########################################################################################
# DECIPHER Developmental Disorders panel in the Gene2Phenotype database (DDG2P), hg38/hg19
# November 5, 2024 - Yesenia Puga, Jairo Navarro, Gerardo Perez

# Download required files
wget https://www.deciphergenomics.org/files/downloads/population_cnv_grch38.txt.gz  # Downloads the CNV data file for hg38
wget http://hgdownload.soe.ucsc.edu/goldenPath/hg38/bigZips/hg38.chrom.sizes  # Downloads chromosome size data for hg38
wget https://genome.ucsc.edu/goldenPath/help/examples/bedExample2.as  # Downloads the .as file defining custom track fields
wget http://hgdownload.soe.ucsc.edu/admin/exe/linux.x86_64/bedToBigBed  # Downloads the bedToBigBed utility for converting BED to bigBed format
chmod 700 bedToBigBed  # Changes permissions to make bedToBigBed executable

# Prepare the BED file:
# Decompresses and trims the CNV data to the first 15 fields
zcat population_cnv_grch38.txt.gz | cut -f1-15 > population_cnv_grch38.bed

# reorders columns to fit BED format
awk 'BEGIN {OFS="\t"} {print $2, $3, $4, $1, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15}' population_cnv_grch38.bed > population_cnv_grch38_reordered.bed

# removes the header line
tail -n +2 population_cnv_grch38_reordered.bed > population_cnv_grch38_final.bed

# Adjust the BED file for bigBed conversion:
# prepends 'chr' to chromosome numbers and adjusts fields for bigBed
awk 'BEGIN{OFS="\t"} {print "chr"$1, $2, $3, $4, 0, ".", $2, $3, 0, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15}' population_cnv_grch38_final.bed > population_cnv_grch38_final_chr.bed

# Sort the BED file:
# sorts lexicographically by chromosome and numerically by start position
LC_ALL=C sort -k1,1 -k2,2n population_cnv_grch38_final_chr.bed > population_cnv_grch38_final_sorted.bed


# Add RGB colors to the BED file using a Python script:
python3 ../assign_rgb_to_bed.py population_cnv_grch38_final_sorted.bed output_population_cnv_grch38.bed

# Convert the BED file to bigBed format, indexing by gene name for faster lookups:
./bedToBigBed -type=bed9+ -as=bedExample2.as -tab -extraIndex=name output_population_cnv_grch38.bed hg38.chrom.sizes population_cnv_grch38.bb

# Add files to hive
cp hg19/population_cnv_grch37.bb /hive/data/genomes/hg19/bed/ddg2p/ddg2pSyndromes.bb
cp hg38/population_cnv_grch38.bb /hive/data/genomes/hg38/bed/ddg2p/ddg2pSyndromes.bb

# Create symlinks from hive to /gbdb
ln -s /hive/data/genomes/hg19/bed/ddg2p/ddg2pSyndromes.bb /gbdb/hg19/decipher/ddg2pSyndromes.bb
ln -s /hive/data/genomes/hg38/bed/ddg2p/ddg2pSyndromes.bb /gbdb/hg38/decipher/ddg2pSyndromes.bb

#######################################################
Author: Yesenia Puga
Program: assign_rgb_to_bed.py 
#######################################################
 # The Python script 'assign_rgb_to_bed.py' is crucial in the data processing pipeline.
 # It assigns RGB color values based on CNV type ('loss', 'gain', 'del/dup')
 # to each entry, enhancing visual differentiation on the Genome Browser. It also updates the
 # 'type' column from numeric identifiers to descriptive text,
 # which is used in the browser for informative mouseover tooltips, aiding in quick and clear
 # variant identification.
import csv
import sys

# Define the color mappings for the RGB column based on the CNV type
def get_rgb_color(cnv_type):
    if cnv_type == "loss":
        return '255,0,0'  # Red
    elif cnv_type == "gain":
        return '0,0,255'  # Blue
    elif cnv_type == "del/dup":
        return '128,128,128'  # Grey
    else:
        raise ValueError(f"Unexpected CNV type: {cnv_type}")

# Convert numeric CNV values to descriptive text
cnv_descriptions = {
    '-1': 'loss',
    '0': 'del/dup',
    '1': 'gain'
}

def process_file(input_bed_file, output_bed_file):
    with open(input_bed_file, 'r') as infile, open(output_bed_file, 'w', newline='') as outfile:
        reader = csv.reader(infile, delimiter='\t')
        writer = csv.writer(outfile, delimiter='\t')

        for row in reader:
            if len(row) < 19:
                raise ValueError("Row does not contain enough columns to include the CNV type.")

            cnv_type_value = row[18]
            if cnv_type_value not in cnv_descriptions:
                raise ValueError(f"Invalid CNV type value found: {cnv_type_value}")

            cnv_type_description = cnv_descriptions[cnv_type_value]
            row[18] = cnv_type_description  # Update the type description in the 19th column
            rgb_value = get_rgb_color(cnv_type_description)
            
            if len(row) < 9:
                raise ValueError("Row does not contain enough columns to include the RGB value.")

            row[8] = rgb_value  # Update RGB value in the 9th column

            writer.writerow(row)

        print(f"Updated file successfully. Output saved to {output_bed_file}.")

if __name__ == "__main__":
    if len(sys.argv) != 3:
        print("Usage: python update_rgb.py <input_bed_file> <output_bed_file>")
    else:
        input_bed_file = sys.argv[1]
        output_bed_file = sys.argv[2]
        process_file(input_bed_file, output_bed_file)

########################################################################################
# Renaming/updating DDG2P track to DECIPHER Population CNVs  #35053
# Januarary 14, 2025 - Gerardo Perez

# Downloaded files
wget https://www.deciphergenomics.org/files/downloads/population_cnv_grch38.txt.gz
zcat population_cnv_grch38.txt.gz | cut -f1-15 > population_cnv_grch38.bed
wget http://hgdownload.soe.ucsc.edu/goldenPath/hg38/bigZips/hg38.chrom.sizes
cp /cluster/home/yepuga/public_html/trackHubs/ddg2p_syndromes/hg38/bedExample2.as /hive/users/gperez2/tracks/decipher/hg38
cp /cluster/home/yepuga/public_html/trackHubs/ddg2p_syndromes/hg38/assign_rgb_to_bed.py /hive/users/gperez2/tracks/decipher/hg38

# Working directory
cd /hive/users/gperez2/tracks/decipher/hg38

# Commands
zcat population_cnv_grch38.txt.gz | cut -f1-15 > population_cnv_grch38.bed

# reorders columns to fit BED format
awk 'BEGIN {OFS="\t"} {print $2, $3, $4, $1, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15}' population_cnv_grch38.bed > population_cnv_grch38_reordered.bed

# removes the header line
tail -n +2 population_cnv_grch38_reordered.bed > population_cnv_grch38_final.bed

# Adjust the BED file for bigBed conversion:
# prepends 'chr' to chromosome numbers, subtracts 1 from the start position, and adjusts fields for bigBed
awk 'BEGIN{OFS="\t"} {print "chr"$1, $2-1, $3, $4, 0, ".", $2, $3, 0, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15}' population_cnv_grch38_final.bed > population_cnv_grch38_final_chr.bed

bedSort population_cnv_grch38_final_chr.bed population_cnv_grch38_sorted.bed

# Add RGB colors to the BED file using a Python script:
python3  assign_rgb_to_bed.py population_cnv_grch38_sorted.bed output_population_cnv_grch38.bed

bedToBigBed -type=bed9+ -as=bedExample2.as -tab -extraIndex=name output_population_cnv_grch38.bed hg38.chrom.sizes population_cnv_grch38.bb

# Moving files
cp hg38/population_cnv_grch38.bb /hive/data/genomes/hg38/bed/decipher/population_cnv.bb

ln -s /hive/data/genomes/hg38/bed/decipher/population_cnv.bb /gbdb/hg38/decipher/population_cnv.bb
