# for emacs: -*- mode: sh; -*-

# This file describes the process for parsing and loading data
# from ORegAnno Open Regulatory Annotation (www.oreganno.org).

##############################################################################
## Create annotation tracks (DONE - 2015-09-17 - Jonathan)
## Files obtained from Robert Lesurf at WUSTL, refs #15957

# Example file header:
# ORegAnno_ID Species Outcome Type  Gene_Symbol Gene_ID Gene_Source
# Regulatory_Element_Symbol Regulatory_Element_ID Regulatory_Element_Source
# dbSNP_ID  PMID  Dataset hg38_Strand hg38_Chrom  hg38_Start  hg38_End
    cd /hive/data/outside/oreganno/09-17-2015
    unzip ucsc.zip
    # Fix db in filename to match file-internal name (and our database name)
    mv Caenorhabditis_briggsae_CB3_Filtered_2015.09.11.txt Caenorhabditis_briggsae_cb3_Filtered_2015.09.11.txt

    mkdir /hive/data/inside/oreganno/09-17-2015
    cd /hive/data/inside/oreganno/09-17-2015
    mkdir bedResults

    ls /hive/data/outside/oreganno/09-17-2015/ucsc/ \
    | sed -E 's/.*_([^_]+)_Filtered.*/\1/' > db.list

    for db in `cat db.list`; do for mode in `echo oreganno attr link`
    do ./oreganno.pl $(find /hive/data/outside/oreganno/09-17-2015/ucsc/ \
    | grep $db) $mode > bedResults/$db.$mode.bed; done; done

    # featureBits reported items out of range
    # $ featureBits hg38 hg38.oreganno.bed
    # Coordinate out of allowed range [0,50818468) for chr22 near line 26392 of hg38.oreganno.bed
    #
    # obtained repaired files from data provider with a list of records to
    # omit: out-of-range_9_29_15.txt and ucsc_fixed_9_29_15.zip

    # fixing CRLF for unix
    cd /hive/data/outside/oreganno/09-17-2015
    tr '^M' '\n' < out-of-range_9_29_15.txt | awk '/OMIT/ {print $2}' > omit.txt

    rm -rf ucsc/
    unzip ucsc_fixed_9_29_15.zip
    # Fix db in filename to match file-internal name (and our database name)
    mv Caenorhabditis_briggsae_CB3_Filtered_2015.09.11.txt Caenorhabditis_briggsae_cb3_Filtered_2015.09.11.txt

    cd /hive/data/inside/oreganno/09-17-2015
    OMIMDIR=/hive/data/outside/oreganno/09-17-2015
    for db in `cat db.list`; do for mode in `echo oreganno attr link`
    do grep -v -f $OMIMDIR/omit.txt $(find $OMIMDIR/ucsc/ | grep -i "_${db}_") | ./oreganno.pl \
    $mode > bedResults/$db.$mode.bed; done; done
    
    # fixing chromosome names on two entries for fr3, based on communication
    # with data provider.
    sed -ri 's/HE598491/HE591800/' fr3.oreganno.bed

    # list of the assemblies that we will actually construct tracks for
    printf \
    "dm6
    canFam3
    hg19
    hg38
    mm10
    mm9
    sacCer3" > load.list

    # sacCer3 coordinates weren't provided - gotta lift it
    cd /hive/data/inside/oreganno/09-17-2015/bedResults
    liftOver  -bedPlus=4 sacCer1.oreganno.bed /gbdb/sacCer1/liftOver/sacCer1ToSacCer3.over.chain.gz \
    sacCer3.oreganno.bed sacCer3.unmapped.bed

    # strip out attr and link entries for items that didn't survive the lift;
    # otherwise joinerCheck complains
    grep -vf <(grep -oP 'OREG\w+' sacCer3.unmapped.bed) sacCer1.attr.bed > sacCer3.attr.bed
    grep -vf <(grep -oP 'OREG\w+' sacCer3.unmapped.bed) sacCer1.link.bed > sacCer3.link.bed

    # Fixing naming schemes to due to some UPPER CASE and
    # some Mixed Case type names.
    cd /hive/data/inside/oreganno/09-17-2015/bedResults
    sed -i -e 's/REGULATORY HAPLOTYPE/Regulatory Haplotype/' *.attr.bed
    sed -i -e 's/REGULATORY POLYMORPHISM/Regulatory Polymorphism/' *.attr.bed
    sed -i -e 's/REGULATORY REGION/Regulatory Region/' *.attr.bed
    sed -i -e 's/TRANSCRIPTION FACTOR BINDING SITE/Transcription Factor Binding Site/' *.attr.bed
    sed -i -e 's/miRNA BINDING SITE/miRNA Binding Site/' *.attr.bed

    # Fixing the chromosome for a couple of records in hg18 and hg19
    sed -iP 's/^chr\t/chr22\t/' hg19.oreganno.bed
    sed -iP 's/^chrNA\t/chr22\t/' hg18.oreganno.bed

    cd /hive/data/inside/oreganno/09-17-2015/bedResults
    for db in `cat ../load.list`; do
    hgsql -Ne 'drop table if exists oreganno, oregannoAttr, oregannoLink' $db
    hgsql $db < ~/kent/src/hg/lib/oreganno.sql 
    hgLoadBed $db oreganno $db.oreganno.bed -tab -oldTable
    hgLoadSqlTab -oldTable $db oregannoAttr ~/kent/src/hg/lib/oreganno.sql $db.attr.bed
    hgLoadSqlTab -oldTable $db oregannoLink ~/kent/src/hg/lib/oreganno.sql $db.link.bed
    done

    # Read 4242 elements of size 6 from dm6.oreganno.bed
    # Read 29176 elements of size 6 from canFam3.oreganno.bed
    # Read 1449726 elements of size 6 from hg19.oreganno.bed
    # Read 1449133 elements of size 6 from hg38.oreganno.bed
    # Read 415390 elements of size 6 from mm10.oreganno.bed
    # Read 416691 elements of size 6 from mm9.oreganno.bed
    # Read 7299 elements of size 6 from sacCer3.oreganno.bed

    # Decided to also load data for the rest of the assemblies
    # with old ORegAnno data, as that should simplify the
    # display code and documentation.
    # dm2, dm3, sacCer1, sacCer2, hg18

    cd /hive/data/inside/oreganno/09-17-2015
    printf \
    "dm2
    dm3
    sacCer1
    sacCer2
    hg18" > loadOld.list

    # lifting coordinates from sacCer1 to sacCer2
    cd /hive/data/inside/oreganno/09-17-2015/bedResults
    liftOver  -bedPlus=4 sacCer1.oreganno.bed /gbdb/sacCer1/liftOver/sacCer1ToSacCer2.over.chain.gz \
    sacCer2.oreganno.bed sacCer2.unmapped.bed
    cp -p sacCer1.attr.bed sacCer2.attr.bed
    cp -p sacCer1.link.bed sacCer2.link.bed

    cd /hive/data/inside/oreganno/09-17-2015/bedResults
    for db in `cat ../loadOld.list`; do
    hgsql -Ne 'drop table if exists oreganno, oregannoAttr, oregannoLink' $db
    hgsql $db < ~/kent/src/hg/lib/oreganno.sql 
    hgLoadBed $db oreganno $db.oreganno.bed -tab -oldTable
    hgLoadSqlTab -oldTable $db oregannoAttr ~/kent/src/hg/lib/oreganno.sql $db.attr.bed
    hgLoadSqlTab -oldTable $db oregannoLink ~/kent/src/hg/lib/oreganno.sql $db.link.bed
    done

    # Read 2071 elements of size 6 from dm2.oreganno.bed
    # Read 4242 elements of size 6 from dm3.oreganno.bed
    # Read 7302 elements of size 6 from sacCer1.oreganno.bed
    # Read 7302 elements of size 6 from sacCer2.oreganno.bed
    # Read 1449113 elements of size 6 from hg18.oreganno.bed
