#########################################################################
# ncbiRefSeq Anomamlies Tracks (DONE - 2018-08-07 - ChrisL)

    mkdir /hive/data/genomes/hg38/bed/ncbiRefSeqAnomalies
    cd /hive/data/genomes/hg38/bed/ncbiRefSeqAnomalies

    db=hg38
    pre=ncbiRefSeqGenomicDiff
    buildDir=/hive/data/genomes/hg38/bed/ncbiRefSeq.p11.2018-01-09
    asmId=GCF_000001405.37_GRCh38.p11
    
    time (zcat $buildDir/process/$asmId.rna.cds.gz \
        | egrep '[0-9]+\.\.[0-9]+' \
        | pslMismatchGapToBed -cdsFile=stdin -db=$db -ignoreQNamePrefix=X \
            $buildDir/process/$asmId.$db.psl.gz \
            /hive/data/genomes/$db/$db.2bit \
            $buildDir/$db.rna.fa \
            $pre)
    # real    0m32.676s

    # do some checking
    cut -f1 *.bed | sort | uniq -c | grep -v "alt" | awk '{sum+=$1}END{print sum}'
    # 6727
    wc -l *.bed | grep total
    # 24013
    calc \(6726 \/ 24013\) \* 100
    # (6726 / 24013) * 100 = 28.009828
    # so roughly 72% of the anomalies are on alt chromosomes, good or bad?
    
    bedToBigBed -tab -type=bed4+ -as=$HOME/kent/src/hg/lib/txAliMismatch.as \
      $pre.mismatch.bed /hive/data/genomes/$db/chrom.sizes $pre.mismatch.bb
    bedToBigBed -tab -type=bed4+ -as=$HOME/kent/src/hg/lib/txAliShortGap.as \
      $pre.shortGap.bed /hive/data/genomes/$db/chrom.sizes $pre.shortGap.bb
    bedToBigBed -tab -type=bed8+ -as=$HOME/kent/src/hg/lib/txAliShiftyGap.as \
      $pre.shiftyGap.bed /hive/data/genomes/$db/chrom.sizes $pre.shiftyGap.bb
    bedToBigBed -tab -type=bed4+ -as=$HOME/kent/src/hg/lib/txAliDoubleGap.as \
      $pre.doubleGap.bed /hive/data/genomes/$db/chrom.sizes $pre.doubleGap.bb
    bedToBigBed -tab -type=bed4+ -as=$HOME/kent/src/hg/lib/txAliQSkipped.as \
      $pre.qSkipped.bed /hive/data/genomes/$db/chrom.sizes $pre.qSkipped.bb
    
XXX - stopped here because of big track reformatting. See below for more information.
    ln -s `pwd`/*.bb /gbdb/hg38/ncbiRefSeq/

#########################################################################
# re-format after feedback from MarkD

    # figure out unique fields from the 5 .as files, and manually put them in the right order:
    # for file in txAli*.as; do tail -n +4 $file | head -n -1 | cut -d';' -f1; done | sort -u
cat << EOF > txAliDiff.as
table txAliDiff
"Differences between reference genome and transcript sequences"
    (   
    string chrom;       "Reference sequence chromosome or scaffold"
    uint   chromStart;  "Start position in chromosome of ambiguous gap placement region"
    uint   chromEnd;    "End position in chromosome of ambiguous gap placement region"
    string name;        "Name of item"
    uint   score;       "Not used"
    char[1] strand;     "Transcript orientation on genome: + or -"
    uint   thickStart;  "Start position of 3'-most location for gaps that can shift position without introducing mismatches"
    uint   thickEnd;    "End position of 3'-most gap location for gaps that can shift position without introducing mismatches"
    uint reserved;         "RGB color of this item"
    string txName;      "Transcript identifier"
    uint   txStart;     "Start position in transcript (of ambiguous gap placement region where applicable)"
    uint   txEnd;       "End position in transcript (of ambiguous gap placement region where applicable)"
    uint gSkipped;    "Number of bases skipped on genome, if any"
    uint txSkipped;   "Number of bases skipped on transcript, if any"
    uint shiftL;      "Number of bases that gap can be shifted left on genome with no mismatches"
    uint shiftR;      "Number of bases that gap can be shifted right on genome with no mismatches" 
    lstring hgvsG;      "HGVS g. notation of genome change to match transcript"
    lstring hgvsCN;     "HGVS c./n. notation of part of transcript not matched in genome"
    lstring hgvsN;      "HGVS c./n. notation of transcript change to match genome"
    lstring hgvsPosCN;  "HGVS c./n. position range of transcript ambiguous gap placement region"
    )
EOF

    # fix up pslMismatchGapToBed according to feedback from #21079: 
    # git show 9ed6f979a8ec95ac934e5e8fef028d188b76a035

    # re-run:
    
    cd /hive/data/genomes/hg38/bed/ncbiRefSeqAnomalies

    db=hg38
    pre=ncbiRefSeqGenomicDiff
    buildDir=/hive/data/genomes/hg38/bed/ncbiRefSeq.p11.2018-01-09
    asmId=GCF_000001405.37_GRCh38.p11

    time (zcat $buildDir/process/$asmId.rna.cds.gz \
        | egrep '[0-9]+\.\.[0-9]+' \
        | pslMismatchGapToBed -cdsFile=stdin -db=$db -ignoreQNamePrefix=X \
            $buildDir/process/$asmId.$db.psl.gz \
            /hive/data/genomes/$db/$db.2bit \
            $buildDir/$db.rna.fa \
            $pre)
    # real    0m31.944s
    
    bedToBigBed -type=bed9+ -tab -as=$HOME/kent/src/hg/lib/txAliDiff.as $pre.bed \
        /hive/data/genomes/$db/chrom.sizes $pre.bb
    ln -s `pwd`/$pre.bb /gbdb/hg38/ncbiRefSeq/$pre.bb

#########################################################################
# Updated 2019-01-25 to include patch sequences (p12), see patchUpdate.12.txt
# Eventually this process will be folded into doNcbiRefSeq.pl .
