# for emacs: -*- mode: sh; -*-

# This file describes browser build for the panTro5
# Organism name:  Pan troglodytes - chimp

#########################################################################
#  Initial steps, reusing photograph from panTro4 (DONE - 2016-08-01 - Hiram)

#  since this is not a new species, can reuse the existing Gorilla photo

# To start this initialBuild.txt document, from a previous assembly document:

mkdir ~/kent/src/hg/makeDb/doc/panTro5
cd ~/kent/src/hg/makeDb/doc/panTro5

sed -e 's/gorGor5/panTro5/g; s/GorGor5/PanTro5/g; s/DONE/TBD/g;' ../gorGor5/initialBuild.txt > initialBuild.txt

mkdir -p /hive/data/genomes/panTro5/refseq
cd /hive/data/genomes/panTro5/refseq

time rsync -L -a -P \
rsync://ftp.ncbi.nlm.nih.gov/genomes/refseq/vertebrate_mammalian/Pan_troglodytes/all_assembly_versions/GCF_000001515.7_Pan_tro_3.0/ ./
# real    6m28.188s

#  since this is not a new species, can reuse the existing Chimp photo panTro4
# construct the required photoReference.txt
cd /hive/data/genomes/panTro5
printf "photoCreditURL http://www.smm.org/
photoCreditName Science Museum of Minnesota\n" > photoReference.txt

# this information is from the top of
#    refseq/GCF_000001515.7_Pan_tro_3.0_assembly_report.txt

# Assembly name:  GSMRT3ssembly name:  Pan_tro 3.0
# Organism name:  Pan troglodytes (chimpanzee)
# Isolate:  Yerkes chimp pedigree #C0471 (Clint)
# Sex:  male
# Taxid:          9598
# BioSample:      SAMN02981217
# BioProject:     PRJNA10627
# Submitter:      Chimpanzee Sequencing and Analysis Consortium
# Date:           2016-5-3
# Assembly type:  haploid
# Release type:   major
# Assembly level: Chromosome
# Genome representation: full
# WGS project:    AACZ04
# Assembly method: DiscoVar v. 51280; PBJelly v. 14.9.9
# Genome coverage: 6x Sanger; 55x Illumina; 9x PacBio
# Sequencing technology: Sanger; Illumina; PacBio
# RefSeq category: Representative Genome
# GenBank assembly accession: GCA_000001515.5
# RefSeq assembly accession: GCF_000001515.7
# RefSeq assembly and GenBank assemblies identical: no
#
## Assembly-Units:
## GenBank Unit Accession       RefSeq Unit Accession   Assembly-Unit name
## GCA_000000075.5      GCF_000000075.6 Primary Assembly
##      GCF_000001485.1 non-nuclear

#############################################################################
# establish config.ra file (DONE - Hiram - 2016-08-01)
    cd /hive/data/genomes/panTro5
    ~/kent/src/hg/utils/automation/prepConfig.pl panTro5 mammal \
      chimp ./refseq/*_assembly_report.txt > panTro5.config.ra
# going to need a mitoAcc ?
    # edit result to add note about mito sequence, reuse LN611626.1
    # from gorGor4 and the genBankAccessionID wasn't picked up correctly
    # and common name of Gorilla instead of Western lowland gorilla

    # verify it looks sane
    cat panTro5.config.ra
# config parameters for makeGenomeDb.pl:
db panTro5
clade mammal
genomeCladePriority 35
scientificName Pan troglodytes
commonName Chimp
assemblyDate May 2016
assemblyLabel Chimpanzee Sequencing and Analysis Consortium
assemblyShortLabel Pan_tro 3.0
orderKey 3337
# mitochondrial sequence included in refseq release
# mitoAcc NC_001643.1
mitoAcc none
fastaFiles /hive/data/genomes/panTro5/ucsc/*.fa.gz
agpFiles /hive/data/genomes/panTro5/ucsc/*.agp
# qualFiles none
dbDbSpeciesDir chimp
photoCreditURL http://www.smm.org/
photoCreditName Science Museum of Minnesota
ncbiGenomeId 202
ncbiAssemblyId 733711
ncbiAssemblyName Pan_tro 3.0
ncbiBioProject 10627
ncbiBioSample SAMN02981217
genBankAccessionID GCF_000001515.7
taxId 9598

(had to fixup common name after the fact.  It was originally here Chimpanzee,
we use just Chimp)

#############################################################################
# setup UCSC named files (DONE - 2016-08-01 - Hiram)

    mkdir /hive/data/genomes/panTro5/ucsc
    cd /hive/data/genomes/panTro5/ucsc
    # measure what is in the refseq release:
    time faSize ../refseq/*0_genomic.fna.gz
# 3231170666 bases (98551050 N's 3132619616 real 1925571363 upper 1207048253 lower) in 44449 sequences in 1 files
# Total size: mean 72693.9 sd 3091024.2 min 984 (NW_016016878.1) max 228573443 (NC_006468.4) median 1522
# %37.36 masked total, %38.53 masked real

# real    1m19.478s

    # check for duplicate sequences (there are now several genomic.fna.gz files)

    time faToTwoBit -noMask ../refseq/*0_genomic.fna.gz refseq.2bit
    #  real    1m29.320s

    twoBitDup refseq.2bit
    # no output is a good result, otherwise, would have to eliminate duplicates
    # real    0m29.874s

    time ~/kent/src/hg/utils/automation/ucscCompositeAgp.pl \
      ../refseq/*0_genomic.fna.gz \
         ../refseq/*_assembly_structure/Primary_Assembly
# NC_006468.4 chr1
# NC_006469.4 chr2A
# NC_006470.4 chr2B
# NC_006471.4 chr4
# NC_006472.4 chr5
# NC_006473.4 chr6
# NC_006474.4 chr7
# NC_006475.4 chr8
# NC_006476.4 chr9
# NC_006477.4 chr10
# NC_006478.4 chr11
# NC_006479.4 chr12
# NC_006480.4 chr13
# NC_006481.4 chr14
# NC_006482.4 chr15
# NC_006483.4 chr16
# NC_006484.4 chr17
# NC_006485.4 chr18
# NC_006486.4 chr19
# NC_006487.4 chr20
# NC_006488.4 chr21
# NC_006489.4 chr22
# NC_006490.4 chr3
# NC_006491.4 chrX
# NC_006492.4 chrY

# real    16m56.273s

    time ~/kent/src/hg/utils/automation/unplacedWithChroms.pl \
       ../refseq/*_assembly_structure/Primary_Assembly
    # processed 42786 sequences into chrUn.fa.gz
    # real    14m46.345s

    time ~/kent/src/hg/utils/automation/unlocalizedWithChroms.pl \
       ../refseq/*_assembly_structure/Primary_Assembly
# 11
# 21
# 7
# Y
# 17
# 22
# 1
# 18
# 16
# 13
# 6
# X
# 3
# 9
# 2B
# 12
# 20
# 14
# 15
# 8
# 4
# 2A
# 19
# 10
# 5
# processed 1637 sequences into chr*_random.gz 25 files
# real    1m5.584s

    # bash syntax here
    mitoAcc=`grep "^# mitoAcc" ../panTro5.config.ra | awk '{print $NF}'`
    printf "# mitoAcc %s\n" "$mitoAcc"
# mitoAcc NC_001643.1

    zcat \
  ../refseq/*_assembly_structure/non-nuclear/assem*/AGP/chrMT.comp.agp.gz \
     | grep -v "^#" | sed -e "s/^$mitoAcc/chrM/;" > chrM.agp

    printf ">chrM\n" > chrM.fa
    twoBitToFa -noMask refseq.2bit:$mitoAcc stdout | grep -v "^>" >> chrM.fa
    gzip chrM.fa

    # verify fasta and AGPs agree
    time faToTwoBit *.fa.gz test.2bit
    # real    1m27.743s

    cat *.agp | checkAgpAndFa stdin test.2bit 2>&1 | tail -4
    # All AGP and FASTA entries agree - both files are valid

    # and no sequence lost from orginal:
    twoBitToFa test.2bit stdout | faSize stdin
# 3231170666 bases (98551050 N's 3132619616 real 3132619616 upper 0 lower)
#    in 44449 sequences in 1 files
# Total size: mean 72693.9 sd 3091024.2 min 984 (chrUn_NW_016016878v1)
#    max 228573443 (chr1) median 1522

    # same numbers as above (except for upper/lower masking)

    # no longer need these temporary 2bit files
    rm test.2bit refseq.2bit


#############################################################################
#  Initial database build (DONE - 2016-08-01 - Hiram)

    # verify sequence and AGP are OK:
    cd /hive/data/genomes/panTro5
    time (makeGenomeDb.pl -workhorse=hgwdev -dbHost=hgwdev -fileServer=hgwdev \
         panTro5.config.ra) > makeGenomeDb.log 2>&1
    # real    30m32.304s

    # check in the trackDb files created and add to trackDb/makefile

##############################################################################
# cpgIslands on UNMASKED sequence (DONE - 2016-08-01 - Hiram)
    mkdir /hive/data/genomes/panTro5/bed/cpgIslandsUnmasked
    cd /hive/data/genomes/panTro5/bed/cpgIslandsUnmasked

    time (doCpgIslands.pl -dbHost=hgwdev -bigClusterHub=ku -buildDir=`pwd` \
       -tableName=cpgIslandExtUnmasked \
          -maskedSeq=/hive/data/genomes/panTro5/panTro5.unmasked.2bit \
             -workhorse=hgwdev -smallClusterHub=ku panTro5) > do.log 2>&1
    # real    18m26.584s

    cat fb.panTro5.cpgIslandExtUnmasked.txt
    # 33615228 bases of 3132620660 (1.073%) in intersection

#############################################################################
# cytoBandIdeo - (DONE - 2016-08-01 - Hiram)
    mkdir /hive/data/genomes/panTro5/bed/cytoBand
    cd /hive/data/genomes/panTro5/bed/cytoBand
    makeCytoBandIdeo.csh panTro5

#############################################################################
# ucscToINSDC table/track (DONE - 2016-08-01 - Hiram)
    mkdir /hive/data/genomes/panTro5/bed/ucscToINSDC
    cd /hive/data/genomes/panTro5/bed/ucscToINSDC

    # find the genbank accession for NC_001643.1 at Entrez nucleotide
    # The NC_001643.1 name is the RefSeq name, the D38113.1 is the INSDC name
    ~/kent/src/hg/utils/automation/ucscToINSDC.sh \
      ../../refseq/GCF_*structure/Primary_Assembly NC_001643.1

    awk '{printf "%s\t%s\n", $2, $1}' ucscToINSDC.txt | sort > insdcToUcsc.txt

    # it isn't clear why there isn't the INSDC name listed in the
    # assembly_report for chrM, fix it up here with the sed
    grep -v "^#" ../../refseq/GCF*_assembly_report.txt | cut -f5,7 \
      | sed -e 's/na\b/D38113.1/;' | awk '{printf "%s\t%s\n", $2, $1}' \
         | sort > refseq.insdc.txt

    awk '{printf "%s\t0\t%d\n", $1,$2}' ../../chrom.sizes \
         | sort > name.coordinate.tab

    # the tr commands avoid the problem of trying to use the -t argument
    # to the join command which doesn't accept -t'\t' but instead has
    # to use the unseen/can not copy command ctrl-v i
    join refseq.insdc.txt insdcToUcsc.txt | tr '[ ]' '[\t]' | sort -k3 \
       | join -2 3 name.coordinate.tab - | tr '[ ]' '[\t]' | cut -f1-3,5 \
           > ucscToINSDC.bed

    # should be same line counts throughout:
    wc -l *
    # 44449 insdc.refseq.txt
    # 44449 insdcToUcsc.txt
    # 44449 name.coordinate.tab
    # 44449 refseq.insdc.txt
    # 44449 ucscToINSDC.bed
    # 44449 ucscToINSDC.txt

    cut -f1 ucscToINSDC.bed | awk '{print length($0)}' | sort -n | tail -1
    # 27
    # use the 27 in this sed
    sed -e "s/21/27/" $HOME/kent/src/hg/lib/ucscToINSDC.sql \
         | hgLoadSqlTab panTro5 ucscToINSDC stdin ucscToINSDC.bed
    # check table coords should be silent
    checkTableCoords panTro5
    # should cover %100 entirely:
    featureBits -countGaps panTro5 ucscToINSDC
    # 3231170666 bases of 3231170666 (100.000%) in intersection

    join -1 2 <(sort -k2 ucscToINSDC.txt) refseq.insdc.txt | tr '[ ]' '[\t]' \
      | sort -k2 | join -2 2 name.coordinate.tab - |  tr '[ ]' '[\t]' \
        | cut -f1-4 > ucscToRefSeq.bed
    cut -f1 ucscToRefSeq.bed | awk '{print length($0)}' | sort -n | tail -1
    # 27
    # use the 27 in this sed
    sed -e "s/21/27/" $HOME/kent/src/hg/lib/ucscToINSDC.sql \
       | sed -e 's/INSDC/RefSeq/g;' > ucscToRefSeq.sql
    hgLoadSqlTab panTro5 ucscToRefSeq ./ucscToRefSeq.sql ucscToRefSeq.bed

    # check table coords should be silent
    checkTableCoords  panTro5 -table=ucscToRefSeq
    # should cover %100 all bases:
    featureBits -countGaps panTro5 ucscToRefSeq
    # 3231170666 bases of 3231170666 (100.000%) in intersection

#########################################################################
# add chromAlias table (DONE - 2016-10-20 - Hiram)

    mkdir /hive/data/genomes/panTro5/bed/chromAlias
    cd /hive/data/genomes/panTro5/bed/chromAlias

    hgsql -N -e 'select chrom,name from ucscToRefSeq;' panTro5 \
        > ucsc.refseq.tab
    hgsql -N -e 'select chrom,name from ucscToINSDC;' panTro5 \
        > ucsc.genbank.tab

    awk '{printf "%s\t%s\t%s\n", $2,$1,$3}' ucsc.genbank.tab ucsc.refseq.tab \
        | sort > panTro5.chromAlias.tab

    join -t$'\t' ../idKeys/panTro5.idKeys.txt \
	../../ensembl/ensemblPanTro5.idKeys.txt \
	| cut -f2,3 | sort > ucsc.ensembl.tab

    ~/kent/src/hg/utils/automation/chromAlias.pl
    sort -o panTro5.chromAlias.tab panTro5.chromAlias.tab

for t in refseq genbank ensembl
do
  c0=`cat ucsc.$t.tab | wc -l`
  c1=`grep $t panTro5.chromAlias.tab | wc -l`
  ok="OK"
  if [ "$c0" -ne "$c1" ]; then
     ok="ERROR"
  fi
  printf "# checking $t: $c0 =? $c1 $ok\n"
done
# checking refseq: 44449 =? 44449 OK
# checking genbank: 44449 =? 44449 OK
# checking ensembl: 44449 =? 44449 OK

    hgLoadSqlTab panTro5 chromAlias ~/kent/src/hg/lib/chromAlias.sql \
        panTro5.chromAlias.tab

#########################################################################
#########################################################################
# fixup search rule for assembly track/gold table (DONE - 2016-08-01 - Hiram)

    cd ~/kent/src/hg/makeDb/trackDb/chimp/panTro5
    # preview prefixes and suffixes:
    hgsql -N -e "select frag from gold;" panTro5 \
      | sed -e 's/[0-9][0-9]*//;' | sort | uniq -c
#    72971 AACZ.1
#       76 AC.1
#        1 AC.10
#        1 AC.15
#      598 AC.2
#     1034 AC.3
#      304 AC.4
#       78 AC.5
#       18 AC.6
#        1 AC.7
#        2 AC.8
#        1 AL.1
#        1 AL.2
#       11 BS.1
#        2 BS.2
#        1 CT.1
#       36 CT.2
#       12 CT.3
#        9 CU.1
#       20 CU.2
#        5 CU.3
#        1 CU.4
#        1 NC_.1

    # implies a search rule of: '[ABCLNSTUZ_]+[0-9]+(\.[0-9]+)?'

    # verify this rule will find them all or eliminate them all:
    hgsql -N -e "select frag from gold;" panTro5 | wc -l
    # 75184

    hgsql -N -e "select frag from gold;" panTro5 \
       | egrep -e '[ABCLNSTUZ_]+[0-9]+(\.[0-9]+)?' | wc -l
    # 75184

    hgsql -N -e "select frag from gold;" panTro5 \
       | egrep -v -e '[ABCLNSTUZ_]+[0-9]+(\.[0-9]+)?' | wc -l
    # 0

    # hence, add to trackDb/chimp/panTro5/trackDb.ra
searchTable gold
shortCircuit 1
termRegex [ABCLNSTUZ_]+[0-9]+(\.[0-9]+)?
query select chrom,chromStart,chromEnd,frag from %s where frag like '%s%%'
searchPriority 8

    # verify searches work in the position box

##########################################################################
# running repeat masker (DONE - 2016-08-01 - Hiram)
    mkdir /hive/data/genomes/panTro5/bed/repeatMasker
    cd /hive/data/genomes/panTro5/bed/repeatMasker
    time  (doRepeatMasker.pl -buildDir=`pwd` \
        -bigClusterHub=ku -dbHost=hgwdev -workhorse=hgwdev \
        -smallClusterHub=ku panTro5) > do.log 2>&1
    # real    498m50.970s
    # one broken item:
# RepeatMasker bug?: Undefined id, line 3685255 of input:
#  2121  11.8  3.8  3.5  chr4      152075963 152076304 (42426029) +  MER61C         LTR/ERV1                86  428    (0)
    # remove that one item:
    grep -v "152075963 152076304" panTro5.fa.out > clean.panTro5.fa.out
    # finish cat step
    /cluster/bin/scripts/extractNestedRepeats.pl clean.panTro5.fa.out | sort -k1,1 -k2,2n > panTro5.nestedRepeats.bed
    #    real    2m24.817s
    # continue with mask step
    time  (doRepeatMasker.pl -buildDir=`pwd` \
        -bigClusterHub=ku -dbHost=hgwdev -workhorse=hgwdev \
        -continue=mask -smallClusterHub=ku panTro5) > mask.log 2>&1
    # real    33m53.795s

    cat faSize.rmsk.txt
# 3231170666 bases (98551050 N's 3132619616 real 1483787108 upper 1648832508
#    lower) in 44449 sequences in 1 files
# Total size: mean 72693.9 sd 3091024.2 min 984 (chrUn_NW_016016878v1)
#    max 228573443 (chr1) median 1522
# %51.03 masked total, %52.63 masked real


    egrep -i "versi|relea" do.log
    # RepeatMasker version open-4.0.5
    #    January 31 2015 (open-4-0-5) version of RepeatMasker
    # CC   RELEASE 20140131;

    time featureBits -countGaps panTro5 rmsk
    # 1648952639 bases of 3231170666 (51.033%) in intersection
    # real    0m57.937s

    # why is it different than the faSize above ?
    # because rmsk masks out some N's as well as bases, the faSize count above
    #   separates out the N's from the bases, it doesn't show lower case N's

    # with high contig count assemblies, faster way to get the same result:
    time hgsql -N -e 'select genoName,genoStart,genoEnd from rmsk;' panTro5 \
        | bedSingleCover.pl stdin | ave -col=4 stdin | grep "^total"
    # total 1648952639.000000
    # real    0m41.040s

##########################################################################
# running simple repeat (DONE - 2016-08-01 - Hiram)
    # this was done twice, once with the existing trf v407.b
    # and second with the v4.09
    # there were nearly identical results, a few more bases covered
    # by the v4.09.  Ended up loading the v4.09 results

    mkdir /hive/data/genomes/panTro5/bed/simpleRepeat
    cd /hive/data/genomes/panTro5/bed/simpleRepeat
    time (doSimpleRepeat.pl -buildDir=`pwd` -bigClusterHub=ku \
        -trf409=6 -dbHost=hgwdev -workhorse=hgwdev -smallClusterHub=ku \
        panTro5) > do.log 2>&1
    # real    104m39.660s

    cat fb.simpleRepeat
    # 173043342 bases of 3132620660 (5.524%) in intersection

    # add to rmsk after it is done:
    cd /hive/data/genomes/panTro5
    twoBitMask panTro5.rmsk.2bit \
        -add bed/simpleRepeat/trfMask.bed panTro5.2bit
    #   you can safely ignore the warning about fields >= 13
    twoBitToFa panTro5.2bit stdout | faSize stdin > faSize.panTro5.2bit.txt
    cat faSize.panTro5.2bit.txt
# 3231170666 bases (98551050 N's 3132619616 real 1480952676 upper 1651666940 lower) in 44449 sequences in 1 files
# Total size: mean 72693.9 sd 3091024.2 min 984 (chrUn_NW_016016878v1) max 228573443 (chr1) median 1522
# %51.12 masked total, %52.72 masked real

    rm /gbdb/panTro5/panTro5.2bit
    ln -s `pwd`/panTro5.2bit /gbdb/panTro5/panTro5.2bit

#########################################################################
# CREATE MICROSAT TRACK (DONE - 2016-08-02 - Hiram)
    ssh hgwdev
    mkdir /cluster/data/panTro5/bed/microsat
    cd /cluster/data/panTro5/bed/microsat

    awk '($5==2 || $5==3) && $6 >= 15 && $8 == 100 && $9 == 0 {printf("%s\t%s\t%s\t%dx%s\n", $1, $2, $3, $6, $16);}' \
       ../simpleRepeat/simpleRepeat.bed > microsat.bed

    hgLoadBed panTro5 microsat microsat.bed
    # Read 31196 elements of size 4 from microsat.bed

##########################################################################
## WINDOWMASKER (DONE - 2016-08-02 - Hiram)

    mkdir /hive/data/genomes/panTro5/bed/windowMasker
    cd /hive/data/genomes/panTro5/bed/windowMasker
    time (doWindowMasker.pl -buildDir=`pwd` -workhorse=hgwdev \
        -dbHost=hgwdev panTro5) > do.log 2>&1
    #  real    234m56.016s

    # Masking statistics
    cat faSize.panTro5.cleanWMSdust.txt
# 3231170666 bases (98551050 N's 3132619616 real 1906460997 upper 1226158619
#    lower) in 44449 sequences in 1 files
# Total size: mean 72693.9 sd 3091024.2 min 984 (chrUn_NW_016016878v1)
#    max 228573443 (chr1) median 1522
# %37.95 masked total, %39.14 masked real

    cat fb.panTro5.rmsk.windowmaskerSdust.txt
    # 973431518 bases of 3231170666 (30.126%) in intersection

#########################################################################
# run up idKeys files for ncbiRefSeq (DONE - 2016-08-02 - Hiram)
    mkdir /hive/data/genomes/panTro5/bed/idKeys
    cd /hive/data/genomes/panTro5/bed/idKeys

    time (doIdKeys.pl -buildDir=`pwd`  panTro5) > do.log 2>&1
    # real    22m47.114s

    cat panTro5.keySignature.txt
    #   cea39548ec6babd2ebaf0a11fd9a2bb4

#########################################################################
# LIFTOVER TO panTro4 (DONE - 2016-08-02 - Hiram)
    ssh hgwdev
    mkdir /hive/data/genomes/panTro5/bed/blat.panTro4.2016-08-02
    cd /hive/data/genomes/panTro5/bed/blat.panTro4.2016-08-02
    doSameSpeciesLiftOver.pl -verbose=2 \
        -debug -bigClusterHub=ku -dbHost=hgwdev -workhorse=hgwdev \
        -ooc=/hive/data/genomes/panTro5/jkStuff/panTro5.11.ooc \
         panTro5 panTro4
    time (doSameSpeciesLiftOver.pl -verbose=2 \
        -bigClusterHub=ku -dbHost=hgwdev -workhorse=hgwdev \
        -ooc=/hive/data/genomes/panTro5/jkStuff/panTro5.11.ooc \
         panTro5 panTro4) > doLiftOverToGalGal4.log 2>&1
    # real    86m43.038s

    # see if the liftOver menus function in the browser from panTro5 to panTro4

#########################################################################
# LIFTOVER TO panTro3 (DONE - 2016-08-04 - Hiram)
    ssh hgwdev
    mkdir /hive/data/genomes/panTro5/bed/blat.panTro3.2016-08-04
    cd /hive/data/genomes/panTro5/bed/blat.panTro3.2016-08-04
    doSameSpeciesLiftOver.pl -verbose=2 \
        -debug -bigClusterHub=ku -dbHost=hgwdev -workhorse=hgwdev \
        -ooc=/hive/data/genomes/panTro5/jkStuff/panTro5.11.ooc \
         panTro5 panTro3
    time (doSameSpeciesLiftOver.pl -verbose=2 \
        -bigClusterHub=ku -dbHost=hgwdev -workhorse=hgwdev \
        -ooc=/hive/data/genomes/panTro5/jkStuff/panTro5.11.ooc \
         panTro5 panTro3) > doLiftOverToPanTro3.log 2>&1
    # real    220m13.396s

    # see if the liftOver menus function in the browser from panTro5 to panTro3

##########################################################################
# ncbiRefSeq (DONE - 2016-06-02 - Hiram)

    mkdir /hive/data/genomes/panTro5/bed/ncbiRefSeq
    cd /hive/data/genomes/panTro5/bed/ncbiRefSeq
    # running step wise as this script is still under development
    time (~/kent/src/hg/utils/automation/doNcbiRefSeq.pl -buildDir=`pwd` \
      -bigClusterHub=ku -dbHost=hgwdev \
      -stop=download -fileServer=hgwdev -smallClusterHub=ku -workhorse=hgwdev \
      refseq vertebrate_mammalian Pan_troglodytes \
      GCF_000001515.7_Pan_tro_3.0 panTro5) > download.log 2>&1
    # real    16m29.536s

    time (~/kent/src/hg/utils/automation/doNcbiRefSeq.pl -buildDir=`pwd` \
      -continue=process -bigClusterHub=ku -dbHost=hgwdev \
      -stop=process -fileServer=hgwdev -smallClusterHub=ku -workhorse=hgwdev \
      refseq vertebrate_mammalian Pan_troglodytes \
      GCF_000001515.7_Pan_tro_3.0 panTro5) > process.log 2>&1
XXX - running - Tue Aug  2 21:02:53 PDT 2016
    # real    3m58.858s

    time (~/kent/src/hg/utils/automation/doNcbiRefSeq.pl -buildDir=`pwd` \
      -continue=load -bigClusterHub=ku -dbHost=hgwdev \
      -stop=load -fileServer=hgwdev -smallClusterHub=ku -workhorse=hgwdev \
      refseq vertebrate_mammalian Pan_troglodytes \
      GCF_000001515.7_Pan_tro_3.0 panTro5) > load.log 2>&1
    # real    0m33.205s

    cat fb.ncbiRefSeq.panTro5.txt
    #  82563006 bases of 1218501075 (6.776%) in intersection

    featureBits -enrichment panTro5 refGene ncbiRefSeq
    # refGene 1.181%, ncbiRefSeq 6.776%, both 1.175%, cover 99.49%,
    #    enrich 14.68x

##########################################################################
# cpgIslands - (DONE - 2016-08-02 - Hiram)
    mkdir /hive/data/genomes/panTro5/bed/cpgIslands
    cd /hive/data/genomes/panTro5/bed/cpgIslands
    time (doCpgIslands.pl -dbHost=hgwdev -bigClusterHub=ku \
      -workhorse=hgwdev -smallClusterHub=ku panTro5) > do.log 2>&1
    #  real    8m3.862s

    cat fb.panTro5.cpgIslandExt.txt
    # 24313253 bases of 3132620660 (0.776%) in intersection

##############################################################################
# genscan - (DONE - 2016-08-02 - Hiram)
    mkdir /hive/data/genomes/panTro5/bed/genscan
    cd /hive/data/genomes/panTro5/bed/genscan
    time (doGenscan.pl -buildDir=`pwd` -workhorse=hgwdev -dbHost=hgwdev \
      -bigClusterHub=ku panTro5) > do.log 2>&1
    # real    1759m5.409s

    cat fb.panTro5.genscan.txt
    # 56568317 bases of 3132620660 (1.806%) in intersection

    cat fb.panTro5.genscanSubopt.txt
    # 54406299 bases of 3132620660 (1.737%) in intersection

########################################################################
# Create kluster run files (DONE - 2016-08-02 - Hiram)

    # numerator is panTro5 gapless bases "real" as reported by:
    featureBits -noRandom -noHap panTro5 gap
    # 96494254 bases of 2870630823 (3.361%) in intersection
    #          use this ^^^^^^^^^^ number

    # denominator is hg19 gapless bases as reported by:
    #   featureBits -noRandom -noHap hg19 gap
    #     234344806 bases of 2861349177 (8.190%) in intersection
    # 1024 is threshold used for human -repMatch:
    calc \( 2870630823 / 2861349177 \) \* 1024
    #  ( 2870630823 / 2861349177 ) * 1024 = 1027.321652

    # ==> use -repMatch=1000 according to size scaled up from 1024 for human.
    #   and rounded down to nearest 50
    cd /hive/data/genomes/panTro5
    time blat panTro5.2bit \
         /dev/null /dev/null -tileSize=11 -makeOoc=jkStuff/panTro5.11.ooc \
        -repMatch=1000
    # Wrote 38245 overused 11-mers to jkStuff/panTro5.11.ooc
    # real    0m58.799s


    # check for non-bridged gaps
    hgsql -e 'select count(*) from gap;' panTro5
# +----------+
# | count(*) |
# +----------+
# |    27779 |
# +----------+

    hgsql -e 'select count(*) from gap where bridge="no";' panTro5
# +----------+
# | count(*) |
# +----------+
# |     1066 |
# +----------+

    # if there were non-bridged gaps, make up a nonBridged.lft file

    #   check non-bridged gaps to see what the typical size is:
    hgsql -N \
         -e 'select * from gap where bridge="no" order by size;' panTro5 \
         | sort -k7,7nr | ave -col=7 stdin
# Q1 100.000000
# median 100.000000
# Q3 100.000000
# average 66592.843340
# min 10.000000
# max 3000000.000000
# count 1066
    # there are two gaps of size 10, the rest are 100 or above

    #   decide on a minimum gap for this break, use either 100 or 5000 will
    #   generate 13387 liftOver rows, but if use 6000, only got 11703 rows.
    #   so use 100 here to get more liftOver row.
    gapToLift -verbose=2 -minGap=100 panTro5 jkStuff/nonBridged.lft \
        -bedFile=jkStuff/nonBridged.bed
    # -rw-rw-r-- 1 2602128 Aug  2 08:42 nonBridged.lft
    # -rw-rw-r-- 1 2368731 Aug  2 08:42 nonBridged.bed

########################################################################
# GENBANK AUTO UPDATE (DONE - 2016-08-02,08 - Hiram)
    ssh hgwdev
    cd $HOME/kent/src/hg/makeDb/genbank
    git pull
    # /cluster/data/genbank/data/organism.lst shows:
    # #organism     mrnaCnt estCnt  refSeqCnt
    # Pan troglodytes 3533    5066    2041
    # Pan troglodytes schweinfurthii  7       0       0
    # Pan troglodytes troglodytes     42      0       0
    # Pan troglodytes verus   126     12315   0

    # Edit src/lib/gbGenome.c to add new species, this one already here:
# static char *panTroNames[] = {"Pan troglodytes", "Pan troglodytes troglodytes"

    # edit etc/genbank.conf to add panTro5 just after balAcu1

# panTro5 (chimp)
panTro5.serverGenome = /hive/data/genomes/panTro5/panTro5.2bit
panTro5.clusterGenome = /hive/data/genomes/panTro5/panTro5.2bit
panTro5.ooc = /hive/data/genomes/panTro5/jkStuff/panTro5.11.ooc
panTro5.lift = /hive/data/genomes/panTro5/jkStuff/nonBridged.lft
panTro5.perChromTables = no
panTro5.refseq.mrna.native.pslCDnaFilter  = ${finished.refseq.mrna.native.pslCDnaFilter}
panTro5.refseq.mrna.xeno.pslCDnaFilter    = ${finished.refseq.mrna.xeno.pslCDnaFilter}
panTro5.genbank.mrna.native.pslCDnaFilter = ${finished.genbank.mrna.native.pslCDnaFilter}
panTro5.genbank.mrna.xeno.pslCDnaFilter   = ${finished.genbank.mrna.xeno.pslCDnaFilter}
panTro5.genbank.est.native.pslCDnaFilter  = ${finished.genbank.est.native.pslCDnaFilter}
panTro5.genbank.est.xeno.pslCDnaFilter    = ${finished.genbank.est.xeno.pslCDnaFilter}
panTro5.downloadDir = panTro5
panTro5.refseq.mrna.native.load  = yes
panTro5.refseq.mrna.xeno.load  = yes
# DO NOT NEED genbank.mrna.xeno except for human, mouse
panTro5.genbank.mrna.xeno.load = yes
# panTro5.upstreamGeneTbl = ensGene
# panTro5.upstreamMaf = multiz12way
# /hive/data/genomes/panTro5/bed/multiz12way/species.list

    git commit -m "Added panTro5 - Chimp refs #17817" etc/genbank.conf
    git push
    # update /cluster/data/genbank/:
    make etc-update

    screen      #  control this business with a screen since it takes a while
    cd /cluster/data/genbank

    time ./bin/gbAlignStep -initial panTro5
    # real    3017m33.463s

    # logFile: var/build/logs/2016.08.02-08:57:34.panTro5.initalign.log
    tail -2 var/build/logs/2016.08.02-08:57:34.panTro5.initalign.log
# hgwdev 2016.08.04-10:58:21 panTro5.initalign: Succeeded: panTro5
# hgwdev 2016.08.04-11:15:07 panTro5.initalign: finish

    #   To rerun entire alignment, rm the work dir first:
    #     /cluster/data/genbank/work/initial.panTro5

    # load database when finished
    ssh hgwdev
    cd /cluster/data/genbank
    time ./bin/gbDbLoadStep -drop -initialLoad panTro5
    # real    143m28.104s
    tail -2 var/dbload/hgwdev/logs/2016.08.05-10:00:13.panTro5.dbload.log
# gbLoadRna: selected refseq.77 panTro5 native,xeno mrna: delete=0 seqChg=0 metaChg=0 extChg=0 new=426916 orphan=0 derived=0 noChg=0
# hgwdev 2016.08.05-12:23:41 panTro5.dbload: finish

    # enable daily alignment and update of hgwdev
    cd ~/kent/src/hg/makeDb/genbank
    git pull
    # add panTro5 to:
    #   etc/align.dbs etc/hgwdev.dbs
    git add etc/align.dbs etc/hgwdev.dbs
    git commit -m "Added panTro5 - chimp - Pan troglodytes refs #17817" etc/align.dbs etc/hgwdev.dbs
    git push
    make etc-update

#############################################################################
# augustus gene track (DONE - 2016-08-02 - Hiram)

    mkdir /hive/data/genomes/panTro5/bed/augustus
    cd /hive/data/genomes/panTro5/bed/augustus
    time (doAugustus.pl -buildDir=`pwd` -bigClusterHub=ku \
        -species=human -dbHost=hgwdev \
           -workhorse=hgwdev panTro5) > do.log 2>&1
    # real    136m39.195s

    cat fb.panTro5.augustusGene.txt
    # 52261749 bases of 3132620660 (1.668%) in intersection

############################################################################
#  BLATSERVERS ENTRY (DONE - 2016-08-02 - Hiram)
#	After getting a blat server assigned by the Blat Server Gods,
    ssh hgwdev

    hgsql -e 'INSERT INTO blatServers (db, host, port, isTrans, canPcr) \
	VALUES ("panTro5", "blat1d", "17866", "1", "0"); \
	INSERT INTO blatServers (db, host, port, isTrans, canPcr) \
	VALUES ("panTro5", "blat1d", "17867", "0", "1");' \
	    hgcentraltest
    #	test it with some sequence

############################################################################
## default position to FOXP2 (DONE - 2016-08-04 - Hiram)
    ssh hgwdev
    hgsql -e 'update dbDb set defaultPos="chr7:118772635-119046356"
	where name="panTro5";' hgcentraltest

#########################################################################
# all.joiner update, downloads and in pushQ - (DONE - 2016-08-05 - Hiram)
    cd $HOME/kent/src/hg/makeDb/schema
    # fixup all.joiner until this is a clean output
    joinerCheck -database=panTro5 -tableCoverage all.joiner
    joinerCheck -database=panTro5 -times all.joiner
    joinerCheck -database=panTro5 -keys all.joiner

    cd /hive/data/genomes/panTro5
    time (makeDownloads.pl panTro5) > downloads.log 2>&1
    #  real    15m52.648s

    #   now ready for pushQ entry
    mkdir /hive/data/genomes/panTro5/pushQ
    cd /hive/data/genomes/panTro5/pushQ
    time makePushQSql.pl panTro5 > panTro5.pushQ.sql 2> stderr.out
    #   real    6m9.066s

    #   check for errors in stderr.out, some are OK, e.g.:
    # WARNING: hgwdev does not have /gbdb/panTro5/wib/gc5Base.wib
    # WARNING: hgwdev does not have /gbdb/panTro5/wib/quality.wib
    # WARNING: hgwdev does not have /gbdb/panTro5/bbi/qualityBw/quality.bw
    # WARNING: panTro5 does not have seq
    # WARNING: panTro5 does not have extFile

    #   copy it to hgwbeta
    scp -p panTro5.pushQ.sql qateam@hgwbeta:/tmp/
    ssh qateam@hgwbeta "./bin/x86_64/hgsql qapushq < /tmp/panTro5.pushQ.sql"

    #   in that pushQ entry walk through each entry and see if the
    #   sizes will set properly

#########################################################################
# LIFTOVER TO panTro6 (DONE - 2018-03-24 - Hiram)
    ssh hgwdev
    mkdir /hive/data/genomes/panTro5/bed/blat.panTro6.2018-03-24
    cd /hive/data/genomes/panTro5/bed/blat.panTro6.2018-03-24
    time (doSameSpeciesLiftOver.pl -verbose=2 \
        -bigClusterHub=ku -dbHost=hgwdev -workhorse=hgwdev \
        -ooc=/hive/data/genomes/panTro5/jkStuff/panTro5.11.ooc \
         panTro5 panTro6) > do.log 2>&1
    # real    460m46.002s

    # see if the liftOver menus function in the browser from panTro5 to panTro6

#########################################################################
