# for emacs: -*- mode: sh; -*-

# This file describes browser build for the macFas5
# Macaca fascicularis/Crab-eating macaque

# DATE:   12-Jun-2013
# ORGANISM:       Macaca fascicularis
# TAXID:  9541
# ASSEMBLY LONG NAME:     Macaca_fascicularis_5.0
# ASSEMBLY SHORT NAME:    Macaca_fascicularis_5.0
# ASSEMBLY SUBMITTER:     Washington University (WashU)
# ASSEMBLY TYPE:  Haploid
# NUMBER OF ASSEMBLY-UNITS:       1
# ASSEMBLY ACCESSION:     GCA_000364345.1
# FTP-RELEASE DATE: 26-Jun-2013

#  chrMT: NC_012670

#############################################################################
# fetch sequence from new style download directory (DONE - 2013-06-27 - Hiram)
    # NCBI has redesigned their FTP download site, new type of address
    #      and naming scheme
    mkdir -p /hive/data/genomes/macFas5/genbank
    cd /hive/data/genomes/macFas5/genbank

    rsync -L -a -P \
rsync://ftp.ncbi.nlm.nih.gov/genbank/genomes/Eukaryotes/vertebrates_mammals/Macaca_fascicularis/Macaca_fascicularis_5.0/ ./
    # sent 4283 bytes  received 2107814896 bytes  4806885.24 bytes/sec
    #   total size is 2107542107  speedup is 1.00

    # measure what we have here:
    faSize Primary_Assembly/assembled_chromosomes/FASTA/*.fa.gz \
Primary_Assembly/unlocalized_scaffolds/FASTA/*.unlocalized.scaf.fa.gz \
Primary_Assembly/unplaced_scaffolds/FASTA/unplaced.scaf.fa.gz
    # 2946827162 bases (142977352 N's 2803849810 real 2803849810 upper
    #  0 lower) in 7600 sequences in 43 files
    # Total size: mean 387740.4 sd 7560161.6
    #  min 218 (gi|505496605|gb|AQIA01081996.1|)
    #   max 227556264 (gi|511747355|gb|CM001919.1|) median 3505

    # note that these totals do not include chrM since the GenBank ftp directory
    # did not include a non-nuclear directory

    # pick up the same photo used at NCBI
    #   http://www.ncbi.nlm.nih.gov/genome/776
    mkdir /hive/data/genomes/macFas5/photo
    cd /hive/data/genomes/macFas5/photo
    wget --timestamping \
 http://mongabay-images.s3.amazonaws.com/images/indonesia/bali/bali8025.JPG
    convert -quality 80 -geometry 400x400 bali8025.JPG Macaca_fascicularis.jpg

#############################################################################
# fixup to UCSC naming scheme (DONE - 2013-06-27 - Hiram)
    mkdir /hive/data/genomes/macFas5/ucsc
    cd /hive/data/genomes/macFas5/ucsc

    # fetch chrM sequence:
    export mitoAcc=NC_012670
    wget -O ${mitoAcc}.fa \
 "http://www.ncbi.nlm.nih.gov/sviewer/viewer.fcgi?db=nuccore&dopt=fasta&sendto=on&id=$mitoAcc"

    echo ">chrM" > chrM.fa
    grep -v "^>" ${mitoAcc}.fa >> chrM.fa

    export mSize=`faCount chrM.fa | grep total | awk '{print $2}'`

    /bin/echo -e "chrM\t1\t$mSize\t1\tF\t$mitoAcc\t1\t$mSize\t+" > chrM.agp

    # scripts (that have become much more generic since this time)
    $HOME/kent/src/hg/makeDb/doc/macFas5/ucscCompositeAgp.pl
    $HOME/kent/src/hg/makeDb/doc/macFas5/unlocalized.pl
    $HOME/kent/src/hg/makeDb/doc/macFas5/unplaced.pl

    # verify what we have here:
    faSize chr*.fa
    # 2946843737 bases (142977352 N's 2803866385 real 2803866385 upper
    #    0 lower) in 7601 sequences in 44 files
    # Total size: mean 387691.6 sd 7559665.4
    #   min 218 (chrUn_AQIA01081996) max 227556264 (chr1) median 3505

    # add chrM to the original genbank to verify this increased size:
    calc 2946827162 \+ 16575
    #  2946827162 + 16575 = 2946843737

#############################################################################
#  Initial database build (DONE - 2013-06-27 - Hiram)

    cd /hive/data/genomes/macFas5

    cat << '_EOF_' > macFas5.config.ra
# Config parameters for makeGenomeDb.pl:
db macFas5
clade primate
genomeCladePriority 16
scientificName Macaca fascicularis
commonName Crab-eating macaque
assemblyDate Jun. 2013
assemblyLabel Washington University (WashU)
assemblyShortLabel Macaca_fascicularis_5.0
orderKey 3705
# chrM bioproject: 37941 - already included in ucsc directory
# mitoAcc NC_012670.1
mitoAcc none
fastaFiles /hive/data/genomes/macFas5/ucsc/chr*.fa
agpFiles /hive/data/genomes/macFas5/ucsc/chr*.agp
# qualFiles none
dbDbSpeciesDir macaca
photoCreditURL http://travel.mongabay.com/indonesia/images/bali8025.html
photoCreditName Rhett A. Butler mongabay.com
ncbiGenomeId 776
ncbiAssemblyId  704988
ncbiAssemblyName CMacaca_fascicularis_5.0
ncbiBioProject 215851
genBankAccessionID GCA_000364345.1
taxId 9541
'_EOF_'
    # << happy emacs

    # this assembly was constructed without makeGenomeDb.pl
    # for example, after the .fa and .agp files made above:
    cd /hive/data/genomes/macFas5/ucsc
    faToTwoBit chr*.fa ../macFas5.unmasked.2bit
    cat chr*.agp > ../macFas5.agp

    cd /hive/data/genomes/macFas5
    time checkAgpAndFa macFas5.agp macFas5.unmasked.2bit 2>&1 | tail
    # Valid Fasta file entry
    # All AGP and FASTA entries agree - both files are valid
    # real    0m12.280s

    # this step was performed 2015-01-08 after the config.ra file was created:
    time (makeGenomeDb.pl -workhorse=hgwdev -dbHost=hgwdev \
       -fileServer=hgwdev -continue=db macFas5.config.ra) > db.log 2>&1
    # real    24m21.493s

    # check in the trackDb files created and add to trackDb/makefile

#########################################################################
# fixup search rule for assembly track/gold table (DONE - 2016-04-05 - Hiram)
    cd ~/kent/src/hg/makeDb/trackDb/macaca/macFas5
    hgsql -N -e "select frag from gold;" macFas5 | sort | head -1
AQIA01000001.1


    hgsql -N -e "select frag from gold;" macFas5 | sort | tail -2
AQIA01087763.1
NC_012670

    # verify this rule will find them all or eliminate them all:
    hgsql -N -e "select frag from gold;" macFas5 | wc -l
    # 87764

    hgsql -N -e "select frag from gold;" macFas5 \
       | egrep -e '[AN][QC][I_][A0][01][0-9]+(\.[0-9]+)?' | wc -l
    # 87764

    hgsql -N -e "select frag from gold;" macFas5 \
       | egrep -v -e '[AN][QC][I_][A0][01][0-9]+(\.[0-9]+)?' | wc -l
    # 0

    # hence, add to trackDb/rhesus/macFas5/trackDb.ra
searchTable gold
shortCircuit 1
termRegex [AN][QC][I_][A0][01][0-9]+(\.[0-9]+)?
query select chrom,chromStart,chromEnd,frag from %s where frag like '%s%%'
searchPriority 8

    # verify searches work in the position box

#########################################################################

##########################################################################
# running repeat masker (DONE - 2014-12-12 - Hiram)
    mkdir /hive/data/genomes/macFas5/bed/repeatMasker
    cd /hive/data/genomes/macFas5/bed/repeatMasker
    time doRepeatMasker.pl -buildDir=`pwd` -stop=mask \
       -bigClusterHub=swarm -dbHost=hgwdev -workhorse=hgwdev \
       -species  "Macaca fascicularis" -useRMBlastn \
       -smallClusterHub=encodek macFas5 > mask.log 2>&1
    # real    395m54.339s

    # this failed during the cat step due to difficulties, finishing
    doRepeatMasker.pl -buildDir=`pwd` -stop=mask \
       -bigClusterHub=swarm -dbHost=hgwdev -workhorse=hgwdev \
       -continue=mask -species  "Macaca fascicularis" -useRMBlastn \
       -smallClusterHub=encodek macFas5 > finiMask.log 2>&1
    #  *** All done!  (through the 'mask' step) - Elapsed time: 2m5s

    # install step performed 2015-01-08
    doRepeatMasker.pl -buildDir=`pwd` -debug \
       -bigClusterHub=swarm -dbHost=hgwdev -workhorse=hgwdev \
       -continue=install -species  "Macaca fascicularis" -useRMBlastn \
       -smallClusterHub=ku macFas5

    time doRepeatMasker.pl -buildDir=`pwd` \
       -bigClusterHub=swarm -dbHost=hgwdev -workhorse=hgwdev \
       -continue=cleanup -species  "Macaca fascicularis" -useRMBlastn \
       -smallClusterHub=ku macFas5

    cat faSize.rmsk.txt
    # 3011982740 bases (614872980 N's 2397109760 real 1287460374 upper 1109649386 lower) in 319550 sequences in 1 files
    # Total size: mean 9425.7 sd 1088157.1 min 201 (chrUn_JMHX01238739v1) max 215897965 (chr1) median 823
    # %36.84 masked total, %46.29 masked real

    # running cleanUp step manually:
    time doRepeatMasker.pl -buildDir=`pwd` -bigClusterHub=swarm \
       -dbHost=hgwdev -workhorse=hgwdev  -continue=cleanup \
       -species  "Macaca fascicularis" -useRMBlastn -smallClusterHub=ku macFas5
    # real    20m7.569s

    egrep -i "versi|relea" mask.log
    # RepeatMasker version open-4.0.3
    #    June 20 2013 (open-4-0-3) version of RepeatMasker
    #    CC   RELEASE 20130422;                                            *

    time featureBits -countGaps macFas5 rmsk
    # 1403493819 bases of 2946843737 (47.627%) in intersection
    # real    0m39.963s

    # why is it different than the faSize above ?
    # because rmsk masks out some N's as well as bases, the count above
    #   separates out the N's from the bases, it doesn't show lower case N's

##########################################################################
# running simple repeat (DONE 2013-06-27 - Hiram)

    mkdir /hive/data/genomes/macFas5/bed/simpleRepeat
    cd /hive/data/genomes/macFas5/bed/simpleRepeat
    time doSimpleRepeat.pl -buildDir=`pwd` \
       -smallClusterHub=swarm -workhorse=hgwdev -stop=filter \
       macFas5 > filter.log 2>&1
    # real    72m50.488s

    doSimpleRepeat.pl -buildDir=`pwd` \
       -smallClusterHub=ku -workhorse=hgwdev -continue=load macFas5

    cat fb.simpleRepeat
    # 95343382 bases of 2803866698 (3.400%) in intersection

    # add to rmsk after it is done:
    cd /hive/data/genomes/macFas5
    twoBitMask macFas5.rmsk.2bit \
        -add bed/simpleRepeat/trfMask.bed macFas5.2bit
    #   you can safely ignore the warning about fields >= 13
    twoBitToFa macFas5.2bit stdout | faSize stdin > faSize.macFas5.2bit.txt
    cat faSize.macFas5.2bit.txt
    # 2946843737 bases (142977352 N's 2803866385 real 1398959329 upper
    #    1404907056 lower) in 7601 sequences in 1 files
    # Total size: mean 387691.6 sd 7559665.4 min 218 (chrUn_AQIA01081996)
    #    max 227556264 (chr1) median 3505
    # %47.67 masked total, %50.11 masked real

    rm /gbdb/macFas5/macFas5.2bit
    ln -s `pwd`/macFas5.2bit /gbdb/macFas5/macFas5.2bit

##########################################################################
# CREATE MICROSAT TRACK (DONE - 2015-06-22 - Hiram)
     ssh hgwdev
     mkdir /cluster/data/macFas5/bed/microsat
     cd /cluster/data/macFas5/bed/microsat
     awk '($5==2 || $5==3) && $6 >= 15 && $8 == 100 && $9 == 0 {printf("%s\t%s\t%s\t%dx%s\n", $1, $2, $3, $6, $16);}' \
	../simpleRepeat/simpleRepeat.bed > microsat.bed
    hgLoadBed macFas5 microsat microsat.bed
    #	Read 34107 elements of size 4 from microsat.bed

##########################################################################
## WINDOWMASKER (DONE - 2013-06-27 - Hiram)

    mkdir /hive/data/genomes/macFas5/bed/windowMasker
    cd /hive/data/genomes/macFas5/bed/windowMasker
    time doWindowMasker.pl -stop=twobit -buildDir=`pwd` \
       -workhorse=hgwdev macFas5 > twobit.log 2>&1
    # real    205m59.525s

    # Masking statistics
    cat faSize.macFas5.cleanWMSdust.txt
    # 2946843737 bases (142977352 N's 2803866385 real 1750648586 upper
    #    1053217799 lower) in 7601 sequences in 1 files
    # Total size: mean 387691.6 sd 7559665.4 min 218 (chrUn_AQIA01081996)
    #   max 227556264 (chr1) median 3505
    # %35.74 masked total, %37.56 masked real

    time featureBits -countGaps macFas5 rmsk windowmaskerSdust \
        > fb.macFas5.rmsk.windowmaskerSdust.txt 2>&1
    # real    5m55.718s
    cat fb.macFas5.rmsk.windowmaskerSdust.txt
    # 820214292 bases of 2946843737 (27.834%) in intersection

    doWindowMasker.pl -continue=cleanup -buildDir=`pwd` -workhorse=hgwdev \
        -dbHost=hgwdev macFas5
    #    Elapsed time: 1m56s

##########################################################################
# cpgIslands - (DONE - 2013-11-25 - Hiram)
    mkdir /hive/data/genomes/macFas5/bed/cpgIslands
    cd /hive/data/genomes/macFas5/bed/cpgIslands
    doCpgIslands.pl -buildDir=`pwd` -bigClusterHub=ku \
      -dbHost=hgwdev -smallClusterHub=ku -workhorse=hgwdev \
       macFas5 > run.log 2>&1
    # Elapsed time: 8m47s

    cat fb.macFas5.cpgIslandExt.txt
    # 18918384 bases of 2803866698 (0.675%) in intersection

##############################################################################
# cpgIslands on UNMASKED sequence (DONE - 2014-07-16 - Hiram)
    mkdir /hive/data/genomes/macFas5/bed/cpgIslandsUnmasked
    cd /hive/data/genomes/macFas5/bed/cpgIslandsUnmasked

    time (doCpgIslands.pl -dbHost=hgwdev -bigClusterHub=ku -buildDir=`pwd` \
       -tableName=cpgIslandExtUnmasked \
          -maskedSeq=/hive/data/genomes/macFas5/macFas5.unmasked.2bit \
             -workhorse=hgwdev -smallClusterHub=ku macFas5) > do.log 2>&1
    # real    16m26.998s

    cat fb.macFas5.cpgIslandExtUnmasked.txt
    # 33488708 bases of 2803866698 (1.194%) in intersection

#############################################################################
# cytoBandIdeo - (DONE - 2015-01-09 - Hiram)
    mkdir /hive/data/genomes/macFas5/bed/cytoBand
    cd /hive/data/genomes/macFas5/bed/cytoBand
    makeCytoBandIdeo.csh macFas5

#########################################################################
# genscan - (DONE - 2014-09-10 - Hiram)
    mkdir /hive/data/genomes/macFas5/bed/genscan
    cd /hive/data/genomes/macFas5/bed/genscan
    doGenscan.pl -stop=makeBed \
      -buildDir=/hive/data/genomes/macFas5/bed/genscan \
        -bigClusterHub=ku -workhorse=hgwdev -dbHost=hgwdev macFas5 > do.log 2>&1
    #  Elapsed time: 187m25s

    doGenscan.pl -continue=load -stop=load  \
      -buildDir=/hive/data/genomes/macFas5/bed/genscan \
        -bigClusterHub=ku -workhorse=hgwdev \
            -dbHost=hgwdev macFas5 > load.log 2>&1
    #  Elapsed time: 0m25s

    doGenscan.pl -continue=cleanup \
       -buildDir=/hive/data/genomes/macFas5/bed/genscan \
         -bigClusterHub=ku -workhorse=hgwdev \
            -dbHost=hgwdev macFas5 > cleanup.log 2>&1
    #  Elapsed time: 0m18s

    cat fb.macFas5.genscan.txt
    # 52296622 bases of 2803866698 (1.865%) in intersection

    cat fb.macFas5.genscanSubopt.txt
    # 52200998 bases of 2803866698 (1.862%) in intersection

#########################################################################
# UCSC to RefSeq name correspondence (DONE - 2017-12-14 - Hiram)

    mkdir /hive/data/genomes/macFas5/bed/ucscToRefSeq
    cd /hive/data/genomes/macFas5/bed/ucscToRefSeq

    # construct idKeys in ./refseq
    mkdir refseq
    cd refseq
    ln -s /hive/data/outside/ncbi/genomes/refseq/vertebrate_mammalian/Macaca_fascicularis/all_assembly_versions/GCF_000364345.1_Macaca_fascicularis_5.0/GCF_000364345.1_Macaca_fascicularis_5.0_genomic.fna.gz .

    faToTwoBit *.fna.gz refseq.macFas5.2bit
    time (doIdKeys.pl -buildDir=`pwd` -twoBit=`pwd`/refseq.macFas5.2bit refseqMacFas5) > idKeys.log 2>&1 &

    cd /hive/data/genomes/macFas5/bed/ucscToRefSeq

    join -t$'\t' ../idKeys/*.idKeys.txt  refseq/*.idKeys.txt \
	| cut -f2- | sort > ucsc.refseq.tab

    awk '{printf "%s\t0\t%d\n", $1,$2}' ../../chrom.sizes \
         | sort > name.coordinate.tab

    join ucsc.refseq.tab name.coordinate.tab \
       | awk '{printf "%s\t%d\t%d\t%s\n", $1,$3,$4,$2}' \
          | sort -k1,1 -k2,2n > ucscToRefSeq.bed

    # when working perfectly, all these tab files have the same line count:
    wc -l *.tab *.bed
#   7601 name.coordinate.tab
#   7601 ucsc.refseq.tab
#   7601 ucscToRefSeq.bed

    # the extra counts in the ucsc.refseq.tab are due to the duplicates
    # in the refseq assembly.  The missing item in the bed file is chrM

    export chrSize=`cut -f1 ucscToRefSeq.bed | awk '{print length($0)}' | sort -n | tail -1`
    echo $chrSize
    #  25
    sed -e "s/21/$chrSize/" $HOME/kent/src/hg/lib/ucscToINSDC.sql \
       | sed -e 's/INSDC/RefSeq/g;' > ucscToRefSeq.sql
    hgLoadSqlTab macFas5 ucscToRefSeq ./ucscToRefSeq.sql ucscToRefSeq.bed

    checkTableCoords  macFas5 -table=ucscToRefSeq
    # would normally cover %100 all bases, this case missing chrM:
    featureBits -countGaps macFas5 ucscToRefSeq
    # 2946843737 bases of 2946843737 (100.000%) in intersection

########################################################################
# ucscToINSDC table/track (DONE - 2015-01-09 - Hiram)
    # rebuild this 2017-12-14, had incorrect chrM name before

    cd /hive/data/genomes/macFas5/bed/ucscToINSDC

    ~/kent/src/hg/utils/automation/ucscToINSDC.sh \
      ../../genbank/Primary_Assembly FJ906803.1
    # this one is old, does not have the v1 business:
    sed -e 's/v1//;' ucscToINSDC.txt | sort > ucscToINSDC.tab

    awk '{printf "%s\t0\t%d\n", $1,$2}' ../../chrom.sizes \
         | sort > name.coordinate.tab

    join name.coordinate.tab ucscToINSDC.tab | tr '[ ]' '[\t]' \
         > ucscToINSDC.bed
    # should all be the same count:
    wc -l *
#      7601 name.coordinate.tab
#      7601 ucscToINSDC.bed
#      7601 ucscToINSDC.tab
#      7601 ucscToINSDC.txt

    cut -f1 ucscToINSDC.bed | awk '{print length($0)}' | sort -n | tail -1
    # 25
    # use the 25 in this sed
    sed -e "s/21/25/" $HOME/kent/src/hg/lib/ucscToINSDC.sql \
         | hgLoadSqlTab macFas5 ucscToINSDC stdin ucscToINSDC.bed
    checkTableCoords macFas5
    # should cover %100 entirely:
    featureBits -countGaps macFas5 ucscToINSDC
    # 2946843737 bases of 2946843737 (100.000%) in intersection

    # below obsolete procedure:

    # reusing the old-style script since this genbank download is still
    # the old style, this script copied from rn6/bed/ucscToINSDC
    grep chrM ../../*.agp
# chrM    1       16575   1       F       NC_012670       1       16575   +
    ./translateNames.sh  NC_012670.1

    # this NC name is not the genbank name ! 2017-12-14

    awk '{printf "%s\t0\t%d\n", $1,$2}' ../../chrom.sizes \
         | sort > name.coordinate.tab
    join name.coordinate.tab ucscToINSDC.txt | tr '[ ]' '[\t]' \
         > ucscToINSDC.bed
    # verify they are all accounted for:
    wc -l ucscToINSDC.bed ../../chrom.sizes
    #   7601 ucscToINSDC.bed
    #   7601 ../../chrom.sizes

    cut -f1 ucscToINSDC.bed | awk '{print length($0)}' | sort -n | tail -1
    # 25
    # use the 25 in this sed
    sed -e "s/21/25/" $HOME/kent/src/hg/lib/ucscToINSDC.sql \
         | hgLoadSqlTab macFas5 ucscToINSDC stdin ucscToINSDC.bed
    checkTableCoords macFas5
    # should cover %100 entirely:
    featureBits -countGaps macFas5 ucscToINSDC
    # 2946843737 bases of 2946843737 (100.000%) in intersection

##############################################################################
# add chromAlias table (DONE - 2017-12-14 - Hiram)

    mkdir /hive/data/genomes/macFas5/bed/chromAlias
    cd /hive/data/genomes/macFas5/bed/chromAlias

    hgsql -N -e 'select chrom,name from ucscToRefSeq;' macFas5 \
        | sort > ucsc.refseq.tab
    hgsql -N -e 'select chrom,name from ucscToINSDC;' macFas5 \
        |sort > ucsc.genbank.tab

    join -t$'\t' ../idKeys/macFas5.idKeys.txt \
	../../ensembl/ensemblMacFas5.idKeys.txt \
	| cut -f2,3 | sort > ucsc.ensembl.tab

    ~/kent/src/hg/utils/automation/chromAlias.pl
    sort -o macFas5.chromAlias.tab macFas5.chromAlias.tab

for t in refseq genbank ensembl
do
  c0=`cat ucsc.$t.tab | wc -l`
  c1=`grep $t macFas5.chromAlias.tab | wc -l`
  ok="OK"
  if [ "$c0" -ne "$c1" ]; then
     ok="ERROR"
  fi
  printf "# checking $t: $c0 =? $c1 $ok\n"
done
# checking refseq: 7601 =? 7601 OK
# checking genbank: 7601 =? 7601 OK
# checking ensembl: 7601 =? 7601 OK

    hgLoadSqlTab macFas5 chromAlias ~/kent/src/hg/lib/chromAlias.sql \
        macFas5.chromAlias.tab

##############################################################################
# TransMap V3 tracks. see makeDb/doc/transMapTracks.txt (2014-12-21 markd)
##############################################################################
#########################################################################
# Create kluster run files (DONE - 2015-01-09 - Hiram)

    cd /hive/data/genomes/macFas5
    # numerator is macFas5 gapless bases "real" as reported by:
    head -1 faSize.macFas5.2bit.txt
# 2946843737 bases (142977352 N's 2803866385 real 1398959329 upper 1404907056
# lower) in 7601 sequences in 1 files

# 3011982740 bases (614872980 N's 2397109760 real 1286534225 upper 1110575535 lower) in 319550 sequences in 1 files

    # denominator is hg19 gapless bases as reported by:
    #   featureBits -noRandom -noHap hg19 gap
    #     234344806 bases of 2861349177 (8.190%) in intersection
    # 1024 is threshold used for human -repMatch:
    calc \( 2803866385 / 2861349177 \) \* 1024
    # ( 2803866385 / 2861349177 ) * 1024 = 1003.428453

    # ==> use -repMatch=1000 according to size scaled down from 1024 for human.
    #   and rounded down to nearest 50
    cd /hive/data/genomes/macFas5
    time blat macFas5.2bit \
         /dev/null /dev/null -tileSize=11 -makeOoc=jkStuff/macFas5.11.ooc \
        -repMatch=1000
    # Wrote 29333 overused 11-mers to jkStuff/macFas5.11.ooc
    #  real    0m50.948s

    #   check non-bridged gaps to see what the typical size is:
    hgsql -N -e 'select * from gap where bridge="no" order by size;' macFas5         | ave -tableOut -col=7 stdin
# # min Q1 median Q3 max mean N sum stddev
# 100 1e+06 1e+06 1e+06 1e+06 875012 24 2.10003e+07 337798

 # note the minimum non-bridged gap size is 100 and there are only 24 such gaps
 # on the other hand, there are 68,162 gaps size >= 100
    hgsql -N -e 'select * from gap where size>99 order by size;' macFas5 \
       | ave -tableOut -col=7 stdin
# # min Q1 median Q3 max mean N sum stddev
# 100 100 377 1082 1e+06 2088.37 68162 1.42347e+08 20953.7

    gapToLift -verbose=2 -minGap=100 macFas5 jkStuff/macFas5.nonBridged.lft \
        -bedFile=jkStuff/macFas5.nonBridged.bed

########################################################################
# GENBANK AUTO UPDATE (DONE - 2015-01-09 - Hiram)
    ssh hgwdev
    cd $HOME/kent/src/hg/makeDb/genbank
    git pull
    # /cluster/data/genbank/data/organism.lst shows:
    # #organism       mrnaCnt estCnt  refSeqCnt
    # Macaca fascicularis     11994   163304  2152

    # edit etc/genbank.conf to add macFas5 just before rheMac2

# macFas5 (Crab-eating macaque)
macFas5.serverGenome = /hive/data/genomes/macFas5/macFas5.2bit
macFas5.clusterGenome = /hive/data/genomes/macFas5/macFas5.2bit
macFas5.ooc = /hive/data/genomes/macFas5/jkStuff/macFas5.11.ooc
macFas5.lift = /hive/data/genomes/macFas5/jkStuff/macFas5.nonBridged.lft
macFas5.perChromTables = no
macFas5.refseq.mrna.native.pslCDnaFilter  = ${ordered.refseq.mrna.native.pslCDnaFilter}
macFas5.refseq.mrna.xeno.pslCDnaFilter    = ${ordered.refseq.mrna.xeno.pslCDnaFilter}
macFas5.genbank.mrna.native.pslCDnaFilter = ${ordered.genbank.mrna.native.pslCDnaFilter}
macFas5.genbank.mrna.xeno.pslCDnaFilter   = ${ordered.genbank.mrna.xeno.pslCDnaFilter}
macFas5.genbank.est.native.pslCDnaFilter  = ${ordered.genbank.est.native.pslCDnaFilter}
macFas5.genbank.est.xeno.pslCDnaFilter    = ${ordered.genbank.est.xeno.pslCDnaFilter}
macFas5.downloadDir = macFas5
macFas5.refseq.mrna.native.load  = yes
macFas5.refseq.mrna.native.loadDesc  = yes
macFas5.refseq.mrna.xeno.load = yes
macFas5.refseq.mrna.xeno.loadDesc  = yes
macFas5.genbank.mrna.native.load  = yes
macFas5.genbank.mrna.native.loadDesc  = yes
macFas5.genbank.est.native.load  = yes
macFas5.upstreamGeneTbl = refGene

    # Edit src/lib/gbGenome.c to add new species

 git commit -m "Added macFas5; refs #11174" etc/genbank.conf src/lib/gbGenome.c
    git push
    # update /cluster/data/genbank/:
    make etc-update
    make install-server

    screen      #  control this business with a screen since it takes a while
    cd /cluster/data/genbank

    time ./bin/gbAlignStep -initial macFas5
    # logFile: var/build/logs/2015.01.09-10:38:58.macFas5.initalign.log
    #   real    170m25.901s

    # verify completed successfully:
    tail var/build/logs/2015.01.09-10:38:58.macFas5.initalign.log
# hgwdev 2015.01.09-13:25:18 macFas5.initalign: Succeeded: macFas5
# hgwdev 2015.01.09-13:29:24 macFas5.initalign: finish

    #   To re-do, rm the dir first:
    #     /cluster/data/genbank/work/initial.macFas5

    # load database when finished
    ssh hgwdev
    cd /cluster/data/genbank
    time ./bin/gbDbLoadStep -drop -initialLoad macFas5
    # logFile: var/dbload/hgwdev/logs/2015.01.20-14:14:51.macFas5.dbload.log
    # real    41m8.893s

    # enable daily alignment and update of hgwdev
    cd ~/kent/src/hg/makeDb/genbank
    git pull
    # add macFas5 to:
    #   etc/align.dbs
    #   etc/hgwdev.dbs
    git add etc/align.dbs
    git add etc/hgwdev.dbs
    git commit -m "Added macFas5 - crab-eating macaque refs #11174" etc/align.dbs etc/hgwdev.dbs
    git push
    make etc-update

############################################################################
#  BLATSERVERS ENTRY (DONE - 2015-03-20 - Hiram)
#	After getting a blat server assigned by the Blat Server Gods,
     ssh hgwdev

     # verify doesn't exist yet:
     hgsql -e 'select * from blatServers;' hgcentraltest | grep -i macfas
     # empty result

     hgsql -e 'INSERT INTO blatServers (db, host, port, isTrans, canPcr) \
	VALUES ("macFas5", "blat4c", "17864", "1", "0"); \
	INSERT INTO blatServers (db, host, port, isTrans, canPcr) \
	VALUES ("macFas5", "blat4c", "17865", "0", "1");' \
		hgcentraltest

     # verify entry:
     hgsql -e 'select * from blatServers;' hgcentraltest | grep -i macfas
     #    macFas5 blat4c  17864   1       0
     #    macFas5 blat4c  17865   0       1

     #	test it with some sequence

#########################################################################
# all.joiner update, downloads and in pushQ - (DONE - 2015-01-27 - Hiram)
    cd $HOME/kent/src/hg/makeDb/schema
    # fixup all.joiner until this is a clean output
    joinerCheck -database=macFas5 -keys all.joiner
    joinerCheck -database=macFas5 -tableCoverage all.joiner
    joinerCheck -database=macFas5 -times all.joiner

    cd /hive/data/genomes/macFas5
    time (makeDownloads.pl -workhorse=hgwdev -dbHost=hgwdev macFas5) \
       > downloads.log 2>&1
    # real    25m37.738s


    #   now ready for pushQ entry
    mkdir /hive/data/genomes/macFas5/pushQ
    cd /hive/data/genomes/macFas5/pushQ
    time makePushQSql.pl macFas5 > macFas5.pushQ.sql 2> stderr.out
    #  real    5m23.179s
    #   check for errors in stderr.out, some are OK, e.g.:
    # WARNING: hgwdev does not have /gbdb/macFas5/wib/gc5Base.wib
    # WARNING: hgwdev does not have /gbdb/macFas5/wib/quality.wib
    # WARNING: hgwdev does not have /gbdb/macFas5/bbi/gc5BaseBw/gc5Base.bw
    # WARNING: hgwdev does not have /gbdb/macFas5/bbi/qualityBw/quality.bw
    # WARNING: macFas5 does not have seq
    # WARNING: macFas5 does not have extFile

    #   copy it to hgwbeta
    scp -p macFas5.pushQ.sql qateam@hgwbeta:/tmp
    ssh qateam@hgwbeta './bin/x86_64/hgsql qapushq < /tmp/macFas5.pushQ.sql'
    #   in that pushQ entry walk through each entry and see if the
    #   sizes will set properly

#########################################################################
