# for emacs: -*- mode: sh; -*-

# This file describes browser build for the bosTau9

#########################################################################
# reuse photograph from bosTau previous versions (DONE - hiram - 2018-11-06)

mkdir /hive/data/genomes/bosTau9
cd /hive/data/genomes/bosTau9
cp -p ../bosTau8/photoReference.txt .


cat photoReference.txt

# photoCreditURL  http://www.genome.gov/dmd/img.cfm?node=Photos/Animals/Cow&id=79109
# photoCreditName NHGRI press photos

#########################################################################
#  Initial steps (DONE  - 2018-11-06 - Hiram)

# To start this initialBuild.txt document, from a previous assembly document:

mkdir ~/kent/src/hg/makeDb/doc/bosTau9
cd ~/kent/src/hg/makeDb/doc/bosTau9

# best to use a most recent document since it has the latest features and
# procedures:
sed -e 's/oviAri4/bosTau9/g; s/OviAri4/BosTau9/g; s/DONE/TBD/g;' \
   ../oviAri4/initialBuild.txt > initialBuild.txt

mkdir /hive/data/genomes/bosTau9/refseq
cd /hive/data/genomes/bosTau9/refseq

time (rsync --stats -L -a -P \
rsync://ftp.ncbi.nlm.nih.gov/genomes/refseq/vertebrate_mammalian/Bos_taurus/all_assembly_versions/GCF_002263795.1_ARS-UCD1.2/ ./) > fetch.log 2>&1

# sent 3,528 bytes  received 3,992,632,460 bytes  52,882,595.87 bytes/sec
# total size is 3,991,645,023  speedup is 1.00

# real    1m14.770s

# check assembly size for later reference:

faSize G*D1.2_genomic.fna.gz
# 2715853792 bases (28162 N's 2715825630 real 1595305255 upper
#	1120520375 lower) in 2211 sequences in 1 files
# Total size: mean 1228337.3 sd 10762990.9 min 1034 (NW_020192071.1)
#	max 158534110 (NC_037328.1) median 21935
# %41.26 masked total, %41.26 masked real

# this information is from the top of
#    bosTau9/refseq/GCF_002263795.1_ARS-UCD1.2_assembly_report.txt

# Assembly name:  ARS-UCD1.2
# Organism name:  Bos taurus (cattle)
# Infraspecific name:  breed=Hereford
# Isolate:  L1 Dominette 01449 registration number 42190680
# Sex:  female
# Taxid:          9913
# BioSample:      SAMN03145444
# BioProject:     PRJNA391427
# Submitter:      USDA ARS
# Date:           2018-4-11
# Assembly type:  haploid
# Release type:   major
# Assembly level: Chromosome
# Genome representation: full
# WGS project:    NKLS02
# Assembly method: Falcon v. FEB-2016
# Expected final version: yes
# Reference guided assembly: de-novo
# Genome coverage: 80.0x
# Sequencing technology: PacBio; Illumina NextSeq 500; Illumina HiSeq; Illumina GAII
# RefSeq category: Representative Genome
# GenBank assembly accession: GCA_002263795.2
# RefSeq assembly accession: GCF_002263795.1
# RefSeq assembly and GenBank assemblies identical: no
#
## Assembly-Units:
## GenBank Unit Accession       RefSeq Unit Accession   Assembly-Unit name
## GCA_002263805.2      GCF_002263805.1 Primary Assembly
## GCA_002263815.2      GCF_000001285.1 non-nuclear

#############################################################################
# establish config.ra file (DONE - Hiram - 2018-11-06)
    # arguments here are: <db> <clade> <trackDbDir> <assembly_report.txt>
    cd /hive/data/genomes/bosTau9
    $HOME/kent/src/hg/utils/automation/prepConfig.pl bosTau9 mammal \
        cow ./refseq/*_assembly_report.txt > bosTau9.config.ra

    # compare to ../oviAri3 to see what might need to be fixed up:
    diff bosTau9.config.ra ../bosTau8/bosTau8.config.ra | less
    # fixup the 'commonName' from Cattle to Cow and orderKey from 3262 to 3626

    cat bosTau9.config.ra
# config parameters for makeGenomeDb.pl:
db bosTau9
clade mammal
genomeCladePriority 35
scientificName Bos taurus
commonName Cow
assemblyDate Apr. 2018
assemblyLabel USDA ARS
assemblyShortLabel ARS-UCD1.2
orderKey 3626
# mitochondrial sequence included in refseq release
# mitoAcc NC_006853.1
mitoAcc none
fastaFiles /hive/data/genomes/bosTau9/ucsc/*.fa.gz
agpFiles /hive/data/genomes/bosTau9/ucsc/*.agp
# qualFiles none
dbDbSpeciesDir cow
photoCreditURL  http://www.genome.gov/dmd/img.cfm?node=Photos/Animals/Cow&id=79109
photoCreditName NHGRI press photos
ncbiGenomeId 82
ncbiAssemblyId 1677391
ncbiAssemblyName ARS-UCD1.2
ncbiBioProject 391427
ncbiBioSample SAMN03145444
genBankAccessionID GCF_002263795.1
taxId 9913

#############################################################################
# setup UCSC named files (DONE - 2018-11-06 - Hiram)

    mkdir /hive/data/genomes/bosTau9/ucsc
    cd /hive/data/genomes/bosTau9/ucsc

    # check for duplicate sequences:
    time faToTwoBit -noMask ../refseq/G*D1.2_genomic.fna.gz refseq.2bit
    #  real    0m44.551s

    twoBitDup refseq.2bit
    # no output is a good result, otherwise, would have to eliminate duplicates
    # the scripts creating the fasta here will be using this refseq.2bit file

    time ~/kent/src/hg/utils/automation/ucscCompositeAgp.pl \
       ../refseq/G*D1.2_genomic.fna.gz \
          ../refseq/G*D1.2_assembly_structure/Primary_Assembly
# NC_037328.1 chr1
# NC_037329.1 chr2
# NC_037330.1 chr3
# NC_037331.1 chr4
# NC_037332.1 chr5
# NC_037333.1 chr6
# NC_037334.1 chr7
# NC_037335.1 chr8
# NC_037336.1 chr9
# NC_037337.1 chr10
# NC_037338.1 chr11
# NC_037339.1 chr12
# NC_037340.1 chr13
# NC_037341.1 chr14
# NC_037342.1 chr15
# NC_037343.1 chr16
# NC_037344.1 chr17
# NC_037345.1 chr18
# NC_037346.1 chr19
# NC_037347.1 chr20
# NC_037348.1 chr21
# NC_037349.1 chr22
# NC_037350.1 chr23
# NC_037351.1 chr24
# NC_037352.1 chr25
# NC_037353.1 chr26
# NC_037354.1 chr27
# NC_037355.1 chr28
# NC_037356.1 chr29
# NC_037357.1 chrX

# real    10m47.295s

    # unplaced sequences
    time ~/kent/src/hg/utils/automation/unplacedWithChroms.pl \
       ../refseq/*_assembly_structure/Primary_Assembly
# processed 2180 sequences into chrUn.fa.gz
# real    0m27.379s

    # unlocalized sequences
    time ~/kent/src/hg/utils/automation/unlocalizedWithChroms.pl \
       ../refseq/*_assembly_structure/Primary_Assembly
# No unlocalized sequences

    # bash syntax here
    mitoAcc=`grep "^# mitoAcc" ../bosTau9.config.ra | awk '{print $NF}'`
    printf "# mitoAcc %s\n" "$mitoAcc"
# mitoAcc NC_006853.1

    zcat \
  ../refseq/*_assembly_structure/non-nuclear/assem*/AGP/chrMT.comp.agp.gz \
     | grep -v "^#" | sed -e "s/^$mitoAcc/chrM/;" > chrM.agp

    printf ">chrM\n" > chrM.fa
    twoBitToFa -noMask refseq.2bit:$mitoAcc stdout | grep -v "^>" >> chrM.fa
    gzip chrM.fa

    # verify chrM sequence is there:
    faCount chrM.fa.gz
#seq    len     A       C       G       T       N       cpg
chrM    16338   5457    4238    2202    4441    0       358

    # verify fasta and AGPs agree
    time faToTwoBit *.fa.gz test.2bit
    # real    0m58.603s

    time cat *.agp | checkAgpAndFa stdin test.2bit 2>&1 | tail -4
    # All AGP and FASTA entries agree - both files are valid

    # real    0m3.117s

    # and no sequence lost from orginal:
    twoBitToFa test.2bit stdout | faSize stdin
# 2715853792 bases (28162 N's 2715825630 real 2715825630 upper 0
#	lower) in 2211 sequences in 1 files
# Total size: mean 1228337.3 sd 10762990.9 min 1034 (chrUn_NW_020192071v1)
#	max 158534110 (chr1) median 21935

    # same numbers as above
# 2715853792 bases (28162 N's 2715825630 real 1595305255 upper
#	1120520375 lower) in 2211 sequences in 1 files
# Total size: mean 1228337.3 sd 10762990.9 min 1034 (NW_020192071.1)
#	max 158534110 (NC_037328.1) median 21935

    # no longer need these temporary 2bit files
    rm refseq.2bit test.2bit

#############################################################################
#  Initial database build (DONE - 2018-11-06 - Hiram)

    cd /hive/data/genomes/bosTau9
    # verify sequence and AGP are OK:
    time (makeGenomeDb.pl -workhorse=hgwdev -dbHost=hgwdev -fileServer=hgwdev \
         -stop=agp bosTau9.config.ra) > agp.log 2>&1
    # real    2m36.514s

    # verify there was no error in that step:
    tail agp.log
    #  *** All done!  (through the 'agp' step)

    # then finish it off:
    time (makeGenomeDb.pl -workhorse=hgwdev -dbHost=hgwdev \
       -fileServer=hgwdev -continue=db bosTau9.config.ra) > db.log 2>&1
    # real    15m27.039s

    # verify gaps are all there
    twoBitInfo -nBed bosTau9.unmasked.2bit stdout | awk '{print $3-$2}' \
	| ave stdin | sed -e 's/^/# /;'
# Q1 25.000000
# median 25.000000
# Q3 100.000000
# average 72.958549
# min 23.000000
# max 252.000000
# count 386
# total 28162.000000
# standard deviation 71.040090

    twoBitInfo -nBed bosTau9.unmasked.2bit stdout \
       | awk '{printf "%s\t%d\t%d\t%d\n", $1,$2,$3,$3-$2}' \
         | sort -k4,4nr | cut -f4 | uniq -c
     42 252
    116 100
      1 99
      1 85
      1 66
      1 65
      1 60
      1 44
      1 36
    220 25
      1 23


    # the gap table has nothing
    hgsql -e 'select count(*) from gap;' bosTau9
+----------+
| count(*) |
+----------+
|        0 |
+----------+

    # otherwise, compare to the gap table:
    hgsql -e 'select chromEnd-chromStart from gap;' bosTau9 | ave stdin | sed -e 's/^/# /;'

    # check in the trackDb files created in TemporaryTrackDbCheckout/
    #    and add bosTau9 to trackDb/makefile

    # temporary symlink until masked sequence is available
    cd /hive/data/genomes/bosTau9
    ln -s `pwd`/bosTau9.unmasked.2bit /gbdb/bosTau9/bosTau9.2bit

##############################################################################
# cpgIslands on UNMASKED sequence (DONE - 2018-11-06 - Hiram)
    mkdir /hive/data/genomes/bosTau9/bed/cpgIslandsUnmasked
    cd /hive/data/genomes/bosTau9/bed/cpgIslandsUnmasked

    time (doCpgIslands.pl -dbHost=hgwdev -bigClusterHub=ku -buildDir=`pwd` \
       -tableName=cpgIslandExtUnmasked \
          -maskedSeq=/hive/data/genomes/bosTau9/bosTau9.unmasked.2bit \
             -workhorse=hgwdev -smallClusterHub=ku bosTau9) > do.log 2>&1
    # real    3m41.462s

    cat fb.bosTau9.cpgIslandExtUnmasked.txt
    # 33761995 bases of 2715853792 (1.243%) in intersection

#############################################################################
# cytoBandIdeo - (DONE - 2018-11-06 - Hiram)
    mkdir /hive/data/genomes/bosTau9/bed/cytoBand
    cd /hive/data/genomes/bosTau9/bed/cytoBand
    makeCytoBandIdeo.csh bosTau9

#############################################################################
# run up idKeys files for chromAlias/ncbiRefSeq (DONE - 2018-11-06 - Hiram)
    mkdir /hive/data/genomes/bosTau9/bed/idKeys
    cd /hive/data/genomes/bosTau9/bed/idKeys

    time (doIdKeys.pl \
        -twoBit=/hive/data/genomes/bosTau9/bosTau9.unmasked.2bit \
        -buildDir=`pwd` bosTau9) > do.log 2>&1 &
    # real    0m47.105s

    cat bosTau9.keySignature.txt
    #  7850e2d5dabb6134fdc9d7083f1a3a54

    # one of them did not work, joinerCheck complained, remove it:
hgsql -e 'delete from ncbiRefSeq where name="NM_001143743.1";' bosTau9
hgsql -e 'delete from ncbiRefSeqLink where protAcc="NP_001137215.1";' bosTau9
hgsql -e 'delete from ncbiRefSeqPsl where qName="NM_001143743.1";' bosTau9
hgsql -e 'delete from ncbiRefSeqCurated where name="NM_001143743.1";' bosTau9
hgsql -e 'delete from seqNcbiRefSeq where acc="NM_001143743.1";' bosTau9



#############################################################################
# gapOverlap (DONE - 2018-11-06 - Hiram)
    mkdir /hive/data/genomes/bosTau9/bed/gapOverlap
    cd /hive/data/genomes/bosTau9/bed/gapOverlap
    time (doGapOverlap.pl \
	-twoBit=/hive/data/genomes/bosTau9/bosTau9.unmasked.2bit bosTau9 ) \
	> do.log 2>&1
    # real    1m51.390s

    cat fb.bosTau9.gapOverlap.txt
    # 150 bases of 2715853792 (0.000%) in intersection

    # 1 items on chr13
    zcat *.bed.gz | cut -f1 | cut -d'_' -f1 | sort | uniq -c |sort -r | head -5
     1 chr13

#############################################################################
# tandemDups (DONE - 2018-11-06 - Hiram)
    mkdir /hive/data/genomes/bosTau9/bed/tandemDups
    cd /hive/data/genomes/bosTau9/bed/tandemDups
    time (~/kent/src/hg/utils/automation/doTandemDup.pl \
  -twoBit=/hive/data/genomes/bosTau9/bosTau9.unmasked.2bit bosTau9) \
	> do.log 2>&1 &
    # real    140m36.622s

    cat fb.bosTau9.tandemDups.txt
    # 110536957 bases of 2715853792 (4.070%) in intersection

    bigBedInfo bosTau9.tandemDups.bb | sed -e 's/^/#  /;'
#  version: 4
#  fieldCount: 13
#  hasHeaderExtension: yes
#  isCompressed: yes
#  isSwapped: 0
#  extraIndexCount: 0
#  itemCount: 1,556,737
#  primaryDataSize: 41,133,265
#  primaryIndexSize: 152,196
#  zoomLevels: 9
#  chromCount: 1955
#  basesCovered: 2,031,718,211
#  meanDepth (of bases covered): 7.749018
#  minDepth: 1.000000
#  maxDepth: 1559.000000
#  std of depth: 13.615893

#############################################################################
# ucscToINSDC and ucscToRefSeq table/track (DONE - 2018-11-08 - Hiram)
    # construct idKeys for the refseq sequence
    mkdir /hive/data/genomes/bosTau9/refseq/idKeys
    cd /hive/data/genomes/bosTau9/refseq/idKeys
    faToTwoBit ../GCF_002263795.1_ARS-UCD1.2_genomic.fna.gz bosTau9.refSeq.2bit

    time (doIdKeys.pl -buildDir=`pwd` \
        -twoBit=`pwd`/bosTau9.refSeq.2bit refseqBosTau9)  > do.log 2>&1 &
    # real    2m57.680s

    cat refseqBosTau9.keySignature.txt
    #  8eb392728eaf8d55db9d9cf05639cc0e

    # and the genbank sequence needs keys too:
    mkdir /hive/data/genomes/bosTau9/refseq/idKeysGenbank
    cd /hive/data/genomes/bosTau9/refseq/idKeysGenbank
    faToTwoBit /hive/data/outside/ncbi/genomes/genbank/vertebrate_mammalian/Bos_taurus/all_assembly_versions/GCA_002263795.2_ARS-UCD1.2/GCA_002263795.2_ARS-UCD1.2_genomic.fna.gz bosTau9.genbank.2bit

    time (doIdKeys.pl -buildDir=`pwd` \
        -twoBit=`pwd`/bosTau9.genbank.2bit genbankBosTau9)  > do.log 2>&1 &
    # real    3m3.732s

    cat genbankBosTau9.keySignature.txt
    #  c5f6bb39f5c7053fa10deecfa9ce4fc6

    mkdir /hive/data/genomes/bosTau9/bed/chromAlias
    cd /hive/data/genomes/bosTau9/bed/chromAlias

    join -t$'\t' ../idKeys/bosTau9.idKeys.txt \
        ../../refseq/idKeysGenbank/genbankBosTau9.idKeys.txt | cut -f2- \
          | sort -k1,1 | join -t$'\t' <(sort -k1,1 ../../chrom.sizes) - \
            | awk '{printf "%s\t0\t%d\t%s\n", $1, $2, $3}' \
               | sort -k1,1 -k2,2n > ucscToINSDC.bed

    join -t$'\t' ../idKeys/bosTau9.idKeys.txt \
        ../../refseq/idKeys/refseqBosTau9.idKeys.txt | cut -f2- \
          | sort -k1,1 | join -t$'\t' <(sort -k1,1 ../../chrom.sizes) - \
            | awk '{printf "%s\t0\t%d\t%s\n", $1, $2, $3}' \
               | sort -k1,1 -k2,2n > ucscToRefSeq.bed

    # should be same line counts throughout:
    wc -l * ../../chrom.sizes
    # 2210 ucscToINSDC.bed
    # 2211 ucscToRefSeq.bed
    # 2211 ../../chrom.sizes

    # need to find the accession for the INSDC equivalent to chrM:
    egrep chrM *
# ucscToRefSeq.bed:chrM   0       16338   NC_006853.1

    # lookup that accession at NCBI Entrez: AY526085.1
    # and add to ucscToINSDC.bed:
    printf "chrM\t0\t16338\tAY526085.1\n" >> ucscToINSDC.bed
    # verify:
    grep chrM *
# ucscToINSDC.bed:chrM    0       16338   AY526085.1
# ucscToRefSeq.bed:chrM   0       16338   NC_006853.1

    export chrSize=`cut -f1 ucscToINSDC.bed | awk '{print length($0)}' | sort -n | tail -1`
    echo $chrSize
    # 20
    # use the $chrSize in this sed
    sed -e "s/21/$chrSize/" $HOME/kent/src/hg/lib/ucscToINSDC.sql \
         | hgLoadSqlTab bosTau9 ucscToINSDC stdin ucscToINSDC.bed
     # should be the same for ucscToRefSeq:
    export chrSize=`cut -f1 ucscToRefSeq.bed | awk '{print length($0)}' | sort -n | tail -1`
    echo $chrSize
    # 20
    sed -e "s/21/$chrSize/" $HOME/kent/src/hg/lib/ucscToINSDC.sql \
       | sed -e 's/INSDC/RefSeq/g;' \
         | hgLoadSqlTab bosTau9 ucscToRefSeq stdin ucscToRefSeq.bed

    # should be quiet for all OK
    checkTableCoords bosTau9

    # should cover %100 entirely:
    featureBits -countGaps bosTau9 ucscToINSDC
    # 2715853792 bases of 2715853792 (100.000%) in intersection

    featureBits -countGaps bosTau9 ucscToRefSeq
    # 2715853792 bases of 2715853792 (100.000%) in intersection

#########################################################################
# add chromAlias table (DONE - 2018-11-08 - Hiram)
    # after ucscToRefSeq and ucscToINSDC tables have been created

    mkdir /hive/data/genomes/bosTau9/bed/chromAlias
    cd /hive/data/genomes/bosTau9/bed/chromAlias

    hgsql -N -e 'select chrom,name from ucscToRefSeq;' bosTau9 \
        | sort -k1,1 > ucsc.refseq.tab
    hgsql -N -e 'select chrom,name from ucscToINSDC;' bosTau9 \
        | sort -k1,1 > ucsc.genbank.tab

    ### Adding Ensembl alias with v95 release, after idKeys made: 2019-01-16
    join -t$'\t' ../idKeys/bosTau9.idKeys.txt \
        ../../ens95/ensBosTau9.idKeys.txt | cut -f2- \
          | sort -k1,1 | join -t$'\t' <(sort -k1,1 ../../chrom.sizes) - \
            | awk '{printf "%s\t0\t%d\t%s\n", $1, $2, $3}' \
               | sort -k1,1 -k2,2n > ucscToEns.bed
    # Ensembl is missing a chrM sequence:
    wc -l *.bed
  2210 ucscToEns.bed
  2211 ucscToINSDC.bed
  2211 ucscToRefSeq.bed
    cut -f1,4 ucscToEns.bed | sort > ucsc.ensembl.tab

    ~/kent/src/hg/utils/automation/chromAlias.pl ucsc.*.tab \
	> bosTau9.chromAlias.tab

for t in refseq genbank ensembl
do
  c0=`cat ucsc.$t.tab | wc -l`
  c1=`grep $t bosTau9.chromAlias.tab | wc -l`
  ok="OK"
  if [ "$c0" -ne "$c1" ]; then
     ok="ERROR"
  fi
  printf "# checking $t: $c0 =? $c1 $ok\n"
done
# checking refseq: 2211 =? 2211 OK
# checking genbank: 2211 =? 2211 OK
# checking ensembl: 2210 =? 2210 OK

    hgLoadSqlTab bosTau9 chromAlias ~/kent/src/hg/lib/chromAlias.sql \
        bosTau9.chromAlias.tab

#########################################################################
# fixup search rule for assembly track/gold table (DONE - 2018-11-06 - Hiram)
    cd ~/kent/src/hg/makeDb/trackDb/cow/bosTau9

    # preview prefixes and suffixes:
    hgsql -N -e "select frag from gold;" bosTau9 \
      | sed -e 's/[0-9][0-9]*//;' | sort | uniq -c | sed -e 's/^/#\t/;'
    #             1 NC_.1
    #          2210 NKLS.1

    # implies a rule: 'N[CK][LS0-9_]+(\.[0-9]+)?'

    # verify this rule will find them all and eliminate them all:
    hgsql -N -e "select frag from gold;" bosTau9 | wc -l
    # 2211

    hgsql -Ne "select frag from gold" bosTau9 \
        | egrep -e 'N[CK][LS0-9_]+(\.[0-9]+)?' | wc -l
    # 2211

    hgsql -Ne "select frag from gold" bosTau9 \
        | egrep -v -e 'N[CK][LS0-9_]+(\.[0-9]+)?' | wc -l
    # 0

    # hence, add to trackDb/cow/bosTau9/trackDb.ra
searchTable gold
shortCircuit 1
termRegex N[CK][LS0-9_]+(\.[0-9]+)?
query select chrom,chromStart,chromEnd,frag from %s where frag like '%s%%'
searchPriority 8

    # verify searches work in the position box

#############################################################################
# running repeat masker (DONE - 2018-11-06 - Hiram)
    mkdir /hive/data/genomes/bosTau9/bed/repeatMasker
    cd /hive/data/genomes/bosTau9/bed/repeatMasker
    time  (doRepeatMasker.pl -buildDir=`pwd` \
        -bigClusterHub=ku -dbHost=hgwdev -workhorse=hgwdev \
        -smallClusterHub=ku bosTau9) > do.log 2>&1 &
    # real    531m36.333s

    # had quite a few with this error:
# RepeatMasker bug?: Undefined id, line 5255479 of input:
#  1073  25.3  0.4  0.4  chrUn_NW_020191554v1   44876   45087   (19682) C  BTSAT2c        Satellite/centr       (11)  341      1   

    # get that list of items out of the do.log and remove them from
    # the bosTau9.sorted.fa.out to clean it up:
    mv bosTau9.sorted.fa.out bosTau9.sorted.fa.out.broken
    grep chrUn_NW do.log | cut -c24-59 | sort > grep.remove.list
    # there are 171 of these:
    wc -l grep.remove.list
    # 171
    grep -v -f grep.remove.list bosTau9.sorted.fa.out.broken \
	> bosTau9.sorted.fa.out
    # verify 171 lines removed:
    wc -l bosTau9.sorted.fa.out bosTau9.sorted.fa.out.broken
    # 5620007 bosTau9.sorted.fa.out
    # 5620178 bosTau9.sorted.fa.out.broken
    #     171 difference
    mv bosTau9.fa.out bosTau9.fa.out.broken
    grep -v -f grep.remove.list bosTau9.fa.out.broken \
	> bosTau9.fa.out
    wc -l bosTau9.fa.out.broken bosTau9.fa.out
    #	5620178 bosTau9.fa.out.broken
    #	5620007 bosTau9.fa.out
    #       171 difference
    # the last command of doCat.csh:
    time /cluster/bin/scripts/extractNestedRepeats.pl bosTau9.fa.out \
	| sort -k1,1 -k2,2n > bosTau9.nestedRepeats.bed

    # continuing:
    time  (doRepeatMasker.pl -buildDir=`pwd` \
        -bigClusterHub=ku -dbHost=hgwdev -workhorse=hgwdev \
        -continue=mask -smallClusterHub=ku bosTau9) > mask.log 2>&1 &
    # real    17m45.760s

    egrep "bases|Total|masked" faSize.rmsk.txt \
	| fold -w 75 -s  | sed -e 's/^/# /;'
# 2715853792 bases (28162 N's 2715825630 real 1376420245 upper 1339405385 
# lower) in 2211 sequences in 1 files
# Total size: mean 1228337.3 sd 10762990.9 min 1034 (chrUn_NW_020192071v1) 
# max 158534110 (chr1) median 21935
# %49.32 masked total, %49.32 masked real

    egrep -i "versi|relea" do.log
    # RepeatMasker version open-4.0.7
    #    February 01 2017 (open-4-0-7) 1.331 version of RepeatMasker
    # CC    Dfam_Consensus RELEASE 20170127;                            *
    # CC    RepBase RELEASE 20170127;                                   *

    time featureBits -countGaps bosTau9 rmsk
    # 1339405686 bases of 2715853792 (49.318%) in intersection
    # real    0m31.962s

    # why is it different than the faSize above ?
    # because rmsk masks out some N's as well as bases, the faSize count above
    #   separates out the N's from the bases, it doesn't show lower case N's

    # faster way to get the same result on high contig count assemblies:
    time hgsql -N -e 'select genoName,genoStart,genoEnd from rmsk;' bosTau9 \
        | bedSingleCover.pl stdin | ave -col=4 stdin | grep "^total"
    # total 1339405686.000000
    # real    0m24.033s

##########################################################################
# running simple repeat (DONE - 2018-11-06 - Hiram)

    mkdir /hive/data/genomes/bosTau9/bed/simpleRepeat
    cd /hive/data/genomes/bosTau9/bed/simpleRepeat
    # using trf409 6 here as similar size to genome (human == 6)
    time (doSimpleRepeat.pl -buildDir=`pwd` -bigClusterHub=ku \
        -dbHost=hgwdev -workhorse=hgwdev -smallClusterHub=ku \
        -trf409 6 bosTau9) > do.log 2>&1 &
    # real    173m39.504s

    cat fb.simpleRepeat
    # 78768566 bases of 2715853792 (2.900%) in intersection

    bigBedInfo *.bb | sed -e 's/^/# /;'
# version: 4
# fieldCount: 16
# hasHeaderExtension: yes
# isCompressed: yes
# isSwapped: 0
# extraIndexCount: 0
# itemCount: 545,256
# primaryDataSize: 17,063,916
# primaryIndexSize: 103,156
# zoomLevels: 10
# chromCount: 2051
# basesCovered: 78,768,566
# meanDepth (of bases covered): 6.563866
# minDepth: 1.000000
# maxDepth: 206.000000
# std of depth: 9.700871

    # adding this trfMask to the other masking
    cd /hive/data/genomes/bosTau9

    # when using the Window Masker result:
#    twoBitMask bed/windowMasker/bosTau9.cleanWMSdust.2bit \
#       -add bed/simpleRepeat/trfMask.bed  bosTau9.2bit
    #   you can safely ignore the warning about fields >= 13

    # when using Rmsk results, add to rmsk after it is done:
    twoBitMask bosTau9.rmsk.2bit \
        -add bed/simpleRepeat/trfMask.bed bosTau9.2bit
    #   you can safely ignore the warning about fields >= 13

    twoBitToFa bosTau9.2bit stdout | faSize stdin > faSize.bosTau9.2bit.txt
    egrep "bases|Total|masked" faSize.bosTau9.2bit.txt \
	| fold -w 75 -s  | sed -e 's/^/# /;'
# 2715853792 bases (28162 N's 2715825630 real 1375606800 upper 1340218830 
# lower) in 2211 sequences in 1 files
# Total size: mean 1228337.3 sd 10762990.9 min 1034 (chrUn_NW_020192071v1) 
# max 158534110 (chr1) median 21935
# %49.35 masked total, %49.35 masked real

    # reset the symlink
    rm /gbdb/bosTau9/bosTau9.2bit
    ln -s `pwd`/bosTau9.2bit /gbdb/bosTau9/bosTau9.2bit

#########################################################################
# CREATE MICROSAT TRACK (DONE - 2018-11-08 - Hiram)
    ssh hgwdev
    mkdir /cluster/data/bosTau9/bed/microsat
    cd /cluster/data/bosTau9/bed/microsat

    awk '($5==2 || $5==3) && $6 >= 15 && $8 == 100 && $9 == 0 {printf("%s\t%s\t%s\t%dx%s\n", $1, $2, $3, $6, $16);}' \
       ../simpleRepeat/simpleRepeat.bed > microsat.bed

    hgLoadBed bosTau9 microsat microsat.bed
    # Read 25219 elements of size 4 from microsat.bed

##########################################################################
## WINDOWMASKER (DONE - 2018-11-07 - Hiram)

    mkdir /hive/data/genomes/bosTau9/bed/windowMasker
    cd /hive/data/genomes/bosTau9/bed/windowMasker
    time (doWindowMasker.pl -buildDir=`pwd` -workhorse=hgwdev \
        -dbHost=hgwdev bosTau9) > do.log 2>&1
    # real    95m24.111s

    # Masking statistics
    egrep "bases|Total|masked" faSize.bosTau9.cleanWMSdust.txt \
	| fold -w 75 -s  | sed -e 's/^/# /;'
# 2715853792 bases (28162 N's 2715825630 real 1580978410 upper 1134847220 
# lower) in 2211 sequences in 1 files
# Total size: mean 1228337.3 sd 10762990.9 min 1034 (chrUn_NW_020192071v1) 
# max 158534110 (chr1) median 21935
# %41.79 masked total, %41.79 masked real

    cat fb.bosTau9.rmsk.windowmaskerSdust.txt
    # 907805797 bases of 2715853792 (33.426%) in intersection

#############################################################################
# ncbiRefSeq (DONE - 2018-11-08 - Hiram)

    # can be run up after ucscToRefSeq table is constructed
    mkdir /hive/data/genomes/bosTau9/bed/ncbiRefSeq
    cd /hive/data/genomes/bosTau9/bed/ncbiRefSeq

    # adjust the name arguments
    time (~/kent/src/hg/utils/automation/doNcbiRefSeq.pl -buildDir=`pwd` \
      -bigClusterHub=ku -dbHost=hgwdev \
      -fileServer=hgwdev -smallClusterHub=hgwdev-101 -workhorse=hgwdev \
      refseq vertebrate_mammalian Bos_taurus \
      GCF_002263795.1_ARS-UCD1.2 bosTau9) > do.log 2>&1 &
    # real    5m10.572s

    cat fb.ncbiRefSeq.bosTau9.txt
    # 80750008 bases of 2715853792 (2.973%) in intersection

#############################################################################
# cpgIslands - (DONE - 2018-11-08 - Hiram)
    mkdir /hive/data/genomes/bosTau9/bed/cpgIslands
    cd /hive/data/genomes/bosTau9/bed/cpgIslands
    time (doCpgIslands.pl -dbHost=hgwdev -bigClusterHub=ku \
      -workhorse=hgwdev -smallClusterHub=ku bosTau9) > do.log 2>&1 &
    # real    3m43.856s

    cat fb.bosTau9.cpgIslandExt.txt
    # 26618121 bases of 2715853792 (0.980%) in intersection

##############################################################################
# genscan - (DONE - 2018-11-08 - Hiram)
    mkdir /hive/data/genomes/bosTau9/bed/genscan
    cd /hive/data/genomes/bosTau9/bed/genscan
    time (doGenscan.pl -buildDir=`pwd` -workhorse=hgwdev -dbHost=hgwdev \
      -bigClusterHub=ku bosTau9) > do.log 2>&1 &
    # real    48m49.605s

    # one broken one finished with 2,000,000 window size:
    time (././runGsBig2M.csh chr13 000 gtf/000/chr13.gtf pep/000/chr13.pep subopt/000/chr13.bed) > lastOne.log 2>&1
    # real    28m54.073s

    time (doGenscan.pl -buildDir=`pwd` -workhorse=hgwdev -dbHost=hgwdev \
      -continue=makeBed -bigClusterHub=ku bosTau9) > makeBed.log 2>&1 &

    cat fb.bosTau9.genscan.txt
    # 56389215 bases of 2715853792 (2.076%) in intersection

    cat fb.bosTau9.genscanSubopt.txt
    # 51538764 bases of 2715853792 (1.898%) in intersection

    bigBedInfo bosTau9.genscan.bb | sed -e 's/^/# /;'
# version: 4
# fieldCount: 12
# hasHeaderExtension: yes
# isCompressed: yes
# isSwapped: 0
# extraIndexCount: 0
# itemCount: 43,798
# primaryDataSize: 2,587,464
# primaryIndexSize: 37,772
# zoomLevels: 7
# chromCount: 804
# basesCovered: 1,855,898,050
# meanDepth (of bases covered): 1.000000
# minDepth: 1.000000
# maxDepth: 1.000000
# std of depth: -nan

#############################################################################
# augustus gene track (DONE - 2018-11-08 - Hiram)

    mkdir /hive/data/genomes/bosTau9/bed/augustus
    cd /hive/data/genomes/bosTau9/bed/augustus
    time (doAugustus.pl -buildDir=`pwd` -bigClusterHub=ku \
     -species=human -dbHost=hgwdev -workhorse=hgwdev bosTau9) > do.log 2>&1 &
    # real    109m41.672s

    cat fb.bosTau9.augustusGene.txt
    # 52391476 bases of 2715853792 (1.929%) in intersection

     bigBedInfo  bosTau9.augustus.bb | sed -e 's/^/# /;'
# version: 4
# fieldCount: 20
# hasHeaderExtension: yes
# isCompressed: yes
# isSwapped: 0
# extraIndexCount: 0
# itemCount: 30,862
# primaryDataSize: 2,187,353
# primaryIndexSize: 22,468
# zoomLevels: 7
# chromCount: 454
# basesCovered: 1,220,657,991
# meanDepth (of bases covered): 1.261447
# minDepth: 1.000000
# maxDepth: 5.000000
# std of depth: 0.630592

#############################################################################
# lastz/chain/net swap human/hg38 (TBD - 2018-04-25 - Hiram)
    # original alignment
    cd /hive/data/genomes/hg38/bed/lastzBosTau9.2018-04-25

    cat fb.hg38.chainBosTau9Link.txt
    # 1388649593 bases of 3049335806 (45.539%) in intersection
    cat fb.hg38.chainSynBosTau9Link.txt
    # 1330693519 bases of 3049335806 (43.639%) in intersection
    cat fb.hg38.chainRBestBosTau9Link.txt
    # 1278396766 bases of 3049335806 (41.924%) in intersection

    # running the swap
    mkdir /hive/data/genomes/bosTau9/bed/blastz.hg38.swap
    cd /hive/data/genomes/bosTau9/bed/blastz.hg38.swap
    time (doBlastzChainNet.pl -verbose=2 \
        -swap /hive/data/genomes/hg38/bed/lastzBosTau9.2018-04-25/DEF \
        -chainMinScore=3000 -chainLinearGap=medium \
        -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \
        -syntenicNet) > swap.log 2>&1
    # real    104m31.748s

    cat fb.bosTau9.chainHg38Link.txt
    # 1319553403 bases of 2587515673 (50.997%) in intersection
    cat fb.bosTau9.chainSynHg38Link.txt
    # 1280196824 bases of 2587515673 (49.476%) in intersection

    time (doRecipBest.pl -load -workhorse=hgwdev -buildDir=`pwd` \
        bosTau9 hg38) > rbest.log 2>&1 &
    # real    638m15.603s

    cat fb.bosTau9.chainRBestHg38Link.txt 
    # 1279077824 bases of 2587515673 (49.433%) in intersection

#############################################################################
# lastz/chain/net swap mouse/mm10 (TBD - 2018-04-25 - Hiram)

    # alignment to mouse/mm10:
    cd /hive/data/genomes/mm10/bed/lastzBosTau9.2018-04-25

    cat fb.mm10.chainBosTau9Link.txt
    # 693504453 bases of 2652783500 (26.143%) in intersection

    cat fb.mm10.chainRBestBosTau9Link.txt
    # 657097998 bases of 2652783500 (24.770%) in intersection

    # and for the swap:
    mkdir /hive/data/genomes/bosTau9/bed/blastz.mm0.swap
    cd /hive/data/genomes/bosTau9/bed/blastz.mm10.swap

    time (doBlastzChainNet.pl -verbose=2 \
      /hive/data/genomes/mm10/bed/lastzBosTau9.2018-04-25/DEF \
        -swap -chainMinScore=3000 -chainLinearGap=medium \
          -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \
            -syntenicNet) > swap.log 2>&1
    #  real    63m12.935s

    cat fb.bosTau9.chainMm10Link.txt
    # 680117358 bases of 2587515673 (26.285%) in intersection
    cat fb.bosTau9.chainSynMm10Link.txt
    # 643562837 bases of 2587515673 (24.872%) in intersection

    time (doRecipBest.pl -load -workhorse=hgwdev -buildDir=`pwd` \
	bosTau9 mm10) > rbest.log 2>&1 &
    # real    437m1.637s

    cat fb.bosTau9.chainRBestMm10Link.txt
    # 656602300 bases of 2587515673 (25.376%) in intersection

##############################################################################
# Create kluster run files (DONE - 2018-11-08 - Hiram)

    cd /hive/data/genomes/bosTau9
    # numerator is bosTau9 gapless bases "real" as reported by:
    featureBits -noRandom -noHap bosTau9 gap
    # 0 bases of 2628411261 (0.000%) in intersection
    #               ^^^

    # denominator is hg19 gapless bases as reported by:
    #   featureBits -noRandom -noHap hg19 gap
    #     234344806 bases of 2861349177 (8.190%) in intersection
    # 1024 is threshold used for human -repMatch:
    calc \(2628411261 / 2861349177 \) \* 1024
    # (2628411261 / 2861349177 ) * 1024 = 940.637778

    # ==> use -repMatch=900 same as was bosTau8
    cd /hive/data/genomes/bosTau9
    blat bosTau9.2bit /dev/null /dev/null -tileSize=11 \
        -makeOoc=jkStuff/bosTau9.11.ooc -repMatch=900
    #   Wrote 35432 overused 11-mers to jkStuff/bosTau9.11.ooc
    # bosTau8 at repMatch=900 was:
    #   Wrote 33613 overused 11-mers to jkStuff/bosTau8.11.ooc

    # no unbridged gaps so no need to worry about gaplift file:
    hgsql -Ne "select bridge from gap" bosTau9 | sort | uniq -c
    # no gaps of any sort

    hgsql -Ne "select count(*) from gap" bosTau9
    #    +---+
    #    | 0 |
    #    +---+

##############################################################################
# LIFTOVER TO bosTau8 (DONE - 2018-11-08 - Hiram)
    ssh hgwdev
    mkdir /hive/data/genomes/bosTau9/bed/blat.bosTau8.2018-11-08
    cd /hive/data/genomes/bosTau9/bed/blat.bosTau8.2018-11-08
    time (doSameSpeciesLiftOver.pl -verbose=2 -buildDir=`pwd` \
	-ooc=/hive/data/genomes/bosTau9/jkStuff/bosTau9.11.ooc \
        -bigClusterHub=ku -dbHost=hgwdev -workhorse=hgwdev \
         bosTau9 bosTau8) > do.log 2>&1 &
    # real    1255m56.756s

    # verify the convert link on the test browser is now active
    # from bosTau9 to bosTau8

##############################################################################
# LIFTOVER TO bosTau7 (DONE - 2018-11-08 - Hiram)
    ssh hgwdev
    mkdir /hive/data/genomes/bosTau9/bed/blat.bosTau7.2018-11-08
    cd /hive/data/genomes/bosTau9/bed/blat.bosTau7.2018-11-08
    time (doSameSpeciesLiftOver.pl -verbose=2 -buildDir=`pwd` \
	-ooc=/hive/data/genomes/bosTau9/jkStuff/bosTau9.11.ooc \
        -bigClusterHub=ku -dbHost=hgwdev -workhorse=hgwdev \
         bosTau9 bosTau7) > do.log 2>&1 &
    # real    831m15.041s

    # verify the convert link on the test browser is now active
    # from bosTau9 to bosTau7

##############################################################################
# LIFTOVER TO bosTau6 (DONE - 2018-11-08 - Hiram)
    ssh hgwdev
    mkdir /hive/data/genomes/bosTau9/bed/blat.bosTau6.2018-11-08
    cd /hive/data/genomes/bosTau9/bed/blat.bosTau6.2018-11-08
    time (doSameSpeciesLiftOver.pl -verbose=2 -buildDir=`pwd` \
	-ooc=/hive/data/genomes/bosTau9/jkStuff/bosTau9.11.ooc \
        -bigClusterHub=ku -dbHost=hgwdev -workhorse=hgwdev \
         bosTau9 bosTau6) > do.log 2>&1 &
    # real    1236m43.784s

    # verify the convert link on the test browser is now active
    # from bosTau9 to bosTau6

##############################################################################
# crispr 10K shoulders (DONE - 2018-11-16 - Hiram)
    mkdir  /hive/data/genomes/bosTau9/bed/crispr10K
    cd  /hive/data/genomes/bosTau9/bed/crispr10K

    time (~/kent/src/hg/utils/automation/doCrispr.pl \
   -stop=load -buildDir=`pwd` -smallClusterHub=hgwdev-101 bosTau9 ncbiRefSeq) \
	> do.log 2>&1
    # real    1192m19.444s
    # broke down, fixed, manually completed specScores
    time find tmp/outGuides -type f | xargs cut -f3-6 > ../specScores.tab
    # real    329m49.271s
    # effScores: real      1410m36.918s
    # offTargets: real     99m35.115s
    # load: real   132m24.530s

    # hive cleaning - 2021-04-26 - Hiram
    time (~/kent/src/hg/utils/automation/doCrispr.pl \
     -continue=cleanup -buildDir=`pwd` -smallClusterHub=hgwdev bosTau9 \
          -fileServer=hgwdev -smallClusterHub=hgwdev -bigClusterHub=ku
            -workhorse=hgwdev) > cleanup.log 2>&1 &

##############################################################################
# crispr whole genome (DONE - 2022-03-01 - Hiram)
    mkdir /hive/data/genomes/bosTau9/bed/crisprAll
    cd /hive/data/genomes/bosTau9/bed/crisprAll

    # the large shoulder argument will cause the entire genome to be scanned
    # this takes a while for a new genome to get the bwa indexing done
    time (~/kent/src/hg/utils/automation/doCrispr.pl -verbose=2 -stop=ranges \
    bosTau9 -tableName=crisprAll \
    -fileServer=hgwdev \
    -buildDir=`pwd` -smallClusterHub=hgwdev -bigClusterHub=ku \
      -workhorse=hgwdev) > indexFa.log 2>&1
    # real    1m10.666s

    time (~/kent/src/hg/utils/automation/doCrispr.pl -verbose=2 \
    -continue=ranges bosTau9 -tableName=crisprAll \
    -fileServer=hgwdev \
    -buildDir=`pwd` -smallClusterHub=hgwdev -bigClusterHub=ku \
      -workhorse=hgwdev) > ranges.log 2>&1
    # real    8554m11.613s

    time (~/kent/src/hg/utils/automation/doCrispr.pl -verbose=2 \
    -continue=effScores bosTau9 -tableName=crisprAll \
    -fileServer=hgwdev \
    -buildDir=`pwd` -smallClusterHub=hgwdev -bigClusterHub=ku \
      -workhorse=hgwdev) > effScores.log 2>&1

    time (~/kent/src/hg/utils/automation/doCrispr.pl -verbose=2 \
    -continue=offTargets bosTau9 -tableName=crisprAll \
    -fileServer=hgwdev \
    -buildDir=`pwd` -smallClusterHub=hgwdev -bigClusterHub=ku \
      -workhorse=hgwdev) > offTargets.log 2>&1

    time (~/kent/src/hg/utils/automation/doCrispr.pl -verbose=2 \
    -continue=load bosTau9 -tableName=crisprAll \
    -fileServer=hgwdev \
    -buildDir=`pwd` -smallClusterHub=hgwdev -bigClusterHub=ku \
      -workhorse=hgwdev) > load.log 2>&1

    cat guides/run.time | sed -e 's/^/# /;'
# Completed: 100 of 100 jobs
# CPU time in finished jobs:      12054s     200.90m     3.35h    0.14d  0.000 y
# IO & Wait Time:                   282s       4.70m     0.08h    0.00d  0.000 y
# Average job time:                 123s       2.06m     0.03h    0.00d
# Longest finished job:             432s       7.20m     0.12h    0.01d
# Submission to last job:           434s       7.23m     0.12h    0.01d

    cat specScores/run.time | sed -e 's/^/# /;'
# Completed: 841413 of 841413 jobs
# CPU time in finished jobs:   71934084s 1198901.39m 19981.69h  832.57d  2.281 y
# IO & Wait Time:                     0s       0.00m     0.00h    0.00d  0.000 y
# Average job time:                  85s       1.41m     0.02h    0.00d
# Longest finished job:             170s       2.83m     0.05h    0.00d
# Submission to last job:        203959s    3399.32m    56.66h    2.36d

    grep -c . effScores.tab
    # 288692962
    grep -c . specScores.tab 
    # 218717447

    cat effScores/run.time | sed -e 's/^/# /;'
# Completed: 28864 of 28864 jobs
# CPU time in finished jobs:   12570265s  209504.41m  3491.74h  145.49d  0.399 y
# IO & Wait Time:                 45737s     762.29m    12.70h    0.53d  0.001 y
# Average job time:                 437s       7.28m     0.12h    0.01d
# Longest finished job:            7448s     124.13m     2.07h    0.09d
# Submission to last job:         31650s     527.50m     8.79h    0.37d

    cat offTargets/run.time | sed -e 's/^/# /;'
# Completed: 145438 of 145438 jobs
# CPU time in finished jobs:    2306725s   38445.42m   640.76h   26.70d  0.073 y
# IO & Wait Time:                901605s   15026.74m   250.45h   10.44d  0.029 y
# Average job time:                  22s       0.37m     0.01h    0.00d
# Longest finished job:             133s       2.22m     0.04h    0.00d
# Submission to last job:         14396s     239.93m     4.00h    0.17d

##############################################################################
# GENBANK AUTO UPDATE (DONE - 2018-11-08 - Hiram)
    ssh hgwdev
    cd $HOME/kent/src/hg/makeDb/genbank
    git pull
    # /cluster/data/genbank/data/organism.lst shows:
    # #organism      mrnaCnt   estCnt  refSeqCnt
    # Bos taurus	20115	1583423	13363

    # edit etc/genbank.conf to add bosTau9 just after bosTau8
# bosTau9 (cow - Bos taurus - refseq GCF_002263795.1 ARS-UCD1.2 - taxId 9913)
bosTau9.serverGenome = /hive/data/genomes/bosTau9/bosTau9.2bit
bosTau9.clusterGenome = /hive/data/genomes/bosTau9/bosTau9.2bit
bosTau9.ooc = /hive/data/genomes/bosTau9/jkStuff/bosTau9.11.ooc
bosTau9.lift = no
bosTau9.perChromTables = no
bosTau9.refseq.mrna.native.pslCDnaFilter  = ${finished.refseq.mrna.native.pslCDnaFilter}
bosTau9.refseq.mrna.xeno.pslCDnaFilter    = ${finished.refseq.mrna.xeno.pslCDnaFilter}
bosTau9.genbank.mrna.native.pslCDnaFilter = ${finished.genbank.mrna.native.pslCDnaFilter}
bosTau9.genbank.mrna.xeno.pslCDnaFilter   = ${finished.genbank.mrna.xeno.pslCDnaFilter}
bosTau9.genbank.est.native.pslCDnaFilter  = ${finished.genbank.est.native.pslCDnaFilter}
bosTau9.genbank.est.xeno.pslCDnaFilter    = ${finished.genbank.est.xeno.pslCDnaFilter}
bosTau9.downloadDir = bosTau9
# bosTau9.upstreamGeneTbl = refGene
# defaults yes: genbank.mrna.native.load genbank.mrna.native.loadDesc
# yes: genbank.est.native.load refseq.mrna.native.load
# yes: refseq.mrna.native.loadDesc refseq.mrna.xeno.load
# yes: refseq.mrna.xeno.loadDesc
# defaults no: genbank.mrna.xeno.load genbank.mrna.xeno.loadDesc
# no: genbank.est.native.loadDesc genbank.est.xeno.load
# no: genbank.est.xeno.loadDesc

    # verify stated file paths do exist:
    grep bosTau9 etc/genbank.conf | egrep "Genome|ooc|lift" \
	| awk '{print $NF}' | sort -u | xargs ls -og
ls: cannot access no: No such file or directory
-rw-rw-r-- 1 712534740 Nov  7 13:35 /hive/data/genomes/bosTau9/bosTau9.2bit
-rw-rw-r-- 1    141736 Nov  8 13:13 /hive/data/genomes/bosTau9/jkStuff/bosTau9.11.ooc
    # ls error on the file named 'no' is from the bosTau9.lift = no

    git commit -m 'adding bosTau9 refs #22425' etc/genbank.conf
    git push

    # add bosTau9 to:
    #   etc/hgwdev.dbs
    git commit -m 'adding bosTau9 refs #22425' etc/hgwdev.dbs

    git push
    # update /cluster/data/genbank/:
    make etc-update

#############################################################################
#  BLATSERVERS ENTRY (DONE - 2018-11-08 - Hiram)
#	After getting a blat server assigned by the Blat Server Gods,
    ssh hgwdev

    hgsql -e 'INSERT INTO blatServers (db, host, port, isTrans, canPcr) \
    VALUES ("bosTau9", "blat1c", "17908", "1", "0"); \
    INSERT INTO blatServers (db, host, port, isTrans, canPcr) \
    VALUES ("bosTau9", "blat1c", "17909", "0", "1");' \
    hgcentraltest
    #   test it with some sequence

##############################################################################
## reset default position to the casein gene complex (milk production proteins)
##  (DONE - 2018-11-08 - Hiram)
## https://www.ncbi.nlm.nih.gov/pmc/articles/PMC332753/
## quote: The four genes reside on less than 200 kb of DNA in
##        the order CASAS1-CASB-CASAS2-CASK.

    ssh hgwdev
    hgsql -e 'update dbDb set defaultPos="chr6:85405597-85664387" where name="bosTau9";' hgcentraltest

##############################################################################
# all.joiner update, downloads and in pushQ - (TBD - 2018-05-01 - Hiram)
    cd $HOME/kent/src/hg/makeDb/schema
    ~/kent/src/hg/utils/automation/verifyBrowser.pl bosTau9
# 64 tables in database bosTau9 - Cow, Bos taurus
# verified 62 tables in database bosTau9, 2 extra tables, 24 optional tables
# NCBI RefSeq genes     10 optional tables
# chainNetRBestHg38     3 optional tables
# chainNetRBestMm10     3 optional tables
# chainNetSynHg38       3 optional tables
# chainNetSynMm10       3 optional tables
# gapOverlap    1 optional tables
# tandemDups    1 optional tables
# 1     crispr10KRanges - extra table
# 2     crispr10KTargets        - extra table
# 9 genbank tables found
# verified 29 required tables, 0 missing tables
# hg38 chainNet to bosTau9 found 3 required tables
# mm10 chainNet to bosTau9 found 3 required tables
# hg38 chainNet RBest and syntenic to bosTau9 found 6 optional tables
# mm10 chainNet RBest and syntenic to bosTau9 found 3 optional tables
# liftOver to previous versions: 3, from previous versions: 3

    # fixup all.joiner until this is a clean output
    joinerCheck -database=bosTau9 -tableCoverage all.joiner
    joinerCheck -database=bosTau9 -times all.joiner
    joinerCheck -database=bosTau9 -keys all.joiner

    cd /hive/data/genomes/bosTau9
    # clean up obsolete trackDb work, assuming you have already checked in
    # these trackDb files into the source tree
    rm -fr TemporaryTrackDbCheckout

    time (makeDownloads.pl -workhorse=hgwdev bosTau9) > downloads.log 2>&1

    #   now ready for pushQ entry
    mkdir /hive/data/genomes/bosTau9/pushQ
    cd /hive/data/genomes/bosTau9/pushQ
  time (makePushQSql.pl -redmineList bosTau9) > bosTau9.pushQ.sql 2> stderr.out
    # real    9m34.930s

    # remove the tandemDups and gapOverlap from the file list:
    sed -i -e "/tandemDups/d" redmine.bosTau9.table.list
    sed -i -e "/Tandem Dups/d" redmine.bosTau9.releaseLog.txt
    sed -i -e "/gapOverlap/d" redmine.bosTau9.table.list
    sed -i -e "/Gap Overlaps/d" redmine.bosTau9.releaseLog.txt

    #   check for errors in stderr.out, some are OK, e.g.:
    # WARNING: bosTau9 does not have seq
    # WARNING: bosTau9 does not have extFile

    # add the path names to the listing files in the redmine issue
    # in the three appropriate entry boxes:

/hive/data/genomes/bosTau9/pushQ/redmine.bosTau9.file.list
/hive/data/genomes/bosTau9/pushQ/redmine.bosTau9.releaseLog.txt
/hive/data/genomes/bosTau9/pushQ/redmine.bosTau9.table.list

##############################################################################
# LIFTOVER TO bosTau4 (DONE - 2022-12-06 - Hiram)
    ssh hgwdev
    mkdir /hive/data/genomes/bosTau9/bed/blat.bosTau4.2022-12-06
    cd /hive/data/genomes/bosTau9/bed/blat.bosTau4.2022-12-06
    doSameSpeciesLiftOver.pl -verbose=2 \
        -debug -bigClusterHub=ku -dbHost=hgwdev -workhorse=hgwdev \
        -target2Bit=/hive/data/genomes/bosTau9/bosTau9.2bit \
        -targetSizes=/hive/data/genomes/bosTau9/chrom.sizes \
        -query2Bit=/hive/data/genomes/bosTau4/bosTau4.2bit \
        -querySizes=/hive/data/genomes/bosTau4/chrom.sizes \
        -ooc=/hive/data/genomes/bosTau9/jkStuff/bosTau9.11.ooc \
         bosTau9 bosTau4
    time (doSameSpeciesLiftOver.pl -verbose=2 \
        -bigClusterHub=ku -dbHost=hgwdev -workhorse=hgwdev \
        -target2Bit=/hive/data/genomes/bosTau9/bosTau9.2bit \
        -targetSizes=/hive/data/genomes/bosTau9/chrom.sizes \
        -query2Bit=/hive/data/genomes/bosTau4/bosTau4.2bit \
        -querySizes=/hive/data/genomes/bosTau4/chrom.sizes \
        -ooc=/hive/data/genomes/bosTau9/jkStuff/bosTau9.11.ooc \
         bosTau9 bosTau4) > doLiftOverToBosTau9.log 2>&1
    # real    384m15.787s

    # see if the liftOver menus function in the browser from bosTau9 to bosTau4

##############################################################################
