# for emacs: -*- mode: sh; -*-

# This file describes browser build for the danRer11
#
#  Can use existing photograph (otherwise find one before starting here)

#########################################################################
#  Initial steps, find photograph (DONE - 2017-05-24 - Chris)

# To start this initialBuild.txt document, from a previous assembly document:

mkdir ~/kent/src/hg/makeDb/doc/danRer11
cd ~/kent/src/hg/makeDb/doc/danRer11

sed -e 's/galGal5/danRer11/g; s/GalGal5/DanRer11/g; s/TBD/TBD/g;' \
  ../galGal5/initialBuild.txt > initialBuild.txt

# the files required are probably already here, take a look into:
#  /hive/data/outside/ncbi/genomes/refseq/<subSet>/<scientificName>/all_assembly_versions
# and merely symlink them in:

mkdir -p /hive/data/genomes/danRer11/genbank
cd /hive/data/genomes/danRer11/genbank
ln -s /hive/data/outside/ncbi/genomes/genbank/vertebrate_other/Danio_rerio/all_assembly_versions/GCA_000002035.4_GRCz11/* ./

#  Can use existing photograph

# Link to the existing photo. 
cd ../
ln -s ../danRer10/photoReference.txt .  

# Assembly name:  GRCz11
# Description:    Genome Reference Consortium Zebrafish Build 11
# Organism name:  Danio rerio (zebrafish)
# Infraspecific name:  strain=Tuebingen
# Taxid:          7955
# BioSample:      SAMN06930106
# BioProject:     PRJNA11776
# Submitter:      Genome Reference Consortium
# Date:           2017-5-9
# Assembly type:  haploid-with-alt-loci
# Release type:   major
# Assembly level: Chromosome
# Genome representation: full
# RefSeq category: Reference Genome
# GenBank assembly accession: GCA_000002035.4
#
## Assembly-Units:
## GenBank Unit Accession       RefSeq Unit Accession   Assembly-Unit name
## GCA_000000175.4              Primary Assembly
## GCA_002119155.1              ALT_DRER_TU_1
## GCA_002119165.1              ALT_DRER_TU_2
## GCA_002119175.1              ALT_DRER_TU_3
## GCA_002119185.1              ALT_DRER_TU_4

#############################################################################
# establish config.ra file (DONE - Chris - 2017-05-24)
    cd /hive/data/genomes/danRer11
    ~/kent/src/hg/utils/automation/prepConfig.pl danRer11 vertebrate zebrafish \
       genbank/*_assembly_report.txt > danRer11.config.ra

    # verify it looks sane
    cat danRer11.config.ra
# config parameters for makeGenomeDb.pl:
db danRer11
clade vertebrate
scientificName Danio rerio
commonName Zebrafish
assemblyDate May 2017
assemblyLabel Genome Reference Consortium
assemblyShortLabel GRCz11
orderKey 26172
mitoAcc NC_002333.2
fastaFiles /hive/data/genomes/danRer11/ucsc/*.fa.gz
agpFiles /hive/data/genomes/danRer11/ucsc/*.agp
# qualFiles none
dbDbSpeciesDir zebrafish
photoCreditURL	http://www.genome.gov/dmd/img.cfm?node=Photos/Animals/Zebrafish&id=79130
photoCreditName	NHGRI Press Photos
ncbiGenomeId 50
ncbiAssemblyId 1104621
ncbiAssemblyName GRCz11
ncbiBioProject 11776
ncbiBioSample SAMN06930106
genBankAccessionID GCA_000002035.4
taxId 7955

#############################################################################
# setup UCSC named files (DONE - 2017-06-01 - Chris)

    mkdir /hive/data/genomes/danRer11/ucsc
    cd /hive/data/genomes/danRer11/ucsc
    # measure what is in the refseq release:
    faSize ../genbank/*genomic.fna.gz

#1679186873 bases (4693618 N's 1674493255 real 861444935 upper 813048320 lower) in 1922 sequences in 1 files
#Total size: mean 873666.4 sd 6187603.1 min 650 (KN150525.1) max 78093715 (CM002888.2) median 149621
#N count: mean 2442.0 sd 64890.9
#U count: mean 448202.4 sd 3139997.1
#L count: mean 423022.0 sd 3024074.2
#%48.42 masked total, %48.55 masked real

    # check for duplicate sequences:
    time faToTwoBit -noMask ../genbank/*_genomic.fna.gz genbank.2bit
    #  real    0m52.302s
    twoBitDup genbank.2bit
    # no output is a good result, otherwise, would have to eliminate duplicates
    # the scripts creating the fasta here will be using this genbank.2bit file
    # remove it later

    # new option required to ucscCompositeAgp.pl 2016-04-13
    time ~/kent/src/hg/utils/automation/ucscCompositeAgp.pl \
      ../genbank/*_genomic.fna.gz ../genbank/*_assembly_structure/Primary_Assembly
# constructing refseq.2bit from ../genbank/GCA_000002035.4_GRCz11_genomic.fna.gz
CM002885.2 chr1
CM002886.2 chr2
CM002887.2 chr3
CM002888.2 chr4
CM002889.2 chr5
CM002890.2 chr6
CM002891.2 chr7
CM002892.2 chr8
CM002893.2 chr9
CM002894.2 chr10
CM002895.2 chr11
CM002896.2 chr12
CM002897.2 chr13
CM002898.2 chr14
CM002899.2 chr15
CM002900.2 chr16
CM002901.2 chr17
CM002902.2 chr18
CM002903.2 chr19
CM002904.2 chr20
CM002905.2 chr21
CM002906.2 chr22
CM002907.2 chr23
CM002908.2 chr24
CM002909.2 chr25

real	8m20.666s
user	8m33.194s
    
	time ~/kent/src/hg/utils/automation/unplacedWithChroms.pl \
       ../genbank/*_assembly_structure/Primary_Assembly

    # Are no unlocalized
	#./unlocalizedWithChroms.pl ../genbank/*_assembly_structure/Primary_Assembly

    # Handle the alternative sequences
	time ucscAltAgp ../genbank/GCA_000002035.4_GRCz11_assembly_structure/all_alt_scaffold_placement.txt ../genbank/GCA_000002035.4_GRCz11_assembly_structure/
	# Finished creating alt_4.agp
	# Finished creatingalt_4.fa.gz
	# Finished creating alt_2.agp
	# Finished creatingalt_2.fa.gz
	# Finished creating alt_1.agp
	# Finished creatingalt_1.fa.gz
	# Finished creating alt_3.agp
	# Finished creatingalt_3.fa.gz

	# real	1m49.141s
	# user	1m47.337s
	# sys	0m1.237s
	
	# verify fasta and AGPs agree
	time faToTwoBit *.fa.gz test.2bit
    # real    0m46.414s

    cat *.agp | checkAgpAndFa stdin test.2bit 2>&1 | tail -4
    # All AGP and FASTA entries agree - both files are valid

    # and no sequence lost from orginal:
    twoBitToFa test.2bit stdout | faSize stdin
	# 1679186873 bases (4693618 N's 1674493255 real 1674493255 upper 0 lower) in 1922 sequences in 1 files
	# Total size: mean 873666.4 sd 6187603.1 min 650 (chrUn_KN150525v1) max 78093715 (chr4) median 149621
	# N count: mean 2442.0 sd 64890.9
	# U count: mean 871224.4 sd 6162220.4
	# L count: mean 0.0 sd 0.0
	# %0.00 masked total, %0.00 masked real
	
    # same numbers as above (except for upper/lower masking)

    # no longer need these temporary 2bit files
    rm test.2bit genbank.2bit

#############################################################################
#  Initial database build (DONE - 2016-06-01 - Chris)

	cd /hive/data/genomes/danRer11
    # verify sequence and AGP are OK:
    time (makeGenomeDb.pl -workhorse=hgwdev -dbHost=hgwdev -fileServer=hgwdev \
         -stop=agp danRer11.config.ra) > agp.log 2>&1
    # real	1m40.249s

    # then finish it off:
    time (makeGenomeDb.pl -workhorse=hgwdev -dbHost=hgwdev \
       -fileServer=hgwdev -continue=db danRer11.config.ra) > db.log 2>&1
    # real    11m9.465s

    # check in the trackDb files created in TemporaryTrackDbCheckout/
    #    and add danRer11 to trackDb/makefile

    # temporary symlink until masked sequence is available
    cd /hive/data/genomes/danRer11
    ln -s `pwd`/danRer11.unmasked.2bit /gbdb/danRer11/danRer11.2bit

##############################################################################
# cpgIslands on UNMASKED sequence (DONE - 2017-06-01 - Chris)
    mkdir /hive/data/genomes/danRer11/bed/cpgIslandsUnmasked
    cd /hive/data/genomes/danRer11/bed/cpgIslandsUnmasked

    time (doCpgIslands.pl -dbHost=hgwdev -bigClusterHub=ku -buildDir=`pwd` \
       -tableName=cpgIslandExtUnmasked \
          -maskedSeq=/hive/data/genomes/danRer11/danRer11.unmasked.2bit \
             -workhorse=hgwdev -smallClusterHub=ku danRer11) > do.log 2>&1
    # real    6m59.252s

    cat fb.danRer11.cpgIslandExtUnmasked.txt
	# 34662385 bases of 1674677181 (2.070%) in intersection

#############################################################################
# cytoBandIdeo - (DONE - 2017-06-02 - Chris)
    mkdir /hive/data/genomes/danRer11/bed/cytoBand
    cd /hive/data/genomes/danRer11/bed/cytoBand
    makeCytoBandIdeo.csh danRer11

#########################################################################
# ucscToINSDC table/track (DONE - 2016-04-14 - Chris)
    # the sequence here is working for a 'refseq' assembly with a chrM
    # situation may be specific depending upon what is available in the assembly

    mkdir /hive/data/genomes/danRer11/bed/ucscToINSDC
    cd /hive/data/genomes/danRer11/bed/ucscToINSDC

    # find accession for chrM
    grep chrM ../../danRer11.agp
	#chrM	1	16596	2	F	NC_002333.2	1	16596	+

    # use that accession here:
    ~/kent/src/hg/utils/automation/ucscToINSDC.sh \
        ../../genbank/GCA_*structure/Primary_Assembly NC_002333.2
    
	# Copy over the ucscToINSDC.txt file from idKeys
	cp ../idKeys/idMap.txt .

    awk '{printf "%s\t0\t%d\n", $1,$2}' ../../chrom.sizes \
         | sort > name.coordinate.tab
    
	# Put in the chrm refseq: NC_002333.2, INSDC: AC024175.3 manually

	join name.coordinate.tab <(sort idMap.txt) | tr '[ ]' '[\t]' \
         > ucscToINSDC.bed

    # should be same line counts throughout:
    wc -l *
    #   1922   3844  46128 insdc.refseq.txt
	#   1922   3844  56859 insdcToUcsc.txt
	#   1923   5769  51957 name.coordinate.tab
	#   1923   7688  73086 ucscToINSDC.bed
	#   1922   3844  56859 ucscToINSDC.txt

    cut -f1 ucscToINSDC.bed | awk '{print length($0)}' | sort -n | tail -1
    # 20
    # use the 20 in this sed
    sed -e "s/21/20/" $HOME/kent/src/hg/lib/ucscToINSDC.sql \
         | hgLoadSqlTab danRer11 ucscToINSDC stdin ucscToINSDC.bed

	checkTableCoords danRer11
    # should cover %100 entirely:
    featureBits -countGaps danRer11 ucscToINSDC
    # 1679203469 bases of 1679203469 (100.000%) in intersection

#########################################################################
# fixup search rule for assembly track/gold table (DONE - 2016-10-20 - Hiram)
    cd ~/kent/src/hg/makeDb/trackDb/zebrafish/danRer11
    # preview prefixes and suffixes:
    hgsql -N -e "select frag from gold;" danRer11 \
      | sed -e 's/[0-9][0-9]*//; s/\.[0-9]\+$//' | sort | uniq -c 

    984 AL
   4518 BX
      5 CAAK
  18560 CABZ
   3251 CR
   1015 CT
   3213 CU
     31 CZQB
    936 FO
    873 FP
    347 FQ
      1 LK
    876 LO
      6 LT
      1 NC_

    # implies a rule: '[ABCFLN][A-Z]+[0-9_][0-9]+(\.[0-9]+)?'

    # verify this rule will find them all and eliminate them all:
    hgsql -N -e "select frag from gold;" danRer11 | wc -l
    # 34617

    hgsql -N -e "select frag from gold;" danRer11 \
	| egrep -e '[ABCFLN][A-Z]+[0-9_][0-9]+(\.[0-9]+)?' | wc -l
    # 34617

    hgsql -N -e "select frag from gold;" danRer11 \
	| egrep -v -e '[ABCFLN][A-Z]+[0-9_][0-9]+(\.[0-9]+)?' | wc -l
    # 0

    # hence, add to trackDb/zebrafish/danRer11/trackDb.ra
searchTable gold
shortCircuit 1
termRegex [ABCFLN][A-Z]+[0-9_][0-9]+(\.[0-9]+)?
query select chrom,chromStart,chromEnd,frag from %s where frag like '%s%%'
searchPriority 8

    # verify searches work in the position box

##########################################################################
# running repeat masker (DONE - 2017-06-07 - Chris)
    mkdir /hive/data/genomes/danRer11/bed/repeatMasker
    cd /hive/data/genomes/danRer11/bed/repeatMasker
    time  (doRepeatMasker.pl -buildDir=`pwd` \
        -bigClusterHub=ku -dbHost=hgwdev -workhorse=hgwdev \
        -smallClusterHub=ku danRer11) > do.log 2>&1
    # real    254m18.208s

	# 9 jobs (out of 3664) crashed in the cluster, they were re run manually and
	# completed fine, taking ~90 minute each. 

    cat faSize.rmsk.txt
# 1679203469 bases (4693618 N's 1674509851 real 809208275 upper 865301576 lower) in 1923 sequences in 1 files
# Total size: mean 873220.7 sd 6186024.0 min 650 (chrUn_KN150525v1) max 78093715 (chr4) median 146921
# N count: mean 2440.8 sd 64874.0
# U count: mean 420805.1 sd 2950984.7
# L count: mean 449974.8 sd 3210628.1
# %51.53 masked total, %51.67 masked real

    egrep -i "versi|relea" do.log
    # RepeatMasker version open-4.0.5
	# grep version of RepeatMasker$ /scratch/data/RepeatMasker/RepeatMasker
	#    January 31 2015 (open-4-0-5) version of RepeatMasker
	# grep RELEASE /scratch/data/RepeatMasker/Libraries/RepeatMaskerLib.embl
	# CC   RELEASE 20140131;  

    time featureBits -countGaps danRer11 rmsk
    # 865410761 bases of 1679203469 (51.537%) in intersection
	# real	0m28.805s

    # why is it different than the faSize above ?
    # because rmsk masks out some N's as well as bases, the faSize count above
    #   separates out the N's from the bases, it doesn't show lower case N's

    # faster way to get the same result:
    time hgsql -N -e 'select genoName,genoStart,genoEnd from rmsk;' danRer11 \
        | bedSingleCover.pl stdin | ave -col=4 stdin | grep "^total"
    #  total 865410761.000000
	# real	0m28.896s

##########################################################################
# running simple repeat (DONE - 2017-06-07 - Chris)

    mkdir /hive/data/genomes/danRer11/bed/simpleRepeat
    cd /hive/data/genomes/danRer11/bed/simpleRepeat
    time (doSimpleRepeat.pl -buildDir=`pwd` -bigClusterHub=ku \
        -dbHost=hgwdev -workhorse=hgwdev -smallClusterHub=ku \
        -trf409 3 danRer11) > do.log 2>&1
    # real	14m55.497s

    cat fb.simpleRepeat
	# 108632714 bases of 1674677181 (6.487%) in intersection

    cd /hive/data/genomes/danRer11
    # using the Window Masker result:
    #twoBitMask bed/windowMasker/danRer11.cleanWMSdust.2bit \
    #   -add bed/simpleRepeat/trfMask.bed  danRer11.2bit
    #   you can safely ignore the warning about fields >= 13

    # add to rmsk after it is done:
    twoBitMask danRer11.rmsk.2bit \
        -add bed/simpleRepeat/trfMask.bed danRer11.2bit
    #   you can safely ignore the warning about fields >= 13
    twoBitToFa danRer11.2bit stdout | faSize stdin > faSize.danRer11.2bit.txt
    cat faSize.danRer11.2bit.txt
# 1679203469 bases (4693618 N's 1674509851 real 807728713 upper 866781138 lower) in 1923 sequences in 1 files
# Total size: mean 873220.7 sd 6186024.0 min 650 (chrUn_KN150525v1) max 78093715 (chr4) median 146921
# %51.62 masked total, %51.76 masked real

    rm /gbdb/danRer11/danRer11.2bit
    ln -s `pwd`/danRer11.2bit /gbdb/danRer11/danRer11.2bit

#########################################################################
# CREATE MICROSAT TRACK (DONE - 2017-06-04 - Chris)
    ssh hgwdev
    mkdir /cluster/data/danRer11/bed/microsat
    cd /cluster/data/danRer11/bed/microsat

    awk '($5==2 || $5==3) && $6 >= 15 && $8 == 100 && $9 == 0 {printf("%s\t%s\t%s\t%dx%s\n", $1, $2, $3, $6, $16);}' \
       ../simpleRepeat/simpleRepeat.bed > microsat.bed

    hgLoadBed danRer11 microsat microsat.bed
    # Read 127131 elements of size 4 from microsat.bed

##########################################################################
## WINDOWMASKER (DONE - 2017-06-07 - Chris)

    mkdir /hive/data/genomes/danRer11/bed/windowMasker
    cd /hive/data/genomes/danRer11/bed/windowMasker
    time (doWindowMasker.pl -buildDir=`pwd` -workhorse=hgwdev \
        -dbHost=hgwdev danRer11) > do.log 2>&1
    # real    53m45.183s

    # Masking statistics
    cat faSize.danRer11.cleanWMSdust.txt
# 1679203469 bases (4693618 N's 1674509851 real 844135283 upper 830374568 lower) in 1923 sequences in 1 files
# Total size: mean 873220.7 sd 6186024.0 min 650 (chrUn_KN150525v1) max 78093715 (chr4) median 146921
# N count: mean 2440.8 sd 64874.0
# U count: mean 438967.9 sd 3081301.2
# L count: mean 431812.0 sd 3080639.1
# %49.45 masked total, %49.59 masked real

    cat fb.danRer11.rmsk.windowmaskerSdust.txt
    # 638473395 bases of 1679203469 (38.022%) in intersection

##########################################################################
# run up idKeys files for ncbiRefSeq (DONE - 2017-03-28 - Chris)
    mkdir /hive/data/genomes/danRer11/bed/idKeys
    cd /hive/data/genomes/danRer11/bed/idKeys

    time (doIdKeys.pl -buildDir=`pwd`  danRer11) > do.log 2>&1 &
    # real    12m20.939s

    cat danRer11.keySignature.txt
    #   8ae219619932349bebc17364408bf9d0

	mkdir genbank
	time (doIdKeys.pl -buildDir=`pwd` -twoBit=`pwd`/genbank.2bit genbank) > do.log 2>&1 &

	cd ../
	join danRer11.idKeys.txt genbank/genbank.idKeys.txt | awk '{print $2"\t"$3}' > idMap.txt 


##########################################################################
# cpgIslands - (DONE - 2016-10-20 - Hiram)
    mkdir /hive/data/genomes/danRer11/bed/cpgIslands
    cd /hive/data/genomes/danRer11/bed/cpgIslands
    time (doCpgIslands.pl -dbHost=hgwdev -bigClusterHub=ku \
      -workhorse=hgwdev -smallClusterHub=ku danRer11) > do.log 2>&1
    # real    5m24.930s

    cat  fb.danRer11.cpgIslandExt.txt
    # 5261052 bases of 1674677181 (0.314%) in intersection

##############################################################################
# genscan - (DONE - 2017-06-07 - Chris)
    mkdir /hive/data/genomes/danRer11/bed/genscan
    cd /hive/data/genomes/danRer11/bed/genscan
    time (doGenscan.pl -buildDir=`pwd` -workhorse=hgwdev -dbHost=hgwdev \
      -bigClusterHub=ku danRer11) > do.log 2>&1
    # real	66m57.728s

    cat fb.danRer11.genscan.txt
    # 57461966 bases of 1674677181 (3.431%) in intersection

    cat fb.danRer11.genscanSubopt.txt
    # 45875776 bases of 1674677181 (2.739%) in intersection

#############################################################################
# lastz/chain/net swap human/hg38 (DONE - 2017-03-30 - Chris)
    # original alignment
	mkdir /hive/data/genomes/hg38/bed/lastzDanRer11.2017-06-12
	cd /hive/data/genomes/hg38/bed/lastzDanRer11.2017-06-12
	# 41036733 bases of 3049335806 (1.346%) in intersection

	mkdir /hive/data/genomes/danRer11/bed/blastz.hg38.swap
    cd /hive/data/genomes/danRer11/bed/blastz.hg38.swap

    time (doBlastzChainNet.pl -verbose=2 \
      /hive/data/genomes/hg38/bed/lastzDanRer11.2017-06-12/DEF \
        -swap -chainMinScore=3000 -chainLinearGap=medium \
          -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \
            -noDbNameCheck -syntenicNet) > swap.log 2>&1
	#  real	39m24.916s

    cat fb.danRer11.chainHg38Link.txt
    # 47869194 bases of 1674677181 (2.858%) in intersection

    time (doRecipBest.pl -workhorse=hgwdev -buildDir=`pwd` danRer11 hg38) \
       > rbest.log 2>&1 &
    # real	638m45.337s

#############################################################################
# lastz/chain/net swap mouse/mm10 (DONE - 2017-03-30 - Chris)

    cd /hive/data/genomes/mm10/bed/lastzDanRer11.2017-06-12
    cat fb.mm10.chainDanRer11Link.txt
    # 36448414 bases of 2652783500 (1.374%) in intersection

    #	and for the swap
    mkdir /hive/data/genomes/danRer11/bed/blastz.mm10.swap
    cd /hive/data/genomes/danRer11/bed/blastz.mm10.swap
    time (doBlastzChainNet.pl -verbose=2 \
	/hive/data/genomes/mm10/bed/lastzDanRer11.2017-06-12/DEF \
        -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \
        -syntenicNet -swap -chainMinScore=5000 -chainLinearGap=loose) \
	> swap.log 2>&1 &
    #	real    53m19.485s

    cat  fb.danRer11.chainMm10Link.txt
    #	90150612 bases of 1369865365 (6.581%) in intersection

    time (doRecipBest.pl -workhorse=hgwdev -buildDir=`pwd` danRer11 mm10) \
         > rbest.log 2>&1 &
    #	real    597m52.740s

#########################################################################
# Create kluster run files (DONE - 2017-06-08 - Chris)

    # numerator is danRer11 gapless bases "real" as reported by:
    featureBits -noRandom -noHap danRer11 gap
    # 4323788 bases of 1340794641 (0.322%) in intersection
    #                   ^^^

    # denominator is hg19 gapless bases as reported by:
    #   featureBits -noRandom -noHap hg19 gap
    #     234344806 bases of 2861349177 (8.190%) in intersection
    # 1024 is threshold used for human -repMatch:
    calc \( 1340794641 / 2861349177 \) \* 1024
    #  ( 1340794641 / 2861349177 ) * 1024 = 479.834381

    # ==> use -repMatch=500 according to size scaled down from 1024 for human.
    #   and rounded down to nearest 50
    cd /hive/data/genomes/danRer11
    blat danRer11.2bit \
         /dev/null /dev/null -tileSize=11 -makeOoc=jkStuff/danRer11.11.ooc \
        -repMatch=500
	# Wrote 57994 overused 11-mers to jkStuff/danRer11.11.ooc

    #   check non-bridged gaps to see what the typical size is:
    hgsql -N \
        -e 'select * from gap where bridge="no" order by size;' danRer11 \
        | sort -k7,7nr | ave -col=7 stdin
    # minimum gap size is 10 and produces a reasonable number of lifts
    gapToLift -verbose=2 -minGap=100 danRer11 jkStuff/nonBridged.lft \
        -bedFile=jkStuff/nonBridged.bed

########################################################################
# GENBANK AUTO UPDATE (DONE - 2017-06-14 - Chris)
    ssh hgwdev
    cd $HOME/kent/src/hg/makeDb/genbank
    git pull
    # /cluster/data/genbank/data/organism.lst shows:
    # #organism       mrnaCnt estCnt  refSeqCnt
	# Danio rerio	30816	1488354	15648

    # edit etc/genbank.conf to add danRer11 just before rheMac2
# danRer11 (zebrafish)
danRer11.serverGenome = /hive/data/genomes/danRer11/danRer11.2bit
danRer11.clusterGenome = /hive/data/genomes/danRer11/danRer11.2bit
danRer11.ooc = /hive/data/genomes/danRer11/jkStuff/danRer11.11.ooc
danRer11.lift = /hive/data/genomes/danRer11/jkStuff/nonBridged.lft
danRer11.perChromTables = no
danRer11.refseq.mrna.native.pslCDnaFilter  = ${finished.refseq.mrna.native.pslCDnaFilter}
danRer11.refseq.mrna.xeno.pslCDnaFilter    = ${finished.refseq.mrna.xeno.pslCDnaFilter}
danRer11.genbank.mrna.native.pslCDnaFilter = ${finished.genbank.mrna.native.pslCDnaFilter}
danRer11.genbank.mrna.xeno.pslCDnaFilter   = ${finished.genbank.mrna.xeno.pslCDnaFilter}
danRer11.genbank.est.native.pslCDnaFilter  = ${finished.genbank.est.native.pslCDnaFilter}
# DO NOT NEED genbank.mrna.xeno except for human, mouse
# defaults are fine: genbank.mrna.native refseq.mrna.native refseq.mrna.xeno yes
# and genbank.est.native
danRer11.genbank.mrna.xeno.load  = yes
danRer11.refseq.mrna.xeno.load  = yes
danRer11.downloadDir = danRer11
# danRer11.upstreamGeneTbl = refGene
# danRer11.upstreamMaf = multiz7way
# /hive/data/genomes/danRer10/bed/multiz7way/species.lst

    git commit -m "Added danRer11; refs #19459" etc/genbank.conf
    git push
    # update /cluster/data/genbank/:
    make etc-update

    screen      #  control this business with a screen since it takes a while
    cd /cluster/data/genbank

    time ./bin/gbAlignStep -initial danRer11
	# logFile: var/build/logs/2017.06.07-13:49:34.danRer11.initalign.log
	# real    2983m0.012s
	tail var/build/logs/2017.06.07-13:49:34.danRer11.initalign.log
    # hgwdev 2017.06.09-15:24:14 danRer11.initalign: Succeeded: danRer11
	# hgwdev 2017.06.09-15:32:33 danRer11.initalign: finish

    #   To re-do, rm the dir first:
    #     /cluster/data/genbank/work/initial.danRer11

    # load database when finished
    ssh hgwdev
    cd /cluster/data/genbank
    time ./bin/gbDbLoadStep -drop -initialLoad danRer11
	
    tail -1 var/build/logs/2017.06.07-13:49:34.danRer11.initalign.log
	# hgwdev 2017.06.09-15:32:33 danRer11.initalign: finish

    # enable daily alignment and update of hgwdev
    cd ~/kent/src/hg/makeDb/genbank
    git pull
    # add danRer11 to:
    #   etc/align.dbs etc/hgwdev.dbs
    git add etc/align.dbs etc/hgwdev.dbs
    git commit -m "Added danRer11 - zebrafish refs #19459" etc/align.dbs etc/hgwdev.dbs
    git push
    make etc-update

#############################################################################
# augustus gene track (DONE - 2017-06-09 - Chris)

    mkdir /hive/data/genomes/danRer11/bed/augustus
    cd /hive/data/genomes/danRer11/bed/augustus
    time (doAugustus.pl -buildDir=`pwd` -bigClusterHub=ku \
        -species=zebrafish -dbHost=hgwdev \
           -workhorse=hgwdev danRer11) > do.log 2>&1

    cat fb.danRer11.augustusGene.txt
	# 53437277 bases of 1674677181 (3.191%) in intersection

#########################################################################
# add chromAlias table (DONE - 2017-10-20 - Hiram)

    mkdir /hive/data/genomes/danRer11/bed/chromAlias
    cd /hive/data/genomes/danRer11/bed/chromAlias

    hgsql -N -e 'select chrom,name,"refseq" from ucscToRefSeq;' danRer11 \
        > ucsc.refseq.tab
    hgsql -N -e 'select chrom,name,"genbank" from ucscToINSDC;' danRer11 \
        > ucsc.genbank.tab

    awk '{printf "%s\t%s\t%s\n", $2,$1,$3}' ucsc.genbank.tab ucsc.refseq.tab \
        | sort > danRer11.chromAlias.tab

    # verify chrM is OK:

    grep chrM danRer11.chromAlias.tab
# AC024175.3      chrM    genbank
# NC_002333.2     chrM    refseq

    hgLoadSqlTab danRer11 chromAlias ~/kent/src/hg/lib/chromAlias.sql \
        danRer11.chromAlias.tab

#########################################################################
# LIFTOVER TO danRer10 (DONE - 2017-10-25 - Hiram)
    ssh hgwdev
    mkdir /hive/data/genomes/danRer11/bed/blat.danRer10.2017-10-25
    cd /hive/data/genomes/danRer11/bed/blat.danRer10.2017-10-25
    time (doSameSpeciesLiftOver.pl -verbose=2 -buildDir=`pwd` \
	-ooc=/hive/data/genomes/danRer11/jkStuff/danRer11.11.ooc \
        -bigClusterHub=ku -dbHost=hgwdev -workhorse=hgwdev \
         danRer10 danRer10) > do.log 2>&1
    # real    230m19.256s

    # verify the convert link on the test browser is now active from danRer11 to
    # danRer10

#########################################################################
# LIFTOVER TO danRer10 (DONE - 2017-06-12 - Chris)
    ssh hgwdev
    mkdir /hive/data/genomes/danRer11/bed/blat.danRer10.2017-06-09
    cd /hive/data/genomes/danRer11/bed/blat.danRer10.2017-06-09
    time (doSameSpeciesLiftOver.pl -verbose=2 \
        -bigClusterHub=ku -dbHost=hgwdev -workhorse=hgwdev \
        -ooc=/hive/data/genomes/danRer11/jkStuff/danRer11.11.ooc \
         danRer11 danRer10) > doLiftOverToDanRer10.log 2>&1
    # real    86m43.038s

    # see if the liftOver menus function in the browser from danRer11 to danRer10

#########################################################################
#  BLATSERVERS ENTRY (DONE - 2017-05-19 - Chris)
#	After getting a blat server assigned by the Blat Server Gods,
    ssh hgwdev
	# Haifang 
	# Done.

	# Starting trans gfServer for danRer11 on host blat1a, port 17874
	# Starting untrans gfServer for danRer11 on host blat1a, port 17875

    hgsql -e 'INSERT INTO blatServers (db, host, port, isTrans, canPcr) \
	VALUES ("danRer11", "blat1a", "17874", "1", "0"); \
	INSERT INTO blatServers (db, host, port, isTrans, canPcr) \
	VALUES ("danRer11", "blat1a", "17875", "0", "1");' \
	    hgcentraltest
    #	test it with some sequence

#########################################################################
# all.joiner update, downloads and in pushQ - (DONE - 2017-06-09 - Chris)
    cd $HOME/kent/src/hg/makeDb/schema
    # fixup all.joiner until this is a clean output
    joinerCheck -database=danRer11 -tableCoverage all.joiner
    joinerCheck -database=danRer11 -times all.joiner
    joinerCheck -database=danRer11 -keys all.joiner

    cd /hive/data/genomes/danRer11
    time (makeDownloads.pl danRer11) > downloads.log 2>&1
    #  real    27m1.207s


	# New Redmine pushQ System
    #   now ready for pushQ entry
    mkdir /hive/data/genomes/danRer11/pushQ
    cd /hive/data/genomes/danRer11/pushQ
    time (makePushQSql.pl -redmineList danRer11) > danRer11.pushQ.sql 2> stderr.out
    #real	3m32.847s
	# Put a path to the files produced here in the redmine ticket 

#########################################################################
# UCSC to RefSeq name correspondence (DONE - 2017-10-20 - Hiram)

    mkdir /hive/data/genomes/danRer11/bed/ucscToRefSeq
    cd /hive/data/genomes/danRer11/bed/ucscToRefSeq

    # after constructing idKeys in ./refseq/ directory:

    join -t$'\t' ../idKeys/*.idKeys.txt  refseq/*.idKeys.txt \
	| cut -f2- | sort > ucsc.refseq.tab

    awk '{printf "%s\t0\t%d\n", $1,$2}' ../../chrom.sizes \
         | sort > name.coordinate.tab

    join ucsc.refseq.tab name.coordinate.tab \
       | awk '{printf "%s\t%d\t%d\t%s\n", $1,$3,$4,$2}' \
          | sort -k1,1 -k2,2n > ucscToRefSeq.bed

    join -2 2 refseq.insdc.txt ucscToRefSeq.txt | tr '[ ]' '[\t]' | sort -k3 \
       | join -2 3 name.coordinate.tab - | tr '[ ]' '[\t]' | cut -f1-4 \
           > ucscToRefSeq.bed

    # when working perfectly, all these tab files have the same line count:
    wc -l *.tab *.bed
#    1923 name.coordinate.tab
#    1923 ucsc.refseq.tab
#    1923 ucscToRefSeq.bed

    export chrSize=`cut -f1 ucscToRefSeq.bed | awk '{print length($0)}' | sort -n | tail -1`
    echo $chrSize
    #  20
    sed -e "s/21/$chrSize/" $HOME/kent/src/hg/lib/ucscToINSDC.sql \
       | sed -e 's/INSDC/RefSeq/g;' > ucscToRefSeq.sql
    hgLoadSqlTab danRer11 ucscToRefSeq ./ucscToRefSeq.sql ucscToRefSeq.bed

    checkTableCoords  danRer11 -table=ucscToRefSeq
    # should cover %100 all bases:
    featureBits -countGaps danRer11 ucscToRefSeq
    # 1679203469 bases of 1679203469 (100.000%) in intersection

#########################################################################
# LIFTOVER TO danRer7 (DONE - 2022-08-04 - Hiram)
    ssh hgwdev
    mkdir /hive/data/genomes/danRer11/bed/blat.danRer7.2022-08-04
    cd /hive/data/genomes/danRer11/bed/blat.danRer7.2022-08-04
    time (doSameSpeciesLiftOver.pl -verbose=2 -buildDir=`pwd` \
	-ooc=/hive/data/genomes/danRer11/jkStuff/danRer11.11.ooc \
	-target2Bit=/hive/data/genomes/danRer11/danRer11.2bit \
	-targetSizes=/hive/data/genomes/danRer11/chrom.sizes \
	-query2Bit=/hive/data/genomes/danRer7/danRer7.2bit \
	-querySizes=/hive/data/genomes/danRer7/chrom.sizes \
        -bigClusterHub=ku -dbHost=hgwdev -workhorse=hgwdev \
         danRer11 danRer7) > do.log 2>&1
    # real    230m19.256s

    # verify the convert link on the test browser is now active from danRer11 to
    # danRer7

##############################################################################
# crispr whole genome (DONE - 2023-11-04 - Hiram)
    # redmine issue 21863: https://redmine.soe.ucsc.edu/issues/21863

    mkdir /hive/data/genomes/danRer11/bed/crisprAll
    cd /hive/data/genomes/danRer11/bed/crisprAll

    # make sure it can get started
    time (~/kent/src/hg/utils/automation/doCrispr.pl \
      -stop=guides -buildDir=`pwd` -smallClusterHub=hgwdev danRer11) \
           > guides.log 2>&1
    # real    37m57.115s

    sed -e 's/^/# /;' guides/run.time
# Completed: 99 of 99 jobs
# CPU time in finished jobs:       6868s     114.47m     1.91h    0.08d  0.000 y
# IO & Wait Time:                   234s       3.89m     0.06h    0.00d  0.000 y
# Average job time:                  72s       1.20m     0.02h    0.00d
# Longest finished job:             101s       1.68m     0.03h    0.00d
# Submission to last job:           102s       1.70m     0.03h    0.00d

    # looks good, let it run through the load:
    time ~/kent/src/hg/utils/automation/doCrispr.pl -continue=specScoreJobList \
        -stop=load -buildDir=`pwd` -smallClusterHub=hgwdev danRer11) \
           > load.log 2>&1
    # real    2471m6.435s

    sed -e 's/^/# /;' specScores/run.time
# Completed: 721425 of 721425 jobs
# CPU time in finished jobs:   44390040s  739834.00m 12330.57h  513.77d  1.408 y
# IO & Wait Time:                872244s   14537.40m   242.29h   10.10d  0.028 y
# Average job time:                  63s       1.05m     0.02h    0.00d
# Longest finished job:             148s       2.47m     0.04h    0.00d
# Submission to last job:        112835s    1880.58m    31.34h    1.31d

    sed -e 's/^/# /;'  effScores/run.time
# Completed: 11644 of 11644 jobs
# CPU time in finished jobs:    5938619s   98976.98m  1649.62h   68.73d  0.188 y
# IO & Wait Time:                 73259s    1220.99m    20.35h    0.85d  0.002 y
# Average job time:                 516s       8.61m     0.14h    0.01d
# Longest finished job:            2250s      37.50m     0.62h    0.03d
# Submission to last job:          9850s     164.17m     2.74h    0.11d

    sed -e 's/^/# /;'  offTargets/run.time
# Completed: 36072 of 36072 jobs
# CPU time in finished jobs:     492565s    8209.42m   136.82h    5.70d  0.016 y
# IO & Wait Time:                126854s    2114.23m    35.24h    1.47d  0.004 y
# Average job time:                  17s       0.29m     0.00h    0.00d
# Longest finished job:              25s       0.42m     0.01h    0.00d
# Submission to last job:          1763s      29.38m     0.49h    0.02d

    # that made the table crispr10K and symlinks in /gbdb/danRer11/crisrp10K/
    # when it should have been instead crisprAll, reset the links and reload
    # the correct table:
mkdir -p /gbdb/danRer11/crisprAll/
rm -f /gbdb/danRer11/crisprAll/crispr.bb
rm -f /gbdb/danRer11/crisprAll/crisprDetails.tab
ln -sf `pwd`/crispr.bb /gbdb/danRer11/crisprAll/crispr.bb
ln -sf `pwd`/crisprDetails.tab /gbdb/danRer11/crisprAll/crisprDetails.tab
hgBbiDbLink danRer11 crisprAllTargets /gbdb/danRer11/crisprAll/crispr.bb

    hgsql -e 'drop table crispr10K;' danRer11

    grep -c . effScores.tab
    # 116454428
    grep -c . specScores.tab
    # 53058151

    time (cut -f1-3 crispr.bed | bedSingleCover.pl stdin \
       | awk '{sum+=$3-$2}END{printf "%d bases\n", sum}') \
            > coverage.crispr.bed.txt 2>&1
    1145886525 bases
    # real    5m14.538s

    ave -col=2 ../../*.sizes | grep total
    total 1679203469.000000

    # 'featureBits' result:
    echo "scale+=3; 100.0 * 1145886525 / 1679203469" | bc
    68.239

##############################################################################
