# for emacs: -*- mode: sh; -*-

#       DATE:   31-May-2012
#       ORGANISM:       Alligator mississippiensis
#       TAXID:  8496
#       ASSEMBLY LONG NAME:     allMis0.2
#       ASSEMBLY SHORT NAME:    allMis0.2
#       ASSEMBLY SUBMITTER:     International Crocodilian Genomes Working Group
#       ASSEMBLY TYPE:  Haploid
#       NUMBER OF ASSEMBLY-UNITS:       1
#       ASSEMBLY ACCESSION:     GCA_000281125.1

#       FTP-RELEASE DATE: 07-Aug-2012

#       http://www.ncbi.nlm.nih.gov/genome/13409
#       http://www.ncbi.nlm.nih.gov/assembly/406428/
#       http://www.ncbi.nlm.nih.gov/bioproject/159843

#       http://www.ncbi.nlm.nih.gov/Traces/wgs/?val=AKHW01
#       Genome Coverage : 68x

#       http://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?id=8496

# rsync://ftp.ncbi.nlm.nih.gov/genbank/genomes/Eukaryotes/vertebrates_other/Alligator_mississippiensis/allMis0.2/

##########################################################################
# Download sequence (DONE - 2012-08-23 - Hiram)
    mkdir /hive/data/genomes/allMis1
    cd /hive/data/genomes/allMis1
    mkdir genbank
    cd genbank
rsync -a -P \
rsync://ftp.ncbi.nlm.nih.gov/genbank/genomes/Eukaryotes/vertebrates_other/Alligator_mississippiensis/allMis0.2/ ./

    # verify the size of the sequence here:
    faSize Primary_Assembly/unplaced_scaffolds/FASTA/unplaced.scaf.fa.gz

# 2174243242 bases (45539765 N's 2128703477 real 2128703477 upper 0 lower)
#       in 14644 sequences in 1 files
# Total size: mean 148473.3 sd 283025.3 min 816 (gi|397224103|gb|AKHW01085896.1|)
#       max 4958242 (gi|397457027|gb|JH738473.1|) median 25764
# %0.00 masked total, %0.00 masked real

    # strip the names down to something reasonable, they can all be:
    # >JH7[0-9]
    zcat Primary_Assembly/unplaced_scaffolds/FASTA/unplaced.scaf.fa.gz \
        | sed -e "s/^>.*JH7/>JH7/; s/^>.*AKHW01/>AKHW01/; s/.1. Alligator.*//" \
        | gzip -c > ucsc.fa.gz
    zcat Primary_Assembly/unplaced_scaffolds/AGP/unplaced.scaf.agp.gz \
        | sed -e "s/^\(JH7[0-9]*\).1/\1/; s/^\(AKHW01[0-9]*\).1/\1/;" \
        | gzip -c > ucsc.agp.gz
    time checkAgpAndFa  ucsc.agp.gz ucsc.fa.gz 2>&1 | tail -2
# Valid Fasta file entry
# All AGP and FASTA entries agree - both files are valid
    #   real    0m59.548s

    mkdir /hive/data/genomes/allMis1/photograph
    cd /hive/data/genomes/allMis1/photograph
    wget --timestamping \
        http://www.nasa.gov/centers/kennedy/images/content/91159main_93pc780.jpg

    convert -geometry "220x300" 91159main_93pc780.jpg \
        Alligator_Mississippiensis.jpg
    # check this .jpg file into the source tree kent/src/hg/htdocs/images/
    git commit -m "gator photo from NASA Kennedy" Alligator_Mississippiensis.jpg
    # and copy to /usr/local/apache/htdocs/images
    cp -p Alligator_Mississippiensis.jpg /usr/local/apache/htdocs/images

##########################################################################
# Initial makeGenomeDb.pl (DONE - 2012-08-23 - Hiram)
    # obtain a template for this from the source tree:
    # kent/src/hg/utils/automation/configFiles/
    # and check it back into the source tree when completed here:
    cd /hive/data/genomes/geoFor1
    cat << '_EOF_' > geoFor1.config.ra
# Config parameters for makeGenomeDb.pl:
db allMis1
clade vertebrate
genomeCladePriority 68
scientificName Alligator Mississippiensis
commonName American alligator
assemblyDate Aug. 2012
assemblyLabel International Crocodilian Genomes Working Group
assemblyShortLabel allMis0.2
orderKey 4380
mitoAcc NC_001922
fastaFiles /hive/data/genomes/allMis1/genbank/ucsc.fa.gz
agpFiles /hive/data/genomes/allMis1/genbank/ucsc.agp.gz
# qualFiles none
dbDbSpeciesDir gator
photoCreditURL http://green.soe.ucsc.edu/
photoCreditName Photo courtesy of the Green Lab, U.C. Santa Cruz
ncbiGenomeId 13409
ncbiAssemblyId 406428
ncbiAssemblyName allMis0.2
ncbiBioProject 159843
genBankAccessionID GCA_000281125.1
taxId 8496
'_EOF_'
    # << happy emacs

    time makeGenomeDb.pl -workhorse=hgwdev -fileServer=hgwdev -dbHost=hgwdev \
        -stop=agp allMis1.config.ra > agp.log 2>&1
    #   real    2m32.247s
    # verify OK:
    tail -1 agp.log
    #   *** All done!  (through the 'agp' step)

    # finish it off
    time makeGenomeDb.pl -continue=db -workhorse=hgwdev -fileServer=hgwdev \
        -dbHost=hgwdev allMis1.config.ra > db.log 2>&1
    #   real    16m30.326s
    # was missing the photograph, it needs to exist for this to complete:
    time makeGenomeDb.pl -continue=trackDb -workhorse=hgwdev \
        -fileServer=hgwdev -dbHost=hgwdev allMis1.config.ra > trackDb.log 2>&1

    #   real    9m24.133s
    #	add the trackDb entries to the source tree, and the 2bit link:
    ln -s `pwd`/allMis1.unmasked.2bit /gbdb/allMis1/allMis1.2bit
    #	browser should function now

    hgsql -e 'update defaultDb set name="allMis1" where name="allMis9";' \
        hgcentraltest

##########################################################################
# running repeat masker (DONE - 2013-06-07 - Hiram)
    # with new repeat masker version that includes the Alligator repeats
    mkdir /hive/data/genomes/allMis1/bed/rmsk.2013-06-06
    cd /hive/data/genomes/allMis1/bed/rmsk.2013-06-06

    # the clusterRun portion:
    # real    1251m59.513s
    # user    0m34.986s
    # sys     0m19.211s
# Completed: 6152 of 6152 jobs
# CPU time in finished jobs:   16757781s  279296.34m  4654.94h  193.96d  0.531 y
# IO & Wait Time:               1074282s   17904.71m   298.41h   12.43d  0.034 y
# Average job time:                2899s      48.31m     0.81h    0.03d
# Longest finished job:            4741s      79.02m     1.32h    0.05d
# Submission to last job:         74762s    1246.03m    20.77h    0.87d

    time doRepeatMasker.pl -buildDir=`pwd` \
        -continue=cat -dbHost=hgwdev \
        -bigClusterHub=swarm -workhorse=hgwdev allMis1 > cat.log 2>&1
    # real    42m20.999s

    cat faSize.rmsk.txt
# 2174259888 bases (45539765 N's 2128720123 real 1337574861 upper
#    791145262 lower) in 14645 sequences in 1 files
# Total size: mean 148464.3 sd 283017.8 min 816 (AKHW01085896)
#    max 4958242 (JH738473) median 25754
# %36.39 masked total, %37.17 masked real

    egrep -i "versi|relea" do.log
# RepeatMasker version open-4.0.2
#    April 29 2013 (open-4-0-2) version of RepeatMasker
# CC   RELEASE 20130422;

    featureBits -countGaps allMis1 rmsk
    # 793968795 bases of 2174259888 (36.517%) in intersection

    # why is it different than the faSize above ?
    # because rmsk masks out some N's as well as bases, the count above
    #	separates out the N's from the bases, it doesn't show lower case N's

##########################################################################
# running simple repeat (DONE - 2012-08-28 - Hiram)
    mkdir /hive/data/genomes/allMis1/bed/simpleRepeat
    cd /hive/data/genomes/allMis1/bed/simpleRepeat
    time doSimpleRepeat.pl -buildDir=`pwd` -bigClusterHub=swarm \
	-dbHost=hgwdev -workhorse=hgwdev -smallClusterHub=encodek \
	allMis1 > do.log 2>&1 &
    #   real    9m46.133s

    cat fb.simpleRepeat
    #   22565360 bases of 2129659933 (1.060%) in intersection

    # considering rmsk %37 vs. WM %29, using RM and TRF
    cd /hive/data/genomes/allMis1
    twoBitMask allMis1.rmsk.2bit \
	-add bed/simpleRepeat/trfMask.bed allMis1.2bit
    #	you can safely ignore the warning about fields >= 13

    twoBitToFa allMis1.2bit stdout | faSize stdin > faSize.allMis1.2bit.txt
    cat faSize.allMis1.2bit.txt

    # 2174259888 bases (45539765 N's 2128720123 real 1337312569 upper
    #    791407554 lower) in 14645 sequences in 1 files
    # Total size: mean 148464.3 sd 283017.8 min 816 (AKHW01085896)
    #    max 4958242 (JH738473) median 25754
    # %36.40 masked total, %37.18 masked real

    rm /gbdb/allMis1/allMis1.2bit
    ln -s `pwd`/allMis1.2bit /gbdb/allMis1/allMis1.2bit

#########################################################################
# Verify all gaps are marked, add any N's not in gap as type 'other'
#	(DONE - 2012-07-26 - Hiram)
    mkdir /hive/data/genomes/allMis1/bed/gap
    cd /hive/data/genomes/allMis1/bed/gap
    time nice -n +19 findMotif -motif=gattaca -verbose=4 \
	-strand=+ ../../allMis1.unmasked.2bit > findMotif.txt 2>&1
    #   real    0m29.584s
    grep "^#GAP " findMotif.txt | sed -e "s/^#GAP //" > allGaps.bed
    time featureBits allMis1 -not gap -bed=notGap.bed
    #   2129659933 bases of 2129659933 (100.000%) in intersection
    #   real    0m16.373s

    # can see now if allGaps.bed actually is all the gaps:
    hgsql -N -e "select size from gap;" allMis1 | ave stdin | grep total
# total 44599955.000000
    ave -col=5 allGaps.bed | grep total
# total 45539765.000000
    # not the same count, not all gaps are marked

    time featureBits allMis1 allGaps.bed notGap.bed -bed=new.gaps.bed
    #   939810 bases of 2129659933 (0.044%) in intersection
    #   real    44m59.537s

    # check if any non-bridged gaps here:
    hgsql -N -e "select bridge from gap;" allMis1 | sort | uniq -c
    #  99514 yes

#########################################################################
# cytoBandIdeo - (DONE - 2013-06-18 - Hiram)
    mkdir /hive/data/genomes/allMis1/bed/cytoBand
    cd /hive/data/genomes/allMis1/bed/cytoBand
    makeCytoBandIdeo.csh allMis1

##########################################################################
## WINDOWMASKER (DONE - 2012-07-26 - Hiram)
    mkdir /hive/data/genomes/allMis1/bed/windowMasker
    cd /hive/data/genomes/allMis1/bed/windowMasker
    time nice -n +19 doWindowMasker.pl -buildDir=`pwd` -workhorse=hgwdev \
	-dbHost=hgwdev allMis1 > do.log 2>&1 &
    #   real    161m54.809s

    # Masking statistics
    cat faSize.allMis1.wmsk.txt
    #   2174259888 bases (45539765 N's 2128720123 real 1521018315 upper
    #   607701808 lower) in 14645 sequences in 1 files
    #   Total size: mean 148464.3 sd 283017.8 min 816 (AKHW01085896)
    #   max 4958242 (JH738473) median 25754
    #   %27.95 masked total, %28.55 masked real

    cat faSize.allMis1.wmsk.sdust.txt
    #   2174259888 bases (45539765 N's 2128720123 real 1508788037
    #   upper 619932086 lower) in 14645 sequences in 1 files
    #   Total size: mean 148464.3 sd 283017.8 min 816 (AKHW01085896)
    #   max 4958242 (JH738473) median 25754
    #   %28.51 masked total, %29.12 masked real

    cat faSize.allMis1.cleanWMSdust.txt
    #   2174259888 bases (45539765 N's 2128720123 real 1508788037
    #   upper 619932086 lower) in 14645 sequences in 1 files
    #   Total size: mean 148464.3 sd 283017.8 min 816 (AKHW01085896)
    #   max 4958242 (JH738473) median 25754
    #   %28.51 masked total, %29.12 masked real

    cat fb.allMis1.windowmaskerSdust.clean.txt
    #   619953052 bases of 2174259888 (28.513%) in intersection

    # how much does this window masker and repeat masker overlap:
    # can be done after rmsk is done
    time featureBits -countGaps allMis1 rmsk windowmaskerSdust \
        > fb.allMis1.rmsk.windowmaskerSdust.txt 2>&1

    cat fb.allMis1.rmsk.windowmaskerSdust.txt
    #   460348480 bases of 2174259888 (21.173%) in intersection

##########################################################################
# add simpleRepeats to WindowMasker result (DONE - 2012-07-26 - Hiram)
    # add to rmsk after it is done:
    cd /hive/data/genomes/allMis1
    twoBitMask -add bed/windowMasker/allMis1.cleanWMSdust.2bit \
	bed/simpleRepeat/trfMask.bed allMis1.2bit
    #	you can safely ignore the warning about fields >= 13

    twoBitToFa allMis1.2bit stdout | faSize stdin > faSize.allMis1.2bit.txt
    cat faSize.allMis1.2bit.txt
    #   1065292181 bases (24006152 N's 1041286029 real 833083628 upper
    #           208202401 lower) in 27239 sequences in 1 files
    #   Total size: mean 39109.1 sd 545732.4 min 131 (JH767125)
    #           max 30495534 (JH739887) median 598
    #   %19.54 masked total, %19.99 masked real

    rm /gbdb/allMis1/allMis1.2bit
    ln -s `pwd`/allMis1.2bit /gbdb/allMis1/allMis1.2bit

##########################################################################
# cpgIslands - (DONE - 2013-06-07 - Hiram)
    mkdir /hive/data/genomes/allMis1/bed/cpgIslands
    cd /hive/data/genomes/allMis1/bed/cpgIslands
    time doCpgIslands.pl allMis1 > do.log 2>&1 &
    # real    72m55.069s

    cat fb.allMis1.cpgIslandExt.txt
    #   20404153 bases of 2129659933 (0.958%) in intersection

#########################################################################
# genscan - (DONE - 2013-06-07 - Hiram)
    mkdir /hive/data/genomes/allMis1/bed/genscan
    cd /hive/data/genomes/allMis1/bed/genscan
    time doGenscan.pl allMis1 > do.log 2>&1 &
    # real    89m34.019s

    cat fb.allMis1.genscan.txt
    #   48525950 bases of 2129659933 (2.279%) in intersection
    cat fb.allMis1.genscanSubopt.txt
    #   53173655 bases of 2129659933 (2.497%) in intersection

#########################################################################
# MAKE 11.OOC FILE FOR BLAT/GENBANK (DONE - 2013-06-08 - Hiram)
    # Use -repMatch=800, based on size -- for human we use 1024
    # use the "real" number from the faSize measurement,
    # hg19 is 2897316137, calculate the ratio factor for 1024:
    calc \( 2128720123 / 2897316137 \) \* 1024
    #   ( 2128720123 / 2897316137 ) * 1024 = 752.354698

    # round up to 800

    cd /hive/data/genomes/allMis1
    time blat allMis1.2bit /dev/null /dev/null -tileSize=11 \
      -makeOoc=jkStuff/allMis1.11.ooc -repMatch=800
    #   Wrote 12702 overused 11-mers to jkStuff/allMis1.11.ooc
    #   real    0m43.678s

    # there are no non-bridged gaps, no lift file needed for genbank
    hgsql -N -e "select bridge from gap;" allMis1 | sort | uniq -c
    #   68589 yes

#########################################################################
# AUTO UPDATE GENBANK (DONE - 2013-06-12,18 - Hiram)
    # examine the file:
    /cluster/data/genbank/data/organism.lst
    # for your species to see what counts it has for:
# organism       mrnaCnt estCnt  refSeqCnt
# Alligator mississippiensis      107     5428    0
# Alligator sinensis      16      0       0

    # to decide which "native" mrna or ests you want to specify in genbank.conf

    ssh hgwdev
    cd $HOME/kent/src/hg/makeDb/genbank
    git pull
    # edit etc/genbank.conf to add allMis1 just after ce2
# allMis1 - Alligator mississippiensis
allMis1.serverGenome = /hive/data/genomes/allMis1/allMis1.2bit
allMis1.clusterGenome = /hive/data/genomes/allMis1/allMis1.2bit
allMis1.ooc = /hive/data/genomes/allMis1/allMis1.11.ooc
allMis1.lift = no
allMis1.refseq.mrna.native.pslCDnaFilter  = ${lowCover.refseq.mrna.native.pslCDnaFilter}
allMis1.refseq.mrna.xeno.pslCDnaFilter    = ${lowCover.refseq.mrna.xeno.pslCDnaFilter}
allMis1.genbank.mrna.native.pslCDnaFilter = ${lowCover.genbank.mrna.native.pslCDnaFilter}
allMis1.genbank.mrna.xeno.pslCDnaFilter   = ${lowCover.genbank.mrna.xeno.pslCDnaFilter}
allMis1.genbank.est.native.pslCDnaFilter  = ${lowCover.genbank.est.native.pslCDnaFilter}
allMis1.refseq.mrna.native.load = yes
allMis1.refseq.mrna.xeno.load = yes
allMis1.genbank.mrna.xeno.load = yes
allMis1.downloadDir = allMis1
allMis1.upstreamGeneTbl = refGene
allMis1.perChromTables = no

    # end of section added to etc/genbank.conf
    git commit -m "updating allMis0 to allMis1 refs #8750" etc/genbank.conf
    git push
    make etc-update

    ssh hgwdev			# used to do this on "genbank" machine
    screen -S allMis1           # long running job managed in screen
    cd /cluster/data/genbank
    time nice -n +19 ./bin/gbAlignStep -initial allMis1 &
    #   var/build/logs/2013.06.12-15:21:37.allMis1.initalign.log
    # real    1444m27.582s

    # load database when finished
    ssh hgwdev
    cd /cluster/data/genbank
    time nice -n +19 ./bin/gbDbLoadStep -drop -initialLoad allMis1 &
    #   var/dbload/hgwdev/logs/2013.06.18-13:52:33.dbload.log
    #   real    108m47.912s

    # check the end of that dbload.log to see if it was successful
    #   hgwdev 2013.06.18-15:41:20 dbload: finish

    # enable daily alignment and update of hgwdev (DONE - 2012-05-09 - Hiram)
    cd ~/kent/src/hg/makeDb/genbank
    git pull
    # add allMis1 to:
    vi etc/align.dbs etc/hgwdev.dbs
    git commit -m "Added allMis1 refs #8750" etc/align.dbs etc/hgwdev.dbs
    git push
    make etc-update

#########################################################################
# set default position to FOXP2 gene displays  (DONE - 2013-06-28 - Hiram)
    hgsql -e \
'update dbDb set defaultPos="JH731472:504271-884586" where name="allMis1";' \
	hgcentraltest

############################################################################
# downloads and pushQ entry (DONE - 2013-06-28 - Hiram)
    # after adding allMis1 to the all.joiner file and verifying that
    #   joinerCheck is clean, can construct the downloads:
    cd /hive/data/genomes/allMis1
    time makeDownloads.pl -workhorse=hgwdev allMis1 > downloads.log 2>&1 &
    #   real    21m2.394s

    mkdir /hive/data/genomes/allMis1/pushQ
    cd /hive/data/genomes/allMis1/pushQ
    # Mark says don't let the transMap track get there
    time makePushQSql.pl allMis1 2> stderr.txt > allMis1.sql
    #   real    3m44.708s

    # check the stderr.txt for bad stuff, these kinds of warnings are OK:
# WARNING: hgwdev does not have /gbdb/allMis1/wib/gc5Base.wib
# WARNING: hgwdev does not have /gbdb/allMis1/wib/quality.wib
# WARNING: hgwdev does not have /gbdb/allMis1/bbi/quality.bw
# WARNING: allMis1 does not have seq
# WARNING: allMis1 does not have extFile
# WARNING: allMis1 does not have estOrientInfo

    scp -p allMis1.sql hgwbeta:/tmp
    ssh hgwbeta "hgsql qapushq < /tmp/allMis1.sql"

##########################################################################
#  BLATSERVERS ENTRY (DONE - 2013-06-26 - Hiram)
#	After getting a blat server assigned by the Blat Server Gods,
    ssh hgwdev

    hgsql -e 'INSERT INTO blatServers (db, host, port, isTrans, canPcr) \
	VALUES ("allMis1", "blat4a", "17848", "1", "0"); \
	INSERT INTO blatServers (db, host, port, isTrans, canPcr) \
	VALUES ("allMis1", "blat4a", "17849", "0", "1");' \
	    hgcentraltest
    #	test it with some sequence

############################################################################
# fixup search rule for assembly track/gold table (DONE - 2013-08-06 - Hiram)
    hgsql -N -e "select frag from gold;" allMis1 | sort | head -1
AKHW01000001.1
    hgsql -N -e "select frag from gold;" allMis1 | sort | tail -2
AKHW01114158.1
NC_001922

    # verify this rule will find them all or eliminate them all:
    hgsql -N -e "select frag from gold;" allMis1 | wc -l
    # 114159

    hgsql -N -e "select frag from gold;" allMis1 | egrep -e '[AN][KC][H_][W0]0[0-9]+(\.1)?' | wc -l
    # 114159

    hgsql -N -e "select frag from gold;" allMis1 | egrep -v -e '[AN][KC][H_][W0]0[0-9]+(\.1)?' | wc -l
    # 0

    # hence, add to trackDb/gator/trackDb.ra
searchTable gold
shortCircuit 1
termRegex [AN][KC][H_][W0]0[0-9]+(\.1)?
query select chrom,chromStart,chromEnd,frag from %s where frag like '%s%%'
searchPriority 8

#########################################################################
# swap human/gator hg19/allMis1 lastz (DONE - 2013-06-30 - Hiram)
    # original alignment
    cd /hive/data/genomes/hg19/bed/lastzAllMis1.2013-06-28
    cat fb.hg19.chainAllMis1Link.txt
    #	205244552 bases of 2897316137 (7.084%) in intersection

    #	and for this swap
    mkdir /hive/data/genomes/allMis1/bed/blastz.hg19.swap
    cd /hive/data/genomes/allMis1/bed/blastz.hg19.swap
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
	/hive/data/genomes/hg19/bed/lastzAllMis1.2013-06-28/DEF \
        -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
        -swap -chainMinScore=5000 -chainLinearGap=loose > swap.log 2>&1 &
    #   forgot to get stderr in the swap.log
    #	real    27m54.364s

    cat  fb.allMis1.chainHg19Link.txt
    #	172919308 bases of 2129659933 (8.120%) in intersection

    # set sym link to indicate this is the lastz for this genome:
    cd /hive/data/genomes/allMis1/bed
    ln -s blastz.hg19.swap lastz.hg19

#########################################################################
##############################################################################
# TransMap V3 tracks. see makeDb/doc/transMapTracks.txt (2014-12-21 markd)
##############################################################################
