# for emacs: -*- mode: sh; -*-

#       DATE:   22-May-2012
#       ORGANISM:       Chinchilla lanigera
#       TAXID:  34839
#       ASSEMBLY LONG NAME:     ChiLan1.0
#       ASSEMBLY SHORT NAME:    ChiLan1.0
#       ASSEMBLY SUBMITTER:     Broad Institute
#       ASSEMBLY TYPE:  Haploid
#       NUMBER OF ASSEMBLY-UNITS:       1
#       ASSEMBLY ACCESSION:     GCA_000276665.1

#       FTP-RELEASE DATE: 09-Jul-2012

#       http://www.ncbi.nlm.nih.gov/genome/10466
#       http://www.ncbi.nlm.nih.gov/genome/assembly/397218/

#       http://www.ncbi.nlm.nih.gov/bioproject/68239

#       http://www.ncbi.nlm.nih.gov/Traces/wgs/?val=AGCD01
#       Genome Coverage : 87x Illumina

#       http://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?id=34839

# rsync:://ftp.ncbi.nlm.nih.gov/genbank/genomes/Eukaryotes/vertebrates_mammals/Chinchilla_lanigera/ChiLan1.0/

##########################################################################
# Download sequence (DONE - 2012-07-27 - Hiram)
    mkdir /hive/data/genomes/chiLan1
    cd /hive/data/genomes/chiLan1
    mkdir genbank
    cd genbank
    rsync -a -P \
rsync://ftp.ncbi.nlm.nih.gov/genbank/genomes/Eukaryotes/vertebrates_mammals/Chinchilla_lanigera/ChiLan1.0/ ./

    # verify the size of the sequence here:
    faSize Primary_Assembly/unplaced_scaffolds/FASTA/unplaced.scaf.fa.gz
# 2390852391 bases (106575991 N's 2284276400 real 2284276400 upper 0 lower)
#       in 2838 sequences in 1 files
# Total size: mean 842442.7 sd 4383001.2 min 1000
#       (gi|393809579|gb|AGCD01081635.1|)
#       max 63520090 (gi|394364413|gb|JH721862.1|) median 2911

    # strip the names down to something reasonable, they can all be:
    # >JH7[0-9] or >AGCD01[0-9]
    zcat Primary_Assembly/unplaced_scaffolds/FASTA/unplaced.scaf.fa.gz \
| sed -e "s/^>.*JH7/>JH7/; s/^>.*AGCD01/>AGCD01/; s/.1. Chinchilla .*//" \
        | gzip -c > ucsc.fa.gz
    zcat Primary_Assembly/unplaced_scaffolds/AGP/unplaced.scaf.agp.gz \
        | sed -e "s/^\(JH7[0-9]*\).1/\1/; s/^\(AGCD01[0-9]*\).1/\1/" \
        | gzip -c > ucsc.agp.gz
    checkAgpAndFa  ucsc.agp.gz ucsc.fa.gz 2>&1 | tail -2
# Valid Fasta file entry
# All AGP and FASTA entries agree - both files are valid

    mkdir /hive/data/genomes/chiLan1/photograph
    cd /hive/data/genomes/chiLan1/photograph
# from: http://commons.wikimedia.org/wiki/File:Chinchilla_lanigera_%28Wroclaw_zoo%29-2.JPG
# http://commons.wikimedia.org/wiki/User:Gu%C3%A9rin_Nicolas

    wget -O Chinchilla_lanigera_Guerin_Nicolas.jpg \
http://upload.wikimedia.org/wikipedia/commons/1/18/Chinchilla_lanigera_%28Wroclaw_zoo%29-2.JPG

    convert -geometry "350x350" Chinchilla_lanigera_Guerin_Nicolas.jpg \
        Chinchilla_lanigera.jpg
    # check this .jpg file into the source tree kent/src/hg/htdocs/images/
    git commit -m "from Wikimedia Commons - Guerin Nicolas" Chinchilla_lanigera.jpg
    # and copy to /usr/local/apache/htdocs/images
    cp -p Geospiza_fortis.jpg /usr/local/apache/htdocs/images

##########################################################################
# Initial makeGenomeDb.pl (DONE - 2012-07-27 - Hiram)
    # obtain a template for this from the source tree:
    # kent/src/hg/utils/automation/configFiles/
    # and check it back into the source tree when completed here:
    cd /hive/data/genomes/chiLan1
    cat << '_EOF_' > chiLan1.config.ra
# Config parameters for makeGenomeDb.pl:
db chiLan1
clade mammal
genomeCladePriority 35
scientificName Chinchilla lanigera
commonName Chinchilla
assemblyDate May 2012
assemblyLabel Broad Institute
assemblyShortLabel ChiLan1.0
orderKey 1700
mitoAcc none
fastaFiles /hive/data/genomes/chiLan1/genbank/ucsc.fa.gz
agpFiles /hive/data/genomes/chiLan1/genbank/ucsc.agp.gz
# qualFiles none
dbDbSpeciesDir chiLan
photoCreditURL http://commons.wikimedia.org/wiki/File:Chinchilla_lanigera_%28Wroclaw_zoo%29-2.JPG
photoCreditName Photo courtesy of Gu%C3%A9rin Nicolas, Wikimedia Commons
ncbiGenomeId 10466
ncbiAssemblyId 397218
ncbiAssemblyName ChiLan1.0
ncbiBioProject 68239
genBankAccessionID GCA_000276665.1
taxId 34839
'_EOF_'
    # << happy emacs

    time makeGenomeDb.pl -workhorse=hgwdev -fileServer=hgwdev -dbHost=hgwdev \
        -stop=agp chiLan1.config.ra > agp.log 2>&1
    #   real    2m10.088s
    # verify OK:
    tail -1 agp.log
    #   *** All done!  (through the 'agp' step)

    # finish it off
    time makeGenomeDb.pl -continue=db -workhorse=hgwdev -fileServer=hgwdev \
        -dbHost=hgwdev chiLan1.config.ra > db.log 2>&1
    #   real    9m24.133s
    #	add the trackDb entries to the source tree, and the 2bit link:
    ln -s `pwd`/chiLan1.unmasked.2bit /gbdb/chiLan1/chiLan1.2bit
    #	browser should function now

##########################################################################
# running repeat masker (DONE - 2012-07-27 - Hiram)
    mkdir /hive/data/genomes/chiLan1/bed/repeatMasker
    cd /hive/data/genomes/chiLan1/bed/repeatMasker
    time doRepeatMasker.pl -buildDir=`pwd` -noSplit \
	-bigClusterHub=swarm -dbHost=hgwdev -workhorse=hgwdev \
	-smallClusterHub=encodek chiLan1 > do.log 2>&1 &
    #   real    650m7.044s

    cat faSize.rmsk.txt
    #   2390852391 bases (106575991 N's 2284276400 real 1581183547 upper
    #   703092853 lower) in 2838 sequences in 1 files
    #   Total size: mean 842442.7 sd 4383001.2 min 1000 (AGCD01081635)
    #   max 63520090 (JH721862) median 2911
    #   %29.41 masked total, %30.78 masked real

    egrep -i "versi|relea" do.log
#    April 26 2011 (open-3-3-0) version of RepeatMasker
# CC   RELEASE 20110920;
# RepeatMasker version development-$Id: RepeatMasker,v 1.26 2011/09/26 16:19:44 angie Exp $

    featureBits -countGaps chiLan1 rmsk
    #   703728919 bases of 2390852391 (29.434%) in intersection

    # why is it different than the faSize above ?
    # because rmsk masks out some N's as well as bases, the count above
    #	separates out the N's from the bases, it doesn't show lower case N's

##########################################################################
# running simple repeat (DONE - 2012-07-27 - Hiram)
    mkdir /hive/data/genomes/chiLan1/bed/simpleRepeat
    cd /hive/data/genomes/chiLan1/bed/simpleRepeat
    time doSimpleRepeat.pl -buildDir=`pwd` -bigClusterHub=swarm \
	-dbHost=hgwdev -workhorse=hgwdev -smallClusterHub=encodek \
	chiLan1 > do.log 2>&1 &
    #   real    11m57.551s

    cat fb.simpleRepeat
    #   27063329 bases of 2284276400 (1.185%) in intersection

#########################################################################
# Verify all gaps are marked, add any N's not in gap as type 'other'
#	(DONE - 2012-07-27 - Hiram)
    mkdir /hive/data/genomes/chiLan1/bed/gap
    cd /hive/data/genomes/chiLan1/bed/gap
    time nice -n +19 findMotif -motif=gattaca -verbose=4 \
	-strand=+ ../../chiLan1.unmasked.2bit > findMotif.txt 2>&1
    #   real    0m32.055s
    grep "^#GAP " findMotif.txt | sed -e "s/^#GAP //" > allGaps.bed
    time featureBits chiLan1 -not gap -bed=notGap.bed
    #   2284276400 bases of 2284276400 (100.000%) in intersection
    #   real    0m13.196s

    # can see now if allGaps.bed actually is all the gaps:
    hgsql -N -e "select size from gap;" chiLan1 | ave stdin | grep total
# total 106575991.000000
    ave -col=5 allGaps.bed | grep total
# total 106575991.000000
    # equal counts indicates the rest is unnecessary, they are all marked

    time featureBits chiLan1 allGaps.bed notGap.bed -bed=new.gaps.bed
    #   0 bases of 2284276400 (0.000%) in intersection
    #   real    1m20.106s
    # no new gaps, nothing to do here

    # check if any non-bridged gaps here:
    hgsql -N -e "select bridge from gap;" chiLan1 | sort | uniq -c
    # 78817 yes

##########################################################################
## WINDOWMASKER (DONE - 2012-07-27 - Hiram)
    mkdir /hive/data/genomes/chiLan1/bed/windowMasker
    cd /hive/data/genomes/chiLan1/bed/windowMasker
    time nice -n +19 doWindowMasker.pl -buildDir=`pwd` -workhorse=hgwdev \
	-dbHost=hgwdev chiLan1 > do.log 2>&1 &
    #   real    172m49.925s

    # Masking statistics
     cat faSize.chiLan1.wmsk.txt
    #   2390852391 bases (106575991 N's 2284276400 real 1622493987 upper
    #   661782413 lower) in 2838 sequences in 1 files
    #   Total size: mean 842442.7 sd 4383001.2 min 1000 (AGCD01081635)
    #   max 63520090 (JH721862) median 2911
    #   %27.68 masked total, %28.97 masked real

    cat faSize.chiLan1.wmsk.sdust.txt
    #   2390852391 bases (106575991 N's 2284276400 real 1611206785 upper
    #   673069615 lower) in 2838 sequences in 1 files
    #   Total size: mean 842442.7 sd 4383001.2 min 1000 (AGCD01081635)
    #   max 63520090 (JH721862) median 2911
    #   %28.15 masked total, %29.47 masked real

    cat faSize.chiLan1.cleanWMSdust.txt
    #   2390852391 bases (106575991 N's 2284276400 real 1611206785 upper
    #   673069615 lower) in 2838 sequences in 1 files
    #   Total size: mean 842442.7 sd 4383001.2 min 1000 (AGCD01081635)
    #   max 63520090 (JH721862) median 2911
    #   %28.15 masked total, %29.47 masked real

    cat fb.chiLan1.windowmaskerSdust.clean.txt
    #   673069615 bases of 2390852391 (28.152%) in intersection

    # how much does this window masker and repeat masker overlap:
    featureBits -countGaps chiLan1 rmsk windowmaskerSdust
    #   332330878 bases of 2390852391 (13.900%) in intersection

##########################################################################
# add simpleRepeats to RepeatMasker result (DONE - 2012-07-30 - Hiram)
    # add to rmsk after it is done:
    cd /hive/data/genomes/chiLan1
    twoBitMask -add chiLan1.rmsk.2bit \
	bed/simpleRepeat/trfMask.bed chiLan1.2bit
    #	you can safely ignore the warning about fields >= 13

    twoBitToFa chiLan1.2bit stdout | faSize stdin > faSize.chiLan1.2bit.txt
    cat faSize.chiLan1.2bit.txt
    #   2390852391 bases (106575991 N's 2284276400 real 1580253341 upper
    #   704023059 lower) in 2838 sequences in 1 files
    #   Total size: mean 842442.7 sd 4383001.2 min 1000 (AGCD01081635)
    #   max 63520090 (JH721862) median 2911
    #   %29.45 masked total, %30.82 masked real

    rm /gbdb/chiLan1/chiLan1.2bit
    ln -s `pwd`/chiLan1.2bit /gbdb/chiLan1/chiLan1.2bit

##########################################################################
# cpgIslands - (DONE - 2012-07-27 - Hiram)
    mkdir /hive/data/genomes/chiLan1/bed/cpgIslands
    cd /hive/data/genomes/chiLan1/bed/cpgIslands
    time doCpgIslands.pl chiLan1 > do.log 2>&1
    #   real    24m13.577s

    cat fb.chiLan1.cpgIslandExt.txt
    #   27037127 bases of 2284276400 (1.184%) in intersection

#########################################################################
# genscan - (DONE - 2012-07-27 - Hiram)
    mkdir /hive/data/genomes/chiLan1/bed/genscan
    cd /hive/data/genomes/chiLan1/bed/genscan
    time doGenscan.pl chiLan1 > do.log 2>&1
    #   real    34m54.484s
    # two failed jobs 
# Completed: 2836 of 2838 jobs
# Crashed: 2 jobs
# CPU time in finished jobs:      49352s     822.54m    13.71h    0.57d  0.002 y
# IO & Wait Time:                 12838s     213.96m     3.57h    0.15d  0.000 y
# Average job time:                  22s       0.37m     0.01h    0.00d
# Longest finished job:            1435s      23.92m     0.40h    0.02d
# Submission to last job:          1543s      25.72m     0.43h    0.02d

    # changing the window size to -window=2000000 and running:
    ./testGsBig.csh JH721897 000 gtf/000/JH721897.gtf pep/000/JH721897.pep \
        subopt/000/JH721897.bed
    #   real    8m58.351s
    ./testGsBig.csh JH721865 000 gtf/000/JH721865.gtf pep/000/JH721865.pep subopt/000/JH721865.bed
    #   real    13m56.143s

    # continuing
    time doGenscan.pl -continue=makeBed chiLan1 > bed.log 2>&1
    #   real    3m24.229s

    cat fb.chiLan1.genscan.txt
    #   65590106 bases of 2284276400 (2.871%) in intersection
    cat fb.chiLan1.genscanSubopt.txt
    #   54325275 bases of 2284276400 (2.378%) in intersection

#########################################################################
# MAKE 11.OOC FILE FOR BLAT/GENBANK (DONE - 2012-07-27 - Hiram)
    # Use -repMatch=850, based on size -- for human we use 1024
    # use the "real" number from the faSize measurement,
    # hg19 is 2897316137, calculate the ratio factor for 1024:
    #   2390852391 bases (106575991 N's 2284276400 real 1580253341 upper
    calc \( 2284276400 / 2897316137 \) \* 1024
    #   ( 2284276400 / 2897316137 ) * 1024 = 807.333036

    # round up to 850 (orycun2 was also 920)

    cd /hive/data/genomes/chiLan1
    time blat chiLan1.2bit /dev/null /dev/null -tileSize=11 \
      -makeOoc=jkStuff/chiLan1.11.ooc -repMatch=850
    #   Wrote 20262 overused 11-mers to jkStuff/chiLan1.11.ooc
    #   real    0m54.278s
    #	oryCun2 was: Wrote 32543 overused 11-mers to jkStuff/oryCun2.11.ooc

    # there are no non-bridged gaps, no lift file needed for genbank
    hgsql -N -e "select bridge from gap;" chiLan1 | sort | uniq -c
    #    78817 yes
#    cd /hive/data/genomes/chiLan1/jkStuff
#    gapToLift chiLan1 chiLan1.nonBridged.lift -bedFile=chiLan1.nonBridged.bed
    # largest non-bridged contig:
#    awk '{print $3-$2,$0}' chiLan1.nonBridged.bed | sort -nr | head
    #   123773608 chrX  95534   123869142       chrX.01

#########################################################################
# AUTO UPDATE GENBANK (DONE - 2012-07-27 - Hiram)
    # examine the file:
    /cluster/data/genbank/data/organism.lst
    # for your species to see what counts it has for:
# organism       mrnaCnt estCnt  refSeqCnt
# Chinchilla lanigera     45      1228    0
    # to decide which "native" mrna or ests you want to specify in genbank.conf

    ssh hgwdev
    cd $HOME/kent/src/hg/makeDb/genbank
    git pull
    # edit etc/genbank.conf to add chiLan1 just before oryLat2
# chiLan1 (Chinchilla lanigera)
chiLan1.serverGenome = /hive/data/genomes/chiLan1/chiLan1.2bit
chiLan1.clusterGenome = /hive/data/genomes/chiLan1/chiLan1.2bit
chiLan1.lift = no
chiLan1.perChromTables = no
chiLan1.refseq.mrna.native.pslCDnaFilter  = ${ordered.refseq.mrna.native.pslCDnaFilter}
chiLan1.refseq.mrna.xeno.pslCDnaFilter    = ${ordered.refseq.mrna.xeno.pslCDnaFilter}
chiLan1.genbank.mrna.native.pslCDnaFilter = ${ordered.genbank.mrna.native.pslCDnaFilter}
chiLan1.genbank.mrna.xeno.pslCDnaFilter   = ${ordered.genbank.mrna.xeno.pslCDnaFilter}
chiLan1.genbank.est.native.pslCDnaFilter  = ${ordered.genbank.est.native.pslCDnaFilter}
chiLan1.ooc = /hive/data/genomes/chiLan1/jkStuff/chiLan1.11.ooc
chiLan1.refseq.mrna.native.load = yes
chiLan1.refseq.mrna.xeno.load = yes
chiLan1.genbank.mrna.xeno.load = no
chiLan1.genbank.est.native.load = yes
chiLan1.downloadDir = chiLan1

    # end of section added to etc/genbank.conf
    git commit -m "adding chiLan1 Chinchilla" etc/genbank.conf
    git push
    make etc-update

    git pull
    # Edit src/lib/gbGenome.c to add new species.
    git commit -m "adding definition for chiLanNames" src/lib/gbGenome.c
    git push
    make install-server

    ssh hgwdev			# used to do this on "genbank" machine
    screen -S chiLan1           # long running job managed in screen
    cd /cluster/data/genbank
    time nice -n +19 ./bin/gbAlignStep -initial chiLan1 &
    #   var/build/logs/2012.07.30-09:39:12.chiLan1.initalign.log
    #   real    345m41.104s
    # load database when finished
    ssh hgwdev
    cd /cluster/data/genbank
    time nice -n +19 ./bin/gbDbLoadStep -drop -initialLoad chiLan1 &
    #   real    20m50.723s
    #   var/dbload/hgwdev/logs/2012.07.30-21:45:49.dbload.log

    # enable daily alignment and update of hgwdev (DONE - 2012-05-09 - Hiram)
    cd ~/kent/src/hg/makeDb/genbank
    git pull
    # add chiLan1 to:
    vi etc/align.dbs etc/hgwdev.dbs
    git commit -m "Added chiLan1." etc/align.dbs etc/hgwdev.dbs
    git push
    make etc-update

#########################################################################

##############################################################################
# TransMap V3 tracks. see makeDb/doc/transMapTracks.txt (2014-12-21 markd)
##############################################################################
