# for emacs: -*- mode: sh; -*-

#	http://www.ncbi.nlm.nih.gov/Traces/wgs/?val=ABRO01

# Dipodomys ordii  Kangaroo rat
#########################################################################
# DOWNLOAD SEQUENCE (DONE braney 2008-10-07)
    ssh kkstore05
    mkdir /cluster/store12/dipOrd1
    ln -s /cluster/store12/dipOrd1 /cluster/data
    mkdir /cluster/data/dipOrd1/broad
    cd /cluster/data/dipOrd1/broad

    wget --timestamping \
ftp://ftp.broad.mit.edu/pub/assemblies/mammals/kangarooRat/Dipord1.0/assembly.agp \
ftp://ftp.broad.mit.edu/pub/assemblies/mammals/kangarooRat/Dipord1.0/assembly.bases.gz \
ftp://ftp.broad.mit.edu/pub/assemblies/mammals/kangarooRat/Dipord1.0/assembly.quals.gz 
    md5sum ass* > assembly.md5sum

    qaToQac assembly.quals.gz stdout | qacAgpLift assembly.agp stdin dipOrd1.qual.qac

   cut -f 1 assembly.agp | uniq -c | wc -l 
   # Number of scaffolds: 210053

#########################################################################
# Create .ra file and run makeGenomeDb.pl (DONE braney 2008-10-07)
    ssh kolossus
    cd /cluster/data/dipOrd1
cat << _EOF_ >dipOrd1.config.ra
# Config parameters for makeGenomeDb.pl:
db dipOrd1
clade mammal
genomeCladePriority 35
scientificName  Dipodomys ordii  
commonName Kangaroo rat
assemblyDate Jul. 2008
assemblyLabel Broad Institute
orderKey 236.5
#mitoAcc AJ222767
mitoAcc none
fastaFiles /cluster/data/dipOrd1/broad/assembly.bases.gz
agpFiles /cluster/data/dipOrd1/broad/assembly.agp
qualFiles /cluster/data/dipOrd1/broad/dipOrd1.qual.qac
dbDbSpeciesDir kangarooRat
_EOF_

# use 'screen' make sure on kkstore05
    makeGenomeDb.pl -workhorse kolossus dipOrd1.config.ra > makeGenomeDb.out 2>&1 &

    cut -f 2 chrom.sizes | ave stdin
# Q1 1417.000000
# median 2509.000000
# Q3 8139.000000
# average 10275.987955
# min 554.000000
# max 11421927.000000
# count 210053
# total 2158502098.000000
# standard deviation 40860.463605

#########################################################################
## Repeat Masker (DONE - 2008-10-14 - Hiram)
    screen	# to manage this job
    mkdir /hive/data/genomes/dipOrd1/bed/repeatMasker
    cd /hive/data/genomes/dipOrd1/bed/repeatMasker
    time doRepeatMasker.pl -workhorse=hgwdev -bigClusterHub=swarm \
	-buildDir=`pwd` dipOrd1 > do.log 2>&1 &
    #	real    663m46.841s
    #	failed in doMask.csh due to no dipOrd1.unmasked.2bit file ?
    #	which was there at the beginning, now it is gone ?
    #	the broken doRepeatMasker.pl script removed it due to hive confusion
    #	went back to the jkStuff/makeUnmasked script and re-ran part of that.
    #	then ran the doMask.csh script to finish that, and continuing:
    time $HOME/kent/src/hg/utils/automation/doRepeatMasker.pl \
	-continue=install -workhorse=hgwdev -bigClusterHub=swarm \
        -buildDir=`pwd` dipOrd1 > install.log 2>&1 &
    #	real    15m17.199s
    twoBitToFa dipOrd1.rmsk.2bit stdout | faSize stdin > faSize.rmsk.txt
# 2158502098 bases (313540677 N's 1844961421 real 1404133178 upper 440828243
# lower) in 210053 sequences in 1 files
# %20.42 masked total, %23.89 masked real

#########################################################################
# SIMPLE REPEATS TRF (DONE - 2008-10-17 - Hiram)
    screen # use a screen to manage this job
    mkdir /cluster/data/dipOrd1/bed/simpleRepeat
    cd /cluster/data/dipOrd1/bed/simpleRepeat
    # 
    time  $HOME/kent/src/hg/utils/automation/doSimpleRepeat.pl \
	-buildDir=/cluster/data/dipOrd1/bed/simpleRepeat \
	dipOrd1 > do.log 2>&1 &
    #	real    2630m12.078s

    featureBits dipOrd1 simpleRepeat
    #	211605979 bases of 1844961421 (11.469%) in intersection

    #	after RM run is done, add this mask:
    cd /cluster/data/dipOrd1
    twoBitMask dipOrd1.rmsk.2bit -add bed/simpleRepeat/trfMask.bed dipOrd1.2bit

    twoBitToFa dipOrd1.2bit stdout | faSize stdin > dipOrd1.2bit.faSize.txt
# 2158502098 bases (313540677 N's 1844961421 real 1323236442 upper 521724979
# lower) in 210053 sequences in 1 files
# %24.17 masked total, %28.28 masked real

    #	link to gbdb
    ln -s `pwd`/dipOrd1.2bit /gbdb/dipOrd1

###########################################################################
# prepare for kluster runs (DONE _ 2008-10-17 - Hiram)
    # compare to size of real bases to adjust the repMatch
    #	hg18: 2881421696
    #	dipOrd1: 1844961421
    # thus: 1024 * 1844961421/2881421696 = 655
    #	rounding up to 700 to be a bit conservative
    cd /hive/data/genomes/dipOrd1
    time blat dipOrd1.2bit \
	/dev/null /dev/null -tileSize=11 -makeOoc=dipOrd1.11.ooc -repMatch=700
    #	Wrote 33062 overused 11-mers to dipOrd1.11.ooc
    #	real    1m34.993s

    #	and staging data for push to kluster nodes
    mkdir /hive/data/staging/data/dipOrd1
    cp -p dipOrd1.2bit chrom.sizes dipOrd1.11.ooc \
	/hive/data/staging/data/dipOrd1
    #	request to cluster admin to push this to the kluster nodes
    #	/scratch/data/

###########################################################################
# add NCBI identifiers to the dbDb (DONE - 2008-10-21 - Hiram)
    hgsql -e 'update dbDb set
sourceName="Broad Institute dipOrd1 (NCBI project 20385, ABRO01000000)" where name="dipOrd1";' hgcentraltest

###########################################################################
#  dipOrd1 - Kangaroo rat - Ensembl Genes version 51  (DONE - 2008-12-02 -
#  hiram)
    ssh hgwdev
    cd /hive/data/genomes/dipOrd1
    cat << '_EOF_' > dipOrd1.ensGene.ra
# required db variable
db dipOrd1
# do we need to translate geneScaffold coordinates
geneScaffolds yes
# ignore genes that do not properly convert to a gene pred, and contig
#       names that are not in the UCSC assembly
skipInvalid yes
# ignore the single gene that have invalid structures from Ensembl:
# 11275: ENSDORT00000004734 no exonFrame on CDS exon 12
'_EOF_'
#  << happy emacs

    doEnsGeneUpdate.pl -ensVersion=51 dipOrd1.ensGene.ra
    ssh hgwdev
    cd /hive/data/genomes/dipOrd1/bed/ensGene.51
    featureBits dipOrd1 ensGene
    # 25129085 bases of 1844961421 (1.362%) in intersection

 *** All done!  (through the 'makeDoc' step)
 *** Steps were performed in /hive/data/genomes/dipOrd1/bed/ensGene.51

############################################################################
# cpgIslands - (DONE - 2011-04-23 - Hiram)
    mkdir /hive/data/genomes/dipOrd1/bed/cpgIslands
    cd /hive/data/genomes/dipOrd1/bed/cpgIslands
    time doCpgIslands.pl dipOrd1 > do.log 2>&1
    #   real    635m52.950s
    #   fixing broken command in script:
    time ./doLoadCpg.csh
    #   real    2m18.163s
    time doCpgIslands.pl -continue=cleanup dipOrd1 > cleanup.log 2>&1
    #   real    97m27.451s

    cat fb.dipOrd1.cpgIslandExt.txt
    #   16112003 bases of 1844961421 (0.873%) in intersection

#########################################################################
# genscan - (DONE - 2011-04-26 - Hiram)
    mkdir /hive/data/genomes/dipOrd1/bed/genscan
    cd /hive/data/genomes/dipOrd1/bed/genscan
    time doGenscan.pl dipOrd1 > do.log 2>&1
    # recovering from power failure, kluster run had just finished
    # and it had just started on makeBed:
    time ./doMakeBed.csh
    #   real    317m41.669s
    time doGenscan.pl -continue=load dipOrd1 > load.log 2>&1
    #   real    14m39.736s
    # failed out of inodes on hive:
    time doGenscan.pl -continue=cleanup dipOrd1 > cleanup.log 2>&1
    #   real    43m1.037s

    cat fb.dipOrd1.genscan.txt
    #   46775156 bases of 1844961421 (2.535%) in intersection
    cat fb.dipOrd1.genscanSubopt.txt
    #   46026502 bases of 1844961421 (2.495%) in intersection

#########################################################################
# windowMasker - (DONE - 2012-05-02 - Hiram)
    mkdir /hive/data/genomes/dipOrd1/bed/windowMasker
    cd /hive/data/genomes/dipOrd1/bed/windowMasker
    # trying out new version of the script that does all the usual steps
    #   that used to be performed manually after it was done
    time /cluster/home/hiram/kent/src/hg/utils/automation/doWindowMasker.pl \
        -workhorse=hgwdev -buildDir=`pwd` -dbHost=hgwdev dipOrd1 > do.log 2>&1
    #   Elapsed time: 574m42s

    sed -e 's/^/    #\t/' fb.dipOrd1.windowmaskerSdust.beforeClean.txt
    #   1092394968 bases of 2158502098 (50.609%) in intersection
    sed -e 's/^/    #\t/' fb.dipOrd1.windowmaskerSdust.clean.txt
    #   778854291 bases of 2158502098 (36.083%) in intersection
    sed -e 's/^/    #\t/' fb.dipOrd1.rmsk.windowmaskerSdust.txt
    #   294876541 bases of 2158502098 (13.661%) in intersection

#########################################################################
# AUTO UPDATE GENBANK (DONE - 2012-05-03 - Hiram)
    # examine the file:
    /cluster/data/genbank/data/organism.lst
    # for your species to see what counts it has for:
# organism       mrnaCnt estCnt  refSeqCnt
# Dipodomys ordii - does not exist in this file
    # to decide which "native" mrna or ests you want to specify in genbank.conf

    ssh hgwdev  
    cd $HOME/kent/src/hg/makeDb/genbank
    git pull
# dipOrd1 (Kangaroo rat)
dipOrd1.serverGenome = /hive/data/genomes/dipOrd1/dipOrd1.2bit
dipOrd1.clusterGenome = /scratch/data/dipOrd1/dipOrd1.2bit
dipOrd1.ooc = /scratch/data/dipOrd1/dipOrd1.11.ooc
dipOrd1.lift = no
dipOrd1.refseq.mrna.native.pslCDnaFilter  = ${lowCover.refseq.mrna.native.pslCDnaFilter}
dipOrd1.refseq.mrna.xeno.pslCDnaFilter    = ${lowCover.refseq.mrna.xeno.pslCDnaFilter}
dipOrd1.genbank.mrna.native.pslCDnaFilter = ${lowCover.genbank.mrna.native.pslCDnaFilter}
dipOrd1.genbank.mrna.xeno.pslCDnaFilter   = ${lowCover.genbank.mrna.xeno.pslCDnaFilter}
dipOrd1.genbank.est.native.pslCDnaFilter  = ${lowCover.genbank.est.native.pslCDnaFilter}
dipOrd1.refseq.mrna.native.load = no
dipOrd1.refseq.mrna.xeno.load = yes
dipOrd1.genbank.mrna.xeno.load = yes
dipOrd1.genbank.est.native.load = no
dipOrd1.downloadDir = dipOrd1
dipOrd1.perChromTables = no

    # end of section added to etc/genbank.conf
    git commit -m "adding dipOrd1 kangaroo rat" etc/genbank.conf
    git push
    make etc-update

    git pull
    # Edit src/lib/gbGenome.c to add new species.
    git commit -m "adding definition for dipOrdNames Dipodomys ordii" \
        src/lib/gbGenome.c
#	Western Painted Turtle - Chrysemys picta bellii - Dec 2011
    git push
    make install-server

    ssh hgwdev			# used to do this on "genbank" machine
    screen -S dipOrd1           # long running job managed in screen
    cd /cluster/data/genbank
    time nice -n +19 ./bin/gbAlignStep -initial dipOrd1 &
    #	var/build/logs/2012.05.03-16:19:09.dipOrd1.initalign.log
    #   real    6078m6.247s

    # load database when finished
    ssh hgwdev
    cd /cluster/data/genbank
    time nice -n +19 ./bin/gbDbLoadStep -drop -initialLoad dipOrd1 &
    #	logFile: var/dbload/hgwdev/logs/2012.05.08-15:08:30.dbload.log
    #   real    38m3.733s

    # enable daily alignment and update of hgwdev (DONE - 2012-02-09 - Hiram)
    cd ~/kent/src/hg/makeDb/genbank
    git pull
    # add dipOrd1 to:
    vi etc/align.dbs etc/hgwdev.dbs
    git commit -m "Added dipOrd1." etc/align.dbs etc/hgwdev.dbs
    git push
    make etc-update

#########################################################################
# set default position to OPN5 gene displays  (DONE - 2012-07-20 - Hiram)
    hgsql -e \
'update dbDb set defaultPos="scaffold_7358:2,192-33,995" where name="dipOrd1";' \
	hgcentraltest

############################################################################
# pushQ entry (DONE - 2012-07-20 - Hiram)
    mkdir /hive/data/genomes/dipOrd1/pushQ
    cd /hive/data/genomes/dipOrd1/pushQ
    # Mark says don't let the transMap track get there
    time makePushQSql.pl dipOrd1 2> stderr.txt | grep -v transMap > dipOrd1.sql

    scp -p dipOrd1.sql hgwbeta:/tmp
    ssh hgwbeta
    cd /tmp
    hgsql qapushq < dipOrd1.sql

############################################################################
##############################################################################
# TransMap V3 tracks. see makeDb/doc/transMapTracks.txt (2014-12-21 markd)
##############################################################################
#########################################################################
# LIFTOVER TO dipOrd2 (TBD - 2017-12-31 - Hiram)
    ssh hgwdev
    mkdir /hive/data/genomes/dipOrd1/bed/blat.dipOrd2.2017-12-31
    cd /hive/data/genomes/dipOrd1/bed/blat.dipOrd2.2017-12-31
    time (doSameSpeciesLiftOver.pl -verbose=2 -buildDir=`pwd` \
	-ooc=/hive/data/genomes/dipOrd1/jkStuff/dipOrd1.11.ooc \
        -bigClusterHub=ku -dbHost=hgwdev -workhorse=hgwdev \
         dipOrd1 dipOrd2) > do.log 2>&1
    # real    559m26.237s

    # verify the convert link on the test browser is now active from dipOrd1 to
    # dipOrd2

########################################################################
