# for emacs: -*- mode: sh; -*-

# Pika "Ochotona princeps"

#	http://www.ncbi.nlm.nih.gov/genome/771
#	http://www.ncbi.nlm.nih.gov/bioproject/74593
#	http://www.ncbi.nlm.nih.gov/bioproject/12210 - chrMt - Italy

#########################################################################
# DOWNLOAD SEQUENCE (DONE braney 2008-07-11)
    ssh kkstore05
    mkdir /cluster/store12/ochPri2
    ln -s /cluster/store12/ochPri2 /cluster/data
    mkdir /cluster/data/ochPri2/broad
    cd /cluster/data/ochPri2/broad

    wget --timestamping \
ftp://ftp.broad.mit.edu/pub/assemblies/mammals/pika/OchPri2.0/assembly.agp \
ftp://ftp.broad.mit.edu/pub/assemblies/mammals/pika/OchPri2.0/assembly.bases.gz \
ftp://ftp.broad.mit.edu/pub/assemblies/mammals/pika/OchPri2.0/assembly.quals.gz 
    md5sum ass* > assembly.md5sum

    qaToQac assembly.quals.gz stdout | qacAgpLift assembly.agp stdin ochPri2.qual.qac

    # wget --timestamping \
ftp://ftp.broad.mit.edu/pub/assemblies/mammals/pika/OchPri2.0/BasicStats.out

#### No BasicStats.out

   cut -f 1 assembly.agp | uniq -c | wc -l 
   # Number of scaffolds: 187558

#########################################################################
# Create .ra file and run makeGenomeDb.pl
    ssh kkstore05
    cd /cluster/data/ochPri2
cat << _EOF_ >ochPri2.config.ra
# Config parameters for makeGenomeDb.pl:
db ochPri2
clade mammal
genomeCladePriority 35
scientificName Ochotona princeps
commonName Pika
assemblyDate Jul. 2008
assemblyLabel Broad Institute ochPri2 
orderKey 236.5
#mitoAcc AJ222767
mitoAcc none
fastaFiles /cluster/data/ochPri2/broad/assembly.bases.gz
agpFiles /cluster/data/ochPri2/broad/assembly.agp
qualFiles /cluster/data/ochPri2/broad/ochPri2.qual.qac
dbDbSpeciesDir pika
_EOF_

# use 'screen' make sure on kkstore05
    makeGenomeDb.pl -verbose=2 ochPri2.config.ra > makeGenomeDb.out 2>&1 &

# 'ctl-a ctl -d' returns to previous shell
cut -f 2 chrom.sizes | ave stdin
# Q1 1249.000000
# median 4932.000000
# Q3 9905.250000
# average 18371.833534
# min 600.000000
# max 1977105.000000
# count 187558
# total 3445784354.000000
# standard deviation 52187.432301


#########################################################################
# REPEATMASKER (DONE braney 2008-07-29)
    ssh kkstore05
    screen # use a screen to manage this job
    mkdir /cluster/data/ochPri2/bed/repeatMasker
    cd /cluster/data/ochPri2/bed/repeatMasker
    doRepeatMasker.pl -buildDir=/cluster/data/ochPri2/bed/repeatMasker \
        ochPri2 > do.log 2>&1 &

    # Note: can run simpleRepeats simultaneously
    #### When done with RM:
    ssh pk
    para time

# Completed: 8024 of 8024 jobs
# CPU time in finished jobs:   20582841s  343047.36m  5717.46h  238.23d  0.653 y
# IO & Wait Time:                172016s    2866.93m    47.78h    1.99d  0.005 y
# Average job time:                2587s      43.11m     0.72h    0.03d
# Longest finished job:            8109s     135.15m     2.25h    0.09d
# Submission to last job:        121861s    2031.02m    33.85h    1.41d

    time nice -n +19 featureBits ochPri2 rmsk > fb.ochPri2.rmsk.txt 2>&1 &
    # 219204536 bases of 1923624051 (11.395%) in intersection

    # RepeatMasker and lib version from do.log:
    #    Jun 13 2008 (open-3-2-5) version of RepeatMasker
    # CC   RELEASE 20080611;  


#########################################################################
# SIMPLE REPEATS TRF (DONE braney 2008-07-29)
    ssh kkstore05
    screen # use a screen to manage this job
    mkdir /cluster/data/ochPri2/bed/simpleRepeat
    cd /cluster/data/ochPri2/bed/simpleRepeat
    # 
    doSimpleRepeat.pl -buildDir=/cluster/data/ochPri2/bed/simpleRepeat \
	ochPri2 > do.log 2>&1 &

    #### When done
    ssh pk
    para time

# Completed: 70 of 70 jobs
# CPU time in finished jobs:      51330s     855.49m    14.26h    0.59d  0.002 y
# IO & Wait Time:                   191s       3.19m     0.05h    0.00d  0.000 y
# Average job time:                 736s      12.27m     0.20h    0.01d
# Longest finished job:            7673s     127.88m     2.13h    0.09d
# Submission to last job:          7905s     131.75m     2.20h    0.09d

    featureBits ochPri2 simpleRepeat
    # 181386495 bases of 1923624051 (9.429%) in intersection

    #	after RM run is done, add this mask:
    cd /cluster/data/ochPri2
    twoBitMask ochPri2.rmsk.2bit -add bed/simpleRepeat/trfMask.bed ochPri2.2bit

    twoBitToFa ochPri2.2bit stdout | faSize stdin

# 3445784354 bases (1522160303 N's 1923624051 real 1703533517 upper 220090534
# lower) in 187558 sequences in 1 files
# Total size: mean 18371.8 sd 52187.6 min 600 (scaffold_187557) max 1977105
# (scaffold_0) median 4932
# N count: mean 8115.7 sd 20418.0
# U count: mean 9082.7 sd 30990.6
# L count: mean 1173.5 sd 4397.6
# %6.39 masked total, %11.44 masked real

    twoBitToFa ochPri2.rmsk.2bit stdout | faSize stdin
# 3445784354 bases (1522160303 N's 1923624051 real 1704790340 upper 218833711
# lower) in 187558 sequences in 1 files
# Total size: mean 18371.8 sd 52187.6 min 600 (scaffold_187557) max 1977105
# (scaffold_0) median 4932
# N count: mean 8115.7 sd 20418.0
# U count: mean 9089.4 sd 31003.0
# L count: mean 1166.8 sd 4384.0
# %6.35 masked total, %11.38 masked real

    # Link to it from /gbdb
    ln -s /cluster/data/ochPri2/ochPri2.2bit /gbdb/ochPri2/ochPri2.2bit

    # mkdir /san/sanvol1/scratch/ochPri2
    cp /cluster/data/ochPri2/ochPri2.2bit /san/sanvol1/scratch/ochPri2
    cp /cluster/data/ochPri2/chrom.sizes /san/sanvol1/scratch/ochPri2

############################################################################
#  ochPri2 - Pika - Ensembl Genes version 51  (DONE - 2008-12-03 - hiram)
    ssh kolossus
    cd /hive/data/genomes/ochPri2
    cat << '_EOF_' > ochPri2.ensGene.ra
# required db variable
db ochPri2
# do we need to translate geneScaffold coordinates
geneScaffolds yes
# ignore genes that do not properly convert to a gene pred, and contig
#	names that are not in the UCSC assembly
skipInvalid yes
# ignore the single gene that has an invalid structure from Ensembl:
# 10995: ENSOPRT00000002716 no exonFrame on CDS exon 2
'_EOF_'
#  << happy emacs

    doEnsGeneUpdate.pl -ensVersion=51 ochPri2.ensGene.ra
    ssh hgwdev
    cd /hive/data/genomes/ochPri2/bed/ensGene.51
    featureBits ochPri2 ensGene
    # 25200422 bases of 1923624051 (1.310%) in intersection

 *** All done!  (through the 'makeDoc' step)
 *** Steps were performed in /hive/data/genomes/ochPri2/bed/ensGene.51

############################################################################
# cpgIslands - (DONE - 2011-04-23 - Hiram)
    mkdir /hive/data/genomes/ochPri2/bed/cpgIslands
    cd /hive/data/genomes/ochPri2/bed/cpgIslands
    time doCpgIslands.pl ochPri2 > do.log 2>&1
    #   fixing broken command in script:
    time ./doLoadCpg.csh
    #   real    2m10.760s
    # continuing:
    time doCpgIslands.pl -continue=cleanup ochPri2 > cleanup.log 2>&1
    #   real    41m18.291s

    cat fb.ochPri2.cpgIslandExt.txt
    #   22070916 bases of 1923624051 (1.147%) in intersection

#########################################################################
# genscan - (DONE - 2011-04-26 - Hiram)
    mkdir /hive/data/genomes/ochPri2/bed/genscan
    cd /hive/data/genomes/ochPri2/bed/genscan
    time doGenscan.pl ochPri2 > do.log 2>&1
    # recovering from power failure, kluster run had just finished
    # and it had just started on makeBed:
    time ./doMakeBed.csh
    #   real    329m1.050s
    # continuing:
    time doGenscan.pl -continue=load ochPri2 > load.log 2>&1
    #   real    12m8.926s
    # failed out of inodes on hive:
    time doGenscan.pl -continue=cleanup ochPri2 > cleanup.log 2>&1
    #   real    35m52.560s

    cat fb.ochPri2.genscan.txt
    #   76450061 bases of 1923624051 (3.974%) in intersection
    cat fb.ochPri2.genscanSubopt.txt
    #   102217862 bases of 1923624051 (5.314%) in intersection

#########################################################################
# windowMasker - (DONE - 2012-05-02 - Hiram)
    screen -S ochPri2
    mkdir /hive/data/genomes/ochPri2/bed/windowMasker
    cd /hive/data/genomes/ochPri2/bed/windowMasker
    # trying out new version of the script that does all the usual steps
    #   that used to be performed manually after it was done
    time /cluster/home/hiram/kent/src/hg/utils/automation/doWindowMasker.pl \
        -workhorse=hgwdev -buildDir=`pwd` -dbHost=hgwdev ochPri2 > do.log 2>&1
    #   real    904m39.415s
    # fixing a broken command in the doLoad step:
    time ./lastLoad.csh
    #   real    7m10.582s
    time /cluster/home/hiram/kent/src/hg/utils/automation/doWindowMasker.pl \
        -continue=cleanup -workhorse=hgwdev -buildDir=`pwd` \
        -dbHost=hgwdev ochPri2 > cleanup.log 2>&1
    #   real    1m45.497s

    cat fb.ochPri2.windowmaskerSdust.beforeClean.txt
    #   2224659037 bases of 3445784354 (64.562%) in intersection
    sed -e 's/^/    #\t/' fb.ochPri2.windowmaskerSdust.clean.txt
    #   702498734 bases of 3445784354 (20.387%) in intersection
    sed -e 's/^/    #\t/' fb.ochPri2.rmsk.windowmaskerSdust.txt
    #   79974550 bases of 3445784354 (2.321%) in intersection

#########################################################################
# MAKE 11.OOC FILE FOR BLAT/GENBANK (DONE - 2012-05-03 - Hiram)
    # Use -repMatch=900, based on size -- for human we use 1024
    # use the "real" number from the faSize measurement,
    # hg19 is 2897316137, calculate the ratio factor for 1024:
    calc \( 1923624051 / 2897316137 \) \* 1024
    #	( 1923624051 / 2897316137 ) * 1024 = 679.867483

    # round up to 700

    cd /hive/data/genomes/ochPri2
    time blat ochPri2.2bit /dev/null /dev/null -tileSize=11 \
      -makeOoc=jkStuff/ochPri2.11.ooc -repMatch=700
    #   Wrote 19533 overused 11-mers to jkStuff/ochPri2.11.ooc
    #   real    0m51.997s

    # there are no non-bridged gaps, no lift file needed for genbank
    hgsql -N -e "select bridge from gap;" ochPri2 | sort | uniq -c
    #   530221 yes
#    cd /hive/data/genomes/ochPri2/jkStuff
#    gapToLift ochPri2 ochPri2.nonBridged.lift -bedFile=ochPri2.nonBridged.bed
    # largest non-bridged contig:
#    awk '{print $3-$2,$0}' ochPri2.nonBridged.bed | sort -nr | head
    #   123773608 chrX  95534   123869142       chrX.01

#########################################################################
# AUTO UPDATE GENBANK (DONE - 2012-05-03 - Hiram)
    # examine the file:
    /cluster/data/genbank/data/organism.lst
    # for your species to see what counts it has for:
# organism       mrnaCnt estCnt  refSeqCnt
# Ochotona princeps       1       0       0
    # to decide which "native" mrna or ests you want to specify in genbank.conf

    ssh hgwdev
    cd $HOME/kent/src/hg/makeDb/genbank
    git pull
    # edit etc/genbank.conf to add ochPri2 just after ce2
# ochPri2 (Pika)
ochPri2.serverGenome = /hive/data/genomes/ochPri2/ochPri2.2bit
ochPri2.clusterGenome = /hive/data/genomes/ochPri2/ochPri2.2bit
ochPri2.ooc = /hive/data/genomes/ochPri2/jkStuff/ochPri2.11.ooc
ochPri2.lift = no
ochPri2.refseq.mrna.native.pslCDnaFilter  = ${lowCover.refseq.mrna.native.pslCDnaFilter}
ochPri2.refseq.mrna.xeno.pslCDnaFilter    = ${lowCover.refseq.mrna.xeno.pslCDnaFilter}
ochPri2.genbank.mrna.native.pslCDnaFilter = ${lowCover.genbank.mrna.native.pslCDnaFilter}
ochPri2.genbank.mrna.xeno.pslCDnaFilter   = ${lowCover.genbank.mrna.xeno.pslCDnaFilter}
ochPri2.genbank.est.native.pslCDnaFilter  = ${lowCover.genbank.est.native.pslCDnaFilter}
ochPri2.refseq.mrna.native.load = no
ochPri2.refseq.mrna.xeno.load = yes
ochPri2.genbank.mrna.xeno.load = yes
ochPri2.genbank.est.native.load = no
ochPri2.downloadDir = ochPri2
ochPri2.perChromTables = no

    # end of section added to etc/genbank.conf
    git commit -m "adding ochPri2 pika" etc/genbank.conf
    git push
    make etc-update

    git pull
    # Edit src/lib/gbGenome.c to add new species.
    git commit -m "adding definition for micMurNames" \
        src/lib/gbGenome.c
    git push
    make install-server

    ssh hgwdev			# used to do this on "genbank" machine
    screen -S ochPri2           # long running job managed in screen
    cd /cluster/data/genbank
    time nice -n +19 ./bin/gbAlignStep -initial ochPri2 &
    #   var/build/logs/2012.06.08-09:50:30.ochPri2.initalign.log
    #   real    2405m45.707s

    # load database when finished
    ssh hgwdev
    cd /cluster/data/genbank
    time nice -n +19 ./bin/gbDbLoadStep -drop -initialLoad ochPri2 &
    #	logFile:  var/dbload/hgwdev/logs/2012.06.11-13:42:34.dbload.log
    #   real    10m29.923s

    # enable daily alignment and update of hgwdev (DONE - 2012-06-11 - Hiram)
    cd ~/kent/src/hg/makeDb/genbank
    git pull
    # add ochPri2 to:
    vi etc/align.dbs etc/hgwdev.dbs
    git commit -m "Added ochPri2." etc/align.dbs etc/hgwdev.dbs
    git push
    make etc-update

#########################################################################
# set default position to RHO gene displays  (DONE - 2012-07-24 - Hiram)
    hgsql -e \
'update dbDb set defaultPos="scaffold_6491:75719-81052" where name="ochPri2";' \
	hgcentraltest

############################################################################
# pushQ entry (DONE - 2012-07-24 - Hiram)
    mkdir /hive/data/genomes/ochPri2/pushQ
    cd /hive/data/genomes/ochPri2/pushQ
    # Mark says don't let the transMap track get there
    time makePushQSql.pl ochPri2 2> stderr.txt | grep -v transMap > ochPri2.sql
    #   real    3m37.306s

    scp -p ochPri2.sql hgwbeta:/tmp
    ssh hgwbeta "hgsql qapushq < /tmp/ochPri2.sql"

############################################################################
# construct liftOver to ochPri3 (DONE - 2014-04-17 - Hiram)
    screen -S ochPri3	# manage this longish running job in a screen
    mkdir /hive/data/genomes/ochPri2/bed/blat.ochPri3.2014-04-17
    cd /hive/data/genomes/ochPri2/bed/blat.ochPri3.2014-04-17
    # check it with -debug first to see if it is going to work:
    time doSameSpeciesLiftOver.pl -buildDir=`pwd` -bigClusterHub=ku \
	-ooc=/hive/data/genomes/ochPri2/jkStuff/ochPri2.11.ooc \
	-debug -dbHost=hgwdev -workhorse=hgwdev ochPri2 ochPri3
    #   real    0m1.838s
    # if that is OK, then run it:
    time doSameSpeciesLiftOver.pl -buildDir=`pwd` -bigClusterHub=ku \
	-ooc=/hive/data/genomes/ochPri2/jkStuff/ochPri2.11.ooc \
	-dbHost=hgwdev -workhorse=hgwdev ochPri2 ochPri3 > do.log 2>&1

    # verify this file exists:
    #	/gbdb/ochPri2/liftOver/ochPri2ToOchPri3.over.chain.gz
    # and try out the conversion on genome-test from ochPri2 to ochPri3

############################################################################
