# for emacs: -*- mode: sh; -*-

# Tarsius syrichta
# (original doc had incorrect name of Tarsier syrichta)

#	http://www.ncbi.nlm.nih.gov/Traces/wgs/?val=ABRT00

#########################################################################
# DOWNLOAD SEQUENCE (DONE braney 2008-10-07)
    ssh kolossus
    mkdir /hive/data/genomes/tarSyr1
    ln -s /cluster/store12/tarSyr1 /cluster/data
    mkdir /cluster/data/tarSyr1/broad
    cd /cluster/data/tarSyr1/broad

    wget --timestamping \
ftp://ftp.broad.mit.edu/pub/assemblies/mammals/tarsier/Tarsyr1.0/assembly.agp \
ftp://ftp.broad.mit.edu/pub/assemblies/mammals/tarsier/Tarsyr1.0/assembly.bases.gz \
ftp://ftp.broad.mit.edu/pub/assemblies/mammals/tarsier/Tarsyr1.0/assembly.quals.gz 
    md5sum ass* > assembly.md5sum

    qaToQac assembly.quals.gz stdout | qacAgpLift assembly.agp stdin tarSyr1.qual.qac

   cut -f 1 assembly.agp | uniq -c | wc -l 
   # Number of scaffolds: 656709


#########################################################################
# Create .ra file and run makeGenomeDb.pl  (working... braney
    ssh hgwdev
    screen
    cd /cluster/data/tarSyr1
cat << _EOF_ >tarSyr1.config.ra
# Config parameters for makeGenomeDb.pl:
db tarSyr1
clade mammal
genomeCladePriority 35
scientificName  Tarsier syrichta
commonName Tarsier
assemblyDate Aug. 2008
assemblyLabel Broad Institute tarSyr1 
orderKey 236.5
#mitoAcc AJ222767
mitoAcc none
fastaFiles /cluster/data/tarSyr1/broad/assembly.bases.gz
agpFiles /cluster/data/tarSyr1/broad/assembly.agp
qualFiles /cluster/data/tarSyr1/broad/tarSyr1.qual.qac
dbDbSpeciesDir tarsier
_EOF_

# use 'screen' make sure on kkstore05
    makeGenomeDb.pl -workhorse kolossus -verbose=2 tarSyr1.config.ra > makeGenomeDb.out 2>&1 &

# 'ctl-a ctl -d' returns to previous shell
cut -f 2 chrom.sizes | ave stdin
# Q1 7169.500000
# median 13298.000000
# Q3 55788.500000
# average 866164.007952
# min 3002.000000
# max 88675666.000000
# count 3144
# total 2723219641.000000
# standard deviation 5316243.688364


#########################################################################
# REPEATMASKER (not done)
    ssh kkstore05
    screen # use a screen to manage this job
    mkdir /cluster/data/tarSyr1/bed/repeatMasker
    cd /cluster/data/tarSyr1/bed/repeatMasker
    /cluster/bin/scripts/doRepeatMasker.pl -species "tarsier" \
	-buildDir=/cluster/data/tarSyr1/bed/repeatMasker    \
	tarSyr1 > do.log 2>&1 &

# new parasol, lots of crashes, no times

    /cluster/bin/scripts/doRepeatMasker.pl -species "tarsier" \
	-continue cat -buildDir=/cluster/data/tarSyr1/bed/repeatMasker    \
	tarSyr1 > do2.log 2>&1 &


    time nice -n +19 featureBits tarSyr1 rmsk > fb.tarSyr1.rmsk.txt 2>&1 &
    # 1154651023 bases of 2771976320 (41.654%) in intersection


#########################################################################
# SIMPLE REPEATS TRF (not done)
    ssh kkstore05
    screen # use a screen to manage this job
    mkdir /cluster/data/tarSyr1/bed/simpleRepeat
    cd /cluster/data/tarSyr1/bed/simpleRepeat
    # 
    doSimpleRepeat.pl -buildDir=/cluster/data/tarSyr1/bed/simpleRepeat \
	tarSyr1 > do.log 2>&1 &

    #### When done
    ssh pk
    para time
    # Completed: 51 of 51 jobs
    # CPU time in finished jobs:      24985s     416.41m     6.94h    0.29d
    # 0.001 y
    # IO & Wait Time:                   101s       1.69m     0.03h    0.00d
    # 0.000 y
    # Average job time:                 492s       8.20m     0.14h    0.01d
    # Longest finished job:            3887s      64.78m     1.08h    0.04d
    # Submission to last job:          3911s      65.18m     1.09h    0.05d

    featureBits tarSyr1 simpleRepeat
    # 50851363 bases of 2771976320 (1.834%) in intersection

    #	after RM run is done, add this mask:
    cd /cluster/data/tarSyr1
    twoBitMask tarSyr1.rmsk.2bit -add bed/simpleRepeat/trfMask.bed tarSyr1.2bit

    twoBitToFa tarSyr1.2bit stdout | faSize stdin
#    3183347966 bases (411371646 N's 2771976320 real 1619032005 upper
#    1152944315 lower) in 659020 sequences in 1 files
#    Total size: mean 4830.4 sd 8873.5 min 600 (scaffold_659019) max 329427
#    (scaffold_0) median 1645
#    N count: mean 624.2 sd 1398.1
#    U count: mean 2456.7 sd 5229.3
#    L count: mean 1749.5 sd 2909.8
#    %36.22 masked total, %41.59 masked real

    twoBitToFa tarSyr1.rmsk.2bit stdout | faSize stdin
# 3183347966 bases (411371646 N's 2771976320 real 1619660149 upper 1152316171
# lower) in 659020 sequences in 1 files
# Total size: mean 4830.4 sd 8873.5 min 600 (scaffold_659019) max 329427
# (scaffold_0) median 1645
# N count: mean 624.2 sd 1398.1
# U count: mean 2457.7 sd 5230.7
# L count: mean 1748.5 sd 2908.4
# %36.20 masked total, %41.57 masked real


    # Link to it from /gbdb
    ln -s /cluster/data/tarSyr1/tarSyr1.2bit /gbdb/tarSyr1/tarSyr1.2bit

    # mkdir /san/sanvol1/scratch/tarSyr1
    cp /cluster/data/tarSyr1/tarSyr1.2bit /san/sanvol1/scratch/tarSyr1
    cp /cluster/data/tarSyr1/chrom.sizes /san/sanvol1/scratch/tarSyr1

#########################################################################
## Repeat Masker (DONE - 2008-10-15 - Hiram)
    screen # use a screen to manage this job
    mkdir /hive/data/genomes/tarSyr1/bed/repeatMasker
    cd /hive/data/genomes/tarSyr1/bed/repeatMasker
    doRepeatMasker.pl -workhorse=hgwdev -bigClusterHub=swarm \
	-species=tarsier -buildDir=`pwd` tarSyr1 > do.log 2>&1 &
    #	real    810m18.865s
    #	failed in doMask.csh due to no tarSyr1.unmasked.2bit file ?
    #	which was there at the beginning, now it is gone ?
    #	the broken doRepeatMasker.pl script removed it due to hive confusion
    #	went back to the jkStuff/makeUnmasked script and re-ran part of that.
    #	then ran the doMask.csh script to finish that, and continuing:
    time $HOME/kent/src/hg/utils/automation/doRepeatMasker.pl \
	-continue=install -workhorse=hgwdev -bigClusterHub=swarm \
        -species=tarsier -buildDir=`pwd` tarSyr1 > install.log 2>&1 &
    #	real    26m23.595s
    twoBitToFa tarSyr1.rmsk.2bit stdout | faSize stdin
# 3179905132 bases (411368789 N's 2768536343 real 1613280363 upper 1155255980
# lower) in 656709 sequences in 1 files
# %36.33 masked total, %41.73 masked real

###########################################################################
# SIMPLE REPEATS TRF (DONE - 2008-10-16 - Hiram)
    screen # use a screen to manage this job
    mkdir /hive/data/genomes/tarSyr1/bed/simpleRepeat
    cd /hive/data/genomes/tarSyr1/bed/simpleRepeat
    # 
    time  $HOME/kent/src/hg/utils/automation/doSimpleRepeat.pl \
	-buildDir=/hive/data/genomes/tarSyr1/bed/simpleRepeat \
	tarSyr1 > do.log 2>&1 &
    #	real    382m17.104s
    cat fb.simpleRepeat
    #	50848145 bases of 2768536343 (1.837%) in intersection

    #	after RM run is done, add this mask:
    cd /hive/data/genomes/tarSyr1
    rm tarSyr1.2bit
    twoBitMask tarSyr1.rmsk.2bit -add bed/simpleRepeat/trfMask.bed tarSyr1.2bit
    #	can safely ignore warning about >=13 fields in bed file

    twoBitToFa tarSyr1.2bit stdout | faSize stdin > tarSyr1.2bit.faSize.txt
# 3179905132 bases (411368789 N's 2768536343 real 1612652870 upper 1155883473
# lower) in 656709 sequences in 1 files
# %36.35 masked total, %41.75 masked real

    #	link to gbdb
    ln -s `pwd`/tarSyr1.2bit /gbdb/tarSyr1

###########################################################################
# prepare for kluster runs (DONE _ 2008-10-16 - Hiram)
    # compare to size of real bases to adjust the repMatch
    #	hg18: 2881421696
    #	tarSyr1: 2768536343
    # thus: 1024 * 2768536343/2881421696 = 983
    #	rounding up to 1000 for a bit more conservative masking
    cd /hive/data/genomes/tarSyr1
    time blat tarSyr1.2bit \
	/dev/null /dev/null -tileSize=11 -makeOoc=tarSyr1.11.ooc -repMatch=1000
    #	Wrote 31947 overused 11-mers to tarSyr1.11.ooc
    #	real    2m30.155s
    #	and staging data for push to kluster nodes
    mkdir /hive/data/staging/data/tarSyr1
    cp -p tarSyr1.2bit chrom.sizes tarSyr1.11.ooc \
	/hive/data/staging/data/tarSyr1
    #	request to cluster admin to push this to the kluster nodes
    #	/scratch/data/

###########################################################################
# add NCBI identifiers to the dbDb (DONE - 2008-10-21 - Hiram)
    hgsql -e 'update dbDb set
sourceName="Broad Institute tarSyr1 (NCBI project 20335, ABRT000000000)" where name="tarSyr1";' hgcentraltest

############################################################################
#  tarSyr1 - Tarsier - Ensembl Genes version 51  (DONE - 2008-12-04 - hiram)
    ssh kkr14u02
    cd /hive/data/genomes/tarSyr1
    cat << '_EOF_' > tarSyr1.ensGene.ra
# required db variable
db tarSyr1
# do we need to translate geneScaffold coordinates
geneScaffolds yes
# ignore genes that do not properly convert to a gene pred, and contig
#       names that are not in the UCSC assembly
skipInvalid yes
# ignore the 2,819 genes that do not translate properly to UCSC # coordinates
#       out of 43,529 genes
'_EOF_'
#  << happy emacs

    doEnsGeneUpdate.pl -ensVersion=51 tarSyr1.ensGene.ra
    ssh hgwdev
    cd /hive/data/genomes/tarSyr1/bed/ensGene.51
    featureBits tarSyr1 ensGene
    # 20086184 bases of 2768536343 (0.726%) in intersection

 *** All done!  (through the 'makeDoc' step)
 *** Steps were performed in /hive/data/genomes/tarSyr1/bed/ensGene.51

############################################################################
# Swap lastz from Mm10 (DONE - 2012-03-12 - Hiram)
    # original alignment on Mm10
    cd /hive/data/genomes/mm10/bed/lastzTarSyr1.2012-03-10
    cat fb.mm10.chainTarSyr1Link.txt 
    #	651517559 bases of 2652783500 (24.560%) in intersection

    # and this swap:
    mkdir /hive/data/genomes/tarSyr1/bed/blastz.mm10.swap
    cd /hive/data/genomes/tarSyr1/bed/blastz.mm10.swap
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
	/hive/data/genomes/mm10/bed/lastzTarSyr1.2012-03-10/DEF \
	-swap -syntenicNet \
	-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
	-chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 &
    #	real    746m30.852s
    cat fb.tarSyr1.chainMm10Link.txt 
    #	691746721 bases of 2768536343 (24.986%) in intersection
    # set sym link to indicate this is the lastz for this genome:
    cd /hive/data/genomes/tarSyr1/bed
    ln -s blastz.mm10.swap lastz.mm10

##############################################################################
# cpgIsland - (DONE - 2012-04-21 - Hiram)
    mkdir /hive/data/genomes/tarSyr1/bed/cpgIslands
    cd /hive/data/genomes/tarSyr1/bed/cpgIslands
    time doCpgIslands.pl tarSyr1 > do.log 2>&1
    real    664m57.705s
    # there is a problem with this command:
#cut -f1 cpgIsland.bed | sort -u | awk '{print length($0)}' | sort -rn | head -1
    # replace the 'head -1' with 'sed -n -e '1,1 p'
    # finished manually with C set at 15:
    time ./doLoadCpg.csh
    #   real    3m25.459s
    # continuing:
    time doCpgIslands.pl -continue=cleanup tarSyr1
    #   real    34m36.015s

    cat fb.tarSyr1.cpgIslandExt.txt
    #   9784198 bases of 2768536343 (0.353%) in intersection

##############################################################################
# genscan - (DONE - 2011-04-25 - Hiram)
    mkdir /hive/data/genomes/tarSyr1/bed/genscan
    cd /hive/data/genomes/tarSyr1/bed/genscan
    time doGenscan.pl tarSyr1 > do.log 2>&1
    #   real    1086m30.217s

    cat fb.tarSyr1.genscan.txt
    #   49393140 bases of 2768536343 (1.784%) in intersection
    cat fb.tarSyr1.genscanSubopt.txt
    #   46556603 bases of 2768536343 (1.682%) in intersection

#########################################################################
# windowMasker - (DONE - 2012-05-03 - Hiram)
    screen -S tarSyr1
    mkdir /hive/data/genomes/tarSyr1/bed/windowMasker
    cd /hive/data/genomes/tarSyr1/bed/windowMasker
    # trying out new version of the script that does all the usual steps
    #   that used to be performed manually after it was done
    time /cluster/home/hiram/kent/src/hg/utils/automation/doWindowMasker.pl \
        -workhorse=hgwdev -buildDir=`pwd` -dbHost=hgwdev tarSyr1 > do.log 2>&1
    #   real    2741m46.799s

    sed -e 's/^/    #\t/' fb.tarSyr1.windowmaskerSdust.beforeClean.txt
    #   1510322967 bases of 3179905132 (47.496%) in intersection
    sed -e 's/^/    #\t/' fb.tarSyr1.windowmaskerSdust.clean.txt
    #   1098954178 bases of 3179905132 (34.559%) in intersection
    sed -e 's/^/    #\t/' fb.tarSyr1.rmsk.windowmaskerSdust.txt
    #   669452844 bases of 3179905132 (21.053%) in intersection

#########################################################################
# AUTO UPDATE GENBANK (DONE - 2012-05-03 - 2012-06-22 - Hiram)
    # examine the file:
    /cluster/data/genbank/data/organism.lst
    # for your species to see what counts it has for:
# organism       mrnaCnt estCnt  refSeqCnt
# Tarsius syrichta	8	0	0
    # to decide which "native" mrna or ests you want to specify in genbank.conf

    ssh hgwdev
    cd $HOME/kent/src/hg/makeDb/genbank
    git pull
    # edit etc/genbank.conf to add tarSyr1
# tarSyr1 (Tarsier)
tarSyr1.serverGenome = /hive/data/genomes/tarSyr1/tarSyr1.2bit
tarSyr1.clusterGenome = /scratch/data/tarSyr1/tarSyr1.2bit
tarSyr1.ooc = /scratch/data/tarSyr1/tarSyr1.11.ooc
tarSyr1.lift = no
tarSyr1.refseq.mrna.native.pslCDnaFilter  = ${lowCover.refseq.mrna.native.pslCDnaFilter}
tarSyr1.refseq.mrna.xeno.pslCDnaFilter    = ${lowCover.refseq.mrna.xeno.pslCDnaFilter}
tarSyr1.genbank.mrna.native.pslCDnaFilter = ${lowCover.genbank.mrna.native.pslCDnaFilter}
tarSyr1.genbank.mrna.xeno.pslCDnaFilter   = ${lowCover.genbank.mrna.xeno.pslCDnaFilter}
tarSyr1.genbank.est.native.pslCDnaFilter  = ${lowCover.genbank.est.native.pslCDnaFilter}
tarSyr1.refseq.mrna.native.load = no
tarSyr1.refseq.mrna.xeno.load = yes
tarSyr1.genbank.mrna.xeno.load = yes
tarSyr1.genbank.est.native.load = yes
tarSyr1.downloadDir = tarSyr1
tarSyr1.perChromTables = no

    # end of section added to etc/genbank.conf
    git commit -m "adding tarSyr1 tarsier" etc/genbank.conf
    git push
    make etc-update

    git pull
    # Edit src/lib/gbGenome.c to add new species.
    git commit -m "adding definition for tarSyrNames" src/lib/gbGenome.c
    git push
    make install-server

    ssh hgwdev			# used to do this on "genbank" machine
    screen -S tarSyr1           # long running job managed in screen
    cd /cluster/data/genbank
    time nice -n +19 ./bin/gbAlignStep -initial tarSyr1 &
    #   var/build/logs/2012.06.08-10:01:20.tarSyr1.initalign.log
    #   real    4309m23.845s
    # after finishing the kluster run manually:
# Completed: 650332 of 650332 jobs
# CPU time in finished jobs:    2156583s   35943.06m   599.05h   24.96d  0.068 y
# IO & Wait Time:               3057671s   50961.18m   849.35h   35.39d  0.097 y
# Average job time:                   8s       0.13m     0.00h    0.00d
# Longest finished job:             210s       3.50m     0.06h    0.00d
# Submission to last job:        207812s    3463.53m    57.73h    2.41d

    time nice -n +19 ./bin/gbAlignStep -initial -continue=finish tarSyr1 &
    #   var/build/logs/2012.06.16-12:07:23.tarSyr1.initalign.log

    # load database when finished
    ssh hgwdev
    cd /cluster/data/genbank
    time nice -n +19 ./bin/gbDbLoadStep -drop -initialLoad tarSyr1 &
    #	logFile:  var/dbload/hgwdev/logs/2012.06.22-14:55:19.dbload.log
    #   real    64m45.767s

    # enable daily alignment and update of hgwdev (DONE - 2012-02-09 - Hiram)
    cd ~/kent/src/hg/makeDb/genbank
    git pull
    # add tarSyr1 to:
    vi etc/align.dbs etc/hgwdev.dbs
    git commit -m "Added tarSyr1." etc/align.dbs etc/hgwdev.dbs
    git push
    make etc-update

#########################################################################
# set default position to RHO gene displays  (DONE - 2012-07-26 - Hiram)
    hgsql -e \
'update dbDb set defaultPos="scaffold_66368:1-2388" where name="tarSyr1";' \
	hgcentraltest

############################################################################
# pushQ entry (DONE - 2012-07-26 - Hiram)
    mkdir /hive/data/genomes/tarSyr1/pushQ
    cd /hive/data/genomes/tarSyr1/pushQ
    # Mark says don't let the transMap track get there
    time makePushQSql.pl tarSyr1 2> stderr.txt | grep -v transMap > tarSyr1.sql
    #   real    4m6.158s
    # check the stderr.txt for bad stuff, these kinds of warnings are OK:
# WARNING: hgwdev does not have /gbdb/tarSyr1/wib/gc5Base.wib
# WARNING: hgwdev does not have /gbdb/tarSyr1/wib/quality.wib
# WARNING: hgwdev does not have /gbdb/tarSyr1/bbi/quality.bw
# WARNING: tarSyr1 does not have seq
# WARNING: tarSyr1 does not have extFile
# WARNING: tarSyr1 does not have estOrientInfo

    scp -p tarSyr1.sql hgwbeta:/tmp
    ssh hgwbeta "hgsql qapushq < /tmp/tarSyr1.sql"

############################################################################
# LIFTOVER TO tarSyr2 (DONE - 2014-12-11 - Hiram)
    ssh hgwdev
    mkdir /hive/data/genomes/tarSyr1/bed/blat.tarSyr2.2014-12-11
    cd /hive/data/genomes/tarSyr1/bed/blat.tarSyr2.2014-12-11
    time (doSameSpeciesLiftOver.pl -verbose=2 -buildDir=`pwd` \
	-ooc=/hive/data/genomes/tarSyr1/tarSyr1.11.ooc \
        -bigClusterHub=ku -dbHost=hgwdev -workhorse=hgwdev \
         tarSyr1 tarSyr2) > do.log 2>&1
    # real    11534m13.107s
    # 8 days of run time, the tarSyr2 to tarSyr1 is taking even longer

    # verify the convert link on the test browser is now active from tarSyr1 to
    # tarSyr2

#########################################################################
