# for emacs: -*- mode: sh; -*-

# Vicugna pacos

#	http://www.ncbi.nlm.nih.gov/bioproject/30567
#	http://www.ncbi.nlm.nih.gov/genome/905
#	http://www.ncbi.nlm.nih.gov/Traces/wgs/?val=ABRR01

#########################################################################
# DOWNLOAD SEQUENCE  (DONE braney 2008-10-07)
    ssh kolossus
    screen
    mkdir /hive/data/genomes/vicPac1
    ln -s /hive/data/genomes/vicPac1 /cluster/data
    mkdir /cluster/data/vicPac1/broad
    cd /cluster/data/vicPac1/broad

    wget --timestamping \
ftp://ftp.broad.mit.edu/pub/assemblies/mammals/alpaca/VicPac1.0/assembly.agp \
ftp://ftp.broad.mit.edu/pub/assemblies/mammals/alpaca/VicPac1.0/assembly.bases.gz \
ftp://ftp.broad.mit.edu/pub/assemblies/mammals/alpaca/VicPac1.0/assembly.quals.gz 

    qaToQac assembly.quals.gz stdout | \
	qacAgpLift assembly.agp stdin vicPac1.qual.qac

    cut -f 1 assembly.agp | uniq -c | wc -l 
# Number of scaffolds: 298413

#########################################################################
# Create .ra file and run makeGenomeDb.pl (working braney...
    cd /cluster/data/vicPac1
cat << _EOF_ >vicPac1.config.ra
# Config parameters for makeGenomeDb.pl:
db vicPac1
clade mammal
genomeCladePriority 35
scientificName  Vicugna vicugna
commonName Alpaca
assemblyDate Jul. 2008
assemblyLabel Broad Institute vicPac1 
orderKey 233.5
#mitoAcc AJ222767
mitoAcc none
fastaFiles /cluster/data/vicPac1/broad/assembly.bases.gz
agpFiles /cluster/data/vicPac1/broad/assembly.agp
qualFiles /cluster/data/vicPac1/broad/vicPac1.qual.qac
dbDbSpeciesDir alpaca
_EOF_

# use 'screen' make sure on kkstore05
    makeGenomeDb.pl -workhorse kolossus -verbose=2 vicPac1.config.ra > makeGenomeDb.out 2>&1 &

# 'ctl-a ctl -d' returns to previous shell
cut -f 2 chrom.sizes | ave stdin
# Q1 1042.000000
# median 1372.000000
# Q3 2292.000000
# average 9926.690888
# min 555.000000
# max 5516956.000000
# count 298413
# total 2962253608.000000
# standard deviation 67785.419160

#########################################################################
# REPEATMASKER (not done)
    ssh kkstore05
    screen # use a screen to manage this job
    mkdir /cluster/data/vicPac1/bed/repeatMasker
    cd /cluster/data/vicPac1/bed/repeatMasker
    doRepeatMasker.pl -buildDir=/cluster/data/vicPac1/bed/repeatMasker \
        vicPac1 > do.log 2>&1 &

    # Note: can run simpleRepeats simultaneously
    #### When done with RM:
    ssh pk
    para time

# Completed: 7141 of 7141 jobs
# CPU time in finished jobs:   19375689s  322928.14m  5382.14h  224.26d  0.614 y
# IO & Wait Time:                185390s    3089.84m    51.50h    2.15d  0.006 y
# Average job time:                2739s      45.65m     0.76h    0.03d
# Longest finished job:           13925s     232.08m     3.87h    0.16d
# Submission to last job:        175075s    2917.92m    48.63h    2.03d

    time nice -n +19 featureBits vicPac1 rmsk > fb.vicPac1.rmsk.txt 2>&1 &
    # 614774346 bases of 1923010363 (31.969%) in intersection

    # RepeatMasker and lib version from do.log:
    #    Jun 13 2008 (open-3-2-5) version of RepeatMasker
    # CC   RELEASE 20080611;     

#########################################################################
# SIMPLE REPEATS TRF (not done)
    ssh kkstore05
    screen # use a screen to manage this job
    mkdir /cluster/data/vicPac1/bed/simpleRepeat
    cd /cluster/data/vicPac1/bed/simpleRepeat
    # had to change genus/species to "vicugna genus" in dummyRun
    doSimpleRepeat.pl -buildDir=/cluster/data/vicPac1/bed/simpleRepeat \
	vicPac1 > do.log 2>&1 &

    #### When done
    ssh pk
    para time
# Completed: 60 of 60 jobs
# CPU time in finished jobs:      71877s    1197.95m    19.97h    0.83d  0.002 y
# IO & Wait Time:                  2811s      46.85m     0.78h    0.03d  0.000 y
# Average job time:                1245s      20.75m     0.35h    0.01d
# Longest finished job:           12819s     213.65m     3.56h    0.15d
# Submission to last job:         13133s     218.88m     3.65h    0.15d

    featureBits vicPac1 simpleRepeat
    # 77931003 bases of 1923010363 (4.053%) in intersection

    #	after RM run is done, add this mask:
    cd /cluster/data/vicPac1
    twoBitMask vicPac1.rmsk.2bit -add bed/simpleRepeat/trfMask.bed vicPac1.2bit

    twoBitToFa vicPac1.2bit stdout | faSize stdin
# 2962350360 bases (1039339997 N's 1923010363 real 1305704427 upper 617305936
# lower) in 298420 sequences in 1 files
# Total size: mean 9926.8 sd 67784.7 min 600 (scaffold_298419) max 5516956
(scaffold_0) median 1372
# N count: mean 3482.8 sd 21549.2
# U count: mean 4375.4 sd 33856.2
# L count: mean 2068.6 sd 15148.2
# %20.84 masked total, %32.10 masked real

    twoBitToFa vicPac1.rmsk.2bit stdout | faSize stdin

# 2962350360 bases (1039339997 N's 1923010363 real 1309436724 upper 613573639
# lower) in 298420 sequences in 1 files
# Total size: mean 9926.8 sd 67784.7 min 600 (scaffold_298419) max 5516956
# (scaffold_0) median 1372
# N count: mean 3482.8 sd 21549.2
# U count: mean 4387.9 sd 33863.9
# L count: mean 2056.1 sd 15140.4
# %20.71 masked total, %31.91 masked real

    ln -s /cluster/data/vicPac1/vicPac1.2bit /gbdb/vicPac1/vicPac1.2bit

    cp /cluster/data/vicPac1/vicPac1.2bit /san/sanvol1/scratch/vicPac1
    cp /cluster/data/vicPac1/chrom.sizes /san/sanvol1/scratch/vicPac1

#########################################################################
## Repeat Masker (DONE - 2008-10-16 - Hiram)
    screen	# to manage this several day job
    mkdir /hive/data/genomes/vicPac1/bed/repeatMasker
    cd /hive/data/genomes/vicPac1/bed/repeatMasker
    time $HOME/kent/src/hg/utils/automation/doRepeatMasker.pl \
	-workhorse=hgwdev -bigClusterHub=swarm \
	-buildDir=`pwd` vicPac1 > do.log 2>&1 &
    #	real    292m6.556s

    twoBitToFa vicPac1.rmsk.2bit stdout | faSize stdin > faSize.rmsk.txt
# 2962253608 bases (1039343173 N's 1922910435 real 1306779321 upper 616131114
# lower) in 298413 sequences in 1 files
# %20.80 masked total, %32.04 masked real

#########################################################################
# SIMPLE REPEATS TRF (DONE - 2008-10-17 - Hiram)
    screen # use a screen to manage this job
    mkdir /hive/data/genomes/vicPac1/bed/simpleRepeat
    cd /hive/data/genomes/vicPac1/bed/simpleRepeat
    # 
    time $HOME/kent/src/hg/utils/automation/doSimpleRepeat.pl \
	-buildDir=/cluster/data/vicPac1/bed/simpleRepeat vicPac1 > do.log 2>&1 &
    #	real    74m17.044s
    cat fb.simpleRepeat
    #	77925712 bases of 1922910435 (4.052%) in intersection

    #	after RM run is done, add this mask:
    cd /hive/data/genomes/vicPac1
    rm vicPac1.2bit
    twoBitMask vicPac1.rmsk.2bit -add bed/simpleRepeat/trfMask.bed vicPac1.2bit
    #	can safely ignore warning about >=13 fields in bed file

    twoBitToFa vicPac1.2bit stdout | faSize stdin > vicPac1.2bit.faSize.txt
# 2962253608 bases (1039343173 N's 1922910435 real 1303047359 upper 619863076
# lower) in 298413 sequences in 1 files
# %20.93 masked total, %32.24 masked rea

    #	link to gbdb
    ln -s `pwd`/vicPac1.2bit /gbdb/vicPac1

###########################################################################
# prepare for kluster runs (DONE _ 2008-10-22 - Hiram)
    # compare to size of real bases to adjust the repMatch
    #	hg18: 2881421696
    #	vicPac1: 1922910435
    # thus: 1024 * 1922910435/2881421696 = 683
    #	rounding up to 700 for a more conservative masking
    cd /hive/data/genomes/vicPac1
    time blat vicPac1.2bit \
	/dev/null /dev/null -tileSize=11 -makeOoc=vicPac1.11.ooc -repMatch=700
    #	Wrote 25830 overused 11-mers to vicPac1.11.ooc
    #	real    2m8.728s

    #	and staging data for push to kluster nodes
    mkdir /hive/data/staging/data/vicPac1
    cp -p vicPac1.2bit chrom.sizes vicPac1.11.ooc \
	/hive/data/staging/data/vicPac1
    #	request to cluster admin to push this to the kluster nodes
    #	/scratch/data/

###########################################################################
# fix the scientific name (DONE - 2008-10-22 - Hiram)
    hgsql -e 'update dbDb set
scientificName="Vicugna pacos" where name="vicPac1";' hgcentraltest

############################################################################
#  vicPac1 - Alpaca - Ensembl Genes version 51  (DONE - 2008-12-04 - hiram)
    ssh swarm
    cd /hive/data/genomes/vicPac1
    cat << '_EOF_' > vicPac1.ensGene.ra
# required db variable
db vicPac1
# do we need to translate geneScaffold coordinates
geneScaffolds yes
# ignore genes that do not properly convert to a gene pred, and contig
#	names that are not in the UCSC assembly
skipInvalid yes
# ignore the 53 genes that do not translate properly to UCSC coordinates
'_EOF_'
#  << happy emacs

    doEnsGeneUpdate.pl -ensVersion=51 vicPac1.ensGene.ra
    ssh hgwdev
    cd /hive/data/genomes/vicPac1/bed/ensGene.51
    featureBits vicPac1 ensGene
    # 17730015 bases of 1922910435 (0.922%) in intersection

 *** All done!  (through the 'makeDoc' step)
 *** Steps were performed in /hive/data/genomes/vicPac1/bed/ensGene.51

############################################################################
# SWAP mm10 lastz (DONE - 2012-03-19 - Hiram)
    # original alignment to mm10:
    cat /hive/data/genomes/mm10/bed/lastzVicPac1.2012-03-16/fb.mm10.chainVicPac1Link.txt 
    #	600477253 bases of 2652783500 (22.636%) in intersection

    # and this swap
    mkdir /hive/data/genomes/vicPac1/bed/blastz.mm10.swap
    cd /hive/data/genomes/vicPac1/bed/blastz.mm10.swap
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
	/hive/data/genomes/mm10/bed/lastzVicPac1.2012-03-16/DEF \
	-swap -syntenicNet \
	-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
	-chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 &
    #	real    159m21.952s
    cat fb.vicPac1.chainMm10Link.txt
    #	610885692 bases of 1922910435 (31.769%) in intersection
    # set sym link to indicate this is the lastz for this genome:
    cd /hive/data/genomes/vicPac1/bed
    ln -s blastz.mm10.swap lastz.mm10

##############################################################################
# cpgIslands - (DONE - 2011-04-23 - Hiram)
    mkdir /hive/data/genomes/vicPac1/bed/cpgIslands
    cd /hive/data/genomes/vicPac1/bed/cpgIslands
    time doCpgIslands.pl vicPac1 > do.log 2>&1
    #   real    897m19.342s
    # fixing broken command in the script:
    time ./doLoadCpg.csh
    #   real    2m37.853s
    time doCpgIslands.pl -continue=cleanup vicPac1 > cleanup.log 2>&1
    #   real    119m41.201s

    cat fb.vicPac1.cpgIslandExt.txt
    #   9241028 bases of 1922910435 (0.481%) in intersection

#########################################################################
# genscan - (DONE - 2011-04-26 - Hiram)
    mkdir /hive/data/genomes/vicPac1/bed/genscan
    cd /hive/data/genomes/vicPac1/bed/genscan
    time doGenscan.pl vicPac1 > do.log 2>&1
    # recovering from power failure, kluster run had just finished
    # and it had just started on makeBed:
    time ./doMakeBed.csh
    #   real    445m44.260s
    # continuing:
    time doGenscan.pl -continue=load vicPac1 > load.log 2>&1
    #   real    40m21.604s
    cat fb.vicPac1.genscan.txt
    #   46607076 bases of 1922910435 (2.424%) in intersection
    cat fb.vicPac1.genscanSubopt.txt
    #   54656455 bases of 1922910435 (2.842%) in intersection

#########################################################################
# windowMasker - (DONE - 2012-05-02 - Hiram)
    screen -S vicPac1
    mkdir /hive/data/genomes/vicPac1/bed/windowMasker
    cd /hive/data/genomes/vicPac1/bed/windowMasker
    # trying out new version of the script that does all the usual steps
    #   that used to be performed manually after it was done
    time /cluster/home/hiram/kent/src/hg/utils/automation/doWindowMasker.pl \
        -workhorse=hgwdev -buildDir=`pwd` -dbHost=hgwdev vicPac1 > do.log 2>&1
    #   real    1042m31.380s
    # fixing a broken command in the doLoad step:
    time ./lastLoad.csh
    #   real    9m4.895s

    sed -e 's/^/    #\t/' fb.vicPac1.windowmaskerSdust.beforeClean.txt
    #   1672360288 bases of 2962253608 (56.456%) in intersection
    sed -e 's/^/    #\t/' fb.vicPac1.windowmaskerSdust.clean.txt
    #   633017115 bases of 2962253608 (21.369%) in intersection
    sed -e 's/^/    #\t/' fb.vicPac1.rmsk.windowmaskerSdust.txt
    #   266161799 bases of 2962253608 (8.985%) in intersection

#########################################################################
# MAKE 11.OOC FILE FOR BLAT/GENBANK (DONE - 2012-05-07 - Hiram)
    # Use -repMatch=900, based on size -- for human we use 1024
    # use the "real" number from the faSize measurement,
    # hg19 is 2897316137, calculate the ratio factor for 1024:
    calc \( 1922910435 / 2897316137 \) \* 1024
    #	( 1922910435 / 2897316137 ) * 1024 = 679.615269

    # round up to 700

    cd /hive/data/genomes/vicPac1
    time blat vicPac1.2bit /dev/null /dev/null -tileSize=11 \
      -makeOoc=jkStuff/vicPac1.11.ooc -repMatch=700
    #   Wrote 25830 overused 11-mers to jkStuff/vicPac1.11.ooc
    #   Done making jkStuff/vicPac1.11.ooc

    # there are no non-bridged gaps, no lift file needed for genbank
    hgsql -N -e "select bridge from gap;" vicPac1 | sort | uniq -c
    #   422879 yes
#    cd /hive/data/genomes/vicPac1/jkStuff
#    gapToLift vicPac1 vicPac1.nonBridged.lift -bedFile=vicPac1.nonBridged.bed
    # largest non-bridged contig:
#    awk '{print $3-$2,$0}' vicPac1.nonBridged.bed | sort -nr | head
    #   123773608 chrX  95534   123869142       chrX.01

#########################################################################
# AUTO UPDATE GENBANK (DONE - 2012-05-07 - 2012-06-22 - Hiram)
    # examine the file:
    /cluster/data/genbank/data/organism.lst
    # for your species to see what counts it has for:
# organism       mrnaCnt estCnt  refSeqCnt
# Vicugna pacos	164	7286	0
    # to decide which "native" mrna or ests you want to specify in genbank.conf

    ssh hgwdev
    cd $HOME/kent/src/hg/makeDb/genbank
    git pull
    # edit etc/genbank.conf to add vicPac1

# vicPac1 (alpaca)
vicPac1.serverGenome = /hive/data/genomes/vicPac1/vicPac1.2bit
vicPac1.clusterGenome = /hive/data/genomes/vicPac1/vicPac1.2bit
vicPac1.ooc = /hive/data/genomes/vicPac1/jkStuff/vicPac1.11.ooc
vicPac1.lift = no
vicPac1.refseq.mrna.native.pslCDnaFilter  = ${lowCover.refseq.mrna.native.pslCDnaFilter}
vicPac1.refseq.mrna.xeno.pslCDnaFilter    = ${lowCover.refseq.mrna.xeno.pslCDnaFilter}
vicPac1.genbank.mrna.native.pslCDnaFilter = ${lowCover.genbank.mrna.native.pslCDnaFilter}
vicPac1.genbank.mrna.xeno.pslCDnaFilter   = ${lowCover.genbank.mrna.xeno.pslCDnaFilter}
vicPac1.genbank.est.native.pslCDnaFilter  = ${lowCover.genbank.est.native.pslCDnaFilter}
vicPac1.refseq.mrna.native.load = no
vicPac1.refseq.mrna.xeno.load = yes
vicPac1.genbank.mrna.native.load = no
vicPac1.genbank.mrna.xeno.load = no
vicPac1.genbank.est.native.load = yes
vicPac1.downloadDir = vicPac1
vicPac1.perChromTables = no

    # end of section added to etc/genbank.conf
    git commit -m "adding vicPac1 alpaca" etc/genbank.conf
    git push
    make etc-update

    git pull
    # Edit src/lib/gbGenome.c to add new species.
    git commit -m "adding definition for vicPacNames" src/lib/gbGenome.c
    git push
    make install-server

    ssh hgwdev			# used to do this on "genbank" machine
    screen -S vicPac1           # long running job managed in screen
    cd /cluster/data/genbank
    time nice -n +19 ./bin/gbAlignStep -initial vicPac1 &
    #   var/build/logs/2012.06.16-19:10:31.vicPac1.initalign.log

    # load database when finished
    ssh hgwdev
    cd /cluster/data/genbank
    time nice -n +19 ./bin/gbDbLoadStep -drop -initialLoad vicPac1 &
    #   var/dbload/hgwdev/logs/2012.06.22-12:27:12.dbload.log
    #   real    39m1.879s

    # enable daily alignment and update of hgwdev (DONE - 2012-02-09 - Hiram)
    cd ~/kent/src/hg/makeDb/genbank
    git pull
    # add vicPac1 to:
    vi etc/align.dbs etc/hgwdev.dbs
    git commit -m "Added vicPac1." etc/align.dbs etc/hgwdev.dbs
    git push
    make etc-update

#####################################################################
#  SWAP bosTau7 LASTZ (DONE - 2012-06-19 - Chin)
    #   original alignment done at bosTau7.txt
    cd /hive/data/genomes/bosTau7/bed/lastzVicPac1.2012-06-19
    cat fb.bosTau7.chainVicPac1Link.txt
    #   1259182233 bases of 2804673174 (44.896%) in intersection

    # Create link
    cd /hive/data/genomes/bosTau7/bed
    ln -s lastzVicPac1.2012-06-19 lastz.vicPac1

    #   and the swap
    mkdir /hive/data/genomes/vicPac1/bed/blastz.bosTau7.swap
    cd /hive/data/genomes/vicPac1/bed/blastz.bosTau7.swap
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
    /hive/data/genomes/bosTau7/bed/lastzVicPac1.2012-06-19/DEF \
    -swap -syntenicNet \
    -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
    -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 &
    #   real    717m22.295s
    cat fb.vicPac1.chainBosTau7Link.txt
    #  1309266730 bases of 1922910435 (68.088%) in intersection
    # set sym link to indicate this is the lastz for this genome:
    cd /hive/data/genomes/vicPac1/bed
    ln -s blastz.bosTau7.swap lastz.bosTau7

#########################################################################
# set default position to RHO gene displays  (DONE - 2012-07-26 - Hiram)
    hgsql -e \
'update dbDb set defaultPos="scaffold_5479:34441-51753" where name="vicPac1";' \
	hgcentraltest

############################################################################
# pushQ entry (DONE - 2012-07-26 - Hiram)
    mkdir /hive/data/genomes/vicPac1/pushQ
    cd /hive/data/genomes/vicPac1/pushQ
    # Mark says don't let the transMap track get there
    time makePushQSql.pl vicPac1 2> stderr.txt | grep -v transMap > vicPac1.sql
    #   real    3m49.201s
    # check the stderr.txt for bad stuff, these kinds of warnings are OK:
# WARNING: hgwdev does not have /gbdb/vicPac1/wib/gc5Base.wib
# WARNING: hgwdev does not have /gbdb/vicPac1/wib/quality.wib
# WARNING: hgwdev does not have /gbdb/vicPac1/bbi/quality.bw
# WARNING: vicPac1 does not have seq
# WARNING: vicPac1 does not have extFile
# WARNING: vicPac1 does not have estOrientInfo

    scp -p vicPac1.sql hgwbeta:/tmp
    ssh hgwbeta "hgsql qapushq < /tmp/vicPac1.sql"

############################################################################
# construct liftOver to vicPac2 (DONE - 2013-06-10 - Hiram)
    screen -S vicPac2	# manage this longish running job in a screen
    mkdir /hive/data/genomes/vicPac1/bed/blat.vicPac2.2013-06-10
    cd /hive/data/genomes/vicPac1/bed/blat.vicPac2.2013-06-10
    # check it with -debug first to see if it is going to work:
    time doSameSpeciesLiftOver.pl -buildDir=`pwd` -bigClusterHub=swarm \
	-ooc=/hive/data/genomes/vicPac1/jkStuff/vicPac1.11.ooc \
	-debug -dbHost=hgwdev -workhorse=hgwdev vicPac1 vicPac2
    #   real    0m1.838s
    # if that is OK, then run it:
    time doSameSpeciesLiftOver.pl -buildDir=`pwd` -bigClusterHub=swarm \
	-ooc=/hive/data/genomes/vicPac1/jkStuff/vicPac1.11.ooc \
	-dbHost=hgwdev -workhorse=hgwdev vicPac1 vicPac2 > do.log 2>&1
    #	real    3064m58.156s

    # verify this file exists:
    #	/gbdb/vicPac1/liftOver/vicPac1ToVicPac2.over.chain.gz
    # and try out the conversion on genome-test from vicPac1 to vicPac2

############################################################################
