
# Nematostella vectensis DOE JGI Nemve1

# $Id: nemVec1.txt,v 1.7 2008/08/11 23:51:00 kord Exp $
# $Source: /projects/compbio/cvsroot/kent/src/hg/makeDb/doc/nemVec1.txt,v $

#######################################################################
# CREATE BUILD DIRECTORY (DONE 2008-Jul-01 - Kord) 

# what store?
ssh hgwdev
df -h            
# Filesystem            Size  Used Avail Use% Mounted on
# /dev/sda3             9.7G  5.9G  3.3G  65% /
# /dev/sda1              99M   36M   58M  39% /boot
# none                   16G     0   16G   0% /dev/shm
# /dev/sda5              20G   15G  3.5G  81% /scratch
# /dev/sdd1             1.5T  626G  767G  45% /data
# /dev/sde1             2.0T  1.8T  120G  94% /data/mysql
# kkhome-10:/export/cluster/home
#                       766G  415G  291G  59% /cluster/home
# ...
# kkstore06-10:/export/cluster/store3
#                       2.3T  1.9T  302G  87% /cluster/store3
# ...
ssh kkstore06
mkdir /cluster/store3/nemVec1
ln -s /cluster/store3/nemVec1 /cluster/data/nemVec1

#######################################################################
# DOWNLOAD THE ASSEMBLY (DONE - 2008-Jul-05 - Kord)
ssh kkstore06
cd /cluster/data/nemVec1
mkdir jkStuff
mkdir bed

mkdir /cluster/data/nemVec1/downloads
cd /cluster/data/nemVec1/downloads

# GenBank WGS sequences and Scaffolds:
# http://www.ncbi.nlm.nih.gov/entrez/viewer.fcgi?db=nucleotide&val=ABAV01000000
# WGS         ABAV01000001-ABAV01059149
# WGS_SCAFLD  DS469507-DS480310

# JGI supplied contigs
# http://genome.jgi-psf.org/Nemve1/Nemve1.download.ftp.html
# Unmasked
wget --timestamp \
ftp://ftp.jgi-psf.org/pub/JGI_data/Nematostella_vectensis/v1.0/assembly/Nemve1.fasta.gz
faSize Nemve1.fasta.gz
# 356613585 bases (59221913 N's 297391672 real 297391672 upper 0 lower) in 10804 sequences in 1 files
# Total size: mean 33007.6 sd 149445.1 min 626 (scaffold_6071) max 3256212 (scaffold_1) median 6708
# N count: mean 5481.5 sd 10915.3
# U count: mean 27526.1 sd 141670.8
# L count: mean 0.0 sd 0.0
# %0.00 masked total, %0.00 masked real

# Masked
wget --timestamp \
ftp://ftp.jgi-psf.org/pub/JGI_data/Nematostella_vectensis/v1.0/assembly/Nemve1.allmasked.gz
faSize Nemve1.allmasked.gz
# 356613585 bases (59221913 N's 297391672 real 216553990 upper 80837682 lower) in 10804 sequences in 1 files
# Total size: mean 33007.6 sd 149445.1 min 626 (scaffold_6071) max 3256212 (scaffold_1) median 6708
# N count: mean 5481.5 sd 10915.3
# U count: mean 20043.9 sd 107371.3
# L count: mean 7482.2 sd 35162.7
# %22.67 masked total, %27.18 masked real

# Tandemly repated
wget --timestamp \
ftp://ftp.jgi-psf.org/pub/JGI_data/Nematostella_vectensis/annotation/v1.0/NvTRjug.fasta.gz
faSize NvTRjug.fasta.gz
# 14414 bases (0 N's 14414 real 14414 upper 0 lower) in 10 sequences in 1 files
# Total size: mean 1441.4 sd 2104.8 min 191 (GTGTTTGTGGTGTTTTjuggernaut) max 7162 (AAAAAAAAATCGAACAjuggernaut) median 786
# N count: mean 0.0 sd 0.0
# U count: mean 1441.4 sd 2104.8
# L count: mean 0.0 sd 0.0
# %0.00 masked total, %0.00 masked real

#######################################################################
# run the makeGenomeDb procedure to create the db and unmasked sequence
# (DONE - 2008-Jul-14 - Kord)

# create a new genomeClade?
# Nope - Jim doesn't want one-browser clades in the menu
# So, its into "other" for Nematostella until Hydra magnipapillata is done

# gnerate a fake AGP for use in downstream tools
ssh kkstore06
cd /cluster/data/nemVec1/downloads
hgFakeAgp -minContigGap=50 Nemve1.fasta.gz nemVec1.agp

cat << _EOF_ > nemVec1.config.ra
# Config parameters for makeGenomeDb.pl:
db nemVec1 
# new clade? cnidarian 
clade other
genomeCladePriority 18
scientificName Nematostella vectensis
commonName Starlet sea anemone
assemblyDate Jun. 2007
assemblyLabel US DOE JGI-PGF Nemve1
orderKey 227
mitoAcc none
fastaFiles /cluster/data/nemVec1/downloads/Nemve1.fasta.gz
agpFiles   /cluster/data/nemVec1/downloads/nemVec1.agp
# qualFiles /dev/null
dbDbSpeciesDir other
_EOF_

# run just to the AGP to make sure things are sane first
nice time -p makeGenomeDb.pl nemVec1.config.ra -stop agp \
    |tee -a jkStuff/makeGenomeDb.ajp 2>&1

# now, contining to make the Db and all
time -p nice -n +19 makeGenomeDb.pl nemVec1.config.ra -continue db \
    >jkStuff/makeGenomeDb.log 2>&1
# real 223.58
# user 0.20
# sys 0.16


cd /cluster/data/nemVec1/TemporaryTrackDbCheckout/kent/src/hg/makeDb/trackDb/other/nemVec1
# Updated the html
# description.html /cluster/data/nemVec1/html/{trackDb.ra,gap.html,gold.html}

 cd ../.. 
 # (to trackDb/) and
# - edit makefile to add nemVec1 to DBS.
# - (if necessary) cvs add other
# - cvs add other/nemVec1
# - cvs add other/nemVec1/*.{ra,html}
# - cvs ci -m "Added nemVec1 to DBS." makefile
# - cvs ci -m "Initial descriptions for nemVec1." other/nemVec1
# - (if necessary) cvs ci other
# - Run make update DBS=nemVec1 and make alpha when done.
# - (optional) Clean up /cluster/data/nemVec1/TemporaryTrackDbCheckout
# -  cvsup your ~/kent/src/hg/makeDb/trackDb and make future edits there.

#######################################################################
# REPEATMASKER (DONE - 2008-Jul-16 - Kord)

ssh kkstore06
screen
mkdir /cluster/data/nemVec1/bed/repeatMasker
cd /cluster/data/nemVec1/bed/repeatMasker
time -p nice -n +19 doRepeatMasker.pl -bigClusterHub=pk \
    -buildDir=/cluster/data/nemVec1/bed/repeatMasker nemVec1 > do.log 2>&1 &

# versions from the do.log
# grep version of RepeatMasker$ /scratch/data/RepeatMasker/RepeatMasker
#      Jun 13 2008 (open-3-2-5) version of RepeatMasker
# grep RELEASE /scratch/data/RepeatMasker/Libraries/RepeatMaskerLib.embl
# CC   RELEASE 20080611;  

ssh pk
cd /cluster/data/nemVec1/bed/repeatMasker/run.cluster
para time > para.time 2>&1
exit

ssh kkstore06
cd /cluster/data/nemVec1/bed/repeatMasker
cat run.cluster/para.time
# 879 jobs in batch
# 29017 jobs (including everybody's) in Parasol queue or running.
# Checking finished jobs
# Completed: 879 of 879 jobs
# CPU time in finished jobs:    3593738s   59895.63m   998.26h   41.59d  0.114 y
# IO & Wait Time:                 44003s     733.38m    12.22h    0.51d  0.001 y
# Average job time:                4138s      68.97m     1.15h    0.05d
# Longest finished job:            9437s     157.28m     2.62h    0.11d
# Submission to last job:         75500s    1258.33m    20.97h    0.87d
# Estimated complete:                 0s       0.00m     0.00h    0.00d

# continue the script after fixing crashed and failed cluster jobs
date > run.cluster/run.time
time -p nice -n +19 doRepeatMasker.pl -bigClusterHub=pk \
    -buildDir=/cluster/data/nemVec1/bed/repeatMasker \
    -continue cat nemVec1 > do-cat.log 2>&1 &

#  *** All done!
#  *** Steps were performed in /cluster/data/nemVec1/bed/repeatMasker
#  *** Please check the log file to see if hgLoadOut had warnings that we
#      should pass on to Arian and Robert.
#  *** Please also spot-check some repeats in the browser, run twoBitToFa on
#      a portion of nemVec1.2bit to make sure there's some masking, etc.

#######################################################################
# SIMPLE REPEATS (DONe - 2008-Jul-16 - Kord)
ssh kkstore06
screen
ssh memk parasol status
# CPUs free: 20
# CPUs busy: 0
# CPUs dead: 8
# Jobs running:  0
# Jobs waiting:  0
# Jobs finished: 6334
# Jobs crashed:  88
# Spokes free: 30
# Spokes busy: 0
# Spokes dead: 0
# Active batches: 0
# Total batches: 26
# Active users: 0
# Total users: 7
# Days up: 27.866007
# Version: 11
mkdir /cluster/data/nemVec1/bed/simpleRepeat
cd /cluster/data/nemVec1/bed/simpleRepeat
time nice -n +19 doSimpleRepeat.pl -smallClusterHub=memk \
    -buildDir=/cluster/data/nemVec1/bed/simpleRepeat nemVec1 > do.log 2>&1 &
# generated a permissions error on mkdir /san/sanvol1
# So, I'll run it on kk with the new parasol version (12.08)
time nice -n +19 doSimpleRepeat.pl -smallClusterHub=kk \
    -buildDir=/cluster/data/nemVec1/bed/simpleRepeat nemVec1 > do.log 2>&1 &
#real    342m39.774s

wc -l /san/sanvol1/scratch/nemVec1/TrfPart/00*/00*.lst
#    24 /san/sanvol1/scratch/nemVec1/TrfPart/000/000.lst
#    40 /san/sanvol1/scratch/nemVec1/TrfPart/001/001.lst
#    62 /san/sanvol1/scratch/nemVec1/TrfPart/002/002.lst
#   102 /san/sanvol1/scratch/nemVec1/TrfPart/003/003.lst
#   201 /san/sanvol1/scratch/nemVec1/TrfPart/004/004.lst
#  1016 /san/sanvol1/scratch/nemVec1/TrfPart/005/005.lst
#  5428 /san/sanvol1/scratch/nemVec1/TrfPart/006/006.lst
#  3931 /san/sanvol1/scratch/nemVec1/TrfPart/007/007.lst
# 10804 total

ssh kk
cd /cluster/data/nemVec1/bed/simpleRepeat/run.cluster
para time > para.time 2>&1
exit
cd /cluster/data/nemVec1/bed/simpleRepeat
cat run.cluster/para.time
# 8 jobs in batch
# 2 jobs (including everybody's) in Parasol queue or running.
# Checking finished jobs
# Completed: 7 of 8 jobs
# Crashed: 1 jobs
# CPU time in finished jobs:      11930s     198.84m     3.31h    0.14d  0.000 y
# IO & Wait Time:                  9131s     152.18m     2.54h    0.11d  0.000 y
# Average job time:                3009s      50.15m     0.84h    0.03d
# Longest finished job:            9775s     162.92m     2.72h    0.11d
# Submission to last job:         20268s     337.80m     5.63h    0.23d
# Estimated complete:                 0s       0.00m     0.00h    0.00d

ssh kk
cd /cluster/data/nemVec1/bed/simpleRepeat/run.cluster
para crashed
# 8 jobs in batch
# 2 jobs (including everybody's) in Parasol queue or running.
# Checking finished jobs
# ./TrfRun.csh /san/sanvol1/scratch/nemVec1/TrfPart/003/003.lst.bed

# generate new list
cp para.time para.time.orig
cp jobList jobList.orig
# remove the other jobs from the list and recreate batch
rm para.bookmark
para make jobList

# pick up where we left off...
time nice -n +19 doSimpleRepeat.pl -smallClusterHub=kk \
    -continue trf \
    -buildDir=/cluster/data/nemVec1/bed/simpleRepeat nemVec1 > do3.log 2>&1 &

ssh pk
cd /cluster/data/nemVec1/bed/simpleRepeat/run.cluster
para time
# 8 jobs in batch
# 0 jobs (including everybody's) in Parasol queue or running.
# Checking finished jobs
# Completed: 8 of 8 jobs
# CPU time in finished jobs:      13551s     225.86m     3.76h    0.16d  0.000 y
# IO & Wait Time:                     0s       0.00m     0.00h    0.00d  0.000 y
# Average job time:                1692s      28.20m     0.47h    0.02d
# Longest finished job:            2353s      39.22m     0.65h    0.03d
# Submission to last job:          2357s      39.28m     0.65h    0.03d
# Estimated complete:                 0s       0.00m     0.00h    0.00d

# pick up where we left off... the right way...
time nice -n +19 doSimpleRepeat.pl -smallClusterHub=kk \
    -continue filter \
    -buildDir=/cluster/data/nemVec1/bed/simpleRepeat nemVec1 > do4.log 2>&1 &


#######################################################################
# MASK SEQUENCE WITH RM+TRF (DONE - 2008-Jul-16 - Kord)

ssh kolossus
cd /cluster/data/nemVec1
twoBitMask nemVec1.rmsk.2bit -add bed/simpleRepeat/trfMask.bed nemVec1.2bit
# ignornig extra BED column warnin
twoBitToFa nemVec1.2bit stdout | faSize stdin
# 356613585 bases (59221913 N's 297391672 real 257024600 upper 40367072 lower) in 10804 sequences in 1 files
# Total size: mean 33007.6 sd 149445.1 min 626 (scaffold_6071) max 3256212 (scaffold_1) median 6708
# N count: mean 5481.5 sd 10915.3
# U count: mean 23789.8 sd 125342.3
# L count: mean 3736.3 sd 17397.9
# %11.32 masked total, %13.57 masked real

# set the symlink on hgwdev to /gbdb/nemVec1
ssh hgwdev
cd /gbdb/nemVec1
ln -s /cluster/data/nemVec1/nemVec1.2bit .

#######################################################################
# MAKE 11.OOC FILE FOR BLAT (DONE - 2008-Aug-01 - Kord)

ssh kkstore06
cd /cluster/data/nemVec1

# Human uses 1024
# nemVec1 is 0.375/3.1 = 0.115 human
# 0.115 * 1024 = 118 (fr2.txt uses 128)
blat nemVec1.2bit /dev/null /dev/null -tileSize=11 \
    -makeOoc=jkStuff/11.ooc -repMatch=128
# Wrote 15776 overused 11-mers to jkStuff/11.ooc

# As a note...
# A previous run with -repMatch=300 
# Wrote 1377 overused 11-mers to jkStuff/11.ooc

#   copy all of this stuff to the Iservers
ssh kkr1u00
mkdir -p /iscratch/i/nemVec1
cd /iscratch/i/nemVec1
cp -p /cluster/data/nemVec1/nemVec1.2bit .
cp -p /cluster/data/nemVec1/jkStuff/11.ooc .
cp -p /cluster/data/nemVec1/chrom.sizes .
exit
cp -p jkStuff/11.ooc /san/sanvol1/scratch/nemVec1

#######################################################################
# GENBANK AUTO-UPDATE ( DONE - 2008-Aug-11 - Kord )

# alingn with latest genbank process
ssh hgwdev
cd ~/kent/src/hg/makeDb/genbank
cvsup

# edit etc/genbank.conf to add nemVec1 

# nemVec1 (Starlet Sea Anemone)
nemVec1.serverGenome = /cluster/data/nemVec1/nemVec1.2bit
nemVec1.clusterGenome = /san/sanvol1/scratch/nemVec1/nemVec1.2bit
nemVec1.ooc = /san/sanvol1/scratch/nemVec1/11.ooc
# pslCDnaFilter lines should have ordered instead of finished 
# (only human and mouse are finished)
nemVec1.refseq.mrna.xeno.pslCDnaFilter    = ${ordered.refseq.mrna.xeno.pslCDnaFilter}
nemVec1.genbank.mrna.xeno.pslCDnaFilter   = ${ordered.genbank.mrna.xeno.pslCDnaFilter}
nemVec1.genbank.est.native.pslCDnaFilter  = ${ordered.genbank.est.native.pslCDnaFilter}
nemVec1.maxIntron = 100000
nemVec1.lift = no
# Per Mark: only load if >10^3 available
# check /cluster/data/genbank/data/organism.lst
# organism                mrnaCnt    estCnt  refSeqCnt
# Nematostella vectensis  147        163314  0
nemVec1.genbank.est.native.load = yes
nemVec1.genbank.est.xeno.load = no
nemVec1.genbank.mrna.native.load = no
nemVec1.genbank.mrna.xeno.load = yes 
nemVec1.refseq.mrna.xeno.load = yes
nemVec1.refseq.mrna.native.load = no
nemVec1.perChromTables = no
nemVec1.downloadDir = nemVec1

cvs com -m "Added nemVec1" etc/genbank.conf
# update /cluster/data/genbank/:
make etc-update

ssh genbank
screen      #   use a screen to manage this job
cd /cluster/data/genbank
time nice -n +19 ./bin/gbAlignStep -initial nemVec1 &
# var/build/logs/2008.08.01-16\:40\:20.nemVec1.initalign.log
# genbank 2008.08.01-17:13:29 nemVec1.initalign: command: ./bin/gbAlignStep -initial nemVec1
# no species defined for database "nemVec1"; edit gbGenome.c to add definition

# update gbGenome.c
ssh hgwdev
cd ~/kent/src/hg/makeDb/genbank/
cvsup
vi src/lib/gbGenome.c
# added:
# static char *strPurNames[] = {"Strongylocentrotus purpuratus", NULL};
#     {"nemVec", nemVecNames},
cvs com -m "nemVec" gbGenome.c
make install-server
exit

# delete previous failed run
rm -rf /cluster/data/genbank/build/work/initial.nemVec1/align

time nice -n +19 ./bin/gbAlignStep -initial nemVec1 &
# logFile: var/build/logs/2008.08.08-15:13:58.nemVec1.initalign.log
# real    51m43.328s
# user    37m44.638s
# sys     11m55.862s
# updated job database on disk
# total sick machines: 7 failures: 25
# Sick batch! Correct problem and then run para clearSickNodes.
# command failed: ssh -x pk cd /cluster/data/genbank/build/work/initial.nemVec1/align\; para make 
# -maxPush=200000 align.jobs </dev/null at /cluster/genbank/genbank/bin/../lib/gbCommon.pm line 251. at /cluster/genbank/genbank/bin/../lib/gbCommon.pm line 251.
# command failed: gbAlignRun -workdir=/cluster/data/genbank/build/work/initial.nemVec1/align  at /
# cluster/genbank/genbank/bin/../lib/gbCommon.pm line 251. at /cluster/genbank/genbank/bin/../lib/gbCommon.pm line 251.

ssh pk 
cd /cluster/genbank/genbank/build/work/initial.nemVec1/align
para check
# 41202 jobs in batch
# 21831 jobs (including everybody's) in Parasol queue or running.
# Sick Batch: consecutive crashes (32) >= sick batch threshold (25)
# Checking finished jobs
# unsubmitted jobs: 35835
# crashed: 32
# total jobs in batch: 41202
# total sick machines: 7 failures: 25
# Sick batch! Correct problem and then run para clearSickNodes.

# It looks like all are sick nodes
para problems | egrep failure | awk '{print $3}' | sort | uniq -c
# 32 crash
para try
# 41202 jobs in batch
# 21559 jobs (including everybody's) in Parasol queue or running.
# Checking finished jobs
# updated job database on disk
# Pushed Jobs: 10

# Can't open /iscratch/i/nemVec1/nemVec1.2bit to read: No such file or directory

para problem | egrep 2bit | egrep Can | wc -l
#  41202 jobs in batch
#  20652 jobs (including everybody's) in Parasol queue or running.
#  Checking finished jobs
#  52

ssh hgwdev
ls -l /iscratch/i/nemVec1/nemVec1.2bit
# -rw-rw-r--  1 kord protein 91277763 Jul 16 21:25 /iscratch/i/nemVec1/nemVec1.2bit

# rsync to the iservers - this takes a long time these days
# iservers no available?

# Using /san/sanvol1/scratch/nemVec1/
cp /cluster/data/nemVec1/nemVec1.2bit /san/sanvol1/scratch/nemVec1/
cp /cluster/data/nemVec1/jkStuff/11.ooc /san/sanvol1/scratch/nemVec1/
ssh kkr10u26.kilokluster.ucsc.edu "ls -l /san/sanvol1/scratch/nemVec1/"
total 89708
# -rw-rw-r--  1 kord protein    63112 Aug 11 10:10 11.ooc
# drwxrwxr-x  4 kord protein    38912 Jul 16 10:29 RMPart
# drwxrwxr-x  4 kord protein     2048 Jul 16 21:01 TrfPart
# -rw-rw-r--  1 kord protein 91277763 Aug 11 10:10 nemVec1.2bit

# Updated kent/src/hg/makeDb/genbank/etc/genbank.conf with new paths (and updated
# doc above)
cd /cluster/home/kord/kent/src/hg/makeDb/genbank
vi etc/genbank.conf
cvs com -m "updated nemVec1 2bit and ooc path" etc/genbank.conf 
make etc-update

ssh genbank
screen
cd /cluster/data/genbank
egrep nemVec etc/genbank.conf
# nemVec1 (Starlet Sea Anemone)
# nemVec1.serverGenome = /cluster/data/nemVec1/nemVec1.2bit
# nemVec1.clusterGenome = /san/sanvol1/scratch/nemVec1/nemVec1.2bit
# nemVec1.ooc = /san/sanvol1/scratch/nemVec1/11.ooc
# ...
rm -rf /cluster/data/genbank/build/work/initial.nemVec1/align
time nice -n +19 ./bin/gbAlignStep -initial nemVec1 &
# logFile: var/build/logs/2008.08.11-10:15:07.nemVec1.initalign.log
# real    52m22.977s
# user    38m37.753s
# sys 12m28.625s
# updated job database on disk
# total sick machines: 3 failures: 10
# Sick batch! Correct problem and then run para clearSickNodes.
# command failed: ssh -x pk cd /cluster/data/genbank/build/work/initial.nemVec1/align\; para make -maxPush=200000 align.jobs </dev/null at /cluster/genbank/genbank/bin/../lib/gbCommon.pm line 251. at /cluster/genbank/genbank/bin/../lib/gbCommon.pm line 251.
# command failed: gbAlignRun -workdir=/cluster/data/genbank/build/work/initial.nemVec1/align  at /cluster/genbank/genbank/bin/../lib/gbCommon.pm line 251. at /cluster/genbank/genbank/bin/../lib/gbCommon.pm line 251.
para problems
# nill
para clearSickNodes
# Told hub to clear sick nodes
para try
parasol list batches
#user     run   wait   done crash pri max cpu  ram  plan min batch
# kord        1      0      9    65  10  -1   1  2.0g    0   0 /cluster/genbank/genbank/build/work/initial.nemVec1/align/

para check
# 41526 jobs in batch
# 243011 jobs (including everybody's) in Parasol queue or running.
# Checking finished jobs
# unsubmitted jobs: 41516
# ranOk: 10
# total jobs in batch: 41526

para push
# 41526 jobs in batch
# 242964 jobs (including everybody's) in Parasol queue or running.
# Checking finished jobs
# ...

parasol list batches
#user     run   wait   done crash pri max cpu  ram  plan min batch
kord      390  38499   2637    65  10  -1   1  2.0g  390   0 /cluster/genbank/genbank/build/work/initial.nemVec1/align/
para check
# 41526 jobs in batch
# 242557 jobs (including everybody's) in Parasol queue or running.
# Checking finished jobs
# ...
# ranOk: 41526
# total jobs in batch: 41526

# when the blat runs are done, finish up the gbAlignStep
time nice -n +19 ./bin/gbAlignStep -initial -continue=finish nemVec1 &
# logFile: var/build/logs/2008.08.11-11:55:31.nemVec1.initalign.log
# real    54m55.941s
# user    38m11.286s
# sys     15m36.247s
# genbank 2008.08.11-12:50:27 nemVec1.initalign: finish

ssh hgwdev
cd /cluster/data/genbank
time nice -n +19 ./bin/gbDbLoadStep -drop -initialLoad nemVec1
# logFile: var/dbload/hgwdev/logs/2008.08.11-13:08:47.dbload.log
# real    22m17.933s
# user    9m14.590s
# sys     3m20.555s

# enable daily alignment and update of hgwdev
cd ~/kent/src/hg/makeDb/genbank
cvsup 
# add nemVec1 to:
    etc/align.dbs
    etc/hgwdev.dbs
cvs ci -m "Added N. vectensis" \
    etc/align.dbs etc/hgwdev.dbs
make etc-update

#######################################################################
# make downloads (DONE 2008-Aug-11 Kord)

# verify all tracks have html pages for their trackDb entries
cd /cluster/data/nemVec1
ln -s /cluster/data/nemVec1/bed/repeatMasker/nemVec1.fa.out
nice time /cluster/bin/scripts/makeDownloads.pl nemVec1

#   Edit these files
#       /cluster/data/nemVec1/goldenPath/database/README.txt
#       /cluster/data/nemVec1/goldenPath/bigZips/README.txt

#######################################################################
# GENSCAN PREDICTIONS (DONE 2008-Aug-11 - Kord) 

ssh hgwdev
mkdir /cluster/data/nemVec1/bed/genscan
cd /cluster/data/nemVec1/bed/genscan

mkdir gtf pep subopt
cvs co hg3rdParty/genscanlinux

# generate hard-masked sequence
ssh kkstore06
cd /cluster/data/nemVec1/bed/genscan
zcat /cluster/data/nemVec1/goldenPath/bigZips/nemVec1.fa.gz \
| maskOutFa stdin hard nemVec1.hardmask.fa

# split into 2Mb files
mkdir split
cd split
faSplit about ../nemVec1.hardmask.fa 2000000 split &

# generate file list and check that no files are completely masked
cd ..
for f in `find ./split -name "*fa"`; 
do 
    egrep '[ACGT]' $f > /dev/null; 
    if [ $? == 0 ]; then  
       echo $f >> genome.list
    fi
done
wc -l genome.list 
# 157 genome.list

# use kk
ssh kk
parasol status
# CPUs total: 62
# CPUs free: 62
# CPUs busy: 0
# Nodes total: 31
# ...
cd /cluster/data/nemVec1/bed/genscan
cat << _EOF_ > gsub
#LOOP
/cluster/bin/x86_64/gsBig {check in line+ $(path1)} {check out line gtf/$(root1).gtf} -trans={check out line pep/$(root1).pep} -subopt={check out line subopt/$(root1).bed} -exe=hg3rdParty/genscanlinux/genscan -par=hg3rdParty/genscanlinux/HumanIso.smat -tmp=/tmp -window=2400000
#ENDLOOP
_EOF_

# ended up using pk instead of kk
ssh pk
/parasol/bin/gensub2 genome.list single gsub jobList
para create jobList
# Checking input files
# .......
# 157 jobs written to /cluster/store3/nemVec1/bed/genscan/batch

para try
# 157 jobs in batch
# 218392 jobs (including everybody's) in Parasol queue or running.
# Checking finished jobs
# updated job database on disk
# Pushed Jobs: 10

para check
# 157 jobs in batch
# 218209 jobs (including everybody's) in Parasol queue or running.
# Checking finished jobs
# unsubmitted jobs: 147
# ranOk: 10
# total jobs in batch: 157

para push
# 157 jobs in batch
# 218193 jobs (including everybody's) in Parasol queue or running.
# Checking finished jobs
# .......
# updated job database on disk
# Pushed Jobs: 147

para check
# 157 jobs in batch
# 217861 jobs (including everybody's) in Parasol queue or running.
# Checking finished jobs
# ......................
# ranOk: 157
# total jobs in batch: 157

para time > para.time 2>&1
cat para.time 
# 157 jobs in batch
# 217819 jobs (including everybody's) in Parasol queue or running.
# Checking finished jobs
# Completed: 157 of 157 jobs
# CPU time in finished jobs:       8635s     143.92m     2.40h    0.10d  0.000 y
# IO & Wait Time:                   499s       8.31m     0.14h    0.01d  0.000 y
# Average job time:                  58s       0.97m     0.02h    0.00d
# Longest finished job:             101s       1.68m     0.03h    0.00d
# Submission to last job:           303s       5.05m     0.08h    0.00d
# Estimated complete:                 0s       0.00m     0.00h    0.00d

# Concatenate
ssh kkstore06
cd /cluster/data/nemVec1/bed/genscan
cat gtf/*gtf > genscan.gtf
cat pep/*pep > genscan.pep
cat subopt/*.bed > genscanSubopt.bed

# Load into the databse
ssh hgwdev
cd /cluster/data/nemVec1/bed/genscan
ldHgGene -gtf nemVec1 genscan genscan.gtf
# Reading genscan.gtf
# Read 30239 transcripts in 183271 lines in 1 files
#   30239 groups 7812 seqs 1 sources 1 feature types
# 30239 gene predictions

hgPepPred nemVec1 generic genscanPep genscan.pep
# Processing genscan.pep

hgLoadBed nemVec1 genscanSubopt genscanSubopt.bed
# Reading genscanSubopt.bed
# Loaded 131648 elements of size 6
# Sorted
# Creating table definition for genscanSubopt
# Saving bed.tab
# Loading nemVec1

featureBits nemVec1 genscan
# 43681106 bases of 297398104 (14.688%) in intersection
featureBits nemVec1 genscanSubopt
# 22102365 bases of 297398104 (7.432%) in intersection
featureBits nemVec1 genscan rmsk
# 630 bases of 297398104 (0.000%) in intersection


#######################################################################
# Updating the description with an image (DONE 2008-Aug-11 - Kord)

# checking out the browser CVS tree
ssh hgwdev
cvs co browser/images

# add the image
cd browser/images
mv ~/Nematostella_vectensis.jpg .
cvs add ~/Nematostella_vectensis.jpg
cvs com ~/Nematostella_vectensis.jpg

# update the description page
cd ~/kent/src/hg/makeDb/trackDb
cvs com description.html


