##################################################

#
#  danRer6 = Danio rerio Zv8   Dec. 2008
#
#  by Galt Barber 2009-06-17
#
# Danio Rerio (zebrafish) from Sanger, version Zv8 (released 2008-12-12)
#  Project website:
#    http://www.sanger.ac.uk/Projects/D_rerio/
#  Assembly notes:
#    http://www.sanger.ac.uk/Projects/D_rerio/
#    http://www.sanger.ac.uk/Projects/D_rerio/Zv8_assembly_information.shtml
###############################################
# set up main genome directory

    ssh hgwdev
    cd /hive/data/genomes
    mkdir danRer6
    cd danRer6


###########################################################################
# DOWNLOAD SEQUENCE (DONE, 2009-06-17, galt)

    cd /hive/data/genomes/danRer6
    mkdir download
    cd download

    # get sequence from Sanger  
    # (See below, they later give us corrected files at another location)

    wget --timestamping -r -np -l 2 -nd -L 'ftp://ftp.ensembl.org/pub/assembly/zebrafish/Zv8release'

    # matches ftp site sizes	
    ls -l --block-size=K
-rw-rw-r-- 1 galt protein       1K Apr  9 15:56 README
-rw-rw-r-- 1 galt protein    3999K Dec 12  2008 Zv8_chr.agp
-rw-rw-r-- 1 galt protein 1553865K Dec 12  2008 Zv8_contigs.fa
-rw-rw-r-- 1 galt protein    6475K Dec 12  2008 Zv8_scaffold.agp
-rw-rw-r-- 1 galt protein 1470539K Dec 12  2008 Zv8_scaffolds.fa
-rw-rw-r-- 1 galt protein 1471257K Apr  9 15:55 Zv8_toplevel.fa

    # Zv8_chr.agp maps contigs (in Zv8_contigs.fa) to chr1-25 
    # Oddly enough, the contig names are 
    # all in the form of scaffoldName.1, ... 
    # so that any original contig name is lost.

    # Zv8_scaffold.agp maps contigs to scaffolds, 
    #  producing Zv8_scaffolds.fa

    # Not entirely sure yet about Zv8_toplevel.fa
    #  but it seems to be more scaffolds only.
    # I think only the configs.fa is useful fasta.

    # There is nothing that maps the scaffolds to
    # the chroms except that they gave the contig names
    # like scaffold1.1 etc.  This connection via name 
    # parts is not official agp structure, it's a hack.

    # People want to see the detailed internal gap structure
    # of the scaffolds even inside their chroms, so that
    # creates a minor problem.

    # Additionally, we want to include the chroms for sure,
    # but we want to also include any other scaffolds not
    # already included in the chroms. I am going to write
    # a c program to merge the chrom and scaffold agps
    # so that we don't duplicate scaffolds that are in 
    # the chroms.  It will have to rely on the name 
    # hack they provide to connect scaffold to chrom.

    # This agp/fasta structure has existed at least since Zv7
    # and I don't know if Sanger or others use it more 
    # generally.

    # I created a new program agpSangerUnfinished in kent/src/hg
    # because Sanger wants to use unfinished clones but can't
    # officially report it that way, so they hack their agp file
    # but they don't change contigs.fa to match.
    # compile it
    cd kent/src/hg/agpSangerUnfinished
    make
    rehash

    cd /hive/data/genomes/danRer6/download

    # fix unfinished names and coordinates
    agpSangerUnfinished Zv8_chr.agp Zv8_contigs.fa Zv8_chr.fix.agp
    agpSangerUnfinished Zv8_scaffold.agp Zv8_contigs.fa Zv8_scaffold.fix.agp

    # I created a new program agpMergeChromScaf in kent/src/hg
    # as mentioned above for merging scaffold data 
    # nog alreadhromScaf Zv8_
    # compile it
    cd kent/src/hg/agpMergeChromScaf
    make
    rehash

    cd /hive/data/genomes/danRer6/download
    agpMergeChromScaf Zv8_chr.fix.agp Zv8_scaffold.fix.agp danRer6.agp

    # Now make final fasta from fixed/merged agp
    agpAllToFaFile danRer6.agp Zv8_contigs.fa danRer6.fa

    # check it out!
    faToTwoBit danRer6.fa danRer6.2bit
    checkAgpAndFa danRer6.agp danRer6.2bit >& checkAgpAndFa.log
    tail -1 checkAgpAndFa.log
#All AGP and FASTA entries agree - both files are valid

-------------------------------------------------------------------
#RE-DOING with their corrected clone names:  (done 2009-06-24 galt)


ftp://ftp.sanger.ac.uk/pub/zfish/jt8/rt

"
I should stress that the sequence content of the FASTA file has not changed
from the versions you
already have; I have just altered the headers for the first set of clones
mentioned in the previous
email. Only the contig-level FASTA file is affected; the scaffold-level and
top-level FASTA files are
not concerned with the names of clones, so they remain as before.
"

    # move old version out of the way
    cd /hive/data/genomes/danRer6
    mv download download0
    mkdir download
    cd download

    # get sequence from Sanger
    # James Torrance <jt8@sanger.ac.uk> 
    # created this version of Zv8 with fixed clone-ids.
    wget --timestamping -r -np -l 2 -nd -L 'ftp://ftp.sanger.ac.uk/pub/zfish/jt8/rt'

[hgwdev:download> ll
-rw-rw-r-- 1 galt protein   1051781 Jun 23 17:23 Zv8_chr.agp.gz
-rw-rw-r-- 1 galt protein 468741052 Jun 23 22:43 Zv8_contigs.fa.gz
-rw-rw-r-- 1 galt protein   1683358 Jun 23 17:23 Zv8_scaffold.agp.gz
-rw-rw-r-- 1 galt protein       155 Jun 23 22:44 md5sum.dat

[hgwdev:download> cat md5sum.dat 
1bb31f1f3290038066f74680a854ff69  Zv8_chr.agp.gz
c4cc4c9574721ce57fde0c79357a4a13  Zv8_contigs.fa.gz
fc4f9261b77fd177096f0cc9c8718a8d  Zv8_scaffold.agp.gz

[hgwdev:download> md5sum *.gz
1bb31f1f3290038066f74680a854ff69  Zv8_chr.agp.gz
c4cc4c9574721ce57fde0c79357a4a13  Zv8_contigs.fa.gz
fc4f9261b77fd177096f0cc9c8718a8d  Zv8_scaffold.agp.gz
# they match! no corruption during transfer


    # decompress
    gunzip *.gz

[hgwdev:download> ll
-rw-rw-r-- 1 galt protein    4094225 Jun 23 17:23 Zv8_chr.agp
-rw-rw-r-- 1 galt protein 1591157262 Jun 23 22:43 Zv8_contigs.fa
-rw-rw-r-- 1 galt protein    6630120 Jun 23 17:23 Zv8_scaffold.agp

    # repeat all of the above steps from the first section
    # (after the wget) down to where it says
#All AGP and FASTA entries agree - both files are valid


#######################################################################
# MAKE GENOME DB FROM CHROMOSOMES AND UNMAPPED SCAFFOLDS 
# (DONE, 2009-06-24, galt)
# CHANGE dbDb ENTRY MADE BY SCRIPT TO DISPLAY SAME DEFAULT POSITION 
# AS FOR DANRER5, GENE vdac2 (DONE, 2007-09-10, hartera)

    # To find the accession number for chrM, go to http://www.ncbi.nih.gov/ and search
    # Nucleotide for "Danio mitochondrion genome". 
    # That shows accession: NC_002333.2, 16596 bp
    # wget for mitochondron genome so:
    # http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nuccore&rettype=fasta&id=NC_002333.2&retmode=text
    ssh hgwdev
    cd /hive/data/genomes/danRer6/
    cat << 'EOF' > danRer6.config.ra
db danRer6
clade vertebrate
scientificName Danio rerio
commonName Zebrafish
assemblyDate Dec. 2008
assemblyLabel Wellcome Trust Sanger Institute, Zv8 assembly (CAAK00000000.5)
orderKey 448
genomeCladePriority 90
dbDbSpeciesDir zebrafish
mitoAcc NC_002333.2
agpFiles /hive/data/genomes/danRer6/download/danRer6.agp
fastaFiles /hive/data/genomes/danRer6/download/danRer6.fa
taxId 7955
'EOF'    
 # << keep emacs coloring happy

    ssh hgwdev
    cd /hive/data/genomes/danRer6/

    # makeGenomeDb.pl creates the db for the genome, makes the hgcentraltest
    # entries and loads the gold and gap tables. Creates the GC
    # percent, short match and restriction enzyme tracks. It also creates
    # a danRer6 directory in the trackDb/zebrafish directory with a trackDb.ra 
    # file and a template for the assembly descripton, and for the 
    # gold and gap track tracks 

    # Run makeGenomeDb.pl
    makeGenomeDb.pl danRer6.config.ra > & makeGenomeDb.pl.out &

    # Follow the directions at the end of the log 
    tail -20 makeGenomeDb.log

#----
#NOTES -- STUFF THAT YOU WILL HAVE TO DO --
#cd /hive/data/genomes/danRer6/TemporaryTrackDbCheckout/kent/src/hg/makeDb/trackDb/zebrafish/danRer6
#
#Search for '***' notes in each file in and make corrections (sometimes the
#files used for a previous assembly might make a better template):
#  description.html /hive/data/genomes/danRer6/html/{trackDb.ra,gap.html,gold.html}
#
#Then cd ../.. (to trackDb/) and
# - edit makefile to add danRer6 to DBS.
# - (if necessary) cvs add zebrafish
# - cvs add zebrafish/danRer6
# - cvs add zebrafish/danRer6/*.{ra,html}
# - cvs ci -m "Added danRer6 to DBS." makefile
# - cvs ci -m "Initial descriptions for danRer6." zebrafish/danRer6
# - (if necessary) cvs ci zebrafish
# - Run make update DBS=danRer6 and make alpha when done.
# - (optional) Clean up /hive/data/genomes/danRer6/TemporaryTrackDbCheckout
# - cvsup your ~/kent/src/hg/makeDb/trackDb and make future edits there.
#----


    cd /hive/data/genomes/danRer6/TemporaryTrackDbCheckout/kent/src/hg/makeDb/trackDb/zebrafish/danRer6
    # save in case
    mkdir orig
    cp * orig/
    # start with version from danRer5
    cp ~/kent/src/hg/makeDb/trackDb/zebrafish/danRer5/description.html .
    cp ~/kent/src/hg/makeDb/trackDb/zebrafish/danRer5/gap.html .
    cp ~/kent/src/hg/makeDb/trackDb/zebrafish/danRer5/gold.html .
    # edit them for danRer6
    vi description.html
    vi gap.html
    vi gold.html

    cd ../.. 
    vi makefile    # - edit makefile to add danRer6 to DBS.
    cvs add zebrafish/danRer6
    cvs add zebrafish/danRer6/*.{ra,html}
    cvs ci -m "Added danRer6 to DBS." makefile
    cvs ci -m "Initial descriptions for danRer6." zebrafish/danRer6
    make update DBS=danRer6
    make alpha  DBS=danRer6
    
   
    # (2009-06-24, galt), Change the default Browser position in dbDb
    # so that it displays vdac2 - chr13:14,786,820-14,801,993 - same as for 
    # danRer5.
    ssh hgwdev
    hgsql -h genome-testdb \
 	-e 'update dbDb set defaultPos = "chr13:14,786,820-14,801,993" where name = "danRer6";' \
	hgcentraltest


    # temporarily add in a 2bit file so we can run the browser:
    ln -s `pwd`/danRer6.unmasked.2bit /gbdb/danRer6/danRer6.2bit 
    

###########################################################
# Repeatmasking  (DONE 2009-06-26 galt)
# 
#

    ssh hgwdev
    cd /hive/data/genomes/danRer6/

    screen

    # repeat mask
    doRepeatMasker.pl danRer6 > & doRepeatMasker.pl.out &

    # about 12 hours.

#/cluster/bin/scripts/extractNestedRepeats.pl danRer6.fa.out
#RepeatMasker bug?: Undefined id, line 330298 of input:
#  261  27.7  3.1  1.2  Zv8_NA4692   12290   12450   (10778) C ERV1-N3-I_DR-int LTR (824) 3767   3604
#At least one ID was missing (see warnings above) -- please report to Robert
#Hubley.  -continue at your disgression.

    # Seems to only be one line that is a problem, and it may even have
    # been duplicated, so remove the problem line from danRer6.fa.out
#261  27.7  3.1  1.2  Zv8_NA4692   12290   12450   (10778) C ERV1-N3-I_DR-int LTR                  (824) 3767   3604
#267  24.9  1.8  1.2  Zv8_NA4692   12300   12313   (10915) C ERV1-N3-I_DR-int LTR                  (824) 3722   3604 298373
    # keep the second of these lines, remove the first (line 330298) which is missing the last column.

    # since danRer6.nestedRepeats.bed did get created,
    #  and doCat.csh appears to have finished,
    # I am going to ignore this error and continue

    doRepeatMasker.pl -continue mask  \
      -buildDir /hive/data/genomes/danRer6/bed/RepeatMasker.2009-06-26/ danRer6 \
      >> & doRepeatMasker.pl.out &

    cat bed/RepeatMasker.2009-06-26/faSize.rmsk.txt 
    #1512402306 bases (5525484 N's 1506876822 real 729112148 upper 777764674 lower)
    #in 11724 sequences in 1 files


    [hgwdev:danRer6> featureBits -countGaps danRer6 rmsk
    #777997590 bases of 1512402306 (51.441%) in intersection

    # simple repeat masker trf
    doSimpleRepeat.pl danRer6 > & doSimpleRepeat.pl.out &
    #[1]    Exit 25   doSimpleRepeat.pl danRer6 >& doSimpleRepeat.pl.out
    #oops messed up on chrM producing no output, liftUp produced nothing
    #this is not a problem, and the bug is in the automated process,
    #they should either tolerate nothing to do and create an empty output,
    #or just filter chrM out of the input.

    doSimpleRepeat.pl -continue filter  \
      -buildDir /hive/data/genomes/danRer6/bed/simpleRepeat.2009-06-29/ danRer6 \
      >> & doSimpleRepeat.pl.out &

    featureBits -countGaps danRer6 simpleRepeat
    #111749881 bases of 1512402306 (7.389%) in intersection

    # make final masked .2bit
    twoBitMask danRer6.rmsk.2bit -add bed/simpleRepeat.2009-06-29/trfMask.bed danRer6.2bit
    #Warning: BED file bed/simpleRepeat.2009-06-29/trfMask.bed has >=13 fields
    #which means it might contain block coordinates, but this program uses only the
    #first three fields (the entire span -- no support for blocks).
    #(this seems to be normal/ok)

############################################################################
#  prepare cluster data (DONE - 2009-07-06 - Galt)
    
    ssh hgwdev
    cd /hive/data/genomes/danRer6

    # create gbdb symlink
    rm /gbdb/danRer6/danRer6.2bit
    ln -s `pwd`/danRer6.2bit /gbdb/danRer6/

    calc '1024*1.14/2.88'
    #405

    blat danRer6.2bit /dev/null /dev/null -tileSize=11 -makeOoc=11.ooc -repMatch=405
    #Wrote 71088 overused 11-mers to 11.ooc

    mkdir /hive/data/staging/data/danRer6
    cp -p danRer6.2bit /hive/data/staging/data/danRer6/
    cp -p 11.ooc /hive/data/staging/data/danRer6/
    cp -p chrom.sizes /hive/data/staging/data/danRer6/

    # ask admin to sync this directory: /hive/data/staging/data/danRer6/
    #       to the kluster nodes /scratch/data/danRer6/

############################################################################
# running cpgIsland (DONE - 2009-07-07 - Galt)
    
    ssh hgwdev
    cd /hive/data/genomes/danRer6

    mkdir bed/cpgIsland
    cd bed/cpgIsland
    cvs -d /projects/compbio/cvsroot checkout -P hg3rdParty/cpgIslands
    cd hg3rdParty/cpgIslands
    # comment out the following two lines if it compiles cleanly
    # some day  (there were some other fixups too, adding include lines)
    sed -i -e "s#\(extern char\* malloc\)#// \1#" cpg_lh.c
    make
    #warning: incompatible implicit declaration of built-in function
    # ignore the warnings
    cd ../../ 
    ln -s hg3rdParty/cpgIslands/cpglh.exe

    # make hardmasked fasta files
    mkdir -p hardMaskedFa
    bash
    cut -f1 ../../chrom.sizes | while read C
    do
    echo ${C}
    twoBitToFa ../../danRer6.2bit:$C stdout \
	| maskOutFa stdin hard hardMaskedFa/${C}.fa
    done

    #exit bash 
    exit

    cut -f1 ../../chrom.sizes > chr.list
    cat << '_EOF_' > template
    #LOOP
    ./runOne $(root1) {check out line results/$(root1).cpg}
    #ENDLOOP
    '_EOF_'
    # << happy emacs

    cat << '_EOF_' > runOne
    #!/bin/csh -fe
    ./cpglh.exe hardMaskedFa/$1.fa > /scratch/tmp/$1.$$
    mv /scratch/tmp/$1.$$ $2
    '_EOF_'
    # << happy emacs

    chmod +x runOne
    mkdir results

    pk
    cd /hive/data/genomes/danRer6/bed/cpgIsland

    gensub2 chr.list single template jobList
    para create jobList
    para try
    para check
    para push
    para time
    #Completed: 11724 of 11724 jobs
    #CPU time in finished jobs:        111s       1.85m     0.03h    0.00d  0.000 y
    #IO & Wait Time:                 73830s    1230.50m    20.51h    0.85d  0.002 y
    #Average job time:                   6s       0.11m     0.00h    0.00d
    #Longest finished job:              41s       0.68m     0.01h    0.00d
    #Submission to last job:          1141s      19.02m     0.32h    0.01d

    exit # return to hgwdev
    # should be in /hive/data/genomes/danRer6/bed/cpgIsland

    # Transform cpglh output to bed +
    catDir results | awk '{ \
    $2 = $2 - 1; \
    width = $3 - $2; \
    printf("%s\t%d\t%s\t%s %s\t%s\t%s\t%0.0f\t%0.1f\t%s\t%s\n", \
       $1, $2, $3, $5,$6, width, \
       $6, width*$7*0.01, 100.0*2*$6/width, $7, $9); \
    }' > cpgIsland.bed

    # took around 2 minutes

    hgLoadBed danRer6 cpgIslandExt -tab \
      -sqlTable=$HOME/kent/src/hg/lib/cpgIslandExt.sql cpgIsland.bed

    #Loaded 13361 elements of size 10

###########################################################################
# HUMAN (hg18) PROTEINS TRACK (DONE 2009-07-09 braney )
    # bash  if not using bash shell already

    cd /cluster/data/danRer6
    mkdir /cluster/data/danRer6/blastDb

    awk '{if ($2 > 1000000) print $1}' chrom.sizes > 1meg.lst
    twoBitToFa -seqList=1meg.lst  danRer6.2bit temp.fa
    faSplit gap temp.fa 1000000 blastDb/x -lift=blastDb.lft
    rm temp.fa 1meg.lst

    awk '{if ($2 <= 1000000) print $1}' chrom.sizes > less1meg.lst
    twoBitToFa -seqList=less1meg.lst  danRer6.2bit temp.fa
    faSplit about temp.fa 1000000 blastDb/y 
    rm temp.fa less1meg.lst

    cd blastDb
    for i in *.fa
    do
	/hive/data/outside/blast229/formatdb -i $i -p F
    done
    rm *.fa
    ls *.nsq | wc -l
# 1513

    mkdir -p /cluster/data/danRer6/bed/tblastn.hg18KG
    cd /cluster/data/danRer6/bed/tblastn.hg18KG
    echo  ../../blastDb/*.nsq | xargs ls -S | sed "s/\.nsq//"  > query.lst
    wc -l query.lst
# 1513 query.lst

   # we want around 250000 jobs
   calc `wc /cluster/data/hg18/bed/blat.hg18KG/hg18KG.psl | awk '{print $1}'`/\(250000/`wc query.lst | awk '{print $1}'`\)

# 36727/(250000/1513) = 222.271804

   mkdir -p kgfa
   split -l 222 /cluster/data/hg18/bed/blat.hg18KG/hg18KG.psl  kgfa/kg
   cd kgfa
   for i in *; do 
     nice pslxToFa $i $i.fa; 
     rm $i; 
   done
   cd ..
   ls -1S kgfa/*.fa > kg.lst
   wc kg.lst
# 166  166 2158 kg.lst

   mkdir -p blastOut
   for i in `cat kg.lst`; do  mkdir blastOut/`basename $i .fa`; done
   tcsh
   cd /cluster/data/danRer6/bed/tblastn.hg18KG
   cat << '_EOF_' > blastGsub
#LOOP
blastSome $(path1) {check in line $(path2)} {check out exists blastOut/$(root2)/q.$(root1).psl }
#ENDLOOP
'_EOF_'

   cat << '_EOF_' > blastSome
#!/bin/sh
BLASTMAT=/hive/data/outside/blast229/data
export BLASTMAT
g=`basename $2`
f=/tmp/`basename $3`.$g
for eVal in 0.01 0.001 0.0001 0.00001 0.000001 1E-09 1E-11
do
if /hive/data/outside/blast229/blastall -M BLOSUM80 -m 0 -F no -e $eVal -p tblastn -d $1 -i $2 -o $f.8
then
        mv $f.8 $f.1
        break;
fi
done
if test -f  $f.1
then
    if /cluster/bin/i386/blastToPsl $f.1 $f.2
    then
	liftUp -nosort -type=".psl" -nohead $f.3 /cluster/data/danRer6/blastDb.lft carry $f.2
        liftUp -nosort -type=".psl" -pslQ -nohead $3.tmp /cluster/data/hg18/bed/blat.hg18KG/protein.lft warn $f.3
        if pslCheck -prot $3.tmp
        then                  
            mv $3.tmp $3     
            rm -f $f.1 $f.2 $f.3 $f.4
        fi
        exit 0               
    fi                      
fi                         
rm -f $f.1 $f.2 $3.tmp $f.8 $f.3 $f.4
exit 1
'_EOF_'
    # << happy emacs
    chmod +x blastSome
    exit 
    
    ssh swarm
    cd /cluster/data/danRer6/bed/tblastn.hg18KG
    gensub2 query.lst kg.lst blastGsub blastSpec
    para create blastSpec
#    para try, check, push, check etc.

    para time
# Completed: 251158 of 251158 jobs
# CPU time in finished jobs:    8587545s  143125.76m  2385.43h   99.39d  0.272 y
# IO & Wait Time:                885281s   14754.68m   245.91h   10.25d  0.028 y
# Average job time:                  38s       0.63m     0.01h    0.00d
# Longest finished job:            1167s      19.45m     0.32h    0.01d
# Submission to last job:          9852s     164.20m     2.74h    0.11d

    ssh swarm
    cd /cluster/data/danRer6/bed/tblastn.hg18KG
    mkdir chainRun
    cd chainRun
    tcsh
    cat << '_EOF_' > chainGsub
#LOOP
chainOne $(path1)
#ENDLOOP
'_EOF_'

    cat << '_EOF_' > chainOne
(cd $1; cat q.*.psl | simpleChain -prot -outPsl -maxGap=12000 stdin ../c.`basename $1`.psl)
'_EOF_'
    chmod +x chainOne
    ls -1dS ../blastOut/kg?? > chain.lst
    gensub2 chain.lst single chainGsub chainSpec
    # do the cluster run for chaining
    para create chainSpec
    para try, check, push, check etc.

# Completed: 166 of 166 jobs
# CPU time in finished jobs:     107801s    1796.69m    29.94h    1.25d  0.003 y
# IO & Wait Time:                 23945s     399.08m     6.65h    0.28d  0.001 y
# Average job time:                 794s      13.23m     0.22h    0.01d
# Longest finished job:           25056s     417.60m     6.96h    0.29d
# Submission to last job:         25085s     418.08m     6.97h    0.29d


    cd /cluster/data/danRer6/bed/tblastn.hg18KG/blastOut
    for i in kg??
    do
       cat c.$i.psl | awk "(\$13 - \$12)/\$11 > 0.6 {print}" > c60.$i.psl
       sort -rn c60.$i.psl | pslUniq stdin u.$i.psl
       awk "((\$1 / \$11) ) > 0.60 { print   }" c60.$i.psl > m60.$i.psl
       echo $i
    done
    sort u.*.psl m60* | uniq | sort -T /tmp -k 14,14 -k 16,16n -k 17,17n > ../blastHg18KG.psl
    cd ..
    pslCheck blastHg18KG.psl
# checked: 43305 failed: 0 errors: 0

    # load table 
    ssh hgwdev
    cd /cluster/data/danRer6/bed/tblastn.hg18KG
    hgLoadPsl danRer6 blastHg18KG.psl

    # check coverage
    featureBits danRer6 blastHg18KG 
# 19115900 bases of 1506896106 (1.269%) in intersection

    featureBits danRer6 blastHg18KG refGene  -enrichment
# blastHg18KG 1.269%, refGene 1.827%, both 0.659%, cover 51.97%, enrich 28.45x

    rm -rf blastOut
#end tblastn

############################################################################
# create lift file on unBridged gaps for genbank splits (2009-07-07 - Hiram)
    mkdir /hive/data/genomes/danRer6/bed/gap
    cd /hive/data/genomes/danRer6/bed/gap
    gapToLift danRer6 danRer6.unBridged.lift -bedFile=unBridged.lift.bed
    cp -p danRer6.unBridged.lift ../../jkStuff/
    cp -p danRer6.unBridged.lift /hive/data/staging/data/danRer6/

    # ask admin to sync this directory: /hive/data/staging/data/danRer6/
    #       to the kluster nodes /scratch/data/danRer6/

###########################################################################
# AUTO UPDATE GENBANK MRNA AND EST AND ZGC GENES RUN
# (DONE, 2009-07-07, galt)
    ssh hgwdev  
    cd $HOME/kent/src/hg/makeDb/genbank
    cvs update -dP
    # edit etc/genbank.conf to add danRer6 and commit to CVS
    # done already for first run so no need to update.
# danRer6 (zebrafish)
danRer6.serverGenome = /hive/data/genomes/danRer6/danRer6.2bit
danRer6.clusterGenome = /scratch/data/danRer6/danRer6.2bit
danRer6.ooc = /scratch/data/danRer6/11.ooc
danRer6.lift = /hive/data/genomes/danRer6/jkStuff/danRer6.unBridged.lift
danRer6.refseq.mrna.native.pslCDnaFilter  = ${ordered.refseq.mrna.native.pslCDnaFilter}
danRer6.refseq.mrna.xeno.pslCDnaFilter    = ${ordered.refseq.mrna.xeno.pslCDnaFilter}
danRer6.genbank.mrna.native.pslCDnaFilter = ${ordered.genbank.mrna.native.pslCDnaFilter}
danRer6.genbank.mrna.xeno.pslCDnaFilter   = ${ordered.genbank.mrna.xeno.pslCDnaFilter}
danRer6.genbank.est.native.pslCDnaFilter  = ${ordered.genbank.est.native.pslCDnaFilter}
danRer6.downloadDir = danRer6
danRer6.perChromTables = no
danRer6.refseq.mrna.xeno.load  = yes

    # end of section added to etc/genbank.conf
    cvs commit -m "Added danRer6." etc/genbank.conf
    # also added the perChromTables line afterwards. This means that there
    # will not be one table per chromosome. Would be too many tables as 
    # there are 5010 scaffolds.
    # update /cluster/data/genbank/
    make etc-update

    # ~/kent/src/hg/makeDb/genbank/src/lib/gbGenome.c already contains
    # danRer genome information

    ssh genbank
    cd /cluster/data/genbank
    nice bin/gbAlignStep -initial danRer6 &

    #var/build/logs/2009.07.07-20:23:34.danRer6.initalign.log

    # load database when finished
    ssh hgwdev
    cd /cluster/data/genbank
    nice ./bin/gbDbLoadStep -drop -initialLoad danRer6 &

    #var/dbload/hgwdev/logs/2009.07.08-09:59:18.dbload.log

    # enable daily alignment and update of hgwdev
    cd ~/kent/src/hg/makeDb/genbank
    cvsup
    # add danRer6 to:
        etc/align.dbs
        etc/hgwdev.dbs
    cvs ci -m "Added danRer6." etc/align.dbs etc/hgwdev.dbs
    make etc-update


#########################################################################
#  BLATSERVERS ENTRY (DONE - 2009-07-08 - Galt)
#	After getting a blat server assigned by the Blat Server Gods,
    ssh hgwdev

    hgsql -e 'INSERT INTO blatServers (db, host, port, isTrans, canPcr) \
	VALUES ("danRer6", "blat14", "17790", "1", "0"); \
	INSERT INTO blatServers (db, host, port, isTrans, canPcr) \
	VALUES ("danRer6", "blat14", "17791", "0", "1");' \
	    hgcentraltest
    #	test it with some sequence

############################################################################
# Making download files (DONE - 2009-07-08 - Galt)

    ssh hgwdev
    cd /hive/data/genomes/danRer6

    ln -s bed/RepeatMasker.2009-06-26/danRer6.fa.out .
    cd bed
    ln -s simpleRepeat.2009-06-29 simpleRepeat
    cd ..

    makeDownloads.pl danRer6 >& downloads.log

# *** Please take a look at the downloads for danRer6 using a web browser.
# *** The downloads url is: http://hgwdev.soe.ucsc.edu/goldenPath/danRer6. 
# *** Edit each README.txt to resolve any notes marked with "***":
#     /hive/data/genomes/danRer6/goldenPath/database/README.txt
#     /hive/data/genomes/danRer6/goldenPath/bigZips/README.txt
#     (The htdocs/goldenPath/danRer6/*/README.txt "files" are just links to those.)

############################################################################
# Make Self Chain (DONE - 2008-07-23)

    ssh hgwdev

    mkdir /hive/data/genomes/danRer6/bed/lastzSelf.2009-07-30
    cd /hive/data/genomes/danRer6/bed/lastzSelf.2009-07-30

    twoBitInfo ../../danRer6.2bit stdout | grep chr > chromOnly.len
    
    cat << '_EOF_' > DEF
# zebrafish vs self
BLASTZ=lastz
# maximum M allowed with lastz is only 254
BLASTZ_M=254
# lastz does not like the O= and E= lines in the matrix file
#       this copy has that removed from /scratch/data/scratch/human_chimp.v2.q
BLASTZ_Q=/hive/data/genomes/hg19/bed/lastzHg19Haps.2009-03-09/human_chimp.v2.q
# and place those items here
BLASTZ_O=600
BLASTZ_E=150
# other parameters from hg18 vs venter1 lastz on advice from Webb
BLASTZ_K=10000
BLASTZ_Y=15000
BLASTZ_T=2

BASE=/hive/data/genomes/danRer6/bed/lastzSelf.2009-07-30

# TARGET: Zebrafish danRer6
SEQ1_DIR=/scratch/data/danRer6/danRer6.2bit
SEQ1_LEN=$BASE/chromOnly.len
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
SEQ1_IN_CONTIGS=0

# QUERY: Zebrafish danRer6
SEQ2_DIR=/scratch/data/danRer6/danRer6.2bit
SEQ2_LEN=$BASE/chromOnly.len
SEQ2_CHUNK=10000000
SEQ2_LAP=0
SEQ2_IN_CONTIGS=0

TMPDIR=/scratch/tmp
'_EOF_'

    screen # use screen to manage this long-running job
    cd /hive/data/genomes/danRer6/bed/lastzSelf.2009-07-30
    time nice doBlastzChainNet.pl `pwd`/DEF -verbose=2 \
        -noLoadChainSplit -chainMinScore=2000 -chainLinearGap=medium \
        -workhorse=hgwdev \
        -stop=net -smallClusterHub=memk -bigClusterHub=swarm >& do.log &
    # This took 26 hours to finish.

    time nice doBlastzChainNet.pl `pwd`/DEF -verbose=2 \
        -noLoadChainSplit -chainMinScore=2000 -chainLinearGap=medium \
        -continue=load -debug -workhorse=hgwdev \
        -stop=load -smallClusterHub=memk -bigClusterHub=swarm >& load.debug.log &

    # load.debug.log indicates load step would do
    #  axtChain/loadUp.csh which would do:
    #   hgLoadChain -tIndex danRer6 chainSelf danRer6.danRer6.all.chain.gz

    # Manually include the -normScore parameter for self-chains.
    cd /hive/data/genomes/danRer6/bed/lastzSelf.2009-07-30/axtChain
    hgLoadChain -normScore -tIndex danRer6 chainSelf danRer6.danRer6.all.chain.gz

    # FYI For self-chains, we don't need to do the other steps 
    # that doBlastzChainNet.pl would normally do.

    # add to zebrafish/danRer6/trackDb.ra
track chainSelf
shortLabel Self Chain
longLabel Self Chained Alignments
group varRep
chromosomes
chr1,chr2,chr3,chr4,chr5,chr6,chr7,chr8,chr9,chr10,chr11,chr12,chr13,chr14,chr15,chr16,chr17,chr18,chr19,chr20,chr21,chr22,chr23,chr24,chr25,chrM
priority 400
visibility hide
color 100,50,0
altColor 255,240,200
chainColor Normalized Score
spectrum on
type chain danRer6
otherDb danRer6

    # Finish chainSelf downloads
    cd /hive/data/genomes/danRer6/bed/lastzSelf.2009-07-30/axtChain
    md5sum danRer6.danRer6.all.chain.gz >> ../md5sum.txt
    cd ..
    cp /usr/local/apache/htdocs/goldenPath/hg19/vsSelf/README.txt .
    vi README.txt
    mkdir /usr/local/apache/htdocs/goldenPath/danRer6/vsSelf
    cd /usr/local/apache/htdocs/goldenPath/danRer6/vsSelf
    ln -s /hive/data/genomes/danRer6/bed/lastzSelf.2009-07-30/axtChain/danRer6.danRer6.all.chain.gz .
    ln -s /hive/data/genomes/danRer6/bed/lastzSelf.2009-07-30/README.txt .
    ln -s /hive/data/genomes/danRer6/bed/lastzSelf.2009-07-30/md5sum.txt .

#######################################################################
# Finish danRer6.2bit in downloads (done 2009-07-31)
#
#   NOTE this should not need to be done in future
#   because Hiram added the functionality to makeDownloads.pl
#
    ssh hgwdev
    cd /hive/data/genomes/danRer6/
    ln -s /hive/data/genomes/danRer6/danRer6.2bit /usr/local/apache/htdocs/goldenPath/danRer6/bigZips/
    vi README
    md5sum danRer6.2bit >> /usr/local/apache/htdocs/goldenPath/danRer6/bigZips/md5sum.txt

#######################################################################
# LASTZ/CHAIN/NET swap tetNig2 (DONE - 2009-09-15,18 - Hiram)
    # original alignment to tetNig2
    cd /hive/data/genomes/tetNig2/bed/lastzDanRer6.2009-09-15
    cat fb.tetNig2.chainDanRer6Link.txt 
    #	70626082 bases of 302314788 (23.362%) in intersection

    #	and the swap to here, danRer6:
    mkdir /hive/data/genomes/danRer6/bed/blastz.tetNig2.swap
    cd /hive/data/genomes/danRer6/bed/blastz.tetNig2.swap
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
	/hive/data/genomes/tetNig2/bed/lastzDanRer6.2009-09-15/DEF \
	-swap -tRepeats=windowmaskerSdust \
	-noLoadChainSplit -chainMinScore=2000 -chainLinearGap=medium \
	-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
	> swap.log 2>&1 &
    #	real    26m40.310s
    cat fb.danRer6.chainTetNig2Link.txt 
    #	93349443 bases of 1506896106 (6.195%) in intersection

#######################################################################
# LASTZ/CHAIN/NET swap oryLat2 (DONE - 2009-09-24 - Hiram)
    #	the first lastz
    cd /hive/data/genomes/oryLat2/bed/lastzDanRer6.2009-08-13
    cat fb.oryLat2.chainDanRer6Link.txt
    #	123547945 bases of 700386597 (17.640%) in intersection

    #	and this swap
    mkdir /hive/data/genomes/danRer6/bed/blastz.oryLat2.swap
    cd /hive/data/genomes/danRer6/bed/blastz.oryLat2.swap
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
	/hive/data/genomes/oryLat2/bed/lastzDanRer6.2009-08-13/DEF \
	-chainMinScore=3000 -chainLinearGap=medium \
	-noLoadChainSplit -tRepeats=windowmaskerSdust \
	-smallClusterHub=encodek -bigClusterHub=swarm \
	-swap > swap.log 2>&1 &
    #	real    53m31.962s
    cat fb.danRer6.chainOryLat2Link.txt 
    #	147597031 bases of 1506896106 (9.795%) in intersection

#######################################################################
# Creating the pushQ entry (DONE - 2009-10-06 - Galt)
    mkdir /hive/data/genomes/danRer6/pushQ
    cd /hive/data/genomes/danRer6/pushQ
    makePushQSql.pl danRer6 > danRer6.pushQ.sql
    #WARNING: danRer6 does not have seq
    #WARNING: danRer6 does not have extFile
    #WARNING: Could not tell (from trackDb, all.joiner and hardcoded lists of
    #supporting and genbank tables) which tracks to assign these tables to:
    #  ensGtp
    #  ensPep

    ssh hgwbeta hgsql -h mysqlbeta qapushq < /hive/data/genomes/danRer6/pushQ/danRer6.pushQ.sql

    # Then, look at file(s) named in each of the following
    # wiggle tables and added these to the files section of qapushq.danRer6
    # queue table:
          hgsql danRer6 -e 'select distinct(file) from gc5Base'

    # Inside the pushQ application, I manually added 
    # ensGtp and ensPep to the ensGene track tables
    # /gbdb/danRer6/wib/gc5Base.wib to gc5Base files

    # added danRer6 to all.joiner both $gbd and $chainDest




#########################################################################
# LASTZ Mouse mm9 (DONE - 2009-12-17 - Galt)
    mkdir /hive/data/genomes/danRer6/bed/lastzMm9.2009-12-17
    cd /hive/data/genomes/danRer6/bed/lastzMm9.2009-12-17

    cat << '_EOF_' > DEF
# zebrafish vs. mouse
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=6000
BLASTZ_K=2200
BLASTZ_Q=/scratch/data/blastz/HoxD55.q

# TARGET: Zebrafish danRer6
SEQ1_DIR=/scratch/data/danRer6/danRer6.2bit
SEQ1_LEN=/scratch/data/danRer6/chrom.sizes
SEQ1_CHUNK=20000000
SEQ1_LAP=0
SEQ1_LIMIT=40

# QUERY: Mouse mm9
SEQ2_DIR=/scratch/data/mm9/mm9.2bit
SEQ2_LEN=/scratch/data/mm9/chrom.sizes
SEQ2_CHUNK=10000000
SEQ2_LAP=10000
SEQ2_LIMIT=5

BASE=/hive/data/genomes/danRer6/bed/lastzMm9.2009-12-17
TMPDIR=/scratch/tmp
'_EOF_'
    # << happy emacs

    #	establish a screen to control this job
    screen
    time nice +19 doBlastzChainNet.pl -verbose=2 \
	`pwd`/DEF \
	-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=loose \
	-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
	>& do.log &

    cat fb.danRer6.chainMm9Link.txt 
    #   77099032 bases of 1506896106 (5.116%) in intersection

#########################################################################
# LASTZ Stickleback gasAcu1 (DONE - 2009-12-21 - Galt)
    mkdir /hive/data/genomes/danRer6/bed/lastzGasAcu1.2009-12-21
    cd /hive/data/genomes/danRer6/bed/lastzGasAcu1.2009-12-21

    cat << '_EOF_' > DEF
# zebrafish vs. stickleback
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=6000
BLASTZ_K=2200
BLASTZ_Q=/scratch/data/blastz/HoxD55.q

# TARGET: Zebrafish danRer6
SEQ1_DIR=/scratch/data/danRer6/danRer6.2bit
SEQ1_LEN=/scratch/data/danRer6/chrom.sizes
SEQ1_CHUNK=20000000
SEQ1_LAP=0
SEQ1_LIMIT=40

# QUERY: Stickleback gasAcu1
SEQ2_DIR=/scratch/data/gasAcu1/gasAcu1.2bit
SEQ2_LEN=/scratch/data/gasAcu1/chrom.sizes
SEQ2_CHUNK=10000000
SEQ2_LAP=10000
SEQ2_LIMIT=50

BASE=/hive/data/genomes/danRer6/bed/lastzGasAcu1.2009-12-21
TMPDIR=/scratch/tmp
'_EOF_'
    # << happy emacs

    #	establish a screen to control this job
    screen
    time nice +19 doBlastzChainNet.pl -verbose=2 \
	`pwd`/DEF \
	-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=loose \
	-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
	>& do.log &

    cat fb.danRer6.chainGasAcu1Link.txt 
    #   147479454 bases of 1506896106 (9.787%) in intersection

#########################################################################
# LASTZ Xenopus xenTro2 (DONE - 2009-12-22 - Galt)
    mkdir /hive/data/genomes/danRer6/bed/lastzXenTro2.2009-12-22
    cd /hive/data/genomes/danRer6/bed/lastzXenTro2.2009-12-22

    cat << '_EOF_' > DEF
# zebrafish vs. xenopus
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=6000
BLASTZ_K=2200
BLASTZ_Q=/scratch/data/blastz/HoxD55.q

# TARGET: Zebrafish danRer6
SEQ1_DIR=/scratch/data/danRer6/danRer6.2bit
SEQ1_LEN=/scratch/data/danRer6/chrom.sizes
SEQ1_CHUNK=20000000
SEQ1_LAP=0
SEQ1_LIMIT=40

# QUERY: Xenopus xenTro2
SEQ2_DIR=/scratch/data/xenTro2/xenTro2.2bit
SEQ2_LEN=/scratch/data/xenTro2/chrom.sizes
SEQ2_CHUNK=10000000
SEQ2_LAP=10000
SEQ2_LIMIT=50

BASE=/hive/data/genomes/danRer6/bed/lastzXenTro2.2009-12-22
TMPDIR=/scratch/tmp
'_EOF_'
    # << happy emacs

    #	establish a screen to control this job
    screen
    time nice +19 doBlastzChainNet.pl -verbose=2 \
	`pwd`/DEF \
	-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=loose \
	-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
	>& do.log &

    cat fb.danRer6.chainXenTro2Link.txt 
    #   100078259 bases of 1506896106 (6.641%) in intersection

############################################################################
# TRANSMAP vertebrate.2009-09-13 build  (2009-09-20 markd)

vertebrate-wide transMap alignments were built  Tracks are created and loaded
by a single Makefile. This is available from:
   svn+ssh://hgwdev.soe.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2009-09-13


##########################################################################
# MAKE PER-CHROM FASTA DOWNLOADS (DONE 2010-01-25 galt)
    ssh hgwdev
    mkdir /cluster/data/danRer6/goldenPath/chromosomes
    cd /cluster/data/danRer6/goldenPath/chromosomes
    awk '$1 !~ /^chr/ {print $1;}' ../../chrom.sizes > scafs.lst
    awk '$1 ~ /^chr/ {print $1;}' ../../chrom.sizes > chroms.lst
    foreach chr (`cat chroms.lst`)
      twoBitToFa ../../danRer6.2bit stdout -seq=$chr \
      | gzip -c > $chr.fa.gz
    end
    twoBitToFa ../../danRer6.2bit stdout -seqList=scafs.lst \
    | gzip -c > unmappedScaffolds.fa.gz
    rm chroms.lst scafs.lst
    md5sum *.gz > md5sum.txt
    # Make README.txt
    ln -s /cluster/data/danRer6/goldenPath/chromosomes \
      /usr/local/apache/htdocs/goldenPath/danRer6/chromosomes


#########################################################################
## 6-Way Multiz (DONE - 2010-02-01 - Galt)
    ssh hgwdev
    mkdir /cluster/data/danRer6/bed/multiz6way
    cd /cluster/data/danRer6/bed/multiz6way

    wget -O 46way.nh http://hgdownload.soe.ucsc.edu/goldenPath/hg19/multiz46way/46way.corrected.nh

# ((Zebrafish,
# (Stickleback,
# Tetraodon)),
# (X_tropicalis,
# (Mouse,
# Human))) 


    #	All distances remain as specified in the 46way.nh
    /cluster/bin/phast/tree_doctor --prune-all-but danRer6,gasAcu1,tetNig2,xenTro2,mm9,hg19 46way.nh > 6way.nh

#(((hg19:0.131183,mm9:0.352605):0.525173,xenTro2:0.852482):0.300396,((tetNig2:0.416610,gasAcu1:0.372371):0.322824,danRer6:0.731166):0.155214);

    #	Use this specification in the phyloGif tool:
    #	http://genome.ucsc.edu/cgi-bin/phyloGif
    #	to obtain a gif image for /usr/local/apache/htdocs/images/phylo/danRer6_6way.gif
    #   See below, I redo this on species names.
    wget -O /usr/local/apache/htdocs/images/phylo/danRer6_6way.gif \
'http://genome.ucsc.edu/cgi-bin/phyloGif?hgsid=151159276&phyloGif_width=200&phyloGif_height=320&boolshad.phyloGif_branchLengths=0&boolshad.phyloGif_lengthLegend=0&boolshad.phyloGif_branchLabels=0&phyloGif_branchDecimals=2&phyloGif_branchMultiplier=1&phyloGif_underscores=on&boolshad.phyloGif_underscores=0&boolshad.phyloGif_htmlPage=0&phyloGif_tree=(((Human%3A0.131183%2CMouse%3A0.352605)%3A0.525173%2CX.tropicalis%3A0.852482)%3A0.300396%2C((Tetraodon%3A0.416610%2CStickleback%3A0.372371)%3A0.322824%2CZebrafish%3A0.731166)%3A0.155214)%3B%0D%0A&phyloGif_submit=submit'

    /cluster/bin/phast/all_dists 6way.nh > 6way.distances.txt
    #	Use this output to create the table below
    grep -i danRer6 6way.distances.txt | sort -k3,3n

#gasAcu1 danRer6 1.426361
#tetNig2 danRer6 1.470600
#hg19    danRer6 1.843132
#xenTro2 danRer6 2.039258
#mm9     danRer6 2.064554


# hiram says these should already have been created 
# and are in the build dirs already.

featureBits danRer6 chainGasAcu1Link
#147479454 bases of 1506896106 (9.787%) in intersection
featureBits gasAcu1 chainDanRer6Link
#114451338 bases of 446627861 (25.626%) in intersection

featureBits danRer6 chainTetNig2Link
#93349443 bases of 1506896106 (6.195%) in intersection
featureBits tetNig2 chainDanRer6Link
#70626082 bases of 302314788 (23.362%) in intersection

featureBits danRer6 chainHg19Link
#96424507 bases of 1506896106 (6.399%) in intersection
featureBits hg19 chainDanRer6Link
#88391631 bases of 2897316137 (3.051%) in intersection

featureBits danRer6 chainXenTro2Link
#100078259 bases of 1506896106 (6.641%) in intersection
featureBits xenTro2 chainDanRer6Link
#92089833 bases of 1359412157 (6.774%) in intersection

featureBits danRer6 chainMm9Link
#77099032 bases of 1506896106 (5.116%) in intersection
featureBits mm9 chainDanRer6Link
#73444297 bases of 2620346127 (2.803%) in intersection

#
#	If you can fill in all the numbers in this table, you are ready for
#	the multiple alignment procedure
#

#                         featureBits chainLink measures
#                                           chainDanRer6Link  chain linearGap
#    distance                    on danRer6  on other     minScore
#  1  1.43 - stickleback gasAcu1   (% 9.79) (% 25.63)       2000     loose
#  2  1.47 - tetraodon tetNig2     (% 6.20) (% 23.36)       2000     loose
#  3  1.84 - human hg19            (% 6.40) (%  3.05)       2000     loose
#  4  2.04 - xenopus xenTro2       (% 6.64) (%  6.77)       2000     loose
#  5  2.06 - mouse mm9             (% 5.12) (%  2.80)       2000     loose

# None of this concern for distances matters in building the first step, the
# maf files.

    # fix missing symlinks
    cd /hive/data/genomes/danRer6/bed/
    ln -s lastzMm9.2009-12-17/ lastz.mm9
    ln -s lastzXenTro2.2009-12-22/ lastz.xenTro2
    ln -s blastz.hg19.swap/ lastz.hg19
    ln -s blastz.tetNig2.swap/ lastz.tetNig2

    cd /hive/data/genomes/danRer6/bed/multiz6way

    #	bash shell syntax here ...
    export H=/cluster/data/danRer6/bed
    mkdir mafLinks
    for G in gasAcu1 tetNig2 hg19 xenTro2 mm9
    do
	mkdir mafLinks/$G
	if [ ! -d ${H}/lastz.${G}/mafNet ]; then
	echo "missing directory lastz.${G}/mafNet"
		exit 255
	fi
	ln -s ${H}/lastz.$G/mafNet/*.maf.gz ./mafLinks/$G
    done

    # location for cluster run
    #/hive/data/genomes/danRer6/bed/multiz6way/mafLinks/ 

    mkdir penn
    cp -p /cluster/bin/penn/multiz.2009-01-21/multiz penn
    cp -p /cluster/bin/penn/multiz.2009-01-21/maf_project penn
    cp -p /cluster/bin/penn/multiz.2009-01-21/autoMZ penn

    # create species list and stripped down tree for autoMZ
    sed 's/[a-z][a-z]*_//g; s/:[0-9\.][0-9\.]*//g; s/;//; /^ *$/d' \
	6way.nh > tmp.nh
    echo `cat tmp.nh` > tree-commas.nh
    echo `cat tree-commas.nh` | sed 's/ //g; s/,/ /g' > tree.nh
    sed 's/[()]//g; s/,/ /g' tree.nh > species.lst


    # this one is simple enough that it doesn't need a cluster run.

    cat > autoMultiz << '_EOF_'
#!/bin/csh -ef
set db = danRer6
set binDir = /hive/data/genomes/danRer6/bed/multiz6way/penn
rm -fr tmp
mkdir -p tmp
cd tmp
foreach s (`cat ../species.lst`)
    if ($s != $db) then
        zcat ../mafLinks/$s/$db.$s.net.maf > $db.$s.sing.maf
    endif
end
set path = ($binDir $path); rehash
$binDir/autoMZ + T=. E=$db "`cat ../tree.nh`" $db.*.sing.maf ../multiz6way.maf
cd ..
rm -fr $tmp
'_EOF_'
    # << happy emacs
    chmod +x autoMultiz

    screen
    ./autoMultiz > & autoMultiz.log
    # took 3 hours, log shows no errors.

    # Load into database
    ssh hgwdev
    cd /hive/data/genomes/danRer6/bed/multiz6way
    mkdir /gbdb/danRer6/multiz6way
    ln -s /hive/data/genomes/danRer6/bed/multiz6way/multiz6way.maf \
	/gbdb/danRer6/multiz6way
    time nice hgLoadMaf danRer6 multiz6way
    #	Loaded 2228353 mafs in 1 files from /gbdb/danRer6/multiz6way
    #	43 sec

    time nice hgLoadMafSummary -minSize=10000 -mergeGap=500 \
	-maxSize=50000 danRer6 multiz6waySummary multiz6way.maf
    #	Created 786724 summary blocks from 4850023 components 
    #		and 2228353 mafs from multiz6way.maf
    #	49 sec

    wc -l multiz6way*.tab
    #  2228353 multiz6way.tab
    #   786724 multiz6waySummary.tab
    #  3015077 total
    rm multiz6way*.tab

    #	Create tree image for details page
    #	You can get a better image from the phyloGif tool:
    #	http://genome.ucsc.edu/cgi-bin/phyloGif
    cp 6way.nh treeImage.nh
    #	Edit treeImage.nh to make the names as desired

# (
# (zebrafish,(tetraodon,stickleback)),
# (X._tropicalis,(mouse,human))
# );

    #	Submit this treeImage.nh to the phyloGif program

    # Organism Image
    pushd $HOME/browser/images/phylo
    wget -O danRer6_6way.gif \
	'http://hgwdev.soe.ucsc.edu/cgi-bin/phyloGif?hgsid=1831836&phyloGif_width=200&phyloGif_height=320&boolshad.phyloGif_branchLengths=0&boolshad.phyloGif_lengthLegend=0&boolshad.phyloGif_branchLabels=0&phyloGif_branchDecimals=2&phyloGif_branchMultiplier=1&phyloGif_underscores=on&boolshad.phyloGif_underscores=0&boolshad.phyloGif_htmlPage=0&phyloGif_tree=(%0D%0A(zebrafish%2C(tetraodon%2Cstickleback))%2C%0D%0A(X._tropicalis%2C(mouse%2Chuman))%0D%0A)%3B&phyloGif_submit=submit'
    #  check this .jpg file into the browser doc source tree images directory
    cvs add -kb danRer6_6way.gif 
    cvs commit -m 'organism tree for danRer6 6way' danRer6_6way.gif 
    cp -p danRer6_6way.gif /usr/local/apache/htdocs/images/phylo/
    popd


############################################################################
# GAP ANNOTATE MULTIZ6WAY MAF AND LOAD TABLES (DONE - 2010-02-08 - Galt)

    ssh hgwdev
    mkdir /hive/data/genomes/danRer6/bed/multiz6way/anno
    cd /hive/data/genomes/danRer6/bed/multiz6way/anno
    screen
    mafAddIRows ../multiz6way.maf /hive/data/genomes/danRer6/danRer6.2bit multiz6way.maf
    # 5 mins

    # Load into database
    rm /gbdb/danRer6/multiz6way/multiz6way.maf   # remove old symlink
    ln -s /hive/data/genomes/danRer6/bed/multiz6way/anno/multiz6way.maf \
	/gbdb/danRer6/multiz6way
    time nice hgLoadMaf danRer6 multiz6way

    #	Loaded 2644222 mafs in 1 files from /gbdb/danRer6/multiz6way
    #	2 mins 

    time nice hgLoadMafSummary -minSize=10000 -mergeGap=500 \
	-maxSize=50000 danRer6 multiz6waySummary multiz6way.maf
    #	Created 786724 summary blocks from 4850023 components 
    #   and 2644222 mafs from multiz6way.maf
    #   1 mins

    wc -l multiz6way*.tab
    #  2644222 multiz6way.tab
    #   786724 multiz6waySummary.tab
    #  3430946 total
    rm multiz6way*.tab


#########################################################################
# Adding automatic generation of upstream files (DONE - 2010-02-09 - Galt)
    # edit src/hg/makeDb/genbank/etc/genbank.conf to add:
danRer6.upstreamGeneTbl = refGene
danRer6.upstreamMaf = multiz6way /hive/data/genomes/danRer6/bed/multiz6way/species.lst

#########################################################################
# MULTIZ6WAY DOWNLOADABLES (DONE - 2010-02-09 - Galt)

    # create some downloads
    mkdir -p /hive/data/genomes/danRer6/bed/multiz6way/downloads/maf
    cd /hive/data/genomes/danRer6/bed/multiz6way/downloads/maf
    time cp -p ../../anno/*.maf .
    # 1 mins
    time gzip --rsyncable *.maf
    # 11 mins
    time md5sum *.gz > md5sum.txt
    # 1 mins

    mkdir -p /usr/local/apache/htdocs/goldenPath/danRer6/multiz6way/maf
    cd /usr/local/apache/htdocs/goldenPath/danRer6/multiz6way/maf
    ln -s /hive/data/genomes/danRer6/bed/multiz6way/downloads/maf/* .

#############################################################################
## create upstream refGene maf files
    cd /hive/data/genomes/danRer6/bed/multiz6way/downloads/maf
    # bash script
#!/bin/sh
for S in 1000 2000 5000
do
    echo "making upstream${S}.maf"
    featureBits danRer6 refGene:upstream:${S} -fa=/dev/null -bed=stdout \
        | perl -wpe 's/_up[^\t]+/\t0/' | sort -k1,1 -k2,2n \
        | /cluster/bin/$MACHTYPE/mafFrags danRer6 multiz6way \
                stdin stdout \
                -orgs=/hive/data/genomes/danRer6/bed/multiz6way/species.lst \
        | gzip -c > upstream${S}.maf.gz
    echo "done upstream${S}.maf.gz"
done

    # make downloads
    cd /usr/local/apache/htdocs/goldenPath/danRer6/multiz6way
    ln -s /hive/data/genomes/danRer6/bed/multiz6way/6way.nh
    ln -s /hive/data/genomes/danRer6/bed/multiz6way/downloads/README.txt

    cd /usr/local/apache/htdocs/goldenPath/danRer6/multiz6way/maf
    ln -s /hive/data/genomes/danRer6/bed/multiz6way/downloads/maf/up*.gz .
    md5sum up*.gz >> md5sum.txt


#######################################################################
# MULTIZ6WAY MAF FRAMES (DONE - 2010-02-16 - galt)
    ssh hgwdev
    mkdir /cluster/data/danRer6/bed/multiz6way/frames
    cd /cluster/data/danRer6/bed/multiz6way/frames
    mkdir genes

#danRer6 refGene
#gasAcu1 ensGene
#tetNig2 ensGene
#hg19    knownGene
#xenTro2 refGene
#mm9     knownGene
    #------------------------------------------------------------------------
    # get the genes for all genomes
    # mRNAs with CDS.  single select to get cds+psl, then split that up and
    # create genePred
    # using refGene for danRer6 xenTro2
    # using ensGene for gasAcu1 and tetNig2
    # bash syntax
    bash
    for qDB in danRer6 xenTro2 gasAcu1 tetNig2
    do
      if [ $qDB = "gasAcu1" -o $qDB = "tetNig2" ]; then
        geneTbl=ensGene
      else
	geneTbl=refGene
      fi
      echo hgsql -N -e \"'select * from '$geneTbl\;\" ${qDB}
      hgsql -N -e "select * from $geneTbl" ${qDB} | cut -f 2-11 \
      | genePredSingleCover stdin stdout | gzip -2c \
        > /scratch/tmp/$qDB.tmp.gz
      mv /scratch/tmp/$qDB.tmp.gz genes/$qDB.gp.gz
      rm -f $tmpExt
    done

    # using knownGene for mm9 hg19
    # genePreds; (must keep only the first 10 columns for knownGene)
    for qDB in mm9 hg19
    do
        geneTbl=knownGene
      echo hgsql -N -e \"'select * from '$geneTbl\;\" ${qDB}
      hgsql -N -e "select * from $geneTbl" ${qDB} | cut -f 1-10 \
      | genePredSingleCover stdin stdout | gzip -2c \
        > /scratch/tmp/$qDB.tmp.gz
      mv /scratch/tmp/$qDB.tmp.gz genes/$qDB.gp.gz
      rm -f $tmpExt
    done

    ###
    exit # end bash shell
    
    ssh hgwdev
    cd /cluster/data/danRer6/bed/multiz6way/frames
    cat ../anno/multiz6way.maf | nice genePredToMafFrames danRer6 stdin stdout danRer6 genes/danRer6.gp.gz gasAcu1 genes/gasAcu1.gp.gz tetNig2 genes/tetNig2.gp.gz hg19 genes/hg19.gp.gz mm9 genes/mm9.gp.gz xenTro2 genes/xenTro2.gp.gz | gzip > multiz6way.mafFrames.gz

    ssh hgwdev
    cd /cluster/data/danRer6/bed/multiz6way/frames
    nice hgLoadMafFrames danRer6 multiz6wayFrames multiz6way.mafFrames.gz

############################################################################
# CREATE LIFTOVER CHAINS FROM danRer6 TO danRer5 (DONE 2010-04-02 galt)
    doSameSpeciesLiftOver.pl -debug danRer6 danRer5
    cd /hive/data/genomes/danRer6/bed/blat.danRer5.2010-04-02
    doSameSpeciesLiftOver.pl danRer6 danRer5 >& do.log & tail -f do.log


#########################################################################
# Phylogenetic tree from 6-way (DONE 2010-05-20 galt)
# 	We need one tree for all chroms

    cd /hive/data/genomes/danRer6/bed/multiz6way/
    mkdir mafSplit
    cd mafSplit
    mafSplit -byTarget -useFullSequenceName /dev/null . ../multiz6way.maf
    # got 6660 mafs named after their chrom/scaff .maf
    # although there are over 11,000 chroms and scaffolds,
    # some are too small or have nothing aligning.

    mkdir /hive/data/genomes/danRer6/bed/multiz6way/4d
    cd /hive/data/genomes/danRer6/bed/multiz6way/4d

    # danRer6 cannot be too picky, dropping this clause: 
    #   refSeqStatus.status='Reviewed' 
    hgsql danRer6 -Ne \
    "select * from refGene,refSeqStatus where refGene.name=refSeqStatus.mrnaAcc and mol='mRNA'" \
	| cut -f 2-20 | egrep -E -v "chrM" \
	> refSeqReviewed.gp
    wc -l refSeqReviewed.gp
    # 15261 refSeqReviewed.gp
    genePredSingleCover refSeqReviewed.gp stdout | sort > refSeqReviewedNR.gp
    wc -l refSeqReviewedNR.gp
    # 14609 refSeqReviewedNR.gp

    ssh memk
    mkdir /hive/data/genomes/danRer6/bed/multiz6way/4d/run
    cd /hive/data/genomes/danRer6/bed/multiz6way/4d/run
    mkdir ../mfa

# whole chrom mafs version, using new version of 
# uses memory-efficient version of phast, from Melissa Hubisz at Cornell
#	mjhubisz at gmail.com

    cat << '_EOF_' > 4d.csh
#!/bin/csh -fe
set PHASTBIN = /cluster/bin/phast.build/cornellCVS/phast.2009-10-21/bin
set r = "/hive/data/genomes/danRer6/bed/multiz6way"
set c = $1
set infile = $r/mafSplit/$2
set outfile = $3
cd /scratch/tmp
# 'clean' maf
perl -wpe 's/^s ([^.]+)\.\S+/s $1/' $infile > $c.maf    
awk -v C=$c '$2 == C {print}' $r/4d/refSeqReviewedNR.gp > $c.gp
set NL=`wc -l $c.gp| gawk '{print $1}'`
if ("$NL" != "0") then
    set PHASTBIN = /cluster/bin/phast.build/cornellCVS/phast.2009-10-21/bin
    $PHASTBIN/msa_view --4d --features $c.gp --do-cats 3 -i MAF $c.maf -o SS > $c.ss
    $PHASTBIN/msa_view -i SS --tuple-size 1 $c.ss > $r/4d/run/$outfile
else
    echo "" > $r/4d/run/$outfile
endif
rm -f $c.gp $c.maf $c.ss
'_EOF_'
    # << happy emacs
    chmod +x 4d.csh

    ls -1S /hive/data/genomes/danRer6/bed/multiz6way/mafSplit/*.maf | \
        egrep -E -v "chrM" \
	| sed -e "s#.*multiz6way/mafSplit/##" \
	> maf.list

    cat << '_EOF_' > template
#LOOP
4d.csh $(root1) $(path1) {check out line+ ../mfa/$(root1).mfa}
#ENDLOOP
'_EOF_'
    # << happy emacs

    gensub2 maf.list single template stdout | tac > jobList
    para create jobList
    para try ... check ... push ... etc
    para time
# Completed: 23 of 23 jobs
# CPU time in finished jobs:       9032s     150.53m     2.51h    0.10d  0.000 y
# IO & Wait Time:                   672s      11.21m     0.19h    0.01d  0.000 y
# Average job time:                 422s       7.03m     0.12h    0.00d
# Longest finished job:             860s      14.33m     0.24h    0.01d
# Submission to last job:          1210s      20.17m     0.34h    0.01d

    # combine mfa files
    ssh hgwdev
    cd /hive/data/genomes/danRer6/bed/multiz6way/4d
    # but first clean out junk 1-byte leftovers from above process.
    cd mfa
    find -type f -size 1c | xargs -iX rm X
    find -type f -size 0c | xargs -iX rm X
    cd ..
    #want comma-less species.lst
    /cluster/bin/phast.build/cornellCVS/phast.2009-10-21/bin/msa_view \
	--aggregate "`cat ../species.lst`" mfa/*.mfa | sed s/"> "/">"/ \
	> 4d.all.mfa

    # fix order in case it helps:
    #((danRer6,(tetNig2,gasAcu1)),(xenTro2,(mm9,hg19)))

    # use phyloFit to create tree model (output is phyloFit.mod)
    /cluster/bin/phast.build/cornellCVS/phast.2009-10-21/bin/phyloFit \
	--EM --precision MED --msa-format FASTA --subst-mod REV \
	--tree ../tree-commas.nh 4d.all.mfa


#############################################################################
# phastCons 6-way (WORKING - 2010-05-13 - Galt)
    #	was unable to split the full chrom MAF files, now working on the
    #	maf files as they were split up during multiz

    # split 6way mafs into 10M chunks and generate sufficient statistics 
    # files for # phastCons

    ssh swarm
    mkdir -p /hive/data/genomes/danRer6/bed/multiz6way/cons/msa.split/2010-05-18
    cd /hive/data/genomes/danRer6/bed/multiz6way/cons/msa.split/2010-05-18
    mkdir ss

    cat << '_EOF_' > doSplit.csh
#!/bin/csh -ef
set c = $1
set MAF = /hive/data/genomes/danRer6/bed/multiz6way/mafSplit/$c.maf
set WINDOWS = /hive/data/genomes/danRer6/bed/multiz6way/cons/msa.split/2010-05-18/ss/$c
set WC = `cat $MAF | wc -l`
set NL = `grep "^#" $MAF | wc -l`
if ( -s $2 ) then
    exit 0
endif
if ( -s $2.running ) then
    exit 0
endif

date >> $2.running

rm -fr $WINDOWS
mkdir $WINDOWS
pushd $WINDOWS > /dev/null
if ( $WC != $NL ) then
/cluster/bin/phast.build/cornellCVS/phast.2009-10-19/bin/msa_split \
    $MAF -i MAF -o SS -r $WINDOWS/$c -w 10000000,0 -I 1000 -B 5000
endif
popd > /dev/null
date >> $2
rm -f $2.running
'_EOF_'
    # << happy emacs
    chmod +x doSplit.csh

    cat << '_EOF_' > template
#LOOP
doSplit.csh $(root1) {check out line+ $(root1).done}
#ENDLOOP
'_EOF_'
    # << happy emacs

    #	do the easy ones first to see some immediate results
    ls -1S -r ../../../mafSplit | sed -e "s/.maf//" > maf.list

    gensub2 maf.list single template jobList
    para -ram=8g create jobList
    para try ... check ... etc

# Completed: 6660 of 6660 jobs
# CPU time in finished jobs:        520s       8.67m     0.14h    0.01d  0.000 y
# IO & Wait Time:                 47106s     785.10m    13.08h    0.55d  0.001 y
# Average job time:                   7s       0.12m     0.00h    0.00d
# Longest finished job:              41s       0.68m     0.01h    0.00d
# Submission to last job:           667s      11.12m     0.19h    0.01d
# Estimated complete:                 0s       0.00m     0.00h    0.00d


    # some scaffolds were too small to produce output. 
    # expected 6660, found 4273.
    cd ss
    find -type f | wc -l
    #4273
    cd ..

    # Run phastCons
    #	This job is I/O intensive in its output files, beware where this
    #	takes place or do not run too many at once.
    ssh swarm
    mkdir -p /hive/data/genomes/danRer6/bed/multiz6way/cons/run.cons
    cd /hive/data/genomes/danRer6/bed/multiz6way/cons/run.cons

    #	there are going to be only one phastCons run using
    #	this same script.  It triggers off of the current working directory
    #	$cwd:t which is the "grp" in this script.  It is:
    #	all 

    cat << '_EOF_' > doPhast.csh
#!/bin/csh -fe
set PHASTBIN = /cluster/bin/phast.build/cornellCVS/phast.2009-10-21/bin
set c = $1
set cX = $1:r
set f = $2
set len = $3
set cov = $4
set rho = $5
set grp = $cwd:t
set cons = /hive/data/genomes/danRer6/bed/multiz6way/cons
set tmp = $cons/tmp/$f
mkdir -p $tmp
set ssSrc = $cons
set useGrp = "$grp.mod"
ln -s $ssSrc/msa.split/2010-05-18/ss/$c/$f.ss $tmp
ln -s $cons/$grp/$grp.mod $tmp
pushd $tmp > /dev/null
$PHASTBIN/phastCons $f.ss $useGrp \
    --rho $rho --expected-length $len --target-coverage $cov --quiet \
    --seqname $c --idpref $c --most-conserved $f.bed --score > $f.pp
popd > /dev/null
mkdir -p pp/$c bed/$c
sleep 4
touch pp/$c bed/$c
rm -f pp/$c/$f.pp
rm -f bed/$c/$f.bed
mv $tmp/$f.pp pp/$c
mv $tmp/$f.bed bed/$c
rm -fr $tmp
'_EOF_'
    # << happy emacs
    chmod a+x doPhast.csh

    #	this template will serve for all runs
    #	root1 == chrom name, file1 == ss file name without .ss suffix
    cat << '_EOF_' > template
#LOOP
../run.cons/doPhast.csh $(root1) $(file1) 45 0.3 0.3 {check out line+ pp/$(root1)/$(file1).pp}
#ENDLOOP
'_EOF_'
    # << happy emacs

    ls -1S ../msa.split/2010-05-18/ss/*/* | sed -e 's/.ss$//' > ss.list

    # Create parasol batch and run it
    # run for all species
    cd /hive/data/genomes/danRer6/bed/multiz6way/cons
    mkdir -p all
    cd all
    #	Using the .mod tree
    cp -p ../../4d/phyloFit.mod ./all.mod

    gensub2 ../run.cons/ss.list single ../run.cons/template jobList
    para -ram=8g create jobList
    para try ... check ... push ... etc.
# Completed: 4273 of 4273 jobs
# CPU time in finished jobs:       2323s      38.72m     0.65h    0.03d  0.000 y
# IO & Wait Time:                 68627s    1143.78m    19.06h    0.79d  0.002 y
# Average job time:                  17s       0.28m     0.00h    0.00d
# Longest finished job:              37s       0.62m     0.01h    0.00d
# Submission to last job:           338s       5.63m     0.09h    0.00d


    # create Most Conserved track
    ssh hgwdev
    cd /hive/data/genomes/danRer6/bed/multiz6way/cons/all

    cat bed/*/*.bed | sort -k1,1 -k2,2n \
	| awk '{printf "%s\t%d\t%d\tlod=%d\t%s\n", $1, $2, $3, $5, $5;}' \
    	> tmpMostConserved.bed
    /cluster/bin/scripts/lodToBedScore tmpMostConserved.bed > mostConserved.bed

    # load into database
    nice hgLoadBed danRer6 phastConsElements6way mostConserved.bed
    # Loaded 881975 elements of size 5
    # Saving bed.tab
    # start -1, end 426 out of range in findBin (max is 512M)
[hgwdev:all> cat mostConserved.bed | gawk '{if ($3 == 426) print $0 }'
Zv8_NA5813      -1      426     lod=93  380
# there's only one record like this with chromStart == -1 

it is the first record of a set
[hgwdev:all> grep Zv8_NA5813 mostConserved.bed
Zv8_NA5813      -1      426     lod=93  380
Zv8_NA5813      520     666     lod=41  286
Zv8_NA5813      742     832     lod=28  242

[hgwdev:all> grep Zv8_NA5813 tmpMostConserved.bed
Zv8_NA5813      -1      426     lod=93  93
Zv8_NA5813      520     666     lod=41  41

[hgwdev:all> cat bed/Zv8_NA5813/Zv8_NA5813.0-6928.bed
Zv8_NA5813      -1      426     Zv8_NA5813.1    93      +
Zv8_NA5813      520     666     Zv8_NA5813.1    41      +
Zv8_NA5813      742     832     Zv8_NA5813.2    28      +

# I have manually patched mostConserved.bed changing the -1 to 0 for 
# Zv8_NA5813      -1      426     lod=93  93
# and re-ran the same load command above this time without error.

# also manually patched pp/Zv8_NA5813/Zv8_NA5813.0-6928.pp
# and pp/Zv8_NA6933/Zv8_NA6933.0-2411.pp
# so that it has in first line start=1 instead of start=0

    # Try for 5% overall cov, and 70% CDS cov 
    featureBits danRer6 -enrichment refGene:cds phastConsElements6way
    #	--rho 0.3 --expected-length 45 --target-coverage 0.3
    #   refGene:cds 1.244%, phastConsElements6way 12.225%, 
    #   both 1.029%, cover 82.75%, enrich 6.77x

    # hg19 for comparison
    #	refGene:cds 1.187%, phastConsElements46way 5.065%,
    #	both 0.884%, cover 74.46%, enrich 14.70x

# add to zebrafish/danRer6/trackDb.ra track phastConsElements6way
# so that it can use the standard existing html page
html ../../phastConsElements  


    # Create merged posterier probability file and wiggle track data files
    cd /hive/data/genomes/danRer6/bed/multiz6way/cons/all
    mkdir downloads

    cat pp/*/*.pp | gzip > downloads/phastCons6way.wigFix.gz

    # make downloads
    mkdir /usr/local/apache/htdocs-hgdownload/goldenPath/danRer6/phastCons6way/
    pushd /usr/local/apache/htdocs-hgdownload/goldenPath/danRer6/phastCons6way/
    ln -s /hive/data/genomes/danRer6/bed/multiz6way/cons/all/downloads/* .
    md5sum *.gz >> md5sum.txt
    ln -s /hive/data/genomes/danRer6/bed/multiz6way/cons/README.txt
    ln -s /hive/data/genomes/danRer6/bed/multiz6way/cons/all/all.mod
    popd

    #	encode those files into wiggle data
    zcat downloads/phastCons6way.wigFix.gz \
	| wigEncode stdin phastCons6way.wig phastCons6way.wib
    #   Converted stdin, upper limit 1.00, lower limit 0.00
    du -hsc *.wi?
    # 453M    phastCons6way.wib
    # 97M     phastCons6way.wig
    # 549M    total


    # Load gbdb and database with wiggle.
    ln -s `pwd`/phastCons6way.wib /gbdb/danRer6/multiz6way/phastCons6way.wib
    nice hgLoadWiggle -pathPrefix=/gbdb/danRer6/multiz6way danRer6 \
	phastCons6way phastCons6way.wig

  # use to set trackDb.ra entries for wiggle min and max

    wigTableStats.sh danRer6 phastCons6way
# db.table             min max mean   count     sumData     stdDev   viewLimits
#danRer6.phastCons6way   0 1 0.712241 237215280 1.68955e+08 0.337377 viewLimits=0:1




    #  Create histogram to get an overview of all the data
    hgWiggle -doHistogram -db=danRer6 \
	-hBinSize=0.001 -hBinCount=1000 -hMinVal=0.0 -verbose=2 \
	    phastCons6way >& histogram.data 

    #	create plot of histogram:

#orig set terminal png small color x000000 xffffff xc000ff x66ff66 xffff00 x00ffff
    cat << '_EOF_' | gnuplot > histo.png
set terminal png small x000000 xffffff xc000ff x66ff66 xffff00 x00ffff
set size 1.4, 0.8
set key left box
set grid noxtics
set grid ytics
set title " Zebrafish danRer6 Histogram phastCons6way track"
set xlabel " phastCons6way score"
set ylabel " Relative Frequency"
set y2label " Cumulative Relative Frequency (CRF)"
set y2range [0:1]
set y2tics
set yrange [0:0.02]

plot "histogram.data" using 2:5 title " RelFreq" with impulses, \
        "histogram.data" using 2:7 axes x1y2 title " CRF" with lines
'_EOF_'
    #	<< happy emacs

    display histo.png &

#########################################################################
# CREATE LIFTOVER CHAINS FROM danRer6 TO danRer7 (DONE - 2010-12-21 - Hiram)
    doSameSpeciesLiftOver.pl -debug \
	-bigClusterHub=swarm -dbHost=hgwdev -workhorse=hgwdev \
	danRer6 danRer7
    cd /hive/data/genomes/danRer6/bed/blat.danRer7.2010-12-21
    time doSameSpeciesLiftOver.pl \
	-bigClusterHub=swarm -dbHost=hgwdev -workhorse=hgwdev \
	danRer6 danRer7 > do.log 2>&1 &
    #	real    126m51.141s

#########################################################################

##########################################################################pubStart
# Publications track (DONE - 04-27-12 - Max)

# article download and conversion is run every night on hgwdev:
# 22 22 * * * /hive/data/inside/literature/pubtools/pubCronDailyUpdate.sh
# the script downloads files into /hive/data/outside/literature/{PubMedCentral,ElsevierConsyn}/
# then converts them to text into /hive/data/outside/literature/{pmc,elsevier}

# all configuration of the pipeline is in /hive/data/inside/literature/pubtools/lib/pubConf.py

# data processing was run manually like this
export PATH=/cluster/home/max/bin/x86_64:/cluster/bin/x86_64:/cluster/home/max/software/bin/:/cluster/software/bin:/cluster/home/max/projects/pubtools:/cluster/home/max/bin/x86_64:/hive/groups/recon/local/bin:/usr/local/bin:/usr/bin:/bin:/usr/bin/X11:/cluster/home/max/usr/src/scripts:/cluster/home/max/usr/src/oneshot:/cluster/home/max/bin:/cluster/bin/scripts:.:/cluster/home/max/usr/bin:/usr/lib64/qt-3.3/bin:/usr/kerberos/bin:/usr/local/bin:/bin:/usr/bin:/usr/lpp/mmfs/bin/:/opt/dell/srvadmin/bin:/cluster/bin/scripts:/hive/users/hiram/cloud/ec2-api-tools-1.3-51254/bin:/cluster/home/max/bin:/usr/bin/X11:/usr/java/jdk1.6.0_20/bin:/cluster/home/max/bin:/hive/data/inside/literature/pubtools/
# pmc
cd /hive/data/inside/literature/pubtools/runs/pmcBlat/
pubBlat init /hive/data/inside/literature/blat/pmc/ /hive/data/inside/literature/text/pmc
ssh swarm 
cd /hive/data/inside/literature/pubtools/runs/pmcBlat/
pubBlat steps:annot-tables
exit
pubBlat load

# elsevier
cd /hive/data/inside/literature/pubtools/runs/elsBlat/
pubBlat init /hive/data/inside/literature/blat/elsevier/ /hive/data/inside/literature/text/elsevier
ssh swarm 
cd /hive/data/inside/literature/pubtools/runs/elsBlat/
pubBlat steps:annot-tables
exit
pubBlat load
#--pubEnd

