# for emacs: -*- mode: sh; -*-

# Caenorhabditis elegans
# Washington University School of Medicine GSC and Sanger Institute WS180

# We don't have the resources to build a complete browser on this -- 
# make ce4 <-> ce5 <-> ce6 liftOver files and compGeno tracks for modENCODE,
# and a skeletal browser for checking the compGeno tracks.

#  $Id: ce5.txt,v 1.3 2009/01/26 20:27:39 angie Exp $

#########################################################################
# DOWNLOAD SEQUENCE (DONE 10/10/07 angie)
    ssh kkstore06
    mkdir /cluster/store3/worm/ce5
    ln -s /cluster/store3/worm/ce5 /cluster/data/ce5
    cd /cluster/data/ce5
    mkdir downloads
    cd downloads
    wget --timestamping -m -np -nd \
      "ftp://ftp.sanger.ac.uk/pub/wormbase/WS180/CHROMOSOMES/"
    # All we really want is the CHR*.agp and CHR*.fa.gz -- clean up a 
    # bit & fetch more selectively next time.
    rm chrom* *_masked.dna.gz *.gff.gz CHROMOSOME_MtDNA.CSHL.gff


#########################################################################
# NORMALIZE SEQUENCE NAMES TO BEGIN WITH chr (DONE 10/10/07 angie)
    ssh kkstore06
    cd /cluster/data/ce5/downloads
    # Fix fasta names:
    zcat CHR*.dna.gz \
    | sed -e '/^$/ d;  s/^>CHROMOSOME_MtDNA/>chrM/;  s/^>CHROMOSOME_/>chr/;' \
    | gzip -c > UCSC.fa.gz
    faSize -detailed UCSC.fa.gz
#chrI    15072421
#chrII   15279316
#chrIII  13783681
#chrIV   17493785
#chrM    13794
#chrV    20919568
#chrX    17718851
    # Make sure we get the same sizes from this command:
    zcat CHR*.dna.gz | sed -e '/^$/ d;' | faSize -detailed stdin

    # Fix AGP names:
    sed -e 's/^/chr/' CHR*.agp \
    > UCSC.agp
    # And add a fake mitochondrial AGP entry for the sake of downstream
    # tools (I made sure the GenBank sequence is identical to given):
    echo "chrM\t1\t13794\t1\tF\tNC_001328.1\t1\t13794\t+" >> UCSC.agp


#########################################################################
# CHECK AGP VS. FA, MAKE UNMASKED 2BIT (DONE 10/10/07 angie)
    ssh kkstore06
    cd /cluster/data/ce5
    cat << '_EOF_' > ce5.config.ra
# Config parameters for makeGenomeDb.pl:
db ce5
clade worm
genomeCladePriority 10
scientificName Caenorhabditis elegans
commonName C. elegans
assemblyDate Aug. 2007
assemblyLabel Washington University School of Medicine GSC and Sanger Institute WS180
orderKey 827
mitoAcc none
fastaFiles /cluster/data/ce5/downloads/UCSC.fa.gz
agpFiles   /cluster/data/ce5/downloads/UCSC.agp
# qualFiles /dev/null
dbDbSpeciesDir worm
'_EOF_'
    # << emacs

    mkdir jkStuff
    nice makeGenomeDb.pl ce5.config.ra -stop agp \
      >& jkStuff/makeGenomeDb.log & tail -f jkStuff/makeGenomeDb.log
    # Note: file date on ce5.unmasked.2bit is 10/11 -- original was moved
    # aside to ce5.unmasked.lowercase.2bit, and then ce5.unmasked.2bit was
    # regenerated to test a slight tweak to makeGenomeDb.pl: use -noMask
    # with faToTwoBit when making .unmasked, in case we get lower-cased
    # sequence as in this case... which looks like all-masked to many progs.


#########################################################################
# REPEATMASKER (DONE (through mask) 10/10/07, DONE 1/14/09 angie)
    ssh kkstore06
    cd /cluster/data/ce5
    doRepeatMasker.pl ce5 -debug
    cd bed/RepeatMasker.2007-10-10
    doRepeatMasker.pl ce5 -stop mask >& do.log & tail -f do.log
    # 1/14/09: Load the table (need it for netClass)
    cd /hive/data/genomes/ce5/bed/RepeatMasker.2007-10-10
    doRepeatMasker.pl ce5 -continue install -buildDir `pwd` \
      -workhorse=kolossus -fileServer=kolossus \
      >>& do.log & tail -f do.log
    


#########################################################################
# SIMPLE REPEATS (DONE (through filter) 10/11/07, DONE 1/14/09 angie)
    ssh kkstore06
    cd /cluster/data/ce5
    doSimpleRepeat.pl ce5 -debug -verbose=3
    cd bed/simpleRepeat.2007-10-11
    doSimpleRepeat.pl ce5 -verbose=3 -stop filter >& do.log & tail -f do.log
    # Took less than 1/2 hour (small sequence, just did a single run).
    # Use the suggested commands at the end of do.log below...
    # 1/14/09: Load the table (need it for netClass)
    cd /hive/data/genomes/ce5/bed/simpleRepeat.2007-10-11
    doSimpleRepeat.pl ce5 -verbose=3 -continue load -buildDir `pwd` \
      -workhorse=kolossus -fileServer=kolossus \
      >>& do.log & tail -f do.log


#########################################################################
# MASK SEQUENCE WITH RM+TRF (DONE 10/11/07 angie)
    # Since both doRepeatMasker.pl and doSimpleRepeats.pl have completed,
    # now it's time to combine the masking into the final ce5.2bit,
    # following the instructions at the end of doSimpleRepeat's output.
    ssh kolossus
    cd /cluster/data/ce5
    twoBitMask ce5.rmsk.2bit -add bed/simpleRepeat.2007-10-11/trfMask.bed \
      ce5.2bit
    # Ignore warning about extra BED columns.
    

#########################################################################
# INSTALL 2BIT AND 11.OOC FOR BLAT/LIFTOVER (DONE 10/11/07 angie)
# REDONE 12/15/08 --> jkStuff/11.ooc because /cluster/bluearc is no more.
    # Use -repMatch=100 (based on size -- for human we use 1024, and 
    # worm size is ~3.4% of human judging by gapless ce4 vs. hg18 genome 
    # size from featureBits. So we would use 34, but that yields a very
    # high number of tiles to ignore, especially for a small more compact 
    # genome.  Bump that up a bit to be more conservative.
    ssh kkstore06
    cd /cluster/data/ce5
    rsync -av ce5.2bit /cluster/bluearc/ce5/
    blat ce5.2bit /dev/null /dev/null -makeOoc=/cluster/bluearc/ce5/11.ooc \
      -repMatch=100
#Wrote 8502 overused 11-mers to /cluster/bluearc/ce5/11.ooc


#########################################################################
# CE5->CE4 LIFTOVER (DONE 10/11/07 angie)
    ssh kkstore06
    cd /cluster/data/ce5
    doSameSpeciesLiftOver.pl ce5 ce4 -debug
    cd bed/blat.ce4.2007-10-11
    doSameSpeciesLiftOver.pl ce5 ce4 -verbose 3 >& do.log &
    tail -f do.log


#########################################################################
# CE5->CE6 LIFTOVER (DONE 12/15/08 angie)
    cd /cluster/data/ce5
    doSameSpeciesLiftOver.pl ce5 ce6 -debug
    cd bed/blat.ce6.2008-12-15
    doSameSpeciesLiftOver.pl ce5 ce6 >& do.log & tail -f do.log


#########################################################################
# MAKE ACTIVE=0 ENTRY IN DBDB (DONE 10/11/07 angie)
    ssh kkstore06
    cd /cluster/data/ce5
    makeGenomeDb.pl ce5.config.ra -debug -continue dbDb -stop dbDb
    # Edit dbDbInsert.sql so active=0 (and fix the -debug defaultPos
    # that refers to chr1 --> chrI), then execute as in makeGenomeDb's
    # #DEBUG# output:
    ssh -x hgwdev hgsql -h genome-testdb -A -N hgcentraltest \
      < /cluster/data/ce5/dbDbInsert.sql


#########################################################################
# MAKE SKELETAL BROWSER DB (DONE 1/13/09 angie)
    cd /hive/data/genomes/ce5
    makeGenomeDb.pl ce5.config.ra -continue db \
      >>& jkStuff/makeGenomeDb.log & tail -f jkStuff/makeGenomeDb.log
    # Followed instructions at end for editing and adding ce5 to trackDb/...
    # Used descriptions from ce6.


#########################################################################
# BLASTZ caePb2 (DONE 1/14/09 angie)
    # use screen: Ctrl-a d to leave, screen -ls, screen -r NNNNN to return
    screen
    mkdir /hive/data/genomes/ce5/bed/blastzCaePb2.2009-01-13
    cd /hive/data/genomes/ce5/bed/blastzCaePb2.2009-01-13

    cat << '_EOF_' > DEF
# ce5 vs caePb2
BLASTZ_H=2000
BLASTZ_M=50

# TARGET: elegans ce5
SEQ1_DIR=/hive/data/genomes/ce5/ce5.2bit
SEQ1_LEN=/hive/data/genomes/ce5/chrom.sizes
SEQ1_CHUNK=1000000
SEQ1_LAP=10000

# QUERY: C. PB2801 (brenneri) caePb2
SEQ2_DIR=/hive/data/genomes/caePb2/caePb2.2bit
SEQ2_LEN=/hive/data/genomes/caePb2/chrom.sizes
SEQ2_CTGDIR=/hive/data/staging/data/caePb2/caePb2.supercontigs.2bit
SEQ2_CTGLEN=/hive/data/staging/data/caePb2/caePb2.supercontigs.sizes
SEQ2_LIFT=/hive/data/staging/data/caePb2/caePb2.supercontigs.lift
SEQ2_CHUNK=1000000
SEQ2_LAP=0
SEQ2_LIMIT=50

BASE=/hive/data/genomes/ce5/bed/blastzCaePb2.2009-01-13
TMPDIR=/scratch/tmp
'_EOF_'
    # << emacs

    # Would use memk for smallClusterHub, but it can't see /hive
    doBlastzChainNet.pl `pwd`/DEF -verbose=2 -bigClusterHub=swarm \
      -smallClusterHub=swarm -workhorse=kolossus -fileServer=kolossus \
      -qRepeats=windowmaskerSdust \
       >& do.log & tail -f do.log


#########################################################################
# BLASTZ caePb2 (DONE 1/14/09 angie)
    # use screen: Ctrl-a d to leave, screen -ls, screen -r NNNNN to return
    screen
    mkdir /hive/data/genomes/ce5/bed/blastzCaePb2.2009-01-13
    cd /hive/data/genomes/ce5/bed/blastzCaePb2.2009-01-13

    cat << '_EOF_' > DEF
# ce5 vs caePb2
BLASTZ_H=2000
BLASTZ_M=50

# TARGET: elegans ce5
SEQ1_DIR=/hive/data/genomes/ce5/ce5.2bit
SEQ1_LEN=/hive/data/genomes/ce5/chrom.sizes
SEQ1_CHUNK=1000000
SEQ1_LAP=10000

# QUERY: C. PB2801 (brenneri) caePb2
SEQ2_DIR=/hive/data/genomes/caePb2/caePb2.2bit
SEQ2_LEN=/hive/data/genomes/caePb2/chrom.sizes
SEQ2_CTGDIR=/hive/data/staging/data/caePb2/caePb2.supercontigs.2bit
SEQ2_CTGLEN=/hive/data/staging/data/caePb2/caePb2.supercontigs.sizes
SEQ2_LIFT=/hive/data/staging/data/caePb2/caePb2.supercontigs.lift
SEQ2_CHUNK=1000000
SEQ2_LAP=0
SEQ2_LIMIT=50

BASE=/hive/data/genomes/ce5/bed/blastzCaePb2.2009-01-13
TMPDIR=/scratch/tmp
'_EOF_'
    # << emacs

    # Would use memk for smallClusterHub, but it can't see /hive
    doBlastzChainNet.pl `pwd`/DEF -verbose=2 -bigClusterHub=swarm \
      -smallClusterHub=swarm -workhorse=kolossus -fileServer=kolossus \
      -qRepeats=windowmaskerSdust \
       >& do.log & tail -f do.log
    ln -s blastzCaePb2.2009-01-13 /hive/data/genomes/ce5/bed/blastz.caePb2


#########################################################################
# BLASTZ caeRem3 (DONE 1/14/09 angie)
    # use screen: Ctrl-a d to leave, screen -ls, screen -r NNNNN to return
    screen
    mkdir /hive/data/genomes/ce5/bed/blastzCaeRem3.2009-01-14
    cd /hive/data/genomes/ce5/bed/blastzCaeRem3.2009-01-14

    cat << '_EOF_' > DEF
# ce5 vs caeRem3
BLASTZ_H=2000
BLASTZ_M=50

# TARGET: elegans ce5
SEQ1_DIR=/hive/data/genomes/ce5/ce5.2bit
SEQ1_LEN=/hive/data/genomes/ce5/chrom.sizes
SEQ1_CHUNK=1000000
SEQ1_LAP=10000

# QUERY: C. remanei caeRem3
SEQ2_DIR=/hive/data/genomes/caeRem3/caeRem3.2bit
SEQ2_LEN=/hive/data/genomes/caeRem3/chrom.sizes
SEQ2_CTGDIR=/hive/data/staging/data/caeRem3/caeRem3.supercontigs.2bit
SEQ2_CTGLEN=/hive/data/staging/data/caeRem3/caeRem3.supercontigs.sizes
SEQ2_LIFT=/hive/data/staging/data/caeRem3/caeRem3.chrUn.lift
SEQ2_CHUNK=1000000
SEQ2_LAP=0
SEQ2_LIMIT=50

BASE=/hive/data/genomes/ce5/bed/blastzCaeRem3.2009-01-14
TMPDIR=/scratch/tmp
'_EOF_'
    # << emacs

    # Would use memk for smallClusterHub, but it can't see /hive
    doBlastzChainNet.pl `pwd`/DEF -verbose=2 -bigClusterHub=swarm \
      -smallClusterHub=swarm -workhorse=kolossus -fileServer=kolossus \
       >& do.log & tail -f do.log
    ln -s blastzCaeRem3.2009-01-14 /hive/data/genomes/ce5/bed/blastz.caeRem3


#########################################################################
# BLASTZ cb3 (DONE 1/14/09 angie)
    # use screen: Ctrl-a d to leave, screen -ls, screen -r NNNNN to return
    screen
    mkdir /hive/data/genomes/ce5/bed/blastzCb3.2009-01-14
    cd /hive/data/genomes/ce5/bed/blastzCb3.2009-01-14
    cat << '_EOF_' > DEF
# ce5 vs cb3
BLASTZ_H=2000
BLASTZ_M=50

# TARGET: elegans ce5
SEQ1_DIR=/hive/data/genomes/ce5/ce5.2bit
SEQ1_LEN=/hive/data/genomes/ce5/chrom.sizes
SEQ1_CHUNK=1000000
SEQ1_LAP=10000

# QUERY: C. briggsae cb3
SEQ2_DIR=/hive/data/genomes/cb3/cb3.rmskTrf.2bit
SEQ2_LEN=/hive/data/genomes/cb3/chrom.sizes
SEQ2_CHUNK=1000000
SEQ2_LAP=0
SEQ2_LIMIT=50

BASE=/hive/data/genomes/ce5/bed/blastzCb3.2009-01-14
TMPDIR=/scratch/tmp
'_EOF_'
    # << emacs

    # Would use memk for smallClusterHub, but it can't see /hive
    doBlastzChainNet.pl `pwd`/DEF -verbose=2 -bigClusterHub=swarm \
      -smallClusterHub=swarm -workhorse=kolossus -fileServer=kolossus \
       >& do.log & tail -f do.log
    ln -s blastzCb3.2009-01-14 /hive/data/genomes/ce5/bed/blastz.cb3


#########################################################################
# BLASTZ priPac1 (DONE 1/14/09 angie)
    # use screen: Ctrl-a d to leave, screen -ls, screen -r NNNNN to return
    screen
    mkdir /hive/data/genomes/ce5/bed/blastzPriPac1.2009-01-14
    cd /hive/data/genomes/ce5/bed/blastzPriPac1.2009-01-14

    cat << '_EOF_' > DEF
# ce5 vs priPac1
BLASTZ_H=2000
BLASTZ_M=50

# TARGET: elegans ce5
SEQ1_DIR=/hive/data/genomes/ce5/ce5.2bit
SEQ1_LEN=/hive/data/genomes/ce5/chrom.sizes
SEQ1_CHUNK=1000000
SEQ1_LAP=10000

# QUERY: P. pacificus priPac1
SEQ2_DIR=/hive/data/genomes/priPac1/priPac1.TrfWM.2bit
SEQ2_LEN=/hive/data/genomes/priPac1/chrom.sizes
SEQ2_CTGDIR=/hive/data/genomes/priPac1/trfWM.maskedContigs/priPac1.TrfWM.supers.2bit
SEQ2_CTGLEN=/hive/data/genomes/priPac1/trfWM.maskedContigs/priPac1.TrfWM.supers.sizes
SEQ2_LIFT=/hive/data/genomes/priPac1/trfWM.maskedContigs/super.lift
SEQ2_CHUNK=1000000
SEQ2_LAP=0
SEQ2_LIMIT=50

BASE=/hive/data/genomes/ce5/bed/blastzPriPac1.2009-01-14
TMPDIR=/scratch/tmp
'_EOF_'
    # << emacs

    # Would use memk for smallClusterHub, but it can't see /hive
    doBlastzChainNet.pl `pwd`/DEF -verbose=2 -bigClusterHub=swarm \
      -smallClusterHub=swarm -workhorse=kolossus -fileServer=kolossus \
       >& do.log & tail -f do.log
    ln -s blastzPriPac1.2009-01-14 /hive/data/genomes/ce5/bed/blastz.priPac1


#########################################################################
# BLASTZ caeJap1 (DONE 1/14/09 angie)
    # use screen: Ctrl-a d to leave, screen -ls, screen -r NNNNN to return
    screen
    mkdir /hive/data/genomes/ce5/bed/blastzCaeJap1.2009-01-14
    cd /hive/data/genomes/ce5/bed/blastzCaeJap1.2009-01-14

    cat << '_EOF_' > DEF
# ce5 vs caeJap1
BLASTZ_H=2000
BLASTZ_M=50

# TARGET: elegans ce5
SEQ1_DIR=/hive/data/genomes/ce5/ce5.2bit
SEQ1_LEN=/hive/data/genomes/ce5/chrom.sizes
SEQ1_CHUNK=1000000
SEQ1_LAP=10000

# QUERY: C. remanei caeJap1
SEQ2_DIR=/hive/data/genomes/caeJap1/caeJap1.2bit
SEQ2_LEN=/hive/data/genomes/caeJap1/chrom.sizes
SEQ2_CTGDIR=/hive/data/genomes/caeJap1/maskedContigs/caeJap1.TrfWM.supers.2bit
SEQ2_CTGLEN=/hive/data/genomes/caeJap1/maskedContigs/caeJap1.TrfWM.supers.sizes
SEQ2_LIFT=/hive/data/genomes/caeJap1/maskedContigs/supers.lift
SEQ2_CHUNK=1000000
SEQ2_LAP=0
SEQ2_LIMIT=50

BASE=/hive/data/genomes/ce5/bed/blastzCaeJap1.2009-01-14
TMPDIR=/scratch/tmp
'_EOF_'
    # << emacs

    # Would use memk for smallClusterHub, but it can't see /hive
    doBlastzChainNet.pl `pwd`/DEF -verbose=2 -bigClusterHub=swarm \
      -smallClusterHub=swarm -workhorse=kolossus -fileServer=kolossus \
      -qRepeats=windowmaskerSdust \
       >& do.log & tail -f do.log
    ln -s blastzCaeJap1.2009-01-14 /hive/data/genomes/ce5/bed/blastz.caeJap1


############################################################################
# MULTIZ6WAY (DONE 1/15/09 angie)
    mkdir /hive/data/genomes/ce5/bed/multiz6way
    cd /hive/data/genomes/ce5/bed/multiz6way
    # Tree copied from ce6.txt, see comments there on its derivation:
    cat << '_EOF_' > nematodes.nh
    ((((((((((briggsae_cb3:0.005,remanei_caeRem3:0.003):0.004,
        brenneri_caePb2:0.013):0.002,elegans_ce5:0.003):0.001,
            japonica_caeJap1:0.023):0.024,ps1010:0.027):0.014,
                df5070:0.056):0.009, plicata:0.102):0.099,sb341:0.060):0.038,
                (df5074:0.351,df5055:0.090):0.221):0.2,pristiochus_priPac1:0.8)
'_EOF_'
    # << emacs

    # filter that for the six species in use here
    /cluster/bin/phast/x86_64/tree_doctor \
--prune-all-but briggsae_cb3,remanei_caeRem3,brenneri_caePb2,elegans_ce5,japonica_caeJap1,pristiochus_priPac1 \
      -R elegans_ce5 \
      nematodes.nh > 6way.nh
#(((((briggsae_cb3:0.005000,remanei_caeRem3:0.003000):0.004000,brenneri_caePb2:0.013000):0.002000,elegans_ce5:0.003000):0.001000,japonica_caeJap1:0.023000):0.384000,pristiochus_priPac1:0.800000);

    # rearrange to get Ce5 on top for the construction of the tree image
    # in the 6-way description page.  This results in something like:
    cat <<EOF > 6way_for_phyloGif.nh
(((C._elegans_ce5:0.003000,
    (C._brenneri_caePb2:0.013000,
        (C._remanei_caeRem3:0.003000,C._briggsae_cb3:0.005000):0.004000)
                :0.002000):0.001000,
            C._japonica_caeJap1:0.023000):0.384000,
                P._pristiochus_priPac1:0.800000);
EOF

    /cluster/bin/phast/x86_64/all_dists 6way.nh > 6way.distances.txt
    grep -i ce5 6way.distances.txt | sort -k3,3n
    # Use this output for reference, and use the calculated
    # distances in the table below to order the organisms and check
    # the button order on the browser.
    # And if you can fill in the table below entirely, you have
    # succeeded in finishing all the alignments required.
    #
#                         featureBits chainLink measures
#                                           chainCe5Link   chain   linearGap
#    distance                       on ce5      on other   minScore
#  1 0.0120  - remanei_caeRem3      41.593%       n/a      1000     loose
#  2 0.0140  - briggsae_cb3         42.154%       n/a      1000     loose
#  3 0.0180  - brenneri_caePb2      40.553%       n/a      1000     loose
#  4 0.0270  - japonica_caeJap1     26.639%       n/a      1000     loose
#  5 1.1880  - pristiochus_priPac1   6.113%       n/a      1000     loose
    # Coverage is about 0.015% lower than for ce6 for each of those, but only 
    # a few bases in the genome assembly actually changed ce5 -> ce6.  
    # Maybe it's a lastz vs. blastz thing??

    cd /hive/data/genomes/ce5/bed/multiz6way
    # bash shell syntax here ...
    export H=/hive/data/genomes/ce5/bed
    mkdir mafLinks
    for G in caeRem3 cb3 caePb2 priPac1 caeJap1
    do
        mkdir -p mafLinks/$G
        if [ ! -d ${H}/blastz.${G}/mafNet ]; then
        echo "missing directory ${H}/blastz.${G}/mafNet"
        fi
        ln -s ${H}/blastz.$G/mafNet/*.maf.gz ./mafLinks/$G
    done

    # these are x86_64 binaries
    mkdir penn
    # use latest penn utilities
    P=/cluster/bin/penn/multiz.v11.2007-03-19/multiz-tba
    cp -p $P/{autoMZ,multiz,maf_project} penn

    # the autoMultiz cluster run
    ssh swarm
    cd /hive/data/genomes/ce5/bed/multiz6way/

    # create species list and stripped down tree for autoMZ
    sed 's/[a-z][a-z0-9]*_//ig; s/:[0-9\.][0-9\.]*//g; s/;//; /^ *$/d' \
        6way.nh > tree-commas.nh
    sed 's/ //g; s/,/ /g' tree-commas.nh > tree.nh
    sed 's/[()]//g; s/,/ /g' tree.nh > species.lst

    mkdir maf run
    cd run
    # NOTE: you need to set the db properly in this script
    cat > autoMultiz << '_EOF_'
#!/bin/csh -ef
set db = ce5
set c = $1
set maf = $2
set binDir = /hive/data/genomes/$db/bed/multiz6way/penn
set tmp = /scratch/tmp/$db/multiz.$c
set pairs = /hive/data/genomes/$db/bed/multiz6way/mafLinks
rm -fr $tmp
mkdir -p $tmp
cp ../{tree.nh,species.lst} $tmp
pushd $tmp
foreach s (`cat species.lst`)
    set in = $pairs/$s/$c.maf
    set out = $db.$s.sing.maf
    if ($s == $db) then
        continue
    endif
    if (-e $in.gz) then
        zcat $in.gz > $out
    else if (-e $in) then
        cp $in $out
    else
        echo "##maf version=1 scoring=autoMZ" > $out
    endif
end
set path = ($binDir $path); rehash
$binDir/autoMZ + T=$tmp E=$db "`cat tree.nh`" $db.*.sing.maf $c.maf
popd
cp $tmp/$c.maf $maf
rm -fr $tmp
'_EOF_'
# << emacs
    chmod +x autoMultiz

cat  << '_EOF_' > template
#LOOP
autoMultiz $(root1) {check out line+ /hive/data/genomes/ce5/bed/multiz6way/maf/$(root1).maf}
#ENDLOOP
'_EOF_'
# << emacs

    awk '{print $1}' /hive/data/genomes/ce5/chrom.sizes > chrom.lst
    gensub2 chrom.lst single template jobList
    para create jobList
    para push -maxPush=1
    para check ... push ... etc ...
#Completed: 7 of 7 jobs
#CPU time in finished jobs:       1693s      28.21m     0.47h    0.02d  0.000 y
#IO & Wait Time:                   106s       1.77m     0.03h    0.00d  0.000 y
#Average job time:                 257s       4.28m     0.07h    0.00d
#Longest finished job:             374s       6.23m     0.10h    0.00d
#Submission to last job:           395s       6.58m     0.11h    0.00d

    # back on non-para-hub machine:
    cd /hive/data/genomes/ce5/bed/multiz6way
    time catDir maf > multiz6way.maf
#real    0m12.721s
#-rw-rw-r--  1 angie protein 342023168 Jan 15 19:23 multiz6way.maf

    # before getting to the annotation, load this up so we can get
    # a first view of the track.  This will be replaced with the annotated
    # mafs
    ssh hgwdev
    cd /hive/data/genomes/ce5/bed/multiz6way
    mkdir /gbdb/ce5/multiz6way
    ln -s /hive/data/genomes/ce5/bed/multiz6way/multiz6way.maf /gbdb/ce5/multiz6way

    # this load creates a large file, do that on local disk:
    cd /scratch/tmp
    time hgLoadMaf ce5 multiz6way
#Loaded 364189 mafs in 1 files from /gbdb/ce5/multiz6way
#7.676u 0.420s 0:10.35 78.1%     0+0k 0+0io 0pf+0w

    time hgLoadMafSummary -minSize=10000 -mergeGap=500 \
        -maxSize=50000 ce5 multiz6waySummary /gbdb/ce5/multiz6way/multiz6way.maf
#Created 124469 summary blocks from 1042942 components and 364189 mafs from /gbdb/ce5/multiz6way/multiz6way.maf
#21.210u 2.196s 0:24.49 95.5%    0+0k 0+0io 0pf+0w

    # remove the temporary .tab files:
    rm multiz6*.tab


############################################################################
# LIFTOVER SANGER GENES FROM CE6 (DONE 1/15/09 angie)
    ssh hgwdev
    mkdir /hive/data/genomes/ce5/bed/sangerGene.liftOver
    cd /hive/data/genomes/ce5/bed/sangerGene.liftOver
    hgsql ce6 -NBe 'select * from sangerGene' \
    | cut -f 1-10 \
    | liftOver -genePred stdin -minBlocks=0.75 \
      /hive/data/genomes/ce6/bed/liftOver/ce6ToCe5.over.chain.gz \
      sangerGeneLO.gp unmapped.txt
    ldHgGene ce5 sangerGeneLO -predTab sangerGeneLO.gp
#30296 gene predictions


############################################################################
# PHASTCONS6WAY (DONE 1/16/09 angie)
    # make per-chrom fasta:
    cd /hive/data/genomes/ce5
    for C in `cut -f1 chrom.sizes | sed -e s/chr//`
do
    echo "twoBitToFa -seq=chr${C} ce5.2bit ${C}/chr${C}.fa"
    twoBitToFa -seq=chr${C} ce5.2bit ${C}/chr${C}.fa
done

    mkdir /hive/data/genomes/ce5/bed/multiz6way/phastCons
    cd /hive/data/genomes/ce5/bed/multiz6way/phastCons

    # Create SS files
    mkdir ss
    cd ss
    time for C in `cut -f1 /hive/data/genomes/ce5/chrom.sizes`
    do
      if [ -s /hive/data/genomes/ce5/bed/multiz6way/maf/${C}.maf ]; then
        mkdir ${C}
        echo msa_split $C
        chrN=${C/chr/}
        chrN=${chrN/_random/}
        /cluster/bin/phast/$MACHTYPE/msa_split \
            /hive/data/genomes/ce5/bed/multiz6way/maf/${C}.maf \
            --refseq /hive/data/genomes/ce5/${chrN}/${C}.fa \
            --in-format MAF --windows 4000000,0 --between-blocks 5000 \
            --out-format SS --out-root ${C}/${C}
      fi
    done
#real    1m22.701s

    # Using the ave.cons.mod and ave.noncons.mod from 
    # /hive/data/genomes/ce6/bed/multiz6way/cons because ce5 and ce6
    # are almost identical.  See ce6.txt for derivation.
    sed -e 's/ce6/ce5/g' \
      /hive/data/genomes/ce6/bed/multiz6way/cons/ave.cons.mod > ave.cons.mod
    sed -e 's/ce6/ce5/g' \
      /hive/data/genomes/ce6/bed/multiz6way/cons/ave.noncons.mod > ave.noncons.mod
    # cons:
ALPHABET: A C G T 
ORDER: 0
SUBST_MOD: REV
BACKGROUND: 0.308500 0.191500 0.191500 0.308500 
RATE_MAT:
  -0.875904    0.175256    0.458101    0.242547 
   0.282331   -1.200805    0.180928    0.737546 
   0.737985    0.180928   -1.200354    0.281440 
   0.242547    0.457829    0.174703   -0.875079 
TREE: (((((cb3:0.087550,caeRem3:0.073515):0.020860,caePb2:0.090766):0.029808,ce5:0.094770):0.038388,caeJap1:0.109779):0.126548,priPac1:0.126548);
    # noncons:
ALPHABET: A C G T 
ORDER: 0
SUBST_MOD: REV
BACKGROUND: 0.308500 0.191500 0.191500 0.308500 
RATE_MAT:
  -0.875904    0.175256    0.458101    0.242547 
   0.282331   -1.200805    0.180928    0.737546 
   0.737985    0.180928   -1.200354    0.281440 
   0.242547    0.457829    0.174703   -0.875079 
TREE: (((((cb3:0.328951,caeRem3:0.273909):0.078262,caePb2:0.339349):0.111927,ce5:0.355503):0.144928,caeJap1:0.413391):0.478831,priPac1:0.478831);

    ssh swarm
    # Create cluster dir to do main phastCons run
    mkdir /hive/data/genomes/ce5/bed/multiz6way/phastCons/run
    cd /hive/data/genomes/ce5/bed/multiz6way/phastCons/run
    mkdir ppRaw bed

    # Create script to run phastCons with right parameters
    # This job is I/O intensive in its output files, thus it is all
    # working over in /scratch/tmp/
    cat > doPhast << '_EOF_'
#!/bin/csh -fe
mkdir /scratch/tmp/${2}
cp -p ../ss/${1}/${2}.ss ../ave.cons.mod ../ave.noncons.mod /scratch/tmp/${2}
pushd /scratch/tmp/${2} > /dev/null
/cluster/bin/phast/${MACHTYPE}/phastCons ${2}.ss ave.cons.mod,ave.noncons.mod \
   --expected-lengths 15 --target-coverage 0.55 --quiet \
        --seqname ${1} --idpref ${1} --viterbi ${2}.bed --score > ${2}.pp
popd > /dev/null
mkdir -p ppRaw/${1}
mkdir -p bed/${1}
mv /scratch/tmp/${2}/${2}.pp ppRaw/${1}
mv /scratch/tmp/${2}/${2}.bed bed/${1}
rm /scratch/tmp/${2}/ave.{cons,noncons}.mod
rm /scratch/tmp/${2}/${2}.ss
rmdir /scratch/tmp/${2}
'_EOF_'
    # << emacs
    chmod +x doPhast

    # root1 == chrom name, file1 == ss file name without .ss suffix
    cat > template << '_EOF_'
#LOOP
doPhast $(root1) $(file1)
#ENDLOOP
'_EOF_'
    # << emacs

    # Create parasol batch and run it
    ls -1 ../ss/chr*/chr*.ss | sed 's/.ss$//' > in.list

    gensub2 in.list single template jobList
    para create jobList
    para try/check/push/etc.
#Completed: 29 of 29 jobs
#CPU time in finished jobs:        308s       5.13m     0.09h    0.00d  0.000 y
#IO & Wait Time:                   129s       2.15m     0.04h    0.00d  0.000 y
#Average job time:                  15s       0.25m     0.00h    0.00d
#Longest finished job:              25s       0.42m     0.01h    0.00d
#Submission to last job:           666s      11.10m     0.18h    0.01d

    # combine predictions and transform scores to be in 0-1000 interval
    # it uses a lot of memory, so on kolossus:
    ssh kolossus
    cd /hive/data/genomes/ce5/bed/multiz6way/phastCons/run
    # The sed's and the sort get the file names in chrom,start order
    # You might like to verify it is correct by first looking at the
    # list it produces:
    find ./bed -type f | sed -e "s#/# x #g; s#\.# y #g; s#-# z #g" \
        | sort -k7,7 -k9,9n \
        | sed -e "s# z #-#g; s# y #\.#g; s# x #/#g"
    # if that looks right (== in chrom and numerical order), then let it run:
    find ./bed -type f | sed -e "s#/# x #g; s#\.# y #g; s#-# z #g" \
        | sort -k7,7 -k9,9n \
        | sed -e "s# z #-#g; s# y #\.#g; s# x #/#g" | xargs cat \
        | awk '{printf "%s\t%d\t%d\tlod=%d\t%s\n", $1, $2, $3, $5, $5;}' \
        | /cluster/bin/scripts/lodToBedScore /dev/stdin \
            > ../phastConsElements6way.bed

    # Figure out how much is actually covered by the bed file as so:
    # Get the non-n genome size from faSize on all chroms:
    faSize /hive/data/genomes/ce5/*/chr*.fa
#100281416 bases (0 N's 100281416 real 86678741 upper 13602675 lower) in 7 sequences in 7 files
#Total size: mean 14325916.6 sd 6728307.7 min 13794 (chrM) max 20919568 (chrV) median 15279316
#%13.56 masked total, %13.56 masked real

    # The 100281416 comes from the non-n genome as counted above.
    cd ..
    awk '{sum+=$3-$2} \
END{printf "%% %.2f = 100.0*%d/100281416\n",100.0*sum/100281416,sum}' \
        phastConsElements6way.bed
#% 29.44 = 100.0*29519094/100281416
    # --expected-length 15 --target-coverage 0.55

    # Aiming for %70 coverage in the following featureBits measurement on CDS,
    # but as in ce6, will accept 61%:
    featureBits ce5 -enrichment sangerGeneLO:cds phastConsElements6way.bed
#sangerGeneLO:cds 25.324%, phastConsElements6way.bed 29.436%, both 15.448%, cover 61.00%, enrich 2.07x

    # Load most conserved track into database
    ssh hgwdev
    cd /hive/data/genomes/ce5/bed/multiz6way/phastCons
    # the copy was already done above
    # cp -p /hive/data/genomes/ce5/bed/multiz6way/phastCons/run/phastConsElements6way.bed .
    time hgLoadBed ce5 phastConsElements6way phastConsElements6way.bed
#Loaded 592415 elements of size 5
#2.121u 0.172s 0:13.55 16.9%     0+0k 0+0io 0pf+0w

    # should measure the same as above
    featureBits ce5 -enrichment sangerGeneLO:cds phastConsElements6way
#sangerGeneLO:cds 25.324%, phastConsElements6way 29.436%, both 15.448%, cover 61.00%, enrich 2.07x

    # Create merged posterier probability file and wiggle track data files
    ssh kolossus
    cd /hive/data/genomes/ce5/bed/multiz6way/phastCons/run
    # prepare compressed copy of ascii data values for downloads
    cat << '_EOF_' > gzipAscii.sh
#!/bin/sh

TOP=`pwd`
export TOP

mkdir -p ../phastCons6wayScores

for D in ppRaw/chr*
do
    C=${D/ppRaw\/}
    out=../phastCons6wayScores/${C}.scores.gz
    echo "========================== ${C} ${D}"
    find ./${D} -type f | sed -e "s#/# x #g; s#\.# y #g; s#-# z #g" \
        | sort -k7,7 -k9,9n \
        | sed -e "s# z #-#g; s# y #\.#g; s# x #/#g" | xargs cat |
            gzip > ${out}
done
'_EOF_'
    # << emacs
    chmod +x gzipAscii.sh
    time nice ./gzipAscii.sh
#68.633u 0.954s 1:12.04 96.5%    0+0k 0+0io 0pf+0w

    # use those file to encode the data for the track:
    cd ..
    zcat phastCons6wayScores/chr*.data.gz \
    | wigEncode stdin phastCons6way.wig phastCons6way.wib
    # Converted stdin, upper limit 1.00, lower limit 0.00
#-rw-rw-r--  1 angie protein 56581969 Jan 16 15:00 phastCons6way.wib
#-rw-rw-r--  1 angie protein  8810835 Jan 16 15:00 phastCons6way.wig

    ssh hgwdev
    # Make link for download of score files:
    ln -s /hive/data/genomes/ce5/bed/multiz6way/phastCons/phastCons6wayScores \
      /usr/local/apache/htdocs/goldenPath/ce5/
    # Load gbdb and database with wiggle.
    cd /hive/data/genomes/ce5/bed/multiz6way/phastCons
    ln -s `pwd`/phastCons6way.wib /gbdb/ce5/wib/phastCons6way.wib
    hgLoadWiggle ce5 phastCons6way phastCons6way.wig
    #  Create histogram to see relative and cumulative frequency of scores:
    hgWiggle -doHistogram \
        -hBinSize=0.001 -hBinCount=1000 -hMinVal=0.0 -verbose=2 \
            -db=ce5 phastCons6way >& histogram.data
    cat << '_EOF_' | gnuplot > histo.png
set terminal png small color \
        x000000 xffffff xc000ff x66ff66 xffff00 x00ffff xff0000
set size 1.4, 0.8
set key left box
set grid noxtics
set grid ytics
set title " Worm Ce5 Histogram phastCons6way track"
set xlabel " phastCons6way score"
set ylabel " Relative Frequency"
set y2label " Cumulative Relative Frequency (CRF)"
set y2range [0:1]
set y2tics
set yrange [0:0.02]

plot "histogram.data" using 2:5 title " RelFreq" with impulses, \
        "histogram.data" using 2:7 axes x1y2 title " CRF" with lines
'_EOF_'
    # << emacs
    display histo.png &


############################################################################
# PHASTCONS6WAY PARAM RE-ESTIMATION EXPERIMENT (DONE 1/23/09 angie)
# I wanted to see whether following consEntropy's advice for expected-lengths,
# and increasing target-coverage to get closer to 68.8%, would make for good
# results, but it didn't.  Including this here just for the record -- original
# phastCons6way from above is not changed.
    # Start with ce6 model and params, but use consEntropy's results
    # to tune the parameters:
    mkdir /hive/data/genomes/ce5/bed/multiz6way/phastConsTuned
    cd /hive/data/genomes/ce5/bed/multiz6way/phastConsTuned
    sed -e 's/ce6/ce5/g' \
      /hive/data/genomes/ce6/bed/multiz6way/cons/ave.cons.mod > ce6.ave.cons.mod
    sed -e 's/ce6/ce5/g' \
      /hive/data/genomes/ce6/bed/multiz6way/cons/ave.noncons.mod > ce6.ave.noncons.mod
    /cluster/bin/phast/${MACHTYPE}/consEntropy --NH 9.78 0.55 15 ave.{,non}cons.mod
#( Solving for new omega: 15.000000 76.253180 42.469155 35.389619 34.810297 34.805819 )
#Transition parameters: gamma=0.550000, omega=15.000000, mu=0.066667, nu=0.081481
#Relative entropy: H=1.172307 bits/site
#Expected min. length: L_min=6.108564 sites
#Expected max. length: L_max=5.516260 sites
#Phylogenetic information threshold: PIT=L_min*H=7.161112 bits
#Recommended expected length: omega=34.805819 sites (for L_min*H=9.780000)
    # Adam commented in dm1.txt that entropy would decrease with a large 
    # increase in coverage.  We need to boost the coverage from 61% to 
    # closer to the target of 68.8% -- not large but not tiny.  Also,
    # the computed PT is 7.16, already lower than the target of 9.78
    # (those targets help to make the results more comparable across 
    # species groups).  So try increasing both target-coverage and 
    # expected-length from ce6's:
    /cluster/bin/phast/${MACHTYPE}/consEntropy --NH 9.78 0.75 40 ave.{,non}cons.mod
#( Solving for new omega: 40.000000 67.304149 61.371424 61.145983 )
#Transition parameters: gamma=0.750000, omega=40.000000, mu=0.025000, nu=0.075000
#Relative entropy: H=1.172307 bits/site
#Expected min. length: L_min=7.137874 sites
#Expected max. length: L_max=7.010810 sites
#Phylogenetic information threshold: PIT=L_min*H=8.367780 bits
#Recommended expected length: omega=61.145983 sites (for L_min*H=9.780000)
    # Well, with the increased coverage, it is recommending a *very* long
    # expected-length... but the model figures into this, too.  Use those
    # params to re-estimate the model.  

    # FIRST ITERATION: use ce6's ave.*.mod and the params tweaked according
    # to consEntropy's output above.
    ssh swarm
    mkdir /hive/data/genomes/ce5/bed/multiz6way/phastConsTuned/run.estimate1
    cd /hive/data/genomes/ce5/bed/multiz6way/phastConsTuned/run.estimate1
    cat << '_EOF_' > doEstimate.sh
#!/bin/csh -ef
/cluster/bin/phast/phastCons $1 ../ave.noncons.mod --gc 0.383 --nrates 1,1 \
    --no-post-probs --ignore-missing \
    --expected-lengths 40 --target-coverage 0.75 \
    --quiet --log $2 --estimate-trees $3
'_EOF_'
    # << emacs
    chmod a+x doEstimate.sh
    rm -fr LOG TREES
    mkdir -p LOG TREES
    rm -f jobList
    foreach f (../../phastCons/ss/chr*/*.ss)
      set root = $f:t:r:r
      echo doEstimate.sh $f LOG/$root.log TREES/$root >> jobList
    end
    para create jobList 
    para push -maxPush=1, check, shove etc.
    para time
#Completed: 29 of 29 jobs
#CPU time in finished jobs:       2728s      45.47m     0.76h    0.03d  0.000 y
#IO & Wait Time:                   159s       2.64m     0.04h    0.00d  0.000 y
#Average job time:                 100s       1.66m     0.03h    0.00d
#Longest finished job:             155s       2.58m     0.04h    0.00d
#Submission to last job:           190s       3.17m     0.05h    0.00d
    # Now combine parameter estimates.  We can average the .mod files
    # using phyloBoot.  This must be done separately for the conserved
    # and nonconserved models
    ssh kolossus
    cd /hive/data/genomes/ce5/bed/multiz6way/phastConsTuned/run.estimate1
    ls -1 TREES/*.cons.mod > cons.txt
    /cluster/bin/phast/$MACHTYPE/phyloBoot --read-mods '*cons.txt' \
      --output-average ave.cons.mod > cons_summary.txt
    ls -1 TREES/*.noncons.mod > noncons.txt
    /cluster/bin/phast/$MACHTYPE/phyloBoot --read-mods '*noncons.txt' \
      --output-average ave.noncons.mod > noncons_summary.txt
    grep TREE ave*.mod
    # FIRST ITERATION:
#ave.cons.mod:TREE: (((((cb3:0.089855,caeRem3:0.089789):0.023180,caePb2:0.095488):0.030832,ce5:0.100172):0.042051,caeJap1:0.123745):0.132125,priPac1:0.132125);
#ave.noncons.mod:TREE: (((((cb3:0.310133,caeRem3:0.305886):0.079966,caePb2:0.328879):0.106520,ce5:0.345476):0.145426,caeJap1:0.426104):0.455636,priPac1:0.455636);
    /cluster/bin/phast/${MACHTYPE}/consEntropy --NH 9.78 0.75 40 ave.{,non}cons.mod
#( Solving for new omega: 40.000000 68.550455 62.213959 61.964188 )
#Transition parameters: gamma=0.750000, omega=40.000000, mu=0.025000, nu=0.075000
#Relative entropy: H=1.053667 bits/site
#Expected min. length: L_min=7.887543 sites
#Expected max. length: L_max=7.970539 sites
#Phylogenetic information threshold: PIT=L_min*H=8.310843 bits
#Recommended expected length: omega=61.964188 sites (for L_min*H=9.780000)
    # Hmmm... recommended expected-length went up a tad and PIT decreased 
    # a smidge, so if anything those measures seemed to worsen with
    # re-estimation.  Well, let's see what happens to coverage if we run with
    # the new model and 0.75 40 anyway.
    cp ave.{,non}cons.mod ..

    ssh swarm
    mkdir /hive/data/genomes/ce5/bed/multiz6way/phastConsTuned/run.scores
    cd /hive/data/genomes/ce5/bed/multiz6way/phastConsTuned/run.scores
    mkdir ppRaw bed
    cat > doPhast << '_EOF_'
#!/bin/csh -fe
mkdir /scratch/tmp/${2}
cp -p ../../phastCons/ss/${1}/${2}.ss ../ave.cons.mod ../ave.noncons.mod /scratch/tmp/${2}
pushd /scratch/tmp/${2} > /dev/null
/cluster/bin/phast/${MACHTYPE}/phastCons ${2}.ss ave.cons.mod,ave.noncons.mod \
   --expected-lengths 40 --target-coverage 0.75 --quiet \
        --seqname ${1} --idpref ${1} --viterbi ${2}.bed --score > ${2}.pp
popd > /dev/null
mkdir -p ppRaw/${1}
mkdir -p bed/${1}
mv /scratch/tmp/${2}/${2}.pp ppRaw/${1}
mv /scratch/tmp/${2}/${2}.bed bed/${1}
rm /scratch/tmp/${2}/ave.{cons,noncons}.mod
rm /scratch/tmp/${2}/${2}.ss
rmdir /scratch/tmp/${2}
'_EOF_'
    # << emacs
    chmod +x doPhast

    # root1 == chrom name, file1 == ss file name without .ss suffix
    cat > template << '_EOF_'
#LOOP
doPhast $(root1) $(file1)
#ENDLOOP
'_EOF_'
    # << emacs

    # Create parasol batch and run it
    ls -1 ../../phastCons/ss/chr*/chr*.ss | sed 's/.ss$//' > in.list
    gensub2 in.list single template jobList
    para create jobList
    para try, check, shove, time
#Completed: 29 of 29 jobs
#CPU time in finished jobs:        298s       4.96m     0.08h    0.00d  0.000 y
#IO & Wait Time:                    90s       1.50m     0.03h    0.00d  0.000 y
#Average job time:                  13s       0.22m     0.00h    0.00d
#Longest finished job:              19s       0.32m     0.01h    0.00d
#Submission to last job:           115s       1.92m     0.03h    0.00d

    # combine predictions and transform scores to be in 0-1000 interval
    cd /hive/data/genomes/ce5/bed/multiz6way/phastConsTuned/run.scores
    # The sed's and the sort get the file names in chrom,start order
    find ./bed -type f | sed -e "s#/# x #g; s#\.# y #g; s#-# z #g" \
    | sort -k7,7 -k9,9n \
    | sed -e "s# z #-#g; s# y #\.#g; s# x #/#g" | xargs cat \
    | awk '{printf "%s\t%d\t%d\tlod=%d\t%s\n", $1, $2, $3, $5, $5;}' \
    | /cluster/bin/scripts/lodToBedScore /dev/stdin \
        > ../phastConsElements6way.bed
    # As Hiram found when trying even 0.60 as target-cov, there are many
    # warnings about negative scores.
    cd ..

    # Aiming for %70 coverage in the following featureBits measurement on CDS,
    # but as in ce6, will accept 61%:
    featureBits ce5 -enrichment sangerGeneLO:cds phastConsElements6way.bed
#sangerGeneLO:cds 25.324%, phastConsElements6way.bed 34.253%, both 16.522%, cover 65.24%, enrich 1.90x
    # OK, 65.2% is closer to 68.8% than 61%.
    # What is the actual distribution of conserved element sizes like?
    awk '{print $3 - $2;}' phastConsElements6way.bed | sort -n > sizes.txt
    # Average length:
    awk '{TOTAL += $1; COUNT++; }  END {printf "%d / %d = %.2f\n", TOTAL, COUNT, TOTAL/COUNT; }' \
      sizes.txt
#34349436 / 423298 = 81.15
    # Median:
    tail -n +`expr 423298 / 2` sizes.txt | head -2
#51
#51
    textHistogram sizes.txt -binSize=10
#Maximum value 3811.000000
    # slowly decaying exponential peaking in the 10-19 bin.
    # Repeated those measurements in ../phastCons -- avg. 49.8, median 31,
    # bigger peak at 10-19 and faster decay in the histogram.  
    # So expected-lengths definitely has an effect there... actual lengths
    # are bigger than expected, and really don't want to be as low as 15.  
    # hg18 17way avg=66.3, median=38.  dm3 15way avg=43.05, median=26.
    # ce2 phastConsElementsPaper (undoc'd): avg=269, median=175 -- but that
    # might have been with only a 2-way alignment w/briggsae. 

    # Load up wiggle and elements to visually compare with original run:
    cd /hive/data/genomes/ce5/bed/multiz6way/phastConsTuned/run.scores
    cat << '_EOF_' > catScores.sh
#!/bin/sh

TOP=`pwd`
export TOP

mkdir -p ../phastCons6wayScoresT1

for D in ppRaw/chr*
do
    C=${D/ppRaw\/}
    out=../phastCons6wayScoresT1/${C}.scores.gz
    find ./${D} -type f | sed -e "s#/# x #g; s#\.# y #g; s#-# z #g" \
        | sort -k7,7 -k9,9n \
        | sed -e "s# z #-#g; s# y #\.#g; s# x #/#g" | xargs cat
done
'_EOF_'
    # << emacs
    chmod +x catScores.sh
    ./catScores.sh \
    | wigEncode stdin phastCons6wayT1.{wig,wib}
    mv phastCons6wayT1.{wig,wib} ..
#Converted stdin, upper limit 1.00, lower limit 0.00
    ssh hgwdev
    # Load gbdb and database with wiggle.
    cd /hive/data/genomes/ce5/bed/multiz6way/phastConsTuned
    ln -s `pwd`/phastCons6wayT1.wib /gbdb/ce5/wib/
    hgLoadWiggle ce5 phastCons6wayT1 phastCons6wayT1.wig
    hgLoadBed ce5 phastConsElements6wayT1 phastConsElements6way.bed
    # They look overinflated -- I see elements extending into gaps,
    # amplified score-humps over gaps (don't know why those are there at all).

    # Bottom line -- stick with the ce6 model and params from the original 
    # run above!


############################################################################
# ANNOTATE MULTIZ6WAY MAF AND RELOAD TABLE (DONE 1/16/09 angie)
    ssh swarm
    mkdir /hive/data/genomes/ce5/bed/multiz6way/anno
    cd /hive/data/genomes/ce5/bed/multiz6way/anno
    # these actually already all exist from previous multiple alignments
    # if there are done you will see an ls of them
    # or else the twoBitInfo command will be echoed, run it if you want
    for DB in `cat ../species.lst`
do
    CDIR="/hive/data/genomes/${DB}"
    if [ ! -f ${CDIR}/${DB}.N.bed ]; then
        echo "creating ${DB}.N.bed"
        echo twoBitInfo -nBed ${CDIR}/${DB}.2bit ${CDIR}/${DB}.N.bed
    else
        ls -og ${CDIR}/${DB}.N.bed
    fi
done

    mkdir maf run
    cd run
    rm -f sizes nBeds
    for DB in `sed -e "s/ce5 //" ../../species.lst`
do
    echo "${DB} "
    ln -s  /hive/data/genomes/${DB}/${DB}.N.bed ${DB}.bed
    echo ${DB}.bed  >> nBeds
    ln -s  /hive/data/genomes/${DB}/chrom.sizes ${DB}.len
    echo ${DB}.len  >> sizes
done

    cat << '_EOF_' > doAnno.csh
#!/bin/csh -ef
set dir = /hive/data/genomes/ce5/bed/multiz6way
set c = $1
cat $dir/maf/${c}.maf | nice \
    mafAddIRows -nBeds=nBeds stdin /hive/data/genomes/ce5/ce5.2bit $2
'_EOF_'
    # << emacs
    chmod +x doAnno.csh


    cat << '_EOF_' > template
#LOOP
./doAnno.csh $(root1) {check out line+ ../maf/$(root1).maf}
#ENDLOOP
'_EOF_'
    # << emacs

    cut -f1 /hive/data/genomes/ce5/chrom.sizes > chrom.list
    gensub2 chrom.list single template jobList
#Completed: 7 of 7 jobs
#CPU time in finished jobs:         31s       0.52m     0.01h    0.00d  0.000 y
#IO & Wait Time:                    35s       0.58m     0.01h    0.00d  0.000 y
#Average job time:                   9s       0.16m     0.00h    0.00d
#Longest finished job:              12s       0.20m     0.00h    0.00d
#Submission to last job:            19s       0.32m     0.01h    0.00d

    # Load anno/maf
    ssh hgwdev
    cd /hive/data/genomes/ce5/bed/multiz6way/anno/maf
    mkdir -p /gbdb/ce5/multiz6way/anno/maf
    ln -s /hive/data/genomes/ce5/bed/multiz6way/anno/maf/*.maf \
      /gbdb/ce5/multiz6way/anno/maf
    time /cluster/bin/x86_64/hgLoadMaf \
        -pathPrefix=/gbdb/ce5/multiz6way/anno/maf ce5 multiz6way
#Loaded 418471 mafs in 7 files from /gbdb/ce5/multiz6way/anno/maf
#7.204u 0.610s 0:25.32 30.8%     0+0k 0+0io 4pf+0w

    # Do the computation-intensive part of hgLoadMafSummary on a workhorse 
    # machine and then load on hgwdev:
    ssh kolossus
    cd /hive/data/genomes/ce5/bed/multiz6way/anno/maf
    cat *.maf \
    | hgLoadMafSummary ce5 -minSize=30000 -mergeGap=1500 \
        -maxSize=200000 -test multiz6waySummary stdin
#Created 45472 summary blocks from 1042942 components and 418471 mafs from stdin
#-rw-rw-r--  1 angie protein 2097539 Jan 16 15:39 multiz6waySummary.tab

    ssh hgwdev
    cd /hive/data/genomes/ce5/bed/multiz6way/anno/maf
    hgLoadSqlTab ce5 multiz6waySummary \
      ~/kent/src/hg/lib/mafSummary.sql multiz6waySummary.tab

    # Create per-chrom individual maf files for downloads
    ssh kolossus
    cd /hive/data/genomes/ce5/bed/multiz6way
    mkdir mafDownloads
    time for M in anno/maf/chr*.maf
    do
        B=`basename $M`
        nice cp -p ${M} mafDownloads/${B}
        nice gzip mafDownloads/${B}
        echo ${B} done
    done

    ssh hgwdev
    cd /hive/data/genomes/ce5/bed/multiz6way
    # There isn't any refGene table, using sangerGeneLO instead
#!/bin/sh
for S in 1000 2000 5000
do
    echo "making upstream${S}.maf"
    featureBits -verbose=2 ce5 \
        sangerGeneLO:upstream:${S} -fa=/dev/null -bed=stdout \
        | perl -wpe 's/_up[^\t]+//' | sort -k1,1 -k2,2n \
        | mafFrags ce5 multiz6way stdin stdout -orgs=species.lst \
        | gzip -c > mafDownloads/sangerGeneLO.upstream${S}.maf.gz
    echo "done sangerGeneLO.upstream${S}.maf.gz"
done
    cd mafDownloads
    md5sum *.gz > md5sum.txt

    # deliver to downloads
    mkdir /usr/local/apache/htdocs/goldenPath/ce5/multiz6way
    ln -s /hive/data/genomes/ce5/bed/multiz6way/mafDownloads/* \
      /usr/local/apache/htdocs/goldenPath/ce5/multiz6way/


#######################################################################
# MULTIZ6WAY MAF FRAMES (DONE 1/16/09 angie)
    ssh hgwdev
    mkdir /hive/data/genomes/ce5/bed/multiz6way/frames
    cd /hive/data/genomes/ce5/bed/multiz6way/frames
    mkdir genes
    #------------------------------------------------------------------------
    # get the genes for all genomes
    # using sangerGeneLO for ce5
    # using blastCe6SG for cb3, caePb2, caeRem3, caeJap1 priPac1
    # (Note the Ce6, not Ce5!  But the genes are the same as in sangerGeneLO)
    for qDB in ce5 cb3 caePb2 caeRem3 caeJap1 priPac1
    do
      if [ $qDB = "cb3" -o $qDB = "caePb2" -o $qDB = "caeRem3" -o $qDB = "caeJap1" -o $qDB = "priPac1" ]; then
        geneTbl=blastCe6SG
          echo hgsql -N -e \"'select * from '$geneTbl\;\" ${qDB}
          hgsql -N -e "select * from $geneTbl" ${qDB} | cut -f 2-100 \
                > /scratch/tmp/$qDB.tmp.psl
            mrnaToGene -allCds /scratch/tmp/$qDB.tmp.psl stdout \
          | genePredSingleCover stdin stdout | gzip -2c \
            > /scratch/tmp/$qDB.tmp.gz
          rm -f /scratch/tmp/$qDB.tmp.psl
          mv /scratch/tmp/$qDB.tmp.gz genes/$qDB.gp.gz
      elif [ $qDB = "ce5" ]; then
        geneTbl=sangerGeneLO
          echo hgsql -N -e \"'select * from '$geneTbl\;\" ${qDB}
          hgsql -N -e "select * from $geneTbl" ${qDB} | cut -f 2-11 \
          | genePredSingleCover stdin stdout | gzip -2c \
            > /scratch/tmp/$qDB.tmp.gz
          mv /scratch/tmp/$qDB.tmp.gz genes/$qDB.gp.gz
      fi
    done

    ssh kolossus
    cd /hive/data/genomes/ce5/bed/multiz6way/frames
    cat ../maf/*.maf \
    | genePredToMafFrames ce5 stdin stdout \
        ce5 genes/ce5.gp.gz         cb3 genes/cb3.gp.gz \
        caePb2 genes/caePb2.gp.gz   caeRem3 genes/caeRem3.gp.gz \
        caeJap1 genes/caeJap1.gp.gz priPac1 genes/priPac1.gp.gz \
    | gzip > multiz6way.mafFrames.gz

    ssh hgwdev
    cd /hive/data/genomes/ce5/bed/multiz6way/frames
    hgLoadMafFrames ce5 multiz6wayFrames multiz6way.mafFrames.gz


#########################################################################
#########################################################################
