# for emacs: -*- mode: sh; -*-


# This file describes how to make the browser database for the
# worm C. briggsae
# Currently 2003-05-29 this file is in a HIGH state of flux as it is being
# worked out.

#  NOTE:  this doc may have genePred loads that fail to include
#  the bin column.  Please correct that for the next build by adding
#  a bin column when you make any of these tables:
#
#  mysql> SELECT tableName, type FROM trackDb WHERE type LIKE "%Pred%";
#  +-----------+----------------------+
#  | tableName | type                 |
#  +-----------+----------------------+
#  | twinscan  | genePred twinscanPep |
#  +-----------+----------------------+


DOWNLOAD SEQUENCE (DONE 2003-05-20 - Hiram)

    # next machine
    ssh eieio
    mkdir -p /cluster/store5/worm/cb1/sanger
    cd /cluster/store5/worm/cb1/sanger
    wget -o cb1.fetch.log -r -l1 --no-directories \
	ftp://ftp.sanger.ac.uk/pub/wormbase/cbriggsae/cb25.agp8/
#   Takes about eight minutes
#   These files seem to have not been updated since July 2002
#   Check what you received:
    ls
# cb1.fetch.log                    cb25.agp8.gff.tar.gz
# cb25.agp8.agp.gz                 cb25.agp8.reads.placed.gz
# cb25.agp8.contigs.fasta.gz       cb25.agp8.supercontigs.fasta.gz
# cb25.agp8.contigs.fasta.qual.gz  cb25.agp8.supercontigs.fasta.qual.gz
# cb25.agp8.fasta.gz               README

#   get out of this download data directory, and create your home symlink
#	shortcut
    cd ..
    ln -s /cluster/store5/worm/cb1 ~/cb1
    cd ~/cb1
    # unzip the small contigs and ultra contigs
    #	Will need both of those later
    zcat sanger/cb25.agp8.contigs.fasta.gz > contigs.fa
    zcat sanger/cb25.agp8.fasta.gz > ultra.fa
    # create an artifical chrUn agp file:  (does 1000 N gaps)
    rm -f ultra.agp
    rm -f ultra.lft
    scaffoldFaToAgp ultra.fa
    # gap size is 1000, total gaps: 578
    # chrom size is 109025926
    # writing ultra.agp
    # writing ultra.lft
    mkdir Un
    mv ultra.agp Un/chrUn.agp
    mv ultra.lft Un/chrUn.lft

    #---------------  NEW GAP STUFF ------------------------------
    # liftup sanger contig agp file to chrUn coordinates
    zcat sanger/cb25.agp8.agp.gz > Un/cb25.agp8.agp
    cd Un
    # NOTE: silly liftUp requires files to have same name as chrom
    # NOTE: the generated agp is no longer needed, save it anyway
    mv chrUn.agp chrUn.ultra.agp 
    liftUp -gapsize=1000 chrUn.agp chrUn.lft warn cb25.agp8.agp
    mv chrUn.agp chrUn.contig.agp
    #hgGoldGapGl -noGl cb1 ~/cb1 Un

    #---------------  END NEW GAP STUFF ------------------------------

    # hgGoldGapGl is an unusual command.  It is given a directory
    # argument and a sub directory argument.  It puts those two together
    # and then looks in that pathname for directory names of 1 or 2
    # characters, and then looks in those subdirectories for agp files.
    # it all comes from the structure of the hg* directory structure
    # and how the human chromosomes are split into their contigs
    #  SO, to handle hgGoldGapGl:
    mkdir Un/Un
    cd Un/Un
    ln -s ../*.agp .

    #  Will put together the chrUn.fa later with the masked contigs

#  Populate /iscratch/i for masking runs
#	For some types of operations we want the smaller contigs
#	For others we want the ultracontigs.  Since they are both
#	relatively small (about 100 M bases) we can have them both
#	exist out there
    # next machine
    ssh kkr1u00
    mkdir -p /iscratch/i/worms/Cbriggsae
    mkdir -p /iscratch/i/worms/Cbriggsae/contigs
    mkdir -p /iscratch/i/worms/Cbriggsae/ultras
    cd ~/cb1
    cp -p contigs.fa /iscratch/i/worms/Cbriggsae
    cp -p ultra.fa /iscratch/i/worms/Cbriggsae
    faSplit sequence contigs.fa 1000 /iscratch/i/worms/Cbriggsae/contigs/c
    ~hiram/bin/i386/faSplit byname ultra.fa /iscratch/i/worms/Cbriggsae/ultras
    # I don't see Brian's faSplit byname function in the
    # /cluster/bin/i386/faSplit yet ?  (It's version is 19 July 2002)
    ~kent/bin/iSync

#  With /iscratch/i/worms/Cbriggsae populated, ready for RepeatMasker run
    # next machine
    ssh kk
    cd ~/cb1
    mkdir -p RMRun.ultras/out
    cd RMRun.ultras
    cat << '_EOF_' > RMWorm
#!/bin/csh -fe
#
#   $1 is full pathname to a contig .fa file
#   this directory . is the location to return results to

set bname = `basename $1`
/bin/mkdir -p /tmp/$bname
/bin/cp $1 /tmp/$bname/
pushd .
cd /tmp/$bname
/scratch/hg/RepeatMasker/RepeatMasker -ali -s -el $bname
popd
/bin/cp /tmp/$bname/$bname.out ./out
if (-e /tmp/$bname/$bname.align) /bin/cp /tmp/$bname/$bname.align ./out
# /bin/cp /tmp/$2*.masked ../masked/
rm -f /tmp/$bname/$bname
rm -f /tmp/$bname/$bname.*
rmdir /tmp/$bname
'_EOF_'

    chmod +x RMWorm

    cat << '_EOF_' > gsub
#LOOP
/cluster/store5/worm/cb1/RMRun.ultras/RMWorm {check in exists+ $(path1)}
#ENDLOOP
'_EOF_'

    ls /iscratch/i/worms/Cbriggsae/ultras/*.fa > briggsae.lst
    echo "" > dummy.lst
    gensub2 briggsae.lst dummy.lst gsub spec
    para create spec
    para try
    
# SIMPLE REPEAT [TRF] TRACK (DONE 2003-05-29 - Hiram)
    # Assuming ultra contings are already in
    # /iscratch/i/worms/Cbriggsae/ultras from RepeatMasker setup above

    # Create cluster parasol job:
    # next machine
    ssh kk
    mkdir -p ~/cb1/bed/simpleRepeat
    cd ~/cb1/bed/simpleRepeat
    mkdir trf
    ls -1S /iscratch/i/worms/Cbriggsae/ultras/*.fa > genome.lst
    cat << '_EOF_' > gsub
#LOOP
/cluster/bin/i386/trfBig -trf=/cluster/bin/i386/trf {check in line+ $(path1)}  /dev/null -bedAt={check out line trf/$(root1).bed} -tempDir=/tmp
#ENDLOOP
'_EOF_'

    echo "" > dummy.lst
    gensub2 genome.lst dummy.lst gsub spec
    para create spec
    para try
    para check
    para push
    ... etc ...

# PROCESS SIMPLE REPEATS INTO MASK (DONE 2003-05-29 - Hiram)
    # next machine
    ssh eieio
    cd ~/cb1/bed/simpleRepeat
    mkdir -p trfMask
    foreach f (trf/*.bed)
	awk '{if ($5 <= 12) print;}' $f > trfMask/$f:t
    end

    #  When cluster run is done, combine into one:
#XXXX    cat trf/*.bed > simpleRepeat.bed
    
    # Load into the database:
    # next machine
#XXXX    ssh hgwdev
#XXXX    cd ~/cb1/bed/simpleRepeat
#XXXX    /cluster/bin/i386/hgLoadBed cb1 simpleRepeat simpleRepeat.bed \
#XXXX      -sqlTable=$HOME/src/hg/lib/simpleRepeat.sql

# Create Soft and Hard masks from RepeatMaster and TRF outputs:
#  and rebuild the nib files
    # next machine
    ssh eieio
    mkdir ~/cb1/ultras
    mkdir ~/cb1/ultraNibs
    cd ~/cb1/ultras
    ~hiram/bin/i386/faSplit byname ../ultra.fa .
    mkdir ~/cb1/softMask
    cd ~/cb1/softMask
    foreach c (../ultras/*.fa)
        set b = $c:t
        set r = $b:r
        echo ${c} ${b} ${r}
	/cluster/bin/i386/maskOutFa ${c} ../RMRun.ultras/out/${b}.out \
		${b} -soft
	/cluster/bin/i386/maskOutFa ${b} \
		../bed/simpleRepeat/trfMask/${r}.bed \
		${b} -softAdd
	faToNib -softMask ${b} ../ultraNibs/${r}.nib
    end
    # combine them all to be the query sequence for blastz runs
    cat *.fa > ../allUltras.soft.fa

    #  With masked nib files ready, prepare cluster for blastz runs
    # next machine
    ssh kkr1u00
    cd ~/cb1
    mkdir -p /iscratch/i/worms/Cbriggsae/trfFa
    cp -p allUltras.soft.fa /iscratch/i/worms/Cbriggsae/trfFa
    mkdir -p /iscratch/i/worms/Cbriggsae/rmsk
    cd ~/cb1/RRun.ultras/out
    cp -p *.out /iscratch/i/worms/Cbriggsae/rmsk
    ~kent/bin/iSync

# XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX  WORKING ON THIS 2003-06-10 XXXX
#   Un directory was created above from the scaffoldToAgp
    # next machine
    ssh eieio
    cd ~/cb1/Un
    /cluster/home/hiram/bin/i386/agpToFa -simpleMultiMixed chrUn.agp \
	chrUn chrUn.fa ../allUltras.soft.fa
    # $ faSize chrUn.fa
    # 109025926 bases (3588679 N's 105437247 real) in 1 sequences in 1 files
    # create nib file:
    cd ~/cb1
    mkdir nib
    /cluster/bin/i386/faToNib -softMask Un/chrUn.fa nib/chrUn.nib
    # Writing 109025926 bases in 54512971 bytes


# CREATING DATABASE (DONE 2003-06-10 - Hiram)

    # Create the database.
    # next machine
    ssh hgwdev
    echo 'create database cb1' | hgsql ''
    # if you need to delete that database:  !!! WILL DELETE EVERYTHING
    # !!!
        echo 'drop database cb1' | hgsql cb1
    # Use df to make sure there is at least 5 gig free on
    df -h /var/lib/mysql

# CREATING GRP TABLE FOR TRACK GROUPING (DONE 2003-06-10 - Hiram)
    # next machine
    ssh hgwdev
    #  the following command copies all the data from the table
    #   grp in the database rn1 to our new database cb1
    echo "create table grp (PRIMARY KEY(NAME)) select * from rn1.grp" \
      | hgsql cb1
    # if you need to delete that table:   !!! WILL DELETE ALL grp data
    # !!!
        echo 'drop table grp;' | hgsql cb1

# STORING O+O SEQUENCE AND ASSEMBLY INFORMATION  (DONE 2003-05-20 - Hiram)

    # Make symbolic links from /gbdb/cb1/nib to the real nibs.
    # next machine
    # next machine
    ssh hgwdev
    mkdir -p /gbdb/cb1/nib
    ln -s  /cluster/store5/worm/cb1/nib/chrUn.nib /gbdb/cb1/nib

    # Load /gbdb/cb1/nib paths into database and save size info.
    hgsql cb1  < ~/kent/src/hg/lib/chromInfo.sql
    # if you need to delete that table:   !!! DELETES ALL DATA IN TABLE
    # !!!
        echo 'drop table chromInfo;' | hgsql cb1
    cd ~/cb1
    hgNibSeq -preMadeNib cb1 /gbdb/cb1/nib Un/chrUn.fa
# Processing Un/chrUn.fa to /gbdb/cb1/nib/chrUn.nib
# 109025926 total bases
#   Verify the hgNibSeq load functioned OK: 
    echo "select chrom,size from chromInfo" | hgsql -N cb1 > chrom.sizes
    cat chrom.sizes
#   Typical contents of chrom.sizes:
# chrUn   109025926

# Set up relational mrna tables.
    hgLoadRna new cb1
    # that created a bunch of tables.  If you need to delete them:
        echo 'drop table author; \
drop table cds; drop table cell; drop table description; \
drop table development; drop table extFile; drop table geneName; \
drop table history; drop table keyword; drop table library; drop table
mrna; \
drop table mrnaClone; drop table organism; drop table productName; \
drop table seq; drop table sex; drop table source; drop table tissue;' \
        | hgsql cb1
    # OR - hgLoadRna drop cb1

# MAKE GCPERCENT (DONE 2003-06-10 - Hiram)
    # next machine
     ssh hgwdev
     mkdir -p /cluster/store5/worm/cb1/bed/gcPercent
     cd /cluster/store5/worm/cb1/bed/gcPercent
     hgsql cb1  < ~/kent/src/hg/lib/gcPercent.sql
    #  If you need to delete that table created
        echo 'drop table gcPercent;' | hgsql cb1;
     hgGcPercent cb1 ../../nib
     hgGoldGapGl -noGl cb1 /cluster/store5/worm/cb1 Un
     # hgGoldGapGl looked in /cluster/store5/worm/cb1/Un for a 1 or 2
     # character directory name, then below that found the agp file !


# MAKE HGCENTRALTEST ENTRY AND TRACKDB TABLE FOR CB1 (DONE 2002-06-10 - Hiram)
    # next machine
     ssh hgwdev
    echo 'insert into defaultDb values("C. briggsae", "cb1");' \
      | hgsql -h genome-testdb hgcentraltest
    #  If you need to delete that entry:
        echo 'delete from defaultDb where name="cb1";' \
        | hgsql -h genome-testdb hgcentraltest
    # Note: for next assembly, set scientificName column to 
    # "Caenorhabditis briggsae"
    echo 'insert into dbDb values("cb1", "July 2002", \
        "/gbdb/cb1/nib", "Worm", "chrUn:74980670-74998831", 1, 10, \
        "C. briggsae");' \
        | hgsql -h genome-testdb hgcentraltest
    #  If you need to delete that entry:
        echo 'delete from dbDb where name="cb1";' \
        | hgsql -h genome-testdb hgcentraltest
    # Make trackDb table so browser knows what tracks to expect:
    cd ~/kent/src/hg/makeDb/trackDb
    cvs up -d -P
    # Edit that makefile to add cb1 in all the right places and do
    make update
    # before you make alpha you must commit your trackDb/makefile
    # and any other trackDb/worm files
    cvs commit makefile
    make alpha

# CREATE REPEAT TRACKS (2003-06-10 - DONE - Hiram)

    ssh eieio
    cd ~/cb1

    # merge and lift up the repeatmasker output files to chrom
    # coordinates
    liftUp Un/chrUn.fa.out Un/chrUn.lft warn RMRun.ultras/out/*.out

    # load into the database (chrUn_rmsk table)
    ssh hgwdev
    hgLoadOut cb1 Un/chrUn.fa.out

    # lift the simple repeat output to chrom coordinates
    ssh eieio
    cd ~/cb1
    cd bed/simpleRepeat
    liftUp simpleRepeat.bed ~/cb1/Un/chrUn.lft warn trf/*.bed

    # load into the database (simpleRepeat table)
    ssh hgwdev
    cd ~/cb1/bed/simpleRepeat
    hgLoadBed cb1 simpleRepeat simpleRepeat.bed \
      -sqlTable=$HOME/kent/src/hg/lib/simpleRepeat.sql
    # Loaded 32763 elements

# MAKING AND STORING mRNA AND EST ALIGNMENTS (DONE 2003-06-12 - Hiram)
    # next machine
    ssh kkr1u00
    mkdir -p /iscratch/i/worms/Cbriggsae/mRNA
    mkdir -p /iscratch/i/worms/Cbriggsae/EST
    cd /iscratch/i/worms/Cbriggsae/mRNA
    # there are 33 sequences in mrna.fa, break up byname, each one to a file
    #	(the 100 argument is necessary, but ignored)
    faSplit byname \
	/cluster/store5/mrna.134/org/Caenorhabditis_briggsae/mrna.fa 100
    cd /iscratch/i/worms/Cbriggsae/EST
    # there are 2424 ESTs.  (The 100 argument is necessary but ignored)
    faSplit byname \
	/cluster/store5/mrna.134/org/Caenorhabditis_briggsae/est.fa 100
    ~kent/bin/iSync

    # next machine, small cluster is fine, there are few tiny jobs
    ssh kkr1u00
    cd ~/cb1/bed
    mkdir mrna est
    cd mrna
    mkdir psl
    ls -1S /iscratch/i/worms/Cbriggsae/bothMasksNib/chr*.nib > worm.lst
    ls -1S /iscratch/i/worms/Cbriggsae/mRNA/*.fa > mrna.lst
    cat << '_EOF_' > gsub
#LOOP
/cluster/bin/i386/blat -fine -q=rna -mask=lower {check in exists+ $(path1)} {check in line+ $(path2)} {check out line+ psl/$(root1)_$(root2).psl}
#ENDLOOP
'_EOF_'

    gensub2 worm.lst mrna.lst gsub spec
    para create spec
    para try
    para check
    para push		# there are only 33 jobs
    ... etc ...
# Average job time:                  22s       0.36m     0.01h    0.00d
# Longest job:                       25s       0.42m     0.01h    0.00d


    # cluster run done
    pslSort dirs raw.psl /tmp psl
    pslReps -minAli=0.98 -sizeMatters -nearTop=0.005 raw.psl all_mrna.psl \
      /dev/null
    pslSortAcc nohead chrom /tmp all_mrna.psl

    # load mrna tables
    # next machine
    ssh hgwdev
    cd ~/cb1/bed/mrna/chrom
    # names must be chr*_mrna.psl, there is only one
    mv chrUn.psl chrUn_mrna.psl
    # this load command rewrites these table entirely if they exist
    hgLoadPsl -noTNameIx cb1 *.psl
    mkdir /gbdb/cb1/mrna.134
    ln -s /cluster/store5/mrna.134/org/Caenorhabditis_briggsae/mrna.fa \
	/gbdb/cb1/mrna.134
    # ! ! !  DO NOT RUN hgLoadRna in /gbdb - it leaves .tab files
    hgLoadRna add -type=mRNA cb1 /gbdb/cb1/mrna.134/mrna.fa \
	/cluster/store5/mrna.134/org/Caenorhabditis_briggsae/mrna.ra
    cd ~/cb1/bed/mrna
    hgLoadPsl cb1 all_mrna.psl

    # /iscratch/i was setup with est.fa file above

    # Let's try the bluearc server
    ssh kk
    cd ~/cb1/bed/est
    mkdir psl
    ls -1S /cluster/bluearc/iscratch/i/worms/Cbriggsae/bothMasksNib/chr*.nib > worm.lst
    # names are too long for an ls all the way over in /cluster/bluearc
    ln -s /cluster/bluearc/iscratch/i/worms/Cbriggsae/EST cbaEST
    ls -1S cbaEST/* > est.lst
    cat << '_EOF_' > gsub
#LOOP
/cluster/bin/i386/blat -mask=lower {check in exists+ $(path1)} {check in line+ $(path2)} {check out line+ psl/$(root1)_$(root2).psl}
#ENDLOOP
'_EOF_'

    gensub2 worm.lst est.lst gsub spec
    para create spec
    para try
    para check
    para push
# Average job time:                  80s       1.33m     0.02h    0.00d
# Longest job:                      537s       8.95m     0.15h    0.01d
    #  I had trouble getting the cluster to finish these jobs
    #  They would appear to be running, but when I checked on them, they
    #  were not accumulating any CPU time.  Had to stop and re-push a
    #  number of times to get them all to complete

    # cluster run done
    # next machine
    ssh hgwdev
    cd ~/cb1/bed/est
    pslSort dirs raw.psl /tmp psl
    pslReps -minAli=0.98 -sizeMatters -nearTop=0.005 raw.psl all_est.psl \
      /dev/null
    pslSortAcc nohead chrom /tmp all_est.psl
    cd chrom
    # name must be chr*_est.psl
    mv chrUn.psl chrUn_est.psl
    # this load command rewrites these table entirely if they exist
    hgLoadPsl -noTNameIx cb1 *.psl

    ln -s /cluster/store5/mrna.134/org/Caenorhabditis_briggsae/est.fa \
	/gbdb/cb1/mrna.134
    # ! ! !  DO NOT RUN hgLoadRna in /gbdb - it leaves .tab files
    hgLoadRna add -mrnaType=EST -type=EST cb1 /gbdb/cb1/mrna.134/est.fa \
	/cluster/store5/mrna.134/org/Caenorhabditis_briggsae/est.ra
    cd ~/cb1/bed/est
    hgLoadPsl cb1 all_est.psl -nobin

# END ====  MAKING AND STORING mRNA AND EST ALIGNMENTS

# BEGINNING OF Ce1 BLASTZ   (DONE 2003-6-17 - Hiram)

    # next machine
    ssh kk
    mkdir -p ~/cb1/bed/blastzCe1
    cd ~/cb1/bed/blastzCe1
    cp ~angie/hummus/DEF.cb1-ce1.2003-06-17 DEF
#	source the DEF file which was carefully written as above.
    . ./DEF
    mkdir -p $BASE/run
    ~angie/hummus/make-joblist $DEF > $BASE/run/j
    # that created xdir.sh and joblist run/j
    sh $BASE/xdir.sh
    # xdir.sh makes a bunch of result directories in $BASE/raw/
    # based on chrom name and CHUNK size
    cd $BASE/run
    # now edit j because it is not correct:
    sed -e 's#^#/cluster/home/angie/schwartzbin/#' j > j2
    wc -l j*
    head j2
    # *** make sure the j2 edits are OK, then use it:
    mv j2 j
    # para create will create the file: 'batch' for the cluster run
    para create j
    para try
    para check
    para push
    ... etc ...
Average job time:                1485s      24.75m     0.41h    0.02d
Longest job:                     3821s      63.68m     1.06h    0.04d

    # When that cluster run is done, results are in $BASE/raw/chr*/*
    # continuing with ~angie/hummus/Notes:
    # --- normalize and single_cov
    cd ~/cb1/bed/blastzCe1
    #   source the DEF file again in case you are coming back to this
    . ./DEF
    # a new run directory
    mkdir -p $BASE/run.1
    # another obscure script creates a new job list:
    ~angie/hummus/do.out2lav $DEF >$BASE/run.1/j
    cd $BASE/run.1
    # the job list is once again incorrect, edit it:
    sed -e 's/^/\/cluster\/home\/angie\/schwartzbin\//' j > j2
    wc -l j*
    head j2
    # make sure the edited j2 is OK, then use it:
    mv j2 j
    para create j
    para try
    para push;para check; ... etc ...
# Average job time:                 412s       6.86m     0.11h    0.00d
# Longest job:                      585s       9.75m     0.16h    0.01d

    # Translate the .lav files created by the end of ~angie/hummus Notes
    # into axt files
    ssh eieio
    set base="/cluster/store5/worm/cb1/bed/blastzCe1"
    set seq1_dir="/cluster/store5/worm/cb1/nib"
    set seq2_dir="/cluster/store5/worm/ce1/nib"
    set tbl="blastzCe1"
    cd $base
    mkdir -p axtChrom
    foreach c (lav/*)
      pushd $c
      set chr=$c:t
      set out=$base/axtChrom/$chr.axt
      echo "Translating $chr lav to $out"
      cat `ls -1 *.lav | sort -g` \
        | /cluster/bin/i386/lavToAxt stdin $seq1_dir $seq2_dir stdout \
        | /cluster/bin/i386/axtSort stdin $out
      popd
    end

    # Translate the sorted axt files into psl:
    cd $base
    mkdir -p pslChrom
    foreach f (axtChrom/chr*.axt)
      set c=$f:t:r
      nice /cluster/bin/i386/axtToPsl $f S1.len S2.len pslChrom/${c}_${tbl}.psl
    end

    # load these blastz results
    # next machine
    ssh hgwdev
    cd ~/cb1/bed/blastzCe1/pslChrom
    /cluster/bin/i386/hgLoadPsl -noTNameIx cb1 chr*_*.psl
# trackDb/worm/cb1/trackDb.ra entry:
#   track blastzCe1
#   shortLabel briggsae Blastz
#   longLabel Blastz C briggsae
#   group compGeno
#   priority 159
#   visibility dense
#   color 0,0,0
#   altColor 50,128,50
#   spectrum on
#   type psl xeno cb1

# CHAINING Briggsae blastz
    # next machine, small cluster is good for these tiny jobs
    ssh kkr1u00
    cd /cluster/store5/worm/cb1/bed/blastzCe1
    mkdir axtChain
    cd axtChain
    mkdir run1
    cd run1
    ls -1S ../../axtChrom/*.axt > input.lst
    cat << '_EOF_' > gsub
#LOOP
doChain {check in exists $(path1)} {check out line+ chain/$(root1).chain} {check out line+ out/$(root1).out}
#ENDLOOP
'_EOF_'
    cat << '_EOF_' > doChain
#!/bin/csh
    axtChain $1  /cluster/store5/worm/cb1/nib /cluster/store5/worm/ce1/nib $2 > $3
'_EOF_'
    chmod a+x doChain

    mkdir out chain
    gensub2 input.lst single gsub spec
    para create spec
    para try
Average job time:                3175s      52.92m     0.88h    0.04d
Longest job:                     3175s      52.92m     0.88h    0.04d

    # now on the cluster server, sort chains
    # next machine
    ssh eieio
    cd ~/cb1/bed/blastzCe1/axtChain
    chainMergeSort run1/chain/*.chain > all.chain
    chainSplit chain all.chain
    # optionally: rm run1/chain/*.chain

    # Load chains into database
    # next machine
    ssh hgwdev
    cd ~/cb1/bed/blastzCe1/axtChain/chain
    hgLoadChain cb1 chrUn_ce1Chain chrUn.chain

# END Blastz C Elegans
# 2004-05-07
# changed priorities and names for Ce1 blastz and chain tracks 
# trackDb/worm/cb1/trackDb.ra entry:
# track blastzCe1
# shortLabel Ce1 Blastz
# longLabel Blastz C. elegans (Ce1)
# group compGeno
# priority 156.2
# visibility dense
# color 0,0,0
# altColor 50,128,50
# spectrum on
# type psl xeno ce1
# otherDb ce1

# track ce1Chain
# shortLabel Ce1 Chain
# longLabel Chained elegans(Ce1)/briggsae Alignments
# group compGeno
# priority 156.1
# visibility dense
# color 100,50,0
# altColor 255,240,200
# spectrum on
# type chain ce1
# otherDb ce1
                 
# C. elegans (Ce2) Blastz (DONE, 2004-05-06, hartera)
    # next machine
    ssh kkr1u00
    # blastz requires lineage-specific repeats but there are none for the worms
    # so create empty files for each chromsome and iSync
    mkdir -p /iscratch/i/worms/Cbriggsae/linSpecRep.notinCelegans
    cd /iscratch/i/worms/Celegans2/linSpecRep.notinCelegans
    # create empty chrUn.out.spec file if there is not one already here

    mkdir -p /iscratch/i/worms/Celegans2/linSpecRep.notinCbriggsae
    cd /iscratch/i/worms/Celegans2/linSpecRep.notinCbriggsae
    # create empty chrI.out.spec and cp to chrN.out.spec for chrII, chrIII, chrIV, chrV, chrX, chrM if not there alreday
    iSync

    ssh kk
    mkdir -p /cluster/data/cb1/bed/blastzCe2.2004-05-05
    cd /cluster/data/cb1/bed/
    ln -s blastzCe2.2004-05-05 blastzCe2
    cd blastzCe2
    
cat << '_EOF_' > DEF
# C. briggsae vs. C. elegans
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/home/angie/schwartzbin:/cluster/bin/i386
                                                                                
ALIGN=blastz-run
BLASTZ=blastz
BLASTZ_H=2000
#BLASTZ_ABRIDGE_REPEATS=1
#  when SMSK=/dev/null
BLASTZ_ABRIDGE_REPEATS=0
                                                                                
# TARGET
SEQ1_DIR=/iscratch/i/worms/Cbriggsae/bothMasksNib
# RMSK not currently used
SEQ1_RMSK=/iscratch/i/worms/Cbriggsae/rmsk
SEQ1_SMSK=/iscratch/i/worms/Cbriggsae/linSpecRep.notinCelegans
# FLAG not currently used
SEQ1_FLAG=-worm
SEQ1_IN_CONTIGS=0
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
                                                                                
# QUERY
SEQ2_DIR=/iscratch/i/worms/Celegans2/nib
# RMSK not currently used
SEQ2_RMSK=/iscratch/i/worms/Celegans2/rmsk
SEQ2_SMSK=/iscratch/i/worms/Celegans2/linSpecRep.notinCbriggsae
# FLAG not currently used
SEQ2_FLAG=-worm
SEQ2_IN_CONTIGS=0
SEQ2_CHUNK=10000000
SEQ2_LAP=10000
                                                                                
BASE=/cluster/store5/worm/cb1/bed/blastzCe2
                                                                                
DEF=$BASE/DEF
RAW=$BASE/raw
CDBDIR=$BASE
SEQ1_LEN=$BASE/S1.len
SEQ2_LEN=$BASE/S2.len
                                                                                
#DEBUG=1
'_EOF_'
    # << this line keeps emacs coloring happy
    # Save the DEF file in the current standard place
    cp DEF ~angie/hummus/DEF.cb1-ce2.2004-05-05
 
    mkdir /cluster/data/cb1/jkStuff
    # Need shell scripts from ce2 to do cluster runs
    mv /cluster/data/ce2/jkStuff/BlastZ*.sh /cluster/data/cb1/jkStuff/

    # prepare first cluster run
    ssh kk
    cd /cluster/data/cb1/bed/blastzCe2
    source DEF
    /cluster/data/cb1/jkStuff/BlastZ_run0.sh
    cd run.0
    para try, check, push, check, ....
# para time
# Completed: 154 of 154 jobs
# CPU time in finished jobs:     181038s    3017.31m    50.29h    2.10d  0.006 y
# IO & Wait Time:                  2433s      40.54m     0.68h    0.03d  0.000 y
# Average job time:                1191s      19.86m     0.33h    0.01d
# Longest job:                     3365s      56.08m     0.93h    0.04d
# Submission to last job:          5981s      99.68m     1.66h    0.07d

    #   Second cluster run to convert the .out's to .lav's
    cd /cluster/data/cb1/bed/blastzCe2
    source DEF
    /cluster/data/cb1/jkStuff/BlastZ_run1.sh
    cd run.1
    para try, check, push, etc ...
# para time
# Completed: 11 of 11 jobs
# CPU time in finished jobs:        973s      16.21m     0.27h    0.01d  0.000 y
# IO & Wait Time:                    57s       0.96m     0.02h    0.00d  0.000 y
# Average job time:                  94s       1.56m     0.03h    0.00d
# Longest job:                      125s       2.08m     0.03h    0.00d
# Submission to last job:           125s       2.08m     0.03h    0.00d

    #   Third cluster run to convert lav's to axt's
    cd /cluster/data/cb1/bed/blastzCe2
    source DEF
    /cluster/data/cb1/jkStuff/BlastZ_run2.sh
    cd run.2
    para try
    para check
# para time
# CPU time in finished jobs:        240s       4.00m     0.07h    0.00d  0.000 y
# IO & Wait Time:                   359s       5.99m     0.10h    0.00d  0.000 y
# Average job time:                 599s       9.98m     0.17h    0.01d
# Longest job:                      599s       9.98m     0.17h    0.01d
# Submission to last job:           599s       9.98m     0.17h    0.01d

    # translate sorted axt files into psl
    ssh eieio
    cd /cluster/data/cb1/bed/blastzCe2
    mkdir -p pslChrom
    set tbl = "blastzCe2"
    foreach f (axtChrom/chr*.axt)
      set c=$f:t:r
      echo "Processing chr $c"
      /cluster/bin/i386/axtToPsl $f S1.len S2.len pslChrom/${c}_${tbl}.psl
    end
 
   # Load database tables
    ssh hgwdev
    cd /cluster/data/cb1/bed/blastzCe2/pslChrom
    /cluster/bin/i386/hgLoadPsl -noTNameIx cb1 chrUn_blastzCe2.psl
    # alignments have gone from 1668769 for Ce1 to 1256139 for Ce2

# trackDb/worm/cb1/trackDb.ra entry:
# track blastzCe2
# shortLabel Elegans Blastz
# longLabel Blastz C. elegans
# group compGeno
# priority 155.2
# visibility dense
# color 0,0,0
# altColor 50,128,50
# spectrum on
# type psl xeno ce2
# otherDb ce2

# CHAINING C. elegans blastz for Ce2 [DONE, 2004-05-06, hartera]
    # next machine
    ssh kk
    mkdir -p /cluster/data/cb1/bed/blastzCe2/axtChain/run1
    cd /cluster/data/cb1/bed/blastzCe2/axtChain/run1
    mkdir out chain
    
    ls -1S /cluster/data/cb1/bed/blastzCe2/axtChrom/*.axt > input.lst
    cat << '_EOF_' > gsub
#LOOP
doChain {check in exists $(path1)} {check out line+ chain/$(root1).chain} {check out line+ out/$(root1).out}
#ENDLOOP
'_EOF_'
    # << this line makes emacs coloring happy
                                                                                
    cat << '_EOF_' > doChain
#!/bin/csh
    axtFilter $1 | axtChain stdin \
        /iscratch/i/worms/Cbriggsae/bothMasksNib \
        /iscratch/i/worms/Celegans2/nib $2 >& $3
'_EOF_'
    # << this line makes emacs coloring happy
    chmod a+x doChain
    gensub2 input.lst single gsub jobList
    para create jobList
    # only 1 job so all done by para try
    para try
    para check
# para time
# CPU time in finished jobs:       1649s      27.48m     0.46h    0.02d  0.000 y
# IO & Wait Time:                    15s       0.26m     0.00h    0.00d  0.000 y
# Average job time:                1664s      27.73m     0.46h    0.02d
# Longest job:                     1664s      27.73m     0.46h    0.02d
# Submission to last job:          1664s      27.73m     0.46h    0.02d

    # now on the file server, sort chains
    ssh eieio
    cd /cluster/data/cb1/bed/blastzCe2/axtChain
    time chainMergeSort run1/chain/*.chain > all.chain
    # User 48.840u
    # System 3.530s
    # Elapsed Real 0:54.94
 
    time chainSplit chain all.chain
    # User 49.300u
    # System 3.700s
    # Elapsed Real 0:55.55

    # Load chains into database
    # next machine
    ssh hgwdev
    cd /cluster/data/cb1/bed/blastzCe2/axtChain/chain
    hgLoadChain cb1 chrUn_chainCe2 chrUn.chain

# trackDb/worm/cb1/trackDb.ra entry:
# track chainCe2
# shortLabel Elegans Chain
# longLabel Chained elegans/briggsae Alignments
# group compGeno
# priority 155.1
# visibility dense
# color 100,50,0
# altColor 255,240,200
# spectrum on
# type chain ce2

# ADD TWINSCAN PREDICTIONS (DONE, 2004-11-30, hartera)
    # Provided by Chaochun Wei: wei@cse.wustl.edu
    # These are gene predictions produced by Iscan (new version of Twinscan)
    # C. briggsae/C. elegans (version WS98) homology used for predictions.
    ssh eieio
    mkdir -p /cluster/data/cb1/bed/twinscan
    cd /cluster/data/cb1/bed/twinscan
    wget --timestamp \
         http://genes.cs.wustl.edu/predictions/worm/C_briggsae/cb25.agp8_11_19_2004/cb25.agp8_11_19_2004.tgz
    tar xvzf cb25.agp8_11_19_2004.tgz
    # Add '.a' to end of protein fasta id's, to match gtf transcript_id's:
    cp /dev/null twinscanPep.fa
    perl -wpe 's/^(>\S+).*/$1.a/' chr_ptx/*.ptx > twinscanPep.fa
    # need to do lift up from contigs to chrUn
    cat ./chr_gtf/cb*.gtf >> cbAll.gtf
    liftUp cbAll.lifted.gtf /cluster/data/cb1/Un/chrUn.lft warn cbAll.gtf
    # Got 1156 lifts in /cluster/data/cb1/Un/chrUn.lft
    # load.
    ssh hgwdev
    cd /cluster/data/cb1/bed/twinscan
    ldHgGene -gtf -genePredExt cb1 twinscan cbAll.lifted.gtf
    # Read 25001 transcripts in 175327 lines in 1 files
    # 25001 groups 1 seqs 1 sources 3 feature types
    # 25001 gene predictions
    hgPepPred cb1 generic twinscanPep twinscanPep.fa
# Add track to trackDb.ra for cb1
   
#######################################################################
# TWINSCAN CLEANUP (DONE, 2007-06-25, hartera)
    ssh kkstore02
    cd /cluster/store5/worm/cb1/bed/twinscan
    # remove cbAll.gtf as same as in chr_gtf directory
    rm cbAll.gtf
    rm *.tab
    # gzip other files
    gzip chr_gtf/*.gtf chr_ptx/*.ptx chr_tx/*.tx
    
#######################################################################
# Ce2 BLASTZ CLEANUP (DONE, 2007-06-25, hartera)
    ssh kkstore02
    cd /cluster/store5/worm/cb1/bed/blastzCe2.2004-05-05/pslChrom
    rm psl.tab
    gzip chrUn_blastzCe2.psl
    cd /cluster/store5/worm/cb1/bed/blastzCe2/lav/chrUn
    gzip *.lav
    cd /cluster/store5/worm/cb1/bed/blastzCe2/axtChain
    gzip all.chain
    # this is the same as the chrUn.chain since there is only one chrom
    # so the chain directory can be removed. 
    rm -r chain
    cd run1/chain/
    gzip chrUn.chain
    cd ..
    rm -r err
    rm batch batch.bak para.results
    cd /cluster/store5/worm/cb1/bed/blastzCe2/axtChrom
    gzip chrUn.axt
     

#######################################################################
# redo genbank alignments, as update was never enabled. (2007-11-15 markd)

    cd kent/src/hg/makeDb/genbank
    # edit etc/genbank.conf and add
    cb1.refseq.mrna.xeno.load = yes
    cvs commit etc
    make etc-update

    # do alignment
    ssh genbank
    cd /cluster/data/genbank
    ((nice ./bin/gbAlignStep -initial cb1)|&mail markd&)

    # the next day..
    ssh hgwdev
    cd /cluster/data/genbank
    ((nice ./bin/gbDbLoadStep -drop -initialLoad cb1)|&mail markd&)
