# for emacs: -*- mode: sh; -*-


# This file describes how to make the browser database for the
# worm C. elegans
# Currently 2003-05-28 this file is in a state of flux as it is being
# worked out.

DOWNLOAD SEQUENCE (DONE 2003-05-12 - Hiram)

    # next machine
    ssh eieio
    mkdir -p /cluster/store5/worm/ce1/sanger
    mkdir -p /cluster/store5/worm/ce1/tmp
    cd /cluster/store5/worm/ce1/sanger
    wget -o ce1.fetch.log -r -l1 --no-directories \
	ftp://ftp.sanger.ac.uk/pub/wormbase/current_release/CHROMOSOMES/
#   Takes about five minutes
#   This current_release is updated every two weeks.  Although the
#	sequence is quite stable at this time and changes very little.
#   Check that you have some valid files. Should be chroms I II III IV V X
#	in dna, gff and agp formats
    ls -ogrt
#   get out of this download data directory, and create your home symlink
#	shortcut
    cd ..
    ln -s /cluster/store5/worm/ce1 ~/ce1
    cd ~/ce1
#   There is a Mitochondria sequence.  It can be found at NCBI
#	This search sequence at NCBI in 'nucleotide' selects one
#	entry: X54252:
#	Caenorhabditis Elegans[Organism] AND mitochondria AND MTCE
#	I'd like to figure out a wget command to fetch this, but doing
#	that manually via the WEB browser is not such a big deal.  And
#	besides, once it is fetched, it is stable (haven't seen it
#	change in years) so you can
#	merely copy from a previous Ce build.  A copy has been manually
#	placed in ~/ce1/ncbi/x54252.fa
#	

#   translate to unzipped .fa, all upper case, and
#	rename the agp files so hgGoldGap can find them
    mkdir sangerFa
    sed -e "s/^>.*/>chrM/" ./ncbi/x54252.fa > sangerFa/chrM.fa
    foreach f (I II III IV V X)
	echo -n "${f} "
	zcat sanger/CHROMOSOME_${f}.dna.gz | tr '[a-z]' '[A-Z]' | \
		sed -e "s/CHROMOSOME_/chr/" > sangerFa/chr${f}.fa
    	mkdir sangerFa/${f}
        ln -s ~/ce1/sanger/CHROMOSOME_${f}.agp sangerFa/${f}/chr${f}.agp
    end
    # hgGoldGap does not handle dir names over 2 characters:
    mv sangerFa/III sangerFa/3

# CREATE AN .agp FILE FOR ChrM (DONE, 2004-04-19, hartera)

    ssh hgwdev
    cd /cluster/data/ce1/sanger
    cat << '_EOF_' > CHROMOSOME_M.agp
M       1       13794   1       F       X54252.1      1       13794   +
'_EOF_'
    mkdir -p /cluster/data/ce1/sangerFa/M
    ln -s /cluster/data/ce1/sanger/CHROMOSOME_M.agp \
           /cluster/data/ce1/sangerFa/M/chrM.agp
    
# CREATING DATABASE (DONE 2003-05-12 - Hiram)

    # Create the database.
    # next machine
    ssh hgwdev
    echo 'create database ce1' | hgsql ''
    # if you need to delete that database:  !!! WILL DELETE EVERYTHING !!!
	echo 'drop database ce1' | hgsql ce1
    # Use df to ake sure there is at least 5 gig free on 
    df -h /var/lib/mysql
# Before loading data:
# Filesystem            Size  Used Avail Use% Mounted on
# /dev/sda1             472G  389G   58G  87% /var/lib/mysql


# CREATING GRP TABLE FOR TRACK GROUPING (DONE 2003-05-12 - Hiram)
    # next machine
    ssh hgwdev
    #  the following command copies all the data from the table
    #	grp in the database rn1 to our new database ce1
    echo "create table grp (PRIMARY KEY(NAME)) select * from rn1.grp" \
      | hgsql ce1
    # if you need to delete that table:   !!! WILL DELETE ALL grp data !!!
	echo 'drop table grp;' | hgsql ce1

# PREPARE Split contigs into 100,000 bp chunks for cluster runs
    # next machine
    ssh eieio
    cd ~/ce1
    rm -fr ./split
    mkdir split
    foreach f (I II III IV V X M)
	mkdir split/$f
        faSplit size sangerFa/chr${f}.fa 100000 split/$f/c -lift=split/chr${f}.lft
    end
# 151 pieces of 151 written
# 153 pieces of 153 written
# 138 pieces of 138 written
# 175 pieces of 175 written
# 210 pieces of 210 written
# 178 pieces of 178 written
# 1 pieces of 1 written
    cat split/*.lft > liftAll.lft

    # copy them to /iscratch/i for cluster rsync
    # next machine
    ssh kkr1u00
    cd ~/ce1/split
    foreach c (I II III IV V X M)
	echo -n "${c} "
	mkdir -p /iscratch/i/worms/Celegans/unmaskedSplit/${c}
        cp -p ${c}/c*.fa /iscratch/i/worms/Celegans/unmaskedSplit/${c}
    end
    ~kent/bin/iSync

# Run RepeatMasker on the chromosomes (DONE 2003-06-04 - Hiram)
    # next machine
    ssh kk
    cd ~/ce1
    mkdir RMRun
    cd RMRun
    # create the script to use for the RepeatMasker run
    cat << '_EOF_' > RMWorm
#!/bin/csh -fe
#
#       This is a slight rearrangement of the
#       /cluster/bin/scripts/RMWorm used in makeHg15.doc
#       The results here need to go to a different location 
#       $1 == chrom name: I II III IV V X M
#       $2 == directory where split contig .fa is found
#       $3 == name of contig .fa file
cd $1
pushd .
cd $2
/bin/mkdir -p /tmp/$3/$1
/bin/cp $3 /tmp/$3/$1
cd /tmp/$3/$1
/scratch/hg/RepeatMasker/RepeatMasker -s -el -ali $3
popd
/bin/cp /tmp/$3/$1/$3.out ./
if( -e /tmp/$3/$1/$3.align ) /bin/cp /tmp/$3/$1/$3.align ./
# /bin/cp /tmp/$2*.masked ../masked/
/bin/rm -r /tmp/$3/$1
/bin/rmdir --ignore-fail-on-non-empty /tmp/$3
'_EOF_'
    chmod +x RMWorm

    # create job list
    rm -f RMJobs
    foreach c (I II III IV V X M)
	mkdir ~/ce1/RMRun/${c}
        foreach t ( /iscratch/i/worms/Celegans/unmaskedSplit/$c/c*.fa )
	    set d = $t:h
	    set f = $t:t
	    echo ./RMWorm ${c} ${d} ${f} \
	    '{'check out line+ ~/ce1/RMRun/$c/${f}.out'}' \
	    >> RMJobs
	end
    end
    para create RMJobs
    para try
    para check
    # when they are finished, liftUp and load the .out files into the database:
    # next machine
    ssh eieio
    cd ~/ce1/RMRun
    foreach c (I II III IV V X M)
	liftUp chr${c}.fa.out ../split/chr${c}.lft warn ${c}/*.fa.out
    end
    
    # next machine
    ssh hgwdev
    cd ~/ce1/RMRun
    hgLoadOut ce1 chr*.fa.out
    # Noticed one error in this load:
    # Processing chrI.fa.out
    # Strange perc. field -0.6 line 791 of chrI.fa.out


# SIMPLE REPEAT [TRF] TRACK (DONE 2003-06-05 - Hiram)
    # ensure chr*.fa files exist on /iscratch/i
    # next machine
    ssh kkr1u00
    cp -p /cluster/store5/worm/ce1/sangerFa/*.fa \
	/iscratch/i/worms/Celegans/unmaskedFa
    ~kent/bin/iSync

    # Create cluster parasol job:
    # next machine
    ssh kk
    mkdir -p ~/ce1/bed/simpleRepeat
    cd ~/ce1/bed/simpleRepeat
    mkdir trf
    ls -1S /iscratch/i/worms/Celegans/unmaskedFa/chr*.fa > genome.lst
    cat << '_EOF_' > gsub
#LOOP
/cluster/bin/i386/trfBig -trf=/cluster/bin/i386/trf {check in line+ $(path1)}  /dev/null -bedAt={check out line trf/$(root1).bed} -tempDir=/tmp
#ENDLOOP
'_EOF_'

    echo "" > dummy.lst
    gensub2 genome.lst dummy.lst gsub spec
    para create spec
    para try    # there are only 7, so this runs them all
    para check
Average job time:                 468s       7.80m     0.13h    0.01d
Longest job:                      963s      16.05m     0.27h    0.01d

    #  When cluster run is done, combine into one:
    cat trf/*.bed > simpleRepeat.bed
    
    # Load into the database:
    # next machine
    ssh hgwdev
    cd ~/ce1/bed/simpleRepeat
    /cluster/bin/i386/hgLoadBed ce1 simpleRepeat simpleRepeat.bed \
      -sqlTable=$HOME/src/hg/lib/simpleRepeat.sql
    # Loaded 28590 elements

# PROCESS SIMPLE REPEATS INTO MASK (DONE 2003-06-05 - Hiram)
    # next machine
    ssh eieio
    cd ~/ce1/bed/simpleRepeat
    mkdir -p trfMask
    foreach f (trf/*.bed)
	awk '{if ($5 <= 12) print;}' $f > trfMask/$f:t
    end

# Create Soft and Hard masks from RepeatMaster and TRF outputs:
#  and rebuild the nib files
    # next machine
    ssh eieio
    cd ~/ce1
    mkdir softMask
    cd ~/ce1
    foreach c (I II III IV V X M)
        echo -n "masking chr${c} "
	/cluster/bin/i386/maskOutFa sangerFa/chr${c}.fa RMRun/chr${c}.fa.out \
		softMask/chr${c}.fa -soft
	/cluster/bin/i386/maskOutFa softMask/chr${c}.fa \
		bed/simpleRepeat/trfMask/chr${c}.bed \
		softMask/chr${c}.fa -softAdd
		faToNib -softMask softMask/chr${c}.fa nib/chr${c}.nib
    end
    # output:
    # masking chrI Writing 15080473 bases in 7540245 bytes
    # masking chrII Writing 15279303 bases in 7639660 bytes
    # masking chrIII Writing 13783268 bases in 6891642 bytes
    # masking chrIV Writing 17493790 bases in 8746903 bytes
    # masking chrV Writing 20922238 bases in 10461127 bytes
    # masking chrX Writing 17705013 bases in 8852515 bytes
    # masking chrM Writing 13794 bases in 6905 bytes

    # unknown if hardMasks are actually needed for anything
    mkdir hardMask
    foreach c (I II III IV V X M)
        echo "masking chr${c}"
	/cluster/bin/i386/maskOutFa softMask/chr${c}.fa hard \
		hardMask/chr${c}.fa
    end

    #  With masked nib files ready, prepare cluster for blastz runs
    # next machine
    ssh kkr1u00
    cd ~/ce1/softMask
    mkdir -p /iscratch/i/worms/Celegans/bothMasksFa
    mkdir -p /iscratch/i/worms/Celegans/bothMasksNib
    cp -p *.fa /iscratch/i/worms/Celegans/bothMasksFa
    cd ~/ce1/nib
    cp -p c*.nib /iscratch/i/worms/Celegans/bothMasksNib
    ~kent/bin/iSync

STORING O+O SEQUENCE AND ASSEMBLY INFORMATION  (DONE 2003-05-12 - Hiram)

    # Make symbolic links from /gbdb/ce1/nib to the real nibs.
    # next machine
    ssh hgwdev
    mkdir -p /gbdb/ce1/nib
    foreach f (/cluster/store5/worm/ce1/nib/*.nib)
      ln -s $f /gbdb/ce1/nib
    end
    cd ~/ce1/tmp
    # Load /gbdb/ce1/nib paths into database and save size info.
    hgsql ce1  < ~/kent/src/hg/lib/chromInfo.sql
    # if you need to delete that table:   !!! DELETES ALL DATA IN TABLE !!!
	echo 'drop table chromInfo;' | hgsql ce1
    hgNibSeq -preMadeNib ce1 /gbdb/ce1/nib sangerFa/chr*.fa
# Typical output:
# Processing chrI.fa to /gbdb/ce1/nib/chrI.nib
# Processing chrII.fa to /gbdb/ce1/nib/chrII.nib
# Processing chrIII.fa to /gbdb/ce1/nib/chrIII.nib
# Processing chrIV.fa to /gbdb/ce1/nib/chrIV.nib
# Processing chrM.fa to /gbdb/ce1/nib/chrM.nib
# Processing chrV.fa to /gbdb/ce1/nib/chrV.nib
# Processing chrX.fa to /gbdb/ce1/nib/chrX.nib
# 100277879 total bases

#   Verify the hgNibSeq load functioned OK:
    echo "select chrom,size from chromInfo" | hgsql -N ce1 > chrom.sizes
    cat chrom.sizes
#   Typical contents of chrom.sizes:
chrI    15080473
chrII   15279303
chrIII  13783268
chrIV   17493790
chrM    13794
chrV    20922238
chrX    17705013

    # Set up relational mrna tables.
    hgLoadRna new ce1
    # that created a bunch of tables.  If you need to delete them:
	hgLoadRna drop ce1

MAKE GCPERCENT AND GAP tracks (DONE 2003-05-12 - Hiram)
    # next machine
     ssh hgwdev
     mkdir -p /cluster/store5/worm/ce1/bed/gcPercent
     cd /cluster/store5/worm/ce1/bed/gcPercent
     hgsql ce1  < ~/kent/src/hg/lib/gcPercent.sql
    #  If you need to delete that table created
	echo 'drop table gcPercent;' | hgsql ce1;
     hgGcPercent ce1 ../../nib
    # hgGoldGap does not handle dir names over 2 characters
    #	directory III has been moved to 3 when all this was created above
     hgGoldGapGl -noGl ce1 /cluster/store5/worm/ce1 sangerFa
    # there are no gaps in these chromosomes.  They are all making
    # empty tables except for chrM.  We tried running the browser
    # with these empty tables removed, but then it complained when
    # we tried to display the assembly track.  The browser needs to
    # be fixed.  In the meantime, for completeness, and to allow
    # featureBits to function, create an empty chrM_gap:
    cat << '_EOF_' | hgsql ce1
CREATE TABLE chrM_gap (
     bin smallint not null,
     chrom varchar(255) not null,
     chromStart int unsigned not null,
     chromEnd int unsigned not null,
     ix int not null,
     n char(1) not null,
     size int unsigned not null,
     type varchar(255) not null,
     bridge varchar(255) not null,
     UNIQUE(chromStart),
     UNIQUE(chromEnd)
);
'_EOF_'
    #  hgGoldGapGl needs to be fixed so it can create a gold track
    #	and not need to create a gap track.  And the browser needs to
    #	be fixed so it can display the assembly track without the need
    #	for the gap tables to exist.

# REMAKE GAP TRACKS - ADD POSITIONS OF Ns TO ChrN_gap TABLES 
# (DONE, 2004-04-30, hartera)
# Add Ns to gap tables as for Ce2, this is described on Gap details page
    # next machine
    ssh hgwdev
    mkdir -p /cluster/data/ce1/bed/gap
    cd /cluster/data/ce1/bed/gap
    # finds motifs and finds location of gaps as part of output
    foreach c (I II III IV V X M)
        findMotif -chr=chr${c} -verbose=4 -motif=gcatg \
            /gbdb/ce1/nib >& chr${c}Bed.stderr
    end
    grep -h GAP *.stderr | sed -e "s/#GAP //" > gap.bed
    # All the gap tables are empty so 
    # Load in gap.bed
    # Need to add extra fields to gap.bed file
                                                                                
    cat << '_EOF_' > /cluster/data/ce1/scripts/createGapFile.pl
#!/usr/bin/perl -w
use strict;
                                                                                
my $oldchr = "";
while (<STDIN>) {
  my @fields = split(/\t/);
  my $chr = $fields[0];
  if ($chr ne $oldchr) {
     open(OUT, ">$chr"."_gap.bed");
     $oldchr = $chr;
  }
  print OUT "$fields[0]\t$fields[1]\t$fields[2]\t$fields[3]\tN\t$fields[4]\tfragment\tyes\n";
}
'_EOF_'

    perl /cluster/data/ce1/scripts/createGapFile.pl < gap.bed
    # load into relevant tables
    foreach c (I II III IV V X M)
        hgLoadBed -tab -oldTable ce1 chr${c}_gap chr${c}_gap.bed
    end

# MAKE chrM_gold TABLE FROM NEW chrM.agp FILE CREATED ABOVE 
# (DONE, 2004-04-19, hartera)
    # next machine
    ssh hgwdev
    cd /cluster/data/ce1/
    hgGoldGapGl -noGl -chrom=chrM ce1 /cluster/data/ce1 sangerFa

MAKE HGCENTRALTEST ENTRY AND TRACKDB TABLE FOR CE1 (DONE 2002-05-12 - Hiram)
    # next machine
     ssh hgwdev
    echo 'insert into defaultDb values("C. elegans", "ce1");' \
      | hgsql -h genome-testdb hgcentraltest
    #  If you need to delete that entry:
	echo 'delete from defaultDb where name="ce1";' \
        | hgsql -h genome-testdb hgcentraltest
    # Note: for next assembly, set scientificName column to 
    # "Caenorhabditis elegans"

    echo 'insert into dbDb values("ce1", "May 2003", \
	"/gbdb/ce1/nib", "Worm", "chrII:14642289-14671631", 1, 10, \
	"C. elegans");' \
	| hgsql -h genome-testdb hgcentraltest
    #  If you need to delete that entry:
	echo 'delete from dbDb where name="ce1";' \
	| hgsql -h genome-testdb hgcentraltest
    # Make trackDb table so browser knows what tracks to expect:
    cd ~/kent/src/hg/makeDb/trackDb
    cvs up -d -P
    # Edit that makefile to add ce1 in all the right places and do
    make update
    make alpha
    cvs commit makefile

MAKE SANGER GENE TRACK (DONE - 2003-05-19 - Hiram)

    # next machine
    ssh eieio
    mkdir -p /cluster/store5/worm/ce1/bed/sangerGene
    cd /cluster/store5/worm/ce1/bed/sangerGene
    # the perl trims these files down to a reasonable size.  As they are
    #	they cause ldHgGene to hangup due to memory size.
    foreach f (I II III IV V X)
	echo -n "${f} "
	zcat ../../sanger/CHROMOSOME_${f}.gff.gz | \
	sed -e "s/CHROMOSOME_III/chrIII/g" -e "s/CHROMOSOME_II/chrII/g" \
	-e "s/CHROMOSOME_IV/chrIV/g" -e "s/CHROMOSOME_I/chrI/g" \
	-e "s/CHROMOSOME_X/chrX/g" -e "s/CHROMOSOME_V/chrV/g" \
	-e 's/Sequence "\(.*\)"$/\1/' -e 's/Transcript "\(.*\)"$/\1/' | \
	perl -ne '@a = split; \
        print if($a[1] =~ /curated|DNA|RNA/i && \
        $a[2] =~ /intron|exon|cds|sequence|transcri/i);' > chr${f}.gff
    end
    echo
    #  check file sizes, should be reasonable
    ls -ogrt
# [hiram@eieio sangerGene]$ ls -ogrt
# total 22656
# -rw-r--r--    1 hiram     3495873 May 19 11:52 chrI.gff
# -rw-r--r--    1 hiram     3670984 May 19 11:54 chrII.gff
# -rw-r--r--    1 hiram     3264917 May 19 11:56 chrIII.gff
# -rw-r--r--    1 hiram     3642256 May 19 11:59 chrIV.gff
# -rw-r--r--    1 hiram     4704564 May 19 12:02 chrV.gff
# -rw-rw-r--    1 hiram     3787721 May 19 12:03 chrX.gff

    # now load database with those transformed gff files
    # next machine
    ssh hgwdev
    cd /cluster/store5/worm/ce1/bed/sangerGene
    ldHgGene ce1 sangerGene *.gff
# Read 37330 transcripts in 411148 lines in 6 files
#  37330 groups 6 seqs 11 sources 6 feature types
#  22128 gene predictions

    # if you need to delete that table:
	echo 'drop table sangerGene' | hgsql ce1

# MAKING AND STORING mRNA AND EST ALIGNMENTS (WORKING 2003-05-28 - Hiram)
    # next machine
    ssh kkr1u00
    cd /iscratch/i/worms/Celegans
    mkdir mRNA
    # there are 1644 sequences in mrna.fa, the 1000000 makes 1644 files
    faSplit sequence /iscratch/i/mrna.134/Caenorhabditis_elegans/mrna.fa \
		1000000 mRNA/m
    ~kent/bin/iSync

    # next machine
    ssh kk
    cd ~/ce1/bed
    mkdir mrna est
    cd mrna
    mkdir psl
    ls -1S /iscratch/i/worms/Celegans/bothMasksNib/chr*.nib > worm.lst
    ls -1S /iscratch/i/worms/Celegans/mRNA/m*.fa > mrna.lst
    cat << '_EOF_' > gsub
#LOOP
/cluster/bin/i386/blat -fine -q=rna -mask=lower {check in exists+ $(path1)} {check in line+ $(path2)} {check out line+ psl/$(root1)_$(root2).psl}
#ENDLOOP
'_EOF_'

    gensub2 worm.lst mrna.lst gsub spec
    para create spec
    para try
    para check
    para push
    ... etc ...
    #  These jobs are a bit on the short side.  I wonder if the mrna
    #  fa's could be put together in larger chunks ?
# Average job time:                   8s       0.13m     0.00h    0.00d
# Longest job:                       63s       1.05m     0.02h    0.00d


    # cluster run done
    pslSort dirs raw.psl /tmp psl
    pslReps -minAli=0.98 -sizeMatters -nearTop=0.005 raw.psl all_mrna.psl \
      /dev/null
    pslSortAcc nohead chrom /tmp all_mrna.psl

    # load mrna tables
    # next machine
    ssh hgwdev
    cd ~/ce1/bed/mrna/chrom
    # names must be chr*_mrna.psl
    foreach d (I II III IV V X M)
	rm -f chr${d}_mrna.psl
	mv chr${d}.psl chr${d}_mrna.psl
    end
    # this load command rewrites these table entirely if they exist
    hgLoadPsl ce1 *.psl
    mkdir /gbdb/ce1/mrna.134
    ln -s /cluster/store5/mrna.134/org/Caenorhabditis_elegans/mrna.fa \
	/gbdb/ce1/mrna.134
    # ! ! !  DO NOT RUN hgLoadRna in /gbdb - it leaves .tab files
    cd ~/ce1/bed/mrna
    hgLoadRna add -type=mRNA ce1 /gbdb/ce1/mrna.134/mrna.fa \
	/cluster/store5/mrna.134/org/Caenorhabditis_elegans/mrna.ra
    hgLoadPsl ce1 all_mrna.psl

    # setup /iscratch/i with the est.fa file for blat run
    # next machine
    ssh kkr1u00
    mkdir -p /iscratch/i/worms/Celegans/EST
    cd /iscratch/i/worms/Celegans/EST
    # create about a 200 files to provide about 1400 jobs to the cluster
    #	(these jobs are very small despite this small number of est files)
    faSplit sequence \
	/cluster/store5/mrna.134/org/Caenorhabditis_elegans/est.fa 200 e
    ~kent/bin/iSync

    # next machine
    ssh kk
    cd ~/ce1/bed/est
    mkdir psl
    ls -1S /iscratch/i/worms/Celegans/bothMasksNib/chr*.nib > worm.lst
    ls -1S /iscratch/i/worms/Celegans/EST/e*.fa > est.lst
    cat << '_EOF_' > gsub
#LOOP
/cluster/bin/i386/blat -mask=lower {check in exists+ $(path1)} {check in line+ $(path2)} {check out line+ psl/$(root1)_$(root2).psl}
#ENDLOOP
'_EOF_'

    gensub2 worm.lst est.lst gsub spec
    para create spec
    para try
    para check
    para push
    # These 1358 jobs are pretty short:
# Average job time:                  43s       0.71m     0.01h    0.00d
# Longest job:                       77s       1.28m     0.02h    0.00d

    # cluster run done
    # next machine
    ssh hgwdev
    cd ~/ce1/bed/est
    pslSort dirs raw.psl /tmp psl
    pslReps -minAli=0.98 -sizeMatters -nearTop=0.005 raw.psl all_est.psl \
      /dev/null
    pslSortAcc nohead chrom /tmp all_est.psl
    cd chrom
    # names must be chr*_est.psl
    foreach d (I II III IV V X M)
	rm -f chr${d}_est.psl
	mv chr${d}.psl chr${d}_est.psl
    end
    # this load command rewrites these table entirely if they exist
    hgLoadPsl ce1 *.psl

    ln -s /cluster/store5/mrna.134/org/Caenorhabditis_elegans/est.fa \
	/gbdb/ce1/mrna.134
    # ! ! !  DO NOT RUN hgLoadRna in /gbdb - it leaves .tab files
    cd ~/ce1/bed/est
    hgLoadRna add -mrnaType=EST -type=EST ce1 /gbdb/ce1/mrna.134/est.fa \
	/cluster/store5/mrna.134/org/Caenorhabditis_elegans/est.ra
    hgLoadPsl ce1 all_est.psl -nobin

MAKE HGCENTRALTEST BLATSERVERS ENTRY FOR CE1 (DONE - 2003-05-29 - Hiram)
    # next machine
    ssh hgwdev
    # DNA port is "0", trans prot port is "1"
    echo 'insert into blatServers values("ce1", "blat10", "17784", "0"); \
          insert into blatServers values("ce1", "blat10", "17785", "1");' \
      | hgsql -h genome-testdb hgcentraltest
        # if you need to delete those entries
	echo 'delete from blatServers where db="ce1";' \
        | hgsql -h genome-testdb hgcentraltest
        # to check the entries:
	echo 'select * from blatServers where db="ce1";' \
        | hgsql -h genome-testdb hgcentraltest

PRODUCING FUGU FISH ALIGNMENTS  (DONE 2003-06-05 - Hiram)
    # Assumes masked NIBs have been prepared as above
    # and Fugu pieces are already on kluster /iscratch/i.
    # next machine
    ssh kk
    mkdir -p ~/ce1/bed/blatFugu
    cd ~/ce1/bed/blatFugu
    mkdir psl
    ls -1S /iscratch/i/fugu/*.fa > fugu.lst
    ls -1S /iscratch/i/worms/Celegans/bothMasksNib/chr*.nib > worm.lst
    cat << '_EOF_' > gsub
#LOOP
/cluster/bin/i386/blat -q=dnax -t=dnax -mask=lower {check in exists+ $(path1)} {check in line+ $(path2)} {check out line+ psl/$(root1)_$(root2).psl}
#ENDLOOP
'_EOF_'

    gensub2 worm.lst fugu.lst gsub spec
    para create spec
    para try
    para check
    para push
    para check
# Average job time:                 237s       3.95m     0.07h    0.00d
# Longest job:                     1326s      22.10m     0.37h    0.02d

    # want names chrI_blatFugu
    #  When cluster run is done, sort alignments
    # next machine
    ssh eieio
    cd ~/ce1/bed/blatFugu
    foreach d (I II III IV V X M)
        echo -n "${d} "
	pslCat psl/chr${d}_*.psl | \
	pslSortAcc nohead chrom temp stdin
	rm -f chrom/chr${d}_blatFugu.psl
	mv chrom/chr${d}.psl chrom/chr${d}_blatFugu.psl
    end

    # next machine
    ssh hgwdev
    cd ~/ce1/bed/blatFugu/chrom
    hgLoadPsl ce1 chr*_blatFugu.psl

    # Make fugu /gbdb/ symlink and load Fugu sequence data.
    mkdir /gbdb/ce1/fuguSeq
    ln -s /cluster/store3/fuguSeq/fugu_v3_mask.fasta /gbdb/ce1/fuguSeq
    # ! ! !  DO NOT RUN hgLoadRna in /gbdb - it leaves .tab files
    cd ~/ce1/bed/blatFugu
    hgLoadRna addSeq ce1 /gbdb/ce1/fuguSeq/fugu_v3_mask.fasta

# RUN Waba alignment with briggsae  (DONE - 2003-06-02 - Hiram)
    # prepare contigs from C. briggsae
    #   Assumes C. briggsae data has been downloaded according to
    #   makeCb1.doc
    # next machine
    ssh kkr1u00
    mkdir -p /iscratch/i/worms/Cbriggsae/waba_contigs
    cd /iscratch/i/worms/Cbriggsae/waba_contigs
    faSplit sequence ../contigs.fa 2000 c
    ~kent/bin/iSync
    
    # next machine
    ssh kk
    mkdir ~/ce1/bed/waba/out
    cd ~/ce1/bed/waba
    ls /iscratch/i/worms/Cbriggsae/contigs/c*.fa > briggsae.lst
    ls /iscratch/i/worms/Celegans/unmaskedFa/chr*.fa > elegans.lst
    echo "" > dummy.lst
    # create scripts to be used here
    cat << '_EOF_' > wabaRun
#!/bin/csh -fe
#
#	$1 - full pathname to a briggsae contig
#	$2 - file path to an elegans chrom.fa
#	$3 - result file full pathname
#
set f = $1:t
set chr = $2:t
set d = $chr:r
mkdir -p /tmp/$d/$f
cp $1 /tmp/$d/$f
pushd .
cd /tmp/$d/$f
set t = $f:r
/cluster/home/hiram/bin/i386/waba 1 $f $2 $t.1
/cluster/home/hiram/bin/i386/waba 2 $t.1 $t.2
/cluster/home/hiram/bin/i386/waba 3 $t.2 $t.wab
cp $t.wab $3
popd
rm -f /tmp/$d/$f/$t.*
rmdir --ignore-fail-on-non-empty /tmp/$d/$f
rmdir --ignore-fail-on-non-empty /tmp/$d
'_EOF_'
    chmod +x wabaRun

    echo
    cat << '_EOF_' > gsub
#LOOP
/cluster/store5/worm/ce1/bed/waba/wabaRun {check in exists+ $(path1)} {check in exists+ $(path2)} {check out exists /cluster/store5/worm/ce1/bed/waba/out/$(root2)/$(root1).wab}
#ENDLOOP
'_EOF_'

    gensub2 briggsae.lst elegans.lst gsub spec
    para create spec
    para try
    para check
    para push
    ... etc ...
    # one of the jobs takes quite a while.  Most of the others are OK:
# Average job time:                2308s      38.46m     0.64h    0.03d
# Longest job:                    47442s     790.70m    13.18h    0.55d

    # next machine
    ssh hgwdev
    # you may need to build hgWaba
    cd ~/kent/src/hg/makeDb/hgWaba
    make
    # hgWaba.c has been adjusted to avoid the squeezed assert by
    #  ignoring them.  That may want to be looked at some day.
    #  (The waba cluster run has a number of errors too.  waba is not
    #	perfect, but it is as good as the Intronerator was...)

    cd ~/ce1/bed/waba
    # one of the outputs produces a result that won't process properly
    # with hgWaba.  Move it out of the way
    mv out/chrII/c0911.wab out/chrII_c0911.wab.broken
 
    mkdir Load
    cd Load
    #  The cat through the pipe to hgWaba will avoid making large files
    # 	that are not needed.
    cat << '_EOF_' > loadEm.sh
#!/bin/sh
#
for c in I II III IV V X M
do
	echo -n "${c} "
        cat /cluster/store5/worm/ce1/bed/waba/out/chr${c}/c*.wab |
        ~/bin/i386/hgWaba ce1 Cbr chr${c} 0 stdin > proc${c}.out 2>&1
done

exit 0
'_EOF_'

    # run it to load the waba track:
    ./loadEm.sh
    # remove garbage temp file:
    rm full_waba.tab
    rm chrom_waba.tab

    # worm/ce1/trackDb.ra entry:
#   track cbrWaba
#   shortLabel Briggsae Waba
#   longLabel C. briggsae Waba Alignments
#   group genes
#   priority 50
#   visibility dense
#   color 140,0,200
#   altColor 210,140,250

# BEGINNING OF Cb1 BLASTZ

    # next machine
    ssh kkr1u00
    #  These nibs were created in the combined RepeatMasker and TRF above
    mkdir -p /iscratch/i/worms/Celegans/rmsk
    cd /iscratch/i/worms/Celegans/rmsk
    cp -p /cluster/store5/worm/ce1/RMRun/chr*.fa.out .
    ~kent/bin/iSync
  
#	These blastz sections usually start by saying:
#		FIRST FOLLOW INSTRUCTIONS IN ~angie/hummus/Notes
#	This Notes file is the actual blastz run.
#	But I found those notes a bit obscure, for clarity I have
#	reproduced the exact steps here.
#	The initial DEF file has been created for this.
#	It is in ~angie/hummus/DEF.ce1-cb1.2003-05-29
#	which will blastz soft masked Cbriggsae ultra contigs
#	against the soft masked Celegans chroms

#	1. created the DEF file for this particular blastz run
#	    The TARGET files are the masked NIBS created above, in:
#	SEQ1_DIR=/iscratch/i/worms/Celegans/bothMasksNib
#	    and the repeatmasker *.out files in:
#	SEQ1_RMSK=/iscratch/i/worms/Celegans/rmsk
#	    There is no lineage specific repeats for the worms as far as
#	    I can tell, thus:
#	SEQ1_SMSK=/dev/null
#	    This requires:
#	BLASTZ_ABRIDGE_REPEATS=0
#	    The QUERY sequence are the repeat and trf soft masked chrUn.fa
#	    nib file from the briggsae processing:
#	SEQ2_DIR=/iscratch/i/worms/Cbriggsae/bothMasksNib
#	    And the repeat masker run from the briggsae processing:
#	SEQ2_RMSK=/iscratch/i/worms/Cbriggsae/rmsk
#	    No lineage specific repeats:
#	SEQ2_SMSK=/dev/null
#
#	    Our base of operations:
#	BASE=/cluster/store5/worm/ce1/bed/blastzCb1
#

#	Reproduced here are the instructions as specified in the
#	hummus/Notes file.
#	Evidently historical archives of the DEF files exist in
#	in ~angie/hummus/DEF* and they are copied out of there to
#	to the file: $BASE/DEF for the actual run.  Don't worry, none of
#	this seems to make any sense.
	# next machine
	ssh kk
	cd ~/ce1/bed/blastzCb1
#	source the DEF file which was carefully written as above.
	. ./DEF
	mkdir -p $BASE/run
	~angie/hummus/make-joblist $DEF > $BASE/run/j
	# that created xdir.sh and joblist run/j
	sh $BASE/xdir.sh
	# xdir.sh makes a bunch of result directories in $BASE/raw/
	# based on chrom name and CHUNK size
        cd $BASE/run
	# now edit j because it is not correct:
	sed -e 's#^#/cluster/home/angie/schwartzbin/#' j > j2
	wc -l j*
	head j2
	# *** make sure the j2 edits are OK, then use it:
	mv j2 j
	# para create will create the file: 'batch' for the cluster run
	para create j
	para try
	para check
	para push
	... etc ...
Average job time:                1222s      20.36m     0.34h    0.01d
Longest job:                     3772s      62.87m     1.05h    0.04d

	# When that cluster run is done, results are in $BASE/raw/chr*/*
	# continuing with ~angie/hummus/Notes:
	# --- normalize and single_cov
	cd ~/ce1/bed/blastzCb1
	#   source the DEF file again in case you are coming back to this
	. ./DEF
	# a new run directory
	mkdir -p $BASE/run.1
	# another obscure script creates a new job list:
	~angie/hummus/do.out2lav $DEF >$BASE/run.1/j
 	cd $BASE/run.1
	# the job list is once again incorrect, edit it:
 	sed -e 's/^/\/cluster\/home\/angie\/schwartzbin\//' j > j2
 	wc -l j*
 	head j2
	# make sure the edited j2 is OK, then use it:
 	mv j2 j
 	para create j
 	para try
	para push;para check; ... etc ...
Average job time:                  65s       1.08m     0.02h    0.00d
Longest job:                      136s       2.27m     0.04h    0.00d


    # Translate the .lav files created by the end of ~angie/hummus Notes
    # into axt files
    ssh eieio
    set base="/cluster/store5/worm/ce1/bed/blastzCb1"
    set seq1_dir="/cluster/store5/worm/ce1/nib"
    set seq2_dir="/cluster/store5/worm/cb1/nib"
    set tbl="blastzCb1"
    cd $base
    mkdir -p axtChrom
    foreach c (lav/*)
      pushd $c
      set chr=$c:t
      set out=$base/axtChrom/$chr.axt
      echo "Translating $chr lav to $out"
      cat `ls -1 *.lav | sort -g` \
        | /cluster/home/hiram/bin/i386/lavToAxt stdin $seq1_dir $seq2_dir stdout \
        | /cluster/bin/i386/axtSort stdin $out
      popd
    end

    # Translate the sorted axt files into psl:
    cd $base
    mkdir -p pslChrom
    foreach f (axtChrom/chr*.axt)
      set c=$f:t:r
      /cluster/bin/i386/axtToPsl $f S1.len S2.len pslChrom/${c}_${tbl}.psl
    end

    # load these blastz results
    # next machine
    ssh hgwdev
    cd ~/ce1/bed/blastzCb1/pslChrom
    /cluster/bin/i386/hgLoadPsl ce1 chr*_*.psl
    # trackDb/worm/ce1/trackDb.ra entry:
#   track blastzCb1
#   shortLabel briggsae Blastz
#   longLabel Blastz C briggsae
#   group compGeno
#   priority 159
#   visibility dense
#   color 0,0,0
#   altColor 50,128,50
#   spectrum on
#   type psl xeno cb1

# CHAINING Briggsae blastz
    # next machine, small cluster is good for these tiny jobs
    ssh kkr1u00
    cd /cluster/store5/worm/ce1/bed/blastzCb1
    mkdir axtChain
    cd axtChain
    mkdir run1
    cd run1
    ls -1S ../../axtChrom/*.axt > input.lst
    cat << '_EOF_' > gsub
#LOOP
doChain {check in exists $(path1)} {check out line+ chain/$(root1).chain} {check out line+ out/$(root1).out}
#ENDLOOP
'_EOF_'
    cat << '_EOF_' > doChain
#!/bin/csh
    axtChain $1  /cluster/store5/worm/ce1/nib /cluster/store5/worm/cb1/nib $2 > $3
'_EOF_'
    chmod a+x doChain

    mkdir out chain
    gensub2 input.lst single gsub spec
    para create spec
    para try
Average job time:                 303s       5.04m     0.08h    0.00d
Longest job:                     1006s      16.77m     0.28h    0.01d

    # now on the cluster server, sort chains
    # next machine
    ssh eieio
    cd ~/ce1/bed/blastzCb1/axtChain
    chainMergeSort run1/chain/*.chain > all.chain
    chainSplit chain all.chain
    # optionally: rm run1/chain/*.chain

    # Load chains into database
    # next machine
    ssh hgwdev
    cd ~/ce1/bed/blastzCb1/axtChain/chain
    foreach i (*.chain)
        set c = $i:r
        hgLoadChain ce1 ${c}_cb1Chain $i
        echo done $c
    end

    # Create the nets  XXXX STOPPED HERE 2003-06-11
    # next machine
    ssh eieio
    cd ~/ce1/bed/blastzCb1/axtChain
    mkdir preNet
    cd chain
    

# MAKING AND STORING refSeq ALIGNMENTS (WORKING 2003-06-02 - Hiram)
# MAKING AND STORING refSeq ALIGNMENTS - hgRefSeqMrna load REDONE (2004-05-18, hartera)
# hgRefSeqStatus - refSeqStatus table rebuilt with updated loc2ref
# (2004-06-16, hartera)

    # next machine
    ssh kkr1u00
    mkdir -p /iscratch/i/worms/Celegans/refSeq
    cd /iscratch/i/worms/Celegans
    # there are about 20000 sequences in the refSeq.fa file,
    #	split them into about 300 files
    faSplit sequence \
  /cluster/store5/mrna.134/refSeq/041003/org/Caenorhabditis_elegans/refSeq.fa \
		300 refSeq/r
    ~kent/bin/iSync

    # next machine
    ssh kk
    cd ~/ce1/bed
    mkdir -p refSeq/psl
    cd refSeq
    # for reference:
    ln -s \
 /cluster/store5/mrna.134/refSeq/041003/org/Caenorhabditis_elegans/refSeq.* .
    ls -1S /cluster/bluearc/iscratch/i/worms/Celegans/bothMasksNib/chr*.nib > worm.lst
    ls -1S /cluster/bluearc/iscratch/i/worms/Celegans/refSeq/r*.fa > refSeq.lst
    cat << '_EOF_' > gsub
#LOOP
/cluster/bin/i386/blat -fine -q=rna -mask=lower {check in exists+ $(path1)} {check in line+ $(path2)} {check out line+ psl/$(root1)_$(root2).psl}
#ENDLOOP
'_EOF_'

    gensub2 worm.lst refSeq.lst gsub spec
    para create spec
    para try
    para check
    para push  ... etc ...
# Average job time:                  55s       0.92m     0.02h    0.00d
# Longest job:                      626s      10.43m     0.17h    0.01d

    # when cluster run done
    # next machine
    ssh eieio
    cd ~/ce1/bed/refSeq
    pslSort dirs raw.psl /tmp psl
    pslReps -minAli=0.98 -sizeMatters -nearTop=0.005 raw.psl all_refSeq.psl \
      /dev/null
    pslSortAcc nohead chrom /tmp all_refSeq.psl
    pslCat -dir chrom > refSeqAli.psl

    # load refSeq tables
    # next machine
    ssh hgwdev
    cd ~/ce1/bed/refSeq
    hgLoadPsl ce1 -tNameIx refSeqAli.psl
    ln -s \
 /cluster/store5/mrna.134/refSeq/041003/org/Caenorhabditis_elegans/refSeq.fa \
	/gbdb/ce1/mrna.134
    # ! ! DO NOT RUN hgLoadRna in /gbdb - it leaves .tab files
    cd ~/ce1/bed/refSeq
    hgLoadRna add -type=refSeq ce1 /gbdb/ce1/mrna.134/refSeq.fa \
 /cluster/store5/mrna.134/refSeq/041003/org/Caenorhabditis_elegans/refSeq.ra
    # Reload refLink table as LocusLink ID column is all 0 and protein
    # accessions missing. (DONE, 2004-05-12, hartera)
    # RefSeq accessions for ce1 appear not to be in 
    # /cluster/store5/mrna.134/refSeq/041003/loc2ref so could not also
    # map the LocusLink ID to OMIM ID through mim2loc
    # Dowload the latest loc2ref and mim2loc from NCBI and put in
    # /cluster/store5/mrna.134/refSeq/051204/
    # Previous error in loading refPep was because there were duplicate
    # accessions in hs.faa once the suffix of .1 and .2 were removed
    # A search in GenBank showed that NP_002900.1 has been replaced 
    # by NP_002900.2 so removed the former from hs.faa
    # (2004-05-18, hartera) Deleted contents of refPep table since
    # hs.faa contains only human sequences. No C. elegans peptide sequences
    # available from this NCBI Build 134 so added protein translation of genomic DNA 
    # on the RefSeq genes details pages. 
    hgRefSeqMrna ce1 /gbdb/ce1/mrna.134/refSeq.fa \
 /cluster/store5/mrna.134/refSeq/041003/org/Caenorhabditis_elegans/refSeq.ra \
      all_refSeq.psl \
      /cluster/store5/mrna.134/refSeq/051204/loc2ref \
      /cluster/store5/mrna.134/refSeq/041003/hs.faa \
      /cluster/store5/mrna.134/refSeq/051204/mim2loc
    # Missing locusLinkIds for 184 of 20661
    # Missing protein accessions for 184 of 20661 
    # Add RefSeq status info (updated 2004-06-16 as mrnaAcc values not found
    # in refLink.mrnaAcc column). Only load in those for C. elegans (Taxonomy
    # ID is 6239)
    cd /cluster/data/ce1/bed/refSeq
    hgRefSeqStatus -taxId=6239 ce1 \
         /cluster/store5/mrna.134/refSeq/051204/loc2ref

    # Create precomputed join of refFlat and refGene:
    echo 'CREATE TABLE refFlat \
          (KEY geneName (geneName), KEY name (name), KEY chrom (chrom)) \
          SELECT refLink.name as geneName, refGene.* \
          FROM refLink,refGene \
          WHERE refLink.mrnaAcc = refGene.name' \
    | hgsql ce1

# END - MAKING AND STORING refSeq ALIGNMENTS

# BEGINNING OF Self BLASTZ

#	DEF file here is the same as the blastzCb1/DEF file, with
#	the following changes:
#	SEQ1_DIR=/iscratch/i/worms/Celegans/bothMasksNib
#	SEQ2_DIR=/iscratch/i/worms/Celegans/bothMasksNib
#	SEQ2_RMSK=/iscratch/i/worms/Celegans/rmsk
#	SEQ2_CHUNK=10000000
#	SEQ2_LAP=10000
#	BASE=/cluster/store5/worm/ce1/bed/blastzSelf

#	Reproduced here are the instructions as specified in the
#	hummus/Notes file.
#	Evidently historical archives of the DEF files exist in
#	in ~angie/hummus/DEF* and they are copied out of there to
#	to the file: $BASE/DEF for the actual run.  Don't worry, none of
#	this seems to make any sense.
	# next machine
    ssh kk
    mkdir -p ~/ce1/bed/blastzSelf
    cd ~/ce1/bed/blastzSelf
    . ./DEF
    mkdir -p $BASE/run
    ~angie/hummus/make-joblist $DEF > $BASE/run/j
    # that created xdir.sh and joblist run/j
    sh $BASE/xdir.sh
    # xdir.sh makes a bunch of result directories in $BASE/raw/
    # based on chrom name and CHUNK size
    cd $BASE/run
    # now edit j because it is not correct:
    sed -e 's#^#/cluster/home/angie/schwartzbin/#' j > j2
    wc -l j*
    head j2
    # *** make sure the j2 edits are OK, then use it:
    mv j2 j
    # para create will create the file: 'batch' for the cluster run
    para create j
    para try
    para check
    para push
	... etc ...
Average job time:                2228s      37.14m     0.62h    0.03d
Longest job:                    15281s     254.68m     4.24h    0.18d
    # That longest job time is not correct.  I was having trouble with
    # the cluster during that run.

    # When that cluster run is done, results are in $BASE/raw/chr*/*
    # continuing with ~angie/hummus/Notes:
    # --- normalize and single_cov
    cd ~/ce1/bed/blastzSelf
    #   source the DEF file again in case you are coming back to this
    . ./DEF
    # a new run directory
    mkdir -p $BASE/run.1
    # another obscure script creates a new job list:
    ~angie/hummus/do.out2lav $DEF >$BASE/run.1/j
    cd $BASE/run.1
    # the job list is once again incorrect, edit it:
    sed -e 's/^/\/cluster\/home\/angie\/schwartzbin\//' j > j2
    wc -l j*
    head j2
    # make sure the edited j2 is OK, then use it:
    mv j2 j
    para create j
    para try
    para push;para check; ... etc ...
Average job time:                 241s       4.01m     0.07h    0.00d
Longest job:                      459s       7.65m     0.13h    0.01d


    # Translate the .lav files created by the end of ~angie/hummus Notes
    # into axt files
    ssh eieio
    set base="/cluster/store5/worm/ce1/bed/blastzSelf"
    set seq1_dir="/cluster/store5/worm/ce1/nib"
    set seq2_dir="/cluster/store5/worm/ce1/nib"
    set tbl="blastzSelf"
    cd $base
    mkdir -p axtChrom
    foreach c (lav/*)
      pushd $c
      set chr=$c:t
      set out=$base/axtChrom/$chr.axt
      echo "Translating $chr lav to $out"
      cat `ls -1 *.lav | sort -g` \
        | /cluster/home/hiram/bin/i386/lavToAxt stdin $seq1_dir $seq2_dir stdout \
        | /cluster/bin/i386/axtSort stdin $out
      popd
    end

    #  DropOverlap cleans out a bit more than DropSelf:
    cd $base
    mkdir DropOverlap
    foreach c (M I II III IV V X)
	echo "dropping chr${c}"
	/cluster/bin/i386/axtDropOverlap axtChrom/chr${c}.axt \
		DropOverlap/chr${c}.axt
    end

    # Translate the sorted axt files into psl:
    mkdir -p pslChrom
    foreach f (DropOverlap/chr*.axt)
      set c=$f:t:r
      /cluster/bin/i386/axtToPsl $f S1.len S2.len pslChrom/${c}_${tbl}.psl
    end

    # load these blastz results  (they already have the correct names
    #		chr*_blastzSelf.psl from axtToPsl above)
    # next machine
    ssh hgwdev
    cd ~/ce1/bed/blastzSelf/pslChrom
    /cluster/bin/i386/hgLoadPsl ce1 chr*_*.psl
    # trackDb/worm/ce1/trackDb.ra entry:
#   track blastzSelf
#   shortLabel Blastz Self
#   longLabel Blastz C elegans (self)
#   group compGeno
#   priority 159
#   visibility dense
#   color 0,0,0
#   altColor 50,128,50
#   spectrum on
#   type psl xeno ce1

# CHAINING Self blastz
    # next machine, small cluster is good for these tiny jobs
    ssh kkr1u00
    cd /cluster/store5/worm/ce1/bed/blastzSelf
    mkdir -p axtChain/run1
    cd /cluster/store5/worm/ce1/bed/blastzSelf/axtChain/run1
    ls -1S /cluster/store5/worm/ce1/bed/blastzSelf/DropOverlap/*.axt > input.lst
    cat << '_EOF_' > gsub
#LOOP
doChain {check in exists $(path1)} {check out line+ chain/$(root1).chain} {check out line+ out/$(root1).out}
#ENDLOOP
'_EOF_'
    #  Nibs are on /cluster/bluearc now
    cat << '_EOF_' > doChain
#!/bin/csh
    axtChain $1  /cluster/bluearc/iscratch/i/worms/Celegans/bothMasksNib /iscratch/i/worms/Celegans/bothMasksNib $2 > $3
'_EOF_'
    chmod a+x doChain

    mkdir out chain
    gensub2 input.lst single gsub spec
    para create spec
    para try
    para push ... etc ...
Average job time:                 502s       8.37m     0.14h    0.01d
Longest job:                      874s      14.57m     0.24h    0.01d

    # now on the cluster server, sort chains
    # next machine
    ssh eieio
    cd ~/ce1/bed/blastzSelf/axtChain
    chainMergeSort run1/chain/*.chain > all.chain
    chainSplit chain all.chain
    # optionally: rm run1/chain/*.chain

    # Load chains into database
    # next machine
    ssh hgwdev
    cd ~/ce1/bed/blastzSelf/axtChain/chain
    foreach i (*.chain)
        set c = $i:r
        hgLoadChain ce1 ${c}_chainSelf $i
        echo done $c
    end
#  Loading 845504 chains into ce1.chrI_chainSelf
#  Loading 845307 chains into ce1.chrII_chainSelf
#  Loading 1152790 chains into ce1.chrIII_chainSelf
#  Loading 859566 chains into ce1.chrIV_chainSelf
#  Loading 2 chains into ce1.chrM_chainSelf
#  Loading 1114939 chains into ce1.chrV_chainSelf
#  Loading 604293 chains into ce1.chrX_chainSelf


#  END of Blastz Self


LOAD SOFTBERRY GENES (DONE - 2003-08-14 - Hiram)
     mkdir -p /cluster/data/ce1/bed/softberry
     cd /cluster/data/ce1/bed/softberry

     wget \
	ftp://www.softberry.com/pub/SC_CEL_MAY03/Soft_fgenesh_Ce03.tar.gz
     tar xvzf Soft_fgenesh_Ce03.tar.gz

     ldHgGene ce1 softberryGene chr*.gff
# Read 19112 transcripts in 276202 lines in 6 files
#   19112 groups 6 seqs 1 sources 4 feature types
# 19112 gene predictions
     hgPepPred ce1 softberry *.protein
     hgSoftberryHom ce1 *.protein

    # Add entry to worm/ce1/trackDb.ra and add specific
    # softberryGene.html file

CREATE ORFTOGENE TABLE (Used by hgGene and hgNear)
   mkdir /cluster/data/ce1/bed/orfToGene
   cd !$
   wget ftp://ftp.wormbase.org/pub/wormbase/GENE_DUMPS/gene_names.txt
   awk -F '\t' '$2 == "Caenorhabditis elegans" && $3 == "Gene" {printf("%s\t%s\n", $4, $1)}' gene_names.txt > orfToGene.txt
   hgCeOrfToGene ce1 gene_names.txt sangerGene orfToGene

CREATE SANGER LINKS TABLE FROM WORMBASE INFO (used by hgGene and hgNear)
   mkdir /cluster/data/ce1/bed/steinHelp
# Mail Lincoln Stein and ask for mappings from wormPep to
# SwissProt and from ORF to wormPep to description.  Place
# these files in the steinHelp directory:
#	seqPepDesc.ace  - contains ORF/WormPep/Description
#	spWp.ace - contains WormPep/SwissProt
# Then go to the AQL search page at
# http://www.wormbase.org/db/searches/aql_query.
# and create the following files in that directory:
#    wbGeneClass.txt - associates 3 letter first part of worm gene
#             name with a brief description.  Produced with query:
#             select l,l->Description  from l in class Gene_Class
#    wbConcise.txt - associates worm gene name with a several
#             sentence description.  Produces with query:
#             select l,g from l in class Locus, g in l->Gene_information[Provisional_description]
    hgWormLinks spWp.ace seqPepDesc.ace wbConcise.txt wbGeneClass.txt ../orfToGene/orfToGene sangerLinks.txt
    hgsql ce1 < ~/kent/src/hg/near/hgWormLinks/sangerLinks.sql
    hgsql -e \
	'load data local infile "sangerLinks.txt" into table sangerLinks.sql' \
	ce1

TWINSCAN GENE PREDICTIONS (DONE - 2004-01-05 - Hiram)
    mkdir /cluster/data/ce1/bed/twinscan
    cd /cluster/data/ce1/bed/twinscan
    wget --timestamping \
http://genes.cs.wustl.edu/predictions/worm/C_elegans/WS100/12-15-2003/Celegans_WS100_12-15-03.tgz

    tar xvzf Celegans_WS100_12-15-03.tgz
    rm -r chr_tx
    # clean up chrom field of GTF files
    foreach c (I II III IV V X)
	set f = chr_gtf/CHROMOSOME_${c}.pred_gtf
	echo ${f}
	sed -e "s/CHROMOSOME_/chr/g" ${f} | \
	sed -e "s/_[0-9][0-9]*.seq\t/\t/" > \
		chr_gtf/chr${c}-fixed.gtf
    end

    # pare down protein FASTA header to id and add missing .1:
    foreach c (I II III IV V X)
	set f = chr_ptx/CHROMOSOME_${c}.ptx
	echo ${f}
	perl -wpe 's/^\>CHROMOSOME_(\S+).*$/\>chr$1.1/;' < \
		${f} > chr_ptx/chr${c}-fixed.fa
    end
    ldHgGene ce1 twinscan chr_gtf/chr*-fixed.gtf -exon=CDS
#    Read 20889 transcripts in 146518 lines in 6 files
#      20889 groups 206 seqs 1 sources 3 feature types
#    20889 gene predictions
    hgPepPred ce1 generic twinscanPep chr_ptx/chr*-fixed.fa

# MYTOUCH FIX - Jen - 2006-01-25
  sudo mytouch ce1 twinscanPep 0404031400.00

# MRNAORIENT, creating tables for clusterRna operation
#		(working 2004-02-10 - Hiram)
    mkdir -p /cluster/bluearc/ce1/bed/est
    mkdir -p /cluster/bluearc/ce1/bed/mrna
    cd /cluster/bluearc/ce1/bed/est
    cp -p /cluster/data/ce1/bed/est/chrom/*.psl .
    pslSortAcc nohead contig /tmp/psl *.psl
    cd /cluster/bluearc/ce1/bed/mrna
    cp -p /cluster/data/ce1/bed/mrna/chrom/*.psl .
    pslSortAcc nohead contig /tmp/psl *.psl
    mkdir mRNA
    faSplit sequence /cluster/store5/mrna.134/org/Caenorhabditis_elegans/mrna.fa 1000000 mRNA/m
    mkdir -p /cluster/data/ce1/bed/mrnaOrientInfo/oi
    cd /cluster/data/ce1/bed/mrnaOrientInfo
    ls -1S /cluster/bluearc/ce1/bed/mrna/contig/*.psl > psl.lst
    ls -1S /iscratch/i/worms/Celegans/mrna.134/mrna.fa > mrna.lst
    cat << '_EOF_' > gsub
#LOOP
/cluster/bin/i386/polyInfo {check in line $(path1)} {check in line+ /iscratch/i/worms/Celegans/trfFa/$(root1).fa} /iscratch/i/worms/Celegans/mrna.134/mrna.fa {check out line+ oi/$(root1).tab}
#ENDLOOP
'_EOF_'
    ssh kk
    cd /cluster/data/ce1/bed/mrnaOrientInfo
    gensub2 psl.lst mrna.lst gsub jobList
    para create jobList
    para try			# there are only six jobs, this runs them

    cat oi/*.tab > mrnaOrientInfo.bed
    hgLoadBed ce1 mrnaOrientInfo mrnaOrientInfo.bed \
      -sqlTable=$HOME/kent/src/hg/lib/mrnaOrientInfo.sql

    mkdir -p /cluster/data/ce1/bed/estOrientInfo/oi
    cd /cluster/data/ce1/bed/estOrientInfo
    ls -1S /cluster/bluearc/ce1/bed/est/contig/*.psl > psl.lst
    ls -1S /iscratch/i/worms/Celegans/mrna.134/est.fa > est.lst
    cat << '_EOF_' > gsub
#LOOP
/cluster/bin/i386/polyInfo {check in line $(path1)} {check in line+ /iscratch/i/worms/Celegans/trfFa/$(root1).fa} /iscratch/i/worms/Celegans/mrna.134/est.fa {check out line+ oi/$(root1).tab}
#ENDLOOP
'_EOF_'
    ssh kkr1u00
    cd /cluster/data/ce1/bed/estaOrientInfo
    gensub2 psl.lst est.lst gsub jobList
    para create jobList
    para try			# there are only six jobs, this runs them

    cat oi/*.tab > estOrientInfo.bed
    bedSort estOrientInfo.bed estOrientInfo.bed
    hgLoadBed ce1 estOrientInfo estOrientInfo.bed \
      -sqlTable=$HOME/kent/src/hg/lib/estOrientInfo.sql

    mkdir /cluster/data/ce1/bed/rnaCluster
    cd /cluster/data/ce1/bed/rnaCluster
    foreach f (I II III IV V X)
	echo "clusterRna ce1 /dev/null chr${f}.bed -chrom=${f}"
	clusterRna ce1 /dev/null chr${f}.bed -chrom=chr${f}
    end
    hgLoadBed ce1 rnaCluster *.bed


    hgsql -A -e "select chrom from chromInfo" ce1 | egrep -v chrom > chrom.tab
    (perl -e 'print "#\!/bin/sh\n"; while(<>) {chomp($_); printf "hgsql -A -e =select * from rnaCluster where chrom like \"$_\" order by chromStart, name= ce1 | egrep -v chromStart | cut -f2\-13 > $_.ce1.rnaCluster.bed\n";}' | sed -e "s/=/'/g") < chrom.tab > rnaCluster.sh

    mkdir -p /cluster/data/ce1/bed/altSplice/agx
    cd /cluster/data/ce1/bed/altSplice

    for c in I II III IV V X M
    do
    hgsql -e "create table chr${c}_intronEst as select chr${c}_est.* from chr${c}_est,estOrientInfo where (chr${c}_est.qName = estOrientInfo.name) and (estOrientInfo.intronOrientation != 0)" ce1
    done

    for c in I II III IV V X M
    do
    hgsql -e "alter table chr${c}_intronEst add index bin (bin);" ce1
    hgsql -e "alter table chr${c}_intronEst add index tStart (tStart);" ce1
    hgsql -e "alter table chr${c}_intronEst add index qName (qName(12));" ce1
    hgsql -e "alter table chr${c}_intronEst add index tEnd (tEnd);" ce1
    echo "done chr${c}"
    done

    hgsql -A -e "select chrom from chromInfo" ce1 | egrep -v chrom > chrom.tab
    perl -e 'while(<>) {chomp($_); print "~/bin/i386/altSplice -db=ce1 -beds=../rnaCluster/$_.ce1.rnaCluster.bed -agxOut=agx/$_.ce1.rnaCluster.agx -consensus -weightMrna\n";}' < chrom.tab > altSplice.para.spec
    # these jobs are no big deal on the worm, just run this file:
    chmod +x altSplice.para.spec
    mkdir agx
    ./altSplice.para.spec
    cat agx/*.agx > altSplice.agx
    agxToBed altSplice.agx altSplice.bed

    cat << '_EOF_' >> altGraphX.sql
CREATE TABLE altGraphX (
  bin smallint unsigned not null, 
  tName varchar(255) NOT NULL default '',
  tStart int(11) NOT NULL default '0',
  tEnd int(11) NOT NULL default '0',
  name varchar(255) NOT NULL default '',
  id int(10) unsigned NOT NULL auto_increment,
  strand char(2) NOT NULL default '',
  vertexCount int(10) unsigned NOT NULL default '0',
  vTypes longblob NOT NULL,
  vPositions longblob NOT NULL,
  edgeCount int(10) unsigned NOT NULL default '0',
  edgeStarts longblob NOT NULL,
  edgeEnds longblob NOT NULL,
  evidence longblob NOT NULL,
  edgeTypes longblob NOT NULL,
  mrnaRefCount int(11) NOT NULL default '0',
  mrnaRefs longblob NOT NULL,
  mrnaTissues longblob NOT NULL,
  mrnaLibs longblob NOT NULL,
  PRIMARY KEY  (id),
  KEY tName (tName(16),tStart,tEnd)
);
'_EOF_'
    # << emacs

    hgLoadBed -sqlTable=altGraphX.sql ce1 altGraphX altSplice.agx

    # trackDb entry:
	track altGraphX
	shortLabel Alt-Splice
	longLabel Alternative Splicing
	group rna
	priority 74.1
	visibility pack
	type bed 3
	canPack off

    #	copy frame based view to goldenPath 2006-01-12 - Hiram
    cd /usr/local/apache/htdocs/goldenPath
    ln -s ceMay2003 ce1
    cd /cluster/data/ce1/bed/altSites
    mkdir /usr/local/apache/htdocs/goldenPath/ce1/altGraphX
    sed -e "s#http://hgwdev-sugnet.soe.ucsc.edu##" links.html \
	> /usr/local/apache/htdocs/goldenPath/ce1/altGraphX/links.html

    sed -e \
    "s/Human Alt Splicing Conserved in Mouse/Alt Splicing in C. elegans/" \
	-e "s/NM_005487/chrII:14689493-14690232/" \
	-e "s#http://hgwdev-sugnet.soe.ucsc.edu##" \
	 frame.html \
	    > /usr/local/apache/htdocs/goldenPath/ce1/altGraphX/frame.html

#  GC 5 BASE WIGGLE TRACK (DONE - 2004-02-13 - Hiram)
    #    working on eieio, the fileserver for this data.
    # was named gc20KBase to start with, became a 5 base dataset later
    #	the generic trackDb html file is named gc20KBase.html
    ssh eieio
    mkdir /cluster/data/ce1/bed/gc20KBase
    cd /cluster/data/ce1/bed/gc20KBase
    mkdir wigData5 dataLimits5
for n in ../../nib/*.nib
do
    c=`basename ${n} | sed -e "s/.nib//"`
    C=`echo $c | sed -e "s/chr//"`
    echo -n "working on ${c} - ${C} ... "
    hgGcPercent -chr=${c} -doGaps \
	-file=stdout -win=5 ce1 ../../nib | grep -w GC | awk '
{
    bases = $3 - $2
    perCent = $5/10.0
    printf "%d\t%.1f\n", $2+1, perCent
}' | wigAsciiToBinary -dataSpan=5 -chrom=${c} \
	-wibFile=wigData5/gc5Base_${C} -name=${C} stdin > dataLimits5/${c}
echo "done ${c}"
done

    #	data is compete, load track on hgwdev
    ssh hgwdev
    cd /cluster/data/ce1/bed/gc20KBase
    hgLoadWiggle ce1 gc20KBase wigData5/*.wig
    #	create symlinks for .wib files
    mkdir /gbdb/ce1/wib
    ln -s `pwd`/wigData5/*.wib /gbdb/ce1/wib
    #	the trackDb entry
track gc20KBase
shortLabel GC Percent
longLabel GC Percent in 5 base windows
group map
priority 23.5
visibility hide
autoScale Off
maxHeightPixels 128:36:16
graphTypeDefault Bar
gridDefault OFF
windowingFunction Mean
color 0,128,255
altColor 255,128,0
viewLimits 30:70
type wig 0 100


#  miRNA track (DONE - 2004-03-02 - Hiram)
    #	data from: Sam Griffiths-Jones <sgj@sanger.ac.uk>
    #	and Michel.Weber@ibcg.biotoul.fr
    #	notify them if this assembly updates to renew this track
    mkdir /cluster/data/ce1/bed/miRNA
    cd /cluster/data/ce1/bed/miRNA
    wget --timestamping \
    "http://www.sanger.ac.uk/Software/Rfam/mirna/data/ucsc_cel_mir_track.txt"
    grep "^chr" ucsc_cel_mir_track.txt | sed -e "s/ /\t/g" > worm.mir.bed
    hgLoadBed ce1 miRNA worm.mir.bed
    # entry in trackDb/trackDb.ra already there for same track in Hg16
    #	see makeHg16.doc for more comments on this


#####################################################################
# SEGMENTAL DUPLICATIONS (DONE 6/30/06 angie)
    # File emailed from Xinwei She <xws@u.washington.edu>
    mkdir /cluster/data/ce1/bed/genomicSuperDups
    cd /cluster/data/ce1/bed/genomicSuperDups
    sed -e 's/CHROMOSOME_/chr/g; s/\t_\t/\t-\t/;' \
      ce1_genomicSuperDup.tab \
    | awk '($3 - $2) >= 1000 && ($9 - $8) >= 1000 {print;}' \
    | hgLoadBed ce1 genomicSuperDups stdin \
      -sqlTable=$HOME/kent/src/hg/lib/genomicSuperDups.sql


