# for emacs: -*- mode: sh; -*-


# Caenorhabditis remanei -- 
# 
# WashU's genome v2 PCAP assembly.

# DOWNLOAD SEQUENCE (DONE 3/9/05 Andy)
   # I'm using the assembly as non-sinlgleton supercontigs.  The fasta 
   # files in the WashU FTP site don't use genbank accessions.  Joanne 
   # Nelson provided an AGP file describing an assembly using the genbank 
   # sequence.  So I just went to genbank to get the sequence through their
   # Taxonomy browser:   
   #    http://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?mode=Info&id=31234
   # In the Entrez records section, click on Nucleotide.  Then on the new web page,
   # select FASTA in the "Display" box, and "File" in the "Send to" box.  The FASTA
   # file downloads to /tmp/sequences.fasta
   mkdir -p /cluster/store10/caeRem1/downloads
   ln -s /cluster/store10/caeRem1 /cluster/data/caeRem1
   cd /cluster/store10/caeRem1/downloads
   cp /tmp/sequences.fasta . 
   # Get the file from Joanne Nelson and clean it up (the AGP records in the file start
   # with AAGD00 instead of AAGD01 like in genbank):
   wget ftp://genome.wustl.edu/private/joanne/remanei/remanei_pcap.contigs.agp_no_singletons.gz
   zcat remanei_pcap.contigs.agp_no_singletons.gz | sed 's/AAGD00/AAGD01/' > remanei.agp
   # Clean up the genbank fasta and grab the records in the AGP file.
   sed -e '/^$/d; s/^>gi.*gb|\(.*\)|.*$/>\1/' sequences.fasta > genbank.fa
   rm sequences.fa remanei_pcap.contigs.agp_no_singletons.gz
   grep AAGD remanei.agp | cut -f6 | sort | uniq > accs.txt
   faSomeRecords genbank.fa accs.txt remanei.contigs.fa
   agpAllToFaFile -sizes=supercontigs remanei.agp remanei.contigs.fa remanei.unmasked.fa
   faSize remanei.unmasked.fa
#143407347 bases (5365830 N's 138041517 real 138041517 upper 0 lower) in 3584 sequences in 1 files
#Total size: mean 40013.2 sd 199471.6 min 2021 (SuperCont6529) max 5925111 (SuperCont0) median 5701
#N count: mean 1497.2 sd 2951.0
#U count: mean 38516.0 sd 197042.1
#L count: mean 0.0 sd 0.0

# CREATING DATABASE (DONE 3/10/05 Andy)
    # Create the database.
    ssh hgwdev
    hgsql '' -e 'create database caeRem1'

# LOAD GAP & GOLD TABLES FROM AGP (DONE 3/10/2005 Andy)
    ssh hgwdev
    cd /cluster/data/caeRem1/downloads
    hgGoldGapGl -noGl caeRem1 remanei.agp
    # For some reason, the indices did not get built correctly --
    # "show index from gap/gold" shows NULL cardinalities for chrom.  
    # Rebuild indices with "analyze table".
    # *** Andy's note: the same thing happened in this assembly too.
    hgsql caeRem1 -e 'analyze table gold; analyze table gap;'

# REPEATMASKER (DONE 3/10/05 Andy)
   # Split files for the run:
   ssh kksilo
   cd /cluster/data/caeRem1/downloads
   mkdir ../unmaskedSplits
   faSplit -outDirDepth=1 -lift=../unmaskedSplits/splits.lft gap remanei.unmasked.fa 500000 ../unmaskedSplits/
   # copy to iscratch
   ssh kkr1u00
   mkdir /iscratch/i/caeRem1
   cd /iscratch/i/caeRem1
   cp -r /cluster/data/caeRem1/unmaskedSplits .
   iSync
   ssh kk
   cd /cluster/data/caeRem1
   mkdir RMRun RMOut jkStuff
   cat << "_EOF_" > jkStuff/RMApis
#!/bin/bash 

file=`basename $1`
wholedir=`dirname $1`
part=`basename $wholedir`

cd /cluster/data/caeRem1/RMOut
mkdir -p /tmp/caeRem1/$part
cp /iscratch/i/caeRem1/unmaskedSplits/$part/$file /tmp/caeRem1/$part
pushd /tmp/caeRem1/$part
/cluster/bluearc/RepeatMasker/RepeatMasker -s -spec worm $file
popd
mkdir -p $part
cp /tmp/caeRem1/$part/$file.out ./$part/$file.out
rm -fr /tmp/caeRem1/$part/$file.*
rmdir --ignore-fail-on-non-empty /tmp/caeRem1/$part
rmdir --ignore-fail-on-non-empty /tmp/caeRem1	
_EOF_
    # << this line makes emacs coloring happy
    chmod +x jkStuff/RMApis
    cd RMRun
    find ../unmaskedSplits/ -name '*.fa' > files.lst   
    cat << "_EOF_" > gsub
#LOOP
../jkStuff/RMApis $(path1) {check in line+ $(path1)} {check out line+ ../RMOut/$(lastDir1)/$(root1).fa.out}
#ENDLOOP
_EOF_
    gensub2 files.lst single gsub RMJobs
    para create RMJobs
    para try
    para push
    para time
#3718 jobs in batch
#126046 jobs (including everybody's) in Parasol queue.
#Checking finished jobs
#Completed: 3718 of 3718 jobs
#CPU time in finished jobs:    1183137s   19718.95m   328.65h   13.69d  0.038 y
#IO & Wait Time:                 18806s     313.44m     5.22h    0.22d  0.001 y
#Average job time:                 323s       5.39m     0.09h    0.00d
#Longest running job:                0s       0.00m     0.00h    0.00d
#Longest finished job:            4496s      74.93m     1.25h    0.05d
#Submission to last job:         10211s     170.18m     2.84h    0.12d
    
    # Lift results.
    ssh kksilo
    cd /cluster/data/caeRem1/RMOut
    for dir in *; do
       cd $dir
       for file in *; do
          tail +5 $file >> ../tmp
       done
       cd ../
    done
    head -n 4 0/000.fa.out | cat - tmp > rmsk.out
    liftUp rmsk.lifted.out ../unmaskedSplits/splits.lft warn rmsk.out
    rm tmp rmsk.out
    
    # Load results
    ssh hgwdev
    cd /cluster/data/caeRem1/RMOut
    hgLoadOut caeRem1 rmsk.lifted.out
    hgsql caeRem1 -e 'rename table rmsk_rmsk to rmsk'
    hgsql caeRem1 -e 'drop index bin       on rmsk; \
                  drop index genoStart on rmsk; \
                  drop index genoEnd   on rmsk; \
                  create index bin       on rmsk (genoName(11), bin); \
                  create index genoStart on rmsk (genoName(11), genoStart); \
                  create index genoEnd   on rmsk (genoName(11), genoEnd);' 

# SIMPLE REPEATS (TRF) (DONE 3/10/2005 Andy)
    ssh kksilo
    mkdir -p /cluster/data/caeRem1/bed/simpleRepeat
    cd /cluster/data/caeRem1/bed/simpleRepeat
    nice trfBig ../../downloads/remanei.unmasked.fa trf.bed -bed -tempDir=/tmp > trf.log &
    # Load this into the database as so
    ssh hgwdev
    cd /cluster/data/caeRem1/bed/simpleRepeat
    hgLoadBed -sqlTable=/cluster/home/aamp/kent/src/hg/lib/simpleRepeat.sql \
       caeRem1 simpleRepeat trf.bed

# MAKE MASKED 2BIT (DONE 3/10/2005 Andy)
    # make a filtered version of the trf output: 
    # keep trf's with period <= 12:
    ssh kksilo
    cd /cluster/data/apiMel/bed/simpleRepeat
    awk '{if ($5 <= 12) print;}' trf.bed > trfMask.bed
    cd ../../
    mkdir maskedFa
    cd maskedFa/
    maskOutFa -soft ../downloads/remanei.unmasked.fa ../bed/simpleRepeat/trfMask.bed remanei.softMasked.fa 
    maskOutFa -softAdd remanei.softMasked.fa ../RMOut/rmsk.lifted.out remanei.softMasked.fa
    maskOutFa remanei.softMasked.fa hard remanei.hardMasked.fa
    faToTwoBit remanei.softMasked.fa caeRem1.2bit
    # faToTwoBit remanei.hardMasked.fa caeRem1.hardMasked.2bit
    mv *.2bit ../
    cd ../
    # Load into database
    hgsql caeRem1 < ~/kent/src/hg/lib/chromInfo.sql
    twoBitInfo caeRem1.2bit chrom.sizes.1
    awk '{printf("%s\t%s\t/gbdb/caeRem1/caeRem1.2bit\n", $1, $2)}' chrom.sizes.1 > chrom.sizes
    rm chrom.sizes.1
    echo "load data local infile 'chrom.sizes' into table chromInfo;" | hgsql caeRem1

# CREATING GRP TABLE FOR TRACK GROUPING (DONE 3/30/2005 Andy)
    # Copy all the data from the table "grp" 
    # in an existing database to the new database
    ssh hgwdev
    hgsql caeRem1 -e 'create table grp (PRIMARY KEY(NAME)) select * from hg17.grp'

# MAKE HGCENTRALTEST ENTRY AND TRACKDB TABLE (DONE 3/30/2005 Andy)
    # Warning: genome and organism fields must correspond
    # with defaultDb values
    echo 'INSERT INTO dbDb \
        (name, description, nibPath, organism, \
             defaultPos, active, orderKey, genome, scientificName, \
             htmlPath, hgNearOk, hgPbOk, sourceName) values \
        ("caeRem1", "March 2005", "/gbdb/caeRem1", "C. remanei", \
             "SuperCont12:1-12000", 1, 80, \
             "C. remanei", \
             "Caenorhabditis remanei", "/gbdb/caeRem1/html/description.html", \
             0, 0, "WUGSC");' \
      | hgsql -N hgcentraltest
    echo 'update defaultDb set name="caeRem1" where genome="C. remanei"' \
      | hgsql -N hgcentraltest
    hgsql hgcentraltest -e 'INSERT INTO genomeClade (genome,clade,priority) \
values ("C. remanei", "worm",30);'

    ssh hgwdev
    cd ~/kent/src/hg/makeDb/trackDb
    cvs update
    # Edit trackDb/makefile to add caeRem1 to the DBS variable.
    mkdir worm/caeRem1
    # Create a simple worm/caeRem1/description.html file.  In the initial case it's just empty.
    cd worm
    cvs add caeRem1
    cvs add caeRem1/description.html
    make update DBS=caeRem1 ZOO_DBS=
    # go public on genome-test
    cvs ci -m "Added caeRem1 (another worm)." makefile
    cvs ci -m "Empty initial trackDb.ra and description.html for caeRem1" worm
    # in a clean, updated tree's kent/src/hg/makeDb/trackDb:
    make alpha

# SOME GBDB STUFF (DONE 3/30/2005 Andy)
    mkdir -p /gbdb/caeRem1
    mkdir /cluster/data/caeRem1/html
    ln -s /cluster/data/caeRem1/html /gbdb/caeRem1/html
    ln -s /cluster/data/caeRem1/caeRem1.2bit /gbdb/caeRem1/caeRem1.2bit

# MAKE GC5BASE WIGGLE TRACK (DONE 3/30/2005 Andy)
    ssh hgwdev
    mkdir /cluster/data/caeRem1/bed/gc5Base
    cd /cluster/data/caeRem1/bed/gc5Base
    hgGcPercent -wigOut -doGaps -file=stdout -win=5 -verbose=2 caeRem1 \
       /cluster/data/caeRem1 | wigEncode stdin gc5Base.wig gc5Base.wib
    mkdir /gbdb/caeRem1/wib
    ln -s `pwd`/gc5Base.wib /gbdb/caeRem1/wib
    hgLoadWiggle -pathPrefix=/gbdb/caeRem1/wib caeRem1 gc5Base gc5Base.wig

## MAKE DOWNLOADABLE FILES (DONE 3/30/2005 Andy)
    ssh hgwdev
    cd /cluster/data/caeRem1
    mkdir zips
    zip -j zips/allOut.zip RMOut/rmsk.lifted.out
    zip -j zips/allFa.zip maskedFa/*.fa
    zip -j zips/allTrf.zip bed/simpleRepeat/trfMask.bed    
    zip -j zips/allAgp.zip downloads/remanei.agp
    cd maskedFa/
    maskOutFa remanei.softMasked.fa hard remanei.hardMasked.fa
    cd ../
    zip -j zips/allFaMasked.zip maskedFa/remanei.hardMasked.fa
    mkdir -p /usr/local/apache/htdocs/goldenPath/caeRem1
    cd /usr/local/apache/htdocs/goldenPath/caeRem1
    mkdir bigZips database
    # Create README.txt files in bigZips/ and database/ to explain the files.
    cd bigZips/
    cp -p /cluster/data/caeRem1/zips/*.zip .
    md5sum *.zip > md5sum.txt

# MAKE 11.OOC FILE FOR BLAT (DONE 3/30/2005 Andy)
    # Use -repMatch=100 (based on size -- for human we use 1024, and 
    # fly size is ~4.4% of human judging by gapless dm1 genome size from 
    # featureBits -- we would use 45, but bump that up a bit to be more 
    # conservative).
    ssh hgwdev
    mkdir -p /panasas/store/caeRem1
    blat /cluster/data/caeRem1/caeRem1.2bit /dev/null /dev/null -tileSize=11 \
      -makeOoc=/panasas/store/caeRem1/11.ooc -repMatch=100
#Wrote 10174 overused 11-mers to /panasas/store/caeRem1/11.ooc

# PRODUCING GENSCAN PREDICTIONS (DONE 3/30/2005 Andy)
    ssh hgwdev
    # Make hard-masked scaffolds and split up for processing:
    cd /cluster/data/caeRem1
    mkdir hardMaskedFaSplits
    cd maskedFa/
    faSplit -outDirDepth=1 -lift=../hardMaskedFaSplits/hardMasked.lft gap remanei.hardMasked.fa 1000000 .
    mv ? ../hardMaskedFaSplits
    mv ../hardMaskedFaSplits /panasas/store/caeRem1
    mkdir /cluster/data/caeRem1/bed/genscan
    cd /cluster/data/caeRem1/bed/genscan
    # Check out hg3rdParty/genscanlinux to get latest genscan:
    cvs co hg3rdParty/genscanlinux
    # Make 3 subdirectories for genscan to put their output files in
    pushd /cluster/bluearc
    mkdir -p caeRem1/genscan
    cd caeRem1/genscan
    mkdir gtf pep subopt
    popd
    find /panasas/store/caeRem1/hardMaskedFaSplits -name '*.fa' chunks.lst
    cat << _EOF_ > gsub
#LOOP
gsBig {check in line+ \$(path1)} {check out line /cluster/bluearc/caeRem1/genscan/gtf/\$(root1).gtf} -trans={check out line /cluster/bluearc/caeRem1/genscan/pep/\$(root1).pep} -subopt={check out line /cluster/bluearc/caeRem1/genscan/subopt/\$(root1).bed} -exe=hg3rdParty/genscanlinux/genscan -par=hg3rdParty/genscanlinux/HumanIso.smat -tmp=/tmp -window=2400000
#ENDLOOP
_EOF_

    # OK if this needs to be redone, then it should be noted that I created 3626 files
    # in each of the /cluster/bluearc/caeRem1/genscan/* dirs, and I should've partitioned.    

    # << this line keeps emacs coloring happy
    gensub2 chunks.lst single gsub jobList
    ssh kk
    cd /cluster/data/caeRem1/bed/genscan
    para create jobList
    para try
    para check
    para push 
#3626 jobs in batch
#0 jobs (including everybody's) in Parasol queue.
#Checking finished jobs
#Completed: 3625 of 3626 jobs
#Crashed: 1 jobs
#CPU time in finished jobs:       8081s     134.69m     2.24h    0.09d  0.000 y
#IO & Wait Time:                  9851s     164.18m     2.74h    0.11d  0.000 y
#Average job time:                   5s       0.08m     0.00h    0.00d
#Longest running job:                0s       0.00m     0.00h    0.00d
#Longest finished job:             190s       3.17m     0.05h    0.00d
#Submission to last job:           690s      11.50m     0.19h    0.01d

    # If there are crashes, diagnose with "para problems".  
    # If a job crashes due to genscan running out of memory, re-run it 
    # manually with "-window=200000" instead of "-window=2400000".
    ssh kolossus
    cd /cluster/data/caeRem1/bed/genscan
    gsBig /panasas/store/caeRem1/hardMaskedFaSplits/9/2649.fa \
       /cluster/bluearc/caeRem1/genscan/gtf/2649.gtf \
       -trans=/cluster/bluearc/caeRem1/genscan/pep/2649.pep \
       -subopt=/cluster/bluearc/caeRem1/genscan/subopt/2649.bed \
       -exe=hg3rdParty/genscanlinux/genscan \
       -par=hg3rdParty/genscanlinux/HumanIso.smat -tmp=/tmp -window=200000

    # Lift and cat things
    ssh hgwdev
    cd /cluster/data/caeRem1/bed/genscan
    mv /cluster/bluearc/caeRem1/genscan/* .
    cat gtf/*.gtf > /tmp/genscan.gtf
    liftUp genscan.gtf /panasas/store/caeRem1/hardMaskedFaSplits/hardMasked.lft warn /tmp/genscan.gtf
    cat subopt/*.bed > /tmp/genscanSubopt.bed
    liftUp genscanSubopt.bed /panasas/store/caeRem1/hardMaskedFaSplits/hardMasked.lft warn /tmp/genscanSubopt.bed
    cat pep/*.pep > genscan.pep
    # Clean up
    rm -rf pep/ gtf/ subopt/
    rm -rf /panasas/store/caeRem1/hardMaskedFaSplits 
    # Load into the database:
    cd /cluster/data/caeRem1/bed/genscan
    ldHgGene -gtf caeRem1 genscan genscan.gtf
    hgPepPred caeRem1 generic genscanPep genscan.pep
    hgLoadBed caeRem1 genscanSubopt genscanSubopt.bed


# GENBANK mRNA AND EST COUNTS (DONE 3/30/2005 Andy)
    # Go to the latest GenBank full release dir and get an idea of how
    # many mRNAs and ESTs there are to align.
    ssh eieio
    cd /cluster/data/genbank/data/processed/genbank.146.0/full
    grep remanei mrna.gbidx | wc -l
#21
    grep remanei est.*gbidx | wc -l
#0
    # Hmmmmm... I'm not sure if that's enough sequence to justify an EST track.
    # At this point I'll just start with the mRNA and refgene mRNAs

# AUTO UPDATE GENBANK MRNA RUN  (IN PROGRESS 3/30/2005 Andy)
    ssh hgwdev
    # Update genbank config and source in CVS:
    cd ~/kent/src/hg/makeDb/genbank
    cvsup .

# caeRem1 (C. remanei)
caeRem1.genome = /panasas/store/caeRem1/caeRem1.2bit
caeRem1.lift = no
caeRem1.refseq.mrna.native.load = yes
caeRem1.genbank.mrna.xeno.load = yes
caeRem1.genbank.est.xeno.load = no
caeRem1.downloadDir = caeRem1
caeRem1.perChromTables = no

    cvs commit -m 'caeRem1 added to genbank update.' etc/genbank.conf 
    # Edit src/align/gbBlat to add /panasas/store/caeRem1/11.ooc
    cvs commit -m '' src/align/gbBlat
    # Install to /cluster/data/genbank:
    make install-server
    ssh `fileServer /cluster/data/genbank/`
    cd /cluster/data/genbank
    # This is an -initial run, mRNA only:
    nice bin/gbAlignStep -srcDb=genbank -type=mrna -initial caeRem1 &
    tail -f [its logfile]
    # Load results:
#    ssh hgwdev
#    cd /cluster/data/genbank
#    nice bin/gbDbLoadStep -verbose=1 -drop -initialLoad caeRem1 &
#    featureBits caeRem1 all_mrna
#    featureBits caeRem1 xenoMrna
#    # Clean up:
#    rm -rf work/initial.caeRem1

    # At this point, I'm not adding ESTs.  But if we do put them in later,
    # it'll be better to use the -drop option to gbDbLoadStep and do the
    # mRNA all over again too.

# BRIAN-STYLE HUMAN PROTEINS TRACK (HG17) (IN PROGRESS 3/30/2005 Andy)
    ssh kolossus
    mkdir -p /cluster/data/caeRem1/blastDb
    cd /cluster/data/caeRem1/blastDb
    pushd ../maskedFa/
    mkdir ../faSplits
    faSplit -outDirDepth=1 -lift=../faSplits/liftAll.lft gap remanei.softMasked.fa 500000 .
    mv ? ../faSplits/
    popd 
    for i in `seq 0 9`; do
      echo $i
      mkdir $i
      pushd $i
        ln -s ../../faSplits/$i/*.fa .
	for j in *.fa; do 
           /iscratch/i/blast/formatdb -i $j -p F
        done
      popd
    done

    find . -name '*.fa' | xargs rm
    rm ?/*.log
    cd ../
    cp -r blastDb/ /panasas/store/caeRem1

    mkdir -p /cluster/data/caeRem1/bed/tblastn.hg17KG
    cd /cluster/data/caeRem1/bed/tblastn.hg17KG
    find /panasas/store/caeRem1/blastDb -name '*.nsq' | sed "s/\.nsq//" > query.lst

    calc `wc /cluster/data/hg17/bed/blat.hg17KG/hg17KG.psl | awk "{print \\\$1}"`/\(350000/`wc query.lst | awk "{print \\\$1}"`\)
    # 42156/(350000/3718) = 447.817166
    mkdir -p /cluster/bluearc/caeRem1/bed/tblastn.hg17KG/kgfa
    split -l 448 /cluster/data/hg17/bed/blat.hg17KG/hg17KG.psl /cluster/bluearc/caeRem1/bed/tblastn.hg17KG/kgfa/kg
    ln -s /cluster/bluearc/caeRem1/bed/tblastn.hg17KG/kgfa kgfa
    cd kgfa
    for i in *; do pslxToFa $i $i.fa; rm $i; done
    cd ..
    ls -1S kgfa/*.fa > kg.lst
    mkdir -p /cluster/bluearc/caeRem1/bed/tblastn.hg17KG/blastOut
    ln -s /cluster/bluearc/caeRem1/bed/tblastn.hg17KG/blastOut 
    for i in `cat kg.lst`; do  mkdir blastOut/`basename $i .fa`; done
    tcsh

    cat << _EOF_ > blastGsub
#LOOP
blastSome \$(path1) {check in line \$(path2)} {check out exists blastOut/\$(root2)/q.\$(root1).psl } 
#ENDLOOP
_EOF_


    cat << _EOF_ > blastSome
#!/bin/sh
BLASTMAT=/iscratch/i/blast/data
export BLASTMAT
g=\`basename \$2\`
f=/tmp/\`basename \$3\`.\$g
for eVal in 0.01 0.001 0.0001 0.00001 0.000001 1E-09 1E-11
do
if /scratch/blast/blastall -M BLOSUM80 -m 0 -F no -e \$eVal -p tblastn -d \$1 -i \$2 -o \$f.8
then
        mv \$f.8 \$f.1
        break;
fi
done
if test -f  \$f.1
then
if /cluster/bin/i386/blastToPsl \$f.1 \$f.2
then
	liftUp -nosort -type=".psl" -nohead \$f.3 /cluster/data/caeRem1/jkStuff/liftAll.lft warn \$f.2  
	liftUp -nosort -type=".psl" -pslQ -nohead \$3.tmp /cluster/data/hg17/bed/blat.hg17KG/protein.lft warn \$f.3
	if pslCheck -prot \$3.tmp
	then
	    mv \$3.tmp \$3
	    rm -f \$f.1 \$f.2 
	fi
        exit 0
    fi
fi
rm -f \$f.1 \$f.2 \$3.tmp \$f.8
exit 1
_EOF_

    chmod +x blastSome
    gensub2 query.lst kg.lst blastGsub blastSpec

    ssh kk
    cd /cluster/data/caeRem1/bed/tblastn.hg17KG
    para create blastSpec

#######################################################################
# sangerGeneWS140 - mapping Ce3 sangerGene onto this sequence
#	(DONE - 2005-08-17 - Hiram)
#
    ssh pk
    mkdir /san/sanvol1/scratch/worms/caeRem1/blastDb
    cd /san/sanvol1/scratch/worms/caeRem1/blastDb
    faSplit sequence /cluster/data/caeRem1/maskedFa/remanei.softMasked.fa 10 c_

    #	And construct the blast database
    ssh kkr10u01
    for i in *.fa
    do
    /cluster/bluearc/blast2211x86_64/bin/formatdb -i $i -p F
    done
    exit

    mkdir -p /cluster/data/caeRem1/bed/tblastn.sangerGene
    cd /cluster/data/caeRem1/bed/tblastn.sangerGene

    ls -1S /san/sanvol1/scratch/worms/caeRem1/blastDb/*.fa > query.lst
    ls -1S /san/sanvol1/scratch/worms/ce3/splitPep/*.fa > pep.lst

    mkdir blastOut
    for i in `cat pep.lst`; do mkdir blastOut/`basename $i .fa`; done

    cat << '_EOF_' > gsub
#LOOP
blastSome {check in line+ $(path1)} {check in line+ $(path2)} {check out exists blastOut/$(root2)/q.$(root1).psl } 
#ENDLOOP
'_EOF_'
    #	happy emacs

    cat << '_EOF_' > blastSome
#!/bin/sh
BLASTMAT=/cluster/bluearc/blast2211x86_64/data
export BLASTMAT
g=`basename $2`
f=/tmp/`basename $3`.$g
for eVal in 0.01 0.001 0.0001 0.00001 0.000001 1E-09 1E-11
do
if /cluster/bluearc/blast2211x86_64/bin/blastall -M BLOSUM80 -m 0 -F no -e $eVal -p tblastn -d $1 -i $2 -o $f.8
then
        mv $f.8 $f.1
        break;
fi
done
if test -f  $f.1
then
    if /cluster/bin/i386/blastToPsl $f.1 $f.2
    then
        if pslCheck -prot $f.2
        then
            cp -p $f.2 $3
            rm -f $f.1 $f.2
        fi
        exit 0
    fi
fi
rm -f $f.1 $f.2 $f.8
exit 1
'_EOF_'
    #	happy emacs
    chmod +x blastSome
    
    gensub2 query.lst pep.lst gsub jobList
    para create jobList
    para try; push; check ... etc ...
# Completed: 8170 of 8170 jobs
# CPU time in finished jobs:     119476s    1991.26m    33.19h    1.38d  0.004 y
# IO & Wait Time:                 21472s     357.87m     5.96h    0.25d  0.001 y
# Average job time:                  17s       0.29m     0.00h    0.00d
# Longest running job:                0s       0.00m     0.00h    0.00d
# Longest finished job:              33s       0.55m     0.01h    0.00d
# Submission to last job:          1258s      20.97m     0.35h    0.01d

    cat << '_EOF_' > chainGsub
#LOOP
chainSome $(path1)
#ENDLOOP
'_EOF_'
    #	happy emacs

    cat << '_EOF_' > chainSome
(cd $1; cat q.*.psl | simpleChain -prot -outPsl -maxGap=7000 stdin ../c.`basename $1`.psl)
'_EOF_'
    #	happy emacs
    chmod +x chainSome

    ls -1dS `pwd`/blastOut/wp???? > chain.lst
    gensub2 chain.lst single chainGsub chainJobs
    para create chainJobs
    para try; push; check ... etc ...
# Completed: 817 of 817 jobs
# CPU time in finished jobs:         40s       0.67m     0.01h    0.00d  0.000 y
# IO & Wait Time:                  2092s      34.87m     0.58h    0.02d  0.000 y
# Average job time:                   3s       0.04m     0.00h    0.00d
# Longest running job:                0s       0.00m     0.00h    0.00d
# Longest finished job:               6s       0.10m     0.00h    0.00d
# Submission to last job:            97s       1.62m     0.03h    0.00d
    ssh kkstore01 
    cd /cluster/data/caeRem1/bed/tblastn.sangerGene/blastOut
    for i in wp????
    do
        awk "(\$13 - \$12)/\$11 > 0.6 {print}" c.$i.psl > c60.$i.psl
        sort -rn c60.$i.psl | pslUniq stdin u.$i.psl
        awk "((\$1 / \$11) ) > 0.60 {print}" c60.$i.psl > m60.$i.psl
        echo $i
    done

    cat u.*.psl m60* | sort -T /tmp -k 14,14 -k 16,16n -k 17,17n \
	| uniq  > ../preblastSangerGene.psl
XXXX
    cd ..
#  haven't tried this yet - 2005-08-12 - should be interesting, need a
#  psl file indicating where the sangerGene's are on Ce3 and alias name list
    blatDir=/cluster/data/hg16/bed/blat.hg16KG
    protDat -kg preblastHg16KG.psl $blatDir/hg16KG.psl $blatDir/kg.mapNames blastHg16KG.psl
XXXX

    ssh hgwdev
    cd /cluster/data/caeRem1/bed/tblastn.sangerGene
    hgLoadPsl -table=blastSangerGene caeRem1 preblastSangerGene.psl
    
    # clean up
    ssh kkstore01
    cd /cluster/data/cb2/bed/tblastn.sangerGene
    rm -rf blastOut

############################################################################
#  BLATSERVERS ENTRY (DONE - 2005-08-31 - Hiram)
    ssh hgwdev

    hgsql -e 'INSERT INTO blatServers (db, host, port, isTrans, canPcr) \
	VALUES ("caeRem1", "blat16", "17788", "1", "0"); \
	INSERT INTO blatServers (db, host, port, isTrans, canPcr) \
	VALUES ("caeRem1", "blat16", "17789", "0", "1");' \
	    hgcentraltest

############################################################################
#	preparing goldenPath downloads (WORKING 2005-12-14 - Hiram)
    ssh kkstore01
    mkdir /cluster/data/caeRem1/goldenPath
    cd /cluster/data/caeRem1/goldenPath
    mkdir bigZips
    cp -p ../maskedFa/remanei.softMasked.fa bigZips/superContigs.masked.fa
    gzip bigZips/superContigs.masked.fa
    cp -p ../RMOut/rmsk.lifted.out bigZips/rmsk.out
    gzip bigZips/rmsk.out

    ssh hgwdev
    mkdir /usr/local/apache/htdocs/goldenPath/caeRem1/bigZips
    cd /usr/local/apache/htdocs/goldenPath/caeRem1/bigZips
    ln -s /cluster/data/caeRem1/goldenPath/bigZips/* .

