# for emacs: -*- mode: sh; -*-

#	Caenorhabditis PB2801
#	This is the actual name, it is sometimes known as:
#	Caenorhabditis n. sp. PB2801
#	From Wash U GSC
#	http://genome.wustl.edu/genome.cgi?GENOME=Caenorhabditis+n.+sp.+PB2801
#	http://www.ncbi.nlm.nih.gov/Traces/wgs/?val=ABEG02


#########################################################################
# DOWNLOAD SEQUENCE (DONE - 2008-05-13 - Hiram)
    ssh kkstore01
    mkdir /cluster/store1/caePb2
    ln -s /cluster/store1/caePb2 /cluster/data/caePb2

    mkdir /cluster/data/caePb2/downloads
    cd /cluster/data/caePb2/downloads

    wget --timestamping \
"ftp://genome.wustl.edu/pub/organism/Invertebrates/Caenorhabditis_PB2801/assembly/Caenorhabditis_PB2801-6.0.1/ASSEMBLY"

    wget --timestamping \
"ftp://genome.wustl.edu/pub/organism/Invertebrates/Caenorhabditis_PB2801/assembly/Caenorhabditis_PB2801-6.0.1/README" -O README.caePb2

    wget --timestamping -m -nd -np \
"ftp://genome.wustl.edu/pub/organism/Invertebrates/Caenorhabditis_PB2801/assembly/Caenorhabditis_PB2801-6.0.1/output/"

    #	Going to manipulate the given supercontigs.agp.gz file to produce a
    #	chrUn.agp.gz file, although this ends up being too detailed.  A second
    #	super.lift and gold.tab will also be produced to get the chrUn_gold
    #	table loaded with appropriate supercontigs without all the bridged
    #	gaps.  This first chrUn.agp.gz file will be used with the initial
    #	build so that the gap table will be constructed properly with all the
    #	bridged gaps.
    faCount contigs.fa.gz > faCount.contigs.fa.txt
    mkdir ../jkStuff
    cat << '_EOF_' > ../jkStuff/mkChrUnAgp.pl
#!/usr/bin/env perl

use strict;
use warnings;

my $start = 1;
my $end = 1;
my $itemCount = 1;
my $curContig = "";
my $firstTime = 1;
my $scaffoldGapSize = 1000;

open (FH,'zcat supercontigs.agp.gz|') or
        die "can not open zcat supercontigs.agp.gz";
while (my $line=<FH>) {
    chomp $line;
    if ($line =~ m/fragment/) {
        my ($name, $gStart, $gEnd, $counter, $WN, $gapLen, $frag, $yesNo) =
            split('\s+',$line);
        $end = $start + $gapLen - 1;
        printf "chrUn\t%d\t%d\t%d\t%s\t%d\t%s\t%s\n",
                    $start, $end, $itemCount++, $WN, $gapLen, $frag, $yesNo;
        $start = $end + 1;
    } else {
    my ($ctgName, $ctgStart, $ctgEnd, $counter, $WN, $name, $cStart, $cEnd, $strand) =
        split('\s+',$line);
        my $ctgLen = $ctgEnd - $ctgStart + 1;
        my $cLen = $cEnd - $cStart + 1;
        die "not matching start, end:\n$line" if ($ctgLen != $cLen);
        if (!$firstTime) {
            if ($ctgName ne $curContig) {
                $end = $start + $scaffoldGapSize - 1;
                printf "chrUn\t%d\t%d\t%d\tN\t%d\tscaffold\tno\n",
                    $start, $end, $itemCount++, $scaffoldGapSize;
                $start = $end + 1;
            }
        } else {
            $firstTime = 0;
        }
        $end = $start + $ctgLen - 1;
        printf "chrUn\t%d\t%d\t%d\t%s\t%s\t%d\t%d\t%s\n",
                $start, $end, $itemCount++, $WN, $name,
                $cStart, $cEnd, $strand;
        $start = $end + 1;
        $curContig = $ctgName;
    }
}
close (FH);
'_EOF_'
# << happy emacs
    chmod +x ../jkStuff/mkChrUnAgp.pl
    ../jkStuff/mkChrUnAgp.pl | gzip > chrUn.agp.gz
    qaToQac contigs.fa.qual.gz stdout \
        | qacAgpLift chrUn.agp.gz stdin chrUn.qac

##########################################################################
## Initial database construction and beginning tables
##	(DONE - 2008-05-13 - Hiram)

    ssh kkstore01
    cd /cluster/data/caePb2
    cat << '_EOF_' > caePb2.config.ra
# Config parameters for makeGenomeDb.pl:
db caePb2
clade worm
genomeCladePriority 66
scientificName Caenorhabditis brenneri
commonName C. brenneri
assemblyDate Apr. 2008
assemblyLabel Washington University School of Medicine GSC Caenorhabditis brenneri(PB2801) 6.0.1
orderKey 839
mitoAcc none
fastaFiles /cluster/data/caePb2/downloads/contigs.fa.gz
agpFiles /cluster/data/caePb2/downloads/chrUn.agp.gz
# qualFiles none
dbDbSpeciesDir worm
'_EOF_'
# << happy emacs

    time makeGenomeDb.pl -verbose=2 caePb2.config.ra > makeGenomeDb.out 2>&1
    #	real    3m24.338s

##########################################################################
##  Fixup chrUn_gold table, and add ctgPos2 table

    ssh kkstore01
    cd /cluster/data/caePb2/downloads

    #	create a ctgPos2.tab table
    cat << '_EOF_' > ../jkStuff/mkScafAgp.pl
#!/usr/bin/env perl

use strict;
use warnings;

my $start = 1;
my $end = 1;
my $itemCount = 1;
my $curContig = "";
my $firstTime = 1;
my $scaffoldGapSize = 1000;
my $scafEnd = 1;

open (CT,">ctgPos2.tab") or die "can not write to ctgPos2.tab";

open (FH,'zcat supercontigs.agp.gz|') or
        die "can not open zcat supercontigs.agp.gz";
while (my $line=<FH>) {
    chomp $line;
    if ($line =~ m/fragment/) {
        my ($name, $gStart, $gEnd, $counter, $WN, $gapLen, $frag, $yesNo) =
            split('\s+',$line);
        $end = $start + $gapLen - 1;
        $scafEnd += $gapLen;
        printf "chrUn\t%d\t%d\t%d\t%s\t%d\t%s\t%s\n",
                    $start, $end, $itemCount++, $WN, $gapLen, $frag, $yesNo;
        $start = $end + 1;
    } else {
    my ($ctgName, $ctgStart, $ctgEnd, $counter, $WN, $name, $cStart, $cEnd, $strand) =
        split('\s+',$line);
        my $ctgLen = $ctgEnd - $ctgStart + 1;
        my $cLen = $cEnd - $cStart + 1;
        die "not matching start, end:\n$line" if ($ctgLen != $cLen);
        if (!$firstTime) {
            if ($ctgName ne $curContig) {
                $end = $start + $scaffoldGapSize - 1;
                printf "chrUn\t%d\t%d\t%d\tN\t%d\tscaffold\tno\n",
                    $start, $end, $itemCount++, $scaffoldGapSize;
                $start = $end + 1;
                $scafEnd = $cStart - 1;
            }
        } else {
            $firstTime = 0;
            $scafEnd = 0;
        }
        $scafEnd += $ctgLen;
        my $fragStart = $scafEnd - $ctgLen + 1;
        $end = $start + $ctgLen - 1;
        printf "chrUn\t%d\t%d\t%d\t%s\t%s\t%d\t%d\t%s\n",
                $start, $end, $itemCount++, $WN, $ctgName,
                $fragStart, $scafEnd, $strand;
        printf CT "%s\t%d\tchrUn\t%d\t%d\tW\n",
                $name, $ctgLen, $start-1, $end;
        $start = $end + 1;
        $curContig = $ctgName;
    }
}
close (FH);
close (CT);
'_EOF_'
# << happy emacs
    chmod +x ../jkStuff/mkScafAgp.pl
    nice -n +19 ../jkStuff/mkScafAgp.pl

    ##	and load that ctgPos2 table:
    ssh hgwdev
    cd /cluster/data/caePb2/downloads
    hgLoadSqlTab caePb2 ctgPos2 ~/kent/src/hg/lib/ctgPos2.sql ctgPos2.tab

    ##  and the super.lift to get supercontigs lifted to chrUn
    ssh kkstore01
    cd /cluster/data/caePb2/downloads
    
    #	beware of the chrUnSize specified below
    cat << '_EOF_' > ../jkStuff/mkSuperLift.pl
#!/usr/bin/env perl

use strict;
use warnings;

my $start = 1;
my $end = 1;
my $itemCount = 1;
my $curContig = "";
my $firstTime = 1;
my $scaffoldGapSize = 1000;
my $scafEnd = 1;
my $superStart = 0;
my $superEnd = 1;
my $superLength = 0;
my $chrUnSize = 194283334;
my $chrUnName = "chrUn";

open (GL,">gold.tab") or die "Can not write to gold.tab";

open (FH,'zcat supercontigs.agp.gz|') or
        die "can not open zcat supercontigs.agp.gz";
while (my $line=<FH>) {
    chomp $line;
    if ($line =~ m/fragment/) {
        my ($name, $gStart, $gEnd, $counter, $WN, $gapLen, $frag, $yesNo) =
            split('\s+',$line);
        $end = $start + $gapLen - 1;
        $scafEnd += $gapLen;
#       printf "chrUn\t%d\t%d\t%d\t%s\t%d\t%s\t%s\n",
#                   $start, $end, $itemCount++, $WN, $gapLen, $frag, $yesNo;
        $start = $end + 1;
    } else {
    my ($ctgName, $ctgStart, $ctgEnd, $counter, $WN, $name, $cStart, $cEnd, $strand) =
        split('\s+',$line);
        my $ctgLen = $ctgEnd - $ctgStart + 1;
        my $cLen = $cEnd - $cStart + 1;
        die "not matching start, end:\n$line" if ($ctgLen != $cLen);
        if (!$firstTime) {
            if ($ctgName ne $curContig) {
                $superLength = $superEnd - $superStart;
                $end = $start + $scaffoldGapSize - 1;
#               printf "chrUn\t%d\t%d\t%d\tN\t%d\tscaffold\tno\n",
#                   $start, $end, $itemCount++, $scaffoldGapSize;
                printf "%d\t%s\t%d\t%s\t%d\n",
                        $superStart, $curContig, $superLength, $chrUnName,
                                $chrUnSize;
                printf GL "%s\t%d\t%d\t%d\tW\t%s\t0\t%d\t+\n",
                        $chrUnName, $superStart, $superEnd, $itemCount++,
                        $curContig, $superLength;
                $start = $end + 1;
                $scafEnd = $cStart - 1;
                $superStart = $start - 1;
            }
        } else {
            $firstTime = 0;
            $scafEnd = 0;
        }
        $scafEnd += $ctgLen;
        my $fragStart = $scafEnd - $ctgLen + 1;
        $end = $start + $ctgLen - 1;
        $superEnd = $end;
#       printf "chrUn\t%d\t%d\t%d\t%s\t%s\t%d\t%d\t%s\n",
#               $start, $end, $itemCount++, $WN, $ctgName,
#               $fragStart, $scafEnd, $strand;
#       printf SL "%s\t%d\tchrUn\t%d\t%d\tW\n",
#               $name, $ctgLen, $start-1, $end;
        $start = $end + 1;
        $curContig = $ctgName;
    }
}
close (FH);
$superLength = $superEnd - $superStart;
printf "%d\t%s\t%d\t%s\t%d\n",
    $superStart, $curContig, $superLength, $chrUnName, $chrUnSize;
printf GL "%s\t%d\t%d\t%d\tW\t%s\t0\t%d\t+\n",
        $chrUnName, $superStart, $superEnd, $itemCount++,
        $curContig, $superLength;
close (GL);
'_EOF_'
# << happy emacs
    chmod +x ../jkStuff/mkSuperLift.pl

    nice -n +19 ../jkStuff/mkSuperLift.pl > super.lift

    ##	and load this new gold table

    ssh hgwdev
    cd /cluster/data/caePb2/downloads
    sed -e "s/agpFrag/chrUn_gold/" $HOME/kent/src/hg/lib/agpFrag.sql \
	> chrUn_gold.sql
    # edit that .sql file to add the bin column
    #	 bin smallint(6) NOT NULL default '0',

    hgLoadBed -sqlTable=chrUn_gold.sql caePb2 chrUn_gold gold.tab

##############################################################################
## WindowMasker masking (DONE - 2008-05-13 - Hiram)
    ssh kolossus
    mkdir /cluster/data/caePb2/bed/WindowMasker
    cd /cluster/data/caePb2/bed/WindowMasker
    time nice -n +19 ~/kent/src/hg/utils/automation/doWindowMasker.pl \
	caePb2 -buildDir=/cluster/data/caePb2/bed/WindowMasker \
	    -workhorse=kolossus > doWM.out 2>&1 &
    #	real    16m34.556s
    ## load result to prep for cleaning
    ssh hgwdev
    cd /cluster/data/caePb2/bed/WindowMasker
    hgLoadBed caePb2 windowmaskerSdust windowmasker.sdust.bed.gz
    #	Loaded 1346187 elements of size 3

    featureBits caePb2 windowmaskerSdust
    #	73264992 bases of 170473138 (42.977%) in intersection

    #	eliminate the gaps from the masking (WM bug)
    featureBits caePb2 -not gap -bed=notGap.bed
    #	170473138 bases of 170473138 (100.000%) in intersection

    time nice -n +19 featureBits caePb2 windowmaskerSdust notGap.bed \
        -bed=stdout | gzip -c > cleanWMask.bed.gz
    #	49454796 bases of 170473138 (29.010%) in intersection

    #	reload track to get it clean
    hgLoadBed caePb2 windowmaskerSdust cleanWMask.bed.gz
    #	Loaded 1343764 elements of size 4
    featureBits caePb2 windowmaskerSdust
    #	49454796 bases of 170473138 (29.010%) in intersection

##############################################################################
## simpleRepeat masking (DONE - 2008-05-13 - Hiram)
    ssh kolossus
    mkdir /cluster/data/caePb2/bed/simpleRepeat
    cd /cluster/data/caePb2/bed/simpleRepeat
    nice -n +19 twoBitToFa ../../caePb2.unmasked.2bit stdout \
	| trfBig -trf=/cluster/bin/i386/trf stdin /dev/null \
		-bedAt=simpleRepeat.bed -tempDir=/scratch/tmp
    #	about 30 minutes
    awk '{if ($5 <= 12) print;}' simpleRepeat.bed > trfMask.bed

    ssh hgwdev
    cd /cluster/data/caePb2/bed/simpleRepeat
    nice -n +19 hgLoadBed caePb2 simpleRepeat \
      simpleRepeat.bed -sqlTable=$HOME/kent/src/hg/lib/simpleRepeat.sql

    featureBits caePb2 simpleRepeat
    #	5083084 bases of 170473138 (2.982%) in intersection

##############################################################################
## combine trf mask with windowmasker (DONE - 2008-05-13 - Hiram)
    ssh kkstore01
    cd /cluster/data/caePb2
    zcat bed/WindowMasker/cleanWMask.bed.gz \
	| twoBitMask caePb2.unmasked.2bit stdin \
	    -type=.bed caePb2.cleanWMSdust.2bit
    cat bed/simpleRepeat/trfMask.bed \
        | twoBitMask -add -type=.bed caePb2.cleanWMSdust.2bit stdin caePb2.2bit
    #	can safely ignore the warning about BED file >= 13 fields
    #	check it
    twoBitToFa caePb2.2bit stdout | faSize stdin
    #	194283334 bases (23810210 N's 170473124 real 120922179 upper
    #	49550945 lower) in 1 sequences in 1 files
    #	%25.50 masked total, %29.07 masked real

##############################################################################
## create masked contigs for blastz (DONE - 2008-05-13 - Hiram)
    ssh kkstore01
    mkdir /cluster/data/caePb2/maskedContigs
    cd /cluster/data/caePb2/maskedContigs
    #	copy over lft2BitToFa.pl from /cluster/data/caePb1/jkStuff
    cp -p /cluster/data/caePb1/jkStuff/lft2BitToFa.pl ../jkStuff
    ln -s ../downloads/super.lift .
    time nice -n +19 ../jkStuff/lft2BitToFa.pl ../caePb2.2bit super.lift \
	> super.fa
    #	real    1m55.813s
    #	verify it is OK, should have same numbers of real, upper and lower as
    #	measured in the caePb2.2bit, only N's are different:
    faSize super.fa
# 190774334 bases (20301210 N's 170473124 real 120922179 upper 49550945 lower) in 3510 sequences in 1 files
# Total size: mean 54351.7 sd 179245.2 min 327 (Contig8561) max 4147112 (Contig0) median 7338
# %25.97 masked total, %29.07 masked real
    #	in fact, the missing N's should be:
    #	(3510-1)*1000 = 23810210-20301210 = 3509000

    faToTwoBit super.fa caePb2.supercontigs.2bit
    twoBitInfo caePb2.supercontigs.2bit stdout | sort -k2nr \
        > caePb2.supercontigs.sizes
    #	Copy this 2bit file and sizes over to /san/sanvol1/scratch/worms/caePb2

    #  the lft2BitToFa.pl script is:
    cat << '_EOF_'
#!/usr/bin/env perl

use strict;
use warnings;
use File::Basename;
    
sub usage()
{
printf "usage: %s <file.2bit> <file.lft> [more_files.lft]\n",  basename($0);
printf "\tfasta output is to stdout, therefore route stdout to result file\n";
exit 255;
}   
    
my $argc = scalar(@ARGV);

usage if ($argc < 2);

my $twoBitFile = shift;
    
while (my $liftFile = shift)
{   
open (FH,"<$liftFile") or die "Can not open $liftFile";
while (my $line=<FH>)
    {
    chomp $line;
    my ($start, $contig, $length, $chrom, $chrom_length) = split('\s',$line);
    my $cmd=sprintf("twoBitToFa $twoBitFile:%s:%d-%d stdout",
        $chrom, $start, $start+$length);
    print `$cmd | sed -e "s#^>.*#>$contig#"`;
    }
close (FH);
}
'_EOF_'
# << happy emacs

#########################################################################
# MAKE 11.OOC FILE FOR BLAT/GENBANK (DONE - 2008-05-13 - Hiram)
    # Use -repMatch=62 (based on size -- for human we use 1024, and 
    # C. PB2801 size is ~6.1% of human judging by gapless caePb2 vs. hg18 
    # genome sizes from featureBits.  So we would use 62, but that yields a very
    # high number of tiles to ignore, especially for a small more compact 
    # genome.  Bump that up a bit to be more conservative.

    ssh kolossus
    cd /cluster/data/caePb2
    blat caePb2.2bit /dev/null /dev/null -tileSize=11 \
      -makeOoc=jkStuff/11.ooc -repMatch=120
    #	Wrote 8586 overused 11-mers to jkStuff/11.ooc
    cp -p jkStuff/11.ooc /san/sanvol1/scratch/worms/caePb2

############################################################################
## Default position (DONE - 2008-05-13 - Hiram)
    ssh hgwdev
    hgsql -e 'update dbDb set defaultPos="chrUn:117559143-117563800"
	where name="caePb2";' hgcentraltest

#########################################################################
# GENBANK AUTO UPDATE (DONE - 2008-05-13 - Hiram)
    # align with latest genbank process.
    ssh hgwdev
    cd ~/kent/src/hg/makeDb/genbank
    cvsup
    # edit etc/genbank.conf to add caePb2 just before caeRem2

# caePb2 (C. brenneri)
caePb2.serverGenome = /cluster/data/caePb2/caePb2.2bit
caePb2.clusterGenome = /iscratch/i/worms/caePb2/caePb2.2bit
caePb2.ooc = /iscratch/i/worms/caePb2/11.ooc
caePb2.lift = /iscratch/i/worms/caePb2/caePb2.supercontigs.lift
caePb2.refseq.mrna.native.pslCDnaFilter  = ${lowCover.refseq.mrna.native.pslCDnaFilter}
caePb2.refseq.mrna.xeno.pslCDnaFilter    = ${lowCover.refseq.mrna.xeno.pslCDnaFilter}
caePb2.genbank.mrna.native.pslCDnaFilter = ${lowCover.genbank.mrna.native.pslCDnaFilter}
caePb2.genbank.mrna.xeno.pslCDnaFilter   = ${lowCover.genbank.mrna.xeno.pslCDnaFilter}
caePb2.genbank.est.native.pslCDnaFilter  = ${lowCover.genbank.est.native.pslCDnaFilter}
caePb2.refseq.mrna.native.load = no
caePb2.refseq.mrna.xeno.load  = yes
caePb2.refseq.mrna.xeno.loadDesc = yes
caePb2.genbank.mrna.xeno.load = yes
caePb2.genbank.est.native.load = yes
caePb2.genbank.est.native.loadDesc = no
caePb2.downloadDir = caePb2
caePb2.perChromTables = no

    cvs ci -m "Added caePb2." etc/genbank.conf
    # update /cluster/data/genbank/:
    make etc-update

    ssh genbank
    cd /cluster/data/genbank
    time nice -n +19 bin/gbAlignStep -initial caePb2 &
    #	logFile: var/build/logs/2008.06.16-10:03:05.caePb2.initalign.log
    #	real    85m23.353s

    # load database when finished
    ssh hgwdev
    cd /cluster/data/genbank
    time nice -n +19 ./bin/gbDbLoadStep -drop -initialLoad caePb2
    #	real    16m2.684s

    # enable daily alignment and update of hgwdev
    cd ~/kent/src/hg/makeDb/genbank
    cvsup
    # add caePb2 to:
        etc/align.dbs
        etc/hgwdev.dbs
    cvs ci -m "Added caePb2 - C. brenneri" etc/align.dbs etc/hgwdev.dbs
    make etc-update

##########################################################################
## Creating downloads (DONE - 2008-05-13 - Hiram)
    #	There is only one chrom, make its trfMaskChrom file exist
    ssh hgwdev
    mkdir /cluster/data/caePb2/bed/simpleRepeat/trfMaskChrom
    cd /cluster/data/caePb2/bed/simpleRepeat/trfMaskChrom
    ## symlink didn't work here, the symlink ended up in the tar file
    cp -p ../trfMask.bed ./chrUn.bed
    cd  /cluster/data/caePb2
    #	there is no RM file
    touch Un/chrUn.fa.out
    /cluster/bin/scripts/makeDownloads.pl caePb2 
    ## *!* EDIT THE README.txt FILES *!* ##

##########################################################################
## Creating pushQ (DONE - 2008-05-13 - Hiram)
    ssh hgwdev
    mkdir /cluster/data/caePb2/pushQ
    cd /cluster/data/caePb2/pushQ
    /cluster/bin/scripts/makePushQSql.pl -noGenbank caePb2 \
	> caePb2.sql 2> stderr.out

############################################################################
#  BLATSERVERS ENTRY (DONE - 2008-05-13 - Hiram)
#	After getting a blat server assigned by the Blat Server Gods,
    ssh hgwdev

    hgsql -e 'INSERT INTO blatServers (db, host, port, isTrans, canPcr) \
	VALUES ("caePb2", "blat5", "17786", "1", "0"); \
	INSERT INTO blatServers (db, host, port, isTrans, canPcr) \
	VALUES ("caePb2", "blat5", "17787", "0", "1");' \
	    hgcentraltest
    #	test it with some sequence

############################################################################
#  Fixup assembly date after confirmation from John Spieth
#	2008-05-27 - DONE - Hiram
    ssh hgwdev
    hgsql -e 'update dbDb set description="Feb. 2008" where name="caePb2"' \
	hgcentraltest

#########################################################################
# ELEGANS (ce6) PROTEINS TRACK (DONE -  Hiram - 2008-06-11)
    ssh kkstore01

    #	breaking up this target genome into manageable pieces
    #	using the contigs so we don't have genes spanning non-bridged gaps
    mkdir /cluster/data/caePb2/blastDb
    cd /cluster/data/caePb2
    twoBitToFa -noMask maskedContigs/caePb2.supercontigs.2bit temp.fa
    faSplit gap temp.fa 1000000 blastDb/x -lift=blastDb.lft
    #	3545 pieces of 3545 written
    rm temp.fa
    cd blastDb
    for i in *.fa
    do
	/cluster/bluearc/blast229/formatdb -i $i -p F
    done
    rm *.fa

    ##	copy to san for kluster access
    mkdir -p /san/sanvol1/scratch/worms/caePb2/blastDb
    cd /san/sanvol1/scratch/worms/caePb2/blastDb
    rsync -a --progress --stats /cluster/data/caePb2/blastDb/ .

    ## create the query protein set
    mkdir -p /cluster/data/caePb2/bed/tblastn.ce6SG
    cd /cluster/data/caePb2/bed/tblastn.ce6SG
    echo  /san/sanvol1/scratch/worms/caePb2/blastDb/*.nsq | xargs ls -S \
	| sed "s/\.nsq//"  > query.lst
    wc -l query.lst
    # 3545 query.lst

   # we want around 80000 jobs
   calc `wc /cluster/data/ce6/bed/blat.ce6SG/ce6SG.psl | awk "{print \\\$1}"`/\(80000/`wc query.lst | awk "{print \\\$1}"`\)

    #	23741/(80000/3545) = 1052.023062

   mkdir -p /cluster/bluearc/worms/caePb2/bed/tblastn.ce6SG/sgfa
   split -l 1000 /cluster/data/ce6/bed/blat.ce6SG/ce6SG.psl \
	/cluster/bluearc/worms/caePb2/bed/tblastn.ce6SG/sgfa/sg
   ln -s /cluster/bluearc/worms/caePb2/bed/tblastn.ce6SG/sgfa sgfa
   cd sgfa
   for i in *; do 
     nice pslxToFa $i $i.fa; 
     rm $i; 
   done
   cd ..
   ls -1S sgfa/*.fa > sg.lst
   mkdir -p /cluster/bluearc/worms/caePb2/bed/tblastn.ce6SG/blastOut
   ln -s /cluster/bluearc/worms/caePb2/bed/tblastn.ce6SG/blastOut
   for i in `cat sg.lst`; do  mkdir blastOut/`basename $i .fa`; done
   
   cd /cluster/data/caePb2/bed/tblastn.ce6SG
   cat << '_EOF_' > template
#LOOP
blastSome $(path1) {check in line $(path2)} {check out exists blastOut/$(root2)/q.$(root1).psl }
#ENDLOOP
'_EOF_'
    # << happy emacs

   cat << '_EOF_' > blastSome
#!/bin/sh
CBIN=/cluster/bin/i386
DB=caePb2
BLASTMAT=/scratch/data/blast229/data
SCR="/scratch/tmp/${DB}"
g=`/bin/basename $2`
D=`/bin/basename $1`
inputFile=${SCR}/${D}_${g}
export CBIN DB BLASTMAT SCR g D
/bin/mkdir -p ${SCR}
/bin/cp -p $1.* ${SCR}
/bin/cp -p $2 ${inputFile}
/bin/cp -p /cluster/store1/worm/${DB}/blastDb.lft ${SCR}/${D}_blastDb.lft
/bin/cp -p /cluster/store3/worm/ce6/bed/blat.ce6SG/protein.lft \
        ${SCR}/${D}_protein.lft
f=${SCR}/`/bin/basename $3`.$g
for eVal in 0.01 0.001 0.0001 0.00001 0.000001 1E-09 1E-11
do
if /scratch/data/blast229/blastall -M BLOSUM80 -m 0 -F no \
        -e $eVal -p tblastn -d ${SCR}/${D} -i ${inputFile} -o $f.8
then
        /bin/mv $f.8 $f.1
        break;
fi
done
if test -f  $f.1
then
    if ${CBIN}/blastToPsl $f.1 $f.2
    then
        ${CBIN}/liftUp -nosort -type=".psl" -nohead $f.3 \
            ${SCR}/${D}_blastDb.lft carry $f.2 > /dev/null
        ${CBIN}/liftUp -nosort -type=".psl" -pslQ -nohead ${SCR}/$3.tmp \
            ${SCR}/${D}_protein.lft warn $f.3 > /dev/null
        if ${CBIN}/pslCheck -prot ${SCR}/$3.tmp
        then
            /bin/rm -f $3
            /bin/mv ${SCR}/$3.tmp $3
            /bin/rm -f $f.1 $f.2 $f.3 $f.4 ${SCR}/${D}.* ${inputFile} \
                ${SCR}/${D}_protein.lft ${SCR}/${D}_blastDb.lft
            /bin/rmdir --ignore-fail-on-non-empty ${SCR}
        fi
        exit 0
    fi
fi
/bin/rm -f $f.1 $f.2 $3.tmp $f.8 $f.3 $f.4 ${SCR}/${D}.* ${inputFile} \
        ${SCR}/${D}_protein.lft ${SCR}/${D}_blastDb.lft
/bin/rmdir --ignore-fail-on-non-empty ${SCR}
exit 1
'_EOF_'
    # << happy emacs
    chmod +x blastSome

    ssh kk
    cd /cluster/data/caePb2/bed/tblastn.ce6SG
    gensub2 query.lst sg.lst template jobList
    para create jobList
#    para try, check, push, check etc.
# Completed: 85080 of 85080 jobs
# CPU time in finished jobs:    9631747s  160529.12m  2675.49h  111.48d  0.305 y
# IO & Wait Time:                401660s    6694.33m   111.57h    4.65d  0.013 y
# Average job time:                 118s       1.97m     0.03h    0.00d
# Longest finished job:            1032s      17.20m     0.29h    0.01d
# Submission to last job:         75506s    1258.43m    20.97h    0.87d

    # do the cluster run for chaining
    ssh kk
    mkdir /cluster/data/caePb2/bed/tblastn.ce6SG/chainRun
    cd /cluster/data/caePb2/bed/tblastn.ce6SG/chainRun
    cat << '_EOF_' > template
#LOOP
chainOne $(path1)
#ENDLOOP
'_EOF_'
    # << happy emacs

    cat << '_EOF_' > chainOne
(cd $1; cat q.*.psl | simpleChain -prot -outPsl -maxGap=50000 stdin /cluster/bluearc/worms/caePb2/bed/tblastn.ce6SG/blastOut/c.`basename $1`.psl)
'_EOF_'
    # << happy emacs
    chmod +x chainOne

    ls -1dS /cluster/bluearc/worms/caePb2/bed/tblastn.ce6SG/blastOut/sg?? \
	> chain.lst
    gensub2 chain.lst single template jobList
    cd /cluster/data/caePb2/bed/tblastn.ce6SG/chainRun
    para create jobList
    # para maxNode 30 there are only 24 jobs
    para try, check, push, check etc.
# Completed: 24 of 24 jobs
# CPU time in finished jobs:        497s       8.28m     0.14h    0.01d  0.000 y
# IO & Wait Time:                  2371s      39.52m     0.66h    0.03d  0.000 y
# Average job time:                 120s       1.99m     0.03h    0.00d
# Longest finished job:             125s       2.08m     0.03h    0.00d
# Submission to last job:           161s       2.68m     0.04h    0.00d

    ssh kkstore01
    cd /cluster/data/caePb2/bed/tblastn.ce6SG/blastOut
    for i in sg??
    do
       cat c.$i.psl | awk "(\$13 - \$12)/\$11 > 0.6 {print}" > c60.$i.psl
       sort -rn c60.$i.psl | pslUniq stdin u.$i.psl
       awk "((\$1 / \$11) ) > 0.60 { print   }" c60.$i.psl > m60.$i.psl
       echo $i
    done
    sort -T /scratch/tmp -k 14,14 -k 16,16n -k 17,17n u.*.psl m60* | uniq \
	> /cluster/data/caePb2/bed/tblastn.ce6SG/blastCe6SG.psl
    cd ..
    pslCheck blastCe6SG.psl
    #	checked: 35496 failed: 0 errors: 0

    # load table 
    ssh hgwdev
    cd /cluster/data/caePb2/bed/tblastn.ce6SG
    liftUp -nohead -type=.psl stdout ../../jkStuff/caePb2.supercontigs.lift \
	error blastCe6SG.psl | sort -k14,14 -k16,16n \
	| hgLoadPsl -table=blastCe6SG caePb2 stdin
    hgLoadPsl caePb2 blastCe6SG.psl

    # check coverage
    featureBits caePb2 blastCe6SG 
    #	23871394 bases of 170473138 (14.003%) in intersection
    featureBits caePb1 blastCe4SG
    #	22988044 bases of 175247318 (13.117%) in intersection

    featureBits ce6 sangerGene
    #	28134889 bases of 100281426 (28.056%) in intersection

    ssh kkstore01
    cd /cluster/data/caePb2/bed/tblastn.ce6SG
    rm -rf blastOut sgfa
    rm -rf /cluster/bluearc/worms/caePb2/bed/tblastn.ce6SG

###########################################################################
# SWAP LASTZ ce9 (DONE - 2010-09-23 - Hiram)
    # original alignment
    cd /hive/data/genomes/ce9/bed/blastzCaePb2.2010-09-20
    cat fb.ce9.chainCaePb2Link.txt
    #	40793086 bases of 100286004 (40.677%) in intersection

    #	running swap
    mkdir /hive/data/genomes/caePb2/bed/blastz.ce9.swap
    cd /hive/data/genomes/caePb2/bed/blastz.ce9.swap
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
	/hive/data/genomes/ce9/bed/blastzCaePb2.2010-09-20/DEF \
	-workhorse=hgwdev -qRepeats=windowmaskerSdust \
	-bigClusterHub=pk -smallClusterHub=memk -swap > swap.log 2>&1 &
    #	real    2m36.529s
    cat fb.caePb2.chainCe9Link.txt
    #	55084649 bases of 170473138 (32.313%) in intersection

    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
	/hive/data/genomes/ce9/bed/blastzCaePb2.2010-09-20/DEF \
	-workhorse=hgwdev -qRepeats=windowmaskerSdust \
	-continue=syntenicNet -syntenicNet \
	-bigClusterHub=pk -smallClusterHub=memk -swap > synNet.log 2>&1 &
    #	real    0m42.368s

###########################################################################
# ELEGANS (ce9) PROTEINS TRACK (DONE - 2010-10-08 - Hiram)
    cd /hive/data/genomes/caePb2
    mv blastDb blastDb.2008-06-16
    mv blastDb.lft blastDb.lft.2008-06-16
    mkdir blastDb
    twoBitToFa caePb2.unmasked.2bit temp.fa
    faSplit gap temp.fa 1000000 blastDb/x -lift=blastDb.lft
    #	212 pieces of 212 written
    rm temp.fa
    cd blastDb
    for i in *.fa
    do
	/scratch/data/blast-2.2.11/bin/formatdb -i $i -p F
    done
    rm *.fa

    ## create the query protein set
    mkdir -p /hive/data/genomes/caePb2/bed/tblastn.ce9SG
    cd /hive/data/genomes/caePb2/bed/tblastn.ce9SG
    echo  /hive/data/genomes/caePb2/blastDb/*.nsq | xargs ls -S \
	| sed "s/\.nsq//"  > query.lst
    wc -l query.lst
    # 212 query.lst

   # we want around 50000 jobs
    calc `wc -l /hive/data/genomes/ce9/bed/blat.ce9SG/ce9SG.psl  | cut -d" " -f1`/\(50000/`wc -l query.lst | cut -d" " -f1`\)
    #	28103/(50000/212) = 119.156720

   mkdir sgfa
   split -l 100 /hive/data/genomes/ce9/bed/blat.ce9SG/ce9SG.psl sgfa/sg
   cd sgfa
   for i in *; do 
     nice pslxToFa $i $i.fa; 
     rm $i; 
   done
   cd ..
   ls -1S sgfa/*.fa > sg.lst
   mkdir blastOut
   for i in `cat sg.lst`; do  mkdir blastOut/`basename $i .fa`; done
   
   cat << '_EOF_' > template
#LOOP
blastSome $(path1) {check in line $(path2)} {check out exists blastOut/$(root2)/q.$(root1).psl }
#ENDLOOP
'_EOF_'
    # << happy emacs

   cat << '_EOF_' > blastSome
#!/bin/sh
DB=caePb2
BLASTMAT=/scratch/data/blast-2.2.11/data
SCR="/scratch/tmp/${DB}"
g=`basename $2`
D=`basename $1`
export BLASTMAT DB SCR g D
mkdir -p ${SCR}
cp -p $1.* ${SCR}
f=${SCR}/`basename $3`.$g
for eVal in 0.01 0.001 0.0001 0.00001 0.000001 1E-09 1E-11
do
if /scratch/data/blast-2.2.11/bin/blastall -M BLOSUM80 -m 0 -F no \
	-e $eVal -p tblastn -d ${SCR}/$D -i $2 -o $f.8
then
        mv $f.8 $f.1
        break;
fi
done
if test -f  $f.1
then
    if /cluster/bin/x86_64/blastToPsl $f.1 $f.2
    then
        liftUp -nosort -type=".psl" -nohead $f.3 \
	    /hive/data/genomes/${DB}/blastDb.lft carry $f.2 > /dev/null
        liftUp -nosort -type=".psl" -pslQ -nohead $3.tmp \
	    /hive/data/genomes/ce9/bed/blat.ce9SG/protein.lft warn $f.3 > /dev/null
        if pslCheck -prot $3.tmp
        then
            mv $3.tmp $3
            rm -f $f.1 $f.2 $f.3 $f.4 ${SCR}/$D.*
            rmdir --ignore-fail-on-non-empty ${SCR}
        fi
        exit 0
    fi
fi
rm -f $f.1 $f.2 $3.tmp $f.8 $f.3 $f.4 ${SCR}/$D.*
rmdir --ignore-fail-on-non-empty ${SCR}
exit 1
'_EOF_'
    # << happy emacs
    chmod +x blastSome

    ssh swarm
    cd /hive/data/genomes/caePb2/bed/tblastn.ce9SG
    gensub2 query.lst sg.lst template jobList
    para create jobList
    para try, check, push, check etc.
# Completed: 59784 of 59784 jobs
# CPU time in finished jobs:     577335s    9622.26m   160.37h    6.68d  0.018 y
# IO & Wait Time:                192124s    3202.06m    53.37h    2.22d  0.006 y
# Average job time:                  13s       0.21m     0.00h    0.00d
# Longest finished job:              29s       0.48m     0.01h    0.00d
# Submission to last job:         15337s     255.62m     4.26h    0.18d

    # do the cluster run for chaining
    ssh swarm
    mkdir /hive/data/genomes/caePb2/bed/tblastn.ce9SG/chainRun
    cd /hive/data/genomes/caePb2/bed/tblastn.ce9SG/chainRun
    cat << '_EOF_' > template
#LOOP
chainOne $(path1) {check out exists+ ../blastOut/c.$(file1).psl}
#ENDLOOP
'_EOF_'
    # << happy emacs

    cat << '_EOF_' > chainOne
#!/bin/csh -fe
cd $1
set b = $1:t
cat q.*.psl | simpleChain -prot -outPsl -maxGap=50000 stdin \
/hive/data/genomes/caePb2/bed/tblastn.ce9SG/blastOut/c.$b.psl
'_EOF_'
    # << happy emacs
    chmod +x chainOne

    ls -1dS /hive/data/genomes/caePb2/bed/tblastn.ce9SG/blastOut/sg?? \
	> chain.lst
    gensub2 chain.lst single template jobList
    cd /hive/data/genomes/caePb2/bed/tblastn.ce9SG/chainRun
    para create jobList
    para try, check, push, check etc.
# Completed: 282 of 282 jobs
# CPU time in finished jobs:       9777s     162.95m     2.72h    0.11d  0.000 y
# IO & Wait Time:                  9172s     152.86m     2.55h    0.11d  0.000 y
# Average job time:                  67s       1.12m     0.02h    0.00d
# Longest finished job:             781s      13.02m     0.22h    0.01d
# Submission to last job:           839s      13.98m     0.23h    0.01d

    cd /hive/data/genomes/caePb2/bed/tblastn.ce9SG/blastOut
    for i in sg??
    do
       cat c.$i.psl | awk "(\$13 - \$12)/\$11 > 0.6 {print}" > c60.$i.psl
       sort -rn c60.$i.psl | pslUniq stdin u.$i.psl
       awk "((\$1 / \$11) ) > 0.60 { print   }" c60.$i.psl > m60.$i.psl
       echo $i
    done
    sort -T /scratch/tmp -k 14,14 -k 16,16n -k 17,17n u.*.psl m60* | uniq \
	> ../blastCe9SG.psl
    cd ..
    pslCheck blastCe9SG.psl
    #	checked: 41278 failed: 0 errors: 0

    # load table 
    ssh hgwdev
    cd /hive/data/genomes/caePb2/bed/tblastn.ce9SG
    hgLoadPsl caePb2 blastCe9SG.psl

    # check coverage
    featureBits caePb2 blastCe9SG
    #	23730009 bases of 170473138 (13.920%) in intersection
    featureBits caeRem3 blastCe9SG
    #	20302540 bases of 138406388 (14.669%) in intersection
    featureBits cb3 blastCe9SG
    #	18490367 bases of 108433446 (17.052%) in intersection
    featureBits caeJap3 blastCe9SG
    #	12894398 bases of 154057934 (8.370%) in intersection
    featureBits melHap1 blastCe9SG
    #	4376245 bases of 53017507 (8.254%) in intersection
    featureBits melInc1 blastCe9SG
    #	3882043 bases of 82095019 (4.729%) in intersection
    featureBits priPac2 blastCe9SG
    #	5436779 bases of 133634773 (4.068%) in intersection
    featureBits bruMal1 blastCe9SG 
    #	4424694 bases of 89235536 (4.958%) in intersection
    featureBits haeCon1 blastCe9SG
    #	4990746 bases of 278844984 (1.790%) in intersection
    featureBits ce9 sangerGene
    #	28689552 bases of 100286004 (28.608%) in intersection

    rm -rf blastOut

#########################################################################
