# for emacs: -*- mode: sh; -*-

# Lizard - Anolis carolinensis - Broad Institute 1.0

#########################################################################
# DOWNLOAD SEQUENCE (DONE - 2007-02-16 - Hiram)
    ssh kkstore05
    mkdir /cluster/store12/anoCar1
    ln -s /cluster/store12/anoCar1 /cluster/data/anoCar1
    mkdir /cluster/data/anoCar1/downloads
    cd /cluster/data/anoCar1/downloads
    foreach f (assembly.agp \
               BasicAssemblyOneLiner.out \
               ForDistribution.command \
               assembly.bases.gz \
               assembly.links \
               assembly.quals.gz \
               source)
      wget --timestamping \
        ftp://ftp.broad.mit.edu/pub/assemblies/reptiles/lizard/AnoCar1.0/$f
    end
    faSize assembly.bases.gz 
# 1741478929 bases (0 N's 1741478929 real 1741478929 upper 0 lower)
#	in 50470 sequences in 1 files
    #	Discovered later that this quals file needs to be lifted
    qaToQac assembly.quals.gz stdout \
	| qacAgpLift assembly.agp stdin scaffold.lifted.qac

    ##	Calculate N50
    #	Find total length in sequence
    sort -k2nr chrom.sizes | awk '{sum+=$2;print NR,sum,$2,$1}' | tail -1
    #	7233 1781602899 340 scaffold_7232
    #	half of 1781602899 is 890801449
    #	run this again, scanning the list until the sum reaches 890801449
    sort -k2nr chrom.sizes | awk '{sum+=$2;print NR,sum,$2,$1}' | less
    #	204 889215106 2440512 scaffold_201
    #	205 891654231 2439125 scaffold_205

#########################################################################
## Create .ra file and run makeGenomeDb.pl
    ssh hgwdev
    cd /cluster/data/anoCar1
    cat << '_EOF_' >anoCar1.config.ra
# Config parameters for makeGenomeDb.pl:
db anoCar1
clade vertebrate
genomeCladePriority 66
scientificName Anolis carolinensis
commonName Lizard
assemblyDate Jan. 2007
assemblyLabel Broad Institute AnoCar (1.0)
orderKey 440
mitoAcc none
fastaFiles /cluster/data/anoCar1/downloads/assembly.bases.gz
agpFiles /cluster/data/anoCar1/downloads/assembly.agp
qualFiles /cluster/data/anoCar1/downloads/scaffold.lifted.qac
dbDbSpeciesDir lizard
'_EOF_'

    time makeGenomeDb.pl -verbose=2 anoCar1.config.ra > makeGenomeDb.out 2>&1
    #	broken down during the quals step since assembly.quals.gz needed
    #	to be lifted.  do the qaToQac | qacAgpLift sequence, fixup the
    #	specification above for qualFiles, and finish off the quals loading:
    ssh kkstore05
    cd /cluster/data/anoCar1/bed/qual
    qacToWig -fixed ../../downloads/scaffold.lifted.qac stdout \
	| wigEncode stdin qual.wig qual.wib
    ssh hgwdev
    cd /cluster/data/anoCar1/bed/qual
    time nice -n +19 hgLoadWiggle \
	-pathPrefix=/gbdb/anoCar1/wib anoCar1 quality qual.wig

    ##	continue
    ssh hgwdev
    cd /cluster/data/anoCar1
    time makeGenomeDb.pl -verbose=2 -continue=dbDb anoCar1.config.ra \
	> makeDbDb.out 2>&1

    ## better orderKey to get Lizard between frog and fish
    hgsql -e 'update dbDb set orderKey="440" where name="anoCar1";' \
	hgcentraltest
    ## fixup that number in the .ra file as mentioned above, was 375
##########################################################################
## Photograph - permission to use obtained from R. Steven Rainwater
##		(DONE - 2007-02-16 - Hiram)
    ## Fetch photo from:
    wget --timestamping \
	"http://rainwaterreptileranch.org/steve/photos/herps/anole2.jpeg" \
	-O R.Steven.Rainwater.Anole.Lizard.jpg
    convert -quality 80 -sharpen 0 -crop "168x272+264+37" \
	R.Steven.Rainwater.Anole.Lizard.jpg Anolis_carolinensis.jpg

################################################
## WINDOWMASKER (Working - 2007-02-16 - Hiram)
    ssh kkstore05
    cd /cluster/data/anoCar1/bed/
    time nice -n +19 ~/kent/src/hg/utils/automation/doWindowMasker.pl anoCar1 \
      -workhorse=kolossus > wmRun.log 2>&1 &
    #	real    172m58.813s

    # Save the log
    mv wmRun.log WindowMasker.2007-02-16

    # Masking statistics
    cd WindowMasker.2007-02-18
    twoBitToFa anoCar1.wmsk.2bit stdout | faSize stdin
    #	1781602899 bases (40123970 N's 1741478929 real
    #	1009685313 upper 731793616 lower) in 7233 sequences in 1 files
    twoBitToFa anoCar1.wmsk.sdust.2bit stdout | faSize stdin
    #	1781602899 bases (40123970 N's 1741478929 real
    #	1000477327 upper 741001602 lower) in 7233 sequences in 1 files

    ssh hgwdev
    hgLoadBed -strict anoCar1 windowmaskerSdust windowmasker.sdust.bed.gz
    #	Loaded 8354004 elements of size 3

    #	why does featureBits show more bases masked than what faSize
    #	measured as lower case ?  Because this counts the masked sequence
    #	in the gaps, and the faSize doesn't have the gaps.
    time nice -n +19 featureBits anoCar1 windowmaskerSdust
    #	781125572 bases of 1741478929 (44.854%) in intersection

    time nice -n +19 featureBits -countGaps anoCar1 windowmaskerSdust
    #	781125572 bases of 1781602899 (43.844%) in intersection

    #	Curiously, WM overlaps gaps ?
    time nice -n +19 featureBits -countGaps anoCar1 windowmaskerSdust gap
    #	40123970 bases of 1781602899 (2.252%) in intersection
    #	741001602 + 40123970 == 781125572 

#########################################################################
# SIMPLE REPEATS (TRF) (DONE 2007-02-16 - Hiram)
#                      (dropped chromEnd index 2007-05-10 - kuhn)
    ssh kolossus
    mkdir /cluster/data/anoCar1/bed/simpleRepeat
    cd /cluster/data/anoCar1/bed/simpleRepeat
    time nice -n 19 twoBitToFa ../../anoCar1.unmasked.2bit stdout \
	| trfBig -trf=/cluster/bin/i386/trf stdin /dev/null \
	    -bedAt=simpleRepeat.bed -tempDir=/tmp > trf.log 2>&1 &
    #	real    164m34.988s
    #	user    159m32.859s
    #	sys     4m31.649s

    ssh kkstore05
    cd /cluster/data/anoCar1/bed/simpleRepeat
    # Make a filtered version for sequence masking:
    awk '{if ($5 <= 12) print;}' simpleRepeat.bed > trfMask.bed
    splitFileByColumn trfMask.bed trfMaskChrom

    # Load unfiltered repeats into the database:
    ssh hgwdev
    nice -n +19 hgLoadBed anoCar1 simpleRepeat \
      /cluster/data/anoCar1/bed/simpleRepeat/simpleRepeat.bed \
      -sqlTable=$HOME/kent/src/hg/lib/simpleRepeat.sql
    #	Loaded 569322 elements of size 16
    nice -n +19 featureBits anoCar1 simpleRepeat
    #	50290171 bases of 1741478929 (2.888%) in intersection

#########################################################################
## Add TRF mask to WindowMasker masked sequence, and fixup the bogus
##	window masked N's  (DONE - 2007-02-17 - Hiram)
    ssh kkstore05
    cd /cluster/data/anoCar1/bed/WindowMasker.2007-02-16
    ##	Curious, twoBitMask would not accept stdout for its output 2bit
    twoBitMask anoCar1.wmsk.sdust.2bit \
      -add ../simpleRepeat/trfMask.bed tmp.2bit
    twoBitToFa tmp.2bit stdout \
	| sed -e "s/n/N/g" | faToTwoBit stdin anoCar1.2bit
    ## check it:
    twoBitToFa anoCar1.2bit stdout | faSize stdin
    #	1781602899 bases (40123970 N's 1741478929 real
    #	1000242640 upper 741236289 lower) in 7233 sequences in 1 files
    ## trfMask contributes:
    awk '{sum+=$3-$2} END{print sum}' ../simpleRepeat/trfMask.bed
    #	16534332
    ## and we measured wmsk.sdust before: 741001602
    16534332 + 741001602

#########################################################################
## BLASTZ Hg18 swap (DONE - 2007-02-18 - Hiram)
    ##	the original blastz to hg18 measured
    time nice -n +19 featureBits hg18 chainAnoCar1Link \
	> fb.hg18.chainAnoCar1Link.txt 2>&1
    #	real    2m28.318s
    #	137554843 bases of 2881515245 (4.774%) in intersection

    ssh kkstore05
    mkdir /cluster/data/anoCar1/bed/blastz.hg18.swap
    cd /cluster/data/anoCar1/bed/blastz.hg18.swap
    time doBlastzChainNet.pl \
	/cluster/data/hg18/bed/blastz.anoCar1.2007-02-17/DEF \
	-chainMinScore=5000 -chainLinearGap=loose \
	-tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \
	-verbose=2 -bigClusterHub=pk -swap > swap.log 2>&1 &

    time nice -n +19 featureBits anoCar1 chainHg18Link \
	> fb.anoCar1.chainHg18Link.txt 2>&1  
    #	real    3m16.810s
    #	112434396 bases of 1741478929 (6.456%) in intersection

#########################################################################
# GENSCAN PREDICTIONS (DONE - 2006-05-03 - 2006-05-05 - Hiram)
    ssh kkstore05
    #	Create a 2bit file all hard masked
    cd /cluster/data/anoCar1
    time nice -n +10 twoBitToFa anoCar1.2bit stdout \
	| maskOutFa stdin hard stdout \
	    | faToTwoBit stdin anoCar1.hard.2bit
    #	make sure it still has all the unmasked sequence in it:
    nice -n +19 twoBitToFa anoCar1.hard.2bit stdout | faSize stdin
    #	1781602899 bases (781360259 N's 1000242640 real
    #	1000242640 upper 0 lower) in 7233 sequences in 1 files

    nice -n +19 twoBitToFa anoCar1.2bit stdout | faSize stdin
    #	1781602899 bases (40123970 N's 1741478929 real
    #	1000242640 upper 741236289 lower) in 7233 sequences in 1 files
    #	same number of total bases, the lowers have become Ns:
    #	781360259 == 741236289 + 40123970
    #	the lower "reals" disappear from the "real" count:
    #	1000242640 == 1741478929 - 741236289

    #	And, make sure there aren't any sequences in this lot that have
    #	become all N's with no sequence left in them:
    twoBitToFa anoCar1.hard.2bit stdout \
	| faCount stdin > anoCar1.hard.faCount
    #	181 scaffolds end up with less than 100 bases of sequence left
    egrep -v "^#|^total" anoCar1.hard.faCount | awk '{print $1,$2-$7}' \
	| sort -k2nr | awk '{if ($2 < 100) { print }}' | wc -l
    #	181
    #	leaving 7.052 scaffolds with more than 100 bases of seqence left:
    egrep -v "^#|^total" anoCar1.hard.faCount | awk '{print $1,$2-$7}' \
	| sort -k2nr | awk '{if ($2 >= 100) { print }}' | wc -l
    #	7052
    #	make a list of those to extract their sequence:
    egrep -v "^#|^total" anoCar1.hard.faCount | awk '{print $1,$2-$7}' \
	| sort -k2nr | awk '{if ($2 >= 100) { print $1 }}' \
	| sort > hard.genscan.list

    twoBitToFa -seqList=hard.genscan.list anoCar1.hard.2bit genscan.hard.fa
    #	What do we have left to work with:
    faSize genscan.hard.fa
    #	1780575355 bases (780338843 N's 1000236512 real
    #	1000236512 upper 0 lower) in 7052 sequences in 1 files


    #	creating 4,000,000 sized chunks, the largest scaffolds remain
    #	in single pieces, the scaffolds smaller than 4,000,000 are grouped
    #	into 4,000,000 sized fasta files.  You don't want to break these
    #	things up because genscan will be doing its own internal 2.4 million
    #	window on these pieces, and the gene names are going to be
    #	constructed from the sequence name in these fasta files.  The
    #	gene names are much better when they are this simple scaffoldN.M
    #	numbering scheme.
    mkdir genscanSplit
    faSplit about genscan.hard.fa 4000000 genscanSplit/c_

    ssh hgwdev
    mkdir /cluster/data/anoCar1/bed/genscan
    cd /cluster/data/anoCar1/bed/genscan
    # Check out hg3rdParty/genscanlinux to get latest genscan:
    cvs co hg3rdParty/genscanlinux

    # Run on small cluster (more mem than big cluster).
    ssh kki
    cd /cluster/data/anoCar1/bed/genscan
    # Make 3 subdirectories for genscan to put their output files in
    mkdir gtf pep subopt
    # Generate a list file, genome.list, of all the hard-masked contigs that 
    # *do not* consist of all-N's (which would cause genscan to blow up)
    #	Since we split on gaps, we have no chunks like that.  You can
    #	verify with faCount on the chunks.
    ls -1S /cluster/data/anoCar1/genscanSplit/c_*.fa > chunk.list

    # Create template file, gsub, for gensub2.  For example (3-line file):
    cat << '_EOF_' > template
#LOOP
/cluster/bin/x86_64/gsBig {check in line+ $(path1)} {check out line gtf/$(root1).gtf} -trans={check out line pep/$(root1).pep} -subopt={check out line subopt/$(root1).bed} -exe=hg3rdParty/genscanlinux/genscan -par=hg3rdParty/genscanlinux/HumanIso.smat -tmp=/scratch/tmp -window=2400000
#ENDLOOP
'_EOF_'
    # << happy emacs
    gensub2 chunk.list single template jobList
    para create jobList
    para try, check, push, check, ...
# Completed: 319 of 319 jobs
# CPU time in finished jobs:      39037s     650.62m    10.84h    0.45d  0.001 y
# IO & Wait Time:                  1015s      16.91m     0.28h    0.01d  0.000 y
# Average job time:                 126s       2.09m     0.03h    0.00d
# Longest finished job:             459s       7.65m     0.13h    0.01d
# Submission to last job:          3495s      58.25m     0.97h    0.04d

    # cat results into single files
    ssh kkstore05
    cd /cluster/data/anoCar1/bed/genscan
    cat gtf/c_*.gtf > genscan.gtf
    cat subopt/c_*.bed > genscanSubopt.bed
    cat pep/c_*.pep > genscan.pep

    # Load into the database as so:
    ssh hgwdev
    cd /cluster/data/anoCar1/bed/genscan
    ldHgGene anoCar1 -gtf genscan genscan.gtf
    #	Read 28102 transcripts in 184045 lines in 1 files
    #	28102 groups 4190 seqs 1 sources 1 feature types
    #	28102 gene predictions

    hgPepPred anoCar1 generic genscanPep genscan.pep
    hgLoadBed -strict anoCar1 genscanSubopt genscanSubopt.bed
    #	Loaded 286327 elements of size 6

    #	check the numbers
    time nice -n +19 featureBits anoCar1 genscan
    #	31394034 bases of 1741478929 (1.803%) in intersection

#########################################################################
## BLASTZ GasAcu1/Stickleback swap (DONE - 2007-02-19 - Hiram)
    ##	the original blastz to gasAcu1 measured
    time nice -n +19 featureBits gasAcu1 chainAnoCar1Link \
	> fb.gasAcu1.chainAnoCar1Link.txt 2>&1
    #	real    0m51.499s
    #	56386298 bases of 446627861 (12.625%) in intersection

    ssh kkstore05
    mkdir /cluster/data/anoCar1/bed/blastz.gasAcu1.swap
    cd /cluster/data/anoCar1/bed/blastz.gasAcu1.swap
    time doBlastzChainNet.pl -verbose=2 \
	/cluster/data/gasAcu1/bed/blastz.anoCar1.2007-02-19/DEF \
	-tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \
	-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
	-swap > swap.log 2>&1 &

    time nice -n +19 featureBits anoCar1 chainGasAcu1Link \
	> fb.anoCar1.chainGasAcu1Link.txt 2>&1  
    #	real    1m14.245s
    #	54464074 bases of 1741478929 (3.127%) in intersection


###########################################################################
# HUMAN (hg18) PROTEINS TRACK (DONE braney 2007-02-19)
    ssh kkstore05
    bash # if not using bash shell already

    mkdir /cluster/data/anoCar1/blastDb
    cd /cluster/data/anoCar1
    twoBitToFa anoCar1.unmasked.2bit temp.fa
    faSplit sequence temp.fa 500 blastDb/
    rm temp.fa
    cd blastDb
    for i in *.fa
    do
	/cluster/bluearc/blast229/formatdb -i $i -p F
    done
    rm *.fa

    mkdir -p /san/sanvol1/scratch/anoCar1/blastDb
    cd /cluster/data/anoCar1/blastDb
    for i in nhr nin nsq; 
    do 
	echo $i
	cp *.$i /san/sanvol1/scratch/anoCar1/blastDb
    done

    mkdir -p /cluster/data/anoCar1/bed/tblastn.hg18KG
    cd /cluster/data/anoCar1/bed/tblastn.hg18KG
    echo  /san/sanvol1/scratch/anoCar1/blastDb/*.nsq | xargs ls -S | sed "s/\.nsq//"  > query.lst
    wc -l query.lst
# 496 query.lst

   # we want around 200000 jobs
   calc `wc /cluster/data/hg18/bed/blat.hg18KG/hg18KG.psl | awk "{print \\\$1}"`/\(200000/`wc query.lst | awk "{print \\\$1}"`\)
# 36727/(200000/496) = 91.082960

   mkdir -p /cluster/bluearc/anoCar1/bed/tblastn.hg18KG/kgfa
   split -l 90 /cluster/data/hg18/bed/blat.hg18KG/hg18KG.psl  /cluster/bluearc/anoCar1/bed/tblastn.hg18KG/kgfa/kg
   ln -s /cluster/bluearc/anoCar1/bed/tblastn.hg18KG/kgfa kgfa
   cd kgfa
   for i in *; do 
     nice pslxToFa $i $i.fa; 
     rm $i; 
     done
   cd ..
   ls -1S kgfa/*.fa > kg.lst
   mkdir -p /cluster/bluearc/anoCar1/bed/tblastn.hg18KG/blastOut
   ln -s /cluster/bluearc/anoCar1/bed/tblastn.hg18KG/blastOut
   for i in `cat kg.lst`; do  mkdir blastOut/`basename $i .fa`; done
   tcsh
   cd /cluster/data/anoCar1/bed/tblastn.hg18KG
   cat << '_EOF_' > blastGsub
#LOOP
blastSome $(path1) {check in line $(path2)} {check out exists blastOut/$(root2)/q.$(root1).psl }
#ENDLOOP
'_EOF_'

   cat << '_EOF_' > blastSome
#!/bin/sh
BLASTMAT=/cluster/bluearc/blast229/data
export BLASTMAT
g=`basename $2`
f=/tmp/`basename $3`.$g
for eVal in 0.01 0.001 0.0001 0.00001 0.000001 1E-09 1E-11
do
if /cluster/bluearc/blast229/blastall -M BLOSUM80 -m 0 -F no -e $eVal -p tblastn -d $1 -i $2 -o $f.8
then
        mv $f.8 $f.1
        break;
fi
done
if test -f  $f.1
then
    if /cluster/bin/i386/blastToPsl $f.1 $f.2
    then
        liftUp -nosort -type=".psl" -pslQ -nohead $3.tmp /cluster/data/hg18/bed/blat.hg18KG/protein.lft warn $f.2

        if pslCheck -prot $3.tmp                                                  
        then                                                                      
            mv $3.tmp $3                                                          
            rm -f $f.1 $f.2 $f.3 $f.4
        fi
        exit 0                                                                    
    fi                                                                            
fi                                                                                
rm -f $f.1 $f.2 $3.tmp $f.8 $f.3 $f.4
exit 1
'_EOF_'
    # << happy emacs
    chmod +x blastSome
    gensub2 query.lst kg.lst blastGsub blastSpec
    # back to bash
    exit 
    
    ssh pk
    cd /cluster/data/anoCar1/bed/tblastn.hg18KG
    para create blastSpec
#    para try, check, push, check etc.

    para time

# Completed: 202864 of 202864 jobs
# CPU time in finished jobs:   14747966s  245799.43m  4096.66h  170.69d  0.468 y
# IO & Wait Time:               1561940s   26032.34m   433.87h   18.08d  0.050 y
# Average job time:                  80s       1.34m     0.02h    0.00d
# Longest finished job:             722s      12.03m     0.20h    0.01d
# Submission to last job:         93277s    1554.62m    25.91h    1.08d

    ssh kkstore05
    cd /cluster/data/anoCar1/bed/tblastn.hg18KG
    mkdir chainRun
    cd chainRun
    tcsh
    cat << '_EOF_' > chainGsub
#LOOP
chainOne $(path1)
#ENDLOOP
'_EOF_'

    cat << '_EOF_' > chainOne
(cd $1; cat q.*.psl | simpleChain -prot -outPsl -maxGap=150000 stdin /cluster/bluearc/anoCar1/bed/tblastn.hg18KG/blastOut/c.`basename $1`.psl)
'_EOF_'
    exit
    chmod +x chainOne
    ls -1dS /cluster/bluearc/anoCar1/bed/tblastn.hg18KG/blastOut/kg?? > chain.lst
    gensub2 chain.lst single chainGsub chainSpec
    # do the cluster run for chaining
    ssh kk
    cd /cluster/data/anoCar1/bed/tblastn.hg18KG/chainRun
    para create chainSpec
    para maxNode 30
    para try, check, push, check etc.
# Completed: 409 of 409 jobs
# CPU time in finished jobs:       2911s      48.52m     0.81h    0.03d  0.000 y
# IO & Wait Time:                 65231s    1087.18m    18.12h    0.75d  0.002 y
# Average job time:                 167s       2.78m     0.05h    0.00d
# Longest finished job:             287s       4.78m     0.08h    0.00d
# Submission to last job:          2318s      38.63m     0.64h    0.03d

    ssh kkstore05
    cd /cluster/data/anoCar1/bed/tblastn.hg18KG/blastOut
    for i in kg??
    do
       cat c.$i.psl | awk "(\$13 - \$12)/\$11 > 0.6 {print}" > c60.$i.psl
       sort -rn c60.$i.psl | pslUniq stdin u.$i.psl
       awk "((\$1 / \$11) ) > 0.60 { print   }" c60.$i.psl > m60.$i.psl
       echo $i
    done
    sort -T /tmp -k 14,14 -k 16,16n -k 17,17n u.*.psl m60* | uniq > /cluster/data/anoCar1/bed/tblastn.hg18KG/blastHg18KG.psl
    cd ..
    pslCheck blastHg18KG.psl

    # load table 
    ssh hgwdev
    cd /cluster/data/anoCar1/bed/tblastn.hg18KG
    hgLoadPsl anoCar1 blastHg18KG.psl
    # check coverage
    featureBits anoCar1 blastHg18KG 
# 21571582 bases of 1741478929 (1.239%) in intersection
    
    ssh kkstore05
    rm -rf /cluster/data/anoCar1/bed/tblastn.hg18KG/blastOut
    rm -rf /cluster/bluearc/anoCar1/bed/tblastn.hg18KG/blastOut
#end tblastn

#########################################################################
## BLASTZ galGal3/Chicken swap (DONE - 2007-02-19 - Hiram)
    ##	the original blastz to galGal3 measured
    time nice -n +19 featureBits galGal3 chainAnoCar1Link \
	> fb.galGal3.chainAnoCar1Link.txt 2>&1
    #	real    0m43.752s
    #	106743952 bases of 1042591351 (10.238%) in intersection

    ssh kkstore05
    mkdir /cluster/data/anoCar1/bed/blastz.galGal3.swap
    cd /cluster/data/anoCar1/bed/blastz.galGal3.swap

    time doBlastzChainNet.pl -verbose=2 \
	/cluster/data/galGal3/bed/blastz.anoCar1.2007-02-18/DEF \
	-tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \
	-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
	-swap > swap.log 2>&1 &

    ssh hgwdev
    cd /cluster/data/anoCar1/bed/blastz.galGal3.swap
    time nice -n +19 featureBits anoCar1 chainGalGal3Link \
	> fb.anoCar1.chainGalGal3Link.txt 2>&1
    #	real    1m8.359s
    #	109074507 bases of 1741478929 (6.263%) in intersection

#########################################################################
# MAKE 11.OOC FILE FOR BLAT (DONE - 2007-02-19 - Hiram)
    # This will find repeats within the genome that should not be matched
    # against. Uses 11-mers.
    # Use -repMatch=620 (based on size -- for human we use 1024, and
    # lizard size is ~60.4% of human judging by gapless anoCar1 vs. hg18
    # genome sizes from featureBits.

    # hg18 / anoCar1 non-gap bases
    #	2881515245 / 1741478929 = 1.654636
    # anoCar1 / hg18 non-gap bases
    #	1741478929 / 2881515245 = 0.604362

    #	thus 1024 * 0.604362 ~= 620
    
    ssh kkstore05
    blat /cluster/data/anoCar1/anoCar1.2bit /dev/null /dev/null -tileSize=11 \
      -makeOoc=/cluster/data/anoCar1/11.ooc -repMatch=620
    # Wrote 32070 overused 11-mers to /cluster/data/anoCar1/11.ooc

    cp -p /cluster/data/anoCar1/11.ooc /san/sanvol1/scratch/anoCar1
    cp -p /cluster/data/anoCar1/jkStuff/liftAll.lft \
	/san/sanvol1/scratch/anoCar1

#########################################################################
# GENBANK AUTO UPDATE (DONE - 2007-02-20 - Hiram)
    # Make a liftAll.lft that specifies 5M chunks for genbank:
    #	only a few of the largest scaffolds will be broken up, most of them not
    ssh kkstore05
    cd /cluster/data/anoCar1
    simplePartition.pl anoCar1.2bit 5000000 /tmp/anoCar1
    find /tmp/anoCar1 -type f | grep lft | xargs cat > jkStuff/liftAll.lft
    rm -r /tmp/anoCar1
    cp -p jkStuff/liftAll.lft /san/sanvol1/scratch/anoCar1

    # align with latest genbank process.
    ssh hgwdev
    cd ~/kent/src/hg/makeDb/genbank
    cvsup
    # edit etc/genbank.conf to add anoCar1 just after xenTro2
    # anoCar1
    anoCar1.serverGenome = /cluster/data/anoCar1/anoCar1.2bit
    anoCar1.clusterGenome = /san/sanvol1/scratch/anoCar1/anoCar1.2bit
    anoCar1.ooc = /san/sanvol1/scratch/anoCar1/11.ooc
    anoCar1.lift = /san/sanvol1/scratch/anoCar1/liftAll.lft
    anoCar1.refseq.mrna.native.pslCDnaFilter  = ${lowCover.refseq.mrna.native.pslCDnaFilter}
    anoCar1.refseq.mrna.xeno.pslCDnaFilter    = ${lowCover.refseq.mrna.xeno.pslCDnaFilter}
    anoCar1.genbank.mrna.native.pslCDnaFilter = ${lowCover.genbank.mrna.native.pslCDnaFilter}
    anoCar1.genbank.mrna.xeno.pslCDnaFilter   = ${lowCover.genbank.mrna.xeno.pslCDnaFilter}
    anoCar1.genbank.est.native.pslCDnaFilter  = ${lowCover.genbank.est.native.pslCDnaFilter}
    anoCar1.refseq.mrna.native.load = yes
    anoCar1.genbank.est.native.load = no
    anoCar1.refseq.mrna.xeno.load = yes
    anoCar1.genbank.mrna.xeno.load = yes
    anoCar1.downloadDir = anoCar1
    anoCar1.perChromTables = no


    cvs ci -m "Added anoCar1." etc/genbank.conf
    # update /cluster/data/genbank/:
    make etc-update

    # Edit src/lib/gbGenome.c to add new species.
    #
    cvs ci -m "Added Anolis carolinensis (lizard)." src/lib/gbGenome.c
    make install-server

    cd /cluster/data/genbank
    screen

    # This is a call to a script that will push our jobs out to the cluster
    # since it's a big job.  
    nice -n +19 bin/gbAlignStep -initial anoCar1 &
    # logFile: var/build/logs/2007.02.19-22:10:25.anoCar1.initalign.log

    # load database when finished
    ssh hgwdev
    cd /cluster/data/genbank
    time nice -n +19 ./bin/gbDbLoadStep -drop -initialLoad anoCar1
    #	real    8m9.108s

    # enable daily alignment and update of hgwdev (DONE - 2007-02-20 - Hiram)
    cd ~/kent/src/hg/makeDb/genbank
    cvsup
    # add anoCar1 to:
        etc/align.dbs
        etc/hgwdev.dbs
    cvs ci -m "Added anoCar1." etc/align.dbs etc/hgwdev.dbs
    make etc-update

###   (2007-05-16 markd)
    # modify genbank to not load native RefSeq, since there are none.
    # remove empty files and rerun gbDbLoadStep
    anoCar1.refseq.mrna.native.load = no
    nice -n +19 ./bin/gbDbLoadStep -drop -initialLoad anoCar1
    

##########################################################################
## BLASTZ FROG xenTro2 (WORKING - 2007-02-20 - Hiram)
    ssh kkstore04
    mkdir /cluster/data/anoCar1/bed/blastz.xenTro2.2007-02-20
    cd /cluster/data/anoCar1/bed/blastz.xenTro2.2007-02-20

    cat << '_EOF_' > DEF
# Lizard vs frog
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=8000
BLASTZ_K=2200
BLASTZ_M=50
BLASTZ_Q=/cluster/data/blastz/HoxD55.q

# TARGET: Lizard AnoCar1 - largest chunk big enough for largest scaffold
SEQ1_DIR=/san/sanvol1/scratch/anoCar1/anoCar1.2bit
SEQ1_LEN=/san/sanvol1/scratch/anoCar1/chrom.sizes
SEQ1_CHUNK=20000000
SEQ1_LAP=10000
SEQ1_LIMIT=30

# TARGET: Frog xenTro2 - single chunk big enough for the largest scaffold
SEQ2_DIR=/san/sanvol1/scratch/xenTro2/xenTro2.sdTrf.2bit
SEQ2_LEN=/san/sanvol1/scratch/xenTro2/chrom.sizes
SEQ2_CHUNK=8000000
SEQ2_LIMIT=30
SEQ2_LAP=0

BASE=/cluster/data/anoCar1/bed/blastz.xenTro2.2007-02-20
TMPDIR=/scratch/tmp
'_EOF_'
    # << happy emacs

    time doBlastzChainNet.pl -verbose=2 DEF -bigClusterHub=pk \
	-chainMinScore=5000 -chainLinearGap=loose \
	-tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \
	-blastzOutRoot /cluster/bluearc/anoCar1XenTro2 > do.log 2>&1 &
    ## real    1522m46.550s
    ## this broke down during chaining because this file:
# -rw-rw-r--  1 5982855 Feb 21 14:39 ../../pslParts/part039.lst.psl.g
    ## has a bogus character control-V in the middle on a number
    ## manually running through that one chain step with that line removed
    ##	got the chaining completed, then continuing:
    time doBlastzChainNet.pl -verbose=2 DEF -bigClusterHub=pk \
	-chainMinScore=5000 -chainLinearGap=loose \
	-tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \
	-continue=chainMerge \
	-blastzOutRoot /cluster/bluearc/anoCar1XenTro2 > chainMerge.log 2>&1 &
    #	real    119m12.634s

    ssh hgwdev
    cd /cluster/data/anoCar1/bed/blastz.xenTro2.2007-02-20]
    time nice -n +19 featureBits anoCar1 chainXenTro2Link \
	> fb.anoCar1.chainXenTro2Link.txt 2>&1
    #	real    11m33.086s
    #	83873500 bases of 1741478929 (4.816%) in intersection

    ssh kkstore04
    mkdir /cluster/data/xenTro2/bed/blastz.anoCar1.swap
    cd /cluster/data/xenTro2/bed/blastz.anoCar1.swap

    time doBlastzChainNet.pl -verbose=2 \
	/cluster/data/anoCar1/bed/blastz.xenTro2.2007-02-20/DEF \
	-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
	-tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \
	-swap > swap.log 2>&1 &


############################################################################
#	After getting a blat server assigned by the Blat Server Gods,
    ssh hgwdev

    hgsql -e 'INSERT INTO blatServers (db, host, port, isTrans, canPcr) \
	VALUES ("anoCar1", "blat11", "17780", "1", "0"); \
	INSERT INTO blatServers (db, host, port, isTrans, canPcr) \
	VALUES ("anoCar1", "blat11", "17781", "0", "1");' \
	    hgcentraltest
    #	test it with some sequence

#########################################################################
## BLASTZ mm8/Mouse swap (DONE - 2007-02-20 - Hiram)
    ##	the original blastz to mm8 measured
    time nice -n +19 featureBits mm8 chainAnoCar1Link \
	> fb.mm8.chainAnoCar1Link.txt 2>&1
    #	real    1m37.380s
    #	106743952 bases of 1042591351 (10.238%) in intersection

    ssh kkstore04
    mkdir /cluster/data/anoCar1/bed/blastz.mm8.swap
    cd /cluster/data/anoCar1/bed/blastz.mm8.swap

    time doBlastzChainNet.pl -verbose=2 \
	/cluster/data/mm8/bed/blastz.anoCar1.2007-02-19/DEF \
	-chainMinScore=5000 -chainLinearGap=loose \
	-tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \
	-bigClusterHub=pk \
	-swap > swap.log 2>&1 &

    ssh hgwdev
    cd /cluster/data/anoCar1/bed/blastz.mm8.swap
    time nice -n +19 featureBits anoCar1 chainMm8Link \
	> fb.anoCar1.chainMm8Link.txt 2>&1
    #	real    2m1.527s
    #	82784787 bases of 1741478929 (4.754%) in intersection

    #	% Coverage of Lizard by:    (chainMinScore,chainLinearGap,type masking)
    #	6.456 - Human hg18	(5000,loose,windowMasker)
    #	6.263 - Chicken galGal3	(5000,loose,windowMasker)
    #	4.816 - Frog xenTro2	(5000,loose,windowMasker)
    #	4.754 - Mouse mm8	(5000,loose,windowMasker)
    #	3.127 - Stickleback gasAcu1	(5000,loose,windowMasker)

    #	% coverage of Chicken by:
    #	10.238 - Lizard anoCar1	(5000,loose,windowMasker)
    #	8.795 - Human hg18	(5000,loose,rmsk)
    #	6.745 - Mouse mm8	(5000,loose,rmsk)
    #	5.330 - Frog xenTro2	(5000,loose,rmsk)
    #	3.144 - Stickleback gasAcu1	(2000,loose,windowMasker)

    #	% coverage of Frog by:
    #	6.217 - Lizard anoCar1	(5000,loose,windowMasker)
    #	5.634 - Human hg18	(5000,loose,rmsk)
    #	5.358 - Mouse mm8	(5000,loose,rmsk)
    #	4.776 - Chicken galGal3	(5000,loose,rmsk)
    #	x.xxx - Stickleback gasAcu1	(not yet done)

    #	% coverage of Human by:
    #	34.514 - Mouse mm8	(3000,medium,rmsk)
    #	4.774 - Lizard anoCar1	(5000,loose,windowMasker)
    #	3.589 - Chicken galGal3	(5000,loose,rmsk)
    #	2.623 - Frog xenTro2	(5000,loose,rmsk)
    #	1.923 - Stickleback gasAcu1	(2000,loose,rmsk)

##########################################################################
## RepeatMasker run to cover all bases (DONE - 2007-03-07 - Hiram)
    ssh kkstore02
    mkdir /cluster/data/anoCar1/bed/RepeatMasker
    cd /cluster/data/anoCar1/bed/RepeatMasker
    time nice -n +19 doRepeatMasker.pl -verbose=2 -bigClusterHub=kk \
	-buildDir=/cluster/data/anoCar1/bed/RepeatMasker anoCar1 > do.log 2>&1 &

############################################################################
##  DOWNLOADS - (DONE - 2007-02-12 - 2007-02-16 - Hiram)
    ssh hgwdev
    cd /cluster/data/anoCar1
    ln -s bed/RepeatMasker/anoCar1.fa.out .

    ~/kent/src/hg/utils/automation/makeDownloads.pl anoCar1 \
	> makeDownloads.out 2>&1
    #	Doesn't work due to missing Repeat masker outputs
    #	Create WindowMasker separate files by chrom, for downloads
    ssh kkstore05
    cd /cluster/data/anoCar1/goldenPath/bigZips
    ln -s ../../bed/WindowMasker.2007-02-16/windowmasker.sdust.bed.gz \
	./anoCar1.WMSdust.bed.gz

    # get GenBank native mRNAs
    ssh hgwdev
    cd /cluster/data/genbank
    ./bin/x86_64/gbGetSeqs -db=anoCar1 -native \
	GenBank mrna /cluster/data/anoCar1/goldenPath/bigZips/mrna.fa
    # get GenBank xeno mRNAs
    ./bin/x86_64/gbGetSeqs -db=anoCar1 -xeno \
	GenBank mrna /cluster/data/anoCar1/goldenPath/bigZips/xenoMrna.fa
    ssh kkstore05
    cd /cluster/data/anoCar1/goldenPath/bigZips
    gzip *.fa
    md5sum *.gz > md5sum.txt
    #	Edit the README.txt file to be correct
    ssh hgwdev
    cd /usr/local/apache/htdocs/goldenPath/anoCar1/bigZips
    ln -s /cluster/data/anoCar1/goldenPath/bigZips/* .

############################################################################
## Default position set at IFG-1 (DONE - 2007-04-09 - Hiram)
    ssh hgwdev
    hgsql -e 'update dbDb set defaultPos="scaffold_72:3056494-3141055"
	where name="anoCar1";' hgcentraltest


############################################################################
# SWAP ORNANA1 CHAIN/NET (DONE 5/2/07 angie)
    ssh kkstore05
    mkdir /cluster/data/anoCar1/bed/blastz.ornAna1.swap
    cd /cluster/data/anoCar1/bed/blastz.ornAna1.swap
    doBlastzChainNet.pl -swap \
      /cluster/data/ornAna1/bed/blastz.anoCar1/DEF >& do.log &
    tail -f do.log
    ln -s blastz.ornAna1.swap /cluster/data/anoCar1/bed/blastz.ornAna1


############################################################################
# SWAP Mouse Mm9 chain/net (DONE - 2007-09-21 - hiram)
    ssh kkstore04
    screen # control this sequence with screen
    #	the original
    cd /cluster/data/mm9/bed/blastzAnoCar1.2007-09-19
    cat  fb.mm9.chainAnoCar1Link.txt
    #	89239796 bases of 2620346127 (3.406%) in intersection

    #	and for the swap
    mkdir /cluster/data/anoCar1/bed/blastz.mm9.swap
    cd /cluster/data/anoCar1/bed/blastz.mm9.swap
    time nice -n +19 ~/kent/src/hg/utils/automation/doBlastzChainNet.pl \
	/cluster/data/mm9/bed/blastzAnoCar1.2007-09-19/DEF -chainMinScore=5000 \
	-swap -qRepeats=windowmaskerSdust \
	-chainLinearGap=loose -bigClusterHub=kk -verbose=2 > swap.log 2>&1 &
    #	real    29m12.291s
    cat fb.anoCar1.chainMm9Link.txt
    #	85923556 bases of 1741478929 (4.934%) in intersection

#########################################################################
############################################################################
# TRANSMAP vertebrate.2008-05-20 build  (2008-05-24 markd)

vertebrate-wide transMap alignments were built  Tracks are created and loaded
by a single Makefile. This is available from:
   svn+ssh://hgwdev.soe.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2008-05-20

see doc/builds.txt for specific details.
############################################################################
############################################################################
# TRANSMAP vertebrate.2008-06-07 build  (2008-06-30 markd)

vertebrate-wide transMap alignments were built  Tracks are created and loaded
by a single Makefile. This is available from:
   svn+ssh://hgwdev.soe.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2008-06-30

see doc/builds.txt for specific details.
############################################################################
############################################################################
# TRANSMAP vertebrate.2009-07-01 build  (2009-07-21 markd)

vertebrate-wide transMap alignments were built  Tracks are created and loaded
by a single Makefile. This is available from:
   svn+ssh://hgwdev.soe.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2009-07-01

see doc/builds.txt for specific details.
############################################################################
############################################################################
# TRANSMAP vertebrate.2009-09-13 build  (2009-09-20 markd)

vertebrate-wide transMap alignments were built  Tracks are created and loaded
by a single Makefile. This is available from:
   svn+ssh://hgwdev.soe.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2009-09-13

see doc/builds.txt for specific details.

############################################################################
# construct liftOver to anoCar2 (DONE - 2011-02-22 - Hiram) 
    mkdir /hive/data/genomes/anoCar1/bed/blat.anoCar2.2011-04-20
    cd /hive/data/genomes/anoCar1/bed/blat.anoCar2.2011-04-20
    # check it with -debug first to see if it is going to work:
    time doSameSpeciesLiftOver.pl -buildDir=`pwd` -bigClusterHub=swarm \
	-debug -dbHost=hgwdev -workhorse=hgwdev anoCar1 anoCar2 > do.log 2>&1
    # if that is OK, then run it:
    time doSameSpeciesLiftOver.pl -buildDir=`pwd` -bigClusterHub=swarm \
	-dbHost=hgwdev -workhorse=hgwdev anoCar1 anoCar2 > do.log 2>&1
    #	hgwdev broken down, reboot
    #	continuing manually to complete the net step
    cd /hive/data/genomes/anoCar1/bed/blat.anoCar2.2011-04-20/run.chain
    time ./doNet.csh > net.log 2>&1
    cd /hive/data/genomes/anoCar1/bed/blat.anoCar2.2011-04-20
    time doSameSpeciesLiftOver.pl -continue=load  -buildDir=`pwd` \
	-bigClusterHub=swarm \
	-dbHost=hgwdev -workhorse=hgwdev anoCar1 anoCar2 > load.log 2>&1
    #	real    2m40.093s

    # verify this file exists:
    #	/gbdb/anoCar1/liftOver/anoCar1ToAnoCar2.over.chain.gz
    # and try out the conversion on genome-test from anoCar1 to anoCar2 

############################################################################
