# for emacs: -*- mode: sh; -*-


# This file describes browser build for the Lancelet
# genome, Branchiostoma floridae, March 2006, v.1.0 from JGI
#
#       "$Id: braFlo1.txt,v 1.19 2009/11/25 21:48:38 hiram Exp $"
#

##########################################################################
### Fetch sequence       (DONE - 2007-03-26 - Hiram)
    ssh kkstore05
    mkdir /cluster/store12/braFlo1
    ln -s /cluster/store12/braFlo1 /cluster/data/braFlo1
    mkdir /cluster/data/braFlo1/downloads
    cd /cluster/data/braFlo1/downloads
    wget --timestamping \
'ftp://ftp.jgi-psf.org/pub/JGI_data/Branchiostoma_floridae/v1.0/Branchiostoma.fasta.gz'
    wget --timestamping \
'ftp://ftp.jgi-psf.org/pub/JGI_data/Branchiostoma_floridae/v1.0/proteins.Brafl1.fasta.gz'
    wget --timestamping \
'ftp://ftp.jgi-psf.org/pub/JGI_data/Branchiostoma_floridae/v1.0/transcripts.Brafl1.fasta.gz'
    gunzip Branchiostoma.fasta.gz
    scaffoldFaToAgp Branchiostoma.fasta

##########################################################################
## Set up initial database (DONE - 2007-03-26 - Hiram)
    cd /cluster/data/braFlo1
    cat << '_EOF_' > braFlo1.config.ra
# Config parameters for makeGenomeDb.pl:
db braFlo1
clade deuterostome
genomeCladePriority 20
scientificName Branchiostoma floridae
commonName Lancelet
assemblyDate Mar. 2006
assemblyLabel JGI v.1.0 (March 2006)
orderKey 799
mitoAcc 5881414
fastaFiles /cluster/data/braFlo1/downloads/Branchiostoma.fasta
agpFiles /cluster/data/braFlo1/downloads/Branchiostoma.agp
# qualFiles none
dbDbSpeciesDir lancelet
'_EOF_'
    time nice -n +19 makeGenomeDb.pl braFlo1.config.ra > makeGenomeDb.out 2>&1

    hgsql -e 'INSERT INTO chrM_gold
(bin, chrom, chromStart, chromEnd, ix, type, frag, fragStart, fragEnd, strand)
VALUES (585, "chrM", 0, 15083, 1, "F", "NC_000834.1", 0, 15083, "+");' braFlo1

    ## done with this sequence
    cd downloads
    gzip Branchiostoma.fasta

##########################################################################
## RepeatMasker (DONE - 2007-03-26 - Hiram)
    ssh kkstore05
    mkdir /cluster/data/braFlo1/bed/RepeatMasker
    cd /cluster/data/braFlo1/bed/RepeatMasker
    time nice -n +10 doRepeatMasker.pl braFlo1 \
	-buildDir=/cluster/data/braFlo1/bed/RepeatMasker > do.log 2>&1 &
    # about 11 minutes
    cd /cluster/data/braFlo1
    twoBitToFa braFlo1.rmsk.2bit stdout | faSize -showPercent stdin
    #	926386587 bases (95184565 N's 831202022 real
    #	806238402 upper 24963620 lower) in 2 sequences in 1 files
    #	twoBitToFa braFlo1.rmsk.2bit stdout | faSize -showPercent stdin
    #	%2.69 masked total, %3.00 masked real

#########################################################################
# SIMPLE REPEATS (TRF) (DONE 2007-01-22 - Hiram)
    ssh kolossus
    mkdir /cluster/data/braFlo1/bed/simpleRepeat
    cd /cluster/data/braFlo1/bed/simpleRepeat
    for C in chrM chrUn
do
    mkdir /scratch/tmp/braFlo1.trf
    twoBitToFa -seq="${C}" ../../braFlo1.unmasked.2bit \
	/scratch/tmp/braFlo1.trf/${C}.fa
    time nice -n 19 trfBig -trf=/cluster/bin/i386/trf \
	/scratch/tmp/braFlo1.trf/${C}.fa /dev/null \
	-bedAt=${C}.bed -tempDir=/scratch/tmp/braFlo1.trf > ${C}.log 2>&1
    rm -fr /scratch/tmp/braFlo1.trf
    echo ${C} done
done
    #	real    187m42.772s
    #	no repeats found on chrM
    # Make a filtered version for sequence masking:
    awk '{if ($5 <= 12) print;}' chrUn.bed > trfMask.bed

  # Load unfiltered repeats into the database:
    ssh hgwdev
    hgLoadBed braFlo1 simpleRepeat \
      /cluster/data/braFlo1/bed/simpleRepeat/chrUn.bed \
      -sqlTable=$HOME/kent/src/hg/lib/simpleRepeat.sql

################################################
## WINDOWMASKER (DONE - 2007-03-26 - Hiram)
    ssh kkstore05
    mkdir /cluster/data/braFlo1/bed/WindowMasker
    cd /cluster/data/braFlo1/bed/WindowMasker
    time nice -n +19 ~/kent/src/hg/utils/automation/doWindowMasker.pl braFlo1 \
      -buildDir=/cluster/data/braFlo1/bed/WindowMasker \
	-workhorse=kolossus > wmRun.log 2>&1 &
    #	real    82m55.387s

    # Masking statistics
    twoBitToFa braFlo1.wmsk.sdust.2bit stdout | faSize stdin
    #	926386587 bases (95184565 N's 831202022 real
    #	611909619 upper 219292403 lower) in 2 sequences in 1 files
    #	%23.67 masked total, %26.38 masked real

    ssh hgwdev
    time nice -n +19 \
	hgLoadBed -strict braFlo1 windowmaskerSdust windowmasker.sdust.bed.gz
    #	Loaded 4263263 elements of size 3

    #	eliminate the gaps from the masking (WM bug)
    featureBits braFlo1 -not gap -bed=notGap.bed
    #	831202022 bases of 831202022 (100.000%) in intersection
 
    time nice -n +19 featureBits braFlo1 windowmaskerSdust notGap.bed \
        -bed=stdout | gzip -c > cleanWMask.bed.gz
    #	219292403 bases of 831202022 (26.383%) in intersection

    #	reload track to get it clean
    hgLoadBed braFlo1 windowmaskerSdust cleanWMask.bed.gz
    #	Loaded 4255796 elements of size 4
    featureBits braFlo1 windowmaskerSdust
    #	219292403 bases of 831202022 (26.383%) in intersection

##############################################################################
## combine trf mask with windowmasker (DONE - 2008-05-23 - Hiram)
    ssh kkstore05
    cd /cluster/data/braFlo1
    zcat bed/WindowMasker/cleanWMask.bed.gz \
	| twoBitMask braFlo1.unmasked.2bit stdin \
	    -type=.bed braFlo1.cleanWMSdust.2bit
    cat bed/simpleRepeat/trfMask.bed \
      | twoBitMask -add -type=.bed braFlo1.cleanWMSdust.2bit stdin braFlo1.2bit
    #	can safely ignore the warning about BED file >= 13 fields
    #	check it
    twoBitToFa braFlo1.2bit stdout | faSize stdin
    #	926386587 bases (95184565 N's 831202022 real 611655540 upper
    #	219546482 lower) in 2 sequences in 1 files
    #	%23.70 masked total, %26.41 masked real

    #	create gbdb symlink
    ssh hgwdev
    ln -s /cluster/data/braFlo1/braFlo1.2bit /gbdb/braFlo1

#########################################################################
## Create masked scaffolds-only 2bit file (DONE - 2007-03-26 - Hiram)
    ssh kkstore05
    mkdir /cluster/data/braFlo1/scaffolds
    cp -p /cluster/data/oryLat1/jkStuff/lft2BitToFa.pl \
	/cluster/data/braFlo1/jkStuff
    cp -p /cluster/data/oryLat1/jkStuff/agpToLift.pl \
	/cluster/data/braFlo1/jkStuff
    cd /cluster/data/braFlo1/scaffolds
    ln -s ../Un/chrUn.agp .
    ../jkStiff/agpToLift.pl chrUn.agp > braFlo1.lift
    ../jkStuff/lft2BitToFa.pl ../braFlo1.2bit braFlo1.lift \
	> chrUn.scaffolds.fa
    twoBitToFa ../braFlo1.2bit chrM.fa
    faToTwoBit chrUn.scaffolds.fa chrM.fa braFlo1UnScaffolds.2bit
    twoBitInfo braFlo1UnScaffolds.2bit stdout | sort -k2nr \
	> braFlo1UnScaffolds.sizes
    cp -p braFlo1UnScaffolds.sizes braFlo1UnScaffolds.2bit braFlo1.lift \
	 /san/sanvol1/scratch/braFlo1

#########################################################################
## Reload rmsk to take care of empty chrM (DONE - 2007-03-26 - Hiram)
    ssh hgwdev
    cd /cluster/data/braFlo1/bed/RepeatMasker
    hgLoadOut -table=rmsk -nosplit braFlo1 braFlo1.fa.out
    hgsql -e "drop table chrUn_rmsk;" braFlo1
    #	before:
    featureBits braFlo1 rmsk
    #	24973684 bases of 923355587 (2.705%) in intersection
    #	after:
    featureBits braFlo1 rmsk
    #	24973684 bases of 923355587 (2.705%) in intersection
 
#########################################################################
## BLASTZ HUMAN HG18 (DONE - 2007-03-26 - Hiram)
    # measurement on hg18
    ssh kkstore02
    cd /cluster/data/hg18/bed/blastz.braFlo1.2007-03-26
    cat fb.hg18.chainBraFlo1Link.txt
    #	26455595 bases of 2881515245 (0.918%) in intersection

    # and now the swap
    mkdir /cluster/data/braFlo1/bed/blastz.hg18.swap
    cd /cluster/data/braFlo1/bed/blastz.hg18.swap
    time doBlastzChainNet.pl -chainMinScore=2000 -chainLinearGap=loose \
	/cluster/data/hg18/bed/blastz.braFlo1.2007-03-26/DEF \
	-tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \
	-bigClusterHub=pk -verbose=2 \
	-swap > swap.log 2>&1 &
    #	real    83m46.258s
    cat fb.braFlo1.chainHg18Link.txt
    #	30912893 bases of 923355587 (3.348%) in intersection

    #	create reciprocal best chains/nets for 5-way maf alignments
    ssh hgwdev
    cd /cluster/data/braFlo1/bed/blastz.hg18.swap
    time nice -n +19 /cluster/bin/scripts/doRecipBest.pl braFlo1 hg18 \
    time nice -n +19 $HOME/kent/src/hg/utils/automation/doRecipBest.pl \
	braFlo1 hg18 > rbest.log 2>&1 &
    #	real    34m14.843s
    #	real    32m11.695s
    #	This did not work right.  Something went haywire somewhere:
# Error: braFlo1 rbest net coverage 14190116 != hg18 14270477

#########################################################################
# MAKE 11.OOC FILE FOR BLAT (DONE - 2007-01-18 - Hiram)
    # Use -repMatch=328 (based on size -- for human we use 1024, and 
    # lancelet size is ~32% of human judging by gapless braFlo1 vs. hg18 
    # genome sizes from featureBits.
    ssh kolossus
    cd /cluster/data/braFlo1
    blat braFlo1.2bit /dev/null /dev/null -tileSize=11 \
      -makeOoc=jkStuff/11.ooc -repMatch=328
    #	Wrote 9536 overused 11-mers to jkStuff/11.ooc
    cp -p jkStuff/11.ooc /san/sanvol1/scratch/braFlo1

#########################################################################
# GENBANK AUTO UPDATE (DONE - 2007-03-26,27 - Hiram)
    # align with latest genbank process.
    ssh hgwdev
    cd ~/kent/src/hg/makeDb/genbank
    cvsup
    # edit etc/genbank.conf to add braFlo1 just after gasAcu1

# braFlo1 (Branchiostoma floridae - Lancelet)
braFlo1.serverGenome = /cluster/data/braFlo1/braFlo1.2bit
braFlo1.clusterGenome = /san/sanvol1/scratch/braFlo1/braFlo1.2bit
braFlo1.refseq.mrna.native.pslCDnaFilter  = ${ordered.refseq.mrna.native.pslCDnaFilter}
braFlo1.refseq.mrna.xeno.pslCDnaFilter    = ${ordered.refseq.mrna.xeno.pslCDnaFilter}
braFlo1.genbank.mrna.native.pslCDnaFilter = ${ordered.genbank.mrna.native.pslCDnaFilter}
braFlo1.genbank.mrna.xeno.pslCDnaFilter   = ${ordered.genbank.mrna.xeno.pslCDnaFilter}
braFlo1.genbank.est.native.pslCDnaFilter  = ${ordered.genbank.est.native.pslCDnaFilter}
braFlo1.ooc = /san/sanvol1/scratch/braFlo1/11.ooc
braFlo1.lift = /cluster/data/braFlo1/jkStuff/liftAll.lft
braFlo1.refseq.mrna.native.load = no
braFlo1.refseq.mrna.xeno.load = yes
braFlo1.genbank.mrna.xeno.load = yes
braFlo1.genbank.est.native.load = yes
braFlo1.downloadDir = braFlo1
braFlo1.perChromTables = no

    cvs ci -m "Added braFlo1." etc/genbank.conf
    # update /cluster/data/genbank/:
    make etc-update

    # Edit src/lib/gbGenome.c to add new species.
    cvs ci -m "Added Branchiostoma floridae (lancelet)." src/lib/gbGenome.c
    make install-server

    ssh kkstore05
    cd /cluster/data/genbank
    time nice -n +19 bin/gbAlignStep -initial braFlo1 &
    #	logFile: var/build/logs/2007.03.26-21:58:34.braFlo1.initalign.log
    #	real    230m8.123s

    # load database when finished
    ssh hgwdev
    cd /cluster/data/genbank
    time nice -n +19 ./bin/gbDbLoadStep -drop -initialLoad braFlo1
    #	logFile: var/dbload/hgwdev/logs/2007.03.27-07:52:23.dbload.log
    #	real    6m57.557s
    ## re-load to get the est track after adding that to the .conf file
    #	var/dbload/hgwdev/logs/2007.03.29-17:09:35.dbload.log
    #	real    19m2.168s

    # enable daily alignment and update of hgwdev
    cd ~/kent/src/hg/makeDb/genbank
    cvsup
    # add braFlo1 to:
        etc/align.dbs
        etc/hgwdev.dbs
    cvs ci -m "Added braFlo1 - Branchiostoma floridae (lancelet)" \
	etc/align.dbs etc/hgwdev.dbs
    make etc-update

############################################################################
## Adding a photograph (DONE - 2007-03-28 - Hiram)
## From Hector Escriva: hector.escriva@obs-banyuls.fr
# Dr. Clawson
# All the european teams working with amphioxus have changed to B.
# lanceolatum species and none of us is still working with B. floridae. We
# are able here to induce spawnings everyday (on B. lanceolatum) and this
# is impossible with the american species which limits very much the
# amount of experiments that one can perform. So I have pictures of B.
# lanceolatum, which is morphologically the same as floridae (even me I
# cannot distinghish them). So I send you here a picture of a B.
# lanceolatum juvenile in which we clearly see the main morphological
# aspects of this animal. You should however write the correct name of the
# species in order to be serious.
# I would also ask you to write in "Photo courtesy of", the name of my
# technicien who did the picture and myself. Michael Fuentes and Hector Escriva.
# Let me know if you agree with this picture.
# Sincerely Hector Escriva
    mkdir /cluster/data/braFlo1/photograph
    cd /cluster/data/braFlo1/photograph
    convert -sharpen 0 -normalize -geometry 350x200 \
	PostmetamorphicLarvae.jpg Branchiostoma_lanceolatum.jpg

############################################################################
#  BLATSERVERS ENTRY (DONE - 2007-04-04 - Hiram)
#	After getting a blat server assigned by the Blat Server Gods,
    ssh hgwdev

    hgsql -e 'INSERT INTO blatServers (db, host, port, isTrans, canPcr) \
	VALUES ("braFlo1", "blat11", "17784", "1", "0"); \
	INSERT INTO blatServers (db, host, port, isTrans, canPcr) \
	VALUES ("braFlo1", "blat11", "17785", "0", "1");' \
	    hgcentraltest
    #	test it with some sequence

############################################################################
## Default position set at IFG-1 (DONE - 2007-04-09 - Hiram)
    ssh hgwdev
    hgsql -e 'update dbDb set defaultPos="chrUn:619013791-619054719"
	where name="braFlo1";' hgcentraltest

############################################################################
#  BLASTZ petMar1 Lamprey (DONE - 2008-04-11 - Hiram)
    ssh kkstore05
    screen	# use a screen to control this multi-day job
    mkdir /cluster/data/braFlo1/bed/blastzPetMar1.2008-04-11
    cd /cluster/data/braFlo1/bed/blastzPetMar1.2008-04-11

    cat << '_EOF_' > DEF
# Lancelet vs Lamprey

BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=6000
BLASTZ_K=2200
BLASTZ_Q=/cluster/data/blastz/HoxD55.q

# TARGET: Lancelet braFlo1 - largest chunk big enough for largest scaffold
#       Largest scaffold 7,200,735 - 3032 scaffolds + chrM
SEQ1_DIR=/san/sanvol1/scratch/braFlo1/braFlo1.2bit
SEQ1_LEN=/san/sanvol1/scratch/braFlo1/chrom.sizes
SEQ1_CTGDIR=/san/sanvol1/scratch/braFlo1/braFlo1UnScaffolds.2bit
SEQ1_CTGLEN=/san/sanvol1/scratch/braFlo1/braFlo1UnScaffolds.sizes
SEQ1_LIFT=/san/sanvol1/scratch/braFlo1/braFlo1.lift
SEQ1_CHUNK=10000000
SEQ1_LIMIT=50
SEQ1_LAP=10000

# QUERY: Lamprey petMar1
SEQ2_DIR=/scratch/data/petMar1/petMar1.2bit
SEQ2_LEN=/cluster/data/petMar1/chrom.sizes
SEQ2_CHUNK=20000000
SEQ2_LIMIT=200
SEQ2_LAP=0

BASE=/cluster/data/braFlo1/bed/blastzPetMar1.2008-04-11
TMPDIR=/scratch/tmp
'_EOF_'
    # << happy emacs

    time nice -n +19 doBlastzChainNet.pl \
	/cluster/data/braFlo1/bed/blastzPetMar1.2008-04-11/DEF \
	-chainMinScore=5000 -chainLinearGap=loose \
	-tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \
	-bigClusterHub=kk -verbose=2 > do.log 2>&1 &
# Completed: 78590 of 78590 jobs
# CPU time in finished jobs:   10056466s  167607.76m  2793.46h  116.39d  0.319 y
# IO & Wait Time:               6633099s  110551.65m  1842.53h   76.77d  0.210 y
# Average job time:                 212s       3.54m     0.06h    0.00d
# Longest finished job:            4572s      76.20m     1.27h    0.05d
# Submission to last job:        107694s    1794.90m    29.91h    1.25d
    #	continuing after various interruptions and finished the batch manually
    time nice -n +19 doBlastzChainNet.pl \
	/cluster/data/braFlo1/bed/blastzPetMar1.2008-04-11/DEF \
	-chainMinScore=5000 -chainLinearGap=loose \
	-tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \
	-continue=cat -bigClusterHub=kk -verbose=2 > cat.log 2>&1 &
    #	real    12m43.792s
    cat fb.braFlo1.chainPetMar1Link.txt
    #	27418217 bases of 923355587 (2.969%) in intersection

    #	create reciprocal best chains/nets for 5-way maf alignments
    ssh hgwdev
    cd /cluster/data/braFlo1/bed/blastzPetMar1.2008-04-11
    time nice -n +19 /cluster/bin/scripts/doRecipBest.pl braFlo1 petMar1 \
	> rbest.log 2>&1 &
    #	real    30m11.990s

    #	and the swap
    mkdir /cluster/data/petMar1/bed/blastz.braFlo1.swap
    cd /cluster/data/petMar1/bed/blastz.braFlo1.swap
    time nice -n +19 doBlastzChainNet.pl \
	/cluster/data/braFlo1/bed/blastzPetMar1.2008-04-11/DEF \
	-chainMinScore=5000 -chainLinearGap=loose \
	-tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \
	-swap -bigClusterHub=kk -verbose=2 > swap.log 2>&1 &
    #	real    194m53.514s
    cat fb.petMar1.chainBraFlo1Link.txt
    #	27373264 bases of 831696438 (3.291%) in intersection

##########################################################################
## BLASTZ Chicken GalGal3 swap (DONE - 2008-04-16 - Hiram)
    #	the original
    ssh kkstore03
    cd /cluster/data/galGal3/bed/blastzBraFlo1.2008-04-15
    cat fb.galGal3.chainBraFlo1Link.txt
    #	19795687 bases of 1042591351 (1.899%) in intersection

    #	and the swap
    mkdir /cluster/data/braFlo1/bed/blastz.galGal3.swap
    cd /cluster/data/braFlo1/bed/blastz.galGal3.swap
    time nice -n +19 doBlastzChainNet.pl \
	/cluster/data/galGal3/bed/blastzBraFlo1.2008-04-15/DEF \
	-chainMinScore=5000 -chainLinearGap=loose \
	-tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \
	-swap -bigClusterHub=pk -verbose=2 > swap.log 2>&1 &
    #	real    4m45.054s
    cat fb.braFlo1.chainGalGal3Link.txt
    #	30287175 bases of 923355587 (3.280%) in intersection

    #	create reciprocal best chains/nets for 5-way maf alignments
    ssh hgwdev
    cd /cluster/data/braFlo1/bed/blastz.galGal3.swap
    time nice -n +19 /cluster/bin/scripts/doRecipBest.pl braFlo1 galGal3 \
	> rbest.log 2>&1 &
    #	real    24m30.834s

###########################################################################
# HUMAN (hg18) PROTEINS TRACK (DONE braney 2008-04-21)

# (auto) establish native host of /cluster/data/<assembly>
    df /cluster/data/braFlo1
    ssh kkstore05
    # bash  if not using bash shell already

    mkdir /cluster/data/braFlo1/blastDb
    cd /cluster/data/braFlo1
    awk '{if ($2 > 1000000) print $1}' scaffolds/braFlo1UnScaffolds.sizes > 1meg.lst
    if test -s 1meg.lst; then
	twoBitToFa -seqList=1meg.lst  scaffolds/braFlo1UnScaffolds.2bit temp.fa
	faSplit gap temp.fa 1000000 blastDb/x -lift=blastDb.lft
	rm temp.fa 
    fi
    rm 1meg.lst

    awk '{if ($2 <= 1000000) print $1}' scaffolds/braFlo1UnScaffolds.sizes > less1meg.lst
    if test -s less1meg.lst; then
	twoBitToFa -seqList=less1meg.lst  scaffolds/braFlo1UnScaffolds.2bit temp.fa
	faSplit about temp.fa 1000000 blastDb/y 
	rm temp.fa 
    fi
    rm less1meg.lst

    cd blastDb
    for i in *.fa
    do
	/cluster/bluearc/blast229/formatdb -i $i -p F
    done
    rm *.fa
    ls *.nsq | wc -l
# 1063

    mkdir -p /san/sanvol1/scratch/braFlo1/blastDb
    cd /cluster/data/braFlo1/blastDb
    for i in nhr nin nsq; 
    do 
	echo $i
	cp *.$i /san/sanvol1/scratch/braFlo1/blastDb
    done

    mkdir -p /cluster/data/braFlo1/bed/tblastn.hg18KG
    cd /cluster/data/braFlo1/bed/tblastn.hg18KG
    echo  /san/sanvol1/scratch/braFlo1/blastDb/*.nsq | xargs ls -S | sed "s/\.nsq//"  > query.lst
    wc -l query.lst

# 1063 query.lst

   # we want around 100,000 jobs per gig (0.0001 jobs per base)
   numJobs=`awk '{sum += $2} END {print sum * 0.0001 }' /cluster/data/braFlo1/chrom.sizes`

   # we want around numJobs jobs
   numGenes=`wc /cluster/data/hg18/bed/blat.hg18KG/hg18KG.psl | awk '{print $1}'`
   numQueries=`wc query.lst | awk '{print $1}'`
   numKGFiles=`awk -v ng=$numGenes -v nq=$numQueries -v nj=$numJobs 'BEGIN {printf "%d", ng/(nj/nq);exit}' `

   mkdir -p /cluster/bluearc/braFlo1/bed/tblastn.hg18KG/kgfa
   split -l $numKGFiles /cluster/data/hg18/bed/blat.hg18KG/hg18KG.psl  /cluster/bluearc/braFlo1/bed/tblastn.hg18KG/kgfa/kg
   ln -s /cluster/bluearc/braFlo1/bed/tblastn.hg18KG/kgfa kgfa
   cd kgfa
   for i in *; do 
     nice pslxToFa $i $i.fa; 
     rm $i; 
     done
   cd ..
   ls -1S kgfa/*.fa > kg.lst
   mkdir -p /cluster/bluearc/braFlo1/bed/tblastn.hg18KG/blastOut
   ln -s /cluster/bluearc/braFlo1/bed/tblastn.hg18KG/blastOut
   for i in `cat kg.lst`; do  mkdir blastOut/`basename $i .fa`; done
   tcsh
   cd /cluster/data/braFlo1/bed/tblastn.hg18KG
   cat << '_EOF_' > blastGsub
#LOOP
blastSome $(path1) {check in line $(path2)} {check out exists blastOut/$(root2)/q.$(root1).psl }
#ENDLOOP
'_EOF_'

   cat << '_EOF_' > blastSome
#!/bin/sh
BLASTMAT=/cluster/bluearc/blast229/data
export BLASTMAT
g=`basename $2`
f=/tmp/`basename $3`.$g
for eVal in 0.01 0.001 0.0001 0.00001 0.000001 1E-09 1E-11
do
if /cluster/bluearc/blast229/blastall -M BLOSUM80 -m 0 -F no -e $eVal -p tblastn -d $1 -i $2 -o $f.8
then
        mv $f.8 $f.1
        break;
fi
done
if test -f  $f.1
then
    if /cluster/bin/i386/blastToPsl $f.1 $f.2
    then
	if test -s /cluster/data/braFlo1/blastDb.lft
	then
	    liftUp -nosort -type=".psl" -nohead $f.3 /cluster/data/braFlo1/blastDb.lft carry $f.2
	else
	    mv $f.2 $f.3
	fi
        liftUp -nosort -type=".psl" -pslQ -nohead $3.tmp /cluster/data/hg18/bed/blat.hg18KG/protein.lft warn $f.3
        if pslCheck -prot $3.tmp
        then                  
            mv $3.tmp $3     
            rm -f $f.1 $f.2 $f.3 $f.4
        fi
        exit 0               
    fi                      
fi                         
rm -f $f.1 $f.2 $3.tmp $f.8 $f.3 $f.4
exit 1
'_EOF_'
    # << happy emacs
    chmod +x blastSome
    gensub2 query.lst kg.lst blastGsub blastSpec
    exit 
    
    ssh pk
    cd /cluster/data/braFlo1/bed/tblastn.hg18KG
    para create blastSpec
#    para try, check, push, check etc.

    para time

# Completed: 93544 of 93544 jobs
# CPU time in finished jobs:    8499504s  141658.40m  2360.97h   98.37d  0.270 y
# IO & Wait Time:                669830s   11163.84m   186.06h    7.75d  0.021 y
# Average job time:                  98s       1.63m     0.03h    0.00d
# Longest finished job:             376s       6.27m     0.10h    0.00d
# Submission to last job:         44890s     748.17m    12.47h    0.52d

    ssh kkstore05
    cd /cluster/data/braFlo1/bed/tblastn.hg18KG
    mkdir chainRun
    cd chainRun
    tcsh
    cat << '_EOF_' > chainGsub
#LOOP
chainOne $(path1)
#ENDLOOP
'_EOF_'

    cat << '_EOF_' > chainOne
(cd $1; cat q.*.psl | simpleChain -prot -outPsl -maxGap=75000 stdin /cluster/bluearc/braFlo1/bed/tblastn.hg18KG/blastOut/c.`basename $1`.psl)
'_EOF_'
    chmod +x chainOne
    ls -1dS /cluster/bluearc/braFlo1/bed/tblastn.hg18KG/blastOut/kg?? > chain.lst
    gensub2 chain.lst single chainGsub chainSpec
    # do the cluster run for chaining
    ssh memk
    cd /cluster/data/braFlo1/bed/tblastn.hg18KG/chainRun
    para create chainSpec
    para try, check, push, check etc.

# Completed: 88 of 88 jobs
# CPU time in finished jobs:        906s      15.09m     0.25h    0.01d  0.000 y
# IO & Wait Time:                 19435s     323.92m     5.40h    0.22d  0.001 y
# Average job time:                 231s       3.85m     0.06h    0.00d
# Longest running job:                0s       0.00m     0.00h    0.00d
# Longest finished job:             322s       5.37m     0.09h    0.00d
# Submission to last job:           812s      13.53m     0.23h    0.01d

    ssh kkstore05
    cd /cluster/data/braFlo1/bed/tblastn.hg18KG/blastOut
    for i in kg??
    do
       cat c.$i.psl | awk "(\$13 - \$12)/\$11 > 0.6 {print}" > c60.$i.psl
       sort -rn c60.$i.psl | pslUniq stdin u.$i.psl
       awk "((\$1 / \$11) ) > 0.60 { print   }" c60.$i.psl > m60.$i.psl
       echo $i
    done
    sort -T /tmp -k 14,14 -k 16,16n -k 17,17n u.*.psl m60* | uniq > /cluster/data/braFlo1/bed/tblastn.hg18KG/preblastHg18KG.psl
    cd ..
    pslCheck preblastHg18KG.psl
    liftUp -nosort -type=".psl" -nohead blastHg18KG.psl /cluster/data/braFlo1/scaffolds/braFlo1.lift carry preblastHg18KG.psl

    # load table 
    ssh hgwdev
    cd /cluster/data/braFlo1/bed/tblastn.hg18KG
    hgLoadPsl braFlo1 blastHg18KG.psl

    # check coverage
    featureBits braFlo1 blastHg18KG 
# 12239766 bases of 923355587 (1.326%) in intersection

    featureBits braFlo1 xenoRefGene:cds blastHg18KG  -enrichment
# xenoRefGene:cds 1.206%, blastHg18KG 1.326%, both 0.621%, cover 51.51%, enrich 38.86x

    ssh kkstore05
    rm -rf /cluster/data/braFlo1/bed/tblastn.hg18KG/blastOut
    rm -rf /cluster/bluearc/braFlo1/bed/tblastn.hg18KG/blastOut
#end tblastn

## BLASTZ Mouse mm9 swap (DONE - 2008-04-17 - Hiram)
    #	the original
    ssh kkstore06
    cd /cluster/data/mm9/bed/blastzBraFlo1.2008-04-14
    cat fb.mm9.chainBraFlo1Link.txt
    #	26725980 bases of 2620346127 (1.020%) in intersection

    #	That is OK, now for the swap:
    mkdir /cluster/data/braFlo1/bed/blastz.mm9.swap
    cd /cluster/data/braFlo1/bed/blastz.mm9.swap
    time doBlastzChainNet.pl -verbose=2 -swap \
	/cluster/data/mm9/bed/blastzBraFlo1.2008-04-14/DEF \
	-chainMinScore=5000 -chainLinearGap=loose \
	-qRepeats=windowmaskerSdust -bigClusterHub=kk > swap.log 2>&1 &
    #	real    12m23.402s
    cat  fb.braFlo1.chainMm9Link.txt
    #	31517169 bases of 923355587 (3.413%) in intersection

    #	create reciprocal best chains/nets for 5-way maf alignments
    ssh hgwdev
    cd /cluster/data/braFlo1/bed/blastz.mm9.swap
    time nice -n +19 /cluster/bin/scripts/doRecipBest.pl braFlo1 mm9 \
	> rbest.log 2>&1 &
    #	real    35m15.178s

#############################################################################
## 5-Way Multiz (DONE - 2008-04-17 - Hiram)
##
    ssh hgwdev
    mkdir /cluster/data/braFlo1/bed/multiz5way
    cd /cluster/data/braFlo1/bed/multiz5way
    #	take the 6-way tree from mm9 and eliminate genomes not in
    #	this alignment.  Manually add in petMar1 and braFlo1
    #	arbitrarily somewhere outside of the fishes
    #	rearrange to get braFlo1 on the top of the graph
    #	paste this tree into the on-line phyloGif tool:
    #	http://genome.ucsc.edu/cgi-bin/phyloGif
    #	to create the image for the tree diagram

    #	select the 5 organisms from the 6-way on Lamprey petMar1
    /cluster/bin/phast/tree_doctor \
	--prune-all-but \
Human_hg18,Mouse_mm9,Chicken_galGal3,Lamprey_petMar1,Lanclet_braFlo1 \
	/cluster/data/petMar1/bed/multiz6way/petMar1.6-way.nh \
	> 5-way.fullNames.nh

    #	looks something like this:
    ((Lamprey_petMar1:1.600000,(Chicken_galGal3:0.488824,
      (Mouse_mm9:0.325818,Human_hg18:0.126901):0.470757):0.725954):0.400000,
	Lanclet_braFlo1:2.000000);

    #	rearrange to get Lanclet at the top, this leaves us with:
    cat << '_EOF_' > braFlo1.5-way.nh
    (Lanclet_braFlo1:2.0,(Lamprey_petMar1:1.600000,(Chicken_galGal3:0.488824,
      (Mouse_mm9:0.325818,Human_hg18:0.126901):0.470757):0.725954):0.400000);
'_EOF_'
    #	<< happy emacs

    #	create a species list from that file:
    sed -e 's/[()]//g; s/ /\n/g; s/,/\n/g' braFlo1.5-way.nh \
        | sed -e "s/[ \t]*//g; /^[ \t]$/d; /^$/d" | sort -u \
        | sed -e "s/.*_//; s/:.*//" | sort > species.list
    #	verify that has 5 db names in it
    # create a stripped down nh file for use in autoMZ run
    echo \
`sed 's/[a-zA-Z0-9]*_//g; s/:[012].[0-9]*//g; s/[,;]/ /g' braFlo1.5-way.nh \
	| sed -e "s/  / /g"` > tree.5.nh
    #	that looks like, as a single line:
# (braFlo1 (petMar1 (galGal3 (mm9 hg18))))

    # verify all blastz's exists
    cat << '_EOF_' > listMafs.csh
#!/bin/csh -fe
cd /cluster/data/braFlo1/bed/multiz5way
foreach db (`grep -v braFlo1 species.list`)
    set bdir = /cluster/data/braFlo1/bed/blastz.$db
    if (-e $bdir/mafRBestNet/chrUn.maf.gz) then
	echo "$db mafRBestNet"
    else if (-e $bdir/mafSynNet/chrUn.maf.gz) then
	echo "$db mafSynNet"
    else if (-e $bdir/mafNet/chrUn.maf.gz) then
	echo "$db mafNet"
    else
	echo "$db mafs not found"
    endif
end
'_EOF_'
    # << happy emacs
    chmod +x ./listMafs.csh
    #	see what it says, these should all be mafRBestNet
    ./listMafs.csh
# galGal3 mafRBestNet
# hg18 mafRBestNet
# mm9 mafRBestNet
# petMar1 mafRBestNet

    /cluster/bin/phast/all_dists braFlo1.5-way.nh > 5way.distances.txt
    grep -i braflo 5way.distances.txt | sort -k3,3n
Lanclet_braFlo1 Chicken_galGal3 3.614778
Lanclet_braFlo1 Human_hg18      3.723612
Lanclet_braFlo1 Mouse_mm9       3.922529
Lanclet_braFlo1 Lamprey_petMar1 4.000000

    #	use the calculated
    #	distances in the table below to order the organisms and check
    #	the button order on the browser.  Zebrafish ends up before
    #	tetraodon and fugu on the browser despite its distance.
    #	And if you can fill in the table below entirely, you have
    #	succeeded in finishing all the alignments required.
    #
#                         featureBits chainLink measures
#                                       chainBraFlo1Link   chain   linearGap
#    distance                     on BraFlo1    on other   minScore
#  1  3.614778 Chicken_galGal3    (%  3.280)   (%  1.899)   5000     loose
#  2  3.723612 Human_hg18         (%  3.348)   (%  0.918)   2000     loose
#  3  3.922529 Mouse_mm9          (%  3.413)   (%  1.020)   5000     loose
#  4  4.000000 Lamprey_petMar1    (%  2.969)   (%  3.291)   5000     loose

    # copy net mafs to cluster-friendly storage, splitting chroms
    #	 XXX - no need to split, there is only chrM and chrUn
    mkdir mafLinks
    cd mafLinks
    for D in galGal3 petMar1 mm9 hg18
do
    mkdir ${D}
    ln -s /cluster/data/braFlo1/bed/blastz.${D}/mafRBestNet/*.maf.gz ./${D}
done

    #	copy to kluster friendly san for multiz run
    ssh kkstore05
    mkdir -p /san/sanvol1/scratch/braFlo1/multiz5way/mafSrc
    cd /cluster/data/braFlo1/bed/multiz5way/mafLinks
    rsync -a --progress -L ./ /san/sanvol1/scratch/braFlo1/multiz5way/mafSrc

    # ready for the multiz run
    ssh memk
    mkdir /cluster/data/braFlo1/bed/multiz5way/cons
    cd /cluster/data/braFlo1/bed/multiz5way/cons
    mkdir -p maf run
    cd run
    mkdir penn
    # use latest penn utilities
    P=/cluster/bin/penn/multiz.v11.2007-03-19/multiz-tba
    cp -p $P/{autoMZ,multiz,maf_project} penn

    #	set the db and pairs directories here
    cat > autoMultiz.csh << '_EOF_'
#!/bin/csh -ef
set db = braFlo1
set c = $1
set result = $2
set resultDir = $result:h
set run = `pwd`
set tmp = /scratch/tmp/$db/multiz.$c
set pairs = /san/sanvol1/scratch/$db/multiz5way/mafSrc
rm -fr $tmp
mkdir -p $tmp
mkdir -p $resultDir
cp ../../tree.5.nh ../../species.list $tmp
pushd $tmp
foreach s (`grep -v $db species.list`)
    set in = $pairs/$s/$c.maf
    set out = $db.$s.sing.maf
    if (-e $in.gz) then
        zcat $in.gz > $out
    else if (-e $in) then
        cp $in $out
    else
        echo "##maf version=1 scoring=autoMZ" > $out
    endif
end
set path = ($run/penn $path); rehash
$run/penn/autoMZ + T=$tmp E=$db "`cat tree.5.nh`" $db.*.sing.maf $c.maf
popd
rm -f $result
cp $tmp/$c.maf $result
rm -fr $tmp
rmdir --ignore-fail-on-non-empty /scratch/tmp/$db
'_EOF_'
# << happy emacs
    chmod +x autoMultiz.csh

    cat  << '_EOF_' > template
#LOOP
./autoMultiz.csh $(root1) {check out line+ /cluster/data/braFlo1/bed/multiz5way/cons/maf/$(root1).maf}
#ENDLOOP
'_EOF_'
    # << emacs

    echo "chrM.maf" > mafList
    echo "chrUn.maf" >> mafList
    gensub2 mafList single template jobList

    para create jobList
    para try ... check ... push ... etc
# Completed: 2 of 2 jobs
# CPU time in finished jobs:        375s       6.25m     0.10h    0.00d  0.000 y
# IO & Wait Time:                    17s       0.28m     0.00h    0.00d  0.000 y
# Average job time:                 196s       3.27m     0.05h    0.00d
# Longest finished job:             385s       6.42m     0.11h    0.00d
# Submission to last job:           385s       6.42m     0.11h    0.00d


    # load tables for a look
    ssh hgwdev
    mkdir -p /gbdb/braFlo1/multiz5way/maf
    ln -s /cluster/data/braFlo1/bed/multiz5way/cons/maf/*.maf \
                /gbdb/braFlo1/multiz5way/maf
    # this generates an immense multiz5way.tab file in the directory
    #	where it is running.  Best to run this over in scratch.
    cd /scratch/tmp
    time nice -n +19 hgLoadMaf \
	-pathPrefix=/gbdb/braFlo1/multiz5way/maf braFlo1 multiz5way
    #	Loaded 220560 mafs in 2 files from /gbdb/braFlo1/multiz5way/maf
    #	real    0m4.151s

    # load summary table
    time nice -n +19 cat /gbdb/braFlo1/multiz5way/maf/*.maf \
	| hgLoadMafSummary braFlo1 -minSize=30000 -mergeGap=1500 \
	 -maxSize=200000  multiz5waySummary stdin
    #	Created 87678 summary blocks from 366073 components and 220521 mafs from stdin
    #	real    0m4.911s

    # Gap Annotation
    # prepare bed files with gap info
    ssh kkstore05
    mkdir /cluster/data/braFlo1/bed/multiz5way/anno
    cd /cluster/data/braFlo1/bed/multiz5way/anno
    mkdir maf run

    #	these actually already all exist from previous multiple alignments
    #	if there are done you will see an ls of them
    #	or else the twoBitInfo command will be echoed, run it if you want
    for DB in `cat ../species.list`
do
    CDIR="/cluster/data/${DB}"
    if [ ! -f ${CDIR}/${DB}.N.bed ]; then
	echo "creating ${DB}.N.bed"
	echo twoBitInfo -nBed ${CDIR}/${DB}.2bit ${CDIR}/${DB}.N.bed
    else
	ls -og ${CDIR}/${DB}.N.bed
    fi
done

    cd run
    rm -f nBeds sizes
    for DB in `grep -v braFlo1 ../../species.list`
do
    echo "${DB} "
    ln -s  /cluster/data/${DB}/${DB}.N.bed ${DB}.bed
    echo ${DB}.bed  >> nBeds
    ln -s  /cluster/data/${DB}/chrom.sizes ${DB}.len
    echo ${DB}.len  >> sizes
done

    ssh memk
    cd /cluster/data/braFlo1/bed/multiz5way/anno/run
    cat << '_EOF_' > doAnno.csh
#!/bin/csh -ef
set dir = /cluster/data/braFlo1/bed/multiz5way/cons
set c = $1
cat $dir/maf/${c}.maf | nice \
    mafAddIRows -nBeds=nBeds stdin /cluster/data/braFlo1/braFlo1.2bit $2
'_EOF_'
    # << happy emacs
    chmod +x doAnno.csh


    cat << '_EOF_' > template
#LOOP
./doAnno.csh $(root1) {check out line+ ../maf/$(root1).maf}
#ENDLOOP
'_EOF_'
    # << happy emacs

    cut -f1 /cluster/data/braFlo1/chrom.sizes > chrom.list
    gensub2 chrom.list single template jobList
    para create jobList
    para try ... check ... push ... etc.
# Completed: 2 of 2 jobs
# CPU time in finished jobs:       7240s     120.66m     2.01h    0.08d  0.000 y
# IO & Wait Time:                     9s       0.16m     0.00h    0.00d  0.000 y
# Average job time:                3625s      60.41m     1.01h    0.04d
# Longest finished job:            7241s     120.68m     2.01h    0.08d
# Submission to last job:          7241s     120.68m     2.01h    0.08d

    #	load the results
    ssh hgwdev
    cd /cluster/data/braFlo1/bed/multiz5way/anno
    mkdir -p /gbdb/braFlo1/multiz5way/anno
    ln -s `pwd`/maf/*.maf /gbdb/braFlo1/multiz5way/anno
    #	by loading this into the table multiz5way, it will replace the
    #	previously loaded table with the unannotated mafs
    #	huge temp files are made, do them on local disk
    cd /scratch/tmp
    time nice -n +19 hgLoadMaf -pathPrefix=/gbdb/braFlo1/multiz5way/anno \
                braFlo1 multiz5way
    #	Loaded 311293 mafs in 2 files from /gbdb/braFlo1/multiz5way/anno
    #	real    0m7.505s

    #	normally filter this for chrom size > 1,000,000 and only load
    #	those chroms.  But this is a scaffold assembly, load everything.
    time nice -n +19 cat /gbdb/braFlo1/multiz5way/anno/*.maf \
	| hgLoadMafSummary braFlo1 -minSize=30000 -mergeGap=1500 \
	 -maxSize=200000  multiz5waySummary stdin
    #	Created 87678 summary blocks from 366073 components and 311253 mafs from stdin
    #	real    0m6.998s

    #	by loading this into the table multiz5waySummary, it will replace
    #	the previously loaded table with the unannotated mafs
    #	remove the multiz5way*.tab files in this /scratch/tmp directory
    rm multiz5way*.tab
    #	And, you can remove the previously loaded non-annotated maf file link:
    rm /gbdb/braFlo1/multiz5way/maf/*.maf
    rmdir /gbdb/braFlo1/multiz5way/maf

###########################################################################
## Annotate 5-way multiple alignment with gene annotations
##		(DONE - 2008-05-01 - Hiram)
    # Gene frames
    ## following pattern used in Lamprey petMar1
    #	hg18 is in a bit of transition at the moment, so use ensGene
    #	mm9 can use knownGene
    #	xenoMrna for braFlo1, petMar1
    #	ensGene v49 for galGal3

    ssh hgwdev
    mkdir /cluster/data/braFlo1/bed/multiz5way/frames
    cd /cluster/data/braFlo1/bed/multiz5way/frames
    mkdir genes
    # knownGene
    for DB in mm9
do
    hgsql -N -e "select name,chrom,strand,txStart,txEnd,cdsStart,cdsEnd,exonCount,exonStarts,exonEnds from knownGene" ${DB} \
      | genePredSingleCover stdin stdout | gzip -2c \
        > /scratch/tmp/${DB}.tmp.gz
    mv /scratch/tmp/${DB}.tmp.gz genes/$DB.gp.gz
    echo "${DB} done"
done
    # ensGene
    for DB in hg18 galGal3
do
    hgsql -N -e "select name,chrom,strand,txStart,txEnd,cdsStart,cdsEnd,exonCount,exonStarts,exonEnds from ensGene" ${DB} \
      | genePredSingleCover stdin stdout | gzip -2c \
        > /scratch/tmp/${DB}.tmp.gz
    mv /scratch/tmp/${DB}.tmp.gz genes/$DB.gp.gz
    echo "${DB} done"
done

    #	use xenoMrna for petMar1, braFlo1
    for DB in petMar1 braFlo1
do
tmpExt=`mktemp temp.XXXXXX`
tmpMrnaCds=${DB}.mrna-cds.${tmpExt}
tmpMrna=${DB}.mrna.${tmpExt}
tmpCds=${DB}.cds.${tmpExt}
hgsql -N -e 'select xenoMrna.qName,cds.name,xenoMrna.* \
	   from xenoMrna,gbCdnaInfo,cds \
	   where (xenoMrna.qName = gbCdnaInfo.acc) and \
	     (gbCdnaInfo.cds != 0) and (gbCdnaInfo.cds = cds.id)' \
$DB > ${tmpMrnaCds}
cut -f 1-2  ${tmpMrnaCds} > ${tmpCds}
cut -f 4-100  ${tmpMrnaCds} > ${tmpMrna}
mrnaToGene -cdsFile=${tmpCds} -smallInsertSize=8 -quiet ${tmpMrna} stdout | \
genePredSingleCover stdin stdout | gzip -2c > /scratch/tmp/$DB.tmp.gz
rm ${tmpMrnaCds} ${tmpMrna} ${tmpCds}
mv /scratch/tmp/$DB.tmp.gz genes/$DB.gp.gz
rm -f $tmpExt
echo "${DB} done"
done

    ls -og genes
# -rw-rw-r--  1 1040639 May  1 09:53 braFlo1.gp.gz
# -rw-rw-r--  1 1557911 May  1 09:50 galGal3.gp.gz
# -rw-rw-r--  1 2151412 May  1 09:50 hg18.gp.gz
# -rw-rw-r--  1 1965274 May  1 09:50 mm9.gp.gz
# -rw-rw-r--  1  596023 May  1 09:51 petMar1.gp.gz


    ssh kkstore05
    cd /cluster/data/braFlo1/bed/multiz5way/frames
    #	hg18 is in a bit of transition at the moment, so use ensGene
    #	mm9 can use knownGene
    #	xenoMrna for braFlo1, braFlo1
    #	ensGene v49 for galGal3, oryLat1
    #	anything to annotate is in a pair, e.g.: braFlo1 genes/braFlo1.gp.gz
    time (cat  ../anno/maf/*.maf | nice -n +19 genePredToMafFrames braFlo1 stdin stdout braFlo1 genes/braFlo1.gp.gz hg18 genes/hg18.gp.gz mm9 genes/mm9.gp.gz galGal3 genes/galGal3.gp.gz petMar1 genes/petMar1.gp.gz | gzip > multiz5way.mafFrames.gz) > frames.log 2>&1
    # see what it looks like in terms of number of annotations per DB:
    zcat multiz5way.mafFrames.gz | cut -f4 | sort | uniq -c | sort -n
    #	 44216 petMar1
    #	 71268 braFlo1
    #	107089 galGal3
    #	107857 hg18
    #	111452 mm9

    #	load the resulting file
    ssh hgwdev
    cd /cluster/data/braFlo1/bed/multiz5way/frames
    time nice -n +19 hgLoadMafFrames braFlo1 multiz5wayFrames \
	multiz5way.mafFrames.gz
    #	real    0m7.902s

    #	enable the trackDb entries:
# frames multiz5wayFrames
# irows on
#############################################################################
# phastCons 5-way (DONE - 2008-05-01 - Hiram)

    # split 5way mafs into 10M chunks and generate sufficient statistics 
    # files for # phastCons
    ssh memk
    mkdir /cluster/data/braFlo1/bed/multiz5way/msa.split
    cd /cluster/data/braFlo1/bed/multiz5way/msa.split
    mkdir -p /san/sanvol1/scratch/braFlo1/multiz5way/cons/ss

    cat << '_EOF_' > doSplit.csh
#!/bin/csh -ef
set MAFS = /cluster/data/braFlo1/bed/multiz5way/anno/maf
set WINDOWS = /san/sanvol1/scratch/braFlo1/multiz5way/cons/ss
pushd $WINDOWS
set c = $1
rm -f $c.out
twoBitToFa -seq=$c /scratch/data/braFlo1/braFlo1.2bit /scratch/tmp/braFlo1.$c.fa
/cluster/bin/phast/$MACHTYPE/msa_split $MAFS/$c.maf -i MAF \
    -M /scratch/tmp/braFlo1.$c.fa \
    -o SS -r $c -w 10000000,0 -I 1000 -B 5000
rm -f /scratch/tmp/braFlo1.$c.fa
popd
date > $c.out
'_EOF_'
    # << happy emacs
    chmod +x doSplit.csh

    cat << '_EOF_' > template
#LOOP
doSplit.csh $(root1) {check out line+ $(root1).out}
#ENDLOOP
'_EOF_'
    # << happy emacs

    #	create list of maf files:
    (cd ../anno/maf; find . -type f) | sed -e "s#^./##" > maf.list

    gensub2 maf.list single template jobList
    para create jobList
    para try ... check ... etc
# Completed: 2 of 2 jobs
# CPU time in finished jobs:        337s       5.61m     0.09h    0.00d  0.000 y
# IO & Wait Time:                    40s       0.67m     0.01h    0.00d  0.000 y
# Average job time:                 189s       3.14m     0.05h    0.00d
# Longest finished job:             375s       6.25m     0.10h    0.00d
# Submission to last job:           375s       6.25m     0.10h    0.00d

    # take the cons and noncons trees from the Lamprey 6-way

    #	Estimates are not easy to make, probably more correctly,
    #	take the 6-way .mod file, and re-use it here.
    ssh hgwdev
    cd /cluster/data/braFlo1/bed/multiz5way
    cp -p /cluster/data/petMar1/bed/multiz6way/cons/all/all.mod \
	./petMar1.6way.mod


    # Run phastCons
    #	This job is I/O intensive in its output files, thus it is all
    #	working over in /scratch/tmp/
    ssh memk
    mkdir -p /cluster/data/braFlo1/bed/multiz5way/cons/run.cons
    cd /cluster/data/braFlo1/bed/multiz5way/cons/run.cons

    #	there are going to be several different phastCons runs using
    #	this same script.  They trigger off of the current working directory
    #	$cwd:t which is the "grp" in this script.  It is one of:
    #	all gliers placentals
    #	Well, that's what it was when used in the Mm9 30-way,
    #	in this instance, there is only the directory "all"

    cat << '_EOF_' > doPhast.csh
#!/bin/csh -fe
set PHASTBIN = /cluster/bin/phast.build/cornellCVS/phast.2007-05-04/bin
set f = $1
set c = $1:r
set len = $2
set cov = $3
set rho = $4
set grp = $cwd:t
set tmp = /scratch/tmp/$f
set cons = /cluster/data/braFlo1/bed/multiz5way/cons
mkdir -p $tmp
set san = /san/sanvol1/scratch/braFlo1/multiz5way/cons
if (-s $cons/$grp/$grp.non-inf) then
  cp -p $cons/$grp/$grp.mod $cons/$grp/$grp.non-inf $tmp
  cp -p $san/ss/$f.ss $cons/$grp/$grp.mod $cons/$grp/$grp.non-inf $tmp
else
  cp -p $cons/$grp/$grp.mod $tmp
  cp -p $san/ss/$f.ss $cons/$grp/$grp.mod $tmp
endif
pushd $tmp > /dev/null
if (-s $grp.non-inf) then
  $PHASTBIN/phastCons $f.ss $grp.mod \
    --rho $rho --expected-length $len --target-coverage $cov --quiet \
    --not-informative `cat $grp.non-inf` \
    --seqname $c --idpref $c --most-conserved $f.bed --score > $f.pp
else
  $PHASTBIN/phastCons $f.ss $grp.mod \
    --rho $rho --expected-length $len --target-coverage $cov --quiet \
    --seqname $c --idpref $c --most-conserved $f.bed --score > $f.pp
endif
popd > /dev/null
mkdir -p $san/$grp/pp $san/$grp/bed
sleep 4
touch $san/$grp/pp $san/$grp/bed
rm -f $san/$grp/pp/$f.pp
rm -f $san/$grp/bed/$f.bed
mv $tmp/$f.pp $san/$grp/pp
mv $tmp/$f.bed $san/$grp/bed
rm -fr $tmp
'_EOF_'
    # << happy emacs
    chmod a+x doPhast.csh

    cat << '_EOF_' > template
#LOOP
../doPhast.csh $(file1) 45 .3 .31 {check out line+ /san/sanvol1/scratch/braFlo1/multiz5way/cons/all/pp/$(file1).pp}
#ENDLOOP
'_EOF_'
    # << happy emacs

    # Create parasol batch and run it
    pushd /san/sanvol1/scratch/braFlo1/multiz5way/cons
    find ./ss -type f -name "*.ss" | sed -e "s#^./##; s/.ss$//" \
	> /cluster/data/braFlo1/bed/multiz5way/cons/ss.list
    popd

    # run for all species
    cd ..
    mkdir -p all run.cons/all
    cd all
    /cluster/bin/phast.build/cornellCVS/phast.2007-05-04/bin/tree_doctor \
    ../../petMar1.6way.mod \
--prune-all-but=braFlo1,hg18,galGal3,mm9,petMar1 \
	> all.mod

    cd ../run.cons/all

    #	root1 == chrom name, file1 == ss file name without .ss suffix
    # Create template file for "all" run
    cat << '_EOF_' > template
#LOOP
../doPhast.csh $(file1) 45 .3 .31 {check out line+ /san/sanvol1/scratch/braFlo1/multiz5way/cons/all/pp/$(file1).pp}
#ENDLOOP
'_EOF_'
    # << happy emacs
    gensub2 ../../ss.list single template jobList
    para create jobList
    para try ... check ... push ... etc.
# Completed: 94 of 94 jobs
# CPU time in finished jobs:       1945s      32.42m     0.54h    0.02d  0.000 y
# IO & Wait Time:                   810s      13.49m     0.22h    0.01d  0.000 y
# Average job time:                  29s       0.49m     0.01h    0.00d
# Longest finished job:              34s       0.57m     0.01h    0.00d
# Submission to last job:           201s       3.35m     0.06h    0.00d

    # create Most Conserved track
    ssh kolossus
    cd /san/sanvol1/scratch/braFlo1/multiz5way/cons/all
    find ./bed -type f -name "chr*.bed" | xargs cat \
	| sort -k1,1 -k2,2n | \
        awk '{printf "%s\t%d\t%d\tlod=%d\t%s\n", $1, $2, $3, $5, $5;}' | \
            /cluster/bin/scripts/lodToBedScore /dev/stdin > mostConserved.bed
    cp -p mostConserved.bed /cluster/data/braFlo1/bed/multiz5way/cons/all

    # load into database
    ssh hgwdev
    cd /cluster/data/braFlo1/bed/multiz5way/cons/all
    time nice -n +19 hgLoadBed braFlo1 phastConsElements5way mostConserved.bed
    #	Loaded 122683 elements of size 5
    #	real    0m2.299s

    # Try for 5% overall cov, and 70% CDS cov 
    #	We don't have any gene tracks to compare CDS coverage
    #	--rho .31 --expected-length 45 --target-coverage .3
    featureBits braFlo1 phastConsElements5way
    #	35581251 bases of 923355587 (3.853%) in intersection

    # Create merged posterier probability file and wiggle track data files
    # currently doesn't matter where this is performed, the san is the same
    # network distance from all machines.
    # sort by chromName, chromStart so that items are in numerical order 
    #  for wigEncode
    cd /san/sanvol1/scratch/braFlo1/multiz5way/cons/all
    mkdir phastCons5wayScores

    cat pp/chrM*.pp | gzip > phastCons5wayScores/chrM.data.gz
    #	this is a bit of a cheat on the sorting order here, but it works.
    ls pp/chrUn*.pp | sort -t. -k2n | xargs cat \
	| gzip > phastCons5wayScores/chrUn.data.gz

    #	copy the phastCons5wayScores to:
    mkdir -p \
/cluster/data/braFlo1/bed/multiz5way/downloads/phastCons5way
    cp -p phastCons5wayScores/chr*.data.gz \
/cluster/data/braFlo1/bed/multiz5way/downloads/phastCons5way
    #	for hgdownload downloads

    # Create merged posterier probability file and wiggle track data files
    # currently doesn't matter where this is performed, the san is the same
    # network distance from all machines.
    cd /san/sanvol1/scratch/braFlo1/multiz5way/cons/all
    zcat phastCons5wayScores/chr*.data.gz \
	| wigEncode stdin phastCons5way.wig phastCons5way.wib
    # Converted stdin, upper limit 1.00, lower limit 0.00
    time nice -n +19 cp -p *.wi? /cluster/data/braFlo1/bed/multiz5way/cons/all
    #	real    0m1.216s

    # Load gbdb and database with wiggle.
    ssh hgwdev
    cd /cluster/data/braFlo1/bed/multiz5way/cons/all
    ln -s `pwd`/phastCons5way.wib /gbdb/braFlo1/multiz5way/phastCons5way.wib
    time nice -n +19 hgLoadWiggle -pathPrefix=/gbdb/braFlo1/multiz5way braFlo1 \
	phastCons5way phastCons5way.wig
    #	real    0m2.166s
    # remove garbage
    rm wiggle.tab

    #  Create histogram to get an overview of all the data
    ssh hgwdev
    cd /cluster/data/braFlo1/bed/multiz5way/cons/all
    time nice -n +19 hgWiggle -doHistogram \
	-hBinSize=0.001 -hBinCount=1000 -hMinVal=0.0 -verbose=2 \
	    -db=braFlo1 phastCons5way > histogram.data 2>&1
    #	real    5m0.608s

    #	create plot of histogram:

    cat << '_EOF_' | gnuplot > histo.png
set terminal png small color x000000 xffffff xc000ff x66ff66 xffff00 x00ffff
set size 1.4, 0.8
set key left box
set grid noxtics
set grid ytics
set title " Lancelet BraFlo1 Histogram phastCons5way track"
set xlabel " phastCons5way score"
set ylabel " Relative Frequency"
set y2label " Cumulative Relative Frequency (CRF)"
set y2range [0:1]
set y2tics
set yrange [0:0.02]

plot "histogram.data" using 2:5 title " RelFreq" with impulses, \
        "histogram.data" using 2:7 axes x1y2 title " CRF" with lines
'_EOF_'
    #	<< happy emacs

    display histo.png &
    #	it is a bit lopsided toward the highly conserved end, but that
    #	is because there is so little bits being used

    #	These trackDb entries turn on the wiggle phastCons data track:
    #	type wigMaf 0.0 1.0
    #	maxHeightPixels 100:40:11
    #	wiggle phastCons5way
    #	spanList 1
    #	autoScale Off
    #	windowingFunction mean
    #	pairwiseHeight 12
    #	yLineOnOff Off

#############################################################################
#  Downloads (DONE - 2008-05-01 - Hiram)
    #	Let's see if the downloads will work
    ssh hgwdev
    cd /cluster/data/braFlo1
    #	expecting to find repeat masker .out file here:
    ln -s bed/RepeatMasker/braFlo1.fa.out .
    cd bed/simpleRepeat
    mkdir trfMaskChrom
    splitFileByColumn trfMask.bed trfMaskChrom
    touch trfMaskChrom/chrM.bed
    cd /cluster/data/braFlo1
    time nice -n +19 /cluster/bin/scripts/makeDownloads.pl \
	-workhorse=hgwdev braFlo1 > jkStuff/downloads.log 2>&1
    #	real    11m53.475s

    #	failed making upstream sequences:
    #	featureBits calJac1 mgcGenes:upstream:1000 -fa=stdout
    #	setpriority: Permission denied.
    #	the 'nice' from my bash shell causes trouble inside the csh
    #	script which uses nice.  Finish off the install step manually
    #	with the mgcGenes upstreams ...

#############################################################################
#  PushQ entries (DONE - 2008-05-01 - Hiram)
    ssh hgwdev
    /cluster/data/braFlo1
    /cluster/bin/scripts/makePushQSql.pl braFlo1 > jkStuff/pushQ.sql
    #	output warnings:
# braFlo1 does not have seq
# Could not tell (from trackDb, all.joiner and hardcoded lists of supporting
# and genbank tables) which tracks to assign these tables to:
#	gbWarn
#	multiz5wayFrames
#	phastCons5way
    scp -p jkStuff/pushQ.sql hgwbeta:/tmp/braFlo1.sql
    ssh hgwbeta
    cd /tmp
    hgsql qapushq < braFlo1.sql

#############################################################################
#  Create 5-way downloads (DONE - 2008-05-01 - Hiram)
    ssh hgwdev
    mkdir -p /cluster/data/braFlo1/bed/multiz5way/downloads/phastCons5way
    cd /cluster/data/braFlo1/bed/multiz5way/downloads/phastCons5way
    #	if these haven't been copied here from before:
    cp -p \
/san/sanvol1/scratch/braFlo1/multiz5way/cons/all/phastCons5wayScores/* .
    ln -s ../../cons/all/all.mod ./5way.mod
    cp /cluster/data/petMar1/bed/multiz6way/downloads/phastCons6way/README.txt .
    # edit that README.txt to be correct for this 5-way alignment
    cd ..
    mkdir multiz5way
    cd multiz5way
    cp -p /cluster/data/petMar1/bed/multiz6way/downloads/multiz6way/README.txt .
    # edit that README.txt to be correct for this 5-way alignment
    ssh kkstore05
    cd /cluster/data/braFlo1/bed/multiz5way/downloads/multiz5way
    ln -s ../../braFlo1.5-way.nh ./5way.nh

    cp -p ../../anno/maf/chr*.maf .
    gzip chr*.maf

    ssh hgwdev
    cd /cluster/data/braFlo1/bed/multiz5way/downloads/multiz5way
    #	creating upstream files from xenoRefGene, bash script:
    cat << '_EOF_' > mkUpstream.sh
#!/bin/bash
DB=braFlo1
GENE=xenoRefGene
NWAY=multiz5way
export DB GENE

for S in 1000 2000 5000
do
    echo "making upstream${S}.maf"
    featureBits ${DB} ${GENE}:upstream:${S} -fa=/dev/null -bed=stdout \
        | perl -wpe 's/_up[^\t]+/\t0/' | sort -k1,1 -k2,2n \
        | $HOME/kent/src/hg/ratStuff/mafFrags/mafFrags ${DB} ${NWAY} \
                stdin stdout \
                -orgs=/cluster/data/${DB}/bed/${NWAY}/species.list \
        | gzip -c > upstream${S}.maf.gz
    echo "done upstream${S}.maf.gz"
done
'_EOF_'
    # << happy emacs
    chmod +x ./mkUpstream.sh
    time nice -n +19 ./mkUpstream.sh
    #	real    3m1.228s
# -rw-rw-r--  1   126900 May  1 16:51 upstream1000.maf.gz
# -rw-rw-r--  1   224320 May  1 16:52 upstream2000.maf.gz
# -rw-rw-r--  1   501687 May  1 16:53 upstream5000.maf.gz

    #	check the names in these upstream files to ensure sanity:
    zcat upstream1000.maf.gz | grep "^s " | awk '{print $2}' \
	| sort | uniq -c | sort -rn | less
    #	should be a list of the other 4 species with a high count,
    #	then xenoRefGene names, e.g.:
    #	364 petMar1
    #	364 mm9
    #	364 hg18
    #	364 galGal3
    #	  7 NM_069006
    #	  7 NM_054045
    #	  7 NM_021059

    ssh kkstore05
    cd /cluster/data/braFlo1/bed/multiz5way/downloads/multiz5way
    md5sum *.maf.gz > md5sum.txt
    cd ../phastCons5way
    md5sum *.data.gz *.mod > md5sum.txt

    ssh hgwdev
    mkdir /usr/local/apache/htdocs/goldenPath/braFlo1/multiz5way
    mkdir /usr/local/apache/htdocs/goldenPath/braFlo1/phastCons5way
    cd /cluster/data/braFlo1/bed/multiz5way/downloads/multiz5way
    ln -s `pwd`/* /usr/local/apache/htdocs/goldenPath/braFlo1/multiz5way
    rm /usr/local/apache/htdocs/goldenPath/braFlo1/multiz5way/mkUpstream.sh 
    cd ../phastCons5way
    ln -s `pwd`/* /usr/local/apache/htdocs/goldenPath/braFlo1/phastCons5way
    #	if your ln -s `pwd`/* made extra links to files you don't want there,
    #	check the goldenPath locations and remove those extra links

#############################################################################
#  add strings of N's to the gap track (DONE - 2008-05-09 - Hiram)
    ssh kkstore05
    mkdir /cluster/data/braFlo1/bed/gap
    cd /cluster/data/braFlo1/bed/gap
    time nice -n +19 findMotif -strand=+ -verbose=4 -motif=gattaca \
	../../braFlo1.2bit > braFlo1.gaps.txt 2>&1
    #	real    0m16.659s
    grep "^#GAP chrUn" braFlo1.gaps.txt | sed -e "s/#GAP //" > chrUn.gap.bed
    #	what are their sizes like:
    ave -col=5 chrUn.gap.bed
Q1 50.000000
median 60.000000
Q3 704.000000
average 1004.236543
min 1.000000
max 144292.000000
count 94786
total 95187565.000000
standard deviation 4497.943141
    #	don't let this overlap with the gap track already
    ssh hgwdev
    cd /cluster/data/braFlo1/bed/gap
    #	already existing gap track
    featureBits braFlo1 gap
    #	3031000 bases of 923355587 (0.328%) in intersection
    #	prove that it is a complete subset of this new set
    featureBits braFlo1 gap chrUn.gap.bed
    #	3031000 bases of 923355587 (0.328%) in intersection
    #	let's make sure none of the regular gaps run-on into these new
    #	found gaps
    hgsql -e "select chrom,chromStart,chromEnd from chrUn_gap;" braFlo1 \
	| sort -k1,1 -k2,2n > gap.bed
    awk '{printf "%s\t%s\t%s\n", $1, $2, $3}' chrUn.gap.bed \
	| sort -k1,1 -k2,2n > new.bed
    #	after working with that for a while, indeed there is a problem
    #	found in the existing gap track.  There are "contig" gaps in the
    #	the track which claim to be 1000 bases, but the DNA sequence
    #	has 1001 N's in that location.  Let's see if that is because the
    #	scaffold itself ends in a single N.  Yes, after some investigation,
    #	it is found there are scaffolds which either begin with Ns, or
    #	or end with N's, and it looks like they are single N's.
    #	there are 30 gap locations where this happens.
    #	So, let's make up a proper exclusive or of the new gaps:
    #	intersect the new gaps with everything that is not in the current
    #	gap track.  Get the regions that are not in the gap track:
    featureBits -not -countGaps braFlo1 gap -chrom=chrUn -bed=notGap.bed
    #	then, find only those bits in the new gaps that intersect that:
    featureBits -countGaps braFlo1 notGap.bed new.bed -bed=newGaps.bed
    #	and profile their sizes:
    awk '{print $3-$2}' newGaps.bed | ave stdin
Q1 50.000000
median 51.000000
Q3 591.000000
average 1004.037315
min 1.000000
max 144292.000000
count 91785
total 92155565.000000
standard deviation 4570.916383
    #	make the new gaps into a properly formatted gap table
    #	currently, the last index number in the gap track is:o
    hgsql -e "select max(ix) from chrUn_gap;" braFlo1
    #	6062

    awk '
BEGIN{ ix=6063 }
{
printf "%s\t%d\t%d\t%d\tN\t%d\tfragment\tyes\n", $1, $2, $3, ix, $3-$2
++ix
}' newGaps.bed > newGaps.gap.table
    #	findMotif has made an error on the last one, there is an extra one
    #	at the end that is not legitimate, take it off.
    #	let's get the SQL definition:
    mkdir dump
    cd dump
    hgsqldump -c --tab=. braFlo1 chrUn_gap
    cd ..
    #	and, make the bin column 
    hgLoadBed -noLoad braFlo1 xyt newGaps.gap.table
    #	count before:
    hgsql -e "select count(*) from chrUn_gap;" braFlo1
    #	3031
    #	then, add this business to the existing table:
    hgLoadSqlTab -append braFlo1 chrUn_gap dump/chrUn_gap.sql bed.tab
    #	and after
    hgsql -e "select count(*) from chrUn_gap;" braFlo1
    #	94815
    #	that is the sum of old and new:
    wc -l gap.bed bed.tab
    #	 3031 gap.bed
    #	91784 bed.tab
    #	94815 total

#############################################################################
# GENBANK reload due to problems found by gbSanity (2008-05-09 markd)
    ssh hgwdev
    cd /cluster/data/genbank
    nice ./bin/gbDbLoadStep3 -drop -initialLoad braFlo1&

#############################################################################

################################################
# AUTOMATE UPSTREAM FILE CREATION (2008-10-15 markd)
update genbank.conf:
braFlo1.upstreamGeneTbl = xenoRefGene
braFlo1.upstreamMaf = multiz5way /hive/data/genomes/braFlo1/bed/multiz5way/species.list
