# for emacs: -*- mode: sh; -*-

# Cavia porcellus -- Broad Institute Release 3.0 (Feb 12 2008)

# Template from bosTau4.txt

#########################################################################
# DOWNLOAD SEQUENCE (DONE - 2008-04-02 - Tim and Kate)
    ssh kkstore05
    mkdir /cluster/store12/cavPor3
    ln -s /cluster/store12/cavPor3 /cluster/data
    mkdir /cluster/data/cavPor3/broad
    cd /cluster/data/cavPor3/broad

    wget --timestamping \
ftp://ftp.broad.mit.edu/pub/assemblies/mammals/guineaPig/cavPor3/assembly.agp \
ftp://ftp.broad.mit.edu/pub/assemblies/mammals/guineaPig/cavPor3/assembly.bases.gz \
ftp://ftp.broad.mit.edu/pub/assemblies/mammals/guineaPig/cavPor3/assembly.quals.gz

    wget --timestamping \
#Not helpful ftp://ftp.broad.mit.edu/pub/assemblies/mammals/guineaPig/cavPor3/assembly_supers.qual.gz

qaToQac assembly.quals.gz stdout | qacAgpLift assembly.agp stdin cavPor3.qual.qac

########## From broad inst. cavPor3/BasicStats.out
# --------------------------------------------------------------------------------
# Sat Feb 02 03:52:11 2008 run (pid=21060), using Tue Jan 22 11:07:31 EST 2008 make
# BasicStats PRE=/wga/dev/WGAdata DATA=projects/Guineapig RUN=run/work           \
#            SUBDIR=post5 QUAL_STATS=True OUTFILE=BasicStats.out
# --------------------------------------------------------------------------------
# Supercontigs having < 3 reads or < 1kb sequence are ignored.
# 8 gaps <= -1000; 0 gaps <= -10000; 0 gaps <= -100000
# fraction of gaps < -10kb or more than 4 deviations below zero: 0.0222%
# 667 gaps > 10kb, 60 gaps > 50kb, 0 gaps > 200kb, 0 gaps > 1Mb
# 93.8% of reads were used in the assembly (95.55% of bases, 96.3% of Q20 bases)
# 0.00512% of reads were used multiply in the assembly
# 61603 contigs, having N50 length 80583
# total contig length: 2663352932, spanning 2722377657 bases (with 2.17% in gaps)
# 3143 supercontigs, having N50 length 27408292 (not including gaps)
# 2.48% of assembly in supers of size < 200000 (67399955 bases)
# Assembly base coverage: 6.79X.  Assembly Q20 coverage:  5.99X.
# 100% of bases have q >= 1
# 99.57% of bases have q >= 20
# 99.02% of bases have q >= 30
# 98.4% of bases have q >= 40
# 97.72% of bases have q >= 50

   cut -f 1 assembly.agp | uniq -c | wc -l
   # Number of scaffolds: 3143
   cut -f 1 assembly.agp | uniq -c | sort -b -n -r | cut -c 1-8 | uniq -c | sort -n -r -b | head -3
   # Number of scaffolds with single contig: 2176 thus 967 multi-contig scaffolds




#########################################################################
# Create .ra file and run makeGenomeDb.pl
    ssh kkstore05
    cd /cluster/data/cavPor3
cat << _EOF_ >cavPor3.config.ra
# Config parameters for makeGenomeDb.pl:
db cavPor3
clade mammal
genomeCladePriority 35
scientificName Cavia porcellus
commonName Guinea Pig
assemblyDate Feb. 2008
assemblyLabel Broad Institute cavPor3
orderKey 99
#mitoAcc AJ222767
mitoAcc 5679797
fastaFiles /cluster/data/cavPor3/broad/assembly.bases.gz
agpFiles /cluster/data/cavPor3/broad/assembly.agp
qualFiles /cluster/data/cavPor3/broad/cavPor3.qual.qac
dbDbSpeciesDir guineaPig
_EOF_

# use 'screen' make sure on kkstore05
    makeGenomeDb.pl -verbose=2 cavPor3.config.ra > makeGenomeDb.out 2>&1 &

# 'ctl-a ctl -d' returns to previous shell
cut -f 2 chrom.sizes | ave stdin
# Q1 7169.500000
# median 13298.000000
# Q3 55788.500000
# average 866164.007952
# min 3002.000000
# max 88675666.000000
# count 3144
# total 2723219641.000000
# standard deviation 5316243.688364

	# NOTES -- STUFF THAT YOU WILL HAVE TO DO --
	#
	#
	# Template trackDb.ra and .html's have been created, but they all need editing!
	#
	# cd /cluster/data/cavPor3/TemporaryTrackDbCheckout/kent/src/hg/makeDb/trackDb/guineaPig/cavPor3
	#
	# Search for '***' notes in each file in and make corrections (sometimes the
	# files used for a previous assembly might make a better template):
	#  description.html /cluster/data/cavPor3/html/{trackDb.ra,gap.html,gold.html}
	#
	# Then cd ../.. (to trackDb/) and
	# - edit makefile to add cavPor3 to DBS.
	# - (if necessary) cvs add guineaPig
	# - cvs add guineaPig/cavPor3
	# - cvs add guineaPig/cavPor3/*.{ra,html}
	# - cvs ci -m "Added cavPor3 to DBS." makefile
	# - cvs ci -m "Initial descriptions for cavPor3." guineaPig/cavPor3
	# - (if necessary) cvs ci guineaPig
        cvs ci -m "Initial description page for browser gateway of Cavia porcellus (guinea pig)" description.html
        cvs ci -m "Initial description page of gap location track for Cavia porcellus (guinea pig)" gap.html
        cvs ci -m "Initial description page of assembly track for Cavia porcellus (guinea pig)" gold.html
	# - Run make update DBS=cavPor3 and make alpha when done.
	# - (optional) Clean up /cluster/data/cavPor3/TemporaryTrackDbCheckout
	# - cvsup your ~/kent/src/hg/makeDb/trackDb and make future edits there.
		



#########################################################################
# REPEATMASKER (DONE - 2008-04-05 - Tim)
    ssh kkstore06
    screen # use a screen to manage this job
    mkdir /cluster/data/cavPor3/bed/repeatMasker
    cd /cluster/data/cavPor3/bed/repeatMasker
    doRepeatMasker.pl -buildDir=/cluster/data/cavPor3/bed/repeatMasker \
        cavPor3 > do.log 2>&1 &

    # Note: can run simpleRepeats simultaneously
    #### When done with RM:
    ssh pk
    para time
    # CPU time in finished jobs:   26236391s  437273.18m  7287.89h  303.66d  0.832 y
    # IO & Wait Time:                548808s    9146.80m   152.45h    6.35d  0.017 y
    # Average job time:                4735s      78.91m     1.32h    0.05d
    # Longest running job:                0s       0.00m     0.00h    0.00d
    # Longest finished job:            5871s      97.85m     1.63h    0.07d
    # Submission to last job:        130217s    2170.28m    36.17h    1.51d


    time nice -n +19 featureBits cavPor3 rmsk > fb.cavPor3.rmsk.txt 2>&1 &
    # 732765485 bases of 2663369733 (27.513%) in intersection

    # RepeatMasker and lib version from do.log:
    #    RepeatMasker version development-$Id: cavPor3.txt,v 1.22 2010/04/21 19:22:27 hiram Exp $
    #    Jan 11 2008 (open-3-1-9) version of RepeatMasker
    #    CC   RELEASE 20071204;

    # Compare coverage to previous assembly:
    # skip this since cavPor 2 was never added to browser
    #featureBits cavPor2 rmsk

#########################################################################
# SIMPLE REPEATS TRF (DONE - 2008-04-04 - Tim)
    ssh kkstore05
    screen # use a screen to manage this job
    mkdir /cluster/data/cavPor3/bed/simpleRepeat
    cd /cluster/data/cavPor3/bed/simpleRepeat
    #
    doSimpleRepeat.pl -buildDir=/cluster/data/cavPor3/bed/simpleRepeat \
	cavPor3 > do.log 2>&1 &

    #### When done
    ssh pk
    para time
    # Completed: 73 of 73 jobs
    # CPU time in finished jobs:      13220s     220.33m     3.67h    0.15d  0.000 y
    # IO & Wait Time:                   679s      11.32m     0.19h    0.01d  0.000 y
    # Average job time:                 190s       3.17m     0.05h    0.00d
    # Longest running job:                0s       0.00m     0.00h    0.00d
    # Longest finished job:             397s       6.62m     0.11h    0.00d
    # Submission to last job:          1320s      22.00m     0.37h    0.02d

    featureBits cavPor3 simpleRepeat
    # 33641656 bases of 2663369733 (1.263%) in intersection
    #	after RM run is done, add this mask:
    cd /cluster/data/cavPor3
    twoBitMask cavPor3.rmsk.2bit -add bed/simpleRepeat/trfMask.bed cavPor3.2bit

    twoBitToFa cavPor3.2bit stdout | faSize stdin
    #  2723219641 bases (59849908 N's 2663369733 real 1930234123 upper 733135610 lower) in 3144 sequences in 1 files
    #  Total size: mean 866164.0 sd 5317089.3 min 3002 (scaffold_3142) max 88675666 (scaffold_0) median 13298
    #  N count: mean 19036.2 sd 100029.3
    #  U count: mean 613942.2 sd 3764766.9
    #  L count: mean 233185.6 sd 1462738.3
    #  %26.92 masked total, %27.53 masked real
    # >>> NOTE: 2723219641 bases matche chrome.sizes above but not featureBits which is 2663369733

    twoBitToFa cavPor3.rmsk.2bit stdout | faSize stdin
    # 2723219641 bases (59849908 N's 2663369733 real 1931053208 upper 732316525 lower) in 3144 sequences in 1 files
    # Total size: mean 866164.0 sd 5317089.3 min 3002 (scaffold_3142) max 88675666 (scaffold_0) median 13298
    # N count: mean 19036.2 sd 100029.3
    # U count: mean 614202.7 sd 3766351.7
    # L count: mean 232925.1 sd 1461161.8
    # %26.89 masked total, %27.50 masked real

    #########################################################################
    # Link to it from /gbdb:  (DONE - 2008-04-10 - Tim)
    ln -s /cluster/data/cavPor3/cavPor3.2bit /gbdb/cavPor3/cavPor3.2bit

#########################################################################
# Create OOC file for genbank runs (DONE - 2008-04-07 - Tim)
# use same repMatch value as bosTau2
    ssh kkstore05
    cd /cluster/data/cavPor3
    blat cavPor3.2bit /dev/null /dev/null -tileSize=11 \
    	-makeOoc=cavPor3.11.1005.ooc -repMatch=1005
    # Wrote 26905 overused 11-mers to cavPor3.11.1005.ooc
    # -rw-rw-r--  1 tdreszer protein    107628 Apr  7 13:22 cavPor3.11.1005.ooc
    blat cavPor3.2bit /dev/null /dev/null -tileSize=11 \
	-makeOoc=cavPor3.11.ooc -repMatch=1024
    # Wrote 25815 overused 11-mers to cavPor3.11.ooc
    # -rw-rw-r--  1 tdreszer protein    103268 Apr  7 13:17 cavPor3.11.ooc

    ssh kkr1u00
    mkdir /iscratch/i/cavPor3
    cd /iscratch/i/cavPor3
    cp -p /cluster/data/cavPor3/cavPor3.2bit .
    for R in 2 3 4 5 6 7 8
do
    rsync -a --progress ./ kkr${R}u00:/iscratch/i/cavPor3/
done

#########################################################################
# Run WindowMasker because RepeatMasker coverge is low (DONE - 2008-04-08 - Tim)
    screen
    mkdir /cluster/data/cavPor3/bed/windowMasker
    ssh kkstore05
    cd /cluster/data/cavPor3/bed/windowMasker
    nice doWindowMasker.pl -workhorse=kolossus \
        -buildDir=/cluster/data/cavPor3/bed/windowMasker cavPor3 > wmRun.log 2>&1 &

    #	load this initial data to get ready to clean it
    ssh hgwdev
    cd /cluster/data/cavPor3/bed/windowMasker
    hgLoadBed cavPor3 windowmaskerSdust windowmasker.sdust.bed.gz
    # Loaded 14554107 elements of size 3

    #	eliminate the gaps from the masking
    featureBits cavPor3 -not gap -bed=notGap.bed
    # 2663369733 bases of 2663369733 (100.000%) in intersection
    featureBits cavPor3 windowmaskerSdust
    # 990884347 bases of 2663369733 (37.204%) in intersection

    time nice -n +19 featureBits cavPor3 windowmaskerSdust notGap.bed \
        -bed=stdout | gzip -c > cleanWMask.bed.gz
    # 931034439 bases of 2663369733 (34.957%) in intersection

    #	reload track to get it clean
    hgLoadBed cavPor3 windowmaskerSdust cleanWMask.bed.gz
    # Loaded 14547726 elements of size 4
    featureBits cavPor3 windowmaskerSdust
    # 931034439 bases of 2663369733 (34.957%) in intersection

    #	mask the sequence with this clean mask
    zcat cleanWMask.bed.gz \
	| twoBitMask ../../cavPor3.unmasked.2bit stdin \
	    -type=.bed cavPor3.cleanWMSdust.2bit
    twoBitToFa cavPor3.cleanWMSdust.2bit stdout | faSize stdin \
        > cavPor3.cleanWMSdust.faSize.txt
    cat cavPor3.cleanWMSdust.faSize.txt
    # 2723219641 bases (59849908 N's 2663369733 real 1732335294 upper 931034439 lower) in 3144 sequences in 1 files
    # Total size: mean 866164.0 sd 5317089.3 min 3002 (scaffold_3142) max 88675666 (scaffold_0) median 13298
    # N count: mean 19036.2 sd 100029.3
    # U count: mean 550997.2 sd 3547541.7
    # L count: mean 296130.5 sd 1686844.5
    # %34.19 masked total, %34.96 masked real

#########################################################################
#  Starting Genbank (Done - 2008-04-10 - Tim)
    ssh hgwdev
    cd $HOME/kent/src/hg/makeDb/genbank
    # edit etc/genbank.conf to add the following entry:

# cavPor3 (C. porcellus) 3144 scaffolds 967 are multi-contig and 2076 are single contig
cavPor3.serverGenome = /cluster/data/cavPor3/cavPor3.2bit
cavPor3.clusterGenome = /iscratch/i/cavPor3/cavPor3.2bit
cavPor3.ooc = /cluster/data/cavPor3/cavPor3.11.ooc
cavPor3.refseq.mrna.native.pslCDnaFilter  = ${ordered.refseq.mrna.native.pslCDnaFilter}
cavPor3.refseq.mrna.xeno.pslCDnaFilter    = ${ordered.refseq.mrna.xeno.pslCDnaFilter}
cavPor3.genbank.mrna.native.pslCDnaFilter = ${ordered.genbank.mrna.native.pslCDnaFilter}
cavPor3.genbank.mrna.xeno.pslCDnaFilter   = ${ordered.genbank.mrna.xeno.pslCDnaFilter}
cavPor3.genbank.est.native.pslCDnaFilter  = ${ordered.genbank.est.native.pslCDnaFilter}
cavPor3.lift = no
cavPor3.downloadDir = cavPor3
cavPor3.perChromTables = no
cavPor3.refseq.mrna.native.load = no
cavPor3.refseq.mrna.xeno.load = yes
cavPor3.genbank.mrna.xeno.load = yes
cavPor3.genbank.est.native.load = yes

    cvs ci -m "Added cavPor3." etc/genbank.conf
    # update /cluster/data/genbank/:
    make etc-update
    cvs ci -m "Turned off native and on xeno." etc/genbank.conf

    # Edit src/lib/gbGenome.c to add new species.
    # static char *cavPorNames[] = {"Cavia porcellus", NULL};
    # static struct dbToSpecies dbToSpeciesMap[] = { ...>>> {"cavPor", cavPorNames},

    cvs ci -m "Added guinea pig." src/lib/gbGenome.c
    make install-server

    ssh genbank
    screen  # control this business with a screen since it takes a while
    cd /cluster/data/genbank

    # This is a call to a script that will push our jobs out to the cluster
    # since it's a big job.
    time nice -n +19 bin/gbAlignStep -initial cavPor3 &
    #	logFile: var/build/logs/2008.03.10-14:14:43.cavPor3.initalign.log
    #	real    567m7.431s
    # > For comparison:  logFile: var/build/logs/2008.04.08-14:37:23.bosTau4.initalign.log
    # >                  real    45m6.595s
    # Batch failed after 4 tries on /cluster/genbank/genbank/bin/gbBlat genbank.164.0/cavPor3/full/psl/est.eb.native.1/scaffold_41/scaffold_41.job genbank.164.0/cavPor3/full/psl/est.eb.native.1/scaffold_41/scaffold_41.psl
    # command failed: ssh -x kk cd /cluster/data/genbank/build/work/initial.cavPor3/align\; para make -maxPush=200000 align.jobs </dev/null at /cluster/genbank/genbank/bin/../lib/gbCommon.pm line 238. at /cluster/genbank/genbank/bin/../lib/gbCommon.pm line 238.
    # command failed: gbAlignRun -workdir=/cluster/data/genbank/build/work/initial.cavPor3/align  at /cluster/genbank/genbank/bin/../lib/gbCommon.pm line 238. at /cluster/genbank/genbank/bin/../lib/gbCommon.pm line 238.
    #[1]+  Exit 2                  time nice -n +19 bin/gbAlignStep -initial cavPor3
    ## Determined the machine kkr4u58 was not answering ssh requests and all failed jobs were
    ## on that machine:
    parasol machine remove kkr4u58.kilokluster.ucsc.edu
    para push -retries=6
    ### After manual restart of the para jobsList:
    #Checking finished jobs
    #Completed: 21128 of 21128 jobs
    #CPU time in finished jobs:   15970173s  266169.54m  4436.16h  184.84d  0.506 y
    #IO & Wait Time:                685589s   11426.49m   190.44h    7.94d  0.022 y
    #Average job time:                 788s      13.14m     0.22h    0.01d
    #Longest running job:                0s       0.00m     0.00h    0.00d
    #Longest finished job:            1581s      26.35m     0.44h    0.02d
    #Submission to last job:        135502s    2258.37m    37.64h    1.57d
    #Estimated complete:                 0s       0.00m     0.00h    0.00d
    ####### Once parasol jobs are finished (handholding) run:
    time nice -n +19 bin/gbAlignStep -initial -continue=finish cavPor3 &
    # logFile: var/build/logs/2008.04.10-09:11:21.cavPor3.initalign.log
    # real    97m14.229s
    # gbAlignInstall: complete: real=2.02
    # genbank 2008.04.10-10:48:36 cavPor3.initalign: finish

    # load database when finished
    ssh hgwdev
    cd /cluster/data/genbank
    time nice -n +19 ./bin/gbDbLoadStep -drop -initialLoad cavPor3 &
    #	logFile: var/dbload/hgwdev/logs/2008.04.10-11:25:04.dbload.log
    #	real    16m25.416s

    # enable daily alignment and update of hgwdev (DONE - 2008-03-11 - Hiram)
    cd ~/kent/src/hg/makeDb/genbank
    cvsup
    # add cavPor3 to:
        etc/align.dbs
        etc/hgwdev.dbs
    cvs ci -m "Added cavPor3." etc/align.dbs etc/hgwdev.dbs
    make etc-update

#########################################################################
#  REstarting Genbank (beg: 2008-05-22   Done: 2008-05-23 - Tim)

    ### QA found empty refLink table which led MarkD to determine that in genbank.conf cavPor3 used default params
    ### which were not appropriate.  The last 4 lines listed below were added, and genbank started again
    ssh hgwdev
    cd $HOME/kent/src/hg/makeDb/genbank
    # edit etc/genbank.conf to add the following entry:

# cavPor3 (C. porcellus) 3144 scaffolds 967 are multi-contig and 2076 are single contig
cavPor3.serverGenome = /cluster/data/cavPor3/cavPor3.2bit
cavPor3.clusterGenome = /iscratch/i/cavPor3/cavPor3.2bit
cavPor3.ooc = /cluster/data/cavPor3/cavPor3.11.ooc
cavPor3.refseq.mrna.native.pslCDnaFilter  = ${ordered.refseq.mrna.native.pslCDnaFilter}
cavPor3.refseq.mrna.xeno.pslCDnaFilter    = ${ordered.refseq.mrna.xeno.pslCDnaFilter}
cavPor3.genbank.mrna.native.pslCDnaFilter = ${ordered.genbank.mrna.native.pslCDnaFilter}
cavPor3.genbank.mrna.xeno.pslCDnaFilter   = ${ordered.genbank.mrna.xeno.pslCDnaFilter}
cavPor3.genbank.est.native.pslCDnaFilter  = ${ordered.genbank.est.native.pslCDnaFilter}
cavPor3.lift = no
cavPor3.downloadDir = cavPor3
cavPor3.perChromTables = no
cavPor3.refseq.mrna.native.load = no
cavPor3.refseq.mrna.xeno.load = yes
cavPor3.genbank.mrna.xeno.load = yes
cavPor3.genbank.est.native.load = yes
    ### The last 4 lines listed above were added, and genbank started again

    # update /cluster/data/genbank/:
    make etc-update
    cvs ci -m "Turned off native and on xeno." etc/genbank.conf

    # Edit src/lib/gbGenome.c to add new species.
    # static char *cavPorNames[] = {"Cavia porcellus", NULL};
    # static struct dbToSpecies dbToSpeciesMap[] = { ...>>> {"cavPor", cavPorNames},

    #cvs ci -m "Added guinea pig." src/lib/gbGenome.c
    make install-server

    ### Mark Deikhens recommends:
    # You are not really realigning, but aligning additional.  If you
    # just kick off an initial alignment, it *should* align the
    # missing partitions of the data.  However, sometimes weird things
    # happen when a new, full release is made, and a new refseq just
    # came out.  So kick on the alignment and let me know when its
    # done and I can take a look.
    #
    # Also, to just reload everything, include the -drop flag on your
    # command line.  That is usually easiest; way waste your time
    # figuring out weird cases when the computer can just do the work.

    ssh genbank
    screen  # control this business with a screen since it takes a while
    cd /cluster/data/genbank

    # This is a call to a script that will push our jobs out to the cluster
    # since it's a big job.
    time nice -n +19 bin/gbAlignStep -initial cavPor3 &
    # logFile: var/build/logs/2008.05.22-15:44:34.cavPor3.initalign.log
    # real    182m21.821s
    # gbAlignInstall: complete: real=2.35
    # genbank 2008.05.22-18:46:56 cavPor3.initalign: finish

    # Mark Deikhens: Look good, load it.

    # load database when finished
    ssh hgwdev
    cd /cluster/data/genbank
    time nice -n +19 ./bin/gbDbLoadStep -drop -initialLoad cavPor3 &
    #   logFile: var/dbload/hgwdev/logs/2008.05.23-11:39:52.dbload.log
    #   real    17m53.123s

############################################################################
#	DONE - 2008-04-10 - Tim
#	Reset default position to Math1=ATOH1=NM_005172=scaffold_15:19047435-19048046
    hgsql -e \
'update dbDb set defaultPos="scaffold_15:19047434-19048046" where name="cavPor3";' \
	hgcentraltest
    #	And there was a mistake in the description date entry
    hgsql -e \
'update dbDb set description="Feb. 2008" where name="cavPor3";' \
	hgcentraltest

#########################################################################
## genscan run (DONE - 2008-04-10 - Tim)
##	create hard masked sequence
    ssh kkstore05
    cd /cluster/data/cavPor3
    mkdir hardMasked
    for C in `cut -f1 chrom.sizes`
do
    echo "hardMasked/${C}.hard.fa"
    twoBitToFa -seq=${C} cavPor3.2bit stdout \
	| maskOutFa stdin hard hardMasked/${C}.hard.fa
    ls -ld "hardMasked/${C}.hard.fa"
done

    #	And, make sure there aren't any sequences in this lot that have
    #	become all N's with no sequence left in them.  This drives genscan nuts
    echo hardMasked/*.hard.fa | xargs faCount > faCount.hard.txt

    #	the lowest three are:
    egrep -v "^#|^total" faCount.hard.txt \
	| awk '{print $1,$2-$7}' | sort -k2,2nr | tail -3
    # scaffold_2949 425
    # scaffold_3101 357
    # scaffold_3130 0
    rm harMasked/scaffold_3130.hard.fa
    ##### Saving this junk from bosTau4.txt because it will be useful next time
    ##  #	There are a whole bunch of these, and many with just a few bases.
    ##  #	Actually, before removing these for genscan, run the cpgIsland
    ##  #	business first since it can work on them all.
    ##  #	So, remove any with less than 100 bases of sequence
    ##  egrep -v "^#|^total" faCount.hard.txt | awk '{size=$2-$7; if (size < 100){printf "hardMasked/%s.hard.fa\n", $1}}' | xargs rm

    #	now get them over to a kluster location
    mkdir /san/sanvol1/scratch/cavPor3/hardChunks
    cd /san/sanvol1/scratch/cavPor3/hardChunks
    #	creating 4,000,000 sized chunks, the chroms stay together as
    #	single pieces.  The contigs get grouped together into 4,000,000
    #	sized fasta files.  You don't want to break these things up
    #	because genscan will be doing its own internal 2.4 million
    #	window on these pieces, and the gene names are going to be
    #	constructed from the sequence name in these fasta files.
    echo /cluster/data/cavPor3/hardMasked/*.hard.fa | xargs cat \
	| faSplit about stdin ls c_

    ssh hgwdev
    mkdir /cluster/data/cavPor3/bed/genscan
    cd /cluster/data/cavPor3/bed/genscan
    # Check out hg3rdParty/genscanlinux to get latest genscan:
    cvs co hg3rdParty/genscanlinux

    # Run on small cluster (more mem than big cluster).
    ssh memk
    cd /cluster/data/cavPor3/bed/genscan
    # Make 3 subdirectories for genscan to put their output files in
    mkdir gtf pep subopt
    # Generate a list file, genome.list, of all the hard-masked contigs that
    # *do not* consist of all-N's (which would cause genscan to blow up)
    #	Since we split on gaps, we have no chunks like that.  You can
    #	verify with faCount on the chunks.
    ls -1Sr /san/sanvol1/scratch/cavPor3/hardChunks/c_*.fa > genome.list

    # Create run-time script to operate gsBig in a cluster safe manner
    cat << \_EOF_ > runGsBig
#!/bin/csh -fe
set runDir = `pwd`
set srcDir = $1
set inFile = $2
set fileRoot = $inFile:r
mkdir /scratch/tmp/$fileRoot
cp -p $srcDir/$inFile /scratch/tmp/$fileRoot
pushd /scratch/tmp/$fileRoot
/cluster/bin/x86_64/gsBig $inFile $fileRoot.gtf -trans=$fileRoot.pep -subopt=$fileRoot.bed -exe=$runDir/hg3rdParty/genscanlinux/genscan -par=$runDir/hg3rdParty/genscanlinux/HumanIso.smat -tmp=/scratch/tmp -window=2400000
popd
cp -p /scratch/tmp/$fileRoot/$fileRoot.gtf gtf
cp -p /scratch/tmp/$fileRoot/$fileRoot.pep pep
cp -p /scratch/tmp/$fileRoot/$fileRoot.bed subopt
rm -fr /scratch/tmp/$fileRoot
_EOF_
    # << happy emacs
    chmod +x runGsBig
    cat << \_EOF_ > template
#LOOP
runGsBig /san/sanvol1/scratch/cavPor3/hardChunks $(file1) {check out line gtf/$(root1).gtf} {check out line pep/$(root1).pep} {check out line subopt/$(root1).bed}
#ENDLOOP
_EOF_
    # << happy emacs

    gensub2 genome.list single template jobList
    para create jobList
    para try, check, push, check, ...

    ##[tdreszer@memk /cluster/data/cavPor3/bed/genscan] para check
    ##172 jobs in batch
    ##13 jobs (including everybody's) in Parasol queue.
    ##Checking finished jobs
    ##crashed: 7
    ##ranOk: 165
    ##total jobs in batch: 172
    ##[tdreszer@memk /cluster/data/cavPor3/bed/genscan] para status | grep -v done
    ##172 jobs in batch
    ##13 jobs (including everybody's) in Parasol queue.
    ##Checking finished jobs
    ###state  tries   real    cpu     host    jobid   cmd
    ##crash   1       60.00   55.30   mkr0u0  825047  runGsBig /san/sanvol1/scratch/cavPor3/hardChunks c_19.fa gtf/c_19.gtf pep/c_19.pep subopt/c_19.bed
    ##crash   1       215.00  213.48  mkr0u6  825117  runGsBig /san/sanvol1/scratch/cavPor3/hardChunks c_152.fa gtf/c_152.gtf pep/c_152.pep subopt/c_152.bed
    ##crash   1       97.00   96.01   mkr0u0  825153  runGsBig /san/sanvol1/scratch/cavPor3/hardChunks c_111.fa gtf/c_111.gtf pep/c_111.pep subopt/c_111.bed
    ##crash   1       149.00  146.78  mkr0u0  825158  runGsBig /san/sanvol1/scratch/cavPor3/hardChunks c_107.fa gtf/c_107.gtf pep/c_107.pep subopt/c_107.bed
    ##crash   1       638.00  636.81  mkr0u3  825178  runGsBig /san/sanvol1/scratch/cavPor3/hardChunks c_71.fa gtf/c_71.gtf pep/c_71.pep subopt/c_71.bed
    ##crash   1       904.00  901.75  mkr0u6  825185  runGsBig /san/sanvol1/scratch/cavPor3/hardChunks c_32.fa gtf/c_32.gtf pep/c_32.pep subopt/c_32.bed
    ##crash   1       1238.00 1232.05 mkr0u6  825196  runGsBig /san/sanvol1/scratch/cavPor3/hardChunks c_01.fa gtf/c_01.gtf pep/c_01.pep subopt/c_01.bed
    ##[tdreszer@memk /cluster/data/cavPor3/bed/genscan] para problems | grep Insuff
    ##Insufficient memory error: results may be unreliable.
    ##Insufficient memory error: results may be unreliable.
    ##Insufficient memory error: results may be unreliable.
    ##Insufficient memory error: results may be unreliable.
    ##Insufficient memory error: results may be unreliable.
    ##Insufficient memory error: results may be unreliable.
    ##Insufficient memory error: results may be unreliable.
    ####### However, these are not the simply the largest files:
    [tdreszer@memk /cluster/data/cavPor3/bed/genscan] ll /san/sanvol1/scratch/cavPor3/hardChunks/c_19.fa
    -rw-rw-r--  1 tdreszer protein 4408036 Apr  8 15:39 /san/sanvol1/scratch/cavPor3/hardChunks/c_19.fa
    [tdreszer@memk /cluster/data/cavPor3/bed/genscan] ll /san/sanvol1/scratch/cavPor3/hardChunks/c_152.fa
    -rw-rw-r--  1 tdreszer protein 8304826 Apr  8 15:40 /san/sanvol1/scratch/cavPor3/hardChunks/c_152.fa
    [tdreszer@memk /cluster/data/cavPor3/bed/genscan] ll /san/sanvol1/scratch/cavPor3/hardChunks/c_111.fa
    -rw-rw-r--  1 tdreszer protein 18155094 Apr  8 15:40 /san/sanvol1/scratch/cavPor3/hardChunks/c_111.fa
    [tdreszer@memk /cluster/data/cavPor3/bed/genscan] ll /san/sanvol1/scratch/cavPor3/hardChunks/c_107.fa
    -rw-rw-r--  1 tdreszer protein 20080719 Apr  8 15:40 /san/sanvol1/scratch/cavPor3/hardChunks/c_107.fa
    [tdreszer@memk /cluster/data/cavPor3/bed/genscan] ll /san/sanvol1/scratch/cavPor3/hardChunks/c_71.fa
    -rw-rw-r--  1 tdreszer protein 36860211 Apr  8 15:40 /san/sanvol1/scratch/cavPor3/hardChunks/c_71.fa
    [tdreszer@memk /cluster/data/cavPor3/bed/genscan] ll /san/sanvol1/scratch/cavPor3/hardChunks/c_32.fa
    -rw-rw-r--  1 tdreszer protein 49549701 Apr  8 15:40 /san/sanvol1/scratch/cavPor3/hardChunks/c_32.fa
    [tdreszer@memk /cluster/data/cavPor3/bed/genscan] ll /san/sanvol1/scratch/cavPor3/hardChunks/c_01.fa
    -rw-rw-r--  1 tdreszer protein 82754438 Apr  8 15:39 /san/sanvol1/scratch/cavPor3/hardChunks/c_01.fa
    # Okay try spliting the failed c_*.fa files into individual sequences rt=retry
    ssh memk
    pushd /san/sanvol1/scratch/cavPor3/hardChunks
    faSplit sequence c_19.fa 1000 c_rt19_
    ## Wrong one!   faSplit sequence c_151.fa 1000 c_rt151_
    faSplit sequence c_111.fa 1000 c_rt111_
    faSplit sequence c_107.fa 1000 c_rt107_
    faSplit sequence c_71.fa 1000 c_rt71_
    faSplit sequence c_32.fa 1000 c_rt32_
    faSplit sequence c_01.fa 1000 c_rt01_
    popd # cd /cluster/data/cavPor3/bed/genscan
    ls -1Sr /san/sanvol1/scratch/cavPor3/hardChunks/c_rt*.fa > genome_rt.list

    gensub2 genome_rt.list single template jobList_rt
    para create jobList_rt
    para try, check, push, check, ...

    # Checking finished jobs
    # crashed: 6
    # ranOk: 71
    # total jobs in batch: 77
    ## missed one:
    pushd /san/sanvol1/scratch/cavPor3/hardChunks
    faSplit sequence c_152.fa 1000 c_rt152_
    popd # cd /cluster/data/cavPor3/bed/genscan
    ls -1Sr /san/sanvol1/scratch/cavPor3/hardChunks/c_rt152*.fa > genome_rt2.list

    gensub2 genome_rt2.list single template jobList_rt2
    para create jobList_rt2
    para try, check, push, check, ...
    # Checking finished jobs
    # crashed: 1
    # ranOk: 10
    # total jobs in batch: 11

    ######## After breaking failed chunks by sequence, the 7 culprit scaffolds are known:
    # faCount /san/sanvol1/scratch/cavPor3/hardChunks/c_rt19_0010.fa
    # faCount /san/sanvol1/scratch/cavPor3/hardChunks/c_rt151_0010.fa
    # faCount /san/sanvol1/scratch/cavPor3/hardChunks/c_rt111_0010.fa
    # faCount /san/sanvol1/scratch/cavPor3/hardChunks/c_rt107_0010.fa
    # faCount /san/sanvol1/scratch/cavPor3/hardChunks/c_rt71_0025.fa
    # faCount /san/sanvol1/scratch/cavPor3/hardChunks/c_rt32_0005.fa
    # faCount /san/sanvol1/scratch/cavPor3/hardChunks/c_rt01_0000.fa
    #seq                len       A        C        G        T        N      cpg
    #scaffold_115     4065651   736071   696850   700707   740890  1191133  60230
    #scaffold_81      7649460  1502224  1297711  1285486  1482474  2081565  87359
    #scaffold_45     15715546  2926604  2362405  2351615  2933678  5141244 126807
    #scaffold_41     16993173  3764581  2421095  2432646  3805601  4569250 103547
    #scaffold_21     33939536  7356221  4629107  4649855  7388452  9915901 190985
    #scaffold_13     48363610 10226185  7754898  7730208 10253556 12398763 432844
    #scaffold_1      81131790 17393288 12255795 12237937 17366862 21877908 561145

    ### Punting.  Hiram suggests that we just wait till the alignments fill us in.

    # At this point, all scaffolds have been successfully genscanned EXCEPT the 7 listed above

    # cat and lift the results into single files
    ssh kkstore05
    cd /cluster/data/cavPor3/bed/genscan
    sort -k1,1 -k4.4n gtf/c_*.gtf > genscan.gtf
    sort -k1,1 -k2,2n subopt/c_*.bed > genscanSubopt.bed
    cat pep/c_*.pep > genscan.pep

    # Load into the database as so:
    ssh hgwdev
    cd /cluster/data/cavPor3/bed/genscan
    ldHgGene cavPor3 -gtf genscan genscan.gtf
    # Read 64818 transcripts in 416957 lines in 1 files
    # 64818 groups 2179 seqs 1 sources 1 feature types
    # 64818 gene predictions

    hgPepPred cavPor3 generic genscanPep genscan.pep
    hgLoadBed cavPor3 genscanSubopt genscanSubopt.bed
    #	Loaded 537376 elements of size 6

    #	let's check the numbers
    time nice -n +19 featureBits cavPor3 genscan
    #   76489707 bases of 2663369733 (2.872%) in intersection

#########################################################################
## genscan RErun (BEG: 2008-05-22  DONE: 2008-05-22 - Tim)

### ### QA found missing genscan data, so retrying the unlucky 7 by splitting on gaps
    ssh memk
    pushd /san/sanvol1/scratch/cavPor3/hardChunks
    faSplit gap c_rt01_0000.fa 1000000 c_rt_01_0000_gap_
    faSplit gap c_rt32_0005.fa 1000000 c_rt_32_0005_gap_
    faSplit gap c_rt71_0025.fa 1000000 c_rt_71_0025_gap_
    faSplit gap c_rt107_0010.fa 1000000 c_rt_107_0010_gap_
    faSplit gap c_rt111_0010.fa 1000000 c_rt_111_0010_gap_
    faSplit gap c_rt151_0010.fa 1000000 c_rt_151_0010_gap_
    faSplit gap c_rt19_0010.fa 1000000 c_rt_19_0010_gap_
    popd # cd /cluster/data/cavPor3/bed/genscan
    ls -1Sr /san/sanvol1/scratch/cavPor3/hardChunks/c_rt*_gap_*.fa > genome_gap.list

    gensub2 genome_gap.list single template jobList_gap
    para create jobList_gap
    para try, check, push, check, ...
    # 219 jobs in batch
    # 52852 jobs (including everybody's) in Parasol queue.
    # Checking finished jobs
    # .......
    # ranOk: 219
    # total jobs in batch: 219

    # cat and lift the results into single files
    ssh kkstore05
    cd /cluster/data/cavPor3/bed/genscan
    sort -k1,1 -k4.4n gtf/c_*.gtf > genscan_full.gtf
    sort -k1,1 -k2,2n subopt/c_*.bed > genscanSubopt_full.bed
    cat pep/c_*.pep > genscan_full.pep

    # Load into the database as so:
    ssh hgwdev
    cd /cluster/data/cavPor3/bed/genscan
    ldHgGene cavPor3 -gtf genscan genscan_full.gtf
    # Read 70478 transcripts in 454271 lines in 1 files
    # 70478 groups 2398 seqs 1 sources 1 feature types
    # 70478 gene predictions

    hgPepPred cavPor3 generic genscanPep genscan_full.pep
    hgLoadBed cavPor3 genscanSubopt genscanSubopt_full.bed
    #   Loaded 605836 elements of size 6

    #   let's check the numbers
    time nice -n +19 featureBits cavPor3 genscan
    #   76489707 bases of 2663369733 (2.872%) in intersection

### ### ### Total defeat.  faSplit on gaps results in new sequence names that donot match to the known scaffold_nnnn sequences.

    # Clean up mess:
    cd /cluster/data/cavPor3/bed/genscan
    pushd /san/sanvol1/scratch/cavPor3/hardChunks
    rm c_rt_*_gap_*.fa
    popd # cd /cluster/data/cavPor3/bed/genscan
    rm gtf/c_rt_*_gap_*.gtf
    rm pep/c_rt_*_gap_*.pep
    rm subopt/c_rt_*_gap_*.bed

    # Try the seven culprits one more time.
cat << \_EOF_ > genome_oneMoreTime.list
/san/sanvol1/scratch/cavPor3/hardChunks/c_rt19_0010.fa
/san/sanvol1/scratch/cavPor3/hardChunks/c_rt151_0010.fa
/san/sanvol1/scratch/cavPor3/hardChunks/c_rt111_0010.fa
/san/sanvol1/scratch/cavPor3/hardChunks/c_rt107_0010.fa
/san/sanvol1/scratch/cavPor3/hardChunks/c_rt71_0025.fa
/san/sanvol1/scratch/cavPor3/hardChunks/c_rt32_0005.fa
/san/sanvol1/scratch/cavPor3/hardChunks/c_rt01_0000.fa
_EOF_

    ssh memk
    cd /cluster/data/cavPor3/bed/genscan
    gensub2 genome_oneMoreTime.list single template jobList_oneMoreTime
    para create jobList_oneMoreTime
    para push, check, ...
    #  para status
    # 7 jobs in batch
    # 0 jobs (including everybody's) in Parasol queue.
    # Checking finished jobs
    # #state  tries   real    cpu     host    jobid   cmd
    # crash   1       58.00   54.08   mkr0u5  1182093 runGsBig /san/sanvol1/scratch/cavPor3/hardChunks c_rt19_0010.fa gtf/c_rt19_0010.gtf pep/c_rt19_0010.pep subopt/c_rt19_0010.bed
    # done    1       228.00  224.16  mkr0u6  1182094 runGsBig /san/sanvol1/scratch/cavPor3/hardChunks c_rt151_0010.fa gtf/c_rt151_0010.gtf pep/c_rt151_0010.pep subopt/c_rt151_0010.bed
    # crash   1       57.00   52.47   mkr0u3  1182095 runGsBig /san/sanvol1/scratch/cavPor3/hardChunks c_rt111_0010.fa gtf/c_rt111_0010.gtf pep/c_rt111_0010.pep subopt/c_rt111_0010.bed
    # crash   1       106.00  99.41   mkr0u7  1182096 runGsBig /san/sanvol1/scratch/cavPor3/hardChunks c_rt107_0010.fa gtf/c_rt107_0010.gtf pep/c_rt107_0010.pep subopt/c_rt107_0010.bed
    # crash   1       617.00  602.73  mkr0u5  1182097 runGsBig /san/sanvol1/scratch/cavPor3/hardChunks c_rt71_0025.fa gtf/c_rt71_0025.gtf pep/c_rt71_0025.pep subopt/c_rt71_0025.bed
    # crash   1       922.00  896.75  mkr0u3  1182098 runGsBig /san/sanvol1/scratch/cavPor3/hardChunks c_rt32_0005.fa gtf/c_rt32_0005.gtf pep/c_rt32_0005.pep subopt/c_rt32_0005.bed
    # crash   1       1307.00 1258.05 mkr0u7  1182099 runGsBig /san/sanvol1/scratch/cavPor3/hardChunks c_rt01_0000.fa gtf/c_rt01_0000.gtf pep/c_rt01_0000.pep subopt/c_rt01_0000.bed

    # One down, 6 to go.  Try lowering window size on gsBig

    mv runGsBig runGsBig.24
    cat << \_EOF_ > runGsBig
#!/bin/csh -fe
set runDir = `pwd`
set srcDir = $1
set inFile = $2
set fileRoot = $inFile:r
mkdir /scratch/tmp/$fileRoot
cp -p $srcDir/$inFile /scratch/tmp/$fileRoot
pushd /scratch/tmp/$fileRoot
/cluster/bin/x86_64/gsBig $inFile $fileRoot.gtf -trans=$fileRoot.pep -subopt=$fileRoot.bed -exe=$runDir/hg3rdParty/genscanlinux/genscan -par=$runDir/hg3rdParty/genscanlinux/HumanIso.smat -tmp=/scratch/tmp -window=1200000
popd
cp -p /scratch/tmp/$fileRoot/$fileRoot.gtf gtf
cp -p /scratch/tmp/$fileRoot/$fileRoot.pep pep
cp -p /scratch/tmp/$fileRoot/$fileRoot.bed subopt
rm -fr /scratch/tmp/$fileRoot
_EOF_
    # << happy emacs
    chmod +x runGsBig

    para push, check, ...
    #  para status
    # 7 jobs in batch
    # 0 jobs (including everybody's) in Parasol queue.
    # Checking finished jobs
    # #state  tries   real    cpu     host    jobid   cmd
    # done    2       58.00   54.08   mkr0u5  1182093 runGsBig /san/sanvol1/scratch/cavPor3/hardChunks c_rt19_0010.fa gtf/c_rt19_0010.gtf pep/c_rt19_0010.pep subopt/c_rt19_0010.bed
    # done    1       228.00  224.16  mkr0u6  1182094 runGsBig /san/sanvol1/scratch/cavPor3/hardChunks c_rt151_0010.fa gtf/c_rt151_0010.gtf pep/c_rt151_0010.pep subopt/c_rt151_0010.bed
    # crash   2       57.00   52.47   mkr0u3  1182095 runGsBig /san/sanvol1/scratch/cavPor3/hardChunks c_rt111_0010.fa gtf/c_rt111_0010.gtf pep/c_rt111_0010.pep subopt/c_rt111_0010.bed
    # done    2       106.00  99.41   mkr0u7  1182096 runGsBig /san/sanvol1/scratch/cavPor3/hardChunks c_rt107_0010.fa gtf/c_rt107_0010.gtf pep/c_rt107_0010.pep subopt/c_rt107_0010.bed
    # done    2       617.00  602.73  mkr0u5  1182097 runGsBig /san/sanvol1/scratch/cavPor3/hardChunks c_rt71_0025.fa gtf/c_rt71_0025.gtf pep/c_rt71_0025.pep subopt/c_rt71_0025.bed
    # crash   2       922.00  896.75  mkr0u3  1182098 runGsBig /san/sanvol1/scratch/cavPor3/hardChunks c_rt32_0005.fa gtf/c_rt32_0005.gtf pep/c_rt32_0005.pep subopt/c_rt32_0005.bed
    # done    2       1307.00 1258.05 mkr0u7  1182099 runGsBig /san/sanvol1/scratch/cavPor3/hardChunks c_rt01_0000.fa gtf/c_rt01_0000.gtf pep/c_rt01_0000.pep subopt/c_rt01_0000.bed

    # All but two!

    para push, check, ...
    #  para status
    # 7 jobs in batch
    # 0 jobs (including everybody's) in Parasol queue.
    # Checking finished jobs
    # #state  tries   real    cpu     host    jobid   cmd
    # done    2       58.00   54.08   mkr0u5  1182093 runGsBig /san/sanvol1/scratch/cavPor3/hardChunks c_rt19_0010.fa gtf/c_rt19_0010.gtf pep/c_rt19_0010.pep subopt/c_rt19_0010.bed
    # done    1       228.00  224.16  mkr0u6  1182094 runGsBig /san/sanvol1/scratch/cavPor3/hardChunks c_rt151_0010.fa gtf/c_rt151_0010.gtf pep/c_rt151_0010.pep subopt/c_rt151_0010.bed
    # done    3       57.00   52.47   mkr0u3  1182095 runGsBig /san/sanvol1/scratch/cavPor3/hardChunks c_rt111_0010.fa gtf/c_rt111_0010.gtf pep/c_rt111_0010.pep subopt/c_rt111_0010.bed
    # done    2       106.00  99.41   mkr0u7  1182096 runGsBig /san/sanvol1/scratch/cavPor3/hardChunks c_rt107_0010.fa gtf/c_rt107_0010.gtf pep/c_rt107_0010.pep subopt/c_rt107_0010.bed
    # done    2       617.00  602.73  mkr0u5  1182097 runGsBig /san/sanvol1/scratch/cavPor3/hardChunks c_rt71_0025.fa gtf/c_rt71_0025.gtf pep/c_rt71_0025.pep subopt/c_rt71_0025.bed
    # done    3       922.00  896.75  mkr0u3  1182098 runGsBig /san/sanvol1/scratch/cavPor3/hardChunks c_rt32_0005.fa gtf/c_rt32_0005.gtf pep/c_rt32_0005.pep subopt/c_rt32_0005.bed
    # done    2       1307.00 1258.05 mkr0u7  1182099 runGsBig /san/sanvol1/scratch/cavPor3/hardChunks c_rt01_0000.fa gtf/c_rt01_0000.gtf pep/c_rt01_0000.pep subopt/c_rt01_0000.bed
    ### All completed!

    # cat and lift the results into single files
    ssh kkstore05
    cd /cluster/data/cavPor3/bed/genscan
    sort -k1,1 -k4.4n gtf/c_*.gtf > genscan_full.gtf
    sort -k1,1 -k2,2n subopt/c_*.bed > genscanSubopt_full.bed
    cat pep/c_*.pep > genscan_full.pep

    # Load into the database as so:
    ssh hgwdev
    cd /cluster/data/cavPor3/bed/genscan
    ldHgGene cavPor3 -gtf genscan genscan_full.gtf
    # Read 70161 transcripts in 453022 lines in 1 files
    # 70161 groups 2185 seqs 1 sources 1 feature types
    # 70161 gene predictions

    hgPepPred cavPor3 generic genscanPep genscan_full.pep
    hgLoadBed cavPor3 genscanSubopt genscanSubopt_full.bed
    #   Loaded 603786 elements of size 6

    #   let's check the numbers
    time nice -n +19 featureBits cavPor3 genscan
    #   82960328 bases of 2663369733 (3.115%) in intersection


#########################################################################
# CPGISLANDS (DONE - 2008-04-10 - Tim)
    ssh hgwdev
    mkdir /cluster/data/cavPor3/bed/cpgIsland
    cd /cluster/data/cavPor3/bed/cpgIsland

    # Build software from Asif Chinwalla (achinwal@watson.wustl.edu)
    cvs co hg3rdParty/cpgIslands
    cd hg3rdParty/cpgIslands
    make
    #	There was a problem in here, in both cpg.c and cpg_lh.c:
    #   cpg_lh.c:74: warning: conflicting types for built-in function 'malloc'
    #	warning: conflicting types for built-in function 'malloc'
    #	commentted out line 74 ONLY to get this to build
    #	gcc readseq.c cpg_lh.c -o cpglh.exe
    cd ../..
    ln -s hg3rdParty/cpgIslands/cpglh.exe .

    # There may be warnings about "bad character" for IUPAC ambiguous
    # characters like R, S, etc.  Ignore the warnings.
    mkdir results
    echo ../../hardMasked/*.hard.fa | sed -e "s/ /\n/g" | while read F
    do
	FA=${F/*\/}
	C=${FA/.hard.fa/}
	echo "./cpglh.exe ${FA} > results/${C}.cpg"
	nice -n +19 ./cpglh.exe ${F} > results/${C}.cpg
    done > cpglh.out 2>&1 &
    #	about 5 minutes

    # Transform cpglh output to bed +
    cat << \_EOF_ > filter.awk
{
$2 = $2 - 1;
width = $3 - $2;
printf("%s\t%d\t%s\t%s %s\t%s\t%s\t%0.0f\t%0.1f\t%s\t%s\n",
       $1, $2, $3, $5,$6, width,
       $6, width*$7*0.01, 100.0*2*$6/width, $7, $9);
}
_EOF_
    #	<< happy emacs
    catDir results | awk -f filter.awk | sort -k1,1 -k2,2n > cpgIsland.bed

    ssh hgwdev
    cd /cluster/data/cavPor3/bed/cpgIsland
    hgLoadBed cavPor3 cpgIslandExt -tab \
      -sqlTable=$HOME/kent/src/hg/lib/cpgIslandExt.sql cpgIsland.bed
    #	Loaded 36373 elements of size 10
    featureBits cavPor3 cpgIslandExt
    #  21746514 bases of 2663369733 (0.817%) in intersection

#########################################################################
# READ before BLASTZ: http://genomewiki.ucsc.edu/index.php/Mm9_multiple_alignment
#########################################################################
# BLASTZ/CHAIN/NET Hg18-cavPor3 (START - 2008-04-10 - Tim; DONE 2008-04-15)
    ssh kkstore02 # store11->kkstore02-10
    screen # use a screen to manage this multi-day job
    mkdir /cluster/data/hg18/bed/blastzCavPor3.2008-04-10
    cd /cluster/data/hg18/bed/
    ln -s blastzCavPor3.2008-04-10 blastz.cavPor3
    cd blastzCavPor3.2008-04-10

    cat << \_EOF_ > DEF
BLASTZ_M=50

# TARGET: Human Hg18
SEQ1_DIR=/cluster/bluearc/scratch/data/hg18/nib
SEQ1_LEN=/cluster/data/hg18/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000

# QUERY: GuineaPig cavPor3
SEQ2_DIR=/cluster/bluearc/scratch/data/cavPor3/cavPor3.2bit
SEQ2_LEN=/cluster/data/cavPor3/chrom.sizes
# Maximum number of scaffolds that can be lumped together
SEQ2_LIMIT=300
SEQ2_CHUNK=20000000
SEQ2_LAP=0

BASE=/cluster/data/hg18/bed/blastzCavPor3.2008-04-10
TMPDIR=/scratch/tmp
_EOF_
    # << this line keeps emacs coloring happy

    # NOTE: be sure to ls the data in above script on workhorse machine before starting script
    time nice -n +19 doBlastzChainNet.pl `pwd`/DEF \
	-bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
	    -syntenicNet > do.log 2>&1
    # <ctl-A><ctl-d>  ps -ef | grep blastzCavPor3
    # real    1320m35.523s
    # HOWEVER, doBlastzChainNet failed because the run.blastz/para.results were corrupt.
    cd /cluster/data/hg18/bed/blastzCavPor3.2008-04-10/run.blastz
    para recover jobList jobListRecovered
    # wc -l jobListRecovered -> 9
    para create jobListRecovered
    para push
    # completed all jobs.  Now:
    ssh kkstore02
    screen -r -d
    time nice -n +19 doBlastzChainNet.pl `pwd`/DEF \
	-bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
	   -continue=cat -syntenicNet > do.log 2>&1 &
    #real    210m38.768s # low because of the restart that had to occur
    tail do.log
    # *** All done!
    # *** Make sure that goldenPath/hg18/vsCavPor3/README.txt is accurate.
    # *** Add {chain,net}CavPor3 tracks to trackDb.ra if necessary.

    cat fb.hg18.chainCavPor3Link.txt
    #   1267036494 bases of 2881515245 (43.971%) in intersection

    ######### Change locations in DEF due to Hiram's new methods
    cd /cluster/data/hg18/bed/blastzCavPor3.2008-04-10
    cat << \_EOF_ > DEF
BLASTZ_M=50

# TARGET: Human Hg18
SEQ1_DIR=/scratch/data/hg18/nib
SEQ1_LEN=/scratch/data/hg18/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000

# QUERY: GuineaPig cavPor3
SEQ2_DIR=/scratch/data/cavPor3/cavPor3.2bit
SEQ2_LEN=/scratch/data/cavPor3/chrom.sizes
# Maximum number of scaffolds that can be lumped together
SEQ2_LIMIT=300
SEQ2_CHUNK=20000000
SEQ2_LAP=0

BASE=/cluster/data/hg18/bed/blastzCavPor3.2008-04-10
TMPDIR=/scratch/tmp
_EOF_


    mkdir /cluster/data/cavPor3/bed/blastz.hg18.swap
    ssh kkstore05 # where cavPor3 is located

    screen
    cd /cluster/data/cavPor3/bed/blastz.hg18.swap
    time nice -n +19 doBlastzChainNet.pl \
	/cluster/data/hg18/bed/blastzCavPor3.2008-04-10/DEF \
	-bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
	    -swap -syntenicNet > do.log 2>&1
    # real    216m13.808s
    # chainSplit chain cavPor3.hg18.all.chain.gz
    # sh: pipe error: Too many open files
    # Can't open chain/scaffold_621.chain to append: Too many open files

    #	broken down during netSynteny.csh due to too many open files on
    #	a chainSplit
    #	However, there is no need to split when we have scaffolds.
    #   Kate fixes doBlastzChainNet.pl and retry:

    cd /cluster/data/cavPor3/bed/blastz.hg18.swap
    time nice -n +19 ~kate/kent/src/hg/utils/automation/doBlastzChainNet.pl \
	/cluster/data/hg18/bed/blastzCavPor3.2008-04-10/DEF \
	-bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
	    -swap -continue=syntenicNet -syntenicNet > do.log 2>&1 &
    # real     29m49.617s
    # *** All done!
    # *** Make sure that (/usr/local/apache/htdocs/goldenPath/)cavPor3/vsHg18/README.txt is accurate.
    # *** Add {chain,net}Hg18 tracks to trackDb.ra if necessary.

    cat fb.cavPor3.chainHg18Link.txt
    # 1281925834 bases of 2663369733 (48.132%) in intersection

#########################################################################
# BLASTZ/CHAIN/NET mm9-cavPor3 (START - 2008-04-10 - DONE 2008-04-14 - Tim)
    ssh kkstore06 # store4->kkstore04-10
    screen # use a screen to manage this multi-day job
    mkdir /cluster/data/mm9/bed/blastzCavPor3.2008-04-10
    cd /cluster/data/mm9/bed/
    ln -s blastzCavPor3.2008-04-10 blastz.cavPor3
    cd blastzCavPor3.2008-04-10

    cat << \_EOF_ > DEF
BLASTZ_M=50

# TARGET: Mouse Mm9
SEQ1_DIR=/cluster/bluearc/scratch/data/mm9/nib
SEQ1_LEN=/cluster/data/mm9/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000

# QUERY: GuineaPig cavPor3
SEQ2_DIR=/cluster/bluearc/scratch/data/cavPor3/cavPor3.2bit
SEQ2_LEN=/cluster/data/cavPor3/chrom.sizes
# Maximum number of scaffolds that can be lumped together
SEQ2_LIMIT=300
SEQ2_CHUNK=20000000
SEQ2_LAP=0

BASE=/cluster/data/mm9/bed/blastzCavPor3.2008-04-10
TMPDIR=/scratch/tmp
_EOF_
    # << this line keeps emacs coloring happy

    # NOTE: be sure to ls the data in above script on workhorse machine before starting script
    time nice -n +19 doBlastzChainNet.pl `pwd`/DEF \
	-bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
	    -syntenicNet > do.log 2>&1
    # <ctl-A><ctl-d>  ps -ef | grep blastzCavPor3
    # real    1764m55.155s
    tail do.log
    #  *** All done!
    # *** Make sure that goldenPath/mm9/vsCavPor3/README.txt is accurate.
    # *** Add {chain,net}CavPor3 tracks to trackDb.ra if necessary.
    # blastz: ranOk: 52984
    cat fb.mm9.chainCavPor3Link.txt
    #    757283793 bases of 2620346127 (28.900%) in intersection

    cd /cluster/data/mm9/bed/blastzCavPor3.2008-04-10
    ######### Change locations in DEF due to Hiram's new methods
    cat << \_EOF_ > DEF
BLASTZ_M=50

# TARGET: Mouse Mm9
SEQ1_DIR=/scratch/data/mm9/nib
SEQ1_LEN=/scratch/data/mm9/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000

# QUERY: GuineaPig cavPor3
SEQ2_DIR=/scratch/data/cavPor3/cavPor3.2bit
SEQ2_LEN=/scratch/data/cavPor3/chrom.sizes
# Maximum number of scaffolds that can be lumped together
SEQ2_LIMIT=300
SEQ2_CHUNK=20000000
SEQ2_LAP=0

BASE=/cluster/data/mm9/bed/blastzCavPor3.2008-04-10
TMPDIR=/scratch/tmp
_EOF_

    mkdir /cluster/data/cavPor3/bed/blastz.mm9.swap
    cd /cluster/data/cavPor3/bed/blastz.mm9.swap
    time nice -n +19 doBlastzChainNet.pl \
	/cluster/data/mm9/bed/blastzCavPor3.2008-04-10/DEF \
	-bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
	    -swap -syntenicNet > do.log 2>&1 &
    # real   166m53.671s
    # Exit 25   time nice -n +19 doBlastzChainNet.pl /cluster/data/mm9/bed/blastzCavPor3.2008-04-10/DEF -bigC
    # Can't open chain/scaffold_795.chain to append: Too many open files
    # gzip: stdout: Broken pipe
    # Command failed:
    # ssh -x kolossus nice /cluster/data/cavPor3/bed/blastz.mm9.swap/axtChain/netSynteny.csh

    #	broken down during netSynteny.csh due to too many open files on
    #	a chainSplit
    #	However, there is no need to split when we have scaffolds.
    #   Kate fixes doBlastzChainNet.pl and retry:

    cd /cluster/data/cavPor3/bed/blastz.mm9.swap
    time nice -n +19 ~kate/kent/src/hg/utils/automation/doBlastzChainNet.pl \
	/cluster/data/mm9/bed/blastzCavPor3.2008-04-10/DEF \
	-bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
	    -swap -continue=syntenicNet -syntenicNet > syn.log 2>&1 &
    # real    24m37.561s
    # *** All done!
    # *** Make sure that goldenPath/cavPor3/vsMm9/README.txt is accurate.
    # *** Add {chain,net}Mm9 tracks to trackDb.ra if necessary.
    cat fb.cavPor3.chainMm9Link.txt
    # 781173609 bases of 2663369733 (29.330%) in intersection


#########################################################################
# BLASTZ/CHAIN/NET galGal3-cavPor3 (START - 2008-04-10; DONE: 2008-04-15 - Tim)
    ssh kkstore03 # store6->kkstore03-10
    screen # use a screen to manage this multi-day job
    mkdir /cluster/data/galGal3/bed/blastzCavPor3.2008-04-10
    cd /cluster/data/galGal3/bed/
    ln -s blastzCavPor3.2008-04-10 blastz.cavPor3
    cd blastzCavPor3.2008-04-10

    cat << \_EOF_ > DEF
BLASTZ_M=50
BLASTZ_Y=3400
BLASTZ_L=6000
BLASTZ_K=2200
BLASTZ_Q=/cluster/data/blastz/HoxD55.q

# TARGET: Chicken galGal3
SEQ1_DIR=/cluster/bluearc/scratch/data/galGal3/nib
SEQ1_LEN=/cluster/data/galGal3/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000

# QUERY: GuineaPig cavPor3
SEQ2_DIR=/cluster/bluearc/scratch/data/cavPor3/cavPor3.2bit
SEQ2_LEN=/cluster/data/cavPor3/chrom.sizes
# Maximum number of scaffolds that can be lumped together
SEQ2_LIMIT=300
SEQ2_CHUNK=20000000
SEQ2_LAP=0

BASE=/cluster/data/galGal3/bed/blastzCavPor3.2008-04-10
TMPDIR=/scratch/tmp
_EOF_
    # << this line keeps emacs coloring happy

    # NOTE: be sure to ls the data in above script on workhorse machine before starting script
    time nice -n +19 doBlastzChainNet.pl `pwd`/DEF \
	-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
	    -syntenicNet > do.log 2>&1
    # <ctl-A><ctl-d>  ps -ef | grep blastzCavPor3
    # real    1507m5.132s
    tail do.log
    # *** All done!
    # *** Make sure that goldenPath/galGal3/vsCavPor3/README.txt is accurate.
    # *** Add {chain,net}CavPor3 tracks to trackDb.ra if necessary.

    cat fb.galGal3.chainCavPor3Link.txt
    #   106239838 bases of 1042591351 (10.190%) in intersection

    cd /cluster/data/galGal3/bed/blastzCavPor3.2008-04-10
    ######### Change locations in DEF due to Hiram's new methods
    cat << \_EOF_ > DEF
BLASTZ_M=50
BLASTZ_Y=3400
BLASTZ_L=6000
BLASTZ_K=2200
BLASTZ_Q=/cluster/data/blastz/HoxD55.q

# TARGET: Chicken galGal3
SEQ1_DIR=/scratch/data/galGal3/nib
SEQ1_LEN=/scratch/data/galGal3/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000

# QUERY: GuineaPig cavPor3
SEQ2_DIR=/scratch/data/cavPor3/cavPor3.2bit
SEQ2_LEN=/scratch/data/cavPor3/chrom.sizes
# Maximum number of scaffolds that can be lumped together
SEQ2_LIMIT=300
SEQ2_CHUNK=20000000
SEQ2_LAP=0

BASE=/cluster/data/galGal3/bed/blastzCavPor3.2008-04-10
TMPDIR=/scratch/tmp
_EOF_

    mkdir /cluster/data/cavPor3/bed/blastz.galGal3.swap

    cd /cluster/data/cavPor3/bed/blastz.galGal3.swap
    time nice -n +19 doBlastzChainNet.pl \
	/cluster/data/galGal3/bed/blastzCavPor3.2008-04-10/DEF \
	-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
	    -swap > do.log 2>&1
    # real    23m19.970s
    # *** All done!
    # *** Make sure that goldenPath/cavPor3/vsGalGal3/README.txt is accurate.
    # *** Add {chain,net}GalGal3 tracks to trackDb.ra if necessary.

    cat fb.cavPor3.chainGalGal3Link.txt
    # 144795360 bases of 2663369733 (5.437%) in intersection

#########################################################################
# BLASTZ/CHAIN/NET cavPor3-oryCun1 (RESTART - 2008-04-11 - DONE - 2008-04-22 Tim)
    ssh kkstore05 # store12->kkstore05
    screen # use a screen to manage this multi-day job
    mkdir /cluster/data/cavPor3/bed/blastzOryCun1.2008-04-11
    cd /cluster/data/cavPor3/bed/
    ln -s blastzOryCun1.2008-04-11 blastz.oryCun1
    cd blastzOryCun1.2008-04-11

    cat << \_EOF_ > DEF
BLASTZ_M=50

# TARGET: GuineaPig cavPor3
SEQ1_DIR=/cluster/bluearc/scratch/data/cavPor3/cavPor3.2bit
SEQ1_LEN=/cluster/bluearc/scratch/data/cavPor3/chrom.sizes
SEQ1_LIMIT=300
SEQ1_CHUNK=20000000
SEQ1_LAP=10000

# QUERY: Rabbit oryCun1
SEQ2_DIR=/cluster/bluearc/scratch/data/oryCun1/oryCun1.2bit
SEQ2_LEN=/cluster/bluearc/scratch/data/oryCun1/chrom.sizes
# Maximum number of scaffolds that can be lumped together
SEQ2_LIMIT=500
SEQ2_CHUNK=50000000
SEQ2_LAP=0

BASE=/cluster/data/cavPor3/bed/blastzOryCun1.2008-04-11
TMPDIR=/scratch/tmp
_EOF_
    # << this line keeps emacs coloring happy

    # NOTE: be sure to ls the data in above script on workhorse machine before starting script
    time nice -n +19 doBlastzChainNet.pl `pwd`/DEF \
	-bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
	   -stop=partition > do.log 2>&1 &
    ## used stop, then `wc -l run.blastz/cavPor3.lst` * `wc -l run.balstz/monDom4.lst`
    ## to find out size based upon DEF SEQn_CHUNK and SEQn_LIMIT then:
    time nice -n +19 doBlastzChainNet.pl `pwd`/DEF \
	-bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
	   -continue=blastz > do.log 2>&1 &
    # <ctl-A><ctl-d>  ps -ef | grep blastzCavPor3
    #real    2818m19.825s
    tail do.log
    # Loading 22976113 chains into cavPor3.chainOryCun1
    # Can't start query:
    # load data local infile 'link.tab' into table chainOryCun1Link
    # mySQL error 1114: The table 'chainOryCun1Link' is full
    # Command failed:
    # ssh -x hgwdev nice /cluster/data/cavPor3/bed/blastzOryCun1.2008-04-11/axtChain/loadUp.csh

    # mysql> select count(*) from chainOryCun1Link; > 119,306,828   mm9-oryCun1 has ~40m spread out among ~20 chr tbls
    # featureBits cavPor3 chainOryCun1Link
    # 439282179 bases of 2663369733 (16.493%) in intersection
    # Options:
    # 1) combine small scaffolds into a scaffoldUn
    #    Have to restart all blastzs  Awkward since not clear dividing line between scaffold/superscaffold
    # 2) Do not have the raw chains available at all.  Just have reciprocal best
    # 3) Go back to the idea of greater masking.

    ## [tdreszer@hgwdev /cluster/data/cavPor3] textHistogram -col=2 -binSize=40000000 chrom.sizes
    ##   0 ************************************************************ 3126
    ## 40000000  16
    ## 80000000  2
    ## [tdreszer@hgwdev /cluster/data/cavPor3] textHistogram -col=2 -binSize=10000000 chrom.sizes
    ##  0 ************************************************************ 3079
    ## 10000000 * 26
    ## 20000000  14
    ## 30000000  7
    ## 40000000  5
    ## 50000000  4
    ## 60000000  5
    ## 70000000  2
    ## 80000000  2
    ##################### Hiram recommends loading all 210 mil anyway with special table create to override row limit

    ###################################################################
    #	this failed during the load because the chainLink table became
    #	too large.  These were loaded manually with an sql statement to
    #	start the table definition, then a load data local infile using
    #	the .tab files left over from the failed load.  Note the extra
    #	definitions on the chainOryCun1Link table
    time nice -n +19 hgsql -e \
      "DROP TABLE chainOryCun1Link;" cavPor3 &

    CREATE TABLE chainOryCun1Link (
      bin smallint(5) unsigned NOT NULL default 0,
      tName varchar(255) NOT NULL default '',
      tStart int(10) unsigned NOT NULL default 0,
      tEnd int(10) unsigned NOT NULL default 0,
      qStart int(10) unsigned NOT NULL default 0,
      chainId int(10) unsigned NOT NULL default 0,
      KEY tName (tName(16),bin),
      KEY chainId (chainId)
    ) ENGINE=MyISAM max_rows=220000000 avg_row_length=55 pack_keys=1 CHARSET=latin1;

    time nice -n +19 hgsql -e \
      "load data local infile \"link.tab\" into table chainOryCun1Link;" cavPor3
    #	this one took a number of hours
    # real    272m44.943s

    #	finish the nets and load
    # Add gap/repeat stats to the net file using database tables:
    cd /cluster/data/cavPor3/bed/blastzOryCun1.2008-04-11/axtChain
    netClass -verbose=0   -noAr noClass.net cavPor3 oryCun1 cavPor3.oryCun1.net

    # Load nets:
    time nice -n +19 netFilter -minGap=10 cavPor3.oryCun1.net \
      | hgLoadNet -verbose=0 cavPor3 netOryCun1 stdin > netFilter.out 2>&1 &
    # real    6m27.050s

    cd /cluster/data/cavPor3/bed/blastzOryCun1.2008-04-11
    time nice -n +19 featureBits cavPor3 chainOryCun1Link >&fb.cavPor3.chainOryCun1Link.txt &
    #real    61m59.509s

    cat fb.cavPor3.chainOryCun1Link.txt
    #   752079320 bases of 2663369733 (28.238%) in intersection

    ###################################################################
    ### doReciprocalbest.pl  look in mm9.txt with oryCun1
    #	create reciprocal best chains/nets
    ssh hgwdev
    cd /cluster/data/cavPor3/bed/blastzOryCun1.2008-04-11
    time nice -n +19 /cluster/bin/scripts/doRecipBest.pl cavPor3 oryCun1 \
	> rbest.log 2>&1 &
    # failed due to experiments dir being in the way
    cd /cluster/data/cavPor3/bed/blastzOryCun1.2008-04-11/axtChain
    mv experiments old.experiments

    time nice -n +19 /cluster/bin/scripts/doRecipBest.pl cavPor3 oryCun1 \
	> rbestResume.log 2>&1 &
    # hung for some unknown reason.  Starting again
    time nice -n +19 /cluster/bin/scripts/doRecipBest.pl cavPor3 oryCun1 \
	>> rbestAgain.log 2>&1 &
    # real    37m33.221s
    # HgStepManager: executing step 'download' Fri Apr 18 10:33:31 2008.
    # download: output of previous step recipBest, /usr/local/apache/htdocs/goldenPath/cavPor3/vsOryCun1 , is required but does not appear to exist.
    # If it actually does exist, then this error is probably due to network/filesystem delays -- wait a minute and restart with -continue download.
    # If it really doesn't exist, either fix things manually or try -continue recipBest
    time nice -n +19 /cluster/bin/scripts/doRecipBest.pl cavPor3 oryCun1 \
	-continue=recipBest >> rbestAgain.log 2>&1 &

    # Okay, the p[art that I am missing is that I didn't finish doBlastz with downloads!
    ssh kkstore05
    screen
    cd /cluster/data/cavPor3/bed/blastzOryCun1.2008-04-11/
    time nice -n +19 doBlastzChainNet.pl `pwd`/DEF \
	-bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
	   -continue=download > doResumeAtDownload.log 2>&1 &
    # real    3m21.480s
    # *** All done!
    # *** Make sure that goldenPath/cavPor3/vsOryCun1/README.txt is accurate.
    # *** Add {chain,net}OryCun1 tracks to trackDb.ra if necessary.

    time nice -n +19 /cluster/bin/scripts/doRecipBest.pl cavPor3 oryCun1 \
	-continue=download >> rbestAgain.log 2>&1 &
    # real    0m0.418s
    # *** All done!
    # *** Steps were performed in /cluster/data/cavPor3/bed/blastz.oryCun1

    #	real    611m17.901s
    cat fb.cavPor3.chainOryCun1Link.txt
    #   752079320 bases of 2663369733 (28.238%) in intersection

    ######### Change locations in DEF due to Hiram's new methods
    cd /cluster/data/cavPor3/bed/blastzOryCun1.2008-04-11
    cat << \_EOF_ > DEF
BLASTZ_M=50

# TARGET: GuineaPig cavPor3
SEQ1_DIR=/scratch/data/cavPor3/cavPor3.2bit
SEQ1_LEN=/scratch/data/cavPor3/chrom.sizes
SEQ1_LIMIT=300
SEQ1_CHUNK=20000000
SEQ1_LAP=10000

# QUERY: Rabbit oryCun1
SEQ2_DIR=/scratch/data/oryCun1/oryCun1.2bit
SEQ2_LEN=/scratch/data/oryCun1/chrom.sizes
# Maximum number of scaffolds that can be lumped together
SEQ2_LIMIT=500
SEQ2_CHUNK=50000000
SEQ2_LAP=0

BASE=/cluster/data/cavPor3/bed/blastzOryCun1.2008-04-11
TMPDIR=/scratch/tmp
_EOF_
    mkdir /cluster/data/oryCun1/bed/blastz.cavPor3.swap
    cd /cluster/data/oryCun1/bed/blastz.cavPor3.swap
    screen
    time nice -n +19 doBlastzChainNet.pl \
	/cluster/data/cavPor3/bed/blastzOryCun1.2008-04-11/DEF \
	-bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
	    -swap > do.log 2>&1
    # <ctl-A><ctl-d>
    # real    492m11.337s
    # load data local infile 'link.tab' into table chainCavPor3Link
    # mySQL error 1114: The table 'chainCavPor3Link' is full
    # Command failed:
    # ssh -x hgwdev nice /cluster/data/oryCun1/bed/blastz.cavPor3.swap/axtChain/loadUp.csh
    # wc -l link.tab  210745564

    ###################################################################
    # Manual load again
    ssh hgwdev
    cd /cluster/data/oryCun1/bed/blastz.cavPor3.swap/axtChain
    hgsql oryCun1
    DROP TABLE chainCavPor3Link;
    CREATE TABLE chainCavPor3Link (
      bin smallint(5) unsigned NOT NULL default 0,
      tName varchar(255) NOT NULL default '',
      tStart int(10) unsigned NOT NULL default 0,
      tEnd int(10) unsigned NOT NULL default 0,
      qStart int(10) unsigned NOT NULL default 0,
      chainId int(10) unsigned NOT NULL default 0,
      KEY tName (tName(16),bin),
      KEY chainId (chainId)
    ) ENGINE=MyISAM max_rows=212000000 avg_row_length=55 pack_keys=1 CHARSET=latin1;
    # exit

    screen
    time nice -n +19 hgsql -e \
      "load data local infile \"link.tab\" into table chainCavPor3Link;" oryCun1
    # real    360m19.577s
    # mysql> select count(*) from chainCavPor3Link;  | 210745564 |
    #	finish the nets and load
    # Still on hgwdev because of sql
    screen
    time nice -n +19 /cluster/data/oryCun1/bed/blastz.cavPor3.swap/axtChain/loadUpResume.csh >> doFinishLoad.log 2>&1
    # real    991m52.532s
    # cat fb.oryCun1.chainCavPor3Link.txt
    # 729628282 bases of 2076044328 (35.145%) in intersection

    # Now continue -swap at download
    ssh kkstore04 # oryCun1 -> store09 -> kkstore04
    cd /cluster/data/oryCun1/bed/blastz.cavPor3.swap
    screen
    time nice -n +19 doBlastzChainNet.pl \
	/cluster/data/cavPor3/bed/blastzOryCun1.2008-04-11/DEF \
	-bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
	    -swap -continue=download >> doResumeAtDownload.log 2>&1
    # real    2m2.757s
    # *** All done!
    # *** Make sure that goldenPath/oryCun1/vsCavPor3/README.txt is accurate.
    # *** Add {chain,net}CavPor3 tracks to trackDb.ra if necessary.

#########################################################################
# BLASTZ/CHAIN/NET rn4-cavPor3 (STARTED - 2008-04-14; DONE 04-15-2008 - Tim)
    ssh kkstore06  # rat on store3->kkstore06
    screen # use a screen to manage this multi-day job
    mkdir /cluster/data/rn4/bed/blastzCavPor3.2008-04-14
    cd /cluster/data/rn4/bed/
    ln -s blastzCavPor3.2008-04-14 blastz.cavPor3
    cd blastzCavPor3.2008-04-14

    cat << \_EOF_ > DEF
BLASTZ_M=50

# TARGET: Rat Rn4
SEQ1_DIR=/cluster/bluearc/scratch/data/rn4/nib
SEQ1_LEN=/cluster/bluearc/scratch/data/rn4/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000

# QUERY: GuineaPig cavPor3
SEQ2_DIR=/cluster/bluearc/scratch/data/cavPor3/cavPor3.2bit
SEQ2_LEN=/cluster/bluearc/scratch/data/cavPor3/chrom.sizes
# Maximum number of scaffolds that can be lumped together
SEQ2_LIMIT=300
SEQ2_CHUNK=20000000
SEQ2_LAP=0

BASE=/cluster/data/rn4/bed/blastzCavPor3.2008-04-14
TMPDIR=/scratch/tmp
_EOF_
    # << this line keeps emacs coloring happy

    # NOTE: be sure to ls the data in above script on workhorse machine before starting script
    time nice -n +19 doBlastzChainNet.pl `pwd`/DEF \
	-bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
	    -syntenicNet > do.log 2>&1 &
    # <ctl-A><ctl-d>  ps -ef | grep blastzCavPor3
    #	real    611m17.901s
    cat fb.rn4.chainCavPor3Link.txt
    #   716379861 bases of 2571531505 (27.858%) in intersection

    ######### Change locations in DEF due to Hiram's new methods
    cat << \_EOF_ > DEF
BLASTZ_M=50

# TARGET: Rat Rn4
SEQ1_DIR=/scratch/data/rn4/nib
SEQ1_LEN=/scratch/data/rn4/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000

# QUERY: GuineaPig cavPor3
SEQ2_DIR=/scratch/data/cavPor3/cavPor3.2bit
SEQ2_LEN=/scratch/data/cavPor3/chrom.sizes
# Maximum number of scaffolds that can be lumped together
SEQ2_LIMIT=300
SEQ2_CHUNK=20000000
SEQ2_LAP=0

BASE=/cluster/data/rn4/bed/blastzCavPor3.2008-04-14
TMPDIR=/scratch/tmp
_EOF_

    mkdir /cluster/data/cavPor3/bed/blastz.rn4.swap
    cd /cluster/data/cavPor3/bed/blastz.rn4.swap
    time nice -n +19 doBlastzChainNet.pl \
	/cluster/data/rn4/bed/blastzCavPor3.2008-04-14/DEF \
	-bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
	    -swap -syntenicNet > do.log 2>&1
    # real    103m34.523s

    #	broken down during netSynteny.csh due to too many open files on
    #	a chainSplit
    #	However, there is no need to split when we have scaffolds.
    #   Kate fixes doBlastzChainNet.pl and retry:

    cd /cluster/data/cavPor3/bed/blastz.rn4.swap
    time nice -n +19 doBlastzChainNet.pl \
	/cluster/data/rn4/bed/blastzCavPor3.2008-04-14/DEF \
	-bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
	    -swap -continue=syntenicNet -syntenicNet > doSyn.log 2>&1 &
    # real    14m49.396s
    # *** All done!
    # *** Make sure that goldenPath/cavPor3/vsRn4/README.txt is accurate.
    # *** Add {chain,net}Rn4 tracks to trackDb.ra if necessary.
    cat fb.cavPor3.chainRn4Link.txt
    #   735147548 bases of 2663369733 (27.602%) in intersection

###########################################################################
# BLASTZ/CHAIN/NET monDom4-cavPor3 (START - 2008-04-11 DONE 2008-04-16 - Tim)
    ssh kkstore04 # monDom on store9 -> kkstore04
    screen # use screen to control this job
    mkdir /cluster/data/monDom4/bed/blastzCavPor3.2008-04-11
    cd /cluster/data/monDom4/bed/
    ln -s blastzCavPor3.2008-04-11 blastz.cavPor3
    cd blastzCavPor3.2008-04-11

    cat << \_EOF_ > DEF
# opossum vs guineaPigs

# Use "mammal-fish" params even though this is mammal-mammal...
# pretty distant, hopefully not too many shared undermasked repeats.
#BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=6000
BLASTZ_K=2200
BLASTZ_M=50
BLASTZ_Q=/cluster/data/blastz/HoxD55.q

# TARGET: Opossum monDom4
SEQ1_DIR=/san/sanvol1/scratch/monDom4/nib
SEQ1_LEN=/san/sanvol1/scratch/monDom4/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000

# QUERY: GuineaPig cavPor3
SEQ2_DIR=/cluster/bluearc/scratch/data/cavPor3/cavPor3.2bit
SEQ2_LEN=/cluster/bluearc/scratch/data/cavPor3/chrom.sizes
SEQ2_LIMIT=300
SEQ2_CHUNK=20000000
SEQ2_LAP=0

BASE=/cluster/data/monDom4/bed/blastzCavPor3.2008-04-11

TMPDIR=/scratch/tmp
_EOF_
    # << happy emacs

    time nice -n +19 doBlastzChainNet.pl `pwd`/DEF \
	-chainMinScore=5000 -verbose=2 -stop=partition \
	-chainLinearGap=loose -bigClusterHub=pk > do.log 2>&1 &
    ## used stop, then `wc -l run.blastz/cavPor3.lst` * `wc -l run.balstz/monDom4.lst`
    ## to find out size based upon DEF SEQn_CHUNK and SEQn_LIMIT then:
    time nice -n +19 doBlastzChainNet.pl `pwd`/DEF \
	-chainMinScore=5000 -verbose=2 -continue=blastz \
	-chainLinearGap=loose -bigClusterHub=pk > do.log 2>&1 &
    # <ctl-A><ctl-d>  ps -ef | grep blastzCavPor3
    # real    1715m0.868s
    # tail do.log
    # updated job database on disk
    # Batch failed after 4 tries on /cluster/bin/scripts/blastz-run-ucsc -outFormat psl /san/sanvol1/scratch/monDom4/nib/chr5.nib:chr5:20000000-30010000 qParts/part042.lst ../DEF ../psl/chr5.nib:chr5:20000000-30010000/chr5.nib:chr5:20000000-30010000_part042.lst.psl
    # Command failed:
    # ssh -x pk nice /cluster/data/monDom4/bed/blastzCavPor3.2008-04-11/run.blastz/doClusterRun.csh

    para-eta time
    # Completed: 65513 of 65514 jobs
    # Crashed: 1 jobs
    # CPU time in finished jobs:    9807564s  163459.40m  2724.32h  113.51d  0.311 y
    # IO & Wait Time:                373385s    6223.08m   103.72h    4.32d  0.012 y
    # Average job time:                 155s       2.59m     0.04h    0.00d
    # Longest running job:                0s       0.00m     0.00h    0.00d
    # Longest finished job:            2413s      40.22m     0.67h    0.03d
    # Submission to last job:        103236s    1720.60m    28.68h    1.19d

    #### One for automation ???????????:
    para problems
    ## start time: Mon Apr 14 14:01:19 2008
    ## return: 9
    ## ../Psl/chr5.nib:chr5:20000000-30010000/chr5.nib:chr5:20000000-30010000_part042.lst.psl does not exist
    ## stderr:
    ## ../psl/chr5.nib:chr5:20000000-30010000/chr5.nib:chr5:20000000-30010000_part042.lst.psl already exists

    ## ls ../Psl/chr5.nib:chr5:20000000-30010000/chr5.nib:chr5:20000000-30010000_part042.lst.psl
    ## ls: ../Psl/chr5.nib:chr5:20000000-30010000/chr5.nib:chr5:20000000-30010000_part042.lst.psl: No such file or directory
    ## ls ../psl/chr5.nib:chr5:20000000-30010000/chr5.nib:chr5:20000000-30010000_part042.lst.psl
    ## ../psl/chr5.nib:chr5:20000000-30010000/chr5.nib:chr5:20000000-30010000_part042.lst.psl
    ########### Case sensitive path name!!! "Psl" vs "psl"

    ssh kkstore04
    screen # use screen to control this job
    cd /cluster/data/monDom4/bed/blastzCavPor3.2008-04-11
   ######### Change locations in DEF due to Hiram's new methods
   cat << \_EOF_ > DEF
# opossum vs guineaPigs

# Use "mammal-fish" params even though this is mammal-mammal...
# pretty distant, hopefully not too many shared undermasked repeats.
#BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=6000
BLASTZ_K=2200
BLASTZ_M=50
BLASTZ_Q=/scratch/data/blastz/HoxD55.q

# TARGET: Opossum monDom4
SEQ1_DIR=/san/sanvol1/scratch/monDom4/nib
#SEQ1_DIR=/scratch/data/monDom4/monDom.2bit
SEQ1_LEN=/scratch/data/monDom4/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000

# QUERY: GuineaPig cavPor3
SEQ2_DIR=/scratch/data/cavPor3/cavPor3.2bit
SEQ2_LEN=/scratch/data/cavPor3/chrom.sizes
SEQ2_LIMIT=300
SEQ2_CHUNK=20000000
SEQ2_LAP=0

BASE=/cluster/data/monDom4/bed/blastzCavPor3.2008-04-11

TMPDIR=/scratch/tmp
_EOF_

    time nice -n +19 doBlastzChainNet.pl `pwd`/DEF \
	-chainMinScore=5000 -verbose=2 -continue=cat \
	-chainLinearGap=loose -bigClusterHub=pk > do.log 2>&1 &
    # <ctl-A><ctl-d>  ps -ef | grep blastzCavPor3
    # real    232m24.348s
    # Done time nice -n +19 doBlastzChainNet.pl `pwd`/DEF -chainMinScore=5000 -verbose=2 -continue=cat -

    cat fb.monDom4.chainCavPor3Link.txt
    #   334067222 bases of 3501643220 (9.540%) in intersection

    mkdir /cluster/data/cavPor3/bed/blastz.monDom4.swap

    cd /cluster/data/cavPor3/bed/blastz.monDom4.swap
    time nice -n +19 doBlastzChainNet.pl \
	/cluster/data/monDom4/bed/blastzCavPor3.2008-04-11/DEF \
	-chainMinScore=5000 -verbose=2 -smallClusterHub=memk \
	-swap -chainLinearGap=loose -bigClusterHub=pk > do.log 2>&1
    # real    238m31.843s
    # Loading 18041915 chains into cavPor3.chainMonDom4
    # Can't start query:
    # load data local infile 'link.tab' into table chainMonDom4Link
    # mySQL error 1114: The table 'chainMonDom4Link' is full
    # select count(*) from chainMonDom4Link; => 119,311,098
    # wc -l link.tab                         => 202,324,406 link.tab

    ###################################################################
    #	this failed during the load because the chainLink table became
    #	too large.  These were loaded manually with an sql statement to
    #	start the table definition, then a load data local infile using
    #	the .tab files left over from the failed load.  Note the extra
    #	definitions on the chainMonDom4Link table
    time nice -n +19 hgsql -e \
      "DROP TABLE chainMonDom4Link;" cavPor3 &

    CREATE TABLE chainMonDom4Link (
      bin smallint(5) unsigned NOT NULL default 0,
      tName varchar(255) NOT NULL default '',
      tStart int(10) unsigned NOT NULL default 0,
      tEnd int(10) unsigned NOT NULL default 0,
      qStart int(10) unsigned NOT NULL default 0,
      chainId int(10) unsigned NOT NULL default 0,
      KEY tName (tName(16),bin),
      KEY chainId (chainId)
    ) ENGINE=MyISAM max_rows=210000000 avg_row_length=55 pack_keys=1 CHARSET=latin1;

    time nice -n +19 hgsql -e \
      "load data local infile \"link.tab\" into table chainMonDom4Link;" cavPor3
    #	this one took a number of hours
    # real    272m44.943s

    #	finish the nets and load
    # Add gap/repeat stats to the net file using database tables:
    cd /cluster/data/cavPor3/bed/blastz.monDom4.swap/axtChain
    # copied loadUp.csh to loadUpResume.csh and commented out lines already done
    time nice -n +19 ./loadUpResume.csh > loadUpResume.out 2>&1
    #real    30m47.809s

    #	real    106m17.385s
    cat fb.cavPor3.chainMonDom4Link.txt
    # 382771802 bases of 2663369733 (14.372%) in intersection

    ssh kkstore05
    screen
    cd /cluster/data/cavPor3/bed/blastz.monDom4.swap

    time nice -n +19 doBlastzChainNet.pl \
	/cluster/data/monDom4/bed/blastzCavPor3.2008-04-11/DEF \
	-chainMinScore=5000 -verbose=2 -smallClusterHub=memk \
	-swap -chainLinearGap=loose -bigClusterHub=pk -continue=download > doDowload.log 2>&1
    # real    0m36.604s
    # *** All done!
    # *** Make sure that goldenPath/cavPor3/vsMonDom4/README.txt is accurate.
    # *** Add {chain,net}MonDom4 tracks to trackDb.ra if necessary.

############################################################################
# FINAL BLASTZS STEPS:
# Be sure to add chainCavPor3.html,netCavPor3.html to makeDb/trackDb and update makeDb/trackDb/trackDb.ra
# Any orgs with non-standard priority update {org}/priority.ra
# Any orgs with differnent BLASTZ_Q=HoxD55.q will need chain{DB}.html and trackDb.ra at assembly level

#Modified (possibly merged) files:
#  trackDb/trackDb.ra

#Files/directories not checked in to CVS that look like source:
#  doc/cavPor3.txt
#  trackDb/chainOryCun1.html
#  trackDb/netOryCun1.html

############################################################################
#  BLATSERVERS ENTRY (DONE - 2008-04-17 - Tim)
#   After getting a blat server assigned by the Blat Server Gods,
    ssh hgwdev

    hgsql -e 'INSERT INTO blatServers (db, host, port, isTrans, canPcr) \
    VALUES ("cavPor3", "blat7", "17780", "1", "0"); \
    INSERT INTO blatServers (db, host, port, isTrans, canPcr) \
    VALUES ("cavPor3", "blat7", "17781", "0", "1");' \
        hgcentraltest
    #   test it with some sequence
    # Can't open /gbdb/cavPor3/choHof1.2bit to read: No such file or directory
    ## Doh!  I entered in the wrong blat server the first time!

#############################################################################
#  Downloads (DONE - 2008-04-23 - Tim)
#   Let's see if the downloads will work
    ssh hgwdev
    /cluster/data/cavPor3
#   expecting to find repeat masker .out file here:
    ln -s bed/repeatMasker/cavPor3.fa.out .
    time nice -n +19 /cluster/bin/scripts/makeDownloads.pl \
    -workhorse=hgwdev cavPor3 > jkStuff/downloads.log 2>&1
    # failed due to link to RepeatMasker instead of repeatMasker.  Fixed and restarted.
    # *** All done!
    # *** Please take a look at the downloads for cavPor3 using a web browser.
    # *** Edit each README.txt to resolve any notes marked with "***":
    #     /cluster/data/cavPor3/goldenPath/database/README.txt
    #     /cluster/data/cavPor3/goldenPath/bigZips/README.txt
    #     (The htdocs/goldenPath/cavPor3/*/README.txt "files" are just links to those.)
    # *** If you have to make any edits that would always apply to future
    #     assemblies from the same sequencing center, please edit them into
    #     ~/kent/src/hg/utils/automation/makeDownloads.pl (or ask Angie for help).

    # the downloads are located at: http://hgwdev.soe.ucsc.edu/goldenPath/cavPor3/
    mv goldenPath old.goldenPath
    mv jkStuff/downloads.log jkStuff/old.downloads.log
    time nice -n +19 /cluster/home/tdreszer/kent/src/hg/utils/automation/makeDownloads.pl \
    -workhorse=hgwdev cavPor3 >> jkStuff/downloads.log 2>&1
    # real    24m23.427s
    # *** Please take a look at the downloads for cavPor3 using a web browser.
    # *** The downloads directory is: /usr/local/apache/htdocs/goldenPath/cavPor3.
    # *** Edit each README.txt to resolve any notes marked with "***":
    #      /cluster/data/cavPor3/goldenPath/database/README.txt
    #      /cluster/data/cavPor3/goldenPath/bigZips/README.txt
    #      (The htdocs/goldenPath/cavPor3/*/README.txt "files" are just links to those.)
    #  *** If you have to make any edits that would always apply to future
    #      assemblies from the same sequencing center, please edit them into
    #      ~/kent/src/hg/utils/automation/makeDownloads.pl (or ask Angie for help).


#############################################################################
#  PushQ entries (DONE - 2008-04-24 - Tim)
    ssh hgwdev
    /cluster/data/cavPor3
    /cluster/bin/scripts/makePushQSql.pl cavPor3 > jkStuff/pushQ.sql
    # output warnings:
    # hgwdev does not have /usr/local/apache/htdocs/goldenPath/cavPor3/liftOver/cavPor3ToCavPor*
    ### is it worth it? NO: cavPor2 was never released.
    # cavPor3 does not have seq
    # cavPor3 does not have extFile
    ### Not needed because output is from multiz, visiGene, affyProbes, cloneend, stsMarker, nibbImageProbes
    ### Kate suggests updating makePushQsql to require gbSeq and gbExtFile instead, but it already does.
    #
    #  *** All done!
    #  *** Please edit the output to ensure correctness before using.
    #  *** 1. Resolve any warnings output by this script.
    #  *** 2. Remove any entries which should not be pushed.
    ### All accounted for but: genscanSubopt which is not in pushQ.sql but is in

    #  *** 3. Add tables associated with the main track table (e.g. *Pep tables
    #         for gene prediction tracks).
    #  *** 4. Add files associated with tracks.  First, look at the results
    #         of this query:
    #           hgsql cavPor3 -e 'select distinct(path) from extFile'
    ### Doesn't exist
    #         Then, look at file(s) named in each of the following wiggle tables:
    #           hgsql cavPor3 -e 'select distinct(file) from gc5Base'
    ### | /gbdb/cavPor3/wib/gc5Base.wib |
    #           hgsql cavPor3 -e 'select distinct(file) from quality'
    ### | /gbdb/cavPor3/wib/qual.wib |
    #         Files go in the second field after tables (it's tables, cgis, files).
    #  *** 5. This script currently does not recognize composite tracks.  If cavPor3
    #         has any composite tracks, you should manually merge the separate
    #         per-table entries into one entry.
    ### Doen't apply to cavPor3 at this time.
    #  *** 6. Just before executing the sql, note the ID of the most recent entry
    #         in the Main Push Queue.  If the ID (first column) of the last
    #         INSERT statement is not 1 greater than the most recent entry's,
    #         make it so to avoid an ID clash with an existing entry.
    ### Updated to 4283
    #  *** When everything looks complete and correct, use hgsql on the qapushq
    #      machine (currently hgwbeta) to execute the sql file.  (Make sure that
    #      qapushq does not already have a table named cavPor3.)  Then use the Push
    #      Queue web interface to check the contents of all entries.
    ssh hgwbeta
    cd /cluster/data/cavPor3/jkStuff
    hgsql qapushq < pushQ.sql
    ### All is there
    #  *** If you haven't already, please add cavPor3 to makeDb/schema/all.joiner !
    #      It should be in both $gbd and $chainDest.
    ### Already there
    #  *** When cavPor3 is on the RR (congrats!), please doBlastz -swap if you haven't
    #      already.

###########################################################################
# HUMAN (hg18) PROTEINS TRACK (DONE braney 2008-04-09)
    ssh kkstore05
    # bash  if not using bash shell already

    mkdir /cluster/data/cavPor3/blastDb
    cd /cluster/data/cavPor3
    awk '{if ($2 > 1000000) print $1}' chrom.sizes > 1meg.lst
    twoBitToFa -seqList=1meg.lst  cavPor3.unmasked.2bit temp.fa
    faSplit gap temp.fa 1000000 blastDb/x -lift=blastDb.lft
    rm temp.fa 1meg.lst

    awk '{if ($2 <= 1000000) print $1}' chrom.sizes > less1meg.lst
    twoBitToFa -seqList=less1meg.lst  cavPor3.unmasked.2bit temp.fa
    faSplit about temp.fa 1000000 blastDb/y

    cd blastDb
    for i in *.fa
    do
	/cluster/bluearc/blast229/formatdb -i $i -p F
    done
    rm *.fa
    ls *.nsq | wc -l
# 3401

    mkdir -p /san/sanvol1/scratch/cavPor3/blastDb
    cd /cluster/data/cavPor3/blastDb
    for i in nhr nin nsq;
    do
	echo $i
	cp *.$i /san/sanvol1/scratch/cavPor3/blastDb
    done

    mkdir -p /cluster/data/cavPor3/bed/tblastn.hg18KG
    cd /cluster/data/cavPor3/bed/tblastn.hg18KG
    echo  /san/sanvol1/scratch/cavPor3/blastDb/*.nsq | xargs ls -S | sed "s/\.nsq//"  > query.lst
    wc -l query.lst

# 3401 query.lst

   # we want around 350000 jobs
   calc `wc /cluster/data/hg18/bed/blat.hg18KG/hg18KG.psl | awk '{print $1}'`/\(350000/`wc query.lst | awk '{print $1}'`\)

# 36727/(350000/3401) = 356.881506

   mkdir -p /cluster/bluearc/cavPor3/bed/tblastn.hg18KG/kgfa
   split -l 357 /cluster/data/hg18/bed/blat.hg18KG/hg18KG.psl  /cluster/bluearc/cavPor3/bed/tblastn.hg18KG/kgfa/kg
   ln -s /cluster/bluearc/cavPor3/bed/tblastn.hg18KG/kgfa kgfa
   cd kgfa
   for i in *; do
     nice pslxToFa $i $i.fa;
     rm $i;
     done
   cd ..
   ls -1S kgfa/*.fa > kg.lst
   mkdir -p /cluster/bluearc/cavPor3/bed/tblastn.hg18KG/blastOut
   ln -s /cluster/bluearc/cavPor3/bed/tblastn.hg18KG/blastOut
   for i in `cat kg.lst`; do  mkdir blastOut/`basename $i .fa`; done
   tcsh
   cd /cluster/data/cavPor3/bed/tblastn.hg18KG
   cat << '_EOF_' > blastGsub
#LOOP
blastSome $(path1) {check in line $(path2)} {check out exists blastOut/$(root2)/q.$(root1).psl }
#ENDLOOP
'_EOF_'

   cat << '_EOF_' > blastSome
#!/bin/sh
BLASTMAT=/cluster/bluearc/blast229/data
export BLASTMAT
g=`basename $2`
f=/tmp/`basename $3`.$g
for eVal in 0.01 0.001 0.0001 0.00001 0.000001 1E-09 1E-11
do
if /cluster/bluearc/blast229/blastall -M BLOSUM80 -m 0 -F no -e $eVal -p tblastn -d $1 -i $2 -o $f.8
then
        mv $f.8 $f.1
        break;
fi
done
if test -f  $f.1
then
    if /cluster/bin/i386/blastToPsl $f.1 $f.2
    then
	liftUp -nosort -type=".psl" -nohead $f.3 /cluster/data/cavPor3/blastDb.lft carry $f.2
        liftUp -nosort -type=".psl" -pslQ -nohead $3.tmp /cluster/data/hg18/bed/blat.hg18KG/protein.lft warn $f.3
        if pslCheck -prot $3.tmp
        then
            mv $3.tmp $3
            rm -f $f.1 $f.2 $f.3 $f.4
        fi
        exit 0
    fi
fi
rm -f $f.1 $f.2 $3.tmp $f.8 $f.3 $f.4
exit 1
'_EOF_'
    # << happy emacs
    chmod +x blastSome
    gensub2 query.lst kg.lst blastGsub blastSpec
    exit

    ssh pk
    cd /cluster/data/cavPor3/bed/tblastn.hg18KG
    para create blastSpec
#    para try, check, push, check etc.

    para time

# Completed: 350303 of 350303 jobs
# CPU time in finished jobs:   25825855s  430430.91m  7173.85h  298.91d  0.819 y
# IO & Wait Time:               2396715s   39945.25m   665.75h   27.74d  0.076 y
# Average job time:                  81s       1.34m     0.02h    0.00d
# Longest finished job:             339s       5.65m     0.09h    0.00d
# Submission to last job:         80595s    1343.25m    22.39h    0.93d

    ssh kkstore05
    cd /cluster/data/cavPor3/bed/tblastn.hg18KG
    mkdir chainRun
    cd chainRun
    tcsh
    cat << '_EOF_' > chainGsub
#LOOP
chainOne $(path1)
#ENDLOOP
'_EOF_'

    cat << '_EOF_' > chainOne
(cd $1; cat q.*.psl | simpleChain -prot -outPsl -maxGap=150000 stdin /cluster/bluearc/cavPor3/bed/tblastn.hg18KG/blastOut/c.`basename $1`.psl)
'_EOF_'
    chmod +x chainOne
    ls -1dS /cluster/bluearc/cavPor3/bed/tblastn.hg18KG/blastOut/kg?? > chain.lst
    gensub2 chain.lst single chainGsub chainSpec
    # do the cluster run for chaining
    ssh pk
    cd /cluster/data/cavPor3/bed/tblastn.hg18KG/chainRun
    para create chainSpec
    para maxNode 30
    para try, check, push, check etc.

# Completed: 99 of 103 jobs
# Crashed: 4 jobs
# CPU time in finished jobs:      15940s     265.67m     4.43h    0.18d  0.001 y
# IO & Wait Time:                397633s    6627.21m   110.45h    4.60d  0.013 y
# Average job time:                4178s      69.63m     1.16h    0.05d
# Longest finished job:            5502s      91.70m     1.53h    0.06d
# Submission to last job:          7032s     117.20m     1.95h    0.08d

# ran 4 crashed jobs on memk.
Completed: 4 of 4 jobs
CPU time in finished jobs:       1673s      27.88m     0.46h    0.02d  0.000 y
IO & Wait Time:                   529s       8.81m     0.15h    0.01d  0.000 y
Average job time:                 550s       9.17m     0.15h    0.01d
Longest running job:                0s       0.00m     0.00h    0.00d
Longest finished job:             712s      11.87m     0.20h    0.01d
Submission to last job:          1299s      21.65m     0.36h    0.02d

    ssh kkstore05
    cd /cluster/data/cavPor3/bed/tblastn.hg18KG/blastOut
    for i in kg??
    do
       cat c.$i.psl | awk "(\$13 - \$12)/\$11 > 0.6 {print}" > c60.$i.psl
       sort -rn c60.$i.psl | pslUniq stdin u.$i.psl
       awk "((\$1 / \$11) ) > 0.60 { print   }" c60.$i.psl > m60.$i.psl
       echo $i
    done
    sort -T /tmp -k 14,14 -k 16,16n -k 17,17n u.*.psl m60* | uniq > /cluster/data/cavPor3/bed/tblastn.hg18KG/blastHg18KG.psl
    cd ..
    pslCheck blastHg18KG.psl

    # load table
    ssh hgwdev
    cd /cluster/data/cavPor3/bed/tblastn.hg18KG
    hgLoadPsl cavPor3 blastHg18KG.psl

    # check coverage
    featureBits cavPor3 blastHg18KG
# 35569442 bases of 2663369733 (1.336%) in intersection

    featureBits cavPor3 all_mrna blastHg18KG  -enrichment
# all_mrna 0.023%, blastHg18KG 1.336%, both 0.015%, cover 66.85%, enrich 50.05x

    ssh kkstore05
    rm -rf /cluster/data/cavPor3/bed/tblastn.hg18KG/blastOut
    rm -rf /cluster/bluearc/cavPor3/bed/tblastn.hg18KG/blastOut
#end tblastn
############################### Make editor happy: _EOF_

#########################################################################
## 7-Way Multiz (STARTED - 2008-04-25 - Tim)
##

#  From Jim: A minimal set would be human/mouse/rabbit/chicken.
#  Add rat and possum if you feel like turning the crank a little more.

    #	the all.chain.gz files were split up via kluster jobs on memk
    #	in order to get mafSynNet files.  Example above in ornAna1 blastz
    ssh hgwdev
    mkdir /cluster/data/cavPor3/bed/multiz7way
    cd /cluster/data/cavPor3/bed/multiz7way
    #	take the 30-way tree from mm9 and eliminate genomes not in
    #	this alignment
    #	rearrange to get cavPor3 on the top of the graph
    #	paste this tree into the on-line phyloGif tool:
    #	http://genome.ucsc.edu/cgi-bin/phyloGif
    #	to create the image for the tree diagram

    #	select the 7 organisms from the 30-way recently done on mouse mm9
    /cluster/bin/phast/tree_doctor \
--prune-all-but Human_hg18,Mouse_mm9,Rat_rn4,GuineaPig_cavPor2,Rabbit_oryCun1,Opossum_monDom4,Chicken_galGal3 \
	/cluster/data/mm9/bed/multiz30way/mm9OnTop.fullNames.nh \
	| sed -e "s/cavPor2/cavPor3/g" > 7-way.fullNames.nh

    #	looks something like this:
    # ((((((Mouse_mm9:0.076274,Rat_rn4:0.084383):0.200607,GuineaPig_cavPor3:0.202990):0.034350,Rabbit_oryCun1:0.208548):0.014587
    # ,Human_hg18:0.126901):0.263313,Opossum_monDom4:0.320721):0.207444,Chicken_galGal3:0.488824);

    #	rearrange to get guineaPig at the top:
    # this leaves us with:
    cat << _EOF_ > cavPor3.7-way.nh
(((((GuineaPig_cavPor3:0.202990,(Mouse_mm9:0.076274,Rat_rn4:0.084383):0.200607):0.034350,Rabbit_oryCun1:0.208548):0.014587,Human_hg18:0.126901):0.263313,Opossum_monDom4:0.320721):0.207444,Chicken_galGal3:0.488824);
_EOF_
    #	<< happy emacs

    # verify all blastz's exists
    cat << \_EOF_ > listMafs.csh
#!/bin/csh -fe
cd /cluster/data/cavPor3/bed/multiz7way
foreach db (`grep -v cavPor3 species.list`)
    set bdir = /cluster/data/cavPor3/bed/blastz.$db
    if (-e $bdir/mafRBestNet/cavPor3.$db.rbest.maf.gz) then
	echo "$db mafRBestNet"
    else if (-e $bdir/mafSynNet/cavPor3.$db.syn.maf.gz) then
	echo "$db mafSynNet"
    else if (-e $bdir/mafNet/cavPor3.$db.net.maf.gz) then
	echo "$db mafNet"
    else
	echo "$db mafs not found"
    endif
end
_EOF_
    # << happy emacs
    chmod +x ./listMafs.csh
    #	see what it says:
    ./listMafs.csh
    # galGal3 mafNet
    # hg18 mafNet
    # mm9 mafNet
    # monDom4 mafNet
    # oryCun1 mafRBestNet
    # rn4 mafNet

    /cluster/bin/phast/all_dists cavPor3.7-way.nh > 7way.distances.txt
    #    ERROR: Can't parse distance in tree ("0.014587
    #").
    #[hgwdev:tdreszer multiz7way> m cavPor3.7-way.nh
    # (((((GuineaPig_cavPor3:0.202990,(Mouse_mm9:0.076274,Rat_rn4:0.084383):0.200607):0.034350,Rabbit_oryCun1:0.208548):0.014587
    # ,Human_hg18:0.126901):0.263313,Opossum_monDom4:0.320721):0.207444,Chicken_galGal3:0.488824);
    # Problem is that tree should not contain 'n' so reecho and retry
    m 7way.distances.txt
    # GuineaPig_cavPor3       Mouse_mm9       0.479871
    # GuineaPig_cavPor3       Rat_rn4 0.487980
    # GuineaPig_cavPor3       Rabbit_oryCun1  0.445888
    # GuineaPig_cavPor3       Human_hg18      0.378828
    # GuineaPig_cavPor3       Opossum_monDom4 0.835961
    # GuineaPig_cavPor3       Chicken_galGal3 1.211508
    # Mouse_mm9       Rat_rn4 0.160657
    # Mouse_mm9       Rabbit_oryCun1  0.519779
    # Mouse_mm9       Human_hg18      0.452719
    # Mouse_mm9       Opossum_monDom4 0.909852
    # Mouse_mm9       Chicken_galGal3 1.285399
    # Rat_rn4 Rabbit_oryCun1  0.527888
    # Rat_rn4 Human_hg18      0.460828
    # Rat_rn4 Opossum_monDom4 0.917961
    # Rat_rn4 Chicken_galGal3 1.293508
    # Rabbit_oryCun1  Human_hg18      0.350036
    # Rabbit_oryCun1  Opossum_monDom4 0.807169
    # Rabbit_oryCun1  Chicken_galGal3 1.182716
    # Human_hg18      Opossum_monDom4 0.710935
    # Human_hg18      Chicken_galGal3 1.086482
    # Opossum_monDom4 Chicken_galGal3 1.016989
    # (total) -       2.228942

    grep -i cavPor 7way.distances.txt | sort -k3,3n
    # GuineaPig_cavPor3       Human_hg18      0.378828
    # GuineaPig_cavPor3       Rabbit_oryCun1  0.445888
    # GuineaPig_cavPor3       Mouse_mm9       0.479871
    # GuineaPig_cavPor3       Rat_rn4         0.487980
    # GuineaPig_cavPor3       Opossum_monDom4 0.835961
    # GuineaPig_cavPor3       Chicken_galGal3 1.211508

    # Note that guinneaPig is closer to human than to other rodents.
    # This may be reasonable, since the speed of evolution (length of tree limbs)
    # in rodents is greater than for primates.  For instance mm9 is closer to hg18
    # than it is to either cavPor2 or oryCun1

    #	use the calculated
    #	distances in the table below to order the organisms and check
    #	the button order on the browser.
    #	And if you can fill in the table below entirely, you have
    #	succeeded in finishing all the alignments required.
    #
    #                         featureBits chainLink measures
    #                                    chainCavPor3Link   chain   linearGap
    # distance                     on cavPor3    on other   minScore
    # 0.479871 Mouse_mm9           (28.900%)     (29.330%)  3000     medium
    # 0.487980 Rat_rn4             (27.858%)     (27.602%)  3000     medium
    # 0.445888 Rabbit_oryCun1      (35.145%)     (28.238%)  3000     medium
    # 0.378828 Human_hg18          (43.971%)     (48.132%)  3000     medium
    # 0.835961 Opossum_monDom4      (9.540%)     (14.372%)  5000     loose
    # 1.211508 Chicken_galGal3     (10.190%)      (5.437%)  5000     loose
    ### ### ### ### Be sure to use calJac as an example because of scaffolds
    ### ### ### ### will require making maf files by scaffold name

    #	create a coherent set of all the mafs involved in this run
    mkdir mafLinks
    cd mafLinks
    ln -s ../../blastz.hg18/mafNet ./hg18
    ln -s ../../blastz.mm9/mafNet ./mm9
    ln -s ../../blastz.rn4/mafNet ./rn4
    ln -s ../../blastz.monDom4/mafNet ./monDom4
    ln -s ../../blastz.galGal3/mafNet ./galGal3
    ln -s ../../blastz.oryCun1/mafRBestNet ./oryCun1

    #	check data size:
    du -hscL *
    # 100M    galGal3
    # 930M    hg18
    # 577M    mm9
    # 268M    monDom4
    # 495M    oryCun1
    # 543M    rn4
    # 2.9G    total

    #   need to split these things up by Contig number for efficient kluster run
    ssh kkstore06
    mkdir -p /san/sanvol1/scratch/cavPor3/multiz7way/contigMaf
    cd /scratch/tmp
    #   the 16201 is from petMar/chrom.sizes
    echo "chrM 0 16801" > chrM.bed

    for D in `grep -v cavPor3 /cluster/data/cavPor3/bed/multiz7way/species.list`
do
    echo ${D}
    zcat \
/cluster/data/cavPor3/bed/multiz7way/mafLinks/cavPor3.${D}.*.maf.gz \
    > ${D}.maf
    mkdir /scratch/tmp/${D}
    cd /scratch/tmp/${D}
    mafSplit -verbose=2 /dev/null -byTarget -useHashedName=10 Contig \
    ../${D}.maf -outDirDepth=2
    mafsInRegion ../chrM.bed 0/0/chrM.maf ../${D}.maf
    rsync -a --progress ./ \
    /san/sanvol1/scratch/cavPor3/multiz7way/contigMaf/${D}
    cd /scratch/tmp
    rm -fr ${D} ${D}.maf
done

    #   create a run-time list of contigs to operate on, not all contigs
    #   exist in all alignments, but we want all contig names used in any
    #   alignment:
    ssh kkstore05 # cavPor3 -> store12 -> kkstore05
    cd /san/sanvol1/scratch/cavPor3/multiz7way/contigMaf
    for D in *
do
    cd "${D}"
    find . -type f
    cd ..
done | sort -u > /tmp/7-way.contig.list
    wc -l /tmp/7-way.contig.list
    mkdir /cluster/data/cavPor3/bed/multiz7way/splitRun
    cp -p /tmp/7-way.contig.list \
    /cluster/data/cavPor3/bed/multiz7way/splitRun
    #   296 /tmp/7-way.contig.list

    # ready for the multiz run
    ssh pk
    cd /cluster/data/cavPor3/bed/multiz7way/splitRun
    mkdir -p maf run
    cd run
    mkdir penn
    # use latest penn utilities
    P=/cluster/bin/penn/multiz.v11.2007-03-19/multiz-tba
    cp -p $P/{autoMZ,multiz,maf_project} penn

    #   set the db and pairs directories here
    cat > autoMultiz.csh << \_EOF_
#!/bin/csh -ef
set db = cavPor3
set subdir = $1
set c = $2
set result = $3
set resultDir = $result:h
set run = `pwd`
set tmp = /scratch/tmp/$db/multiz.$c
set pairs = /san/sanvol1/scratch/$db/multiz7way/contigMaf
rm -fr $tmp
mkdir -p $tmp
mkdir -p $resultDir
cp ../../tree.7.nh ../../species.list $tmp
pushd $tmp
foreach s (`grep -v $db species.list`)
    set in = $pairs/$s/$subdir/$c.maf
    set out = $db.$s.sing.maf
    if (-e $in.gz) then
        zcat $in.gz > $out
    else if (-e $in) then
        cp $in $out
    else
        echo "##maf version=1 scoring=autoMZ" > $out
    endif
end
set path = ($run/penn $path); rehash
$run/penn/autoMZ + T=$tmp E=$db "`cat tree.7.nh`" $db.*.sing.maf $c.maf
popd
cp $tmp/$c.maf $result
rm -fr $tmp
rmdir --ignore-fail-on-non-empty /scratch/tmp/$db
_EOF_
# << happy emacs
    chmod +x autoMultiz.csh

    cat  << \_EOF_ > template
#LOOP
./autoMultiz.csh $(dir1) $(root1) {check out line+ /cluster/data/cavPor3/bed/multiz7way/splitRun/maf/$(dir1)/$(root1).maf}
#ENDLOOP
_EOF_
    # << emacs

    sed -e "s/^\.\///" ../7-way.contig.list \
    | gensub2 stdin single template jobList

    para create jobList
    para try ... check ...
    # Checking finished jobs
    # Completed: 10 of 296 jobs
    # CPU time in finished jobs:       4355s      72.59m     1.21h    0.05d  0.000 y
    # IO & Wait Time:                   104s       1.73m     0.03h    0.00d  0.000 y
    # Average job time:                 446s       7.43m     0.12h    0.01d
    # Longest running job:                0s       0.00m     0.00h    0.00d
    # Longest finished job:            1672s      27.87m     0.46h    0.02d
    # Submission to last job:          1681s      28.02m     0.47h    0.02d
    # Estimated complete:                 0s       0.00m     0.00h    0.00d
    # [pk:tdreszer run> pc
    # 296 jobs in batch
    # 92505 jobs (including everybody's) in Parasol queue.
    # Checking finished jobs
    # unsubmitted jobs: 286
    # ranOk: 10
    # total jobs in batch: 296
    para push ... check ... etc
    # Completed: 296 of 296 jobs
    # CPU time in finished jobs:      53659s     894.32m    14.91h    0.62d  0.002 y
    # IO & Wait Time:                  1675s      27.92m     0.47h    0.02d  0.000 y
    # Average job time:                 187s       3.12m     0.05h    0.00d
    # Longest running job:                0s       0.00m     0.00h    0.00d
    # Longest finished job:            1969s      32.82m     0.55h    0.02d
    # Submission to last job:          4023s      67.05m     1.12h    0.05d
    # Estimated complete:                 0s       0.00m     0.00h    0.00d
    # [pk:tdreszer run> pc
    # 296 jobs in batch
    # 84971 jobs (including everybody's) in Parasol queue.
    # Checking finished jobs
    # ranOk: 296
    # total jobs in batch: 296

    # put the split maf results back together into a single maf file
    #   eliminate duplicate comments
    ssh kkstore05
    cd /cluster/data/cavPor3/bed/multiz7way
    mkdir togetherMaf
    grep "^##maf version" splitRun/maf/0/0/Contig00700.maf \
    | sort -u > togetherMaf/cavPor3.7way.maf
    ##maf version=1 scoring=autoMZ.v1
    ##maf version=1 scoring=maf_project.v12
    ##maf version=1 scoring=multiz

    for F in `find ./splitRun/maf -type f -depth`
do
    grep -h "^#" "${F}" | egrep -v "maf version=1|eof maf" \
    | sed -e "s#/_MZ_[^ ]* # #g; s#__[0-9]##g"
done | sort -u >> togetherMaf/cavPor3.7way.maf
    for F in `find ./splitRun/maf -type f -depth`
do
    grep -v -h "^#" "${F}"
done >> togetherMaf/cavPor3.7way.maf

    grep "^##eof maf" splitRun/maf/0/0/Contig00700.maf \
    | sort -u >> togetherMaf/cavPor3.7way.maf

    # load tables for a look
    ssh hgwdev
    mkdir -p /gbdb/cavPor3/multiz7way/maf
    ln -s /cluster/data/cavPor3/bed/multiz7way/togetherMaf/*.maf \
                /gbdb/cavPor3/multiz7way/maf/multiz7way.maf
    # this generates an immense multiz7way.tab file in the directory
    #   where it is running.  Best to run this over in scratch.
    cd /scratch/tmp
    time nice -n +19 hgLoadMaf \
    -pathPrefix=/gbdb/cavPor3/multiz7way/maf cavPor3 multiz7way
    # Advisory lock created
    # Indexing and tabulating /gbdb/cavPor3/multiz7way/maf/multiz7way.maf
    # Loading multiz7way into database
    # Loaded 6939217 mafs in 1 files from /gbdb/cavPor3/multiz7way/maf
    # Advisory lock has been released
    #
    # real    7m7.421s
    # user    1m50.625s
    # sys     0m34.191s

### ### ###################################################################
### ### #### Abandoning 7-Way because I am adding Squirrel 2008-05-27 #####
### ### ###################################################################


#########################################################################
## 8-Way Multiz (STARTED - 2008-05-27 - Tim)
## 8-Way Multiz (RESTARTED - 2008-07-02 - Tim Done: 7-11-2008)
##

    #   the all.chain.gz files were split up via kluster jobs on memk
    #   in order to get mafSynNet files.  Example above in ornAna1 blastz
    ssh hgwdev
    mkdir /cluster/data/cavPor3/bed/multiz8way
    cd /cluster/data/cavPor3/bed/multiz8way
    #   take the 30-way tree from mm9 and eliminate genomes not in
    #   this alignment
    #   rearrange to get cavPor3 on the top of the graph
    #   paste this tree into the on-line phyloGif tool:
    #   http://genome.ucsc.edu/cgi-bin/phyloGif
    #   to create the image for the tree diagram

    ### Note the ground squirrel (Spermophilus tridecemlineatus) is not in the mm9 30-way
    ### Add it a same distance from other rodents as rabbit

    #   select the 7 organisms from the 30-way recently done on mouse mm9
    /cluster/bin/phast/tree_doctor \
--prune-all-but Human_hg18,Mouse_mm9,Rat_rn4,GuineaPig_cavPor2,Rabbit_oryCun1,Opossum_monDom4,Chicken_galGal3 \
    /cluster/data/mm9/bed/multiz30way/mm9OnTop.fullNames.nh \
    | sed -e "s/cavPor2/cavPor3/g" > 7-way.fullNames.nh

    #   looks something like this:
    # ((((((Mouse_mm9:0.076274,Rat_rn4:0.084383):0.200607,GuineaPig_cavPor3:0.202990):0.034350,Rabbit_oryCun1:0.208548):0.014587
    # ,Human_hg18:0.126901):0.263313,Opossum_monDom4:0.320721):0.207444,Chicken_galGal3:0.488824);

    #   rearrange to get guineaPig at the top:
    # this leaves us with:
    cat << _EOF_ > cavPor3.7-way.nh
(((((GuineaPig_cavPor3:0.202990,(Mouse_mm9:0.076274,Rat_rn4:0.084383):0.200607):0.034350,Rabbit_oryCun1:0.208548):0.014587,Human_hg18:0.126901):0.263313,Opossum_monDom4:0.320721):0.207444,Chicken_galGal3:0.488824);
_EOF_
    #   << happy emacs

    # Add squirrel at same distance to guineaPig and Mouse as rabbits:
   # Alternative tree: ((((((GuineaPig_cavPor3:0.202990,(Mouse_mm9:0.076274,Rat_rn4:0.084383):0.200607):0.017175,Squirrel_speTri0:0.208548):0.017175,Rabbit_oryCun1:0.208548):0.014587,Human_hg18:0.126901):0.263313,Opossum_monDom4:0.320721):0.207444,Chicken_galGal3:0.488824);
    cat << _EOF_ > cavPor3.8-way.nh
(((((GuineaPig_cavPor3:0.202990,((Mouse_mm9:0.076274,Rat_rn4:0.084383):0.18602,Squirrel_speTri0:0.208548):0.014587):0.034350,Rabbit_oryCun1:0.208548):0.014587,Human_hg18:0.126901):0.263313,Opossum_monDom4:0.320721):0.207444,Chicken_galGal3:0.488824);
_EOF_
    #   << happy emacs

    cd /cluster/data/cavPor3/bed/multiz8way
    cp ../multiz7way/species.list .
    echo speTri0 >> species.list

    # verify all blastz's exists
    cat << \_EOF_ > listMafs.csh
#!/bin/csh -fe
cd /cluster/data/cavPor3/bed/multiz8way
foreach db (`grep -v cavPor3 species.list`)
    set bdir = /cluster/data/cavPor3/bed/blastz.$db
    if (-e $bdir/mafRBestNet/cavPor3.$db.rbest.maf.gz) then
    echo "$db mafRBestNet"
    else if (-e $bdir/mafSynNet/cavPor3.$db.syn.maf.gz) then
    echo "$db mafSynNet"
    else if (-e $bdir/mafNet/cavPor3.$db.net.maf.gz) then
    echo "$db mafNet"
    else
    echo "$db mafs not found"
    endif
end
_EOF_
    # << happy emacs
    chmod +x ./listMafs.csh
    #   see what it says:
    ./listMafs.csh
    # galGal3 mafNet
    # hg18 mafNet
    # mm9 mafNet
    # monDom4 mafNet
    # oryCun1 mafRBestNet
    # rn4 mafNet
    # speTri0 mafs not found

    ### After some back and forth on my own, Kate finally got the speTri0 Mafs together:
    # galGal3 mafNet
    # hg18 mafNet
    # mm9 mafNet
    # monDom4 mafNet
    # oryCun1 mafRBestNet
    # rn4 mafNet
    # speTri0 mafRBestNet

    /cluster/bin/phast/all_dists cavPor3.8-way.nh > 8way.distances.txt
    # GuineaPig_cavPor3       Mouse_mm9       0.479871
    # GuineaPig_cavPor3       Rat_rn4 0.487980
    # GuineaPig_cavPor3       Squirrel_speTri0        0.426125
    # GuineaPig_cavPor3       Rabbit_oryCun1  0.445888
    # GuineaPig_cavPor3       Human_hg18      0.378828
    # GuineaPig_cavPor3       Opossum_monDom4 0.835961
    # GuineaPig_cavPor3       Chicken_galGal3 1.211508
    # Mouse_mm9       Rat_rn4 0.160657
    # Mouse_mm9       Squirrel_speTri0        0.470842
    # Mouse_mm9       Rabbit_oryCun1  0.519779
    # Mouse_mm9       Human_hg18      0.452719
    # Mouse_mm9       Opossum_monDom4 0.909852
    # Mouse_mm9       Chicken_galGal3 1.285399
    # Rat_rn4 Squirrel_speTri0        0.478951
    # Rat_rn4 Rabbit_oryCun1  0.527888
    # Rat_rn4 Human_hg18      0.460828
    # Rat_rn4 Opossum_monDom4 0.917961
    # Rat_rn4 Chicken_galGal3 1.293508
    # Squirrel_speTri0        Rabbit_oryCun1  0.466033
    # Squirrel_speTri0        Human_hg18      0.398973
    # Squirrel_speTri0        Opossum_monDom4 0.856106
    # Squirrel_speTri0        Chicken_galGal3 1.231653
    # Rabbit_oryCun1  Human_hg18      0.350036
    # Rabbit_oryCun1  Opossum_monDom4 0.807169
    # Rabbit_oryCun1  Chicken_galGal3 1.182716
    # Human_hg18      Opossum_monDom4 0.710935
    # Human_hg18      Chicken_galGal3 1.086482
    # Opossum_monDom4 Chicken_galGal3 1.016989
    # (total) -       2.437490

    grep -i cavPor 8way.distances.txt | sort -k3,3n
    # GuineaPig_cavPor3       Human_hg18      0.378828
    # GuineaPig_cavPor3       Squirrel_speTri0        0.426125
    # GuineaPig_cavPor3       Rabbit_oryCun1  0.445888
    # GuineaPig_cavPor3       Mouse_mm9       0.479871
    # GuineaPig_cavPor3       Rat_rn4 0.487980
    # GuineaPig_cavPor3       Opossum_monDom4 0.835961
    # GuineaPig_cavPor3       Chicken_galGal3 1.211508

    # Note that guinneaPig is closer to human than to other rodents.
    # This may be reasonable, since the speed of evolution (length of tree limbs)
    # in rodents is greater than for primates.  For instance mm9 is closer to hg18
    # than it is to either cavPor2 or oryCun1

    ## ??? time nice -n +19 featureBits cavPor3 axtChain/cavPor3.speTri0.rbest.chain.psl >& fb.cavPor3.speTri0.rbest.chain.psl.txt &


    #   use the calculated
    #   distances in the table below to order the organisms and check
    #   the button order on the browser.
    #   And if you can fill in the table below entirely, you have
    #   succeeded in finishing all the alignments required.
    #
    #                         featureBits chainLink measures
    #                                    chainCavPor3Link   chain   linearGap
    # distance                     on cavPor3    on other   minScore
    # 0.479871 Mouse_mm9           (28.900%)     (29.330%)  3000     medium
    # 0.487980 Rat_rn4             (27.858%)     (27.602%)  3000     medium
    # 0.479871 Squirrel_speTri0                             3000     medium
    # 0.445888 Rabbit_oryCun1      (35.145%)     (28.238%)  3000     medium
    # 0.378828 Human_hg18          (43.971%)     (48.132%)  3000     medium
    # 0.835961 Opossum_monDom4      (9.540%)     (14.372%)  5000     loose
    # 1.211508 Chicken_galGal3     (10.190%)      (5.437%)  5000     loose
    ### ### ### ### Be sure to use calJac as an example because of scaffolds
    ### ### ### ### will require making maf files by scaffold name

    #   create a coherent set of all the mafs involved in this run
    mkdir mafLinks
    cd mafLinks
    ln -s ../../blastz.hg18/mafNet ./hg18
    ln -s ../../blastz.mm9/mafNet ./mm9
    ln -s ../../blastz.rn4/mafNet ./rn4
    ln -s ../../blastz.monDom4/mafNet ./monDom4
    ln -s ../../blastz.galGal3/mafNet ./galGal3
    ln -s ../../blastz.oryCun1/mafRBestNet ./oryCun1
    ln -s ../../blastz.speTri0/mafRBestNet ./speTri0

    #   check data size:
    du -hscL *
    # 100M    galGal3
    # 930M    hg18
    # 577M    mm9
    # 268M    monDom4
    # 495M    oryCun1
    # 543M    rn4
    # 680M    speTri0
    # 3.6G    total

    #   need to split these things up by Contig number for efficient kluster run
    ssh kkstore05
    mkdir -p /san/sanvol1/scratch/cavPor3/multiz8way/contigMaf
    cd /iscratch/tmp
    #   the 16201 is from petMar/chrom.sizes
    echo "chrM 0 16801" > chrM.bed

    for D in `grep -v cavPor3 /cluster/data/cavPor3/bed/multiz8way/species.list`
do
    echo ${D}
    zcat \
/cluster/data/cavPor3/bed/multiz8way/mafLinks/${D}/cavPor3.${D}.*.maf.gz \
    > ${D}.maf
    mkdir /iscratch/tmp/${D}
    cd /iscratch/tmp/${D}
    mafSplit -verbose=2 /dev/null -byTarget -useHashedName=10 Contig \
    ../${D}.maf -outDirDepth=2
    mafsInRegion ../chrM.bed 0/0/chrM.maf ../${D}.maf
    rsync -a --progress ./ \
    /san/sanvol1/scratch/cavPor3/multiz8way/contigMaf/${D}
    cd /iscratch/tmp
    rm -fr ${D} ${D}.maf
done

    #   create a run-time list of contigs to operate on, not all contigs
    #   exist in all alignments, but we want all contig names used in any
    #   alignment:
    ssh kkstore05 # cavPor3 -> store12 -> kkstore05
    cd /san/sanvol1/scratch/cavPor3/multiz8way/contigMaf
    for D in *
do
    cd "${D}"
    find . -type f
    cd ..
done | sort -u > /tmp/8-way.contig.list
    wc -l /tmp/8-way.contig.list
    mkdir /cluster/data/cavPor3/bed/multiz8way/splitRun
    cp -p /tmp/8-way.contig.list \
    /cluster/data/cavPor3/bed/multiz8way/splitRun
    # wc -l 8-way.contig.list
    # 296 8-way.contig.list


    # ready for the multiz run
    ssh pk
    cd /cluster/data/cavPor3/bed/multiz8way/splitRun
    mkdir -p maf run
    cd run
    mkdir penn
    # use latest penn utilities
    P=/cluster/bin/penn/multiz.v11.2007-03-19/multiz-tba
    cp -p $P/{autoMZ,multiz,maf_project} penn

    #   set the db and pairs directories here
    cat > autoMultiz.csh << \_EOF_
#!/bin/csh -ef
set db = cavPor3
set subdir = $1
set c = $2
set result = $3
set resultDir = $result:h
set run = `pwd`
set tmp = /scratch/tmp/$db/multiz.$c
set pairs = /san/sanvol1/scratch/$db/multiz8way/contigMaf
rm -fr $tmp
mkdir -p $tmp
mkdir -p $resultDir
cp ../../tree.8.nh ../../species.list $tmp
pushd $tmp
foreach s (`grep -v $db species.list`)
    set in = $pairs/$s/$subdir/$c.maf
    set out = $db.$s.sing.maf
    if (-e $in.gz) then
        zcat $in.gz > $out
    else if (-e $in) then
        cp $in $out
    else
        echo "##maf version=1 scoring=autoMZ" > $out
    endif
end
set path = ($run/penn $path); rehash
$run/penn/autoMZ + T=$tmp E=$db "`cat tree.8.nh`" $db.*.sing.maf $c.maf
popd
cp $tmp/$c.maf $result
rm -fr $tmp
rmdir --ignore-fail-on-non-empty /scratch/tmp/$db
_EOF_
# << happy emacs
    chmod +x autoMultiz.csh

    cat  << \_EOF_ > template
#LOOP
./autoMultiz.csh $(dir1) $(root1) {check out line+ /cluster/data/cavPor3/bed/multiz8way/splitRun/maf/$(dir1)/$(root1).maf}
#ENDLOOP
_EOF_
    # << emacs

    sed -e "s/^\.\///" ../8-way.contig.list \
    | gensub2 stdin single template jobList

    para create jobList
    para try ... check ...

    ## first 20 failed because of tree.8.nh:

    ### Hand edited tree.8.nh from cavPor3.8-way.nh to:
    # (((((cavPor3 ((mm9 rn4) speTri0)) oryCun1) hg18) monDom4) galGal3)

    # Next 10 succeeded
    # 296 jobs in batch
    # 15118 jobs (including everybody's) in Parasol queue.
    # Checking finished jobs
    # Completed: 10 of 296 jobs
    # Crashed: 20 jobs
    # CPU time in finished jobs:       5742s      95.69m     1.59h    0.07d  0.000 y
    # IO & Wait Time:                    76s       1.27m     0.02h    0.00d  0.000 y
    # Average job time:                 582s       9.70m     0.16h    0.01d
    # Longest running job:                0s       0.00m     0.00h    0.00d
    # Longest finished job:            1482s      24.70m     0.41h    0.02d
    # Submission to last job:          2712s      45.20m     0.75h    0.03d
    # Estimated complete:                 0s       0.00m     0.00h    0.00d

    para push; para check
    # 296 jobs in batch
    # 13908 jobs (including everybody's) in Parasol queue.
    # Checking finished jobs
    # running: 25
    # ranOk: 271
    # total jobs in batch: 296
    # [pk:tdreszer run> pc
    # 296 jobs in batch
    # 10574 jobs (including everybody's) in Parasol queue.
    # Checking finished jobs
    # .
    # ranOk: 296
    # total jobs in batch: 296
    para -eta time
    # 296 jobs in batch
    # 10542 jobs (including everybody's) in Parasol queue.
    # Checking finished jobs
    # Completed: 296 of 296 jobs
    # CPU time in finished jobs:      81235s    1353.92m    22.57h    0.94d  0.003 y
    # IO & Wait Time:                 11960s     199.33m     3.32h    0.14d  0.000 y
    # Average job time:                 315s       5.25m     0.09h    0.00d
    # Longest running job:                0s       0.00m     0.00h    0.00d
    # Longest finished job:            3262s      54.37m     0.91h    0.04d
    # Submission to last job:         59090s     984.83m    16.41h    0.68d
    # Estimated complete:                 0s       0.00m     0.00h    0.00d

    # put the split maf results back together into a single maf file
    #   eliminate duplicate comments
    ssh kkstore05
    cd /cluster/data/cavPor3/bed/multiz8way
    mkdir togetherMaf
    grep "^##maf version" splitRun/maf/0/0/Contig00700.maf \
    | sort -u > togetherMaf/cavPor3.8way.maf
    ##maf version=1 scoring=autoMZ.v1
    ##maf version=1 scoring=maf_project.v12
    ##maf version=1 scoring=multiz

    for F in `find ./splitRun/maf -type f -depth`
do
    grep -h "^#" "${F}" | egrep -v "maf version=1|eof maf" \
    | sed -e "s#/_MZ_[^ ]* # #g; s#__[0-9]##g"
done | sort -u >> togetherMaf/cavPor3.8way.maf
    for F in `find ./splitRun/maf -type f -depth`
do
    grep -v -h "^#" "${F}"
done >> togetherMaf/cavPor3.8way.maf

    grep "^##eof maf" splitRun/maf/0/0/Contig00700.maf \
    | sort -u >> togetherMaf/cavPor3.8way.maf

    # load tables for a look
    ssh hgwdev
    mkdir -p /gbdb/cavPor3/multiz8way/maf
    ln -s /cluster/data/cavPor3/bed/multiz8way/togetherMaf/*.maf \
                /gbdb/cavPor3/multiz8way/maf/multiz8way.maf
    # this generates an immense multiz8way.tab file in the directory
    #   where it is running.  Best to run this over in scratch.
    cd /scratch/tmp
    time nice -n +19 hgLoadMaf \
    -pathPrefix=/gbdb/cavPor3/multiz8way/maf cavPor3 multiz8way &
    # Advisory lock created
    # Indexing and tabulating /gbdb/cavPor3/multiz8way/maf/multiz8way.maf
    # Loading multiz8way into database
    # Loaded 8976211 mafs in 1 files from /gbdb/cavPor3/multiz8way/maf
    # Advisory lock has been released
    # real    4m26.559s
    # user    2m35.837s
    # sys     0m47.660s

    # load summary table
    time nice -n +19 cat /gbdb/cavPor3/multiz8way/maf/*.maf \
	| hgLoadMafSummary cavPor3 -minSize=30000 -mergeGap=1500 \
	 -maxSize=200000  multiz8waySummary stdin
    # Indexing and tabulating stdin
    # Created 1650940 summary blocks from 26384188 components and 8747391 mafs from stdin
    # Loading into cavPor3 table multiz8waySummary...
    # Loading completeAdvisory lock has been released
    # real    5m31.133s
    # user    4m55.729s
    # sys     0m21.743s

    # Gap Annotation
    # prepare bed files with gap info
    ssh kkstore05
    mkdir /cluster/data/cavPor3/bed/multiz8way/anno
    cd /cluster/data/cavPor3/bed/multiz8way/anno
    mkdir maf run

    #	these actually already all exist from previous multiple alignments
    #	remove the echo in front of the twoBitInfo to actually make it work
    for DB in `cat ../species.list`
do
    CDIR="/cluster/data/${DB}"
    if [ ! -f ${CDIR}/${DB}.N.bed ]; then
	echo "creating ${DB}.N.bed"
	echo twoBitInfo -nBed ${CDIR}/${DB}.2bit ${CDIR}/${DB}.N.bed
    else
	ls -og ${CDIR}/${DB}.N.bed
    fi
done
    # creating cavPor3.N.bed
    # twoBitInfo -nBed /cluster/data/cavPor3/cavPor3.2bit /cluster/data/cavPor3/cavPor3.N.bed
    # -rw-rw-r--  1 2164385 Jul 18  2006 /cluster/data/galGal3/galGal3.N.bed
    # -rw-rw-r--  1 232970 Feb  6  2006 /cluster/data/hg18/hg18.N.bed
    # -rw-rw-r--  1 27838 Oct 15  2007 /cluster/data/mm9/mm9.N.bed
    # -rw-rw-r--  1 1788138 Feb 28  2006 /cluster/data/monDom4/monDom4.N.bed
    # -rw-rw-r--  1 13782261 Nov 13  2005 /cluster/data/oryCun1/oryCun1.N.bed
    # -rw-rw-r--  1 20910683 Feb 28  2006 /cluster/data/rn4/rn4.N.bed
    # creating speTri0.N.bed
    # twoBitInfo -nBed /cluster/data/speTri0/speTri0.2bit /cluster/data/speTri0/speTri0.N.bed

    cd run
    rm -f nBeds sizes
    for DB in `grep -v cavPor3 ../../species.list`
do
    echo "${DB} "
    ln -s  /cluster/data/${DB}/${DB}.N.bed ${DB}.bed
    echo ${DB}.bed  >> nBeds
    ln -s  /cluster/data/${DB}/chrom.sizes ${DB}.len
    echo ${DB}.len  >> sizes
done
    # galGal3
    # hg18
    # mm9
    # monDom4
    # oryCun1
    # rn4
    # speTri0

    ssh memk
    #	temporarily copy the cavPor3.8way.maf file onto the memk
    #	nodes /scratch/data/cavPor3/maf/ directory
    for R in 0 1 2 3 4 5 6 7
do
    ssh mkr0u${R} rsync -a --progress \
/cluster/data/cavPor3/bed/multiz8way/togetherMaf/cavPor3.8way.maf \
	/scratch/data/cavPor3/maf/
done

    mkdir /cluster/data/cavPor3/bed/multiz8way/anno/splitMaf
    #	need to split up the single maf file into individual
    #	per-scaffold maf files to run annotation on
    cd /cluster/data/cavPor3/bed/multiz8way/anno/splitMaf
    #	create bed files to list approximately 394 scaffolds in
    #	a single list, approximately 8 lists
    cat << \_EOF_ > mkBedLists.pl
#!/usr/bin/env perl

use strict;
use warnings;

my $bedCount = 0;
my $i = 0;

my $bedFile = sprintf("file_%d.bed", $bedCount);

open (BF,">$bedFile") or die "can not write to $bedFile $!";

open (FH,"</cluster/data/cavPor3/chrom.sizes") or
        die "can not read /cluster/data/cavPor3/chrom.sizes $!";
while (my $line = <FH>) {
    chomp $line;
    if ( (($i + 1) % 394) == 0 )  {
        printf "%s\n", $line;
        close (BF);
        ++$bedCount;
        $bedFile = sprintf("file_%d.bed", $bedCount);
        open (BF,">$bedFile") or die "can not write to $bedFile $!";
    }
    ++$i;
    my ($chr, $size) = split('\s+',$line);
    printf BF "%s\t0\t%d\t%s\n", $chr, $size, $chr;
}
close (FH);
# close (BH);
_EOF_
    # << happy emacs
    chmod +x mkBedLists.pl
    ./mkBedLists.pl
    # -rw-rw-r--  1 tdreszer protein 13805 Jul 10 16:23 file_0.bed
    # -rw-rw-r--  1 tdreszer protein 13602 Jul 10 16:23 file_1.bed
    # -rw-rw-r--  1 tdreszer protein 13756 Jul 10 16:23 file_2.bed
    # -rw-rw-r--  1 tdreszer protein 14166 Jul 10 16:23 file_3.bed
    # -rw-rw-r--  1 tdreszer protein 14090 Jul 10 16:23 file_4.bed
    # -rw-rw-r--  1 tdreszer protein 13790 Jul 10 16:23 file_5.bed
    # -rw-rw-r--  1 tdreszer protein 13790 Jul 10 16:23 file_6.bed
    # -rw-rw-r--  1 tdreszer protein 13545 Jul 10 16:23 file_7.bed

    #	now, run a mafsInRegion on each one of those lists
    cat << \_EOF_ > runOne
#!/bin/csh -fe
set runDir = "/cluster/data/cavPor3/bed/multiz8way/anno/splitMaf"
set resultDir = $1
set bedFile = $resultDir.bed
mkdir -p $resultDir
mkdir -p /scratch/tmp/cavPor3/$resultDir
pushd /scratch/tmp/cavPor3/$resultDir
mafsInRegion $runDir/$bedFile -outDir . \
        /scratch/data/cavPor3/maf/cavPor3.8way.maf
popd
rsync -q -a /scratch/tmp/cavPor3/$resultDir/ ./$resultDir/
rm -fr /scratch/tmp/cavPor3/$resultDir
rmdir --ignore-fail-on-non-empty /scratch/tmp/cavPor3
_EOF_
    # << happy emacs
    chmod +x runOne

    cat << \_EOF_ > template
#LOOP
./runOne $(root1)
#ENDLOOP
_EOF_
    # << happy emacs

    ls file*.bed > runList
    gensub2 runList single template jobList
    para create jobList
    para try ... check ... push ... etc
    # 8 jobs in batch
    # 0 jobs (including everybody's) in Parasol queue.
    # Checking finished jobs
    # Completed: 8 of 8 jobs
    # CPU time in finished jobs:       1810s      30.16m     0.50h    0.02d  0.000 y
    # IO & Wait Time:                  1218s      20.30m     0.34h    0.01d  0.000 y
    # Average job time:                 379s       6.31m     0.11h    0.00d
    # Longest running job:                0s       0.00m     0.00h    0.00d
    # Longest finished job:             614s      10.23m     0.17h    0.01d
    # Submission to last job:           614s      10.23m     0.17h    0.01d
    # Estimated complete:                 0s       0.00m     0.00h    0.00d

    cd /cluster/data/cavPor3/bed/multiz8way/anno/run

    cat << \_EOF_ > doAnno.csh
#!/bin/csh -ef
set outDir = ../maf/$2
set result = $3
set input = $1
mkdir -p $outDir
cat $input | \
nice mafAddIRows -nBeds=nBeds stdin /scratch/data/cavPor3/cavPor3.2bit $result
_EOF_
    # << happy emacs
    chmod +x doAnno.csh

    cat << \_EOF_ > template
#LOOP
./doAnno.csh $(path1) $(lastDir1) {check out line+ ../maf/$(lastDir1)/$(root1).maf}
#ENDLOOP
_EOF_
    # << happy emacs

    find ../splitMaf -type f -name "*.maf" > maf.list
    gensub2 maf.list single template jobList
    para create jobList
    para try ... check ... push ... etc.
    1849 jobs in batch
    # 0 jobs (including everybody's) in Parasol queue.
    # Checking finished jobs
    # unsubmitted jobs: 1839
    # crashed: 10
    # total jobs in batch: 1849
    # Couldn't open speTri0.bed , No such file or directory
    # lrwxrwxrwx  1 tdreszer protein     35 Jul 10 14:13 speTri0.bed -> /cluster/data/speTri0/speTri0.N.bed
    ### /cluster/data/speTri0/speTri0.N.bed Does not exist...
    twoBitInfo -nBed /cluster/data/speTri0/speTri0.2bit /cluster/data/speTri0/speTri0.N.bed &
    para try; para push...
    # 1849 jobs in batch
    # 0 jobs (including everybody's) in Parasol queue.
    # Checking finished jobs
    # Completed: 1849 of 1849 jobs
    # CPU time in finished jobs:       6313s     105.22m     1.75h    0.07d  0.000 y
    # IO & Wait Time:                  5708s      95.13m     1.59h    0.07d  0.000 y
    # Average job time:                   7s       0.11m     0.00h    0.00d
    # Longest running job:                0s       0.00m     0.00h    0.00d
    # Longest finished job:             171s       2.85m     0.05h    0.00d
    # Submission to last job:           651s      10.85m     0.18h    0.01d
    # Estimated complete:                 0s       0.00m     0.00h    0.00d

    #	put the results back together into a single file
    ssh kkstore05
    cd /cluster/data/cavPor3/bed/multiz8way/anno
    grep "^##maf version" maf/file_0//scaffold_0.maf \
	| sort -u > cavPor3.anno.8way.maf
    ##maf version=1 scoring=autoMZ.v1
    find ./maf -type f -depth -name "*.maf" | while read F
do
    grep -v -h "^#" "${F}"
done >> cavPor3.anno.8way.maf
    echo "##eof maf" >> cavPor3.anno.8way.maf
    # -rw-rw-r--  1 tdreszer protein 15080110322 Jul 11 14:14 cavPor3.anno.8way.maf

    ssh hgwdev
    cd /cluster/data/cavPor3/bed/multiz8way/anno
    mkdir -p /gbdb/cavPor3/multiz8way/anno
    ln -s `pwd`/cavPor3.anno.8way.maf \
                /gbdb/cavPor3/multiz8way/anno/multiz8way.maf
    #	by loading this into the table multiz8way, it will replace the
    #	previously loaded table with the unannotated mafs
    #	huge temp files are made, do them on local disk
    cd /scratch/tmp
    time nice -n +19 hgLoadMaf -pathPrefix=/gbdb/cavPor3/multiz8way/anno \
                cavPor3 multiz8way
    # Loaded 10486949 mafs in 1 files from /gbdb/cavPor3/multiz8way/anno
    # Advisory lock has been released
    # real    6m19.376s

    #	normally filter this for chrom size > 1,000,000 and only load
    #	those chroms.  But this is a scaffold assembly, load everything:
    time nice -n +19 hgLoadMafSummary cavPor3 -minSize=30000 -mergeGap=1500 \
	-maxSize=200000  multiz8waySummary \
	    /gbdb/cavPor3/multiz8way/anno/multiz8way.maf
    # Created 1650940 summary blocks from 26384188 components
    # and 10209337 mafs from /gbdb/cavPor3/multiz8way/anno/multiz8way.maf
    # real    6m29.261s

    #	by loading this into the table multiz8waySummary, it will replace
    #	the previously loaded table with the unannotated mafs
    #	remove the multiz8way*.tab files in this /scratch/tmp directory
    rm multiz8way*.tab
    #	And, you can remove the previously loaded non-annotated maf file link:
    rm /gbdb/cavPor3/multiz8way/maf/multiz8way.maf
    rmdir /gbdb/cavPor3/multiz8way/maf

# ### >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> got this far
# ### >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> got this far
# ### >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> got this far
# ### >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> got this far
# ### >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> got this far
# ### >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> got this far

##################
# Effort to move to hive
##################

ssh hgwdev
mv /hive/archive/store12/cavPor3 /hive/data/genomes/cavPor3
ln -sf /hive/data/genomes/cavPor3 /cluster/data/cavPor3
find /gbdb/cavPor3 /usr/local/apache/htdocs/goldenPath/cavPor3 -type l -ls | grep /cluster/store


################## EVERYTHING THAT FOLLOWS IS TEMPLATE FOR WHAT NEEDS TO BE DONE ######################
################## EVERYTHING THAT FOLLOWS IS TEMPLATE FOR WHAT NEEDS TO BE DONE ######################
################## EVERYTHING THAT FOLLOWS IS TEMPLATE FOR WHAT NEEDS TO BE DONE ######################
################## EVERYTHING THAT FOLLOWS IS TEMPLATE FOR WHAT NEEDS TO BE DONE ######################


###########################################################################
## Annotate 8-way multiple alignment with gene annotations
##		(START - 2008-07-14 - Tim)
    # Gene frames
    ## given previous survey done for 9-way alignment on Marmoset
    ## and the appearance of new ensGene tables on everything
    #	use knownGene for hg18, mm9
    #	use ensGene for canFam2, ornAna1
    #	and refGene for cavPor3

    ssh hgwdev
    mkdir /cluster/data/cavPor3/bed/multiz8way/frames
    cd /cluster/data/cavPor3/bed/multiz8way/frames
    mkdir genes
    # knownGene
    for DB in hg18 mm9 rn4
do
    hgsql -N -e "select name,chrom,strand,txStart,txEnd,cdsStart,cdsEnd,exonCount,exonStarts,exonEnds from knownGene" ${DB} \
      | genePredSingleCover stdin stdout | gzip -2c \
        > /scratch/tmp/${DB}.tmp.gz
    mv /scratch/tmp/${DB}.tmp.gz genes/$DB.gp.gz
    echo "${DB} done"
done
    # ensGene
    for DB in oryCun1 ornAna1
do
    hgsql -N -e "select name,chrom,strand,txStart,txEnd,cdsStart,cdsEnd,exonCount,exonStarts,exonEnds from ensGene" ${DB} \
      | genePredSingleCover stdin stdout | gzip -2c \
        > /scratch/tmp/${DB}.tmp.gz
    mv /scratch/tmp/${DB}.tmp.gz genes/$DB.gp.gz
    echo "${DB} done"
done
    # refGene
    for DB in cavPor3
do
    hgsql -N -e "select name,chrom,strand,txStart,txEnd,cdsStart,cdsEnd,exonCount,exonStarts,exonEnds from refGene" ${DB} \
      | genePredSingleCover stdin stdout | gzip -2c \
        > /scratch/tmp/${DB}.tmp.gz
    mv /scratch/tmp/${DB}.tmp.gz genes/$DB.gp.gz
    echo "${DB} done"
done

    ls -og genes
# -rw-rw-r--  1  861210 Mar 18 15:40 cavPor3.gp.gz
# -rw-rw-r--  1 1865308 Mar 18 15:40 canFam2.gp.gz
# -rw-rw-r--  1 2008806 Mar 18 15:39 hg18.gp.gz
# -rw-rw-r--  1 1965274 Mar 18 15:39 mm9.gp.gz
# -rw-rw-r--  1 1347532 Mar 18 15:40 ornAna1.gp.gz

    ssh kkstore06
    cd /cluster/data/cavPor3/bed/multiz5way/frames
    #	anything to annotate is in a pair, e.g.: cavPor3 genes/cavPor3.gp.gz
    time (cat  ../anno/cavPor3.anno.5way.maf | nice -n +19 genePredToMafFrames cavPor3 stdin stdout cavPor3 genes/cavPor3.gp.gz hg18 genes/hg18.gp.gz mm9 genes/mm9.gp.gz canFam2 genes/canFam2.gp.gz ornAna1 genes/ornAna1.gp.gz | gzip > multiz5way.mafFrames.gz) > frames.log 2>&1
    # see what it looks like in terms of number of annotations per DB:
    zcat multiz5way.mafFrames.gz | cut -f4 | sort | uniq -c | sort -n
#  77526 cavPor3
# 110768 ornAna1
# 221524 mm9
# 230010 hg18
# 243396 canFam2

    #	load the resulting file
    ssh hgwdev
    cd /cluster/data/cavPor3/bed/multiz5way/frames
    time nice -n +19 hgLoadMafFrames cavPor3 multiz5wayFrames \
	multiz5way.mafFrames.gz
    #	real    0m21.968s

    #	enable the trackDb entries:
# frames multiz5wayFrames
# irows on
#############################################################################
# phastCons 5-way (DONE - 2008-03-19 - Hiram)

    # split 5way mafs into 10M chunks and generate sufficient statistics
    # files for # phastCons
    ssh memk
    mkdir /cluster/data/cavPor3/bed/multiz5way/msa.split
    cd /cluster/data/cavPor3/bed/multiz5way/msa.split
    mkdir -p /san/sanvol1/scratch/cavPor3/multiz5way/cons/ss

    cat << '_EOF_' > doSplit.csh
#!/bin/csh -ef
set MAFS = /cluster/data/cavPor3/bed/multiz5way/anno/maf
set WINDOWS = /san/sanvol1/scratch/cavPor3/multiz5way/cons/ss
pushd $WINDOWS
set resultDir = $1
set c = $2
rm -fr $resultDir/$c
mkdir -p $resultDir
twoBitToFa -seq=$c /scratch/data/cavPor3/cavPor3.2bit /scratch/tmp/cavPor3.$c.fa
# need to truncate odd-ball scaffold/chrom names that include dots
# as phastCons utils can't handle them
set TMP = /scratch/tmp/$c.clean.maf.$$
perl -wpe 's/^s ([^.]+\.[^. ]+)\.\S+/s $1/' $MAFS/$resultDir/$c.maf > $TMP
/cluster/bin/phast/$MACHTYPE/msa_split $TMP -i MAF \
    -M /scratch/tmp/cavPor3.$c.fa \
    -o SS -r $resultDir/$c -w 10000000,0 -I 1000 -B 5000
rm -f /scratch/tmp/cavPor3.$c.fa
rm -f $TMP
popd
mkdir -p $resultDir
date > $resultDir/$c.out
'_EOF_'
    # << happy emacs
    chmod +x doSplit.csh

    cat << '_EOF_' > template
#LOOP
doSplit.csh $(dir1) $(root1) {check out line+ $(dir1)/$(root1).out}
#ENDLOOP
'_EOF_'
    # << happy emacs

    #	create list of maf files:
    (cd ../anno/maf; find . -type f) | sed -e "s#^./##" > maf.list

    gensub2 maf.list single template jobList
    para create jobList
    para try ... check ... etc
# Completed: 2320 of 2320 jobs
# CPU time in finished jobs:       1710s      28.50m     0.47h    0.02d  0.000 y
# IO & Wait Time:                  6951s     115.85m     1.93h    0.08d  0.000 y
# Average job time:                   4s       0.06m     0.00h    0.00d
# Longest finished job:             128s       2.13m     0.04h    0.00d
# Submission to last job:          1048s      17.47m     0.29h    0.01d

    # take the cons and noncons trees from the mouse 30-way

    #	Estimates are not easy to make, probably more correctly,
    #	take the 30-way .mod file, and re-use it here.
    ssh hgwdev
    cd /cluster/data/cavPor3/bed/multiz5way
    cp -p /cluster/data/mm9/bed/multiz30way/mm9.30way.mod .

    # Run phastCons
    #	This job is I/O intensive in its output files, thus it is all
    #	working over in /scratch/tmp/
    ssh memk
    mkdir -p /cluster/data/cavPor3/bed/multiz5way/cons/run.cons
    cd /cluster/data/cavPor3/bed/multiz5way/cons/run.cons

    #	there are going to be several different phastCons runs using
    #	this same script.  They trigger off of the current working directory
    #	$cwd:t which is the "grp" in this script.  It is one of:
    #	all gliers placentals
    #	Well, that's what it was when used in the Mm9 30-way,
    #	in this instance, there is only the directory "all"

    cat << '_EOF_' > doPhast.csh
#!/bin/csh -fe
set PHASTBIN = /cluster/bin/phast.build/cornellCVS/phast/bin
set subDir = $1
set f = $2
set c = $2:r
set len = $3
set cov = $4
set rho = $5
set grp = $cwd:t
set tmp = /scratch/tmp/$f
set cons = /cluster/data/cavPor3/bed/multiz5way/cons
mkdir -p $tmp
set san = /san/sanvol1/scratch/cavPor3/multiz5way/cons
if (-s $cons/$grp/$grp.non-inf) then
  cp -p $cons/$grp/$grp.mod $cons/$grp/$grp.non-inf $tmp
  cp -p $san/ss/$subDir/$f.ss $cons/$grp/$grp.mod $cons/$grp/$grp.non-inf $tmp
else
  cp -p $cons/$grp/$grp.mod $tmp
  cp -p $san/ss/$subDir/$f.ss $cons/$grp/$grp.mod $tmp
endif
pushd $tmp > /dev/null
if (-s $grp.non-inf) then
  $PHASTBIN/phastCons $f.ss $grp.mod \
    --rho $rho --expected-length $len --target-coverage $cov --quiet \
    --not-informative `cat $grp.non-inf` \
    --seqname $c --idpref $c --most-conserved $f.bed --score > $f.pp
else
  $PHASTBIN/phastCons $f.ss $grp.mod \
    --rho $rho --expected-length $len --target-coverage $cov --quiet \
    --seqname $c --idpref $c --most-conserved $f.bed --score > $f.pp
endif
popd > /dev/null
mkdir -p $san/$grp/pp/$subDir $san/$grp/bed/$subDir
sleep 4
touch $san/$grp/pp/$subDir $san/$grp/bed/$subDir
rm -f $san/$grp/pp/$subDir/$f.pp
rm -f $san/$grp/bed/$subDir/$f.bed
mv $tmp/$f.pp $san/$grp/pp/$subDir
mv $tmp/$f.bed $san/$grp/bed/$subDir
rm -fr $tmp
'_EOF_'
    # << happy emacs
    chmod a+x doPhast.csh

    # Create parasol batch and run it
    pushd /san/sanvol1/scratch/cavPor3/multiz5way/cons
    find ./ss -type f -name "*.ss" | sed -e "s#^./##; s/.ss$//" \
	> /cluster/data/cavPor3/bed/multiz5way/cons/ss.list
    popd

    # run for all species
    cd ..
    mkdir -p all run.cons/all
    cd all
    /cluster/bin/phast.cz/tree_doctor ../../mm9.30way.mod \
--prune-all-but=bosTau3,hg18,mm9,canFam2,ornAna1 \
	| sed -e "s/bosTau3/cavPor3/" > all.mod
    cd ../run.cons/all

    #	root1 == chrom name, file1 == ss file name without .ss suffix
    # Create template file for "all" run
    cat << '_EOF_' > template
#LOOP
../doPhast.csh $(lastDir1) $(file1) 45 .3 .31 {check out line+ /san/sanvol1/scratch/cavPor3/multiz5way/cons/all/pp/$(lastDir1)/$(file1).pp}
#ENDLOOP
'_EOF_'
    # << happy emacs
    gensub2 ../../ss.list single template jobList
    para create jobList
    para try ... check ... push ... etc.
# Completed: 2569 of 2569 jobs
# CPU time in finished jobs:       8636s     143.93m     2.40h    0.10d  0.000 y
# IO & Wait Time:                 17371s     289.52m     4.83h    0.20d  0.001 y
# Average job time:                  10s       0.17m     0.00h    0.00d
# Longest finished job:              44s       0.73m     0.01h    0.00d
# Submission to last job:          1008s      16.80m     0.28h    0.01d

    # create Most Conserved track
    ssh kolossus
    cd /san/sanvol1/scratch/cavPor3/multiz5way/cons/all
    find ./bed -type f -name "chr*.bed" | xargs cat \
	| sort -k1,1 -k2,2n | \
        awk '{printf "%s\t%d\t%d\tlod=%d\t%s\n", $1, $2, $3, $5, $5;}' | \
            /cluster/bin/scripts/lodToBedScore /dev/stdin > mostConserved.bed
    #	~ 3 minutes
    cp -p mostConserved.bed /cluster/data/cavPor3/bed/multiz5way/cons/all

    # load into database
    ssh hgwdev
    cd /cluster/data/cavPor3/bed/multiz5way/cons/all
    time nice -n +19 hgLoadBed cavPor3 phastConsElements5way mostConserved.bed
    #	Loaded 1005876 elements of size 5

    # Try for 5% overall cov, and 70% CDS cov
    #	We don't have any gene tracks to compare CDS coverage
    #	--rho .31 --expected-length 45 --target-coverage .3
    featureBits cavPor3 phastConsElements5way
    #	132010504 bases of 2731830700 (4.832%) in intersection

    # Create merged posterier probability file and wiggle track data files
    # currently doesn't matter where this is performed, the san is the same
    # network distance from all machines.
    # sort by chromName, chromStart so that items are in numerical order
    #  for wigEncode
    cd /san/sanvol1/scratch/cavPor3/multiz5way/cons/all
    mkdir -p phastCons5wayScores

for D in `ls -1d pp/file* | sort -t_ -k2n`
do
    TOP=`pwd`
    F=${D/pp\/}
    out=${TOP}/phastCons5wayScores/${F}.data.gz
    echo "${D} > ${F}.data.gz"
    cd ${D}
    find . -name "*.pp" -type f \
	| sed -e "s#^./##; s/chrUn.004./chrUn_004_/; s/-/.-./" \
	| sort -t '.' -k1,1 -k3.3n \
	| sed -e "s/.-./-/; s/chrUn_004_/chrUn.004./" | xargs cat \
	| gzip > ${out}
    cd "${TOP}"
done

    #	copy those files to the downloads area:
# /cluster/data/cavPor3/bed/multiz5way/downloads/phastCons5way/phastConsScores
    #	for hgdownload downloads

    # Create merged posterier probability file and wiggle track data files
    # currently doesn't matter where this is performed, the san is the same
    # network distance from all machines.
    cd /san/sanvol1/scratch/cavPor3/multiz5way/cons/all
    ls -1 phastCons5wayScores/*.data.gz | sort -t_ -k2n | xargs zcat \
	| wigEncode -noOverlap stdin phastCons5way.wig phastCons5way.wib
    # Converted stdin, upper limit 1.00, lower limit 0.00
    time nice -n +19 cp -p *.wi? /cluster/data/cavPor3/bed/multiz5way/cons/all
    #	real    0m40.875s

    # Load gbdb and database with wiggle.
    ssh hgwdev
    cd /cluster/data/cavPor3/bed/multiz5way/cons/all
    ln -s `pwd`/phastCons5way.wib /gbdb/cavPor3/multiz5way/phastCons5way.wib
    time nice -n +19 hgLoadWiggle -pathPrefix=/gbdb/cavPor3/multiz5way cavPor3 \
	phastCons5way phastCons5way.wig
    #	real    1m5.667s
    # remove garbage
    rm wiggle.tab

    #  Create histogram to get an overview of all the data
    ssh hgwdev
    cd /cluster/data/cavPor3/bed/multiz5way/cons/all
    time nice -n +19 hgWiggle -doHistogram \
	-hBinSize=0.001 -hBinCount=1000 -hMinVal=0.0 -verbose=2 \
	    -db=cavPor3 phastCons5way > histogram.data 2>&1
    #	real    3m37.316s

    #	create plot of histogram:

    cat << '_EOF_' | gnuplot > histo.png
set terminal png small color x000000 xffffff xc000ff x66ff66 xffff00 x00ffff
set size 1.4, 0.8
set key left box
set grid noxtics
set grid ytics
set title " Cow BosTau4 Histogram phastCons5way track"
set xlabel " phastCons5way score"
set ylabel " Relative Frequency"
set y2label " Cumulative Relative Frequency (CRF)"
set y2range [0:1]
set y2tics
set yrange [0:0.02]

plot "histogram.data" using 2:5 title " RelFreq" with impulses, \
        "histogram.data" using 2:7 axes x1y2 title " CRF" with lines
'_EOF_'
    #	<< happy emacs

    display histo.png &

    #	These trackDb entries turn on the wiggle phastCons data track:
    #	type wigMaf 0.0 1.0
    #	maxHeightPixels 100:40:11
    #	wiggle phastCons5way
    #	spanList 1
    #	autoScale Off
    #	windowingFunction mean
    #	pairwiseHeight 12
    #	yLineOnOff Off

#############################################################################
#  Downloads (DONE - 2008-01-11 - Hiram)
    #	Let's see if the downloads will work
    ssh hgwdev
    /cluster/data/cavPor3
    #	expecting to find repeat masker .out file here:
    ln -s bed/RepeatMasker/cavPor3.fa.out .
    time nice -n +19 /cluster/bin/scripts/makeDownloads.pl \
	-workhorse=hgwdev cavPor3 > jkStuff/downloads.log 2>&1
    #	real    24m3.210s
    #	failed making upstream sequences:
    #	featureBits cavPor3 mgcGenes:upstream:1000 -fa=stdout
    #	setpriority: Permission denied.
    #	the 'nice' from my bash shell causes trouble inside the csh
    #	script which uses nice.  Finish off the install step manually
    #	with the mgcGenes upstreams ...

#############################################################################
#  PushQ entries (DONE - 2008-01-11 - Hiram)
    ssh hgwdev
    /cluster/data/cavPor3
    /cluster/bin/scripts/makePushQSql.pl cavPor3 > jkStuff/pushQ.sql
    #	output warnings:
# cavPor3 does not have seq
# cavPor3 does not have gbMiscDiff
# Could not tell (from trackDb, all.joiner and hardcoded lists of supporting
# and genbank tables) which tracks to assign these tables to:
#	genscanPep

#############################################################################
#  create download files (DONE - 2008-03-19 - Hiram)
    ssh hgwdev
    cd /cluster/data/cavPor3
    ln -s /cluster/data/cavPor3/bed/repeatMasker/cavPor3.fa.out .
    makeDownloads.pl cavPor3 > makeDownloads.log 2>&1

    #	*EDIT* the README files and ensure they are correct

#############################################################################
#  PushQ entries (DONE - 2008-03-19 - Hiram)
    ssh hgwdev
    /cluster/data/cavPor3
    /cluster/bin/scripts/makePushQSql.pl cavPor3 > jkStuff/pushQ.sql
    #	output warnings:
# hgwdev does not have /usr/local/apache/htdocs/goldenPath/cavPor3/liftOver/cavPor3ToBosTau*
# cavPor3 does not have seq

# Could not tell (from trackDb, all.joiner and hardcoded lists of supporting
# and genbank tables) which tracks to assign these tables to:
#	genscanPep

    #	looks like there should be a bosTau3 to cavPor3 liftOver run

###########################################################################
# HUMAN (hg18) PROTEINS TRACK (DONE braney 2008-03-28)
    ssh kkstore06
    # bash  if not using bash shell already

    mkdir /cluster/data/cavPor3/blastDb
    cd /cluster/data/cavPor3
    grep -v chrUn chrom.sizes | awk '{print $1}' > chr.lst
    for i in `cat chr.lst`;
    do twoBitToFa  cavPor3.unmasked.2bit -seq=$i stdout;
    done > temp.fa
    faSplit gap temp.fa 1000000 blastDb/x -lift=blastDb.lft

    grep chrUn chrom.sizes | awk '{print $1}' > chr.lst
    for i in `cat chr.lst`;
    do twoBitToFa  cavPor3.unmasked.2bit -seq=$i stdout;
    done > temp.fa
    faSplit sequence temp.fa 150 blastDb/y
    rm temp.fa chr.lst

    cd blastDb
    for i in *.fa
    do
	/cluster/bluearc/blast229/formatdb -i $i -p F
    done
    rm *.fa
    ls *.nsq | wc -l
#   3440

    mkdir -p /san/sanvol1/scratch/cavPor3/blastDb
    cd /cluster/data/cavPor3/blastDb
    for i in nhr nin nsq;
    do
	echo $i
	cp *.$i /san/sanvol1/scratch/cavPor3/blastDb
    done

    mkdir -p /cluster/data/cavPor3/bed/tblastn.hg18KG
    cd /cluster/data/cavPor3/bed/tblastn.hg18KG
    echo  /san/sanvol1/scratch/cavPor3/blastDb/*.nsq | xargs ls -S | sed "s/\.nsq//"  > query.lst
    wc -l query.lst
# 3440 query.lst

   # we want around 350000 jobs
   calc `wc /cluster/data/hg18/bed/blat.hg18KG/hg18KG.psl | awk '{print $1}'`/\(350000/`wc query.lst | awk '{print $1}'`\)

# 36727/(350000/3440) = 360.973943

   mkdir -p /cluster/bluearc/cavPor3/bed/tblastn.hg18KG/kgfa
   split -l 361 /cluster/data/hg18/bed/blat.hg18KG/hg18KG.psl  /cluster/bluearc/cavPor3/bed/tblastn.hg18KG/kgfa/kg
   ln -s /cluster/bluearc/cavPor3/bed/tblastn.hg18KG/kgfa kgfa
   cd kgfa
   for i in *; do
     nice pslxToFa $i $i.fa;
     rm $i;
     done
   cd ..
   ls -1S kgfa/*.fa > kg.lst
   mkdir -p /cluster/bluearc/cavPor3/bed/tblastn.hg18KG/blastOut
   ln -s /cluster/bluearc/cavPor3/bed/tblastn.hg18KG/blastOut
   for i in `cat kg.lst`; do  mkdir blastOut/`basename $i .fa`; done
   tcsh
   cd /cluster/data/cavPor3/bed/tblastn.hg18KG
   cat << '_EOF_' > blastGsub
#LOOP
blastSome $(path1) {check in line $(path2)} {check out exists blastOut/$(root2)/q.$(root1).psl }
#ENDLOOP
'_EOF_'

   cat << '_EOF_' > blastSome
#!/bin/sh
BLASTMAT=/cluster/bluearc/blast229/data
export BLASTMAT
g=`basename $2`
f=/tmp/`basename $3`.$g
for eVal in 0.01 0.001 0.0001 0.00001 0.000001 1E-09 1E-11
do
if /cluster/bluearc/blast229/blastall -M BLOSUM80 -m 0 -F no -e $eVal -p tblastn -d $1 -i $2 -o $f.8
then
        mv $f.8 $f.1
        break;
fi
done
if test -f  $f.1
then
    if /cluster/bin/i386/blastToPsl $f.1 $f.2
    then
	liftUp -nosort -type=".psl" -nohead $f.3 /cluster/data/cavPor3/blastDb.lft carry $f.2
        liftUp -nosort -type=".psl" -pslQ -nohead $3.tmp /cluster/data/hg18/bed/blat.hg18KG/protein.lft warn $f.3

        if pslCheck -prot $3.tmp
        then
            mv $3.tmp $3
            rm -f $f.1 $f.2 $f.3 $f.4
        fi
        exit 0
    fi
fi
rm -f $f.1 $f.2 $3.tmp $f.8 $f.3 $f.4
exit 1
'_EOF_'
    # << happy emacs
    chmod +x blastSome
    gensub2 query.lst kg.lst blastGsub blastSpec
    exit

    ssh pk
    cd /cluster/data/cavPor3/bed/tblastn.hg18KG
    para create blastSpec
#    para try, check, push, check etc.

    para time

Completed: 350880 of 350880 jobs
CPU time in finished jobs:   27082816s  451380.27m  7523.00h  313.46d  0.859 y
IO & Wait Time:               2334990s   38916.50m   648.61h   27.03d  0.074 y
Average job time:                  84s       1.40m     0.02h    0.00d
Longest finished job:             578s       9.63m     0.16h    0.01d
Submission to last job:         96125s    1602.08m    26.70h    1.11d


    ssh kkstore06
    cd /cluster/data/cavPor3/bed/tblastn.hg18KG
    mkdir chainRun
    cd chainRun
    tcsh
    cat << '_EOF_' > chainGsub
#LOOP
chainOne $(path1)
#ENDLOOP
'_EOF_'

    cat << '_EOF_' > chainOne
(cd $1; cat q.*.psl | simpleChain -prot -outPsl -maxGap=150000 stdin /cluster/bluearc/cavPor3/bed/tblastn.hg18KG/blastOut/c.`basename $1`.psl)
'_EOF_'
    chmod +x chainOne
    ls -1dS /cluster/bluearc/cavPor3/bed/tblastn.hg18KG/blastOut/kg?? > chain.lst
    gensub2 chain.lst single chainGsub chainSpec
    # do the cluster run for chaining
    ssh pk
    cd /cluster/data/cavPor3/bed/tblastn.hg18KG/chainRun
    para create chainSpec
    para maxNode 30
    para try, check, push, check etc.

# Completed: 99 of 102 jobs
# Crashed: 3 jobs
# CPU time in finished jobs:     113248s    1887.47m    31.46h    1.31d  0.004 y
# IO & Wait Time:                 86043s    1434.04m    23.90h    1.00d  0.003 y
# Average job time:                2013s      33.55m     0.56h    0.02d
# Longest finished job:            6139s     102.32m     1.71h    0.07d
# Submission to last job:         10416s     173.60m     2.89h    0.12d

# ran three crashed jobs on kolossus

    ssh kkstore06
    cd /cluster/data/cavPor3/bed/tblastn.hg18KG/blastOut
    for i in kg??
    do
       cat c.$i.psl | awk "(\$13 - \$12)/\$11 > 0.6 {print}" > c60.$i.psl
       sort -rn c60.$i.psl | pslUniq stdin u.$i.psl
       awk "((\$1 / \$11) ) > 0.60 { print   }" c60.$i.psl > m60.$i.psl
       echo $i
    done
    sort -T /tmp -k 14,14 -k 16,16n -k 17,17n u.*.psl m60* | uniq > /cluster/data/cavPor3/bed/tblastn.hg18KG/blastHg18KG.psl
    cd ..
    pslCheck blastHg18KG.psl

    # load table
    ssh hgwdev
    cd /cluster/data/cavPor3/bed/tblastn.hg18KG
    hgLoadPsl cavPor3 blastHg18KG.psl

    # check coverage
    featureBits cavPor3 blastHg18KG
# 40254923 bases of 2731830700 (1.474%) in intersection

    featureBits cavPor3 refGene:cds blastHg18KG  -enrichment
# refGene:cds 0.429%, blastHg18KG 1.474%, both 0.379%, cover 88.39%, enrich 59.98x

    ssh kkstore06
    rm -rf /cluster/data/cavPor3/bed/tblastn.hg18KG/blastOut
    rm -rf /cluster/bluearc/cavPor3/bed/tblastn.hg18KG/blastOut
#end tblastn

##########################################################################
#  Create 5-way downloads (DONE - 2008-03-28 - Hiram)
    ssh hgwdev
    mkdir -p /cluster/data/cavPor3/bed/multiz5way/downloads/phastCons5way
    cd /cluster/data/cavPor3/bed/multiz5way/downloads/phastCons5way
    cp -p \
/san/sanvol1/scratch/cavPor3/multiz5way/cons/all/phastCons5wayScores/* .
    ln -s ../../cons/all/all.mod ./5way.mod
    cp /cluster/data/calJac1/bed/multiz9way/downloads/phastCons9way/README.txt .
    # edit that README.txt to be correct for this 5-way alignment
    cd ..
    mkdir multiz5way
    cd multiz5way
    cp -p /cluster/data/calJac1/bed/multiz9way/downloads/multiz9way/README.txt .
    # edit that README.txt to be correct for this 5-way alignment
    ssh kkstore06
    cd /cluster/data/cavPor3/bed/multiz5way/downloads/multiz5way
    ln -s ../../cavPor3.5-way.nh 5way.nh

    time gzip -c ../../anno/cavPor3.anno.5way.maf > cavPor3.5way.maf.gz
    #	real    34m59.295s

    ssh hgwdev
    cd /cluster/data/cavPor3/bed/multiz5way/downloads/multiz5way
    #	creating upstream files from refGene, bash script:
    cat << '_EOF_' > mkUpstream.sh
#!/bin/bash
DB=cavPor3
GENE=refGene
NWAY=multiz5way
export DB GENE

for S in 1000 2000 5000
do
    echo "making upstream${S}.maf"
    featureBits ${DB} ${GENE}:upstream:${S} -fa=/dev/null -bed=stdout \
        | perl -wpe 's/_up[^\t]+/\t0/' | sort -k1,1 -k2,2n \
        | $HOME/kent/src/hg/ratStuff/mafFrags/mafFrags ${DB} ${NWAY} \
                stdin stdout \
                -orgs=/cluster/data/${DB}/bed/${NWAY}/species.list \
        | gzip -c > upstream${S}.maf.gz
    echo "done upstream${S}.maf.gz"
done
'_EOF_'
    # << happy emacs
    chmod +x ./mkUpstream.sh
    time nice -n +19 ./mkUpstream.sh
-rw-rw-r--  1    9883443 Mar 28 13:02 upstream1000.maf.gz
-rw-rw-r--  1   17938570 Mar 28 13:06 upstream2000.maf.gz
-rw-rw-r--  1   40384656 Mar 28 13:10 upstream5000.maf.gz
    #
    #	check the names in these upstream files to ensure sanity:
    zcat upstream1000.maf.gz | grep "^s " | awk '{print $2}' \
	| sort | uniq -c | sort -rn | less
    #	should be a list of the other 4 species with a high count,
    #	then refGene names, e.g.:
    #	8806 ornAna1
    #	8806 mm9
    #	8806 hg18
    #	8806 canFam2
    #      7 NM_001077006
    #      3 NM_001113231
    #      3 NM_001105381
    #      3 NM_001102527
    #      3 NM_001102322
    #	...

    ssh kkstore06
    cd /cluster/data/cavPor3/bed/multiz5way/downloads/multiz5way
    md5sum *.maf.gz > md5sum.txt
    cd ../phastCons5way
    md5sum *.data.gz *.mod > md5sum.txt

    ssh hgwdev
    mkdir /usr/local/apache/htdocs/goldenPath/cavPor3/multiz5way
    mkdir /usr/local/apache/htdocs/goldenPath/cavPor3/phastCons5way
    cd /cluster/data/cavPor3/bed/multiz5way/downloads/multiz5way
    ln -s `pwd`/* /usr/local/apache/htdocs/goldenPath/cavPor3/multiz5way
    cd ../phastCons5way
    ln -s `pwd`/* /usr/local/apache/htdocs/goldenPath/cavPor3/phastCons5way
    #	if your ln -s `pwd`/* made extra links to files you don't want there,
    #	check the goldenPath locations and remove those extra links

#############################################################################
# BLASTZ/CHAIN/NET 2X Ground squirrel: speTri0 (In progress 2008-05-16 kate)

    ssh kkstore05
    cd /cluster/data/cavPor3/bed
    mkdir blastzSpeTri0.2008-05-16
    cd blastzSpeTri0.2008-05-16

    cat << '_EOF_' > DEF
# Mouse vs. Ground squirrel

BLASTZ_M=50

# TARGET: Mouse MM9
SEQ1_DIR=/scratch/data/cavPor3/cavPor3.2bit
SEQ1_LEN=/cluster/data/cavPor3/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000

# QUERY: Ground squirrel speTri0
SEQ2_DIR=/scratch/data/speTri0/speTri0.2bit
SEQ2_LEN=/cluster/data/speTri0/chrom.sizes
SEQ2_CHUNK=30000000
SEQ2_LIMIT=500
SEQ2_LAP=0

BASE=/cluster/data/cavPor3/bed/blastzSpeTri0.2008-05-16
TMPDIR=/scratch/tmp
'_EOF_'
    # << happy emacs

    doBlastzChainNet.pl `pwd`/DEF -bigClusterHub=pk  \
      -chainMinScore=3000 -chainLinearGap=medium >& do.log &

    # got here
    ln -s blastzSpeTri0.2008-05-16 /cluster/data/cavPor3/bed/blastzSpeTri0

    # failures... have to rescue manually
    # make axt's.  Note -- too many scaffolds to do in a single pass
    # (exhausts mem) so we split
    ssh kksstore05
    cd /cluster/data/cavPor3/bed/blastz.speTri0/axtChain
    netSplit noClass.net noClass
    ~kate/bin/x86_64/netSplit noClass.net noClass -lump=100
    # can't just split to chains (exhausts open files)
    chainSplit chain cavPor3.speTri0.all.chain.gz -lump=100
    cd noClass
    mkdir ../../axtNetChunks
cat > netToAxt.csh << 'EOF'
    foreach i (*.net)
        set p = $i:r
        echo $i
	netToAxt $p.net ../chain/$p.chain /cluster/data/cavPor3/cavPor3.2bit /cluster/data/speTri0/speTri0.2bit ../../axtNetChunks/$i:r.axt
    end
'EOF'
    csh netToAxt.csh >&! netToAxt.log &

    # create unfiltered mafNet to get better squirrel coverage (2008-11-3 kate)
    cd ../../axtNetChunks
    cat *.axt | axtSort stdin stdout > ../axtChain/cavPor3.speTri0.net.axt
    mkdir ../mafNet
    axtToMaf -tPrefix=cavPor3. -qPrefix=speTri0. ../axtChain/cavPor3.speTri0.net.axt \
        /cluster/data/cavPor3/chrom.sizes /cluster/data/speTri0/chrom.sizes \
            ../mafNet/cavPor3.speTri0.net.maf
    gzip stdin > ../mafNet/cavPor3.speTri0.net.maf.gz

    # low cov genome, so use reciprocal best for multiple alignment
    # need to generate liftover chains, which failed in automation
cat > over.csh << 'EOF'
    foreach i (*.net)
        set p = $i:r
        echo $i
	netChainSubset -verbose=0 $p.net ../chain/$p.chain stdout | \
            chainStitchId stdin $i.over.chain
    end
    cat *.over.chain | gzip -c > cavPor3.speTri0.over.chain.gz
'EOF'
    csh over.csh >&! over.log &

    ssh hgwdev
    cd /cluster/data/cavPor3/bed/blastz.speTri0
    mkdir /usr/local/apache/htdocs/goldenPath/cavPor3/vsSpeTri0
    /cluster/bin/scripts/doRecipBest.pl cavPor3 speTri0 >&! rbest.log &

    # TODO:
    # load chains in database and check coverage
    # NOTE: exhausts mem on hgwdev -- need to try this on kolossus ?
    cd axtChain
    cat > loadChains.csh << 'EOF'
hgLoadChain -tIndex cavPor3 chainSpeTri0 cavPor3.speTri0.all.chain.gz
featureBits cavPor3 chainSpeTri0Link > fb.cavPor3.chainSpeTri0Link.txt
cat fb.cavPor3.chainSpeTri0Link.txt
'EOF'
    csh loadChains.csh >&! loadChains.log &
############################################################################
# SWAP SpeTri0  Blastz (2008-11-01 kate)

mkdir -p /cluster/data/speTri0/bed
cd /cluster/data/speTri0/bed
mkdir blastz.cavPor3.swap
cd blastz.cavPor3.swap

cd /cluster/data/cavPor3/bed/blastz.speTri0/axtChain/chain
mkdir ../chain.swap
cat > swap.csh << 'EOF'
foreach c (*.chain)
    echo $c
    chainSwap $c ../chain.swap/$c
end
'EOF'
csh swap.csh >&! swap.log &

doBlastzChainNet.pl -swap -chainMinScore=3000 -chainLinearGap=medium \
    /cluster/data/cavPor3/bed/blastz.speTri0/DEF >& swap.log  &

############################################################################
# TRANSMAP vertebrate.2008-05-20 build  (2008-05-24 markd)

vertebrate-wide transMap alignments were built  Tracks are created and loaded
by a single Makefile. This is available from:
   svn+ssh://hgwdev.soe.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2008-05-20

see doc/builds.txt for specific details.
#############################################################################
# N-SCAN gene predictions (nscanGene) - (2008-04-03 markd)

    # obtained NSCAN predictions from michael brent's group
    # at WUSTL
    cd /cluster/data/cavPor3/bed/nscan/
    wget -nv http://mblab.wustl.edu/predictions/Guinea_pig/cavPor3/cavPor3.gtf
    wget -nv http://mblab.wustl.edu/predictions/Guinea_pig/cavPor3/cavPor3.prot.fa
    wget -nv http://mblab.wustl.edu/predictions/Guinea_pig/cavPor3/readme.html
    bzip2 cavPor3.*
    chmod a-w *

    # load track
    gtfToGenePred -genePredExt cavPor3.gtf.bz2 stdout | hgLoadGenePred -bin -genePredExt cavPor3 nscanGene stdin
    hgPepPred cavPor3 generic nscanPep  cavPor3.prot.fa.bz2
    rm *.tab

    # update trackDb; need a cavPor3-specific page to describe informants
    marmoset/cavPor3/nscanGene.html   (copy from readme.html)
    marmoset/cavPor3/trackDb.ra
    # set search regex to
    termRegex scaffold_[0-9]+\.[0-9]+\.[0-9]+
############################################################################

# ### Found this in Kate's unchecked in cavPor3.txt.  I need it to continue the blastz to get the maf for multiz !!
# ### #############################################################################
# ### # BLASTZ/CHAIN/NET 2X Ground squirrel: speTri0 (In progress 2008-05-16 kate)
# ###
# ###     ssh kkstore05
# ###     cd /cluster/data/cavPor3/bed
# ###     mkdir blastzSpeTri0.2008-05-16
# ###     cd blastzSpeTri0.2008-05-16
# ###
# ###     cat << '_EOF_' > DEF
# ### # Mouse vs. Ground squirrel
# ###
# ### BLASTZ_M=50
# ###
# ### # TARGET: Mouse MM9
# ### SEQ1_DIR=/scratch/data/cavPor3/cavPor3.2bit
# ### SEQ1_LEN=/cluster/data/cavPor3/chrom.sizes
# ### SEQ1_CHUNK=10000000
# ### SEQ1_LAP=10000
# ###
# ### # QUERY: Ground squirrel speTri0
# ### SEQ2_DIR=/scratch/data/speTri0/speTri0.2bit
# ### SEQ2_LEN=/cluster/data/speTri0/chrom.sizes
# ### SEQ2_CHUNK=30000000
# ### SEQ2_LIMIT=500
# ### SEQ2_LAP=0
# ###
# ### BASE=/cluster/data/cavPor3/bed/blastzSpeTri0.2008-05-16
# ### TMPDIR=/scratch/tmp
# ### '_EOF_'
# ###     # << happy emacs
# ###
# ###     doBlastzChainNet.pl `pwd`/DEF -bigClusterHub=pk  \
# ###       -chainMinScore=3000 -chainLinearGap=medium >& do.log &
# ###
# ###     # got here
# ###     ln -s blastzSpeTri0.2008-05-16 /cluster/data/cavPor3/bed/blastzSpeTri0
ln -s blastzSpeTri0.2008-05-16 /cluster/data/cavPor3/bed/blastz.speTri0

    # ### 1 crashed job in axtChain:
    # ssh mkr0u7
zcat ../../pslParts/cavPor3.2bit\:scaffold_7\:*.psl.gz \
 | axtChain -psl -verbose=0  -minScore=3000 -linearGap=medium stdin \
     /scratch/data/cavPor3/cavPor3.2bit \
     /scratch/data/speTri0/speTri0.2bit \
     tmp.chain &
    # invalid unsigned number: "2"
    ### divide and conquer
     ll ../../pslParts/cavPor3.2bit\:scaffold_7\:*.psl.gz -rw-rw-r--  1 kate protein 25560954 May 18 08:12 ../../pslParts/cavPor3.2bit:scaffold_7:0-10010000.psl.gz
-rw-rw-r--  1 kate protein 28831900 May 18 08:13 ../../pslParts/cavPor3.2bit:scaffold_7:10000000-20010000.psl.gz
-rw-rw-r--  1 kate protein 25466248 May 18 08:13 ../../pslParts/cavPor3.2bit:scaffold_7:20000000-30010000.psl.gz
-rw-rw-r--  1 kate protein 26433148 May 18 08:13 ../../pslParts/cavPor3.2bit:scaffold_7:30000000-40010000.psl.gz
-rw-rw-r--  1 kate protein 38048425 May 18 08:13 ../../pslParts/cavPor3.2bit:scaffold_7:40000000-50010000.psl.gz
-rw-rw-r--  1 kate protein 24145256 May 18 08:13 ../../pslParts/cavPor3.2bit:scaffold_7:50000000-60010000.psl.gz
-rw-rw-r--  1 kate protein  7480033 May 18 08:13 ../../pslParts/cavPor3.2bit:scaffold_7:60000000-61004451.psl.gz
zcat ../../pslParts/cavPor3.2bit\:scaffold_7\:0-*.psl.gz \
 | axtChain -psl -verbose=0  -minScore=3000 -linearGap=medium stdin \
     /scratch/data/cavPor3/cavPor3.2bit \
     /scratch/data/speTri0/speTri0.2bit \
     tmp.chain &
     # Error reading 4 bytes: Success
zcat ../../pslParts/cavPor3.2bit\:scaffold_7\:40*.psl.gz \
 | axtChain -psl -verbose=0  -minScore=3000 -linearGap=medium stdin \
     /scratch/data/cavPor3/cavPor3.2bit \
     /scratch/data/speTri0/speTri0.2bit \
     tmp.chain &
     # invalid unsigned number: "2"
     mv ../../pslParts/cavPor3.2bit\:scaffold_7\:40000000-50010000.psl.gz ../../pslParts/cavPor3.2bit\:scaffold_7\:40000000-50010000.psl.gz.bad
zcat ../../pslParts/cavPor3.2bit\:scaffold_7\:*.psl.gz \
 | axtChain -psl -verbose=0  -minScore=3000 -linearGap=medium stdin \
     /scratch/data/cavPor3/cavPor3.2bit \
     /scratch/data/speTri0/speTri0.2bit \
     tmp.chain &
     #Error reading 4 bytes: Success

     ssh pk; para shove -retries=6
    # 233 jobs in batch
    # 503 jobs (including everybody's) in Parasol queue.
    # Checking finished jobs
    # Completed: 233 of 233 jobs
    # CPU time in finished jobs:     116206s    1936.77m    32.28h    1.34d  0.004 y
    # IO & Wait Time:                  3377s      56.28m     0.94h    0.04d  0.000 y
    # Average job time:                 513s       8.55m     0.14h    0.01d
    # Longest running job:                0s       0.00m     0.00h    0.00d
    # Longest finished job:            8500s     141.67m     2.36h    0.10d
    # Submission to last job:        967976s   16132.93m   268.88h   11.20d
    # Estimated complete:                 0s       0.00m     0.00h    0.00d

    ### ############## Now need to restart the blastz:
    tail do.log
    # Command failed:
    # ssh -x kki nice /cluster/data/cavPor3/bed/blastzSpeTri0.2008-05-16/axtChain/run/doChainRun.csh

    ssh kkstore05 # store12->kkstore05
    cd /cluster/data/cavPor3/bed/blastz.speTri0
    screen

    time nice -n +19 doBlastzChainNet.pl `pwd`/DEF -bigClusterHub=pk  \
       -chainMinScore=3000 -chainLinearGap=medium -continue chainMerge >> do.log 2>&1
    # real    220m10.699s
    # tail do.log
    # writing /dev/null
    # memory usage 2419843072, utime 5416 s/100, stime 389
    # netChainSubset -verbose=0 noClass.net cavPor3.speTri0.all.chain.gz stdout
    # chainStitchId stdin stdout
    # gzip -c
    # Killed
    #
    # gzip: stdout: Broken pipe
    # Command failed:
    # ssh -x kolossus nice /cluster/data/cavPor3/bed/blastzSpeTri0.2008-05-16/axtChain/netChains.csh

    cp netChain.csh netChain.restart.csh
    # edit to restart in correct place
    ./netChain.restart.csh

    ### After some searching I was able to figure out that   = octal "\031"
    # Now I can find the original problem file and try to rebuild it:
    grep -n -P "\031" cavPor3.2bit\:scaffold_7\:40000000-50010000_part007.lst.psl
    # 4253:62 40      0       0       1       1       1       6       +       scaffold_6393.1-308827  308827  149765  149868  scaffold_'      61004451   49604348        49604456        3       2,38,35,       149765,149794,149833,   49604348,49604383,49604421,

    # For comparison:
    tail -55 cavPor3.2bit\:scaffold_7\:40000000-50010000_part007.lst.psl | head -5
    # 91      37      0       0       1       8       3       16      +       scaffold_6393.1-308827  308827  244021  244157  scaffold_7      61004451   48951564        48951708        5       18,25,14,15,56, 244021,244039,244072,244086,244101,     48951564,48951583,48951608,48951629,48951652,
    # 62      32      0       0       0       0       0       0       +       scaffold_6393.1-08827  308827  54194   54288   scaffold_7      61004451   49114670        49114764        1       94,     54194,  49114670,
    # 62      40      0       0       1       1       1       6       +       scaffold_6393.1-308827  308827  149765  149868  scaffold_'      61004451   49604348        49604456        3       2,38,35,       149765,149794,149833,   49604348,49604383,49604421,
    # 127     63      0       0       1       18      2       5       +       scaffold_6393.1-308827  308827  159033  159241  scaffold_7      61004451   49613715        49613910        4       59,84,20,27,    159033,159092,159176,159214,    49613715,49613775,49613863,49613883,
    # 29      8       0       0       0       0       0       0       +       scaffold_6393.1-308827  308827  160121  160158  scaffold_7      61004451   49620825        49620862        1       37,     160121, 49620825,
    ### ### ### Yikes!  the prior line as a different strange char:  = octal "\023"

    ## Run on same machine:
    ssh kkr10u47
    /cluster/bin/scripts/blastz-run-ucsc -outFormat psl ../psl/cavPor3.2bit:scaffold_7:40000000-50010000 qParts/part007.lst ../DEF ../psl/cavPor3.2bit:scaffold_7:40000000-50010000/cavPor3.2bit:scaffold_7:40000000-50010000_part007.lst.new.psl
    ll ../psl/cavPor3.2bit\:scaffold_7\:40000000-50010000/cavPor3.2bit\:scaffold_7\:40000000-50010000_part007*
    # -rw-rw-r--  1 tdreszer protein 785511 Jun  3 13:11 ../psl/cavPor3.2bit:scaffold_7:40000000-50010000/cavPor3.2bit:scaffold_7:40000000-50010000_part007.lst.new.psl
    # -rw-rw-r--  1 kate     protein 785511 May 16 19:40 ../psl/cavPor3.2bit:scaffold_7:40000000-50010000/cavPor3.2bit:scaffold_7:40000000-50010000_part007.lst.psl
    grep -n -P "\031" ../psl/cavPor3.2bit\:scaffold_7\:40000000-50010000/cavPor3.2bit\:scaffold_7\:40000000-50010000_part007.lst.new.psl
    tail -55 ../psl/cavPor3.2bit\:scaffold_7\:40000000-50010000/cavPor3.2bit\:scaffold_7\:40000000-50010000_part007.lst.new.psl | head -5
    # 91      37      0       0       1       8       3       16      +       scaffold_6393.1-308827  308827  244021  244157  scaffold_7      61004451        48951564        48951708       518,25,14,15,56, 244021,244039,244072,244086,244101,     48951564,48951583,48951608,48951629,48951652,
    # 62      32      0       0       0       0       0       0       +       scaffold_6393.1-308827  308827  54194   54288   scaffold_7      61004451        49114670        49114764       194,     54194,  49114670,
    # 62      40      0       0       1       1       1       6       +       scaffold_6393.1-308827  308827  149765  149868  scaffold_7      61004451        49604348        49604456       329,38,35,       149765,149794,149833,   49604348,49604383,49604421,
    # 127     63      0       0       1       18      2       5       +       scaffold_6393.1-308827  308827  159033  159241  scaffold_7      61004451        49613715        49613910       459,84,20,27,    159033,159092,159176,159214,    49613715,49613775,49613863,49613883,
    # 29      8       0       0       0       0       0       0       +       scaffold_6393.1-308827  308827  160121  160158  scaffold_7      61004451        49620825        49620862       137,     160121, 49620825,
    mv ../psl/cavPor3.2bit\:scaffold_7\:40000000-50010000/cavPor3.2bit\:scaffold_7\:40000000-50010000_part007.lst.psl ../psl/cavPor3.2bit\:scaffold_7\:40000000-50010000/cavPor3.2bit\:scaffold_7\:40000000-50010000_part007.lst.psl.old
    mv ../psl/cavPor3.2bit\:scaffold_7\:40000000-50010000/cavPor3.2bit\:scaffold_7\:40000000-50010000_part007.lst.new.psl ../psl/cavPor3.2bit\:scaffold_7\:40000000-50010000/cavPor3.2bit\:scaffold_7\:40000000-50010000_part007.lst.psl

    # Okay, we are NOW done with run.blastz so it is on to run.cat
    ssh kkstore05
    cd ../run.cat
    ./cat.csh cavPor3.2bit:scaffold_7:40000000-50010000 cavPor3.2bit:scaffold_7:40000000-50010000.psl.new.gz
    ll cavPor3.2bit\:scaffold_7\:40000000-50010000.psl.*
    # -rw-rw-r--  1 tdreszer protein 38048414 Jun  3 13:28 cavPor3.2bit:scaffold_7:40000000-50010000.psl.new.gz
    # -rw-rw-r--  1 tdreszer protein 38048475 May 18 08:13 cavPor3.2bit:scaffold_7:40000000-50010000.psl.old.gz
    mv cavPor3.2bit\:scaffold_7\:40000000-50010000.psl.new.gz cavPor3.2bit\:scaffold_7\:40000000-50010000.psl.gz
    mv cavPor3.2bit\:scaffold_7\:40000000-50010000.psl.old.gz cavPor3.2bit\:scaffold_7\:40000000-50010000.psl.gz.old
    mv cavPor3.2bit\:scaffold_7\:40000000-50010000.psl.gz* ../pslParts/

    # Okay, we are now done with run.cat.  What can be done for axtChain?
    # run on same cluster machine
    ssh kkr4u00
    cd /cluster/data/cavPor3/bed/blastz.speTri0/axtChain/run
    ./chain.csh cavPor3.2bit:scaffold_7: chain/cavPor3.2bit:scaffold_7:.chain
    ll chain/cavPor3.2bit\:scaffold_7\:.chain
    # -rw-rw-r--  1 tdreszer protein 304948646 Jun  3 15:05 chain/cavPor3.2bit:scaffold_7:.chain

    # Okay, NOW we are ready to consider restarting doBlastzChainNet.pl
    # Though I assume that some of what was already done, will be in the way.
    time nice -n +19 doBlastzChainNet.pl `pwd`/DEF -bigClusterHub=pk  \
       -chainMinScore=3000 -chainLinearGap=medium -continue chainMerge >> do.log 2>&1
    # postProcessChains: looks like this was run successfully already (cavPor3.speTri0.all.chain.gz exists).
    # Either run with -continue net or some later stage, or move aside/remove /cluster/data/cavPor3/bed/blastzSpeTri0.2008-05-16/axtChain/cavPor3.speTri0.all.chain.gz and run again.
    mv axtChain/cavPor3.speTri0.all.chain.gz axtChain/cavPor3.speTri0.all.chain.gz.old
    time nice -n +19 doBlastzChainNet.pl `pwd`/DEF -bigClusterHub=pk  \
       -chainMinScore=3000 -chainLinearGap=medium -continue chainMerge >> do.log 2>&1
    # real    54m1.762s
    tail do.log
    # HgStepManager: executing step 'net' Thu Jun  5 12:46:09 2008.
    # netChains: looks like we are not starting with a clean slate.  Please move aside or remove /cluster/data/cavPor3/bed/blastzSpeTri0.2008-05-16/axtChain/noClass.net and run again.
    mv /cluster/data/cavPor3/bed/blastzSpeTri0.2008-05-16/axtChain/noClass.net /cluster/data/cavPor3/bed/blastzSpeTri0.2008-05-16/axtChain/noClass.net.old

    screen
    time nice -n +19 doBlastzChainNet.pl `pwd`/DEF -bigClusterHub=pk  \
       -chainMinScore=3000 -chainLinearGap=medium -continue net >> do.log 2>&1
    real    204m20.325s

    tail do.log
    # writing /dev/null
    # memory usage 2427514880, utime 5453 s/100, stime 405
    # netChainSubset -verbose=0 noClass.net cavPor3.speTri0.all.chain.gz stdout
    # chainStitchId stdin stdout
    # gzip -c
    #
    # gzip: stdout: Broken pipe
    # Killed
    # Command failed:
    # ssh -x kolossus nice /cluster/data/cavPor3/bed/blastzSpeTri0.2008-05-16/axtChain/netChains.csh

    ### Can't seem to get ahead!!!

    ## Doing each step in netChains.csh manually...

    cd /cluster/data/cavPor3/bed/blastzSpeTri0.2008-05-16/axtChain

    # Make nets ("noClass", i.e. without rmsk/class stats which are added later):
    chainPreNet cavPor3.speTri0.all.chain.gz /cluster/data/cavPor3/chrom.sizes  /cluster/data/ speTri0/chrom.sizes stdout \
     | chainNet stdin -minSpace=1 /cluster/data/cavPor3/chrom.sizes  /cluster/data/speTri0/chrom.sizes stdout /dev/null \
     | netSyntenic stdin noClass.net
    # Completed successfully

    # Make liftOver chains:
    netChainSubset -verbose=0 noClass.net cavPor3.speTri0.all.chain.gz netChainSubset.out
    ll netChainSubset.out
    # -rw-rw-r--  1 tdreszer protein 73728 Jun  6 17:12 netChainSubset.out

    chainStitchId netChainSubset.out chainStitchId.out
    ll chainStitchId.out
    # -rw-rw-r--  1 tdreszer protein 0 Jun  9 09:13 chainStitchId.out
    # gzip -c chainStitchId.out > cavPor3.speTri0.over.chain.gz
    # Duh: gzip: stdout: Broken pipe

    # Okay, continuing with next step, ignoring the empty cavPor3.speTri0.over.chain.gz
    # Make axtNet for download: one .axt for all of cavPor3.
    mkdir ../axtNet
    netToAxt -verbose=0 noClass.net cavPor3.speTri0.all.chain.gz \
      /scratch/data/cavPor3/cavPor3.2bit /scratch/data/speTri0/speTri0.2bit stdout \
      | axtSort stdin stdout \
      | gzip -c > ../axtNet/cavPor3.speTri0.net.axt.gz

    ### Okay Kate rescued this by adjusting file sizes to handle memory

    ssh kkstore05
    cd /cluster/data/cavPor3/bed/blastz.speTri0
    time nice -n +19 doBlastzChainNet.pl `pwd`/DEF -bigClusterHub=pk  \
       -chainMinScore=3000 -chainLinearGap=medium -continue net >> doAferChain.log 2>&1




    # NOTE: be sure to ls the data in above script on workhorse machine before starting script
    time nice -n +19 doBlastzChainNet.pl `pwd`/DEF \
    -bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
        -syntenicNet > do.log 2>&1
    # <ctl-A><ctl-d>  ps -ef | grep blastzCavPor3
    # real    1764m55.155s
    tail do.log
    #  *** All done!
    # *** Make sure that goldenPath/mm9/vsCavPor3/README.txt is accurate.
    # *** Add {chain,net}CavPor3 tracks to trackDb.ra if necessary.
    # blastz: ranOk: 52984
    cat fb.mm9.chainCavPor3Link.txt
    #    757283793 bases of 2620346127 (28.900%) in intersection

    cd /cluster/data/mm9/bed/blastzCavPor3.2008-04-10
    ######### Change locations in DEF due to Hiram's new methods
    cat << \_EOF_ > DEF
BLASTZ_M=50

# TARGET: Mouse Mm9
SEQ1_DIR=/scratch/data/mm9/nib
SEQ1_LEN=/scratch/data/mm9/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000

# QUERY: GuineaPig cavPor3
SEQ2_DIR=/scratch/data/cavPor3/cavPor3.2bit
SEQ2_LEN=/scratch/data/cavPor3/chrom.sizes
# Maximum number of scaffolds that can be lumped together
SEQ2_LIMIT=300
SEQ2_CHUNK=20000000
SEQ2_LAP=0

BASE=/cluster/data/mm9/bed/blastzCavPor3.2008-04-10
TMPDIR=/scratch/tmp
_EOF_

    mkdir /cluster/data/cavPor3/bed/blastz.mm9.swap
    cd /cluster/data/cavPor3/bed/blastz.mm9.swap
    time nice -n +19 doBlastzChainNet.pl \
    /cluster/data/mm9/bed/blastzCavPor3.2008-04-10/DEF \
    -bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
        -swap -syntenicNet > do.log 2>&1 &
    # real   166m53.671s
    # Exit 25   time nice -n +19 doBlastzChainNet.pl /cluster/data/mm9/bed/blastzCavPor3.2008-04-10/DEF -bigC
    # Can't open chain/scaffold_795.chain to append: Too many open files
    # gzip: stdout: Broken pipe
    # Command failed:
    # ssh -x kolossus nice /cluster/data/cavPor3/bed/blastz.mm9.swap/axtChain/netSynteny.csh

    #   broken down during netSynteny.csh due to too many open files on
    #   a chainSplit
    #   However, there is no need to split when we have scaffolds.
    #   Kate fixes doBlastzChainNet.pl and retry:

    cd /cluster/data/cavPor3/bed/blastz.mm9.swap
    time nice -n +19 ~kate/kent/src/hg/utils/automation/doBlastzChainNet.pl \
    /cluster/data/mm9/bed/blastzCavPor3.2008-04-10/DEF \
    -bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
        -swap -continue=syntenicNet -syntenicNet > syn.log 2>&1 &
    # real    24m37.561s
    # *** All done!
    # *** Make sure that goldenPath/cavPor3/vsMm9/README.txt is accurate.
    # *** Add {chain,net}Mm9 tracks to trackDb.ra if necessary.
    cat fb.cavPor3.chainMm9Link.txt
    # 781173609 bases of 2663369733 (29.330%) in intersection

############################################################################
# TRANSMAP vertebrate.2008-06-07 build  (2008-06-30 markd)

vertebrate-wide transMap alignments were built  Tracks are created and loaded
by a single Makefile. This is available from:
   svn+ssh://hgwdev.soe.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2008-06-30

see doc/builds.txt for specific details.
############################################################################
#  3way Rodent Multiz (special for Jurgens (Schmitz & co.) in Muenster
#  Previously, Robert Baertsch handled these.  I committed to these
#  many months ago...
# 2008-07-26 kate
# Redo with unfiltered net mafs to maximize squirrel sequence (2008-11-08 kate)

    ssh kkstore05
    cd /cluster/data/cavPor3/bed
    mkdir multiz3way
    cd multiz3way
    mkdir -p /san/sanvol1/scratch/cavPor3/multiz3way/
    cd /san/sanvol1/scratch/cavPor3/multiz3way

    # copy mafs to cluster-friendly disk
    # mm9 - high quality mammalian genome, so use syntenic net
    mafSplit /dev/null -byTarget mm9/ /cluster/data/cavPor3/bed/blastz.mm9/axtChain/cavPor3.mm9.synNet.maf.gz
    cd mm9
    # rename to reflect chrom name
    foreach f (*.maf)
        set c = `head -3 $f | grep cavPor3 | awk '{print $2}' | sed 's/cavPor3.//'`
        echo $c
        mv $f $c.maf
    end

    # speTri0 - low quality mammalian genome, so use reciprocal best net
    # mafSplit /dev/null -byTarget speTri0/ /cluster/data/cavPor3/bed/blastz.speTri0/mafRBestNet/cavPor3.speTri0.rbest.maf.gz
    # redo with full net, to get more sequence
    mafSplit /dev/null -byTarget speTri0/ /cluster/data/cavPor3/bed/blastz.speTri0/mafNet/cavPor3.speTri0.net.maf.gz
    cd speTri0
    # rename to reflect chrom name
    foreach f (*.maf)
        set c = `head -3 $f | grep cavPor3 | awk '{print $2}' | sed 's/cavPor3.//'`
        echo $c
        mv $f $c.maf
    end

    cd /san/sanvol1/scratch/cavPor3/multiz3way

    # get latest PSU utilities
    mkdir penn
    set p=/cluster/bin/penn/multiz.v11.2007-03-19/multiz-tba
    cp -p $p/{autoMZ,multiz,maf_project} penn

    # the autoMultiz cluster run
    ssh pk
    cd /cluster/data/cavPor3/bed/multiz3way

    # create species list and stripped down tree for autoMZ
    cat > tree.nh << 'EOF'
((cavPor3 mm9) speTri0)
'EOF'
    cat > species.lst << 'EOF'
cavPor3 mm9 speTri0
'EOF'
    mkdir run maf
    cd run

    cat > autoMultiz << '_EOF_'
#!/bin/csh -ef
set db = cavPor3
set c = $1
set maf = $2
set binDir = /san/sanvol1/scratch/$db/multiz3way/penn
set tmp = /scratch/tmp/$db/multiz.$c
set pairs = /san/sanvol1/scratch/$db/multiz3way
rm -fr $tmp
mkdir -p $tmp
cp ../{tree.nh,species.lst} $tmp
pushd $tmp
foreach s (`cat species.lst`)
    set in = $pairs/$s/$c.maf
    set out = $db.$s.sing.maf
    if ($s == $db) then
	continue
    endif
    if (-e $in.gz) then
	zcat $in.gz > $out
    else if (-e $in) then
	cp $in $out
    else
	echo "##maf version=1 scoring=autoMZ" > $out
    endif
end
set path = ($binDir $path); rehash
$binDir/autoMZ + T=$tmp E=$db "`cat tree.nh`" $db.*.sing.maf $c.maf
popd
cp $tmp/$c.maf $maf
rm -fr $tmp
'_EOF_'
    # << happy emacs
    chmod +x autoMultiz

cat  << '_EOF_' > template
#LOOP
./autoMultiz $(root1) {check out line+ /cluster/data/cavPor3/bed/multiz3way/maf/$(root1).maf}
#ENDLOOP
'_EOF_'
    # << happy emacs

    mkdir /cluster/data/cavPor3/bed/multiz3way/maf/
    awk '{print $1}' /cluster/data/cavPor3/chrom.sizes > chrom.lst
    gensub2 chrom.lst single template jobList
    para create jobList
    # ~3000 jobs
    para try
    para check
    # ~2 hours of run-time

    # remove empty alignment files and package up
    ssh kkstore05
    cd /cluster/data/cavPor3/bed/multiz3way/maf
    mkdir empty
cat > finish.csh << 'EOF'
    foreach f (*.maf)
        head -11 $f | grep -q 's cavPor3'
        if ($status == 1) then
            echo "$f empty"
            mv $f empty
        else
            echo "   $f NOT empty"
        endif
    end
'EOF'
    # oops, forgot to mkdir, so 'empty' files were lost
    ls *.maf | wc -l
    # 1491
    tar cvfz rodent3way.tar.gz *.maf


    #########
    # Also do a multiz with non-filtered mouse mafs:
    # mm9 syntenic net mafs are sparse (cover only ~300 scaffolds), so check net
    ssh hgwdev
    cd /cluster/data/cavPor3/bed/blastz.mm9/axtChain
    netFilter -minGap=10 cavPor3.mm9.syn.net.gz | hgLoadNet -verbose=0 cavPor3 synNetMm9 stdin
    # select distinct(tName) shows 302 rows

    mkdir -p /san/sanvol1/scratch/cavPor3/multiz3way.full/
    cd /san/sanvol1/scratch/cavPor3/multiz3way.full
    mafSplit /dev/null -byTarget mm9/ /cluster/data/cavPor3/bed/blastz.mm9//cavPor3.mm9.net.maf.gz
    cd mm9
    # rename to reflect chrom name
    foreach f (*.maf)
        set c = `head -3 $f | grep cavPor3 | awk '{print $2}' | sed 's/cavPor3.//'`
        echo $c
        mv $f $c.maf
    end

    # copy and link various files from first run
    ln -s ../multiz3way/{speTri0,penn} .
    cp ../multiz3way/{species.lst,tree.nh} .

    mkdir run maf
    cp ../multiz3way/run/chrom.lst run
    sed 's/multiz3way/multiz3way.full/' ../multiz3way/run/autoMultiz > run/autoMultiz
    sed 's/multiz3way/multiz3way.full/' ../multiz3way/run/template > run/template
    # check it
    chmod +x run/autoMultiz

    mkdir -p /cluster/data/cavPor3/bed/multiz3way.full/maf

    ssh pk
    cd /san/sanvol1/scratch/cavPor3/multiz3way.full/run
    gensub2 chrom.lst single template jobList
    para create jobList
    para try
    para check
    # ~2 hours of run-time

    ssh kkstore05
    cd /cluster/data/cavPor3/bed/multiz3way.full/maf
    mkdir empty
    csh ../../multiz3way/maf/finish.csh >&! finish.log &

    # post to hgwdev downloads area
    ssh hgwdev
    cd /cluster/data/cavPor3/bed/multiz3way
    mv *.gz /usr/local/apache/htdocs/goldenPath/cavPor3/multizRodent3way/syntenic
    cd /cluster/data/cavPor3/bed/multiz3way.full
    mv *.gz /usr/local/apache/htdocs/goldenPath/cavPor3/multizRodent3way/full


################################################
# AUTOMATE UPSTREAM FILE CREATION (2008-10-15 markd)
update genbank.conf:
cavPor3.upstreamGeneTbl = xenoRefGene
cavPor3.upstreamMaf = multiz8way /hive/data/genomes/cavPor3/bed/multiz8way/species.list

############################################################################
#  cavPor3 - Guinea Pig - Ensembl Genes version 51  (DONE - 2008-12-02 - hiram)
    ssh hgwdev
    cd /hive/data/genomes/cavPor3
    cat << '_EOF_' > cavPor3.ensGene.ra
# required db variable
db cavPor3
# do we need to translate geneScaffold coordinates
# geneScaffolds yes
nameTranslation "s/^MT/chrM/;"
'_EOF_'
#  << happy emacs

    doEnsGeneUpdate.pl -ensVersion=51 cavPor3.ensGene.ra
    ssh hgwdev
    cd /hive/data/genomes/cavPor3/bed/ensGene.51
    featureBits cavPor3 ensGene
    # 30699872 bases of 2663369733 (1.153%) in intersection

 *** All done!  (through the 'makeDoc' step)
 *** Steps were performed in /hive/data/genomes/cavPor3/bed/ensGene.51

############################################################################

############################################################################
# TRANSMAP vertebrate.2009-07-01 build  (2009-07-21 markd)

vertebrate-wide transMap alignments were built  Tracks are created and loaded
by a single Makefile. This is available from:
   svn+ssh://hgwdev.soe.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2009-07-01

see doc/builds.txt for specific details.
############################################################################
############################################################################
# TRANSMAP vertebrate.2009-09-13 build  (2009-09-20 markd)

vertebrate-wide transMap alignments were built  Tracks are created and loaded
by a single Makefile. This is available from:
   svn+ssh://hgwdev.soe.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2009-09-13

see doc/builds.txt for specific details.

############################################################################
# lastz Rabbit oryCun2 swap (DONE - 2010-01-22 - Hiram)
    #	original alignment
    cd /hive/data/genomes/oryCun2/bed/lastzCavPor3.2010-01-21
    cat fb.oryCun2.chainCavPor3Link.txt
    #	964546600 bases of 2604023284 (37.041%) in intersection

    #	and for the swap
    mkdir /hive/data/genomes/cavPor3/bed/blastz.oryCun2.swap
    cd /hive/data/genomes/cavPor3/bed/blastz.oryCun2.swap
    time doBlastzChainNet.pl -verbose=2 \
	/hive/data/genomes/oryCun2/bed/lastzCavPor3.2010-01-21/DEF \
	-swap -noLoadChainSplit -workhorse=hgwdev -bigClusterHub=swarm \
	-chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 &
    #	real    186m1.649s
    cat fb.cavPor3.chainOryCun2Link.txt
    #	1003499831 bases of 2663369733 (37.678%) in intersection

############################################################################
# enable native RefSeq Genes track  (2010-06-15)

   # in etc/genbank.conf:
      cavPor3.refseq.mrna.native.load = yes

    ssh genbank
    cd /cluster/data/genbank
   ./bin/gbAlignStep -orgCat=native -initial -srcDb=refseq cavPor3
    ssh hgwdev
    cd /cluster/data/genbank
   ./bin/gbDbLoadStep -byPassGbLoaded -reload -srcDb=refseq cavPor3

############################################################################
# liftOver for hg19 (DONE - 2011-07-14 - Hiram)
    cd /hive/data/genomes/cavPor3/bed/blastz.hg19.swap/axtChain
    ln -s `pwd`/cavPor3.hg19.over.chain.gz \
	/gbdb/cavPor3/liftOver/cavPor3ToHg19.over.chain.gz
    hgAddLiftOverChain -minMatch=0.1 -multiple \
	-path=/gbdb/cavPor3/liftOver/cavPor3ToHg19.over.chain.gz \
	cavPor3 hg19
     ln -s `pwd`/cavPor3.hg19.over.chain.gz \
	/usr/local/apache/htdocs-hgdownload/goldenPath/cavPor3/liftOver/cavPor3ToHg19.over.chain.gz
    cd /usr/local/apache/htdocs-hgdownload/goldenPath/cavPor3/liftOver
    md5sum cavPor3ToHg19.over.chain.gz >> md5sum.txt

############################################################################
# SWAP lastz mm10 (DONE - 2012-03-19 - Hiram)
    # original alignment to mm10
    cat /hive/data/genomes/mm10/bed/lastzCavPor3.2012-03-16/fb.mm10.chainCavPor3Link.txt
    #	754642254 bases of 2652783500 (28.447%) in intersection

    # and this swap
    mkdir /hive/data/genomes/cavPor3/bed/blastz.mm10.swap
    cd /hive/data/genomes/cavPor3/bed/blastz.mm10.swap
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
	/hive/data/genomes/mm10/bed/lastzCavPor3.2012-03-16/DEF \
	-swap -syntenicNet \
	-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
	-chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 &
    #	real    80m23.870s
    cat fb.cavPor3.chainMm10Link.txt
    #	775452752 bases of 2663369733 (29.115%) in intersection
    # set sym link to indicate this is the lastz for this genome:
    cd /hive/data/genomes/cavPor3/bed
    ln -s blastz.mm10.swap lastz.mm10

##############################################################################
# create ucscToINSDC name mapping (DONE - 2013-08-16 - Hiram)
    mkdir /hive/data/genomes/cavPor3/bed/ucscToINSDC
    cd /hive/data/genomes/cavPor3/bed/ucscToINSDC

XXX - this one has problems, needs to be fixed

    # copying these scripts from the previous load and improving them
    # with each instance
    ./translateNames.sh
    ./verifyAll.sh
    ./join.sh

    # verify the track link to INSDC functions

##############################################################################
##############################################################################
# TransMap V3 tracks. see makeDb/doc/transMapTracks.txt (2014-12-21 markd)
##############################################################################

##############################################################################
# LASTZ Guinea pig/cavPor3 - Malayan flying lemur/galVar1
#      (DONE - 2016-04-15 - Hiram)
    mkdir /hive/data/genomes/cavPor3/bed/lastzGalVar1.2016-04-15
    cd /hive/data/genomes/cavPor3/bed/lastzGalVar1.2016-04-15

    printf '# Malayan flying lemur vs Guinea pig
BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.54/bin/lastz

# TARGET: Guinea pig cavPor3
SEQ1_DIR=/hive/data/genomes/cavPor3/cavPor3.wmTrf.2bit
SEQ1_LEN=/hive/data/genomes/cavPor3/chrom.sizes
SEQ1_CHUNK=20000000
SEQ1_LAP=10000
SEQ1_LIMIT=30


# QUERY: Malayan flying lemur galVar1
SEQ2_DIR=/hive/data/genomes/galVar1/galVar1.2bit
SEQ2_LEN=/hive/data/genomes/galVar1/chrom.sizes
SEQ2_CHUNK=20000000
SEQ2_LIMIT=800
SEQ2_LAP=0

BASE=/hive/data/genomes/cavPor3/bed/lastzGalVar1.2016-04-15
TMPDIR=/dev/shm
' > DEF
    # << happy emacs

    time (doBlastzChainNet.pl -verbose=2 `pwd`/DEF \
        -syntenicNet -fileServer=hgwdev \
        -chainMinScore=3000 -chainLinearGap=medium \
        -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku) > do.log 2>&1
    #     cat fb.cavPor3.chainTupBel1Link.txt

    cat fb.cavPor3.chainGalVar1Link.txt
    # 1300265220 bases of 2663369733 (48.820%) in intersection

    time (doRecipBest.pl -buildDir=`pwd` cavPor3 galVar1) > rbest.log 2>&1 &
    # real    161m43.865s

    # and for the swap:
    mkdir /hive/data/genomes/galVar1/bed/blastz.cavPor3.swap
    cd /hive/data/genomes/galVar1/bed/blastz.cavPor3.swap

    time (doBlastzChainNet.pl -verbose=2 \
      /hive/data/genomes/cavPor3/bed/lastzGalVar1.2016-04-15/DEF \
        -swap -chainMinScore=3000 -chainLinearGap=medium \
          -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \
            -syntenicNet) > swap.log 2>&1
    #  real    355m25.890s

    cat fb.galVar1.chainCavPor3Link.txt
    # 1357961680 bases of 2802917674 (48.448%) in intersection

    time (doRecipBest.pl -buildDir=`pwd` galVar1 cavPor3) > rbest.log 2>&1
    # real    231m8.085s

##############################################################################
# LASTZ Guinea pig/cavPor3 - Tree shrew/tupBel1
#      (DONE - 2016-04-15 - Hiram)
    mkdir /hive/data/genomes/cavPor3/bed/lastzTupBel1.2016-04-15
    cd /hive/data/genomes/cavPor3/bed/lastzTupBel1.2016-04-15

    printf '# Tree shrew vs Guinea pig
BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.66/bin/lastz

# TARGET: Guinea pig cavPor3
SEQ1_DIR=/hive/data/genomes/cavPor3/cavPor3.wmTrf.2bit
SEQ1_LEN=/hive/data/genomes/cavPor3/chrom.sizes
SEQ1_CHUNK=20000000
SEQ1_LAP=10000
SEQ1_LIMIT=30

# QUERY: Tree shrew tupBel1
SEQ2_DIR=/hive/data/genomes/tupBel1/tupBel1.2bit
SEQ2_LEN=/hive/data/genomes/tupBel1/chrom.sizes
SEQ2_CHUNK=20000000
SEQ2_LIMIT=800
SEQ2_LAP=0

BASE=/hive/data/genomes/cavPor3/bed/lastzTupBel1.2016-04-15
TMPDIR=/dev/shm
' > DEF
    # << happy emacs

    time (doBlastzChainNet.pl -verbose=2 `pwd`/DEF \
        -syntenicNet -fileServer=hgwdev \
        -chainMinScore=3000 -chainLinearGap=medium \
        -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku) > do.log 2>&1
    # real    597m13.526s

    cat fb.cavPor3.chainTupBel1Link.txt
    # 745589338 bases of 2663369733 (27.994%) in intersection

    time (doRecipBest.pl -buildDir=`pwd` cavPor3 tupBel1) > rbest.log 2>&1 &
    # real    187m21.483s

    # and for the swap:
    mkdir /hive/data/genomes/tupBel1/bed/blastz.cavPor3.swap
    cd /hive/data/genomes/tupBel1/bed/blastz.cavPor3.swap

    time (doBlastzChainNet.pl -verbose=2 \
      /hive/data/genomes/cavPor3/bed/lastzTupBel1.2016-04-15/DEF \
        -swap -chainMinScore=3000 -chainLinearGap=medium \
          -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \
            -syntenicNet) > swap.log 2>&1
    #  real    256m16.390s

    cat fb.tupBel1.chainCavPor3Link.txt
    # 747820632 bases of 2137225476 (34.990%) in intersection

    time (doRecipBest.pl -buildDir=`pwd` tupBel1 cavPor3) > rbest.log 2>&1
    # real    262m6.256s

#############################################################################
## 5-Way Multiz (DONE - 2015-05-03 - Hiram)
    ssh hgwdev
    mkdir /hive/data/genomes/cavPor3/bed/multiz5way
    cd /hive/data/genomes/cavPor3/bed/multiz5way

    # from the 191-way in the source tree, select out the 5 used here:
    /cluster/bin/phast/tree_doctor \
        --prune-all-but hg38,cavPor3,galVar1,mm10,tupBel1 \
        /cluster/home/hiram/kent/src/hg/utils/phyloTrees/191way.nh \
          > cavPor3.5way.nh.0
    cat cavPor3.5way.nh.0
    # ((hg38:0.143908,tupBel1:0.191140):0.002000,
    #    (mm10:0.315424,cavPor3:0.175779):0.041059);

    # using TreeGraph2 tree editor on the Mac, rearrange to get cavPor3
    # at the top:

    #	what that looks like:
 ~/kent/src/hg/utils/phyloTrees/asciiTree.pl cavPor3.5way.nh | sed -e 's/^/# /;'
# ((cavPor3:0.175779,
#  mm10:0.315424):0.041059,
# (hg38:0.143908,
# (tupBel1:0.136203,
# galVar1:0.08):0.054937):0.0020);

    # extract species list from that .nh file
    sed 's/[a-z][a-z]*_//g; s/:[0-9\.][0-9\.]*//g; s/;//; /^ *$/d' \
        cavPor3.5way.nh | xargs echo | sed 's/ //g; s/,/ /g' \
        | sed 's/[()]//g; s/,/ /g' | tr '[ ]' '[\n]' > species.list.txt

    # construct db to name translation list:
    cat species.list.txt | while read DB
do
hgsql -N -e "select name,organism from dbDb where name=\"${DB}\";" hgcentraltest
done | sed -e "s/\t/->/; s/ /_/g;" | sed -e 's/$/;/' | sed -e 's/\./_/g' \
        | sed -e 's/-nosed/_nosed/; s/-eating/_eating/;' > db.to.name.txt

    # construct a common name .nh file:
    /cluster/bin/phast/tree_doctor --rename \
    "`cat db.to.name.txt`" cavPor3.5way.nh | sed -e 's/00*)/)/g; s/00*,/,/g' \
       | $HOME/kent/src/hg/utils/phyloTrees/asciiTree.pl /dev/stdin \
         > cavPor3.5way.commonNames.nh
    cat cavPor3.5way.commonNames.nh | sed -e 's/^/# /;'
# ((Guinea_pig:0.175779,
#  Mouse:0.315424):0.041059,
# (Human:0.143908,
# (Tree_shrew:0.136203,
# Malayan_flying_lemur:0.08):0.054937):0.002);

#	Use this specification in the phyloGif tool:
#	http://genome.ucsc.edu/cgi-bin/phyloGif
#	to obtain a png image for src/hg/htdocs/images/phylo/cavPor3_5way.png

    ~/kent/src/hg/utils/phyloTrees/asciiTree.pl cavPor3.5way.nh > t.nh
    ~/kent/src/hg/utils/phyloTrees/scientificNames.sh t.nh \
       | $HOME/kent/src/hg/utils/phyloTrees/asciiTree.pl /dev/stdin \
          > cavPor3.5way.scientificNames.nh
    rm -f t.nh
    cat cavPor3.5way.scientificNames.nh | sed -e 's/^/# /;'
# ((Cavia_porcellus:0.175779,
#  Mus_musculus:0.315424):0.041059,
# (Homo_sapiens:0.143908,
# (Tupaia_belangeri:0.136203,
# Galeopterus_variegatus:0.08):0.054937):0.002);

    /cluster/bin/phast/all_dists cavPor3.5way.nh | grep cavPor3 \
        | sed -e "s/cavPor3.//" | sort -k2n > 5way.distances.txt
    #	Use this output to create the table below
    cat 5way.distances.txt | sed -e 's/^/# /;'
# galVar1       0.353775
# hg38  	0.362746
# tupBel1       0.409978
# mm10  	0.491203

    printf '#!/usr/bin/env perl

use strict;
use warnings;

open (FH, "<5way.distances.txt") or
        die "can not read 5way.distances.txt";

my $count = 0;
while (my $line = <FH>) {
    chomp $line;
    my ($D, $dist) = split('"'"'\\s+'"'"', $line);
    my $chain = "chain" . ucfirst($D);
    my $B="/hive/data/genomes/cavPor3/bed/lastz.$D/fb.cavPor3." .
        $chain . "Link.txt";
    my $chainLinkMeasure =
        `awk '"'"'{print \\$5}'"'"' ${B} 2> /dev/null | sed -e "s/(//; s/)//"`;
    chomp $chainLinkMeasure;
    $chainLinkMeasure = 0.0 if (length($chainLinkMeasure) < 1);
    $chainLinkMeasure =~ s/\\%%//;
    my $swapFile="/hive/data/genomes/${D}/bed/lastz.cavPor3/fb.${D}.chainCavPor3Link.txt";
    my $swapMeasure = "N/A";
    if ( -s $swapFile ) {
	$swapMeasure =
	    `awk '"'"'{print \\$5}'"'"' ${swapFile} 2> /dev/null | sed -e "s/(//; s/)//"`;
	chomp $swapMeasure;
	$swapMeasure = 0.0 if (length($swapMeasure) < 1);
	$swapMeasure =~ s/\\%%//;
    }
    my $orgName=
    `hgsql -N -e '"'"'select organism from dbDb where name="$D";'"'"' hgcentraltest`;
    chomp $orgName;
    if (length($orgName) < 1) {
        $orgName="N/A";
    }
    ++$count;
    printf "# %%02d  %%.4f (%%%% %%06.3f) (%%%% %%06.3f) - %%s %%s\\n", $count, $dist,
        $chainLinkMeasure, $swapMeasure, $orgName, $D;
}
close (FH);
' > sizeStats.pl
    chmod +x ./sizeStats.pl
    ./sizeStats.pl

#	If you can fill in all the numbers in this table, you are ready for
#	the multiple alignment procedure

#       featureBits chainLink measures
#               chainLink
#  N distance  on cavPor3  on other     other species
# 01  0.3538 (% 48.820) (% 48.448) - Malayan flying lemur galVar1
# 02  0.3627 (% 48.000) (% 42.371) - Human hg38
# 03  0.4100 (% 27.994) (% 34.990) - Tree shrew tupBel1
# 04  0.4912 (% 29.115) (% 28.447) - Mouse mm10

# None of this concern for distances matters in building the first step, the
# maf files.  The distances will be better calibrated later.

    # create species list and stripped down tree for autoMZ
    sed 's/[a-z][a-z]*_//g; s/:[0-9\.][0-9\.]*//g; s/;//; /^ *$/d' \
	cavPor3.5way.nh | xargs echo | sed 's/ //g; s/,/ /g' > tree.nh

    sed 's/[()]//g; s/,/ /g' tree.nh > species.list
    # cavPor3 mm10 hg38 tupBel1 galVar1

    # survey N50 for each
    for db in `cat species.list`
do
n50.pl /hive/data/genomes/$db/chrom.sizes
done
#       reading: /hive/data/genomes/cavPor3/chrom.sizes
#       contig count: 3144, total size: 2723219641, one half size: 1361609820
# cumulative    N50 count       contig  contig size
1356838683      27      scaffold_25     28222655
1361609820 one half size
1384780737      28      scaffold_27     27942054
#       reading: /hive/data/genomes/mm10/chrom.sizes
#       contig count: 66, total size: 2730871774, one half size: 1365435887
# cumulative    N50 count       contig  contig size
1312176979      8       chr7    145441459
1365435887 one half size
1442871972      9       chr10   130694993
#       reading: /hive/data/genomes/hg38/chrom.sizes
#       contig count: 455, total size: 3209286105, one half size: 1604643052
# cumulative    N50 count       contig  contig size
1547391171      8       chrX    156040895
1604643052 one half size
1692529807      9       chr8    145138636
#       reading: /hive/data/genomes/tupBel1/chrom.sizes
#       contig count: 150851, total size: 3660774957, one half size:
#       1830387478
# cumulative    N50 count       contig  contig size
1830345737      7910    scaffold_129972.1-127906        127906
1830387478 one half size
1830473625      7911    scaffold_147844.1-127888        127888
#       reading: /hive/data/genomes/galVar1/chrom.sizes
#       contig count: 179514, total size: 3187660572, one half size:
#       1593830286
# cumulative    N50 count       contig  contig size
1593691350      3422    NW_007730159v1  245222
1593830286 one half size
1593936539      3423    NW_007729331v1  245189

    #	bash shell syntax here ...
    cd /hive/data/genomes/cavPor3/bed/multiz5way
    export H=/hive/data/genomes/cavPor3/bed
    mkdir mafLinks
    # good assemblies can use syntenic net:
    #  hg38 mm10 cavPor3
    for G in hg38 mm10
    do
      mkdir mafLinks/$G
      echo ln -s ${H}/lastz.$G/axtChain/cavPor3.${G}.synNet.maf.gz ./mafLinks/$G
      ln -s ${H}/lastz.$G/axtChain/cavPor3.${G}.synNet.maf.gz ./mafLinks/$G
    done

    # other assemblies using recip best net:
    #  tupBel1
    for G in tupBel1 galVar1
    do
      mkdir mafLinks/$G
      echo ln -s ${H}/lastz.$G/mafRBestNet/cavPor3.${G}.rbest.maf.gz ./mafLinks/$G
      ln -s ${H}/lastz.$G/mafRBestNet/cavPor3.${G}.rbest.maf.gz ./mafLinks/$G
    done

    # verify the symLinks are good:
    ls -ogrtL mafLinks/*/* | sed -e 's/^/# /; s/-rw-rw-r-- 1//;'
#  549742600 Mar 19  2012 mafLinks/mm10/cavPor3.mm10.synNet.maf.gz
#  930458998 Apr 29  2015 mafLinks/hg38/cavPor3.hg38.synNet.maf.gz
#  914003647 Apr 18 11:56 mafLinks/galVar1/cavPor3.galVar1.rbest.maf.gz
#  535098561 Apr 18 12:25 mafLinks/tupBel1/cavPor3.tupBel1.rbest.maf.gz

    # split the maf files into a set of hashed named files
    # this hash named split keeps the same chr/contig names in the same
    # named hash file.
    mkdir /hive/data/genomes/cavPor3/bed/multiz5way/mafSplit
    cd /hive/data/genomes/cavPor3/bed/multiz5way/mafSplit
    time for D in `sed -e "s/cavPor3 //" ../species.list`
do
    echo "${D}"
    mkdir $D
    cd $D
    echo "mafSplit -byTarget -useHashedName=8 /dev/null . ../../mafLinks/${D}/*.maf.gz"
    mafSplit -byTarget -useHashedName=8 /dev/null . \
	../../mafLinks/${D}/*.maf.gz
    cd ..
done
    # real    2m20.137s

    # construct a list of all possible maf file names.
    # they do not all exist in each of the species directories
    find . -type f | wc -l
    # 798
    find . -type f | grep ".maf$" | xargs -L 1 basename | sort -u > maf.list
    wc -l maf.list
    # 246 maf.list

    mkdir /hive/data/genomes/cavPor3/bed/multiz5way/splitRun
    cd /hive/data/genomes/cavPor3/bed/multiz5way/splitRun
    mkdir maf run
    cd run
    mkdir penn
    cp -p /cluster/bin/penn/multiz.2009-01-21_patched/multiz penn
    cp -p /cluster/bin/penn/multiz.2009-01-21_patched/maf_project penn
    cp -p /cluster/bin/penn/multiz.2009-01-21_patched/autoMZ penn

    #	verify the db and pairs settings are correct
    printf '#!/bin/csh -ef
set db = cavPor3
set c = $1
set result = $2
set run = `/bin/pwd`
set tmp = /dev/shm/$db/multiz.$c
set pairs = /hive/data/genomes/cavPor3/bed/multiz5way/mafSplit
/bin/rm -fr $tmp
/bin/mkdir -p $tmp
/bin/cp -p ../../tree.nh ../../species.list $tmp
pushd $tmp > /dev/null
foreach s (`/bin/sed -e "s/$db //" species.list`)
    set in = $pairs/$s/$c
    set out = $db.$s.sing.maf
    if (-e $in.gz) then
        /bin/zcat $in.gz > $out
        if (! -s $out) then
            echo "##maf version=1 scoring=autoMZ" > $out
        endif
    else if (-e $in) then
        /bin/ln -s $in $out
    else
        echo "##maf version=1 scoring=autoMZ" > $out
    endif
end
set path = ($run/penn $path); rehash
$run/penn/autoMZ + T=$tmp E=$db "`cat tree.nh`" $db.*.sing.maf $c \
        > /dev/null
popd > /dev/null
/bin/rm -f $result
/bin/cp -p $tmp/$c $result
/bin/rm -fr $tmp
' > autoMultiz.csh

    chmod +x autoMultiz.csh

    printf '#LOOP
./autoMultiz.csh $(file1) {check out line+ /hive/data/genomes/cavPor3/bed/multiz5way/splitRun/maf/$(root1).maf}
#ENDLOOP
' > template

    ln -s ../../mafSplit/maf.list maf.list
    ssh ku
    cd /hive/data/genomes/cavPor3/bed/multiz5way/splitRun/run
    gensub2 maf.list single template jobList
    para create jobList
    para try ... check ... push ... etc...
# Completed: 246 of 246 jobs
# CPU time in finished jobs:      63385s    1056.41m    17.61h    0.73d  0.002 y
# IO & Wait Time:                   674s      11.24m     0.19h    0.01d  0.000 y
# Average job time:                 260s       4.34m     0.07h    0.00d
# Longest finished job:            2448s      40.80m     0.68h    0.03d
# Submission to last job:          2479s      41.32m     0.69h    0.03d

    # combine into one file  (the 1>&2 redirect sends the echo to stderr)
    cd /hive/data/genomes/cavPor3/bed/multiz5way
    head -1 splitRun/maf/020.maf > multiz5way.maf
    time for F in splitRun/maf/*.maf
do
    echo "${F}" 1>&2
    egrep -v "^#" ${F}
done >> multiz5way.maf
    # real    0m37.210s


    tail -1 splitRun/maf/020.maf >> multiz5way.maf
# -rw-rw-r-- 1 8260142703 May  3 15:40 multiz5way.maf

    # Load into database
    ssh hgwdev
    cd /hive/data/genomes/cavPor3/bed/multiz5way
    mkdir /gbdb/cavPor3/multiz5way
    ln -s `pwd`/multiz5way.maf /gbdb/cavPor3/multiz5way
    cd /dev/shm
    time hgLoadMaf cavPor3 multiz5way
# Loaded 8610698 mafs in 1 files from /gbdb/cavPor3/multiz5way
#  real    2m37.904s

    time hgLoadMafSummary -verbose=2 -minSize=30000 \
	-mergeGap=1500 -maxSize=200000 cavPor3 multiz5waySummary \
	/gbdb/cavPor3/multiz5way/multiz5way.maf
# Created 844437 summary blocks from 19057881 components and 8610698 mafs from /gbdb/cavPor3/multiz5way/multiz5way.maf
# real    3m1.052s

# -rw-rw-r-- 1 479126012 May  3 15:42 multiz5way.tab
# -rw-rw-r-- 1  42429421 May  3 15:47 multiz5waySummary.tab

    wc -l multiz5way*.tab
#   8610698 multiz5way.tab
#    844437 multiz5waySummary.tab

    rm multiz5way*.tab

##############################################################################
# GAP ANNOTATE MULTIZ7WAY MAF AND LOAD TABLES (DONE - 2015-05-03 - Hiram)
    # mafAddIRows has to be run on single chromosome maf files, it does not
    #	function correctly when more than one reference sequence
    #	are in a single file.  Need to split of the maf file into individual
    #   maf files
    mkdir -p /hive/data/genomes/cavPor3/bed/multiz5way/anno/mafSplit
    cd /hive/data/genomes/cavPor3/bed/multiz5way/anno/mafSplit

    time mafSplit -outDirDepth=2 -byTarget -useFullSequenceName \
        /dev/null . ../../multiz5way.maf
    #   real    2m41.134s

    find . -type f | wc -l
    #   1082

    # check for N.bed files everywhere:
    cd /hive/data/genomes/cavPor3/bed/multiz5way/anno
    for DB in `cat ../species.list`
do
    if [ ! -s /hive/data/genomes/${DB}/${DB}.N.bed ]; then
        echo "MISS: ${DB}"
#         cd /hive/data/genomes/${DB}
#         twoBitInfo -nBed ${DB}.2bit ${DB}.N.bed
    else
        echo "  OK: ${DB}"
    fi
done

    cd /hive/data/genomes/cavPor3/bed/multiz5way/anno
    for DB in `cat ../species.list`
do
    echo "${DB} "
    ln -s  /hive/data/genomes/${DB}/${DB}.N.bed ${DB}.bed
    echo ${DB}.bed  >> nBeds
    ln -s  /hive/data/genomes/${DB}/chrom.sizes ${DB}.len
    echo ${DB}.len  >> sizes
done
    # make sure they all are successful symLinks:
    ls -ogrtL

    screen -S gapAnno      # use a screen to control this longish job
    ssh ku
    cd /hive/data/genomes/cavPor3/bed/multiz5way/anno
    mkdir result
    find ./mafSplit -type d | sed -e 's#./mafSplit/##' | while read D
do
    echo mkdir -p result/${D}
    mkdir -p result/${D}
done
    printf '#LOOP
mafAddIRows -nBeds=nBeds mafSplit/$(path1) /hive/data/genomes/cavPor3/cavPor3.2bit {check out exists+ result/$(path1)}
#ENDLOOP
' > template

    find ./mafSplit -type f | sed -e 's#^./mafSplit/##' > maf.list
    gensub2 maf.list single template jobList
    # limit jobs on a node with the ram=32g requirement because they go fast
    para -ram=32g create jobList
    para try ... check ... push ...
# Completed: 1082 of 1082 jobs
# CPU time in finished jobs:       1795s      29.92m     0.50h    0.02d  0.000 y
# IO & Wait Time:                  2812s      46.87m     0.78h    0.03d  0.000 y
# Average job time:                   4s       0.07m     0.00h    0.00d
# Longest finished job:              42s       0.70m     0.01h    0.00d
# Submission to last job:           124s       2.07m     0.03h    0.00d

    # verify all result files have some content, look for 0 size files:
    find ./result -type f -size 0
    # should see none
    # or in this manner:
    find ./result -type f | xargs ls -og | sort -k3nr | tail

    # combine into one file  (the 1>&2 redirect sends the echo to stderr)
    head -q -n 1 result/1/1/scaffold_326.maf > cavPor3.5way.maf
    time find ./result -type f | while read F
do
    echo "${F}" 1>&2
    grep -h -v "^#" ${F}
done >> cavPor3.5way.maf
    # real    29m25.870s

    #	these maf files do not have the end marker, this does nothing:
    #	tail -q -n 1 result/1/1/scaffold_326.maf >> cavPor3.5way.maf
    # How about an official end marker:
    echo "##eof maf" >> cavPor3.5way.maf
    ls -og
# -rw-rw-r--  1 11434713982 May  4 10:01 cavPor3.5way.maf

    du -hsc cavPor3.5way.maf
    # 11G     cavPor3.5way.maf

    # construct symlinks to get the individual maf files into gbdb:
    rm /gbdb/cavPor3/multiz5way/multiz5way.maf   # remove previous results
    ln -s `pwd`/cavPor3.5way.maf /gbdb/cavPor3/multiz5way/multiz5way.maf

    # Load into database
    cd /dev/shm
    time hgLoadMaf -pathPrefix=/gbdb/cavPor3/multiz5way cavPor3 multiz5way
# Loaded 9842095 mafs in 1 files from /gbdb/cavPor3/multiz5way
# real    3m21.378s

    time hgLoadMafSummary -verbose=2 -minSize=30000 \
	-mergeGap=1500 -maxSize=200000 cavPor3 multiz5waySummary \
        /gbdb/cavPor3/multiz5way/multiz5way.maf
# Created 844437 summary blocks from 19057881 components and 9842095 mafs from /gbdb/cavPor3/multiz5way/multiz5way.maf
# real    3m41.604s

    # -rw-rw-r-- 1 549110793 May  4 10:05 multiz5way.tab
    # -rw-rw-r-- 1  44118295 May  4 10:11 multiz5waySummary.tab

    rm multiz5way*.tab

######################################################################
# MULTIZ7WAY MAF FRAMES (DONE - 2016-05-05 - Hiram)
    ssh hgwdev
    mkdir /hive/data/genomes/cavPor3/bed/multiz5way/frames
    cd /hive/data/genomes/cavPor3/bed/multiz5way/frames
#   survey all the genomes to find out what kinds of gene tracks they have
    printf '#!/bin/csh -fe
foreach db (`cat ../species.list`)
    printf "# ${db}: "
    set tables = `hgsql $db -N -e "show tables" | egrep "Gene|ncbiRefSeq"`
    foreach table ($tables)
        if ($table == "ensGene" || $table == "refGene" || \
           $table == "mgcGenes" || $table == "knownGene" || \
           $table == "ncbiRefSeq" || $table == "xenoRefGene" ) then
           set count = `hgsql $db -N -e "select count(*) from $table"`
            echo -n "${table}: ${count}, "
        endif
    end
    set orgName = `hgsql hgcentraltest -N -e \
            "select scientificName from dbDb where name='"'"'$db'"'"'"`
    set orgId = `hgsql $db -N -e \
            "select id from organism where name='"'"'$orgName'"'"'"`
    if ($orgId == "") then
        echo "Mrnas: 0"
    else
        set count = `hgsql $db -N -e "select count(*) from gbCdnaInfo where organism=$orgId"`
        echo "Mrnas: ${count}"
    endif
end
' > showGenes.csh
    # << happy emacs
    chmod +x ./showGenes.csh
    time ./showGenes.csh
# cavPor3: ensGene: 26129, refGene: 488, xenoRefGene: 296142, Mrnas: 21235
# mm10: ensGene: 103734, knownGene: 63759, mgcGenes: 26768, ncbiRefSeq:
#     116846, refGene: 35838, xenoRefGene: 172783, Mrnas: 5250157
# hg38: ensGene: 208239, knownGene: 197782, mgcGenes: 35390, ncbiRefSeq:
#     161913, refGene: 63332, xenoRefGene: 181602, Mrnas: 11351569
# tupBel1: ensGene: 34727, xenoRefGene: 688682, Mrnas: 2509
# galVar1: ncbiRefSeq: 41547, xenoRefGene: 473670, Mrnas: 0

# real    2m47.018s

    # from that summary, use these gene sets:
    # knownGene - hg38 mm10
    # ensGene - tupBel1 cavPor3
    # ncbiRefSeq - galVar1

    mkdir genes
    #   1. knownGene: hg38 mm10
    for DB in hg38 mm10
do
    hgsql -N -e "select name,chrom,strand,txStart,txEnd,cdsStart,cdsEnd,exonCount,exonStarts,exonEnds from knownGene" ${DB} \
      | genePredSingleCover stdin stdout | gzip -2c \
        > genes/${DB}.gp.gz
    printf "# ${DB}: "
    genePredCheck -db=${DB} genes/${DB}.gp.gz
done
    # checked: 21375 failed: 0
    # checked: 21100 failed: 0

    #   2. ensGene: tupBel1 cavPor3
    for DB in tupBel1 cavPor3
do
hgsql -N -e "select
name,chrom,strand,txStart,txEnd,cdsStart,cdsEnd,exonCount,exonStarts,exonEnds
from ensGene" ${DB} \
      | genePredSingleCover stdin stdout | gzip -2c \
        > /dev/shm/${DB}.tmp.gz
    mv /dev/shm/${DB}.tmp.gz genes/$DB.gp.gz
    printf "# ${DB}: "
    genePredCheck -db=${DB} genes/${DB}.gp.gz
done
# tupBel1: checked: 29256 failed: 0
# cavPor3: checked: 18631 failed: 0

    #   3. ncbiRefSeq for galVar1
    for DB in galVar1
do
hgsql -N -e "select * from ncbiRefSeq" ${DB} | cut -f2- \
      | genePredSingleCover stdin stdout | gzip -2c \
        > /dev/shm/${DB}.tmp.gz
    mv /dev/shm/${DB}.tmp.gz genes/$DB.gp.gz
    printf "# ${DB}: "
    genePredCheck -db=${DB} genes/${DB}.gp.gz
done
# galVar1: checked: 23389 failed: 0

    # verify counts for genes are reasonable:
    for T in genes/*.gz
do
    printf "# $T: "
    zcat $T | cut -f1 | sort | uniq -c | wc -l
done
# genes/cavPor3.gp.gz: 18631
# genes/galVar1.gp.gz: 23054
# genes/hg38.gp.gz: 21375
# genes/mm10.gp.gz: 21100
# genes/tupBel1.gp.gz: 15407

    time (cat ../anno/cavPor3.5way.maf \
	| genePredToMafFrames cavPor3 stdin stdout \
          `cat ../species.list.txt | xargs echo \
            |sed -e "s#\([a-zA-Z0-9]*\)#\1 genes/\1.gp.gz#g;"` \
		| gzip > multiz5wayFrames.bed.gz)
    # real    2m38.685s

    # verify there are frames on everything, should be 5 species:
    zcat multiz5wayFrames.bed.gz | awk '{print $4}' | sort | uniq -c \
      | sed -e 's/^/# /;'
#  185620 cavPor3
#  246289 galVar1
#  245505 hg38
#  234762 mm10
#  201249 tupBel1

    #   load the resulting file
    ssh hgwdev
    cd /hive/data/genomes/cavPor3/bed/multiz5way/frames
    time hgLoadMafFrames cavPor3 multiz5wayFrames multiz5wayFrames.bed.gz
    #   real    0m12.730s

    time featureBits -countGaps cavPor3 multiz5wayFrames
    # 36929844 bases of 2723219641 (1.356%) in intersection
    # real    0m10.136s

    #   enable the trackDb entries:
# frames multiz5wayFrames
# irows on
    #   appears to work OK

#########################################################################
# Phylogenetic tree from 5-way (DONE - 2016-05-06 - Hiram)
    mkdir /hive/data/genomes/cavPor3/bed/multiz5way/4d
    cd /hive/data/genomes/cavPor3/bed/multiz5way/4d

    # using the ensGene
    hgsql -N -e "select * from ensGene;" cavPor3 \
      | cut -f2- | genePredSingleCover stdin stdout > /dev/shm/cavPor3.tmp.gp
    mv /dev/shm/cavPor3.tmp.gp cavPor3.ensGene.gp
    genePredCheck -db=cavPor3 cavPor3.ensGene.gp
    # checked: 18631 failed: 0

    genePredSingleCover cavPor3.ensGene.gp stdout \
       | sort > cavPor3.ensGeneNR.gp
    genePredCheck -db=cavPor3 cavPor3.ensGeneNR.gp
    # checked: 18631 failed: 0

    # the annotated maf is:
    og ../anno/cavPor3.5way.maf
# -rw-rw-r-- 1 11434713982 May  4 10:01 ../anno/cavPor3.5way.maf

    mkdir annoSplit
    cd annoSplit
    time mafSplit -verbose=2 -outDirDepth=2 -byTarget -useFullSequenceName \
	/dev/null . ../../anno/cavPor3.5way.maf
    # real    3m39.018s


    find . -type f | wc -l
    #   1082
    ssh ku
    mkdir /hive/data/genomes/cavPor3/bed/multiz5way/4d/run
    cd /hive/data/genomes/cavPor3/bed/multiz5way/4d/run
    mkdir ../mfa

    # newer versions of msa_view have a slightly different operation
    # the sed of the gp file inserts the reference species in the chr name
    cat << '_EOF_' > 4d.csh
#!/bin/csh -fex
set PHASTBIN = /cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin
set GP = cavPor3.ensGeneNR.gp

set r = "/hive/data/genomes/cavPor3/bed/multiz6way"
set c = $1
set infile = $r/4d/$2
set outDir = $r/4d/$3
set outfile = $r/4d/run/$4
/bin/mkdir -p $outDir
cd /dev/shm
/bin/awk -v C=$c '$2 == C {print}' $r/4d/$GP | sed -e "s/\t$c\t/\tcavPor3.$c\t/" > $c.gp
set NL=`wc -l $c.gp| gawk '{print $1}'`
echo $NL
if ("$NL" != "0") then
    $PHASTBIN/msa_view --4d --features $c.gp -i MAF $infile -o SS > $c.ss
    $PHASTBIN/msa_view -i SS --tuple-size 1 $c.ss > $outfile
else
    echo "" > $outfile
endif
/bin/rm -f /dev/shm/$c.gp /dev/shm/$c.ss
_EOF_
    # << happy emacs

    chmod +x 4d.csh

    find ../annoSplit -type f | sed -e "s#../annoSplit/##" > maf.list
    wc -l maf.list
# 933 maf.list


    printf '#LOOP
4d.csh $(root1) annoSplit/$(dir1)/$(file1) mfa/$(dir1) {check out line+ ../mfa/$(dir1)/$(root1).mfa}
#ENDLOOP
' > template

    gensub2 maf.list single template jobList
    para create jobList
    para try ... check
    para time
# Completed: 917 of 933 jobs
# Crashed: 16 jobs
# CPU time in finished jobs:        622s      10.37m     0.17h    0.01d  0.000 y
# IO & Wait Time:                  2571s      42.84m     0.71h    0.03d  0.000 y
# Average job time:                   3s       0.06m     0.00h    0.00d
# Longest finished job:              23s       0.38m     0.01h    0.00d
# Submission to last job:            41s       0.68m     0.01h    0.00d

    # the crashed jobs appear to be single genes on a single scaffold and they
    # do not make a .ss output
    # Not all results have contents, that is OK

    # combine mfa files
    ssh hgwdev
    cd /hive/data/genomes/cavPor3/bed/multiz5way/4d
    # remove the broken empty files, size 0 and size 1:
    find ./mfa -type f -size 0 | xargs rm -f
    # sometimes this doesn't work, don't know why
    find ./mfa -type f -size 1 | xargs rm -f
    # when it doesn't, use this empty list procedure
    find ./mfa -type f | xargs ls -og | awk '$3 < 2' | awk '{print $NF}' \
        > empty.list
    cat empty.list | xargs rm -f
    # see what is left:
    ls -ogrt mfa/*/*/*.mfa | sort -k3nr | wc
    #     298    2086   17178

    # want comma-less species.list
    time /cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin/msa_view \
	--aggregate "`cat ../species.list`" mfa/*/*/*.mfa | sed s/"> "/">"/ \
	    > 4d.all.mfa
    # real    0m1.411s

    # check they are all in there:
    grep "^>" 4d.all.mfa | wc -l
    # 6
    grep "^>" 4d.all.mfa | sed -e 's/^/# /;'
# >cavPor3
# >mm10
# >hg38
# >tupChi1
# >tupBel1
# >galVar1

    sed 's/[a-z][a-z]*_//g; s/:[0-9\.][0-9\.]*//g; s/;//; /^ *$/d' \
	../cavPor3.5way.nh | xargs echo | sed -e 's/ //g' > tree_commas.nh
    # tree_commas.nh looks like:
    # ((cavPor3,mm10),(hg38,((tupChi1,tupBel1),galVar1)))

    # use phyloFit to create tree model (output is phyloFit.mod)
    time /cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin/phyloFit \
	    --EM --precision MED --msa-format FASTA --subst-mod REV \
		--tree tree_commas.nh 4d.all.mfa
    #  real    0m1.991s

    mv phyloFit.mod all.mod

    grep TREE all.mod
# TREE:
# ((cavPor3:0.216074,mm10:0.289999):0.0256454,(hg38:0.119014,
# ((tupChi1:0.00562878,tupBel1:0.00977264):0.178737,
#  galVar1:0.104398):0.00894324):0.0256454);

    # compare these calculated lengths to the tree extracted from 183way:
    grep TREE all.mod | sed -e 's/TREE: //' \
       | /cluster/bin/phast/all_dists /dev/stdin | grep cavPor3 \
          | sed -e "s/cavPor3.//;"  | sort > new.dists
    /cluster/bin/phast/all_dists ../cavPor3.5way.nh | grep cavPor3 \
        | sed -e "s/cavPor3.//;" | sort > old.dists
     # printing out the 'new', the 'old' the 'difference' and percent difference
    join new.dists old.dists | awk '{
  printf "#\t%s\t%8.6f\t%8.6f\t%8.6f\t%8.6f\n", $1, $2, $3, $2-$3, 100*($2-$3)/$3 }' \
      | sort -k3n
#       galVar1 0.383228        0.353775        0.029453        8.325348
#       hg38    0.390633        0.362746        0.027887        7.687748
#       tupBel1 0.464419        0.409978        0.054441        13.279005
#       mm10    0.509725        0.491203        0.018522        3.770742

     # compare this tree with the one generated from the galVar1 5-way
    grep TREE all.mod | sed -e 's/TREE: //' \
     | /cluster/bin/phast/all_dists /dev/stdin | grep galVar1 \
        | sed -e "s/galVar1.//" | sort > cavPor3.galVar1.dists
XXX - waiting for galVar1 to fini - Thu May  5 16:22:43 PDT 2016
    grep TREE /hive/data/genomes/galVar1/bed/multiz5way/4d/all.mod \
       | sed -e 's/TREE: //' \
       | /cluster/bin/phast/all_dists /dev/stdin | grep galVar1 \
          | sed -e "s/galVar1.//;"  | sort > galVar1.galVar1.dists
    # appears to be systematically longer branch lengths with this cavPor3
    # reference, all by a similar percentage:
    # table headings:
#       species  cavPor3        galVar1       cavPor3-galVar1    percent
#                dist est.     dist est.         difference    difference
    join cavPor3.galVar1.dists galVar1.galVar1.dists | awk '{
  printf "#\t%s\t%8.6f\t%8.6f\t%8.6f\t%8.6f\n", $1, $2, $3, $2-$3, 100*($2-$3)/$3 }' \
      | sort -k3n
#       hg38    0.231963        0.227001        0.004962        2.185893
#       tupBel1 0.287901        0.280636        0.007265        2.588763
#       cavPor3 0.383228        0.370042        0.013186        3.563379
#       mm10    0.455383        0.442507        0.012876        2.909784

#########################################################################
# phastCons 5-way (DONE - 2016-05-06 - Hiram)
    # split 5way mafs into 10M chunks and generate sufficient statistics
    # files for # phastCons
    ssh ku
    mkdir -p /hive/data/genomes/cavPor3/bed/multiz5way/cons/SS
    cd /hive/data/genomes/cavPor3/bed/multiz5way/cons/SS
    mkdir result done

    printf '#!/bin/csh -ef
set d = $1
set c = $2
set doneDir = done/$d
set MAF = /hive/data/genomes/cavPor3/bed/multiz5way/anno/result/$d/$c.maf
set WINDOWS = /hive/data/genomes/cavPor3/bed/multiz5way/cons/SS/result/$d/$c
set WC = `cat $MAF | wc -l`
set NL = `grep "^#" $MAF | wc -l`
if ( -s $3 ) then
    exit 0
endif
if ( -s $3.running ) then
    exit 0
endif

/bin/mkdir -p $doneDir
/bin/date >> $3.running

/bin/rm -fr $WINDOWS
/bin/mkdir -p $WINDOWS
pushd $WINDOWS > /dev/null
if ( $WC != $NL ) then
/cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin/msa_split \\
    $MAF -i MAF -o SS -r $WINDOWS/$c -w 10000000,0 -I 1000 -B 5000
endif
popd > /dev/null
/bin/date >> $3
/bin/rm -f $3.running
' > mkSS.csh
    chmod +x mkSS.csh

    printf '#LOOP
mkSS.csh $(dir1) $(root1) {check out line+ done/$(dir1)/$(root1)}
#ENDLOOP
' > template

    find ../../anno/result -type f | sed -e "s#../../anno/result/##" > maf.list
    wc -l maf.list
# 1082 maf.list

    ssh ku
    cd /hive/data/genomes/cavPor3/bed/multiz5way/cons/SS

    gensub2 maf.list single template jobList
    # beware overwhelming the cluster with these quick high I/O jobs
    para create jobList
    para try ... check ... etc
    para -maxJob=64 push
# Completed: 1082 of 1082 jobs
# CPU time in finished jobs:        964s      16.06m     0.27h    0.01d  0.000 y
# IO & Wait Time:                  2888s      48.14m     0.80h    0.03d  0.000 y
# Average job time:                   4s       0.06m     0.00h    0.00d
# Longest finished job:              33s       0.55m     0.01h    0.00d
# Submission to last job:            89s       1.48m     0.02h    0.00d

    find ./result -type f | wc -l
    #	 800

    # Run phastCons
    #	This job is I/O intensive in its output files, beware where this
    #	takes place or do not run too many at once.
    ssh ku
    mkdir -p /hive/data/genomes/cavPor3/bed/multiz5way/cons/run.cons
    cd /hive/data/genomes/cavPor3/bed/multiz5way/cons/run.cons

    #	This is setup for multiple runs based on subsets, but only running
    #   the 'all' subset here.
    #   It triggers off of the current working directory
    #	$cwd:t which is the "grp" in this script.  Running:
    #	all and vertebrates

    printf '#!/bin/csh -fe
set PHASTBIN = /cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin
set c = $1
set d = $2
set f = $3
set len = $4
set cov = $5
set rho = $6
set grp = $cwd:t
set cons = /hive/data/genomes/cavPor3/bed/multiz5way/cons
set tmp = $cons/tmp/${d}_${c}
mkdir -p $tmp
set ssSrc = $cons/SS/result
set useGrp = "$grp.mod"
if (-s $cons/$grp/$grp.non-inf) then
  ln -s $cons/$grp/$grp.mod $tmp
  ln -s $cons/$grp/$grp.non-inf $tmp
  ln -s $ssSrc/$d/$f $tmp
else
  ln -s $ssSrc/$d/$f $tmp
  ln -s $cons/$grp/$grp.mod $tmp
endif
pushd $tmp > /dev/null
if (-s $grp.non-inf) then
  $PHASTBIN/phastCons $f $useGrp \\
    --rho $rho --expected-length $len --target-coverage $cov --quiet \\
    --not-informative `cat $grp.non-inf` \\
    --seqname $c --idpref $c --most-conserved $c.bed --score > $c.pp
else
  $PHASTBIN/phastCons $f $useGrp \\
    --rho $rho --expected-length $len --target-coverage $cov --quiet \\
    --seqname $c --idpref $c --most-conserved $c.bed --score > $c.pp
endif
popd > /dev/null
mkdir -p pp/$d bed/$d
sleep 4
touch pp/$d bed/$d
rm -f pp/$d/$c.pp
rm -f bed/$d/$c.bed
mv $tmp/$c.pp pp/$d
mv $tmp/$c.bed bed/$d
rm -fr $tmp
rmdir --ignore-fail-on-non-empty $cons/tmp/$d:h
' > doPhast.csh

    chmod +x doPhast.csh

    #	this template will serve for all runs
    #	root1 == chrom name, file1 == ss file name without .ss suffix
    printf '#LOOP
../run.cons/doPhast.csh $(root1) $(dir1) $(file1) 45 0.3 0.3 {check out line+ pp/$(dir1)/$(root1).pp}
#ENDLOOP
' > template

    find ../SS/result -type f | sed -e "s#../SS/result/##" > ss.list
    wc -l ss.list
    #	800 ss.list

    # Create parasol batch and run it
    # run for all species
    cd /hive/data/genomes/cavPor3/bed/multiz5way/cons
    mkdir -p all
    cd all
    #	Using the .mod tree
    cp -p ../../4d/all.mod ./all.mod

    gensub2 ../run.cons/ss.list single ../run.cons/template jobList
    para -ram=32g create jobList
    para try ... check ...
    para push
# Completed: 800 of 800 jobs
# CPU time in finished jobs:       4961s      82.68m     1.38h    0.06d  0.000 y
# IO & Wait Time:                  5314s      88.57m     1.48h    0.06d  0.000 y
# Average job time:                  13s       0.21m     0.00h    0.00d
# Longest finished job:              32s       0.53m     0.01h    0.00d
# Submission to last job:           105s       1.75m     0.03h    0.00d

    # create Most Conserved track
    cd /hive/data/genomes/cavPor3/bed/multiz5way/cons/all
    time cut -f1 ../../../../chrom.sizes | while read C
do
    ls -d bed/?/?/${C} 2> /dev/null | while read D
    do
        echo ${D}/${C}*.bed 1>&2
        cat ${D}/${C}*.bed
    done | sort -k1,1 -k2,2n \
    | awk '{printf "%s\t%d\t%d\tlod=%d\t%s\n", "'${C}'", $2, $3, $5, $5;}'
done > tmpMostConserved.bed
    # real    1m8.280s

    time /cluster/bin/scripts/lodToBedScore tmpMostConserved.bed \
         w mostConserved.bed
    # real    0m13.098s

    # -rw-rw-r--  1 40630220 May  6 10:26 mostConserved.bed

    # load into database
    ssh hgwdev
    cd /hive/data/genomes/cavPor3/bed/multiz5way/cons/all
    time hgLoadBed cavPor3 phastConsElements5way mostConserved.bed
    # Read 1019383 elements of size 5 from mostConserved.bed
    # real    0m10.697s

    # on human we often try for 5% overall cov, and 70% CDS cov
    #	--rho 0.3 --expected-length 45 --target-coverage 0.3
    time featureBits cavPor3 -enrichment ensGene:cds phastConsElements5way
    # ensGene:cds 1.119%, phastConsElements5way 4.394%, both 0.818%,
    #   cover 73.11%, enrich 16.64x
    #  real    0m14.013s

    # Create merged posterier probability file and wiggle track data files
    cd /hive/data/genomes/cavPor3/bed/multiz5way/cons/all
    mkdir downloads

    # the third sed fixes the chrom names, removing the partition extensions
    time (find ./pp -type f | sed -e "s#^./##; s#\.# d #g; s#-# m #;" \
	| sort -k1,1 -k3,3n | sed -e "s# d #.#g; s# m #-#g;" | xargs cat \
	| sed -e 's/\.[0-9][0-9]*-[0-9][0-9]* start/ start/' \
        | gzip -c > downloads/phastCons5way.wigFix.gz)
    #  real    20m14.674s

# -rw-rw-r-- 1 1844583276 May  6 10:49 phastCons5way.wigFix.gz

    # check integrity of data with wigToBigWig
    time (zcat downloads/phastCons5way.wigFix.gz \
	| wigToBigWig -verbose=2 stdin /hive/data/genomes/cavPor3/chrom.sizes \
	    phastCons5way.bw) > bigWig.log 2>&1
    tail bigWig.log
    # pid=54646: VmPeak:    16771468 kB
    # real    25m26.987s

    bigWigInfo phastCons5way.bw | sed -e 's/^/# /;'
# version: 4
# isCompressed: yes
# isSwapped: 0
# primaryDataSize: 2,832,765,911
# primaryIndexSize: 75,227,328
# zoomLevels: 10
# chromCount: 625
# basesCovered: 1,531,042,459
# mean: 0.139598
# min: 0.000000
# max: 1.000000
# std: 0.262845

    #	encode those files into wiggle data
    time (zcat downloads/phastCons5way.wigFix.gz \
	| wigEncode stdin phastCons5way.wig phastCons5way.wib)
    # Converted stdin, upper limit 1.00, lower limit 0.00
    #  real    8m33.871s

    du -hsc *.wi?
    # 1.5G    phastCons5way.wib
    # 244M    phastCons5way.wig

    # Load gbdb and database with wiggle.
    ln -s `pwd`/phastCons5way.wib /gbdb/cavPor3/multiz5way/phastCons5way.wib
    time hgLoadWiggle -pathPrefix=/gbdb/cavPor3/multiz5way \
	cavPor3 phastCons5way phastCons5way.wig
    #   real    0m29.220s

    # use to set trackDb.ra entries for wiggle min and max
    # and verify table is loaded correctly

    wigTableStats.sh cavPor3 phastCons5way
# db.table              min max mean       count sumData
# cavPor3.phastCons5way   0 1 0.139598 1531042459 2.1373e+08
#     stdDev  viewLimits
#    0.262845  viewLimits=0:1

    #  Create histogram to get an overview of all the data
    time hgWiggle -doHistogram -db=cavPor3 \
	-hBinSize=0.001 -hBinCount=1000 -hMinVal=0.0 -verbose=2 \
	    phastCons5way > histogram.data 2>&1
    #	real    1m45.784s

    #	create plot of histogram:

    printf 'set terminal png small x000000 xffffff xc000ff x66ff66 xffff00 x00ffff font \
"/usr/share/fonts/default/Type1/n022004l.pfb"
set size 1.4, 0.8
set key left box
set grid noxtics
set grid ytics
set title " Guinea pig cavPor3 Histogram phastCons5way track"
set xlabel " phastCons5way score"
set ylabel " Relative Frequency"
set y2label " Cumulative Relative Frequency (CRF)"
set y2range [0:1]
set y2tics
set yrange [0:0.02]

plot "histogram.data" using 2:5 title " RelFreq" with impulses, \\
        "histogram.data" using 2:7 axes x1y2 title " CRF" with lines
' | gnuplot > histo.png

    display histo.png &

#########################################################################
# phyloP for 5-way (DONE - 2016-05-09 - Hiram)
    # run phyloP with score=LRT
    ssh ku
    mkdir /cluster/data/cavPor3/bed/multiz5way/consPhyloP
    cd /cluster/data/cavPor3/bed/multiz5way/consPhyloP

    mkdir run.phyloP
    cd run.phyloP
    # Adjust model file base composition background and rate matrix to be
    # representative of the chromosomes in play
    grep BACKGROUND ../../4d/all.mod | awk '{printf "%0.3f\n", $3 + $4}'
    #	0.569
    /cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin/modFreqs \
	../../4d/all.mod 0.569 > all.mod
    # verify, the BACKGROUND should now be paired up:
    grep BACK all.mod
    #   BACKGROUND: 0.215500 0.284500 0.284500 0.215500 

    printf '#!/bin/csh -fe
set PHASTBIN = /cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin
set f = $1
set d = $f:h
set file1 = $f:t
set out = $2
set cName = $f:t:r
set grp = $cwd:t
set cons = /hive/data/genomes/cavPor3/bed/multiz5way/consPhyloP
set tmp = $cons/tmp/$grp/$f
/bin/rm -fr $tmp
/bin/mkdir -p $tmp
set ssSrc = "/hive/data/genomes/cavPor3/bed/multiz5way/cons/SS/result/$f"
set useGrp = "$grp.mod"
/bin/ln -s $cons/run.phyloP/$grp.mod $tmp
pushd $tmp > /dev/null
$PHASTBIN/phyloP --method LRT --mode CONACC --wig-scores --chrom $cName \\
    -i SS $useGrp $ssSrc.ss > $file1.wigFix
popd > /dev/null
/bin/mkdir -p $out:h
sleep 4
/bin/touch $out:h
/bin/mv $tmp/$file1.wigFix $out
/bin/rm -fr $tmp
/bin/rmdir --ignore-fail-on-non-empty $cons/tmp/$grp/$d
/bin/rmdir --ignore-fail-on-non-empty $cons/tmp/$grp/$d:h
/bin/rmdir --ignore-fail-on-non-empty $cons/tmp/$grp
/bin/rmdir --ignore-fail-on-non-empty $cons/tmp
' > doPhyloP.csh

    chmod +x doPhyloP.csh

    # Create list of chunks
    find ../../cons/SS/result -type f | grep ".ss$" \
	| sed -e "s/.ss$//; s#^../../cons/SS/result/##" > ss.list
    # make sure the list looks good
    wc -l ss.list
    #  800 ss.list

    # Create template file
    #	file1 == $chr/$chunk/file name without .ss suffix
    printf '#LOOP
../run.phyloP/doPhyloP.csh $(path1) {check out line+ wigFix/$(dir1)/$(file1).wigFix}
#ENDLOOP
' > template

    ######################   Running all species  #######################
    # setup run for all species
    mkdir /hive/data/genomes/cavPor3/bed/multiz5way/consPhyloP/all
    cd /hive/data/genomes/cavPor3/bed/multiz5way/consPhyloP/all
    rm -fr wigFix
    mkdir wigFix

    gensub2 ../run.phyloP/ss.list single ../run.phyloP/template jobList
    #	beware overwhelming the cluster with these fast running high I/O jobs
    para create jobList
    para try ... check ... push ... etc ...
    para -maxJob=53 push
    para time > run.time
# Completed: 798 of 800 jobs
# Crashed: 2 jobs
# CPU time in finished jobs:       1334s      22.23m     0.37h    0.02d  0.000 y
# IO & Wait Time:                  5599s      93.32m     1.56h    0.06d  0.000 y
# Average job time:                   9s       0.14m     0.00h    0.00d
# Longest finished job:              15s       0.25m     0.00h    0.00d
# Submission to last job:            88s       1.47m     0.02h    0.00d

    # the two failed jobs actually finished, their error was at the end
    # of the script:
# /bin/rmdir: failed to remove `/hive/data/genomes/cavPor3/bed/multiz5way/consPhyloP/tmp/all/7/4/scaffold_13': No such file or directory

    mkdir downloads

    time (find ./wigFix -type f | sed -e "s#^./##; s#\.# d #g; s#-# m #;" \
	| sort -k1,1 -k3,3n | sed -e "s# d #.#g; s# m #-#g;" | xargs cat \
	| gzip -c > downloads/phyloP5way.wigFix.gz)
    #   real    19m35.007s

    # check integrity of data with wigToBigWig
    time (zcat downloads/phyloP5way.wigFix.gz \
	| wigToBigWig -verbose=2 stdin /hive/data/genomes/cavPor3/chrom.sizes \
	phyloP5way.bw) > bigWig.log 2>&1
    egrep "real|VmPeak" bigWig.log
    # pid=24836: VmPeak:    16771468 kB
    # real    25m9.670s

    bigWigInfo phyloP5way.bw | sed -e 's/^/# /;'
# version: 4
# isCompressed: yes
# isSwapped: 0
# primaryDataSize: 2,473,672,935
# primaryIndexSize: 75,227,328
# zoomLevels: 10
# chromCount: 625
# basesCovered: 1,531,042,459
# mean: 0.092304
# min: -2.562000
# max: 0.863000
# std: 0.610474

    #	encode those files into wiggle data
    time (zcat downloads/phyloP5way.wigFix.gz \
	| wigEncode stdin phyloP5way.wig phyloP5way.wib)
    # Converted stdin, upper limit 0.86, lower limit -2.56
    # real    8m56.205s

    du -hsc *.wi?
    # 1.5G    phyloP5way.wib
    # 249M    phyloP5way.wig

    # Load gbdb and database with wiggle.
    ln -s `pwd`/phyloP5way.wib /gbdb/cavPor3/multiz5way/phyloP5way.wib
    time hgLoadWiggle -pathPrefix=/gbdb/cavPor3/multiz5way cavPor3 \
	phyloP5way phyloP5way.wig
    # real    0m30.072s

    # use to set trackDb.ra entries for wiggle min and max
    # and verify table is loaded correctly

    wigTableStats.sh cavPor3 phyloP5way
# db.table                    min max mean count sumData
# cavPor3.phyloP5way      -2.562 0.863 0.0923044 1531042459 1.41322e+08
#       stdDev viewLimits
#     0.610474 viewLimits=-2.562:0.863

    #	that range is: 0.863+2.562 = 3.425 for hBinSize=0.003425

    #  Create histogram to get an overview of all the data
    time hgWiggle -doHistogram \
	-hBinSize=0.003425 -hBinCount=1000 -hMinVal=-2.562 -verbose=2 \
	    -db=cavPor3 phyloP5way > histogram.data 2>&1
XXX - running - Fri May  6 21:25:03 PDT 2016
    # real    1m49.237s

    # find the Y range for the 2:5 graph
    grep -v chrom histogram.data | grep "^[0-9]" | ave -col=5 stdin \
      | sed -e 's/^/# /;'
# Q1 0.000034
# median 0.000279
# Q3 0.001310
# average 0.001110
# min 0.000000
# max 0.017827
# count 901
# total 0.999993
# standard deviation 0.002086

    # find the X range for the 2:5 graph
    grep "^[0-9]" histogram.data | ave -col=2 stdin \
      | sed -e 's/^/# /;'
# Q1 -1.755415
# median -0.986500
# Q3 -0.210737
# average -0.945154
# min -2.562000
# max 0.801350
# count 901
# total -851.583895
# standard deviation 0.953370

    #	create plot of histogram:
    printf 'set terminal png small x000000 xffffff xc000ff x66ff66 xffff00 x00ffff font \
"/usr/share/fonts/default/Type1/n022004l.pfb"
set size 1.4, 0.8
set key left box
set grid noxtics
set grid ytics
set title " Guinea pig cavPor3 Histogram phyloP5way track"
set xlabel " phyloP5way score"
set ylabel " Relative Frequency"
set y2label " Cumulative Relative Frequency (CRF)"
set y2range [0:1]
set y2tics
set xtics
set xrange [-2.6:0.9]
set yrange [0:0.02]

plot "histogram.data" using 2:5 title " RelFreq" with impulses, \
        "histogram.data" using 2:7 axes x1y2 title " CRF" with lines
' | gnuplot > histo.png

    display histo.png &
    # appears to have an odd hole in the data near X=0 ?

#############################################################################
# construct download files for 5-way (DONE - 2016-05-09 - Hiram)
    mkdir /usr/local/apache/htdocs-hgdownload/goldenPath/cavPor3/multiz5way
    mkdir /usr/local/apache/htdocs-hgdownload/goldenPath/cavPor3/phastCons5way
    mkdir /usr/local/apache/htdocs-hgdownload/goldenPath/cavPor3/phyloP5way
    mkdir /hive/data/genomes/cavPor3/bed/multiz5way/downloads
    cd /hive/data/genomes/cavPor3/bed/multiz5way/downloads
    mkdir multiz5way phastCons5way phyloP5way
    cd multiz5way
    time cp -p ../../anno/cavPor3.5way.maf .
    #   real    2m22.776s
    # -rw-rw-r-- 1 84345995664 Apr 14 09:27 cavPor3.5way.maf

    du -hsc *
    #  79G     cavPor3.5way.maf

    time gzip *.maf
    #   real    87m37.865s
    # -rw-rw-r-- 1 10893506431 Apr 14 09:27 cavPor3.5way.maf.gz

    grep TREE ../../4d/all.mod | awk '{print $NF}' \
      | ~/kent/src/hg/utils/phyloTrees/asciiTree.pl /dev/stdin \
         > cavPor3.5way.nh
    ~/kent/src/hg/utils/phyloTrees/commonNames.sh cavPor3.5way.nh \
      | ~/kent/src/hg/utils/phyloTrees/asciiTree.pl /dev/stdin \
         > cavPor3.5way.commonNames.nh
    ~/kent/src/hg/utils/phyloTrees/scientificNames.sh cavPor3.5way.nh \
	| $HOME/kent/src/hg/utils/phyloTrees/asciiTree.pl /dev/stdin \
	    > cavPor3.5way.scientificNames.nh
    time md5sum *.nh *.maf.gz > md5sum.txt
    #   real    0m35.144s

    ln -s `pwd`/* \
        /usr/local/apache/htdocs-hgdownload/goldenPath/cavPor3/multiz5way

    du -hsc *.maf.gz ../../anno/cavPor3.5way.maf
    #  11G     cavPor3.5way.maf.gz
    #  79G     ../../anno/cavPor3.5way.maf

    # obtain the README.txt from cavPor3/multiz17way and update for this
    #   situation

    #####################################################################
    cd /hive/data/genomes/cavPor3/bed/multiz5way/downloads/phastCons5way

    ln -s ../../cons/all/downloads/phastCons5way.wigFix.gz \
        ./cavPor3.phastCons5way.wigFix.gz
    ln -s ../../cons/all/phastCons5way.bw ./cavPor3.phastCons5way.bw
    ln -s ../../cons/all/all.mod ./cavPor3.phastCons5way.mod
    time md5sum *.gz *.mod *.bw > md5sum.txt
    #   real    0m20.354s

    # obtain the README.txt from hg38/phastCons5way and update for this
    #   situation
    ln -s `pwd`/*.gz `pwd`/*.mod `pwd`/*.bw `pwd`/*.txt \
      /usr/local/apache/htdocs-hgdownload/goldenPath/cavPor3/phastCons5way

    #####################################################################
    cd /hive/data/genomes/cavPor3/bed/multiz5way/downloads/phyloP5way

    ln -s ../../consPhyloP/all/downloads/phyloP5way.wigFix.gz \
        ./cavPor3.phyloP5way.wigFix.gz
    ln -s ../../consPhyloP/run.phyloP/all.mod cavPor3.phyloP5way.mod
    ln -s ../../consPhyloP/all/phyloP5way.bw cavPor3.phyloP5way.bw

    time md5sum *.mod *.bw *.gz > md5sum.txt
    #   real    0m29.662s

    # obtain the README.txt from cavPor3/phyloP17way and update for this
    #   situation
    ln -s `pwd`/* \
      /usr/local/apache/htdocs-hgdownload/goldenPath/cavPor3/phyloP5way

#############################################################################
# hgPal downloads (DONE - 2016-05-09 - Hiram)
#   FASTA from 5-way for knownGene, refGene and knownCanonical

    ssh hgwdev
    screen -S cavPor3HgPal
    mkdir /hive/data/genomes/cavPor3/bed/multiz5way/pal
    cd /hive/data/genomes/cavPor3/bed/multiz5way/pal
    cat ../species.list | tr '[ ]' '[\n]' > order.list

    # this for loop takes about 2.5 hours on this large count contig assembly
    export mz=multiz5way
    export gp=ensGene
    export db=cavPor3
    export I=0
    export D=0
    mkdir exonAA exonNuc
    printf '#!/bin/sh\n' > $gp.jobs
    time for C in `sort -nk2 ../../../chrom.sizes | cut -f1`
    do
        I=`echo $I | awk '{print $1+1}'`
        D=`echo $D | awk '{print $1+1}'`
        dNum=`echo $D | awk '{printf "%03d", int($1/1000)}'`
        mkdir -p exonNuc/${dNum} > /dev/null
        mkdir -p exonAA/${dNum} > /dev/null
	echo "mafGene -chrom=$C -exons -noTrans $db $mz $gp order.list stdout | gzip -c > exonNuc/${dNum}/$C.exonNuc.fa.gz &"
	echo "mafGene -chrom=$C -exons $db $mz $gp order.list stdout | gzip -c > exonAA/${dNum}/$C.exonAA.fa.gz &"
        if [ $I -gt 16 ]; then
            echo "date"
            echo "wait"
            I=0
        fi
    done >> $gp.jobs
    # real    1m25.411s

    echo "date" >> $gp.jobs
    echo "wait" >> $gp.jobs
    chmod +x $gp.jobs

    time (./$gp.jobs) > $gp.jobs.log 2>&1
    # real    7m28.075s


    export mz=multiz5way
    export gp=ensGene
    time find ./exonAA -type f | grep exonAA.fa.gz | xargs zcat \
     | gzip -c > $gp.$mz.exonAA.fa.gz
    # real    0m14.448s

    time find ./exonNuc -type f | grep exonNuc.fa.gz | xargs zcat \
     | gzip -c > $gp.$mz.exonNuc.fa.gz
    # real    0m42.669s

  # -rw-rw-r--  1 30006064 May  9 10:12 ensGene.multiz5way.exonAA.fa.gz
  # -rw-rw-r--  1 47566626 May  9 10:13 ensGene.multiz5way.exonNuc.fa.gz

    export mz=multiz5way
    export gp=ensGene
    export db=cavPor3
    export pd=/usr/local/apache/htdocs-hgdownload/goldenPath/$db/$mz/alignments
    mkdir -p $pd
    md5sum *.fa.gz > md5sum.txt
    ln -s `pwd`/$gp.$mz.exonAA.fa.gz $pd/$gp.exonAA.fa.gz
    ln -s `pwd`/$gp.$mz.exonNuc.fa.gz $pd/$gp.exonNuc.fa.gz
    ln -s `pwd`/md5sum.txt $pd/

    rm -rf exonAA exonNuc

#############################################################################
# wiki page for 5-way (DONE - 2016-05-09 - Hiram)
    mkdir /hive/users/hiram/bigWays/cavPor3.5way
    cd /hive/users/hiram/bigWays
    echo "cavPor3" > cavPor3.5way/ordered.list
    awk '{print $1}' /hive/data/genomes/cavPor3/bed/multiz5way/5way.distances.txt \
       >> cavPor3.5way/ordered.list

    # sizeStats.sh catches up the cached measurements required for data
    # in the tables.  They are usually already mostly done, only new
    # assemblies will have updates.
    ./sizeStats.sh cavPor3.5way/ordered.list
    # dbDb.sh constructs cavPor3.5way/CavPor3_5-way_conservation_alignment.html
    # may need to add new assembly references to srcReference.list and
    # urlReference.list
    ./dbDb.sh cavPor3 5way
    # sizeStats.pl constructs cavPor3.5way/CavPor3_5-way_Genome_size_statistics.html
    # this requires entries in coverage.list for new sequences
    ./sizeStats.pl cavPor3 5way

    # defCheck.pl constructs CavPor3_5-way_conservation_lastz_parameters.html
    ./defCheck.pl cavPor3 5way

    # this constructs the html pages in cavPor3.5way/:
# -rw-rw-r-- 1 3287 May  9 10:24 CavPor3_5-way_conservation_alignment.html
# -rw-rw-r-- 1 4805 May  9 10:25 CavPor3_5-way_Genome_size_statistics.html
# -rw-rw-r-- 1 3293 May  9 10:25 CavPor3_5-way_conservation_lastz_parameters.html
    # add those pages to the genomewiki.  Their page names are the
    # names of the .html files without the .html:
#  CavPor3_5-way_conservation_alignment
#  CavPor3_5-way_Genome_size_statistics
#  CavPor3_5-way_conservation_lastz_parameters

    # when you view the first one you enter, it will have links to the
    # missing two.

############################################################################
## create upstream refGene maf files (DONE - 2016-05-09 - Hiram)
    mkdir -p /hive/data/genomes/cavPor3/bed/multiz5way/downloads/multiz5way
    cd /hive/data/genomes/cavPor3/bed/multiz5way/downloads/multiz5way
    # bash script
#!/bin/sh
export geneTbl="ensGene"
for S in 1000 2000 5000
do
    printf "making upstream${S}.maf\n" 1>&2
    featureBits cavPor3 ${geneTbl}:upstream:${S} -fa=/dev/null -bed=stdout \
        | perl -wpe 's/_up[^\t]+/\t0/' | sort -k1,1 -k2,2n \
        | /cluster/bin/$MACHTYPE/mafFrags cavPor3 multiz5way \
                stdin stdout \
                -orgs=/hive/data/genomes/cavPor3/bed/multiz5way/species.list \
        | gzip -c > upstream${S}.${geneTbl}.maf.gz
    printf "done upstream${S}.${geneTbl}.maf.gz\n" 1>&2
done

    time (./mkUpstream.sh ) > upstream.log 2>&1

    #   real    0m33.868s

    md5sum upstream*.gz >> md5sum.txt

    # script to help create document information:
for db in `cat ../species.list`
do
  orgName=`hgsql -N -e "select organism from dbDb where name=\"${db}\";" hgcentraltest`
  sciName=`hgsql -N -e "select scientificName from dbDb where name=\"${db}\";" hgcentraltest`
  genome=`hgsql -N -e "select genome from dbDb where name=\"${db}\";" hgcentraltest`
  descr=`hgsql -N -e "select description from dbDb where name=\"${db}\";" hgcentraltest`
  printf "%-18s %-24s %-20s\n" "$genome" "$sciName" "$descr"
done

# Guinea pig         Cavia porcellus          Feb. 2008 (Broad/cavPor3)
# Mouse              Mus musculus             Dec. 2011 (GRCm38/mm10)
# Human              Homo sapiens             Dec. 2013 (GRCh38/hg38)
# Tree shrew         Tupaia belangeri         Dec. 2006 (Broad/tupBel1)
# Malayan flying lemur Galeopterus variegatus   Jun. 2014 (G_variegatus-3.0.2/galVar1)

    # obtain the README.txt from ce11/multiz26way and update for this
    #   situation
    mkdir -p /usr/local/apache/htdocs-hgdownload/goldenPath/cavPor3/multiz5way
    ln -s `pwd`/upstream*.gz README.txt \
        /usr/local/apache/htdocs-hgdownload/goldenPath/cavPor3/multiz5way

#############################################################################
# construct download files for 5-way (DONE - 2016-05-09 - Hiram)
    mkdir /usr/local/apache/htdocs-hgdownload/goldenPath/cavPor3/multiz5way
    mkdir /usr/local/apache/htdocs-hgdownload/goldenPath/cavPor3/phastCons5way
    mkdir /usr/local/apache/htdocs-hgdownload/goldenPath/cavPor3/phyloP5way
    mkdir /hive/data/genomes/cavPor3/bed/multiz5way/downloads
    cd /hive/data/genomes/cavPor3/bed/multiz5way/downloads
    mkdir multiz5way phastCons5way phyloP5way
    cd multiz5way
    time cp -p ../../anno/cavPor3.5way.maf .
    #   real    0m18.941s

    # -rw-rw-r-- 1 11434713982 May  4 10:01 cavPor3.5way.maf

    du -hsc *
    #  11G     cavPor3.5way.maf

    time gzip *.maf &
    #   real    36m27.668s

    # -rw-rw-r-- 1 2787306473 May  4 10:01 cavPor3.5way.maf.gz

    grep TREE ../../4d/all.mod | awk '{print $NF}' \
      | ~/kent/src/hg/utils/phyloTrees/asciiTree.pl /dev/stdin \
         > cavPor3.5way.nh
    ~/kent/src/hg/utils/phyloTrees/commonNames.sh cavPor3.5way.nh \
      | ~/kent/src/hg/utils/phyloTrees/asciiTree.pl /dev/stdin \
         > cavPor3.5way.commonNames.nh
    ~/kent/src/hg/utils/phyloTrees/scientificNames.sh cavPor3.5way.nh \
	| $HOME/kent/src/hg/utils/phyloTrees/asciiTree.pl /dev/stdin \
	    > cavPor3.5way.scientificNames.nh
    time md5sum *.nh *.maf.gz >> md5sum.txt
    #   real    0m35.144s

    ln -s `pwd`/*.maf.gz `pwd`/*.txt `pwd`/*.nh \
        /usr/local/apache/htdocs-hgdownload/goldenPath/cavPor3/multiz5way

    du -hsc *.maf.gz ../../anno/cavPor3.5way.maf
    #  11G     cavPor3.5way.maf.gz
    #  79G     ../../anno/cavPor3.5way.maf

    # obtain the README.txt from ce11/multiz26way and update for this
    #   situation

    #####################################################################
    cd /hive/data/genomes/cavPor3/bed/multiz5way/downloads/phastCons5way

    ln -s ../../cons/all/downloads/phastCons5way.wigFix.gz \
        ./cavPor3.phastCons5way.wigFix.gz
    ln -s ../../cons/all/phastCons5way.bw ./cavPor3.phastCons5way.bw
    ln -s ../../cons/all/all.mod ./cavPor3.phastCons5way.mod

    md5sum *.mod *.gz *.bw README.txt > md5sum.txt
    # about 15 to 20 seconds

    # obtain the README.txt from ce11/phastCons26way and update for this
    #   situation
    ln -s `pwd`/*.gz `pwd`/*.mod `pwd`/*.bw `pwd`/*.txt \
      /usr/local/apache/htdocs-hgdownload/goldenPath/cavPor3/phastCons5way

    #####################################################################
    cd /hive/data/genomes/cavPor3/bed/multiz5way/downloads/phyloP5way

    ln -s ../../consPhyloP/all/downloads/phyloP5way.wigFix.gz \
        ./cavPor3.phyloP5way.wigFix.gz
    ln -s ../../consPhyloP/run.phyloP/all.mod cavPor3.phyloP5way.mod
    ln -s ../../consPhyloP/all/phyloP5way.bw cavPor3.phyloP5way.bw

    md5sum *.mod *.gz *.bw README.txt > md5sum.txt
    # about 15 to 20 seconds

    # obtain the README.txt from ce11/phyloP26way and update for this
    #   situation
    ln -s `pwd`/* \
      /usr/local/apache/htdocs-hgdownload/goldenPath/cavPor3/phyloP5way

#############################################################################
# hgPal downloads (DONE - 2016-05-09 - Hiram)
#   FASTA from 5-way for knownGene, refGene and knownCanonical

    ssh hgwdev
    screen -S cavPor3HgPal
    mkdir /hive/data/genomes/cavPor3/bed/multiz5way/pal
    cd /hive/data/genomes/cavPor3/bed/multiz5way/pal
    cat ../species.list | tr '[ ]' '[\n]' > order.list

    # this for loop takes about 2.5 hours on this large count contig assembly
    export mz=multiz5way
    export gp=ensGene
    export db=cavPor3
    export I=0
    export D=0
    mkdir exonAA exonNuc
    printf '#!/bin/sh\n' > $gp.jobs
    time for C in `sort -nk2 ../../../chrom.sizes | cut -f1`
    do
        I=`echo $I | awk '{print $1+1}'`
        D=`echo $D | awk '{print $1+1}'`
        dNum=`echo $D | awk '{printf "%03d", int($1/1000)}'`
        mkdir -p exonNuc/${dNum} > /dev/null
        mkdir -p exonAA/${dNum} > /dev/null
	echo "mafGene -chrom=$C -exons -noTrans $db $mz $gp order.list stdout | gzip -c > exonNuc/${dNum}/$C.exonNuc.fa.gz &"
	echo "mafGene -chrom=$C -exons $db $mz $gp order.list stdout | gzip -c > exonAA/${dNum}/$C.exonAA.fa.gz &"
        if [ $I -gt 16 ]; then
            echo "date"
            echo "wait"
            I=0
        fi
    done >> $gp.jobs
    # real    1m25.411s

    echo "date" >> $gp.jobs
    echo "wait" >> $gp.jobs
    chmod +x $gp.jobs

    time (./$gp.jobs) > $gp.jobs.log 2>&1
    # real    7m28.075s


    export mz=multiz5way
    export gp=ensGene
    time find ./exonAA -type f | grep exonAA.fa.gz | xargs zcat \
     | gzip -c > $gp.$mz.exonAA.fa.gz
    # real    0m14.448s

    time find ./exonNuc -type f | grep exonNuc.fa.gz | xargs zcat \
     | gzip -c > $gp.$mz.exonNuc.fa.gz
    # real    0m42.669s

  # -rw-rw-r--  1 30006064 May  9 10:12 ensGene.multiz5way.exonAA.fa.gz
  # -rw-rw-r--  1 47566626 May  9 10:13 ensGene.multiz5way.exonNuc.fa.gz

    export mz=multiz5way
    export gp=ensGene
    export db=cavPor3
    export pd=/usr/local/apache/htdocs-hgdownload/goldenPath/$db/$mz/alignments
    mkdir -p $pd
    md5sum *.fa.gz > md5sum.txt
    ln -s `pwd`/$gp.$mz.exonAA.fa.gz $pd/$gp.exonAA.fa.gz
    ln -s `pwd`/$gp.$mz.exonNuc.fa.gz $pd/$gp.exonNuc.fa.gz
    ln -s `pwd`/md5sum.txt $pd/

    rm -rf exonAA exonNuc

#############################################################################
# wiki page for 5-way (DONE - 2016-05-09 - Hiram)
    mkdir /hive/users/hiram/bigWays/cavPor3.5way
    cd /hive/users/hiram/bigWays
    echo "cavPor3" > cavPor3.5way/ordered.list
    awk '{print $1}' /hive/data/genomes/cavPor3/bed/multiz5way/5way.distances.txt \
       >> cavPor3.5way/ordered.list

    # sizeStats.sh catches up the cached measurements required for data
    # in the tables.  They are usually already mostly done, only new
    # assemblies will have updates.
    ./sizeStats.sh cavPor3.5way/ordered.list
    # dbDb.sh constructs cavPor3.5way/CavPor3_5-way_conservation_alignment.html
    # may need to add new assembly references to srcReference.list and
    # urlReference.list
    ./dbDb.sh cavPor3 5way
    # sizeStats.pl constructs cavPor3.5way/CavPor3_5-way_Genome_size_statistics.html
    # this requires entries in coverage.list for new sequences
    ./sizeStats.pl cavPor3 5way

    # defCheck.pl constructs CavPor3_5-way_conservation_lastz_parameters.html
    ./defCheck.pl cavPor3 5way

    # this constructs the html pages in cavPor3.5way/:
# -rw-rw-r-- 1 3287 May  9 10:24 CavPor3_5-way_conservation_alignment.html
# -rw-rw-r-- 1 4805 May  9 10:25 CavPor3_5-way_Genome_size_statistics.html
# -rw-rw-r-- 1 3293 May  9 10:25 CavPor3_5-way_conservation_lastz_parameters.html
    # add those pages to the genomewiki.  Their page names are the
    # names of the .html files without the .html:
#  CavPor3_5-way_conservation_alignment
#  CavPor3_5-way_Genome_size_statistics
#  CavPor3_5-way_conservation_lastz_parameters

    # when you view the first one you enter, it will have links to the
    # missing two.

############################################################################
## create upstream refGene maf files (DONE - 2016-05-09 - Hiram)
    mkdir -p /hive/data/genomes/cavPor3/bed/multiz5way/downloads/multiz5way
    cd /hive/data/genomes/cavPor3/bed/multiz5way/downloads/multiz5way
    # bash script
#!/bin/sh
export geneTbl="ensGene"
for S in 1000 2000 5000
do
    printf "making upstream${S}.maf\n" 1>&2
    featureBits cavPor3 ${geneTbl}:upstream:${S} -fa=/dev/null -bed=stdout \
        | perl -wpe 's/_up[^\t]+/\t0/' | sort -k1,1 -k2,2n \
        | /cluster/bin/$MACHTYPE/mafFrags cavPor3 multiz5way \
                stdin stdout \
                -orgs=/hive/data/genomes/cavPor3/bed/multiz5way/species.list \
        | gzip -c > upstream${S}.${geneTbl}.maf.gz
    printf "done upstream${S}.${geneTbl}.maf.gz\n" 1>&2
done

    time (./mkUpstream.sh ) > upstream.log 2>&1

    #   real    0m33.868s

    md5sum upstream*.gz >> md5sum.txt

    # script to help create document information:
for db in `cat ../species.list`
do
  orgName=`hgsql -N -e "select organism from dbDb where name=\"${db}\";" hgcentraltest`
  sciName=`hgsql -N -e "select scientificName from dbDb where name=\"${db}\";" hgcentraltest`
  genome=`hgsql -N -e "select genome from dbDb where name=\"${db}\";" hgcentraltest`
  descr=`hgsql -N -e "select description from dbDb where name=\"${db}\";" hgcentraltest`
  printf "%-18s %-24s %-20s\n" "$genome" "$sciName" "$descr"
done

# Guinea pig         Cavia porcellus          Feb. 2008 (Broad/cavPor3)
# Mouse              Mus musculus             Dec. 2011 (GRCm38/mm10)
# Human              Homo sapiens             Dec. 2013 (GRCh38/hg38)
# Tree shrew         Tupaia belangeri         Dec. 2006 (Broad/tupBel1)
# Malayan flying lemur Galeopterus variegatus   Jun. 2014 (G_variegatus-3.0.2/galVar1)

    # obtain the README.txt from ce11/multiz26way and update for this
    #   situation
    mkdir -p /usr/local/apache/htdocs-hgdownload/goldenPath/cavPor3/multiz5way
    ln -s `pwd`/upstream*.gz README.txt \
        /usr/local/apache/htdocs-hgdownload/goldenPath/cavPor3/multiz5way

##############################################################################
# LASTZ Guinea pig/cavPor3 - Chinese tree shrew/tupChi1
#      (DONE - 2017-11-17 - Hiram)
    mkdir /hive/data/genomes/cavPor3/bed/lastzTupChi1.2017-11-17
    cd /hive/data/genomes/cavPor3/bed/lastzTupChi1.2017-11-17

    printf '# Guinea pig vs. Chinese tree shrew
BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.66/bin/lastz

# TARGET: Guinea pig cavPor3
SEQ1_DIR=/hive/data/genomes/cavPor3/cavPor3.wmTrf.2bit
SEQ1_LEN=/hive/data/genomes/cavPor3/chrom.sizes
SEQ1_CHUNK=20000000
SEQ1_LAP=10000
SEQ1_LIMIT=30

# QUERY: Chinese tree shrew
SEQ2_DIR=/hive/data/genomes/tupChi1/tupChi1.2bit
SEQ2_LEN=/hive/data/genomes/tupChi1/chrom.sizes
SEQ2_CHUNK=20000000
SEQ2_LIMIT=400
SEQ2_LAP=0

BASE=/hive/data/genomes/cavPor3/bed/lastzTupChi1.2017-11-17
TMPDIR=/dev/shm
' > DEF
    # << happy emacs

    time (doBlastzChainNet.pl -verbose=2 `pwd`/DEF \
        -stop=partition -syntenicNet -fileServer=hgwdev \
        -chainMinScore=3000 -chainLinearGap=medium \
        -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku) > do.log 2>&1
    # real    201m20.581s

    cat fb.cavPor3.chainTupChi1Link.txt
    # 1105585848 bases of 2663369733 (41.511%) in intersection

    time (doRecipBest.pl -workhorse=hgwdev -load -buildDir=`pwd` \
	cavPor3 tupChi1) > rbest.log 2>&1 &
    # real    357m30.385s
    cat fb.cavPor3.chainRBestTupChi1Link.txt
    # 965471669 bases of 2663369733 (36.250%) in intersection

    # and for the swap:
    mkdir /hive/data/genomes/tupChi1/bed/blastz.cavPor3.swap
    cd /hive/data/genomes/tupChi1/bed/blastz.cavPor3.swap

    time ($HOME/kent/src/hg/utils/automation/doBlastzChainNet.pl -verbose=2 \
      /hive/data/genomes/cavPor3/bed/lastzTupChi1.2017-11-17/DEF \
        -swap -chainMinScore=3000 -chainLinearGap=medium \
          -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \
            -syntenicNet) > swap.log 2>&1
    #  real    120m19.543s

    cat fb.tupChi1.chainCavPor3Link.txt
    # 1115700960 bases of 2706389135 (41.225%) in intersection

    cat fb.tupChi1.chainSynCavPor3Link.txt
    # 936793671 bases of 2706389135 (34.614%) in intersection

    time (~/kent/src/hg/utils/automation/doRecipBest.pl \
	-buildDir=`pwd` tupChi1 cavPor3) > rbest.log 2>&1
    # real    245m56.588s

#############################################################################
## 6-Way Multiz (DONE - 2017-12-16 - Hiram)
    ssh hgwdev
    mkdir /hive/data/genomes/cavPor3/bed/multiz6way
    cd /hive/data/genomes/cavPor3/bed/multiz6way

    # from the 218-way in the source tree, select out the 6 used here:
    /cluster/bin/phast/tree_doctor \
        --prune-all-but hg38,galVar1,cavPor3,mm10,tupBel1,tupChi1 \
        /cluster/home/hiram/kent/src/hg/utils/phyloTrees/218way.nh \
          > cavPor3.6way.nh.0
    cat cavPor3.6way.nh.0
    # ((hg38:0.143908,((tupChi1:0.070000,tupBel1:0.086203):0.050000,
    #   galVar1:0.080000):0.054937):0.002000,(mm10:0.315424,
    #   cavPor3:0.175779):0.041059);

    # using TreeGraph2 tree editor on the Mac, rearrange to get cavPor3
    # at the top:

    #	what that looks like:
 ~/kent/src/hg/utils/phyloTrees/asciiTree.pl cavPor3.6way.nh | sed -e 's/^/# /;'
# ((cavPor3:0.175779,
#  mm10:0.315424):0.041059,
# (hg38:0.143908,
# ((tupChi1:0.07,
#  tupBel1:0.086203):0.05,
# galVar1:0.08):0.054937):0.002);

    # extract species list from that .nh file
    sed 's/[a-z][a-z]*_//g; s/:[0-9\.][0-9\.]*//g; s/;//; /^ *$/d' \
        cavPor3.6way.nh | xargs echo | sed 's/ //g; s/,/ /g' \
        | sed 's/[()]//g; s/,/ /g' | tr '[ ]' '[\n]' > species.list.txt

    # construct db to name translation list:
    cat species.list.txt | while read DB
do
hgsql -N -e "select name,organism from dbDb where name=\"${DB}\";" hgcentraltest
done | sed -e "s/\t/->/; s/ /_/g;" | sed -e 's/$/;/' | sed -e 's/\./_/g' \
        | sed -e 's/-nosed/_nosed/; s/-eating/_eating/;' > db.to.name.txt

    # construct a common name .nh file:
    /cluster/bin/phast/tree_doctor --rename \
    "`cat db.to.name.txt`" cavPor3.6way.nh | sed -e 's/00*)/)/g; s/00*,/,/g' \
       | $HOME/kent/src/hg/utils/phyloTrees/asciiTree.pl /dev/stdin \
         > cavPor3.6way.commonNames.nh
    cat cavPor3.6way.commonNames.nh | sed -e 's/^/# /;'
# ((Guinea_pig:0.175779,
#  Mouse:0.315424):0.041059,
# (Human:0.143908,
# ((Chinese_tree_shrew:0.07,
#  Tree_shrew:0.086203):0.05,
# Malayan_flying_lemur:0.08):0.054937):0.002);

#	Use this specification in the phyloGif tool:
#	http://genome.ucsc.edu/cgi-bin/phyloGif
#	to obtain a png image for src/hg/htdocs/images/phylo/cavPor3_6way.png

    ~/kent/src/hg/utils/phyloTrees/asciiTree.pl cavPor3.6way.nh > t.nh
    ~/kent/src/hg/utils/phyloTrees/scientificNames.sh t.nh \
       | $HOME/kent/src/hg/utils/phyloTrees/asciiTree.pl /dev/stdin \
          > cavPor3.6way.scientificNames.nh
    cat cavPor3.6way.scientificNames.nh | sed -e 's/^/# /;'
# ((Cavia_porcellus:0.175779,
#  Mus_musculus:0.315424):0.041059,
# (Homo_sapiens:0.143908,
# ((Tupaia_chinensis:0.07,
#  Tupaia_belangeri:0.086203):0.05,
# Galeopterus_variegatus:0.08):0.054937):0.002);

    /cluster/bin/phast/all_dists cavPor3.6way.nh | grep cavPor3 \
        | sed -e "s/cavPor3.//" | sort -k2n > 6way.distances.txt
    #	Use this output to create the table below
    cat 6way.distances.txt | sed -e 's/^/# /;'
# galVar1 0.353775
# hg38    0.362746
# tupChi1 0.393775
# tupBel1 0.409978
# mm10    0.491203

    printf '#!/usr/bin/env perl

use strict;
use warnings;

open (FH, "<6way.distances.txt") or
        die "can not read 6way.distances.txt";

my $count = 0;
while (my $line = <FH>) {
    chomp $line;
    my ($D, $dist) = split('"'"'\\s+'"'"', $line);
    my $chain = "chain" . ucfirst($D);
    my $B="/hive/data/genomes/cavPor3/bed/lastz.$D/fb.cavPor3." .
        $chain . "Link.txt";
    my $chainLinkMeasure =
        `awk '"'"'{print \\$5}'"'"' ${B} 2> /dev/null | sed -e "s/(//; s/)//"`;
    chomp $chainLinkMeasure;
    $chainLinkMeasure = 0.0 if (length($chainLinkMeasure) < 1);
    $chainLinkMeasure =~ s/\\%%//;
    my $swapFile="/hive/data/genomes/${D}/bed/lastz.cavPor3/fb.${D}.chainCavPor3Link.txt";
    my $swapMeasure = "N/A";
    if ( -s $swapFile ) {
	$swapMeasure =
	    `awk '"'"'{print \\$5}'"'"' ${swapFile} 2> /dev/null | sed -e "s/(//; s/)//"`;
	chomp $swapMeasure;
	$swapMeasure = 0.0 if (length($swapMeasure) < 1);
	$swapMeasure =~ s/\\%%//;
    }
    my $orgName=
    `hgsql -N -e "select organism from dbDb where name='"'\$D'"';" hgcentraltest`;
    chomp $orgName;
    if (length($orgName) < 1) {
        $orgName="N/A";
    }
    ++$count;
    printf "# %%02d  %%.4f (%%%% %%05.3f) (%%%% %%05.3f) - %%s %%s\\n", $count, $dist,
        $chainLinkMeasure, $swapMeasure, $orgName, $D;
}
close (FH);
' > sizeStats.pl
    chmod +x ./sizeStats.pl
    ./sizeStats.pl

#	If you can fill in all the numbers in this table, you are ready for
#	the multiple alignment procedure

#       featureBits chainLink measures
#               chainLink
#  N distance  on cavPor3  on other     other species
# 01  0.3538 (% 48.820) (% 48.448) - Malayan flying lemur galVar1
# 02  0.3627 (% 48.000) (% 42.371) - Human hg38
# 03  0.3938 (% 41.511) (% 41.225) - Chinese tree shrew tupChi1
# 04  0.4100 (% 27.994) (% 34.990) - Tree shrew tupBel1
# 05  0.4912 (% 29.115) (% 28.447) - Mouse mm10

# None of this concern for distances matters in building the first step, the
# maf files.  The distances will be better calibrated later.

    # create species list and stripped down tree for autoMZ
    sed 's/[a-z][a-z]*_//g; s/:[0-9\.][0-9\.]*//g; s/;//; /^ *$/d' \
	cavPor3.6way.nh | xargs echo | sed 's/ //g; s/,/ /g' > tree.nh

    sed 's/[()]//g; s/,/ /g' tree.nh > species.list
    # cavPor3 mm10 hg38 tupChi1 tupBel1 galVar1

    # survey N60 for each
    for db in `cat species.list`
do
n50.pl /hive/data/genomes/$db/chrom.sizes
done
#       reading: /hive/data/genomes/cavPor3/chrom.sizes
#       contig count: 3144, total size: 2723219641, one half size: 1361609820
# cumulative    N50 count       contig  contig size
1356838683      27      scaffold_25     28222655
1361609820 one half size
1384780737      28      scaffold_27     27942054
#       reading: /hive/data/genomes/mm10/chrom.sizes
#       contig count: 66, total size: 2730871774, one half size: 1365435887
# cumulative    N50 count       contig  contig size
1312176979      8       chr7    145441459
1365435887 one half size
1442871972      9       chr10   130694993
#       reading: /hive/data/genomes/hg38/chrom.sizes
#       contig count: 455, total size: 3209286105, one half size: 1604643052
# cumulative    N50 count       contig  contig size
1547391171      8       chrX    156040895
1604643052 one half size
1692529807      9       chr8    145138636
#       reading: /hive/data/genomes/tupChi1/chrom.sizes
#       contig count: 50750, total size: 2846580235, one half size: 1423290117
# cumulative    N50 count       contig  contig size
1419920836      231     KB321095        3691413
1423290117 one half size
1423590960      232     KB321106        3670124
#       reading: /hive/data/genomes/tupBel1/chrom.sizes
#       contig count: 150851, total size: 3660774957, one half size: 1830387478
# cumulative    N50 count       contig  contig size
1830345737      7910    scaffold_129972.1-127906        127906
1830387478 one half size
1830473625      7911    scaffold_147844.1-127888        127888
#       reading: /hive/data/genomes/galVar1/chrom.sizes
#       contig count: 179514, total size: 3187660572, one half size: 1593830286
# cumulative    N50 count       contig  contig size
1593691350      3422    NW_007730159v1  245222
1593830286 one half size
1593936539      3423    NW_007729331v1  245189

    #	bash shell syntax here ...
    cd /hive/data/genomes/cavPor3/bed/multiz6way
    export H=/hive/data/genomes/cavPor3/bed
    mkdir mafLinks
    # good assemblies can use syntenic net:
    #  hg38 mm10 galVar1
    for G in hg38 mm10 galVar1 tupChi1
    do
      mkdir mafLinks/$G
      echo ln -s ${H}/lastz.$G/axtChain/cavPor3.${G}.synNet.maf.gz ./mafLinks/$G
      ln -s ${H}/lastz.$G/axtChain/cavPor3.${G}.synNet.maf.gz ./mafLinks/$G
    done

    # other assemblies using recip best net:
    #  tupBel1
    for G in tupBel1
    do
      mkdir mafLinks/$G
      echo ln -s ${H}/lastz.$G/mafRBestNet/cavPor3.${G}.rbest.maf.gz ./mafLinks/$G
      ln -s ${H}/lastz.$G/mafRBestNet/cavPor3.${G}.rbest.maf.gz ./mafLinks/$G
    done

    # verify the symLinks are good:
    ls -ogrtL mafLinks/*/* | sed -e 's/^/# /; s/-rw-rw-r-- 1//;'
#  549742600 Mar 19  2012 mafLinks/mm10/cavPor3.mm10.synNet.maf.gz
#  930458998 Apr 29  2015 mafLinks/hg38/cavPor3.hg38.synNet.maf.gz
#  881164370 Apr 15  2016 mafLinks/galVar1/cavPor3.galVar1.synNet.maf.gz
#  535098561 Apr 18  2016 mafLinks/tupBel1/cavPor3.tupBel1.rbest.maf.gz
#  741020498 Nov 17 16:35 mafLinks/tupChi1/cavPor3.tupChi1.synNet.maf.gz

    # split the maf files into a set of hashed named files
    # this hash named split keeps the same chr/contig names in the same
    # named hash file.
    mkdir /hive/data/genomes/cavPor3/bed/multiz6way/mafSplit
    cd /hive/data/genomes/cavPor3/bed/multiz6way/mafSplit
    time for D in `sed -e "s/cavPor3 //" ../species.list`
do
    echo "${D}"
    mkdir $D
    cd $D
    echo "mafSplit -byTarget -useHashedName=8 /dev/null . ../../mafLinks/${D}/*.maf.gz"
    mafSplit -byTarget -useHashedName=8 /dev/null . \
	../../mafLinks/${D}/*.maf.gz
    cd ..
done
    # real    2m53.479s

    # construct a list of all possible maf file names.
    # they do not all exist in each of the species directories
    find . -type f | wc -l
    # 873
    find . -type f | grep ".maf$" | xargs -L 1 basename | sort -u > maf.list
    wc -l maf.list
    # 239 maf.list

    mkdir /hive/data/genomes/cavPor3/bed/multiz6way/splitRun
    cd /hive/data/genomes/cavPor3/bed/multiz6way/splitRun
    mkdir maf run
    cd run
    mkdir penn
    cp -p /cluster/bin/penn/multiz.2009-01-21_patched/multiz penn
    cp -p /cluster/bin/penn/multiz.2009-01-21_patched/maf_project penn
    cp -p /cluster/bin/penn/multiz.2009-01-21_patched/autoMZ penn

    #	verify the db and pairs settings are correct
    printf '#!/bin/csh -ef
set db = cavPor3
set c = $1
set result = $2
set run = `/bin/pwd`
set tmp = /dev/shm/$db/multiz.$c
set pairs = /hive/data/genomes/cavPor3/bed/multiz6way/mafSplit
/bin/rm -fr $tmp
/bin/mkdir -p $tmp
/bin/cp -p ../../tree.nh ../../species.list $tmp
pushd $tmp > /dev/null
foreach s (`/bin/sed -e "s/$db //" species.list`)
    set in = $pairs/$s/$c
    set out = $db.$s.sing.maf
    if (-e $in.gz) then
        /bin/zcat $in.gz > $out
        if (! -s $out) then
            echo "##maf version=1 scoring=autoMZ" > $out
        endif
    else if (-e $in) then
        /bin/ln -s $in $out
    else
        echo "##maf version=1 scoring=autoMZ" > $out
    endif
end
set path = ($run/penn $path); rehash
$run/penn/autoMZ + T=$tmp E=$db "`cat tree.nh`" $db.*.sing.maf $c \
        > /dev/null
popd > /dev/null
/bin/rm -f $result
/bin/cp -p $tmp/$c $result
/bin/rm -fr $tmp
' > autoMultiz.csh

    chmod +x autoMultiz.csh

    printf '#LOOP
./autoMultiz.csh $(file1) {check out line+ /hive/data/genomes/cavPor3/bed/multiz6way/splitRun/maf/$(root1).maf}
#ENDLOOP
' > template

    ln -s ../../mafSplit/maf.list maf.list
    ssh ku
    cd /hive/data/genomes/cavPor3/bed/multiz6way/splitRun/run
    gensub2 maf.list single template jobList
    para create jobList
    para try ... check ... push ... etc...
# Completed: 239 of 239 jobs
# CPU time in finished jobs:      88973s    1482.88m    24.71h    1.03d  0.003 y
# IO & Wait Time:                   713s      11.88m     0.20h    0.01d  0.000 y
# Average job time:                 375s       6.25m     0.10h    0.00d
# Longest finished job:            3383s      56.38m     0.94h    0.04d
# Submission to last job:          5424s      90.40m     1.51h    0.06d

    # combine into one file  (the 1>&2 redirect sends the echo to stderr)
    cd /hive/data/genomes/cavPor3/bed/multiz6way
    head -1 splitRun/maf/020.maf > multiz6way.maf
    time for F in splitRun/maf/*.maf
do
    echo "${F}" 1>&2
    egrep -v "^#" ${F}
done >> multiz6way.maf
    # real    0m37.947s


    tail -1 splitRun/maf/020.maf >> multiz6way.maf
# -rw-rw-r-- 1 9866798945 Dec 16 22:59 multiz6way.maf

    # Load into database
    ssh hgwdev
    cd /hive/data/genomes/cavPor3/bed/multiz6way
    mkdir /gbdb/cavPor3/multiz6way
    ln -s `pwd`/multiz6way.maf /gbdb/cavPor3/multiz6way
    cd /dev/shm
    time hgLoadMaf cavPor3 multiz6way
# Loaded 9177087 mafs in 1 files from /gbdb/cavPor3/multiz6way
# real    2m51.288s

    time hgLoadMafSummary -verbose=2 -minSize=30000 \
	-mergeGap=1500 -maxSize=200000 cavPor3 multiz6waySummary \
	/gbdb/cavPor3/multiz6way/multiz6way.maf
# Created 1021373 summary blocks from 25418473 components and 9177087 mafs from /gbdb/cavPor3/multiz6way/multiz6way.maf
# real    3m27.813s

# -rw-rw-r--   1  511006120 Dec 16 23:01 multiz6way.tab
# -rw-rw-r--   1   51570723 Dec 16 23:06 multiz6waySummary.tab

    wc -l multiz6way*.tab
#   9177087 multiz6way.tab
#   1021373 multiz6waySummary.tab

    rm multiz6way*.tab

##############################################################################
# GAP ANNOTATE MULTIZ7WAY MAF AND LOAD TABLES (DONE - 2017-12-16 - Hiram)
    # mafAddIRows has to be run on single chromosome maf files, it does not
    #	function correctly when more than one reference sequence
    #	are in a single file.  Need to split of the maf file into individual
    #   maf files
    mkdir -p /hive/data/genomes/cavPor3/bed/multiz6way/anno/mafSplit
    cd /hive/data/genomes/cavPor3/bed/multiz6way/anno/mafSplit

    time mafSplit -outDirDepth=2 -byTarget -useFullSequenceName \
        /dev/null . ../../multiz6way.maf
    #   real    2m49.707s

    find . -type f | wc -l
    #   933

    # check for N.bed files everywhere:
    cd /hive/data/genomes/cavPor3/bed/multiz6way/anno
    for DB in `cat ../species.list`
do
    if [ ! -s /hive/data/genomes/${DB}/${DB}.N.bed ]; then
        echo "MISS: ${DB}"
#         cd /hive/data/genomes/${DB}
#         twoBitInfo -nBed ${DB}.2bit ${DB}.N.bed
    else
        echo "  OK: ${DB}"
    fi
done

    cd /hive/data/genomes/cavPor3/bed/multiz6way/anno
    for DB in `cat ../species.list`
do
    echo "${DB} "
    ln -s  /hive/data/genomes/${DB}/${DB}.N.bed ${DB}.bed
    echo ${DB}.bed  >> nBeds
    ln -s  /hive/data/genomes/${DB}/chrom.sizes ${DB}.len
    echo ${DB}.len  >> sizes
done
    # make sure they all are successful symLinks:
    ls -ogrtL

    screen -S gapAnno      # use a screen to control this longish job
    ssh ku
    cd /hive/data/genomes/cavPor3/bed/multiz6way/anno
    mkdir result
    find ./mafSplit -type d | sed -e 's#./mafSplit/##' | while read D
do
    echo mkdir -p result/${D}
    mkdir -p result/${D}
done
    printf '#LOOP
mafAddIRows -nBeds=nBeds mafSplit/$(path1) /hive/data/genomes/cavPor3/cavPor3.2bit {check out exists+ result/$(path1)}
#ENDLOOP
' > template

    find ./mafSplit -type f | sed -e 's#^./mafSplit/##' > maf.list
    gensub2 maf.list single template jobList
    # limit jobs on a node with the ram=32g requirement because they go fast
    para -ram=32g create jobList
    para try ... check ... push ...
# Completed: 933 of 933 jobs
# CPU time in finished jobs:       1328s      22.14m     0.37h    0.02d  0.000 y
# IO & Wait Time:                  2690s      44.83m     0.75h    0.03d  0.000 y
# Average job time:                   4s       0.07m     0.00h    0.00d
# Longest finished job:              32s       0.53m     0.01h    0.00d
# Submission to last job:            91s       1.52m     0.03h    0.00d

    # verify all result files have some content, look for 0 size files:
    find ./result -type f -size 0
    # should see none
    # or in this manner:
    find ./result -type f | xargs ls -og | sort -k3nr | tail

    # combine into one file  (the 1>&2 redirect sends the echo to stderr)
    head -q -n 1 result/0/0/scaffold_1476.maf > cavPor3.6way.maf
    time find ./result -type f | while read F
do
    echo "${F}" 1>&2
    grep -h -v "^#" ${F}
done >> cavPor3.6way.maf
    # real    1m7.444s

    #	these maf files do not have the end marker, this does nothing:
    #	tail -q -n 1 result/0/0/scaffold_1476.maf >> cavPor3.6way.maf
    # How about an official end marker:
    echo "##eof maf" >> cavPor3.6way.maf
    ls -og
# -rw-rw-r--  1 13600527773 Dec 16 23:23 cavPor3.6way.maf

    du -hsc cavPor3.6way.maf ../*.maf
    #  13G     cavPor3.6way.maf
    #   9.2G    ../multiz6way.maf

    # construct symlinks to get the individual maf files into gbdb:
    rm /gbdb/cavPor3/multiz6way/multiz6way.maf   # remove previous results
    ln -s `pwd`/cavPor3.6way.maf /gbdb/cavPor3/multiz6way/multiz6way.maf

    # Load into database
    cd /dev/shm
    time hgLoadMaf -pathPrefix=/gbdb/cavPor3/multiz6way cavPor3 multiz6way
    #  Loaded 10188349 mafs in 1 files from /gbdb/cavPor3/multiz6way
    #   real    3m38.228s

    time hgLoadMafSummary -verbose=2 -minSize=30000 \
	-mergeGap=1500 -maxSize=200000 cavPor3 multiz6waySummary \
        /gbdb/cavPor3/multiz6way/multiz6way.maf
    # Created 1021373 summary blocks from 25418473 components and 10188349 mafs from /gbdb/cavPor3/multiz6way/multiz6way.maf
    #  real    4m14.633s

    # -rw-rw-r--   1  570229240 Dec 16 23:27 multiz6way.tab
    # -rw-rw-r--   1   53613469 Dec 16 23:33 multiz6waySummary.tab

    rm multiz6way*.tab

######################################################################
# MULTIZ7WAY MAF FRAMES (DONE - 2017-12-17 - Hiram)
    ssh hgwdev
    mkdir /hive/data/genomes/cavPor3/bed/multiz6way/frames
    cd /hive/data/genomes/cavPor3/bed/multiz6way/frames
#   survey all the genomes to find out what kinds of gene tracks they have
    printf '#!/bin/csh -fe
foreach db (`cat ../species.list`)
    printf "# ${db}: "
    set tables = `hgsql $db -N -e "show tables" | egrep "Gene|ncbiRefSeq"`
    foreach table ($tables)
        if ($table == "ensGene" || $table == "refGene" || \
           $table == "ncbiRefSeq" || $table == "mgcGenes" || \
           $table == "knownGene" || $table == "xenoRefGene" ) then
           set count = `hgsql $db -N -e "select count(*) from $table"`
            echo -n "${table}: ${count}, "
        endif
    end
    set orgName = `hgsql hgcentraltest -N -e \
            "select scientificName from dbDb where name='"'"'$db'"'"'"`
    set orgId = `hgsql hgFixed -N -e \
            "select id from organism where name='"'"'$orgName'"'"'"`
    if ($orgId == "") then
        echo "Mrnas: 0"
    else
        set count = `hgsql hgFixed -N -e "select count(*) from gbCdnaInfo where organism=$orgId"`
        echo "Mrnas: ${count}"
    endif
end
' > showGenes.csh

    chmod +x ./showGenes.csh
    time ./showGenes.csh
# cavPor3: ensGene: 34846, refGene: 488, xenoRefGene: 316945, Mrnas: 21241
# mm10: ensGene: 103734, knownGene: 63759, mgcGenes: 27612, ncbiRefSeq: 106520, refGene: 39240, xenoRefGene: 183459, Mrnas: 5371140
# hg38: ensGene: 208239, knownGene: 196838, mgcGenes: 35312, ncbiRefSeq: 159322, refGene: 74453, xenoRefGene: 187376, Mrnas: 11508577
# tupChi1: refGene: 206, xenoRefGene: 353563, Mrnas: 50709
# tupBel1: ensGene: 34727, xenoRefGene: 751689, Mrnas: 2543
# galVar1: ncbiRefSeq: 41547, xenoRefGene: 516902, Mrnas: 0

    # from that summary, use these gene sets:
    # knownGene - hg38 mm10
    # ensGene - cavPor3 tupBel1
    # none - tupChi1 galVar1
 
    mkdir genes
    #   1. knownGene: hg38 mm10
    for DB in hg38 mm10
do
    hgsql -N -e "select name,chrom,strand,txStart,txEnd,cdsStart,cdsEnd,exonCount,exonStarts,exonEnds from knownGene" ${DB} \
      | genePredSingleCover stdin stdout | gzip -2c \
        > genes/${DB}.gp.gz
    printf "# ${DB}: "
    genePredCheck -db=${DB} genes/${DB}.gp.gz
done
# hg38: checked: 21554 failed: 0
# mm10: checked: 21100 failed: 0

    #   2. ensGene: cavPor3 tupBel1
    for DB in cavPor3 tupBel1
do
hgsql -N -e "select name,chrom,strand,txStart,txEnd,cdsStart,cdsEnd,exonCount,exonStarts,exonEnds
from ensGene" ${DB} \
      | genePredSingleCover stdin stdout | gzip -2c \
        > /dev/shm/${DB}.tmp.gz
    mv /dev/shm/${DB}.tmp.gz genes/$DB.gp.gz
    printf "# ${DB}: "
    genePredCheck -db=${DB} genes/${DB}.gp.gz
done
# cavPor3: checked: 18034 failed: 0
# tupBel1: checked: 29256 failed: 0

    # verify counts for genes are reasonable:
    for T in genes/*.gz
do
    echo -n "# $T: "
    zcat $T | cut -f1 | sort | uniq -c | wc -l
done
# genes/cavPor3.gp.gz: 18034
# genes/hg38.gp.gz: 21554
# genes/mm10.gp.gz: 21100
# genes/tupBel1.gp.gz: 15407

    time (cat ../anno/cavPor3.6way.maf \
	| genePredToMafFrames cavPor3 stdin stdout \
          `sed -e 's/tupChi1//; s/galVar1//;' ../species.list.txt | xargs echo \
            | sed -e "s#\([a-zA-Z0-9]*\)#\1 genes/\1.gp.gz#g;"` \
		| gzip > multiz6wayFrames.bed.gz)
    # real    2m42.982s

    # verify there are frames on everything, should be 6 species:
    zcat multiz6wayFrames.bed.gz | awk '{print $4}' | sort | uniq -c \
       | sed -e 's/^/# /;'
#  178463 cavPor3
#  246612 hg38
#  234751 mm10
#  201885 tupBel1

    #   load the resulting file
    ssh hgwdev
    cd /hive/data/genomes/cavPor3/bed/multiz6way/frames
    time hgLoadMafFrames cavPor3 multiz6wayFrames multiz6wayFrames.bed.gz
    #   real    0m9.457s

    time featureBits -countGaps cavPor3 multiz6wayFrames
    # 35046322 bases of 2723219641 (1.287%) in intersection
    # real    0m6.884s

    #   enable the trackDb entries:
# frames multiz6wayFrames
# irows on
    #   appears to work OK

#########################################################################
# Phylogenetic tree from 6-way (DONE - 2017-12-17 - Hiram)
    mkdir /hive/data/genomes/cavPor3/bed/multiz6way/4d
    cd /hive/data/genomes/cavPor3/bed/multiz6way/4d

    # using the ensGene
    hgsql -N -e "select name,chrom,strand,txStart,txEnd,cdsStart,cdsEnd,exonCount,exonStarts,exonEnds from ensGene" cavPor3 \
      | genePredSingleCover stdin stdout > cavPor3.ensGeneNR.gp
    genePredCheck -db=cavPor3 cavPor3.ensGeneNR.gp
    # checked: 18034 failed: 0

    # the annotated maf is:
    og ../anno/cavPor3.6way.maf
# -rw-rw-r-- 1 13063660432 May  4 10:33 ../anno/cavPor3.6way.maf

    mkdir annoSplit
    cd annoSplit
    time mafSplit -verbose=2 -outDirDepth=2 -byTarget -useFullSequenceName \
	/dev/null . ../../anno/cavPor3.6way.maf
    # real    4m19.176s

    find . -type f | wc -l
    #   933
    ssh ku
    mkdir /hive/data/genomes/cavPor3/bed/multiz6way/4d/run
    cd /hive/data/genomes/cavPor3/bed/multiz6way/4d/run
    mkdir ../mfa

    # newer versions of msa_view have a slightly different operation
    # the sed of the gp file inserts the reference species in the chr name
    printf '#!/bin/csh -fe
set PHASTBIN = /cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin
set GP = cavPor3.ensGeneNR.gp
set r = "/hive/data/genomes/cavPor3/bed/multiz6way"
set c = $1:r
set infile = $r/4d/annoSplit/$2
set outDir = $r/4d/mfa/$3:h
set outfile = $r/4d/mfa/$3
/bin/mkdir -p $outDir
cd /dev/shm
/bin/awk -v C=$c '"'"'$2 == C {print}'"'"' $r/4d/$GP | sed -e "s/\\t$c\\t/\\tcavPor3.$c\\t/" > $c.gp
set NL=`wc -l $c.gp| gawk '"'"'{print $1}'"'"'`
echo $NL
if ("$NL" != "0") then
    $PHASTBIN/msa_view --4d --features $c.gp -i MAF $infile -o SS > $c.ss
    $PHASTBIN/msa_view -i SS --tuple-size 1 $c.ss > $outfile
else
    echo "" > $outfile
endif
/bin/rm -f /dev/shm/$c.gp /dev/shm/$c.ss
' > 4d.csh

    chmod +x 4d.csh

    find ../annoSplit -type f | sed -e "s#../annoSplit/##" > maf.list
    wc -l maf.list
# 76237 maf.list

    printf '#LOOP
4d.csh $(file1) $(path1) {check out line+ ../mfa/$(dir1)/$(dir2)$(root1).mfa}
#ENDLOOP
' > template

    gensub2 maf.list single template jobList
    para create jobList
    para try ... check
    para time
# Completed: 76896 of 76237 jobs
# Crashed: 342 jobs
# CPU time in finished jobs:       2831s      47.19m     0.79h    0.03d  0.000 y
# IO & Wait Time:                192714s    3211.90m    63.63h    2.23d  0.006 y
# Average job time:                   3s       0.04m     0.00h    0.00d
# Longest finished job:               6s       0.10m     0.00h    0.00d
# Submission to last job:          1616s      26.92m     0.46h    0.02d

    # Not all results have contents, or finish successfully, that is OK
    # it is because not all contigs have genes, only gene sequences are measured

    # combine mfa files
    ssh hgwdev
    cd /hive/data/genomes/cavPor3/bed/multiz6way/4d
    # remove the broken empty files, size 0 and size 1:
    find ./mfa -type f -size 0 | xargs rm -f
    # sometimes this doesn't work, don't know why
    find ./mfa -type f -size 1 | xargs rm -f
    # when it doesn't, use this empty list procedure
    find ./mfa -type f | xargs ls -og | awk '$3 < 2' | awk '{print $NF}' \
        > empty.list
    cat empty.list | xargs rm -f
    # see what is left:
    ls -ogrt mfa/*/*/*.mfa | sort -k3nr | wc
    #  8027   66189  473683

    # want comma-less species.list
    time /cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin/msa_view \
	--aggregate "`cat ../species.list`" mfa/*/*/*.mfa | sed s/"> "/">"/ \
	    > 4d.all.mfa
    # real    1m10.731s

    # check they are all in there:
    grep "^>" 4d.all.mfa | wc -l
    # 6
    grep "^>" 4d.all.mfa | sed -e 's/^/# /;'
# >cavPor3
# >tupBel1
# >hg38
# >mm10
# >galVar1

    sed 's/[a-z][a-z]*_//g; s/:[0-9\.][0-9\.]*//g; s/;//; /^ *$/d' \
	../cavPor3.6way.nh | xargs echo | sed -e 's/ //g' > tree_commas.nh
    # tree_commas.nh looks like:
    # (((cavPor3,tupBel1),hg38),(mm10,galVar1))

    # use phyloFit to create tree model (output is phyloFit.mod)
    time /cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin/phyloFit \
	    --EM --precision MED --msa-format FASTA --subst-mod REV \
		--tree tree_commas.nh 4d.all.mfa
    #  real    0m1.209s

    mv phyloFit.mod all.mod

    grep TREE all.mod
# TREE:
# (((cavPor3:0.101018,tupBel1:0.179618):0.00922862,hg38:0.116764):0.0263261,
#    (mm10:0.281608,galVar1:0.209143):0.0263261);

    # compare these calculated lengths to the tree extracted from 218way:
    grep TREE all.mod | sed -e 's/TREE: //' \
       | /cluster/bin/phast/all_dists /dev/stdin | grep cavPor3 \
          | sed -e "s/cavPor3.//;"  | sort > new.dists
    /cluster/bin/phast/all_dists ../cavPor3.6way.nh | grep cavPor3 \
        | sed -e "s/cavPor3.//;" | sort > old.dists
     # printing out the 'new', the 'old' the 'difference' and percent difference
    join new.dists old.dists | awk '{
  printf "#\t%s\t%8.5f\t%8.5f\t%8.5f\t%8.5f\n", $1, $2, $3, $2-$3, 100*($2-$3)/$3 }' \
      | sort -k3n
#       galVar1  0.38071         0.35378         0.02693         7.61247
#       hg38     0.38638         0.36275         0.02363         6.51503
#       tupChi1  0.46067         0.39377         0.06690        16.98914
#       tupBel1  0.46482         0.40998         0.05484        13.37633
#       mm10     0.50607         0.49120         0.01487         3.02726

XXX - ready to continue - Sun Dec 17 21:00:07 PST 2017

#########################################################################
# phastCons 6-way (TBD - 2016-06-06 - Hiram)
    # split 6way mafs into 10M chunks and generate sufficient statistics
    # files for # phastCons
    ssh ku
    mkdir -p /hive/data/genomes/cavPor3/bed/multiz6way/cons/SS
    cd /hive/data/genomes/cavPor3/bed/multiz6way/cons/SS
    mkdir result done

    printf '#!/bin/csh -ef
set d = $1
set c = $2
set doneDir = done/$d
set MAF = /hive/data/genomes/cavPor3/bed/multiz6way/anno/result/$d/$c.maf
set WINDOWS = /hive/data/genomes/cavPor3/bed/multiz6way/cons/SS/result/$d/$c
set WC = `cat $MAF | wc -l`
set NL = `grep "^#" $MAF | wc -l`
if ( -s $3 ) then
    exit 0
endif
if ( -s $3.running ) then
    exit 0
endif

/bin/mkdir -p $doneDir
/bin/date >> $3.running

/bin/rm -fr $WINDOWS
/bin/mkdir -p $WINDOWS
pushd $WINDOWS > /dev/null
if ( $WC != $NL ) then
/cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin/msa_split \\
    $MAF -i MAF -o SS -r $WINDOWS/$c -w 10000000,0 -I 1000 -B 5000
endif
popd > /dev/null
/bin/date >> $3
/bin/rm -f $3.running
' > mkSS.csh

    chmod +x mkSS.csh

    printf '#LOOP
mkSS.csh $(dir1) $(root1) {check out line+ done/$(dir1)/$(root1)}
#ENDLOOP
' > template

    find ../../anno/result -type f | sed -e "s#../../anno/result/##" > maf.list
    wc -l maf.list
# 76237 maf.list

    ssh ku
    cd /hive/data/genomes/cavPor3/bed/multiz6way/cons/SS

    gensub2 maf.list single template jobList
    # beware overwhelming the cluster with these quick high I/O jobs
    para create jobList
    para try ... check ... etc
    para -maxJob=64 push
# Completed: 76237 of 76237 jobs
# CPU time in finished jobs:       3491s      68.19m     0.97h    0.04d  0.000 y
# IO & Wait Time:                321266s    6364.26m    89.24h    3.72d  0.010 y
# Average job time:                   4s       0.07m     0.00h    0.00d
# Longest finished job:              10s       0.17m     0.00h    0.00d
# Submission to last job:          1683s      26.38m     0.44h    0.02d


    find ./result -type f | wc -l
    # 24863

    # Run phastCons
    #	This job is I/O intensive in its output files, beware where this
    #	takes place or do not run too many at once.
    ssh ku
    mkdir -p /hive/data/genomes/cavPor3/bed/multiz6way/cons/run.cons
    cd /hive/data/genomes/cavPor3/bed/multiz6way/cons/run.cons

    #	This is setup for multiple runs based on subsets, but only running
    #   the 'all' subset here.
    #   It triggers off of the current working directory
    #	$cwd:t which is the "grp" in this script.  Running:
    #	all and vertebrates

    printf '#!/bin/csh -fe
set PHASTBIN = /cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin
set c = $1
set d = $2
set f = $3
set len = $4
set cov = $5
set rho = $6
set grp = $cwd:t
set cons = /hive/data/genomes/cavPor3/bed/multiz6way/cons
set tmp = $cons/tmp/${d}_${c}
mkdir -p $tmp
set ssSrc = $cons/SS/result
set useGrp = "$grp.mod"
if (-s $cons/$grp/$grp.non-inf) then
  ln -s $cons/$grp/$grp.mod $tmp
  ln -s $cons/$grp/$grp.non-inf $tmp
  ln -s $ssSrc/$d/$f $tmp
else
  ln -s $ssSrc/$d/$f $tmp
  ln -s $cons/$grp/$grp.mod $tmp
endif
pushd $tmp > /dev/null
if (-s $grp.non-inf) then
  $PHASTBIN/phastCons $f $useGrp \
    --rho $rho --expected-length $len --target-coverage $cov --quiet \\
    --not-informative `cat $grp.non-inf` \\
    --seqname $c --idpref $c --most-conserved $c.bed --score > $c.pp
else
  $PHASTBIN/phastCons $f $useGrp \\
    --rho $rho --expected-length $len --target-coverage $cov --quiet \\
    --seqname $c --idpref $c --most-conserved $c.bed --score > $c.pp
endif
popd > /dev/null
mkdir -p pp/$d bed/$d
sleep 4
touch pp/$d bed/$d
rm -f pp/$d/$c.pp
rm -f bed/$d/$c.bed
mv $tmp/$c.pp pp/$d
mv $tmp/$c.bed bed/$d
rm -fr $tmp
rmdir --ignore-fail-on-non-empty $cons/tmp/$d:h
' > doPhast.csh

    chmod +x doPhast.csh

    #	this template will serve for all runs
    #	root1 == chrom name, file1 == ss file name without .ss suffix
    printf '#LOOP
../run.cons/doPhast.csh $(root1) $(dir1) $(file1) 45 0.3 0.3 {check out line+ pp/$(dir1)/$(root1).pp}
#ENDLOOP
' > template

    find ../SS/result -type f | sed -e "s#../SS/result/##" > ss.list
    wc -l ss.list
    #	24863 ss.list

    # Create parasol batch and run it
    # run for all species
    cd /hive/data/genomes/cavPor3/bed/multiz6way/cons
    mkdir -p all
    cd all
    #	Using the .mod tree
    cp -p ../../4d/all.mod ./all.mod

    gensub2 ../run.cons/ss.list single ../run.cons/template jobList
    para -ram=32g create jobList
    para try ... check ...
    para push
# Completed: 24816 of 24863 jobs
# Crashed: 38 jobs
# CPU time in finished jobs:       6246s     104.09m     1.73h    0.07d  0.000 y
# IO & Wait Time:                222621s    3710.36m    61.84h    2.68d  0.007 y
# Average job time:                   9s       0.16m     0.00h    0.00d
# Longest finished job:              18s       0.30m     0.01h    0.00d
# Submission to last job:           936s      16.68m     0.26h    0.01d

   # the 38 crash jobs were actually finished, they failed the last rmdir:
# rmdir: failed to remove `/hive/data/genomes/cavPor3/bed/multiz6way/cons/tmp/7/6': No such file or directory

    # create Most Conserved track
    cd /hive/data/genomes/cavPor3/bed/multiz6way/cons/all
    time cut -f1 ../../../../chrom.sizes | while read C
do
    ls -d bed/?/?/${C} 2> /dev/null | while read D
    do
        echo ${D}/${C}*.bed 1>&2
        cat ${D}/${C}*.bed
    done | sort -k1,1 -k2,2n \
    | awk '{printf "%s\t%d\t%d\tlod=%d\t%s\n", "'${C}'", $2, $3, $6, $6;}'
done > tmpMostConserved.bed
    # real    19m26.846s

    time /cluster/bin/scripts/lodToBedScore tmpMostConserved.bed \
         > mostConserved.bed
    # real    0m6.667s

    # -rw-rw-r--  1 36626033 May  6 14:66 mostConserved.bed

    # load into database
    ssh hgwdev
    cd /hive/data/genomes/cavPor3/bed/multiz6way/cons/all
    time hgLoadBed cavPor3 phastConsElements6way mostConserved.bed
    # Read 932866 elements of size 6 from mostConserved.bed
    # real    0m9.898s

    # on human we often try for 6% overall cov, and 70% CDS cov
    # most bets are off here for that goal, these alignments are too few
    #	and too far between
    #	--rho 0.3 --expected-length 46 --target-coverage 0.3
    time featureBits cavPor3 -enrichment ensGene:cds phastConsElements6way
# ensGene:cds 1.217%, phastConsElements6way 3.976%, both 0.819%,
#   cover 67.27%, enrich 16.92x
    # real    2m33.330s

    # Create merged posterier probability file and wiggle track data files
    cd /hive/data/genomes/cavPor3/bed/multiz6way/cons/all
    mkdir downloads

    # the third sed fixes the chrom names, removing the partition extensions
    time (find ./pp -type f | sed -e "s#^./##; s#\.# d #g; s#-# m #;" \
	| sort -k1,1 -k3,3n | sed -e "s# d #.#g; s# m #-#g;" | xargs cat \
	| sed -e 's/\.[0-9][0-9]*-[0-9][0-9]* start/ start/' \
        | gzip -c > downloads/phastCons6way.wigFix.gz)
    #   real    27m47.146s
# -rw-rw-r-- 1 2207346680 May  6 16:33 phastCons6way.wigFix.gz

    # check integrity of data with wigToBigWig
    time (zcat downloads/phastCons6way.wigFix.gz \
	| wigToBigWig -verbose=2 stdin /hive/data/genomes/cavPor3/chrom.sizes \
	    phastCons6way.bw) > bigWig.log 2>&1
    egrep "real|VmPeak" bigWig.log
    # pid=37326: VmPeak:    20943944 kB
    # real    30m30.283s

    bigWigInfo phastCons6way.bw | sed -e 's/^/# /;'
# version: 4
# isCompressed: yes
# isSwapped: 0
# primaryDataSize: 3,364,371,979
# primaryIndexSize: 81,969,008
# zoomLevels: 10
# chromCount: 24863
# basesCovered: 1,910,088,693
# mean: 0.117236
# min: 0.000000
# max: 1.000000
# std: 0.237776

    #	encode those files into wiggle data
    time (zcat downloads/phastCons6way.wigFix.gz \
	| wigEncode stdin phastCons6way.wig phastCons6way.wib)
    # Converted stdin, upper limit 1.00, lower limit 0.00
    #  real    10m31.797s

    du -hsc *.wi?
    #  1.8G    phastCons6way.wib
    #  276M    phastCons6way.wig

    # Load gbdb and database with wiggle.
    ln -s `pwd`/phastCons6way.wib /gbdb/cavPor3/multiz6way/phastCons6way.wib
    time hgLoadWiggle -pathPrefix=/gbdb/cavPor3/multiz6way \
	cavPor3 phastCons6way phastCons6way.wig
    #   real    0m30.803s

    # use to set trackDb.ra entries for wiggle min and max
    # and verify table is loaded correctly

    wigTableStats.sh cavPor3 phastCons6way
# db.table               min max mean       count sumData
# cavPor3.phastCons6way   0 1 0.117236 1910088693 2.23929e+08
#     stdDev  viewLimits
#    0.237776 viewLimits=0:1

    #  Create histogram to get an overview of all the data
    time hgWiggle -doHistogram -db=cavPor3 \
	-hBinSize=0.001 -hBinCount=1000 -hMinVal=0.0 -verbose=2 \
	    phastCons6way > histogram.data 2>&1
    #	real    4m6.489s

    #	create plot of histogram:

    printf 'set terminal png small x000000 xffffff xc000ff x66ff66 xffff00 x00ffff font \
"/usr/share/fonts/default/Type1/n022004l.pfb"
set size 1.4, 0.8
set key left box
set grid noxtics
set grid ytics
set title " Malayan flying lemur cavPor3 Histogram phastCons6way track"
set xlabel " phastCons6way score"
set ylabel " Relative Frequency"
set y2label " Cumulative Relative Frequency (CRF)"
set y2range [0:1]
set y2tics
set yrange [0:0.02]

plot "histogram.data" using 2:6 title " RelFreq" with impulses, \\
        "histogram.data" using 2:7 axes x1y2 title " CRF" with lines
' | gnuplot > histo.png

    display histo.png &

#########################################################################
# phyloP for 6-way (TBD - 2016-06-09,11 - Hiram)
    # run phyloP with score=LRT
    ssh ku
    mkdir /cluster/data/cavPor3/bed/multiz6way/consPhyloP
    cd /cluster/data/cavPor3/bed/multiz6way/consPhyloP

    mkdir run.phyloP
    cd run.phyloP
    # Adjust model file base composition background and rate matrix to be
    # representative of the chromosomes in play
    grep BACKGROUND ../../4d/all.mod | awk '{printf "%0.3f\n", $3 + $4}'
    #	0.662
    /cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin/modFreqs \
	../../4d/all.mod 0.662 > all.mod
    # verify, the BACKGROUND should now be paired up:
    grep BACK all.mod
    #   BACKGROUND: 0.219000 0.281000 0.281000 0.219000 

    printf '#!/bin/csh -fe
set PHASTBIN = /cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin
set f = $1
set d = $f:h
set file1 = $f:t
set out = $2
set cName = $f:t:r
set grp = $cwd:t
set cons = /hive/data/genomes/cavPor3/bed/multiz6way/consPhyloP
set tmp = $cons/tmp/$grp/$f
/bin/rm -fr $tmp
/bin/mkdir -p $tmp
set ssSrc = "/hive/data/genomes/cavPor3/bed/multiz6way/cons/SS/result/$f"
set useGrp = "$grp.mod"
/bin/ln -s $cons/run.phyloP/$grp.mod $tmp
pushd $tmp > /dev/null
$PHASTBIN/phyloP --method LRT --mode CONACC --wig-scores --chrom $cName \\
    -i SS $useGrp $ssSrc.ss > $file1.wigFix
popd > /dev/null
/bin/mkdir -p $out:h
sleep 4
/bin/touch $out:h
/bin/mv $tmp/$file1.wigFix $out
/bin/rm -fr $tmp
/bin/rmdir --ignore-fail-on-non-empty $cons/tmp/$grp/$d
/bin/rmdir --ignore-fail-on-non-empty $cons/tmp/$grp/$d:h
/bin/rmdir --ignore-fail-on-non-empty $cons/tmp/$grp
/bin/rmdir --ignore-fail-on-non-empty $cons/tmp
' > doPhyloP.csh

    chmod +x doPhyloP.csh

    # Create list of chunks
    find ../../cons/SS/result -type f | grep ".ss$" \
	| sed -e "s/.ss$//; s#^../../cons/SS/result/##" > ss.list
    # make sure the list looks good
    wc -l ss.list
    #	24863 ss.list

    # Create template file
    #	file1 == $chr/$chunk/file name without .ss suffix
    printf '#LOOP
../run.phyloP/doPhyloP.csh $(path1) {check out line+ wigFix/$(dir1)/$(file1).wigFix}
#ENDLOOP
' > template

    ######################   Running all species  #######################
    # setup run for all species
    mkdir /hive/data/genomes/cavPor3/bed/multiz6way/consPhyloP/all
    cd /hive/data/genomes/cavPor3/bed/multiz6way/consPhyloP/all
    rm -fr wigFix
    mkdir wigFix

    gensub2 ../run.phyloP/ss.list single ../run.phyloP/template jobList
    #	beware overwhelming the cluster with these fast running high I/O jobs
    para create jobList
    para try ... check ... push ... etc ...
    para -maxJob=63 push
    para time > run.time
# Completed: 24862 of 24863 jobs
# Crashed: 1 jobs
# CPU time in finished jobs:       7617s     126.29m     2.09h    0.09d  0.000 y
# IO & Wait Time:                166287s    2771.46m    46.19h    1.92d  0.006 y
# Average job time:                   7s       0.12m     0.00h    0.00d
# Longest finished job:              11s       0.18m     0.00h    0.00d
# Submission to last job:          1799s      29.98m     0.60h    0.02d

    # the one failed job was just the last rmdir command:
# /bin/rmdir: failed to remove `/hive/data/genomes/cavPor3/bed/multiz6way/consPhyloP/tmp/all/6/2

    mkdir downloads

    time (find ./wigFix -type f | sed -e "s#^./##; s#\.# d #g; s#-# m #;" \
	| sort -k1,1 -k3,3n | sed -e "s# d #.#g; s# m #-#g;" | xargs cat \
	| gzip -c > downloads/phyloP6way.wigFix.gz)
    #   real    30m20.672s

    # check integrity of data with wigToBigWig
    time (zcat downloads/phyloP6way.wigFix.gz \
	| wigToBigWig -verbose=2 stdin /hive/data/genomes/cavPor3/chrom.sizes \
	phyloP6way.bw) > bigWig.log 2>&1
    egrep "real|VmPeak" bigWig.log
    # pid=19896: VmPeak:    20943916 kB
    # real    216m36.688s

    bigWigInfo phyloP6way.bw | sed -e 's/^/# /;'
# version: 4
# isCompressed: yes
# isSwapped: 0
# primaryDataSize: 3,067,676,886
# primaryIndexSize: 81,969,008
# zoomLevels: 10
# chromCount: 24863
# basesCovered: 1,910,088,693
# mean: 0.061208
# min: -2.699000
# max: 0.833000
# std: 0.611669

    #	encode those files into wiggle data
    time (zcat downloads/phyloP6way.wigFix.gz \
	| wigEncode stdin phyloP6way.wig phyloP6way.wib)
    # Converted stdin, upper limit 0.83, lower limit -2.60
    #    real    9m4.643s

    du -hsc *.wi?
    # 1.8G    phyloP6way.wib
    # 279M    phyloP6way.wig

    # Load gbdb and database with wiggle.
    ln -s `pwd`/phyloP6way.wib /gbdb/cavPor3/multiz6way/phyloP6way.wib
    time hgLoadWiggle -pathPrefix=/gbdb/cavPor3/multiz6way cavPor3 \
	phyloP6way phyloP6way.wig
    # real    0m30.869s

    # use to set trackDb.ra entries for wiggle min and max
    # and verify table is loaded correctly

    wigTableStats.sh cavPor3 phyloP6way
# db.table      min max mean count sumData
# cavPor3.phyloP6way  -2.699 0.833 0.0612084 1910088693 1.16914e+08
#       stdDev viewLimits
#     0.611669 viewLimits=-2.699:0.833

    #	that range is: 0.833+2.699 = 3.432 for hBinSize=0.003432

    #  Create histogram to get an overview of all the data
    time hgWiggle -doHistogram \
	-hBinSize=0.003432 -hBinCount=1000 -hMinVal=-2.669 -verbose=2 \
	    -db=cavPor3 phyloP6way > histogram.data 2>&1
    # real    4m20.444s

    # find the Y range for the 2:6 graph
    grep -v chrom histogram.data | grep "^[0-9]" | ave -col=6 stdin \
      | sed -e 's/^/# /;'
# Q1 0.000087
# median 0.000381
# Q3 0.001361
# average 0.001112
# min 0.000000
# max 0.032088
# count 899
# total 0.999990
# standard deviation 0.002166

    # find the X range for the 2:6 graph
    grep "^[0-9]" histogram.data | ave -col=2 stdin \
      | sed -e 's/^/# /;'
# Q1 -1.773070
# median -1.004300
# Q3 -0.228672
# average -0.962349
# min -2.669000
# max 0.831816
# count 899
# total -866.161612
# standard deviation 0.964068

    #	create plot of histogram:
    printf 'set terminal png small x000000 xffffff xc000ff x66ff66 xffff00 x00ffff font \
"/usr/share/fonts/default/Type1/n022004l.pfb"
set size 1.4, 0.8
set key left box
set grid noxtics
set grid ytics
set title " Guinea pig cavPor3 Histogram phyloP6way track"
set xlabel " phyloP6way score"
set ylabel " Relative Frequency"
set y2label " Cumulative Relative Frequency (CRF)"
set y2range [0:1]
set y2tics
set xtics
set xrange [-2.6:0.86]
set yrange [0:0.033]

plot "histogram.data" using 2:6 title " RelFreq" with impulses, \
        "histogram.data" using 2:7 axes x1y2 title " CRF" with lines
' | gnuplot > histo.png

    display histo.png &
    # appears to have an odd hole in the data near X=0 ?

#############################################################################
# hgPal downloads (TBD - 2016-06-09,11 - Hiram)
#   FASTA from 6-way for knownGene, refGene and knownCanonical

    ssh hgwdev
    screen -S cavPor3HgPal
    mkdir /hive/data/genomes/cavPor3/bed/multiz6way/pal
    cd /hive/data/genomes/cavPor3/bed/multiz6way/pal
    cat ../species.list | tr '[ ]' '[\n]' > order.list

    # this for loop takes about 2.6 hours on this large count contig assembly
    export mz=multiz6way
    export gp=ensGene
    export db=cavPor3
    export I=0
    export D=0
    mkdir exonAA exonNuc
    printf '#!/bin/sh\n' > $gp.jobs

    time for C in `sort -nk2 ../../../chrom.sizes | cut -f1`
    do
        I=`echo $I | awk '{print $1+1}'`
        D=`echo $D | awk '{print $1+1}'`
        dNum=`echo $D | awk '{printf "%03d", int($1/1000)}'`
        mkdir -p exonNuc/${dNum} > /dev/null
        mkdir -p exonAA/${dNum} > /dev/null
	echo "mafGene -chrom=$C -exons -noTrans $db $mz $gp order.list stdout | gzip -c > exonNuc/${dNum}/$C.exonNuc.fa.gz &"
	echo "mafGene -chrom=$C -exons $db $mz $gp order.list stdout | gzip -c > exonAA/${dNum}/$C.exonAA.fa.gz &"
        if [ $I -gt 16 ]; then
            echo "date"
            echo "wait"
            I=0
        fi
    done >> $gp.jobs
    # real    116m16.333s


    echo "date" >> $gp.jobs
    echo "wait" >> $gp.jobs

    chmod +x  ensGene.jobs 

    time (./$gp.jobs) > $gp.jobs.log 2>&1 &
    # real    14m60.760s

    export mz=multiz6way
    export gp=ensGene
    time find ./exonAA -type f | grep exonAA.fa.gz | xargs zcat \
     | gzip -c > $gp.$mz.exonAA.fa.gz
    #  real    4m20.026s

    time find ./exonNuc -type f | grep exonNuc.fa.gz | xargs zcat \
     | gzip -c > $gp.$mz.exonNuc.fa.gz
    #   real    4m46.761s

  #  -rw-rw-r--   1 36201970 May 11 11:26 ensGene.multiz6way.exonAA.fa.gz
  #  -rw-rw-r--   1 69404213 May 11 11:30 ensGene.multiz6way.exonNuc.fa.gz

    export mz=multiz6way
    export gp=ensGene
    export db=cavPor3
    export pd=/usr/local/apache/htdocs-hgdownload/goldenPath/$db/$mz/alignments
    mkdir -p $pd
    md6sum *.fa.gz > md6sum.txt
    ln -s `pwd`/$gp.$mz.exonAA.fa.gz $pd/$gp.exonAA.fa.gz
    ln -s `pwd`/$gp.$mz.exonNuc.fa.gz $pd/$gp.exonNuc.fa.gz
    ln -s `pwd`/md6sum.txt $pd/

    rm -rf exonAA exonNuc

#############################################################################
# construct download files for 6-way (TBD - 2016-06-11 - Hiram)
    mkdir /usr/local/apache/htdocs-hgdownload/goldenPath/cavPor3/multiz6way
    mkdir /usr/local/apache/htdocs-hgdownload/goldenPath/cavPor3/phastCons6way
    mkdir /usr/local/apache/htdocs-hgdownload/goldenPath/cavPor3/phyloP6way
    mkdir /hive/data/genomes/cavPor3/bed/multiz6way/downloads
    cd /hive/data/genomes/cavPor3/bed/multiz6way/downloads
    mkdir multiz6way phastCons6way phyloP6way
    cd multiz6way
    time cp -p ../../anno/cavPor3.6way.maf .
    #   real    0m21.617s

    # -rw-rw-r-- 1 13600527773 Dec 16 23:23 cavPor3.6way.maf

    du -hsc *
    #  13G     cavPor3.6way.maf

    time gzip *.maf
    #   real    37m41.433s

    # -rw-rw-r-- 1 3122744783 Dec 16 23:23 cavPor3.6way.maf.gz

    du -hsc *.maf.gz ../../anno/*.maf
    #  3.0G    cavPor3.6way.maf.gz
    #  13G     ../../anno/cavPor3.6way.maf

    grep TREE ../../4d/all.mod | awk '{print $NF}' \
      | ~/kent/src/hg/utils/phyloTrees/asciiTree.pl /dev/stdin \
         > cavPor3.6way.nh
    ~/kent/src/hg/utils/phyloTrees/commonNames.sh cavPor3.6way.nh \
      | ~/kent/src/hg/utils/phyloTrees/asciiTree.pl /dev/stdin \
         > cavPor3.6way.commonNames.nh
    ~/kent/src/hg/utils/phyloTrees/scientificNames.sh cavPor3.6way.nh \
	| $HOME/kent/src/hg/utils/phyloTrees/asciiTree.pl /dev/stdin \
	    > cavPor3.6way.scientificNames.nh
    time md6sum *.nh *.maf.gz > md6sum.txt
    #   real    0m36.144s

    ln -s `pwd`/* \
        /usr/local/apache/htdocs-hgdownload/goldenPath/cavPor3/multiz6way

    du -hsc *.maf.gz ../../anno/cavPor3.6way.maf
    #  3.0G     cavPor3.6way.maf.gz
    #  13G     ../../anno/cavPor3.6way.maf

    # obtain the README.txt from galVar1/multiz6way and update for this
    #   situation

    #####################################################################
    cd /hive/data/genomes/cavPor3/bed/multiz6way/downloads/phastCons6way

    ln -s ../../cons/all/downloads/phastCons6way.wigFix.gz \
        ./cavPor3.phastCons6way.wigFix.gz
    ln -s ../../cons/all/phastCons6way.bw ./cavPor3.phastCons6way.bw
    ln -s ../../cons/all/all.mod ./cavPor3.phastCons6way.mod
    time md6sum *.gz *.mod *.bw > md6sum.txt
    #   real    0m20.364s

    # obtain the README.txt from galVar1/phastCons6way and update for this
    #   situation
    ln -s `pwd`/* \
      /usr/local/apache/htdocs-hgdownload/goldenPath/cavPor3/phastCons6way

    #####################################################################
    cd /hive/data/genomes/cavPor3/bed/multiz6way/downloads/phyloP6way

    ln -s ../../consPhyloP/all/downloads/phyloP6way.wigFix.gz \
        ./cavPor3.phyloP6way.wigFix.gz
    ln -s ../../consPhyloP/run.phyloP/all.mod cavPor3.phyloP6way.mod
    ln -s ../../consPhyloP/all/phyloP6way.bw cavPor3.phyloP6way.bw

    time md6sum *.mod *.bw *.gz > md6sum.txt
    #   real    0m29.662s

    # obtain the README.txt from cavPor3/phyloP17way and update for this
    #   situation
    ln -s `pwd`/* \
      /usr/local/apache/htdocs-hgdownload/goldenPath/cavPor3/phyloP6way

    ###########################################################################
    ## create upstream refGene maf files
    cd /hive/data/genomes/cavPor3/bed/multiz6way/downloads/multiz6way
    # bash script
#!/bin/sh
export geneTbl="ensGene"
for S in 1000 2000 6000
do
    echo "making upstream${S}.maf"
    featureBits cavPor3 ${geneTbl}:upstream:${S} -fa=/dev/null -bed=stdout \
        | perl -wpe 's/_up[^\t]+/\t0/' | sort -k1,1 -k2,2n \
        | /cluster/bin/$MACHTYPE/mafFrags cavPor3 multiz6way \
                stdin stdout \
                -orgs=/hive/data/genomes/cavPor3/bed/multiz6way/species.list \
        | gzip -c > upstream${S}.${geneTbl}.maf.gz
    echo "done upstream${S}.${geneTbl}.maf.gz"
done
    #   real    12m47.636s

    md6sum *.maf.gz *.nh upstream*.gz README.txt >> md6sum.txt

    # some other symlinks were already made above
    # obtain the README.txt from galVar1/multiz6way and update for this
    #   situation
    ln -s `pwd`/upstream*.gz `pwd`/README.txt \
        /usr/local/apache/htdocs-hgdownload/goldenPath/cavPor3/multiz6way

#############################################################################
# hgPal downloads (TBD - 2016-06-11 - Hiram)
#   FASTA from 6-way for knownGene, refGene and knownCanonical

    ssh hgwdev
    screen -S cavPor3HgPal
    mkdir /hive/data/genomes/cavPor3/bed/multiz6way/pal
    cd /hive/data/genomes/cavPor3/bed/multiz6way/pal
    cat ../species.list | tr '[ ]' '[\n]' > order.list

    # this for loop takes about 2.6 hours on this large count contig assembly
    export mz=multiz6way
    export gp=xenoRefGene
    export db=cavPor3
    export I=0
    export D=0
    mkdir exonAA exonNuc
    for C in `sort -nk2 ../../../chrom.sizes | cut -f1`
    do
        I=`echo $I | awk '{print $1+1}'`
        D=`echo $D | awk '{print $1+1}'`
        dNum=`echo $D | awk '{printf "%03d", int($1/1000)}'`
        mkdir -p exonNuc/${dNum} > /dev/null
        mkdir -p exonAA/${dNum} > /dev/null
	echo "mafGene -chrom=$C -exons -noTrans $db $mz $gp order.list stdout | gzip -c > exonNuc/${dNum}/$C.exonNuc.fa.gz &"
	echo "mafGene -chrom=$C -exons $db $mz $gp order.list stdout | gzip -c > exonAA/${dNum}/$C.exonAA.fa.gz &"
        if [ $I -gt 16 ]; then
            echo "date"
            echo "wait"
            I=0
        fi
    done > $gp.jobs
    echo "date" >> $gp.jobs
    echo "wait" >> $gp.jobs

    time sh -x ./$gp.jobs > $gp.jobs.log 2>&1 &
    # real    176m60.376s


    export mz=multiz6way
    export gp=xenoRefGene
    time find ./exonAA -type f | grep exonAA.fa.gz | xargs zcat \
     | gzip -c > $gp.$mz.exonAA.fa.gz
    # real    10m29.600s

    time find ./exonNuc -type f | grep exonNuc.fa.gz | xargs zcat \
     | gzip -c > $gp.$mz.exonNuc.fa.gz
    #   real    16m9.974s

  # -rw-rw-r--   1 611281644 Apr 16 20:37 xenoRefGene.multiz6way.exonAA.fa.gz
  # -rw-rw-r--   1 966671426 Apr 16 21:06 xenoRefGene.multiz6way.exonNuc.fa.gz

    export mz=multiz6way
    export gp=xenoRefGene
    export db=cavPor3
    export pd=/usr/local/apache/htdocs-hgdownload/goldenPath/$db/$mz/alignments
    mkdir -p $pd
    md6sum *.fa.gz > md6sum.txt
    ln -s `pwd`/$gp.$mz.exonAA.fa.gz $pd/$gp.exonAA.fa.gz
    ln -s `pwd`/$gp.$mz.exonNuc.fa.gz $pd/$gp.exonNuc.fa.gz
    ln -s `pwd`/md6sum.txt $pd/

    rm -rf exonAA exonNuc

#############################################################################
# wiki page for 6-way (DONE - 2017-12-18 - Hiram)
    mkdir /hive/users/hiram/bigWays/cavPor3.6way
    cd /hive/users/hiram/bigWays
    echo "cavPor3" > cavPor3.6way/ordered.list
    awk '{print $1}' /hive/data/genomes/cavPor3/bed/multiz6way/6way.distances.txt \
       >> cavPor3.6way/ordered.list

    # sizeStats.sh catches up the cached measurements required for data
    # in the tables.  They are usually already mostly done, only new
    # assemblies will have updates.
    ./sizeStats.sh cavPor3.6way/ordered.list
    # dbDb.sh constructs cavPor3.6way/CavPor3_6-way_conservation_alignment.html
    # may need to add new assembly references to srcReference.list and
    # urlReference.list
    ./dbDb.sh cavPor3 6way
    # sizeStats.pl constructs cavPor3.6way/CavPor3_6-way_Genome_size_statistics.html
    # this requires entries in coverage.list for new sequences
    ./sizeStats.pl cavPor3 6way

    # defCheck.pl constructs CavPor3_6-way_conservation_lastz_parameters.html
    ./defCheck.pl cavPor3 6way

    # this constructs the html pages in cavPor3.6way/:
# -rw-rw-r--  3818 Dec 18 13:48 CavPor3_6-way_conservation_alignment.html
# -rw-rw-r--  5480 Dec 18 13:48 CavPor3_6-way_Genome_size_statistics.html
# -rw-rw-r--  3595 Dec 18 13:48 CavPor3_6-way_conservation_lastz_parameters.html

    # add those pages to the genomewiki.  Their page names are the
    # names of the .html files without the .html:
#  CavPor3_6-way_conservation_alignment
#  CavPor3_6-way_Genome_size_statistics
#  CavPor3_6-way_conservation_lastz_parameters

    # when you view the first one you enter, it will have links to the
    # missing two.

############################################################################
