# for emacs: -*- mode: sh; -*-

#	The UCSC build oryLat1 had a broken chrUn sequence
#	It had been put together incorrectly here.
#	This oryLat2 build is for the same sequence as was in oryLat1
#	but re-worked to fix our chrUn so we can make it correspond
#	to the Ensembl sequence.

#	http://www.ncbi.nlm.nih.gov/nuccore/144600648

# This file describes browser build for the Medaka
# genome, Oryzias latipes, October 2005, MEDAKA1 from
#	Institute of Genetics (NIG) and the University of Tokyo
#	NIG: http://dolphin.lab.nig.ac.jp/medaka/
#	UTGB: http://medaka.utgenome.org/o
#	Data release policy: http://medaka.utgenome.org/#policy
#	Ensembl: http://www.ensembl.org/Oryzias_latipes/index.html
#
#	"$Id: oryLat2.txt,v 1.17 2009/11/25 21:48:41 hiram Exp $"
#
###########################################################################

    ssh kkstore04
    mkdir /cluster/store9/oryLat2
    ln -s /cluster/store9/oryLat2 /cluster/data/oryLat2
    mkdir /cluster/data/oryLat2/download

#	The following note was received from Bronwen Aken at Ensembl
#	describing the AGP files they used in their assembly:

#    The medaka assembly that we've used consists of:
#    - contigs as the basic unit of assembly
#    - scaffolds that are made up of contigs
#    - ultracontigs that are made up of scaffolds but can't be placed on a
#    chromosome
#    - chromsomes that are made up of scaffolds
#
#    I have attached 4 gzipped AGP files and they correspond to the above
#    four levels of assembly. The first three files were given to us by
#    University of Tokyo. The assembly shown in these files, called
#    HdrR_200510, was not submitted to a public repository. The next
#    assembly, version 1.0, is the one with modified gap sizes. Version 1.0
#    is the assembly used by UCSC and it has been submitted to Genbank/DDBJ/EMBL
#
#
#    The files are:
#    (i) 200510.chr.agp.txt.gz
#    This file specifies where scaffolds are found on a chromosome and how
#    big the gap is between each scaffold
#
#    (ii) 200510.ultracontig.agp.txt.gz
#    This file specifies where scaffolds are found on an ultracontig and how
#    big the gap is between each scaffold
#
#    (iii) 200510.scaffold.agp.txt.gz
#    This file specifies where contigs are found on a scaffold and how big
#    the gap is between each contig
#
#    (iv) contig_fake_scaffolds.agp.txt.gz
#    There were 92 contigs that did not occur in any of the above agp files.
#    We made our own agp file for them, to put them onto their own scaffolds.
#    This was done because Ensembl defines 'toplevel' sequence from which we
#    do our analyses. (In the toplevel sequence for a genome, every contig
#    must occur exactly once.)
#
#    Hope that helps,
#    Bronwen
#
#	These four agp files were extracted from email and dropped into:
#	/cluster/data/oryLat2/download

	wget --timestamping \
"ftp://ftp.ensembl.org/pub/release-50/fasta/oryzias_latipes/dna/Oryzias_latipes.MEDAKA1.50.dna.toplevel.fa.gz" \
-O ensembl.MEDAKA1.50.dna.toplevel.fa.gz

	wget --timestamping \
"ftp://ftp.ensembl.org/pub/release-50/fasta/oryzias_latipes/dna/Oryzias_latipes.MEDAKA1.50.dna.nonchromosomal.fa.gz" \
-O ensembl.MEDAKA1.50.dna.nonchromosomal.fa.gz

for C in 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24
do
	wget --timestamping \
"ftp://ftp.ensembl.org/pub/release-50/fasta/oryzias_latipes/dna/Oryzias_latipes.MEDAKA1.50.dna.chromosome.$C.fa.gz" \
-O ensembl.MEDAKA1.50.dna.chromosome.$C.fa.gz
	echo  ensembl.MEDAKA1.50.dna.chromosome.$C.fa.gz
done

#  Taking a look at those AGP files from Bronwen and these fasta files.
#	Determined that all of the DNA is actually in the single file
#	ensembl.MEDAKA1.50.dna.toplevel.fa.gz
#	And a combination of bits from the four AGP files gets a complete
#	AGP file to match this fasta file,
#	all of: 200510.chr.agp.txt.gz
#	Then, 79 sequences from: 200510.ultracontig.agp.txt.gz
#	Then, 7001 sequences from: 200510.scaffold.agp.txt.gz
#	and the entire contents of: contig_fake_scaffolds.agp.txt.gz
#	The specifications from 200510.chr.agp.txt.gz had gaps at the
#	ends of most chromosomes but there were no N's at that location
#	within the fasta file, so, edit the resulting conglomerate AGP
#	file to remove those gaps at the ends.  Can now pass the
#	test for checkAgpAndFa.  It would be tough to create a chrUn
#	sequence, so, let's try it without

##############################################################################
    ssh hgwdev
    cd /cluster/data/oryLat2
    #	the config.ra script pretty much specifies everything
    cat << '_EOF_' >  oryLat2.ra
db oryLat2
scientificName Oryzias latipes
assemblyDate Oct. 2005
assemblyLabel MEDAKA1
#       Just above oryLat1
orderKey 474
#       NC_004387
mitoAcc 25057156
fastaFiles /cluster/data/oryLat2/ucsc/ucsc.source.fa
dbDbSpeciesDir medaka
agpFiles /cluster/data/oryLat2/ucsc/chromUltraScaff.agp
commonName Medaka
clade Vertebrate
genomeCladePriority 105
'_EOF_'
    # << happy emacs

    makeGenomeDb.pl oryLat2.ra -stop=agp -workhorse=hgwdev
    makeGenomeDb.pl oryLat2.ra -continue=db -workhorse=hgwdev
    #	That worked just fine.

###############################################################################
#  Quick check to see if the Ensembl genes will now map properly to
#	this sequence:
#  oryLat2 - Medaka - Ensembl Genes version 50  (DONE - 2008-08-12 - hiram)
    ssh kkstore04
    cd /cluster/data/oryLat2
    cat << '_EOF_' > oryLat2.ensGene.ra
# required db variable
db oryLat2
# optional nameTranslation, the sed command that will transform
# Ensemble names to UCSC names.  With quotes just to make sure.
nameTranslation "s/^\([0-9][0-9]*\)/chr\1/; s/^MT/chrM/"
'_EOF_'
#  << happy emacs

    doEnsGeneUpdate.pl -ensVersion=50 oryLat2.ensGene.ra
    ssh hgwdev
    cd /cluster/data/oryLat2/bed/ensGene.50
    featureBits oryLat2 ensGene
    # 32418913 bases of 700386597 (4.629%) in intersection
    #	Seems to work just fine, no lifing required, all genes worked

#########################################################################
# REPEATMASKER (DONE - 2008-08-12 - Hiram)
    ssh kkstore04
    screen	#	use a screen to manage this long running job
    mkdir /cluster/data/oryLat2/bed/repeatMasker
    cd /cluster/data/oryLat2/bed/repeatMasker
    # Run -debug to create the dir structure and preview the scripts:
    time nice -n +19 doRepeatMasker.pl oryLat2 -workhorse=kolossus \
	-buildDir=/cluster/data/oryLat2/bed/repeatMasker \
	-bigClusterHub=pk -smallClusterHub=memk > do.log 2>&1
    #	RM version from the log:
    #	RepeatMasker version development-$Id: oryLat2.txt,v 1.17 2009/11/25 21:48:41 hiram Exp $
    #	CC   RELEASE 20080801
    #   about 40 minutes run time

    # Calculate coverage to compare with other masking efforts
    featureBits oryLat2 rmsk
    #	23161223 bases of 700386597 (3.307%) in intersection

#########################################################################
# SIMPLE REPEATS (TRF) (DONE 2008-08-12 - Hiram)
    ssh kolossus
    screen	#	use a screen to manage this long running job
    mkdir /cluster/data/oryLat2/bed/simpleRepeat
    cd /cluster/data/oryLat2/bed/simpleRepeat
    time nice -n +19 doSimpleRepeat.pl \
	-buildDir=/cluster/data/oryLat2/bed/simpleRepeat \
	oryLat2 -smallClusterHub=memk -workhorse=kolossus > do.log 2>&1 &
    #	real    19m50.401s

    cat fb.simpleRepeat
    #	13229585 bases of 700386597 (1.889%) in intersectio

#########################################################################
# windowMasker (DONE 2008-08-12,14 - Hiram)
    ssh kolossus
    screen	#	use a screen to manage this long running job
    mkdir /cluster/data/oryLat2/bed/windowMasker
    cd /cluster/data/oryLat2/bed/windowMasker
    time nice -n +19 doWindowMasker.pl \
	-buildDir=/cluster/data/oryLat2/bed/windowMasker \
	-workhorse=kolossus oryLat2 > do.log 2>&1 &
    #	real    99m18.161s

    twoBitToFa oryLat2.wmsk.sdust.2bit stdout | faSize stdin
    #	869000216 bases (168613619 N's 700386597 real 468915765 upper
    #	231470832 lower) in 7189 sequences in 1 files
    #	%26.64 masked total, %33.05 masked real

    #	load this initial data to get ready to clean it
    ssh hgwdev
    cd /cluster/data/oryLat2/bed/windowMasker
    hgLoadBed oryLat2 windowmaskerSdust windowmasker.sdust.bed.gz
    #	Loaded 4784449 elements of size 3

    #	eliminate the gaps from the masking
    featureBits oryLat2 -not gap -bed=notGap.bed
    #	700386597 bases of 700386597 (100.000%) in intersection
    featureBits oryLat2 windowmaskerSdust
    #	400084451 bases of 700386597 (57.123%) in intersection

    time nice -n +19 featureBits oryLat2 windowmaskerSdust notGap.bed \
        -bed=stdout | gzip -c > cleanWMask.bed.gz
    #	real    6m31.017s
    #	231470832 bases of 700386597 (33.049%) in intersection

    #	reload track to get it clean
    hgLoadBed oryLat2 windowmaskerSdust cleanWMask.bed.gz
    #	Loaded 4798190 elements of size 4


    featureBits oryLat2 windowmaskerSdust
    #	231470832 bases of 700386597 (33.049%) in intersection

    #	mask the sequence with this clean mask
    cd /cluster/data/oryLat2
    zcat bed/windowMasker/cleanWMask.bed.gz \
	| twoBitMask oryLat2.unmasked.2bit stdin \
	    -type=.bed oryLat2.WMSdust.2bit
    twoBitToFa oryLat2.WMSdust.2bit stdout | faSize stdin \
        > oryLat2.WMSdust.faSize.txt
    cat oryLat2.WMSdust.faSize.txt
    #	869000216 bases (168613619 N's 700386597 real 468915765 upper
    #	231470832 lower) in 7189 sequences in 1 files
    #	%26.64 masked total, %33.05 masked real

    #	Add trfMask:
    twoBitMask oryLat2.WMSdust.2bit \
	-add bed/simpleRepeat/trfMask.bed oryLat2.2bit
    # You can safely ignore the warning about extra BED columns
    twoBitToFa oryLat2.2bit stdout | faSize stdin \
        > oryLat2.faSize.txt
    cat oryLat2.faSize.txt
    #	869000216 bases (168613619 N's 700386597 real 468647738 upper
    #	231738859 lower) in 7189 sequences in 1 files
    #	%26.67 masked total, %33.09 masked real
    #	This becomes our masked genome sequence:
    ln -s `pwd`/oryLat2.2bit /gbdb/oryLat2

#############################################################################
#  Verify all N's in sequence are marked in the gap track
#	(DONE - 2008-08-14 - Hiram)
    ssh kkstore04
    mkdir /cluster/data/oryLat2/bed/gap
    cd /cluster/data/oryLat2/bed/gap
    time nice -n +19 findMotif -strand=+ -verbose=4 -motif=gattaca \
	../../oryLat2.unmasked.2bit > oryLat2.Ns.txt 2>&1
    #	real    0m13.949
    grep "^#GAP " oryLat2.Ns.txt | sed -e "s/#GAP //" \
	| sort -k1,1 -k2,2n > oryLat2.Ns.bed
    #	what are their sizes like:
    ave -col=5 oryLat2.Ns.bed
Q1 10.000000
median 10.000000
Q3 285.000000
average 1324.922554
min 10.000000
max 5149430.000000
count 127263
total 168613619.000000
standard deviation 26497.729501

    ssh hgwdev
    cd /cluster/data/oryLat2/bed/gap
    hgsql -N -e "select chrom,chromStart,chromEnd from gap;" oryLat2 \
	| awk '{print $1,$2,$3}' | sort -k1,1 -k2,2n | sum
    #	00926  2904
    awk '{print $1,$2,$3}'  oryLat2.Ns.bed | sort -k1,1 -k2,2n | sum
    #	00926  2904

    #	Identical sums.  All N's are included in the gaps.

    #	same exact definition.  If they were not, would have to figure
    #	out where the Ns were that were not in gaps, and add them to
    #	the gap table.  This can be achieved with featureBits.

#########################################################################
# MAKE 11.OOC FILE FOR BLAT (DONE - 2007-01-18 - Hiram)
    # Use -repMatch=200 (based on size -- for human we use 1024, and
    # medaka size is ~20% of human judging by gapless oryLat2 vs. hg18
    # genome sizes from featureBits.
    ssh kolossus
    cd /cluster/data/oryLat2
    blat oryLat2.2bit /dev/null /dev/null -tileSize=11 \
      -makeOoc=jkStuff/11.ooc -repMatch=200
# Wrote 46667 overused 11-mers to jkStuff/11.ooc
    mkdir /cluster/bluearc/scratch/data/oryLat2
    cp -p oryLat2.2bit chrom.sizes jkStuff/11.ooc \
	/cluster/bluearc/scratch/data/oryLat2
    #	Get this bluearc scratch data area copied to the kluster nodes

#########################################################################
# GENBANK ALIGNMENTS (DONE - 2008-08-14 - Hiram)
    ssh kkstore04
    cd /cluster/data/oryLat2
    simplePartition.pl oryLat2.2bit 5000000 /tmp/oryLat2P
    cat /tmp/oryLat2P/*/*.lft > jkStuff/liftAll.lft
    rm -r /tmp/oryLat2P

    # align with latest genbank process.
    ssh hgwdev
    cd ~/kent/src/hg/makeDb/genbank
    cvsup
    # edit etc/genbank.conf to add oryLat1 just after gasAcu1

# oryLat2 (Oryzias latipes - Medaka)
oryLat2.serverGenome = /cluster/data/oryLat2/oryLat2.2bit
oryLat2.clusterGenome = /scratch/data/oryLat2/oryLat2.2bit
oryLat2.lift = /cluster/data/oryLat2/jkStuff/liftAll.lft
oryLat2.refseq.mrna.native.pslCDnaFilter  = ${ordered.refseq.mrna.native.pslCDnaFilter}
oryLat2.refseq.mrna.xeno.pslCDnaFilter    = ${ordered.refseq.mrna.xeno.pslCDnaFilter}
oryLat2.genbank.mrna.native.pslCDnaFilter = ${ordered.genbank.mrna.native.pslCDnaFilter}
oryLat2.genbank.mrna.xeno.pslCDnaFilter   = ${ordered.genbank.mrna.xeno.pslCDnaFilter}
oryLat2.genbank.est.native.pslCDnaFilter  = ${ordered.genbank.est.native.pslCDnaFilter}
oryLat2.ooc = /scratch/data/oryLat2/11.ooc
oryLat2.genbank.mrna.xeno.load = yes
oryLat2.refseq.mrna.native.load = yes
oryLat2.refseq.mrna.xeno.load = yes
oryLat2.downloadDir = oryLat2


    cvs ci -m "Added oryLat2." etc/genbank.conf
    # update /cluster/data/genbank/:
    make etc-update

    ssh genbank
    screen	# use screen to manage this long running job
    cd /cluster/data/genbank
    time nice -n +19 bin/gbAlignStep -initial oryLat2 &
    #	logFile: var/build/logs/2008.08.14-16:39:51.oryLat2.initalign.log
    #	real    304m52.587s

    # load database when finished
    ssh hgwdev
    cd /cluster/data/genbank
    time nice -n +19 ./bin/gbDbLoadStep -drop -initialLoad oryLat2
    #	logFile: var/dbload/hgwdev/logs/2008.08.15-11:36:18.dbload.log
    #	real    10m58.023s

    # enable daily alignment and update of hgwdev
    cd ~/kent/src/hg/makeDb/genbank
    cvsup
    # add oryLat2 to:
        etc/align.dbs
        etc/hgwdev.dbs
    cvs ci -m "Added oryLat2." etc/align.dbs etc/hgwdev.dbs
    make etc-update

##########################################################################
## BLASTZ swap from Human Hg18 to Medaka OryLat2 (DONE - 2008-08-25 - Hiram)
    ssh kkstore02	# not too important since everything moved to the hive
    screen	# use a screen to control this job
    cd /cluster/data/hg18/bed/blastzOryLat2.2008-08-19
    cat fb.hg18.chainOryLat2Link.txt
    #	52713428 bases of 2881515245 (1.829%) in intersection

    #	That is OK, now for the swap:
    mkdir /cluster/data/oryLat2/bed/blastz.hg18.swap
    cd /cluster/data/oryLat2/bed/blastz.hg18.swap
    time doBlastzChainNet.pl -verbose=2 -swap \
	/cluster/data/hg18/bed/blastzOryLat2.2008-08-19/DEF \
	-chainMinScore=5000 -chainLinearGap=loose \
	-qRepeats=windowmaskerSdust \
	-bigClusterHub=pk > swap.log 2>&1 &
    #	real    17m9.675s
    cat fb.oryLat2.chainHg18Link.txt
    #	46961822 bases of 700386597 (6.705%) in intersection

#########################################################################
# BLASTZ/CHAIN/NET FR2 (DONE - 2008-08-25,27 - Hiram)
##  align to fr2 scaffolds, results lifted to fr2 chrUn coordinates
    ssh kkstore04	# not too important since everything moved to the hive
    screen	# use a screen to control this job
    mkdir /cluster/data/oryLat2/bed/blastz.fr2.2008-08-25
    cd /cluster/data/oryLat2/bed/blastz.fr2.2008-08-25
    cat << '_EOF_' > DEF
# Medaka vs. Fugu

# Using the "nearby" organism settings
#	see also: http://genomewiki.ucsc.edu/index.php/Mm9_multiple_alignment
BLASTZ_Y=9400
BLASTZ_L=3000
BLASTZ_K=3000
BLASTZ_M=50
BLASTZ_Q=/scratch/data/blastz/HoxD55.q

# TARGET: Medaka oryLat2 (40M chunks covers the largest chrom in one gulp)
SEQ1_DIR=/scratch/data/oryLat2/oryLat2.2bit
SEQ1_LEN=/scratch/data/oryLat2/chrom.sizes
SEQ1_CHUNK=40000000
SEQ1_LIMIT=200
SEQ1_LAP=10000

# QUERY: Fugu fr2
#       Align to the scaffolds, results lifed up to chrUn.sdTrf coordinates
SEQ2_DIR=/scratch/data/fr2/fr2.2bit
SEQ2_LEN=/san/sanvol1/scratch/fr2/chrom.sizes
SEQ2_CTGDIR=/san/sanvol1/scratch/fr2/fr2.scaffolds.2bit
SEQ2_CTGLEN=/san/sanvol1/scratch/fr2/fr2.scaffolds.sizes
SEQ2_LIFT=/san/sanvol1/scratch/fr2/liftAll.lft
SEQ2_CHUNK=20000000
SEQ2_LIMIT=30
SEQ2_LAP=0

BASE=/cluster/data/oryLat2/bed/blastz.fr2.2008-08-25
TMPDIR=/scratch/tmp
'_EOF_'
    # << this line keeps emacs coloring happy

    time doBlastzChainNet.pl -chainMinScore=3000 -chainLinearGap=medium \
	`pwd`/DEF \
	-tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \
	-verbose=2 -bigClusterHub=pk  > do.log 2>&1 &
    #	real    0m42.311s
    #	problems on pk, finish the kluster run manually, then continuing:
    time doBlastzChainNet.pl -chainMinScore=3000 -chainLinearGap=medium \
	`pwd`/DEF \
	-tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \
	-smallClusterHub=pk -continue=cat -verbose=2 -bigClusterHub=pk \
	> cat.log 2>&1 &
    #	real    230m12.868s
    cat fb.oryLat2.chainFr2Link.txt
    #	180945351 bases of 700386597 (25.835%) in intersection

    mkdir /cluster/data/fr2/bed/blastz.oryLat2.swap
    cd /cluster/data/fr2/bed/blastz.oryLat2.swap
    time doBlastzChainNet.pl -verbose=2 -swap \
	/cluster/data/oryLat2/bed/blastz.fr2.2008-08-25/DEF \
	-chainMinScore=3000 -chainLinearGap=medium \
	-tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \
	-smallClusterHub=pk -bigClusterHub=pk > swap.log 2>&1 &
    #	real    24m32.826s
    cat fb.fr2.chainOryLat2Link.txt
    #	153621820 bases of 393312790 (39.058%) in intersection

#########################################################################
# BLASTZ/CHAIN/NET Medaka oryLat2 (DONE - 2008-08-26,27 - Hiram)
#	with contigs for Lamprey
    ssh kkstore01	# not too important since everything moved to hive
    screen # use screen to control this job
    mkdir /cluster/data/oryLat2/bed/blastz.petMar1.2008-08-26
    cd /cluster/data/oryLat2/bed/blastz.petMar1.2008-08-26

    cat << '_EOF_' > DEF
# Medaka vs. Lamprey

# using the "close" genome alignment parameters
#	see also: http://genomewiki.ucsc.edu/index.php/Mm9_multiple_alignment
BLASTZ_Y=9400
BLASTZ_L=3000
BLASTZ_K=3000
BLASTZ_M=50
BLASTZ_Q=/scratch/data/blastz/HoxD55.q

# TARGET: Medaka oryLat2 (40M chunks covers the largest chrom in one gulp)
SEQ1_DIR=/scratch/data/oryLat2/oryLat2.2bit
SEQ1_LEN=/scratch/data/oryLat2/chrom.sizes
SEQ1_CHUNK=40000000
SEQ1_LIMIT=200
SEQ1_LAP=10000

# QUERY: Lamprey petMar1, sequence is in contigs
SEQ2_DIR=/scratch/data/petMar1/petMar1.2bit
SEQ2_LEN=/scratch/data/petMar1/chrom.sizes
SEQ2_CHUNK=10000000
SEQ2_LIMIT=150
SEQ2_LAP=0

BASE=/cluster/data/oryLat2/bed/blastz.petMar1.2008-08-26
TMPDIR=/scratch/tmp
'_EOF_'
    # << this line keeps emacs coloring happy

    time doBlastzChainNet.pl -chainMinScore=3000 -chainLinearGap=medium \
	`pwd`/DEF \
	-tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \
	-verbose=2 -smallClusterHub=pk -bigClusterHub=pk > do.log 2>&1 &
    #	real    736m11.276s
    cat fb.oryLat2.chainPetMar1Link.txt
    #	41568923 bases of 700386597 (5.935%) in intersection

    #	And, for the swap:
    mkdir /cluster/data/petMar1/bed/blastz.oryLat2.swap
    cd /cluster/data/petMar1/bed/blastz.oryLat2.swap
    time doBlastzChainNet.pl -verbose=2 -swap \
	/cluster/data/oryLat2/bed/blastz.petMar1.2008-08-26/DEF \
	-chainMinScore=3000 -chainLinearGap=medium \
	-tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \
	-smallClusterHub=pk -bigClusterHub=kk > swap.log 2>&1 &
    #	real    52m27.942s
    cat fb.petMar1.chainOryLat2Link.txt
    #	39181422 bases of 831696438 (4.711%) in intersection

#########################################################################
# BLASTZ/CHAIN/NET Stickleback gasAcu1 (DONE - 2008-08-26,27 - Hiram)
#	with contigs for chrUn in Stickleback
    ssh kkstore02	# not too important since everything moved to hive
    screen # use screen to control this job
    mkdir /cluster/data/oryLat2/bed/blastz.gasAcu1.2008-08-26
    cd /cluster/data/oryLat2/bed/blastz.gasAcu1.2008-08-26

    cat << '_EOF_' > DEF
# Medaka vs. Stickleback

# using the "close" genome alignment parameters
#	see also: http://genomewiki.ucsc.edu/index.php/Mm9_multiple_alignment
BLASTZ_Y=9400
BLASTZ_L=3000
BLASTZ_K=3000
BLASTZ_M=50
BLASTZ_Q=/scratch/data/blastz/HoxD55.q

# TARGET: Medaka oryLat2 (40M chunks covers the largest chrom in one gulp)
SEQ1_DIR=/scratch/data/oryLat2/oryLat2.2bit
SEQ1_LEN=/scratch/data/oryLat2/chrom.sizes
SEQ1_CHUNK=40000000
SEQ1_LIMIT=200
SEQ1_LAP=10000

# QUERY: Stickleback gasAcu1, chrUn is in contigs, lifted to chrUn
SEQ2_DIR=/scratch/data/gasAcu1/gasAcu1.2bit
SEQ2_LEN=/cluster/data/gasAcu1/chrom.sizes
SEQ2_CTGDIR=/san/sanvol1/scratch/gasAcu1/gasAcu1.randomContigs.sdTrf.2bit
SEQ2_CTGLEN=/san/sanvol1/scratch/gasAcu1/gasAcu1.randomContigs.sdTrf.sizes
SEQ2_LIFT=/san/sanvol1/scratch/gasAcu1/chrUn.extraCloneGap.lift
SEQ2_CHUNK=10000000
SEQ2_LIMIT=150
SEQ2_LAP=0

BASE=/cluster/data/oryLat2/bed/blastz.gasAcu1.2008-08-26
TMPDIR=/scratch/tmp
'_EOF_'
    # << this line keeps emacs coloring happy

    time doBlastzChainNet.pl -chainMinScore=3000 -chainLinearGap=medium \
	`pwd`/DEF \
	-tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \
	-verbose=2 -smallClusterHub=pk -bigClusterHub=pk > do.log 2>&1 &
    #	real    698m10.533s
    cat fb.oryLat2.chainGasAcu1Link.txt
    #	211589769 bases of 700386597 (30.210%) in intersection

    #	And, for the swap:
    mkdir /cluster/data/gasAcu1/bed/blastz.oryLat2.swap
    cd /cluster/data/gasAcu1/bed/blastz.oryLat2.swap
    time doBlastzChainNet.pl -chainMinScore=3000 -chainLinearGap=medium -swap \
	/cluster/data/oryLat2/bed/blastz.gasAcu1.2008-08-26/DEF \
	-tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \
	-verbose=2 -smallClusterHub=pk -bigClusterHub=pk > swap.log 2>&1 &
    #	real    31m44.711s
    cat fb.gasAcu1.chainOryLat2Link.txt
    #	189793612 bases of 446627861 (42.495%) in intersection

#########################################################################
# BLASTZ/CHAIN/NET tetNig1 (DONE - 2008-08-26,27 - Hiram)
    ssh kkstore03	# not too important since everything moved to hive
    screen # use screen to control this job
    mkdir /cluster/data/oryLat2/bed/blastz.tetNig1.2008-08-26
    cd /cluster/data/oryLat2/bed/blastz.tetNig1.2008-08-26
    cat << '_EOF_' > DEF
# Medaka vs. Tetraodon

# using the "close" genome alignment parameters
#	see also: http://genomewiki.ucsc.edu/index.php/Mm9_multiple_alignment
BLASTZ_Y=9400
BLASTZ_L=3000
BLASTZ_K=3000
BLASTZ_M=50
BLASTZ_Q=/scratch/data/blastz/HoxD55.q

# TARGET: Medaka oryLat2 (40M chunks covers the largest chrom in one gulp)
SEQ1_DIR=/scratch/data/oryLat2/oryLat2.2bit
SEQ1_LEN=/scratch/data/oryLat2/chrom.sizes
SEQ1_CHUNK=40000000
SEQ1_LIMIT=200
SEQ1_LAP=10000

# QUERY: Tetraodon tetNig1
SEQ2_DIR=/cluster/data/tetNig1/tetNig1.sdTrf.2bit
SEQ2_LEN=/cluster/data/tetNig1/chrom.sizes
SEQ2_CTGDIR=/san/sanvol1/scratch/tetNig1/tetNig1.randomContigs.sdTrf.2bit
SEQ2_CTGLEN=/san/sanvol1/scratch/tetNig1/tetNig1.randomContigs.sdTrf.sizes
SEQ2_LIFT=/san/sanvol1/scratch/tetNig1/tetNig1.randomContigs.lift
SEQ2_CHUNK=20000000
SEQ2_LIMIT=100
SEQ2_LAP=0

BASE=/cluster/data/oryLat2/bed/blastz.tetNig1.2008-08-26
TMPDIR=/scratch/tmp
'_EOF_'
    # << this line keeps emacs coloring happy

    time doBlastzChainNet.pl -chainMinScore=3000 -chainLinearGap=medium \
	`pwd`/DEF \
	-tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \
	-verbose=2 -smallClusterHub=pk -bigClusterHub=pk > do.log 2>&1 &
    time doBlastzChainNet.pl -chainMinScore=3000 -chainLinearGap=medium \
	`pwd`/DEF \
	-continue=cat -tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \
	-verbose=2 -smallClusterHub=pk -bigClusterHub=pk > cat.log 2>&1 &
    #	had to finish netChains.csh manually since SEQ2_DIR was incorrect
    #	continuing:
    time doBlastzChainNet.pl -chainMinScore=3000 -chainLinearGap=medium \
	`pwd`/DEF \
	-continue=load -tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \
	-verbose=2 -smallClusterHub=pk -bigClusterHub=pk > load.log 2>&1 &
    #	real    10m42.744s
    cat fb.oryLat2.chainTetNig1Link.txt
    #	163171121 bases of 700386597 (23.297%) in intersection

    #	And, for the swap:
    mkdir /cluster/data/tetNig1/bed/blastz.oryLat2.swap
    cd /cluster/data/tetNig1/bed/blastz.oryLat2.swap
    time doBlastzChainNet.pl -chainMinScore=3000 -chainLinearGap=medium \
	/cluster/data/oryLat2/bed/blastz.tetNig1.2008-08-26/DEF \
	-swap -tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \
	-verbose=2 -smallClusterHub=pk -bigClusterHub=pk > swap.log 2>&1 &
    #	real    22m48.350s
    cat fb.tetNig1.chainOryLat2Link.txt
    #	139059872 bases of 342403326 (40.613%) in intersection

#########################################################################
# BLASTZ/CHAIN/NET danRer5 (DONE - 2008-08-27,28 - Hiram)
    ssh kkstore01	# not too important since everything moved to hive
    screen # use screen to control this job
    mkdir /cluster/data/oryLat2/bed/blastzDanRer5.2008-08-27
    cd /cluster/data/oryLat2/bed/blastzDanRer5.2008-08-27
    cat << '_EOF_' > DEF
# Medaka vs. Zebrafish

# using the "close" genome alignment parameters
#	see also: http://genomewiki.ucsc.edu/index.php/Mm9_multiple_alignment
BLASTZ_Y=9400
BLASTZ_L=3000
BLASTZ_K=3000
BLASTZ_M=50
BLASTZ_Q=/scratch/data/blastz/HoxD55.q

# TARGET: Medaka oryLat2 (40M chunks covers the largest chrom in one gulp)
SEQ1_DIR=/scratch/data/oryLat2/oryLat2.2bit
SEQ1_LEN=/scratch/data/oryLat2/chrom.sizes
SEQ1_CHUNK=40000000
SEQ1_LIMIT=200
SEQ1_LAP=10000

# Query - zebrafish (danRer5)
SEQ2_DIR=/scratch/data/danRer5/danRer5.2bit
SEQ2_LEN=/scratch/data/danRer5/chrom.sizes
SEQ2_CHUNK=10000000
SEQ2_LAP=0

BASE=/cluster/data/oryLat2/bed/blastzDanRer5.2008-08-27
TMPDIR=/scratch/tmp
'_EOF_'
    # << happy emacs

    time doBlastzChainNet.pl -chainMinScore=3000 -chainLinearGap=medium \
	`pwd`/DEF \
	-verbose=2 -tRepeats=windowmaskerSdust \
	-smallClusterHub=pk -bigClusterHub=pk > do.log 2>&1 &
    #	continuing after kluster difficulties
    time doBlastzChainNet.pl -chainMinScore=3000 -chainLinearGap=medium \
	`pwd`/DEF \
	-continue=cat -tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \
	-verbose=2 -smallClusterHub=pk -bigClusterHub=pk > cat.log 2>&1 &
    #	real    744m38.748s
    cat fb.oryLat2.chainDanRer5Link.txt
    #	138070427 bases of 700386597 (19.713%) in intersection

    #	and for the swap
    mkdir /cluster/data/danRer5/bed/blastz.oryLat2.swap
    cd /cluster/data/danRer5/bed/blastz.oryLat2.swap
    time doBlastzChainNet.pl -chainMinScore=3000 -chainLinearGap=medium \
	/cluster/data/oryLat2/bed/blastzDanRer5.2008-08-27/DEF \
	-swap -tRepeats=windowmaskerSdust \
	-verbose=2 -smallClusterHub=pk -bigClusterHub=pk > swap.log 2>&1 &
    #	real    58m15.303s
    #	had to finish the load nets manually, mistakenly included
    #	-qRepeats=windowmaskerSdust when that is not valid for danRer5
    cat fb.danRer5.chainOryLat2Link.txt
    #	158709399 bases of 1435609608 (11.055%) in intersection

    #	Then, continuing:
    doBlastzChainNet.pl -chainMinScore=3000 -chainLinearGap=medium \
	/cluster/data/oryLat2/bed/blastzDanRer5.2008-08-27/DEF \
	-continue=download -swap -tRepeats=windowmaskerSdust \
	-verbose=2 -smallClusterHub=pk -bigClusterHub=pk > download.log 2>&1 &

#########################################################################
## 5-Way Multiz (DONE - 2008-09-03 - Hiram)
    ssh hgwdev
    mkdir /cluster/data/oryLat2/bed/multiz5way
    cd /cluster/data/oryLat2/bed/multiz5way

    sed -e "s/oryLat1/oryLat2/; s/danRer4/danRer5/" \
	/cluster/data/oryLat1/bed/multiz5way/5way.nh > 5way.nh
    #	this file looks like:

    cat << '_EOF_' > 5way.nh
  (((tetraodon_tetNig1:0.199381,fugu_fr2:0.239894):0.2,
    (stickleback_gasAcu1:0.2,medaka_oryLat2:0.2):0.2):0.292961,
        zebrafish_danRer5:0.782561);
'_EOF_'
    # << happy emacs

    #	Use this specification in the phyloGif tool:
    #	http://genome.ucsc.edu/cgi-bin/phyloGif
    #	to obtain a gif image for htdocs/images/phylo/oryLat2_5way.gif

    /cluster/bin/phast/all_dists 5way.nh > 5way.distances.txt
    #	Use this output to create the table below
    grep -y oryLat2 5way.distances.txt | sort -k3,3n
#
#	If you can fill in all the numbers in this table, you are ready for
#	the multiple alignment procedure
#
#                         featureBits chainLink measures
#                                        chainOryLat1Link   chain    linearGap
#    distance                      on oryLat2    on other   minScore
#  1  0.4000 - stickleback gasAcu1 (% 30.210) (% 42.495)       3000     medium
#  2  0.7994 - tetraodon tetNig1   (% 23.297) (% 40.613)       3000     medium
#  3  0.8399 - fugu fr2            (% 25.835) (% 39.058)       3000     medium
#  4  1.4755 - zebrafish danRer5   (% 19.713) (% 11.055)       3000     medium

    # create species list and stripped down tree for autoMZ
    sed 's/[a-z][a-z]*_//g; s/:[0-9\.][0-9\.]*//g; s/;//; /^ *$/d' \
	5way.nh > tmp.nh
    echo `cat tmp.nh` > tree-commas.nh
    echo `cat tree-commas.nh` | sed 's/ //g; s/,/ /g' > tree.nh
    sed 's/[()]//g; s/,/ /g' tree.nh > species.list

    cd /cluster/data/oryLat2/bed/multiz5way
    #	bash shell syntax here ...
    export H=/cluster/data/oryLat2/bed
    mkdir mafLinks
    for G in gasAcu1 tetNig1 fr2 danRer5
    do
	mkdir mafLinks/$G
	if [ ! -d ${H}/blastz.${G}/mafNet ]; then
	echo "missing directory blastz.${G}/mafNet"
	else
	ln -s ${H}/blastz.$G/mafNet/*.maf.gz ./mafLinks/$G
	fi
    done

    #	need to split these things up by chrom and scaffold names for
    #	efficient kluster run.  Using the new hive architecture.
    ssh hgwdev
    cd /hive/data/genomes/oryLat2/bed/multiz5way
    mkdir mafSplit
    for G in gasAcu1 tetNig1 fr2 danRer5
do
    echo -n "working ${G} ..."
    rm -fr mafSplit/${G}
    mkdir mafSplit/${G}
    cd mafSplit/${G}
    mafSplit /dev/null scaf ../../mafLinks/${G}/oryLat2.${G}.net.maf.gz \
	-verbose=2 -byTarget -useHashedName=16 -outDirDepth=2
    cd /hive/data/genomes/oryLat2/bed/multiz5way
    echo " done"
done
    #	create a run-time list of files to operate on, not all file names
    #	exist for all assemblies
    cd mafSplit
    for D in *
do
    cd "${D}"
    find . -type f
    cd ..
done | sort -u | sed -e "s#./##" > ../5-way.split.list
    wc -l ../5-way.split.list
    #	792 ../5-way.split.list

    # the autoMultiz cluster run
    ssh swarm
    cd /cluster/data/oryLat2/bed/multiz5way/

    mkdir splitRun
    cd splitRun
    mkdir maf run
    cd run
    mkdir penn
    cp -p /cluster/bin/penn/multiz.v11.2007-03-19/multiz-tba/multiz penn
    cp -p /cluster/bin/penn/multiz.v11.2007-03-19/multiz-tba/maf_project penn
    cp -p /cluster/bin/penn/multiz.v11.2007-03-19/multiz-tba/autoMZ penn

    #	set the db and pairs directories here
    cat > autoMultiz.csh << '_EOF_'
#!/bin/csh -ef
set db = oryLat2
set subdir = $1
set c = $2
set result = $3
set resultDir = $result:h
set run = `pwd`
set tmp = $run/tmp/$db/multiz.$c
set pairs = /hive/data/genomes/oryLat2/bed/multiz5way/mafSplit
rm -fr $tmp
mkdir -p $tmp
mkdir -p $resultDir
cp -p ../../tree.nh ../../species.list $tmp
pushd $tmp
pwd
ls -l
cat species.list
foreach s (`sed -e "s/ $db//" species.list`)
    set in = $pairs/$s/$subdir/$c.maf
    set out = $db.$s.sing.maf
    if (-e $in.gz) then
        zcat $in.gz > $out
    else if (-e $in) then
        cp -p $in $out
    else
        echo "##maf version=1 scoring=autoMZ" > $out
    endif
end
set path = ($run/penn $path); rehash
$run/penn/autoMZ + T=$tmp E=$db "`cat tree.nh`" $db.*.sing.maf $c.maf
popd
rm -f $result
cp -p $tmp/$c.maf $result
rm -fr $tmp
rmdir --ignore-fail-on-non-empty $run/tmp/$db
rmdir --ignore-fail-on-non-empty $run/tmp
'_EOF_'
# << happy emacs
    chmod +x autoMultiz.csh

    cat  << '_EOF_' > template
#LOOP
./autoMultiz.csh $(dir1) $(root1) {check out line+ /hive/data/genomes/oryLat2/bed/multiz5way/splitRun/maf/$(dir1)/$(root1).maf}
#ENDLOOP
'_EOF_'
# << happy emacs

    gensub2 ../../5-way.split.list single template jobList
    para create jobList
# Completed: 792 of 792 jobs
# CPU time in finished jobs:       5423s      90.38m     1.51h    0.06d  0.000 y
# IO & Wait Time:                138287s    2304.79m    38.41h    1.60d  0.004 y
# Average job time:                 181s       3.02m     0.05h    0.00d
# Longest finished job:             404s       6.73m     0.11h    0.00d
# Submission to last job:           436s       7.27m     0.12h    0.01d
# Estimated complete:                 0s       0.00m     0.00h    0.00d

    # put the split maf results back together into a single maf file
    #	eliminate duplicate comments
    ssh hgwdev
    cd /hive/data/genomes/oryLat2/bed/multiz5way/splitRun
    grep "^##maf version" maf/0/0/scaf11300.maf \
	| sort -u > oryLat2.5way.maf
    for F in `find ./maf -depth -type f`
do
    grep -h "^#" "${F}" | egrep -v "maf version=1|eof maf" \
	| sed -e "s#/_MZ_[^ ]* # #g; s#__[0-9]##g"
done | sort -u >> oryLat2.5way.maf
    for F in `find ./maf -depth -type f`
do
    grep -v -h "^#" "${F}"
done >> oryLat2.5way.maf
    grep "^##eof maf" maf/0/0/scaf11300.maf \
	| sort -u >> oryLat2.5way.maf

    # load tables for a look
    ssh hgwdev
    mkdir -p /gbdb/oryLat2/multiz5way/maf
    ln -s /hive/data/genomes/oryLat2/bed/multiz5way/splitRun/oryLat2.5way.maf \
                /gbdb/oryLat2/multiz5way/maf/multiz5way.maf
    # this generates an immense multiz5way.tab file in the directory
    #	where it is running.  Best to run this over in scratch.
    cd /scratch/tmp
    time nice -n +19 hgLoadMaf \
	-pathPrefix=/gbdb/oryLat2/multiz5way/maf oryLat2 multiz5way
    #	real    1m10.380s
    #	Loaded 1366931 mafs in 1 files from /gbdb/oryLat2/multiz5way/maf
    # load summary table
    time nice -n +19 cat /gbdb/oryLat2/multiz5way/maf/*.maf \
	| hgLoadMafSummary oryLat2 -minSize=30000 -mergeGap=1500 \
	 -maxSize=200000  multiz5waySummary stdin
    #	real    2m39.822
    #	Created 353577 summary blocks from 2852890 components and 1197504 mafs
    #	from stdin

    # Gap Annotation
    # prepare bed files with gap info
    mkdir /cluster/data/oryLat2/bed/multiz5way/anno
    cd /cluster/data/oryLat2/bed/multiz5way/anno
    mkdir maf run

    #	these actually already all exist from previous multiple alignments
    for DB in `cat ../species.list`
do
    CDIR="/cluster/data/${DB}"
    if [ ! -f ${CDIR}/${DB}.N.bed ]; then
	echo "creating ${DB}.N.bed"
	echo twoBitInfo -nBed ${CDIR}/${DB}.2bit ${CDIR}/${DB}.N.bed
    else
	ls -og ${CDIR}/${DB}.N.bed
    fi
done

    cd run
    rm -f nBeds sizes
    for DB in `sed -e "s/ oryLat2//" ../../species.list`
do
    echo "${DB} "
    ln -s  /cluster/data/${DB}/${DB}.N.bed ${DB}.bed
    echo ${DB}.bed  >> nBeds
    ln -s  /cluster/data/${DB}/chrom.sizes ${DB}.len
    echo ${DB}.len  >> sizes
done

    ssh memk
    #	need to split up the single maf file into individual
    #	per-scaffold maf files to run annotation on
    mkdir /cluster/data/oryLat2/bed/multiz5way/anno/splitMaf
    cd /cluster/data/oryLat2/bed/multiz5way/anno/splitMaf
    #	create bed files to list approximately 112 scaffolds in
    #	a single list, approximately 64 lists
    cat << '_EOF_' > mkBedLists.pl
#!/usr/bin/env perl

use strict;
use warnings;

my $bedCount = 0;
my $i = 0;

my $bedFile = sprintf("file_%d.bed", $bedCount);

open (BF,">$bedFile") or die "can not write to $bedFile $!";

open (FH,"</cluster/data/oryLat2/chrom.sizes") or
        die "can not read /cluster/data/oryLat2/chrom.sizes $!";
while (my $line = <FH>) {
    chomp $line;
    if ( (($i + 1) % 111) == 0 )  {
        printf "%s\n", $line;
        close (BF);
        ++$bedCount;
        $bedFile = sprintf("file_%d.bed", $bedCount);
        open (BF,">$bedFile") or die "can not write to $bedFile $!";
    }
    ++$i;
    my ($chr, $size) = split('\s+',$line);
    printf BF "%s\t0\t%d\t%s\n", $chr, $size, $chr;
}
close (FH);
'_EOF_'
    # << happy emacs
    chmod +x mkBedLists.pl
    ./mkBedLists.pl

    #	now, run a mafsInRegion on each one of those lists
    cat << '_EOF_' > runOne
#!/bin/csh -fe
set runDir = `pwd`
set resultDir = $1
set bedFile = $resultDir.bed
mkdir -p $resultDir
mkdir -p /scratch/tmp/oryLat2/$resultDir
pushd /scratch/tmp/oryLat2/$resultDir
mafsInRegion $runDir/$bedFile -outDir . \
	/cluster/data/oryLat2/bed/multiz5way/splitRun/oryLat2.5way.maf
popd
rsync -q -a /scratch/tmp/oryLat2/$resultDir/ ./$resultDir/
rm -fr /scratch/tmp/oryLat2/$resultDir
rmdir --ignore-fail-on-non-empty /scratch/tmp/oryLat2
'_EOF_'
    # << happy emacs
    chmod +x runOne

    cat << '_EOF_' > template
#LOOP
./runOne $(root1)
#ENDLOOP
'_EOF_'
    # << happy emacs

    ls file*.bed > runList
    gensub2 runList single template jobList
    para create jobList
    para try ... check ... push ... etc
# Completed: 65 of 65 jobs
# CPU time in finished jobs:       2615s      43.58m     0.73h    0.03d  0.000 y
# IO & Wait Time:                 11436s     190.60m     3.18h    0.13d  0.000 y
# Average job time:                 216s       3.60m     0.06h    0.00d
# Longest finished job:             493s       8.22m     0.14h    0.01d
# Submission to last job:          3875s      64.58m     1.08h    0.04d

    cd /cluster/data/oryLat2/bed/multiz5way/anno/run

    cat << '_EOF_' > doAnno.csh
#!/bin/csh -ef
set outDir = ../maf/$2
set result = $3
set input = $1
mkdir -p $outDir
cat $input | \
nice mafAddIRows -nBeds=nBeds stdin /hive/data/genomes/oryLat2/oryLat2.2bit $result
'_EOF_'
    # << happy emacs
    chmod +x doAnno.csh

    cat << '_EOF_' > template
#LOOP
./doAnno.csh $(path1) $(lastDir1) {check out line+ ../maf/$(lastDir1)/$(root1).maf}
#ENDLOOP
'_EOF_'
    # << happy emacs

    find ../splitMaf -type f -name "*.maf" > maf.list
    gensub2 maf.list single template jobList
    para create jobList
    para try ... check ... push ... etc.
# Completed: 5542 of 5542 jobs
# CPU time in finished jobs:        725s      12.09m     0.20h    0.01d  0.000 y
# IO & Wait Time:                 37945s     632.41m    10.54h    0.44d  0.001 y
# Average job time:                   7s       0.12m     0.00h    0.00d
# Longest finished job:              53s       0.88m     0.01h    0.00d
# Submission to last job:           243s       4.05m     0.07h    0.00d

    ssh hgwdev
    cd /cluster/data/oryLat2/bed/multiz5way/anno
    grep "^##maf version" maf/file_0/chr1.maf \
	| sort -u > oryLat2.5way.maf
    find ./maf -type f -depth -name "*.maf" | while read F
do
    grep -v -h "^#" "${F}"
done >> oryLat2.5way.maf
    echo "##eof maf" >> oryLat2.5way.maf

    ssh hgwdev
    cd /cluster/data/oryLat2/bed/multiz5way/anno
    mkdir -p /gbdb/oryLat2/multiz5way/anno
    ln -s `pwd`/oryLat2.5way.maf \
                /gbdb/oryLat2/multiz5way/anno/multiz5way.maf
    #	by loading this into the table multiz5way, it will replace the
    #	previously loaded table with the unannotated mafs
    #	huge temp files are made, do them on local disk
    cd /scratch/tmp
    time nice -n +19 hgLoadMaf -pathPrefix=/gbdb/oryLat2/multiz5way/anno \
                oryLat2 multiz5way
    #	Loaded 1620199 mafs in 1 files from /gbdb/oryLat2/multiz5way/anno
    #	real    3m19.209s

    #	normally filter this for chrom size > 1,000,000 and only load
    #	those chroms.  But this is a scaffold assembly, load everything:
    time nice -n +19 hgLoadMafSummary oryLat2 -minSize=30000 -mergeGap=1500 \
	-maxSize=200000  multiz5waySummary \
	    /gbdb/oryLat2/multiz5way/anno/multiz5way.maf
    #	Created 353577 summary blocks from 2852890 components and 1428024 mafs
    #	from /gbdb/oryLat2/multiz5way/anno/multiz5way.maf
    #	real    3m49.692s

    #	by loading this into the table multiz5waySummary, it will replace
    #	the previously loaded table with the unannotated mafs
    #	remove the multiz5way*.tab files in this /scratch/tmp directory
    rm multiz5way*.tab
    #	And, you can remove the previously loaded non-annotated maf file link:
    rm /gbdb/oryLat2/multiz5way/maf/multiz5way.maf
    rmdir /gbdb/oryLat2/multiz5way/maf

###########################################################################
## Annotate 5-way multiple alignment with gene annotations
##		(DONE - 2008-09-03 - Hiram)
    # Gene frames, using the "best" gene set on each organism:
    #	use ensGene for oryLat2, gasAcu1, fr2, danRer5
    #	with Mrnas for tetNig1

    ssh hgwdev
    mkdir /cluster/data/oryLat2/bed/multiz5way/frames
    cd /cluster/data/oryLat2/bed/multiz5way/frames
    mkdir genes
    # ensGene
    for DB in oryLat2 gasAcu1 fr2 danRer5
do
    hgsql -N -e "select name,chrom,strand,txStart,txEnd,cdsStart,cdsEnd,exonCount,exonStarts,exonEnds from ensGene" ${DB} \
      | genePredSingleCover stdin stdout | gzip -2c \
        > /scratch/tmp/${DB}.tmp.gz
    mv /scratch/tmp/${DB}.tmp.gz genes/$DB.gp.gz
    echo "${DB} done"
done

    # use the xenoMrna data for tetNig1
    for DB in tetNig1
do
tmpExt=`mktemp temp.XXXXXX`
tmpMrnaCds=${DB}.mrna-cds.${tmpExt}
tmpMrna=${DB}.mrna.${tmpExt}
tmpCds=${DB}.cds.${tmpExt}
hgsql -N -e 'select xenoMrna.qName,cds.name,xenoMrna.* \
	   from xenoMrna,gbCdnaInfo,cds \
	   where (xenoMrna.qName = gbCdnaInfo.acc) and \
	     (gbCdnaInfo.cds != 0) and (gbCdnaInfo.cds = cds.id)' \
$DB > ${tmpMrnaCds}
cut -f 1-2  ${tmpMrnaCds} > ${tmpCds}
cut -f 4-100  ${tmpMrnaCds} > ${tmpMrna}
mrnaToGene -cdsFile=${tmpCds} -smallInsertSize=8 -quiet ${tmpMrna} stdout | \
genePredSingleCover stdin stdout | gzip -2c > /scratch/tmp/$DB.tmp.gz
rm ${tmpMrnaCds} ${tmpMrna} ${tmpCds}
mv /scratch/tmp/$DB.tmp.gz genes/$DB.gp.gz
rm -f $tmpExt
echo "${DB} done"
done

    ls -og genes
# -rw-rw-r--  1 1866277 Sep  3 14:02 oryLat2.gp.gz
# -rw-rw-r--  1 1984758 Sep  3 14:02 gasAcu1.gp.gz
# -rw-rw-r--  1 1909042 Sep  3 14:02 fr2.gp.gz
# -rw-rw-r--  1 1845404 Sep  3 14:02 danRer5.gp.gz
# -rw-rw-r--  1 1693359 Sep  3 14:13 tetNig1.gp.gz

    cd /cluster/data/oryLat2/bed/multiz5way/frames
    #	anything to annotate is in a pair, e.g.: oryLat2 genes/oryLat2.gp.gz
    time (cat ../anno/oryLat2.5way.maf | nice -n +19 genePredToMafFrames oryLat2 stdin stdout oryLat2 genes/oryLat2.gp.gz gasAcu1 genes/gasAcu1.gp.gz fr2 genes/fr2.gp.gz danRer5 genes/danRer5.gp.gz tetNig1 genes/tetNig1.gp.gz | gzip > multiz5way.mafFrames.gz) > frames.log 2>&1
    # see what it looks like in terms of number of annotations per DB:
    zcat multiz5way.mafFrames.gz | cut -f4 | sort | uniq -c | sort -n
#	201595 oryLat2
#	213947 tetNig1
#	280670 danRer5
#	298913 gasAcu1
#	318664 fr2

    #	load the resulting file
    cd /cluster/data/oryLat2/bed/multiz5way/frames
    time nice -n +19 hgLoadMafFrames oryLat2 multiz5wayFrames \
	multiz5way.mafFrames.gz
    #	real    0m53.888s

    #	enable the trackDb entries:
# frames multiz5wayFrames
# irows on

#############################################################################
# phastCons 5-way (DONE - 2008-09-03,08 - Hiram)

    # split 5way mafs into 10M chunks and generate sufficient statistics
    # files for # phastCons
    ssh memk
    mkdir /cluster/data/oryLat2/bed/multiz5way/msa.split
    cd /cluster/data/oryLat2/bed/multiz5way/msa.split
    mkdir -p /cluster/data/oryLat2/bed/multiz5way/cons/ss

    cat << '_EOF_' > doSplit.csh
#!/bin/csh -ef
set MAFS = /cluster/data/oryLat2/bed/multiz5way/anno/maf
set WINDOWS = /cluster/data/oryLat2/bed/multiz5way/cons/ss
pushd $WINDOWS
set resultDir = $1
set c = $2
rm -fr $resultDir/$c
mkdir -p $resultDir
twoBitToFa -seq=$c /cluster/data/oryLat2/oryLat2.2bit /scratch/tmp/oryLat2.$c.fa
/cluster/bin/phast/$MACHTYPE/msa_split $MAFS/$resultDir/$c.maf -i MAF \
    -M /scratch/tmp/oryLat2.$c.fa \
    -o SS -r $resultDir/$c -w 10000000,0 -I 1000 -B 5000
rm -f /scratch/tmp/oryLat2.$c.fa
popd
mkdir -p $resultDir
date > $resultDir/$c.out
'_EOF_'
    # << happy emacs
    chmod +x doSplit.csh

    cat << '_EOF_' > template
#LOOP
doSplit.csh $(dir1) $(root1) {check out line+ $(dir1)/$(root1).out}
#ENDLOOP
'_EOF_'
    # << happy emacs

    #	create list of maf files:
    (cd ../anno/maf; find . -type f) | sed -e "s#^./##" > maf.list

    gensub2 maf.list single template jobList
    para create jobList
    para try ... check ... etc
# Completed: 5542 of 5542 jobs
# CPU time in finished jobs:        707s      11.79m     0.20h    0.01d  0.000 y
# IO & Wait Time:                 16217s     270.28m     4.50h    0.19d  0.001 y
# Average job time:                   3s       0.05m     0.00h    0.00d
# Longest finished job:             100s       1.67m     0.03h    0.00d
# Submission to last job:          7795s     129.92m     2.17h    0.09d

    #  create starting-tree with the largest ss file:
    cd /cluster/data/oryLat2/bed/multiz5way/cons
    time nice -n +19 /cluster/bin/phast/$MACHTYPE/phyloFit -i SS \
	ss/file_0/chr4.10000001-19997581.ss \
--tree "(((tetNig1,fr2),(gasAcu1,oryLat2)),danRer5)" \
    --out-root starting-tree
    #	Fitting tree model to s1.1-39973033.ss using REV ...
    #	Writing model to starting-tree.mod ...
    #	real    0m8.798s

    # add up the C and G:
    grep BACKGROUND starting-tree.mod | awk '{printf "%0.3f\n", $3 + $4;}'
    #	0.417
    #	This 0.417 is used in the --gc argument below

    # Create a random list of 50 1 mb regions  (do not use the _randoms)
    cd /cluster/data/oryLat2/bed/multiz5way/cons
    find ./ss -type f | grep "/chr" | xargs ls -1l \
	| awk '$5 > 4000000 {print $9;}' \
	| sed -e "s#./ss/##" | randomLines stdin 50 randomSs.list

    # Set up parasol directory to calculate trees on these 50 regions
    ssh memk
    mkdir /cluster/data/oryLat2/bed/multiz5way/cons/treeRun1
    cd /cluster/data/oryLat2/bed/multiz5way/cons/treeRun1
    mkdir tree log

    #	Tuning this loop should come back to here to recalculate
    # Create little script that calls phastCons with right arguments
    #	--target-coverage of 0.20 is about right for mouse, will be
    #	tuned exactly below
    cat > makeTree.csh << '_EOF_'
#!/bin/csh -fe
set C=$1:h
mkdir -p log/${C} tree/${C}
    /cluster/bin/phast/$MACHTYPE/phastCons ../ss/$1 \
      /cluster/data/oryLat2/bed/multiz5way/cons/starting-tree.mod \
      --gc 0.417 --nrates 1,1 --no-post-probs --ignore-missing \
      --expected-length 10 --target-coverage 0.20 \
      --quiet --log log/$1 --estimate-trees tree/$1
'_EOF_'
    #	<< happy emacs
    chmod a+x makeTree.csh

    # Create gensub file
    cat > template << '_EOF_'
#LOOP
./makeTree.csh $(path1)
#ENDLOOP
'_EOF_'
    #	<< happy emacs

    # Make cluster job and run it
    gensub2 ../randomSs.list single template jobList
    para create jobList
    para try/push/check/etc
# Completed: 50 of 50 jobs
# CPU time in finished jobs:       7629s     127.15m     2.12h    0.09d  0.000 y
# IO & Wait Time:                   367s       6.11m     0.10h    0.00d  0.000 y
# Average job time:                 160s       2.67m     0.04h    0.00d
# Longest finished job:             282s       4.70m     0.08h    0.00d
# Submission to last job:           518s       8.63m     0.14h    0.01d

    # Now combine parameter estimates.  We can average the .mod files
    # using phyloBoot.  This must be done separately for the conserved
    # and nonconserved models
    ls -1 tree/file_0/*.cons.mod > cons.list
    /cluster/bin/phast/$MACHTYPE/phyloBoot --read-mods '*cons.list' \
	--output-average ave.cons.mod > cons_summary.txt
    ls -1 tree/file_0/*.noncons.mod > noncons.list
    /cluster/bin/phast/$MACHTYPE/phyloBoot --read-mods '*noncons.list' \
	--output-average ave.noncons.mod > noncons_summary.txt
    cp -p ave.*.mod ..
    cd ..
    cp -p ave.*.mod /cluster/data/oryLat1/bed/multiz5way/cons

    #	measuring entropy, aiming for a PIT = 10
    #	consEntopy <target coverage> <expected lengths>
    #		 ave.cons.mod ave.noncons.mod --NH 9.78
    #	--expected-length 10 --target-coverage 0.20 \
    time /cluster/bin/phast/$MACHTYPE/consEntropy --NH 9.7834 \
	0.20 10 ave.{cons,noncons}.mod
# ( Solving for new omega: 10.000000 10.103157 10.102202 )
# Transition parameters: gamma=0.200000, omega=10.000000, mu=0.100000, nu=0.025000
# Relative entropy: H=0.861827 bits/site
# Expected min. length: L_min=11.328911 sites
# Expected max. length: L_max=7.054937 sites
# Phylogenetic information threshold: PIT=L_min*H=9.763558 bits
# Recommended expected length: omega=10.102202 sites (for L_min*H=9.783400)

    #	--expected-length 14 --target-coverage 0.20 \
# ( Solving for new omega: 14.000000 10.288426 9.355350 9.262718 9.261710 )
# Transition parameters: gamma=0.200000, omega=14.000000, mu=0.071429, nu=0.017857
# Relative entropy: H=0.783596 bits/site
# Expected min. length: L_min=13.493848 sites
# Expected max. length: L_max=9.090919 sites
# Phylogenetic information threshold: PIT=L_min*H=10.573722 bits
# Recommended expected length: omega=9.261710 sites (for L_min*H=9.783400)

    #	--expected-length 12 --target-coverage 0.20 \
    #	this one comes closest to PIT=10
# ( Solving for new omega: 12.000000 9.993483 9.668856 9.658335 9.658324 )
# Transition parameters: gamma=0.200000, omega=12.000000, mu=0.083333, nu=0.020833
# Relative entropy: H=0.816888 bits/site
# Expected min. length: L_min=12.489452 sites
# Expected max. length: L_max=8.136430 sites
# Phylogenetic information threshold: PIT=L_min*H=10.202482 bits
# Recommended expected length: omega=9.658324 sites (for L_min*H=9.783400)

    #	--expected-length 13 --target-coverage 0.20 \
# ( Solving for new omega: 13.000000 10.112898 9.494883 9.455298 9.455126 )
# Transition parameters: gamma=0.200000, omega=13.000000, mu=0.076923, nu=0.019231
# Relative entropy: H=0.799098 bits/site
# Expected min. length: L_min=13.008620 sites
# Expected max. length: L_max=8.627917 sites
# Phylogenetic information threshold: PIT=L_min*H=10.395159 bits
# Recommended expected length: omega=9.455126 sites (for L_min*H=9.783400)


    featureBits oryLat2 *.bed
    #	61468061 bases of 700386597 (8.776%) in intersection

    # Run phastCons
    #	This job is I/O intensive in its output files, run it on
    #	a kluster without too many CPUs
    ssh memk
    mkdir -p /cluster/data/oryLat2/bed/multiz5way/cons/run.cons
    cd /cluster/data/oryLat2/bed/multiz5way/cons/run.cons

    #	there are going to be several different phastCons runs using
    #	this same script.  They trigger off of the current working directory
    #	$cwd:t which is the "grp" in this script.  It is one of:
    #	all gliers placentals
    #	Well, that's what it was when used in the Mm9 30-way,
    #	in this instance, there is only the directory "all"

    cat << '_EOF_' > doPhast.csh
#!/bin/csh -fe
set PHASTBIN = /cluster/bin/phast.2007-05-04/bin
set subDir = $1
set f = $2
set c = $2:r
set len = $3
set cov = $4
set grp = $cwd:t
set tmp = /scratch/tmp/$f
set cons = /cluster/data/oryLat2/bed/multiz5way/cons
mkdir -p $tmp
set san = /cluster/data/oryLat2/bed/multiz5way/cons
cp -p $cons/$grp/ave.cons.mod $cons/$grp/ave.noncons.mod $tmp
cp -p $san/ss/$subDir/${f}.ss $tmp
pushd $tmp > /dev/null
$PHASTBIN/phastCons ${f}.ss ave.cons.mod,ave.noncons.mod \
    --expected-length $len --target-coverage $cov \
    --quiet --seqname ${c} --idpref ${c} --viterbi ${f}.bed --score > ${f}.pp
popd > /dev/null
mkdir -p $san/$grp/pp/$subDir $san/$grp/bed/$subDir
sleep 4
touch $san/$grp/pp/$subDir $san/$grp/bed/$subDir
rm -f $san/$grp/pp/$subDir/$f.pp
rm -f $san/$grp/bed/$subDir/$f.bed
mv $tmp/$f.pp $san/$grp/pp/$subDir
mv $tmp/$f.bed $san/$grp/bed/$subDir
rm -fr $tmp
'_EOF_'
    # << happy emacs
    chmod a+x doPhast.csh

    # Create parasol batch and run it
    cd /cluster/data/oryLat2/bed/multiz5way/cons
    find ./ss -type f -name "*.ss" | sed -e "s#^./##; s/.ss$//" > ss.list

    # run for all species
    mkdir -p all run.cons/all
    cd all
    ln -s ../ave.*cons.mod .
    cd run.cons/all

    #	root1 == chrom name, file1 == ss file name without .ss suffix
    # Create template file for "all" run
    cat << '_EOF_' > template
#LOOP
../doPhast.csh $(lastDir1) $(file1) 12 .2 {check out line+ /cluster/data/oryLat2/bed/multiz5way/cons/all/pp/$(lastDir1)/$(file1).pp}
#ENDLOOP
'_EOF_'
    # << happy emacs
    gensub2 ../../ss.list single template jobList
    para create jobList
    para try ... check ... push ... etc.
# Completed: 4265 of 4265 jobs
# CPU time in finished jobs:       2722s      45.36m     0.76h    0.03d  0.000 y
# IO & Wait Time:                 28535s     475.59m     7.93h    0.33d  0.001 y
# Average job time:                   7s       0.12m     0.00h    0.00d
# Longest finished job:              42s       0.70m     0.01h    0.00d
# Submission to last job:          1181s      19.68m     0.33h    0.01d

    # create Most Conserved track
    cd /cluster/data/oryLat2/bed/multiz5way/cons/all
    find ./bed -type f -name "*.bed" | xargs cat \
	| sort -k1,1 -k2,2n | \
        awk '{printf "%s\t%d\t%d\tlod=%d\t%s\n", $1, $2, $3, $5, $5;}' | \
            /cluster/bin/scripts/lodToBedScore /dev/stdin > mostConserved.bed

    # load into database
    ssh hgwdev
    cd /cluster/data/oryLat2/bed/multiz5way/cons/all
    time nice -n +19 hgLoadBed oryLat2 phastConsElements5way mostConserved.bed
    #	Loaded 1047037 elements of size 5
    #	real    0m33.582s

    # Try for 5% overall cov, and 70% CDS cov
    #	We don't have any gene tracks to compare CDS coverage
    #	--rho .31 --expected-length 45 --target-coverage .3
    featureBits oryLat2 phastConsElements5way
    #	61468061 bases of 700386597 (8.776%) in intersection

    #	creating the phastCons data files
    mkdir -p phastCons5wayScores
    #	take a look at the file names to see where the chrom files
    #	are:
for D in `ls -1d pp/file* | sort -t_ -k2n`
do
    F=${D/pp\/}
    out=phastCons5wayScores/${F}.data.gz
    ls -S ${D}/*.pp
done | sed -e "s#/# s #g; s#\.# d #g; s#-# m #" \
    | sort -k5,5 -k7,7n | sed -e "s# s #/#g; s# d #.#g; s# m #-#" > fileList.txt

    #	the chrom files are all in file_0, with chrM in file_12
    #	therefore:
    for C in 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 M
do
    find ./pp/file_0 ./pp/file_12 -type f | grep "/chr${C}\." \
	| sed -e "s#^./##" | sed -e "s#/# s #g; s#\.# d #g; s#-# m #" \
	| sort -k5,5 -k7,7n | sed -e "s# s #/#g; s# d #.#g; s# m #-#" \
	| while read F
    do
	cat "${F}"
    done | gzip >> phastCons5wayScores/chr${C}.data.txt.gz
    echo chr${C}.data.txt
done

    #	these files mistakenly got gzipped during the first run of
    #	this script, so now they are *.pp.gz and they are zcat
    #	And, for the rest, place them in their file_N groups
for D in `ls -1d pp/file* | sort -t_ -k2n`
do
    F=${D/pp\/}
    out=phastCons5wayScores/${F}.data.gz
    ls -S ${D}/*.pp.gz | grep -v "/chr" \
	| sed -e "s#/# s #g; s#\.# d #g; s#-# m #" \
	| sort -k5,5 -k7,7n | sed -e "s# s #/#g; s# d #.#g; s# m #-#" \
	| xargs zcat > phastCons5wayScores/${F}.data.txt
    gzip phastCons5wayScores/${F}.data.txt
    ls -og phastCons5wayScores/${F}.data.txt.gz
done

    #	verify nothing got lost
    find ./pp -type f | xargs zcat | wc -l
    #	300140068
    zcat phastCons5wayScores/*.txt.gz | wc -l
    #	300140068

    #	can now convert to wiggle data
    ls -rt phastCons5wayScores/*.txt.gz | while read F
do
    zcat "${F}"
done | wigEncode stdin phastCons5way.wig phastCons5way.wib
    # Converted stdin, upper limit 1.00, lower limit 0.00

    # Load gbdb and database with wiggle.
    ssh hgwdev
    cd /hive/data/genomes/oryLat2/bed/multiz5way/cons/all
    ln -s `pwd`/phastCons5way.wib /gbdb/oryLat2/multiz5way/phastCons5way.wib
    time nice -n +19 hgLoadWiggle -pathPrefix=/gbdb/oryLat2/multiz5way oryLat2 \
	phastCons5way phastCons5way.wig
    #	real    0m13.164s
    # remove garbage
    rm wiggle.tab

    #  Create histogram to get an overview of all the data
    ssh hgwdev
    cd /cluster/data/oryLat2/bed/multiz5way/cons/all
    time nice -n +19 hgWiggle -doHistogram \
	-hBinSize=0.001 -hBinCount=1000 -hMinVal=0.0 -verbose=2 \
	    -db=oryLat2 phastCons5way > histogram.data 2>&1
    #	real    5m0.608s

    #	create plot of histogram:

    cat << '_EOF_' | gnuplot > histo.png
set terminal png small color x000000 xffffff xc000ff x66ff66 xffff00 x00ffff
set size 1.4, 0.8
set key left box
set grid noxtics
set grid ytics
set title " Medaka oryLat2 Histogram phastCons5way track"
set xlabel " phastCons5way score"
set ylabel " Relative Frequency"
set y2label " Cumulative Relative Frequency (CRF)"
set y2range [0:1]
set y2tics
set yrange [0:0.02]

plot "histogram.data" using 2:5 title " RelFreq" with impulses, \
        "histogram.data" using 2:7 axes x1y2 title " CRF" with lines
'_EOF_'
    #	<< happy emacs

    display histo.png &

    #	These trackDb entries turn on the wiggle phastCons data track:
    #	type wigMaf 0.0 1.0
    #	maxHeightPixels 100:40:11
    #	wiggle phastCons5way
    #	spanList 1
    #	autoScale Off
    #	windowingFunction mean
    #	pairwiseHeight 12
    #	yLineOnOff Off

#############################################################################
# CREATE MICROSAT TRACK (DONE - 2008-09-08 - Hiram)
     ssh hgwdev
     mkdir /cluster/data/oryLat2/bed/microsat
     cd /cluster/data/oryLat2/bed/microsat
     awk '($5==2 || $5==3) && $6 >= 15 && $8 == 100 && $9 == 0 {printf("%s\t%s\t%s\t%dx%s\n", $1, $2, $3, $6, $16);}' ../simpleRepeat/simpleRepeat.bed > microsat.bed
    hgLoadBed oryLat2 microsat microsat.bed

#############################################################################
# Create Quality Track (DONE - 2008-09-08 - Hiram)
    #	The quality scores obtained for oryLat1 are now correct since
    #	we haven't smashed the scaffolds and ultracontigs into chrUn
    #	So, using the file picked up for oryLat1:
    ssh hgwdev
    mkdir /cluster/data/oryLat2/bed/qual
    cd /cluster/data/oryLat2/bed/qual
    ln -s /cluster/data/oryLat1/bed/qual/medakaver1.0.UCSC.fasta.qual .

    # some of the numbers in this are incorrect, too large.  Truncate them
    #	to 90 as per advice from Budrul in email, and convert chrom numbers
    #	to UCSC chrom names
    time nice -n +19 sed -e "s/ dna:.*//; s/^>\([0-9][0-9]*\)/>chr\1/;" \
	medakaver1.0.UCSC.fasta.qual \
	| sed -e \
"s/ 1[0-9][0-9]/ 90/g; s/ 2[0-9][0-9]/ 90/g; s/^1[0-9][0-9]/90/; s/^2[0-9][0-9]/90/;" \
	| qaToQac stdin oryLat2.qual.qac
    #	real    3m0.928s

    time nice -n +19 qacToWig -fixed oryLat2.qual.qac stdout \
	| wigEncode stdin oryLat2.qual.wig oryLat2.qual.wib
    #	Converted stdin, upper limit 99.00, lower limit 0.00
    #	real    real    5m31.095s

    ssh hgwdev
    cd /cluster/data/oryLat2/bed/qual
    ln -s /cluster/data/oryLat2/bed/qual/oryLat2.qual.wib /gbdb/oryLat2/wib
    time nice -n +19 hgLoadWiggle -tmpDir=/scratch/tmp \
	-pathPrefix=/gbdb/oryLat2/wib oryLat2 quality oryLat2.qual.wig
    #	real    0m20.513s

#############################################################################
# create downloads (DONE - 2008-09-08 - Hiram)
    ssh hgwdev
    cd /cluster/data/oryLat2
    ln -s bed/repeatMasker/oryLat2.fa.out .
    time nice -n +19 makeDownloads.pl oryLat2 > makeDownloads.log 2>&1
    #	real    7m39.978s
    #	Edit the resulting README files

    #	add upstream sequences for ensGenes version 50
    cd /cluster/data/oryLat2/goldenPath/bigZips
foreach size (1000 2000 5000)
  echo $size
  featureBits oryLat2 ensGene:upstream:$size -fa=stdout \
  | gzip -c > ensGene.upstream$size.fa.gz
end
    #	add those to the md5 sum file and the README file
    md5sum ensGene.up* >> md5sum.txt

#########################################################################
# Adding automatic generation of upstream files (DONE - 2009-08-13 - Hiram)
    # edit src/hg/makeDb/genbank/genbank.conf to add:

oryLat2.upstreamGeneTbl = ensGene
oryLat2.upstreamMaf = multiz5way /hive/data/genomes/oryLat2/bed/multiz5way/species.list

#########################################################################
# MULTIZ5WAY DOWNLOADABLES (DONE - 2008-09-08 - Hiram)
    ssh hgwdev
    mkdir /cluster/data/oryLat2/bed/multiz5way/mafDownloads
    cd /cluster/data/oryLat2/bed/multiz5way

    # upstream mafs
    #	There isn't any refGene table, using ensGene instead
for S in 1000 2000 5000
do
    echo "making upstream${S}.maf"
    nice -n +19 featureBits -verbose=2 oryLat2 \
        ensGene:upstream:${S} -fa=/dev/null -bed=stdout \
        | perl -wpe 's/_up[^\t]+/\t0/' \
        | $HOME/kent/src/hg/ratStuff/mafFrags/mafFrags oryLat2 multiz5way \
                stdin stdout -orgs=species.list \
        | gzip -c > mafDownloads/ensGene.upstream${S}.maf.gz
    echo "done ensGene.upstream${S}.maf.gz"
done

    cd /cluster/data/oryLat2/bed/multiz5way/mafDownloads
    cp -p ../anno/oryLat2.5way.maf .
    gzip oryLat2.5way.maf

    nice -n +19 md5sum *.maf.gz > md5sum.txt
    # Make a README.txt

    ssh hgwdev
    mkdir /usr/local/apache/htdocs/goldenPath/oryLat2/multiz5way
    cd /usr/local/apache/htdocs/goldenPath/oryLat2/multiz5way
    ln -s /cluster/data/oryLat2/bed/multiz5way/mafDownloads/{*.gz,*.txt} .

#############################################################################
# making the pushQ entry (DONE - 2008-09-08 - Hiram)
    ssh hgwdev
    cd /cluster/data/oryLat2/jkStuff
    makePushQSql.pl oryLat2 > oryLat2.pushQ.sql
oryLat2 does not have seq
Could not tell (from trackDb, all.joiner and hardcoded lists of supporting
and genbank tables) which tracks to assign these tables to:
  ensGtp
  ensPep
  multiz5way
  multiz5wayFrames
  multiz5waySummary
  phastCons5way
  phastConsElements5way
    scp -p oryLat2.pushQ.sql hiram@hgwbeta:/tmp
    ssh hgwbeta
    cd /tmp
    hgsql qapushq < oryLat2.pushQ.sql
    #	manually add the entry for the 5way business

#############################################################################
#  Create 5-way downloads (DONE - 2008-09-09 - Hiram)
    ssh hgwdev
    mkdir /cluster/data/oryLat2/goldenPath/multiz5way
    mkdir /cluster/data/oryLat2/goldenPath/phastCons5way
    cd /cluster/data/oryLat2/goldenPath/phastCons5way
    ln -s ../../bed/multiz5way/cons/all/phastCons5wayScores/*.txt.gz .
    ln -s ../../bed/multiz5way/cons/ave.cons.mod 5way.conserved.mod
    ln -s ../../bed/multiz5way/cons/ave.noncons.mod 5way.non-conserved.mod
    md5sum *.gz *.mod > md5sum.txt

    #	copy a README and fix it up for here:
    cp /usr/local/apache/htdocs/goldenPath/oryLat1/phastCons5Scores/README.txt .

    mkdir /cluster/data/oryLat2/goldenPath/multiz5way
    cd /cluster/data/oryLat2/goldenPath/multiz5way
    ln -s ../../bed/multiz5way/mafDownloads/* .
    ln -s ../../bed/multiz5way/5way.nh ./oryLat2.5way.nh

    #	check the names in these upstream files to ensure sanity:
    zcat upstream1000.maf.gz | grep "^s " | awk '{print $2}' \
	| sort | uniq -c | sort -rn | less
    #	should be a list of the other 4 species with a high count,
    #	then Ensembl gene names, e.g.:
    #	4558 tetNig1
    #	4558 gasAcu1
    #	4558 fr2
    #	4558 danRer5
    #	1 ENSORLT00000025880
    #	1 ENSORLT00000025875
    #	1 ENSORLT00000025852
    #	... etc ...

    ssh hgwdev
    cd /cluster/data/oryLat2/goldenPath/phastCons5way
    mkdir /usr/local/apache/htdocs/goldenPath/oryLat2/phastCons5way
    ln -s `pwd`/* /usr/local/apache/htdocs/goldenPath/oryLat2/phastCons5way

    cd /cluster/data/oryLat2/goldenPath/multiz5way
    mkdir /usr/local/apache/htdocs/goldenPath/oryLat2/multiz5way

    ln -s `pwd`/* /usr/local/apache/htdocs/goldenPath/oryLat2/multiz5way
    #	if your ln -s `pwd`/* made extra links to files you don't want there,
    #	check the goldenPath locations and remove those extra links

#############################################################################
##  Set a more interesting default position, location of IGHMBP2 gene
##	DONE - 2008-10-15 - Hiram
    ssh hgwdev
    hgsql -e \
'update dbDb set defaultPos="chr6:11,027,149-11,038,707" where name = "oryLat2";' \
	-h genome-testdb hgcentraltest

## note from QA changed position because default tracks were sparse in old
## position:
'update dbDb set defaultPos='chr18:14,435,198-14,444,829' where name="oryLat2";'

############################################################################

################################################
# AUTOMATE UPSTREAM FILE CREATION (2008-10-15 markd)
update genbank.conf:
oryLat2.upstreamGeneTbl = refGene

###########################################################################
# add NCBI identifiers to the dbDb (DONE - 2008-10-21 - Hiram)
    hgsql -e 'update dbDb set
sourceName="NIG and Tokyo Univ MEDAKA1 (NCBI project 16702, BAAF04000000)" where name="oryLat2";' hgcentraltest
###########################################################################
# HUMAN (hg18) PROTEINS TRACK (DONE 2009-02-02 braney )
    # bash  if not using bash shell already

    cd /cluster/data/oryLat2
    mkdir /cluster/data/oryLat2/blastDb

    awk '{if ($2 > 1000000) print $1}' chrom.sizes > 1meg.lst
    twoBitToFa -seqList=1meg.lst  oryLat2.2bit temp.fa
    faSplit gap temp.fa 1000000 blastDb/x -lift=blastDb.lft
    rm temp.fa 1meg.lst

    awk '{if ($2 <= 1000000) print $1}' chrom.sizes > less1meg.lst
    twoBitToFa -seqList=less1meg.lst  oryLat2.2bit temp.fa
    faSplit about temp.fa 1000000 blastDb/y
    rm temp.fa less1meg.lst

    cd blastDb
    for i in *.fa
    do
	/hive/data/outside/blast229/formatdb -i $i -p F
    done
    rm *.fa
    ls *.nsq | wc -l
# 884

    mkdir -p /cluster/data/oryLat2/bed/tblastn.hg18KG
    cd /cluster/data/oryLat2/bed/tblastn.hg18KG
    echo  ../../blastDb/*.nsq | xargs ls -S | sed "s/\.nsq//"  > query.lst
    wc -l query.lst
# 884

   # we want around 50000 jobs
   calc `wc /cluster/data/hg18/bed/blat.hg18KG/hg18KG.psl | awk '{print $1}'`/\(50000/`wc query.lst | awk '{print $1}'`\)

# 36727/(50000/884) = 649.333360

   mkdir -p kgfa
   split -l 649 /cluster/data/hg18/bed/blat.hg18KG/hg18KG.psl  kgfa/kg
   cd kgfa
   for i in *; do
     nice pslxToFa $i $i.fa;
     rm $i;
   done
   cd ..
   ls -1S kgfa/*.fa > kg.lst
   mkdir -p blastOut
   for i in `cat kg.lst`; do  mkdir blastOut/`basename $i .fa`; done
   tcsh
   cd /cluster/data/oryLat2/bed/tblastn.hg18KG
   cat << '_EOF_' > blastGsub
#LOOP
blastSome $(path1) {check in line $(path2)} {check out exists blastOut/$(root2)/q.$(root1).psl }
#ENDLOOP
'_EOF_'

   cat << '_EOF_' > blastSome
#!/bin/sh
BLASTMAT=/hive/data/outside/blast229/data
export BLASTMAT
g=`basename $2`
f=/tmp/`basename $3`.$g
for eVal in 0.01 0.001 0.0001 0.00001 0.000001 1E-09 1E-11
do
if /hive/data/outside/blast229/blastall -M BLOSUM80 -m 0 -F no -e $eVal -p tblastn -d $1 -i $2 -o $f.8
then
        mv $f.8 $f.1
        break;
fi
done
if test -f  $f.1
then
    if /cluster/bin/i386/blastToPsl $f.1 $f.2
    then
	liftUp -nosort -type=".psl" -nohead $f.3 /cluster/data/oryLat2/blastDb.lft carry $f.2
        liftUp -nosort -type=".psl" -pslQ -nohead $3.tmp /cluster/data/hg18/bed/blat.hg18KG/protein.lft warn $f.3
        if pslCheck -prot $3.tmp
        then
            mv $3.tmp $3
            rm -f $f.1 $f.2 $f.3 $f.4
        fi
        exit 0
    fi
fi
rm -f $f.1 $f.2 $3.tmp $f.8 $f.3 $f.4
exit 1
'_EOF_'
    # << happy emacs
    chmod +x blastSome
    gensub2 query.lst kg.lst blastGsub blastSpec
    exit

    ssh swarm
    cd /cluster/data/oryLat2/bed/tblastn.hg18KG
    para create blastSpec
#    para try, check, push, check etc.

    para time


# Completed: 50388 of 50388 jobs
# CPU time in finished jobs:    4705256s   78420.94m  1307.02h   54.46d  0.149 y
# IO & Wait Time:                240976s    4016.26m    66.94h    2.79d  0.008 y
# Average job time:                  98s       1.64m     0.03h    0.00d
# Longest finished job:             297s       4.95m     0.08h    0.00d
# Submission to last job:          7579s     126.32m     2.11h    0.09d

    ssh swarm
    cd /cluster/data/oryLat2/bed/tblastn.hg18KG
    mkdir chainRun
    cd chainRun
    tcsh
    cat << '_EOF_' > chainGsub
#LOOP
chainOne $(path1)
#ENDLOOP
'_EOF_'

    cat << '_EOF_' > chainOne
(cd $1; cat q.*.psl | simpleChain -prot -outPsl -maxGap=100000 stdin ../c.`basename $1`.psl)
'_EOF_'
    chmod +x chainOne
    ls -1dS ../blastOut/kg?? > chain.lst
    gensub2 chain.lst single chainGsub chainSpec
    # do the cluster run for chaining
    para create chainSpec
    para try, check, push, check etc.

# Completed: 57 of 57 jobs
# CPU time in finished jobs:        775s      12.91m     0.22h    0.01d  0.000 y
# IO & Wait Time:                  1481s      24.69m     0.41h    0.02d  0.000 y
# Average job time:                  40s       0.66m     0.01h    0.00d
# Longest finished job:              65s       1.08m     0.02h    0.00d
# Submission to last job:            66s       1.10m     0.02h    0.00d

    cd /cluster/data/oryLat2/bed/tblastn.hg18KG/blastOut
    for i in kg??
    do
       cat c.$i.psl | awk "(\$13 - \$12)/\$11 > 0.6 {print}" > c60.$i.psl
       sort -rn c60.$i.psl | pslUniq stdin u.$i.psl
       awk "((\$1 / \$11) ) > 0.60 { print   }" c60.$i.psl > m60.$i.psl
       echo $i
    done
    sort u.*.psl m60* | uniq | sort -T /tmp -k 14,14 -k 16,16n -k 17,17n > ../blastHg18KG.psl
    cd ..
    pslCheck blastHg18KG.psl
# checked: 40021 failed: 0 errors: 0

    # load table
    ssh hgwdev
    cd /cluster/data/oryLat2/bed/tblastn.hg18KG
    hgLoadPsl oryLat2 blastHg18KG.psl

    # check coverage
    featureBits oryLat2 blastHg18KG
# 19888142 bases of 700386597 (2.840%) in intersection

    featureBits oryLat1 blastHg18KG
# 18960975 bases of 700386597 (2.707%) in intersection

    featureBits oryLat2 all_mrna blastHg18KG  -enrichment
# all_mrna 0.131%, blastHg18KG 2.840%, both 0.058%, cover 44.55%, enrich 15.69x

    rm -rf blastOut
#end tblastn
############################################################################
# TRANSMAP vertebrate.2009-07-01 build  (2009-07-21 markd)

vertebrate-wide transMap alignments were built  Tracks are created and loaded
by a single Makefile. This is available from:
   svn+ssh://hgwdev.soe.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2009-07-01

see doc/builds.txt for specific details.

############################################################################
# lastz Zebrafish danRer6 (DONE - 2009-08-13 - Hiram)
    mkdir /hive/data/genomes/oryLat2/bed/lastzDanRer6.2009-08-13
    cd /hive/data/genomes/oryLat2/bed/lastzDanRer6.2009-08-13
    cat << '_EOF_' > DEF
# Medaka vs. Zebrafish

# using the "close" genome alignment parameters
#       see also: http://genomewiki.ucsc.edu/index.php/Mm9_multiple_alignment
BLASTZ_Y=9400
BLASTZ_L=3000
BLASTZ_K=3000
BLASTZ_M=50
BLASTZ_Q=/scratch/data/blastz/HoxD55.q

# TARGET: Medaka oryLat2
SEQ1_DIR=/scratch/data/oryLat2/oryLat2.2bit
SEQ1_LEN=/scratch/data/oryLat2/chrom.sizes
SEQ1_CHUNK=20000000
SEQ1_LIMIT=100
SEQ1_LAP=10000

# Query - zebrafish (danRer6)
SEQ2_DIR=/scratch/data/danRer6/danRer6.2bit
SEQ2_LEN=/scratch/data/danRer6/chrom.sizes
SEQ2_CTGDIR=/hive/data/genomes/danRer6/contigs/danRer6.contigs.2bit
SEQ2_CTGLEN=/hive/data/genomes/danRer6/contigs/danRer6.contigs.sizes
SEQ2_LIFT=/hive/data/genomes/danRer6/contigs/danRer6.contigs.lift
SEQ2_CHUNK=10000000
SEQ2_LIMIT=50
SEQ2_LAP=0

BASE=/hive/data/genomes/oryLat2/bed/lastzDanRer6.2009-08-13
TMPDIR=/scratch/tmp
'_EOF_'
    # << happy emacs

    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
	`pwd`/DEF \
	-chainMinScore=3000 -chainLinearGap=medium \
	-noLoadChainSplit -tRepeats=windowmaskerSdust \
	-smallClusterHub=encodek -bigClusterHub=swarm > do.log 2>&1 &
    #	real    232m4.673s
    #	kluster difficulties, after finishing lastz manually:
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
	`pwd`/DEF \
	-chainMinScore=3000 -chainLinearGap=medium \
	-noLoadChainSplit -tRepeats=windowmaskerSdust \
	-smallClusterHub=encodek -bigClusterHub=swarm \
	-continue=cat > cat.log 2>&1 &
    #	about 1 hour
    cat fb.oryLat2.chainDanRer6Link.txt
    #	123547945 bases of 700386597 (17.640%) in intersection

    mkdir /hive/data/genomes/danRer6/bed/blastz.oryLat2.swap
    cd /hive/data/genomes/danRer6/bed/blastz.oryLat2.swap
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
	/hive/data/genomes/oryLat2/bed/lastzDanRer6.2009-08-13/DEF \
	-chainMinScore=3000 -chainLinearGap=medium \
	-noLoadChainSplit -tRepeats=windowmaskerSdust \
	-smallClusterHub=encodek -bigClusterHub=swarm \
	-swap > swap.log 2>&1 &
    #	real    53m31.962s
    cat fb.danRer6.chainOryLat2Link.txt
    #	147597031 bases of 1506896106 (9.795%) in intersection

############################################################################
# BLASTZ/CHAIN/NET tetNig2 (DONE - 2008-08-26,27 - Hiram)
    screen # use screen to control this job
    mkdir /hive/data/genomes/oryLat2/bed/blastzTetNig2.2009-09-14
    cd /hive/data/genomes/oryLat2/bed/blastzTetNig2.2009-09-14
    cat << '_EOF_' > DEF
# Medaka vs. Tetraodon

# using the "close" genome alignment parameters
#	see also: http://genomewiki.ucsc.edu/index.php/Mm9_multiple_alignment
BLASTZ_Y=9400
BLASTZ_L=3000
BLASTZ_K=3000
BLASTZ_M=50
BLASTZ_Q=/scratch/data/blastz/HoxD55.q

# TARGET: Medaka oryLat2 (40M chunks covers the largest chrom in one gulp)
SEQ1_DIR=/scratch/data/oryLat2/oryLat2.2bit
SEQ1_LEN=/scratch/data/oryLat2/chrom.sizes
SEQ1_CHUNK=40000000
SEQ1_LAP=10000
SEQ1_LIMIT=200

# QUERY: Tetraodon TetNig2 - single chunk big enough to single largest item
SEQ2_DIR=/scratch/data/tetNig2/tetNig2.2bit
SEQ2_LEN=/scratch/data/tetNig2/chrom.sizes
SEQ2_CTGDIR=/scratch/data/tetNig2/tetNig2.contigs.2bit
SEQ2_CTGLEN=/scratch/data/tetNig2/tetNig2.contigs.sizes
SEQ2_LIFT=/scratch/data/tetNig2/tetNig2.contigs.lift
SEQ2_CHUNK=20000000
SEQ2_LIMIT=50
SEQ2_LAP=0

BASE=/hive/data/genomes/oryLat2/bed/blastzTetNig2.2009-09-14
TMPDIR=/scratch/tmp
'_EOF_'
    # << this line keeps emacs coloring happy

    time doBlastzChainNet.pl -chainMinScore=3000 -chainLinearGap=medium \
	`pwd`/DEF \
	-workhorse=hgwdev \
	-tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \
	-verbose=2 -smallClusterHub=pk -bigClusterHub=pk > do.log 2>&1 &
    #	real    159m0.619s - failed batch run
    #	recovered and continuing:
    time doBlastzChainNet.pl -chainMinScore=3000 -chainLinearGap=medium \
	`pwd`/DEF \
	-continue=cat -workhorse=hgwdev \
	-tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \
	-verbose=2 -smallClusterHub=pk -bigClusterHub=pk > cat.log 2>&1 &

    cat fb.oryLat2.chainTetNig2Link.txt
    #	162783854 bases of 700386597 (23.242%) in intersection

    #	And, for the swap:
    mkdir /hive/data/genomes/tetNig2/bed/blastz.oryLat2.swap
    cd /hive/data/genomes/tetNig2/bed/blastz.oryLat2.swap
    time doBlastzChainNet.pl -chainMinScore=3000 -chainLinearGap=medium \
	/hive/data/genomes/oryLat2/bed/blastzTetNig2.2009-09-14/DEF \
	-swap -tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \
	-verbose=2 -smallClusterHub=pk -bigClusterHub=pk > swap.log 2>&1 &
    #	about 2h18m
    cat fb.tetNig2.chainOryLat2Link.txt
    #	136115939 bases of 302314788 (45.025%) in intersection

############################################################################
# TRANSMAP vertebrate.2009-09-13 build  (2009-09-20 markd)

vertebrate-wide transMap alignments were built  Tracks are created and loaded
by a single Makefile. This is available from:
   svn+ssh://hgwdev.soe.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2009-09-13

see doc/builds.txt for specific details.

############################################################################
# lastz swap danRer7 (DONE - 2010-12-17 - Hiram)
    #	original alignment to danRer7
    cd /hive/data/genomes/danRer7/bed/lastzOryLat2.2010-12-17/DEF
    cat fb.oryLat2.chainDanRer7Link.txt
    #	141428502 bases of 1409770109 (10.032%) in intersection

    #	running the swap
    mkdir /hive/data/genomes/oryLat2/bed/blastz.danRer7.swap
    cd /hive/data/genomes/oryLat2/bed/blastz.danRer7.swap
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
	/hive/data/genomes/danRer7/bed/lastzOryLat2.2010-12-17/DEF \
	-chainMinScore=2000 -chainLinearGap=medium \
	-noLoadChainSplit -qRepeats=windowmaskerSdust \
	-swap -smallClusterHub=memk -bigClusterHub=swarm > swap.log 2>&1 &
    #	real    20m56.926s
    cat fb.oryLat2.chainDanRer7Link.txt
    #	124527579 bases of 700386597 (17.780%) in intersection

############################################################################

##########################################################################pubStart
# Publications track (DONE - 04-27-12 - Max)

# article download and conversion is run every night on hgwdev:
# 22 22 * * * /hive/data/inside/literature/pubtools/pubCronDailyUpdate.sh
# the script downloads files into /hive/data/outside/literature/{PubMedCentral,ElsevierConsyn}/
# then converts them to text into /hive/data/outside/literature/{pmc,elsevier}

# all configuration of the pipeline is in /hive/data/inside/literature/pubtools/lib/pubConf.py

# data processing was run manually like this
export PATH=/cluster/home/max/bin/x86_64:/cluster/bin/x86_64:/cluster/home/max/software/bin/:/cluster/software/bin:/cluster/home/max/projects/pubtools:/cluster/home/max/bin/x86_64:/hive/groups/recon/local/bin:/usr/local/bin:/usr/bin:/bin:/usr/bin/X11:/cluster/home/max/usr/src/scripts:/cluster/home/max/usr/src/oneshot:/cluster/home/max/bin:/cluster/bin/scripts:.:/cluster/home/max/usr/bin:/usr/lib64/qt-3.3/bin:/usr/kerberos/bin:/usr/local/bin:/bin:/usr/bin:/usr/lpp/mmfs/bin/:/opt/dell/srvadmin/bin:/cluster/bin/scripts:/hive/users/hiram/cloud/ec2-api-tools-1.3-51254/bin:/cluster/home/max/bin:/usr/bin/X11:/usr/java/jdk1.6.0_20/bin:/cluster/home/max/bin:/hive/data/inside/literature/pubtools/
# pmc
cd /hive/data/inside/literature/pubtools/runs/pmcBlat/
pubBlat init /hive/data/inside/literature/blat/pmc/ /hive/data/inside/literature/text/pmc
ssh swarm
cd /hive/data/inside/literature/pubtools/runs/pmcBlat/
pubBlat steps:annot-tables
exit
pubBlat load

# elsevier
cd /hive/data/inside/literature/pubtools/runs/elsBlat/
pubBlat init /hive/data/inside/literature/blat/elsevier/ /hive/data/inside/literature/text/elsevier
ssh swarm
cd /hive/data/inside/literature/pubtools/runs/elsBlat/
pubBlat steps:annot-tables
exit
pubBlat load
#--pubEnd

#########################################################################
# LASTZ/CHAIN/NET Medaka oryLat2 (DONE - 2012-10-23 - Hiram)
    screen -S oryLat2PetMar2 # use screen to control this job
    mkdir /cluster/data/oryLat2/bed/lastzPetMar2.2012-10-23
    cd /cluster/data/oryLat2/bed/lastzPetMar2.2012-10-23

    cat << '_EOF_' > DEF
# Medaka vs. Lamprey
BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.02/bin/lastz
BLASTZ_Y=3400
BLASTZ_L=6000
BLASTZ_K=2200
BLASTZ_M=50
BLASTZ_Q=/cluster/data/blastz/HoxD55.q

# TARGET: Medaka oryLat2 (40M chunks covers the largest chrom in one gulp)
SEQ1_DIR=/scratch/data/oryLat2/oryLat2.2bit
SEQ1_LEN=/scratch/data/oryLat2/chrom.sizes
SEQ1_CHUNK=40000000
SEQ1_LIMIT=25
SEQ1_LAP=10000

# QUERY: Lamprey petMar2, sequence is in contigs
SEQ2_DIR=/hive/data/genomes/petMar2/petMar2.2bit
SEQ2_LEN=/hive/data/genomes/petMar2/chrom.sizes
SEQ2_CHUNK=10000000
SEQ2_LAP=0
SEQ2_LIMIT=100

BASE=/cluster/data/oryLat2/bed/lastzPetMar2.2012-10-23
TMPDIR=/scratch/tmp
'_EOF_'
    # << this line keeps emacs coloring happy

    time nice -n +19 doBlastzChainNet.pl \
	`pwd`/DEF \
	-workhorse=hgwdev -chainMinScore=5000 -chainLinearGap=loose \
	-tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \
	-bigClusterHub=swarm -verbose=2 > do.log 2>&1 &
    #	real    654m52.829s
    cat fb.oryLat2.chainPetMar2Link.txt
    #	42137127 bases of 700386597 (6.016%) in intersection

    # set sym link to indicate this is the lastz for this genome:
    cd /hive/data/genomes/oryLat2/bed
    ln -s lastzPetMar2.2012-10-23 lastz.petMar2

    #	And, for the swap:
    mkdir /cluster/data/petMar2/bed/blastz.oryLat2.swap
    cd /cluster/data/petMar2/bed/blastz.oryLat2.swap
    time doBlastzChainNet.pl -verbose=2 -swap \
	/hive/data/genomes/oryLat2/bed/lastzPetMar2.2012-10-23/DEF \
	-chainMinScore=5000 -chainLinearGap=loose \
	-tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \
	-smallClusterHub=encodek -bigClusterHub=swarm > swap.log 2>&1 &
    #	real    8m42.333s
    cat fb.petMar2.chainOryLat2Link.txt
    #	31889826 bases of 647368134 (4.926%) in intersection

    # set sym link to indicate this is the lastz for this genome:
    cd /hive/data/genomes/petMar2/bed
    ln -s blastz.oryLat2.swap lastz.oryLat2

#########################################################################
# genscan - (DONE - 2013-11-12 - Hiram)
    mkdir /hive/data/genomes/oryLat2/bed/genscan
    cd /hive/data/genomes/oryLat2/bed/genscan
    time doGenscan.pl oryLat2 > do.log 2>&1
    #  real    25m6.013s
four broken jobs:
./lastRunGsBig.csh chr13 001 gtf/001/chr13.gtf pep/001/chr13.pep subopt/001/chr13.bed &
./lastRunGsBig.csh chr8 001 gtf/001/chr8.gtf pep/001/chr8.pep subopt/001/chr8.bed &
./lastRunGsBig.csh chr14 001 gtf/001/chr14.gtf pep/001/chr14.pep subopt/001/chr14.bed &
./lastRunGsBig.csh chr2 001 gtf/001/chr2.gtf pep/001/chr2.pep subopt/001/chr2.bed
XXX - running - Wed Nov 13 15:59:10 PST 2013
    # real    396m44.926s
XXX two still broken, running with 1800000 window size
    #  real    504m10.380s
XXX two still broken, running with 1600000 window size Fri Nov 15 08:10:22 PST 2013
real    404m29.012s
XXX two still broken, split and run

    export TOP="/hive/data/genomes/oryLat2/bed/genscan/splitRun"
    cd ${TOP}

for C in chr2 chr14
do
    mkdir ${C}
    cp -p ../hardMaskedFa/*/${C}.fa ${C}/${C}.fa
    cd ${C}
    mkdir split
    faSplit -lift=${C}.lift -minGapSize=100 gap ${C}.fa 500000 split/${C}_
    cd ${TOP}
done

    # run 'para time > run.time' on ku, then continuing:
    time doGenscan.pl -continue=makeBed oryLat2 > makeBed.log 2>&1
    #  real    4m12.345s

    cat fb.oryLat2.genscan.txt
    #   35480473 bases of 700386597 (5.066%) in intersection

    cat fb.oryLat2.genscanSubopt.txt
    #   26442347 bases of 700386597 (3.775%) in intersection

#########################################################################
##############################################################################
# TransMap V3 tracks. see makeDb/doc/transMapTracks.txt (2014-12-21 markd)
##############################################################################
