# for emacs: -*- mode: sh; -*-

# Bos Taurus -- Baylor Release 4.0 (Oct 4 2007)
#	"$Id: bosTau4.txt,v 1.23 2010/04/01 17:33:10 hiram Exp $"
#########################################################################
# DOWNLOAD SEQUENCE (DONE - 2008-03-07 - Hiram)
    ssh kkstore06
    mkdir /cluster/store3/bosTau4
    ln -s /cluster/store3/bosTau4 /cluster/data/bosTau4
    mkdir /cluster/data/bosTau4/baylor
    cd /cluster/data/bosTau4/baylor

    for F in README.Btau20070913.txt Contigs/*
do
    wget --timestamping \
"ftp://ftp.hgsc.bcm.tmc.edu/pub/data/Btaurus/fasta/Btau20070913-freeze/${F}"
done

    mkdir chroms
    cd chroms
    wget --timestamping \
"ftp://ftp.hgsc.bcm.tmc.edu/pub/data/Btaurus/fasta/Btau20070913-freeze/LinearScaffolds/*"

    #	Looking up chrM in Entrez nucleotide search,
    #	from bosTau3 note: NC_006853 GI:60101824
    #	search for Bos taurus mitochondrion complete genome

    #	Note, their Contigs files are from their 2006 release.
    #	Their chrom files are new to this release.

    #	fixup the Chr case names in the agp file:
    sed -e "s/^Chr/chr/" Btau20070913.All.ac.agp > bosTau4.agp

    #	fixup the contig names in the contigs.fa and qual files, utilize
    #	the following perl scripts:
    cat << '_EOF_' > fixContigNames.pl
#!/usr/bin/env perl

use warnings;
use strict;

my $argc = scalar(@ARGV);

if (1 != $argc) {
    print "usage ./fixContigNames.pl Btau20060815.contigs.fa.gz \\\n",
	" | gzip > bosTau4.contigs.fa.gz\n";
    exit 255;
}

my $fName = shift;

open (FH,"zcat $fName|") or die "can not read $fName $!";
while (my $line = <FH>) {
    if ($line =~ m/^>/) {
	my @a = split('\|', $line);
	$a[3] =~ s/\.[0-9]+//;
	print ">$a[3]\n";
    } else {
	print "$line";
    }
}
close (FH)
'_EOF_'
    # << happy emacs

    cat << '_EOF_' > fixQuals.pl
#!/usr/bin/env perl

use warnings;
use strict;

my $argc = scalar(@ARGV);

if (1 != $argc) {
    print "usage ./fixQuals.pl Btau20060815.contigs.fa.qual.gz \\\n",
	" | gzip > bosTau4.qual.gz\n";
    exit 255;
}

my $fName = shift;

open (FH,"zcat $fName|") or die "can not read $fName $!";
while (my $line = <FH>) {
    if ($line =~ m/^>/) {
	$line =~ s/\.[0-9]+.*//;
    }
    print "$line";
}
close (FH)
'_EOF_'
    # << happy emacs

    #	There are extra records in the qual and contigs fa files.
    #	Only the sequences in the agp file should be used:
    grep "^chr" bosTau4.agp | egrep -v "clone|fragment" | cut -f6 \
	| sort > agp.names
    #	run those scripts, extract only the used records, and lift the
    #	qual scores via the AGP file to a qac file.
    chmod +x fixContigNames.pl fixQuals.pl
    ./fixContigNames.pl Btau20060815.contigs.fa.gz \
	| faSomeRecords stdin agp.names stdout | gzip > bosTau4.contigs.fa.gz
    ./fixQuals.pl Btau20060815.contigs.fa.qual.gz \
	| faSomeRecords stdin agp.names stdout \
	| qaToQac stdin stdout \
	| qacAgpLift bosTau4.agp stdin bosTau4.qual.qac

#############################################################################
#  running makeGenomeDb.pl (DONE - 2008-03-07 - Hiram)
    ssh kkstore06
    cd /cluster/data/bosTau4
    cat << '_EOF_' > bosTau4.config.ra
db bosTau4
clade mammal
scientificName Bos Taurus
assemblyDate Oct. 2007
assemblyLabel Baylor Release 4.0
orderKey 236
dbDbSpeciesDir cow
mitoAcc 60101824
agpFiles /cluster/data/bosTau4/baylor/bosTau4.agp
fastaFiles /cluster/data/bosTau4/baylor/bosTau4.contigs.fa.gz
qualFiles /cluster/data/bosTau4/baylor/bosTau4.qual.qac
commonName Cow
'_EOF_'
    # << happy emacs

    makeGenomeDb.pl bosTau4.config.ra > makeGenomeDb.log 2>&1

#########################################################################
# REPEATMASKER (DONE - 2008-03-07 - Hiram)
    ssh kkstore06
    screen # use a screen to manage this job
    mkdir /cluster/data/bosTau4/bed/repeatMasker
    cd /cluster/data/bosTau4/bed/repeatMasker
    # 
    #	This was going too slow on memk, on the order of 8 days.
    #	chilled that memk batch, then switched it to finish on pk
    doRepeatMasker.pl -buildDir=/cluster/data/bosTau4/bed/repeatMasker \
	-bigClusterHub=memk bosTau4 > do.log 2>&1 &
    #	continuing
    doRepeatMasker.pl -buildDir=/cluster/data/bosTau4/bed/repeatMasker \
	-continue=cat -bigClusterHub=pk bosTau4 > cat.log 2>&1 &

    time nice -n +19 featureBits bosTau4 rmsk > fb.bosTau4.rmsk.txt 2>&1 &
    #	1280927549 bases of 2731830700 (46.889%) in intersection




    # RepeatMasker and lib version from do.log:
    #	RepeatMasker,v 1.20 2008/01/16 18:15:45 angie Exp $
    #   Jan 11 2008 (open-3-1-9) version of RepeatMasker
    #	CC   RELEASE 20071204;

    # Compare coverage to previous assembly:
    featureBits bosTau3 rmsk
    # 1200525422 bases of 2731807384 (43.946%) in intersection
    featureBits bosTau2 rmsk
    # 1291561904 bases of 2812203870 (45.927%) in intersection

#########################################################################
# SIMPLE REPEATS TRF (DONE - 2008-03-07 - Hiram)
    ssh kkstore06
    screen # use a screen to manage this job
    mkdir /cluster/data/bosTau4/bed/simpleRepeat
    cd /cluster/data/bosTau4/bed/simpleRepeat
    # 
    doSimpleRepeat.pl -buildDir=/cluster/data/bosTau4/bed/simpleRepeat \
	bosTau4 > do.log 2>&1 &
    #	after RM run is done, add this mask:
    cd /cluster/data/bosTau4
    twoBitMask bosTau4.rmsk.2bit -add bed/simpleRepeat/trfMask.bed bosTau4.2bit

    twoBitToFa bosTau4.2bit stdout | faSize stdin
    #	2917974530 bases (186161294 N's 2731813236 real
    #	1450855409 upper 1280957827 lower) in 11900 sequences in 1 files
    #	%43.90 masked total, %46.89 masked real

    twoBitToFa bosTau4.rmsk.2bit stdout | faSize stdin
    #	2917974530 bases (186161294 N's 2731813236 real
    #	1451369861 upper 1280443375 lower) in 11900 sequences in 1 files
    #	%43.88 masked total, %46.87 masked real

#########################################################################
# Create OOC file for genbank runs (DONE - 2008-03-10 - Hiram)
# use same repMatch value as bosTau2
    ssh kkstore06
    cd /cluster/data/bosTau3
    blat bosTau4.2bit /dev/null /dev/null -tileSize=11 \
	-makeOoc=bosTau4.11.ooc -repMatch=1005

    ssh kkr1u00
    mkdir /iscratch/i/bosTau4
    cd /iscratch/i/bosTau4
    cp -p /cluster/data/bosTau4/bosTau4.2bit .
    for R in 2 3 4 5 6 7 8
do
    rsync -a --progress ./ kkr${R}u00:/iscratch/i/bosTau4/
done

#########################################################################
#  Starting Genbank
    ssh hgwdev
    cd $HOME/kent/src/hg/makeDb/genbank
    # edit etc/genbank.conf to add the following entry:

# bosTau4 (B. taurus) 31 chromosomes plus 11869 scaffolds
bosTau4.serverGenome = /cluster/data/bosTau4/bosTau4.2bit
bosTau4.clusterGenome = /iscratch/i/bosTau4/bosTau4.2bit
bosTau4.ooc = /cluster/data/bosTau4/bosTau4.11.ooc
bosTau4.refseq.mrna.native.pslCDnaFilter  = ${lowCover.refseq.mrna.native.pslCDnaFilter}
bosTau4.refseq.mrna.xeno.pslCDnaFilter    = ${lowCover.refseq.mrna.xeno.pslCDnaFilter}
bosTau4.genbank.mrna.native.pslCDnaFilter = ${lowCover.genbank.mrna.native.pslCDnaFilter}
bosTau4.genbank.mrna.xeno.pslCDnaFilter   = ${lowCover.genbank.mrna.xeno.pslCDnaFilter}
bosTau4.genbank.est.native.pslCDnaFilter  = ${lowCover.genbank.est.native.pslCDnaFilter}
bosTau4.lift = no
bosTau4.downloadDir = bosTau4
bosTau4.perChromTables = no
bosTau4.mgc = yes

    cvs ci -m "Added bosTau4." etc/genbank.conf
    # update /cluster/data/genbank/:
    make etc-update

    ssh genbank
    screen  # control this business with a screen since it takes a while
    cd /cluster/data/genbank

    # This is a call to a script that will push our jobs out to the cluster
    # since it's a big job.  
    time nice -n +19 bin/gbAlignStep -initial bosTau4 &
    #	logFile: var/build/logs/2008.03.10-14:14:43.bosTau4.initalign.log
    #	real    567m7.431s

    # load database when finished
    ssh hgwdev
    cd /cluster/data/genbank
    time nice -n +19 ./bin/gbDbLoadStep -drop -initialLoad bosTau4
    #	logFile: var/dbload/hgwdev/logs/2008.03.11-08:48:17.dbload.log
    #	real    34m55.565s

    # enable daily alignment and update of hgwdev (DONE - 2008-03-11 - Hiram)
    cd ~/kent/src/hg/makeDb/genbank
    cvsup
    # add bosTau4 to:
        etc/align.dbs
        etc/hgwdev.dbs
    cvs ci -m "Added bosTau4." etc/align.dbs etc/hgwdev.dbs
    make etc-update

############################################################################
#	DONE - 2008-03-11 - Hiram
#	Reset default position to an area of chr3 with a number of genes
    hgsql -e \
'update dbDb set defaultPos="chr3:21083267-21232422" where name="bosTau4";' \
	hgcentraltest
    #	And there was a mistake in the description date entry
    hgsql -e \
'update dbDb set description="Oct. 2007" where name="bosTau4";' \
	hgcentraltest

#########################################################################
## genscan run (DONE - 2008-03-11 - Hiram)
##	create hard masked sequence
    ssh kkstore06
    cd /cluster/data/bosTau4
    mkdir hardMasked
    for C in `cut -f1 chrom.sizes`
do
    echo "hardMasked/${C}.hard.fa"
    twoBitToFa -seq=${C} bosTau4.2bit stdout \
	| maskOutFa stdin hard hardMasked/${C}.hard.fa
    ls -ld "hardMasked/${C}.hard.fa"
done

    #	And, make sure there aren't any sequences in this lot that have
    #	become all N's with no sequence left in them.  This drives genscan nuts
    echo hardMasked/*.hard.fa | xargs faCount > faCount.hard.txt

    #	the lowest three are:
    egrep -v "^#|^total" faCount.hard.txt \
	| awk '{print $1,$2-$7}' | sort -k2,2nr | tail -3
    #	chrUn.004.9957 0
    #	chrUn.004.9961 0
    #	chrUn.004.9996 0
    #	There are a whole bunch of these, and many with just a few bases.
    #	Actually, before removing these for genscan, run the cpgIsland
    #	business first since it can work on them all.
    #	So, remove any with less than 100 bases of sequence
    egrep -v "^#|^total" faCount.hard.txt \
| awk '{size=$2-$7; if (size < 100){printf "hardMasked/%s.hard.fa\n", $1}}' \
    | xargs rm

    #	now get them over to a kluster location
    mkdir /san/sanvol1/scratch/bosTau4/hardChunks
    cd /san/sanvol1/scratch/bosTau4/hardChunks
    #	creating 4,000,000 sized chunks, the chroms stay together as
    #	single pieces.  The contigs get grouped together into 4,000,000
    #	sized fasta files.  You don't want to break these things up
    #	because genscan will be doing its own internal 2.4 million
    #	window on these pieces, and the gene names are going to be
    #	constructed from the sequence name in these fasta files.
    echo /cluster/data/bosTau4/hardMasked/*.hard.fa | xargs cat \
	| faSplit about stdin 4000000 c_

    ssh hgwdev
    mkdir /cluster/data/bosTau4/bed/genscan
    cd /cluster/data/bosTau4/bed/genscan
    # Check out hg3rdParty/genscanlinux to get latest genscan:
    cvs co hg3rdParty/genscanlinux

    # Run on small cluster (more mem than big cluster).
    ssh memk
    cd /cluster/data/bosTau4/bed/genscan
    # Make 3 subdirectories for genscan to put their output files in
    mkdir gtf pep subopt
    # Generate a list file, genome.list, of all the hard-masked contigs that 
    # *do not* consist of all-N's (which would cause genscan to blow up)
    #	Since we split on gaps, we have no chunks like that.  You can
    #	verify with faCount on the chunks.
    ls -1Sr /san/sanvol1/scratch/bosTau4/hardChunks/c_*.fa > genome.list

    # Create run-time script to operate gsBig in a cluster safe manner
    cat << '_EOF_' > runGsBig
#!/bin/csh -fe
set runDir = `pwd`
set srcDir = $1
set inFile = $2
set fileRoot = $inFile:r
mkdir /scratch/tmp/$fileRoot
cp -p $srcDir/$inFile /scratch/tmp/$fileRoot
pushd /scratch/tmp/$fileRoot
/cluster/bin/x86_64/gsBig $inFile $fileRoot.gtf -trans=$fileRoot.pep -subopt=$fileRoot.bed -exe=$runDir/hg3rdParty/genscanlinux/genscan -par=$runDir/hg3rdParty/genscanlinux/HumanIso.smat -tmp=/scratch/tmp -window=2400000
popd
cp -p /scratch/tmp/$fileRoot/$fileRoot.gtf gtf
cp -p /scratch/tmp/$fileRoot/$fileRoot.pep pep
cp -p /scratch/tmp/$fileRoot/$fileRoot.bed subopt
rm -fr /scratch/tmp/$fileRoot
'_EOF_'
    # << happy emacs
    chmod +x runGsBig
    cat << '_EOF_' > template
#LOOP
runGsBig /san/sanvol1/scratch/bosTau4/hardChunks $(file1) {check out line gtf/$(root1).gtf} {check out line pep/$(root1).pep} {check out line subopt/$(root1).bed} 
#ENDLOOP
'_EOF_'
    # << happy emacs

    gensub2 genome.list single template jobList
    para create jobList
    para try, check, push, check, ...
XXX - running 2008-03-11 11:20
# Completed: 97 of 97 jobs
# CPU time in finished jobs:      93952s    1565.87m    26.10h    1.09d  0.003 y
# IO & Wait Time:                   372s       6.19m     0.10h    0.00d  0.000 y
# Average job time:                 972s      16.21m     0.27h    0.01d
# Longest running job:                0s       0.00m     0.00h    0.00d
# Longest finished job:            4870s      81.17m     1.35h    0.06d
# Submission to last job:          7902s     131.70m     2.19h    0.09d

    # cat and lift the results into single files
    ssh kkstore06
    cd /cluster/data/bosTau4/bed/genscan
    sort -k1,1 -k4.4n gtf/c_*.gtf > genscan.gtf
    sort -k1,1 -k2,2n subopt/c_*.bed > genscanSubopt.bed 
    cat pep/c_*.pep > genscan.pep

    # Load into the database as so:
    ssh hgwdev
    cd /cluster/data/bosTau4/bed/genscan
    ldHgGene bosTau4 -gtf genscan genscan.gtf
    #	Read 49598 transcripts in 346887 lines in 1 files
    #	49598 groups 5059 seqs 1 sources 1 feature types
    #	49598 gene predictions

    hgPepPred bosTau4 generic genscanPep genscan.pep
    hgLoadBed bosTau4 genscanSubopt genscanSubopt.bed
    #	Loaded 528092 elements of size 6

    #	let's check the numbers
    time nice -n +19 featureBits bosTau4 genscan
    #	58224594 bases of 2731830700 (2.131%) in intersection
    time nice -n +19 featureBits bosTau3 genscan
    #	59251085 bases of 2731807384 (2.169%) in intersection

#########################################################################
# CPGISLANDS (DONE - 2008-03-11 - Hiram)
    ssh hgwdev
    mkdir /cluster/data/bosTau4/bed/cpgIsland
    cd /cluster/data/bosTau4/bed/cpgIsland

    # Build software from Asif Chinwalla (achinwal@watson.wustl.edu)
    cvs co hg3rdParty/cpgIslands
    cd hg3rdParty/cpgIslands
    make
    #	There was a problem in here, in both cpg.c and cpg_lh.c:
    #	warning: conflicting types for built-in function 'malloc'
    #	comment out the lines where malloc is declared to get this to build
    #	gcc readseq.c cpg_lh.c -o cpglh.exe
    cd ../..
    ln -s hg3rdParty/cpgIslands/cpglh.exe .
    
    # There may be warnings about "bad character" for IUPAC ambiguous 
    # characters like R, S, etc.  Ignore the warnings.  
    mkdir results
    echo ../../hardMasked/*.hard.fa | sed -e "s/ /\n/g" | while read F
    do
	FA=${F/*\/}
	C=${FA/.hard.fa/}
	echo "./cpglh.exe ${FA} > results/${C}.cpg"
	nice -n +19 ./cpglh.exe ${F} > results/${C}.cpg
    done > cpglh.out 2>&1 &
    #	about 5 minutes

    # Transform cpglh output to bed +
    cat << '_EOF_' > filter.awk
{
$2 = $2 - 1;
width = $3 - $2;
printf("%s\t%d\t%s\t%s %s\t%s\t%s\t%0.0f\t%0.1f\t%s\t%s\n",
       $1, $2, $3, $5,$6, width,
       $6, width*$7*0.01, 100.0*2*$6/width, $7, $9);
}
'_EOF_'
    #	<< happy emacs
    catDir results \
    | awk -f filter.awk | sort -k1,1 -k2,2n > cpgIsland.bed

    ssh hgwdev
    cd /cluster/data/bosTau4/bed/cpgIsland
    hgLoadBed bosTau4 cpgIslandExt -tab \
      -sqlTable=$HOME/kent/src/hg/lib/cpgIslandExt.sql cpgIsland.bed
    #	Reading cpgIsland.bed
    #	Loaded 37595 elements of size 10
    featureBits bosTau4 cpgIslandExt
    #	24202824 bases of 2731830700 (0.886%) in intersection
    featureBits bosTau3 cpgIslandExt
    #	24374280 bases of 2731807384 (0.892%) in intersection

#########################################################################
# BLASTZ/CHAIN/NET BOSTAU4 (DONE - 2008-03-11 - Hiram)
    ssh kkstore02
    screen # use a screen to manage this multi-day job
    mkdir /cluster/data/hg18/bed/blastzBosTau4.2008-03-11
    cd /cluster/data/hg18/bed/
    ln -s blastzBosTau4.2008-03-11 blastz.bosTau4
    cd blastzBosTau4.2008-03-11

    cat << '_EOF_' > DEF
BLASTZ_M=50

# TARGET: Human Hg18
SEQ1_DIR=/cluster/bluearc/scratch/data/hg18/nib
SEQ1_LEN=/cluster/data/hg18/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000

# QUERY: Cow bosTau4
SEQ2_DIR=/san/sanvol1/scratch/bosTau4/bosTau4.2bit
SEQ2_LEN=/cluster/data/bosTau4/chrom.sizes
# Maximum number of scaffolds that can be lumped together
SEQ2_LIMIT=300
SEQ2_CHUNK=20000000
SEQ2_LAP=0

BASE=/cluster/data/hg18/bed/blastzBosTau4.2008-03-11
TMPDIR=/scratch/tmp
'_EOF_'
    # << this line keeps emacs coloring happy

    time nice -n +19 doBlastzChainNet.pl `pwd`/DEF \
	-bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
	    -syntenicNet > do.log 2>&1
    #	real    611m17.901s
    cat fb.hg18.chainBosTau4Link.txt
    #	1345348636 bases of 2881515245 (46.689%) in intersection

    mkdir /cluster/data/bosTau4/bed/blastz.hg18.swap
    cd /cluster/data/bosTau4/bed/blastz.hg18.swap
    time nice -n +19 doBlastzChainNet.pl \
	/cluster/data/hg18/bed/blastzBosTau4.2008-03-11/DEF \
	-bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
	    -swap -syntenicNet > do.log 2>&1
    #	broken down during netSynteny.csh due to too many open files on
    #	a chainSplit
    #	To get them split, set up a job on memk to do the split
    #	via chainFilter: chainFilter -t=${T} bigFile.chain > ${T}.chain
    #   Where the targets T are from the list:
    #   grep "^chain " bigFile.chain | awk '{print $3}' | sort -u
    #	see full example below in Platypus Blastz

###########################################################################
# Blastz Platypus ornAna1 (DONE - 2008-03-11,14 - Hiram)
    ssh kkstore06
    screen # use screen to control this job
    mkdir /cluster/data/bosTau4/bed/blastzOrnAna1.2008-03-11
    cd /cluster/data/bosTau4/bed/blastzOrnAna1.2008-03-11

    cat << '_EOF_' > DEF
# Cow vs. platypus
BLASTZ_Y=3400
BLASTZ_L=6000
BLASTZ_K=2200
BLASTZ_Q=/cluster/data/blastz/HoxD55.q
BLASTZ_M=50

# TARGET: Cow bosTau4
SEQ1_DIR=/san/sanvol1/scratch/bosTau4/bosTau4.2bit
SEQ1_LEN=/cluster/data/bosTau4/chrom.sizes
# Maximum number of scaffolds that can be lumped together
SEQ1_LIMIT=300
SEQ1_CHUNK=20000000
SEQ1_LAP=0

# QUERY: Platypus ornAna1
SEQ2_DIR=/scratch/data/ornAna1/ornAna1.2bit
SEQ2_LEN=/cluster/data/ornAna1/chrom.sizes
SEQ2_CHUNK=20000000
SEQ2_LIMIT=300
SEQ2_LAP=0

BASE=/cluster/data/bosTau4/bed/blastzOrnAna1.2008-03-11
TMPDIR=/scratch/tmp
'_EOF_'
    # << happy emacs

    time nice -n +19 doBlastzChainNet.pl `pwd`/DEF \
	-chainMinScore=5000 -verbose=2 \
	-chainLinearGap=loose -bigClusterHub=kk > do.log 2>&1 &
    #	real    1233m16.397s
    #	the pk kluster became free as this job had a long time to go.
    #	Unsure how long it had to go because the batch became confused
    #	and couldn't calculate eta, it had waiting=0 jobs ?  So, chill
    #	out the batch, allow the running jobs to complete,
    #	go to pk and get the jobs completed there.
    #	There was a difficulty with the para.results file.  It lost track
    #	of 1158 jobs, but they were completed and results exist.
    #	then continuing:
    time nice -n +19 doBlastzChainNet.pl `pwd`/DEF \
	-chainMinScore=5000 -verbose=2 -smallClusterHub=memk \
	-continue=cat -chainLinearGap=loose -bigClusterHub=pk > cat.log 2>&1 &
    #	real    66m36.465s
    cat fb.bosTau4.chainOrnAna1Link.txt
    #	201799073 bases of 2731830700 (7.387%) in intersection

    mkdir /cluster/data/ornAna1/bed/blastz.bosTau4.swap
    cd /cluster/data/ornAna1/bed/blastz.bosTau4.swap
    time nice -n +19 doBlastzChainNet.pl \
	/cluster/data/bosTau4/bed/blastzOrnAna1.2008-03-11/DEF \
	-chainMinScore=5000 -verbose=2 -smallClusterHub=memk \
	-swap -chainLinearGap=loose -bigClusterHub=pk > do.log 2>&1 &
    #	real    106m17.385s
    cat fb.ornAna1.chainBosTau4Link.txt
    #	187868681 bases of 1842236818 (10.198%) in intersection

    #	need to split up the chain file to run the reciprocal net
    #	there are too many chroms to work with chain split
    #	do it with chainFilter as a kluster job on memk
    ssh memk
    mkdir /scratch/tmp/ornAna1
    cd /scratch/tmp/ornAna1
    for R in 0 1 2 3 4 5 6 7
do
    echo $R
    ssh mkr0u${R} mkdir /scratch/tmp/ornAna1
    ssh mkr0u${R} cp -p /cluster/data/bosTau4/bed/blastz.ornAna1/axtChain/bosTau4.ornAna1.all.chain.gz /scratch/tmp/ornAna1
    ssh mkr0u${R} gunzip /scratch/tmp/ornAna1/bosTau4.ornAna1.all.chain.gz
done

    zcat /cluster/data/bosTau4/bed/blastz.ornAna1/axtChain/bosTau4.ornAna1.all.chain.gz | grep "^chain "  | awk '{print $3}' | sort -u > chain.list
    cat << '_EOF_' > splitOne.csh
#!/bin/csh -fe
set tmpDir = /scratch/tmp/ornAna1
set T=$1
chainFilter -t=${T} $tmpDir/bosTau4.ornAna1.all.chain > $tmpDir/${T}.chain
'_EOF_'
    # << happy emacs

    cat << '_EOF_' > template
#LOOP
./splitOne.csh $(file1)
'_EOF_'
    # << happy emacs

    gensub2 chain.list single template jobList
    para create jobList
    para try ... check ... push ... time ...
    #	fetch the results from tmp directories
    for R in 0 1 2 3 4 5 6 7
do
    ssh mkr0u${R} "(cd /scratch/tmp/ornAna1; cp -p -u chr*.chain /cluster/data/bosTau4/bed/blastz.ornAna1/axtChain/chain)"
done

############################################################################
#  BLATSERVERS ENTRY (DONE - 2008-03-12 - Hiram)
#	After getting a blat server assigned by the Blat Server Gods,
    ssh hgwdev

    hgsql -e 'INSERT INTO blatServers (db, host, port, isTrans, canPcr) \
	VALUES ("bosTau4", "blat2", "17778", "1", "0"); \
	INSERT INTO blatServers (db, host, port, isTrans, canPcr) \
	VALUES ("bosTau4", "blat2", "17779", "0", "1");' \
	    hgcentraltest
    #	test it with some sequence

#########################################################################
## 5-Way Multiz (DONE - 2008-03-18 - Hiram)
##
    #	the all.chain.gz files were split up via kluster jobs on memk
    #	in order to get mafSynNet files.  Example above in ornAna1 blastz
    ssh hgwdev
    mkdir /cluster/data/bosTau4/bed/multiz5way
    cd /cluster/data/bosTau4/bed/multiz5way
    #	take the 30-way tree from mm9 and eliminate genomes not in
    #	this alignment
    #	rearrange to get bosTau4 on the top of the graph
    #	paste this tree into the on-line phyloGif tool:
    #	http://genome.ucsc.edu/cgi-bin/phyloGif
    #	to create the image for the tree diagram

    #	select the 9 organisms from the 30-way recently done on mouse mm9
    /cluster/bin/phast/tree_doctor \
--prune-all-but Human_hg18,Mouse_mm9,Dog_canFam2,Platypus_ornAna1,Cow_bosTau3 \
	/cluster/data/mm9/bed/multiz30way/mm9OnTop.fullNames.nh \
	| sed -e "s/bosTau3/bosTau4/g" > 5-way.fullNames.nh

    #	looks something like this:
(((Mouse_mm9:0.325818,Human_hg18:0.126901):0.019763,
(Dog_canFam2:0.156637,Cow_bosTau4:0.163945):0.031326):0.332197,
Platypus_ornAna1:0.488110);

    #	rearrange to get Cow at the top:
    # this leaves us with:
    cat << '_EOF_' > bosTau4.5-way.nh
(((Cow_bosTau4:0.163945,Dog_canFam2:0.156637):0.031326,
   (Human_hg18:0.126901,Mouse_mm9:0.325818):0.019763):0.332197,
   Platypus_ornAna1:0.488110);
'_EOF_'
    #	<< happy emacs

    #	create a species list from that file:
    sed -e 's/[()]//g; s/ /\n/g; s/,/\n/g' bosTau4.5-way.nh \
        | sed -e "s/[ \t]*//g; /^[ \t]$/d; /^$/d" | sort -u \
        | sed -e "s/.*_//; s/:.*//" | sort > species.list
    #	verify that has 5 db names in it
    # create a stripped down nh file for use in autoMZ run
    echo \
`sed 's/[a-zA-Z0-9]*_//g; s/:0.[0-9]*//g; s/[,;]/ /g' bosTau4.5-way.nh \
	| sed -e "s/  / /g"` > tree.5.nh
    #	that looks like, as a single line:
    #	(((bosTau4 canFam2) (hg18 mm9)) ornAna1)

    # verify all blastz's exists
    cat << '_EOF_' > listMafs.csh
#!/bin/csh -fe
cd /cluster/data/bosTau4/bed/multiz5way
foreach db (`grep -v bosTau4 species.list`)
    set bdir = /cluster/data/bosTau4/bed/blastz.$db
    if (-e $bdir/mafRBestNet/chr1.maf.gz) then
	echo "$db mafRBestNet"
    else if (-e $bdir/mafSynNet/chr1.maf.gz) then
	echo "$db mafSynNet"
    else if (-e $bdir/mafNet/chr1.maf.gz) then
	echo "$db mafNet"
    else
	echo "$db mafs not found"
    endif
end
'_EOF_'
    # << happy emacs
    chmod +x ./listMafs.csh
    #	see what it says:
    ./listMafs.csh
# canFam2 mafSynNet
# hg18 mafSynNet
# mm9 mafSynNet
# ornAna1 mafSynNet

    /cluster/bin/phast/all_dists bosTau4.5-way.nh > 5way.distances.txt
    grep -i bostau 5way.distances.txt | sort -k3,3n
# Cow_bosTau4     Dog_canFam2     0.320582
# Cow_bosTau4     Human_hg18      0.341935
# Cow_bosTau4     Mouse_mm9       0.540852
# Cow_bosTau4     Platypus_ornAna1        1.015578

    #	use the calculated
    #	distances in the table below to order the organisms and check
    #	the button order on the browser.
    #	And if you can fill in the table below entirely, you have
    #	succeeded in finishing all the alignments required.
    #
#                         featureBits chainLink measures
#                                       chainCalJac1Link   chain   linearGap
#    distance                     on CalJac1    on other   minScore
#  1  0.320582 Dog_canFam2        (% 82.846)   (% 78.351)   3000     medium
#  2  0.341935 Human_hg18         (% 74.810)   (% 77.648)   3000     medium
#  3  0.540852 Mouse_mm9          (% 76.925)   (% 74.694)   3000     medium
#  4  1.015578 Platypus_ornAna1   (% 77.296)   (% 76.308)   3000     medium

    #	create a coherent set of all the mafs involved in this run
    mkdir mafLinks
    cd mafLinks
    ln -s ../../blastz.canFam2/mafSynNet ./canFam2
    ln -s ../../blastz.hg18/mafSynNet ./hg18
    ln -s ../../blastz.mm9/mafSynNet ./mm9
    ln -s ../../blastz.ornAna1/mafSynNet ./ornAna1

    #	check data size:
    du -hscL *
# 982M    canFam2
# 940M    hg18
# 483M    mm9
# 79M     ornAna1
# 2.5G    total

    # copy net mafs to cluster-friendly storage
    ssh kkstore06
    cd /cluster/data/bosTau4/bed/multiz5way/mafLinks
    mkdir -p /san/sanvol1/scratch/bosTau4/multiz5way/mafs
    rsync -a --progress --copy-links ./ \
	/san/sanvol1/scratch/bosTau4/multiz5way/mafs/
    #	create a run-time list of contigs to operate on, not all contigs
    #	exist in all alignments, but we want all contig names used in any
    #	alignment:
    cd /san/sanvol1/scratch/bosTau4/multiz5way/mafs
    for D in *
do
    cd "${D}"
    find . -type f
    cd ..
done | sed -e "s#^./##; s#.maf##" | sort -u > /tmp/5-way.contig.list
    wc -l /tmp/5-way.contig.list
    #	2321 /tmp/5-way.contig.list

    # ready for the multiz run
    ssh pk
    mkdir /cluster/data/bosTau4/bed/multiz5way/splitRun
    cd /cluster/data/bosTau4/bed/multiz5way/splitRun
    scp -p kkstore06:/tmp/5-way.contig.list .
    mkdir -p maf run
    cd run
    mkdir penn
    # use latest penn utilities
    P=/cluster/bin/penn/multiz.v11.2007-03-19/multiz-tba
    cp -p $P/{autoMZ,multiz,maf_project} penn

    #	set the db and pairs directories here
    cat << '_EOF_' > autoMultiz.csh
#!/bin/csh -ef
set db = bosTau4
set c = $1
set result = $2
set resultDir = $result:h
set run = `pwd`
set tmp = /scratch/tmp/$db/multiz.$c
set pairs = /san/sanvol1/scratch/$db/multiz5way/mafs
rm -fr $tmp
mkdir -p $tmp
mkdir -p $resultDir
cp ../../tree.5.nh ../../species.list $tmp
pushd $tmp
foreach s (`grep -v $db species.list`)
    set in = $pairs/$s/$c.maf
    set out = $db.$s.sing.maf
    if (-e $in.gz) then
	zcat $in.gz > $out
    else if (-e $in) then
	cp $in $out
    else
	echo "##maf version=1 scoring=autoMZ" > $out
    endif
end
set path = ($run/penn $path); rehash
$run/penn/autoMZ + T=$tmp E=$db "`cat tree.5.nh`" $db.*.sing.maf $c.maf
popd
cp $tmp/$c.maf $result
rm -fr $tmp
rmdir --ignore-fail-on-non-empty /scratch/tmp/$db
'_EOF_'
# << happy emacs
    chmod +x autoMultiz.csh

    cat  << '_EOF_' > template
#LOOP
./autoMultiz.csh $(root1) {check out line+ /cluster/data/bosTau4/bed/multiz5way/splitRun/maf/$(root1).maf}
#ENDLOOP
'_EOF_'
    # << emacs

    gensub2 ../5-way.contig.list single template jobList

    para create jobList
    para try ... check ... push ... etc
# Completed: 2321 of 2321 jobs
# CPU time in finished jobs:      35680s     594.67m     9.91h    0.41d  0.001 y
# IO & Wait Time:                  9398s     156.63m     2.61h    0.11d  0.000 y
# Average job time:                  19s       0.32m     0.01h    0.00d
# Longest finished job:            1982s      33.03m     0.55h    0.02d
# Submission to last job:          3983s      66.38m     1.11h    0.05d

    # put the split maf results back together into a single maf file
    #	eliminate duplicate comments
    ssh kkstore06
    cd /cluster/data/bosTau4/bed/multiz5way
    grep "^##maf version" splitRun/maf/chr1.maf \
	| sort -u > bosTau4.5way.maf
    for F in `find ./splitRun/maf -type f -depth`
do
    grep -h "^#" "${F}" | egrep -v "maf version=1|eof maf" \
	| sed -e "s#/_MZ_[^ ]* # #g; s#__[0-9]##g"
done | sort -u >> bosTau4.5way.maf
    for F in `find ./splitRun/maf -type f -depth`
do
    grep -v -h "^#" "${F}"
done >> bosTau4.5way.maf
    grep "^##eof maf" splitRun/maf/chr1.maf \
	| sort -u >> bosTau4.5way.maf

    # load tables for a look
    ssh hgwdev
    mkdir -p /gbdb/bosTau4/multiz5way/maf
    ln -s /cluster/data/bosTau4/bed/multiz5way/*.maf \
                /gbdb/bosTau4/multiz5way/maf/multiz5way.maf
    # this generates an immense multiz5way.tab file in the directory
    #	where it is running.  Best to run this over in scratch.
    cd /scratch/tmp
    time nice -n +19 hgLoadMaf \
	-pathPrefix=/gbdb/bosTau4/multiz5way/maf bosTau4 multiz5way
    #	real    3m31.593s
    #	Loaded 5392296 mafs in 1 files from /gbdb/bosTau4/multiz5way/maf
    # load summary table
    time nice -n +19 cat /gbdb/bosTau4/multiz5way/maf/*.maf \
	| hgLoadMafSummary bosTau4 -minSize=30000 -mergeGap=1500 \
	 -maxSize=200000  multiz5waySummary stdin
    #	real    3m41.663s
    #	Created 779088 summary blocks from 10697864 components and 5164426 mafs from stdin

    # Gap Annotation
    # prepare bed files with gap info
    ssh kkstore06
    mkdir /cluster/data/bosTau4/bed/multiz5way/anno
    cd /cluster/data/bosTau4/bed/multiz5way/anno
    mkdir maf run

    #	these actually already all exist from previous multiple alignments
    #	remove the echo in front of the twoBitInfo to actually make it work
    for DB in `cat ../species.list`
do
    CDIR="/cluster/data/${DB}"
    if [ ! -f ${CDIR}/${DB}.N.bed ]; then
	echo "creating ${DB}.N.bed"
	echo twoBitInfo -nBed ${CDIR}/${DB}.2bit ${CDIR}/${DB}.N.bed
    else
	ls -og ${CDIR}/${DB}.N.bed
    fi
done

    cd run
    rm -f nBeds sizes
    for DB in `grep -v bosTau4 ../../species.list`
do
    echo "${DB} "
    ln -s  /cluster/data/${DB}/${DB}.N.bed ${DB}.bed
    echo ${DB}.bed  >> nBeds
    ln -s  /cluster/data/${DB}/chrom.sizes ${DB}.len
    echo ${DB}.len  >> sizes
done

    ssh memk
    #	temporarily copy the bosTau4.5way.maf file onto the memk
    #	nodes /scratch/data/bosTau4/maf/ directory
    for R in 0 1 2 3 4 5 6 7
do
    ssh mkr0u${R} rsync -a --progress \
/cluster/data/bosTau4/bed/multiz5way/bosTau4.5way.maf \
	/scratch/data/bosTau4/maf/
done
    mkdir /cluster/data/bosTau4/bed/multiz5way/anno/splitMaf
    #	need to split up the single maf file into individual
    #	per-scaffold maf files to run annotation on
    cd /cluster/data/bosTau4/bed/multiz5way/anno/splitMaf
    #	create bed files to list approximately 1553 scaffolds in
    #	a single list, approximately 33 lists
    cat << '_EOF_' > mkBedLists.pl
#!/usr/bin/env perl

use strict;
use warnings;

my $bedCount = 0;
my $i = 0;

my $bedFile = sprintf("file_%d.bed", $bedCount);

open (BF,">$bedFile") or die "can not write to $bedFile $!";

open (FH,"</cluster/data/bosTau4/chrom.sizes") or
        die "can not read /cluster/data/bosTau4/chrom.sizes $!";
while (my $line = <FH>) {
    chomp $line;
    if ( (($i + 1) % 1553) == 0 )  {
        printf "%s\n", $line;
        close (BF);
        ++$bedCount;
        $bedFile = sprintf("file_%d.bed", $bedCount);
        open (BF,">$bedFile") or die "can not write to $bedFile $!";
    }
    ++$i;
    my ($chr, $size) = split('\s+',$line);
    printf BF "%s\t0\t%d\t%s\n", $chr, $size, $chr;
}
close (FH);
close (BH);
'_EOF_'
    # << happy emacs
    chmod +x mkBedLists.pl
    ./mkBedLists.pl

    #	now, run a mafsInRegion on each one of those lists
    cat << '_EOF_' > runOne
#!/bin/csh -fe
set runDir = "/cluster/data/bosTau4/bed/multiz5way/anno/splitMaf"
set resultDir = $1
set bedFile = $resultDir.bed
mkdir -p $resultDir
mkdir -p /scratch/tmp/bosTau4/$resultDir
pushd /scratch/tmp/bosTau4/$resultDir
mafsInRegion $runDir/$bedFile -outDir . \
        /scratch/data/bosTau4/maf/bosTau4.5way.maf
popd
rsync -q -a /scratch/tmp/bosTau4/$resultDir/ ./$resultDir/
rm -fr /scratch/tmp/bosTau4/$resultDir
rmdir --ignore-fail-on-non-empty /scratch/tmp/bosTau4
'_EOF_'
    # << happy emacs
    chmod +x runOne

    cat << '_EOF_' > template
#LOOP
./runOne $(root1)
#ENDLOOP
'_EOF_'
    # << happy emacs

    ls file*.bed > runList
    gensub2 runList single template jobList
    para create jobList
    para try ... check ... push ... etc
# Completed: 8 of 8 jobs
# CPU time in finished jobs:       1137s      18.96m     0.32h    0.01d  0.000 y
# IO & Wait Time:                   552s       9.19m     0.15h    0.01d  0.000 y
# Average job time:                 211s       3.52m     0.06h    0.00d
# Longest finished job:             599s       9.98m     0.17h    0.01d
# Submission to last job:           599s       9.98m     0.17h    0.01d

    cd /cluster/data/bosTau4/bed/multiz5way/anno/run

    cat << '_EOF_' > doAnno.csh
#!/bin/csh -ef
set outDir = ../maf/$2
set result = $3
set input = $1
mkdir -p $outDir
cat $input | \
nice mafAddIRows -nBeds=nBeds stdin /scratch/data/bosTau4/bosTau4.2bit $result
'_EOF_'
    # << happy emacs
    chmod +x doAnno.csh

    cat << '_EOF_' > template
#LOOP
./doAnno.csh $(path1) $(lastDir1) {check out line+ ../maf/$(lastDir1)/$(root1).maf}
#ENDLOOP
'_EOF_'
    # << happy emacs

    find ../splitMaf -type f -name "*.maf" > maf.list
    gensub2 maf.list single template jobList
    para create jobList
    para try ... check ... push ... etc.
# Completed: 2320 of 2320 jobs
# CPU time in finished jobs:       8405s     140.09m     2.33h    0.10d  0.000 y
# IO & Wait Time:                  6934s     115.56m     1.93h    0.08d  0.000 y
# Average job time:                   7s       0.11m     0.00h    0.00d
# Longest finished job:             369s       6.15m     0.10h    0.00d
# Submission to last job:           834s      13.90m     0.23h    0.01d

    #	put the results back together into a single file
    ssh kkstore06
    cd /cluster/data/bosTau4/bed/multiz5way/anno
    grep "^##maf version" maf/file_0/chr1.maf \
	| sort -u > bosTau4.anno.5way.maf
    find ./maf -type f -depth -name "*.maf" | while read F
do
    grep -v -h "^#" "${F}"
done >> bosTau4.anno.5way.maf
    echo "##eof maf" >> bosTau4.anno.5way.maf

    ssh hgwdev
    cd /cluster/data/bosTau4/bed/multiz5way/anno
    mkdir -p /gbdb/bosTau4/multiz5way/anno
    ln -s `pwd`/bosTau4.anno.5way.maf \
                /gbdb/bosTau4/multiz5way/anno/multiz5way.maf
    #	by loading this into the table multiz5way, it will replace the
    #	previously loaded table with the unannotated mafs
    #	huge temp files are made, do them on local disk
    cd /scratch/tmp
    time nice -n +19 hgLoadMaf -pathPrefix=/gbdb/bosTau4/multiz5way/anno \
                bosTau4 multiz5way
    #	Loaded 6711605 mafs in 1 files from /gbdb/bosTau4/multiz5way/anno
    #	real    4m54.025s

    #	normally filter this for chrom size > 1,000,000 and only load
    #	those chroms.  But this is a scaffold assembly, load everything:
    time nice -n +19 hgLoadMafSummary bosTau4 -minSize=30000 -mergeGap=1500 \
	-maxSize=200000  multiz5waySummary \
	    /gbdb/bosTau4/multiz5way/anno/multiz5way.maf
    #	Created 779088 summary blocks from 10697864 components
    #	and 6412983 mafs from /gbdb/bosTau4/multiz5way/anno/multiz5way.maf
    #	real    4m6.782s

    #	by loading this into the table multiz5waySummary, it will replace
    #	the previously loaded table with the unannotated mafs
    #	remove the multiz5way*.tab files in this /scratch/tmp directory
    rm multiz5way*.tab
    #	And, you can remove the previously loaded non-annotated maf file link:
    rm /gbdb/bosTau4/multiz5way/maf/bosTau4.5way.maf
    rmdir /gbdb/bosTau4/multiz5way/maf

###########################################################################
## Annotate 5-way multiple alignment with gene annotations
##		(DONE - 2008-03-18 - Hiram)
    # Gene frames
    ## given previous survey done for 9-way alignment on Marmoset
    ## and the appearance of new ensGene tables on everything
    #	use knownGene for hg18, mm9
    #	use ensGene for canFam2, ornAna1
    #	and refGene for bosTau4

    ssh hgwdev
    mkdir /cluster/data/bosTau4/bed/multiz5way/frames
    cd /cluster/data/bosTau4/bed/multiz5way/frames
    mkdir genes
    # knownGene
    for DB in hg18 mm9
do
    hgsql -N -e "select name,chrom,strand,txStart,txEnd,cdsStart,cdsEnd,exonCount,exonStarts,exonEnds from knownGene" ${DB} \
      | genePredSingleCover stdin stdout | gzip -2c \
        > /scratch/tmp/${DB}.tmp.gz
    mv /scratch/tmp/${DB}.tmp.gz genes/$DB.gp.gz
    echo "${DB} done"
done
    # ensGene
    for DB in canFam2 ornAna1
do
    hgsql -N -e "select name,chrom,strand,txStart,txEnd,cdsStart,cdsEnd,exonCount,exonStarts,exonEnds from ensGene" ${DB} \
      | genePredSingleCover stdin stdout | gzip -2c \
        > /scratch/tmp/${DB}.tmp.gz
    mv /scratch/tmp/${DB}.tmp.gz genes/$DB.gp.gz
    echo "${DB} done"
done
    # refGene
    for DB in bosTau4
do
    hgsql -N -e "select name,chrom,strand,txStart,txEnd,cdsStart,cdsEnd,exonCount,exonStarts,exonEnds from refGene" ${DB} \
      | genePredSingleCover stdin stdout | gzip -2c \
        > /scratch/tmp/${DB}.tmp.gz
    mv /scratch/tmp/${DB}.tmp.gz genes/$DB.gp.gz
    echo "${DB} done"
done

    ls -og genes
# -rw-rw-r--  1  861210 Mar 18 15:40 bosTau4.gp.gz
# -rw-rw-r--  1 1865308 Mar 18 15:40 canFam2.gp.gz
# -rw-rw-r--  1 2008806 Mar 18 15:39 hg18.gp.gz
# -rw-rw-r--  1 1965274 Mar 18 15:39 mm9.gp.gz
# -rw-rw-r--  1 1347532 Mar 18 15:40 ornAna1.gp.gz

    ssh kkstore06
    cd /cluster/data/bosTau4/bed/multiz5way/frames
    #	anything to annotate is in a pair, e.g.: bosTau4 genes/bosTau4.gp.gz
    time (cat  ../anno/bosTau4.anno.5way.maf | nice -n +19 genePredToMafFrames bosTau4 stdin stdout bosTau4 genes/bosTau4.gp.gz hg18 genes/hg18.gp.gz mm9 genes/mm9.gp.gz canFam2 genes/canFam2.gp.gz ornAna1 genes/ornAna1.gp.gz | gzip > multiz5way.mafFrames.gz) > frames.log 2>&1
    # see what it looks like in terms of number of annotations per DB:
    zcat multiz5way.mafFrames.gz | cut -f4 | sort | uniq -c | sort -n
#  77526 bosTau4
# 110768 ornAna1
# 221524 mm9
# 230010 hg18
# 243396 canFam2

    #	load the resulting file
    ssh hgwdev
    cd /cluster/data/bosTau4/bed/multiz5way/frames
    time nice -n +19 hgLoadMafFrames bosTau4 multiz5wayFrames \
	multiz5way.mafFrames.gz
    #	real    0m21.968s

    #	enable the trackDb entries:
# frames multiz5wayFrames
# irows on
#############################################################################
# phastCons 5-way (DONE - 2008-03-19 - Hiram)

    # split 5way mafs into 10M chunks and generate sufficient statistics 
    # files for # phastCons
    ssh memk
    mkdir /cluster/data/bosTau4/bed/multiz5way/msa.split
    cd /cluster/data/bosTau4/bed/multiz5way/msa.split
    mkdir -p /san/sanvol1/scratch/bosTau4/multiz5way/cons/ss

    cat << '_EOF_' > doSplit.csh
#!/bin/csh -ef
set MAFS = /cluster/data/bosTau4/bed/multiz5way/anno/maf
set WINDOWS = /san/sanvol1/scratch/bosTau4/multiz5way/cons/ss
pushd $WINDOWS
set resultDir = $1
set c = $2
rm -fr $resultDir/$c
mkdir -p $resultDir
twoBitToFa -seq=$c /scratch/data/bosTau4/bosTau4.2bit /scratch/tmp/bosTau4.$c.fa
# need to truncate odd-ball scaffold/chrom names that include dots
# as phastCons utils can't handle them
set TMP = /scratch/tmp/$c.clean.maf.$$
perl -wpe 's/^s ([^.]+\.[^. ]+)\.\S+/s $1/' $MAFS/$resultDir/$c.maf > $TMP
/cluster/bin/phast/$MACHTYPE/msa_split $TMP -i MAF \
    -M /scratch/tmp/bosTau4.$c.fa \
    -o SS -r $resultDir/$c -w 10000000,0 -I 1000 -B 5000
rm -f /scratch/tmp/bosTau4.$c.fa
rm -f $TMP
popd
mkdir -p $resultDir
date > $resultDir/$c.out
'_EOF_'
    # << happy emacs
    chmod +x doSplit.csh

    cat << '_EOF_' > template
#LOOP
doSplit.csh $(dir1) $(root1) {check out line+ $(dir1)/$(root1).out}
#ENDLOOP
'_EOF_'
    # << happy emacs

    #	create list of maf files:
    (cd ../anno/maf; find . -type f) | sed -e "s#^./##" > maf.list

    gensub2 maf.list single template jobList
    para create jobList
    para try ... check ... etc
# Completed: 2320 of 2320 jobs
# CPU time in finished jobs:       1710s      28.50m     0.47h    0.02d  0.000 y
# IO & Wait Time:                  6951s     115.85m     1.93h    0.08d  0.000 y
# Average job time:                   4s       0.06m     0.00h    0.00d
# Longest finished job:             128s       2.13m     0.04h    0.00d
# Submission to last job:          1048s      17.47m     0.29h    0.01d

    # take the cons and noncons trees from the mouse 30-way

    #	Estimates are not easy to make, probably more correctly,
    #	take the 30-way .mod file, and re-use it here.
    ssh hgwdev
    cd /cluster/data/bosTau4/bed/multiz5way
    cp -p /cluster/data/mm9/bed/multiz30way/mm9.30way.mod .

    # Run phastCons
    #	This job is I/O intensive in its output files, thus it is all
    #	working over in /scratch/tmp/
    ssh memk
    mkdir -p /cluster/data/bosTau4/bed/multiz5way/cons/run.cons
    cd /cluster/data/bosTau4/bed/multiz5way/cons/run.cons

    #	there are going to be several different phastCons runs using
    #	this same script.  They trigger off of the current working directory
    #	$cwd:t which is the "grp" in this script.  It is one of:
    #	all gliers placentals
    #	Well, that's what it was when used in the Mm9 30-way,
    #	in this instance, there is only the directory "all"

    cat << '_EOF_' > doPhast.csh
#!/bin/csh -fe
set PHASTBIN = /cluster/bin/phast.build/cornellCVS/phast/bin
set subDir = $1
set f = $2
set c = $2:r
set len = $3
set cov = $4
set rho = $5
set grp = $cwd:t
set tmp = /scratch/tmp/$f
set cons = /cluster/data/bosTau4/bed/multiz5way/cons
mkdir -p $tmp
set san = /san/sanvol1/scratch/bosTau4/multiz5way/cons
if (-s $cons/$grp/$grp.non-inf) then
  cp -p $cons/$grp/$grp.mod $cons/$grp/$grp.non-inf $tmp
  cp -p $san/ss/$subDir/$f.ss $cons/$grp/$grp.mod $cons/$grp/$grp.non-inf $tmp
else
  cp -p $cons/$grp/$grp.mod $tmp
  cp -p $san/ss/$subDir/$f.ss $cons/$grp/$grp.mod $tmp
endif
pushd $tmp > /dev/null
if (-s $grp.non-inf) then
  $PHASTBIN/phastCons $f.ss $grp.mod \
    --rho $rho --expected-length $len --target-coverage $cov --quiet \
    --not-informative `cat $grp.non-inf` \
    --seqname $c --idpref $c --most-conserved $f.bed --score > $f.pp
else
  $PHASTBIN/phastCons $f.ss $grp.mod \
    --rho $rho --expected-length $len --target-coverage $cov --quiet \
    --seqname $c --idpref $c --most-conserved $f.bed --score > $f.pp
endif
popd > /dev/null
mkdir -p $san/$grp/pp/$subDir $san/$grp/bed/$subDir
sleep 4
touch $san/$grp/pp/$subDir $san/$grp/bed/$subDir
rm -f $san/$grp/pp/$subDir/$f.pp
rm -f $san/$grp/bed/$subDir/$f.bed
mv $tmp/$f.pp $san/$grp/pp/$subDir
mv $tmp/$f.bed $san/$grp/bed/$subDir
rm -fr $tmp
'_EOF_'
    # << happy emacs
    chmod a+x doPhast.csh

    # Create parasol batch and run it
    pushd /san/sanvol1/scratch/bosTau4/multiz5way/cons
    find ./ss -type f -name "*.ss" | sed -e "s#^./##; s/.ss$//" \
	> /cluster/data/bosTau4/bed/multiz5way/cons/ss.list
    popd

    # run for all species
    cd ..
    mkdir -p all run.cons/all
    cd all
    /cluster/bin/phast.cz/tree_doctor ../../mm9.30way.mod \
--prune-all-but=bosTau3,hg18,mm9,canFam2,ornAna1 \
	| sed -e "s/bosTau3/bosTau4/" > all.mod
    cd ../run.cons/all

    #	root1 == chrom name, file1 == ss file name without .ss suffix
    # Create template file for "all" run
    cat << '_EOF_' > template
#LOOP
../doPhast.csh $(lastDir1) $(file1) 45 .3 .31 {check out line+ /san/sanvol1/scratch/bosTau4/multiz5way/cons/all/pp/$(lastDir1)/$(file1).pp}
#ENDLOOP
'_EOF_'
    # << happy emacs
    gensub2 ../../ss.list single template jobList
    para create jobList
    para try ... check ... push ... etc.
# Completed: 2569 of 2569 jobs
# CPU time in finished jobs:       8636s     143.93m     2.40h    0.10d  0.000 y
# IO & Wait Time:                 17371s     289.52m     4.83h    0.20d  0.001 y
# Average job time:                  10s       0.17m     0.00h    0.00d
# Longest finished job:              44s       0.73m     0.01h    0.00d
# Submission to last job:          1008s      16.80m     0.28h    0.01d

    # create Most Conserved track
    ssh kolossus
    cd /san/sanvol1/scratch/bosTau4/multiz5way/cons/all
    find ./bed -type f -name "chr*.bed" | xargs cat \
	| sort -k1,1 -k2,2n | \
        awk '{printf "%s\t%d\t%d\tlod=%d\t%s\n", $1, $2, $3, $5, $5;}' | \
            /cluster/bin/scripts/lodToBedScore /dev/stdin > mostConserved.bed
    #	~ 3 minutes
    cp -p mostConserved.bed /cluster/data/bosTau4/bed/multiz5way/cons/all

    # load into database
    ssh hgwdev
    cd /cluster/data/bosTau4/bed/multiz5way/cons/all
    time nice -n +19 hgLoadBed bosTau4 phastConsElements5way mostConserved.bed
    #	Loaded 1005876 elements of size 5

    # Try for 5% overall cov, and 70% CDS cov 
    #	We don't have any gene tracks to compare CDS coverage
    #	--rho .31 --expected-length 45 --target-coverage .3
    featureBits bosTau4 phastConsElements5way
    #	132010504 bases of 2731830700 (4.832%) in intersection

    # Create merged posterier probability file and wiggle track data files
    # currently doesn't matter where this is performed, the san is the same
    # network distance from all machines.
    # sort by chromName, chromStart so that items are in numerical order 
    #  for wigEncode
    cd /san/sanvol1/scratch/bosTau4/multiz5way/cons/all
    mkdir -p phastCons5wayScores

for D in `ls -1d pp/file* | sort -t_ -k2n`
do
    TOP=`pwd`
    F=${D/pp\/}
    out=${TOP}/phastCons5wayScores/${F}.data.gz
    echo "${D} > ${F}.data.gz"
    cd ${D}
    find . -name "*.pp" -type f \
	| sed -e "s#^./##; s/chrUn.004./chrUn_004_/; s/-/.-./" \
	| sort -t '.' -k1,1 -k3.3n \
	| sed -e "s/.-./-/; s/chrUn_004_/chrUn.004./" | xargs cat \
	| gzip > ${out}
    cd "${TOP}"
done

    #	copy those files to the downloads area:
# /cluster/data/bosTau4/bed/multiz5way/downloads/phastCons5way/phastConsScores
    #	for hgdownload downloads

    # Create merged posterier probability file and wiggle track data files
    # currently doesn't matter where this is performed, the san is the same
    # network distance from all machines.
    cd /san/sanvol1/scratch/bosTau4/multiz5way/cons/all
    ls -1 phastCons5wayScores/*.data.gz | sort -t_ -k2n | xargs zcat \
	| wigEncode -noOverlap stdin phastCons5way.wig phastCons5way.wib
    # Converted stdin, upper limit 1.00, lower limit 0.00
    time nice -n +19 cp -p *.wi? /cluster/data/bosTau4/bed/multiz5way/cons/all
    #	real    0m40.875s

    # Load gbdb and database with wiggle.
    ssh hgwdev
    cd /cluster/data/bosTau4/bed/multiz5way/cons/all
    ln -s `pwd`/phastCons5way.wib /gbdb/bosTau4/multiz5way/phastCons5way.wib
    time nice -n +19 hgLoadWiggle -pathPrefix=/gbdb/bosTau4/multiz5way bosTau4 \
	phastCons5way phastCons5way.wig
    #	real    1m5.667s
    # remove garbage
    rm wiggle.tab

    #  Create histogram to get an overview of all the data
    ssh hgwdev
    cd /cluster/data/bosTau4/bed/multiz5way/cons/all
    time nice -n +19 hgWiggle -doHistogram \
	-hBinSize=0.001 -hBinCount=1000 -hMinVal=0.0 -verbose=2 \
	    -db=bosTau4 phastCons5way > histogram.data 2>&1
    #	real    3m37.316s

    #	create plot of histogram:

    cat << '_EOF_' | gnuplot > histo.png
set terminal png small color x000000 xffffff xc000ff x66ff66 xffff00 x00ffff
set size 1.4, 0.8
set key left box
set grid noxtics
set grid ytics
set title " Cow BosTau4 Histogram phastCons5way track"
set xlabel " phastCons5way score"
set ylabel " Relative Frequency"
set y2label " Cumulative Relative Frequency (CRF)"
set y2range [0:1]
set y2tics
set yrange [0:0.02]

plot "histogram.data" using 2:5 title " RelFreq" with impulses, \
        "histogram.data" using 2:7 axes x1y2 title " CRF" with lines
'_EOF_'
    #	<< happy emacs

    display histo.png &

    #	These trackDb entries turn on the wiggle phastCons data track:
    #	type wigMaf 0.0 1.0
    #	maxHeightPixels 100:40:11
    #	wiggle phastCons5way
    #	spanList 1
    #	autoScale Off
    #	windowingFunction mean
    #	pairwiseHeight 12
    #	yLineOnOff Off

#############################################################################
#  Downloads (DONE - 2008-01-11 - Hiram)
    #	Let's see if the downloads will work
    ssh hgwdev
    /cluster/data/bosTau4
    #	expecting to find repeat masker .out file here:
    ln -s bed/RepeatMasker/bosTau4.fa.out .
    time nice -n +19 /cluster/bin/scripts/makeDownloads.pl \
	-workhorse=hgwdev bosTau4 > jkStuff/downloads.log 2>&1
    #	real    24m3.210s
    #	failed making upstream sequences:
    #	featureBits bosTau4 mgcGenes:upstream:1000 -fa=stdout
    #	setpriority: Permission denied.
    #	the 'nice' from my bash shell causes trouble inside the csh
    #	script which uses nice.  Finish off the install step manually
    #	with the mgcGenes upstreams ...

#############################################################################
#  PushQ entries (DONE - 2008-01-11 - Hiram)
    ssh hgwdev
    /cluster/data/bosTau4
    /cluster/bin/scripts/makePushQSql.pl bosTau4 > jkStuff/pushQ.sql
    #	output warnings:
# bosTau4 does not have seq
# bosTau4 does not have gbMiscDiff
# Could not tell (from trackDb, all.joiner and hardcoded lists of supporting
# and genbank tables) which tracks to assign these tables to:
#	genscanPep

#############################################################################
#  create download files (DONE - 2008-03-19 - Hiram)
    ssh hgwdev
    cd /cluster/data/bosTau4
    ln -s /cluster/data/bosTau4/bed/repeatMasker/bosTau4.fa.out .
    makeDownloads.pl bosTau4 > makeDownloads.log 2>&1

    #	*EDIT* the README files and ensure they are correct

#############################################################################
#  PushQ entries (DONE - 2008-03-19 - Hiram)
    ssh hgwdev
    /cluster/data/bosTau4
    /cluster/bin/scripts/makePushQSql.pl bosTau4 > jkStuff/pushQ.sql
    #	output warnings:
# hgwdev does not have /usr/local/apache/htdocs/goldenPath/bosTau4/liftOver/bosTau4ToBosTau*
# bosTau4 does not have seq

# Could not tell (from trackDb, all.joiner and hardcoded lists of supporting
# and genbank tables) which tracks to assign these tables to:
#	genscanPep

    #	looks like there should be a bosTau3 to bosTau4 liftOver run

###########################################################################
# HUMAN (hg18) PROTEINS TRACK (DONE braney 2008-03-28)
    ssh kkstore06
    # bash  if not using bash shell already

    mkdir /cluster/data/bosTau4/blastDb
    cd /cluster/data/bosTau4
    grep -v chrUn chrom.sizes | awk '{print $1}' > chr.lst
    for i in `cat chr.lst`; 
    do twoBitToFa  bosTau4.unmasked.2bit -seq=$i stdout; 
    done > temp.fa
    faSplit gap temp.fa 1000000 blastDb/x -lift=blastDb.lft

    grep chrUn chrom.sizes | awk '{print $1}' > chr.lst
    for i in `cat chr.lst`; 
    do twoBitToFa  bosTau4.unmasked.2bit -seq=$i stdout; 
    done > temp.fa
    faSplit sequence temp.fa 150 blastDb/y
    rm temp.fa chr.lst

    cd blastDb
    for i in *.fa
    do
	/cluster/bluearc/blast229/formatdb -i $i -p F
    done
    rm *.fa
    ls *.nsq | wc -l
#   3440

    mkdir -p /san/sanvol1/scratch/bosTau4/blastDb
    cd /cluster/data/bosTau4/blastDb
    for i in nhr nin nsq; 
    do 
	echo $i
	cp *.$i /san/sanvol1/scratch/bosTau4/blastDb
    done

    mkdir -p /cluster/data/bosTau4/bed/tblastn.hg18KG
    cd /cluster/data/bosTau4/bed/tblastn.hg18KG
    echo  /san/sanvol1/scratch/bosTau4/blastDb/*.nsq | xargs ls -S | sed "s/\.nsq//"  > query.lst
    wc -l query.lst
# 3440 query.lst

   # we want around 350000 jobs
   calc `wc /cluster/data/hg18/bed/blat.hg18KG/hg18KG.psl | awk '{print $1}'`/\(350000/`wc query.lst | awk '{print $1}'`\)

# 36727/(350000/3440) = 360.973943

   mkdir -p /cluster/bluearc/bosTau4/bed/tblastn.hg18KG/kgfa
   split -l 361 /cluster/data/hg18/bed/blat.hg18KG/hg18KG.psl  /cluster/bluearc/bosTau4/bed/tblastn.hg18KG/kgfa/kg
   ln -s /cluster/bluearc/bosTau4/bed/tblastn.hg18KG/kgfa kgfa
   cd kgfa
   for i in *; do 
     nice pslxToFa $i $i.fa; 
     rm $i; 
     done
   cd ..
   ls -1S kgfa/*.fa > kg.lst
   mkdir -p /cluster/bluearc/bosTau4/bed/tblastn.hg18KG/blastOut
   ln -s /cluster/bluearc/bosTau4/bed/tblastn.hg18KG/blastOut
   for i in `cat kg.lst`; do  mkdir blastOut/`basename $i .fa`; done
   tcsh
   cd /cluster/data/bosTau4/bed/tblastn.hg18KG
   cat << '_EOF_' > blastGsub
#LOOP
blastSome $(path1) {check in line $(path2)} {check out exists blastOut/$(root2)/q.$(root1).psl }
#ENDLOOP
'_EOF_'

   cat << '_EOF_' > blastSome
#!/bin/sh
BLASTMAT=/cluster/bluearc/blast229/data
export BLASTMAT
g=`basename $2`
f=/tmp/`basename $3`.$g
for eVal in 0.01 0.001 0.0001 0.00001 0.000001 1E-09 1E-11
do
if /cluster/bluearc/blast229/blastall -M BLOSUM80 -m 0 -F no -e $eVal -p tblastn -d $1 -i $2 -o $f.8
then
        mv $f.8 $f.1
        break;
fi
done
if test -f  $f.1
then
    if /cluster/bin/i386/blastToPsl $f.1 $f.2
    then
	liftUp -nosort -type=".psl" -nohead $f.3 /cluster/data/bosTau4/blastDb.lft carry $f.2
        liftUp -nosort -type=".psl" -pslQ -nohead $3.tmp /cluster/data/hg18/bed/blat.hg18KG/protein.lft warn $f.3

        if pslCheck -prot $3.tmp                                                  
        then                                                                      
            mv $3.tmp $3                                                          
            rm -f $f.1 $f.2 $f.3 $f.4
        fi
        exit 0                                                                    
    fi                                                                            
fi                                                                                
rm -f $f.1 $f.2 $3.tmp $f.8 $f.3 $f.4
exit 1
'_EOF_'
    # << happy emacs
    chmod +x blastSome
    gensub2 query.lst kg.lst blastGsub blastSpec
    exit 
    
    ssh pk
    cd /cluster/data/bosTau4/bed/tblastn.hg18KG
    para create blastSpec
#    para try, check, push, check etc.

    para time

Completed: 350880 of 350880 jobs
CPU time in finished jobs:   27082816s  451380.27m  7523.00h  313.46d  0.859 y
IO & Wait Time:               2334990s   38916.50m   648.61h   27.03d  0.074 y
Average job time:                  84s       1.40m     0.02h    0.00d
Longest finished job:             578s       9.63m     0.16h    0.01d
Submission to last job:         96125s    1602.08m    26.70h    1.11d


    ssh kkstore06
    cd /cluster/data/bosTau4/bed/tblastn.hg18KG
    mkdir chainRun
    cd chainRun
    tcsh
    cat << '_EOF_' > chainGsub
#LOOP
chainOne $(path1)
#ENDLOOP
'_EOF_'

    cat << '_EOF_' > chainOne
(cd $1; cat q.*.psl | simpleChain -prot -outPsl -maxGap=150000 stdin /cluster/bluearc/bosTau4/bed/tblastn.hg18KG/blastOut/c.`basename $1`.psl)
'_EOF_'
    chmod +x chainOne
    ls -1dS /cluster/bluearc/bosTau4/bed/tblastn.hg18KG/blastOut/kg?? > chain.lst
    gensub2 chain.lst single chainGsub chainSpec
    # do the cluster run for chaining
    ssh pk
    cd /cluster/data/bosTau4/bed/tblastn.hg18KG/chainRun
    para create chainSpec
    para maxNode 30
    para try, check, push, check etc.

# Completed: 99 of 102 jobs
# Crashed: 3 jobs
# CPU time in finished jobs:     113248s    1887.47m    31.46h    1.31d  0.004 y
# IO & Wait Time:                 86043s    1434.04m    23.90h    1.00d  0.003 y
# Average job time:                2013s      33.55m     0.56h    0.02d
# Longest finished job:            6139s     102.32m     1.71h    0.07d
# Submission to last job:         10416s     173.60m     2.89h    0.12d

# ran three crashed jobs on kolossus

    ssh kkstore06
    cd /cluster/data/bosTau4/bed/tblastn.hg18KG/blastOut
    for i in kg??
    do
       cat c.$i.psl | awk "(\$13 - \$12)/\$11 > 0.6 {print}" > c60.$i.psl
       sort -rn c60.$i.psl | pslUniq stdin u.$i.psl
       awk "((\$1 / \$11) ) > 0.60 { print   }" c60.$i.psl > m60.$i.psl
       echo $i
    done
    sort -T /tmp -k 14,14 -k 16,16n -k 17,17n u.*.psl m60* | uniq > /cluster/data/bosTau4/bed/tblastn.hg18KG/blastHg18KG.psl
    cd ..
    pslCheck blastHg18KG.psl

    # load table 
    ssh hgwdev
    cd /cluster/data/bosTau4/bed/tblastn.hg18KG
    hgLoadPsl bosTau4 blastHg18KG.psl

    # check coverage
    featureBits bosTau4 blastHg18KG 
# 40254923 bases of 2731830700 (1.474%) in intersection

    featureBits bosTau4 refGene:cds blastHg18KG  -enrichment
# refGene:cds 0.429%, blastHg18KG 1.474%, both 0.379%, cover 88.39%, enrich 59.98x

    ssh kkstore06
    rm -rf /cluster/data/bosTau4/bed/tblastn.hg18KG/blastOut
    rm -rf /cluster/bluearc/bosTau4/bed/tblastn.hg18KG/blastOut
#end tblastn

##########################################################################
#  Create 5-way downloads (DONE - 2008-03-28 - Hiram)
    ssh hgwdev
    mkdir -p /cluster/data/bosTau4/bed/multiz5way/downloads/phastCons5way
    cd /cluster/data/bosTau4/bed/multiz5way/downloads/phastCons5way
    cp -p \
/san/sanvol1/scratch/bosTau4/multiz5way/cons/all/phastCons5wayScores/* .
    ln -s ../../cons/all/all.mod ./5way.mod
    cp /cluster/data/calJac1/bed/multiz9way/downloads/phastCons9way/README.txt .
    # edit that README.txt to be correct for this 5-way alignment
    cd ..
    mkdir multiz5way
    cd multiz5way
    cp -p /cluster/data/calJac1/bed/multiz9way/downloads/multiz9way/README.txt .
    # edit that README.txt to be correct for this 5-way alignment
    ssh kkstore06
    cd /cluster/data/bosTau4/bed/multiz5way/downloads/multiz5way
    ln -s ../../bosTau4.5-way.nh 5way.nh

    time gzip -c ../../anno/bosTau4.anno.5way.maf > bosTau4.5way.maf.gz
    #	real    34m59.295s

    ssh hgwdev
    cd /cluster/data/bosTau4/bed/multiz5way/downloads/multiz5way
    #	creating upstream files from refGene, bash script:
    cat << '_EOF_' > mkUpstream.sh
#!/bin/bash
DB=bosTau4
GENE=refGene
NWAY=multiz5way
export DB GENE

for S in 1000 2000 5000
do
    echo "making upstream${S}.maf"
    featureBits ${DB} ${GENE}:upstream:${S} -fa=/dev/null -bed=stdout \
        | perl -wpe 's/_up[^\t]+/\t0/' | sort -k1,1 -k2,2n \
        | $HOME/kent/src/hg/ratStuff/mafFrags/mafFrags ${DB} ${NWAY} \
                stdin stdout \
                -orgs=/cluster/data/${DB}/bed/${NWAY}/species.list \
        | gzip -c > upstream${S}.maf.gz
    echo "done upstream${S}.maf.gz"
done
'_EOF_'
    # << happy emacs
    chmod +x ./mkUpstream.sh
    time nice -n +19 ./mkUpstream.sh
-rw-rw-r--  1    9883443 Mar 28 13:02 upstream1000.maf.gz
-rw-rw-r--  1   17938570 Mar 28 13:06 upstream2000.maf.gz
-rw-rw-r--  1   40384656 Mar 28 13:10 upstream5000.maf.gz
    #
    #	check the names in these upstream files to ensure sanity:
    zcat upstream1000.maf.gz | grep "^s " | awk '{print $2}' \
	| sort | uniq -c | sort -rn | less
    #	should be a list of the other 4 species with a high count,
    #	then refGene names, e.g.:
    #	8806 ornAna1
    #	8806 mm9
    #	8806 hg18
    #	8806 canFam2
    #      7 NM_001077006
    #      3 NM_001113231
    #      3 NM_001105381
    #      3 NM_001102527
    #      3 NM_001102322
    #	...

    ssh kkstore06
    cd /cluster/data/bosTau4/bed/multiz5way/downloads/multiz5way
    md5sum *.maf.gz > md5sum.txt
    cd ../phastCons5way
    md5sum *.data.gz *.mod > md5sum.txt

    ssh hgwdev
    mkdir /usr/local/apache/htdocs/goldenPath/bosTau4/multiz5way
    mkdir /usr/local/apache/htdocs/goldenPath/bosTau4/phastCons5way
    cd /cluster/data/bosTau4/bed/multiz5way/downloads/multiz5way
    ln -s `pwd`/* /usr/local/apache/htdocs/goldenPath/bosTau4/multiz5way
    cd ../phastCons5way
    ln -s `pwd`/* /usr/local/apache/htdocs/goldenPath/bosTau4/phastCons5way
    #	if your ln -s `pwd`/* made extra links to files you don't want there,
    #	check the goldenPath locations and remove those extra links

#############################################################################
# N-SCAN gene predictions (nscanGene) - (2008-04-21 markd)

    # obtained NSCAN predictions from michael brent's group
    # at WUSTL
    cd /cluster/data/bosTau4/bed/nscan/
    wget http://mblab.wustl.edu/predictions/Cow/bosTau4/bosTau4.gtf
    wget http://mblab.wustl.edu/predictions/Cow/bosTau4/bosTau4.prot.fa
    wget http://mblab.wustl.edu/predictions/Cow/bosTau4/readme.html
    bzip2 bosTau4.*
    chmod a-w *

    # load track
    gtfToGenePred -genePredExt bosTau4.gtf.bz2 stdout | hgLoadGenePred -bin -genePredExt bosTau4 nscanGene stdin
    hgPepPred bosTau4 generic nscanPep  bosTau4.prot.fa.bz2
    rm *.tab

    # update trackDb; need a bosTau4-specific page to describe informants
    cow/bosTau4/nscanGene.html   (copy from readme.html)
    cow/bosTau4/trackDb.ra
    # set search regex to
        termRegex chr[0-9a-zA-Z_]+\.[0-9]+\.[0-9]+(\.[0-9]+)*

#############################################################################
############################################################################
# TRANSMAP vertebrate.2008-05-20 build  (2008-05-24 markd)

vertebrate-wide transMap alignments were built  Tracks are created and loaded
by a single Makefile. This is available from:
   svn+ssh://hgwdev.soe.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2008-05-20

see doc/builds.txt for specific details.

############################################################################
# SELF BLASTZ (DONE - 2008-06-27 - Hiram)
    #	do not want the noise from the contigs
    ssh kkstore06
    screen # use a screen to manage this multi-day job
    mkdir /cluster/data/bosTau4/noContigs
    cd /cluster/data/bosTau4/noContigs
    cut -f1 ../chrom.sizes | grep -v chrUn | while read C
do
    echo "twoBitToFa -seq=${C} ../bosTau4.2bit ${C}.fa"
    twoBitToFa -seq=${C} ../bosTau4.2bit ${C}.fa
done
    faToTwoBit chr*.fa bosTau4.noContigs.2bit

    twoBitToFa bosTau4.noContigs.2bit stdout | faSize stdin
# 2634429662 bases (167456864 N's 2466972798 real
#	1326108891 upper 1140863907 lower) in 31 sequences in 1 files
# %43.31 masked total, %46.25 masked real
    grep -v chrUn ../chrom.sizes | ave -col=2 stdin
# count 31
# total 2634429662.000000

    twoBitInfo bosTau4.noContigs.2bit stdout | sort -k2nr \
	> bosTau4.noContigs.chrom.sizes
    cp -p bosTau4.noContigs.2bit /cluster/bluearc/scratch/data/bosTau4
    cp -p bosTau4.noContigs.chrom.sizes /cluster/bluearc/scratch/data/bosTau4

    mkdir /cluster/data/bosTau4/bed/blastzSelf.2008-08-27
    cd /cluster/data/bosTau4/bed
    ln -s blastzSelf.2008-08-27 blastzSelf
    cd blastzSelf.2008-08-27

    cat << '_EOF_' > DEF
BLASTZ_M=400

# TARGET: Cow bosTau4
SEQ1_DIR=/cluster/bluearc/scratch/data/bosTau4/bosTau4.noContigs.2bit
SEQ1_LEN=/cluster/bluearc/scratch/data/bosTau4/bosTau4.noContigs.chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000

# QUERY: Cow bosTau4
SEQ2_DIR=/cluster/bluearc/scratch/data/bosTau4/bosTau4.noContigs.2bit
SEQ2_LEN=/cluster/bluearc/scratch/data/bosTau4/bosTau4.noContigs.chrom.sizes
SEQ2_CHUNK=10000000
SEQ2_LAP=0

BASE=/cluster/data/bosTau4/bed/blastzSelf.2008-08-27
TMPDIR=/scratch/tmp
'_EOF_'
    # << this line keeps emacs coloring happy

    cd /cluster/data/bosTau4/bed/blastzSelf.2008-08-27
    time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-chainMinScore=10000 -chainLinearGap=medium -bigClusterHub=pk \
	`pwd`/DEF > blastz.out 2>&1 &
    #	real    3537m49.719s
    #	had to fix a slight problem in the make downloads step, then:
    time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-chainMinScore=10000 -chainLinearGap=medium -bigClusterHub=pk \
	-continue=cleanup `pwd`/DEF > cleanup.out 2>&1 &

############################################################################
############################################################################
# TRANSMAP vertebrate.2008-06-07 build  (2008-06-30 markd)

vertebrate-wide transMap alignments were built  Tracks are created and loaded
by a single Makefile. This is available from:
   svn+ssh://hgwdev.soe.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2008-06-30

see doc/builds.txt for specific details.
############################################################################

################################################
# AUTOMATE UPSTREAM FILE CREATION (2008-10-15 markd)
update genbank.conf:
bosTau4.upstreamGeneTbl = mgcGenes
bosTau4.upstreamMaf = multiz5way /hive/data/genomes/bosTau4/bed/multiz5way/species.list
############################################################################
# TRANSMAP vertebrate.2009-07-01 build  (2009-07-21 markd)

vertebrate-wide transMap alignments were built  Tracks are created and loaded
by a single Makefile. This is available from:
   svn+ssh://hgwdev.soe.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2009-07-01

see doc/builds.txt for specific details.
############################################################################
############################################################################
# TRANSMAP vertebrate.2009-09-13 build  (2009-09-20 markd)

vertebrate-wide transMap alignments were built  Tracks are created and loaded
by a single Makefile. This is available from:
   svn+ssh://hgwdev.soe.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2009-09-13

see doc/builds.txt for specific details.
############################################################################

#############################################################################
# LIFTOVER TO bosTauMd3 (DONE - 2009-12-16 - galt)
    ssh hgwdev
    cd /hive/data/genomes/bosTau4
    ln -s bosTau4.11.ooc 11.ooc
    time nice +19 doSameSpeciesLiftOver.pl -verbose=2 \
        -bigClusterHub=pk -dbHost=hgwdev -workhorse=hgwdev \
         bosTau4 bosTauMd3 >& doLiftOverToBosTauMd3.log

    pushd /usr/local/apache/htdocs/goldenPath/bosTau4/liftOver/
    md5sum bosTau4ToBosTauMd3.over.chain.gz >> md5sum.txt
    popd

#########################################################################
# susScr1 Pig BLASTZ/CHAIN/NET (DONE - 2010-01-25 - Hiram)
    screen # use a screen to manage this multi-day job
    mkdir /hive/data/genomes/bosTau4/bed/lastzSusScr1.2010-01-25
    cd /hive/data/genomes/bosTau4/bed/lastzSusScr1.2010-01-25

    cat << '_EOF_' > DEF
# Pig vs. Cow
BLASTZ_M=50

# TARGET: Cow BosTau4
SEQ1_DIR=/scratch/data/bosTau4/bosTau4.2bit
SEQ1_LEN=/scratch/data/bosTau4/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
SEQ1_LIMIT=100

# QUERY: Pig SusScr1
SEQ2_DIR=/scratch/data/susScr1/susScr1.2bit
SEQ2_LEN=/scratch/data/susScr1/chrom.sizes
SEQ2_CHUNK=10000000
SEQ2_LAP=0

BASE=/hive/data/genomes/bosTau4/bed/lastzSusScr1.2010-01-25
TMPDIR=/scratch/tmp
'_EOF_'
    # << this line keeps emacs coloring happy

    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
	`pwd`/DEF \
	-noLoadChainSplit -syntenicNet \
	-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
	-chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 &
    #	real    2377m21.473s
    #	never did get this to work.  susScr2 as target bosTau4 as query
    #	did finish but with immense chain pileups
    #	failed second time too
    #	XXX - re-running as 01-25 (was 01-21) Mon Jan 25 11:29:32 PST 2010
    #	real    3811m31.428s
    #	failed
    #	XXX - running Thu Jan 25 14:49:43 PST 2010

    time nice -n +19 $HOME/kent/src/hg/utils/automation/doBlastzChainNet.pl \
	`pwd`/DEF -verbose=2 \
	-bigClusterHub=memk -chainMinScore=3000 -chainLinearGap=medium \
	    -syntenicNet > do.log 2>&1

#########################################################################
# lastz swap Rat Rn4 (DONE - 2010-01-22 - Hiram)
    # original run
    cd /cluster/data/rn4/bed/blastzBosTau4.2010-01-19
    cat fb.rn4.chainBosTau4Link.txt 
    #	649931321 bases of 2571531505 (25.274%) in intersection

    mkdir /cluster/data/bosTau4/bed/blastz.rn4.swap
    cd /cluster/data/bosTau4/bed/blastz.rn4.swap
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
	/cluster/data/rn4/bed/blastzBosTau4.2010-01-19/DEF \
	-noLoadChainSplit -workhorse=hgwdev -smallClusterHub=memk \
	-bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
	    -swap -syntenicNet > swap.log 2>&1 &
    #	real    77m18.645s
    cat fb.bosTau4.chainRn4Link.txt 
    #	664253901 bases of 2731830700 (24.315%) in intersection

#########################################################################
# lastz swap Pig SusScr2 (DONE - 2010-03-31 - Hiram)
    #	the primary lastz run on susScr2
    cd /hive/data/genomes/susScr2/bed/lastzBosTau4.2010-03-27
    cat fb.susScr2.chainBosTau4Link.txt 
    #	1478903080 bases of 2231298548 (66.280%) in intersection

    #	and the swap
    mkdir /hive/data/genomes/bosTau4/bed/blastz.susScr2.swap
    cd /hive/data/genomes/bosTau4/bed/blastz.susScr2.swap
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
	/hive/data/genomes/susScr2/bed/lastzBosTau4.2010-03-27/DEF \
	-swap -noLoadChainSplit -syntenicNet \
	-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=pk \
	-chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 &
    #	most interesting, this failed on the first step chainSwap
    #	but the failure didn't make it stop, it continued and produced
    #	zero results to the end.  Running manually:
    cd /hive/data/genomes/bosTau4/bed/blastz.susScr2.swap/axtChain
export sizeG=188743680
ulimit -d $sizeG
ulimit -v $sizeG

chainSwap /hive/data/genomes/susScr2/bed/lastzBosTau4.2010-03-27/axtChain/susScr2.bosTau4.all.chain.gz stdout \
| nice chainSort stdin stdout | nice gzip -c > bosTau4.susScr2.all.chain.gz

    #	it also runs out of memory in the lift over file creation:
    export sizeG=188743680
    ulimit -d $sizeG
    ulimit -v $sizeG

    netChainSubset -verbose=0 noClass.net bosTau4.susScr2.all.chain.gz stdout \
	| chainStitchId stdin stdout | gzip -c > bosTau4.susScr2.over.chain.gz

    #	and netChains.csh is finished manually with this added:
# memory limit 160 Gb
limit datasize 163840m
limit vmemoryuse 163840m

    #	manually run the loadUp.csh with this added:
# memory limit 160 Gb
limit datasize 163840m
limit vmemoryuse 163840m
    #	real    498m5.861s

    cat fb.bosTau4.chainSusScr2Link.txt
    #	1383557633 bases of 2731830700 (50.646%) in intersection


#############################################################################
# LIFTOVER TO bosTau5 (Btau_4.2) (DONE - 2010-07-19 - Chin)
    ssh hgwdev
    cd /hive/data/genomes/bosTau4
    ln -s bosTau4.11.ooc 11.ooc
    time nice -n +19 doSameSpeciesLiftOver.pl -verbose=2 \
        -bigClusterHub=pk -dbHost=hgwdev -workhorse=hgwdev \
         bosTau4 bosTau5 > doLiftOverToBosTau5.log 2>&1 &
    #   real    245m42.802s

    pushd /usr/local/apache/htdocs-hgdownload/goldenPath/bosTau4/liftOver/
    md5sum bosTau4ToBosTau5.over.chain.gz >> md5sum.txt
    popd

#########################################################################
# SEGMENTAL DUPLICATIONS (DONE 6/17/2011 Fan)
# Original raw data file sent by email from Arthur Ko [a5ko@uw.edu]
        
mkdir /hive/data/genomes/bosTau4/bed/genomicSuperDups
cd /hive/data/genomes/bosTau4/bed/genomicSuperDups
		    
# place the data file, bosTau4-genomicSuperDups_2008Oct02.tab.gz, here

gzip -d bosTau4-genomicSuperDups_2008Oct02.tab.gz
awk '($3 - $2) >= 1000 && ($9 - $8) >= 1000 {print;}' bosTau4-genomicSuperDups_2008Oct02.tab \
| hgLoadBed bosTau4 genomicSuperDups stdin \
-tab -sqlTable=$HOME/kent/src/hg/lib/genomicSuperDups.sql

# Kayla found that the strand values were "+" and "_" -- fix:
hgsql bosTau4 -e 'update genomicSuperDups set strand = "-" where strand = "_";'

#############################################################################
# LIFTOVER TO bosTau6 (DONE - 2011-10-20 - Chin)
    ssh hgwdev
    cd /hive/data/genomes/bosTau4

    ln -s bosTau4.11.ooc 11.ooc
    time nice -n +19 doSameSpeciesLiftOver.pl -verbose=2 \
        -bigClusterHub=swarm -dbHost=hgwdev -workhorse=hgwdev \
         bosTau4 bosTau6 >& doLiftOverToBosTau6.log 2>&1 &
    # real    119m16.934s

    pushd /usr/local/apache/htdocs-hgdownload/goldenPath/bosTau4/liftOver/
    md5sum bosTau4ToBosTau6.over.chain.gz >> md5sum.txt
    popd

##############################################################################
# LIFTOVER TO bosTau9 (DONE - 2022-12-06 - Hiram)
    ssh hgwdev
    mkdir /hive/data/genomes/bosTau4/bed/blat.bosTau9.2022-12-06
    cd /hive/data/genomes/bosTau4/bed/blat.bosTau9.2022-12-06
    doSameSpeciesLiftOver.pl -verbose=2 \
        -debug -bigClusterHub=ku -dbHost=hgwdev -workhorse=hgwdev \
        -target2Bit=/hive/data/genomes/bosTau4/bosTau4.2bit \
        -targetSizes=/hive/data/genomes/bosTau4/chrom.sizes \
        -query2Bit=/hive/data/genomes/bosTau9/bosTau9.2bit \
        -querySizes=/hive/data/genomes/bosTau9/chrom.sizes \
        -ooc=/hive/data/genomes/bosTau4/bosTau4.11.ooc \
         bosTau4 bosTau9
    time (doSameSpeciesLiftOver.pl -verbose=2 \
        -bigClusterHub=ku -dbHost=hgwdev -workhorse=hgwdev \
        -target2Bit=/hive/data/genomes/bosTau4/bosTau4.2bit \
        -targetSizes=/hive/data/genomes/bosTau4/chrom.sizes \
        -query2Bit=/hive/data/genomes/bosTau9/bosTau9.2bit \
        -querySizes=/hive/data/genomes/bosTau9/chrom.sizes \
        -ooc=/hive/data/genomes/bosTau4/bosTau4.11.ooc \
         bosTau4 bosTau9) > doLiftOverToBosTau9.log 2>&1
    # real    963m57.013s

    # see if the liftOver menus function in the browser from bosTau4 to bosTau9

##############################################################################
