# for emacs: -*- mode: sh; -*-

#	Takifugu rubripes
# ftp://ftp.ncbi.nlm.nih.gov/genbank/genomes/Eukaryotes/vertebrates_other/Takifugu_rubripes/FUGU5/
#
#	http://www.ncbi.nlm.nih.gov/Traces/wgs/?val=CAAB02
#	WGS: CAAB02000001:CAAB02030857
#	http://www.ncbi.nlm.nih.gov/nuccore/CAAB00000000
#	http://www.fugu-sg.org/
#
##########################################################################
# Download sequence (DONE - 2011-11-09 - Hiram)
    mkdir -p /hive/data/genomes/fr3/genbank
    cd /hive/data/genomes/fr3/genbank
    wget --timestamping -r --cut-dirs=6 --level=0 -nH -x \
	--no-remove-listing -np \
"ftp://ftp.ncbi.nlm.nih.gov/genbank/genomes/Eukaryotes/vertebrates_other/Takifugu_rubripes/FUGU5/*"

    faSize Primary_Assembly/assembled_chromosomes/FASTA/chr*.fa.gz
# 281572362 bases (12721879 N's 268850483 real 268850483 upper 0 lower) in 22
sequences in 22 files

    cat << '_EOF_' > mkUcsc.pl
#!/usr/bin/env perl

use strict;
use warnings;

my %chrFrNameToChrN;

open (FH, "< Primary_Assembly/assembled_chromosomes/chr2acc") or die "can not read Primary_Assembly/assembled_chromosomes/chr2acc";
while (my $line = <FH>) {
    next if ($line =~ m/^#/);
    chomp $line;
    my ($chrN, $frName) = split('\s+', $line);
    $chrFrNameToChrN{$frName} = $chrN;
}
close (FH);

my $firstHeader = 1;
open (UC, "|gzip -c > ucsc.agp.gz") or die "can not write to ucsc.agp.gz";
open (FA, "|gzip -c > ucsc.fa.gz") or die "can not write to uscs.fa.gz";
foreach my $key (sort keys %chrFrNameToChrN) {
    my $chrN = $chrFrNameToChrN{$key};
    printf "%s\tchr%s\n", $key, $chrN;
    my $fastaFile = "Primary_Assembly/assembled_chromosomes/FASTA/chr${chrN}.fa.gz";
    my $agpFile = "Primary_Assembly/assembled_chromosomes/AGP/chr${chrN}.agp.gz";
    open (FH, "zcat $agpFile|") or die "can not read $agpFile";
    while (my $line = <FH>) {
	if ($line =~ m/^#/) {
	    if ($firstHeader) {
		printf UC "%s", $line;
	    }
	    next;
	}
	$firstHeader = 0;
	$line =~ s/$key/chr${chrN}/;
	printf UC "%s", $line;
    }
    close (FH);
    open (FH, "zcat $fastaFile|") or die "can not read $fastaFile";
    while (my $line = <FH>) {
	if ($line =~ m/^>/) {
	    die "can not match fasta name $key in $fastaFile\n$line" if ($line !~ m/$key/);
	    printf FA ">chr${chrN}\n";
	    next;
	}
	printf FA "%s", $line;
    }
    close (FH);
}
close (UC);
close (FA);
'_EOF_'
    # << happy emacs
    chmod +x mkUcsc.pl

    time ./mkUcsc.pl 
    #	real    18m3.244s

    zcat Primary_Assembly/unplaced_scaffolds/AGP/unplaced.scaf.agp.gz | \
	sed -e 's/\.1//' | gzip -c > ucsc.unplaced.agp.gz

    zcat Primary_Assembly/unplaced_scaffolds/FASTA/unplaced.scaf.fa.gz \
	| sed -e 's/^>.*emb|\([A-Z0-9]*\).*/>\1/' | gzip -c \
	> ucsc.unplaced.fa.gz

###########################################################################
# Initial genome build (DONE - 2011-11-16 - Hiram)
    cd /hive/data/genomes/fr3
    cat << '_EOF_' > fr3.config.ra
# Config parameters for makeGenomeDb.pl:
db fr3
clade vertebrate
genomeCladePriority 110
scientificName Takifugu Rubripes 
commonName Fugu
assemblyDate Oct. 2011
assemblyLabel Fugu Genome Sequencing Consortium - FUGU5 (NCBI project 1434, GCA_000180615.2)
assemblyShortLabel fr3.1
# orderKey = fr2.orderKey - 1
orderKey 463
#       GI: 23397366
mitoAcc NC_004299
fastaFiles /hive/data/genomes/fr3/genbank/ucsc.*fa.gz
agpFiles /hive/data/genomes/fr3/genbank/ucsc.*agp.gz
dbDbSpeciesDir fugu
# qualFiles none
taxId 31033
'_EOF_'
    # << happy emacs

    time makeGenomeDb.pl -stop=agp -workhorse=hgwdev -fileServer=hgwdev \
	fr3.config.ra > step.agp.log 2>&1 &
    #	real    0m29.258s
    # take a look at the constructed AGP file to see if it has the names
    #	desired, then continuing:
    time makeGenomeDb.pl -continue=db -workhorse=hgwdev -fileServer=hgwdev \
	fr3.config.ra > step.db.log 2>&1 &
    #	about 3 minutes

################################################
## WINDOWMASKER (DONE - 2011-11-16 - Hiram)
    mkdir /hive/data/genomes/fr3/bed/windowMasker
    cd /hive/data/genomes/fr3/bed/windowMasker
    time doWindowMasker.pl -buildDir=`pwd` -workhorse=hgwdev fr3 > do.log 2>&1 &
    #	real    21m47.182s

    # Masking statistics
    twoBitToFa fr3.wmsk.sdust.2bit stdout | faSize stdin
    #	391484715 bases (40522884 N's 350961831 real 284458798 upper
    #	66503033 lower) in 6835 sequences in 1 files
    #	%16.99 masked total, %18.95 masked real

    hgLoadBed fr3 windowmaskerSdust windowmasker.sdust.bed.gz
    # Loaded 1742235 elements of size 3

    featureBits -countGaps fr3 windowmaskerSdust
    #	107012497 bases of 391484715 (27.335%) in intersection

    #	eliminate the gaps from the masking
    featureBits fr3 -not gap -bed=notGap.bed
    #	350961831 bases of 350961831 (100.000%) in intersection
    time nice -n +19 featureBits fr3 windowmaskerSdust notGap.bed \
        -bed=stdout | gzip -c > cleanWMask.bed.gz
    #	66503033 bases of 350961831 (18.949%) in intersection

    #	reload track to get it clean
    hgLoadBed fr3 windowmaskerSdust cleanWMask.bed.gz
    #	Loaded 1743810 elements of size 4
    featureBits -countGaps fr3 windowmaskerSdust
    #	66503033 bases of 391484715 (16.987%) in intersection
    #	mask the sequence with this clean mask
    zcat cleanWMask.bed.gz \
	| twoBitMask ../../fr3.unmasked.2bit stdin \
	    -type=.bed fr3.cleanWMSdust.2bit
    twoBitToFa fr3.cleanWMSdust.2bit stdout | faSize stdin \
        > fr3.cleanWMSdust.faSize.txt
    cat fr3.cleanWMSdust.faSize.txt
    #	391484715 bases (40522884 N's 350961831 real 284458798 upper 66503033 lower) in 6835 sequences in 1 files
    #	%16.99 masked total, %18.95 masked real

    # how much does this window masker and repeat masker overlap:
    featureBits -countGaps fr3 rmsk windowmaskerSdust
    #	21610292 bases of 391484715 (5.520%) in intersection

##########################################################################
# running repeat masker (DONE - 2011-11-16 - Hiram)
    mkdir /hive/data/genomes/fr3/bed/repeatMasker
    cd /hive/data/genomes/fr3/bed/repeatMasker
    time doRepeatMasker.pl -buildDir=`pwd` -noSplit \
	-bigClusterHub=swarm -dbHost=hgwdev -workhorse=hgwdev \
	-smallClusterHub=memk fr3 > do.log 2>&1 &
    #	real       46m48.252s
    cat faSize.rmsk.txt
    #	391484715 bases (40522884 N's 350961831 real 322500133 upper
    #	28461698 lower) in 6835 sequences in 1 files
    #	%7.27 masked total, %8.11 masked real

    grep -i versi do.log
# RepeatMasker version development-$Id: RepeatMasker,v 1.26 2011/09/26 16:19:44 angie Exp $
# April 26 2011 (open-3-3-0) version of RepeatMasker

    featureBits -countGaps fr3 rmsk
    #	28570382 bases of 391484715 (7.298%) in intersection
    # why is it different than the faSize above ?
    # because rmsk masks out some N's as well as bases, the count above
    #	separates out the N's from the bases, it doesn't show lower case N's

##########################################################################
# running simple repeat (DONE - 2011-11-16 - Hiram)
    mkdir /hive/data/genomes/fr3/bed/simpleRepeat
    cd /hive/data/genomes/fr3/bed/simpleRepeat
    time doSimpleRepeat.pl -buildDir=`pwd` -bigClusterHub=swarm \
	-dbHost=hgwdev -workhorse=hgwdev -smallClusterHub=memk \
	fr3 > do.log 2>&1 &
    #	real  14m11.20

    cat fb.simpleRepeat 
    #	10415803 bases of 350961831 (2.968%) in intersection

    # We are not going to use the RepeatMasker business since Window Masker
    #	covers much more.  It it was just TRF and rmsk, the following would
    #	take place.
    # when repeatMasker above is completed, add this mask:
    cd /hive/data/genomes/fr3
    twoBitMask fr3.rmsk.2bit \
	-add bed/simpleRepeat/trfMask.bed fr3.2bit
    #	you can safely ignore the warning about fields >= 13

    twoBitToFa fr3.2bit stdout | faSize stdin > faSize.fr3.2bit.txt
    cat faSize.fr3.2bit.txt
    #	391484715 bases (40522884 N's 350961831 real 321862602 upper
    #	29099229 lower) in 6835 sequences in 1 files
    #	%7.43 masked total, %8.29 masked real

    #  *** REMEMBER *** to reset they symLink in gbdb:
    rm /gbdb/fr3/fr3.2bit
    ln -s `pwd`/fr3.2bit /gbdb/fr3/fr3.2bit

#########################################################################
## Add TRF mask to WindowMasker masked sequence
    cd /cluster/data/fr3
    twoBitMask bed/windowMasker/fr3.cleanWMSdust.2bit \
      -add bed/simpleRepeat/trfMask.bed fr3.2bit
    # you can safely ignore the warnings about >= 13 fields

    # check the total masking
    twoBitToFa fr3.2bit stdout | faSize stdin
    #	391484715 bases (40522884 N's 350961831 real 284231237 upper
    #	66730594 lower) in 6835 sequences in 1 files
    #	%17.05 masked total, %19.01 masked real

    # as an experiment, see what rmsk adds:
    twoBitMask fr3.2bit \
      -add bed/repeatMasker/fr3.sorted.fa.out fr3.trf.wm.rmsk.2bit

    twoBitToFa fr3.trf.wm.rmsk.2bit stdout | faSize stdin
    #	391484715 bases (40522884 N's 350961831 real 277411829 upper
    #	73550002 lower) in 6835 sequences in 1 files
    #	%18.79 masked total, %20.96 masked real
    # which is an extra 6.8 million bases: 73550002-66730594 = 6819408

    # Make this be the actual file for the genome browser
    rm /gbdb/fr3/fr3.2bit
    ln -s /hive/data/genomes/fr3/fr3.2bit /gbdb/fr3/fr3.2bit 

#########################################################################
# Verify all gaps are marked, add any N's not in gap as type 'other'
#	(DONE - 2011-11-16 - Hiram)
    mkdir /hive/data/genomes/fr3/bed/gap
    cd /hive/data/genomes/fr3/bed/gap
    time nice -n +19 findMotif -motif=gattaca -verbose=4 \
	-strand=+ ../../fr3.unmasked.2bit > findMotif.txt 2>&1
    #	real    1m8.647s

    grep "^#GAP " findMotif.txt | sed -e "s/^#GAP //" > allGaps.bed
    featureBits fr3 -not gap -bed=notGap.bed
    #	363677527 bases of 363677527 (100.000%) in intersection
    featureBits fr3 allGaps.bed notGap.bed -bed=new.gaps.bed
    #	12715696 bases of 363677527 (3.496%) in intersection
    #	Wow, that's a lot of unmarked gap

    #	what is the highest index in the existing gap table:
    hgsql -N -e "select ix from gap;" fr3 | sort -n | tail -1
    #	110
    #	that number is used below in the script to mark all the new gaps
    #	with a higher ix than that
    cat << '_EOF_' > mkGap.pl
#!/bin/env perl

use strict;
use warnings;

my $ix=`hgsql -N -e "select ix from gap;" fr3 | sort -n | tail -1`;
chomp $ix;

open (FH,"<new.gaps.bed") or die "can not read new.gaps.bed";
while (my $line = <FH>) {
    my ($chrom, $chromStart, $chromEnd, $rest) = split('\s+', $line);
    ++$ix;
    printf "%s\t%d\t%d\t%d\tN\t%d\tother\tyes\n", $chrom, $chromStart,
        $chromEnd, $ix, $chromEnd-$chromStart;
}
close (FH);
'_EOF_'
    # << happy emacs
    chmod +x ./mkGap.pl
    ./mkGap.pl > other.bed
    featureBits -countGaps fr3 other.bed
    #	12715696 bases of 363677527 (3.248%) in intersection
    wc -l other.bed
    #	24141
    hgLoadBed -sqlTable=$HOME/kent/src/hg/lib/gap.sql \
	-noLoad fr3 otherGap other.bed
    #	starting with this many
    hgsql -e "select count(*) from gap;" fr3
    #	14303
    hgsql fr3 -e 'load data local infile "bed.tab" into table gap;'
    #	result count:
    hgsql -e "select count(*) from gap;" fr3
    #	38444
    # == 24141 + 14303
    # verify we aren't adding gaps where gaps already exist
    # this would output errors if that were true:
    gapToLift -minGap=1 fr3 nonBridged.lift -bedFile=nonBridged.bed
    # see example in danRer7.txt

    # there are non-bridged gaps here:

    hgsql -N -e "select bridge from gap;" fr3 | sort | uniq -c
    #	256 no
    #	38188 yes

##########################################################################
# MAKE 11.OOC FILE FOR BLAT/GENBANK (DONE - 2011-11-17 - Hiram)
    # Use -repMatch=128 (based on size -- for human we use 1024, and
    # fugu size is ~12% of human judging by gapless fr3 vs. hg18
    # genome sizes from featureBits.  Use the "real" number from
    #	the faSize measurements

    # hg19 is 2897316137, calculate the ratio factor for 1024:
    calc \( 350961831 / 2897316137 \) \* 1024
    #	( 350961831 / 2897316137 ) * 1024 = 124.040629
    # round up to 128

    cd /hive/data/genomes/fr3
    blat fr3.2bit /dev/null /dev/null -tileSize=11 \
      -makeOoc=jkStuff/fr3.11.ooc -repMatch=128
    #	Wrote 8853 overused 11-mers to jkStuff/fr3.11.ooc

    #	copy all of this stuff to the klusters:
    # there are some non-bridged gaps
    hgsql -N -e "select bridge from gap;" fr3 | sort | uniq -c
    #	256 no
    #	38188 yes

    cd /hive/data/genomes/fr3/jkStuff
    gapToLift fr3 fr3.nonBridged.lift -bedFile=fr3.nonBridged.bed
    cd /hive/data/genomes/fr3
    mkdir /hive/data/staging/data/fr3
    cp -p jkStuff/fr3.11.ooc chrom.sizes fr3.2bit jkStuff/fr3.nonBridged.lift \
	/hive/data/staging/data/fr3/
    # request rsync copy from cluster admin

#########################################################################
#	After getting a blat server assigned by the Blat Server Gods,
    ssh hgwdev

    hgsql -e 'INSERT INTO blatServers (db, host, port, isTrans, canPcr) \
	VALUES ("fr3", "blat1", "17814", "1", "0"); \
	INSERT INTO blatServers (db, host, port, isTrans, canPcr) \
	VALUES ("fr3", "blat1", "17815", "0", "1");' \
	    hgcentraltest
    #	test it with some sequence

#########################################################################
## Default position set same as fr2 (DONE - 2011-11-17 - Hiram)
    ssh hgwdev
    hgsql -e 'update dbDb set defaultPos="chr21:5807962-5832802"
	where name="fr3";' hgcentraltest

#########################################################################
# AUTO UPDATE GENBANK (DONE - 2011-11-17,28 - Hiram)
    # examine the file:
    /cluster/data/genbank/data/organism.lst
    # for your species to see what counts it has for:
# organism       mrnaCnt estCnt  refSeqCnt
# Takifugu rubripes       1271    26069   429
    # to decide which "native" mrna or ests you want to specify in genbank.conf
    # this appears that fr3 has plenty of native est's

    ssh hgwdev  
    cd $HOME/kent/src/hg/makeDb/genbank
    git pull
    # edit etc/genbank.conf to add fr3 just after fr2 and commit to GIT
# fr3
fr3.serverGenome = /hive/data/genomes/fr3/fr3.2bit
fr3.clusterGenome = /scratch/data/fr3/fr3.2bit
fr3.ooc = /scratch/data/fr3/fr3.11.ooc
fr3.align.unplacedChroms = HE*
fr3.lift = /scratch/data/fr3/fr3.nonBridged.lift
fr3.refseq.mrna.native.pslCDnaFilter  = ${lowCover.refseq.mrna.native.pslCDnaFilter}
fr3.refseq.mrna.xeno.pslCDnaFilter    = ${lowCover.refseq.mrna.xeno.pslCDnaFilter}
fr3.genbank.mrna.native.pslCDnaFilter = ${lowCover.genbank.mrna.native.pslCDnaFilter}
fr3.genbank.mrna.xeno.pslCDnaFilter   = ${lowCover.genbank.mrna.xeno.pslCDnaFilter}
fr3.genbank.est.native.pslCDnaFilter  = ${lowCover.genbank.est.native.pslCDnaFilter}
fr3.genbank.mrna.xeno.loadDesc = yes
fr3.refseq.mrna.xeno.load  = no
# fr3.upstreamGeneTbl = ensGene
# fr3.upstreamMaf = multiz5way

    # end of section added to etc/genbank.conf
    git commit -m "adding fr3 Takifugu rubripes" genbank.conf
    git push
    make etc-update

    # ~/kent/src/hg/makeDb/genbank/src/lib/gbGenome.c already contains
    # fr genome information, if this is a new species, need to add stuff there

    ssh hgwdev			# used to do this on "genbank" machine
    screen			# long running job managed in screen
    cd /cluster/data/genbank
    time nice -n +19 ./bin/gbAlignStep -initial fr3 &
    #	var/build/logs/2011.11.28-09:01:11.fr3.initalign.log
    #	real    111m30.317s

    # tail the log to make sure it is successful:
# hgwdev 2011.11.28-10:52:34 fr3.initalign: Succeeded: fr3
# hgwdev 2011.11.28-10:52:41 fr3.initalign: finish

    # load database when finished
    ssh hgwdev
    cd /cluster/data/genbank
    time nice -n +19 ./bin/gbDbLoadStep -drop -initialLoad fr3 &
    #	logFile: var/dbload/hgwdev/logs/2011.11.28-18:56:26.dbload.log
    #	real    56m28.463s

    # enable daily alignment and update of hgwdev (DONE - 2011-04-14 - Hiram)
    cd ~/kent/src/hg/makeDb/genbank
    git pull
    # add fr3 to:
        etc/align.dbs
        etc/hgwdev.dbs
    git commit -m "Added fr3." etc/align.dbs etc/hgwdev.dbs
    git push
    make etc-update

###########################################################################
# lastz Zebrafish danRer7 (DONE - 2011-12-01 - Hiram)
    mkdir /hive/data/genomes/fr3/bed/lastzDanRer7.2011-12-01
    cd /hive/data/genomes/fr3/bed/lastzDanRer7.2011-12-01

    cat << '_EOF_' > DEF
# Fugu vs. Zebrafish

# using the "close" genome alignment parameters
#       see also: http://genomewiki.ucsc.edu/index.php/Mm9_multiple_alignment
BLASTZ_Y=9400
BLASTZ_L=3000
BLASTZ_K=3000
BLASTZ_M=50
BLASTZ_Q=/scratch/data/blastz/HoxD55.q

# TARGET: Fugu fr3
#	CHUNK of 24,000,000 is big enough to fit chr1 the largest in one go
SEQ1_DIR=/scratch/data/fr3/fr3.2bit
SEQ1_LEN=/scratch/data/fr3/chrom.sizes
SEQ1_CHUNK=24000000
SEQ1_LAP=10000
SEQ1_LIMIT=40

# QUERY: Zebrafish danRer7
SEQ2_DIR=/scratch/data/danRer7/danRer7.2bit
SEQ2_LEN=/scratch/data/danRer7/chrom.sizes
SEQ2_CHUNK=10000000
SEQ2_LIMIT=10
SEQ2_LAP=0

BASE=/hive/data/genomes/fr3/bed/lastzDanRer7.2011-12-01
TMPDIR=/scratch/tmp
'_EOF_'
    # << happy emacs

    screen # use screen to manage this long running job
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
	`pwd`/DEF \
	-chainMinScore=2000 -chainLinearGap=medium \
	-workhorse=hgwdev -tRepeats=windowmaskerSdust \
	-smallClusterHub=memk -bigClusterHub=swarm > do.log 2>&1 &
    #	real    764m6.061s

    cat fb.fr3.chainDanRer7Link.txt 
    #	80101694 bases of 350961831 (22.823%) in intersection

    mkdir /hive/data/genomes/danRer7/bed/blastz.fr3.swap
    cd /hive/data/genomes/danRer7/bed/blastz.fr3.swap
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
	/hive/data/genomes/fr3/bed/lastzDanRer7.2011-12-01/DEF \
	-chainMinScore=2000 -chainLinearGap=medium \
	-workhorse=hgwdev -tRepeats=windowmaskerSdust \
	-swap -smallClusterHub=memk -bigClusterHub=swarm > swap.log 2>&1 &
    #	real    9m54.286s
    cat fb.danRer7.chainFr3Link.txt 
    #	103831209 bases of 1409770109 (7.365%) in intersection

############################################################################
# lastz Nile tilapia oreNil1 (DONE - 2011-12-01,02 - Hiram)
    mkdir /hive/data/genomes/fr3/bed/lastzOreNil1.2011-12-01
    cd /hive/data/genomes/fr3/bed/lastzOreNil1.2011-12-01

    cat << '_EOF_' > DEF
# Fugu vs. Nile tilapia

# using the "close" genome alignment parameters
#       see also: http://genomewiki.ucsc.edu/index.php/Mm9_multiple_alignment
BLASTZ_Y=9400
BLASTZ_L=3000
BLASTZ_K=3000
BLASTZ_M=50
BLASTZ_Q=/scratch/data/blastz/HoxD55.q

# TARGET: Fugu fr3
#	CHUNK of 24,000,000 is big enough to fit chr1 the largest in one go
SEQ1_DIR=/scratch/data/fr3/fr3.2bit
SEQ1_LEN=/scratch/data/fr3/chrom.sizes
SEQ1_CHUNK=24000000
SEQ1_LAP=10000
SEQ1_LIMIT=40

# QUERY: Nile tilapia oreNil1
SEQ2_DIR=/scratch/data/oreNil1/oreNil1.2bit
SEQ2_LEN=/scratch/data/oreNil1/chrom.sizes
SEQ2_CHUNK=10000000
SEQ2_LIMIT=10
SEQ2_LAP=0

BASE=/hive/data/genomes/fr3/bed/lastzOreNil1.2011-12-01
TMPDIR=/scratch/tmp
'_EOF_'
    # << happy emacs

    screen # use screen to manage this long running job
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
	`pwd`/DEF \
	-chainMinScore=2000 -chainLinearGap=medium -workhorse=hgwdev \
	-qRepeats=windowmaskerSdust -tRepeats=windowmaskerSdust \
	-smallClusterHub=memk -bigClusterHub=swarm > do.log 2>&1 &
    #	real    1023m19.470s

    cat fb.fr3.chainOreNil1Link.txt 
    #	200890618 bases of 350961831 (57.240%) in intersection

    mkdir /hive/data/genomes/oreNil1/bed/blastz.fr3.swap
    cd /hive/data/genomes/oreNil1/bed/blastz.fr3.swap
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
	/hive/data/genomes/fr3/bed/lastzOreNil1.2011-12-01/DEF \
	-chainMinScore=2000 -chainLinearGap=medium -workhorse=hgwdev \
	-qRepeats=windowmaskerSdust -tRepeats=windowmaskerSdust \
	-swap -smallClusterHub=memk -bigClusterHub=swarm > swap.log 2>&1 &
    #	real    17m29.215s
    cat fb.oreNil1.chainFr3Link.txt 
    #	250828022 bases of 816084674 (30.736%) in intersection

############################################################################
# lastz Tetraodon tetNig2 (DONE - 2011-12-01,02 - Hiram)
    mkdir /hive/data/genomes/fr3/bed/lastzTetNig2.2011-12-01
    cd /hive/data/genomes/fr3/bed/lastzTetNig2.2011-12-01

    cat << '_EOF_' > DEF
# Fugu vs. Tetraodon

# using the "close" genome alignment parameters
#       see also: http://genomewiki.ucsc.edu/index.php/Mm9_multiple_alignment
BLASTZ_Y=9400
BLASTZ_L=3000
BLASTZ_K=3000
BLASTZ_M=50
BLASTZ_Q=/scratch/data/blastz/HoxD55.q

# TARGET: Fugu fr3
#	CHUNK of 24,000,000 is big enough to fit chr1 the largest in one go
SEQ1_DIR=/scratch/data/fr3/fr3.2bit
SEQ1_LEN=/scratch/data/fr3/chrom.sizes
SEQ1_CHUNK=24000000
SEQ1_LAP=10000
SEQ1_LIMIT=5

# QUERY: Tetraodon tetNig2
SEQ2_DIR=/scratch/data/tetNig2/tetNig2.2bit
SEQ2_LEN=/scratch/data/tetNig2/chrom.sizes
SEQ2_CHUNK=10000000
SEQ2_LIMIT=1
SEQ2_LAP=0

BASE=/hive/data/genomes/fr3/bed/lastzTetNig2.2011-12-01
TMPDIR=/scratch/tmp
'_EOF_'
    # << happy emacs

    screen # use screen to manage this long running job
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
	`pwd`/DEF \
	-chainMinScore=2000 -chainLinearGap=medium -workhorse=hgwdev \
	-qRepeats=windowmaskerSdust -tRepeats=windowmaskerSdust \
	-smallClusterHub=memk -bigClusterHub=swarm > do.log 2>&1 &
    #	real    468m6.556s

    cat fb.fr3.chainTetNig2Link.txt 
    #	248874754 bases of 350961831 (70.912%) in intersection

    mkdir /hive/data/genomes/tetNig2/bed/blastz.fr3.swap
    cd /hive/data/genomes/tetNig2/bed/blastz.fr3.swap
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
	/hive/data/genomes/fr3/bed/lastzTetNig2.2011-12-01/DEF \
	-chainMinScore=2000 -chainLinearGap=medium -workhorse=hgwdev \
	-qRepeats=windowmaskerSdust -tRepeats=windowmaskerSdust \
	-swap -smallClusterHub=memk -bigClusterHub=swarm > swap.log 2>&1 &
    #	real    10m59.521s
    cat fb.tetNig2.chainFr3Link.txt 
    #	243890006 bases of 302314788 (80.674%) in intersection

############################################################################
# lastz Stickleback gasAcu1 (DONE - 2011-12-01 - Hiram)
#	experiment here, two runs on Stickleback, one with gasAcu1
#	as it stands with little masking, and a second with it masked
#	with WindowMasker.  Result: we get more coverage with the less masked
#	sequence

    twoBitToFa /scratch/data/gasAcu1/gasAcu1.2bit stdout | faSize stdin
    #	463354448 bases (16726587 N's 446627861 real 435123380 upper
    #	11504481 lower) in 23 sequences in 1 files
    #	%2.48 masked total, %2.58 masked real
    twoBitToFa /hive/data/genomes/gasAcu1/bed/windowMasker/gasAcu1.TRF.WMSdust.2bit stdout | faSize stdin

    mkdir /hive/data/genomes/fr3/bed/lastzGasAcu1.2011-12-02
    cd /hive/data/genomes/fr3/bed/lastzGasAcu1.2011-12-02

    cat << '_EOF_' > DEF
# Fugu vs. Stickleback

# using the "close" genome alignment parameters
#       see also: http://genomewiki.ucsc.edu/index.php/Mm9_multiple_alignment
BLASTZ_Y=9400
BLASTZ_L=3000
BLASTZ_K=3000
BLASTZ_M=50
BLASTZ_Q=/scratch/data/blastz/HoxD55.q

# TARGET: Fugu fr3
#	CHUNK of 24,000,000 is big enough to fit chr1 the largest in one go
SEQ1_DIR=/scratch/data/fr3/fr3.2bit
SEQ1_LEN=/scratch/data/fr3/chrom.sizes
SEQ1_CHUNK=24000000
SEQ1_LAP=10000
SEQ1_LIMIT=5

# QUERY: Stickleback gasAcu1
SEQ2_DIR=/scratch/data/gasAcu1/gasAcu1.2bit
SEQ2_LEN=/scratch/data/gasAcu1/chrom.sizes
SEQ2_CHUNK=10000000
SEQ2_LIMIT=1
SEQ2_LAP=0

BASE=/hive/data/genomes/fr3/bed/lastzGasAcu1.2011-12-02
TMPDIR=/scratch/tmp
'_EOF_'
    # << happy emacs

    screen # use screen to manage this long running job
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
	`pwd`/DEF \
	-chainMinScore=2000 -chainLinearGap=medium -workhorse=hgwdev \
	-tRepeats=windowmaskerSdust \
	-smallClusterHub=memk -bigClusterHub=swarm > do.log 2>&1 &
    #	real    267m15.622s

    cat fb.fr3.chainGasAcu1Link.txt 
    #	194876988 bases of 350961831 (55.527%) in intersection
    # vs. the more masked result:
    #	188878416 bases of 350961831 (53.817%) in intersection

    mkdir /hive/data/genomes/gasAcu1/bed/blastz.fr3.swap
    cd /hive/data/genomes/gasAcu1/bed/blastz.fr3.swap
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
	/hive/data/genomes/fr3/bed/lastzGasAcu1.2011-12-02/DEF \
	-chainMinScore=2000 -chainLinearGap=medium -workhorse=hgwdev \
	-tRepeats=windowmaskerSdust \
	-swap -smallClusterHub=memk -bigClusterHub=swarm > swap.log 2>&1 &
    #	real    15m25.221s

    cat fb.gasAcu1.chainFr3Link.txt 
    #	214549957 bases of 446627861 (48.038%) in intersection

##############################################################################
# experiment lastz test on Stickleback with window masker sequence
#	(DONE - Hiram - 2011-12-02)
    mkdir /hive/data/genomes/fr3/bed/lastzGasAcu1.2011-12-02maskTest
    cd /hive/data/genomes/fr3/bed/lastzGasAcu1.2011-12-02maskTest

    cat << '_EOF_' > DEF
# Fugu vs. Stickleback

# using the "close" genome alignment parameters
#       see also: http://genomewiki.ucsc.edu/index.php/Mm9_multiple_alignment
BLASTZ_Y=9400
BLASTZ_L=3000
BLASTZ_K=3000
BLASTZ_M=50
BLASTZ_Q=/scratch/data/blastz/HoxD55.q

# TARGET: Fugu fr3
#	CHUNK of 24,000,000 is big enough to fit chr1 the largest in one go
SEQ1_DIR=/scratch/data/fr3/fr3.2bit
SEQ1_LEN=/scratch/data/fr3/chrom.sizes
SEQ1_CHUNK=24000000
SEQ1_LAP=10000
SEQ1_LIMIT=5

# QUERY: Stickleback gasAcu1
SEQ2_DIR=/hive/data/genomes/gasAcu1/bed/windowMasker/gasAcu1.TRF.WMSdust.2bit
SEQ2_LEN=/scratch/data/gasAcu1/chrom.sizes
SEQ2_CHUNK=10000000
SEQ2_LIMIT=1
SEQ2_LAP=0

BASE=/hive/data/genomes/fr3/bed/lastzGasAcu1.2011-12-02maskTest
TMPDIR=/scratch/tmp
'_EOF_'
    # << happy emacs

    screen # use screen to manage this long running job
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
	`pwd`/DEF \
	-chainMinScore=2000 -chainLinearGap=medium -workhorse=hgwdev \
	-tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \
	-smallClusterHub=memk -bigClusterHub=swarm > do.log 2>&1 &
    #	real    253m13.215s

    cat fb.fr3.chainGasAcu1Link.txt 
    #	188878416 bases of 350961831 (53.817%) in intersection
    # vs. the less masked result:
    #	194876988 bases of 350961831 (55.527%) in intersection

############################################################################
# lastz Medaka oryLat2 (DONE - 2011-12-01,02 - Hiram)
    mkdir /hive/data/genomes/fr3/bed/lastzOryLat2.2011-12-01
    cd /hive/data/genomes/fr3/bed/lastzOryLat2.2011-12-01

    cat << '_EOF_' > DEF
# Fugu vs. Medaka

# using the "close" genome alignment parameters
#       see also: http://genomewiki.ucsc.edu/index.php/Mm9_multiple_alignment
BLASTZ_Y=9400
BLASTZ_L=3000
BLASTZ_K=3000
BLASTZ_M=50
BLASTZ_Q=/scratch/data/blastz/HoxD55.q

# TARGET: Fugu fr3
#	CHUNK of 24,000,000 is big enough to fit chr1 the largest in one go
SEQ1_DIR=/scratch/data/fr3/fr3.2bit
SEQ1_LEN=/scratch/data/fr3/chrom.sizes
SEQ1_CHUNK=24000000
SEQ1_LAP=10000
SEQ1_LIMIT=5

# QUERY: Medaka oryLat2
SEQ2_DIR=/scratch/data/oryLat2/oryLat2.2bit
SEQ2_LEN=/scratch/data/oryLat2/chrom.sizes
SEQ2_CHUNK=10000000
SEQ2_LIMIT=50
SEQ2_LAP=0

BASE=/hive/data/genomes/fr3/bed/lastzOryLat2.2011-12-01
TMPDIR=/scratch/tmp
'_EOF_'
    # << happy emacs

    screen # use screen to manage this long running job
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
	`pwd`/DEF \
	-chainMinScore=2000 -chainLinearGap=medium -workhorse=hgwdev \
	-qRepeats=windowmaskerSdust -tRepeats=windowmaskerSdust \
	-smallClusterHub=memk -bigClusterHub=swarm > do.log 2>&1 &
    #	real    725m19.856s

    cat fb.fr3.chainOryLat2Link.txt 
    #	153943192 bases of 350961831 (43.863%) in intersection

    mkdir /hive/data/genomes/oryLat2/bed/blastz.fr3.swap
    cd /hive/data/genomes/oryLat2/bed/blastz.fr3.swap
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
	/hive/data/genomes/fr3/bed/lastzOryLat2.2011-12-01/DEF \
	-chainMinScore=2000 -chainLinearGap=medium \
	-qRepeats=windowmaskerSdust -tRepeats=windowmaskerSdust \
	-swap -smallClusterHub=memk -bigClusterHub=swarm > swap.log 2>&1 &
    #	real    15m6.162s
    cat fb.oryLat2.chainFr3Link.txt 
    #	181436751 bases of 700386597 (25.905%) in intersection

############################################################################
# lastz Atlantic Cod gadMor1 (DONE - 2012-01-11,17 - Hiram)
    mkdir /hive/data/genomes/fr3/bed/lastzGadMor1.2012-01-11
    cd /hive/data/genomes/fr3/bed/lastzGadMor1.2012-01-11

    cat << '_EOF_' > DEF
# Fugu vs. Atlantic Cod

# using the "close" genome alignment parameters
#       see also: http://genomewiki.ucsc.edu/index.php/Mm9_multiple_alignment
BLASTZ_Y=9400
BLASTZ_L=3000
BLASTZ_K=3000
BLASTZ_M=50
BLASTZ_Q=/scratch/data/blastz/HoxD55.q

# TARGET: Fugu fr3
#	CHUNK of 24,000,000 is big enough to fit chr1 the largest in one go
SEQ1_DIR=/scratch/data/fr3/fr3.2bit
SEQ1_LEN=/scratch/data/fr3/chrom.sizes
SEQ1_CHUNK=24000000
SEQ1_LAP=10000
SEQ1_LIMIT=5

# QUERY: Atlantic Cod gadMor1
SEQ2_DIR=/hive/data/genomes/gadMor1/gadMor1.2bit
SEQ2_LEN=/hive/data/genomes/gadMor1/chrom.sizes
SEQ2_CHUNK=10000000
SEQ2_LIMIT=500
SEQ2_LAP=0

BASE=/hive/data/genomes/fr3/bed/lastzGadMor1.2012-01-11
TMPDIR=/scratch/tmp
'_EOF_'
    # << happy emacs

    screen # use screen to manage this long running job
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
	`pwd`/DEF \
	-chainMinScore=2000 -chainLinearGap=medium -workhorse=hgwdev \
	-qRepeats=windowmaskerSdust -tRepeats=windowmaskerSdust \
	-smallClusterHub=memk -bigClusterHub=swarm > do.log 2>&1 &
    #	Elapsed time: 5107m6s == 3 days 13 hours 7m

    cat fb.fr3.chainGadMor1Link.txt 
    #	106436551 bases of 350961831 (30.327%) in intersection

    mkdir /hive/data/genomes/gadMor1/bed/blastz.fr3.swap
    cd /hive/data/genomes/gadMor1/bed/blastz.fr3.swap
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
	/hive/data/genomes/fr3/bed/lastzGadMor1.2012-01-11/DEF \
	-chainMinScore=2000 -chainLinearGap=medium \
	-qRepeats=windowmaskerSdust -tRepeats=windowmaskerSdust \
	-swap -smallClusterHub=memk -bigClusterHub=swarm > swap.log 2>&1 &
    #	real    160m13.224s
    cat fb.gadMor1.chainFr3Link.txt 
    #	115839168 bases of 608038597 (19.051%) in intersection

############################################################################
# lastz Coelacanth latCha1 (DONE - 2011-12-12,17 - Hiram)
    mkdir /hive/data/genomes/fr3/bed/lastzLatCha1.2012-01-12
    cd /hive/data/genomes/fr3/bed/lastzLatCha1.2012-01-12

    cat << '_EOF_' > DEF
# Fugu vs. Coelacanth

# using the "close" genome alignment parameters
#       see also: http://genomewiki.ucsc.edu/index.php/Mm9_multiple_alignment
BLASTZ_Y=9400
BLASTZ_L=3000
BLASTZ_K=3000
BLASTZ_M=50
BLASTZ_Q=/scratch/data/blastz/HoxD55.q

# TARGET: Fugu fr3
#	CHUNK of 24,000,000 is big enough to fit chr1 the largest in one go
SEQ1_DIR=/scratch/data/fr3/fr3.2bit
SEQ1_LEN=/scratch/data/fr3/chrom.sizes
SEQ1_CHUNK=24000000
SEQ1_LAP=10000
SEQ1_LIMIT=5

# QUERY: Coelacanth latCha1
SEQ2_DIR=/hive/data/genomes/latCha1/latCha1.2bit
SEQ2_LEN=/hive/data/genomes/latCha1/chrom.sizes
SEQ2_CHUNK=10000000
SEQ2_LIMIT=500
SEQ2_LAP=0

BASE=/hive/data/genomes/fr3/bed/lastzLatCha1.2012-01-12
TMPDIR=/scratch/tmp
'_EOF_'
    # << happy emacs

    screen # use screen to manage this long running job
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
	`pwd`/DEF \
	-chainMinScore=2000 -chainLinearGap=medium -workhorse=hgwdev \
	-qRepeats=windowmaskerSdust -tRepeats=windowmaskerSdust \
	-smallClusterHub=memk -bigClusterHub=swarm > do.log 2>&1 &
    #	real    1626m11.140s

    cat fb.fr3.chainLatCha1Link.txt 
    #	48206149 bases of 350961831 (13.735%) in intersection

    mkdir /hive/data/genomes/latCha1/bed/blastz.fr3.swap
    cd /hive/data/genomes/latCha1/bed/blastz.fr3.swap
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
	/hive/data/genomes/fr3/bed/lastzLatCha1.2012-01-12/DEF \
	-chainMinScore=2000 -chainLinearGap=medium \
	-qRepeats=windowmaskerSdust -tRepeats=windowmaskerSdust \
	-swap -smallClusterHub=memk -bigClusterHub=swarm > swap.log 2>&1 &
    #	real    8m47.103s
    cat fb.latCha1.chainFr3Link.txt 
    #	62854403 bases of 2183592768 (2.878%) in intersection

############################################################################
## 8-Way Multiz (WORKING - 2012-01-18 - Hiram)
    ssh hgwdev
    mkdir /hive/data/genomes/fr3/bed/multiz8way
    cd /hive/data/genomes/fr3/bed/multiz8way

    # species list:
    cd /hive/data/genomes/fr3/bed
    ls -d lastz.* | sed -e "s/lastz.//" | sort | xargs echo
# danRer7 gadMor1 gasAcu1 latCha1 oreNil1 oryLat2 tetNig2
    # plus this one
# fr3

    #	All distances remain as specified in the 60way.nh
    /cluster/bin/phast/tree_doctor --prune-all-but \
	fr3,danRer7,gadMor1,gasAcu1,latCha1,oreNil1,oryLat2,tetNig2 \
	    $HOME/kent/src/hg/utils/phyloTrees/60way.nh > 8way.nh
    #	what that looks like:
    cat 8way.nh
# (latCha1:1.166927,((((((tetNig2:0.224159,fr3:0.203847):0.097590,
# oreNil1:0.200000):0.097590,gasAcu1:0.316413):0.030000,
# oryLat2:0.511970):0.030000,gadMor1:0.350000):0.225640,
# danRer7:0.730752):0.147949);

    #	rearrange to get fr3 on top:
    cat << '_EOF_' > fr3.8way.nh
(((((((fr3:0.203847,tetNig2:0.224159):0.097590,oreNil1:0.200000):0.097590,
gasAcu1:0.316413):0.030000,oryLat2:0.511970):0.030000,
gadMor1:0.350000):0.225640,danRer7:0.730752):0.147949,latCha1:1.166927);
'_EOF_'
    # << happy emacs

    # convert to species names
    /cluster/bin/phast/tree_doctor --rename \
"fr3->Fugu; tetNig2->Tetraodon; oreNil1->Nile_Tilapia; gasAcu1->Stickleback; oryLat2->Medaka; gadMor1->Atlantic_cod; danRer7->Zebrafish; latCha1->Coelacanth;" \
fr3.8way.nh > fr3.commonNames.8way.nh

    #	Use this specification in the phyloGif tool:
    #	http://genome.ucsc.edu/cgi-bin/phyloGif
    #	to obtain a png image for src/hg/htdocs/images/phylo/fr3_8way.png

    /cluster/bin/phast/all_dists fr3.8way.nh > 8way.distances.txt
    #	Use this output to create the table below
    grep -i fr3 8way.distances.txt | sort -k3,3n
# fr3     tetNig2 0.428006
# fr3     oreNil1 0.501437
# fr3     gasAcu1 0.715440
# fr3     gadMor1 0.809027
# fr3     oryLat2 0.940997
# fr3     danRer7 1.415419
# fr3     latCha1 1.999543

#	If you can fill in all the numbers in this table, you are ready for
#	the multiple alignment procedure

#                         featureBits chainLink measures
#                                     chainFr3Link          chain  linearGap
#    distance                           on fr3  on other   minScore
#  1  0.428 - tetraodon tetNig2      (% 70.91) (% 80.67)     2000     medium
#  2  0.501 - nile tilapia oreNil1   (% 57.24) (% 30.74)     2000     medium
#  3  0.715 - stickleback gasAcu1    (% 55.53) (% 48.04)     2000     medium
#  4  0.809 - altantic cod gadMor1   (% 30.33) (% 19.05)     2000     medium
#  5  0.941 - medaka oryLat2         (% 43.86) (% 25.91)     2000     medium
#  6  1.415 - zebrafish danRer7      (% 22.82) (%  7.37)     2000     medium
#  7  2.000 - coelacanth latCha1     (% 13.74) (%  2.88)     2000     medium

# None of this concern for distances matters in building the first step, the
# maf files.

    # create species list and stripped down tree for autoMZ
    sed 's/[a-z][a-z]*_//g; s/:[0-9\.][0-9\.]*//g; s/;//; /^ *$/d' \
	fr3.8way.nh > tmp.nh
    echo `cat tmp.nh` | sed 's/ //g; s/,/ /g' > tree.nh
    sed 's/[()]//g; s/,/ /g' tree.nh > species.list

    # break up maf files into smaller pieces:
    mkdir /hive/data/genomes/fr3/bed/multiz8way/mafInputs
    cd /hive/data/genomes/fr3/bed/multiz8way/mafInputs

    for S in `cat ../species.list | sed -e "s/fr3 //"`
do
    echo $S
    rm -fr ./${S}/
    mkdir -p ./${S}/
    netMaf="../../lastz.${S}/mafNet/fr3.${S}.net.maf.gz"
    ls -l "${netMaf}"
    mafSplit -byTarget -useHashedName=5 /dev/null ./${S}/ ${netMaf}
done

    # construct a list of all possible maf file names.
    # they do not all exist in each of the species directories
    find . -type f | wc -l
    # 255
    find . -type f | grep ".maf$" | xargs -L 1 basename | sort -u > maf.list
    wc -l maf.list
    # 32 maf.list

    mkdir /hive/data/genomes/fr3/bed/multiz8way/splitRun
    cd /hive/data/genomes/fr3/bed/multiz8way/splitRun
    mkdir maf run
    cd run
    mkdir penn
    cp -p /cluster/bin/penn/multiz.2009-01-21/multiz penn
    cp -p /cluster/bin/penn/multiz.2009-01-21/maf_project penn
    cp -p /cluster/bin/penn/multiz.2009-01-21/autoMZ penn

    #	set the db and pairs directories here
    cat > autoMultiz.csh << '_EOF_'
#!/bin/csh -ef
set db = fr3
set c = $1
set result = $2
set run = `/bin/pwd`
set tmp = /scratch/tmp/$db/multiz.$c
set pairs = /hive/data/genomes/fr3/bed/multiz8way/mafInputs
/bin/rm -fr $tmp
/bin/mkdir -p $tmp
/bin/cp -p ../../tree.nh ../../species.list $tmp
pushd $tmp > /dev/null
foreach s (`/bin/sed -e "s/$db //" species.list`)
    set in = $pairs/$s/$c.maf
    set out = $db.$s.sing.maf
    if (-e $in.gz) then
        /bin/zcat $in.gz > $out
	if (! -s $out) then
	    echo "##maf version=1 scoring=autoMZ" > $out
	endif
    else if (-e $in) then
        /bin/ln -s $in $out
    else
        echo "##maf version=1 scoring=autoMZ" > $out
    endif
end
set path = ($run/penn $path); rehash
$run/penn/autoMZ + T=$tmp E=$db "`cat tree.nh`" $db.*.sing.maf $c.maf \
	> /dev/null
popd > /dev/null
/bin/rm -f $result
/bin/cp -p $tmp/$c.maf $result
/bin/rm -fr $tmp
/bin/rmdir --ignore-fail-on-non-empty /scratch/tmp/$db
'_EOF_'
# << happy emacs
    chmod +x autoMultiz.csh

    cat  << '_EOF_' > template
#LOOP
./autoMultiz.csh $(root1) {check out line+ /hive/data/genomes/fr3/bed/multiz8way/splitRun/maf/$(root1).maf}
#ENDLOOP
'_EOF_'
# << happy emacs

    ln -s ../../mafInputs/maf.list maf.list
    ssh swarm
    cd /hive/data/genomes/fr3/bed/multiz8way/splitRun/run
    # the tac reverses the list to get the small jobs first
    gensub2 maf.list single template stdout | tac > jobList
    # the ram=8g will allow only one job on a node to slow them down
    para -ram=8g create jobList
# Completed: 32 of 32 jobs
# CPU time in finished jobs:      22781s     379.69m     6.33h    0.26d  0.001 y
# IO & Wait Time:                   967s      16.11m     0.27h    0.01d  0.000 y
# Average job time:                 742s      12.37m     0.21h    0.01d
# Longest finished job:            2113s      35.22m     0.59h    0.02d
# Submission to last job:         17092s     284.87m     4.75h    0.20d

    # assemble into a single maf file
    cd /hive/data/genomes/fr3/bed/multiz8way
    head -1 splitRun/maf/001.maf > multiz8way.maf
    for F in splitRun/maf/*.maf
do
    egrep -v "^#" ${F}
done >> multiz8way.maf
    tail -1 splitRun/maf/001.maf >> multiz8way.maf

# -rw-rw-r-- 1 2491426291 Feb 21 12:06 multiz8way.maf

    # Load into database
    ssh hgwdev
    cd /hive/data/genomes/fr3/bed/multiz8way
    mkdir /gbdb/fr3/multiz8way
    ln -s `pwd`/multiz8way.maf /gbdb/fr3/multiz8way
    cd /scratch/tmp
    time nice -n +19 hgLoadMaf fr3 multiz8way
    #	Indexing and tabulating /gbdb/fr3/multiz8way/multiz8way.maf
    #	Loaded 2735007 mafs in 1 files from /gbdb/fr3/multiz8way
    #	real    1m48.011s

    time nice -n +19 hgLoadMafSummary -verbose=2 -minSize=30000 \
	-mergeGap=1500 -maxSize=200000 fr3 multiz8waySummary \
	/gbdb/fr3/multiz8way/multiz8way.maf
    #	Created 179258 summary blocks from 8283246 components and 2735007 mafs
    #	from /gbdb/fr3/multiz8way/multiz8way.maf

    #	real    1m51.378s

    wc -l multiz8way*.tab
    #	2735007 multiz8way.tab
    #	179258 multiz8waySummary.tab
    #	2914265 total

    rm multiz8way*.tab

#######################################################################
# GAP ANNOTATE MULTIZ9WAY MAF AND LOAD TABLES (DONE - 2012-02-21 - Hiram)
    # mafAddIRows has to be run on single chromosome maf files, it does not
    #	function correctly when more than one reference sequence
    #	are in a single file.
    mkdir -p /hive/data/genomes/fr3/bed/multiz8way/anno/mafSplit
    cd /hive/data/genomes/fr3/bed/multiz8way/anno/mafSplit
    time mafSplit -outDirDepth=2 -byTarget -useFullSequenceName \
	/dev/null . ../../multiz8way.maf
    #	real    1m58.310s

    find . -type f | wc
    #	6214 6214 117987

    cd /hive/data/genomes/fr3/bed/multiz8way/anno
    # a couple of these do not yet have an N.bed file
    for DB in `cat ../species.list`
do
    cd /hive/data/genomes/${DB}
    if [ ! -s "${DB}.N.bed" ]; then
	twoBitInfo -nBed ${DB}.2bit ${DB}.N.bed
	ls -og ${DB}.2bit ${DB}.N.bed
    fi
done

for DB in `cat ../species.list`
do
    echo "${DB} "
    ln -s  /hive/data/genomes/${DB}/${DB}.N.bed ${DB}.bed
    echo ${DB}.bed  >> nBeds
    ln -s  /hive/data/genomes/${DB}/chrom.sizes ${DB}.len
    echo ${DB}.len  >> sizes
done
    # make sure they all are successful symLinks:
    ls -ogrtL

    ssh swarm
    cd /hive/data/genomes/fr3/bed/multiz8way/anno
    mkdir result
    # NEXT TIME: this template should have a check out exists+ statement
    cat << '_EOF_' > template
#LOOP
runOne $(path1) {check out exists+ result/$(path1)}
#ENDLOOP
'_EOF_'
    # << happy emacs

    cat << '_EOF_' > runOne
#!/bin/sh
set -beEu -o pipefail
D=`dirname $2`
mkdir -p "${D}"
mafAddIRows -nBeds=nBeds $1 /hive/data/genomes/fr3/fr3.2bit $2
'_EOF_'
    # << happy emacs

    find ./mafSplit -type f | sed -e 's#^./##' > maf.list
    gensub2 maf.list single template jobList
    # limit jobs to one per node with the ram=8g requirement
    para -ram=8g create jobList
    para try ... check ... push ...
# Completed: 6214 of 6214 jobs
# CPU time in finished jobs:       1542s      25.70m     0.43h    0.02d  0.000 y
# IO & Wait Time:                 16187s     269.78m     4.50h    0.19d  0.001 y
# Average job time:                   3s       0.05m     0.00h    0.00d
# Longest finished job:              16s       0.27m     0.00h    0.00d
# Submission to last job:           331s       5.52m     0.09h    0.00d

    # verify all result files have some content, look for 0 size files:
    find ./result -type f -size 0
    # should see none

    # combine into one file
    head -q -n 1 result/mafSplit/0/0/HE591688.maf > fr3.8way.maf
    find ./result -type f | while read F
do
    grep -h -v "^#" ${F}
done >> fr3.8way.maf
    #	these maf files do not have the end marker, this does nothing:
    #	tail -q -n 1 result/GL172637.maf >> fr3.8way.maf
    # How about an official end marker:
    echo "##eof maf" >> fr3.8way.maf

    # Load into database
    rm /gbdb/fr3/multiz8way/multiz8way.maf   # remove old symlink
    ln -s `pwd`/fr3.8way.maf /gbdb/fr3/multiz8way/multiz8way.maf
    cd /scratch/tmp
    time nice -n +19 hgLoadMaf fr3 multiz8way
    #	Loaded 2844956 mafs in 1 files from /gbdb/fr3/multiz8way
    #	real    1m58.565s

    time nice -n +19 hgLoadMafSummary -verbose=2 -minSize=30000 \
	-mergeGap=1500 -maxSize=200000 fr3 multiz8waySummary \
	/gbdb/fr3/multiz8way/multiz8way.maf
    #	Created 179258 summary blocks from 8283246 components and 2844956 mafs
    #	from /gbdb/fr3/multiz8way/multiz8way.maf

    #	real    2m26.270s

    wc -l multiz8way*.tab
    #	2844956 multiz8way.tab
    #	179258 multiz8waySummary.tab
    #	3024214 total

    rm multiz8way*.tab

#########################################################################
# MULTIZ9WAY MAF FRAMES (DONE - 2012-02-21 - Hiram)
    ssh hgwdev
    mkdir /hive/data/genomes/fr3/bed/multiz8way/frames
    cd /hive/data/genomes/fr3/bed/multiz8way/frames
    mkdir genes

    #	instead of knownGene, so, using ensGene for hg19 and mm9
fr3 - the lifted ensGene from fr2
tetNig2 - ensGene
oreNil1 - none
gasAcu1 - ensGene
oryLat2 - ensGene
gadMor1 - none
danRer7 - ensGene
latCha1 - none

    #------------------------------------------------------------------------
    # get the genes for all genomes
    # mRNAs with CDS.  single select to get cds+psl, then split that up and
    # create genePred
    # using ensGene for:
for qDB in tetNig2 gasAcu1 oryLat2 danRer7
do
  echo hgsql -N -e \"'select * from 'ensGene\;\" ${qDB}
      hgsql -N -e "select * from ensGene" ${qDB} | cut -f 2-11 \
      | genePredSingleCover stdin stdout | gzip -2c > genes/$qDB.gp.gz
done

    # xenoMrna
     hgsql -N -e "select * from xenoMrna" fr3 | cut -f2- \
	| pslToBed stdin stdout | bedToGenePred stdin stdout \
	| genePredSingleCover stdin stdout | gzip -2c > genes/fr3.gp.gz

    # verify counts for genes are reasonable:
    for T in genes/*.gz
do
    echo -n "# $T: "
    zcat $T | cut -f1 | sort | uniq -c | wc -l
done

# genes/danRer7.gp.gz: 25919
# genes/fr3.gp.gz: 29280
# genes/gasAcu1.gp.gz: 20631
# genes/oryLat2.gp.gz: 19576
# genes/tetNig2.gp.gz: 19539

    ssh hgwdev
    cd /hive/data/genomes/fr3/bed/multiz8way/frames
    time (cat ../anno/fr3.8way.maf \
	| nice -n +19 genePredToMafFrames fr3 stdin stdout \
	    `echo "fr3 tetNig2 gasAcu1 oryLat2 danRer7" | sed -e "s#\([a-zA-Z0-9]*\)#\1 genes/\1.gp.gz#g"` \
		| gzip > multiz8way.mafFrames.gz)
    #	real    2m22.546s
    #	-rw-rw-r-- 1 17609757 Feb 21 14:29 multiz8way.mafFrames.gz

    # verify there are frames on everything:
    zcat multiz8way.mafFrames.gz | awk '{print $4}' | sort | uniq -c
    #	352686 danRer7
    #	233699 fr3
    #	298187 gasAcu1
    #	278448 oryLat2
    #	273743 tetNig2

    ssh hgwdev
    cd /hive/data/genomes/fr3/bed/multiz8way/frames
    time hgLoadMafFrames fr3 multiz8wayFrames multiz8way.mafFrames.gz
    #	real    0m12.716s

############################################################################
# creating upstream mafs (STALLED - 2012-02-21 - Hiram)
    # will need to wait until there are some ensGenes here
    ssh hgwdev
    mkdir -p /hive/data/genomes/fr3/goldenPath/multiz8way
    cd /hive/data/genomes/fr3/goldenPath/multiz8way
    # run this bash script
    cat << '_EOF_' > mkUpstream.sh
DB=fr3
GENE=ensGene
NWAY=multiz8way
export DB GENE

for S in 1000 2000 5000
do
    echo "making upstream${S}.${GENE}.maf"
    echo "featureBits ${DB} ${GENE}:upstream:${S} -fa=/dev/null -bed=stdout"
    featureBits ${DB} ${GENE}:upstreamAll:${S} -fa=/dev/null -bed=stdout \
        | perl -wpe 's/_up[^\t]+/\t0/' | sort -k1,1 -k2,2n \
        | mafFrags ${DB} ${NWAY} stdin stdout \
                -orgs=/hive/data/genomes/${DB}/bed/${NWAY}/species.list \
        | gzip -c > upstream${S}.${GENE}.maf.gz
    echo "done upstream${S}.${GENE}.maf.gz"
done
'_EOF_'
    # << happy emacs
    chmod +x mkUpstream.sh
    time ./mkUpstream.sh
    #	real    17m4.037s

    # FIXUP/VERIFY the genbank.conf file to indicate this gene choice for the
    #	upstream automatic generation

#########################################################################
# Phylogenetic tree from 8-way (DONE - 2012-02-21 - Hiram)
    mkdir /hive/data/genomes/fr3/bed/multiz8way/4d
    cd /hive/data/genomes/fr3/bed/multiz8way/4d

    # the split annotated maf's are in:
    ../anno/result/mafSplit/*/*/*.maf

    cd /hive/data/genomes/fr3/bed/multiz8way/4d

    # no gene tracks yet on fr3.  liftUp fr2 ensGenes to fr3 for this:
    hgsql -N -e "select * from ensGene;" fr2 | cut -f2- > fr2.ensGene.gp
    liftOver -genePred fr2.ensGene.gp \
	/gbdb/fr2/liftOver/fr2ToFr3.over.chain.gz \
	fr3.lifted.ensGene.gp unmapped.ensGene.gp

    wc -l fr2.ensGene.gp fr3.lifted.ensGene.gp unmapped.ensGene.gp
    #	48706 fr2.ensGene.gp
    #	48484 fr3.lifted.ensGene.gp
    #	444 unmapped.ensGene.gp
    #	97634 total

    genePredSingleCover fr3.lifted.ensGene.gp stdout | sort > fr3.ensGene.NR.gp
    wc -l fr3.ensGene.NR.gp
    #	18248 fr3.ensGene.NR.gp

    # (this was attempted with all_est which give a much different estimate)
    # (the distances were much smaller)

    ssh swarm
    mkdir /hive/data/genomes/fr3/bed/multiz8way/4d/run
    cd /hive/data/genomes/fr3/bed/multiz8way/4d/run
    mkdir ../mfa

    # newer versions of msa_view have a slightly different operation
    # the sed of the gp file inserts the reference species in the chr name
    cat << '_EOF_' > 4d.csh
#!/bin/csh -fe
set PHASTBIN = /cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin
set r = "/hive/data/genomes/fr3/bed/multiz8way"
set c = $1
set infile = $r/anno/result/mafSplit/$2
set resultDir = $3
mkdir -p ../mfa/$resultDir
set outfile = $r/4d/mfa/$resultDir/$1.mfa
cd /scratch/tmp
# 'clean' maf
perl -wpe 's/^s ([^.]+)\.\S+/s $1/' $infile > $c.maf    
awk -v C=$c '$2 == C {print}' $r/4d/fr3.ensGene.NR.gp | sed -e "s/\t$c\t/\tfr3.$c\t/" > $c.gp
set NL=`wc -l $c.gp| gawk '{print $1}'`
if ("$NL" != "0") then
    $PHASTBIN/msa_view --4d --features $c.gp -i MAF $c.maf -o SS > $c.ss
    $PHASTBIN/msa_view -i SS --tuple-size 1 $c.ss > $outfile
else
    echo "" > $outfile
endif
rm -f $c.gp $c.maf $c.ss
'_EOF_'
    # << happy emacs
    chmod +x 4d.csh

    find ../../anno/result/mafSplit -type f \
	| sed -e "s#../../anno/result/mafSplit/##" > maf.list

    cat << '_EOF_' > template
#LOOP
4d.csh $(root1) $(path1) $(dir1) {check out exists ../mfa/$(dir1)/$(root1).mfa}
#ENDLOOP
'_EOF_'
    # << happy emacs

    gensub2 maf.list single template jobList
    para create jobList
    para try ... check
    para -maxJob=5 push
    para time
    # some of the jobs can not complete
# Completed: 6180 of 6214 jobs
# Crashed: 24 jobs
# Other count: 10 jobs
# CPU time in finished jobs:        565s       9.42m     0.16h    0.01d  0.000 y
# IO & Wait Time:                 15923s     265.38m     4.42h    0.18d  0.001 y
# Average job time:                   3s       0.04m     0.00h    0.00d
# Longest finished job:              21s       0.35m     0.01h    0.00d
# Submission to last job:           861s      14.35m     0.24h    0.01d

    # combine mfa files
    ssh hgwdev
    cd /hive/data/genomes/fr3/bed/multiz8way/4d
    # remove the broken empty files, size 0 and size 1:
    find ./mfa -type f -size 0 | xargs rm -f
    # most interesting, this did not identify files of size 1:
#    find ./mfa -type f -size 1
    find ./mfa -type f | xargs ls -og | awk '$3 == 1' | awk '{print $NF}' \
	> empty.list
    cat empty.list | xargs rm

    #want comma-less species.list
    /cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin/msa_view \
	--aggregate "`cat ../species.list`" \
	`find ./mfa -type f | xargs echo` | sed s/"> "/">"/ \
	    > 4d.all.mfa
    # check they are all in there:
    grep "^>" 4d.all.mfa
    #	>fr3
    #	>tetNig2
    #	>oreNil1
    #	>gasAcu1
    #	>oryLat2
    #	>gadMor1
    #	>danRer7
    #	>latCha1

    # tree-commas.nh:
# (((((((fr3,tetNig2),oreNil1),gasAcu1),oryLat2),gadMor1),danRer7),latCha1)

    # use phyloFit to create tree model (output is phyloFit.mod)
    time nice -n +19 \
	/cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin/phyloFit \
	    --EM --precision MED --msa-format FASTA --subst-mod REV \
		--tree tree-commas.nh 4d.all.mfa
    #	real    0m51.111s
    mv phyloFit.mod all.mod

    grep TREE all.mod
    # TREE:
    # (((((((fr3:0.156632,tetNig2:0.195181):0.256538,oreNil1:0.28154):0.00848527,gasAcu1:0.305909):0.0519573,oryLat2:0.38542):0.140656,gadMor1:0.618515):0.26134,danRer7:0.587935):0.460809,latCha1:0.460809);

    /cluster/bin/phast/all_dists new8way.nh | grep "^fr3"
fr3     tetNig2 0.351813
fr3     oreNil1 0.694710
fr3     gasAcu1 0.727564
fr3     oryLat2 0.859033
fr3     gadMor1 1.232784
fr3     danRer7 1.463544
fr3     latCha1 1.797227

    # existing distances from the 60-way prototype tree, with
    # several branch lengths being pure guesses:
(((((((fr3:0.203847,tetNig2:0.224159):0.09759,oreNil1:0.2):0.09759,
gasAcu1:0.316413):0.03,oryLat2:0.511970):0.03,gadMor1:0.35):0.22564,
danRer7:0.730752):0.147949,latCha1:0.855573);
fr3     tetNig2 0.428006
fr3     oreNil1 0.501437
fr3     gasAcu1 0.715440
fr3     gadMor1 0.809027
fr3     oryLat2 0.940997
fr3     danRer7 1.415419
fr3     latCha1 1.688189

#########################################################################
# phastCons 8-way (DONE - 2012-02-22 - Hiram)

    # split 8way mafs into 10M chunks and generate sufficient statistics 
    # files for # phastCons
    ssh encodek
    mkdir -p /hive/data/genomes/fr3/bed/multiz8way/cons/SS
    cd /hive/data/genomes/fr3/bed/multiz8way/cons/SS
    mkdir result done

    cat << '_EOF_' > mkSS.csh
#!/bin/csh -ef
set dirPath = $1
set mafFile = $2
set MAF = /hive/data/genomes/fr3/bed/multiz8way/anno/result/mafSplit/$dirPath/$mafFile.maf
set WINDOWS = /hive/data/genomes/fr3/bed/multiz8way/cons/SS/result/$dirPath
set WC = `cat $MAF | wc -l`
set NL = `grep "^#" $MAF | wc -l`
if ( -s $3 ) then
    exit 0
endif
mkdir -p done/$dirPath
if ( -s done/$dirPath/$mafFile.running ) then
    exit 0
endif

date >> done/$dirPath/$mafFile.running

rm -f $WINDOWS/$mafFile
mkdir -p $WINDOWS
pushd $WINDOWS > /dev/null
if ( $WC != $NL ) then
/cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin/msa_split \
    $MAF -i MAF -o SS -r $WINDOWS/$mafFile -w 10000000,0 -I 1000 -B 5000
endif
popd > /dev/null
date >> $3
rm -f done/$dirPath/$mafFile.running
'_EOF_'
    # << happy emacs
    chmod +x mkSS.csh

    cat << '_EOF_' > template
#LOOP
mkSS.csh $(dir1) $(root1) {check out line+ done/$(dir1)/$(root1)}
#ENDLOOP
'_EOF_'
    # << happy emacs

    #
    find ../../anno/result/mafSplit -type f \
	| sed -e 's#../../anno/result/mafSplit/##' > maf.list

    gensub2 maf.list single template jobList
    para create jobList
    para try ... check ... etc
# Completed: 6214 of 6214 jobs
# CPU time in finished jobs:        562s       9.36m     0.16h    0.01d  0.000 y
# IO & Wait Time:                 16125s     268.76m     4.48h    0.19d  0.001 y
# Average job time:                   3s       0.04m     0.00h    0.00d
# Longest finished job:              29s       0.48m     0.01h    0.00d
# Submission to last job:           914s      15.23m     0.25h    0.01d

    # not all of them produce results
    #	 there isn't much aligned in these MAF files
    find ./result -type f | wc
    #	 5472 5472 176757

    # Run phastCons
    #	This job is I/O intensive in its output files, beware where this
    #	takes place or do not run too many at once.
    ssh encodek
    mkdir -p /hive/data/genomes/fr3/bed/multiz8way/cons/run.cons
    cd /hive/data/genomes/fr3/bed/multiz8way/cons/run.cons

    #	there are going to be only one phastCons run using
    #	this same script.  It triggers off of the current working directory
    #	$cwd:t which is the "grp" in this script.  It is:
    #	all 

    cat << '_EOF_' > doPhast.csh
#!/bin/csh -fe
set PHASTBIN = /cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin
set dir = $1
set c = $2
set f = $3
set len = $4
set cov = $5
set rho = $6
set resultFile = pp/$dir/$c.pp
set grp = $cwd:t
set cons = /hive/data/genomes/fr3/bed/multiz8way/cons
set tmp = $cons/tmp/$f
mkdir -p $tmp
set ssSrc = $cons
set useGrp = "$grp.mod"
ln -s $ssSrc/SS/result/$c/$f.ss $tmp
ln -s $cons/$grp/$grp.mod $tmp
pushd $tmp > /dev/null
$PHASTBIN/phastCons $cons/SS/result/$dir/$f.ss $useGrp \
    --rho $rho --expected-length $len --target-coverage $cov --quiet \
    --seqname $c --idpref $c --most-conserved $f.bed --score > $f.pp
popd > /dev/null
mkdir -p pp/$dir bed/$dir
sleep 4
touch pp/$dir bed/$dir
rm -f $resultFile
rm -f bed/$dir/$f.bed
mv $tmp/$f.pp pp/$dir
mv $tmp/$f.bed bed/$dir
rm -fr $tmp
'_EOF_'
    # << happy emacs
    chmod +x doPhast.csh

    #	this template will serve for all runs
    #	root1 == chrom name, file1 == ss file name without .ss suffix
    cat << '_EOF_' > template
#LOOP
../run.cons/doPhast.csh $(dir1) $(root1) $(file1) 45 0.3 0.3 {check out line+ pp/$(dir1)/$(file1).pp}
#ENDLOOP
'_EOF_'
    # << happy emacs

    find ../SS -type f | grep ".ss$" | sed -e 's#../SS/result/##; s/.ss$//' \
	> ss.list
    wc -l ss.list
    #	3291 ss.list

    # Create parasol batch and run it
    # run for all species
    cd /hive/data/genomes/fr3/bed/multiz8way/cons
    mkdir -p all
    cd all
    #	Using the .mod tree
    cp -p ../../4d/all.mod ./all.mod

    gensub2 ../run.cons/ss.list single ../run.cons/template jobList
    para create jobList
    para try ... check ...
    para push
# Completed: 5472 of 5472 jobs
# CPU time in finished jobs:       1187s      19.78m     0.33h    0.01d  0.000 y
# IO & Wait Time:                 35801s     596.69m     9.94h    0.41d  0.001 y
# Average job time:                   7s       0.11m     0.00h    0.00d
# Longest finished job:              40s       0.67m     0.01h    0.00d
# Submission to last job:          2238s      37.30m     0.62h    0.03d

    cd /hive/data/genomes/fr3/bed/multiz8way/cons/all

    find ./bed -type f | grep ".bed$" | xargs cat | sort -k1,1 -k2,2n \
	| awk '{printf "%s\t%d\t%d\tlod=%d\t%s\n", $1, $2, $3, $5, $5;}' \
	    > tmpMostConserved.bed

    /cluster/bin/scripts/lodToBedScore tmpMostConserved.bed > mostConserved.bed

    # load into database
    time nice -n +19 hgLoadBed fr3 phastConsElements8way mostConserved.bed
    #	Loaded 1126332 elements of size 5
    #	real    0m7.677s

    # on human we often try for 5% overall cov, and 70% CDS cov 
    # most bets are off here for that goal, these alignments are too few
    #	and too far between
    featureBits fr3 -enrichment ensGene:cds phastConsElements8way
    #	--rho 0.3 --expected-length 45 --target-coverage 0.3
# ensGene:cds 2.244%, phastConsElements8way 10.055%, both 1.669%, cover 74.39%, enrich 7.40x
    featureBits fr3 -enrichment mgcGenes:cds phastConsElements8way
# mgcGenes:cds 0.741%, phastConsElements8way 10.055%, both 0.619%, cover 83.53%, enrich 8.31x

    # Create merged posterier probability file and wiggle track data files
    cd /hive/data/genomes/fr3/bed/multiz8way/cons/all
    mkdir downloads

    find ./pp -type f | sed -e "s#/# dir #g; s#^./##; s#\.# d #g; s#-# m #;" \
	| sort -k9,9 -k11,11n | sed -e "s# dir #/#g; s# d #.#g; s# m #-#g;" \
	| xargs cat \
	| gzip -c > downloads/phastCons8way.wigFix.gz

    # check integrity of data with wigToBigWig
    time (zcat downloads/phastCons8way.wigFix.gz \
	| wigToBigWig -verbose=2 stdin /hive/data/genomes/fr3/chrom.sizes \
	    phastCons8way.bw) > bigWig.log 2>&1 &
    tail bigWig.log
    # pid=25817: VmPeak:     3413828 kB
    #	real    3m47.314s

    bigWigInfo phastCons8way.bw
# version: 4
# isCompressed: yes
# isSwapped: 0
# primaryDataSize: 606,493,400
# primaryIndexSize: 11,794,172
# zoomLevels: 10
# chromCount: 5452
# basesCovered: 305,346,303
# mean: 0.363248
# min: 0.000000
# max: 1.000000
# std: 0.412709

    #	encode those files into wiggle data
    time (zcat downloads/phastCons8way.wigFix.gz \
	| wigEncode stdin phastCons8way.wig phastCons8way.wib) &
    #	Converted stdin, upper limit 1.00, lower limit 0.00
    #	real    1m28.386s

    du -hsc *.wi?
    #	292M    phastCons8way.wib
    #	34M     phastCons8way.wig
    #	325M    total

    # Load gbdb and database with wiggle.
    ln -s `pwd`/phastCons8way.wib /gbdb/fr3/multiz8way/phastCons8way.wib
    time nice -n +19 hgLoadWiggle -pathPrefix=/gbdb/fr3/multiz8way \
	fr3 phastCons8way phastCons8way.wig
    #	real    0m5.850s

    # use to set trackDb.ra entries for wiggle min and max
    # and verify table is loaded correctly

    wigTableStats.sh fr3 phastCons8way
# db.table         min max mean       count sumData      stdDev viewLimits
# fr3.phastCons8way  0 1 0.363248 305346303 1.10917e+08 0.412709 viewLimits=0:1

    #  Create histogram to get an overview of all the data
    time nice -n +19 hgWiggle -doHistogram -db=fr3 \
	-hBinSize=0.001 -hBinCount=1000 -hMinVal=0.0 -verbose=2 \
	    phastCons8way > histogram.data 2>&1
    #	real    0m46.993s

    #	create plot of histogram:

    cat << '_EOF_' | gnuplot > histo.png
set terminal png small x000000 xffffff xc000ff x66ff66 xffff00 x00ffff
set size 1.4, 0.8
set key left box
set grid noxtics
set grid ytics
set title " Fugu fr3 Histogram phastCons8way track"
set xlabel " phastCons8way score"
set ylabel " Relative Frequency"
set y2label " Cumulative Relative Frequency (CRF)"
set y2range [0:1]
set y2tics
set yrange [0:0.02]

plot "histogram.data" using 2:5 title " RelFreq" with impulses, \
        "histogram.data" using 2:7 axes x1y2 title " CRF" with lines
'_EOF_'
    #	<< happy emacs

    display histo.png &

#############################################################################
# phyloP for 8-way (DONE - 2012-02-22 - Hiram)
    # run phyloP with score=LRT 
    ssh encodek
    mkdir /cluster/data/fr3/bed/multiz8way/consPhyloP
    cd /cluster/data/fr3/bed/multiz8way/consPhyloP

    mkdir run.phyloP
    cd run.phyloP
    # Adjust model file base composition background and rate matrix to be
    # representative of the chromosomes in play
    grep BACKGROUND ../../cons/all/all.mod | awk '{printf "%0.3f\n", $3 + $4}'
    #	0.613
    /cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin/modFreqs \
	../../cons/all/all.mod 0.613 > all.mod
    # verify, the BACKGROUND should now be paired up:
    grep BACK all.mod 
    #	BACKGROUND: 0.193500 0.306500 0.306500 0.193500

    #	following the pattern from panTro3 with grp: "all" (no other grp)
    cat << '_EOF_' > doPhyloP.csh
#!/bin/csh -fe
set PHASTBIN = /cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin
set dir = $1
set f = $2
set file1 = $f:t
set out = $3
set cName = $f:t:r
set n = $f:r:e
set grp = $cwd:t
set cons = /hive/data/genomes/fr3/bed/multiz8way/consPhyloP
set tmp = $cons/tmp/$grp/$cName
rm -fr $tmp
mkdir -p $tmp
set ssSrc = "/hive/data/genomes/fr3/bed/multiz8way/cons/SS/result/$f"
set useGrp = "$grp.mod"
ln -s $cons/run.phyloP/$grp.mod $tmp
pushd $tmp > /dev/null
$PHASTBIN/phyloP --method LRT --mode CONACC --wig-scores --chrom $cName \
    -i SS $useGrp $ssSrc.ss > $file1.wigFix
popd > /dev/null
mkdir -p $out:h
sleep 4
mv $tmp/$file1.wigFix $out
rm -fr $tmp
rmdir --ignore-fail-on-non-empty $cons/tmp/$grp
rmdir --ignore-fail-on-non-empty $cons/tmp
'_EOF_'
    # << happy emacs

    # Create list of chunks
    find ../../cons/SS/result -type f | grep ".ss$" \
	| sed -e 's#../../cons/SS/result/##; s/.ss$//'  > ss.list
    # make sure the list looks good
    wc -l ss.list
    #	5472 ss.list

    # Create template file
    #	file1 == $chr/$chunk/file name without .ss suffix
    cat << '_EOF_' > template
#LOOP
../run.phyloP/doPhyloP.csh $(dir1) $(path1) {check out line+ wigFix/$(dir1)/$(file1).wigFix}
#ENDLOOP
'_EOF_'
    # << happy emacs

    ######################   Running all species  #######################
    # setup run for all species
    mkdir /hive/data/genomes/fr3/bed/multiz8way/consPhyloP/all
    cd /hive/data/genomes/fr3/bed/multiz8way/consPhyloP/all
    rm -fr wigFix
    mkdir wigFix

    gensub2 ../run.phyloP/ss.list single ../run.phyloP/template jobList
    # the -ram=8g will allow only one job per node to slow this down since
    #	it would run too fast otherwise.  Either run on one of the small
    #	klusters or use the -ram=8g on the para create
    para -ram=8g create jobList
    para try ... check ... etc ...
    para -maxJob=64 push
    para time > run.time
# Completed: 5690 of 5690 jobs
# CPU time in finished jobs:       9670s     161.17m     2.69h    0.11d  0.000 y
# IO & Wait Time:                 38146s     635.76m    10.60h    0.44d  0.001 y
# Average job time:                   8s       0.14m     0.00h    0.00d
# Longest finished job:              63s       1.05m     0.02h    0.00d
# Submission to last job:          2448s      40.80m     0.68h    0.03d

    # make downloads
    mkdir downloads

    time (find ./wigFix -type f \
	| sed -e "s#/# dir #g; s#^./##; s#\.# d #g; s#-# m #;" \
	| sort -k9,9 -k11,11n | sed -e "s# dir #/#g; s# d #.#g; s# m #-#g;" \
	| xargs cat \
	| gzip -c > downloads/phyloP8way.wigFix.gz) &
    #	real    5m50.668s

    # check integrity of data with wigToBigWig
    time (zcat downloads/phyloP8way.wigFix.gz \
	| wigToBigWig -verbose=2 stdin /hive/data/genomes/fr3/chrom.sizes \
	phyloP8way.bw) > bigWig.log 2>&1 &
    egrep "real|VmPeak" bigWig.log
    #   pid=32533: VmPeak:     3413824 kB
    #	real    4m20.055s

    bigWigInfo phyloP8way.bw  | sed -e "s/^/#\t/"
#       version: 4
#       isCompressed: yes
#       isSwapped: 0
#       primaryDataSize: 613,964,662
#       primaryIndexSize: 11,794,172
#       zoomLevels: 10
#       chromCount: 5452
#       basesCovered: 305,346,303
#       mean: 0.340267
#       min: -2.059000
#       max: 2.645000
#       std: 0.813855

    #	encode those files into wiggle data
    time (zcat downloads/phyloP8way.wigFix.gz \
	| wigEncode stdin phyloP8way.wig phyloP8way.wib) &
    #	Converted stdin, upper limit 2.65, lower limit -2.06
    #	real    1m34.170s

    du -hsc *.wi?
    #	292M    phyloP8way.wib
    #	35M     phyloP8way.wig
    #	326M    total

    # Load gbdb and database with wiggle.
    ln -s `pwd`/phyloP8way.wib /gbdb/fr3/multiz8way/phyloP8way.wib
    nice hgLoadWiggle -pathPrefix=/gbdb/fr3/multiz8way fr3 \
	phyloP8way phyloP8way.wig

    # use to set trackDb.ra entries for wiggle min and max
    # and verify table is loaded correctly

    wigTableStats.sh fr3 phyloP8way
# db.table      min max mean count sumData stdDev viewLimits
# fr3.phyloP8way  -2.059 2.645 0.340267 305346303 1.03899e+08 0.813855
# viewLimits=-2.059:2.645

    #  Create histogram to get an overview of all the data
    time nice -n +19 hgWiggle -doHistogram -db=fr3 \
	-hBinSize=0.001 -hBinCount=1000 -hMinVal=0.0 -verbose=2 \
	    phyloP8way > histogram.data 2>&1
    #	real    0m46.775s

    # find out the range for the 2:5 graph
    grep -v chrom histogram.data | grep "^[0-9]" | ave -col=5 stdin
#	Q1 0.000392
#	median 0.000683
#	Q3 0.001184
#	average 0.001000
#	min 0.000086
#	max 0.029306
#	count 1000
#	total 0.999978
#	standard deviation 0.001531

    #	create plot of histogram:

    cat << '_EOF_' | gnuplot > histo.png
set terminal png small x000000 xffffff xc000ff x66ff66 xffff00 x00ffff
set size 1.4, 0.8
set key left box
set grid noxtics
set grid ytics
set title " Fugu fr3 Histogram phyloP8way track"
set xlabel " phyloP8way score"
set ylabel " Relative Frequency"
set y2label " Cumulative Relative Frequency (CRF)"
set y2range [0:1]
set y2tics
set yrange [0:0.007]

plot "histogram.data" using 2:5 title " RelFreq" with impulses, \
        "histogram.data" using 2:7 axes x1y2 title " CRF" with lines
'_EOF_'
    #	<< happy emacs

    display histo.png &

#############################################################################
# lifting ensGene track from fr2 (DONE - 2012-02-22 - Hiram)
    # no gene tracks yet on fr3.  liftUp fr2 ensGenes to fr3
    # history of fr2 ensGene indicates it has been the same there since
    #	v60 of ensGene - thru today same as v65
    mkdir /hive/data/genomes/fr3/bed/ensGene
    cd /hive/data/genomes/fr3/bed/ensGene
    hgsql -N -e "select * from ensGene;" fr2 | cut -f2- > fr2.ensGene.gp
    liftOver -genePred fr2.ensGene.gp \
	/gbdb/fr2/liftOver/fr2ToFr3.over.chain.gz \
	fr3.lifted.ensGene.gp unmapped.ensGene.gp
    ls -ogrt
# -rw-rw-r-- 1 18680443 Feb 22 16:22 fr2.ensGene.gp
# -rw-rw-r-- 1 15869141 Feb 22 16:22 fr3.lifted.ensGene.gp
# -rw-rw-r-- 1   126578 Feb 22 16:22 unmapped.ensGene.gp

    hgLoadGenePred -skipInvalid -genePredExt fr3 ensGene fr3.lifted.ensGene.gp
    #	Warning: skipping 782 invalid genePreds

    # make a list of what did get loaded:
    hgsql -N -e "select name from ensGene;" fr3 | sort -u > fr3.name.ensGene.txt
    wc -l fr3.name.ensGene.txt 
    #	47702 fr3.name.ensGene.txt

    hgsql -N -e "select * from ensPep;" fr2 | sort > fr2.ensPep.tab
    hgsql -N -e "select * from ensGtp;" fr2 | sort -k2,2 > fr2.ensGtp.tab
    hgsql -N -e "select * from ensemblSource;" fr2 | sort -k1,1 \
	> fr2.ensemblSource.tab
    hgsql -N -e "select * from ensemblToGeneName;" fr2 | sort -k1,1 \
	> fr2.ensemblToGeneName.tab

    # select out ensGtp records that match with the names in fr3 ensGene:
    join -1 2 -2 1 -o "1.1,1.2,1.3" fr2.ensGtp.tab fr3.name.ensGene.txt \
	| tr '[ ]' '[\t]' > fr3.ensGtp.tab
    wc -l *.ensGtp.tab
    #	48706 fr2.ensGtp.tab
    #	47702 fr3.ensGtp.tab

    # select out ensPep records that match with the names in fr3 ensGene:
    join -1 1 -2 2 -o "1.1,1.2" fr2.ensPep.tab fr3.ensGtp.tab \
	| tr '[ ]' '[\t]' > fr3.ensPep.tab

    wc -l fr2.ensPep.tab fr3.ensPep.tab
    #	47841 fr2.ensPep.tab
    #	46842 fr3.ensPep.tab

    # select out ensemblSource records that match the fr3 ensGene names:
    join -1 1 -2 1 -o "1.1,1.2" fr2.ensemblSource.tab fr3.name.ensGene.txt \
	| tr '[ ]' '[\t]' > fr3.ensemblSource.tab
    wc -l fr2.ensemblSource.tab fr3.ensemblSource.tab 
  48706 fr2.ensemblSource.tab
  47702 fr3.ensemblSource.tab

    # select out ensemblToGeneName records that match the fr3 ensGene names:
    join -1 1 -2 1 -o "1.1,1.2" fr2.ensemblToGeneName.tab \
	fr3.name.ensGene.txt | tr '[ ]' '[\t]' > fr3.ensemblToGeneName.tab
    wc -l fr2.ensemblToGeneName.tab fr3.ensemblToGeneName.tab
    #	40067 fr2.ensemblToGeneName.tab
    #	39200 fr3.ensemblToGeneName.tab

    hgPepPred fr3 tab ensPep fr3.ensPep.tab
    hgLoadSqlTab fr3 ensGtp ~/kent/src/hg/lib/ensGtp.sql fr3.ensGtp.tab

    # verify size of the names
    cut -f1 fr3.ensemblSource.tab | awk '{print length($0)}' \
	| sort | uniq -c | head
    #	47702 18

    sed -e "s/15/18/" ~/kent/src/hg/lib/ensemblSource.sql > ensemblSource.sql
    hgLoadSqlTab fr3 ensemblSource ensemblSource.sql fr3.ensemblSource.tab

    # find sizes for indexes
  NL=`awk '{print length($1)}' fr3.ensemblToGeneName.tab | sort -rn | head -1`
  VL=`awk '{print length($2)}' fr3.ensemblToGeneName.tab | sort -rn | head -1`
    # construct sql definition with appropriate index sizes
    sed -e "s/ knownTo / ensemblToGeneName /; s/known gene/ensGen/; s/INDEX(name(12)/PRIMARY KEY(name($NL)/; s/value(12)/value($VL)/" \
	~/kent/src/hg/lib/knownTo.sql > ensemblToGeneName.sql

    hgLoadSqlTab fr3 ensemblToGeneName ensemblToGeneName.sql \
	fr3.ensemblToGeneName.tab

    hgsql -e 'INSERT INTO trackVersion \
    (db, name, who, version, updateTime, comment, source, dateReference) \
    VALUES("fr3", "ensGene", "hiram", "65", now(), \
        "lifted from mm9 ensGene 65", \
        "lifted from mm9 ensGene 65", \
        "dec2011" );' hgFixed

#############################################################################
# hgPal downloads (DONE - Hiram - 2011-12-08)
#   FASTA from 8way for ensGene

    ssh hgwdev
    rm -rf /hive/data/genomes/fr3/bed/multiz8way/pal
    mkdir /hive/data/genomes/fr3/bed/multiz8way/pal
    cd /hive/data/genomes/fr3/bed/multiz8way/pal
    cat ../species.list | tr '[ ]' '[\n]' > order.list

    mz=multiz8way
    gp=ensGene
    db=fr3
    mkdir exonAA exonNuc ppredAA ppredNuc
    for j in `sort -nk 2 /hive/data/genomes/$db/chrom.sizes | awk '{print $1}'`
    do
	echo "date"
	echo "mafGene -chrom=$j  $db $mz $gp order.list stdout | \
	    gzip -c > ppredAA/$j.ppredAA.fa.gz"
	echo "mafGene -chrom=$j -noTrans $db $mz $gp order.list stdout | \
	    gzip -c > ppredNuc/$j.ppredNuc.fa.gz"
	echo "mafGene -chrom=$j -exons -noTrans $db $mz $gp order.list stdout | \
	    gzip -c > exonNuc/$j.exonNuc.fa.gz"
	echo "mafGene -chrom=$j -exons $db $mz $gp order.list stdout | \
	    gzip -c > exonAA/$j.exonAA.fa.gz"
    done > $gp.jobs

    time sh -x $gp.jobs > $gp.jobs.log 2>&1 &
    sleep 1
    tail -f $gp.jobs.log
    #	real    49m45.740s

    mz=multiz8way
    gp=ensGene
    db=fr3
    zcat exonAA/*.gz | gzip -c > $gp.$mz.exonAA.fa.gz
    zcat exonNuc/*.gz | gzip -c > $gp.$mz.exonNuc.fa.gz
    zcat ppredAA/*.gz | gzip -c > $gp.$mz.ppredAA.fa.gz
    zcat ppredNuc/*.gz | gzip -c > $gp.$mz.ppredNuc.fa.gz
    # about 12 minutes

    rm -rf exonAA exonNuc ppredAA ppredNuc &

    md5sum $gp.$mz.exonAA.fa.gz $gp.$mz.exonNuc.fa.gz > md5sum.txt

    # we're only distributing exons at the moment
    mz=multiz8way
    gp=ensGene
    db=fr3
    pd=/usr/local/apache/htdocs-hgdownload/goldenPath/$db/$mz/alignments
    mkdir -p $pd
    ln -s `pwd`/$gp.$mz.exonAA.fa.gz $pd/$gp.exonAA.fa.gz
    ln -s `pwd`/$gp.$mz.exonNuc.fa.gz $pd/$gp.exonNuc.fa.gz
    ln -s `pwd`/md5sum.txt $pd/

    # obtain a copy of a most-recent README.txt file
    cp -p /usr/local/apache/htdocs-hgdownload/goldenPath/gorGor3/multiz11way/alignments/README.txt .

    # edit to bring up to date for fr3
    ln -s `pwd`/README.txt $pd/

#############################################################################
# download data for 8-way (DONE - 2012-02-23 - Hiram)
    # use a README from the most recent multiz like this
    #	this one is from gorGor3
    cd /hive/data/genomes/fr3/bed/multiz8way/cons
    cp -p /hive/data/genomes/gorGor3/bed/multiz11way/cons/README.txt .
    # need to edit this to finish it off

    mkdir /usr/local/apache/htdocs-hgdownload/goldenPath/fr3/phastCons8way
    cd /usr/local/apache/htdocs-hgdownload/goldenPath/fr3/phastCons8way
    ln -s \
/hive/data/genomes/fr3/bed/multiz8way/cons/all/downloads/phastCons8way.wigFix.gz \
	./phastCons8way.wigFix.gz
    ln -s /hive/data/genomes/fr3/bed/multiz8way/cons/all/phastCons8way.bw \
	./phastCons8way.bw

    ln -s /hive/data/genomes/fr3/bed/multiz8way/cons/all/all.mod \
	./fish.mod

    ln -s /hive/data/genomes/fr3/bed/multiz8way/cons/README.txt .

    md5sum phastCons8way.bw phastCons8way.wigFix.gz fish.mod \
	> /hive/data/genomes/fr3/bed/multiz8way/cons/md5sum.txt

    ln -s /hive/data/genomes/fr3/bed/multiz8way/cons/md5sum.txt .

    # use a README from the most recent multiz like this
    #	this one is from xenTro3
    cd /hive/data/genomes/fr3/bed/multiz8way/consPhyloP
    cp /hive/data/genomes/gorGor3/bed/multiz11way/consPhyloP/README.txt .
    # need to edit this to finish it off

    mkdir /usr/local/apache/htdocs-hgdownload/goldenPath/fr3/phyloP8way
    cd /usr/local/apache/htdocs-hgdownload/goldenPath/fr3/phyloP8way
    ln -s \
/hive/data/genomes/fr3/bed/multiz8way/consPhyloP/all/downloads/phyloP8way.wigFix.gz \
	./phyloP8way.wigFix.gz
    ln -s \
/hive/data/genomes/fr3/bed/multiz8way/consPhyloP/all/phyloP8way.bw \
	./phyloP8way.bw
    ln -s \
/hive/data/genomes/fr3/bed/multiz8way/consPhyloP/run.phyloP/all.mod \
	./fish.mod

    ln -s /hive/data/genomes/fr3/bed/multiz8way/consPhyloP/README.txt .
    md5sum README.txt phyloP8way.bw phyloP8way.wigFix.gz fish.mod \
	> /hive/data/genomes/fr3/bed/multiz8way/consPhyloP/md5sum.txt
    ln -s /hive/data/genomes/fr3/bed/multiz8way/consPhyloP/md5sum.txt .

    mkdir /hive/data/genomes/fr3/bed/multiz8way/downloads
    cd /hive/data/genomes/fr3/bed/multiz8way/downloads
    # use a README from the most recent multiz like this
    #	this one is from xenTro3
    cp -p /hive/data/genomes/gorGor3/bed/multiz11way/downloads/README.txt .
    # need to edit this README.txt to finish it off
    # you can find this common name translation above in the section that
    # began the multiz8way work:
    grep TREE ../cons/all/all.mod | sed -e "s/TREE: //" > fr3.8way.nh

    /cluster/bin/phast/tree_doctor --rename \
"fr3->Fugu; tetNig2->Tetraodon; oreNil1->Nile_Tilapia; gasAcu1->Stickleback; oryLat2->Medaka; gadMor1->Atlantic_cod; danRer7->Zebrafish; latCha1->Coelacanth;" \
fr3.8way.nh > fr3.commonNames.8way.nh

    time cp -p ../anno/fr3.8way.maf ./multiz8way.maf
    #	real    1m38.305s
# -rw-rw-r-- 1 3235224846 Feb 21 13:55 multiz8way.maf
    time gzip multiz8way.maf
    #	real    7m24.097s
# -rw-rw-r-- 1 738827448 Feb 21 13:55 multiz8way.maf.gz

    md5sum *.nh *.gz *.txt > md5sum.txt

    mkdir /usr/local/apache/htdocs-hgdownload/goldenPath/fr3/multiz8way
    cd /usr/local/apache/htdocs-hgdownload/goldenPath/fr3/multiz8way
    ln -s /hive/data/genomes/fr3/bed/multiz8way/downloads/fr3.8way.nh .
    ln -s \
/hive/data/genomes/fr3/bed/multiz8way/downloads/fr3.commonNames.8way.nh .
    ln -s \
/hive/data/genomes/fr3/bed/multiz8way/downloads/multiz8way.maf.gz .

    ln -s /hive/data/genomes/fr3/bed/multiz8way/downloads/README.txt .

    ln -s /hive/data/genomes/fr3/bed/multiz8way/downloads/md5sum.txt .

#############################################################################
# Construct download files (DONE - 2012-02-23 - Hiram)
    cd /hive/data/genomes/fr3
    # verify joinerCheck is good and fr3 is in all.joiner
    joinerCheck -keys -database=fr3 \
	$HOME/kent/src/hg/makeDb/schema/all.joiner

    time makeDownloads.pl fr3 -workhorse=hgwdev > downloads.log 2>&1 &
    #	real    24m7.903s
    # you must edit the */README files in ./goldenPath/

############################################################################
# after downloads, construct pushQ (DONE - 2012-02-23 - Hiram)
    mkdir /hive/data/genomes/fr3/pushQ
    cd /hive/data/genomes/fr3/pushQ
    time makePushQSql.pl fr3 > fr3.sql 2> stderr.log
    # check stderr.log for anything not categorized

    scp -p fr3.sql hgwbeta:/tmp
    ssh hgwbeta
    cd /tmp
    hgsql qapushq < fr3.sql
    # after loading, view the subQ in the WEB interface, go through each
    #	item and verify it can measure the tables and files

############################################################################
# running cpgIsland business (DONE - 2012-04-19 - Hiram)
    mkdir /hive/data/genomes/fr3/bed/cpgIsland
    cd /hive/data/genomes/fr3/bed/cpgIsland
    # use a previous binary for this program
    ln -s ../../../mm9/bed/cpgIsland/hg3rdParty/cpgIslands/cpglh.exe .

    mkdir -p hardMaskedFa
    cut -f1 ../../chrom.sizes | while read C
do
    echo ${C}
    twoBitToFa ../../fr3.2bit:$C stdout \
	| maskOutFa stdin hard hardMaskedFa/${C}.fa
done

    ssh swarm
    cd /hive/data/genomes/fr3/bed/cpgIsland
    mkdir results
    cut -f1 ../../chrom.sizes > chr.list
    cat << '_EOF_' > template
#LOOP
./runOne $(root1) {check out exists results/$(root1).cpg}
#ENDLOOP
'_EOF_'
    # << happy emacs

    #	the faCount business is to make sure there is enough sequence to
    #	work with in the fasta.  cpglh.exe does not like files with too many
    #	N's - it gets stuck.
    cat << '_EOF_' > runOne
#!/bin/csh -fe
set C = `faCount hardMaskedFa/$1.fa | egrep -v "^#seq|^total" | awk '{print  $2 - $7 }'`
if ( $C > 200 ) then
    ./cpglh.exe hardMaskedFa/$1.fa > /scratch/tmp/$1.$$
    mv /scratch/tmp/$1.$$ $2
else
    touch $2
endif
'_EOF_'
    # << happy emacs
    chmod +x runOne

    gensub2 chr.list single template jobList
    para create jobList
    para try
    para check ... etc
    para time
# Completed: 6835 of 6835 jobs
# CPU time in finished jobs:         45s       0.75m     0.01h    0.00d  0.000 y
# IO & Wait Time:                 18494s     308.23m     5.14h    0.21d  0.001 y
# Average job time:                   3s       0.05m     0.00h    0.00d
# Longest finished job:              10s       0.17m     0.00h    0.00d
# Submission to last job:           266s       4.43m     0.07h    0.00d

    # Transform cpglh output to bed +
    catDir results | awk '{
$2 = $2 - 1;
width = $3 - $2;
printf("%s\t%d\t%s\t%s %s\t%s\t%s\t%0.0f\t%0.1f\t%s\t%s\n",
       $1, $2, $3, $5,$6, width,
       $6, width*$7*0.01, 100.0*2*$6/width, $7, $9);
}' > cpgIsland.bed

    # verify longest unique chrom name:
    cut -f1 cpgIsland.bed | awk '{print length($0)}' | sort -rn | head -1
    #	8
    # update the length 14 in the template to be 8:
    sed -e "s/14/8/" $HOME/kent/src/hg/lib/cpgIslandExt.sql > cpgIslandExt.sql

    cd /hive/data/genomes/fr3/bed/cpgIsland
    hgLoadBed fr3 cpgIslandExt -tab -sqlTable=cpgIslandExt.sql cpgIsland.bed
    #   Read 36504 elements of size 10 from cpgIsland.bed

    featureBits fr3 cpgIslandExt
    #   18279877 bases of 350961831 (5.209%) in intersection

    # there should be no output from checkTableCoords:
    checkTableCoords -verboseBlocks -table=cpgIslandExt fr3

    #	cleanup, unless you want to move them to the genscan procedure below
    rm -fr hardMaskedFa

#########################################################################
# GENSCAN GENE PREDICTIONS (DONE - 2012-04-09 - Hiram)
    mkdir /hive/data/genomes/fr3/bed/genscan
    cd /hive/data/genomes/fr3/bed/genscan
    # use a previously existing genscan binary
    ln -s ../../../mm9/bed/genscan/hg3rdParty .

    # create hard masked .fa files
    mkdir -p hardMaskedFa
    cut -f1 ../../chrom.sizes | while read C
do
    echo ${C}
    twoBitToFa ../../fr3.2bit:$C stdout \
	| maskOutFa stdin hard hardMaskedFa/${C}.fa
done

    # Generate a list file, genome.list, of all the hard-masked contig chunks:
    find ./hardMaskedFa/ -type f | sed -e 's#^./##' > genome.list

    wc -l genome.list
    #	6835 genome.list

    # OK to run on swarm, small chroms will not need the large memory
    ssh swarm
    cd /hive/data/genomes/fr3/bed/genscan
    # Make 3 subdirectories for genscan to put their output files in
    mkdir gtf pep subopt
    # Create template file, template, for gensub2.  For example (3-line file):
    cat << '_EOF_' > template
#LOOP
/cluster/bin/x86_64/gsBig {check in exists+ $(path1)} {check out exists gtf/$(root1).gtf} -trans={check out exists pep/$(root1).pep} -subopt={check out exists subopt/$(root1).bed} -exe=hg3rdParty/genscanlinux/genscan -par=hg3rdParty/genscanlinux/HumanIso.smat -tmp=/tmp -window=2400000
#ENDLOOP
'_EOF_'
    # << emacs
    gensub2 genome.list single template jobList
    para create jobList
    para try
    para check ... etc...
    para time
# Completed: 6835 of 6835 jobs
# CPU time in finished jobs:       9926s     165.43m     2.76h    0.11d  0.000 y
# IO & Wait Time:                 20476s     341.27m     5.69h    0.24d  0.001 y
# Average job time:                   4s       0.07m     0.00h    0.00d
# Longest finished job:             786s      13.10m     0.22h    0.01d
# Submission to last job:          1236s      20.60m     0.34h    0.01d

    # Concatenate results:
    cd /hive/data/genomes/fr3/bed/genscan
    find ./gtf -type f | xargs cat > genscan.gtf
    find ./pep -type f | xargs cat > genscan.pep
    find ./subopt -type f | xargs cat > genscanSubopt.bed

    # Load into the data/genomesbase (without -genePredExt because no frame info):
    # Don't load the Pep anymore -- redundant since it's from genomic.
    ssh hgwdev
    cd /hive/data/genomes/fr3/bed/genscan
    # to construct a local file with the genePred business:
    gtfToGenePred genscan.gtf genscan.gp
    genePredCheck -db=fr3 genscan.gp
    #   checked: 20354 failed: 0

    # this produces exactly the same thing and loads the table:
    ldHgGene -gtf fr3 genscan genscan.gtf
    #   Read 20354 transcripts in 159927 lines in 1 files
    #   20354 groups 4323 seqs 1 sources 1 feature types
    #   20354 gene predictions

    hgLoadBed fr3 genscanSubopt genscanSubopt.bed
    #	Read 526572 elements of size 6 from genscanSubopt.bed
    featureBits fr3 genscan
    #   25082192 bases of 350961831 (7.147%) in intersection

#########################################################################
# create ucscToINSDC name mapping (DONE - 2013-08-23 - Hiram)
    mkdir /hive/data/genomes/fr3/bed/ucscToINSDC
    cd /hive/data/genomes/fr3/bed/ucscToINSDC

    # copying these scripts from the previous load and improving them
    # with each instance
    ./translateNames.sh NC_004299.1
    ./verifyAll.sh
    ./join.sh

    sed -e "s/21/8/" $HOME/kent/src/hg/lib/ucscToINSDC.sql \
        | hgLoadSqlTab fr3 ucscToINSDC stdin ucscToINSDC.tab
    checkTableCoords fr3 ucscToINSDC
    featureBits -countGaps fr3 ucscToINSDC
    # 391484715 bases of 391484715 (100.000%) in intersection

    # verify the track link to INSDC functions

##############################################################################
##############################################################################
# TransMap V3 tracks. see makeDb/doc/transMapTracks.txt (2014-12-21 markd)
##############################################################################

##############################################################################
# lastz chainNet xenTro9/X. tropicalis (DONE - 2017-04-06 - Hiram)

    # original alignment
    cd /hive/data/genomes/xenTro9/bed/lastzFr3.2017-04-05
    cat fb.xenTro9.chainFr3Link.txt
    #	51743020 bases of 1369865365 (3.777%) in intersection

    #	and for the swap
    mkdir /hive/data/genomes/fr3/bed/blastz.xenTro9.swap
    cd /hive/data/genomes/fr3/bed/blastz.xenTro9.swap
    time (doBlastzChainNet.pl -verbose=2 -syntenicNet \
	/hive/data/genomes/xenTro9/bed/lastzFr3.2017-04-05/DEF \
        -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \
        -swap -chainMinScore=5000 -chainLinearGap=loose) > swap.log 2>&1 &
    #	real    17m36.964s

    cat  fb.fr3.chainXenTro9Link.txt
    #	44999785 bases of 350961831 (12.822%) in intersection

    time (doRecipBest.pl -workhorse=hgwdev -buildDir=`pwd` fr3 xenTro9) \
	> rbest.log 2>&1 &
    # real    161m34.026s

#########################################################################
# add chromAlias table (DONE - 2016-10-20 - Hiram)

    mkdir /hive/data/genomes/fr3/bed/chromAlias
    cd /hive/data/genomes/fr3/bed/chromAlias

    hgsql -N -e 'select chrom,name from ucscToRefSeq;' fr3 \
        | sort -k1,1 > ucsc.refseq.tab
    hgsql -N -e 'select chrom,name from ucscToINSDC;' fr3 \
        | sort -k1,1 > ucsc.genbank.tab

    ### Adding Ensembl alias with v95 release, after idKeys made: 2019-01-16
    join -t$'\t' ../idKeys/fr3.idKeys.txt \
        ../../ens95/ensFr3.idKeys.txt | cut -f2- \
          | sort -k1,1 | join -t$'\t' <(sort -k1,1 ../../chrom.sizes) - \
            | awk '{printf "%s\t0\t%d\t%s\n", $1, $2, $3}' \
               | sort -k1,1 -k2,2n > ucscToEns.bed
    # Ensembl has an error in chr20 - they have an extra 10 N's on the end
    # of the sequence.  Otherwise, it is identical.  Add chr20 alias manually
    # to ucsc.ensembl.bed
    cut -f1,4 ucscToEns.bed | sort > ucsc.ensembl.tab
    # and chrM name for genbank is: AJ421455.1
    wc -l *.tab
#    6835 ucsc.ensembl.tab
#    6835 ucsc.genbank.tab
#    6835 ucsc.refseq.tab

    grep chrM *.tab
# ucsc.ensembl.tab:chrM   MT
# ucsc.genbank.tab:chrM   AJ421455.1
# ucsc.refseq.tab:chrM    NC_004299.1

    ~/kent/src/hg/utils/automation/chromAlias.pl ucsc.*.tab \
	> fr3.chromAlias.tab

for t in refseq genbank ensembl
do
  c0=`cat ucsc.$t.tab | wc -l`
  c1=`grep $t fr3.chromAlias.tab | wc -l`
  ok="OK"
  if [ "$c0" -ne "$c1" ]; then
     ok="ERROR"
  fi
  printf "# checking $t: $c0 =? $c1 $ok\n"
done
# checking refseq: 6835 =? 6835 OK
# checking genbank: 6835 =? 6835 OK
# checking ensembl: 6835 =? 6835 OK

    hgLoadSqlTab fr3 chromAlias ~/kent/src/hg/lib/chromAlias.sql \
        fr3.chromAlias.tab

#########################################################################
