# for emacs: -*- mode: sh; -*-

# This file describes browser build for the panTro4
#	Chimp - Pan_troglodytes-2.1.4 - Feb 2011

# http://www.ncbi.nlm.nih.gov/Traces/wgs/?val=AACZ03
#	6X coverage via a variety of methods

#	http://www.ncbi.nlm.nih.gov/bioproject/10627

#############################################################################
# Fetch sequence from genbank (DONE - 2012-01-06 - Hiram)

    mkdir -p /hive/data/genomes/panTro4/genbank
    cd /hive/data/genomes/panTro4/genbank

    wget --timestamping -r --cut-dirs=6 --level=0 -nH -x \
        --no-remove-listing -np \
"ftp://ftp.ncbi.nlm.nih.gov/genbank/genomes/Eukaryotes/vertebrates_mammals/Pan_troglodytes/Pan_troglodytes-2.1.4/*"
    #	Downloaded: 268 files, 2.2G in 10m 14s (3.62 MB/s)
    #	real    11m47.645s

    # measure sequence to be used here
    faSize Primary_Assembly/assembled_chromosomes/FASTA/*.fa.gz \
	Primary_Assembly/unplaced_scaffolds/FASTA/*.fa.gz \
	Primary_Assembly/unlocalized_scaffolds/FASTA//*.fa.gz
    #	3309561368 bases (407238955 N's 2902322413 real 2902322413 upper
    #	0 lower) in 24128 sequences in 49 files
    #   Total size: mean 137166.8 sd 4455884.0 min 373
    #	(gi|284234151|gb|AACZ03151841.1|) max 247518478
    #	(gi|305434869|gb|CM000316.2|) median 2299

#############################################################################
# process into UCSC naming scheme (DONE - 2012-01-06 - Hiram)
    mkdir /hive/data/genomes/panTro4/ucsc
    cd /hive/data/genomes/panTro4/ucsc

    cat << '_EOF_' > toUcsc.pl
#!/bin/env perl

use strict;
use warnings;

my %accToChr;

open (FH, "<../genbank/Primary_Assembly/assembled_chromosomes/chr2acc") or
        die "can not read Primary_Assembly/assembled_chromosomes/chr2acc";
while (my $line = <FH>) {
    next if ($line =~ m/^#/);
    chomp $line;
    my ($chrN, $acc) = split('\s+', $line);
    $accToChr{$acc} = $chrN;
}
close (FH);

foreach my $acc (keys %accToChr) {
    my $chrN =  $accToChr{$acc};
    print "$acc $accToChr{$acc}\n";
    open (FH, "zcat ../genbank/Primary_Assembly/assembled_chromosomes/AGP/chr${chrN}.agp.gz|") or die "can not read chr${chrN}.agp.gz";
    open (UC, ">chr${chrN}.agp") or die "can not write to chr${chrN}.agp";
    while (my $line = <FH>) {
        if ($line =~ m/^#/) {
            print UC $line;
        } else {
            $line =~ s/^$acc/chr${chrN}/;
            print UC $line;
        }
    }
    close (FH);
    close (UC);
    open (FH, "zcat ../genbank/Primary_Assembly/assembled_chromosomes/FASTA/chr${chrN}.fa.gz|") or die "can not read chr${chrN}.fa.gz";
    open (UC, ">chr${chrN}.fa") or die "can not write to chr${chrN}.fa";
    while (my $line = <FH>) {
        if ($line =~ m/^>/) {
            printf UC ">chr${chrN}\n";
        } else {
            print UC $line;
        }
    }
    close (FH);
    close (UC);
}
'_EOF_'
    # << happy emacs
    chmod +x toUcsc.pl

    cat << '_EOF_' > unplaced.pl
#!/bin/env perl

use strict;
use warnings;

my $agpFile =  "../genbank/Primary_Assembly/unplaced_scaffolds/AGP/unplaced.scaf.agp.gz";
my $fastaFile =  "../genbank/Primary_Assembly/unplaced_scaffolds/FASTA/unplaced.scaf.fa.gz";
open (FH, "zcat $agpFile|") or die "can not read $agpFile";
open (UC, ">unplaced.agp") or die "can not write to unplaced.agp";
while (my $line = <FH>) {
    if ($line =~ m/^#/) {
        print UC $line;
    } else {
        $line =~ s/\.1//;
        printf UC "chrUn_%s", $line;
    }
}
close (FH);
close (UC);

open (FH, "zcat $fastaFile|") or die "can not read $fastaFile";
open (UC, ">unplaced.fa") or die "can not write to unplaced.fa";
while (my $line = <FH>) {
    if ($line =~ m/^>/) {
        chomp $line;
        $line =~ s/.*gb\|//;
        $line =~ s/\.1\|.*//;
        printf UC ">chrUn_$line\n";
    } else {
        print UC $line;
    }
}
close (FH);
close (UC);
'_EOF_'
    # << happy emacs
    chmod +x unplaced.pl

    cat << '_EOF_' > unlocalized.pl
#!/bin/env perl

use strict;
use warnings;

my %accToChr;
my %chrNames;

open (FH, "<../genbank/Primary_Assembly/unlocalized_scaffolds/unlocalized.chr2scaf") or
        die "can not read Primary_Assembly/unlocalized_scaffolds/unlocalized.chr2scaf";
while (my $line = <FH>) {
    next if ($line =~ m/^#/);
    chomp $line;
    my ($chrN, $acc) = split('\s+', $line);
    $accToChr{$acc} = $chrN;
    $chrNames{$chrN} += 1;
}
close (FH);

foreach my $chrN (keys %chrNames) {
    my $agpFile =  "../genbank/Primary_Assembly/unlocalized_scaffolds/AGP/chr$chrN.unlocalized.scaf.agp.gz";
    my $fastaFile =  "../genbank/Primary_Assembly/unlocalized_scaffolds/FASTA/chr$chrN.unlocalized.scaf.fa.gz";
    open (FH, "zcat $agpFile|") or die "can not read $agpFile";
    open (UC, ">chr${chrN}_random.agp") or die "can not write to chr${chrN}_random.agp";
    while (my $line = <FH>) {
        if ($line =~ m/^#/) {
            print UC $line;
        } else {
            chomp $line;
            my (@a) = split('\t', $line);
            my $acc = $a[0];
            my $accNo1 = $acc;
            $accNo1 =~ s/.1$//;
            die "ERROR: acc not .1: $acc" if ($accNo1 =~ m/\./);
            die "ERROR: chrN $chrN not correct for $acc"
                if ($accToChr{$acc} ne $chrN);
            my $ucscName = "chr${chrN}_${accNo1}_random";
	    # these names became too long, limit is 31 in browser:
	    if ($chrN =~ m/LGE22C19W28_E50C23/) {
my $before = $ucscName;
		$ucscName =~ s/_E50C23//;
		$ucscName =~ s/AAD//;
printf STDERR "shorter: $before -> $ucscName\n";
	    }
            printf UC "%s", $ucscName;
            for (my $i = 1; $i < scalar(@a); ++$i) {
                printf UC "\t%s", $a[$i];
            }
            printf UC "\n";
        }
    }
    close (FH);
    close (UC);
    printf "chr%s\n", $chrN;
    open (FH, "zcat $fastaFile|") or die "can not read $fastaFile";
    open (UC, ">chr${chrN}_random.fa") or die "can not write to chr${chrN}_random.fa";
    while (my $line = <FH>) {
        if ($line =~ m/^>/) {
            chomp $line;
            my $acc = $line;
            $acc =~ s/.*gb\|//;
            $acc =~ s/\|.*//;
            my $accNo1 = $acc;
            $accNo1 =~ s/.1$//;
            die "ERROR: acc not .1: $acc" if ($accNo1 =~ m/\./);
            die "ERROR: chrN $chrN not correct for $acc"
                if ($accToChr{$acc} ne $chrN);
            my $ucscName = "chr${chrN}_${accNo1}_random";
	    # these names became too long, limit is 31 in browser:
	    if ($chrN =~ m/LGE22C19W28_E50C23/) {
		$ucscName =~ s/_E50C23//;
		$ucscName =~ s/AAD//;
	    }
            printf UC ">$ucscName\n";
        } else {
            print UC $line;
        }
    }
    close (FH);
    close (UC);
}
'_EOF_'
    # << happy emacs
    chmod +x unlocalized.pl

    ./toUcsc.pl
    ./unlocalized.pl
    ./unplaced.pl

    gzip *.fa *.agp
    # this takes a few minutes

    # verify nothing lost in the translation, should be the same as above
    #	except for the name translations
    faSize *.fa
# 1046915324 bases (14077289 N's 1032838035 real 1032838035 upper 0 lower) in 15931 sequences in 65 files
# Total size: mean 65715.6 sd 2473072.9 min 253 (chr2_AADN03009880_random) max 195276750 (chr1) median 1206

#############################################################################
#   Initial browser build (DONE - 2012-01-06 - Hiram)
    cd /hive/data/genomes/panTro4
    cat << '_EOF_' > panTro4.config.ra
# Config parameters for makeGenomeDb.pl:
db panTro4
clade mammal
genomeCladePriority 10
scientificName Pan troglodytes
commonName Chimp
assemblyDate Feb. 2011
assemblyLabel CSAC Pan_troglodytes-2.1.4 (GCA_000001515.4)
assemblyShortLabel Pan_troglodytes-2.1.4
orderKey 22
mitoAcc NC_001643
fastaFiles /hive/data/genomes/panTro4/ucsc/*.fa.gz
agpFiles /hive/data/genomes/panTro4/ucsc/*.agp.gz
dbDbSpeciesDir chimp
taxId   9598
'_EOF_'
    # << happy emacs

    time makeGenomeDb.pl -stop=agp panTro4.config.ra > agp.log 2>&1
    #	real    3m25.355s
    # check the end of agp.log to verify it is OK
    time makeGenomeDb.pl -workhorse=hgwdev -fileServer=hgwdev \
	-continue=db panTro4.config.ra > db.log 2>&1
    #	about 27 minutes

#############################################################################
# running repeat masker (DONE - 2012-01-06 - Hiram)
    mkdir /hive/data/genomes/panTro4/bed/repeatMasker
    cd /hive/data/genomes/panTro4/bed/repeatMasker
    time doRepeatMasker.pl -buildDir=`pwd` -noSplit \
	-bigClusterHub=swarm -dbHost=hgwdev -workhorse=hgwdev \
	-smallClusterHub=memk panTro4 > do.log 2>&1 &
    #	real    761m6.538s
    # missing IDs:
# 313  12.3  3.5  1.8  chr14     102404757 102404806 (4140132) +  AluSz6         SINE/Alu                85  135  (177)
# 315  16.5  0.0  0.5  chr14     102405794 102405850 (4139088) +  AluJb          SINE/Alu                 1   42  (270)
# 1558  14.1  1.2  2.3  chr14     102406616 102406871 (4138067) +  AluJb          SINE/Alu                53  305    (7)
# 411  17.2  0.0  0.0  chr22     20433770 20433795 (29304189) +  AluJb          SINE/Alu                44   69  (243)
# 1093  15.4  0.0  8.5  chr22     20434607 20434807 (29303177) +  AluJb          SINE/Alu               120  303    (9)

    # slight problem, 5 entries without IDs, removed them:
    egrep -v "102404757 102404806|102405794 102405850|102406616 102406871|20433770 20433795|20434607 20434807" panTro4.fa.out > panTro4.clean.fa.out
    # finish the doCat.csh script:
    /cluster/bin/scripts/extractNestedRepeats.pl panTro4.clean.fa.out \
	| sort -k1,1 -k2,2n > panTro4.nestedRepeats.bed
    # continuing with masking
    time doRepeatMasker.pl -buildDir=`pwd` -noSplit \
	-continue=mask -bigClusterHub=swarm -dbHost=hgwdev -workhorse=hgwdev \
	-smallClusterHub=memk panTro4 > mask.log 2>&1 &
    #	real    43m48.192s

    cat faSize.rmsk.txt
    #	3309577922 bases (407238955 N's 2902338967 real 1415372307 upper
    #	1486966660 lower) in 24129 sequences in 1 files
    #	Total size: mean 137161.8 sd 4455791.7
    #	min 373 (chr1_AACZ03151841_random) max 247518478 (chr2B) median 2299
    #	%10.65 masked total, %10.79 masked real

    grep -i versi do.log
# RepeatMasker version development-$Id: RepeatMasker,v 1.26 2011/09/26 16:19:44 angie Exp $
#    April 26 2011 (open-3-3-0) version of RepeatMasker

    featureBits -countGaps panTro4 rmsk
    #	1490804137 bases of 3309577922 (45.045%) in intersection
    # why is it different than the faSize above ?
    # because rmsk masks out some N's as well as bases, the count above
    #	separates out the N's from the bases, it doesn't show lower case N's

##########################################################################
# running simple repeat (DONE - 2012-01-06 - Hiram)
    mkdir /hive/data/genomes/panTro4/bed/simpleRepeat
    cd /hive/data/genomes/panTro4/bed/simpleRepeat
    time doSimpleRepeat.pl -buildDir=`pwd` -bigClusterHub=swarm \
	-dbHost=hgwdev -workhorse=hgwdev -smallClusterHub=memk \
	panTro4 > do.log 2>&1 &
    #	batch failed in 22 minutes, last running job in about 3 hours
    # two failed jobs are on sequence:
    #	chr2B:0-50000000 and chr2B:50000000-100000000
    # which are all N's - this confuses trf, construct empty results:
    touch /hive/data/genomes/panTro4/TrfPart/049/049.lst.bed
    touch /hive/data/genomes/panTro4/TrfPart/050/050.lst.bed

    time doSimpleRepeat.pl -buildDir=`pwd` -bigClusterHub=swarm \
	-dbHost=hgwdev -workhorse=hgwdev -smallClusterHub=memk \
	-continue=filter panTro4 > filter.log 2>&1 &
    #	real    0m56.241s

    cat fb.simpleRepeat
    #	95896404 bases of 2902338967 (3.304%) in intersection

    cd /hive/data/genomes/panTro4
    twoBitMask panTro4.rmsk.2bit \
	-add bed/simpleRepeat/trfMask.bed panTro4.2bit
    #	you can safely ignore the warning about fields >= 13

    twoBitToFa panTro4.2bit stdout | faSize stdin > faSize.panTro4.2bit.txt
    cat faSize.panTro4.2bit.txt
    #	3309577922 bases (407238955 N's 2902338967 real 1413870150 upper
    #	1488468817 lower) in 24129 sequences in 1 files
    #	Total size: mean 137161.8 sd 4455791.7
    #	min 373 (chr1_AACZ03151841_random) max 247518478 (chr2B) median 2299

    rm /gbdb/panTro4/panTro4.2bit
    ln -s `pwd`/panTro4.2bit /gbdb/panTro4/panTro4.2bit

#########################################################################
# Verify all gaps are marked, add any N's not in gap as type 'other'
#	(DONE - 2012-01-06 - Hiram)
    mkdir /hive/data/genomes/panTro4/bed/gap
    cd /hive/data/genomes/panTro4/bed/gap
    time nice -n +19 findMotif -motif=gattaca -verbose=4 \
	-strand=+ ../../panTro4.unmasked.2bit > findMotif.txt 2>&1
    #	real    0m17.839s
    grep "^#GAP " findMotif.txt | sed -e "s/^#GAP //" > allGaps.bed
    time featureBits -countGaps panTro4 -not gap -bed=notGap.bed
    #	2969184009 bases of 3309577922 (89.715%) in intersection
    #	real    0m26.597s

    time featureBits -countGaps panTro4 allGaps.bed notGap.bed -bed=new.gaps.bed
    #	66845042 bases of 3309577922 (2.020%) in intersection
    #	real    14m32.610s

    #	what is the highest index in the existing gap table:
    hgsql -N -e "select ix from gap;" panTro4 | sort -n | tail -1
    #	1215
    cat << '_EOF_' > mkGap.pl
#!/bin/env perl

use strict;
use warnings;

my $ix=`hgsql -N -e "select ix from gap;" panTro4 | sort -n | tail -1`;
chomp $ix;

open (FH,"<new.gaps.bed") or die "can not read new.gaps.bed";
while (my $line = <FH>) {
    my ($chrom, $chromStart, $chromEnd, $rest) = split('\s+', $line);
    ++$ix;
    printf "%s\t%d\t%d\t%d\tN\t%d\tother\tyes\n", $chrom, $chromStart,
        $chromEnd, $ix, $chromEnd-$chromStart;
}
close (FH);
'_EOF_'
    # << happy emacs
    chmod +x ./mkGap.pl
    ./mkGap.pl > other.bed
    wc -l other.bed
    #	161551
    featureBits -countGaps panTro4 other.bed
    #	66845042 bases of 3309577922 (2.020%) in intersection
    hgLoadBed -sqlTable=$HOME/kent/src/hg/lib/gap.sql \
	-noLoad panTro4 otherGap other.bed
    # verify no overlap with gap table:
    time featureBits -countGaps panTro4 gap other.bed
    #	0 bases of 3309577922 (0.000%) in intersection
    #	real    41m50.742s

    # verify no errors before adding to the table:
    time gapToLift -minGap=1 panTro4 nonBridged.before.lift \
	-bedFile=nonBridged.before.bed > before.gapToLift.txt 2>&1 &
    #	real    0m7.205s
    # check for warnings in before.gapToLift.txt, should be empty:
    #	-rw-rw-r-- 1     1633 Jan  6 15:20 before.gapToLift.txt
    # it indicates that there are telomere's adjacent to centromere's
    #	and heterochromatin
    #	starting with this many:
    hgsql -e "select count(*) from gap;" panTro4
    #	21559
    hgsql panTro4 -e 'load data local infile "bed.tab" into table gap;'
    #	result count:
    hgsql -e "select count(*) from gap;" panTro4
    #	183110
    # == 21559 + 161551
    # verify we aren't adding gaps where gaps already exist
    # this would output errors if that were true:
    gapToLift -minGap=1 panTro4 nonBridged.lift -bedFile=nonBridged.bed
    #same set of warnings as before, telomere's centromere's and heterochromatin
    # there should be no errors or other output, checked bridged gaps:
    hgsql -N -e "select bridge from gap;" panTro4 | sort | uniq -c
    #	2936 no
    #	180174 yes

##########################################################################
## WINDOWMASKER (DONE - 2012-01-06 - Hiram)
    mkdir /hive/data/genomes/panTro4/bed/windowMasker
    cd /hive/data/genomes/panTro4/bed/windowMasker
    time nice -n +19 doWindowMasker.pl -buildDir=`pwd` -workhorse=hgwdev \
	-dbHost=hgwdev panTro4 > do.log 2>&1 &
    # this failed the first time with some kind of network error
    # it was restarted completely and finished normally
    #	real    268m21.572s

    #	about 45 minutes

    # Masking statistics
    twoBitToFa panTro4.wmsk.2bit stdout | faSize stdin
    #	3309577922 bases (407238955 N's 2902338967 real 1830845011 upper
    #	1071493956 lower) in 24129 sequences in 1 files
    #	Total size: mean 137161.8 sd 4455791.7
    #	min 373 (chr1_AACZ03151841_random) max 247518478 (chr2B) median 2299
    #	%32.38 masked total, %36.92 masked real

    twoBitToFa panTro4.wmsk.sdust.2bit stdout | faSize stdin
    #	3309577922 bases (407238955 N's 2902338967 real 1814008749 upper
    #	1088330218 lower) in 24129 sequences in 1 files
    #	Total size: mean 137161.8 sd 4455791.7
    #	min 373 (chr1_AACZ03151841_random) max 247518478 (chr2B) median 2299
    #	%32.88 masked total, %37.50 masked real

    hgLoadBed panTro4 windowmaskerSdust windowmasker.sdust.bed.gz
    #	Loaded 16464562 elements of size 3

    featureBits -countGaps panTro4 windowmaskerSdust
    #	1495546550 bases of 3309577922 (45.188%) in intersection

    #	eliminate the gaps from the masking
    featureBits panTro4 -not gap -bed=notGap.bed
    #	2902338967 bases of 2902338967 (100.000%) in intersection
    time nice -n +19 featureBits panTro4 windowmaskerSdust notGap.bed \
        -bed=stdout | gzip -c > cleanWMask.bed.gz
    #	1088330218 bases of 2902338967 (37.498%) in intersection
    #	real    49m22.981s

    #	reload track to get it clean
    hgLoadBed panTro4 windowmaskerSdust cleanWMask.bed.gz
    #	Loaded 16467228  elements of size 4
    time featureBits -countGaps panTro4 windowmaskerSdust
    #	1088330218 bases of 3309577922 (32.884%) in intersection
    #	real    1m45.898s

    #	mask with this clean result
    zcat cleanWMask.bed.gz \
	| twoBitMask ../../panTro4.unmasked.2bit stdin \
	    -type=.bed panTro4.cleanWMSdust.2bit
    twoBitToFa panTro4.cleanWMSdust.2bit stdout | faSize stdin \
        > panTro4.cleanWMSdust.faSize.txt
    cat panTro4.cleanWMSdust.faSize.txt
    #	3309577922 bases (407238955 N's 2902338967 real 1814008749 upper
    #	1088330218 lower) in 24129 sequences in 1 files
    #	Total size: mean 137161.8 sd 4455791.7
    #	min 373 (chr1_AACZ03151841_random) max 247518478 (chr2B) median 2299
    #	%32.88 masked total, %37.50 masked real

    # how much does this window masker and repeat masker overlap:
    featureBits -countGaps panTro4 rmsk windowmaskerSdust
    #	854923909 bases of 3309577922 (25.832%) in intersection

#########################################################################
# create ucscToEnsembl name mapping (DONE - 2012-03-09 - Hiram)
    # this allows the "ensembl" blue bar button to appear
    mkdir /hive/data/genomes/panTro4/bed/ucscToEnsembl
    cd /hive/data/genomes/panTro4/bed/ucscToEnsembl

cut -f1 ../../chrom.sizes | while read C
do
ucName=${C}
ensName=`echo $C | sed -e 's/^chr[0-9A-Za-z]*_//; s/_random//; s/^chr//; s/^\([GA][LA][CZ0-9]*\)/\1.1/;' | awk '{print $1}'`
echo -e "$ucName\t$ensName"
done > ucscToEnsembl.tab

    cat << '_EOF_' > ucscToEnsembl.sql
# UCSC to Ensembl chr name translation
CREATE TABLE ucscToEnsembl (
    ucsc varchar(255) not null,        # UCSC chromosome name
    ensembl varchar(255) not null,     # Ensembl chromosome name
              #Indices
    PRIMARY KEY(ucsc(18))
);
'_EOF_'

    hgsql panTro4 < ucscToEnsembl.sql
    hgsql panTro4 \
-e 'LOAD DATA LOCAL INFILE "ucscToEnsembl.tab" INTO TABLE ucscToEnsembl'

    # verify the blue bar "ensembl" link is now available

#########################################################################
# construct lift file Ensembl names to UCSC names (DONE - 2012-03-09 - Hiram)
    cd /hive/data/genomes/panTro4/jkStuff

cat ../chrom.sizes | while read L
do
ucName=`echo "${L}" | awk '{print $1}'`
ucSize=`echo "${L}" | awk '{print $2}'`
ensName=`echo $L | sed -e 's/^chr[0-9A-Za-z]*_//; s/_random//; s/^chr//; s/^\([GA][LA][CZ0-9]*\)/\1.1/;' | awk '{print $1}'`
ensSize=`echo $L | sed -e 's/^chr[0-9A-Za-z]*_//; s/_random//; s/^chr//; s/^\([GA][LA][CZ0-9]*\)/\1.1/;' | awk '{print $2}'`
echo -e "0\t$ensName\t$ensSize\t$ucName\t$ucSize"
done > ensToUcsc.lift

#########################################################################
#  panTro4 - Chimp - Ensembl Genes version 65  (DONE - 2012-03-09 - hiram)
    ssh hgwdev
    cd /hive/data/genomes/panTro4
    cat << '_EOF_' > panTro4.ensGene.ra
# required db variable
db panTro4
# optional nameTranslation, the sed command that will transform
#       Ensemble names to UCSC names.  With quotes just to make sure.
nameTranslation "s/^\([0-9XY][0-9]*\)/chr\1/; s/^MT/chrM/; s/^Un/chrUn/"
# the liftUp also translates Ensembl names to UCSC names
liftUp  /hive/data/genomes/panTro4/jkStuff/ensToUcsc.lift
'_EOF_'
#  << happy emacs

    doEnsGeneUpdate.pl -ensVersion=65 panTro4.ensGene.ra
    ssh hgwdev
    cd /hive/data/genomes/panTro4/bed/ensGene.65
    featureBits panTro4 ensGene
    # 49232086 bases of 2902338967 (1.696%) in intersection

 *** All done!  (through the 'makeDoc' step)
 *** Steps were performed in /hive/data/genomes/panTro4/bed/ensGene.65

############################################################################
# cpgIslands - (DONE - 2011-04-24 - Hiram)
    mkdir /hive/data/genomes/panTro4/bed/cpgIslands
    cd /hive/data/genomes/panTro4/bed/cpgIslands
    time doCpgIslands.pl panTro4 > do.log 2>&1
    #   real    54m16.533s

    cat fb.panTro4.cpgIslandExt.txt
    #   19196639 bases of 2902338967 (0.661%) in intersection

#########################################################################
# genscan - (DONE - 2011-04-26 - Hiram)
    mkdir /hive/data/genomes/panTro4/bed/genscan
    cd /hive/data/genomes/panTro4/bed/genscan
    time doGenscan.pl panTro4 > do.log 2>&1
    # recovering after a power failure
    time ./lastJobs.sh
    #   real    431m5.600s

    #   This is not working.  Need to run these split
    mkdir /hive/data/genomes/panTro4/bed/genscan/splitRun
    cd /hive/data/genomes/panTro4/bed/genscan/splitRun
    gapToLift panTro4 panTro4.nonBridged.lift -bedFile=panTro4.nonBridged.bed

    for C in 1 3 5 6 7 9 X 11 13 15 16 19 2B
do
    echo chr${C}
    cd /hive/data/genomes/panTro4/bed/genscan/splitRun
    grep -w "chr${C}" panTro4.nonBridged.lift | grep -v random \
        | sed -e "s/chr${C}./chr${C}_/" > chr${C}.nonBridged.lift
    mkdir chr${C}
    faToTwoBit ../hardMaskedFa/00?/chr${C}.fa chr${C}/chr${C}.2bit
    ~/kent/src/hg/utils/lft2BitToFa.pl chr${C}/chr${C}.2bit \
        chr${C}.nonBridged.lift > chr${C}/chr${C}.nonBridged.fa
    cd /hive/data/genomes/panTro4/bed/genscan/splitRun/chr${C}
    mkdir split${C}
    faSplit sequence chr${C}.nonBridged.fa 100 split${C}/chr${C}_
done
    # verify lift files are sane:
    awk '{print $4}' chr*.nonBridged.lift | sort | uniq -c
    # should see the list of chroms:
    #   240 chr1
    #   75 chr11
    #   51 chr13
    #   88 chr15
    #   259 chr16
    #   144 chr19
    #   59 chr2B
    #   59 chr3
    #   89 chr5
    #   80 chr6
    #   33 chr7
    #   246 chr9
    #   607 chrX

    for C in 1 3 5 6 7 9 X 11 13 15 16 19 2B
do
    echo chr${C}
    cd /hive/data/genomes/panTro4/bed/genscan/splitRun/chr${C}
    echo '#!/bin/sh' > cmdList.sh
    export NL=-1
    ls split${C} | while read F
  do
  NL=`echo $NL | awk '{print $1+1}'`
  if [ "${NL}" -gt 7 ]; then
    NL=0
    echo "echo waiting before $F" >> cmdList.sh
    echo wait >> cmdList.sh
  fi
  echo "/cluster/bin/x86_64/gsBig split${C}/${F} gtf/${F}.gtf -trans=pep/${F}.pep -subopt=subopt/${F}.bed -exe=/scratch/data/genscan/genscan -par=/scratch/data/genscan/HumanIso.smat -tmp=/dev/shm -window=2400000 &"
  done >> cmdList.sh
    echo "echo waiting at end" >> cmdList.sh
    echo "wait" >> cmdList.sh
    chmod +x cmdList.sh
    rm -fr gtf pep subopt
    mkdir gtf pep subopt
done

chr1 cmdList.sh real    36m59.341s
# running the rest:
for C in chr11 chr13 chr15 chr16 chr19 chr2B chr3 chr5 chr6 chr7 chr9 chrX
do
    cd /hive/data/genomes/panTro4/bed/genscan/splitRun/${C}
    time ./cmdList.sh > ../${C}.log 2>&1
done

    # collecting the results:
for C in chr1 chr11 chr13 chr15 chr16 chr19 chr2B chr3 chr5 chr6 chr7 chr9 chrX
do
    cd /hive/data/genomes/panTro4/bed/genscan/splitRun/${C}
    cat gtf/${C}_*.gtf | liftUp -type=.gtf stdout ../${C}.nonBridged.lift error stdin \
        | sed -e "s/${C}_0\([0-4]\)\./${C}.\1/g" > ${C}.gtf
cat subopt/${C}_*.bed | liftUp -type=.bed stdout ../${C}.nonBridged.lift error stdin \
        | sed -e "s/${C}_0\([0-4]\)\./${C}.\1/g" > ${C}.subopt.bed
cat pep/${C}_*.pep | sed -e "s/${C}_0\([0-4]\)\./${C}.\1/g" > ${C}.pep
ls -l ../../gtf/00?/${C}.gtf ../../pep/00?/${C}.pep ../../subopt/00?/${C}.bed
ls -l ${C}.gtf ${C}.pep ${C}.subopt.bed
done
    # after verifying the sizes of the files seem same compared to what
    #   happened in the main run:
for C in chr1 chr11 chr13 chr15 chr16 chr19 chr2B chr3 chr5 chr6 chr7 chr9 chrX
do
    cd /hive/data/genomes/panTro4/bed/genscan/splitRun/${C}
   ls -l ../../gtf/00?/${C}.gtf ../../pep/00?/${C}.pep ../../subopt/00?/${C}.bed
    ls -l ${C}.gtf ${C}.pep ${C}.subopt.bed
    cd /hive/data/genomes/panTro4/bed/genscan/splitRun
done

    # this is tricky, it is counting on the file existing, empty or otherwise
for C in chr1 chr11 chr13 chr15 chr16 chr19 chr2B chr3 chr5 chr6 chr7 chr9 chrX
do
    cd /hive/data/genomes/panTro4/bed/genscan/splitRun/${C}
    D=`ls ../../gtf/00?/${C}.gtf`
    rm -f "${D}"
    cp -p ${C}.gtf "${D}"
    D=`ls ../../pep/00?/${C}.pep`
    rm -f "${D}"
    cp -p ${C}.pep "${D}"
    D=`ls ../../subopt/00?/${C}.bed`
    rm -f "${D}"
    cp -p ${C}.subopt.bed "${D}"
    cd /hive/data/genomes/panTro4/bed/genscan/splitRun
done

    # Now, we can continue
    time doGenscan.pl -continue=makeBed -workhorse=hgwdev -dbHost=hgwdev \
        panTro4 > makeBed.log 2>&1
    #   real    8m55.453s

    cat fb.panTro4.genscan.txt
    #   52641598 bases of 2902338967 (1.814%) in intersection
    cat fb.panTro4.genscanSubopt.txt
    #   54088961 bases of 2902338967 (1.864%) in intersection

#########################################################################
# MAKE 11.OOC FILE FOR BLAT/GENBANK (DONE - 2012-05-04 - Hiram)
    # Use -repMatch=900, based on size -- for human we use 1024
    # use the "real" number from the faSize measurement,
    # hg19 is 2897316137, calculate the ratio factor for 1024:
    calc \( 2902338967 / 2897316137 \) \* 1024
    #	( 2902338967 / 2897316137 ) * 1024 = 1025.775222

    # round up to 1050  (panTro3 was 1024)

    cd /hive/data/genomes/panTro4
    time blat panTro4.2bit /dev/null /dev/null -tileSize=11 \
      -makeOoc=jkStuff/panTro4.11.ooc -repMatch=1050
    #   Wrote 29671 overused 11-mers to jkStuff/panTro4.11.ooc
    #   panTro3 had: Wrote 31038 overused 11-mers to jkStuff/panTro3.11.ooc
    #	real     0m59.729s

    # there are non-bridged gaps, create lift file needed for genbank
    hgsql -N -e "select bridge from gap;" panTro4 | sort | uniq -c
    #   2936 no
    #   180174 yes

    cd /hive/data/genomes/panTro4/jkStuff
    gapToLift panTro4 panTro4.nonBridged.lift -bedFile=panTro4.nonBridged.bed
    # this assembly has gaps abutting each other which produces warnings
    #   from this gapToLift program.
    # largest non-bridged contig:
    awk '{print $3-$2,$0}' panTro4.nonBridged.bed | sort -nr | head
    #   44492345 chr6   107033290       151525635       chr6.60

#########################################################################
# AUTO UPDATE GENBANK (DONE - 2012-05-04 - Hiram)
    # examine the file:
    /cluster/data/genbank/data/organism.lst
    # for your species to see what counts it has for:
# organism       mrnaCnt estCnt  refSeqCnt
# Felis catus	1081	919	354
    # to decide which "native" mrna or ests you want to specify in genbank.conf

    ssh hgwdev
    cd $HOME/kent/src/hg/makeDb/genbank
    git pull
    # edit etc/genbank.conf to add:
# panTro4 (chimp)
panTro4.serverGenome = /hive/data/genomes/panTro4/panTro4.2bit
panTro4.clusterGenome = /hive/data/genomes/panTro4/panTro4.2bit
panTro4.ooc = /hive/data/genomes/panTro4/jkStuff/panTro4.11.ooc
panTro4.lift = /hive/data/genomes/panTro4/jkStuff/panTro4.nonBridged.lift
panTro4.perChromTables = no
panTro4.refseq.mrna.native.pslCDnaFilter  = ${ordered.refseq.mrna.native.pslCDnaFilter}
panTro4.refseq.mrna.xeno.pslCDnaFilter    = ${ordered.refseq.mrna.xeno.pslCDnaFilter}
panTro4.genbank.mrna.native.pslCDnaFilter = ${ordered.genbank.mrna.native.pslCDnaFilter}
panTro4.genbank.mrna.xeno.pslCDnaFilter   = ${ordered.genbank.mrna.xeno.pslCDnaFilter}
panTro4.genbank.est.native.pslCDnaFilter  = ${ordered.genbank.est.native.pslCDnaFilter}
panTro4.genbank.est.xeno.pslCDnaFilter    = ${ordered.genbank.est.xeno.pslCDnaFilter}
panTro4.downloadDir = panTro4
panTro4.refseq.mrna.native.load  = yes
panTro4.refseq.mrna.xeno.load  = yes
panTro4.genbank.mrna.xeno.load = yes
panTro4.genbank.mrna.xeno.loadDesc = yes
panTro4.genbank.est.native.load = yes
# panTro4.upstreamGeneTbl = ensGene
# panTro4.upstreamMaf = multiz12way
# /hive/data/genomes/panTro4/bed/multiz12way/species.list

    # end of section added to etc/genbank.conf
    git commit -m "adding panTro4 chimp" etc/genbank.conf
    git push
    make etc-update

    ssh hgwdev			# used to do this on "genbank" machine
    screen -S panTro4           # long running job managed in screen
    cd /cluster/data/genbank
    time nice -n +19 ./bin/gbAlignStep -initial panTro4 &
# real    3m23.828s
    #	  var/build/logs/2012.05.04-17:28:16.panTro4.initalign.log
    #   real    2021m4.989s

    # load database when finished
    ssh hgwdev
    cd /cluster/data/genbank
    time nice -n +19 ./bin/gbDbLoadStep -drop -initialLoad panTro4 &
    #   var/dbload/hgwdev/logs/2012.05.06-22:07:50.dbload.log
    #   real    194m11.702s

    # enable daily alignment and update of hgwdev (DONE - 2012-02-09 - Hiram)
    cd ~/kent/src/hg/makeDb/genbank
    git pull
    # add panTro4 to:
        etc/align.dbs etc/hgwdev.dbs
    git commit -m "Added panTro4." etc/align.dbs etc/hgwdev.dbs
    git push
    make etc-update

############################################################################
# LASTZ Rhesus rheMac3 run (DONE 2012-07-12 - Chin)
    mkdir /hive/data/genomes/panTro4/bed/lastzRheMac3.2012-07-19
    cd /hive/data/genomes/panTro4/bed/lastzRheMac3.2012-07-19
    cat << '_EOF_' > DEF
# chimp vs rhesus
BLASTZ=lastz
# maximum M allowed with lastz is only 254
BLASTZ_M=254
BLASTZ_Q=/scratch/data/blastz/human_chimp.v2.q
BLASTZ_O=600
BLASTZ_E=150
BLASTZ_K=4500
BLASTZ_Y=15000
BLASTZ_T=2

# TARGET: Chimp PanTro4
SEQ1_DIR=/hive/data/genomes/panTro4/panTro4.2bit
SEQ1_LEN=//hive/data/genomes/panTro4/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
SEQ1_IN_CONTIGS=0

# QUERY: Rhesus RheMac3
SEQ2_DIR=/scratch/data/rheMac3/rheMac3.2bit
SEQ2_LEN=/scratch/data/rheMac3/chrom.sizes
SEQ2_CHUNK=10000000
SEQ2_LAP=0
SEQ2_IN_CONTIGS=0

BASE=/hive/data/genomes/panTro4/bed/lastzRheMac3.2012-07-19
TMPDIR=/scratch/tmp
'_EOF_'
    # << happy emacs

    screen -S panTro4_rheMac3 # use screen to manage this long-running job
    time nice -n +19 doBlastzChainNet.pl `pwd`/DEF -verbose=2 \
    -noLoadChainSplit -chainMinScore=5000 -chainLinearGap=medium \
    -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
    -syntenicNet > do.log 2>&1 &
    # real    4327m41.345s

    cat fb.panTro4.chainRheMac3Link.txt
    #   2362219217 bases of 2902338967 (81.390%) in intersection
    cd /hive/data/genomes/panTro4/bed
    ln -s lastzRheMac3.2012-07-19 lastz.rheMac3

    #   running the swap
    mkdir /hive/data/genomes/rheMac3/bed/blastz.panTro4.swap
    cd /hive/data/genomes/rheMac3/bed/blastz.panTro4.swap
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
    -swap /hive/data/genomes/panTro4/bed/lastzRheMac3.2012-07-19/DEF \
    -noLoadChainSplit -chainMinScore=5000 -chainLinearGap=medium \
    -syntenicNet -workhorse=hgwdev -smallClusterHub=encodek \
    -bigClusterHub=swarm > swap.log 2>&1 &
    #   real    82m21.660s
    cat fb.rheMac3.chainPanTro4Link.txt
    #   2278587464 bases of 2639145830 (86.338%) in intersection
   # set sym link to indicate this is the lastz for this genome:
    cd /hive/data/genomes/rheMac3/bed
    ln -s blastz.panTro4.swap lastz.panTro4

#########################################################################
# set default position to RHO gene displays  (DONE - 2012-07-24 - Hiram)
    hgsql -e \
'update dbDb set defaultPos="chr3:132959918-132967918" where name="panTro4";' \
	hgcentraltest

############################################################################
# pushQ entry (DONE - 2012-07-24 - Hiram)
    mkdir /hive/data/genomes/panTro4/pushQ
    cd /hive/data/genomes/panTro4/pushQ
    # Mark says don't let the transMap track get there
    time makePushQSql.pl panTro4 2> stderr.txt | grep -v transMap > panTro4.sql
    #   real    3m46.246s
    # check the stderr.txt for bad stuff, these kinds of warnings are OK:
# WARNING: hgwdev does not have /gbdb/panTro4/wib/gc5Base.wib
# WARNING: hgwdev does not have /gbdb/panTro4/wib/quality.wib
# WARNING: hgwdev does not have /gbdb/panTro4/bbi/quality.bw
# WARNING: panTro4 does not have seq
# WARNING: panTro4 does not have extFile
# WARNING: panTro4 does not have estOrientInfo

    scp -p panTro4.sql hgwbeta:/tmp
    ssh hgwbeta "hgsql qapushq < /tmp/panTro4.sql"

############################################################################
# lastz mouse Mm10 (DONE - 2012-03-10 - Hiram)
    # the original alignment
    cd /hive/data/genomes/mm10/bed/lastzPanTro4.2012-03-09
    cat fb.mm10.chainPanTro4Link.txt
    #	919836299 bases of 2652783500 (34.674%) in intersection

    # and this swap
    mkdir /hive/data/genomes/panTro4/bed/blastz.mm10.swap
    cd /hive/data/genomes/panTro4/bed/blastz.mm10.swap
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
	/hive/data/genomes/mm10/bed/lastzPanTro4.2012-03-09/DEF \
	-swap -syntenicNet \
	-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
	-chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 &
    #	real    73m23.855s
    cat fb.panTro4.chainMm10Link.txt
    #	926540065 bases of 2902338967 (31.924%) in intersection
    # set sym link to indicate this is the lastz for this genome:
    cd /hive/data/genomes/panTro4/bed
    ln -s blastz.mm10.swap lastz.mm10

##############################################################################
# lastz dog canFam3 (DONE - 2014-05-02 - Hiram)
    # the original alignment on dog:
    cd /hive/data/genomes/canFam3/bed/lastzPanTro4.2013-04-30
    cat fb.canFam3.chainPanTro4Link.txt
    #	1435182107 bases of 2392715236 (59.981%) in intersection

    #	running the swap - DONE - 2013-05-02
    mkdir /hive/data/genomes/panTro4/bed/blastz.canFam3.swap
    cd /hive/data/genomes/panTro4/bed/blastz.canFam3.swap
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
        /hive/data/genomes/canFam3/bed/lastzPanTro4.2013-04-30/DEF \
	-swap -syntenicNet -noLoadChainSplit \
	-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
	-chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 &
    #   real    113m28.428s
    cat fb.panTro4.chainCanFam3Link.txt
    # 1490574959 bases of 2902338967 (51.358%) in intersection

#########################################################################
# create ucscToINSDC name mapping (DONE - 2013-08-15 - Hiram)
    # this allows the "ensembl" blue bar button to appear
    mkdir /hive/data/genomes/panTro4/bed/ucscToINSDC
    cd /hive/data/genomes/panTro4/bed/ucscToINSDC

    cat << '_EOF_' > translateNames.sh
#!/bin/sh

grep -v "^#" ../../genbank/Primary_Assembly/assembled_chromosomes/chr2acc \
   | sed -e 's/^/chr/'

if [ -s ../../genbank/Primary_Assembly/unplaced_scaffolds/AGP/unplaced.scaf.agp.gz ]; then
  zcat ../../genbank/Primary_Assembly/unplaced_scaffolds/AGP/unplaced.scaf.agp.gz | grep -v "^#" | cut -f1 | sort -u \
     | sed -e 's/^\([A-Za-z0-9]*\).\([0-9]*\)/chrUn_\1\t\1.\2/;'
fi

if [ -s ../../genbank/Primary_Assembly/unlocalized_scaffolds/unlocalized.chr2scaf ]; then
grep -v "^#" \
  ../../genbank/Primary_Assembly/unlocalized_scaffolds/unlocalized.chr2scaf \
    | sed -e 's/^\([A-Za-z0-9]*\)\t\([A-Za-z0-9]*\).\([0-9]*\)/chr\1_\2_random\t\2.\3/;'

fi

echo -e "chrM\tNC_001643.1"
'_EOF_'
    # << happy emacs

    chmod +x translateNames.sh
    ./translateNames.sh | sort > ucscToINSDC.txt
    join <(sort ../../chrom.sizes) ucscToINSDC.txt \
        | awk '{printf "%s\t0\t%d\t%s\n", $1, $2, $3}' > ucscToINSDC.tab

    # maximum size of UCSC chrom name for SQL index
    cut -f1 ucscToINSDC.tab | awk '{print length($0)}' | sort -n | tail -1
    #   25

    sed -e 's/21/25/' $HOME/kent/src/hg/lib/ucscToINSDC.sql \
      | hgLoadSqlTab panTro4 ucscToINSDC stdin ucscToINSDC.tab

    # verify the track link to INSDC functions

##############################################################################
##############################################################################
# TransMap V3 tracks. see makeDb/doc/transMapTracks.txt (2014-12-21 markd)
##############################################################################
# lastz swap calJac3/panTro4 (DONE - 2015-02-10 - Hiram)
    # original alignment
    cd /hive/data/genomes/calJac3/bed/lastzPanTro4.2015-02-09
    cat fb.calJac3.chainPanTro4Link.txt
    #	2076330774 bases of 2752505800 (75.434%) in intersection

    #	running the swap
    mkdir /hive/data/genomes/panTro4/bed/blastz.calJac3.swap
    cd /hive/data/genomes/panTro4/bed/blastz.calJac3.swap
    time (doBlastzChainNet.pl -verbose=2 \
	/hive/data/genomes/calJac3/bed/lastzPanTro4.2015-02-09/DEF \
	-swap -syntenicNet -chainMinScore=3000 -chainLinearGap=medium \
	-workhorse=hgwdev -smallClusterHub=ku \
	-bigClusterHub=ku) > swap.log 2>&1
    #	real    114m27.647s

    cat fb.panTro4.chainCalJac3Link.txt
    #	2113187925 bases of 2902338967 (72.810%) in intersection

    time (doRecipBest.pl -buildDir=`pwd` panTro4 calJac3) > rbest.log 2>&1 &
    # real    44m18.149s

##############################################################################
# GENEID GENE PREDICTIONS (DONE - 2015-07-31 - Hiram)
    ssh hgwdev
    mkdir /hive/data/genomes/panTro4/bed/geneid
    cd /hive/data/genomes/panTro4/bed/geneid
    wget --timestamping \
http://genome.crg.es/genepredictions/P.troglodytes/panTro4/geneid_v1.4/00README
    wget --timestamping \
http://genome.crg.es/genepredictions/P.troglodytes/panTro4/geneid_v1.4/panTro4.geneid.gtf

    ldHgGene -gtf -genePredExt panTro4 geneid panTro4.geneid.gtf
    # Read 52164 transcripts in 299208 lines in 1 files
    #  52164 groups 20829 seqs 1 sources 3 feature types
    # 52164 gene predictions

    featureBits -enrichment panTro4 augustusGene:CDS geneid
# augustusGene:CDS 1.122%, geneid 1.307%, both 0.916%, cover 81.67%,
#    enrich 62.50x

##########################################################################
# LASTZ chimp/panTro4 Gorilla/gorGor4 - (DONE - 2016-06-09 - Hiram)
    mkdir /hive/data/genomes/panTro4/bed/lastzGorGor4.2016-06-09
    cd /hive/data/genomes/panTro4/bed/lastzGorGor4.2016-06-09

    printf '# chimp vs. gorilla
BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.66/bin/lastz
BLASTZ_T=2
BLASTZ_O=600
BLASTZ_E=150
BLASTZ_M=254
BLASTZ_K=4500
BLASTZ_L=4500
BLASTZ_Y=15000
BLASTZ_Q=/scratch/data/blastz/human_chimp.v2.q
#       A     C     G     T
# A    90  -330  -236  -356
# C  -330   100  -318  -236
# G  -236  -318   100  -330
# T  -356  -236  -330    90

# TARGET: chimp panTro4
SEQ1_DIR=/hive/data/genomes/panTro4/panTro4.2bit
SEQ1_LEN=/hive/data/genomes/panTro4/chrom.sizes
SEQ1_CHUNK=40000000
SEQ1_LIMIT=100
SEQ1_LAP=10000

# QUERY: Gorilla gorGor4
SEQ2_DIR=/hive/data/genomes/gorGor4/gorGor4.2bit
SEQ2_LEN=/hive/data/genomes/gorGor4/chrom.sizes
SEQ2_CHUNK=20000000
SEQ2_LIMIT=400
SEQ2_LAP=0

BASE=/hive/data/genomes/panTro4/bed/lastzGorGor4.2016-06-09
TMPDIR=/dev/shm
' > DEF

    time (doBlastzChainNet.pl `pwd`/DEF -verbose=2 \
        -chainMinScore=3000 -chainLinearGap=medium \
          -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \
            -syntenicNet) > do.log 2>&1
    # real    147m1.230s

    cat fb.panTro4.chainGorGor4Link.txt
    # 2695509206 bases of 2902338967 (92.874%) in intersection

    time (doRecipBest.pl -workhorse=hgwdev -buildDir=`pwd` panTro4 gorGor4) \
      > rbest.log 2>&1 &
    # real    273m52.556s

    # and for the swap:
    mkdir /hive/data/genomes/gorGor4/bed/blastz.panTro4.swap
    cd /hive/data/genomes/gorGor4/bed/blastz.panTro4.swap

    time (doBlastzChainNet.pl -verbose=2 \
      /hive/data/genomes/panTro4/bed/lastzGorGor4.2016-06-09/DEF \
        -swap -chainMinScore=3000 -chainLinearGap=medium \
          -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \
            -syntenicNet) > swap.log 2>&1
    #  real    91m7.838s

    cat fb.gorGor4.chainPanTro4Link.txt
    # 2663327780 bases of 2917333143 (91.293%) in intersection

    time (doRecipBest.pl -workhorse=hgwdev -buildDir=`pwd` gorGor4 panTro4) \
       > rbest.log 2>&1
    # real    288m9.024s

#########################################################################
# LIFTOVER TO panTro5 (DONE - 2016-08-02 - Hiram)
    ssh hgwdev
    mkdir /hive/data/genomes/panTro4/bed/blat.panTro5.2016-08-02
    cd /hive/data/genomes/panTro4/bed/blat.panTro5.2016-08-02
    doSameSpeciesLiftOver.pl -verbose=2 \
        -debug -bigClusterHub=ku -dbHost=hgwdev -workhorse=hgwdev \
        -ooc=/hive/data/genomes/panTro4/jkStuff/panTro4.11.ooc \
         panTro4 panTro5
    time (doSameSpeciesLiftOver.pl -verbose=2 \
        -bigClusterHub=ku -dbHost=hgwdev -workhorse=hgwdev \
        -ooc=/hive/data/genomes/panTro4/jkStuff/panTro4.11.ooc \
         panTro4 panTro5) > doLiftOverToGalGal4.log 2>&1
    # real    1170m40.479s

    # see if the liftOver menus function in the browser from panTro4 to panTro5

##########################################################################
# LASTZ chimp/panTro4 Gorilla/rheMac8 - (DONE - 2016-08-02 - Hiram)
    # had incorrect year on this directory name
    mkdir /hive/data/genomes/panTro4/bed/lastzRheMac8.2017-08-02
    cd /hive/data/genomes/panTro4/bed/lastzRheMac8.2017-08-02

    printf '# chimp vs rhesus
BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.66/bin/lastz
BLASTZ_T=2
BLASTZ_O=600
BLASTZ_E=150
BLASTZ_M=254
BLASTZ_K=4500
BLASTZ_L=4500
BLASTZ_Y=15000
BLASTZ_Q=/scratch/data/blastz/human_chimp.v2.q
#       A     C     G     T
# A    90  -330  -236  -356
# C  -330   100  -318  -236
# G  -236  -318   100  -330
# T  -356  -236  -330    90

# TARGET: Chimp PanTro4
SEQ1_DIR=/hive/data/genomes/panTro4/panTro4.2bit
SEQ1_LEN=//hive/data/genomes/panTro4/chrom.sizes
SEQ1_CHUNK=40000000
SEQ1_LAP=10000
SEQ1_IN_CONTIGS=0
SEQ1_LIMIT=200

# QUERY: Rhesus RheMac8
SEQ2_DIR=/hive/data/genomes/rheMac8/rheMac8.2bit
SEQ2_LEN=/hive/data/genomes/rheMac8/chrom.sizes
SEQ2_CHUNK=20000000
SEQ2_LAP=0
SEQ2_LIMIT=1000
SEQ2_IN_CONTIGS=0

BASE=/hive/data/genomes/panTro4/bed/lastzRheMac8.2017-08-02
TMPDIR=/dev/shm
' > DEF

    time (doBlastzChainNet.pl `pwd`/DEF -verbose=2 \
        -chainMinScore=3000 -chainLinearGap=medium \
          -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \
            -syntenicNet) > do.log 2>&1
    # real    520m46.562s

    cat fb.panTro4.chainRheMac8Link.txt
    # 2464376195 bases of 2902338967 (84.910%) in intersection

    time (doRecipBest.pl -workhorse=hgwdev -buildDir=`pwd` panTro4 rheMac8) \
      > rbest.log 2>&1 &
    # real    522m24.688s

    # and for the swap:
    mkdir /hive/data/genomes/rheMac8/bed/blastz.panTro4.swap
    cd /hive/data/genomes/rheMac8/bed/blastz.panTro4.swap

    time (doBlastzChainNet.pl -verbose=2 \
      /hive/data/genomes/panTro4/bed/lastzRheMac8.2017-08-02/DEF \
        -swap -chainMinScore=3000 -chainLinearGap=medium \
          -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \
            -syntenicNet) > swap.log 2>&1
    #  real    256m2.899s

    cat fb.rheMac8.chainPanTro4Link.txt
    # 2440747905 bases of 3142093174 (77.679%) in intersection

    time (doRecipBest.pl -workhorse=hgwdev -buildDir=`pwd` rheMac8 panTro4) \
       > rbest.log 2>&1
    # real    806m12.036s

#########################################################################
# LASTZ chimp/panTro4 Gorilla/gorGor5 - (DONE - 2016-08-02 - Hiram)
    mkdir /hive/data/genomes/panTro4/bed/lastzGorGor5.2016-08-02
    cd /hive/data/genomes/panTro4/bed/lastzGorGor5.2016-08-02

    printf '# chimp vs. gorilla
BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.66/bin/lastz
BLASTZ_T=2
BLASTZ_O=600
BLASTZ_E=150
BLASTZ_M=254
BLASTZ_K=4500
BLASTZ_L=4500
BLASTZ_Y=15000
BLASTZ_Q=/scratch/data/blastz/human_chimp.v2.q
#       A     C     G     T
# A    90  -330  -236  -356
# C  -330   100  -318  -236
# G  -236  -318   100  -330
# T  -356  -236  -330    90

# TARGET: chimp panTro4
SEQ1_DIR=/hive/data/genomes/panTro4/panTro4.2bit
SEQ1_LEN=/hive/data/genomes/panTro4/chrom.sizes
SEQ1_CHUNK=40000000
SEQ1_LIMIT=100
SEQ1_LAP=10000

# QUERY: Gorilla gorGor5
SEQ2_DIR=/hive/data/genomes/gorGor5/gorGor5.2bit
SEQ2_LEN=/hive/data/genomes/gorGor5/chrom.sizes
SEQ2_CHUNK=20000000
SEQ2_LIMIT=400
SEQ2_LAP=0

BASE=/hive/data/genomes/panTro4/bed/lastzGorGor5.2016-08-02
TMPDIR=/dev/shm
' > DEF

    time (doBlastzChainNet.pl `pwd`/DEF -verbose=2 \
        -chainMinScore=3000 -chainLinearGap=medium \
          -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \
            -syntenicNet) > do.log 2>&1
    # 489m52.825s

    cat fb.panTro4.chainGorGor5Link.txt
    # 2742736579 bases of 2902338967 (94.501%) in intersection

    time (doRecipBest.pl -workhorse=hgwdev -buildDir=`pwd` panTro4 gorGor5) \
      > rbest.log 2>&1 &
    # real    139m21.411s

    # and for the swap:
    mkdir /hive/data/genomes/gorGor5/bed/blastz.panTro4.swap
    cd /hive/data/genomes/gorGor5/bed/blastz.panTro4.swap

    time (doBlastzChainNet.pl -verbose=2 \
      /hive/data/genomes/panTro4/bed/lastzGorGor5.2016-08-02/DEF \
        -swap -chainMinScore=3000 -chainLinearGap=medium \
          -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \
            -syntenicNet) > swap.log 2>&1
    #  real    90m4.932s

    cat fb.gorGor5.chainPanTro4Link.txt
    # 2690786592 bases of 3080431298 (87.351%) in intersection

    time (doRecipBest.pl -workhorse=hgwdev -buildDir=`pwd` gorGor5 panTro4) \
       > rbest.log 2>&1
    # real    148m7.108s

#########################################################################
# LASTZ chimp/panTro4 Orangutan/ponAbe2 - (DONE - 2018-02-14 - Hiram)
    mkdir /hive/data/genomes/panTro4/bed/lastzPonAbe2.2018-02-14
    cd /hive/data/genomes/panTro4/bed/lastzPonAbe2.2018-02-14

    printf '# chimp vs orangutan
BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.66/bin/lastz
# maximum M allowed with lastz is only 254
BLASTZ_M=254
BLASTZ_O=600
BLASTZ_E=150
BLASTZ_K=4500
BLASTZ_Y=15000
BLASTZ_T=2
BLASTZ_Q=/scratch/data/blastz/human_chimp.v2.q
#       A     C     G     T
# A    90  -330  -236  -356
# C  -330   100  -318  -236
# G  -236  -318   100  -330
# T  -356  -236  -330    90

# TARGET: Chimp PanTro4
SEQ1_DIR=/hive/data/genomes/panTro4/panTro4.2bit
SEQ1_LEN=/hive/data/genomes/panTro4/chrom.sizes
SEQ1_CHUNK=20000000
SEQ1_LAP=10000
SEQ1_LIMIT=200

# QUERY: orangutan PonAbe2
SEQ2_DIR=/hive/data/genomes/ponAbe2/ponAbe2.2bit
SEQ2_LEN=/hive/data/genomes/ponAbe2/chrom.sizes
SEQ2_CHUNK=10000000
SEQ2_LIMIT=100
SEQ2_LAP=0

BASE=/hive/data/genomes/panTro4/bed/lastzPonAbe2.2018-02-14
TMPDIR=/dev/shm
' > DEF

    time (doBlastzChainNet.pl `pwd`/DEF -verbose=2 \
        -chainMinScore=3000 -chainLinearGap=medium \
          -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \
            -syntenicNet) > do.log 2>&1
    # real    141m47.089s

    cat fb.panTro4.chainPonAbe2Link.txt
    # 2598435357 bases of 2902338967 (89.529%) in intersection
    cat  fb.panTro4.chainSynPonAbe2Link.txt
    # 2496865923 bases of 2902338967 (86.029%) in intersection


    time (doRecipBest.pl -load -workhorse=hgwdev -buildDir=`pwd` \
	panTro4 ponAbe2) > rbest.log 2>&1 &
    # real    318m18.574s

    cat fb.panTro4.chainRBestPonAbe2Link.txt
    # 2481241682 bases of 2902338967 (85.491%) in intersection

    # and for the swap:
    mkdir /hive/data/genomes/ponAbe2/bed/blastz.panTro4.swap
    cd /hive/data/genomes/ponAbe2/bed/blastz.panTro4.swap

    time (doBlastzChainNet.pl -verbose=2 \
      /hive/data/genomes/panTro4/bed/lastzPonAbe2.2018-02-14/DEF \
        -swap -chainMinScore=3000 -chainLinearGap=medium \
          -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \
            -syntenicNet) > swap.log 2>&1
    #  real    85m37.535s

    cat fb.ponAbe2.chainPanTro4Link.txt
    # 2713148032 bases of 3093572278 (87.703%) in intersection

    time (doRecipBest.pl -load -workhorse=hgwdev -buildDir=`pwd` \
	ponAbe2 panTro4) > rbest.log 2>&1
    # real    502m37.175s

    cat fb.ponAbe2.chainRBestPanTro4Link.txt
    # 2485372112 bases of 3093572278 (80.340%) in intersection

#########################################################################
