# for emacs: -*- mode: sh; -*-

# This file describes browser build for the rheMac3
#	Macaca mulatta genome: October 2010

# http://www.ncbi.nlm.nih.gov/Traces/wgs/?val=AEHK00
#	50X WGS coverage

#############################################################################
# Fetch sequence from genbank (DONE - 2011-12-22 - Hiram)

    mkdir -p /hive/data/genomes/rheMac3/genbank
    cd /hive/data/genomes/rheMac3/genbank

    wget --timestamping -r --cut-dirs=6 --level=0 -nH -x \
        --no-remove-listing -np \
"ftp://ftp.ncbi.nlm.nih.gov/genbank/genomes/Eukaryotes/vertebrates_mammals/Macaca_mulatta/CR_1.0/*"

    # measure total sequence in this assembly
    faSize Primary_Assembly/assembled_chromosomes/FASTA/chr*.fa.gz \
	Primary_Assembly/unplaced_scaffolds/FASTA//un*.fa.gz
# 2969971616 bases (330842350 N's 2639129266 real 2639129266 upper 0 lower) in
# 34102 sequences in 22 files
# Total size: mean 87090.8 sd 3586193.4 min 200 (gi|353154774|gb|JH298929.1|)
# max 229590362 (gi|353351477|gb|CM001253.1|) median 442
# N count: mean 9701.6 sd 410667.9
# U count: mean 77389.3 sd 3195078.2
# L count: mean 0.0 sd 0.0
# %0.00 masked total, %0.00 masked real


#############################################################################
# process into UCSC naming scheme (DONE - 2011-12-22 - Hiram)
    mkdir /hive/data/genomes/rheMac3/ucsc
    cd /hive/data/genomes/rheMac3/ucsc

    cat << '_EOF_' > toUcsc.pl
#!/bin/env perl

use strict;
use warnings;

my %accToChr;

open (FH, "<../genbank/Primary_Assembly/assembled_chromosomes/chr2acc") or
        die "can not read Primary_Assembly/assembled_chromosomes/chr2acc";
while (my $line = <FH>) {
    next if ($line =~ m/^#/);
    chomp $line;
    my ($chrN, $acc) = split('\s+', $line);
    $accToChr{$acc} = $chrN;
}
close (FH);

foreach my $acc (keys %accToChr) {
    my $chrN =  $accToChr{$acc};
    print "$acc $accToChr{$acc}\n";
    open (FH, "zcat ../genbank/Primary_Assembly/assembled_chromosomes/AGP/chr${chrN}.agp.gz|") or die "can not read chr${chrN}.agp.gz";
    open (UC, ">chr${chrN}.agp") or die "can not write to chr${chrN}.agp";
    while (my $line = <FH>) {
        if ($line =~ m/^#/) {
            print UC $line;
        } else {
            $line =~ s/^$acc/chr${chrN}/;
            print UC $line;
        }
    }
    close (FH);
    close (UC);
    open (FH, "zcat ../genbank/Primary_Assembly/assembled_chromosomes/FASTA/chr${chrN}.fa.gz|") or die "can not read chr${chrN}.fa.gz";
    open (UC, ">chr${chrN}.fa") or die "can not write to chr${chrN}.fa";
    while (my $line = <FH>) {
        if ($line =~ m/^>/) {
            printf UC ">chr${chrN}\n";
        } else {
            print UC $line;
        }
    }
    close (FH);
    close (UC);
}
'_EOF_'
    # << happy emacs
    chmod +x toUcsc.pl
    time ./toUcsc.pl

    cat << '_EOF_' > unplaced.pl
#!/bin/env perl

use strict;
use warnings;

my $agpFile =  "../genbank/Primary_Assembly/unplaced_scaffolds/AGP/unplaced.scaf.agp.gz";
my $fastaFile =  "../genbank/Primary_Assembly/unplaced_scaffolds/FASTA/unplaced.scaf.fa.gz";
open (FH, "zcat $agpFile|") or die "can not read $agpFile";
open (UC, ">unplaced.agp") or die "can not write to unplaced.agp";
while (my $line = <FH>) {
    if ($line =~ m/^#/) {
        print UC $line;
    } else {
        $line =~ s/\.1//;
        printf UC "chrUn_%s", $line;
    }
}
close (FH);
close (UC);

open (FH, "zcat $fastaFile|") or die "can not read $fastaFile";
open (UC, ">unplaced.fa") or die "can not write to unplaced.fa";
while (my $line = <FH>) {
    if ($line =~ m/^>/) {
        chomp $line;
        $line =~ s/.*gb\|//;
        $line =~ s/\.1\|.*//;
        printf UC ">chrUn_$line\n";
    } else {
        print UC $line;
    }
}
close (FH);
close (UC);
'_EOF_'
    # << happy emacs
    chmod +x unplaced.pl
    time ./unplaced.pl
    #	real    0m7.730s

    # compress these files
    gzip *.fa *.agp

    # verify nothing has changed in the sequence, should be the same as above:
    faSize *.fa.gz
# 2969971616 bases (330842350 N's 2639129266 real 2639129266 upper 0 lower)
#	in 34102 sequences in 22 files
# Total size: mean 87090.8 sd 3586193.4 min 200 (chrUn_JH298929)
#	max 229590362 (chr1) median 442
# N count: mean 9701.6 sd 410667.9
# U count: mean 77389.3 sd 3195078.2
# L count: mean 0.0 sd 0.0
# %0.00 masked total, %0.00 masked real

#############################################################################
#  Initial database build (DONE - 2011-12-22 - Hiram)
    cd /hive/data/genomes/rheMac3
    cat << '_EOF_' > rheMac3.config.ra
# Config parameters for makeGenomeDb.pl:
db rheMac3
clade vertebrate
genomeCladePriority 15
scientificName Macaca mulatta
commonName Rhesus
assemblyDate Oct. 2010
assemblyLabel Beijing Genomics Institute, Shenzhen (GCA_000230795.1)
assemblyShortLabel BGI CR_1.0
orderKey 35
mitoAcc NC_005943
fastaFiles /hive/data/genomes/rheMac3/ucsc/*.fa.gz
agpFiles /hive/data/genomes/rheMac3/ucsc/*.agp.gz
dbDbSpeciesDir rhesus
taxId 9544
'_EOF_'
    # << happy emacs

    # first verify the sequence and AGP files are OK
    time makeGenomeDb.pl -stop=agp -workhorse=hgwdev rheMac3.config.ra \
	> agp.log 2>&1
    #	real    3m18.816s
    # verify that was OK, look at the agp.log file
    time makeGenomeDb.pl -continue=db -workhorse=hgwdev rheMac3.config.ra \
	> db.log 2>&1
    #	real    20m51.021s
    # verify the end of do.log indicates a successful completion

    # copy the trackDb business to the source tree, check it in and add
    #	to the trackDb/makefile

#############################################################################
# running repeat masker (DONE - 2011-12-29,30 - Hiram)
    mkdir /hive/data/genomes/rheMac3/bed/repeatMasker
    cd /hive/data/genomes/rheMac3/bed/repeatMasker
    time doRepeatMasker.pl -buildDir=`pwd` -noSplit \
	-bigClusterHub=swarm -dbHost=hgwdev -workhorse=hgwdev \
	-smallClusterHub=memk rheMac3 > do.log 2>&1 &
    #	real    682m32.394s
    cat faSize.rmsk.txt
    #	2969988180 bases (330842350 N's 2639145830 real 1359180091 upper
    #	1279965739 lower) in 34103 sequences in 1 files
    #	%43.10 masked total, %48.50 masked real

    grep -i versi do.log
# RepeatMasker version development-$Id: RepeatMasker,v 1.26 2011/09/26 16:19:44 angie Exp $
#    April 26 2011 (open-3-3-0) version of RepeatMasker

    featureBits -countGaps rheMac3 rmsk
    #	1290741445 bases of 2969988180 (43.459%) in intersection
    # why is it different than the faSize above ?
    # because rmsk masks out some N's as well as bases, the count above
    #	separates out the N's from the bases, it doesn't show lower case N's

##########################################################################
# running simple repeat (DONE - 2011-12-29,2012-01-04 - Hiram)
    mkdir /hive/data/genomes/rheMac3/bed/simpleRepeat
    cd /hive/data/genomes/rheMac3/bed/simpleRepeat
    time doSimpleRepeat.pl -buildDir=`pwd` -bigClusterHub=swarm \
	-dbHost=hgwdev -workhorse=hgwdev -smallClusterHub=memk \
	rheMac3 > do.log 2>&1 &
    #	real    113m25.254s

    cat fb.simpleRepeat
    #	61072049 bases of 2792623296 (2.187%) in intersection

    # add to rmsk after it is done:
    cd /hive/data/genomes/rheMac3
    twoBitMask rheMac3.rmsk.2bit \
	-add bed/simpleRepeat/trfMask.bed rheMac3.2bit
    #	you can safely ignore the warning about fields >= 13

    twoBitToFa rheMac3.2bit stdout | faSize stdin > faSize.rheMac3.2bit.txt
    cat faSize.rheMac3.2bit.txt
    #	2969988180 bases (330842350 N's 2639145830 real 1358178889 upper
    #	1280966941 lower) in 34103 sequences in 1 files
    #	%43.13 masked total, %48.54 masked real

    rm /gbdb/rheMac3/rheMac3.2bit
    ln -s `pwd`/rheMac3.2bit /gbdb/rheMac3/rheMac3.2bit

#########################################################################
# Verify all gaps are marked, add any N's not in gap as type 'other'
#	(DONE - 2012-01-03 - Hiram)
    mkdir /hive/data/genomes/rheMac3/bed/gap
    cd /hive/data/genomes/rheMac3/bed/gap
    time nice -n +19 findMotif -motif=gattaca -verbose=4 \
	-strand=+ ../../rheMac3.unmasked.2bit > findMotif.txt 2>&1
    #	real    0m31.596s
    grep "^#GAP " findMotif.txt | sed -e "s/^#GAP //" > allGaps.bed
    time featureBits rheMac3 -not gap -bed=notGap.bed
    #	real    0m31.496s

    #	2792623296 bases of 2792623296 (100.000%) in intersection
    featureBits rheMac3 allGaps.bed notGap.bed -bed=new.gaps.bed
    #	about 30 minutes
    #	153477466 bases of 2969988180 (5.168%) in intersection

    #	what is the highest index in the existing gap table:
    hgsql -N -e "select ix from gap;" rheMac3 | sort -n | tail -1
    #	658
    cat << '_EOF_' > mkGap.pl
#!/bin/env perl

use strict;
use warnings;

my $ix=`hgsql -N -e "select ix from gap;" rheMac3 | sort -n | tail -1`;
chomp $ix;

open (FH,"<new.gaps.bed") or die "can not read new.gaps.bed";
while (my $line = <FH>) {
    my ($chrom, $chromStart, $chromEnd, $rest) = split('\s+', $line);
    ++$ix;
    printf "%s\t%d\t%d\t%d\tN\t%d\tother\tyes\n", $chrom, $chromStart,
        $chromEnd, $ix, $chromEnd-$chromStart;
}
close (FH);
'_EOF_'
    # << happy emacs
    chmod +x ./mkGap.pl
    ./mkGap.pl > other.bed
    wc -l other.bed
    #	395439
    featureBits -countGaps rheMac3 other.bed
    #	153477466 bases of 2969988180 (5.168%) in intersection
    hgLoadBed -sqlTable=$HOME/kent/src/hg/lib/gap.sql \
	-noLoad rheMac3 otherGap other.bed
    # verify no overlap:
    time featureBits -countGaps rheMac3 gap other.bed
    #	real    36m1.614s
    0 bases of 2969988180 (0.000%) in intersection
    # verify no errors:
    time gapToLift -minGap=1 rheMac3 nonBridged.lift \
	-bedFile=nonBridged.bed > before.gapToLift.txt 2>&1 &
    #	real    0m29.947s
    #	starting with this many
    hgsql -e "select count(*) from gap;" rheMac3
    #	19148
    hgsql rheMac3 -e 'load data local infile "bed.tab" into table gap;'
    #	result count:
    hgsql -e "select count(*) from gap;" rheMac3
    #	414587
    # == 395439 + 19148
    # verify we aren't adding gaps where gaps already exist
    # this would output errors if that were true:
    gapToLift -minGap=1 rheMac3 nonBridged.lift -bedFile=nonBridged.bed
    # see example in danRer7.txt if there are errors
    hgsql -N -e "select bridge from gap;" rheMac3 | sort | uniq -c
    #	4122 no
    #	410465 yes

##########################################################################
## WINDOWMASKER (DONE - 2012-01-03 - Hiram)
    mkdir /hive/data/genomes/rheMac3/bed/windowMasker
    cd /hive/data/genomes/rheMac3/bed/windowMasker
    time nice -n +19 doWindowMasker.pl -buildDir=`pwd` -workhorse=hgwdev \
	-dbHost=hgwdev rheMac3 > do.log 2>&1 &
    #	real    221m45.041s

    # Masking statistics
    twoBitToFa rheMac3.wmsk.2bit stdout | faSize stdin
    #	2969988180 bases (330842350 N's 2639145830 real 1746831363 upper
    #	892314467 lower) in 34103 sequences in 1 files
    #	%30.04 masked total, %33.81 masked real

    twoBitToFa rheMac3.wmsk.sdust.2bit stdout | faSize stdin
    #	2969988180 bases (330842350 N's 2639145830 real 1732647656 upper
    #	906498174 lower) in 34103 sequences in 1 files
    #	%30.52 masked total, %34.35 masked real

    hgLoadBed rheMac3 windowmaskerSdust windowmasker.sdust.bed.gz
    #	Loaded 15866314 elements of size 3

    featureBits -countGaps rheMac3 windowmaskerSdust
    #	1237340524 bases of 2969988180 (41.661%) in intersection

    #	eliminate the gaps from the masking
    featureBits rheMac3 -not gap -bed=notGap.bed
    #	2969988180 bases of 2969988180 (100.000%) in intersection
    time nice -n +19 featureBits rheMac3 windowmaskerSdust notGap.bed \
        -bed=stdout | gzip -c > cleanWMask.bed.gz
    #	real    7m34.303s
    #	1059975640 bases of 2792623296 (37.956%) in intersection
    #	reload track to get it clean
    hgLoadBed rheMac3 windowmaskerSdust cleanWMask.bed.gz
    #	Loaded 15874168 elements of size 4
    time featureBits -countGaps rheMac3 windowmaskerSdust
    #	real    2m1.415s
    #	1059975640 bases of 2969988180 (35.690%) in intersection

    #	do *not* need to mask the sequence with this clean mask
    #	since RepeatMasker matched well enough
    zcat cleanWMask.bed.gz \
	| twoBitMask ../../rheMac3.unmasked.2bit stdin \
	    -type=.bed rheMac3.cleanWMSdust.2bit
    twoBitToFa rheMac3.cleanWMSdust.2bit stdout | faSize stdin \
        > rheMac3.cleanWMSdust.faSize.txt
    cat rheMac3.cleanWMSdust.faSize.txt
    #	1799143587 bases (97789820 N's 1701353767 real 987278282 upper
    #	714075485 lower) in 6457 sequences in 1 files
    #	%39.69 masked total, %41.97 masked real

    # how much does this window masker and repeat masker overlap:
    featureBits -countGaps rheMac3 rmsk windowmaskerSdust
    #	280229203 bases of 1511735326 (18.537%) in intersection

#########################################################################
# MASK SEQUENCE WITH WM+TRF (DONE - 2012-01-04 - Hiram)
    cd /hive/data/genomes/rheMac3
    twoBitMask -add bed/windowMasker/rheMac3.cleanWMSdust.2bit \
	bed/simpleRepeat/trfMask.bed rheMac3.2bit
    #	safe to ignore the warnings about BED file with >=13 fields
    twoBitToFa rheMac3.2bit stdout | faSize stdin > faSize.rheMac3.txt
    cat faSize.rheMac3.txt
    #	1799143587 bases (97789820 N's 1701353767 real 987044085 upper
    #	714309682 lower) in 6457 sequences in 1 files
    #	%39.70 masked total, %41.98 masked real

    #	create symlink to gbdb
    ssh hgwdev
    rm /gbdb/rheMac3/rheMac3.2bit
    ln -s `pwd`/rheMac3.2bit /gbdb/rheMac3/rheMac3.2bit


############################################################################
# set this as defaultDb (DONE - 2012-01-06 - Chin)
#   and make this the default genome for Rhesus
    hgsql -e 'update defaultDb set name="rheMac3" where name="rheMac2";' \
        hgcentraltest

########################################################################
# MAKE 11.OOC FILE FOR BLAT/GENBANK (DONE 2012-01-09 - Chin)
# Create kluster run files (
    # numerator is rheMac3 gapless bases "real" as reported by:
    featureBits -noRandom -noHap rheMac3 gap
    # 323705474 bases of 2562964352 (12.630%) in intersection

    # Use -repMatch=650, based on size -- for human we use 1024
    # use the "real" number from the faSize measurement,
    # hg19 is 2897316137, calculate the ratio factor for 1024:
    calc \(  2562964352 / 2897316137 \) \* 1024
    # ( 2562964352 / 2897316137 ) * 1024 = 905.829869
    # ==> use -repMatch=900 according to size scaled down from 1024 for human.
    #   and rounded down to nearest 50 (900 in this case)

    cd /hive/data/genomes/rheMac3
    blat rheMac3.2bit /dev/null /dev/null -tileSize=11 \
      -makeOoc=jkStuff/rheMac3.11.ooc -repMatch=900
    #   Wrote 26800 overused 11-mers to jkStuff/rheMac3.11.ooc

    # there are non-bridged gaps.
    hgsql -N -e "select bridge from gap;" rheMac3 | sort | uniq -c
    #    4122 no
    #  410465 yes

    # copy the stuff to kluster
    mkdir /hive/data/staging/data/rheMac3
    cp -p rheMac3.2bit chrom.sizes jkStuff/rheMac3.11.ooc \
        /hive/data/staging/data/rheMac3

    gapToLift -bedFile=jkStuff/nonBridgedGaps.bed rheMac3 \
        jkStuff/rheMac3.nonBridged.lft
    cp -p jkStuff/rheMac3.nonBridged.lft /hive/data/staging/data/rheMac3

    # ask cluster-admin to copy (evry time if any file changed)
    #    /hive/data/staging/data/rheMac3 directory to
    #    /scratch/data/rheMac3 on cluster nodes
    # before porceed to genbank step


#########################################################################
# AUTO UPDATE GENBANK (DONE 2012-01-19 - Chin)
    # examine the file:
    /cluster/data/genbank/data/organism.lst
    # for your species to see what counts it has for:
# organism       mrnaCnt estCnt  refSeqCnt
# Macaca mulatta  6156    60360   2302
# Macaca mulatta cytomegalovirus  1       0       0

# Gorilla gorilla       546     4       0
# Gorilla gorilla gorilla       1       0       0
    # to decide which "native" mrna or ests you want to specify in
    # genbank.conf
    # this appears that rheMac3 has almost no native est's

    ssh hgwdev
    cd $HOME/kent/src/hg/makeDb/genbank
    git pull
    # edit etc/genbank.conf to add rheMac3 after rheMac2 and commit to
    # GIT
# Rhesus macaque (macaca mulatta)
rheMac3.serverGenome = /hive/data/genomes/rheMac3/rheMac3.2bit
rheMac3.clusterGenome = /scratch/data/rheMac3/rheMac3.2bit
rheMac3.ooc = /scratch/data/rheMac3/rheMac3.11.ooc
rheMac3.lift = no
rheMac3.perChromTables = no
rheMac3.refseq.mrna.native.pslCDnaFilter  = ${ordered.refseq.mrna.native.pslCDnaFilter}
rheMac3.refseq.mrna.xeno.pslCDnaFilter    = ${ordered.refseq.mrna.xeno.pslCDnaFilter}
rheMac3.genbank.mrna.native.pslCDnaFilter = ${ordered.genbank.mrna.native.pslCDnaFilter}
rheMac3.genbank.mrna.xeno.pslCDnaFilter   = ${ordered.genbank.mrna.xeno.pslCDnaFilter}
rheMac3.genbank.est.native.pslCDnaFilter  = ${ordered.genbank.est.native.pslCDnaFilter}
rheMac3.genbank.est.xeno.pslCDnaFilter    = ${ordered.genbank.est.xeno.pslCDnaFilter}
rheMac3.downloadDir = rheMac3
rheMac3.genbank.mrna.xeno.load = yes
rheMac3.refseq.mrna.native.load  = yes
rheMac3.refseq.mrna.xeno.load  = yes
rheMac3.upstreamGeneTbl = refGene

    # end of section added to etc/genbank.conf
    git commit -m "adding rheMac3 Rhesus macaque" etc/genbank.conf
    git push
    make etc-update

    # ~/kent/src/hg/makeDb/genbank/src/lib/gbGenome.c already contains
    # rheMac genome information, if this is a new species, need to add
    # stuff there

    ssh hgwdev                  # used to do this on "genbank" machine
    screen                      # long running job managed in screen
    cd /cluster/data/genbank
    time nice -n +19 ./bin/gbAlignStep -initial rheMac3 &
    # logFile: var/build/logs/2012.01.11-08:40:51.rheMac3.initalign.log
    #   real    1496m10.582s

    # load database when finished
    ssh hgwdev
    cd /cluster/data/genbank
    time nice -n +19 ./bin/gbDbLoadStep -drop -initialLoad rheMac3 &
    #   logFile: var/dbload/hgwdev/logs/2012.01.12-11:34:20.dbload.log
    #   real    49m3.743s

    # enable daily alignment and update of hgwdev
    cd ~/kent/src/hg/makeDb/genbank
    git pull
    # add rheMac3 to:
        etc/align.dbs
        etc/hgwdev.dbs
    git commit -m "daily update for rheMac3 replaces daily update for gorGor2" \
        etc/align.dbs etc/hgwdev.dbs

    git push
    make etc-update

#########################################################################
#  BLATSERVERS ENTRY (DONE 2012-01-13 - Chin)
#       After getting a blat server assigned by the Blat Server Gods,
#  rheMac3 (trans) on blat1 port 17818
#  rheMac3 (untrans) on blat1 port 17819
    ssh hgwdev

    hgsql -e 'INSERT INTO blatServers (db, host, port, isTrans, canPcr) \
        VALUES ("rheMac3", "blat1", "17818", "1", "0"); \
        INSERT INTO blatServers (db, host, port, isTrans, canPcr) \
        VALUES ("rheMac3", "blat1", "17819", "0", "1");' \
            hgcentraltest
    #   test it with some sequence

#########################################################################
# reset default position (DONE - 2012-01-13  - Chin)
#       Reset default position to an area contains RHOG and a number of
#       other genes
    hgsql -e \
'update dbDb set defaultPos="chr14:70,034,487-70,048,046" where name="rheMac3";' \
        hgcentraltest
############################################################################
# ctgPos2 track - showing clone sequence locations on chromosomes
#       (DONE 2012-01-18 - Chin)
# NOTE - create rheMac3 entry in all.joiner since this is a new species
    mkdir /hive/data/genomes/rheMac3/bed/ctgPos2
    cd /hive/data/genomes/rheMac3/bed/ctgPos2
    cat << '_EOF_' > agpToCtgPos2.pl
#!/usr/bin/env perl

use warnings;
use strict;

my $argc = scalar(@ARGV);

if ($argc != 1) {
    printf STDERR "usage: zcat your.files.agp.gz | agpToCtgPos2.pl /dev/stdin > ctgPos2.tab\n";
    exit 255;
}

my $agpFile = shift;

open (FH, "<$agpFile") or die "can not read $agpFile";
while (my $line = <FH>) {
    next if ($line =~ m/^#/);
    chomp $line;
    my @a = split('\s+', $line);
    next if ($a[4] =~ m/^U$/);
    next if ($a[4] =~ m/^N$/);
    my $chrSize = $a[2]-$a[1]+1;
    my $ctgSize = $a[7]-$a[6]+1;
    die "sizes differ $chrSize != $ctgSize\n$line\n" if ($chrSize != $ctgSize);
    printf "%s\t%d\t%s\t%d\t%d\t%s\n", $a[5], $chrSize, $a[0], $a[1]-1, $a[2], $a[4];
}
close (FH);
'_EOF_'
    # << happy emacs

chmod 775 agpToCtgPos2.pl

export S=../../genbank/Primary_Assembly/assembled_chromosomes
cat  ${S}/chr2acc | grep -v "^#Chromosome" | cut -f2 | while read ACC
do
    C=`grep "${ACC}" ${S}/chr2acc | cut -f1`
    zcat ${S}/AGP/chr${C}.agp.gz \
        | sed -e "s/^${ACC}/chr${C}/"
done | ./agpToCtgPos2.pl /dev/stdin > ctgPos2.tab

    hgLoadSqlTab rheMac3 ctgPos2 $HOME/kent/src/hg/lib/ctgPos2.sql ctgPos2.tab
    # add the track ctgPos2 to src/hg/makeDb/trackDb/cow/rheMac3/trackDb.ra
    # at src/makeDb/trackdb do "make update DBS=rheMac3" or/and "make alpha"
    # based on result of
    # hgsql -N -e "select type from ctgPos2;" rheMac3 | sort | uniq
    # W
    # prepare the src/hg/makeDb/trackDb/cow/rheMac3/ctgPos2.html

############################################################################
# running cpgIsland business (DONE -2012-01-18 - Chin)
    mkdir /hive/data/genomes/rheMac3/bed/cpgIsland
    cd /hive/data/genomes/rheMac3/bed/cpgIsland
    cvs -d /projects/compbio/cvsroot checkout -P hg3rdParty/cpgIslands
    cd hg3rdParty/cpgIslands
    #   needed to fixup this source, adding include to readseq.c:
#include "string.h"
    #   and to cpg_lh.c:
#include "unistd.h"
#include "stdlib.h"
    # and fixing a declaration in cpg_lh.c
    sed -e "s#\(extern char\* malloc\)#// \1#" cpg_lh.c > tmp.c
    mv tmp.c cpg_lh.c
    make
    cd ../../
    ln -s hg3rdParty/cpgIslands/cpglh.exe
    mkdir -p hardMaskedFa
    cut -f1 ../../chrom.sizes | while read C
do
    echo ${C}
    twoBitToFa ../../rheMac3.2bit:$C stdout \
        | maskOutFa stdin hard hardMaskedFa/${C}.fa
done

    ssh swarm
    cd /hive/data/genomes/rheMac3/bed/cpgIsland
    mkdir results
    cut -f1 ../../chrom.sizes > chr.list
    cat << '_EOF_' > template
#LOOP
./runOne $(root1) {check out exists results/$(root1).cpg}
#ENDLOOP
'_EOF_'
    # << happy emacs

    #   the faCount business is to make sure there is enough sequence to
    #   work with in the fasta.  cpglh.exe does not like files with too many
    #   N's - it gets stuck
    cat << '_EOF_' > runOne
#!/bin/csh -fe
set C = `faCount hardMaskedFa/$1.fa | grep ^chr | awk '{print  $2 - $7 }'`
if ( $C > 200 ) then
    ./cpglh.exe hardMaskedFa/$1.fa > /scratch/tmp/$1.$$
    mv /scratch/tmp/$1.$$ $2
else
    touch $2
endif
'_EOF_'
    # << happy emacs


    chmod 775 runOne
    gensub2 chr.list single template jobList
    para create jobList
    para try
    para check ... etc
    para push
    para time
    para problems
    para status
    # then, kick it with para push
    # check it with plb
    # when all are done, para time shows:
    # Checking finished jobs
    # Completed: 34103 of 34103 jobs
    # CPU time in finished jobs:    209s       3.49m     0.06h    0.00d  0.000 y
    # IO & Wait Time:             88578s    1476.30m    24.60h    1.03d  0.003 y
    # Average job time:                   3s       0.04m     0.00h    0.00d
    # Longest finished job:              23s       0.38m     0.01h    0.00d
    # Submission to last job:           146s       2.43m     0.04h    0.00d

    # Transform cpglh output to bed +
    catDir results | awk '{
$2 = $2 - 1;
width = $3 - $2;
printf("%s\t%d\t%s\t%s %s\t%s\t%s\t%0.0f\t%0.1f\t%s\t%s\n",
       $1, $2, $3, $5,$6, width,
       $6, width*$7*0.01, 100.0*2*$6/width, $7, $9);
}' > cpgIsland.bed

    ssh hgwdev
    cd /hive/data/genomes/rheMac3/bed/cpgIsland
    hgLoadBed rheMac3 cpgIslandExt -tab \
      -sqlTable=$HOME/kent/src/hg/lib/cpgIslandExt.sql cpgIsland.bed
    # Reading cpgIsland.bed
    # Read 25189 elements of size 10 from cpgIsland.bed
    # Sorted
    # Creating table definition for cpgIslandExt
    # Saving bed.tab
    # Loading rheMac3

    #   cleanup
    rm -fr hardMaskedFa

##############################################################################
# LASTZ Mouse mm10 Swap (DONE - 2012-03-08 - Hiram)
    cd /hive/data/genomes/mm10/bed/lastzRheMac3.2012-03-08
    cat fb.mm10.chainRheMac3Link.txt
    #   900117108 bases of 2652783500 (33.931%) in intersection

    mkdir /hive/data/genomes/rheMac3/bed/blastz.mm10.swap
    cd /hive/data/genomes/rheMac3/bed/blastz.mm10.swap
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
    /hive/data/genomes/mm10/bed/lastzRheMac3.2012-03-08/DEF \
    -swap -syntenicNet \
    -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
    -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 &
    #   real    69m5.839s
    cat fb.rheMac3.chainMm10Link.txt
    #   883164992 bases of 2639145830 (33.464%) in intersection
    # set sym link to indicate this is the lastz for this genome:
    cd /hive/data/genomes/rheMac3/bed
    ln -s blastz.mm10.swap lastz.mm10

#########################################################################
# LASTZ Human Swap (DONE - 2012-03-15 - Chin)
    cd /hive/data/genomes/hg19/bed/lastzRheMac3.2012-03-15
    cat fb.hg19.chainRheMac3Link.txt
    #   2400694407 bases of 2897316137 (82.859%) in intersection

    #   running the swap - DONE - 20i12-03-16
    mkdir /hive/data/genomes/rheMac3/bed/blastz.hg19.swap
    cd /hive/data/genomes/rheMac3/bed/blastz.hg19.swap
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
        /hive/data/genomes/hg19/bed/lastzRheMac3.2012-03-15/DEF \
        -swap \
        -noLoadChainSplit -chainMinScore=5000 -chainLinearGap=medium \
        -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
        > swap.log 2>&1 &
    #    58m38.594s
    cat fb.rheMac3.chainHg19Link.txt
    #   2313806886 bases of 2646704109 (87.422%) in intersection
    cd /hive/data/genomes/rheMac3/bed
    ln -s blastz.hg19.swap lastz.hg19

##########################################################################
# LASTZ  Orangutan ponAbe2 (DONE - 2012-04-18 - Chin)
    screen # use screen to control this job
    mkdir /hive/data/genomes/rheMac3/bed/blastzPonAbe2.2012-04-18
    cd /hive/data/genomes/rheMac3/bed/blastzPonAbe2.2012-04-18

    cat << '_EOF_' > DEF
# Rhesus vs Orangutan
BLASTZ_M=50

# TARGET: Rhesus RheMac3
SEQ1_DIR=/scratch/data/rheMac3/rheMac3.2bit
SEQ1_LEN=/scratch/data/rheMac3/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000

# QUERY: Orangutan ponAbe2
SEQ2_DIR=/scratch/data/ponAbe2/ponAbe2.2bit
SEQ2_LEN=/scratch/data/ponAbe2/chrom.sizes
SEQ2_CHUNK=10000000
SEQ2_LAP=0

BASE=/hive/data/genomes/rheMac3/bed/blastzPonAbe2.2012-04-18
TMPDIR=/scratch/tmp
'_EOF_'
    # << happy emacs

    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
      -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
      -chainMinScore=3000 -chainLinearGap=medium \
      -syntenicNet `pwd`/DEF > do.log 2>&1 &
    #   real    9679m37.253

    cat fb.rheMac3.chainPonAbe2Link.txt
    #   2354152230 bases of 2639145830 (89.201%) in intersection

    #   and, for the swap
    mkdir /hive/data/genomes/ponAbe2/bed/blastz.rheMac3.swap
    cd /hive/data/genomes/ponAbe2/bed/blastz.rheMac3.swap
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
        /hive/data/genomes/rheMac3/bed/blastzPonAbe2.2012-04-18/DEF \
       -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
        -swap -chainMinScore=3000 -chainLinearGap=medium  > swap.log 2>&1 &
    #   real    289m9.479s
    cat fb.ponAbe2.chainRheMac3Link.txt
    #   2565157739 bases of 3093572278 (82.919%) in intersection
    cd /hive/data/genomes/ponAbe2/bed
    ln -s blastz.rheMac3.swap lastz.rheMac3

##########################################################################
# LASTZ Gorilla gorGor3 (DONE - 2012-04-18 - Chin)
    screen # use screen to control this job
    mkdir /hive/data/genomes/rheMac3/bed/blastzGorGor3.2012-04-18
    cd /hive/data/genomes/rheMac3/bed/blastzGorGor3.2012-04-18

    cat << '_EOF_' > DEF
# Rhesus vs Gorilla
BLASTZ_M=50

# TARGET: Rhesus RheMac3
SEQ1_DIR=/scratch/data/rheMac3/rheMac3.2bit
SEQ1_LEN=/scratch/data/rheMac3/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000

# QUERY: Gorilla gorGor3
SEQ2_DIR=/scratch/data/gorGor3/gorGor3.2bit
SEQ2_LEN=/scratch/data/gorGor3/chrom.sizes
SEQ2_CHUNK=10000000
SEQ2_LAP=0

BASE=/hive/data/genomes/rheMac3/bed/blastzGorGor3.2012-04-18
TMPDIR=/scratch/tmp
'_EOF_'
    # << happy emacs

    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
      -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
      -chainMinScore=3000 -chainLinearGap=medium \
      -syntenicNet `pwd`/DEF > do.log 2>&1 &
    #   real    9670m18.338s

    cat fb.rheMac3.chainGorGor3Link.txt
    #   2279633399 bases of 2639145830 (86.378%) in intersection

    #   and, for the swap
    mkdir /hive/data/genomes/gorGor3/bed/blastz.rheMac3.swap
    cd /hive/data/genomes/gorGor3/bed/blastz.rheMac3.swap
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
        /hive/data/genomes/rheMac3/bed/blastzGorGor3.2012-04-18/DEF \
       -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
        -swap -chainMinScore=3000 -chainLinearGap=medium  > swap.log 2>&1 &
    #   real    289m9.479s
    cat fb.gorGor3.chainRheMac3Link.txt
    #   2335269818 bases of 2822760080 (82.730%) in intersection
    cd /hive/data/genomes/gorGor3/bed
    ln -s blastz.rheMac3.swap lastz.rheMac3


#########################################################################
# LASTZ Rat rn5 Swap (DONE - 2012-07-10 - Chin)
    cd /cluster/data/rn5/bed/lastz.rheMac3.2012-07-10
    cat fb.rn5.chainRheMac3Link.txt
    # 896256761 bases of 2572853723 (34.835%) in intersection

    # then swap
    mkdir /cluster/data/rheMac3/bed/blastz.rn5.swap
    cd /cluster/data/rheMac3/bed/blastz.rn5.swap
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
        /cluster/data/rn5/bed/lastz.rheMac3.2012-07-10/DEF \
        -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
        -chainMinScore=3000 -chainLinearGap=medium \
        -swap -syntenicNet > swap.log 2>&1 &
    #   real    69m12.752s
    cat fb.rheMac3.chainRn5Link.txt
    # 858153767 bases of 2639145830 (32.516%) in intersection
    cd /hive/data/genomes/rheMac3/bed
    ln -s blastz.rn5.swap lastz.rn5

############################################################################
# LASTZ Opossum monDom5 (DONE - 2012-07-19 - Chin)
    mkdir /hive/data/genomes/monDom5/bed/lastzRheMac3.2012-07-19
    cd /hive/data/genomes/monDom5/bed/lastzRheMac3.2012-07-19

    # adjust the SEQ2_LIMIT with -stop=partition to get a reasonable
    #   number of jobs, 50,000 to something under 100,000

    cat << '_EOF_' > DEF
# opossum vs rhesus
BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.02/bin/lastz
# TARGET: Opossum MonDom5
SEQ1_DIR=/scratch/data/monDom5/monDom5.2bit
SEQ1_LEN=/scratch/data/monDom5/chrom.sizes
SEQ1_CHUNK=20000000
SEQ1_LAP=10000

# QUERY: Rhesus RheMac3
SEQ2_DIR=/scratch/data/rheMac3/rheMac3.2bit
SEQ2_LEN=/scratch/data/rheMac3/chrom.sizes
SEQ2_CHUNK=10000000
SEQ2_LAP=0
SEQ2_LIMIT=1000

BASE=/hive/data/genomes/monDom5/bed/lastzRheMac3.2012-07-19
TMPDIR=/scratch/tmp
'_EOF_'
    # << happy emacs

    #   establish a screen to control this job
    screen -S monDom5RheMac3
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
    `pwd`/DEF \
    -syntenicNet \
    -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
    -chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 &
    #   real    2473m43.189s
    cat fb.monDom5.chainRheMac3Link.txt
    #   221659579 bases of 3501660299 (6.330%) in intersection
    # set sym link to indicate this is the lastz for this genome:
    cd /hive/data/genomes/monDom5/bed
    ln -s lastzRheMac3.2012-07-19 lastz.rheMac3

    mkdir /hive/data/genomes/rheMac3/bed/blastz.monDom5.swap
    cd /hive/data/genomes/rheMac3/bed/blastz.monDom5.swap
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
    /hive/data/genomes/monDom5/bed/lastzRheMac3.2012-07-19/DEF \
    -swap -syntenicNet \
    -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
    -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 &
    #   real    106m24.321s
    cat fb.rheMac3.chainMonDom5Link.txt
    #   214801189 bases of 2639145830 (8.139%) in intersection
    # set sym link to indicate this is the lastz for this genome:
    cd /hive/data/genomes/rheMac3/bed
    ln -s blastz.monDom5.swap lastz.monDom5

############################################################################
# LASTZ Chimp panTro4 Swap (DONE 2012-07-12 - Chin)
    cd /hive/data/genomes/panTro4/bed/lastzRheMac3.2012-07-19
    cat fb.panTro4.chainRheMac3Link.txt
    #   2362219217 bases of 2902338967 (81.390%) in intersection
    cd /hive/data/genomes/panTro4/bed
    ln -s lastzRheMac3.2012-07-19 lastz.rheMac3

    #   running the swap
    mkdir /hive/data/genomes/rheMac3/bed/blastz.panTro4.swap
    cd /hive/data/genomes/rheMac3/bed/blastz.panTro4.swap
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
    -swap /hive/data/genomes/panTro4/bed/lastzRheMac3.2012-07-19/DEF \
    -noLoadChainSplit -chainMinScore=5000 -chainLinearGap=medium \
    -syntenicNet -workhorse=hgwdev -smallClusterHub=encodek \
    -bigClusterHub=swarm > swap.log 2>&1 &
    #   real    82m21.660s
    cat fb.rheMac3.chainPanTro4Link.txt
    #   2278587464 bases of 2639145830 (86.338%) in intersection
   # set sym link to indicate this is the lastz for this genome:
    cd /hive/data/genomes/rheMac3/bed
    ln -s blastz.panTro4.swap lastz.panTro4

############################################################################
# LASTZ Marmoset calJac3 (DONE - 2012-07-19 - Chin)
    screen -S rhesus_vs_marmoset # use a screen to manage this multi-day job
    mkdir /hive/data/genomes/rheMac3/bed/lastzCalJac3.2012-07-19
    cd /hive/data/genomes/rheMac3/bed/lastzCalJac3.2012-07-19

    #   same kind of parameters as used in human<->marmoset
    cat << '_EOF_' > DEF
# Rhesus vs. marmoset
BLASTZ=lastz
# maximum M allowed with lastz is only 254
BLASTZ_M=254
BLASTZ_Q=/scratch/data/blastz/human_chimp.v2.q
# and place those items here
BLASTZ_O=600
BLASTZ_E=150
BLASTZ_K=4500
BLASTZ_Y=15000
BLASTZ_T=2

# TARGET: Rhesus RheMac3
SEQ1_DIR=/scratch/data/rheMac3/rheMac3.2bit
SEQ1_LEN=/scratch/data/rheMac3/chrom.sizes
SEQ1_CHUNK=20000000
SEQ1_LAP=10000
SEQ1_LIMIT=5

# QUERY: Marmoset (calJac3)
SEQ2_DIR=/scratch/data/calJac3/calJac3.2bit
SEQ2_LEN=/scratch/data/calJac3/chrom.sizes
SEQ2_LIMIT=50
SEQ2_CHUNK=10000000
SEQ2_LAP=0

BASE=/hive/data/genomes/rheMac3/bed/lastzCalJac3.2012-07-19
TMPDIR=/scratch/tmp
'_EOF_'
    # << this line keeps emacs coloring happy

    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
    `pwd`/DEF \
    -syntenicNet \
    -chainMinScore=5000 -chainLinearGap=medium \
    -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
    > do.log 2>&1 &
    #   real    5720m34.726s
    cat fb.rheMac3.chainCalJac3Link.txt
    #   1893656467 bases of 2639145830 (71.753%) in intersection

    cd /hive/data/genomes/rheMac3/bed
    ln -s lastzCalJac3.2012-07-19 lastz.calJac3

    #   running the swap
    mkdir /hive/data/genomes/calJac3/bed/blastz.rheMac3.swap
    cd /hive/data/genomes/calJac3/bed/blastz.rheMac3.swap
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
    /hive/data/genomes/rheMac3/bed/lastzCalJac3.2012-07-19/DEF \
    -swap -syntenicNet \
    -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
    -chainMinScore=5000 -chainLinearGap=medium > swap.log 2>&1 &
    #   real    84m31.344s
    cat fb.calJac3.chainRheMac3Link.txt
    #   1931533122 bases of 2752505800 (70.174%) in intersection

    cd /hive/data/genomes/calJac3/bed
    ln -s blastz.rheMac3.swap lastz.rheMac3


***********************************************************************
* TODOain/net
 *** Make sure that goldenPath/ponAbe2/vsRheMac3/README.txt is accurate.
 *** Add {chain,net}RheMac3 tracks to trackDb.ra if necessary.

***********************************************************************

############################################################################
# set default position to RHO gene displays  (DONE - 2012-07-24 - Hiram)
    hgsql -e \
'update dbDb set defaultPos="chr3:132959918-132967918" where name="rheMac3";' \
	hgcentraltest

############################################################################
# pushQ entry (DONE - 2012-07-24 - Hiram)
    mkdir /hive/data/genomes/rheMac3/pushQ
    cd /hive/data/genomes/rheMac3/pushQ
    # Mark says don't let the transMap track get there
    time makePushQSql.pl rheMac3 2> stderr.txt | grep -v transMap > rheMac3.sql
    #   real    4m8.890s
    # check the stderr.txt for bad stuff, these kinds of warnings are OK:
# WARNING: hgwdev does not have /gbdb/rheMac3/wib/gc5Base.wib
# WARNING: hgwdev does not have /gbdb/rheMac3/wib/quality.wib
# WARNING: hgwdev does not have /gbdb/rheMac3/bbi/quality.bw
# WARNING: rheMac3 does not have seq
# WARNING: rheMac3 does not have extFile
# WARNING: rheMac3 does not have estOrientInfo

    # eliminated the chainMm9 and chainRn4 chainNet tracks from the .sql
    # and the rheMac3 chainNet tracks on mm9 and rn4
    # the mm10 and rn5 tracks are the new ones

    scp -p rheMac3.sql hgwbeta:/tmp
    ssh hgwbeta "hgsql qapushq < /tmp/rheMac3.sql"

############################################################################
# LIFTOVER TO rheMac2 (DONE - 2012-08-28 - Hiram )
    mkdir /hive/data/genomes/rheMac3/bed/blat.rheMac2.2012-08-28
    cd /hive/data/genomes/rheMac3/bed/blat.rheMac2.2012-08-28
    # -debug run to create run dir, preview scripts...
    doSameSpeciesLiftOver.pl -debug \
        -ooc /hive/data/genomes/rheMac3/jkStuff/rheMac3.11.ooc rheMac3 rheMac2
    # Real run:
    time nice -n +19 doSameSpeciesLiftOver.pl -verbose=2 \
        -ooc /hive/data/genomes/rheMac3/jkStuff/rheMac3.11.ooc \
	-bigClusterHub=swarm -dbHost=hgwdev -workhorse=hgwdev \
	 rheMac3 rheMac2 > do.log 2>&1
    #   real    99m53.205s


#########################################################################
# GENSCAN GENE PREDICTIONS (DONE 2012-10-15 - Chin)

    mkdir /hive/data/genomes/rheMac3/bed/genscan
    cd /hive/data/genomes/rheMac3/bed/genscan
    time doGenscan.pl rheMac3 > do.log 2>&1
    #   real    284m5.600s

    #   This is not working.  Need to run these split
    mkdir /hive/data/genomes/rheMac3/bed/genscan/splitRun
    cd /hive/data/genomes/rheMac3/bed/genscan/splitRun
    cp ../../../jkStuff/rheMac3.nonBridged.lift .
    gapToLift rheMac3 rheMac3.nonBridged.lift -bedFile=rheMac3.nonBridged.bed

    for C in 3 8 13 14 19 20 X
do
    echo chr${C}
    cd /hive/data/genomes/rheMac3/bed/genscan/splitRun
    grep -w "chr${C}" rheMac3.nonBridged.lift | grep -v random \
        | sed -e "s/chr${C}./chr${C}_/" > chr${C}.nonBridged.lift
    mkdir chr${C}
    faToTwoBit ../hardMaskedFa/00?/chr${C}.fa chr${C}/chr${C}.2bit
    ~/kent/src/hg/utils/lft2BitToFa.pl chr${C}/chr${C}.2bit \
        chr${C}.nonBridged.lift > chr${C}/chr${C}.nonBridged.fa
    cd /hive/data/genomes/rheMac3/bed/genscan/splitRun/chr${C}
    mkdir split${C}
    faSplit sequence chr${C}.nonBridged.fa 100 split${C}/chr${C}_
done
    # verify lift files are sane:
    cd /hive/data/genomes/rheMac3/bed/genscan/splitRun
    awk '{print $4}' chr*.nonBridged.lift | sort | uniq -c
    # should see the list of chroms:
    #    246 chr13
    #    127 chr14
    #    139 chr19
    #     98 chr20
    #    317 chr3
    #    169 chr8
    #    330 chrX

   for C in 3 8 13 14 19 20 X
do
    echo chr${C}
    cd /hive/data/genomes/rheMac3/bed/genscan/splitRun/chr${C}
    echo '#!/bin/sh' > cmdList.sh
    export NL=-1
    ls split${C} | while read F
  do
  NL=`echo $NL | awk '{print $1+1}'`
  if [ "${NL}" -gt 7 ]; then
    NL=0
    echo "echo waiting before $F" >> cmdList.sh
    echo wait >> cmdList.sh
  fi
  echo "/cluster/bin/x86_64/gsBig split${C}/${F} gtf/${F}.gtf -trans=pep/${F}.pep -subopt=subopt/${F}.bed -exe=/scratch/data/genscan/genscan -par=/scratch/data/genscan/HumanIso.smat -tmp=/dev/shm -window=2400000 &"
  done >> cmdList.sh
    echo "echo waiting at end" >> cmdList.sh
    echo "wait" >> cmdList.sh
    chmod +x cmdList.sh
    rm -fr gtf pep subopt
    mkdir gtf pep subopt
done

    # run chr20 first to test script
    cd /hive/data/genomes/rheMac3/bed/genscan/splitRun/chr20
    time ./cmdList.sh > ../chr20.log 2>&1
    # real    6m59.465s
# running the rest: 3 8 13 14 19 X
for C in chr3  chr8 chr13 chr14 chr19 chrX
do
    cd /hive/data/genomes/rheMac3/bed/genscan/splitRun/${C}
    time ./cmdList.sh > ../${C}.log 2>&1
done
    # chr3 real    9m41.925s
    # chr8 real    18m50.934s
    # chr13 real    11m1.742s
    # chr14 real    16m32.590s
    # chr19 real    5m8.588s
    # chrX real    8m48.507s

    # collecting the results:
    # run chr20 first:
    cd /hive/data/genomes/rheMac3/bed/genscan/splitRun/chr20
    cat gtf/chr20_*.gtf | liftUp -type=.gtf stdout ../chr20.nonBridged.lift error stdin \
        | sed -e "s/chr20_0\([0-4]\)\./chr20.\1/g" > chr20.gtf
cat subopt/chr20_*.bed | liftUp -type=.bed stdout ../chr20.nonBridged.lift error stdin \
        | sed -e "s/chr20_0\([0-4]\)\./chr20.\1/g" > chr20.subopt.bed
cat pep/chr20_*.pep | sed -e "s/chr20_0\([0-4]\)\./chr20.\1/g" > chr20.pep
ls -l ../../gtf/00?/chr20.gtf ../../pep/00?/chr20.pep ../../subopt/00?/chr20.bed
ls -l chr20.gtf chr20.pep chr20.subopt.bed

    # run the rest
for C in chr3  chr8 chr13 chr14 chr19 chrX
do
    cd /hive/data/genomes/rheMac3/bed/genscan/splitRun/${C}
    cat gtf/${C}_*.gtf | liftUp -type=.gtf stdout ../${C}.nonBridged.lift error stdin \
        | sed -e "s/${C}_0\([0-4]\)\./${C}.\1/g" > ${C}.gtf
cat subopt/${C}_*.bed | liftUp -type=.bed stdout ../${C}.nonBridged.lift error stdin \
        | sed -e "s/${C}_0\([0-4]\)\./${C}.\1/g" > ${C}.subopt.bed
cat pep/${C}_*.pep | sed -e "s/${C}_0\([0-4]\)\./${C}.\1/g" > ${C}.pep
ls -l ../../gtf/00?/${C}.gtf ../../pep/00?/${C}.pep ../../subopt/00?/${C}.bed
ls -l ${C}.gtf ${C}.pep ${C}.subopt.bed
done
    # after verifying the sizes of the files seem same compared to what
    #   happened in the main run:
for C in chr3  chr8 chr13 chr14 chr19 chr20 chrX
do
    cd /hive/data/genomes/rheMac3/bed/genscan/splitRun/${C}
   ls -l ../../gtf/00?/${C}.gtf ../../pep/00?/${C}.pep ../../subopt/00?/${C}.bed
    ls -l ${C}.gtf ${C}.pep ${C}.subopt.bed
    cd /hive/data/genomes/rheMac3/bed/genscan/splitRun
done
-rw-rw-r-- 1 chinhli genecats 0 Oct 15 14:05 ../../gtf/000/chr3.gtf
-rw-rw-r-- 1 chinhli genecats 0 Oct 15 14:05 ../../pep/000/chr3.pep
-rw-rw-r-- 1 chinhli genecats 0 Oct 15 14:05 ../../subopt/000/chr3.bed
-rw-rw-r-- 1 chinhli genecats 2622960 Oct 16 15:51 chr3.gtf
-rw-rw-r-- 1 chinhli genecats 1108696 Oct 16 15:51 chr3.pep
-rw-rw-r-- 1 chinhli genecats 1419193 Oct 16 15:51 chr3.subopt.bed
-rw-rw-r-- 1 chinhli genecats 0 Oct 15 16:46 ../../gtf/000/chr8.gtf
....
....
-rw-rw-r-- 1 chinhli genecats  619333 Oct 16 15:49 chr20.subopt.bed
-rw-rw-r-- 1 chinhli genecats 0 Oct 15 16:46 ../../gtf/000/chrX.gtf
-rw-rw-r-- 1 chinhli genecats 0 Oct 15 16:46 ../../pep/000/chrX.pep
-rw-rw-r-- 1 chinhli genecats 0 Oct 15 16:46 ../../subopt/000/chrX.bed
-rw-rw-r-- 1 chinhli genecats 1347712 Oct 16 15:52 chrX.gtf
-rw-rw-r-- 1 chinhli genecats  641978 Oct 16 15:52 chrX.pep
-rw-rw-r-- 1 chinhli genecats  784120 Oct 16 15:52 chrX.subopt.bed


    # this is tricky, it is counting on the file existing, empty or
    # otherwise
for C in chr3  chr8 chr13 chr14 chr19 chr20 chrX
do
    cd /hive/data/genomes/rheMac3/bed/genscan/splitRun/${C}
    D=`ls ../../gtf/00?/${C}.gtf`
    rm -f "${D}"
    cp -p ${C}.gtf "${D}"
    D=`ls ../../pep/00?/${C}.pep`
    rm -f "${D}"
    cp -p ${C}.pep "${D}"
    D=`ls ../../subopt/00?/${C}.bed`
    rm -f "${D}"
    cp -p ${C}.subopt.bed "${D}"
    cd /hive/data/genomes/rheMac3/bed/genscan/splitRun
done

    # verify size again:
for C in chr3  chr8 chr13 chr14 chr19 chr20 chrX
do
    cd /hive/data/genomes/rheMac3/bed/genscan/splitRun/${C}
   ls -l ../../gtf/00?/${C}.gtf ../../pep/00?/${C}.pep ../../subopt/00?/${C}.bed
    ls -l ${C}.gtf ${C}.pep ${C}.subopt.bed
    cd /hive/data/genomes/rheMac3/bed/genscan/splitRun
done
-rw-rw-r-- 1 chinhli genecats 2622960 Oct 16 15:51 ../../gtf/000/chr3.gtf
-rw-rw-r-- 1 chinhli genecats 1108696 Oct 16 15:51 ../../pep/000/chr3.pep
-rw-rw-r-- 1 chinhli genecats 1419193 Oct 16 15:51 ../../subopt/000/chr3.bed
-rw-rw-r-- 1 chinhli genecats 2622960 Oct 16 15:51 chr3.gtf
-rw-rw-r-- 1 chinhli genecats 1108696 Oct 16 15:51 chr3.pep
-rw-rw-r-- 1 chinhli genecats 1419193 Oct 16 15:51 chr3.subopt.bed
-rw-rw-r-- 1 chinhli genecats  973693 Oct 16 15:51 chr8.subopt.bed
....
...
...
-rw-rw-r-- 1 chinhli genecats  619333 Oct 16 15:49 chr20.subopt.bed
-rw-rw-r-- 1 chinhli genecats 1347712 Oct 16 15:52 ../../gtf/000/chrX.gtf
-rw-rw-r-- 1 chinhli genecats  641978 Oct 16 15:52 ../../pep/000/chrX.pep
-rw-rw-r-- 1 chinhli genecats  784120 Oct 16 15:52 ../../subopt/000/chrX.bed
-rw-rw-r-- 1 chinhli genecats 1347712 Oct 16 15:52 chrX.gtf
-rw-rw-r-- 1 chinhli genecats  641978 Oct 16 15:52 chrX.pep
-rw-rw-r-- 1 chinhli genecats  784120 Oct 16 15:52 chrX.subopt.bed

    # genes found have been copied over to ../../gtf. pep and s
    # Now, we can continue
    cd /hive/data/genomes/rheMac3/bed/genscan
    time doGenscan.pl -continue=makeBed -workhorse=hgwdev -dbHost=hgwdev \
        rheMac3 > makeBed.log 2>&1
    #   real    7m58.322s

    cat fb.rheMac3.genscan.txt
    #   50606442 bases of 2639145830 (1.918%) in intersection
    cat fb.rheMac3.genscanSubopt.txt
    #   51718033 bases of 2639145830 (1.960%) in intersection


#############################################################################
# create ucscToINSDC name mapping (DONE - 2013-08-23 - Hiram)
    mkdir /hive/data/genomes/rheMac3/bed/ucscToINSDC
    cd /hive/data/genomes/rheMac3/bed/ucscToINSDC

    # copying these scripts from the previous load and improving them
    # with each instance
    ./translateNames.sh NC_005943.1
    ./verifyAll.sh
    ./join.sh

    sed -e "s/21/14/" $HOME/kent/src/hg/lib/ucscToINSDC.sql \
        | hgLoadSqlTab rheMac3 ucscToINSDC stdin ucscToINSDC.tab
    checkTableCoords rheMac3 ucscToINSDC
    featureBits -countGaps rheMac3 ucscToINSDC
    # 2969988180 bases of 2969988180 (100.000%) in intersection

    # verify the track link to INSDC functions

##############################################################################
# GENEID GENE PREDICTIONS (DONE - 2015-07-31 - Hiram)
    ssh hgwdev
    mkdir /hive/data/genomes/rheMac3/bed/geneid
    cd /hive/data/genomes/rheMac3/bed/geneid
    wget --timestamping \
http://genome.crg.es/genepredictions/M.mulatta/rheMac3/geneid_v1.4/00README
    wget --timestamping \
http://genome.crg.es/genepredictions/M.mulatta/rheMac3/geneid_v1.4/rheMac3.geneid.gtf

    ldHgGene -gtf -genePredExt rheMac3 geneid rheMac3.geneid.gtf
    # Read 57842 transcripts in 303973 lines in 1 files
    #  57842 groups 28416 seqs 1 sources 3 feature types
    # 57842 gene predictions

    featureBits -enrichment rheMac3 augustusGene:CDS geneid
# augustusGene:CDS 1.177%, geneid 1.378%, both 0.962%, cover 81.74%,
#    enrich 59.31x

##########################################################################
# LASTZ rhesus/rheMac3 dog/canFam3 - (DONE - 2015-09-25 - Hiram)
    mkdir /hive/data/genomes/rheMac3/bed/lastzCanFam3.2015-09-25
    cd /hive/data/genomes/rheMac3/bed/lastzCanFam3.2015-09-25

    printf "%s\n" \
'# rhesus vs dog
BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.66/bin/lastz

# TARGET: Rhesus RheMac3
SEQ1_DIR=/scratch/data/rheMac3/rheMac3.2bit
SEQ1_LEN=/scratch/data/rheMac3/chrom.sizes
SEQ1_CHUNK=20000000
SEQ1_LAP=10000
SEQ1_LIMIT=100

# QUERY: Dog CanFam3
SEQ2_DIR=/hive/data/genomes/canFam3/canFam3.2bit
SEQ2_LEN=/hive/data/genomes/canFam3/chrom.sizes
SEQ2_CHUNK=20000000
SEQ2_LAP=0

BASE=/hive/data/genomes/rheMac3/bed/lastzCanFam3.2015-09-25
TMPDIR=/dev/shm' > DEF

    time (doBlastzChainNet.pl -verbose=2 `pwd`/DEF \
        -syntenicNet -workhorse=hgwdev -smallClusterHub=ku \
        -bigClusterHub=ku -fileServer=hgwdev \
        -chainMinScore=3000 -chainLinearGap=medium) > do.log 2>&1
    # real    849m2.453s

    cat fb.rheMac3.chainCanFam3Link.txt
    # 1421649657 bases of 2639145830 (53.868%) in intersection

    time (doRecipBest.pl -buildDir=`pwd` rheMac3 canFam3) > rbest.log 2>&1 &
    # lost end of rbest.log file due to hive difficulties

    # and for the swap:
    mkdir /hive/data/genomes/canFam3/bed/blastz.rheMac3.swap
    cd /hive/data/genomes/canFam3/bed/blastz.rheMac3.swap

    time (doBlastzChainNet.pl -verbose=2 \
      /hive/data/genomes/rheMac3/bed/lastzCanFam3.2015-09-25/DEF \
        -swap -chainMinScore=3000 -chainLinearGap=medium \
          -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \
            -syntenicNet) > swap.log 2>&1
    # continuing after hive difficulties and finish the net step manually
    time (doBlastzChainNet.pl -verbose=2 \
      /hive/data/genomes/rheMac3/bed/lastzCanFam3.2015-09-25/DEF \
        -swap -chainMinScore=3000 -chainLinearGap=medium \
          -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \
            -continue=load -syntenicNet) > load.log 2>&1
    #  real    48m13.671s

    cat fb.canFam3.chainRheMac3Link.txt
    # 1415605354 bases of 2392715236 (59.163%) in intersection

    time (doRecipBest.pl -workhorse=hgwdev -buildDir=`pwd` canFam3 rheMac3) \
        > rbest.log 2>&1
XXX - running - Tue Sep 29 12:22:16 PDT 2015
    # real    64m14.945s


#########################################################################
# LASTZ rhesus/rheMac3 vs. chicken/galGal4 - (DONE - 2015-09-25 - Hiram)
    mkdir /hive/data/genomes/rheMac3/bed/lastzGalGal4.2015-09-25
    cd /hive/data/genomes/rheMac3/bed/lastzGalGal4.2015-09-25

    printf "%s\n" \
'# rhesus vs chicken
BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.66/bin/lastz
BLASTZ_O=400
BLASTZ_E=30
BLASTZ_Y=3400
BLASTZ_L=6000
BLASTZ_K=2200
BLASTZ_M=50
BLASTZ_Q=/scratch/data/blastz/HoxD55.q
#      A    C    G    T
#     91  -90  -25 -100
#    -90  100 -100  -25
#    -25 -100  100  -90
#   -100  -25  -90  91

# TARGET: rhesus rheMac3
SEQ1_DIR=/scratch/data/rheMac3/rheMac3.2bit
SEQ1_LEN=/scratch/data/rheMac3/chrom.sizes
SEQ1_CHUNK=20000000
SEQ1_LAP=10000
SEQ1_LIMIT=100

# QUERY: chicken galGal4
SEQ2_DIR=/hive/data/genomes/galGal4/galGal4.2bit
SEQ2_LEN=/hive/data/genomes/galGal4/chrom.sizes
SEQ2_CHUNK=20000000
SEQ2_LIMIT=30
SEQ2_LAP=0

BASE=/hive/data/genomes/rheMac3/bed/lastzGalGal4.2015-09-25
TMPDIR=/dev/shm' > DEF

    time (doBlastzChainNet.pl `pwd`/DEF -verbose=2 \
        -chainMinScore=5000 -chainLinearGap=loose \
          -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \
            -syntenicNet) > do.log 2>&1
    # real    214m36.652s

    cat fb.rheMac3.chainGalGal4Link.txt
    # 114693597 bases of 2639145830 (4.346%) in intersection

    time (doRecipBest.pl -buildDir=`pwd` rheMac3 galGal4) > rbest.log 2>&1 &
    # real    15m38.646s

    # and for the swap:
    mkdir /hive/data/genomes/galGal4/bed/blastz.rheMac3.swap
    cd /hive/data/genomes/galGal4/bed/blastz.rheMac3.swap

    time (doBlastzChainNet.pl -verbose=2 \
      /hive/data/genomes/rheMac3/bed/lastzGalGal4.2015-09-25/DEF \
        -swap -chainMinScore=5000 -chainLinearGap=loose \
          -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \
            -syntenicNet) > swap.log 2>&1
    #  real    11m39.434s

    cat fb.galGal4.chainRheMac3Link.txt
    # 102687176 bases of 1032854810 (9.942%) in intersection

    time (doRecipBest.pl -buildDir=`pwd` galGal4 rheMac3) > rbest.log 2>&1
    # continuing after hive difficulties and finishing the recipBest step
    # manually
    time (doRecipBest.pl -workhorse=hgwdev -continue=download  -buildDir=`pwd` \
      galGal4 rheMac3) > download.log 2>&1
    # real    0m4.349s

#########################################################################
# LIFTOVER TO rheMac8 (DONE - 2016-02-05 - Hiram)
    ssh hgwdev
    mkdir /hive/data/genomes/rheMac3/bed/blat.rheMac8.2016-02-05
    cd /hive/data/genomes/rheMac3/bed/blat.rheMac8.2016-02-05
    doSameSpeciesLiftOver.pl -verbose=2 \
        -debug -bigClusterHub=ku -dbHost=hgwdev -workhorse=hgwdev \
        -ooc=/hive/data/genomes/rheMac3/jkStuff/rheMac3.11.ooc \
         rheMac3 rheMac8
    time (doSameSpeciesLiftOver.pl -verbose=2 \
        -bigClusterHub=ku -dbHost=hgwdev -workhorse=hgwdev \
        -ooc=/hive/data/genomes/rheMac3/jkStuff/rheMac3.11.ooc \
         rheMac3 rheMac8) > doLiftOverToRheMac3.log 2>&1
    # real    816m59.070s

    # see if the liftOver menus function in the browser from rheMac2 to Mac8

#########################################################################
