# for emacs: -*- mode: sh; -*-

#	DATE:   08-Feb-2013
#	ORGANISM:       Taeniopygia guttata
#	TAXID:  59729
#	ASSEMBLY LONG NAME:     Taeniopygia_guttata-3.2.4
#	ASSEMBLY SHORT NAME:    Taeniopygia_guttata-3.2.4
#	ASSEMBLY SUBMITTER:     Washington University Genome Sequencing Center
#	ASSEMBLY TYPE:  Haploid
#	NUMBER OF ASSEMBLY-UNITS:       1
#	ASSEMBLY ACCESSION:     GCA_000151805.2
#	FTP-RELEASE DATE: 12-Feb-2013


#       http://www.ncbi.nlm.nih.gov/genome/367
#       http://www.ncbi.nlm.nih.gov/genome/assembly/524908
#       http://www.ncbi.nlm.nih.gov/bioproject/17289

#       http://www.ncbi.nlm.nih.gov/Traces/wgs/?val=ABQF01
#       Genome Coverage : 5.5x  PCAP v. 2008

#       http://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?id=59729

# rsync://ftp.ncbi.nlm.nih.gov/genbank/genomes/Eukaryotes/vertebrates_other/Taeniopygia_guttata/Taeniopygia_guttata-3.2.4/

##########################################################################
# Download sequence (DONE - 2013-06-05 - Hiram)
    mkdir -p /hive/data/genomes/taeGut2/genbank
    cd /hive/data/genomes/taeGut2/genbank

    time rsync -a -P \
rsync://ftp.ncbi.nlm.nih.gov/genbank/genomes/Eukaryotes/vertebrates_other/Taeniopygia_guttata/Taeniopygia_guttata-3.2.4/
    #   real    real    22m54.139s

    # verify the size of the sequence here:
    faSize Primary_Assembly/assembled_chromosomes/FASTA/chr*.fa.gz \
Primary_Assembly/unlocalized_scaffolds/FASTA/chr*.unlocalized.scaf.fa.gz \
Primary_Assembly/unplaced_scaffolds/FASTA/unplaced.scaf.fa.gz

# 1232118738 bases (9270900 N's 1222847838 real 1222847838 upper 0 lower)
#    in 37095 sequences in 69 files
# Total size: mean 33215.2 sd 1455020.7
#   min 783 (gi|197861094|gb|ABQF01053133.1|)
#   max 156412533 (gi|209446643|gb|CM000518.1|) median 3541

##########################################################################
# fixup names for UCSC standards (DONE - 2013-06-05 - Hiram)
    mkdir /hive/data/genomes/taeGut2/ucsc
    cd /hive/data/genomes/taeGut2/ucsc

    ########################  Assembled Chromosomes
    cat << '_EOF_' > ucscCompositeAgp.pl
#!/bin/env perl

use strict;
use warnings;

my %accToChr;

open (FH, "<../genbank/Primary_Assembly/assembled_chromosomes/chr2acc") or
        die "can not read Primary_Assembly/assembled_chromosomes/chr2acc";
while (my $line = <FH>) {
    next if ($line =~ m/^#/);
    chomp $line;
    my ($chrN, $acc) = split('\s+', $line);
    $accToChr{$acc} = $chrN;
}
close (FH);

foreach my $acc (keys %accToChr) {
    my $chrN =  $accToChr{$acc};
    print "$acc $accToChr{$acc}\n";
    open (FH, "zcat ../genbank/Primary_Assembly/assembled_chromosomes/AGP/chr${chrN}.comp.agp.gz|") or die "can not read chr${chrN}.comp.agp.gz";
    open (UC, ">chr${chrN}.agp") or die "can not write to chr${chrN}.agp";
    while (my $line = <FH>) {
        if ($line =~ m/^#/) {
            print UC $line;
        } else {
            $line =~ s/^$acc/chr${chrN}/;
            print UC $line;
        }
    }
    close (FH);
    close (UC);
    open (FH, "zcat ../genbank/Primary_Assembly/assembled_chromosomes/FASTA/chr${chrN}.fa.gz|") or die "can not read chr${chrN}.fa.gz";
    open (UC, ">chr${chrN}.fa") or die "can not write to chr${chrN}.fa";
    while (my $line = <FH>) {
        if ($line =~ m/^>/) {
            printf UC ">chr${chrN}\n";
        } else {
            print UC $line;
        }
    }
    close (FH);
    close (UC);
}
'_EOF_'

    # << happy emacs
    chmod +x ucscCompositeAgp.pl
    time ./ucscCompositeAgp.pl
    # real    0m16.041s

    gzip *.fa *.agp &

    ########################  Unplaced scaffolds
    # verify we don't have any .acc numbers different from .1
    zcat \
    ../genbank/Primary_Assembly/unplaced_scaffolds/AGP/unplaced.scaf.agp.gz \
	| cut -f1 | grep -v '^#' \
	| sed -e 's/[A-Z0-9]*//i' | sort | uniq -c
    #   81153 .1

    cat << '_EOF_' > unplaced.pl
#!/bin/env perl

use strict;
use warnings;

my $agpFile =  "../genbank/Primary_Assembly/unplaced_scaffolds/AGP/unplaced.scaf.agp.gz";
my $fastaFile =  "../genbank/Primary_Assembly/unplaced_scaffolds/FASTA/unplaced.scaf.fa.gz";
open (FH, "zcat $agpFile|") or die "can not read $agpFile";
open (UC, ">chrUn.agp") or die "can not write to chrUn.agp";
while (my $line = <FH>) {
    if ($line =~ m/^#/) {
        print UC $line;
    } else {
        $line =~ s/\.1//;
        printf UC "chrUn_%s", $line;
    }
}
close (FH);
close (UC);

open (FH, "zcat $fastaFile|") or die "can not read $fastaFile";
open (UC, ">chrUn.fa") or die "can not write to chrUn.fa";
while (my $line = <FH>) {
    if ($line =~ m/^>/) {
        chomp $line;
        $line =~ s/.*gb\|//;
        $line =~ s/\.1. .*//;
        printf UC ">chrUn_$line\n";
    } else {
        print UC $line;
    }
}
close (FH);
close (UC);
'_EOF_'
    # << happy emacs
    chmod +x unplaced.pl
    time ./unplaced.pl
    # real    0m5.659s

    gzip chrUn.agp chrUn.fa &

    # make sure none of the names got to be over 31 characers long:
    zcat chrUn.agp.gz | grep -v "^#" | cut -f1 | awk '{print length($1)}' | sort -rn | head -1
    # 18

    # Unlocalized scaffolds

    cat << '_EOF_' > unlocalized.pl
#!/bin/env perl

use strict;
use warnings;

my %accToChr;
my %chrNames;

open (FH, "<../genbank/Primary_Assembly/unlocalized_scaffolds/unlocalized.chr2scaf") or
        die "can not read Primary_Assembly/unlocalized_scaffolds/unlocalized.chr2scaf";
while (my $line = <FH>) {
    next if ($line =~ m/^#/);
    chomp $line;
    my ($chrN, $acc) = split('\s+', $line);
    $acc =~ s/\.1$//;
    $accToChr{$acc} = $chrN;
    $chrNames{$chrN} += 1;
}
close (FH);

foreach my $chrN (keys %chrNames) {
    my $agpFile =  "../genbank/Primary_Assembly/unlocalized_scaffolds/AGP/chr$chrN.unlocalized.scaf.agp.gz";
    my $fastaFile =  "../genbank/Primary_Assembly/unlocalized_scaffolds/FASTA/chr$chrN.unlocalized.scaf.fa.gz";
    open (FH, "zcat $agpFile|") or die "can not read $agpFile";
    open (UC, ">chr${chrN}_random.agp") or die "can not write to chr${chrN}_random.agp";
    while (my $line = <FH>) {
        if ($line =~ m/^#/) {
            print UC $line;
        } else {
            chomp $line;
            my (@a) = split('\t', $line);
            my $acc = $a[0];
            $acc =~ s/\.1//;
            die "ERROR: chrN $chrN not correct for $acc"
                if ($accToChr{$acc} ne $chrN);
            my $ucscName = "chr${chrN}_${acc}_random";
            printf UC "%s", $ucscName;
            for (my $i = 1; $i < scalar(@a); ++$i) {
                printf UC "\t%s", $a[$i];
            }
            printf UC "\n";
        }
    }
    close (FH);
    close (UC);
    printf "chr%s\n", $chrN;
    open (FH, "zcat $fastaFile|") or die "can not read $fastaFile";
    open (UC, ">chr${chrN}_random.fa") or die "can not write to chr${chrN}_random.fa";
    while (my $line = <FH>) {
        if ($line =~ m/^>/) {
            chomp $line;
            my $acc = $line;
            $acc =~ s/.*gb\|//;
            $acc =~ s/\.1. .*//;
            $acc =~ s/\.1//;
            die "ERROR: chrN $chrN not correct for $acc"
                if ($accToChr{$acc} ne $chrN);
            my $ucscName = "chr${chrN}_${acc}_random";
            printf UC ">$ucscName\n";
        } else {
            print UC $line;
        }
    }
    close (FH);
    close (UC);
}
'_EOF_'
    # << happy emacs

    chmod +x unlocalized.pl
    time ./unlocalized.pl
    # real    0m2.140s

    gzip *.fa *.agp

    # verify nothing lost from original:
    time faSize *.fa.gz
    # real    0m32.541s

# 1232118738 bases (9270900 N's 1222847838 real 1222847838 upper 0 lower)
#    in 37095 sequences in 69 files
# Total size: mean 33215.2 sd 1455020.7 min 783 (chr5_ABQF01053133_random)
#    max 156412533 (chr2) median 3541


    # make sure none of the names got to be over 31 characers long:
    zcat *.agp.gz | grep -v "^#" | cut -f1 | awk '{print length($0)}' \
        | sort -rn | uniq -c | head

##########################################################################
# Initial makeGenomeDb.pl (DONE - 2013-06-05 - Hiram)
    cd /hive/data/genomes/taeGut2
    cat << '_EOF_' > taeGut2.config.ra
# Config parameters for makeGenomeDb.pl:
db taeGut2
clade vertebrate
genomeCladePriority 68
scientificName Taeniopygia guttata
commonName Zebra finch
assemblyDate Feb. 2013
assemblyLabel Washington University School of Medicine
assemblyShortLabel WashU taeGut324
orderKey 436
mitoAcc NC_007897
fastaFiles /cluster/data/taeGut2/ucsc/chr*.fa.gz
agpFiles /cluster/data/taeGut2/ucsc/chr*.agp.gz
# qualFiles none
dbDbSpeciesDir zebraFinch
photoCreditURL http://www.physci.ucla.edu/html/arnold.htm
photoCreditName UCLA Department of Physiological Science
ncbiGenomeId 367
ncbiAssemblyId 524908
ncbiAssemblyName Taeniopygia_guttata-3.2.4
ncbiBioProject 17289
genBankAccessionID GCA_000151805.2
taxId   59729
'_EOF_'
    # << happy emacs

    # verify sequence and agp are OK
    time makeGenomeDb.pl -workhorse=hgwdev -fileServer=hgwdev -dbHost=hgwdev \
        -stop=agp taeGut2.config.ra > agp.log 2>&1
    # verify no problem:
    tail -1 agp.log
    #  *** All done!  (through the 'agp' step)

    time makeGenomeDb.pl -workhorse=hgwdev -fileServer=hgwdev -dbHost=hgwdev \
        -continue=db taeGut2.config.ra > db.log 2>&1
    # real    10m29.132s
    #
    #	add the trackDb entries to the source tree, and the 2bit link:
    ln -s `pwd`/taeGut2.unmasked.2bit /gbdb/taeGut2/taeGut2.2bit
    #	browser should function now in sandbox
    #   trackDb files here:
    #   /hive/data/genomes/taeGut2/TemporaryTrackDbCheckout/kent/src/hg/makeDb/trackDb/gibbon/taeGut2/
    #   into source tree
    #   now browser should function on hgwdev

#########################################################################
# running repeat masker (DONE - 2013-06-05 - Hiram)
    mkdir /hive/data/genomes/taeGut2/bed/repeatMasker
    cd /hive/data/genomes/taeGut2/bed/repeatMasker
    time doRepeatMasker.pl -buildDir=`pwd` -noSplit \
	-bigClusterHub=swarm -dbHost=hgwdev -workhorse=hgwdev \
	-smallClusterHub=encodek taeGut2 > do.log 2>&1 &
    # real    220m20.455s

    cat faSize.rmsk.txt
    # 1232135591 bases (9270900 N's 1222864691 real 1125277738 upper
    #   97586953 lower) in 37096 sequences in 1 files
    # Total size: mean 33214.8 sd 1455001.1 min 783 (chr5_ABQF01053133_random)
    #   max 156412533 (chr2) median 3541
    # %7.92 masked total, %7.98 masked real

    egrep -i "versi|relea" do.log
    #    January 10 2013 (open-4-0-0) version of RepeatMasker
    #  CC   RELEASE 20120418;

    time featureBits -countGaps taeGut2 rmsk
    # 97640353 bases of 1232135591 (7.924%) in intersection
    # real    0m16.657s

    # why is it different than the faSize above ?
    # because rmsk masks out some N's as well as bases, the count above
    #	separates out the N's from the bases, it doesn't show lower case N's

##########################################################################
# running simple repeat (DONE - 2013-06-05 - Hiram)
    mkdir /hive/data/genomes/taeGut2/bed/simpleRepeat
    cd /hive/data/genomes/taeGut2/bed/simpleRepeat
    time doSimpleRepeat.pl -buildDir=`pwd` -bigClusterHub=swarm \
	-dbHost=hgwdev -workhorse=hgwdev -smallClusterHub=encodek \
	taeGut2 > do.log 2>&1
    # about 14m30s

    cat fb.simpleRepeat
    #   25800814 bases of 1222864691 (2.110%) in intersection

    # considering rmsk %8 vs. WM %20 and this is a small genome,
    # use the rmsk result in order to have the classifications from
    # that available
    # add to rmsk after it is done:
    cd /hive/data/genomes/taeGut2
    twoBitMask taeGut2.rmsk.2bit \
	-add bed/simpleRepeat/trfMask.bed taeGut2.2bit
    #	you can safely ignore the warning about fields >= 13

    twoBitToFa taeGut2.2bit stdout | faSize stdin > faSize.taeGut2.2bit.txt
    cat faSize.taeGut2.2bit.txt

    # 1232135591 bases (9270900 N's 1222864691 real 1123697742 upper
    #    99166949 lower) in 37096 sequences in 1 files
    # Total size: mean 33214.8 sd 1455001.1 min 783 (chr5_ABQF01053133_random)
    #    max 156412533 (chr2) median 3541
    # %8.05 masked total, %8.11 masked real

    rm /gbdb/taeGut2/taeGut2.2bit
    ln -s `pwd`/taeGut2.2bit /gbdb/taeGut2/taeGut2.2bit

#########################################################################
# Verify all gaps are marked, add any N's not in gap as type 'other'
#	(DONE - 2013-06-05 - Hiram)

    mkdir /hive/data/genomes/taeGut2/bed/gap
    cd /hive/data/genomes/taeGut2/bed/gap

    time nice findMotif -motif=gattaca -verbose=4 \
	-strand=+ ../../taeGut2.unmasked.2bit > findMotif.txt 2>&1
    #   real	1m2.760s

    grep "^#GAP " findMotif.txt | sed -e "s/^#GAP //" > allGaps.bed
    time featureBits taeGut2 -not gap -bed=notGap.bed
    # 1222864691 bases of 1222864691 (100.000%) in intersection
    # real    0m9.762s

    time featureBits taeGut2 allGaps.bed notGap.bed -bed=new.gaps.bed
    # 0 bases of 1222864691 (0.000%) in intersection
    #  real    19m53.371s

    # are there non-bridged gaps here:
    hgsql -N -e "select bridge from gap;" taeGut2 | sort | uniq -c
    #     326 no
    #   87384 yes

##########################################################################
## WINDOWMASKER (DONE- 2013-06-05 - Hiram)
    mkdir /hive/data/genomes/taeGut2/bed/windowMasker
    cd /hive/data/genomes/taeGut2/bed/windowMasker
    time nice -n +19 doWindowMasker.pl -buildDir=`pwd` -workhorse=hgwdev \
	-dbHost=hgwdev taeGut2 > do.log 2>&1 &
    # real    66m52.863s

    # Masking statistics
    faSize.taeGut2.cleanWMSdust.txt
# 1232135591 bases (9270900 N's 1222864691 real 973147255 upper
#     249717436 lower) in 37096 sequences in 1 files
# Total size: mean 33214.8 sd 1455001.1 min 783 (chr5_ABQF01053133_random)
#     max 156412533 (chr2) median 3541
# %20.27 masked total, %20.42 masked real

    # how much does this window masker and repeat masker overlap:
    featureBits -countGaps taeGut2 rmsk windowmaskerSdust
    #   54554911 bases of 1232135591 (4.428%) in intersection

########################################################################
# cpgIslands - (DONE - 2013-06-06 - Hiram)
    mkdir /hive/data/genomes/taeGut2/bed/cpgIslands
    cd /hive/data/genomes/taeGut2/bed/cpgIslands
    time doCpgIslands.pl taeGut2 > do.log 2>&1
    #  real    17m40.265s

    cat fb.taeGut2.cpgIslandExt.txt
    #   16473980 bases of 1222864691 (1.347%) in intersection

#############################################################################
# CPG Islands Unmasked track

mkdir /hive/data/genomes/taeGut2/bed/cpgIslandsUnmasked
cd /hive/data/genomes/taeGut2/bed/cpgIslandsUnmasked
time doCpgIslands.pl -buildDir=`pwd` -bigClusterHub=ku \
   -tableName=cpgIslandExtUnmasked -dbHost=hgwdev -smallClusterHub=ku \
    -workhorse=hgwdev \
    -maskedSeq=/hive/data/genomes/taeGut2/taeGut2.unmasked.2bit \
        taeGut2 > do.log 2>&1

#############################################################################

#########################################################################
# genscan - (DONE - 2013-06-06 - Hiram)
    mkdir /hive/data/genomes/taeGut2/bed/genscan
    cd /hive/data/genomes/taeGut2/bed/genscan
    time doGenscan.pl taeGut2 > do.log 2>&1
    #  real    47m7.823s
    # XXX failed four jobs, running manually on hgwdev: chr9 chr1A chr3 chr2
    # with -window=2000000 - all bug chr9 finished
# real    57m16.383s
# user    133m46.445s
# sys     2m55.584s
    # running chr9 with -window=1000000
# real    10m14.291s
# user    9m59.182s
# sys     0m13.720s

    time doGenscan.pl -continue=makeBed taeGut2 > makeBed.log 2>&1
    # real    30m4.119s

    cat fb.taeGut2.genscan.txt
    #   52986693 bases of 1222864691 (4.333%) in intersection
    cat fb.taeGut2.genscanSubopt.txt
    #   28313661 bases of 1222864691 (2.315%) in intersection

#########################################################################
# MAKE 11.OOC FILE FOR BLAT/GENBANK (DONE - 2013-06-06 - Hiram)
    # Use -repMatch=500, based on size -- for human we use 1024
    # use the "real" number from the faSize measurement,
    # hg19 is 2897316137, calculate the ratio factor for 1024:
    calc \( 1222864691 / 2897316137 \) \* 1024
    #  ( 1222864691 / 2897316137 ) * 1024 = 432.197725

    # round up to 500 (taeGut1 used 1024)

    cd /hive/data/genomes/taeGut2
    time blat taeGut2.2bit /dev/null /dev/null -tileSize=11 \
      -makeOoc=jkStuff/taeGut2.11.ooc -repMatch=500
    # Wrote 12162 overused 11-mers to jkStuff/taeGut2.11.ooc
    # real    0m32.994s

    # there are non-bridged gaps, therefore, a lift file is needed for genbank
    hgsql -N -e "select bridge from gap;" taeGut2 | sort | uniq -c
    #      326 no
    #    87384 yes

    cd /hive/data/genomes/taeGut2/jkStuff
    gapToLift taeGut2 taeGut2.nonBridged.lift -bedFile=taeGut2.nonBridged.bed
    # largest non-bridged contig:
    awk '{print $3-$2,$0}' taeGut2.nonBridged.bed | sort -nr | head
    #   56928224 chr5   4758199 61686423        chr5.07

#########################################################################
# AUTO UPDATE GENBANK (DONE - 2013-03-08 - Pauline)
    # examine the file:
    /cluster/data/genbank/data/organism.lst
    # for your species to see what counts it has for:
# organism       mrnaCnt estCnt  refSeqCnt
# Taeniopygia guttata     5086    92144   987

    # to decide which "native" mrna or ests you want to specify in genbank.conf

    ssh hgwdev
    cd $HOME/kent/src/hg/makeDb/genbank
    git pull
    # edit etc/genbank.conf to add taeGut2 just after ce2

# taeGut2 - Zebra Finch
taeGut2.serverGenome = /hive/data/genomes/taeGut2/taeGut2.2bit
taeGut2.clusterGenome = /hive/data/genomes/taeGut2/taeGut2.2bit
taeGut2.ooc = /hive/data/genomes/taeGut2/jkStuff/taeGut2.11.ooc
taeGut2.align.unplacedChroms = chrUn_*,chr*_random
taeGut2.lift = /hive/data/genomes/taeGut2/jkStuff/taeGut2.nonBridged.lift
taeGut2.refseq.mrna.native.pslCDnaFilter  = ${ordered.refseq.mrna.native.pslCDnaFilter}
taeGut2.refseq.mrna.xeno.pslCDnaFilter    = ${ordered.refseq.mrna.xeno.pslCDnaFilter}
taeGut2.genbank.mrna.native.pslCDnaFilter = ${ordered.genbank.mrna.native.pslCDnaFilter}
taeGut2.genbank.mrna.xeno.pslCDnaFilter   = ${ordered.genbank.mrna.xeno.pslCDnaFilter}
taeGut2.genbank.est.native.pslCDnaFilter  = ${ordered.genbank.est.native.pslCDnaFilter}
taeGut2.refseq.mrna.native.load = yes
taeGut2.refseq.mrna.xeno.load = yes
taeGut2.genbank.mrna.xeno.load = yes
taeGut2.downloadDir = taeGut2
taeGut2.upstreamGeneTbl = refGene

    # end of section added to etc/genbank.conf
    git commit -m "adding taeGut2 Zebra finch refs #10190" etc/genbank.conf
    git push
    make etc-update

    ssh hgwdev			# used to do this on "genbank" machine
    screen -S taeGut2           # long running job managed in screen
    cd /cluster/data/genbank
    time ./bin/gbAlignStep -initial taeGut2 &
#  logFile: var/build/logs/2013.06.18-11:56:48.taeGut2.initalign.log

    # load database when finished
    ssh hgwdev
    cd /cluster/data/genbank

    time ./bin/gbDbLoadStep -drop -initialLoad taeGut2 &
    #   logFile: var/dbload/hgwdev/logs/2013.06.08-18:59:10.dbload.log
   #   real    29m9.092s

    # enable daily alignment and update of hgwdev (DONE - 2013-03-08 - Pauline)
    cd ~/kent/src/hg/makeDb/genbank
    git pull
    # add taeGut2 to:
    vi etc/align.dbs etc/hgwdev.dbs
    git commit -m "Added taeGut2. refs #10190" etc/align.dbs etc/hgwdev.dbs
    git push
    make etc-update

#########################################################################
# set default position similar to taeGut1 (DONE - 2013-06-19 - Hiram)
    hgsql -e \
'update dbDb set defaultPos="chrZ:56283356-56343168" where name="taeGut2";' \
	hgcentraltest

#########################################################################
#  BLATSERVERS ENTRY (DONE - 2013-06-19 - Hiram)
#	After getting a blat server assigned by the Blat Server Gods,
     ssh hgwdev

     hgsql -e 'INSERT INTO blatServers (db, host, port, isTrans, canPcr) \
 	VALUES ("taeGut2", "blat4c", "17848", "1", "0"); \
 	INSERT INTO blatServers (db, host, port, isTrans, canPcr) \
 	VALUES ("taeGut2", "blat4c", "17849", "0", "1");' \
 	    hgcentraltest
     #	test it with some sequence

############################################################################
# reset dbDb description to be like the previous nomLeu
#       (DONE - 2013-03-21 - Hiram)
    hgsql hgcentraltest -e 'update dbDb set description = "Oct. 2012 (GGSC Nleu3.0/taeGut2)"" where name = "taeGut2";' \
          | hgsql -h genome-testdb hgcentraltest

############################################################################
# construct liftOver from taeGut1 to taeGut2 (DONE - 2013-06-19 - Hiram)
    # documentation for this step is in taeGut1 -  been done

############################################################################
# downloads and pushQ entry (DONE - 2014-04-16 - Hiram)
    # after adding taeGut2 to the all.joiner file and verifying that
    #   joinerCheck is clean (i.e. run joinerCheck w -times and -keys flags
    #   to make sure there are no errors), can construct the downloads:
    cd /hive/data/genomes/taeGut2
    time  makeDownloads.pl -workhorse=hgwdev taeGut2
    #   real 14m53.730s

    mkdir /hive/data/genomes/taeGut2/pushQ
    cd /hive/data/genomes/taeGut2/pushQ
    time makePushQSql.pl taeGut2 2> stderr.txt > taeGut2.sql
    #  real	2m4.300s

    # check the stderr.txt for bad stuff, these kinds of warnings are OK:
# WARNING: hgwdev does not have /gbdb/taeGut2/wib/gc5Base.wib
# WARNING: hgwdev does not have /gbdb/taeGut2/wib/quality.wib
# WARNING: hgwdev does not have /gbdb/taeGut2/bbi/quality.bw
# WARNING: taeGut2 does not have seq
# WARNING: taeGut2 does not have extFile
# WARNING: taeGut2 does not have estOrientInfo

    scp -p taeGut2.sql qateam@hgwbeta:/tmp/
    ssh qateam@hgwbeta hgwbeta "./bin/x86_64/hgsql qapushq < /tmp/taeGut2.sql"

############################################################################
#  BLASTZ/CHAIN/NET Zebra finch vs Turkey (DONE - 2013-10-18,25 - Hiram)
#   use a screen to manage this multi-day job
    screen -S melGal1

    mkdir /hive/data/genomes/taeGut2/bed/lastzMelGal1.2013-10-18
    cd /hive/data/genomes/taeGut2/bed/lastzMelGal1.2013-10-18

    cat << '_EOF_' > DEF
# Zebra finch vs. Turkey
BLASTZ_M=50

# TARGET: Zebra finch TaeGut2
SEQ1_DIR=/hive/data/genomes/taeGut2/taeGut2.2bit
SEQ1_LEN=/hive/data/genomes/taeGut2/chrom.sizes
SEQ1_CHUNK=20000000
SEQ1_LAP=10000
SEQ1_LIMIT=100

# QUERY: Turkey MelGal1
SEQ2_DIR=/scratch/data/melGal1/melGal1.2bit
SEQ2_LEN=/scratch/data/melGal1/chrom.sizes
SEQ2_CHUNK=10000000
SEQ2_LIMIT=100
SEQ2_LAP=0

BASE=/hive/data/genomes/taeGut2/bed/lastzMelGal1.2013-10-18
TMPDIR=/dev/shm
'_EOF_'
    # << this line keeps emacs coloring happy
    # screen
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
        `pwd`/DEF \
        -syntenicNet \
        -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \
        -chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1
    # real    4063m8.278s
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
        `pwd`/DEF \
        -continue=cat -syntenicNet \
        -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \
        -chainMinScore=3000 -chainLinearGap=medium > cat.log 2>&1
    # real    66m17.485s

    cat fb.taeGut2.chainMelGal1Link.txt
    # 781765606 bases of 1222864691 (63.929%) in intersection

    cd /hive/data/genomes/taeGut2/bed
    ln -s lastzMelGal1.2013-10-18 lastz.melGal1

    #   and then swap
    mkdir /hive/data/genomes/melGal1/bed/blastz.taeGut2.swap
    cd /hive/data/genomes/melGal1/bed/blastz.taeGut2.swap
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
        /hive/data/genomes/taeGut2/bed/lastzMelGal1.2013-10-18/DEF \
        -swap -syntenicNet \
        -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \
        -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1
    # real    43m24.666s
    cat fb.melGal1.chainTaeGut2Link.txt
    #   661376423 bases of 935922386 (70.666%) in intersection

    cd /hive/data/genomes/melGal1/bed
    ln -s blastz.taeGut2.swap lastz.taeGut2

############################################################################
# lifting ensGene from taeGut1 (DONE - 2011-03-23 - Hiram)
    mkdir /hive/data/genomes/taeGut2/bed/ensGene.73
    cd /hive/data/genomes/taeGut2/bed/ensGene.73

    # start with taeGut1 ensGene
    hgsql -N -e "select * from ensGene;" taeGut1 | cut -f2- > taeGut1.ensGene.gp
    # lift over to taeGut2
    liftOver -genePred  taeGut1.ensGene.gp \
      /hive/data/genomes/taeGut1/bed/liftOver/taeGut1ToTaeGut2.over.chain.gz \
         taeGut2.ensGene.gp unMapped.txt
    # check to see which genes have gone bad
    genePredCheck -db=taeGut2 taeGut2.ensGene.gp > badOnes.txt 2>&1
    #	checked: 19226 failed: 2
    awk '{print $3}' badOnes.txt | sort -u | grep -v failed > toRemove.txt
    # remove the bad ones
    cp -p taeGut2.ensGene.gp taeGut2.unfiltered.ensGene.gp
for N in `cat toRemove.txt`
do
    rm -f t.gp
    grep -v "${N}" taeGut2.ensGene.gp > t.gp
    rm -f taeGut2.ensGene.gp
    mv t.gp taeGut2.ensGene.gp
done
    genePredCheck -db=taeGut2 taeGut2.ensGene.gp
    #	checked: 19225 failed: 0
    # OK to load this since all are OK now
    hgLoadGenePred  -genePredExt taeGut2 ensGene taeGut2.ensGene.gp
    genePredCheck -db=taeGut2 ensGene
    #	checked: 19225 failed: 0
    # use the ensPep and ensGtp from taeGut1, but remove the bad ones
    zcat /hive/data/genomes/taeGut1/bed/ensGene.73/download/Taeniopygia_guttata.taeGut3.2.4.73.pep.all.fa.gz \
        | sed -e 's/^>.* transcript:/>/; s/ CCDS.*$//; s/ gene_biotype.*$//;' | gzip > ensPep.txt.gz
    zcat ensPep.txt.gz \
    | ~/kent/src/utils/faToTab/faToTab.pl /dev/null /dev/stdin \
         | sed -e '/^$/d; s/*$//' | sort > ensPep.taeGut1.fa.tab

    cp -p ensPep.taeGut1.fa.tab ensPep.taeGut2.fa.tab
    for N in `cat toRemove.txt`
do
    rm -f t.fa.tab
    grep -v "${N}" ensPep.taeGut2.fa.tab > t.fa.tab
    rm -f ensPep.taeGut2.fa.tab
    mv t.fa.tab ensPep.taeGut2.fa.tab
    echo "'${N}'"
done

    wc -l ensPep.taeGut1.fa.tab ensPep.taeGut2.fa.tab
    # 18204 ensPep.taeGut1.fa.tab
    # 18203 ensPep.taeGut2.fa.tab

    awk '{print $1}' taeGut1.ensGene.gp | sort -u > taeGut1.name.list
    awk '{print $1}' taeGut2.ensGene.gp | sort -u > taeGut2.name.list
    cp -p /hive/data/genomes/taeGut1/bed/ensGene.73/process/ensGtp.tab \
        ./taeGut1.ensGtp.tab

    cp -p taeGut1.ensGtp.tab taeGut2.ensGtp.tab
    comm -23 taeGut1.name.list taeGut2.name.list | while read N
do
    rm -f t.gtp.tab
    grep -v "${N}" taeGut2.ensGtp.tab > t.gtp.tab
    rm -f taeGut2.ensGtp.tab
    mv t.gtp.tab taeGut2.ensGtp.tab
    echo "${N}"
done
    wc -l *ensGtp*
    # 19334 taeGut1.ensGtp.tab
    # 19225 taeGut2.ensGtp.tab

    comm -23 taeGut1.name.list taeGut2.name.list | while read N
do
    rm -f t.fa.tab
    grep -v "${N}" ensPep.taeGut2.fa.tab > t.fa.tab
    rm -f ensPep.taeGut2.fa.tab
    mv t.fa.tab ensPep.taeGut2.fa.tab
    echo "'${N}'"
done
    wc -l ensPep.taeGut1.fa.tab ensPep.taeGut2.fa.tab
    # 18204 ensPep.taeGut1.fa.tab
    # 18097 ensPep.taeGut2.fa.tab

    # now that those are clean, load them up
    hgPepPred taeGut2 tab ensPep ensPep.taeGut2.fa.tab
    hgLoadSqlTab taeGut2 ensGtp ~/kent/src/hg/lib/ensGtp.sql taeGut2.ensGtp.tab
    # and record version in trackVersion
hgsql -e 'INSERT INTO trackVersion \
    (db, name, who, version, updateTime, comment, source, dateReference) \
    VALUES("taeGut2", "ensGene", "hiram", "73", now(), \
        "peptides lifted from taeGut1", \
        "genes lifted from taeGut1 ensGene v73", "current" );' hgFixed

    /cluster/bin/scripts/ensemblInfo.pl /hive/data/genomes/taeGut1/bed/ensGene.73/process/infoOut.txt | sort > ensemblToGeneName.taeGut1.tab
    /cluster/bin/scripts/ensemblInfo.pl -field2=source /hive/data/genomes/taeGut1/bed/ensGene.73/process/infoOut.txt | sort > ensemblSource.taeGut1.tab

    # default join output separates fields by space
    # we need tab separated files for hgLoadSqlTab
    # if it was join -t'\t' - it doesn't recognize \t as a tab
    # if it was the hidden tab character join -t'^I' it wouldn't cut and paste
    # so use the sed to change space into tab since we know our names do
    # not have space
    join taeGut2.name.list ensemblToGeneName.taeGut1.tab \
          | sed -e 's/ /\t/g' > ensemblToGeneName.tab
    join taeGut2.name.list ensemblSource.taeGut1.tab \
          | sed -e 's/ /\t/g' > ensemblSource.tab

    hgLoadSqlTab taeGut2 ensemblToGeneName \
      /hive/data/genomes/taeGut1/bed/ensGene.73/process/ensemblToGeneName.sql \
           ensemblToGeneName.tab
    hgLoadSqlTab taeGut2 ensemblSource \
       /hive/data/genomes/taeGut1/bed/ensGene.73/process/ensemblSource.sql \
           ensemblSource.tab

#########################################################################
# create ucscToINSDC name mapping (DONE - 2014-04-15 - Hiram)
    mkdir /hive/data/genomes/taeGut2/bed/ucscToINSDC
    cd /hive/data/genomes/taeGut2/bed/ucscToINSDC

    # this script has been maturing over time, it is close to complete.
    # to find a latest copy of it:
    # ls -ogrt /hive/data/genomes/*/bed/ucscToINSDC/translateNames.sh

    cp -p /hive/data/genomes/eriEur2/bed/ucscToINSDC/translateNames.sh .
    ./translateNames.sh
    # it says:
# need to find chrM accessions
    # so add this one:
    echo -e 'chrM\tNC_007897.1' >> ucscToINSDC.txt
    # needs to be sorted to work with join
    sort ucscToINSDC.txt > ucscToINSDC.tab

    awk '{printf "%s\t0\t%d\n", $1,$2}' ../../chrom.sizes | sort \
        > name.coordinate.tab

    join name.coordinate.tab ucscToINSDC.tab | tr '[ ]' '[\t]' > ucscToINSDC.bed

    cut -f1 ucscToINSDC.bed | awk '{print length($0)}' | sort -n | tail -1
# 28

    # use the 28 in this sed:
    sed -e "s/21/28/" $HOME/kent/src/hg/lib/ucscToINSDC.sql \
        | hgLoadSqlTab taeGut2 ucscToINSDC stdin ucscToINSDC.bed
    checkTableCoords taeGut2 ucscToINSDC
    # should cover all bases
    featureBits -countGaps taeGut2 ucscToINSDC
    # 1232135591 bases of 1232135591 (100.000%) in intersection

##############################################################################
# cytoBandIdeo - (DONE - 2014-04-15 - Hiram)
    mkdir /hive/data/genomes/taeGut2/bed/cytoBand
    cd /hive/data/genomes/taeGut2/bed/cytoBand
    makeCytoBandIdeo.csh taeGut2

##############################################################################
# fixup search rule for assembly track/gold table (DONE - 2014-05-01 - Hiram)
    hgsql -N -e "select frag from gold;" taeGut2 | sort | head -1
ABQF01000001.1

    hgsql -N -e "select frag from gold;" taeGut2 | sort | tail -2
ABQF01124805.1
NC_007897

    # verify this rule will find them all or eliminate them all:
    hgsql -N -e "select frag from gold;" taeGut2 | wc -l
    # 124806

    hgsql -N -e "select frag from gold;" taeGut2 | egrep -e '[AN][BC][Q_][F0]0[0-9]+(\.1)?' | wc -l
    # 124806

    hgsql -N -e "select frag from gold;" taeGut2 | egrep -v -e '[AN][BC][Q_][F0]0[0-9]+(\.1)?' | wc -l
    # 0

    # hence, add to trackDb/zebraFinch/taeGut2/trackDb.ra
searchTable gold
shortCircuit 1
termRegex [AN][BC][Q_][F0]0[0-9]+(\.1)?
query select chrom,chromStart,chromEnd,frag from %s where frag like '%s%%'
searchPriority 8

#########################################################################
##############################################################################
# TransMap V3 tracks. see makeDb/doc/transMapTracks.txt (2014-12-21 markd)
##############################################################################

############################################################################
#  BLASTZ/CHAIN/NET Zebra finch vs Turkey (DONE - 2017-01-19,25 - Hiram)
#   use a screen to manage this multi-day job
    screen -S melGal5

    mkdir /hive/data/genomes/taeGut2/bed/lastzMelGal5.2017-01-19
    cd /hive/data/genomes/taeGut2/bed/lastzMelGal5.2017-01-19

    printf '# Zebra finch vs. Turkey
BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.66/bin/lastz
BLASTZ_M=254

# TARGET: Zebra finch TaeGut2
SEQ1_DIR=/hive/data/genomes/taeGut2/taeGut2.2bit
SEQ1_LEN=/hive/data/genomes/taeGut2/chrom.sizes
SEQ1_CHUNK=20000000
SEQ1_LAP=10000
SEQ1_LIMIT=300

# QUERY: Turkey MelGal5
SEQ2_DIR=/scratch/data/melGal5/melGal5.2bit
SEQ2_LEN=/scratch/data/melGal5/chrom.sizes
SEQ2_CHUNK=10000000
SEQ2_LIMIT=800
SEQ2_LAP=0

BASE=/hive/data/genomes/taeGut2/bed/lastzMelGal5.2017-01-19
TMPDIR=/dev/shm
' > DEF

    time (doBlastzChainNet.pl -verbose=2 `pwd`/DEF \
        -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \
        -syntenicNet -chainMinScore=5000 -chainLinearGap=loose) > do.log 2>&1
    # real    266m59.980s

    cat fb.taeGut2.chainMelGal5Link.txt
    # 807888817 bases of 1222864691 (66.065%) in intersection

    time (doRecipBest.pl -workhorse=hgwdev -buildDir=`pwd` taeGut2 melGal5) \
           > rbest.log 2>&1 &
    # real    413m28.452s

    #   and then swap
    mkdir /hive/data/genomes/melGal5/bed/blastz.taeGut2.swap
    cd /hive/data/genomes/melGal5/bed/blastz.taeGut2.swap
    time (doBlastzChainNet.pl -verbose=2 \
        /hive/data/genomes/taeGut2/bed/lastzMelGal5.2017-01-19/DEF \
        -swap -syntenicNet \
        -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \
        -chainMinScore=5000 -chainLinearGap=loose) > swap.log 2>&1
    # real    119m13.393s

    cat fb.melGal5.chainTaeGut2Link.txt
    #   706578355 bases of 1093044709 (64.643%) in intersection

    time (doRecipBest.pl -workhorse=hgwdev -buildDir=`pwd` melGal5 taeGut2) \
           > rbest.log 2>&1
    # real    354m40.975s

############################################################################
# LASTZ Zebra finch TaeGut2 vs. Medium ground finch GeoFor1 (DONE - 2022-02-03 -Gerardo)

# should be able to run this from anywhere, this time it was run from:
    cd kent/src/hg/utils/automation

  time (~/kent/src/hg/utils/automation/pairLastz.sh \
        taeGut2 geoFor1 other other) \
           > zebraFinch_medFinch_20220202_secondTime.log 2>&1 &
  # check the total time
grep -w real  zebraFinch_medFinch_20220202_secondTime.log  | tail -1 | sed -e 's/^/    # /;'
    # real      107m34.019s

  # this zebraFinch_medFinch_20220202_secondTime.log log file happens to have a copy of the make doc, as well
  # as the copy of the make doc left in the target assembly directory:
# /hive/data/genomes/taeGut2/bed/lastzGeoFor1.2022-02-02/makeDoc.txt

    # this command outputs this makeDoc text:

    cat kent/src/hg/utils/automation/zebraFinch_medFinch_20220202_secondTime.log

##############################################################################
# LASTZ Zebra finch TaeGut2 vs. Medium ground finch GeoFor1
#    (DONE - 2022-02-03 - Gerardo)

    mkdir /hive/data/genomes/taeGut2/bed/lastzGeoFor1.2022-02-02
    cd /hive/data/genomes/taeGut2/bed/lastzGeoFor1.2022-02-02

    printf '# Medium ground finch GeoFor1 vs. Zebra finch TaeGut2
BLASTZ=/cluster/bin/penn/lastz-distrib-1.04.03/bin/lastz

# TARGET: Zebra finch TaeGut2
SEQ1_DIR=/hive/data/genomes/taeGut2/taeGut2.2bit
SEQ1_LEN=/hive/data/genomes/taeGut2/chrom.sizes
SEQ1_CHUNK=20000000
SEQ1_LAP=10000
SEQ1_LIMIT=40

# QUERY: Medium ground finch GeoFor1
SEQ2_DIR=/hive/data/genomes/geoFor1/geoFor1.2bit
SEQ2_LEN=/hive/data/genomes/geoFor1/chrom.sizes
SEQ2_CHUNK=20000000
SEQ2_LAP=0
SEQ2_LIMIT=100

BASE=/hive/data/genomes/taeGut2/bed/lastzGeoFor1.2022-02-02
TMPDIR=/dev/shm

' > DEF

    time (~/kent/src/hg/utils/automation/doBlastzChainNet.pl  -verbose=2 `pwd`/DEF -syntenicNet \
        -workhorse=hgwdev -smallClusterHub=hgwdev -bigClusterHub=ku \
        -chainMinScore=5000 -chainLinearGap=loose) > do.log 2>&1
    grep -w real do.log | sed -e 's/^/    # /;'
    # real      159m10.037s

    sed -e 's/^/    # /;' fb.taeGut2.chainGeoFor1Link.txt
    # 1125215195 bases of 1222864691 (92.015%) in intersection
    sed -e 's/^/    # /;' fb.taeGut2.chainSynGeoFor1Link.txt
    # 1024055100 bases of 1222864691 (83.742%) in intersection

    time (~/kent/src/hg/utils/automation/doRecipBest.pl  -load -workhorse=hgwdev -buildDir=`pwd` \
       \
       \
        taeGut2 geoFor1) > rbest.log 2>&1

    grep -w real rbest.log | sed -e 's/^/    # /;'

    sed -e 's/^/    # /;' fb.taeGut2.chainRBest.GeoFor1.txt

    ### and for the swap

    cd /hive/data/genomes/geoFor1/bed/blastz.taeGut2.swap

   time (~/kent/src/hg/utils/automation/doBlastzChainNet.pl  -swap -verbose=2 \
    /hive/data/genomes/taeGut2/bed/lastzGeoFor1.2022-02-02/DEF -swapDir=`pwd` \
  -syntenicNet -workhorse=hgwdev -smallClusterHub=hgwdev -bigClusterHub=ku \
    -chainMinScore=5000 -chainLinearGap=loose) > swap.log 2>&1

    grep -w real swap.log | sed -e 's/^/    # /;'
    # real      42m46.900s

    sed -e 's/^/    # /;' fb.geoFor1.chainTaeGut2Link.txt
    # 954457151 bases of 1041286029 (91.661%) in intersection
    sed -e 's/^/    # /;' fb.geoFor1.chainSynTaeGut2Link.txt
    # 912902982 bases of 1041286029 (87.671%) in intersection
\    time (~/kent/src/hg/utils/automation/doRecipBest.pl  -load -workhorse=hgwdev -buildDir=`pwd` \
    \
    \
   geoFor1 taeGut2) > rbest.log 2>&1

    grep -w real rbest.log | sed -e 's/^/    # /;'
    # real      64m46.642s

    sed -e 's/^/    # /;' fb.geoFor1.chainRBest.TaeGut2.txt
    # 919870835 bases of 1041286029 (88.340%) in intersection

##############################################################################

real    107m34.019s
user    0m1.109s
sys     0m1.771s
