# for emacs: -*- mode: sh; -*-

# This file describes browser build for the papAnu2
#	Olive Baboon - Papio anubis - Mar 2012
#
# Note: The previous assembly papHam1 is from the same animal
#        used for this assembly.


#	http://www.ncbi.nlm.nih.gov/genome/394
#	http://www.ncbi.nlm.nih.gov/genome/assembly/383538/
#	http://www.ncbi.nlm.nih.gov/bioproject/54005 - Baylor
#	http://www.ncbi.nlm.nih.gov/Traces/wgs/?val=AHZZ01
#   Genome Coverage : 2.5x Sanger; 4.5x 454; 85x Illumina

#   no result for "Papio anubis mitochondrion complete genome" search

#   DATE:   21-Mar-2012
#   ORGANISM:   Papio anubis
#   TAXID:  9555
#   ASSEMBLY LONG NAME: Panu_2.0
#   ASSEMBLY SHORT NAME:    Panu_2.0
#   ASSEMBLY SUBMITTER: Baylor College of Medicine
#   ASSEMBLY TYPE:  Haploid
#   NUMBER OF ASSEMBLY-UNITS:   1
#   ASSEMBLY ACCESSION: GCA_000264685.1
#
#   FTP-RELEASE DATE: 07-Jun-2012


#   Eukaryota; Metazoa; Chordata; Craniata; Vertebrata; Euteleostomi; Mammalia;
#   Eutheria; Euarchontoglires; Primates; Haplorrhini; Catarrhini;
#   Cercopithecidae; Cercopithecinae; Papio; Papio anubis

#############################################################################
# Fetch sequence from genbank (DONE - 2012-07-19 - Chin)

    mkdir -p /hive/data/genomes/papAnu2/genbank
    cd /hive/data/genomes/papAnu2/genbank

    time rsync -a -P rsync://ftp.ncbi.nlm.nih.gov/genbank/genomes/Eukaryotes/vertebrates_mammals/Papio_anubis/Panu_2.0/ ./
    # real    33m19.751s

    # measure total sequence in this assembly
    faSize Primary_Assembly/assembled_chromosomes/FASTA/*.fa.gz \
        Primary_Assembly/unplaced_scaffolds/FASTA/*.fa.gz
    #  2948380710 bases (55130419 N's 2893250291 real 2893250291 upper 0 lower)
    #  in 63250 sequences in 22 files
    #  Total size: mean 46614.7 sd 2497441.4
    #  min 200 (gi|388614875|gb|AHZZ01198925.1|)
    #  max 220367699 (gi|390124480|gb|CM001491.1|) median 1416
    #  %0.00 masked total, %0.00 masked real

#############################################################################
# process into UCSC naming scheme (DONE - 2012-07-19 - Chin)
    mkdir /hive/data/genomes/papAnu2/ucsc
    cd /hive/data/genomes/papAnu2/ucsc

    ########################  Assembled Chromosomes
    cat << '_EOF_' > toUcsc.pl
#!/bin/env perl

use strict;
use warnings;

my %accToChr;

open (FH, "<../genbank/Primary_Assembly/assembled_chromosomes/chr2acc") or
        die "can not read Primary_Assembly/assembled_chromosomes/chr2acc";
while (my $line = <FH>) {
    next if ($line =~ m/^#/);
    chomp $line;
    my ($chrN, $acc) = split('\s+', $line);
    $accToChr{$acc} = $chrN;
}
close (FH);

foreach my $acc (keys %accToChr) {
    my $chrN =  $accToChr{$acc};
    print "$acc $accToChr{$acc}\n";
    open (FH, "zcat ../genbank/Primary_Assembly/assembled_chromosomes/AGP/chr${chrN}.agp.gz|") or die "can not read chr${chrN}.agp.gz";
    open (UC, ">chr${chrN}.agp") or die "can not write to chr${chrN}.agp";
    while (my $line = <FH>) {
        if ($line =~ m/^#/) {
            print UC $line;
        } else {
            $line =~ s/^$acc/chr${chrN}/;
            print UC $line;
        }
    }
    close (FH);
    close (UC);
    open (FH, "zcat ../genbank/Primary_Assembly/assembled_chromosomes/FASTA/chr${chrN}.fa.gz|") or die "can not read chr${chrN}.fa.gz";
    open (UC, ">chr${chrN}.fa") or die "can not write to chr${chrN}.fa";
    while (my $line = <FH>) {
        if ($line =~ m/^>/) {
            printf UC ">chr${chrN}\n";
        } else {
            print UC $line;
        }
    }
    close (FH);
    close (UC);
}
'_EOF_'
    # << happy emacs
    chmod +x toUcsc.pl
    time ./toUcsc.pl
    #    real   1m26.505s
    time gzip *.fa *.agp
    #   real    real    13m40.080s
    faSize chr*.fa.gz
    #  2724327674 bases (42062585 N's 2682265089 real 2682265089 upper 0 lower)
    #  in 21 sequences in 21 files
    #  Total size: mean 129729889.2 sd 45381077.9 min 51301725 (chr19)
    #  max 220367699 (chr1) median 128036923
    #  %0.00 masked total, %0.00 masked rea

    ########################  Unplaced scaffolds
    # verify we don't have any .acc numbers different from .1
    # see oreNil2 for xxxx.1 and xxxx.2 cases
    zcat ../genbank/Primary_Assembly/unplaced_scaffolds/AGP/unplaced.scaf.agp.gz \
	 | cut -f1 | egrep "^JH|AHZZ" \
	| sed -e 's/^JH[0-9][0-9]*//; s/^AHZZ[0-9][0-9]*//' | sort | uniq -c
    #	96735 .1

    # this is like the unplaced.pl script in other assemblies except it
    # does not add chrUn_ to the names since they are all just scaffolds

    cat << '_EOF_' > unplaced.pl
#!/bin/env perl

use strict;
use warnings;

my $agpFile =  "../genbank/Primary_Assembly/unplaced_scaffolds/AGP/unplaced.scaf.agp.gz";
my $fastaFile =  "../genbank/Primary_Assembly/unplaced_scaffolds/FASTA/unplaced.scaf.fa.gz";
open (FH, "zcat $agpFile|") or die "can not read $agpFile";
open (UC, ">unplaced.agp") or die "can not write to unplaced.agp";
while (my $line = <FH>) {
    if ($line =~ m/^#/) {
        print UC $line;
    } else {
        $line =~ s/\.1//;
        printf UC "%s", $line;
    }
}
close (FH);
close (UC);

open (FH, "zcat $fastaFile|") or die "can not read $fastaFile";
open (UC, ">unplaced.fa") or die "can not write to unplaced.fa";
while (my $line = <FH>) {
    if ($line =~ m/^>/) {
        chomp $line;
        $line =~ s/.*gb\|//;
        $line =~ s/\.1\|.*//;
        printf UC ">$line\n";
    } else {
        print UC $line;
    }
}
close (FH);
close (UC);
'_EOF_'
    # << happy emacs
    chmod +x unplaced.pl

    time ./unplaced.pl
    # real    0m6.265s
# -rw-rw-r-- 1   4703550 Jul 20 10:47 unplaced.agp
# -rw-rw-r-- 1 228138466 Jul 20 10:47 unplaced.fa

    time gzip *.fa *.agp
    # real    0m50.019s

    # verify nothing lost in the translation, should be the same as above
    #	except for the name translations
    faSize *.fa.gz
    # 2948380710 bases (55130419 N's 2893250291 real 2893250291 upper 0 lower)
    # in 63250 sequences in 22 files
    # Total size: mean 46614.7 sd 2497441.4 min 200 (AHZZ01198925)
    # max 220367699 (chr1) median 1416

#############################################################################
#   Initial browser build (DONE - 2012-07-20 - Chin)
    cd /hive/data/genomes/papAnu2
    cat << '_EOF_' > papAnu2.config.ra
# Config parameters for makeGenomeDb.pl:
db papAnu2
clade mammal
genomeCladePriority 16
scientificName Papio anubis
commonName Baboon
assemblyDate Mar. 2012
assemblyLabel Baylor College of Medicine
assemblyShortLabel Baylor Panu_2.0
orderKey 391
mitoAcc none
fastaFiles /hive/data/genomes/papAnu2/ucsc/*.fa.gz
agpFiles /hive/data/genomes/papAnu2/ucsc/*.agp.gz
dbDbSpeciesDir baboon
photoCreditURL http://www.oumedicine.com/pathology/general-program-info/faculty-staff/roman-f-wolf-dvm
photoCreditName Roman Wolf, University of Oklahoma Health Sciences Center
ncbiGenomeId 394
ncbiAssemblyId 383538
ncbiAssemblyName Panu_2.0
ncbiBioProject 54005
genBankAccessionID GCA_000264685.1
taxId   9555
'_EOF_'
    # << happy emacs

   # verify sequence and agp are OK
    time makeGenomeDb.pl -workhorse=hgwdev -fileServer=hgwdev -dbHost=hgwdev \
        -stop=agp papAnu2.config.ra > agp.log 2>&1 &
    #   real     4m34.068s
    # verify OK:
    tail -1 agp.log
    #   *** All done!  (through the 'agp' stepm

    # finish it off
    time makeGenomeDb.pl -continue=db -workhorse=hgwdev -fileServer=hgwdev \
        -dbHost=hgwdev papAnu2.config.ra > db.log 2>&1 &
    #   real    22m27.336s
    #   add the trackDb entries to the source tree, and the 2bit link:
    ln -s `pwd`/papAnu2.unmasked.2bit /gbdb/papAnu2/papAnu2.2bit
    #  browser should function now
    #  manually run twoBitDup to detect dup
    twoBitDup /gbdb/papAnu2/papAnu2.2bit

   # cd ~/kent/src/hg/makeDb/trackDb
   # edit makefile to add papAnu2 to DBS.
   # git add baboon/papAnu3/*.{ra,html}
   # git commit -m "Added papAnu2 to DBS." makefile
   # git commit -m "Initial descriptions for papAnu2." baboon/papAnu2/*.{ra,html}
   # git pull; git push
   # Run make update DBS=papAnu2 and make alpha when done.
   # (optional) Clean up /cluster/data/papAnu2/TemporaryTrackDbCheckout

   #  browser should function now
   #check in papAnu2.config.ra to src/hg/utils/automation/configFiles

##########################################################################
# running repeat masker (DONE - 2012-07-i21 - Chin)
    mkdir /hive/data/genomes/papAnu2/bed/repeatMasker
    cd /hive/data/genomes/papAnu2/bed/repeatMasker
    time doRepeatMasker.pl -buildDir=`pwd` -noSplit \
	-bigClusterHub=swarm -dbHost=hgwdev -workhorse=hgwdev \
	-smallClusterHub=encodek papAnu2 > do.log 2>&1 &
    #   real    1354m59.921s

    cat faSize.rmsk.txt
    #   2948380710 bases (55130419 N's 2893250291 real 1378673208 upper
    #   1514577083 lower) in 63250 sequences in 1 files
    #   Total size: mean 46614.7 sd 2497441.4 min 200 (AHZZ01198925)
    #   max 220367699 (chr1) median 1416
    #   %51.37 masked total, %52.35 masked real

    egrep -i "versi|relea" do.log
    # RepeatMasker version development-$Id: RepeatMasker,v 1.26
    # 2011/09/26 16:19:44 angie Exp $
    # grep version of RepeatMasker$ /scratch/data/RepeatMasker/RepeatMasker
    #    April 26 2011 (open-3-3-0) version of RepeatMasker
    # grep RELEASE /scratch/data/RepeatMasker/Libraries/RepeatMaskerLib.embl
    # CC   RELEASE 20110920;

    featureBits -countGaps papAnu2 rmsk
    #   1516030289 bases of 2948380710 (51.419%) in intersection

    # why is it different than the faSize above ?
    # because rmsk masks out some N's as well as bases, the count above
    #	separates out the N's from the bases, it doesn't show lower case N's

##########################################################################
# running simple repeat (DONE - 2012-07-23 - Chin)
    mkdir /hive/data/genomes/papAnu2/bed/simpleRepeat
    cd /hive/data/genomes/papAnu2/bed/simpleRepeat
    time doSimpleRepeat.pl -buildDir=`pwd` -bigClusterHub=swarm \
	-dbHost=hgwdev -workhorse=hgwdev -smallClusterHub=encodek \
	papAnu2 > do.log 2>&1 &
    #   real    58m51.627s
    cat fb.simpleRepeat
    #   162539257 bases of 2934388335 (5.539%) in intersection

    # add to rmsk after it is done:
    cd /hive/data/genomes/papAnu2
    twoBitMask papAnu2.rmsk.2bit \
	-add bed/simpleRepeat/trfMask.bed papAnu2.2bit
    #	you can safely ignore the warning about fields >= 13

    twoBitToFa papAnu2.2bit stdout | faSize stdin > faSize.papAnu2.2bit.txt
    cat faSize.papAnu2.2bit.txt
    #   2948380710 bases (55130419 N's 2893250291 real 1377491621 upper
    #   1515758670 lower) in 63250 sequences in 1 files
    #   %51.41 masked total, %52.39 masked real

    rm /gbdb/papAnu2/papAnu2.2bit
    ln -s `pwd`/papAnu2.2bit /gbdb/papAnu2/papAnu2.2bit

#########################################################################
# Verify all gaps are marked, add any N's not in gap as type 'other'
#	(DONE - 2012-07-24 - Chin)
    mkdir /hive/data/genomes/papAnu2/bed/gap
    cd /hive/data/genomes/papAnu2/bed/gap
    time nice -n +19 findMotif -motif=gattaca -verbose=4 \
	-strand=+ ../../papAnu2.unmasked.2bit > findMotif.txt 2>&1
    #	real    1m17.322s
    grep "^#GAP " findMotif.txt | sed -e "s/^#GAP //" > allGaps.bed
    featureBits papAnu2 -not gap -bed=notGap.bed
    #   2934388335 bases of 2934388335 (100.000%) in intersection

    time featureBits papAnu2 allGaps.bed notGap.bed -bed=new.gaps.bed
    #   41138044 bases of 2934388335 (1.402%) in intersection
    #   real    32m23.676s

    #	what is the highest index in the existing gap table:
    hgsql -N -e "select ix from gap;" papAnu2 | sort -n | tail -1
    #	1430
    cat << '_EOF_' > mkGap.pl
#!/bin/env perl

use strict;
use warnings;

my $ix=`hgsql -N -e "select ix from gap;" papAnu2 | sort -n | tail -1`;
chomp $ix;

open (FH,"<new.gaps.bed") or die "can not read new.gaps.bed";
while (my $line = <FH>) {
    my ($chrom, $chromStart, $chromEnd, $rest) = split('\s+', $line);
    ++$ix;
    printf "%s\t%d\t%d\t%d\tN\t%d\tother\tyes\n", $chrom, $chromStart,
        $chromEnd, $ix, $chromEnd-$chromStart;
}
close (FH);
'_EOF_'
    # << happy emacs
    chmod +x ./mkGap.pl
    ./mkGap.pl > other.bed
    featureBits -countGaps papAnu2 other.bed
    #   41138044 bases of 2948380710 (1.395%) in intersection
    wc -l other.bed
    #	113636
    hgLoadBed -sqlTable=$HOME/kent/src/hg/lib/gap.sql \
	-noLoad papAnu2 otherGap other.bed
    #	starting with this many
    hgsql -e "select count(*) from gap;" papAnu2
    #    26003
    hgsql papAnu2 -e 'load data local infile "bed.tab" into table gap;'
    #	result count:
    hgsql -e "select count(*) from gap;" papAnu2
    #    139639
    # == 26003 + 113636
    # verify we aren't adding gaps where gaps already exist
    # this would output errors if that were true:
    gapToLift -minGap=1 papAnu2 nonBridged.lift -bedFile=nonBridged.bed
    # see example in danRer7.txt
    # there are non-bridged gaps here:
    hgsql -N -e "select bridge from gap;" papAnu2 | sort | uniq -c
    #   9250 no
    #   130389 yes

##########################################################################
## WINDOWMASKER (DONE - 2012-07-24 - Chin)
    mkdir /hive/data/genomes/papAnu2/bed/windowMasker
    cd /hive/data/genomes/papAnu2/bed/windowMasker
    time nice -n +19 doWindowMasker.pl -buildDir=`pwd` -workhorse=hgwdev \
	-dbHost=hgwdev papAnu2 > do.log 2>&1 &
    #   real    225m4.246s

    # Masking statistics
    twoBitToFa papAnu2.wmsk.2bit stdout | faSize stdin
    #  2948380710 bases (55130419 N's 2893250291 real 1783065601 upper
    #  1110184690 lower) in 63250 sequences in 1 files
    #  Total size: mean 46614.7 sd 2497441.4 min 200 (AHZZ01198925)
    #  max 220367699 (chr1) median 1416
    #  %37.65 masked total, %38.37 masked real

    hgLoadBed papAnu2 windowmaskerSdust windowmasker.sdust.bed.gz
    #	Read 15750029 elements of size 3 from windowmasker.sdust.bed.gz

    featureBits -countGaps papAnu2 windowmaskerSdust
    #   1181777644 bases of 2948380710 (40.082%) in intersection

    #	eliminate the gaps from the masking
    featureBits papAnu2 -not gap -bed=notGap.bed
    #	2893250291 bases of 2893250291 (100.000%) in intersection
    time nice -n +19 featureBits papAnu2 windowmaskerSdust notGap.bed \
        -bed=stdout | gzip -c > cleanWMask.bed.gz
    #   1126649532 bases of 2893250291 (38.941%) in intersection
    #	real    35m51.398s
    #	reload track to get it clean
    hgLoadBed papAnu2 windowmaskerSdust cleanWMask.bed.gz
    # Read 15783704 elements of size 4 from cleanWMask.bed.gz
    # Sorted
    # Creating table definition for windowmaskerSdust
    # Saving bed.tab
    # Loading papAnu2
    featureBits -countGaps papAnu2 windowmaskerSdust
    #   1126649532 bases of 2948380710 (38.212%) in intersection
    zcat cleanWMask.bed.gz \
	| twoBitMask ../../papAnu2.unmasked.2bit stdin \
	    -type=.bed papAnu2.cleanWMSdust.2bit
    twoBitToFa papAnu2.cleanWMSdust.2bit stdout | faSize stdin \
        > papAnu2.cleanWMSdust.faSize.txt
    cat papAnu2.cleanWMSdust.faSize.txt
    # 2948380710 bases (55130419 N's 2893250291 real 1766600759 upper
    # 1126649532 lower) in 63250 sequences in 1 files
    # Total size: mean 46614.7 sd 2497441.4 min 200 (AHZZ01198925)
    # max 220367699 (chr1) median 1416
    # %38.21 masked total, %38.94 masked real

    # how much does this window masker and repeat masker overlap:
    featureBits -countGaps papAnu2 rmsk windowmaskerSdust
    #   901904342 bases of 2948380710 (30.590%) in intersection

#########################################################################
# MASK SEQUENCE WITH WM+TRF (DONE - 2012-08-07 - Chin)
    #	since rmsk masks so very little of the genome, use the clean WM mask
    #	on the genome sequence
    cd /hive/data/genomes/papAnu2
    twoBitMask -add bed/windowMasker/papAnu2.cleanWMSdust.2bit \
	bed/simpleRepeat/trfMask.bed papAnu2.2bit
    #	safe to ignore the warnings about BED file with >=13 fields
    twoBitToFa papAnu2.2bit stdout | faSize stdin > faSize.papAnu2.txt
    cat faSize.papAnu2.txt
    #   2948380710 bases (55130419 N's 2893250291 real 1766238769 upper
    #   1127011522 lower) in 63250 sequences in 1 files
    #   Total size: mean 46614.7 sd 2497441.4 min 200 (AHZZ01198925)
    #   max 220367699 (chr1) median 1416
    #   ma%38.22 masked total, %38.95 masked real

    #	create symlink to gbdb
    ssh hgwdev
    rm /gbdb/papAnu2/papAnu2.2bit
    ln -s `pwd`/papAnu2.2bit /gbdb/papAnu2/papAnu2.2bit

########################################################################
# cpgIslands - (DONE 2012-08-08 - Chin)
    mkdir /hive/data/genomes/papAnu2/bed/cpgIslands
    cd /hive/data/genomes/papAnu2/bed/cpgIslands
    time doCpgIslands.pl papAnu2 > do.log 2>&1
    #   Elapsed time: 20m50s

    time featureBits papAnu2 cpgIslandExt > fb.papAnu2.cpgIslandExt.txt 2>&1
    #   real    0m22.345s
    cat fb.papAnu2.cpgIslandExt.txt
    #   18257085 bases of 2893250291 (0.631%) in intersection

#########################################################################
# genscan - (DONE 2012-09-06 - Chin)
    mkdir /hive/data/genomes/papAnu2/bed/genscan
    cd /hive/data/genomes/papAnu2/bed/genscan
    time doGenscan.pl papAnu2 > do.log 2>&1
    #   real    85m40.061s

    para time
    # 63250 jobs in batch
    # 74 jobs (including everybody's) in Parasol queue or running.
    # Checking finished jobs
    # Completed: 63249 of 63250 jobs
    # Crashed: 1 jobs
    # Failed job:
    # ./runGsBig.csh chr19 000 gtf/000/chr19.gtf pep/000/chr19.pep subopt/000/chr19.bed
    # set -window=1200000 in runGsBig.csh and manually run again. Success.

    # before continue, create a run.time file on swarm
    ssh swarm
    cd /hive/data/genomes/papAnu2/bed/genscan
    para check
    para time > run.time

    # continuing:
    time doGenscan.pl -continue=makeBed papAnu2 > makeBed.log 2>&1
    #   Elapsed time: 29m46s
    cat fb.papAnu2.genscan.txt
    #   40920336 bases of 2893250291 (1.414%) in intersection
    cat fb.papAnu2.genscanSubopt.txt
    #   49808228 bases of 2893250291 (1.722%) in intersection

#########################################################################
# MAKE 11.OOC FILE FOR BLAT/GENBANK (DONE - 2012-10-23 - Chin)
    # Use -repMatch=900, based on size -- for human we use 1024
    # use the "real" number from the faSize measurement,
    # hg19 is 2897316137, calculate the ratio factor for 1024:
    calc \( 2893250291 / 2897316137 \) \* 1024
    #   ( 2893250291 / 2897316137 ) * 1024 = 1022.563006

    # round up to 1025

    cd /hive/data/genomes/papAnu2
    time blat papAnu2.2bit /dev/null /dev/null -tileSize=11 \
      -makeOoc=jkStuff/papAnu2.11.ooc -repMatch=1025
    # Wrote 29128 overused 11-mers to jkStuff/papAnu2.11.ooc
    # Done making jkStuff/papAnu2.11.ooc
    # real    1m41.286s

    # there are non-bridged gaps, construct lift file needed for genbank
    hgsql -N -e "select bridge from gap;" papAnu2 | sort | uniq -c
    #    9250 no
    #  130389 yes

    cd /hive/data/genomes/papAnu2/jkStuff
    gapToLift papAnu2 papAnu2.nonBridged.lift -bedFile=papAnu2.nonBridged.bed
    # largest non-bridged contig:
    awk '{print $3-$2,$0}' papAnu2.nonBridged.bed | sort -nr | head
    #  3614795 chr4    105018890       108633685       chr4.306

#########################################################################
# AUTO UPDATE GENBANK (DONE - 2012-10-23 - Chin)
    # examine the file:
    /cluster/data/genbank/data/organism.lst
    # for your species to see what counts it has for:
    # organism       mrnaCnt estCnt  refSeqCnt
    # Papio anubis    227     145582  462

    # to decide which "native" mrna or ests you want to specify in genbank.conf
    ssh hgwdev
    cd $HOME/kent/src/hg/makeDb/genbank
    git pull
    # edit etc/genbank.conf to add papAnu2 just ibefore papHam1
# papAnu2 (baboon)
papAnu2.serverGenome = /hive/data/genomes/papAnu2/papAnu2.2bit
papAnu2.clusterGenome = /hive/data/genomes/papAnu2/papAnu2.2bit
papAnu2.ooc = /hive/data/genomes/papAnu2/jkStuff/papAnu2.11.ooc
papAnu2.lift = /hive/data/genomes/papAnu2/jkStuff/papAnu2.nonBridged.lift
papAnu2.perChromTables = no
papAnu2.refseq.mrna.native.pslCDnaFilter  = ${ordered.refseq.mrna.native.pslCDnaFilter}
papAnu2.refseq.mrna.xeno.pslCDnaFilter    = ${ordered.refseq.mrna.xeno.pslCDnaFilter}
papAnu2.genbank.mrna.native.pslCDnaFilter = ${ordered.genbank.mrna.native.pslCDnaFilter}
papAnu2.genbank.mrna.xeno.pslCDnaFilter   = ${ordered.genbank.mrna.xeno.pslCDnaFilter}
papAnu2.genbank.est.native.pslCDnaFilter  = ${ordered.genbank.est.native.pslCDnaFilter}
papAnu2.genbank.est.xeno.pslCDnaFilter    = ${ordered.genbank.est.xeno.pslCDnaFilter}
papAnu2.downloadDir = papAnu2
papAnu2.refseq.mrna.native.load  = yes
papAnu2.refseq.mrna.xeno.load  = yes
papAnu2.genbank.mrna.xeno.load = yes
papAnu2.genbank.mrna.xeno.loadDesc = yes
papAnu2.genbank.est.native.load = yes

    # end of section added to etc/genbank.conf
    git commit -m "adding papAnu2 Papio anubis" etc/genbank.conf
    git push
    make etc-update

    git pull
    # Edit src/lib/gbGenome.c to add new species.
    git commit -m "adding definition for papAnulNames" \
        src/lib/gbGenome.c
    git push
    make install-server

    ssh hgwdev			# used to do this on "genbank" machine
    screen -S papAnu2           # long running job managed in screen
    cd /cluster/data/genbank
    time nice -n +19 ./bin/gbAlignStep -initial papAnu2 &
    #   var/build/logs/2012.06.08-09:52:19.papAnu2.initalign.log
    #   real    1641m58.577s

    # load database when finished
    ssh hgwdev
    cd /cluster/data/genbank
    time nice -n +19 ./bin/gbDbLoadStep -drop -initialLoad papAnu2 &
    #	logFile: var/dbload/hgwdev/logs/2012.10.26-10:34:02.dbload.log
    #   real    106m43.697s

    # enable daily alignment and update of hgwdev
    cd ~/kent/src/hg/makeDb/genbank
    git pull
    # add papAnu2 to:
    vi etc/align.dbs etc/hgwdev.dbs
    git commit -m "Added papAnu2." etc/align.dbs etc/hgwdev.dbs
    git push
    make etc-update

##########################################################################
#  BLATSERVERS ENTRY (DONE - 2012-10-19 - Chin)
#       After getting a blat server assigned by the Blat Server Gods,
# blat4c
# port 17838 trans
# port 17839 untrans

    ssh hgwdev
    hgsql -e 'INSERT INTO blatServers (db, host, port, isTrans, canPcr) \
        VALUES ("papAnu2", "blat4c", "17838", "1", "0"); \
        INSERT INTO blatServers (db, host, port, isTrans, canPcr) \
        VALUES ("papAnu2", "blat4c", "17839", "0", "1");' \
            hgcentraltest
    #   test it with some sequence


#########################################################################
# all.joiner update, downloads and in pushQ - (DONE 2012-10-30 - Chin)

    cd $HOME/kent/src/hg/makeDb/schema
    # fixup all.joiner until this is a clean output
    # add melGal after galGal
    joinerCheck -database=papAnu2 -all all.joiner

    cd /hive/data/genomes/papAnu2
    time makeDownloads.pl -workhorse=hgwdev papAnu2 > makeDownloads.log 2>&1
    #   real    12m40.799s
 *** All done!
 *** Please take a look at the downloads for papAnu2 using a web browser.
 *** The downloads url is: http://hgdownload-test.soe.ucsc.edu/goldenPath/papAnu2.
 *** Edit each README.txt to resolve any notes marked with "***":
     /cluster/data/papAnu2/goldenPath/database/README.txt
     /cluster/data/papAnu2/goldenPath/bigZips/README.txt
     (The htdocs/goldenPath/papAnu2/*/README.txt "files" are just links to those.)
 *** If you have to make any edits that would always apply to future
     assemblies from the same sequencing center, please edit them into
     ~/kent/src/hg/utils/automation/makeDownloads.pl (or ask Angie for help).

    #   now ready for pushQ entry
    mkdir /hive/data/genomes/papAnu2/pushQ
    cd /hive/data/genomes/papAnu2/pushQ
    time makePushQSql.pl papAnu2 > papAnu2.pushQ.sql 2> stderr.out
    #   real    3m39.043s
    #   check for errors in stderr.out, some are OK, e.g.:
    # WARNING: hgwdev does not have /gbdb/papAnu2/wib/gc5Base.wib
    # WARNING: hgwdev does not have /gbdb/papAnu2/wib/quality.wib
    # WARNING: hgwdev does not have /gbdb/papAnu2/bbi/quality.bw
    # WARNING: papAnu2 does not have seq
    # WARNING: papAnu2 does not have extFile

    #   copy it to hgwbeta
    scp -p papAnu2.pushQ.sql hgwbeta:/tmp
    ssh hgwbeta "hgsql qapushq < /tmp/papAnu2.pushQ.sql"
    #   in that pushQ entry walk through each track entry and see if the table
    #   sizes will set properly
#############################################################################
# LIFTOVER TO papHam1 (DONE - 2013-01-16 - Chin)
    sh hgwdev
    cd /hive/data/genomes/papAnu2
    ln -s jkStuff/papAnu2.11.ooc 11.ooc
    screen
    time nice -n +19 doSameSpeciesLiftOver.pl -verbose=2 \
        -bigClusterHub=swarm -dbHost=hgwdev -workhorse=hgwdev \
         papAnu2 papHam1 > doLiftOverToPapHam1.log 2>&1 &
    # real  2563m6.343s

    pushd /usr/local/apache/htdocs-hgdownload/goldenPath/papAnu2/liftOver/
    md5sum papAnu2ToPapHam1.over.chain.gz >> md5sum.txt
    popd

#########################################################################
# lastz swap from hg19 (DONE - 2014-07-25 - Hiram)
    # original alignment
    cd /hive/data/genomes/hg19/bed/lastzPapAnu2.2013-08-19
    cat fb.hg19.chainPapAnu2Link.txt
    # 2467659115 bases of 2897316137 (85.171%) in intersection

    #	running the swap - DONE - 2014-07-25
    mkdir /hive/data/genomes/papAnu2/bed/blastz.hg19.swap
    cd /hive/data/genomes/papAnu2/bed/blastz.hg19.swap

    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
	-swap /hive/data/genomes/hg19/bed/lastzPapAnu2.2013-08-19/DEF \
	-syntenicNet -chainMinScore=5000 -chainLinearGap=medium \
	-workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \
	> swap.log 2>&1
    #  real    99m36.864s

    cat fb.papAnu2.chainHg19Link.txt
    #  2410008689 bases of 2893250291 (83.298%) in intersection

############################################################################
#  papAnu2 - Baboon - Ensembl Genes version 76  (DONE - 2014-08-08 - hiram)
    ssh hgwdev
    cd /hive/data/genomes/papAnu2
    cat << '_EOF_' > papAnu2.ensGene.ra
# required db variable
db papAnu2

# remove the first .1 from the contig names in the Ensembl GTF file
# UCSC does not have the chrMt, and add chr to the chrom names
nameTranslation "s/^\([0-9XY][0-9]*\)/chr\1/; /MT/d; s/\.1//"
'_EOF_'
#  << happy emacs

    doEnsGeneUpdate.pl  -ensVersion=76 papAnu2.ensGene.ra
    ssh hgwdev
    cd /hive/data/genomes/papAnu2/bed/ensGene.76
    featureBits papAnu2 ensGene
    # 45154927 bases of 2893250291 (1.561%) in intersection

############################################################################

##############################################################################
# TransMap V3 tracks. see makeDb/doc/transMapTracks.txt (2014-12-21 markd)
##############################################################################

#########################################################################
# create ucscToEnsembl scaffold name mapping (DONE - 2015-03-31 - Hiram)
    # this can be done when Ensembl releases genes for this organism
    # this allows the "ensembl" blue bar button to appear
    mkdir /hive/data/genomes/papAnu2/bed/ucscToEnsembl
    cd /hive/data/genomes/papAnu2/bed/ucscToEnsembl
    # survey the Ensembl names to see what we are dealing with:
    zcat ../ensGene.79/download/Papio_anubis.PapAnu2.0.79.gtf.gz \
       | grep -v "^#" | cut -f1 | sort | uniq -c > ens.name.list
    # it looks like we remove our chr and add a .1 to the contigs
    grep "^chr" ../../chrom.sizes | cut -f1 | sed -e 's/chr//;' \
      | awk '{printf "chr%d\t%d\n", $1, $1}' > t.tab
    grep -v "^chr" ../../chrom.sizes | cut -f1 \
      | awk '{printf "%s\t%s.1\n", $1, $1}' >> t.tab
    sort t.tab > ucscToEnsembl.tab
    rm -f t.tab
    # determine size of largest UCSC name:
    awk '{print length($1)}' ucscToEnsembl.tab | sort -rn | head -1
    # 12

    # use this size 12 in the sql PRIMARY KEY:

    cat << '_EOF_' > ucscToEnsembl.sql
# UCSC to Ensembl chr name translation
CREATE TABLE ucscToEnsembl (
    ucsc varchar(255) not null,        # UCSC chromosome name
    ensembl varchar(255) not null,     # Ensembl chromosome name
              #Indices
    PRIMARY KEY(ucsc(12))
);
'_EOF_'

    hgsql papAnu2 < ucscToEnsembl.sql
    hgsql papAnu2 \
-e 'LOAD DATA LOCAL INFILE "ucscToEnsembl.tab" INTO TABLE ucscToEnsembl'

    # verify the blue bar "ensembl" link is now available and works to there

##############################################################################
# GENEID GENE PREDICTIONS (DONE - 2015-07-31 - Hiram)
    ssh hgwdev
    mkdir /hive/data/genomes/papAnu2/bed/geneid
    cd /hive/data/genomes/papAnu2/bed/geneid
    wget --timestamping \
http://genome.crg.es/genepredictions/P.anubis/papAnu2/geneid_v1.4/00README
    wget --timestamping \
http://genome.crg.es/genepredictions/P.anubis/papAnu2/geneid_v1.4/papAnu2.geneid.gtf

    ldHgGene -gtf -genePredExt papAnu2 geneid papAnu2.geneid.gtf
    # Read 52364 transcripts in 241824 lines in 1 files
    #   52364 groups 24207 seqs 1 sources 3 feature types
    # 52364 gene predictions

    featureBits -enrichment papAnu2 augustusGene:CDS geneid
# augustusGene:CDS 1.001%, geneid 0.858%, both 0.538%, cover 53.72%,
#    enrich 62.64x

##########################################################################
# LIFTOVER TO papAnu3 (DONE - 2017-06-23 - Hiram)
    ssh hgwdev
    mkdir /hive/data/genomes/papAnu2/bed/blat.papAnu3.2017-06-23
    cd /hive/data/genomes/papAnu2/bed/blat.papAnu3.2017-06-23
    time (doSameSpeciesLiftOver.pl -verbose=2 -buildDir=`pwd` \
	-ooc=/hive/data/genomes/papAnu2/jkStuff/papAnu2.11.ooc \
        -bigClusterHub=ku -dbHost=hgwdev -workhorse=hgwdev \
         papAnu2 papAnu3) > do.log 2>&1
    # real    393m53.436s

    # verify the convert link on the test browser is now active from papAnu2 to
    # papAnu3

#########################################################################
# LIFTOVER TO papAnu4 (DONE - 2018-01-08 - Hiram)
    ssh hgwdev
    mkdir /hive/data/genomes/papAnu2/bed/blat.papAnu4.2018-01-08
    cd /hive/data/genomes/papAnu2/bed/blat.papAnu4.2018-01-08
    time (doSameSpeciesLiftOver.pl -verbose=2 -buildDir=`pwd` \
	-ooc=/hive/data/genomes/papAnu2/jkStuff/papAnu2.11.ooc \
        -bigClusterHub=ku -dbHost=hgwdev -workhorse=hgwdev \
         papAnu2 papAnu4) > do.log 2>&1 &
    # real    577m17.830s

    # verify the convert link on the test browser is now active from papAnu2 to
    # papAnu4

#########################################################################
