# for emacs: -*- mode: sh; -*-

# This file describes browser build for the sarHar1
# Sarcophilus harrisii (Tasmanian Devil)

ASSEMBLY_INFO:
DATE:   07-Feb-2011
ORGANISM:       Sarcophilus harrisii
TAXID:  9305
ASSEMBLY LONG NAME:     Devil_ref v7.0
ASSEMBLY SHORT NAME:    Devil_ref v7.0
ASSEMBLY SUBMITTER:     Wellcome Trust Sanger Institute
ASSEMBLY TYPE:  Haploid
NUMBER OF ASSEMBLY-UNITS:       1
Assembly Accession:     GCA_000189315.1

FTP-RELEASE DATE: 20-Jan-2012

# ftp:
# ftp://ftp.ncbi.nlm.nih.gov/genbank/genomes/Eukaryotes/vertebrates_mammals/Sarcophilus_harrisii/Devil_ref_v7.0/

# Assembly ID: 32874
# http://www.ncbi.nlm.nih.gov/genome/assembly/328478/
ncbiGenomeID=51853
# http://www.ncbi.nlm.nih.gov/bioproject/51853


# WGS Project: AEFK01
# http://www.ncbi.nlm.nih.gov/nuccore/AEFK00000000.1

#	http://www.ncbi.nlm.nih.gov/genome/3066
#	http://www.ncbi.nlm.nih.gov/bioproject/51853 - Sanger
#	http://www.ncbi.nlm.nih.gov/Traces/wgs/?val=AEFK01

#	http://www.ncbi.nlm.nih.gov/bioproject/65325 - Venter
#	http://www.ncbi.nlm.nih.gov/Traces/wgs/?val=AFEY00

#############################################################################
# Fetch sequence from genbank (DONE - 2012-01-27 - Chin)

    mkdir -p /hive/data/genomes/sarHar1/genbank
    cd /hive/data/genomes/sarHar1/genbank

    wget --timestamping -r --cut-dirs=6 --level=0 -nH -x \
        --no-remove-listing -np \
"ftp://ftp.ncbi.nlm.nih.gov/genbank/genomes/Eukaryotes/vertebrates_mammals/Sarcophilus_harrisii/Devil_ref_v7.0/*"
    # FINISHED --2012-01-24
    # Read ASSEMBLY_INFO 

    # measure sequence to be used here
    faSize Primary_Assembly/unplaced_scaffolds/FASTA/*.fa.gz \
	Primary_Assembly/unlocalized_scaffolds/FASTA/*.fa.gz
    #   3174693010 bases (243153308 N's 2931539702 real 2931539702 upper 
    #   0 lower) in 35974 sequences in 8 files
    #   Total size: mean 88249.7 sd 413850.2 min 642 
    #   (gi|323539178|gb|GL867959.1|) max 5315331 
    #   (gi|323557296|gb|GL849905.1|) median 2948

#############################################################################
# process into UCSC naming scheme (DONE - 2012-01-30 - Chin)
    mkdir /hive/data/genomes/sarHar1/ucsc
    cd /hive/data/genomes/sarHar1/ucsc

    cat << '_EOF_' > unplaced.pl
#!/bin/env perl

use strict;
use warnings;

my $agpFile =  "../genbank/Primary_Assembly/unplaced_scaffolds/AGP/unplaced.scaf.agp.gz";
my $fastaFile =  "../genbank/Primary_Assembly/unplaced_scaffolds/FASTA/unplaced.scaf.fa.gz";
open (FH, "zcat $agpFile|") or die "can not read $agpFile";
open (UC, ">unplaced.agp") or die "can not write to unplaced.agp";
while (my $line = <FH>) {
    if ($line =~ m/^#/) {
        print UC $line;
    } else {
        $line =~ s/\.1//;    
        printf UC "chrUn_%s", $line;
    }
}
close (FH);
close (UC);

open (FH, "zcat $fastaFile|") or die "can not read $fastaFile";
open (UC, ">unplaced.fa") or die "can not write to unplaced.fa";
while (my $line = <FH>) {
    if ($line =~ m/^>/) {
        chomp $line;
        $line =~ s/.*gb\|//;
        $line =~ s/\.1\|.*//;
        printf UC ">chrUn_$line\n";
    } else {
        print UC $line;
    }
}
close (FH);
close (UC);
'_EOF_'
    # << happy emacs
    chmod +x unplaced.pl

    cat << '_EOF_' > unlocalized.pl
#!/bin/env perl

use strict;
use warnings;

my %accToChr;
my %chrNames;

open (FH, "<../genbank/Primary_Assembly/unlocalized_scaffolds/unlocalized.chr2scaf") or
        die "can not read Primary_Assembly/unlocalized_scaffolds/unlocalized.chr2scaf";
while (my $line = <FH>) {
    next if ($line =~ m/^#/);
    chomp $line;
    my ($chrN, $acc) = split('\s+', $line);
    $accToChr{$acc} = $chrN;
    $chrNames{$chrN} += 1;
}
close (FH);

foreach my $chrN (keys %chrNames) {
    my $agpFile =  "../genbank/Primary_Assembly/unlocalized_scaffolds/AGP/chr$chrN.unlocalized.scaf.agp.gz";
    my $fastaFile =  "../genbank/Primary_Assembly/unlocalized_scaffolds/FASTA/chr$chrN.unlocalized.scaf.fa.gz";
    open (FH, "zcat $agpFile|") or die "can not read $agpFile";
    open (UC, ">chr${chrN}_random.agp") or die "can not write to chr${chrN}_random.agp";
    while (my $line = <FH>) {
        if ($line =~ m/^#/) {
            print UC $line;
        } else {
            chomp $line;
            my (@a) = split('\t', $line);
            my $acc = $a[0];
            my $accNo1 = $acc;
            $accNo1 =~ s/.1$//;
            die "ERROR: acc not .1: $acc" if ($accNo1 =~ m/\./);
            die "ERROR: chrN $chrN not correct for $acc"
                if ($accToChr{$acc} ne $chrN);
            my $ucscName = "chr${chrN}_${accNo1}_random";
            printf UC "%s", $ucscName;
            for (my $i = 1; $i < scalar(@a); ++$i) {
                printf UC "\t%s", $a[$i];
            }
            printf UC "\n";
        }
    }
    close (FH);
    close (UC);
    printf "chr%s\n", $chrN;
    open (FH, "zcat $fastaFile|") or die "can not read $fastaFile";
    open (UC, ">chr${chrN}_random.fa") or die "can not write to chr${chrN}_random.fa";
    while (my $line = <FH>) {
        if ($line =~ m/^>/) {
            chomp $line;
            my $acc = $line;
            $acc =~ s/.*gb\|//;
            $acc =~ s/\|.*//;
            my $accNo1 = $acc;
            $accNo1 =~ s/.1$//;
            die "ERROR: acc not .1: $acc" if ($accNo1 =~ m/\./);
            die "ERROR: chrN $chrN not correct for $acc"
                if ($accToChr{$acc} ne $chrN);
            my $ucscName = "chr${chrN}_${accNo1}_random";
            printf UC ">$ucscName\n";
        } else {
            print UC $line;
        }
    }
    close (FH);
    close (UC);
}
'_EOF_'
    # << happy emacs
    chmod +x unlocalized.pl

    ./unlocalized.pl
    ./unplaced.pl

    gzip *.fa *.agp
    # this takes a few minutes

    # verify nothing lost in the translation, should be the same as above
    #	except for the name translations
    faSize *.fa.gz
    #   3174693010 bases (243153308 N's 2931539702 real 2931539702 upper 
    #   0 lower) in 35974 sequences in 8 files
    #   Total size: mean 88249.7 sd 413850.2 min 642 (chrX_GL867959_random) 
    #   max 5315331 (chr3_GL849905_random) median 2948

#############################################################################
#   Initial browser build (DONE - 2012-01-30 - Chin)
    cd /hive/data/genomes/sarHar1
    cat << '_EOF_' > sarHar1.config.ra
# Config parameters for makeGenomeDb.pl:
db sarHar1
clade mammal
genomeCladePriority 64
scientificName Sarcophilus harrisii
commonName Tasmanian devil
assemblyDate Feb. 2011
assemblyLabel WTSI Devil_ref v7.0
assemblyShortLabel WTSI Devil_ref v7.0
orderKey 3590
mitoAcc none
fastaFiles /hive/data/genomes/sarHar1/ucsc/*.fa.gz
agpFiles /hive/data/genomes/sarHar1/ucsc/*.agp.gz
dbDbSpeciesDir tasmanianDevil
photoCreditURL http://commons.wikimedia.org/wiki/File:Sarcophilus_harrisii_taranna.jpg
photoCreditName Courtesy of JJ Harrison and Wikimedia Commons
ncbiGenomeId 3066
ncbiAssemblyId 328478
ncbiAssemblyName Devil_ref v7.0
ncbiBioProject 51853
genBankAccessionID GCA_000189315.1
taxId   9305
'_EOF_'
    # << happy emacs

    time makeGenomeDb.pl -stop=agp -dbHost=hgwdev -fileServer=hgwdev \
       -workhorse=hgwdev -noGoldGapSplit sarHar1.config.ra \
       > makeGenomeDb.agp.log 2>&1 & 
    #	real    2m59.472s
    # check the end of agp.log to verify it is OK
    time makeGenomeDb.pl -continue=db -dbHost=hgwdev -fileServer=hgwdev \
       -workhorse=hgwdev -noGoldGapSplit sarHar1.config.ra \
        > makeGenomeDb.db.log 2>&1 &
    #	real    21m56.285s
    #  Per instructions in makeGenomeDb.db.log:
Search for '***' notes in each file in and make corrections (sometimes the
files used for a previous assembly might make a better template):
  description.html /cluster/data/sarHar1/html/{trackDb.ra,gap.html,gold.html}

Then copy these files to your ~/kent/src/hg/makeDb/trackDb/tasmanianDevil/sarHar1
 - cd ~/kent/src/hg/makeDb/trackDb
 - edit makefile to add sarHar1 to DBS.
 - git add tasmanianDevil/sarHar1/*.{ra,html}
 - git commit -m "Added sarHar1 to DBS." makefile
 - git commit -m "Initial descriptions for sarHar1." 
   tasmanianDevil/sarHar1/*.{ra,html}
 - git pull; git push
 - Run make update DBS=sarHar1 and make alpha when done.
 - (optional) Clean up /cluster/data/sarHar1/TemporaryTrackDbCheckout

    # re-do the description.html (2012-06-11 Chin)
    cd /hive/data/genomes/sarHar1
    makeGenomeDb.pl -forceDescription -continue=trackDb sarHar1.config.ra > makeGenomeDb.descrition.log 2>&1 &
    cp html/description.html ~/kent/src/hg/makeDb/trackDb/tasmanianDevil/sarHar1
    # check in descrition.html
    cp sarHar1.config.ra ~/kent/src/hg/utils/automation/configFiles
    cd ~/kent/src/hg/utils/automation/configFiles
    git add sarHar1.config.ra
    git commit -m "Use new description.html format for sarHar1." sarHar1.config.ra

#########################################################################
# RepeatMasker (DONE 2012-02-02 - Chin)
    mkdir /hive/data/genomes/sarHar1/bed/repeatMasker
    cd /hive/data/genomes/sarHar1/bed/repeatMasker
    time nice -n +19 doRepeatMasker.pl -buildDir=`pwd` \
	-workhorse=hgwdev -bigClusterHub=swarm -noSplit sarHar1 > do.log 2>&1 &
    #   real     257m53.975
    cat faSize.rmsk.txt
    # 3174693010 bases (243153308 N's 2931539702 real 1678166674 upper 
      1253373028 lower) in 35974 sequences in 1 files
    # Total size: mean 88249.7 sd 413850.2 
    # min 642 (chrX_GL867959_random) max 5315331 (chr3_GL849905_random)
    # median 2948
    # %39.48 masked total, %42.75 masked real

    grep -i versi do.log
# RepeatMasker version development-$Id: RepeatMasker,v 1.26 2011/09/26 16:19:44 angie Exp $
#    April 26 2011 (open-3-3-0) version of RepeatMasker
    featureBits -countGaps sarHar1 rmsk
    # 1257548613 bases of 3174693010 (39.612%) in intersection

    # why is it different than the faSize above ?
    # because rmsk masks out some N's as well as bases, the count above
    #   separates out the N's from the bases, it doesn't show lower case N's

#########################################################################
# simpleRepeats (DONE - 2012-02-02 - Chin)
    mkdir /hive/data/genomes/sarHar1/bed/simpleRepeat
    cd /hive/data/genomes/sarHar1/bed/simpleRepeat
    time nice -n +19 doSimpleRepeat.pl -buildDir=`pwd` -workhorse=hgwdev \
	-dbHost=hgwdev -bigClusterHub=swarm -smallClusterHub=memk \
        sarHar1 > do.log 2>&1 &
    #   real    83m58.271s

    cat fb.simpleRepeat 
    # 64997640 bases of 2931539806 (2.217%) in intersection

    # add to rmsk after it is done:
    cd /hive/data/genomes/sarHar1
    twoBitMask sarHar1.rmsk.2bit -add bed/simpleRepeat/trfMask.bed sarHar1.2bit
    #	safe to ignore warnings about >=13 fields
    twoBitToFa sarHar1.2bit stdout | faSize stdin > sarHar1.2bit.faSize.txt
    cat sarHar1.2bit.faSize.txt
    #3174693010 bases (243153308 N's 2931539702 real 1676952423 upper 
    # 1254587279 lower) in 35974 sequences in 1 files
    # Total size: mean 88249.7 sd 413850.2 min 642 (chrX_GL867959_random) 
    # max 5315331 (chr3_GL849905_random) median 2948
    # %39.52 masked total, %42.80 masked real

    # double check with featureBits
    featureBits -countGaps sarHar1 gap
    # 243153204 bases of 3174693010 (7.659%) in intersection

    rm /gbdb/sarHar1/sarHar1.2bit
    ln -s `pwd`/sarHar1.2bit /gbdb/sarHar1/sarHar1.2bit

#########################################################################
# Marking *all* gaps - they are not all in the AGP file
#	(DONE - 2012-02-02 - Chin)
    mkdir /hive/data/genomes/sarHar1/bed/allGaps
    cd /hive/data/genomes/sarHar1/bed/allGaps

    time nice -n +19 findMotif -motif=gattaca -verbose=4 \
	-strand=+ ../../sarHar1.unmasked.2bit > findMotif.txt 2>&1 &
    #   real    0m37.373
    grep "^#GAP " findMotif.txt | sed -e "s/^#GAP //" > allGaps.bed
    featureBits sarHar1 -not gap -bed=notGap.bed
    # 2931539806 bases of 2931539806 (100.000%) in intersection
    featureBits sarHar1 allGaps.bed notGap.bed -bed=new.gaps.bed
    # 104 bases of 2931539806 (0.000%) in intersection

    #	what is the highest index in the existing gap table:
    hgsql -N -e "select ix from gap;" sarHar1 | sort -n | tail -1
    #    798	

    # use tcsh and ctrl-c to create the here doc
    cat << '_EOF_' > mkGap.pl
#!/usr/bin/env perl

use strict;
use warnings;

my $ix=`hgsql -N -e "select ix from gap;" sarHar1 | sort -n | tail -1`;
chomp $ix;

open (FH,"<new.gaps.bed") or die "can not read new.gaps.bed";
while (my $line = <FH>) {
    my ($chrom, $chromStart, $chromEnd, $rest) = split('\s+', $line);
    ++$ix;
    printf "%s\t%d\t%d\t%d\tN\t%d\tother\tyes\n", $chrom, $chromStart,
        $chromEnd, $ix, $chromEnd-$chromStart;
}
close (FH);
'_EOF_'
    # << happy emacs
    chmod +x ./mkGap.pl
    ./mkGap.pl > other.bed
    featureBits -countGaps sarHar1 other.bed
    # 0 bases of 3174693010 (0.000%) in intersection
    hgLoadBed -sqlTable=$HOME/kent/src/hg/lib/gap.sql \
	-noLoad sarHar1 otherGap other.bed
    # Reading other.bed
    # Read 86 elements of size 8 from other.bed
    # Sorted
    # Saving bed.tab
    # No load option selected, see file: bed.tab

    #   adding this many:
    wc -l bed.tab
    #   86
    #   starting with this many
    hgsql -e "select count(*) from gap;" sarHar1
    #   201317
    hgsql sarHar1 -e 'load data local infile "bed.tab" into table gap;'
    #   result count:
    hgsql -e "select count(*) from gap;" sarHar1
    #  201403 == 201317 + 86 


##########################################################################
## WINDOWMASKER (DONE - 2012-02-06 - Chin)
    mkdir /hive/data/genomes/sarHar1/bed/windowMasker
    cd /hive/data/genomes/sarHar1/bed/windowMasker
    time nice -n +19 doWindowMasker.pl -buildDir=`pwd` -workhorse=hgwdev \
        -dbHost=hgwdev sarHar1 > do.log 2>&1 &
    #   real     235m3.575s

    # Masking statistics
    twoBitToFa sarHar1.wmsk.2bit stdout | faSize stdin
    # 3174693010 bases (243153308 N's 2931539702 real 1789262669 upper 
    # 1142277033 lower) in 35974 sequences in 1 files
    # Total size: mean 88249.7 sd 413850.2 min 642 (chrX_GL867959_random) 
    # max 5315331 (chr3_GL849905_random) median 2948
    # %35.98 masked total, %38.97 masked real


    twoBitToFa sarHar1.wmsk.sdust.2bit stdout | faSize stdin
    # 3174693010 bases (243153308 N's 2931539702 real 1769157397 upper 
    # 1162382305 lower) in 35974 sequences in 1 files
    # Total size: mean 88249.7 sd 413850.2 min 642 (chrX_GL867959_random) 
    # max 5315331 (chr3_GL849905_random) median 2948
    # %36.61 masked total, %39.65 masked real

    hgLoadBed sarHar1 windowmaskerSdust windowmasker.sdust.bed.gz
    # Reading windowmasker.sdust.bed.gz
    # Read 20892844 elements of size 3 from windowmasker.sdust.bed.gz
    # Sorted
    # Creating table definition for windowmaskerSdust
    # Saving bed.tab
    # Loading sarHar1

    featureBits -countGaps sarHar1 windowmaskerSdust
    #   1405535533 bases of 3174693010 (44.273%) in intersection

    #   eliminate the gaps from the masking
    featureBits sarHar1 -not gap -bed=notGap.bed
    #   2931539702 bases of 2931539702 (100.000%) in intersection
    time nice -n +19 featureBits sarHar1 windowmaskerSdust notGap.bed \
        -bed=stdout | gzip -c > cleanWMask.bed.gz &
    # 1162382305 bases of 2931539702 (39.651%) in intersection
    # real    32m2.289s

    #   reload track to get it clean
    hgLoadBed sarHar1 windowmaskerSdust cleanWMask.bed.gz
    # Reading cleanWMask.bed.gz
    # Read 20998733 elements of size 4 from cleanWMask.bed.gz
    # Sorted
    # Creating table definition for windowmaskerSdust
    # Saving bed.tab
    # Loading sarHar1

    featureBits -countGaps sarHar1 windowmaskerSdust
    # 1162382305 bases of 3174693010 (36.614%) in intersection
    #   mask the sequence with this clean mask
    zcat cleanWMask.bed.gz \
        | twoBitMask ../../sarHar1.unmasked.2bit stdin \
            -type=.bed sarHar1.cleanWMSdust.2bit
    twoBitToFa sarHar1.cleanWMSdust.2bit stdout | faSize stdin \
        > sarHar1.cleanWMSdust.faSize.txt
    cat sarHar1.cleanWMSdust.faSize.txt
    # 174693010 bases (243153308 N's 2931539702 real 1769157397 upper 
    # 1162382305 lower) in 35974 sequences in 1 files
    # Total size: mean 88249.7 sd 413850.2 min 642 (chrX_GL867959_random) 
    # max 5315331 (chr3_GL849905_random) median 2948
    # %36.61 masked total, %39.65 masked real


    # how much does this window masker and repeat masker overlap:
    featureBits -countGaps sarHar1 rmsk windowmaskerSdust
    # 732613957 bases of 3174693010 (23.077%) in intersection

#########################################################################
# MASK SEQUENCE WITH WM+TRF (DONE - 2012-02-08 - Chin)
    cd /hive/data/genomes/sarHar1
    twoBitMask -add bed/windowMasker/sarHar1.cleanWMSdust.2bit \
        bed/simpleRepeat/trfMask.bed sarHar1.2bit
    #   safe to ignore the warnings about BED file with >=13 fields
    twoBitToFa sarHar1.2bit stdout | faSize stdin > faSize.sarHar1.txt
    cat faSize.sarHar1.txt
    # 3174693010 bases (243153308 N's 2931539702 real 1768770389 upper 
    # 1162769313 lower) in 35974 sequences in 1 files
    # Total size: mean 88249.7 sd 413850.2 min 642 (chrX_GL867959_random) 
    # max 5315331 (chr3_GL849905_random) median 2948
    # %36.63 masked total, %39.66 masked real

    #   create symlink to gbdb
    ssh hgwdev
    rm /gbdb/sarHar1/sarHar1.2bit
    ln -s `pwd`/sarHar1.2bit /gbdb/sarHar1/sarHar1.2bit


########################################################################
# Create kluster run files (DONE - 2012-02-08 - Chin)
    # numerator is sarHar1 gapless bases "real" as reported by: 
    cat sarHar1.2bit.faSize.txt
    # 3174693010 bases (243153308 N's 2931539702 real 1676952423 upper 
    # or 
    featureBits -noHap sarHar1 gap
    # 243153308 bases of 2931539702 (8.294%) in intersection

    featureBits -noRandom -noHap sarHar1 gap
    # 213913418 bases of 2430064805 (8.803%) in intersection

    # denominator is hg19 gapless bases as reported by:
    #	featureBits -noRandom -noHap hg19 gap
    #     234344806 bases of 2861349177 (8.190%) in intersection
    # 1024 is threshold used for human -repMatch:
    calc \( 2931539702 / 2861349177 \) \* 1024
    ( 2931539702 / 2861349177 ) * 1024 = 1049.119303

    # ==> use -repMatch=1000 according to size scaled down from 1024 for human.
    #	and rounded down to nearest 50
    cd /hive/data/genomes/sarHar1
    blat sarHar1.2bit \
	 /dev/null /dev/null -tileSize=11 -makeOoc=jkStuff/sarHar1.11.ooc \
	-repMatch=1000 &
    # Wrote 47237 overused 11-mers to jkStuff/sarHar1.11.ooc
    mkdir /hive/data/staging/data/sarHar1
    cp -p sarHar1.2bit jkStuff/sarHar1.11.ooc /hive/data/staging/data/sarHar1
    cp -p chrom.sizes /hive/data/staging/data/sarHar1

    #	check non-bridged gaps to see what the typical size is:
    hgsql -N \
	-e 'select * from gap where bridge="no" order by size;' sarHar1 \
	| sort -k7,7nr
    # all gaps are bridged 
    hgsql -e "select bridge from gap;" sarHar1 | sort | uniq
    # bridge
    # yes

    # skip the following steps
    # gapToLift -verbose=2 -minGap=100 sarHar1 jkStuff/nonBridged.lft \
    #	-bedFile=jkStuff/nonBridged.bed
    # cp -p jkStuff/nonBridged.lft \
    #	/hive/data/staging/data/sarHar1/sarHar1.nonBridged.lft
    # ask cluster-admin to copy (evry time if any file changed)
    #    /hive/data/staging/data/sarHar1 directory to
    #    /scratch/data/sarHar1 on cluster nodes
    # before porceed to genbank step

########################################################################
# GENBANK AUTO UPDATE (DONE - 2012-02-09 - Chin)
    ssh hgwdev
    cd $HOME/kent/src/hg/makeDb/genbank
    git pull

    # check /cluster/data/genbank/data/organism.lst
    # to find out  number of RefSeqs, genbank mRNAs, and ESTs available
    # for sarHar1 first
    # organism             mrnaCnt estCnt  refSeqCnt
    # Sarcophilus harrisii    27      0       0
    # As hetGla1's 90X the sarHar1 is 85X, make it lowCover
    # edit etc/genbank.conf to add sarHar1 after rn4

# sarHar1 (Tasmanian Devil)
sarHar1.serverGenome = /hive/data/genomes/sarHar1/sarHar1.2bit
sarHar1.clusterGenome = /scratch/data/sarHar1/sarHar1.2bit
sarHar1.ooc = /scratch/data/sarHar1/sarHar1.11.ooc
sarHar1.lift = no
sarHar1.perChromTables = no
sarHar1.refseq.mrna.native.pslCDnaFilter  = ${lowCover.refseq.mrna.native.pslCDnaFilter}
sarHar1.refseq.mrna.xeno.pslCDnaFilter    = ${lowCover.refseq.mrna.xeno.pslCDnaFilter}
sarHar1.genbank.mrna.native.pslCDnaFilter = ${lowCover.genbank.mrna.native.pslCDnaFilter}
sarHar1.genbank.mrna.xeno.pslCDnaFilter   = ${lowCover.genbank.mrna.xeno.pslCDnaFilter}
sarHar1.genbank.est.native.pslCDnaFilter  = ${lowCover.genbank.est.native.pslCDnaFilter}
sarHar1.genbank.est.xeno.pslCDnaFilter    = ${lowCover.genbank.est.xeno.pslCDnaFilter}
sarHar1.downloadDir = sarHar1
sarHar1.refSeq.native.load  = no
sarHar1.refseq.mrna.xeno.load = yes
sarHar1.refseq.mrna.xeno.loadDesc  = yes
sarHar1.genbank.mrna.native.load = yes
sarHar1.genbank.mrna.native.loadDesc = yes
sarHar1.genbank.mrna.xeno.load = yes
sarHar1.genbank.mrna.xeno.loadDesc = yes
sarHar1.genbank.est.native.load = no
sarHar1.genbank.est.native.loadDesc = no

    git add etc/genbank.conf
    git commit -m "Added sarHar1" etc/genbank.conf
    git push
    # update /cluster/data/genbank/:
    make etc-update

# Edit src/lib/gbGenome.c to add new species.  With these two lines:
# static char *sarHarNames[] = {"Sarcophilus harrisii", NULL};
#   ... later ...
#    {"sarHar", sarHarNames},
#  gbGenome.c is  in
#  /cluster/home/chinhli/kent/src/hg/makeDb/genbank/src/lib
# make and checkin

    make install-server
    git add src/lib/gbGenome.c
    git commit -m "adding sarHar1 Tasmanian Devil" src/lib/gbGenome.c
    git pull
    git push	    

    ssh hgwdev                  # used to do this on "genbank" machine
    screen	#  control this business with a screen since it takes a while

    cd /cluster/data/genbank
     time nice -n +19 ./bin/gbAlignStep -initial sarHar1 &
    #   logFile: var/build/logs/2012.02.09-11:07:02.sarHar1.initalign.log
    #   real    885m16.517s
    # load database when finished

    ssh hgwdev
    cd /cluster/data/genbank
    time nice -n +19 ./bin/gbDbLoadStep -drop -initialLoad sarHar1 &
    #   logFile: var/dbload/hgwdev/logs/2012.02.13-12:22:56.dbload.log
    #   real    70m12.894s

    # enable daily alignment and update of hgwdev
    cd ~/kent/src/hg/makeDb/genbank
    git pull
    # add sarHar1 to:
        etc/align.dbs
        etc/hgwdev.dbs
    git add  etc/align.dbs
    git add  etc/hgwdev.dbs
    git commit  -m "Added sarHar1 - Tasmanian devil" etc/align.dbs etc/hgwdev.dbs
    git pull
    git push
    make etc-update

#########################################################################
# reset default position as  chr4_GL856884_random:746,443-749,687 Saha-I gene.
# (DONE 2012-02-13 - Chin)
#
    hgsql -e \
'update dbDb set defaultPos="chr4_GL856884_random:746,443-749,687" where name="sarHar1";' \
	hgcentraltest

############################################################################
# running cpgIsland business (DONE -2012-02-13 - Chin)
    mkdir /hive/data/genomes/sarHar1/bed/cpgIsland
    cd /hive/data/genomes/sarHar1/bed/cpgIsland
    cvs -d /projects/compbio/cvsroot checkout -P hg3rdParty/cpgIslands
    cd hg3rdParty/cpgIslands
    #	needed to fixup this source, adding include to readseq.c:
#include "string.h"
    #	and to cpg_lh.c:
#include "unistd.h"
#include "unistd.h"
#include "stdlib.h"
    # and fixing a declaration in cpg_lh.c
    sed -e "s#\(extern char\* malloc\)#// \1#" cpg_lh.c > tmp.c
    mv tmp.c cpg_lh.c
    make
    cd ../../ 
    ln -s hg3rdParty/cpgIslands/cpglh.exe
    mkdir -p hardMaskedFa
    cut -f1 ../../chrom.sizes | while read C
do
    echo ${C}
    twoBitToFa ../../sarHar1.2bit:$C stdout \
	| maskOutFa stdin hard hardMaskedFa/${C}.fa
done

    ssh swarm
    cd /hive/data/genomes/sarHar1/bed/cpgIsland
    mkdir results
    cut -f1 ../../chrom.sizes > chr.list
    cat << '_EOF_' > template
#LOOP
./runOne $(root1) {check out exists results/$(root1).cpg}
#ENDLOOP
'_EOF_'
    # << happy emacs

    #	the faCount business is to make sure there is enough sequence to
    #	work with in the fasta.  cpglh.exe does not like files with too many
    #	N's - it gets stuck
    cat << '_EOF_' > runOne
#!/bin/csh -fe
set C = `faCount hardMaskedFa/$1.fa | grep "^JH\|^chr" | awk '{print  $2 - $7 }'`
if ( $C > 200 ) then
    ./cpglh.exe hardMaskedFa/$1.fa > /scratch/tmp/$1.$$
    mv /scratch/tmp/$1.$$ $2
else
    touch $2
endif
'_EOF_'
    # << happy emacs

    chmod 775 runOne 
    gensub2 chr.list single template jobList
    para create jobList
    para try
    para push 
    para check ... etc
    para time
    para problems
    para status
    # then, kick it with para push
    # check it with plb
    # when all are done, para time shows:
    # para time
    # Checking finished jobs
    # Completed: 35974 of 35974 jobs
    # CPU time in finished jobs:    200s       3.34m     0.06h    0.00d  0.000 y
    # IO & Wait Time:             90743s    1512.38m    25.21h    1.05d  0.003 y
    # Average job time:               3s       0.04m     0.00h    0.00d
    # Longest finished job:           6s       0.10m     0.00h    0.00d
    # Submission to last job:       542s       9.03m     0.15h    0.01d

    # Transform cpglh output to bed +
    catDir results | awk '{
$2 = $2 - 1;
width = $3 - $2;
printf("%s\t%d\t%s\t%s %s\t%s\t%s\t%0.0f\t%0.1f\t%s\t%s\n",
       $1, $2, $3, $5,$6, width,
       $6, width*$7*0.01, 100.0*2*$6/width, $7, $9);
}' > cpgIsland.bed

    ssh hgwdev
    cd /hive/data/genomes/sarHar1/bed/cpgIsland
    hgLoadBed sarHar1 cpgIslandExt -tab \
      -sqlTable=$HOME/kent/src/hg/lib/cpgIslandExt.sql cpgIsland.bed
    # Read 20283 elements of size 10 from cpgIsland.bed
    # Sorted
    # Creating table definition for cpgIslandExt
    # Saving bed.tab
    # Loading sarHar1

    #	cleanup
    rm -fr hardMaskedFa

##########################################################################
#  BLATSERVERS ENTRY (DONE 2012-02-14 - Chin)
#       After getting a blat server assigned by the Blat Server Gods,
#  
# blat1 
# port 17820 (trans) and port 17821 (untrans).

    hgsql -e 'INSERT INTO blatServers (db, host, port, isTrans, canPcr) \
        VALUES ("sarHar1", "blat1", "17820", "1", "0"); \
        INSERT INTO blatServers (db, host, port, isTrans, canPcr) \
        VALUES ("sarHar1", "blat1", "17821", "0", "1");' \
            hgcentraltest
    #   test it with some sequence

#########################################################################
# all.joiner update, downloads and in pushQ - (DONE 2012-06-19 - Chin)
    cd $HOME/kent/src/hg/makeDb/schema
    # fixup all.joiner until this is a clean output
    # add sarHar after hetGla1
    joinerCheck -database=sarHar1 -all all.joiner

    cd /hive/data/genomes/sarHar1
    makeDownloads.pl sarHar1 > do.log 2>&1

    #	now ready for pushQ entry
    mkdir /hive/data/genomes/sarHar1/pushQ
    cd /hive/data/genomes/sarHar1/pushQ
    makePushQSql.pl sarHar1 > sarHar1.pushQ.sql 2> stderr.out
    #	check for errors in stderr.out, some are OK, e.g.:
    # WARNING: hgwdev does not have /gbdb/sarHar1/wib/gc5Base.wib
    # WARNING: hgwdev does not have /gbdb/sarHar1/wib/quality.wib
    # WARNING: hgwdev does not have /gbdb/sarHar1/bbi/quality.bw
    # WARNING: sarHar1 does not have seq
    # WARNING: sarHar1 does not have extFile
    # 
    # WARNING: Could not tell (from trackDb, all.joiner and hardcoded lists of
    # supporting and genbank tables) which tracks to assign these tables to:
    #   ctgPos2

    #	copy it to hgwbeta
    scp -p sarHar1.pushQ.sql hgwbeta:/tmp
    ssh hgwbeta
    cd /tmp
    hgsql qapushq < sarHar1.pushQ.sql
    #	in that pushQ entry walk through each entry and see if the
    #	sizes will set properly

#########################################################################
# GENSCAN GENE PREDICTIONS (DONE 2012-03-15 - Chin)
    mkdir /hive/data/genomes/sarHar1/bed/genscan
    cd /hive/data/genomes/sarHar1/bed/genscan

    # Check out hg3rdParty/genscanlinux to get latest genscan:
    cvs -d /projects/compbio/cvsroot checkout -P hg3rdParty/genscanlinux

    # create hard masked .fa files
    mkdir -p hardMaskedFa
    cut -f1 ../../chrom.sizes | while read C
do
    echo ${C}
    twoBitToFa ../../sarHar1.2bit:$C stdout \
        | maskOutFa stdin hard hardMaskedFa/${C}.fa
done

    # Generate a list file, genome.list, of all the hard-masked contig
    # chunks:
    find ./hardMaskedFa/ -type f | sed -e 's#^./##' > genome.list

    wc -l genome.list
    #   35974 genome.list

    # Run on small cluster (more mem than big cluster).
    ssh swarm
    cd /hive/data/genomes/sarHar1/bed/genscan
    # Make 3 subdirectories for genscan to put their output files in
    mkdir gtf pep subopt
    # Create template file, template, for gensub2.  For example (3-line
    # file):
    cat << '_EOF_' > template
#LOOP
/cluster/bin/x86_64/gsBig {check in exists+ $(path1)} {check out exists gtf/$(root1).gtf} -trans={check out exists pep/$(root1).pep} -subopt={check out exists subopt/$(root1).bed} -exe=hg3rdParty/genscanlinux/genscan -par=hg3rdParty/genscanlinux/HumanIso.smat -tmp=/tmp -window=2400000
#ENDLOOP
'_EOF_'
    # << emacs
    gensub2 genome.list single template jobList
    para create jobList
    para try
    para check ... etc...
    para time
    # Completed: 35974 of 35974 jobs
    # CPU time in finished jobs:  42777s     712.95m    11.88h    0.50d 0.001 y
    # IO & Wait Time:            129825s    2163.75m    36.06h    1.50d 0.004 y
    # Average job time:               5s       0.08m     0.00h    0.00d
    # Longest finished job:         112s       1.87m     0.03h    0.00d
    # Submission to last job        306s       5.10m     0.09h    0.00d

    # Make sure all files ended with linefeed 
    # this did not work, runs out of file handles ?
    # run endsInLf in batch of 256 files
    find ./gtf -type f | xargs -n 256 endsInLf -zeroOk

    # Concatenate results:
    cd /hive/data/genomes/sarHar1/bed/genscan
    find ./gtf -type f | xargs cat > genscan.gtf
    find ./pep -type f | xargs cat > genscan.pep
    find ./subopt -type f | xargs cat > genscanSubopt.bed

    # Load into the database (without -genePredExt because no frame
    # info):
    ssh hgwdev
    cd /hive/data/genomes/sarHar1/bed/genscan
    # to construct a local file with the genePred business:
    gtfToGenePred genscan.gtf genscan.gp
    # this produces exactly the same thing and loads the table:
    ldHgGene -gtf sarHar1 genscan genscan.gtf
    # Read 29306 transcripts in 167329 lines in 1 files
    #  29306 groups 6967 seqs 1 sources 1 feature types
    #  29306 gene predictions
    #  Don't load the Pep anymore -- redundant since it's from
    # genomic.
    hgPepPred sarHar1 generic genscanPep genscan.pep
    # Processing genscan.pep
    hgLoadBed sarHar1 genscanSubopt genscanSubopt.bed
    # Reading genscanSubopt.bed
    # Read 347922 elements of size 6 from genscanSubopt.bed
    featureBits sarHar1 genscan
    # 26054613 bases of 2931539702 (0.889%) in intersection

#########################################################################
# construct lift file Ensembl names to UCSC names (DONE - 2012-09-07 - Chin)
    cd /hive/data/genomes/sarHar1/jkStuff

cat ../chrom.sizes | while read L
do
ucName=`echo "${L}" | awk '{print $1}'`
ucSize=`echo "${L}" | awk '{print $2}'`
ensName=`echo $L | sed -e 's/^chr[0-9A-Za-z]*_//; s/_random//; s/^chr//;
s/^\([GA][LA][CZ0-9]*\)/\1.1/;' | awk '{print $1}'`
ensSize=`echo $L | sed -e 's/^chr[0-9A-Za-z]*_//; s/_random//; s/^chr//;
s/^\([GA][LA][CZ0-9]*\)/\1.1/;' | awk '{print $2}'`
echo -e "0\t$ensName\t$ensSize\t$ucName\t$ucSize"
done > ensToUcsc.lift

#########################################################################
# create ucscToINSDC name mapping (DONE - 2013-08-23 - Hiram)
    mkdir /hive/data/genomes/sarHar1/bed/ucscToINSDC
    cd /hive/data/genomes/sarHar1/bed/ucscToINSDC

    # copying these scripts from the previous load and improving them
    # with each instance
    ./translateNames.sh
    ./verifyAll.sh
    ./join.sh

    sed -e "s/21/20/" $HOME/kent/src/hg/lib/ucscToINSDC.sql \
        | hgLoadSqlTab sarHar1 ucscToINSDC stdin ucscToINSDC.tab
    checkTableCoords sarHar1 ucscToINSDC
    featureBits -countGaps sarHar1 ucscToINSDC
    #  3174693010 bases of 3174693010 (100.000%) in intersection

    # verify the track link to INSDC functions

##############################################################################
##############################################################################
# TransMap V3 tracks. see makeDb/doc/transMapTracks.txt (2014-12-21 markd)
##############################################################################
