# for emacs: -*- mode: sh; -*-

#       DATE:	23-Jan-2012
#       ORGANISM:	Heterocephalus glaber
#       TAXID:	10181
#       ASSEMBLY LONG NAME:	HetGla_female_1.0
#       ASSEMBLY SHORT NAME:	HetGla_female_1.0
#       ASSEMBLY SUBMITTER:	Broad Institute
#       ASSEMBLY TYPE:	Haploid
#       NUMBER OF ASSEMBLY-UNITS:	1
#       ASSEMBLY ACCESSION:	GCA_000247695.1

#       FTP-RELEASE DATE: 10-Mar-2012

#       http://www.ncbi.nlm.nih.gov/genome/6995
#       http://www.ncbi.nlm.nih.gov/genome/assembly/362148/
#       http://www.ncbi.nlm.nih.gov/bioproject/72441
#       http://www.ncbi.nlm.nih.gov/bioproject/62663 - chrMt - NC_015112.1

#       http://www.ncbi.nlm.nih.gov/Traces/wgs/?val=AHKG01
#       Genome Coverage : 90x

#       http://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?id=10181

# rsync://ftp.ncbi.nlm.nih.gov/genbank/genomes/Eukaryotes/vertebrates_mammals/Heterocephalus_glaber/HetGla_female_1.0/

##########################################################################
# Download sequence (DONE - 2012-04-13 - Hiram)
    mkdir /hive/data/genomes/hetGla2
    cd /hive/data/genomes/hetGla2
    mkdir genbank
    cd genbank
    time rsync -a -P \
rsync://ftp.ncbi.nlm.nih.gov/genbank/genomes/Eukaryotes/vertebrates_mammals/Heterocephalus_glaber/HetGla_female_1.0/ ./
    #   real    15m42.132s

    # verify the size of the sequence here:
    faSize Primary_Assembly/unplaced_scaffolds/FASTA/unplaced.scaf.fa.gz
    #   2618188253 bases (303433536 N's 2314754717 real 2314754717 upper
    #   0 lower) in 4228 sequences in 1 files
    #   Total size: mean 619249.8 sd 3685910.4
    #   min 1000 (gi|375378318|gb|AHKG01114633.1|)
    #   max 78885850 (gi|376403570|gb|JH602043.1|) median 1736

##########################################################################
# fixup names for UCSC standards (DONE - 2012-04-13 - Hiram)
    mkdir /hive/data/genomes/hetGla2/ucsc
    cd /hive/data/genomes/hetGla2/ucsc

    ########################  Unplaced scaffolds
    # verify we don't have any .acc numbers different from .1
    zcat \
    ../genbank/Primary_Assembly/unplaced_scaffolds/AGP/unplaced.scaf.agp.gz \
	| cut -f1 | egrep "^JH|^AHKG" \
	| sed -e 's/^JH[0-9][0-9]*//; s/^AHKG[0-9][0-9]*//' | sort | uniq -c
    #   225076 .1

    # this is like the unplaced.pl script in other assemblies except it
    #	does not add chrUn_ to the names since they are all just scaffolds

    cat << '_EOF_' > unplaced.pl
#!/bin/env perl

use strict;
use warnings;

my $agpFile =  "../genbank/Primary_Assembly/unplaced_scaffolds/AGP/unplaced.scaf.agp.gz";
my $fastaFile =  "../genbank/Primary_Assembly/unplaced_scaffolds/FASTA/unplaced.scaf.fa.gz";
open (FH, "zcat $agpFile|") or die "can not read $agpFile";
open (UC, "|gzip -c > unplaced.agp.gz") or die "can not write to unplaced.agp.gz";
while (my $line = <FH>) {
    if ($line =~ m/^#/) {
        print UC $line;
    } else {
        $line =~ s/\.1//;
        printf UC "%s", $line;
    }
}
close (FH);
close (UC);

open (FH, "zcat $fastaFile|") or die "can not read $fastaFile";
open (UC, "|gzip -c > unplaced.fa.gz") or die "can not write to unplaced.fa.gz";
while (my $line = <FH>) {
    if ($line =~ m/^>/) {
        chomp $line;
        $line =~ s/.*gb\|//;
        $line =~ s/\|.*//;
        $line =~ s/\.1//;
        printf UC ">$line\n";
    } else {
        print UC $line;
    }
}
close (FH);
close (UC);
'_EOF_'
    # << happy emacs
    chmod +x unplaced.pl
    time ./unplaced.pl
    #   real    11m32.064s

    # verify nothing lost from original:
    time faSize *.fa.gz
    #   2618188253 bases (303433536 N's 2314754717 real 2314754717 upper 0
    #   lower) in 4228 sequences in 1 files
    #   Total size: mean 619249.8 sd 3685910.4 min 1000 (AHKG01114633)
    #   max 78885850 (JH602043) median 1736
    #   real    0m45.073s

    # make sure none of the names got to be over 31 characers long:
    zcat *.agp.gz | grep -v "^#" | cut -f1 | awk '{print length($0)}' \
        | sort -rn | uniq -c | head
    #     3134 12
    #   221942 8

##########################################################################
# Initial makeGenomeDb.pl (DONE - 2012-04-13 - Hiram)
    cd /hive/data/genomes/hetGla2
    # mitoAcc - chrMt sequence is included in the download files
    cat << '_EOF_' > hetGla2.config.ra
# Config parameters for makeGenomeDb.pl:
db hetGla2
clade mammal
genomeCladePriority 35
scientificName Heterocephalus glaber
commonName Naked Mole Rat
assemblyDate Jan. 2012
assemblyLabel Broad Institute HetGla_female_1.0 (NCBI project 72441, GCA_000247695.1, WGS AHKG01)
assemblyShortLabel Broad HetGla_female_1.0
ncbiAssemblyName HetGla_female_1.0
ncbiAssemblyId 362148
orderKey 1649
mitoAcc NC_015112
fastaFiles /hive/data/genomes/hetGla2/ucsc/*.fa.gz
agpFiles /hive/data/genomes/hetGla2/ucsc/*.agp.gz
# qualFiles none
dbDbSpeciesDir moleRat
taxId 10181
'_EOF_'
    # << happy emacs

    # verify sequence and agp are OK
    time makeGenomeDb.pl -workhorse=hgwdev -fileServer=hgwdev -dbHost=hgwdev \
        -stop=agp hetGla2.config.ra > agp.log 2>&1
    #   real    1m59.793s
    # verify OK:
    tail -1 agp.log
    #   *** All done!  (through the 'agp' step)

    # finish it off
    time makeGenomeDb.pl -continue=db -workhorse=hgwdev -fileServer=hgwdev \
        -dbHost=hgwdev hetGla2.config.ra > db.log 2>&1
    #   real    17m39.702s
    #	add the trackDb entries to the source tree, and the 2bit link:
    ln -s `pwd`/hetGla2.unmasked.2bit /gbdb/hetGla2/hetGla2.2bit
    #	browser should function now

#########################################################################
# running repeat masker (DONE - 2012-04-13 - Hiram)
    mkdir /hive/data/genomes/hetGla2/bed/repeatMasker
    cd /hive/data/genomes/hetGla2/bed/repeatMasker
    time doRepeatMasker.pl -buildDir=`pwd` -noSplit \
	-bigClusterHub=swarm -dbHost=hgwdev -workhorse=hgwdev \
	-smallClusterHub=encodek hetGla2 > do.log 2>&1 &
    #   real    1042m39.868s

    cat faSize.rmsk.txt
    #   2618204639 bases (303433536 N's 2314771103 real 1567576531 upper
    #   747194572 lower) in 4229 sequences in 1 files
    #   Total size: mean 619107.3 sd 3685486.1 min 1000 (AHKG01114633)
    #   max 78885850 (JH602043) median 1736
    #   %28.54 masked total, %32.28 masked real

    egrep -i "versi|relea" do.log
#    April 26 2011 (open-3-3-0) version of RepeatMasker
# CC   RELEASE 20110920;
# RepeatMasker version development-$Id: RepeatMasker,v 1.26 2011/09/26 16:19:44 angie Exp $

    featureBits -countGaps hetGla2 rmsk
    #   748037030 bases of 2618204639 (28.571%) in intersection
    #   real    0m25.109s

    # why is it different than the faSize above ?
    # because rmsk masks out some N's as well as bases, the count above
    #	separates out the N's from the bases, it doesn't show lower case N's

##########################################################################
# running simple repeat (DONE - 2012-04-13 - Hiram)
    mkdir /hive/data/genomes/hetGla2/bed/simpleRepeat
    cd /hive/data/genomes/hetGla2/bed/simpleRepeat
    time doSimpleRepeat.pl -buildDir=`pwd` -bigClusterHub=swarm \
	-dbHost=hgwdev -workhorse=hgwdev -smallClusterHub=encodek \
	hetGla2 > do.log 2>&1 &
    #   real    14m26.766s
    cat fb.simpleRepeat
    #   23745957 bases of 2314771103 (1.026%) in intersection

    # add to rmsk after it is done:
    cd /hive/data/genomes/hetGla2
    twoBitMask hetGla2.rmsk.2bit \
	-add bed/simpleRepeat/trfMask.bed hetGla2.2bit
    #	you can safely ignore the warning about fields >= 13

    twoBitToFa hetGla2.2bit stdout | faSize stdin > faSize.hetGla2.2bit.txt
    cat faSize.hetGla2.2bit.txt
    #   2618204639 bases (303433536 N's 2314771103 real 1566940532 upper
    #   747830571 lower) in 4229 sequences in 1 files
    #   Total size: mean 619107.3 sd 3685486.1 min 1000 (AHKG01114633)
    #   max 78885850 (JH602043) median 1736
    #   %28.56 masked total, %32.31 masked real

    rm /gbdb/hetGla2/hetGla2.2bit
    ln -s `pwd`/hetGla2.2bit /gbdb/hetGla2/hetGla2.2bit

#########################################################################
# Verify all gaps are marked, add any N's not in gap as type 'other'
#	(DONE - 2012-04-13 - Hiram)
    mkdir /hive/data/genomes/hetGla2/bed/gap
    cd /hive/data/genomes/hetGla2/bed/gap
    time nice -n +19 findMotif -motif=gattaca -verbose=4 \
	-strand=+ ../../hetGla2.unmasked.2bit > findMotif.txt 2>&1
    #   real    0m20.133s
    grep "^#GAP " findMotif.txt | sed -e "s/^#GAP //" > allGaps.bed
    featureBits hetGla2 -not gap -bed=notGap.bed
    #   2314771103 bases of 2314771103 (100.000%) in intersection
    #   real    0m16.020s

    time featureBits hetGla2 allGaps.bed notGap.bed -bed=new.gaps.bed
    #   0 bases of 2314771103 (0.000%) in intersection
    #   real    2m28.844s
    # no new gaps, nothing to do here

    # are there non-bridged gaps here:
    hgsql -N -e "select bridge from gap;" hetGla2 | sort | uniq -c
    #  110424 yes

##########################################################################
## WINDOWMASKER (DONE - 2012-04-13 - Hiram)
    mkdir /hive/data/genomes/hetGla2/bed/windowMasker
    cd /hive/data/genomes/hetGla2/bed/windowMasker
    time nice -n +19 doWindowMasker.pl -buildDir=`pwd` -workhorse=hgwdev \
	-dbHost=hgwdev hetGla2 > do.log 2>&1 &
    #   real    198m5.178s

    # Masking statistics
    twoBitToFa hetGla2.wmsk.2bit stdout | faSize stdin
    #   2618204639 bases (303433536 N's 2314771103 real 1603612350 upper
    #   711158753 lower) in 4229 sequences in 1 files
    #   Total size: mean 619107.3 sd 3685486.1 min 1000 (AHKG01114633)
    #   max 78885850 (JH602043) median 1736
    #   %27.16 masked total, %30.72 masked real

    twoBitToFa hetGla2.wmsk.sdust.2bit stdout | faSize stdin
    #   2618204639 bases (303433536 N's 2314771103 real 1591491612 upper
    #   723279491 lower) in 4229 sequences in 1 files
    #   Total size: mean 619107.3 sd 3685486.1 min 1000 (AHKG01114633)
    #   max 78885850 (JH602043) median 1736
    #   %27.63 masked total, %31.25 masked real

    hgLoadBed hetGla2 windowmaskerSdust windowmasker.sdust.bed.gz
    #   Read 13599339 elements of size 3 from windowmasker.sdust.bed.gz

    time featureBits -countGaps hetGla2 windowmaskerSdust
    #   1185785055 bases of 2808525991 (42.221%) in intersection

    #	eliminate the gaps from the masking
    time featureBits hetGla2 -not gap -bed=notGap.bed
    #   2314771103 bases of 2314771103 (100.000%) in intersection
    #   real    0m15.302s

    #   2525294057 bases of 2525294057 (100.000%) in intersection
    time nice -n +19 featureBits hetGla2 windowmaskerSdust notGap.bed \
        -bed=stdout | gzip -c > cleanWMask.bed.gz
    #   723279491 bases of 2314771103 (31.246%) in intersection
    #   real    3m10.554s

    #	reload track to get it clean
    hgLoadBed hetGla2 windowmaskerSdust cleanWMask.bed.gz
    #   Read 13672561 elements of size 4 from cleanWMask.bed.gz
    time featureBits -countGaps hetGla2 windowmaskerSdust
    #   723279491 bases of 2618204639 (27.625%) in intersection
    #   real    1m20.945s

    zcat cleanWMask.bed.gz \
	| twoBitMask ../../hetGla2.unmasked.2bit stdin \
	    -type=.bed hetGla2.cleanWMSdust.2bit
    twoBitToFa hetGla2.cleanWMSdust.2bit stdout | faSize stdin \
        > hetGla2.cleanWMSdust.faSize.txt
    cat hetGla2.cleanWMSdust.faSize.txt
    #   2618204639 bases (303433536 N's 2314771103 real 1591491612 upper
    #   723279491 lower) in 4229 sequences in 1 files
    #   Total size: mean 619107.3 sd 3685486.1 min 1000 (AHKG01114633)
    #   max 78885850 (JH602043) median 1736
    #   %27.63 masked total, %31.25 masked real

    # how much does this window masker and repeat masker overlap:
    featureBits -countGaps hetGla2 rmsk windowmaskerSdust
    #   360810714 bases of 2618204639 (13.781%) in intersection

#########################################################################
# MASK SEQUENCE WITH WM+TRF (DONE - 2012-04-13 - Hiram)
    #	since rmsk masks so very little of the genome, use the clean WM mask
    #	on the genome sequence
#    cd /hive/data/genomes/hetGla2
#    twoBitMask -add bed/windowMasker/hetGla2.cleanWMSdust.2bit \
#	bed/simpleRepeat/trfMask.bed hetGla2.2bit
    #	safe to ignore the warnings about BED file with >=13 fields
#    twoBitToFa hetGla2.2bit stdout | faSize stdin > faSize.hetGla2.txt
#    cat faSize.hetGla2.txt
    #   927696114 bases (111611440 N's 816084674 real 607935500 upper
    #   208149174 lower) in 5678 sequences in 1 files
    #   Total size: mean 163384.3 sd 1922194.0 min 1000 (AERX01077754-1)
    #   max 51042256 (chrLG7) median 2009
    #   %22.44 masked total, %25.51 masked real

    #	create symlink to gbdb
#    rm /gbdb/hetGla2/hetGla2.2bit
#    ln -s `pwd`/hetGla2.2bit /gbdb/hetGla2/hetGla2.2bit

########################################################################
# cpgIslands - (DONE - 2011-04-23 - Hiram)
    mkdir /hive/data/genomes/hetGla2/bed/cpgIslands
    cd /hive/data/genomes/hetGla2/bed/cpgIslands
    time doCpgIslands.pl hetGla2 > do.log 2>&1
    #   real    7m57.656s

    cat fb.hetGla2.cpgIslandExt.txt
    #   21205888 bases of 2314771103 (0.916%) in intersection

#########################################################################
# genscan - (DONE - 2011-04-25 - Hiram)
    mkdir /hive/data/genomes/hetGla2/bed/genscan
    cd /hive/data/genomes/hetGla2/bed/genscan
    time doGenscan.pl hetGla2 > do.log 2>&1
    #   real    32m10.608s
# Completed: 4227 of 4229 jobs
# Crashed: 2 jobs
# CPU time in finished jobs:      52691s     878.18m    14.64h    0.61d  0.002 y
# IO & Wait Time:                 15228s     253.80m     4.23h    0.18d  0.000 y
# Average job time:                  16s       0.27m     0.00h    0.00d
# Longest finished job:            1770s      29.50m     0.49h    0.02d
# Submission to last job:          1993s      33.22m     0.55h    0.02d

    # two broken jobs, reducing the window size to 2000000 seems to work:
./runGsBig.csh JH602052 000 gtf/000/JH602052.gtf pep/000/JH602052.pep subopt/000/JH602052.bed &
./runGsBig.csh JH602053 000 gtf/000/JH602053.gtf pep/000/JH602053.pep subopt/000/JH602053.bed
    #   real    real    12m3.901s

    # continuing:
    time doGenscan.pl -continue=makeBed hetGla2 > makeBed.log 2>&1
    #   real    9m49.111s

    cat fb.hetGla2.genscan.txt
    #   65077826 bases of 2314771103 (2.811%) in intersection
    cat fb.hetGla2.genscanSubopt.txt
    #   60777320 bases of 2314771103 (2.626%) in intersection

#########################################################################
# MAKE 11.OOC FILE FOR BLAT/GENBANK (DONE - 2012-05-07 - Hiram)
    # Use -repMatch=900, based on size -- for human we use 1024
    # use the "real" number from the faSize measurement,
    # hg19 is 2897316137, calculate the ratio factor for 1024:
    calc \( 2314771103 / 2897316137 \) \* 1024
    #	( 2314771103 / 2897316137 ) * 1024 = 818.110795

    # round up to 850 (hetGla1 was 850)

    cd /hive/data/genomes/hetGla2
    time blat hetGla2.2bit /dev/null /dev/null -tileSize=11 \
      -makeOoc=jkStuff/hetGla2.11.ooc -repMatch=850
    #   Wrote 26730 overused 11-mers to jkStuff/hetGla2.11.ooc
    #   real    0m55.389s
    #	hetGla1 was: Wrote 33104 overused 11-mers to jkStuff/hetGla1.11.ooc

    # there are no non-bridged gaps, no lift file needed for genbank
    hgsql -N -e "select bridge from gap;" hetGla2 | sort | uniq -c
    #   110424 yes
#    cd /hive/data/genomes/hetGla2/jkStuff
#    gapToLift hetGla2 hetGla2.nonBridged.lift -bedFile=hetGla2.nonBridged.bed
    # largest non-bridged contig:
#    awk '{print $3-$2,$0}' hetGla2.nonBridged.bed | sort -nr | head
    #   123773608 chrX  95534   123869142       chrX.01

#########################################################################
# AUTO UPDATE GENBANK (DONE - 2012-05-07 - Hiram)
    # examine the file:
    /cluster/data/genbank/data/organism.lst
    # for your species to see what counts it has for:
# organism       mrnaCnt estCnt  refSeqCnt
# Heterocephalus glaber	45	0	0
    # to decide which "native" mrna or ests you want to specify in genbank.conf

    ssh hgwdev
    cd $HOME/kent/src/hg/makeDb/genbank
    git pull
    # edit etc/genbank.conf to add hetGla2 just after ce2
# hetGla2 (naked mole rat)
hetGla2.serverGenome = /hive/data/genomes/hetGla2/hetGla2.2bit
hetGla2.clusterGenome = /hive/data/genomes/hetGla2/hetGla2.2bit
hetGla2.ooc = /hive/data/genomes/hetGla2/jkStuff/hetGla2.11.ooc
hetGla2.lift = no
hetGla2.refseq.mrna.native.pslCDnaFilter  = ${lowCover.refseq.mrna.native.pslCDnaFilter}
hetGla2.refseq.mrna.xeno.pslCDnaFilter    = ${lowCover.refseq.mrna.xeno.pslCDnaFilter}
hetGla2.genbank.mrna.native.pslCDnaFilter = ${lowCover.genbank.mrna.native.pslCDnaFilter}
hetGla2.genbank.mrna.xeno.pslCDnaFilter   = ${lowCover.genbank.mrna.xeno.pslCDnaFilter}
hetGla2.genbank.est.native.pslCDnaFilter  = ${lowCover.genbank.est.native.pslCDnaFilter}
hetGla2.refseq.mrna.native.load = no
hetGla2.refseq.mrna.xeno.load = yes
hetGla2.genbank.mrna.native.load = no
hetGla2.genbank.mrna.xeno.load = yes
hetGla2.genbank.est.native.load = no
hetGla2.downloadDir = hetGla2
hetGla2.perChromTables = no

    # end of section added to etc/genbank.conf
    git commit -m "adding hetGla2 naked mole rat" etc/genbank.conf
    git push
    make etc-update

    ssh hgwdev			# used to do this on "genbank" machine
    screen -S hetGla2           # long running job managed in screen
    cd /cluster/data/genbank
    time nice -n +19 ./bin/gbAlignStep -initial hetGla2 &
    #	 var/build/logs/2012.05.07-10:10:42.hetGla2.initalign.log
    #   real    1716m59.594s

    # load database when finished
    ssh hgwdev
    cd /cluster/data/genbank
    time nice -n +19 ./bin/gbDbLoadStep -drop -initialLoad hetGla2 &
    #	logFile: var/dbload/hgwdev/logs/2012.05.08-16:05:34.dbload.log
    #   real    64m32.488s

    # enable daily alignment and update of hgwdev (DONE - 2012-02-09 - Hiram)
    cd ~/kent/src/hg/makeDb/genbank
    git pull
    # add hetGla2 to:
    vi etc/align.dbs etc/hgwdev.dbs
    git commit -m "Added hetGla2." etc/align.dbs etc/hgwdev.dbs
    git push
    make etc-update

#########################################################################
# set default position to RHO gene displays  (DONE - 2012-07-23 - Hiram)
    hgsql -e \
'update dbDb set defaultPos="JH602185:522994-528500" where name="hetGla2";' \
	hgcentraltest

############################################################################
# pushQ entry (DONE - 2012-07-23 - Hiram)
    mkdir /hive/data/genomes/hetGla2/pushQ
    cd /hive/data/genomes/hetGla2/pushQ
    # Mark says don't let the transMap track get there
    time makePushQSql.pl hetGla2 2> stderr.txt | grep -v transMap > hetGla2.sql
    #   real    3m59.755s

    scp -p hetGla2.sql hgwbeta:/tmp
    ssh hgwbeta
    cd /tmp
    hgsql qapushq < hetGla2.sql

############################################################################
# create ucscToINSDC name mapping (DONE - 2013-08-23 - Hiram)
    mkdir /hive/data/genomes/hetGla2/bed/ucscToINSDC
    cd /hive/data/genomes/hetGla2/bed/ucscToINSDC

    # copying these scripts from the previous load and improving them
    # with each instance
    ./translateNames.sh
    ./verifyAll.sh
    ./join.sh

    sed -e "s/21/12/" $HOME/kent/src/hg/lib/ucscToINSDC.sql \
       | hgLoadSqlTab hetGla2 ucscToINSDC stdin ucscToINSDC.tab
    checkTableCoords hetGla2 ucscToINSDC
    featureBits -countGaps hetGla2 ucscToINSDC
    # 2618204639 bases of 2618204639 (100.000%) in intersection

    # verify the track link to INSDC functions

##############################################################################
##############################################################################
# TransMap V3 tracks. see makeDb/doc/transMapTracks.txt (2014-12-21 markd)
##############################################################################
