# for emacs: -*- mode: sh; -*-

# This file describes browser build for the dasNov3
# Dasypus novemcinctus - armadillo

#       DATE:   01-Dec-2011
#       ORGANISM:       Dasypus novemcinctus
#       TAXID:  9361
#       ASSEMBLY LONG NAME:     Dasnov3.0
#       ASSEMBLY SHORT NAME:    Dasnov3.0
#       ASSEMBLY SUBMITTER:     Baylor College of Medicine
#       ASSEMBLY TYPE:  Haploid
#       NUMBER OF ASSEMBLY-UNITS:       1
#       ASSEMBLY ACCESSION:     GCA_000208655.2

#       FTP-RELEASE DATE: 07-Jan-2012

#       rsync://ftp.ncbi.nlm.nih.gov/genbank/genomes/Eukaryotes/vertebrates_mammals/Dasypus_novemcinctus/Dasnov3.0/ ./

#	Genome ID:
#       http://www.ncbi.nlm.nih.gov/genome/235
#       http://www.ncbi.nlm.nih.gov/bioproject/12594
#       http://www.ncbi.nlm.nih.gov/nuccore/AAGV00000000.3
#       http://www.ncbi.nlm.nih.gov/Traces/wgs/?val=AAGV03
#       Genome Coverage : 6x

#	Taxonomy:
#       http://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?id=9361

#	Mitochondrial sequence:
#       http://www.ncbi.nlm.nih.gov/bioproject/11837 - chrMt NC_001821.1

#	Assembly ID: 326198
#       http://www.ncbi.nlm.nih.gov/genome/assembly/326198/


#############################################################################
# fetch sequence from genbank (DONE - 2012-04-11 - Hiram)
    mkdir -p /hive/data/genomes/genomes/dasNov3/genbank
    cd /hive/data/genomes/genomes/dasNov3/genbank

    rsync -a -P \
rsync://ftp.ncbi.nlm.nih.gov/genbank/genomes/Eukaryotes/vertebrates_mammals/Dasypus_novemcinctus/Dasnov3.0/ ./

    # measure sequence to be used here
    faSize Primary_Assembly/unplaced_scaffolds/FASTA/unplaced.scaf.fa.gz

    #   3631505655 bases (331640683 N's 3299864972 real 3299864972 upper
    #   0 lower) in 46558 sequences in 1 files
    #   Total size: mean 77999.6 sd 423977.1
    #   min 200 (gi|367146187|gb|AAGV03313855.1|)
    #   max 14349515 (gi|371469823|gb|JH573670.1|) median 3708

#############################################################################
# fixup names for UCSC standards (DONE - 2012-04-11 - Hiram)
    mkdir /hive/data/genomes/dasNov3/ucsc
    cd /hive/data/genomes/dasNov3/ucsc


    ########################  Unplaced scaffolds
    # verify we don't have any .acc numbers different from .1
    zcat \
    ../genbank/Primary_Assembly/unplaced_scaffolds/AGP/unplaced.scaf.agp.gz \
	| cut -f1 | egrep "^JH|^AAGV" \
	| sed -e 's/^JH[0-9][0-9]*//; s/^AAGV[0-9][0-9]*//' | sort | uniq -c
    #    583384 .1

    # in that case, since they are all .1, we can remove them
    # this is like the unplaced.pl script in other assemblies except it
    #	does not add chrUn_ to the names since they are all just scaffolds

    cat << '_EOF_' > unplaced.pl
#!/bin/env perl

use strict;
use warnings;

my $agpFile =  "../genbank/Primary_Assembly/unplaced_scaffolds/AGP/unplaced.scaf.agp.gz";
my $fastaFile =  "../genbank/Primary_Assembly/unplaced_scaffolds/FASTA/unplaced.scaf.fa.gz";
open (FH, "zcat $agpFile|") or die "can not read $agpFile";
open (UC, "|gzip -c > unplaced.agp.gz") or die "can not write to unplaced.agp.gz";
while (my $line = <FH>) {
    if ($line =~ m/^#/) {
        print UC $line;
    } else {
        $line =~ s/\.1//;    
        printf UC "%s", $line;
    }
}
close (FH);
close (UC);

open (FH, "zcat $fastaFile|") or die "can not read $fastaFile";
open (UC, "|gzip -c > unplaced.fa.gz") or die "can not write to unplaced.fa.gz";
while (my $line = <FH>) {
    if ($line =~ m/^>/) {
        chomp $line;
        $line =~ s/.*gb\|//;
        $line =~ s/\|.*//;
        $line =~ s/\.1//;    
        printf UC ">$line\n";
    } else {
        print UC $line;
    }
}
close (FH);
close (UC);
'_EOF_'
    # << happy emacs
    chmod +x unplaced.pl
    time ./unplaced.pl
    #   real    16m41.592s
    faSize unplaced.fa.gz
    #   3631505655 bases (331640683 N's 3299864972 real 3299864972 upper
    #   0 lower) in 46558 sequences in 1 files
    #   Total size: mean 77999.6 sd 423977.1 min 200 (AAGV03313855)
    #   max 14349515 (JH573670) median 3708

    # make sure none of the names got to be over 31 characers long:
    zcat unplaced.agp.gz | grep -v "^#" | cut -f1 | awk '{print length($0)}' \
        | sort -rn | uniq -c | head
    #     23501 12
    #   559883 8

#############################################################################
#  Initial database build (DONE - 2012-04-11 - Hiram)

    cd /hive/data/genomes/dasNov3
    cat << '_EOF_' > dasNov3.config.ra
# Config parameters for makeGenomeDb.pl:
db dasNov3
clade mammal
genomeCladePriority 56
scientificName Dasypus novemcinctus
commonName Armadillo
assemblyDate Dec. 2011
assemblyLabel Baylor College of Medicine Dasnov3.0 (GCA_000208655.2)
assemblyShortLabel Armadillo
ncbiAssemblyName Dasnov3.0
ncbiAssemblyId 326198
orderKey 3459
mitoAcc NC_001821
fastaFiles /hive/data/genomes/dasNov3/genbank/unplaced.fa.gz
agpFiles /hive/data/genomes/dasNov3/genbank/unplaced.agp.gz
dbDbSpeciesDir armadillo
taxId 9361
'_EOF_'
    # << happy emacs

    # verify sequence and agp are OK
    time makeGenomeDb.pl -workhorse=hgwdev -fileServer=hgwdev -dbHost=hgwdev \
        -stop=agp dasNov3.config.ra > agp.log 2>&1
    #	real    4m40.661s
    # verify OK:
    tail -1 agp.log
    #   *** All done!  (through the 'agp' step)

    time makeGenomeDb.pl -continue=db -workhorse=hgwdev -fileServer=hgwdev \
        -dbHost=hgwdev dasNov3.config.ra > db.log 2>&1
    #   real    30m16.480s

##########################################################################
# running repeat masker (DONE - 2012-04-11 - Hiram)
    mkdir /hive/data/genomes/dasNov3/bed/repeatMasker
    cd /hive/data/genomes/dasNov3/bed/repeatMasker
    time doRepeatMasker.pl -buildDir=`pwd` -noSplit \
	-bigClusterHub=swarm -dbHost=hgwdev -workhorse=hgwdev \
	-smallClusterHub=encodek dasNov3 > do.log 2>&1 &
    #   real    1214m38.968s

    cat faSize.rmsk.txt
    #   3631522711 bases (331640683 N's 3299882028 real 1674428220 upper
    #   1625453808 lower) in 46559 sequences in 1 files
    #   Total size: mean 77998.3 sd 423972.7 min 200 (AAGV03313855)
    #   max 14349515 (JH573670) median 3708
    #   %44.76 masked total, %49.26 masked real

    egrep -i "versi|relea" do.log
#    April 26 2011 (open-3-3-0) version of RepeatMasker
# CC   RELEASE 20110920; 
# RepeatMasker version development-$Id: RepeatMasker,v 1.26 2011/09/26 16:19:44 angie Exp $

    featureBits -countGaps dasNov3 rmsk
    #   1635908112 bases of 3631522711 (45.047%) in intersection

    # why is it different than the faSize above ?
    # because rmsk masks out some N's as well as bases, the count above
    #	separates out the N's from the bases, it doesn't show lower case N's

##########################################################################
# running simple repeat (DONE - 2012-04-11 - Hiram)
    mkdir /hive/data/genomes/dasNov3/bed/simpleRepeat
    cd /hive/data/genomes/dasNov3/bed/simpleRepeat
    time doSimpleRepeat.pl -buildDir=`pwd` -bigClusterHub=swarm \
	-dbHost=hgwdev -workhorse=hgwdev -smallClusterHub=encodek \
	dasNov3 > do.log 2>&1 &
    #   real    18m3.412s
    cat fb.simpleRepeat 
    #	45167829 bases of 3299882059 (1.369%) in intersection

    # add to rmsk after it is done:
    cd /hive/data/genomes/dasNov3
    twoBitMask dasNov3.rmsk.2bit \
	-add bed/simpleRepeat/trfMask.bed dasNov3.2bit
    #	you can safely ignore the warning about fields >= 13

    twoBitToFa dasNov3.2bit stdout | faSize stdin > faSize.dasNov3.2bit.txt
    cat faSize.dasNov3.2bit.txt
    #   3631522711 bases (331640683 N's 3299882028 real 1673564758 upper
    #   1626317270 lower) in 46559 sequences in 1 files
    #   Total size: mean 77998.3 sd 423972.7 min 200 (AAGV03313855)
    #   max 14349515 (JH573670) median 3708
    #   %44.78 masked total, %49.28 masked real

    rm /gbdb/dasNov3/dasNov3.2bit
    ln -s `pwd`/dasNov3.2bit /gbdb/dasNov3/dasNov3.2bit

#########################################################################
# Verify all gaps are marked, add any N's not in gap as type 'other'
#	(DONE - 2012-04-11 - Hiram)
    mkdir /hive/data/genomes/dasNov3/bed/gap
    cd /hive/data/genomes/dasNov3/bed/gap
    time nice -n +19 findMotif -motif=gattaca -verbose=4 \
	-strand=+ ../../dasNov3.unmasked.2bit > findMotif.txt 2>&1
    #	real    1m10.266s
    grep "^#GAP " findMotif.txt | sed -e "s/^#GAP //" > allGaps.bed
    time featureBits dasNov3 -not gap -bed=notGap.bed
    #   3299882059 bases of 3299882059 (100.000%) in intersection
    #   real    2m31.368s
    time featureBits dasNov3 allGaps.bed notGap.bed -bed=new.gaps.bed
    #   31 bases of 3299882059 (0.000%) in intersection
    #   real    67m12.512s
    # 1 base gaps, one 2 base gap, not worth the bother

    hgsql -N -e "select bridge from gap;" dasNov3 | sort | uniq -c
    #   268413 yes

##########################################################################
## WINDOWMASKER (DONE - 2012-04-10 - Hiram)
    mkdir /hive/data/genomes/dasNov3/bed/windowMasker
    cd /hive/data/genomes/dasNov3/bed/windowMasker
    time nice -n +19 doWindowMasker.pl -buildDir=`pwd` -workhorse=hgwdev \
	-dbHost=hgwdev dasNov3 > do.log 2>&1 &
    #   real    280m54.607s

    # Masking statistics
    twoBitToFa dasNov3.wmsk.2bit stdout | faSize stdin
    #   3631522711 bases (331640683 N's 3299882028 real 1932049508 upper
    #   1367832520 lower) in 46559 sequences in 1 files
    #   Total size: mean 77998.3 sd 423972.7 min 200 (AAGV03313855)
    #   max 14349515 (JH573670) median 3708
    #   %37.67 masked total, %41.45 masked real

    twoBitToFa dasNov3.wmsk.sdust.2bit stdout | faSize stdin
    #   3631522711 bases (331640683 N's 3299882028 real 1910015235 upper
    #   1389866793 lower) in 46559 sequences in 1 files
    #   Total size: mean 77998.3 sd 423972.7 min 200 (AAGV03313855)
    #   max 14349515 (JH573670) median 3708
    #   %38.27 masked total, %42.12 masked real

    hgLoadBed dasNov3 windowmaskerSdust windowmasker.sdust.bed.gz
    #   Read 19264039 elements of size 3 from windowmasker.sdust.bed.gz

    featureBits -countGaps dasNov3 windowmaskerSdust
    #   1721507449 bases of 3631522711 (47.405%) in intersection

    #	eliminate the gaps from the masking
    featureBits dasNov3 -not gap -bed=notGap.bed
    #   3299882059 bases of 3299882059 (100.000%) in intersection

    time nice -n +19 featureBits dasNov3 windowmaskerSdust notGap.bed \
        -bed=stdout | gzip -c > cleanWMask.bed.gz
    #   1389866797 bases of 3299882059 (42.119%) in intersection
    #   real    36m17.629s

    #	reload track to get it clean
    hgLoadBed dasNov3 windowmaskerSdust cleanWMask.bed.gz
    #   Read 19267435 elements of size 4 from cleanWMask.bed.gz
    featureBits -countGaps dasNov3 windowmaskerSdust
    #   1389866797 bases of 3631522711 (38.272%) in intersection
    zcat cleanWMask.bed.gz \
	| twoBitMask ../../dasNov3.unmasked.2bit stdin \
	    -type=.bed dasNov3.cleanWMSdust.2bit
    twoBitToFa dasNov3.cleanWMSdust.2bit stdout | faSize stdin \
        > dasNov3.cleanWMSdust.faSize.txt
    cat dasNov3.cleanWMSdust.faSize.txt
    #   3631522711 bases (331640683 N's 3299882028 real 1910015235 upper
    #   1389866793 lower) in 46559 sequences in 1 files
    #   Total size: mean 77998.3 sd 423972.7 min 200 (AAGV03313855)
    #   max 14349515 (JH573670) median 3708
    #   %38.27 masked total, %42.12 masked real

    # how much does this window masker and repeat masker overlap:
    featureBits -countGaps dasNov3 rmsk windowmaskerSdust
    #   1019390239 bases of 3631522711 (28.071%) in intersection

#########################################################################
# MASK SEQUENCE WITH WM+TRF (DONE - 2012-04-11 - Hiram)
    # Not masking with WM here since RM masked well
    #	since rmsk masks so very little of the genome, use the clean WM mask
    #	on the genome sequence
#    cd /hive/data/genomes/dasNov3
#    twoBitMask -add bed/windowMasker/dasNov3.cleanWMSdust.2bit \
#	bed/simpleRepeat/trfMask.bed dasNov3.2bit
    #	safe to ignore the warnings about BED file with >=13 fields
#    twoBitToFa dasNov3.2bit stdout | faSize stdin > faSize.dasNov3.txt
#    cat faSize.dasNov3.txt
    #   927696114 bases (111611440 N's 816084674 real 607935500 upper
    #   208149174 lower) in 5678 sequences in 1 files
    #   Total size: mean 163384.3 sd 1922194.0 min 1000 (AERX01077754-1)
    #   max 51042256 (chrLG7) median 2009
    #   %22.44 masked total, %25.51 masked real

    #	create symlink to gbdb
#    ssh hgwdev
#    rm /gbdb/dasNov3/dasNov3.2bit
#    ln -s `pwd`/dasNov3.2bit /gbdb/dasNov3/dasNov3.2bit

########################################################################
# cpgIslands - (DONE - 2011-04-24 - Hiram)
    mkdir /hive/data/genomes/dasNov3/bed/cpgIslands
    cd /hive/data/genomes/dasNov3/bed/cpgIslands
    time doCpgIslands.pl dasNov3 > do.log 2>&1
    #   real    98m44.307s

    cat fb.dasNov3.cpgIslandExt.txt
    #   59319215 bases of 3299882059 (1.798%) in intersection

#########################################################################
# genscan - (DONE - 2011-04-26 - Hiram)
    mkdir /hive/data/genomes/dasNov3/bed/genscan
    cd /hive/data/genomes/dasNov3/bed/genscan
    time doGenscan.pl dasNov3 > do.log 2>&1
    #   Elapsed time: 161m31s

    cat fb.dasNov3.genscan.txt
    #   71941727 bases of 3299882059 (2.180%) in intersection
    cat fb.dasNov3.genscanSubopt.txt
    #   85113391 bases of 3299882059 (2.579%) in intersection

#########################################################################
# MAKE 11.OOC FILE FOR BLAT/GENBANK (DONE - 2012-05-03 - Hiram)
    # Use -repMatch=900, based on size -- for human we use 1024
    # use the "real" number from the faSize measurement,
    # hg19 is 2897316137, calculate the ratio factor for 1024:
    calc \( 3299882028 / 2897316137 \) \* 1024
    #	( 3299882028 / 2897316137 ) * 1024 = 1166.279079

    # round up to 1200

    cd /hive/data/genomes/dasNov3
    time blat dasNov3.2bit /dev/null /dev/null -tileSize=11 \
      -makeOoc=jkStuff/dasNov3.11.ooc -repMatch=1200
    #   Wrote 34644 overused 11-mers to jkStuff/dasNov3.11.ooc
    #	real     2m17.323s

    # there are no non-bridged gaps, no lift file needed for genbank
    hgsql -N -e "select bridge from gap;" dasNov3 | sort | uniq -c
    #   268413  yes
#    cd /hive/data/genomes/dasNov3/jkStuff
#    gapToLift dasNov3 dasNov3.nonBridged.lift -bedFile=dasNov3.nonBridged.bed
    # largest non-bridged contig:
#    awk '{print $3-$2,$0}' dasNov3.nonBridged.bed | sort -nr | head
    #   123773608 chrX  95534   123869142       chrX.01


#########################################################################
# AUTO UPDATE GENBANK (DONE - 2012-05-03 - Hiram)
    # examine the file:
    /cluster/data/genbank/data/organism.lst
    # for your species to see what counts it has for:
# organism             mrnaCnt estCnt  refSeqCnt
# Dasypus novemcinctus    23      0       0
    # to decide which "native" mrna or ests you want to specify in genbank.conf

    ssh hgwdev  
    cd $HOME/kent/src/hg/makeDb/genbank
    git pull
    # edit etc/genbank.conf to add dasNov3 just after ce2
# dasNov3 (armadillo)
dasNov3.serverGenome = /hive/data/genomes/dasNov3/dasNov3.2bit
dasNov3.clusterGenome = /hive/data/genomes/dasNov3/dasNov3.2bit
dasNov3.ooc = /hive/data/genomes/dasNov3/jkStuff/dasNov3.11.ooc
dasNov3.lift = no
dasNov3.refseq.mrna.native.pslCDnaFilter  = ${lowCover.refseq.mrna.native.pslCDnaFilter}
dasNov3.refseq.mrna.xeno.pslCDnaFilter    = ${lowCover.refseq.mrna.xeno.pslCDnaFilter}
dasNov3.genbank.mrna.native.pslCDnaFilter = ${lowCover.genbank.mrna.native.pslCDnaFilter}
dasNov3.genbank.mrna.xeno.pslCDnaFilter   = ${lowCover.genbank.mrna.xeno.pslCDnaFilter}
dasNov3.genbank.est.native.pslCDnaFilter  = ${lowCover.genbank.est.native.pslCDnaFilter}
dasNov3.refseq.mrna.native.load = no
dasNov3.refseq.mrna.xeno.load = yes
dasNov3.genbank.mrna.xeno.load = yes
dasNov3.genbank.est.native.load = no
dasNov3.downloadDir = dasNov3
dasNov3.perChromTables = no

    # end of section added to etc/genbank.conf
    git commit -m "adding dasNov3 armadillo" etc/genbank.conf
    git push
    make etc-update

    ssh hgwdev			# used to do this on "genbank" machine
    screen -S dasNov3           # long running job managed in screen
    cd /cluster/data/genbank
    time nice -n +19 ./bin/gbAlignStep -initial dasNov3 &
    #	var/build/logs/2012.05.03-16:02:26.dasNov3.initalign.log
    #   real    1567m32.727s

    # load database when finished
    ssh hgwdev
    cd /cluster/data/genbank
    time nice -n +19 ./bin/gbDbLoadStep -drop -initialLoad dasNov3 &
    #	logFile: var/dbload/hgwdev/logs/2012.05.08-14:17:49.dbload.log
    #   real    39m52.260s

    # enable daily alignment and update of hgwdev (DONE - 2012-02-09 - Hiram)
    cd ~/kent/src/hg/makeDb/genbank
    git pull
    # add dasNov3 to:
    vi etc/align.dbs etc/hgwdev.dbs
    git commit -m "Added dasNov3." etc/align.dbs etc/hgwdev.dbs
    git push
    make etc-update

#########################################################################
# set default position to OPN5 gene displays  (DONE - 2012-07-20 - Hiram)
    hgsql -e \
'update dbDb set defaultPos="JH564832:1503619-1555676" where name="dasNov3";' \
	hgcentraltest

############################################################################
# pushQ entry (DONE - 2012-07-20 - Hiram)
    mkdir /hive/data/genomes/dasNov3/pushQ
    cd /hive/data/genomes/dasNov3/pushQ
    # Mark says don't let the transMap track get there
    time makePushQSql.pl dasNov3 2> stderr.txt | grep -v transMap > dasNov3.sql

    scp -p dasNov3.sql hgwbeta:/tmp
    ssh hgwbeta
    cd /tmp
    hgsql qapushq < dasNov3.sql

############################################################################
# create ucscToINSDC name mapping (DONE - 2013-08-23 - Hiram)
    mkdir /hive/data/genomes/dasNov3/bed/ucscToINSDC
    cd /hive/data/genomes/dasNov3/bed/ucscToINSDC

    # copying these scripts from the previous load and improving them
    # with each instance
    ./translateNames.sh NC_001821.1
    ./verifyAll.sh
    ./join.sh

    sed -e "s/21/12/" $HOME/kent/src/hg/lib/ucscToINSDC.sql \
      | hgLoadSqlTab dasNov3 ucscToINSDC stdin ucscToINSDC.tab
    checkTableCoords dasNov3 ucscToINSDC
    featureBits -countGaps dasNov3 ucscToINSDC
    # 3631522711 bases of 3631522711 (100.000%) in intersection

    # verify the track link to INSDC functions

##############################################################################
# CPG Islands Unmasked track (DONE - 2014-04-24 - Hiram)
    mkdir /hive/data/genomes/dasNov3/bed/cpgIslandsUnmasked
    cd /hive/data/genomes/dasNov3/bed/cpgIslandsUnmasked
    time doCpgIslands.pl -buildDir=`pwd` -bigClusterHub=ku \
     -tableName=cpgIslandExtUnmasked -dbHost=hgwdev -smallClusterHub=ku \
        -workhorse=hgwdev \
          -maskedSeq=/hive/data/genomes/dasNov3/dasNov3.unmasked.2bit \
            dasNov3 > do.log 2>&1
    # real    24m14.038s

    cat fb.dasNov3.cpgIslandExtUnmasked.txt
    # 61507517 bases of 2192103426 (2.806%) in intersection

##############################################################################
##############################################################################
# TransMap V3 tracks. see makeDb/doc/transMapTracks.txt (2014-12-21 markd)
##############################################################################
