# for emacs: -*- mode: sh; -*-

# This file describes browser build for the latCha1
#	Coelacanth - Latimeria chalumnae - Aug 2011

#       photoCreditURL http://en.wikipedia.org/wiki/File:Coelacanth.png
#       photoCreditName Robbie Cada
#       ncbiGenomeId 3262
#       ncbiAssemblyId 303548
#       ncbiAssemblyName Broad LatCha1
#       ncbiBioProject 56111
#       genBankAccessionID GCA_000225785.1

# http://www.ncbi.nlm.nih.gov/Traces/wgs/?val=AFYH00
#	77.5X coverage WGS

#	http://www.ncbi.nlm.nih.gov/bioproject/56111

#############################################################################
# Fetch sequence from genbank (DONE - 2012-01-06 - Hiram)

    mkdir -p /hive/data/genomes/latCha1/genbank
    cd /hive/data/genomes/latCha1/genbank

    time wget --timestamping -r --cut-dirs=6 --level=0 -nH -x \
        --no-remove-listing -np \
"ftp://ftp.ncbi.nlm.nih.gov/genbank/genomes/Eukaryotes/vertebrates_other/Latimeria_chalumnae/LatCha1/*"
    #	Downloaded: 13 files, 807M in 11m 36s (1.16 MB/s)
    #	real    11m41.968s

    # measure sequence to be used here
    faSize Primary_Assembly/unplaced_scaffolds/FASTA/unplaced.scaf.fa.gz
    #	2860575514 bases (676999153 N's 2183576361 real 2183576361 upper 0
    #	lower) in 22818 sequences in 1 files
    #	Total size: mean 125364.9 sd 395789.0
    #	min 1000 (gi|345908326|gb|AFYH01291818.1|)
    #	max 10736886 (gi|346342275|gb|JH126562.1|) median 5475


#############################################################################
# process into UCSC naming scheme (DONE - 2012-01-11 - Hiram)
    mkdir /hive/data/genomes/latCha1/ucsc
    cd /hive/data/genomes/latCha1/ucsc

    # verify we don't have any .acc numbers different from .1
    zcat ../genbank/Primary_Assembly/unplaced_scaffolds/AGP/unplaced.scaf.agp.gz \
	 | cut -f1 | egrep "^JH|AFYH" \
	| sed -e 's/^JH[0-9][0-9]*//; s/^AFYH[0-9][0-9]*//' | sort | uniq -c
    #	560838 .1

    # this is like the unplaced.pl script in other assemblies except it
    #	does not add chrUn_ to the names since they are all just scaffolds

    cat << '_EOF_' > unplaced.pl
#!/bin/env perl

use strict;
use warnings;

my $agpFile =  "../genbank/Primary_Assembly/unplaced_scaffolds/AGP/unplaced.scaf.agp.gz";
my $fastaFile =  "../genbank/Primary_Assembly/unplaced_scaffolds/FASTA/unplaced.scaf.fa.gz";
open (FH, "zcat $agpFile|") or die "can not read $agpFile";
open (UC, ">unplaced.agp") or die "can not write to unplaced.agp";
while (my $line = <FH>) {
    if ($line =~ m/^#/) {
        print UC $line;
    } else {
        $line =~ s/\.1//;    
        printf UC "%s", $line;
    }
}
close (FH);
close (UC);

open (FH, "zcat $fastaFile|") or die "can not read $fastaFile";
open (UC, ">unplaced.fa") or die "can not write to unplaced.fa";
while (my $line = <FH>) {
    if ($line =~ m/^>/) {
        chomp $line;
        $line =~ s/.*gb\|//;
        $line =~ s/\.1\|.*//;
        printf UC ">$line\n";
    } else {
        print UC $line;
    }
}
close (FH);
close (UC);
'_EOF_'
    # << happy emacs
    chmod +x unplaced.pl

    time ./unplaced.pl
    # real    1m4.707s

    time gzip *.fa *.agp
    #	real    11m4.518s
# -rw-rw-r-- 1   27747518 Jan 11 10:57 unplaced.agp
# -rw-rw-r-- 1 2901727727 Jan 11 10:58 unplaced.fa

    # verify nothing lost in the translation, should be the same as above
    #	except for the name translations
    faSize *.fa.gz
# 2860575514 bases (676999153 N's 2183576361 real 2183576361 upper 0 lower) in
# 22818 sequences in 1 files
# Total size: mean 125364.9 sd 395789.0 min 1000 (AFYH01291818)
#	max 10736886 (JH126562) median 5475

#############################################################################
#   Initial browser build (DONE - 2012-01-11 - Hiram)
    cd /hive/data/genomes/latCha1
    cat << '_EOF_' > latCha1.config.ra
# Config parameters for makeGenomeDb.pl:
db latCha1
clade vertebrate
genomeCladePriority 82
scientificName Latimeria chalumnae
commonName Coelacanth
assemblyDate Aug. 2011
assemblyLabel Broad LatCha1 (GCA_000225785.1)
assemblyShortLabel LatCha1
orderKey 447
mitoAcc NC_001804
fastaFiles /hive/data/genomes/latCha1/ucsc/*.fa.gz
agpFiles /hive/data/genomes/latCha1/ucsc/*.agp.gz
dbDbSpeciesDir latCha
taxId   7897
'_EOF_'
    # << happy emacs

    time makeGenomeDb.pl -stop=agp latCha1.config.ra > agp.log 2>&1 
    #	real    5m52.756s
    # check the end of agp.log to verify it is OK
    time makeGenomeDb.pl -workhorse=hgwdev -fileServer=hgwdev \
	-continue=db latCha1.config.ra > db.log 2>&1 
    #	real    16m19.040s

#############################################################################
# running repeat masker (DONE - 2012-01-11 - Hiram)
    mkdir /hive/data/genomes/latCha1/bed/repeatMasker
    cd /hive/data/genomes/latCha1/bed/repeatMasker
    time doRepeatMasker.pl -buildDir=`pwd` -noSplit \
	-bigClusterHub=swarm -dbHost=hgwdev -workhorse=hgwdev \
	-smallClusterHub=memk latCha1 > do.log 2>&1 &
    #	real    250m27.510s
    cat faSize.rmsk.txt
    #	2860591921 bases (676999153 N's 2183592768 real 1981827417 upper
    #	201765351 lower) in 22819 sequences in 1 files
    #	Total size: mean 125360.1 sd 395781.0 min 1000 (AFYH01291818)
    #	max 10736886 (JH126562) median 5475
    #	%7.05 masked total, %9.24 masked real

    grep -i versi do.log
# RepeatMasker version development-$Id: RepeatMasker,v 1.26 2011/09/26 16:19:44 angie Exp $
#    April 26 2011 (open-3-3-0) version of RepeatMasker

    featureBits -countGaps latCha1 rmsk
    #	201917583 bases of 2860591921 (7.059%) in intersection
    # why is it different than the faSize above ?
    # because rmsk masks out some N's as well as bases, the count above
    #	separates out the N's from the bases, it doesn't show lower case N's

##########################################################################
# running simple repeat (DONE - 2012-01-11 - Hiram)
    mkdir /hive/data/genomes/latCha1/bed/simpleRepeat
    cd /hive/data/genomes/latCha1/bed/simpleRepeat
    time doSimpleRepeat.pl -buildDir=`pwd` -bigClusterHub=swarm \
	-dbHost=hgwdev -workhorse=hgwdev -smallClusterHub=memk \
	latCha1 > do.log 2>&1 &
    #	real    43m42.360s

    cat fb.simpleRepeat 
    #	33730886 bases of 2183592768 (1.545%) in intersection

    # not going to add to rmsk here, using the window masker instead since
    #	it masks more sequence
    cd /hive/data/genomes/latCha1
    twoBitMask latCha1.rmsk.2bit \
	-add bed/simpleRepeat/trfMask.bed latCha1.2bit
    #	you can safely ignore the warning about fields >= 13

    twoBitToFa latCha1.2bit stdout | faSize stdin > faSize.latCha1.2bit.txt
    cat faSize.latCha1.2bit.txt
    #	2608572064 bases (131440969 N's 2477131095 real 1320629270 upper
    #	1156501825 lower) in 2685 sequences in 1 files
    #	Total size: mean 971535.2 sd 4827933.6 min 1003 (AGCE01151413)
    #	max 72162052 (JH378105) median 1553

    rm /gbdb/latCha1/latCha1.2bit
    ln -s `pwd`/latCha1.2bit /gbdb/latCha1/latCha1.2bit

#########################################################################
# Verify all gaps are marked, add any N's not in gap as type 'other'
#	(DONE - 2012-01-11 - Hiram)
    mkdir /hive/data/genomes/latCha1/bed/gap
    cd /hive/data/genomes/latCha1/bed/gap
    time nice -n +19 findMotif -motif=gattaca -verbose=4 \
	-strand=+ ../../latCha1.unmasked.2bit > findMotif.txt 2>&1
    #	real    0m27.822s
    grep "^#GAP " findMotif.txt | sed -e "s/^#GAP //" > allGaps.bed
    time featureBits -countGaps latCha1 -not gap -bed=notGap.bed
    #	2183592768 bases of 2860591921 (76.334%) in intersection
    #	real    0m22.615s

    time featureBits -countGaps latCha1 allGaps.bed notGap.bed -bed=new.gaps.bed
    #	0 bases of 2860591921 (0.000%) in intersection
    #	real    28m7.467s

    # no unannotated gaps, no more to run here
    # there are no non-bridged gaps here:
    hgsql -N -e "select bridge from gap;" latCha1 | sort | uniq -c
    #	269010 yes

##########################################################################
## WINDOWMASKER (DONE - 2012-01-11 - Hiram)
    mkdir /hive/data/genomes/latCha1/bed/windowMasker
    cd /hive/data/genomes/latCha1/bed/windowMasker
    time nice -n +19 doWindowMasker.pl -buildDir=`pwd` -workhorse=hgwdev \
	-dbHost=hgwdev latCha1 > do.log 2>&1 &
    #	real    275m14.802s

    # Masking statistics
    twoBitToFa latCha1.wmsk.2bit stdout | faSize stdin
    #	2860591921 bases (676999153 N's 2183592768 real 1197030078 upper
    #	986562690 lower) in 22819 sequences in 1 files
    #	Total size: mean 125360.1 sd 395781.0 min 1000 (AFYH01291818)
    #	max 10736886 (JH126562) median 5475
    #	%34.49 masked total, %45.18 masked real

    twoBitToFa latCha1.wmsk.sdust.2bit stdout | faSize stdin
    #	2860591921 bases (676999153 N's 2183592768 real 1186196719 upper
    #	997396049 lower) in 22819 sequences in 1 files
    #	Total size: mean 125360.1 sd 395781.0 min 1000 (AFYH01291818)
    #	max 10736886 (JH126562) median 5475
    #	%34.87 masked total, %45.68 masked real

    hgLoadBed latCha1 windowmaskerSdust windowmasker.sdust.bed.gz
    #	Loaded 14478353 elements of size 3

    featureBits -countGaps latCha1 windowmaskerSdust
    #	1674395202 bases of 2860591921 (58.533%) in intersection

    #	eliminate the gaps from the masking
    featureBits latCha1 -not gap -bed=notGap.bed
    #	2183592768 bases of 2183592768 (100.000%) in intersection
    time nice -n +19 featureBits latCha1 windowmaskerSdust notGap.bed \
        -bed=stdout | gzip -c > cleanWMask.bed.gz
    #	997396049 bases of 2183592768 (45.677%) in intersection
    #	real    16m48.600s

    #	reload track to get it clean
    hgLoadBed latCha1 windowmaskerSdust cleanWMask.bed.gz
    #	Loaded 14630157 elements of size 4
    time featureBits -countGaps latCha1 windowmaskerSdust
    #	997396049 bases of 2860591921 (34.867%) in intersection
    #	real    2m47.260s

    #	mask with this clean result
    zcat cleanWMask.bed.gz \
	| twoBitMask ../../latCha1.unmasked.2bit stdin \
	    -type=.bed latCha1.cleanWMSdust.2bit
    twoBitToFa latCha1.cleanWMSdust.2bit stdout | faSize stdin \
        > latCha1.cleanWMSdust.faSize.txt
    cat latCha1.cleanWMSdust.faSize.txt
    #	2860591921 bases (676999153 N's 2183592768 real 1186196719 upper
    #	997396049 lower) in 22819 sequences in 1 files
    #	Total size: mean 125360.1 sd 395781.0 min 1000 (AFYH01291818)
    #	max 10736886 (JH126562) median 5475
    #	%34.87 masked total, %45.68 masked real

    # how much does this window masker and repeat masker overlap:
    featureBits -countGaps latCha1 rmsk windowmaskerSdust
    #	177085590 bases of 2860591921 (6.191%) in intersection

#########################################################################
# MASK SEQUENCE WITH WM+TRF (DONE - 2012-01-12 - Hiram)
    cd /hive/data/genomes/latCha1
    twoBitMask -add bed/windowMasker/latCha1.cleanWMSdust.2bit \
	bed/simpleRepeat/trfMask.bed latCha1.2bit
    #	safe to ignore the warnings about BED file with >=13 fields
    twoBitToFa latCha1.2bit stdout | faSize stdin > faSize.latCha1.txt
    cat faSize.latCha1.txt
    #	2860591921 bases (676999153 N's 2183592768 real 1186074582 upper
    #	997518186 lower) in 22819 sequences in 1 files
    #	Total size: mean 125360.1 sd 395781.0 min 1000 (AFYH01291818)
    #	max 10736886 (JH126562) median 5475
    #	%34.87 masked total, %45.68 masked real

    #	create symlink to gbdb
    ssh hgwdev
    rm /gbdb/latCha1/latCha1.2bit
    ln -s `pwd`/latCha1.2bit /gbdb/latCha1/latCha1.2bit

#########################################################################
# cpgIslands - (DONE - 2011-04-23 - Hiram)
    mkdir /hive/data/genomes/latCha1/bed/cpgIslands
    cd /hive/data/genomes/latCha1/bed/cpgIslands
    time doCpgIslands.pl latCha1 > do.log 2>&1
    #   Elapsed time: 75m3s

    cat fb.latCha1.cpgIslandExt.txt
    #   3128859 bases of 2183592768 (0.143%) in intersection

#########################################################################
# genscan - (DONE - 2011-04-26 - Hiram)
    mkdir /hive/data/genomes/latCha1/bed/genscan
    cd /hive/data/genomes/latCha1/bed/genscan
    time doGenscan.pl latCha1 > do.log 2>&1
    #   Elapsed time: 107m4s

    cat fb.latCha1.genscan.txt
    #   34776149 bases of 2183592768 (1.593%) in intersection
    cat fb.latCha1.genscanSubopt.txt
    #   39383851 bases of 2183592768 (1.804%) in intersection

#########################################################################
# MAKE 11.OOC FILE FOR BLAT/GENBANK (DONE - 2012-05-03 - Hiram)
    # Use -repMatch=900, based on size -- for human we use 1024
    # use the "real" number from the faSize measurement,
    # hg19 is 2897316137, calculate the ratio factor for 1024:
    calc \( 2183592768 / 2897316137 \) \* 1024
    #	( 2183592768 / 2897316137 ) * 1024 = 771.748366

    # round up to 800

    cd /hive/data/genomes/latCha1
    time blat latCha1.2bit /dev/null /dev/null -tileSize=11 \
      -makeOoc=jkStuff/latCha1.11.ooc -repMatch=800
    #   Wrote 28363 overused 11-mers to jkStuff/latCha1.11.ooc
    #   real    0m39.722s

    # there are no non-bridged gaps, no lift file needed for genbank
    hgsql -N -e "select bridge from gap;" latCha1 | sort | uniq -c
    #   269010 yes
#    cd /hive/data/genomes/latCha1/jkStuff
#    gapToLift latCha1 latCha1.nonBridged.lift -bedFile=latCha1.nonBridged.bed
    # largest non-bridged contig:
#    awk '{print $3-$2,$0}' latCha1.nonBridged.bed | sort -nr | head
    #   123773608 chrX  95534   123869142       chrX.01

#########################################################################
# AUTO UPDATE GENBANK (DONE - 2012-05-03 - Hiram)
    # examine the file:
    /cluster/data/genbank/data/organism.lst
    # for your species to see what counts it has for:
# organism    mrnaCnt estCnt  refSeqCnt
# Latimeria chalumnae	47	0	0
    # to decide which "native" mrna or ests you want to specify in genbank.conf

    ssh hgwdev  
    cd $HOME/kent/src/hg/makeDb/genbank
    git pull
    # edit etc/genbank.conf
# latCha1 (Coelacanth)
latCha1.serverGenome = /hive/data/genomes/latCha1/latCha1.2bit
latCha1.clusterGenome = /hive/data/genomes/latCha1/latCha1.2bit
latCha1.ooc = /hive/data/genomes/latCha1/jkStuff/latCha1.11.ooc
latCha1.lift = no
latCha1.refseq.mrna.native.pslCDnaFilter  = ${lowCover.refseq.mrna.native.pslCDnaFilter}
latCha1.refseq.mrna.xeno.pslCDnaFilter    = ${lowCover.refseq.mrna.xeno.pslCDnaFilter}
latCha1.genbank.mrna.native.pslCDnaFilter = ${lowCover.genbank.mrna.native.pslCDnaFilter}
latCha1.genbank.mrna.xeno.pslCDnaFilter   = ${lowCover.genbank.mrna.xeno.pslCDnaFilter}
latCha1.genbank.est.native.pslCDnaFilter  = ${lowCover.genbank.est.native.pslCDnaFilter}
latCha1.refseq.mrna.native.load = no
latCha1.refseq.mrna.xeno.load = yes
latCha1.genbank.mrna.xeno.load = yes
latCha1.genbank.est.native.load = no
latCha1.downloadDir = latCha1
latCha1.perChromTables = no

    # end of section added to etc/genbank.conf
    git commit -m "adding latCha1 Coelacanth" etc/genbank.conf
    git push
    make etc-update

    git pull
    # Edit src/lib/gbGenome.c to add new species.
    git commit -m "adding definition for latChaNames" \
        src/lib/gbGenome.c
    git push
    make install-server

    ssh hgwdev			# used to do this on "genbank" machine
    screen -S latCha1           # long running job managed in screen
    cd /cluster/data/genbank
    time nice -n +19 ./bin/gbAlignStep -initial latCha1 &
    #	var/build/logs/2012.05.08-11:55:55.latCha1.initalign.log
    #   real    2310m21.398s

    # load database when finished
    ssh hgwdev
    cd /cluster/data/genbank
    time nice -n +19 ./bin/gbDbLoadStep -drop -initialLoad latCha1 &
    #	logFile: var/dbload/hgwdev/logs/2012.05.09-16:06:10.dbload.log
    #   real    85m52.881s

    # enable daily alignment and update of hgwdev
    cd ~/kent/src/hg/makeDb/genbank
    git pull
    # add latCha1 to:
    vi etc/align.dbs etc/hgwdev.dbs
    git commit -m "Added latCha1." etc/align.dbs etc/hgwdev.dbs
    git push
    make etc-update

#########################################################################
# photograph (DONE - 2012-06-2012 - Hiram)
    # note at http://en.wikipedia.org/wiki/File:Coelacanth.png
    #   This work has been released into the public domain by
    #   its author, Robbie Cada. This applies worldwide.
    mkdir /hive/data/genomes/latCha1/photograph
    wget --timestamping \
http://upload.wikimedia.org/wikipedia/commons/4/4b/Coelacanth.png
    convert -geometry "400x200" Coelacanth.png  "Latimeria_chalumnae.png"

    # add Latimeria_chalumnae.png to the source tree src/hg/htdocs/images/

#########################################################################
# set default position to RHO gene displays  (DONE - 2012-07-23 - Hiram)
    hgsql -e \
'update dbDb set defaultPos="JH126819:1630199-1640227" where name="latCha1";' \
	hgcentraltest

############################################################################
# pushQ entry (DONE - 2012-07-23 - Hiram)
    mkdir /hive/data/genomes/latCha1/pushQ
    cd /hive/data/genomes/latCha1/pushQ
    # Mark says don't let the transMap track get there
    time makePushQSql.pl latCha1 2> stderr.txt | grep -v transMap > latCha1.sql
    #   real    4m0.069s

    scp -p latCha1.sql hgwbeta:/tmp
    ssh hgwbeta
    cd /tmp
    hgsql qapushq < latCha1.sql

############################################################################
# create ucscToINSDC name mapping (DONE - 2013-08-23 - Hiram)
    mkdir /hive/data/genomes/latCha1/bed/ucscToINSDC
    cd /hive/data/genomes/latCha1/bed/ucscToINSDC

    # copying these scripts from the previous load and improving them
    # with each instance
    ./translateNames.sh NC_001804.1
    ./verifyAll.sh
    ./join.sh

    sed -e "s/21/12/" $HOME/kent/src/hg/lib/ucscToINSDC.sql \
        | hgLoadSqlTab latCha1 ucscToINSDC stdin ucscToINSDC.tab
    checkTableCoords latCha1 ucscToINSDC
    featureBits -countGaps latCha1 ucscToINSDC
    # 2860591921 bases of 2860591921 (100.000%) in intersection

    # verify the track link to INSDC functions

##############################################################################
##############################################################################
# TransMap V3 tracks. see makeDb/doc/transMapTracks.txt (2014-12-21 markd)
##############################################################################
