# for emacs: -*- mode: sh; -*-

# This file describes browser build for the saiBol1
#	Squirrel monkey - Saimiri boliviensis - Oct 2011

# http://www.ncbi.nlm.nih.gov/Traces/wgs/?val=AGCE01
#	80X coverage WGS

#	http://www.ncbi.nlm.nih.gov/bioproject/67945
#       http://www.ncbi.nlm.nih.gov/genome/6907
#       http://www.ncbi.nlm.nih.gov/genome/assembly/420378

#############################################################################
# Fetch sequence from genbank (DONE - 2012-01-06 - Hiram)

    mkdir -p /hive/data/genomes/saiBol1/genbank
    cd /hive/data/genomes/saiBol1/genbank

    time wget --timestamping -r --cut-dirs=6 --level=0 -nH -x \
        --no-remove-listing -np \
"ftp://ftp.ncbi.nlm.nih.gov/genbank/genomes/Eukaryotes/vertebrates_mammals/Saimiri_boliviensis/SaiBol1.0/*"
    #	Downloaded: 13 files, 1.0G in 13m 58s (1.23 MB/s)
    #	real    14m3.742s

    # measure sequence to be used here
    faSize Primary_Assembly/unplaced_scaffolds/FASTA/unplaced.scaf.fa.gz
    #	2608572064 bases (131440969 N's 2477131095 real 2477131095 upper 0
    #	lower) in 2685 sequences in 1 files
    #	Total size: mean 971535.2 sd 4827933.6 min 1003
    #	(gi|356714203|gb|AGCE01151413.1|) max 72162052
    #	(gi|357434258|gb|JH378105.1|) median 1553


#############################################################################
# process into UCSC naming scheme (DONE - 2012-01-06 - Hiram)
    mkdir /hive/data/genomes/saiBol1/ucsc
    cd /hive/data/genomes/saiBol1/ucsc

    # verify we don't have any .acc numbers different from .1
    zcat ../genbank/Primary_Assembly/unplaced_scaffolds/AGP/unplaced.scaf.agp.gz \
	 | cut -f1 | egrep "^JH|AGCE" \
	| sed -e 's/^JH[0-9][0-9]*//; s/^AGCE[0-9][0-9]*//' | sort | uniq -c
    #	300141 .1

    # this is like the unplaced.pl script in other assemblies except it
    #	does not add chrUn_ to the names since they are all just scaffolds

    cat << '_EOF_' > unplaced.pl
#!/bin/env perl

use strict;
use warnings;

my $agpFile =  "../genbank/Primary_Assembly/unplaced_scaffolds/AGP/unplaced.scaf.agp.gz";
my $fastaFile =  "../genbank/Primary_Assembly/unplaced_scaffolds/FASTA/unplaced.scaf.fa.gz";
open (FH, "zcat $agpFile|") or die "can not read $agpFile";
open (UC, ">unplaced.agp") or die "can not write to unplaced.agp";
while (my $line = <FH>) {
    if ($line =~ m/^#/) {
        print UC $line;
    } else {
        $line =~ s/\.1//;    
        printf UC "%s", $line;
    }
}
close (FH);
close (UC);

open (FH, "zcat $fastaFile|") or die "can not read $fastaFile";
open (UC, ">unplaced.fa") or die "can not write to unplaced.fa";
while (my $line = <FH>) {
    if ($line =~ m/^>/) {
        chomp $line;
        $line =~ s/.*gb\|//;
        $line =~ s/\.1\|.*//;
        printf UC ">$line\n";
    } else {
        print UC $line;
    }
}
close (FH);
close (UC);
'_EOF_'
    # << happy emacs
    chmod +x unplaced.pl

    time ./unplaced.pl
    # real    1m21.710s

    time gzip *.fa *.agp
    #	real    13m29.352s
# -rw-rw-r-- 1   15919402 Jan  6 15:44 unplaced.agp
# -rw-rw-r-- 1 2645873375 Jan  6 15:45 unplaced.fa

    # verify nothing lost in the translation, should be the same as above
    #	except for the name translations
    faSize *.fa.gz
# 2608572064 bases (131440969 N's 2477131095 real 2477131095 upper 0 lower) in
# 2685 sequences in 1 files
# Total size: mean 971535.2 sd 4827933.6 min 1003 (AGCE01151413) max 72162052 (JH378105) median 1553

#############################################################################
#   Initial browser build (DONE - 2012-01-06 - Hiram)
    cd /hive/data/genomes/saiBol1
    cat << '_EOF_' > saiBol1.config.ra
# Config parameters for makeGenomeDb.pl:
db saiBol1
clade mammal
genomeCladePriority 16
scientificName Saimiri boliviensis
commonName Squirrel monkey
assemblyDate Oct. 2011
assemblyLabel Broad SaiBol1.0 (GCA_000235385.1)
assemblyShortLabel SaiBol1.0
orderKey 40
mitoAcc none
fastaFiles /hive/data/genomes/saiBol1/ucsc/*.fa.gz
agpFiles /hive/data/genomes/saiBol1/ucsc/*.agp.gz
dbDbSpeciesDir saiBol
taxId   39432
'_EOF_'
    # << happy emacs

    time makeGenomeDb.pl -stop=agp saiBol1.config.ra > agp.log 2>&1 
    #	real    3m14.814s
    # check the end of agp.log to verify it is OK
    time makeGenomeDb.pl -workhorse=hgwdev -fileServer=hgwdev \
	-continue=db saiBol1.config.ra > db.log 2>&1 
    #	real    21m32.613s

#############################################################################
# running repeat masker (DONE - 2012-01-06 - Hiram)
    mkdir /hive/data/genomes/saiBol1/bed/repeatMasker
    cd /hive/data/genomes/saiBol1/bed/repeatMasker
    time doRepeatMasker.pl -buildDir=`pwd` -noSplit \
	-bigClusterHub=swarm -dbHost=hgwdev -workhorse=hgwdev \
	-smallClusterHub=memk saiBol1 > do.log 2>&1 &
    #	real    751m55.197s
    cat faSize.rmsk.txt
    #	2608572064 bases (131440969 N's 2477131095 real 1321507940 upper
    #	1155623155 lower) in 2685 sequences in 1 files
    #	Total size: mean 971535.2 sd 4827933.6 min 1003 (AGCE01151413)
    #	max 72162052 (JH378105) median 1553
    #	%44.30 masked total, %46.65 masked real

    grep -i versi do.log
# RepeatMasker version development-$Id: RepeatMasker,v 1.26 2011/09/26 16:19:44 angie Exp $
#    April 26 2011 (open-3-3-0) version of RepeatMasker

    featureBits -countGaps saiBol1 rmsk
    #	1157555619 bases of 2608572064 (44.375%) in intersection
    # why is it different than the faSize above ?
    # because rmsk masks out some N's as well as bases, the count above
    #	separates out the N's from the bases, it doesn't show lower case N's

##########################################################################
# running simple repeat (DONE - 2012-01-06 - Hiram)
    mkdir /hive/data/genomes/saiBol1/bed/simpleRepeat
    cd /hive/data/genomes/saiBol1/bed/simpleRepeat
    time doSimpleRepeat.pl -buildDir=`pwd` -bigClusterHub=swarm \
	-dbHost=hgwdev -workhorse=hgwdev -smallClusterHub=memk \
	saiBol1 > do.log 2>&1 &
    #	real    20m15.294s

    cat fb.simpleRepeat 
    #	38276602 bases of 2477131095 (1.545%) in intersection

    # not going to add to rmsk here, using the window masker instead since
    #	it masks more sequence
    cd /hive/data/genomes/saiBol1
    twoBitMask saiBol1.rmsk.2bit \
	-add bed/simpleRepeat/trfMask.bed saiBol1.2bit
    #	you can safely ignore the warning about fields >= 13

    twoBitToFa saiBol1.2bit stdout | faSize stdin > faSize.saiBol1.2bit.txt
    cat faSize.saiBol1.2bit.txt
    #	2608572064 bases (131440969 N's 2477131095 real 1320629270 upper
    #	1156501825 lower) in 2685 sequences in 1 files
    #	Total size: mean 971535.2 sd 4827933.6 min 1003 (AGCE01151413)
    #	max 72162052 (JH378105) median 1553

    rm /gbdb/saiBol1/saiBol1.2bit
    ln -s `pwd`/saiBol1.2bit /gbdb/saiBol1/saiBol1.2bit

#########################################################################
# Verify all gaps are marked, add any N's not in gap as type 'other'
#	(DONE - 2012-01-04 - Hiram)
    mkdir /hive/data/genomes/saiBol1/bed/gap
    cd /hive/data/genomes/saiBol1/bed/gap
    time nice -n +19 findMotif -motif=gattaca -verbose=4 \
	-strand=+ ../../saiBol1.unmasked.2bit > findMotif.txt 2>&1
    #	real    0m46.931s
    grep "^#GAP " findMotif.txt | sed -e "s/^#GAP //" > allGaps.bed
    time featureBits -countGaps saiBol1 -not gap -bed=notGap.bed
    #	2969184009 bases of 3309577922 (89.715%) in intersection
    #	real    0m26.597s

    time featureBits -countGaps saiBol1 allGaps.bed notGap.bed -bed=new.gaps.bed
    #	0 bases of 2608572064 (0.000%) in intersection
    #	real    8m30.686s

    # no unannotated gaps, no more to run here

##########################################################################
## WINDOWMASKER (DONE - 2012-01-06 - Hiram)
    mkdir /hive/data/genomes/saiBol1/bed/windowMasker
    cd /hive/data/genomes/saiBol1/bed/windowMasker
    time nice -n +19 doWindowMasker.pl -buildDir=`pwd` -workhorse=hgwdev \
	-dbHost=hgwdev saiBol1 > do.log 2>&1 &
    #	about 3 hours

    # Masking statistics
    twoBitToFa saiBol1.wmsk.2bit stdout | faSize stdin
    #	2608572064 bases (131440969 N's 2477131095 real 1630475943 upper
    #	846655152 lower) in 2685 sequences in 1 files
    #	Total size: mean 971535.2 sd 4827933.6
    #	min 1003 (AGCE01151413) max 72162052 (JH378105) median 1553
    #	%32.46 masked total, %34.18 masked real

    twoBitToFa saiBol1.wmsk.sdust.2bit stdout | faSize stdin
    #	2608572064 bases (131440969 N's 2477131095 real 1617331205 upper
    #	859799890 lower) in 2685 sequences in 1 files
    #	Total size: mean 971535.2 sd 4827933.6
    #	min 1003 (AGCE01151413) max 72162052 (JH378105) median 1553
    #	%32.96 masked total, %34.71 masked real

    hgLoadBed saiBol1 windowmaskerSdust windowmasker.sdust.bed.gz
    #	Loaded 14414165 elements of size 3

    featureBits -countGaps saiBol1 windowmaskerSdust
    #	991240859 bases of 2608572064 (37.999%) in intersection

    #	eliminate the gaps from the masking
    featureBits saiBol1 -not gap -bed=notGap.bed
    #	2477131095 bases of 2477131095 (100.000%) in intersection
    time nice -n +19 featureBits saiBol1 windowmaskerSdust notGap.bed \
        -bed=stdout | gzip -c > cleanWMask.bed.gz
    #	859799890 bases of 2477131095 (34.710%) in intersection
    #	real    5m21.813s

    #	reload track to get it clean
    hgLoadBed saiBol1 windowmaskerSdust cleanWMask.bed.gz
    #	Loaded 14489638 elements of size 4
    time featureBits -countGaps saiBol1 windowmaskerSdust
    #	859799890 bases of 2608572064 (32.961%) in intersection
    #	real    1m24.055s

    #	mask with this clean result
    zcat cleanWMask.bed.gz \
	| twoBitMask ../../saiBol1.unmasked.2bit stdin \
	    -type=.bed saiBol1.cleanWMSdust.2bit
    twoBitToFa saiBol1.cleanWMSdust.2bit stdout | faSize stdin \
        > saiBol1.cleanWMSdust.faSize.txt
    cat saiBol1.cleanWMSdust.faSize.txt
    #	2608572064 bases (131440969 N's 2477131095 real 1617331205 upper
    #	859799890 lower) in 2685 sequences in 1 files
    #	Total size: mean 971535.2 sd 4827933.6
    #	min 1003 (AGCE01151413) max 72162052 (JH378105) median 1553
    #	%32.96 masked total, %34.71 masked real

    # how much does this window masker and repeat masker overlap:
    featureBits -countGaps saiBol1 rmsk windowmaskerSdust
    #	627944117 bases of 2608572064 (24.072%) in intersection

#########################################################################
# cpgIslands - (DONE - 2011-04-20 - Hiram)
    mkdir /hive/data/genomes/saiBol1/bed/cpgIslands
    cd /hive/data/genomes/saiBol1/bed/cpgIslands
    time doCpgIslands.pl saiBol1 > do.log 2>&1
    #   Elapsed time: 11m10s

    cat fb.saiBol1.cpgIslandExt.txt 
    #   17439071 bases of 2477131095 (0.704%) in intersection

#########################################################################
# genscan - (DONE - 2011-04-25 - Hiram)
    mkdir /hive/data/genomes/saiBol1/bed/genscan
    cd /hive/data/genomes/saiBol1/bed/genscan
    # while running this procedure, constructed the script to do this
    #   automatically, the steps were run manually for this instance.
    doGenscan.pl hetGla2
    # one job failed during the genscan kluster run.
    # to recover, edit the runGsBig.csh script and lower the size of chunks
    # for gsBig, in this case from 2400000 to 2000000:

/cluster/bin/x86_64/gsBig $seqFile $resultGtf -trans=$resultPep -subopt=$resultSubopt -exe=/hive/data/staging/data/genscan/genscan -par=/hive/data/staging/data/genscan/HumanIso.smat -tmp=/tmp -window=2000000

    # then run the job on hgwdev that uses this script:
    ./runGsBig.csh JH378249 000 gtf/000/JH378249.gtf \
        pep/000/JH378249.pep subopt/000/JH378249.bed

    # may need to adjust the number even smaller, or in the worst case,
    #   actually partition the bit in question.

    # when all finished:
    cat fb.saiBol1.genscan.txt
    #   51449399 bases of 2477131095 (2.077%) in intersection
    cat fb.saiBol1.genscanSubopt.txt
    #   50823007 bases of 2477131095 (2.052%) in intersection

#########################################################################
# MAKE 11.OOC FILE FOR BLAT/GENBANK (DONE - 2012-05-03 - Hiram)
    # Use -repMatch=900, based on size -- for human we use 1024
    # use the "real" number from the faSize measurement,
    # hg19 is 2897316137, calculate the ratio factor for 1024:
    calc \( 2477131095 / 2897316137 \) \* 1024
    #	( 2477131095 / 2897316137 ) * 1024 = 875.493775

    # round up to 900

    cd /hive/data/genomes/saiBol1
    time blat saiBol1.2bit /dev/null /dev/null -tileSize=11 \
      -makeOoc=jkStuff/saiBol1.11.ooc -repMatch=900
    #   Wrote 25485 overused 11-mers to jkStuff/saiBol1.11.ooc
    #   real    1m7.834s

    # there are no non-bridged gaps, no lift file needed for genbank
    hgsql -N -e "select bridge from gap;" saiBol1 | sort | uniq -c
    #   148728 yes
#    cd /hive/data/genomes/saiBol1/jkStuff
#    gapToLift saiBol1 saiBol1.nonBridged.lift -bedFile=saiBol1.nonBridged.bed
    # largest non-bridged contig:
#    awk '{print $3-$2,$0}' saiBol1.nonBridged.bed | sort -nr | head
    #   123773608 chrX  95534   123869142       chrX.01

#########################################################################
# AUTO UPDATE GENBANK (DONE - 2012-05-03 - Hiram)
    # examine the file:
    /cluster/data/genbank/data/organism.lst
    # for your species to see what counts it has for:
# organism       mrnaCnt estCnt  refSeqCnt
# Saimiri boliviensis	52	0	0
# Saimiri boliviensis boliviensis	8	0	0
    # to decide which "native" mrna or ests you want to specify in genbank.conf

    ssh hgwdev
    cd $HOME/kent/src/hg/makeDb/genbank
    git pull
    # edit etc/genbank.conf to add saiBol1
# saiBol1 (squirrel monkey)
saiBol1.serverGenome = /hive/data/genomes/saiBol1/saiBol1.2bit
saiBol1.clusterGenome = /hive/data/genomes/saiBol1/saiBol1.2bit
saiBol1.ooc = /hive/data/genomes/saiBol1/jkStuff/saiBol1.11.ooc
saiBol1.lift = no
saiBol1.refseq.mrna.native.pslCDnaFilter  = ${lowCover.refseq.mrna.native.pslCDnaFilter}
saiBol1.refseq.mrna.xeno.pslCDnaFilter    = ${lowCover.refseq.mrna.xeno.pslCDnaFilter}
saiBol1.genbank.mrna.native.pslCDnaFilter = ${lowCover.genbank.mrna.native.pslCDnaFilter}
saiBol1.genbank.mrna.xeno.pslCDnaFilter   = ${lowCover.genbank.mrna.xeno.pslCDnaFilter}
saiBol1.genbank.est.native.pslCDnaFilter  = ${lowCover.genbank.est.native.pslCDnaFilter}
saiBol1.refseq.mrna.native.load = no
saiBol1.refseq.mrna.xeno.load = yes
saiBol1.genbank.mrna.xeno.load = yes
saiBol1.genbank.est.native.load = no
saiBol1.downloadDir = saiBol1
saiBol1.perChromTables = no

    # end of section added to etc/genbank.conf
    git commit -m "adding saiBol1 squirrel monkey" etc/genbank.conf
    git push
    make etc-update

    git pull
    # Edit src/lib/gbGenome.c to add new species.
    git commit -m "adding definition for saiBolNames" src/lib/gbGenome.c
    git push
    make install-server

    ssh hgwdev			# used to do this on "genbank" machine
    screen -S saiBol1           # long running job managed in screen
    cd /cluster/data/genbank
    time nice -n +19 ./bin/gbAlignStep -initial saiBol1 &
    #	var/build/logs/2012.05.08-14:42:12.saiBol1.initalign.log
    #   real    3156m37.952s

    # load database when finished
    ssh hgwdev
    cd /cluster/data/genbank
    time nice -n +19 ./bin/gbDbLoadStep -drop -initialLoad saiBol1 &
    #	logFile:  var/dbload/hgwdev/logs/2012.05.10-19:23:54.dbload.log
    #   real    6m6.352s

    # enable daily alignment and update of hgwdev (DONE - 2012-02-09 - Hiram)
    cd ~/kent/src/hg/makeDb/genbank
    git pull
    # add saiBol1 to:
    vi etc/align.dbs etc/hgwdev.dbs
    git commit -m "Added saiBol1." etc/align.dbs etc/hgwdev.dbs
    git push
    make etc-update

#########################################################################
# set default position to RHO gene displays  (DONE - 2012-07-26 - Hiram)
    hgsql -e \
'update dbDb set defaultPos="JH378122:1872708-1880068" where name="saiBol1";' \
	hgcentraltest

############################################################################
# pushQ entry (DONE - 2012-07-26 - Hiram)
    mkdir /hive/data/genomes/saiBol1/pushQ
    cd /hive/data/genomes/saiBol1/pushQ
    # Mark says don't let the transMap track get there
    time makePushQSql.pl saiBol1 2> stderr.txt | grep -v transMap > saiBol1.sql
    #   real    3m42.169s
    # check the stderr.txt for bad stuff, these kinds of warnings are OK:
# WARNING: hgwdev does not have /gbdb/saiBol1/wib/gc5Base.wib
# WARNING: hgwdev does not have /gbdb/saiBol1/wib/quality.wib
# WARNING: hgwdev does not have /gbdb/saiBol1/bbi/quality.bw
# WARNING: saiBol1 does not have seq
# WARNING: saiBol1 does not have extFile
# WARNING: saiBol1 does not have estOrientInfo

    scp -p saiBol1.sql hgwbeta:/tmp
    ssh hgwbeta "hgsql qapushq < /tmp/saiBol1.sql"

##############################################################################
# create ucscToRefSeq name mapping (DONE - 2017-12-14 - Hiram)
    mkdir /hive/data/genomes/saiBol1/bed/ucscToRefSeq
    cd /hive/data/genomes/saiBol1/bed/ucscToRefSeq

    # run up idKeys for refseq release
    ln -s /hive/data/outside/ncbi/genomes/refseq/vertebrate_mammalian/Saimiri_boliviensis/all_assembly_versions/GCF_000235385.1_SaiBol1.0/GCF_000235385.1_SaiBol1.0_genomic.fna.gz .
    faToTwoBit *.fna.gz refseq.saiBol1.2bit
    doIdKeys.pl -buildDir=`pwd` -twoBit=`pwd`/refseq.saiBol1.2bit refseqSaiBol1
    # real    2m49.837s

    join -t$'\t' ../idKeys/saiBol1.idKeys.txt refseqSaiBol1.idKeys.txt \
      | cut -f2- | sort > ucsc.refseq.tab

    join -t$'\t' <(sort ../../chrom.sizes) ucsc.refseq.tab \
        | awk '{printf "%s\t0\t%d\t%s\n", $1, $2, $3}' \
          | sort -k1,1 -k2,2n > ucscToRefSeq.bed

    # maximum size of UCSC chrom name for SQL index
    export SZ=`cut -f1 ucscToRefSeq.bed | awk '{print length($0)}' | sort -n | tail -1`
    # SZ: 12

    sed -e "s/21/$SZ/" $HOME/kent/src/hg/lib/ucscToINSDC.sql \
        | hgLoadSqlTab saiBol1 ucscToRefSeq stdin ucscToRefSeq.bed

    checkTableCoords saiBol1 ucscToRefSeq

    featureBits -countGaps saiBol1 ucscToRefSeq
    # 2608572064 bases of 2608572064 (100.000%) in intersection

############################################################################
# create ucscToINSDC name mapping (DONE - 2013-08-23 - Hiram)
    mkdir /hive/data/genomes/saiBol1/bed/ucscToINSDC
    cd /hive/data/genomes/saiBol1/bed/ucscToINSDC

    # copying these scripts from the previous load and improving them
    # with each instance
    ./translateNames.sh
    ./verifyAll.sh
    ./join.sh

    sed -e "s/21/12/" $HOME/kent/src/hg/lib/ucscToINSDC.sql \
        | hgLoadSqlTab saiBol1 ucscToINSDC stdin ucscToINSDC.tab
    checkTableCoords saiBol1 ucscToINSDC
    featureBits -countGaps saiBol1 ucscToINSDC
    # 2608572064 bases of 2608572064 (100.000%) in intersection

    # verify the track link to INSDC functions

##############################################################################
# add chromAlias table (DONE - 2017-12-14 - Hiram)

    mkdir /hive/data/genomes/saiBol1/bed/chromAlias
    cd /hive/data/genomes/saiBol1/bed/chromAlias

    hgsql -N -e 'select chrom,name from ucscToRefSeq;' saiBol1 \
        | sort > ucsc.refseq.tab
    hgsql -N -e 'select chrom,name from ucscToINSDC;' saiBol1 \
        |sort > ucsc.genbank.tab

    join -t$'\t' ../idKeys/saiBol1.idKeys.txt \
	../../ensembl/ensemblSaiBol1.idKeys.txt \
	| cut -f2,3 | sort > ucsc.ensembl.tab

    ~/kent/src/hg/utils/automation/chromAlias.pl
    sort -o saiBol1.chromAlias.tab saiBol1.chromAlias.tab

for t in refseq genbank ensembl
do
  c0=`cat ucsc.$t.tab | wc -l`
  c1=`grep $t saiBol1.chromAlias.tab | wc -l`
  ok="OK"
  if [ "$c0" -ne "$c1" ]; then
     ok="ERROR"
  fi
  printf "# checking $t: $c0 =? $c1 $ok\n"
done
# checking refseq: 2685 =? 2685 OK
# checking genbank: 2685 =? 2685 OK
# checking ensembl: 2685 =? 2685 OK

    hgLoadSqlTab saiBol1 chromAlias ~/kent/src/hg/lib/chromAlias.sql \
        saiBol1.chromAlias.tab

##############################################################################
# TransMap V3 tracks. see makeDb/doc/transMapTracks.txt (2014-12-21 markd)
##############################################################################
