# for emacs: -*- mode: sh; -*-

# Gorilla gorilla gorilla - Sanger release 2

#########################################################################
# DOWNLOAD SEQUENCE (DONE - 2009-10-14 - Hiram)
    mkdir /hive/data/genomes/gorGor2
    cd /hive/data/genomes/gorGor2
    mkdir sanger
    cd sanger
for F in README ggo_r4-split.fa.gz ggo_r4.agp.gz ggo_r4.fa.gz
do
    wget --timestamping "ftp://ftp.sanger.ac.uk/pub/rd/gorilla/release2/${F}"
done

    #	remove blank lines from the fasta file, and fixup MT name:
    mv ggo_r4-split.fa.gz ggo_r4-split_blankLines.fa.gz
    zcat ggo_r4-split_blankLines.fa.gz | grep -v "^$" | gzip > ucsc.split.fa.gz
    zcat ggo_r4.agp.gz | sed -e "s/^MT/chrM/" | gzip > ucsc.agp.gz

#########################################################################
#  running makeGenomeDb.pl (DONE - 2009-10-28 - Hiram)
    cat << '_EOF_' > gorGor2.config.ra
# Config parameters for makeGenomeDb.pl:
db gorGor2
clade mammal
genomeCladePriority 12
scientificName  Gorilla gorilla gorilla
commonName Gorilla
assemblyDate Aug. 2009
assemblyLabel Sanger Institute Aug 2009 (NCBI project 31265)
orderKey 26
# chrM sequence is included in the release and AGP file
mitoAcc none 
fastaFiles /hive/data/genomes/gorGor2/sanger/ucsc.split.fa.gz
agpFiles /hive/data/genomes/gorGor2/sanger/ucsc.agp.gz
# qualFiles none
dbDbSpeciesDir gorilla
taxId 9595
'_EOF_'
    # << happy emacs

    #	run stepwise to see where problems arise:
    makeGenomeDb.pl -stop=seq gorGor2.config.ra > seq.log 2>&1
    makeGenomeDb.pl -continue=agp -stop=agp gorGor2.config.ra > agp.log 2>&1
    makeGenomeDb.pl -continue=db -stop=db gorGor2.config.ra > db.log 2>&1
# failed due to chrom names going to 25 characters and the index doesn't work.
# this bug is fixed in later versions of makeGenomeDb.pl
    cd /hive/data/genomes/gorGor2/bed/chromInfo
    sed -e "s/16/21/" $HOME/kent/src/hg/lib/chromInfo.sql > chromInfo.sql
    hgLoadSqlTab gorGor2 chromInfo chromInfo.sql chromInfo.tab
    makeGenomeDb.pl -continue=db -stop=db gorGor2.config.ra > db.log 2>&1
    cd /hive/data/genomes/gorGor2
    hgGoldGapGl -noGl gorGor2 gorGor2.agp
    mkdir -p /gbdb/gorGor2/wib
    rm -f /gbdb/gorGor2/wib/gc5Base.wib
    ln -s /cluster/data/gorGor2/bed/gc5Base/gc5Base.wib /gbdb/gorGor2/wib
    hgLoadWiggle -pathPrefix=/gbdb/gorGor2/wib \
      gorGor2 gc5Base /cluster/data/gorGor2/bed/gc5Base/gc5Base.wig
    rm -f wiggle.tab
    makeGenomeDb.pl -continue=dbDb -stop=dbDb gorGor2.config.ra > dbDb.log 2>&1
    makeGenomeDb.pl -continue=trackDb -stop=trackDb gorGor2.config.ra \
	> trackDb.log 2>&1

##########################################################################
# running repeat masker (DONE - 2009-11-02,03 - Hiram)
    mkdir /hive/data/genomes/gorGor2/bed/repeatMasker
    cd /hive/data/genomes/gorGor2/bed/repeatMasker
    doRepeatMasker.pl -buildDir=`pwd` -noSplit -bigClusterHub=swarm \
	-workhorse=hgwdev gorGor2 > do.log 2>&1
    cat faSize.rmsk.txt
# 3035791622 bases (206104414 N's 2829687208 real 1467628472 upper
#	1362058736 lower) in 58049 sequences in 1 files
# %44.87 masked total, %48.13 masked real

##########################################################################
# running simple repeat (DONE - 2009-11-02-2010-02-20 - Hiram)
    mkdir /hive/data/genomes/gorGor2/bed/simpleRepeats
    cd /hive/data/genomes/gorGor2/bed/simpleRepeats
    time doSimpleRepeat.pl -buildDir=`pwd` -smallClusterHub=swarm \
	-workhorse=hgwdev gorGor2 > do.log 2>&1 &
    #	there was one very odd bit of sequence in this business
    #	namely: chr4_48641019_10247101
    #	trf could not complete this one, even after days of trying
    #	broke it up into 130,000 sized chunks and even that took 56 minutes
    #	on swarm.  Putting that result together with all of the other
    #	bits in that bunch 037.lst, and continuing:
    time doSimpleRepeat.pl -buildDir=`pwd` -smallClusterHub=swarm \
	-continue=filter -workhorse=hgwdev gorGor2 > filter.log 2>&1 &
#	real    1m8.940s
    cat fb.simpleRepeat 
#	199677936 bases of 2829709756 (7.056%) in intersection
    #	interesting high number there.  I wonder if it is from this odd
    #	sequence ?  Evidently not just this one, there appear to be others
    #	where regions of more than 100,000 were marked as repeats
    #	eg: chr20_26077633_1186791 marked 500,000 chunk as a repeat
    cd /hive/data/genomes/gorGor2
    twoBitMask gorGor2.rmsk.2bit \
	-add bed/simpleRepeats/trfMask.bed gorGor2.2bit
    #	you can safely ignore the warning about fields >= 13

    twoBitToFa gorGor2.2bit stdout | faSize stdin > faSize.gorGor2.2bit.txt
    cat faSize.gorGor2.2bit.txt
# 3035791622 bases (206104414 N's 2829687208 real 1466160275 upper
#	1363526933 lower) in 58049 sequences in 1 files
#	%44.92 masked total, %48.19 masked real

    rm /gbdb/gorGor2/gorGor2.2bit
    ln -s `pwd`/gorGor2.2bit /gbdb/gorGor2/gorGor2.2bit

########################################################################
# Marking *all* gaps - they are not all in the AGP file
#	(DONE - 2010-02-20 - Hiram)
    #	verify all gaps are defined in AGP
    mkdir /hive/data/genomes/gorGor2/bed/allGaps
    cd /hive/data/genomes/gorGor2/bed/allGaps
    time findMotif -strand=+ -motif="gattacagattaca" -verbose=4 \
	../../gorGor2.2bit > findMotif.txt 2>&1 &
    #	less than 1 minute
    grep "^#GAP" findMotif.txt | sed -e "s/#GAP //" \
	| sort -k1,1 -k2,2n > allGaps.bed
    awk '{print $3-$2}' allGaps.bed | ave stdin
    #	total 206104414.000000
    featureBits -countGaps gorGor2 gap
    #	206081866 bases of 3035791622 (6.788%) in intersection
    #	there appear to be more N's than defined in gaps, let's add them
    #	to the gap table

    time featureBits gorGor2 -not gap -bed=notGap.bed
    #	0m28.556s
    time featureBits gorGor2 allGaps.bed notGap.bed -bed=new.gaps.bed
    #	real    246m8.539s
    #	what is the last index in the existing gap table:
    hgsql -N -e "select ix from gap;" gorGor2 | sort -n | tail -1
    #	3998
    #	each scaffold has its parts renumbered from 1, thus, there
    #	are many more gaps than 3998.  All 3998 means is that is the
    #	highest number of gaps on chr4_48641019_10247101
    cat << '_EOF_' > mkGap.pl
#!/usr/bin/env perl

use strict;
use warnings;

my $ix=`hgsql -N -e "select ix from gap;" gorGor2 | sort -n | tail -1`;
chomp $ix;

open (FH,"<new.gaps.bed") or die "can not read new.gaps.bed";
while (my $line = <FH>) {
    my ($chrom, $chromStart, $chromEnd, $rest) = split('\s+', $line);
    ++$ix;
    printf "%s\t%d\t%d\t%d\tN\t%d\tother\tyes\n", $chrom, $chromStart,
        $chromEnd, $ix, $chromEnd-$chromStart;
}
close (FH);
'_EOF_'
    # << happy emacs
    chmod +x ./mkGap.pl
    ./mkGap.pl > other.gap
    hgLoadBed -sqlTable=$HOME/kent/src/hg/lib/gap.sql \
	-noLoad gorGor2 otherGap other.gap
    #	Loaded 19438
    #	adding this many:
    wc -l bed.tab
    #	19438
    #	starting with this many
    hgsql -e "select count(*) from gap;" gorGor2
    #	3998
    hgsql gorGor2 -e 'load data local infile "bed.tab" into table gap;'
    #	original number of gaps:
    grep fragment ../../gorGor2.agp | wc -l
    #	407798
    # +  19438
    # = 427236
    #	result count:
    hgsql -e "select count(*) from gap;" gorGor2
    #	427236
    #	OK.

##########################################################################
# MAKE 11.OOC FILES FOR BLAT (DONE - 2010-02-20 - Hiram)
    # numerator is gorGor2 gapless bases "real" as reported by faSize 
    # denominator is hg17 gapless bases as reported by featureBits,
    # 1024 is threshold used for human -repMatch:
    calc \( 2829687208 / 2897310462 \) \* 1024
    #	( 2829687208 / 2897310462 ) * 1024 = 1000.099830
    # ==> use -repMatch=1000 according to size scaled down from 1024 for human.
    cd /hive/data/genomes/gorGor2
    blat gorGor2.2bit /dev/null /dev/null -tileSize=11 \
      -makeOoc=jkStuff/gorGor2.11.ooc -repMatch=1000
    #	Wrote 29650 overused 11-mers to jkStuff/gorGor2.11.ooc

    mkdir /hive/data/staging/data/gorGor2
    cp -p gorGor2.2bit chrom.sizes jkStuff/gorGor2.11.ooc \
	/hive/data/staging/data/gorGor2

    #	check if there are any non-bridged gaps to contend with:
    time gapToLift -bedFile=jkStuff/nonBridgedGaps.bed gorGor2 \
	jkStuff/gorGor2.nonBridged.lft
    #	real    0m28.560s
    wc -l chrom.sizes
    #	58049
    wc -l jkStuff/nonBridgedGaps.bed
    #	58049
    #	there are no non-bridged gaps, this bed file is the whole genome
    awk '{print $3}' jkStuff/nonBridgedGaps.bed | ave stdin
    #	total 3035791622.000000
    ave -col=2 chrom.sizes
    #	total 3035791622.000000
##########################################################################
# GENBANK AUTO UPDATE (DONE - 2009-09-25 - Hiram)
    #	add the following to genbank/etc/genbank.conf just before
    #	the chimp panTro1
# Gorilla
gorGor2.serverGenome = /hive/data/genomes/gorGor2/gorGor2.2bit
gorGor2.clusterGenome = /scratch/data/gorGor2/gorGor2.2bit
gorGor2.ooc = /hive/data/genomes/gorGor2/jkStuff/gorGor2.11.ooc
gorGor2.lift = no
gorGor2.perChromTables = no
gorGor2.refseq.mrna.native.pslCDnaFilter  = ${ordered.refseq.mrna.native.pslCDnaFilter}
gorGor2.refseq.mrna.xeno.pslCDnaFilter    = ${ordered.refseq.mrna.xeno.pslCDnaFilter}
gorGor2.genbank.mrna.native.pslCDnaFilter = ${ordered.genbank.mrna.native.pslCDnaFilter}
gorGor2.genbank.mrna.xeno.pslCDnaFilter   = ${ordered.genbank.mrna.xeno.pslCDnaFilter}
gorGor2.genbank.est.native.pslCDnaFilter  = ${ordered.genbank.est.native.pslCDnaFilter}
gorGor2.genbank.est.xeno.pslCDnaFilter    = ${ordered.genbank.est.xeno.pslCDnaFilter}
gorGor2.downloadDir = gorGor2
gorGor2.refseq.mrna.native.load  = yes
gorGor2.refseq.mrna.xeno.load = yes
gorGor2.refseq.mrna.xeno.loadDesc  = yes

    # Edit src/lib/gbGenome.c to add new species.  With this line:
    #	static char *gorGorNames[] = {"Gorilla gorilla", "Gorilla gorilla gorilla", "Gorilla gorilla uellensis", "Gorilla gorilla diehli", "Gorilla gorilla graueri", "Gorilla berengi", NULL};

    cvs ci -m "adding names for gorilla species" src/lib/gbGenome.c
    make install-server

    ssh genbank
    screen  # control this business with a screen since it takes a while
    cd /cluster/data/genbank

    time nice -n +19 bin/gbAlignStep -initial gorGor2 &
    #	logFile: var/build/logs/2010.02.22-08:38:02.gorGor2.initalign.log
    #	real    421m51.236s

    # load database when finished
    ssh hgwdev
    cd /cluster/data/genbank
    time nice -n +19 ./bin/gbDbLoadStep -drop -initialLoad gorGor2
    #	logFile: var/dbload/hgwdev/logs/2010.02.22-18:08:15.dbload.log
    #	real    15m40.078s

    # enable daily alignment and update of hgwdev
    cd ~/kent/src/hg/makeDb/genbank
    cvsup
    # add gorGor2 to:
        etc/align.dbs
        etc/hgwdev.dbs
    cvs ci -m "Added gorGor2 - Gorilla Gorilla" etc/align.dbs etc/hgwdev.dbs
    make etc-update
    #	done - 2010-02-22 - Hiram

############################################################################
#  BLATSERVERS ENTRY (DONE - 2009-08-06 - Hiram)
#	After getting a blat server assigned by the Blat Server Gods,
    ssh hgwdev

    hgsql -e 'INSERT INTO blatServers (db, host, port, isTrans, canPcr) \
	VALUES ("gorGor2", "blat12", "17808", "1", "0"); \
	INSERT INTO blatServers (db, host, port, isTrans, canPcr) \
	VALUES ("gorGor2", "blat12", "17809", "0", "1");' \
	    hgcentraltest
    #	test it with some sequence

#########################################################################
# Set default position to location of RHO as found by blat from hg18 sequence
#		(DONE - 2010-02-24 - Hiram)
    hgsql -e \
'update dbDb set defaultPos="chr3_129946504_1170306:812299-819972" where name="gorGor2";' \
        hgcentraltest
#########################################################################
