#########################################################################
# Sorex unguiculatus -- Common Shrew
# Broad sorAra1 2X release

#	http://www.ncbi.nlm.nih.gov/genome/91

#########################################################################
# Download sequence (2006-03-05 kate)

    ssh kkstore05
    mkdir -p /cluster/store12/sorAra1
    ln -s /cluster/store12/sorAra1 /cluster/data/sorAra1 
    cd /cluster/data/sorAra1
    mkdir downloads
    cd downloads
    wget -r -nd ftp://ftp.broad.mit.edu/pub/assemblies/mammals/commonShrew/sorAra1

#########################################################################
# Create .ra file and run makeGenomeDb.pl
    ssh hgwdev
    cd /cluster/data/sorAra1
    cat << '_EOF_' >sorAra1.config.ra
# Config parameters for makeGenomeDb.pl:
db sorAra1
clade mammal
genomeCladePriority 36
scientificName Sorex araneus 
commonName Shrew
assemblyDate June 2006
assemblyLabel Broad Institute sorAra1 (Draft_v2)
orderKey 100
# search Entrez nucleotide for 'sorex araneus mitochondrion complete genome, use GI #'
# not found (only sorex unguiculatus)
mitoAcc none
fastaFiles /cluster/data/sorAra1/downloads/assembly.bases.gz
agpFiles /cluster/data/sorAra1/downloads/assembly.agp
qualFiles /cluster/data/sorAra1/downloads/scaffold.lifted.qac
dbDbSpeciesDir shrew
'_EOF_'

    makeGenomeDb.pl -verbose=2 -stop=seq sorAra1.config.ra >& makeGenomeDb.out &

    # Need dbDb entry for name lookup
    hgsql -e 'INSERT INTO dbDb (name, description, nibPath, organism, \
        defaultPos, active, orderKey, genome, scientificName, \
        htmlPath, hgNearOk, hgPbOk, sourceName) \
        VALUES("sorAra1", "June 2006", "/gbdb/sorAra1", "Shrew", \
        "", 0, 100, "Shrew", \
        "Sorex araneus", "", 0, 0, \
        "Broad Institute sorAra (Draft_v2)")' -h localhost hgcentraltest

################################################
## WINDOWMASKER (2006-03-04 kate)
    ssh kkstore05
    cd /cluster/data/sorAra1/bed/
    # note: only runs on kolossus, due to network connection to NCBI
    # requested Andy hardcode this.
    nice /cluster/bin/scripts/doWindowMasker.pl sorAra1 \
        -workhorse=kolossus >& wmRun.log &

    ln -s WindowMasker.2007-03-05 WMRun
    mv wmRun.log WMRun
    cd WMRun

    # upper-case n's left by WM (request to Andy to fix this)
    twoBitToFa sorAra1.wmsk.sdust.2bit stdout | tr n N | \
            faToTwoBit stdin /cluster/data/sorAra1/sorAra1.2bit

    # stats on masking
    cd /cluster/data/sorAra1
    twoBitToFa sorAra1.2bit stdout | nice faSize stdin >& faSize.log &
# 2936119008 bases (1103254311 N's 1832864697 real 1116732269 upper 716132428 lower) in 262057 sequences in 1 files
# Total size: mean 11204.1 sd 25927.2 min 203 (scaffold_107701) max 791662 (scaffold_254556) median 4789

    # 39.1% masked
    # Mouse and rat are ~43%, from RM.  TRF gives them an extra 2%, skip here

    mkdir -p /san/sanvol1/scratch/sorAra1
    cp -p sorAra1.2bit chrom.sizes /san/sanvol1/scratch/sorAra1

    #	load this table after creating a db (DONE - 2008-10-21 - Hiram)
    cd /hive/data/genomes/sorAra1/bed/WindowMasker.2007-03-05
    time hgLoadBed sorAra1 windowmaskerSdust windowmasker.sdust.bed.gz
    #	Loaded 11038904 elements of size 3
    #	real    3m20.424s

################################################
# DOWNLOADS (2007-06-05 kate)

    ssh kkstore05
    cd /cluster/data/sorAra1
    mkdir bigZips
    cd bigZips
    nice twoBitToFa ../sorAra1.2bit sorAra1.fa
    cp ../downloads/assembly.agp sorAra1.agp
    nice gzip sorAra1.fa sorAra1.agp
    md5sum *.gz > md5sum.txt

    ssh hgwdev
    set d = /usr/local/apache/htdocs/goldenPath
    set bd = /cluster/data/sorAra1
    cp $d/felCat3/bigZips/README.txt $bd/bigZips
    # EDIT
    mkdir -p $d/sorAra1/bigZips
    ln -s $bd/bigZips/{*.gz,md5sum.txt,README.txt} $d/sorAra1/bigZips
   
##############################################################################
# creating DB to make it easier to work with this (DONE - 2008-10-20 - Hiram)
    cd /hive/data/genomes/sorAra1/downloads
    qaToQac assembly.quals.gz stdout \
    | qacAgpLift assembly.agp stdin sorAra1.quals.qac
    cd /hive/data/genomes/sorAra1
    # edit sorAra1.config.ra to set:
    #	qualFiles /cluster/data/sorAra1/downloads/sorAra1.quals.qac
    makeGenomeDb.pl -workhorse=hgwdev -continue=agp -stop=agp \
	sorAra1.config.ra > makeAgp.log 2>&1
    time makeGenomeDb.pl -workhorse=hgwdev -continue=db -stop=db \
	sorAra1.config.ra > makeDb.log 2>&1
    #	real    27m48.147s
    time makeGenomeDb.pl -workhorse=hgwdev -continue=dbDb -stop=dbDb \
	sorAra1.config.ra > makeDbDb.log 2>&1
    #	real    0m1.294s

##############################################################################
## Repeat Masker (DONE - 2008-10-20 - Hiram)
    screen	# to manage this several day job
    mkdir /hive/data/genomes/sorAra1/bed/repeatMasker
    cd /hive/data/genomes/sorAra1/bed/repeatMasker
    time $HOME/kent/src/hg/utils/automation/doRepeatMasker.pl \
	-workhorse=hgwdev -bigClusterHub=swarm \
	-buildDir=`pwd` sorAra1 > do.log 2>&1 &
    #	real    258m6.902s

    twoBitToFa sorAra1.rmsk.2bit stdout | faSize stdin > faSize.rmsk.txt
# 2936119008 bases (1103254311 N's 1832864697 real 1411267433 upper 421597264
# lower) in 262057 sequences in 1 files
# 14.36 masked total, %23.00 masked rea

#########################################################################
# SIMPLE REPEATS TRF (DONE - 2008-10-21 - Hiram)
    screen # use a screen to manage this job
    mkdir /hive/data/genomes/sorAra1/bed/simpleRepeat
    cd /hive/data/genomes/sorAra1/bed/simpleRepeat
    # 
    time $HOME/kent/src/hg/utils/automation/doSimpleRepeat.pl \
	-buildDir=/cluster/data/sorAra1/bed/simpleRepeat sorAra1 > do.log 2>&1 &
    #	real    140m0.001s
    cat fb.simpleRepeat
    #	32481599 bases of 1832864697 (1.772%) in intersection

    #	after RM run is done, add this mask:
    cd /hive/data/genomes/sorAra1
    rm sorAra1.2bit
    twoBitMask sorAra1.rmsk.2bit -add bed/simpleRepeat/trfMask.bed \
	sorAra1.rmTrf.2bit
    #	can safely ignore warning about >=13 fields in bed file

    twoBitToFa sorAra1.rmTrf.2bit stdout \
	| faSize stdin > sorAra1.rmTrf.faSize.txt
# 2936119008 bases (1103254311 N's 1832864697 real 1410119486 upper 422745211
# lower) in 262057 sequences in 1 files
# %14.40 masked total, %23.06 masked real
    #	leaving the windowmasker masked sequence in place, do not
    #	overwrite with this one
    #	link to gbdb
    #	ln -s `pwd`/sorAra1.2bit /gbdb/sorAra1

#########################################################################
# prepare for kluster runs (DONE _ 2008-10-16 - Hiram)
    # compare to size of real bases to adjust the repMatch
    #	hg18: 2881421696
    #	sorAra1: 1832864697
    # thus: 1024 * 1832864697/2881421696 = 651
    #	rounding up to 700 for a bit more conservative masking
    cd /hive/data/genomes/sorAra1
    time blat sorAra1.2bit \
	/dev/null /dev/null -tileSize=11 -makeOoc=sorAra1.11.ooc -repMatch=700
    #	Wrote 26119 overused 11-mers to sorAra1.11.ooc
    #	real    1m57.876s

    #	and staging data for push to kluster nodes
    mkdir /hive/data/staging/data/sorAra1
    cp -p sorAra1.2bit chrom.sizes sorAra1.11.ooc \
	/hive/data/staging/data/sorAra1
    #	request to cluster admin to push this to the kluster nodes
    #	/scratch/data/

###########################################################################
# add NCBI identifiers to the dbDb (DONE - 2008-10-21 - Hiram)
    hgsql -e 'update dbDb set
sourceName="Broad Institute sorAra1 (NCBI project 20325, ABRP01000000)" where name="sorAra1";' hgcentraltest

############################################################################
#  sorAra1 - Shrew - Ensembl Genes version 51  (DONE - 2008-12-02 - hiram)
    ssh kkr14u06
    cd /hive/data/genomes/sorAra1
    cat << '_EOF_' > sorAra1.ensGene.ra
# required db variable
db sorAra1
# do we need to translate geneScaffold coordinates
geneScaffolds yes
'_EOF_'
#  << happy emacs

    doEnsGeneUpdate.pl -ensVersion=51 sorAra1.ensGene.ra
    ssh hgwdev
    cd /hive/data/genomes/sorAra1/bed/ensGene.51
    featureBits sorAra1 ensGene
    # 19400431 bases of 1832864697 (1.058%) in intersection

 *** All done!  (through the 'makeDoc' step)
 *** Steps were performed in /hive/data/genomes/sorAra1/bed/ensGene.51

############################################################################
# cpgIslands - (DONE - 2011-04-23 - Hiram)
    mkdir /hive/data/genomes/sorAra1/bed/cpgIslands
    cd /hive/data/genomes/sorAra1/bed/cpgIslands
    time doCpgIslands.pl sorAra1 > do.log 2>&1
    #   real    823m38.651s
    # fixing broken command in script:
    time ./doLoadCpg.csh
    #   real    2m45.268s
    time doCpgIslands.pl -continue=cleanup sorAra1 > cleanup.log 2>&1
    #   real    117m41.101s

    cat fb.sorAra1.cpgIslandExt.txt
    #   19811608 bases of 1832864697 (1.081%) in intersection

#########################################################################
# genscan - (DONE - 2011-04-26 - Hiram)
    mkdir /hive/data/genomes/sorAra1/bed/genscan
    cd /hive/data/genomes/sorAra1/bed/genscan
    time doGenscan.pl sorAra1 > do.log 2>&1
    # recovering from power failure, kluster run had just finished
    # and it had just started on makeBed:
    time ./doMakeBed.csh
    #   real    400m44.222s
    # continuing:
    time doGenscan.pl -continue=load sorAra1 > load.log 2>&1
    #   real    27m31.386s

    cat fb.sorAra1.genscan.txt
    #   27964827 bases of 1832864697 (1.526%) in intersection
    cat fb.sorAra1.genscanSubopt.txt
    #   28734625 bases of 1832864697 (1.568%) in intersection

#########################################################################
# MAKE 11.OOC FILE FOR BLAT/GENBANK (DONE - 2012-05-03 - Hiram)
    # Use -repMatch=900, based on size -- for human we use 1024
    # use the "real" number from the faSize measurement,
    # hg19 is 2897316137, calculate the ratio factor for 1024:
    calc \( 1832864697 / 2897316137 \) \* 1024
    #	( 1832864697 / 2897316137 ) * 1024 = 647.790355

    # round up to 700

    cd /hive/data/genomes/sorAra1
    time blat sorAra1.2bit /dev/null /dev/null -tileSize=11 \
      -makeOoc=jkStuff/sorAra1.11.ooc -repMatch=700
    #   Wrote 26119 overused 11-mers to jkStuff/sorAra1.11.ooc
    #   real    0m45.014s

    # there are no non-bridged gaps, no lift file needed for genbank
    hgsql -N -e "select bridge from gap;" sorAra1 | sort | uniq -c
    #   403896 yes
#    cd /hive/data/genomes/sorAra1/jkStuff
#    gapToLift sorAra1 sorAra1.nonBridged.lift -bedFile=sorAra1.nonBridged.bed
    # largest non-bridged contig:
#    awk '{print $3-$2,$0}' sorAra1.nonBridged.bed | sort -nr | head
    #   123773608 chrX  95534   123869142       chrX.01

#########################################################################
# AUTO UPDATE GENBANK (DONE - 2012-05-03 - Hiram)
    # examine the file:
    /cluster/data/genbank/data/organism.lst
    # for your species to see what counts it has for:
# organism       mrnaCnt estCnt  refSeqCnt
# Sorex unguiculatus	2	0	0
    # to decide which "native" mrna or ests you want to specify in genbank.conf

    ssh hgwdev
    cd $HOME/kent/src/hg/makeDb/genbank
    git pull
    # edit etc/genbank.conf to add sorAra1
# sorAra1 (common shrew)
sorAra1.serverGenome = /hive/data/genomes/sorAra1/sorAra1.2bit
sorAra1.clusterGenome = /hive/data/genomes/sorAra1/sorAra1.2bit
sorAra1.ooc = /hive/data/genomes/sorAra1/jkStuff/sorAra1.11.ooc
sorAra1.lift = no
sorAra1.refseq.mrna.native.pslCDnaFilter  = ${lowCover.refseq.mrna.native.pslCDnaFilter}
sorAra1.refseq.mrna.xeno.pslCDnaFilter    = ${lowCover.refseq.mrna.xeno.pslCDnaFilter}
sorAra1.genbank.mrna.native.pslCDnaFilter = ${lowCover.genbank.mrna.native.pslCDnaFilter}
sorAra1.genbank.mrna.xeno.pslCDnaFilter   = ${lowCover.genbank.mrna.xeno.pslCDnaFilter}
sorAra1.genbank.est.native.pslCDnaFilter  = ${lowCover.genbank.est.native.pslCDnaFilter}
sorAra1.refseq.mrna.native.load = no
sorAra1.refseq.mrna.xeno.load = yes
sorAra1.genbank.mrna.xeno.load = yes
sorAra1.genbank.est.native.load = no
sorAra1.downloadDir = sorAra1
sorAra1.perChromTables = no

    # end of section added to etc/genbank.conf
    git commit -m "adding sorAra1 common shrew" etc/genbank.conf
    git push
    make etc-update

    git pull
    # Edit src/lib/gbGenome.c to add new species.
    git commit -m "adding definition for sorAraNames" src/lib/gbGenome.c
    git push
    make install-server

    ssh hgwdev			# used to do this on "genbank" machine
    screen -S sorAra1           # long running job managed in screen
    cd /cluster/data/genbank
    time nice -n +19 ./bin/gbAlignStep -initial sorAra1 &
    #   var/build/logs/2012.06.08-09:57:32.sorAra1.initalign.log
    #   real    2505m42.331s

    # load database when finished
    ssh hgwdev
    cd /cluster/data/genbank
    time nice -n +19 ./bin/gbDbLoadStep -drop -initialLoad sorAra1 &
    #	logFile:  var/dbload/hgwdev/logs/2012.06.12-13:15:38.dbload.log
    #   real    19m9.715s

    # enable daily alignment and update of hgwdev (DONE - 2012-06-12 - Hiram)
    cd ~/kent/src/hg/makeDb/genbank
    git pull
    # add sorAra1 to:
    vi etc/align.dbs etc/hgwdev.dbs
    git commit -m "Added sorAra1." etc/align.dbs etc/hgwdev.dbs
    git push
    make etc-update

#########################################################################
# set default position to RHO gene displays  (DONE - 2012-07-26 - Hiram)
    hgsql -e \
'update dbDb set defaultPos="scaffold_163319:8348-15207" where name="sorAra1";' \
	hgcentraltest

############################################################################
# pushQ entry (DONE - 2012-07-26 - Hiram)
    mkdir /hive/data/genomes/sorAra1/pushQ
    cd /hive/data/genomes/sorAra1/pushQ
    # Mark says don't let the transMap track get there
    time makePushQSql.pl sorAra1 2> stderr.txt | grep -v transMap > sorAra1.sql
    #   real    3m42.169s
    # check the stderr.txt for bad stuff, these kinds of warnings are OK:
# WARNING: hgwdev does not have /gbdb/sorAra1/wib/gc5Base.wib
# WARNING: hgwdev does not have /gbdb/sorAra1/wib/quality.wib
# WARNING: hgwdev does not have /gbdb/sorAra1/bbi/quality.bw
# WARNING: sorAra1 does not have seq
# WARNING: sorAra1 does not have extFile
# WARNING: sorAra1 does not have estOrientInfo

    scp -p sorAra1.sql hgwbeta:/tmp
    ssh hgwbeta "hgsql qapushq < /tmp/sorAra1.sql"

############################################################################
# construct liftOver to sorAra2 (DONE - 2013-06-10 - Hiram)
    screen -S sorAra2	# manage this longish running job in a screen
    mkdir /hive/data/genomes/sorAra1/bed/blat.sorAra2.2013-06-10
    cd /hive/data/genomes/sorAra1/bed/blat.sorAra2.2013-06-10
    # check it with -debug first to see if it is going to work:
    time doSameSpeciesLiftOver.pl -buildDir=`pwd` -bigClusterHub=swarm \
	-ooc=/hive/data/genomes/sorAra1/jkStuff/sorAra1.11.ooc \
	-debug -dbHost=hgwdev -workhorse=hgwdev sorAra1 sorAra2
    #   real    0m1.838s
    # if that is OK, then run it:
    time doSameSpeciesLiftOver.pl -buildDir=`pwd` -bigClusterHub=swarm \
	-ooc=/hive/data/genomes/sorAra1/jkStuff/sorAra1.11.ooc \
	-dbHost=hgwdev -workhorse=hgwdev sorAra1 sorAra2 > do.log 2>&1
    #	real    1146m9.559s

    # verify this file exists:
    #	/gbdb/sorAra1/liftOver/sorAra1ToSorAra2.over.chain.gz
    # and try out the conversion on genome-test from sorAra1 to sorAra2

############################################################################
