# for emacs: -*- mode: sh; -*-

# Meloidogyne incognita
# Sanger and Genoscope
# http://www.nematode.net/Species.Summaries/Meloidogyne.incognita/index.php
#	Genbank: CABB00000000
#	http://www.ncbi.nlm.nih.gov/Traces/wgs/?val=CABB01

##############################################################################
## Fetch sequence (DONE - 2010-09-20 - Hiram)

    mkdir -p /hive/data/genomes/melInc1/ws210
    cd /hive/data/genomes/melInc1/ws210

    wget --no-parent --timestamping -m -nH --cut-dirs=5 \
	ftp://ftp.sanger.ac.uk/pub/wormbase/WS210/genomes/m_incognita/

    hgFakeAgp -minContigGap=1 \
	sequences/dna/b_malayi.WS210.dna.fa.gz ucsc.fake.melInc1.agp

##############################################################################
## Initial browser build (DONE - 2010-09-20 - Hiram)
    cd /hive/data/genomes/melInc1

    cat << '_EOF_' > melInc1.config.ra
# Config parameters for makeGenomeDb.pl:
db melInc1
clade worm
genomeCladePriority 84
scientificName Meloidogyne incognita
commonName M. incognita
assemblyDate Feb. 2008
assemblyLabel Sanger and Genoscope Genbank: CABB00000000
assemblyShortLabel M. incognita WS210
orderKey 897
mitoAcc none
fastaFiles /hive/data/genomes/melInc1/ws210/sequences/dna/m_incognita.WS210.dna.fa.gz
agpFiles /hive/data/genomes/melInc1/ws210/ucsc.fake.melInc1.agp
# qualFiles none
dbDbSpeciesDir worm
taxId 6306
'_EOF_'
    # << happy emacs

    makeGenomeDb.pl -workhorse=hgwdev -verbose=2 \
	-stop=agp melInc1.config.ra > agp.log 2>&1
    makeGenomeDb.pl -workhorse=hgwdev -verbose=2 \
	-continue=db melInc1.config.ra > db.log 2>&1

##############################################################################
# REPEATMASKER (DONE - 2010-09-21 - Hiram)
    screen 	#	use screen to control the job
    mkdir /hive/data/genomes/melInc1/bed/repeatMasker
    cd /hive/data/genomes/melInc1/bed/repeatMasker
    time nice -n +19 doRepeatMasker.pl -noSplit -bigClusterHub=pk \
	-buildDir=`pwd` melInc1 > do.log 2>&1 &
    #	real    53m51.544s

    #	from the do.log:
    #	RepeatMasker version development-$Id:
    #	RepeatMasker,v 1.25 2010/09/08 21:32:26 angie Exp
    #	CC   RELEASE 20090604;                                            *

    cat faSize.rmsk.txt
    #	82095019 bases (0 N's 82095019 real 73879883 upper 8215136 lower)
    #	in 9538 sequences in 1 files
    #	%10.01 masked total, %10.01 masked real

#########################################################################
# SIMPLE REPEATS (DONE - 2010-09-21 - Hiram)
    screen 	#	use screen to control the job
    mkdir /hive/data/genomes/melInc1/bed/simpleRepeat
    cd /hive/data/genomes/melInc1/bed/simpleRepeat
    time nice -n +19 doSimpleRepeat.pl -workhorse=hgwdev \
	-smallClusterHub=memk -buildDir=`pwd` melInc1 > do.log 2>&1 &
    #	real    14m29.878s

    cat fb.simpleRepeat 
    #	3947412 bases of 82095019 (4.808%) in intersection

#########################################################################
# run window masker (DONE - 2010-09-21 - Hiram)
    mkdir /hive/data/genomes/melInc1/bed/windowMasker
    cd /hive/data/genomes/melInc1/bed/windowMasker
    time doWindowMasker.pl -verbose=2 -workhorse=kolossus \
	-buildDir=`pwd` melInc1 > do.log 2>&1 &
    #	about 14 minutes

    twoBitToFa melInc1.wmsk.sdust.2bit stdout | faSize stdin
    #	82095019 bases (0 N's 82095019 real 44822089 upper 37272930 lower)
    #	in 9538 sequences in 1 files
    #	%45.40 masked total, %45.40 masked real

    hgLoadBed melInc1 windowmaskerSdust windowmasker.sdust.bed.gz
    #	Loaded 775123 elements of size 3
    featureBits melInc1 windowmaskerSdust
    #	37272930 bases of 82095019 (45.402%) in intersection

    #	this is the mask for the genome
    cd /hive/data/genomes/melInc1
    zcat bed/windowMasker/windowmasker.sdust.bed.gz \
	| twoBitMask melInc1.unmasked.2bit stdin \
	    -type=.bed melInc1.2bit
    twoBitToFa melInc1.2bit stdout | faSize stdin
    #	82095019 bases (0 N's 82095019 real 44822089 upper 37272930 lower)
    #	in 9538 sequences in 1 files
    #	%45.40 masked total, %45.40 masked real

    ln -s `pwd`/melInc1.2bit /gbdb/melInc1/melInc1.2bit

########################################################################
# MAKE 11.OOC FILE FOR BLAT/GENBANK (DONE - 2010-09-21 - Hiram)
    # numerator is melInc1 gapless bases "real" as reported by faSize 
    # denominator is hg19 gapless bases as reported by featureBits,
    #	featureBits hg19 gap
    # 1024 is threshold used for human -repMatch:
    calc \( 82095019 / 2897310462 \) \* 1024
    #	( 82095019 / 2897310462 ) * 1024 = 29.014944
    # 29 is way too small, use 100 to keep the number of overused
    #	11-mers to a smaller number

    cd /hive/data/genomes/melInc1
    blat melInc1.2bit /dev/null /dev/null -tileSize=11 \
      -makeOoc=melInc1.11.ooc -repMatch=100
    #	Wrote 8613 overused 11-mers to melInc1.11.ooc

    mkdir /hive/data/staging/data/melInc1
    cp -p melInc1.2bit chrom.sizes melInc1.11.ooc \
	/hive/data/staging/data/melInc1

#########################################################################
# LASTZ SWAP ce9 (DONE - 2010-09-23 - Hiram)
    # original alignment
    cd /hive/data/genomes/ce9/bed/blastzMelInc1.2010-09-22
    cat fb.ce9.chainMelInc1Link.txt 
    #	3310201 bases of 100286004 (3.301%) in intersection

    #	running swap
    mkdir /hive/data/genomes/melInc1/bed/blastz.ce9.swap
    cd /hive/data/genomes/melInc1/bed/blastz.ce9.swap
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
	/hive/data/genomes/ce9/bed/blastzMelInc1.2010-09-22/DEF \
	-qRepeats=windowmaskerSdust -bigClusterHub=pk \
	-workhorse=hgwdev -smallClusterHub=memk -swap > swap.log 2>&1 &
    #	real    0m36.095s

    cat fb.melInc1.chainCe9Link.txt
    #	4240790 bases of 82095019 (5.166%) in intersection

#########################################################################
# GENBANK AUTO UPDATE (DONE - 2010-09-27 - Hiram)
    # align with latest genbank process.
    ssh hgwdev
    cd ~/kent/src/hg/makeDb/genbank
    git pull
    # edit etc/genbank.conf to add melInc1 just before caeJap2

# melInc1 (Meloidogyne incognita)
melInc1.serverGenome = /hive/data/genomes/melInc1/melInc1.2bit
melInc1.clusterGenome = /scratch/data/melInc1/melInc1.2bit
melInc1.ooc = /scratch/data/melInc1/melInc1.11.ooc
melInc1.lift = no
melInc1.refseq.mrna.native.pslCDnaFilter  = ${lowCover.refseq.mrna.native.pslCDnaFilter}
melInc1.refseq.mrna.xeno.pslCDnaFilter    = ${lowCover.refseq.mrna.xeno.pslCDnaFilter}
melInc1.genbank.mrna.native.pslCDnaFilter = ${lowCover.genbank.mrna.native.pslCDnaFilter}
melInc1.genbank.mrna.xeno.pslCDnaFilter   = ${lowCover.genbank.mrna.xeno.pslCDnaFilter}
melInc1.genbank.est.native.pslCDnaFilter  = ${lowCover.genbank.est.native.pslCDnaFilter}
melInc1.refseq.mrna.native.load = yes
melInc1.refseq.mrna.xeno.load  = yes
melInc1.refseq.mrna.xeno.loadDesc = yes
melInc1.genbank.mrna.xeno.load = yes
melInc1.genbank.est.native.load = yes
melInc1.genbank.est.native.loadDesc = no
melInc1.downloadDir = melInc1
melInc1.perChromTables = no

    git commit -m "Added melInc1 M. incognita WS210" etc/genbank.conf
    git push
    # update /cluster/data/genbank/:
    make etc-update

    ssh genbank
    screen		#	use a screen to manage this job
    cd /cluster/data/genbank
    time nice -n +19 bin/gbAlignStep -initial melInc1 &
    #	logFile: var/build/logs/2010.09.27-13:31:08.melInc1.initalign.log
    #	real    388m50.914s

    # load database when finished
    ssh hgwdev
    cd /cluster/data/genbank
    time nice -n +19 ./bin/gbDbLoadStep -drop -initialLoad melInc1
    #	logFile: var/dbload/hgwdev/logs/2010.09.28-10:56:39.dbload.log
    #	real    28m22.985s

    # enable daily alignment and update of hgwdev
    cd ~/kent/src/hg/makeDb/genbank
    git pull
    # add melInc1 to:
        etc/align.dbs
        etc/hgwdev.dbs
    git commit -m "Added melInc1" etc/align.dbs etc/hgwdev.dbs
    git push
    make etc-update

############################################################################
#  BLATSERVERS ENTRY (DONE - 2008-06-04 - Hiram)
#	After getting a blat server assigned by the Blat Server Gods,
    ssh hgwdev

    hgsql -e 'INSERT INTO blatServers (db, host, port, isTrans, canPcr) \
	VALUES ("melInc1", "blat5", "17790", "1", "0"); \
	INSERT INTO blatServers (db, host, port, isTrans, canPcr) \
	VALUES ("melInc1", "blat5", "17791", "0", "1");' \
	    hgcentraltest
    #	test it with some sequence

############################################################################
# reset default position to ZC101.2e protein blat result
    ssh hgwdev
    hgsql -e 'update dbDb set defaultPos="Minc_Contig26:7019-50820"
	where name="melInc1";' hgcentraltest

############################################################################
# ELEGANS (ce9) PROTEINS TRACK (DONE - 2010-10-08 - Hiram)
    cd /hive/data/genomes/melInc1
    mkdir blastDb
    twoBitToFa melInc1.unmasked.2bit temp.fa
    faSplit gap temp.fa 1000000 blastDb/x -lift=blastDb.lft
    #	9538 pieces of 9538 written
    rm temp.fa
    cd blastDb
    for i in *.fa
    do
	/scratch/data/blast-2.2.11/bin/formatdb -i $i -p F
    done
    rm *.fa

    ## create the query protein set
    mkdir -p /hive/data/genomes/melInc1/bed/tblastn.ce9SG
    cd /hive/data/genomes/melInc1/bed/tblastn.ce9SG
    echo  /hive/data/genomes/melInc1/blastDb/*.nsq | xargs ls -S \
	| sed "s/\.nsq//"  > query.lst
    wc -l query.lst
    # 9538 query.lst

   # we want around 200000 jobs
   calc `wc /hive/data/genomes/ce9/bed/blat.ce9SG/ce9SG.psl | awk "{print \\\$1}"`/\(200000/`wc query.lst | awk "{print \\\$1}"`\)

    #	28103/(200000/9538) = 1340.232070

   mkdir sgfa
   split -l 1300 /hive/data/genomes/ce9/bed/blat.ce9SG/ce9SG.psl sgfa/sg
   cd sgfa
   for i in *; do 
     nice pslxToFa $i $i.fa; 
     rm $i; 
   done
   cd ..
   ls -1S sgfa/*.fa > sg.lst
   mkdir blastOut
   for i in `cat sg.lst`; do  mkdir blastOut/`basename $i .fa`; done
   
   cat << '_EOF_' > template
#LOOP
blastSome $(path1) {check in line $(path2)} {check out exists blastOut/$(root2)/q.$(root1).psl }
#ENDLOOP
'_EOF_'
    # << happy emacs

   cat << '_EOF_' > blastSome
#!/bin/sh
DB=melInc1
BLASTMAT=/scratch/data/blast-2.2.11/data
SCR="/scratch/tmp/${DB}"
g=`basename $2`
D=`basename $1`
export BLASTMAT DB SCR g D
mkdir -p ${SCR}
cp -p $1.* ${SCR}
f=${SCR}/`basename $3`.$g
for eVal in 0.01 0.001 0.0001 0.00001 0.000001 1E-09 1E-11
do
if /scratch/data/blast-2.2.11/bin/blastall -M BLOSUM80 -m 0 -F no \
	-e $eVal -p tblastn -d ${SCR}/$D -i $2 -o $f.8
then
        mv $f.8 $f.1
        break;
fi
done
if test -f  $f.1
then
    if /cluster/bin/x86_64/blastToPsl $f.1 $f.2
    then
        liftUp -nosort -type=".psl" -nohead $f.3 \
	    /hive/data/genomes/${DB}/blastDb.lft carry $f.2 > /dev/null
        liftUp -nosort -type=".psl" -pslQ -nohead $3.tmp \
	    /hive/data/genomes/ce9/bed/blat.ce9SG/protein.lft warn $f.3 > /dev/null
        if pslCheck -prot $3.tmp
        then
            mv $3.tmp $3
            rm -f $f.1 $f.2 $f.3 $f.4 ${SCR}/$D.*
            rmdir --ignore-fail-on-non-empty ${SCR}
        fi
        exit 0
    fi
fi
rm -f $f.1 $f.2 $3.tmp $f.8 $f.3 $f.4 ${SCR}/$D.*
rmdir --ignore-fail-on-non-empty ${SCR}
exit 1
'_EOF_'
    # << happy emacs
    chmod +x blastSome

    ssh swarm
    cd /hive/data/genomes/melInc1/bed/tblastn.ce9SG
    gensub2 query.lst sg.lst template jobList
    para create jobList
    para try, check, push, check etc.
# Completed: 209836 of 209836 jobs
# CPU time in finished jobs:    4293845s   71564.08m  1192.73h   49.70d  0.136 y
# IO & Wait Time:                968639s   16143.98m   269.07h   11.21d  0.031 y
# Average job time:                  25s       0.42m     0.01h    0.00d
# Longest finished job:              76s       1.27m     0.02h    0.00d
# Submission to last job:         66006s    1100.10m    18.34h    0.76d

    # do the cluster run for chaining
    ssh swarm
    mkdir /hive/data/genomes/melInc1/bed/tblastn.ce9SG/chainRun
    cd /hive/data/genomes/melInc1/bed/tblastn.ce9SG/chainRun
    cat << '_EOF_' > template
#LOOP
chainOne $(path1) {check out exists+ ../blastOut/c.$(file1).psl}
#ENDLOOP
'_EOF_'
    # << happy emacs

    cat << '_EOF_' > chainOne
#!/bin/csh -fe
cd $1
set b = $1:t
cat q.*.psl | simpleChain -prot -outPsl -maxGap=50000 stdin \
/hive/data/genomes/melInc1/bed/tblastn.ce9SG/blastOut/c.$b.psl
'_EOF_'
    # << happy emacs
    chmod +x chainOne

    ls -1dS /hive/data/genomes/melInc1/bed/tblastn.ce9SG/blastOut/sg?? \
	> chain.lst
    gensub2 chain.lst single template jobList
    cd /hive/data/genomes/melInc1/bed/tblastn.ce9SG/chainRun
    para create jobList
    para -maxJob=30 push
    para try, check, push, check etc.
# Completed: 22 of 22 jobs
# CPU time in finished jobs:        139s       2.32m     0.04h    0.00d  0.000 y
# IO & Wait Time:                 11262s     187.70m     3.13h    0.13d  0.000 y
# Average job time:                 518s       8.64m     0.14h    0.01d
# Longest finished job:             537s       8.95m     0.15h    0.01d
# Submission to last job:           562s       9.37m     0.16h    0.01d

    cd /hive/data/genomes/melInc1/bed/tblastn.ce9SG/blastOut
    for i in sg??
    do
       cat c.$i.psl | awk "(\$13 - \$12)/\$11 > 0.6 {print}" > c60.$i.psl
       sort -rn c60.$i.psl | pslUniq stdin u.$i.psl
       awk "((\$1 / \$11) ) > 0.60 { print   }" c60.$i.psl > m60.$i.psl
       echo $i
    done
    sort -T /scratch/tmp -k 14,14 -k 16,16n -k 17,17n u.*.psl m60* | uniq \
	> ../blastCe9SG.psl
    cd ..
    pslCheck blastCe9SG.psl
    #	checked: 15615 failed: 0 errors: 0

    # load table 
    ssh hgwdev
    cd /hive/data/genomes/melInc1/bed/tblastn.ce9SG
    hgLoadPsl melInc1 blastCe9SG.psl

    # check coverage
    featureBits melInc1 blastCe9SG
    #	3882043 bases of 82095019 (4.729%) in intersection
    featureBits melHap1 blastCe9SG
    #	4376245 bases of 53017507 (8.254%) in intersection
    featureBits bruMal1 blastCe9SG 
    #	4424694 bases of 89235536 (4.958%) in intersection
    featureBits priPac2 blastCe9SG
    #	5436779 bases of 133634773 (4.068%) in intersection
    featureBits haeCon1 blastCe9SG
    #	4990746 bases of 278844984 (1.790%) in intersection
    featureBits caeJap3 blastCe9SG
    #	12894398 bases of 154057934 (8.370%) in intersection
    featureBits caePb2 blastCe9SG
    #	23730009 bases of 170473138 (13.920%) in intersection
    featureBits caeRem3 blastCe9SG
    #	20302540 bases of 138406388 (14.669%) in intersection
    featureBits cb3 blastCe9SG
    #	18490367 bases of 108433446 (17.052%) in intersection
    featureBits ce9 sangerGene
    #	28689552 bases of 100286004 (28.608%) in intersection

    rm -rf blastOut

#########################################################################
# verify all.joiner has everything (DONE - 2010-10-21 - Hiram)
    # edit all.joiner until all these commands are successful
    cd $HOME/kent/src/hg/makeDb/schema
    joinerCheck -tableCoverage -database=melInc1 ./all.joiner
    joinerCheck -keys -database=melInc1 ./all.joiner
    joinerCheck -times -database=melInc1 ./all.joiner
    joinerCheck -all -database=melInc1 ./all.joiner
    #	the -all command will complain about other databases on hgwdev
    #	that are not specified in all.joiner.  There are a lot of test
    #	databases on hgwdev

#########################################################################
# construct downloads files (DONE - 2010-10-21 - Hiram)
    cd /hive/data/genomes/melInc1
    makeDownloads.pl -dbHost=hgwdev -workhorse=hgwdev -verbose=2 melInc1 \
	 > downloads.log 2>&1

#########################################################################
## Creating pushQ (DONE - 2010-10-21 - Hiram)
    ssh hgwdev
    mkdir /hive/data/genomes/melInc1/pushQ
    cd /hive/data/genomes/melInc1/pushQ
    makePushQSql.pl melInc1 > melInc1.sql 2> errorLog.out

    ## check the errorLog.out for anything that needs to be fixed
    ## copy melInc1.sql to hgwbeta:/tmp
    ## then on hgwbeta:
    hgsql qapushq < melInc1.sql

#######################################################################
# LIFTOVER TO melInc2 (DONE - 2015-07-07 - Hiram )
    mkdir /hive/data/genomes/melInc1/bed/blat.melInc2.2015-07-07
    cd /hive/data/genomes/melInc1/bed/blat.melInc2.2015-07-07
    # -debug run to create run dir, preview scripts...
    doSameSpeciesLiftOver.pl \
	-buildDir=`pwd` \
	-bigClusterHub=ku -dbHost=hgwdev -workhorse=hgwdev \
	-ooc=/hive/data/genomes/melInc1/melInc1.11.ooc -debug melInc1 melInc2
    # Real run:
    time (doSameSpeciesLiftOver.pl -buildDir=`pwd` \
	-bigClusterHub=ku -dbHost=hgwdev -workhorse=hgwdev \
	-ooc=/hive/data/genomes/melInc1/melInc1.11.ooc melInc1 melInc2) \
		> do.log 2>&1
    #	real    5m40.388s
    # verify it works on genome-test

#############################################################################
