# for emacs: -*- mode: sh; -*-

# Meloidogyne hapla
# The Center for the Biology of Nematode Parasitism, N.C. State
#	http://www.pngg.org/cbnp/index.php
# "Sequence and genetic map of Meloidogyne hapla:
#	A compact nematode genome for plant parasitism"
#	http://www.pnas.org/content/105/39/14802.full
#	PNAS  September 30, 2008   vol. 105  no. 39  14802-14807 
#	http://www.ncbi.nlm.nih.gov/Traces/wgs/?val=ABLG01

##############################################################################
## Fetch sequence (DONE - 2010-09-20 - Hiram)

    mkdir -p /hive/data/genomes/melHap1/ws210
    cd /hive/data/genomes/melHap1/ws210

    wget --no-parent --timestamping -m -nH --cut-dirs=5 \
	ftp://ftp.sanger.ac.uk/pub/wormbase/WS210/genomes/m_hapla/

    hgFakeAgp -minContigGap=1 \
	sequences/dna/b_malayi.WS210.dna.fa.gz ucsc.fake.melHap1.agp

##############################################################################
## Initial browser build (DONE - 2010-09-20 - Hiram)
    cd /hive/data/genomes/melHap1

    cat << '_EOF_' > melHap1.config.ra
# Config parameters for makeGenomeDb.pl:
db melHap1
clade worm
genomeCladePriority 82
scientificName Meloidogyne hapla
commonName M. hapla
assemblyDate Sep. 2008
assemblyLabel The Center for the Biology of Nematode Parasitism, N.C. State
assemblyShortLabel M. hapla VW9 WS210
orderKey 895
mitoAcc none
fastaFiles /hive/data/genomes/melHap1/ws210/sequences/dna/m_hapla.WS210.dna.fa.gz
agpFiles /hive/data/genomes/melHap1/ws210/ucsc.fake.melHap1.agp
# qualFiles none
dbDbSpeciesDir worm
taxId 6305
'_EOF_'
    # << happy emacs

    time makeGenomeDb.pl -workhorse=hgwdev -verbose=2 \
	melHap1.config.ra > makeGenomeDb.log 2>&1
    #	real    0m42.028s

##############################################################################
# REPEATMASKER (DONE - 2010-09-21 - Hiram)
    screen 	#	use screen to control the job
    mkdir /hive/data/genomes/melHap1/bed/repeatMasker
    cd /hive/data/genomes/melHap1/bed/repeatMasker
    time nice -n +19 doRepeatMasker.pl -noSplit -bigClusterHub=pk \
	-buildDir=`pwd` melHap1 > do.log 2>&1 &
    #	real    23m56.249s

    #	from the do.log:
    #	RepeatMasker version development-$Id:
    #	RepeatMasker,v 1.25 2010/09/08 21:32:26 angie Exp
    #	CC   RELEASE 20090604;                                            *

    cat faSize.rmsk.txt
    #	53017507 bases (0 N's 53017507 real 43261440 upper 9756067 lower)
    #	in 3452 sequences in 1 files
    #	%18.40 masked total, %18.40 masked real

#########################################################################
# SIMPLE REPEATS (DONE - 2010-09-21 - Hiram)
    screen 	#	use screen to control the job
    mkdir /hive/data/genomes/melHap1/bed/simpleRepeat
    cd /hive/data/genomes/melHap1/bed/simpleRepeat
    time nice -n +19 doSimpleRepeat.pl -workhorse=hgwdev \
	-smallClusterHub=memk -buildDir=`pwd` melHap1 > do.log 2>&1 &
    #	real    11m40.808s

    cat fb.simpleRepeat 
    #	2456208 bases of 53017507 (4.633%) in intersection

#########################################################################
# run window masker (DONE - 2010-09-21 - Hiram)
    mkdir /hive/data/genomes/melHap1/bed/windowMasker
    cd /hive/data/genomes/melHap1/bed/windowMasker
    time doWindowMasker.pl -verbose=2 -workhorse=kolossus \
	-buildDir=`pwd` melHap1 > do.log 2>&1 &
    #	real    1m59.029s

    twoBitToFa melHap1.wmsk.sdust.2bit stdout | faSize stdin
    #	53017507 bases (0 N's 53017507 real 24940979 upper 28076528 lower)
    #	in 3452 sequences in 1 files
    #	%52.96 masked total, %52.96 masked real

    hgLoadBed melHap1 windowmaskerSdust windowmasker.sdust.bed.gz
    #	Loaded 477268 elements of size 3
    featureBits melHap1 windowmaskerSdust
    #	28076528 bases of 53017507 (52.957%) in intersection

    #	this is the mask for the genome
    cd /hive/data/genomes/melHap1
    zcat bed/windowMasker/windowmasker.sdust.bed.gz \
	| twoBitMask melHap1.unmasked.2bit stdin \
	    -type=.bed melHap1.2bit
    twoBitToFa melHap1.2bit stdout | faSize stdin
    #	53017507 bases (0 N's 53017507 real 24940979 upper 28076528 lower)
    #	in 3452 sequences in 1 files
    #	%52.96 masked total, %52.96 masked real

    ln -s `pwd`/melHap1.2bit /gbdb/melHap1/melHap1.2bit

########################################################################
# MAKE 11.OOC FILE FOR BLAT/GENBANK (DONE - 2010-09-21 - Hiram)
    # numerator is melHap1 gapless bases "real" as reported by faSize 
    # denominator is hg19 gapless bases as reported by featureBits,
    #	featureBits hg19 gap
    # 1024 is threshold used for human -repMatch:
    calc \( 53017507 / 2897310462 \) \* 1024
    #	( 53017507 / 2897310462 ) * 1024 = 18.738043
    # 18 is way too small, use 100 to keep the number of overused
    #	11-mers to a smaller number

    cd /hive/data/genomes/melHap1
    blat melHap1.2bit /dev/null /dev/null -tileSize=11 \
      -makeOoc=melHap1.11.ooc -repMatch=50
    #	Wrote 6149 overused 11-mers to melHap1.11.ooc

    mkdir /hive/data/staging/data/melHap1
    cp -p melHap1.2bit chrom.sizes melHap1.11.ooc \
	/hive/data/staging/data/melHap1

#########################################################################
# SWAP LASTZ ce9 (DONE - 2010-09-23 - Hiram)
    # original alignment
    cd /hive/data/genomes/ce9/bed/blastzMelHap1.2010-09-22
    cat fb.ce9.chainMelHap1Link.txt 
    #	3933301 bases of 100286004 (3.922%) in intersection

    #	running swap
    mkdir /hive/data/genomes/melHap1/bed/blastz.ce9.swap
    cd /hive/data/genomes/melHap1/bed/blastz.ce9.swap
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
	/hive/data/genomes/ce9/bed/blastzMelHap1.2010-09-22/DEF \
	-qRepeats=windowmaskerSdust -bigClusterHub=pk \
	-workhorse=hgwdev -smallClusterHub=memk -swap > swap.log 2>&1 &
    #	real    0m23.154s

    cat fb.melHap1.chainCe9Link.txt
    #	3631046 bases of 53017507 (6.849%) in intersection

#########################################################################
# GENBANK AUTO UPDATE (DONE - 2010-09-27 - Hiram)
    # align with latest genbank process.
    ssh hgwdev
    cd ~/kent/src/hg/makeDb/genbank
    git pull
    # edit etc/genbank.conf to add melHap1 just before caeJap2

# melHap1 (Meloidogyne hapla)
melHap1.serverGenome = /hive/data/genomes/melHap1/melHap1.2bit
melHap1.clusterGenome = /scratch/data/melHap1/melHap1.2bit
melHap1.ooc = /scratch/data/melHap1/melHap1.11.ooc
melHap1.lift = no
melHap1.refseq.mrna.native.pslCDnaFilter  = ${lowCover.refseq.mrna.native.pslCDnaFilter}
melHap1.refseq.mrna.xeno.pslCDnaFilter    = ${lowCover.refseq.mrna.xeno.pslCDnaFilter}
melHap1.genbank.mrna.native.pslCDnaFilter = ${lowCover.genbank.mrna.native.pslCDnaFilter}
melHap1.genbank.mrna.xeno.pslCDnaFilter   = ${lowCover.genbank.mrna.xeno.pslCDnaFilter}
melHap1.genbank.est.native.pslCDnaFilter  = ${lowCover.genbank.est.native.pslCDnaFilter} 
melHap1.refseq.mrna.native.load = yes
melHap1.refseq.mrna.xeno.load  = yes
melHap1.refseq.mrna.xeno.loadDesc = yes
melHap1.genbank.mrna.xeno.load = yes
melHap1.genbank.est.native.load = yes
melHap1.genbank.est.native.loadDesc = no
melHap1.downloadDir = melHap1
melHap1.perChromTables = no

    git commit -m "Added melHap1 M. hapla WS210" etc/genbank.conf
    git push
    # update /cluster/data/genbank/:
    make etc-update

    ssh genbank
    screen		#	use a screen to manage this job
    cd /cluster/data/genbank
    time nice -n +19 bin/gbAlignStep -initial melHap1 &
    #	logFile: var/build/logs/2010.09.27-13:30:40.melHap1.initalign.log
    #	real    119m17.816s
    # this had a failure during the fetch:
    # Exception: process signaled 9: "gbAlignGet
    # -workdir=work/initial.melHap1/align -orgCats=xeno,native -polyASizes
    # -fasize=4194304 -noMigrate genbank.179.0 full mrna melHap1"

    #	remove the work/initial.melHap1 directory and start over:
    time nice -n +19 bin/gbAlignStep -initial melHap1 &
    #	logFile: var/build/logs/2010.09.28-10:10:02.melHap1.initalign.log
    #	real    201m42.832s

    # load database when finished
    ssh hgwdev
    cd /cluster/data/genbank
    time nice -n +19 ./bin/gbDbLoadStep -drop -initialLoad melHap1
    #	logFile: var/dbload/hgwdev/logs/2010.09.28-14:33:02.dbload.log
    #	real    20m26.836s

    # enable daily alignment and update of hgwdev
    cd ~/kent/src/hg/makeDb/genbank
    git pull
    # add melHap1 to:
        etc/align.dbs
        etc/hgwdev.dbs
    git commit -m "Added melHap1" etc/align.dbs etc/hgwdev.dbs
    git push
    make etc-update

############################################################################
#  BLATSERVERS ENTRY (DONE - 2008-06-04 - Hiram)
#	After getting a blat server assigned by the Blat Server Gods,
    ssh hgwdev

    hgsql -e 'INSERT INTO blatServers (db, host, port, isTrans, canPcr) \
	VALUES ("melHap1", "blat5", "17788", "1", "0"); \
	INSERT INTO blatServers (db, host, port, isTrans, canPcr) \
	VALUES ("melHap1", "blat5", "17789", "0", "1");' \
	    hgcentraltest
    #	test it with some sequence

############################################################################
# reset default position to ZC101.2e protein blat result
    ssh hgwdev
    hgsql -e 'update dbDb set defaultPos="MhA1_Contig210:36435-59955"
	where name="melHap1";' hgcentraltest

############################################################################
# ELEGANS (ce9) PROTEINS TRACK (DONE - 2010-10-08 - Hiram)
    cd /hive/data/genomes/melHap1
    mkdir blastDb
    twoBitToFa melHap1.unmasked.2bit temp.fa
    faSplit gap temp.fa 1000000 blastDb/x -lift=blastDb.lft
    #	3452 pieces of 3452 written
    rm temp.fa
    cd blastDb
    for i in *.fa
    do
	/scratch/data/blast-2.2.11/bin/formatdb -i $i -p F
    done
    rm *.fa

    ## create the query protein set
    mkdir -p /hive/data/genomes/melHap1/bed/tblastn.ce9SG
    cd /hive/data/genomes/melHap1/bed/tblastn.ce9SG
    echo  /hive/data/genomes/melHap1/blastDb/*.nsq | xargs ls -S \
	| sed "s/\.nsq//"  > query.lst
    wc -l query.lst
    # 3452 query.lst

   # we want around 50000 jobs
   calc `wc /hive/data/genomes/ce9/bed/blat.ce9SG/ce9SG.psl | awk "{print \\\$1}"`/\(50000/`wc query.lst | awk "{print \\\$1}"`\)

    #	28103/(50000/3452) = 1940.231120

   mkdir -p sgfa
   split -l 1900 /hive/data/genomes/ce9/bed/blat.ce9SG/ce9SG.psl sgfa/sg
   cd sgfa
   for i in *; do 
     nice pslxToFa $i $i.fa; 
     rm $i; 
   done
   cd ..
   ls -1S sgfa/*.fa > sg.lst
   mkdir -p blastOut
   for i in `cat sg.lst`; do  mkdir blastOut/`basename $i .fa`; done
   
   cat << '_EOF_' > template
#LOOP
blastSome $(path1) {check in line $(path2)} {check out exists blastOut/$(root2)/q.$(root1).psl }
#ENDLOOP
'_EOF_'
    # << happy emacs

   cat << '_EOF_' > blastSome
#!/bin/sh
DB=melHap1
BLASTMAT=/scratch/data/blast-2.2.11/data
SCR="/scratch/tmp/${DB}"
g=`basename $2`
D=`basename $1`
export BLASTMAT DB SCR g D
mkdir -p ${SCR}
cp -p $1.* ${SCR}
f=${SCR}/`basename $3`.$g
for eVal in 0.01 0.001 0.0001 0.00001 0.000001 1E-09 1E-11
do
if /scratch/data/blast-2.2.11/bin/blastall -M BLOSUM80 -m 0 -F no \
	-e $eVal -p tblastn -d ${SCR}/$D -i $2 -o $f.8
then
        mv $f.8 $f.1
        break;
fi
done
if test -f  $f.1
then
    if /cluster/bin/x86_64/blastToPsl $f.1 $f.2
    then
        liftUp -nosort -type=".psl" -nohead $f.3 \
	    /hive/data/genomes/${DB}/blastDb.lft carry $f.2 > /dev/null
        liftUp -nosort -type=".psl" -pslQ -nohead $3.tmp \
	    /hive/data/genomes/ce9/bed/blat.ce9SG/protein.lft warn $f.3 > /dev/null
        if pslCheck -prot $3.tmp
        then
            mv $3.tmp $3
            rm -f $f.1 $f.2 $f.3 $f.4 ${SCR}/$D.*
            rmdir --ignore-fail-on-non-empty ${SCR}
        fi
        exit 0
    fi
fi
rm -f $f.1 $f.2 $3.tmp $f.8 $f.3 $f.4 ${SCR}/$D.*
rmdir --ignore-fail-on-non-empty ${SCR}
exit 1
'_EOF_'
    # << happy emacs
    chmod +x blastSome

    ssh swarm
    cd /hive/data/genomes/melHap1/bed/tblastn.ce9SG
    gensub2 query.lst sg.lst template jobList
    para create jobList
    para try, check, push, check etc.
# Completed: 51780 of 51780 jobs
# CPU time in finished jobs:    1643951s   27399.18m   456.65h   19.03d  0.052 y
# IO & Wait Time:                283698s    4728.31m    78.81h    3.28d  0.009 y
# Average job time:                  37s       0.62m     0.01h    0.00d
# Longest finished job:             127s       2.12m     0.04h    0.00d
# Submission to last job:         65982s    1099.70m    18.33h    0.76d

    # do the cluster run for chaining
    ssh swarm
    mkdir /hive/data/genomes/melHap1/bed/tblastn.ce9SG/chainRun
    cd /hive/data/genomes/melHap1/bed/tblastn.ce9SG/chainRun
    cat << '_EOF_' > template
#LOOP
chainOne $(path1) {check out exists+ ../blastOut/c.$(file1).psl}
#ENDLOOP
'_EOF_'
    # << happy emacs

    cat << '_EOF_' > chainOne
#!/bin/csh -fe
cd $1
set b = $1:t
cat q.*.psl | simpleChain -prot -outPsl -maxGap=50000 stdin \
/hive/data/genomes/melHap1/bed/tblastn.ce9SG/blastOut/c.$b.psl
'_EOF_'
    # << happy emacs
    chmod +x chainOne

    ls -1dS /hive/data/genomes/melHap1/bed/tblastn.ce9SG/blastOut/sg?? \
	> chain.lst
    gensub2 chain.lst single template jobList
    cd /hive/data/genomes/melHap1/bed/tblastn.ce9SG/chainRun
    para create jobList
    para -maxJob=30 push
    para try, check, push, check etc.
# Completed: 15 of 15 jobs
# CPU time in finished jobs:         47s       0.79m     0.01h    0.00d  0.000 y
# IO & Wait Time:                   966s      16.09m     0.27h    0.01d  0.000 y
# Average job time:                  68s       1.13m     0.02h    0.00d
# Longest finished job:              90s       1.50m     0.03h    0.00d
# Submission to last job:           257s       4.28m     0.07h    0.00d

    cd /hive/data/genomes/melHap1/bed/tblastn.ce9SG/blastOut
    for i in sg??
    do
       cat c.$i.psl | awk "(\$13 - \$12)/\$11 > 0.6 {print}" > c60.$i.psl
       sort -rn c60.$i.psl | pslUniq stdin u.$i.psl
       awk "((\$1 / \$11) ) > 0.60 { print   }" c60.$i.psl > m60.$i.psl
       echo $i
    done
    sort -T /scratch/tmp -k 14,14 -k 16,16n -k 17,17n u.*.psl m60* | uniq \
	> /hive/data/genomes/melHap1/bed/tblastn.ce9SG/blastCe9SG.psl
    cd ..
    pslCheck blastCe9SG.psl
    #	checked: 14315 failed: 0 errors: 0

    # load table 
    ssh hgwdev
    cd /hive/data/genomes/melHap1/bed/tblastn.ce9SG
    hgLoadPsl melHap1 blastCe9SG.psl

    # check coverage
    featureBits melHap1 blastCe9SG
    #	4376245 bases of 53017507 (8.254%) in intersection
    featureBits melInc1 blastCe9SG
    #	3882043 bases of 82095019 (4.729%) in intersection
    featureBits haeCon1 blastCe9SG
    #	4990746 bases of 278844984 (1.790%) in intersection
    featureBits priPac2 blastCe9SG
    #	5436779 bases of 133634773 (4.068%) in intersection
    featureBits bruMal1 blastCe9SG 
    #	4424694 bases of 89235536 (4.958%) in intersection
    featureBits caeJap3 blastCe9SG
    #	12894398 bases of 154057934 (8.370%) in intersection
    featureBits caePb2 blastCe9SG
    #	23730009 bases of 170473138 (13.920%) in intersection
    featureBits caeRem3 blastCe9SG
    #	20302540 bases of 138406388 (14.669%) in intersection
    featureBits cb3 blastCe9SG
    #	18490367 bases of 108433446 (17.052%) in intersection
    featureBits ce9 sangerGene
    #	28689552 bases of 100286004 (28.608%) in intersection

    rm -rf blastOut

#########################################################################
# verify all.joiner has everything (DONE - 2010-10-25 - Hiram)
    # edit all.joiner until all these commands are successful
    cd $HOME/kent/src/hg/makeDb/schema
    joinerCheck -tableCoverage -database=melHap1 ./all.joiner
    joinerCheck -keys -database=melHap1 ./all.joiner
    joinerCheck -times -database=melHap1 ./all.joiner
    joinerCheck -all -database=melHap1 ./all.joiner
    #	the -all command will complain about other databases on hgwdev
    #	that are not specified in all.joiner.  There are a lot of test
    #	databases on hgwdev

#########################################################################
# construct downloads files (DONE - 2010-10-25 - Hiram)
    cd /hive/data/genomes/melHap1
    makeDownloads.pl -dbHost=hgwdev -workhorse=hgwdev -verbose=2 melHap1 \
	 > downloads.log 2>&1

#########################################################################
## Creating pushQ (DONE - 2010-10-25 - Hiram)
    ssh hgwdev
    mkdir /hive/data/genomes/melHap1/pushQ
    cd /hive/data/genomes/melHap1/pushQ
    makePushQSql.pl melHap1 > melHap1.sql 2> errorLog.out

    ## check the errorLog.out for anything that needs to be fixed
    ## copy melHap1.sql to hgwbeta:/tmp
    ## then on hgwbeta:
    hgsql qapushq < melHap1.sql

#######################################################################
# improve common name (DONE - 2015-07-24 - Hiram)

    hgsql -e 'update dbDb set organism="Root knot" where name="melHap1";' hgcentraltest

#############################################################################
