# for emacs: -*- mode: sh; -*-

# Pristionchus pacificus
#	http://www.ncbi.nlm.nih.gov/Traces/wgs/?val=ABKE01

##############################################################################
## Fetch sequence (DONE - 2010-09-28 - Hiram)

    mkdir /cluster/data/priPac2
    cd /cluster/data/priPac2
    mkdir wgs
    cd wgs
    wget --timestamping -O wgs.ABKE.1.qscore.gz \
	ftp://ftp.ncbi.nlm.nih.gov/genbank/wgs/wgs.ABKE.1.qscore.gz

    wget --timestamping -O wgs.ABKE.1.fsa_nt.gz \
	ftp://ftp.ncbi.nlm.nih.gov/genbank/wgs/wgs.ABKE.1.fsa_nt.gz

    wget --timestamping -O wgs.ABKE.1.gbff.gz \
	ftp://ftp.ncbi.nlm.nih.gov/genbank/wgs/wgs.ABKE.1.gbff.gz

    zcat wgs.ABKE.1.qscore.gz | sed -e 's#^>gb|#>#; s#.1|.*##' \
    | gzip -c > ABKE010.ucsc.qa.gz

    zcat wgs.ABKE.1.fsa_nt.gz | sed -e 's#^>gi.*ABKE#>ABKE#; s#.1|.*##' \
    | gzip -c > ABKE010.ucsc.fa.gz

    zcat wgs.ABKE.1.fsa_nt.gz | grep "^>" | sed -e 's#^>gi.*ABKE#ABKE#; s#| Pris.*PS312##; s#, whole.*##' > accToContig.list

    hgFakeAgp -minContigGap=1 ABKE010.ucsc.fa.gz ABKE010.ucsc.agp

    cat << '_EOF_' > replaceNames.pl
#!/bin/env perl

use strict;
use warnings;

my %ableToContig;
open (FH, "<accToContig.list") or die "can not read accToContig.list";
while (my $line = <FH>) {
    chomp $line;
    my ($ableName, $contigName) = split('\s+', $line);
    $ableName =~ s/.1$//;
    $ableToContig{$ableName} = $contigName;
}
close (FH);

open (FH, "<ABKE010.ucsc.agp") or die "can not read ABKE010.ucsc.agp";
while (my $line = <FH>) {
    chomp $line;
    my ($acc, $start, $end, $id, $type, $ctg, $ctgStart, $ctgEnd, $strand) =
        split('\s+', $line);
    if ($type eq "N") {
	printf "%s\t%d\t%d\t%d\t%s\t%d\t%s\t%s\n",
	    $acc, $start, $end, $id, $type, $ctg, $ctgStart, $ctgEnd;
    } else {
	$acc =~ s/.1$//;
	printf "%s\t%d\t%d\t%d\t%s\t%s\t%d\t%d\t%s\n",
	    $acc, $start, $end, $id, $type, $ableToContig{$acc}, $ctgStart, $ctgEnd, $strand;
    }
}
close (FH);
'_EOF_'
    # << happy emacs
    chmod +x replaceNames.pl

    ./replaceNames.pl > contig.ucsc.agp

    qaToQac ABKE010.ucsc.qa.gz ABKE010.ucsc.qac

##############################################################################
## Initial browser build (DONE - 2010-09-29 - Hiram)
    cd /hive/data/genomes/priPac2

    cat << '_EOF_' > priPac2.config.ra
# Config parameters for makeGenomeDb.pl:
db priPac2
clade worm
scientificName Pristionchus pacificus
commonName P. pacificus
assemblyDate Dec. 2008
assemblyLabel Washington University School of Medicine GSC P. pacificus 5.0.1
assemblyShortLabel WUGSC 5.0.1
orderKey 884
mitoAcc none
fastaFiles /hive/data/genomes/priPac2/wgs/ABKE010.ucsc.fa.gz
agpFiles /hive/data/genomes/priPac2/wgs/contig.ucsc.agp
qualFiles /hive/data/genomes/priPac2/wgs/ABKE010.ucsc.qac
dbDbSpeciesDir worm
taxId 54126
'_EOF_'
    # << happy emacs

    time makeGenomeDb.pl -stop=agp -workhorse=hgwdev -verbose=2 \
	priPac2.config.ra > agp.log 2>&1
    #	real    0m6.409s
    time makeGenomeDb.pl -continue=db -workhorse=hgwdev -verbose=2 \
	priPac2.config.ra > makeGenomeDb.log 2>&1
    #	real    2m21.252s

##############################################################################
# REPEATMASKER (DONE - 2010-09-29 - Hiram)
    screen 	#	use screen to control the job
    mkdir /hive/data/genomes/priPac2/bed/repeatMasker
    cd /hive/data/genomes/priPac2/bed/repeatMasker
    time nice -n +19 doRepeatMasker.pl -noSplit -bigClusterHub=pk \
	-buildDir=`pwd` priPac2 > do.log 2>&1 &
    #	real    68m59.569s

    #	from the do.log:
    #	June 30 2010 (open-3-2-9) version of RepeatMasker
    #	CC   RELEASE 20090604;
    #	RepeatMasker,v 1.25 2010/09/08 21:32:26 angie Exp $

    cat faSize.rmsk.txt
    #	133635077 bases (304 N's 133634773 real 131467395 upper
    #	2167378 lower) in 13962 sequences in 1 files
    #	%1.62 masked total, %1.62 masked real

#########################################################################
# SIMPLE REPEATS (DONE - 2010-09-29 - Hiram)
    screen 	#	use screen to control the job
    mkdir /hive/data/genomes/priPac2/bed/simpleRepeat
    cd /hive/data/genomes/priPac2/bed/simpleRepeat
    time nice -n +19 doSimpleRepeat.pl -workhorse=hgwdev \
	-smallClusterHub=memk -buildDir=`pwd` priPac2 > do.log 2>&1 &
    #	real    11m23.587s

    cat fb.simpleRepeat 
    #	2706905 bases of 133634773 (2.026%) in intersection

#########################################################################
# run window masker (DONE - 2010-09-29 - Hiram)
    mkdir /hive/data/genomes/priPac2/bed/windowMasker
    cd /hive/data/genomes/priPac2/bed/windowMasker
    time doWindowMasker.pl -verbose=2 -workhorse=kolossus \
	-buildDir=`pwd` priPac2 > do.log 2>&1 &
    #	real    3m42.686s

    #	real    4m14.628s

    twoBitToFa priPac2.wmsk.sdust.2bit stdout | faSize stdin
    #	133635077 bases (304 N's 133634773 real 103391560 upper
    #	30243213 lower) in 13962 sequences in 1 files
    #	%22.63 masked total, %22.63 masked real

    #	load this initial data to get ready to clean it
    hgLoadBed priPac2 windowmaskerSdust windowmasker.sdust.bed.gz
    #	Loaded 799331 elements of size 3
    featureBits priPac2 windowmaskerSdust
    #	30243416 bases of 133634773 (22.631%) in intersection

    #	eliminate the gaps from the masking
    time featureBits priPac2 -not gap -bed=notGap.bed
    #	133634773 bases of 133634773 (100.000%) in intersection
    #	real    0m1.895s
    time nice -n +19 featureBits priPac2 windowmaskerSdust notGap.bed \
        -bed=stdout | gzip -c > cleanWMask.bed.gz
    #	real    0m33.349s
    #	30243213 bases of 133634773 (22.631%) in intersection

    #	reload track to get it clean
    hgLoadBed priPac2 windowmaskerSdust cleanWMask.bed.gz
    #	Loaded 799488  elements of size 3
    time featureBits priPac2 windowmaskerSdust > fb.priPac2.cleanWMask.txt 2>&1
    #	real    0m9.075s
    cat fb.priPac2.cleanWMask.txt 
    #	30243213 bases of 133634773 (22.631%) in intersection

    cd /hive/data/genomes/priPac2
    #	mask the sequence with this clean mask
    zcat bed/windowMasker/cleanWMask.bed.gz \
	| twoBitMask priPac2.unmasked.2bit stdin \
	    -type=.bed priPac2.2bit
    twoBitToFa priPac2.2bit stdout | faSize stdin  > priPac2.faSize.txt
    cat priPac2.faSize.txt
    #	133635077 bases (304 N's 133634773 real 103391560 upper
    #	30243213 lower) in 13962 sequences in 1 files
    #	%22.63 masked total, %22.63 masked real

    ln -s `pwd`/priPac2.2bit /gbdb/priPac2/priPac2.2bit

#########################################################################
# Check for *all* gaps - if they are not all in the AGP file
#       (DONE - 2010-10-20 - Hiram)
    mkdir /hive/data/genomes/priPac2/bed/allGaps
    cd /hive/data/genomes/priPac2/bed/allGaps

    time nice -n +19 findMotif -motif=gattaca -verbose=4 \
        -strand=+ ../../priPac2.2bit > findMotif.txt 2>&1
    #   real    0m2.078s

    grep "^#GAP " findMotif.txt | sed -e "s/^#GAP //" > allGaps.bed
    featureBits priPac2 -not gap -bed=notGap.bed
    #  133634773 bases of 133634773 (100.000%) in intersection

    featureBits priPac2 allGaps.bed notGap.bed -bed=new.gaps.bed
    #	0 bases of 133634773 (0.000%) in intersection
    ### - there are no extra gaps
    #	(see felCat4.txt for remainder of procedure if there are gaps)

########################################################################
# MAKE 11.OOC FILE FOR BLAT/GENBANK (DONE - 2010-09-29 - Hiram)
    # numerator is priPac2 gapless bases "real" as reported by faSize 
    # denominator is hg19 gapless bases as reported by featureBits,
    #	featureBits hg19 gap
    # 1024 is threshold used for human -repMatch:
    calc \( 133634773 / 2897310462 \) \* 1024
    #	( 133634773 / 2897310462 ) * 1024 = 47.230702
    # 47 is way too small, use 100 to keep the number of overused
    #	11-mers to a smaller number

    cd /hive/data/genomes/priPac2
    blat priPac2.2bit /dev/null /dev/null -tileSize=11 \
      -makeOoc=priPac2.11.ooc -repMatch=100
    #	Wrote 3164 overused 11-mers to priPac2.11.ooc

    mkdir /hive/data/staging/data/priPac2
    cp -p priPac2.2bit chrom.sizes priPac2.11.ooc \
	/hive/data/staging/data/priPac2

##############################################################################
#  BLATSERVERS ENTRY (DONE - 2008-06-04 - Hiram)
#	After getting a blat server assigned by the Blat Server Gods,
    ssh hgwdev

    hgsql -e 'INSERT INTO blatServers (db, host, port, isTrans, canPcr) \
	VALUES ("priPac2", "blat5", "17784", "1", "0"); \
	INSERT INTO blatServers (db, host, port, isTrans, canPcr) \
	VALUES ("priPac2", "blat5", "17785", "0", "1");' \
	    hgcentraltest
    #	test it with some sequence

############################################################################
# reset default position to ZC101.2e protein blat result
    ssh hgwdev
    hgsql -e 'update dbDb set defaultPos="ABKE01000932:1-29163"
	where name="priPac2";' hgcentraltest

############################################################################
# ELEGANS (ce9) PROTEINS TRACK (DONE - 2010-10-07 - Hiram)
    cd /hive/data/genomes/priPac2
    mkdir blastDb
    twoBitToFa priPac2.unmasked.2bit temp.fa
    faSplit gap temp.fa 1000000 blastDb/x -lift=blastDb.lft
    #	13962 pieces of 13962 written
    rm temp.fa
    cd blastDb
    for i in *.fa
    do
	/scratch/data/blast-2.2.11/bin/formatdb -i $i -p F
    done
    rm *.fa

    ## create the query protein set
    mkdir -p /hive/data/genomes/priPac2/bed/tblastn.ce9SG
    cd /hive/data/genomes/priPac2/bed/tblastn.ce9SG
    echo  /hive/data/genomes/priPac2/blastDb/*.nsq | xargs ls -S \
	| sed "s/\.nsq//"  > query.lst
    wc -l query.lst
    # 13962 query.lst

   # we want around 400000 jobs
    calc `wc -l /hive/data/genomes/ce9/bed/blat.ce9SG/ce9SG.psl  | cut -d" " -f1`/\(400000/`wc -l query.lst | cut -d" " -f1`\)
    #	28103/(400000/13962) = 980.935215

   mkdir sgfa
   split -l 900 /hive/data/genomes/ce9/bed/blat.ce9SG/ce9SG.psl sgfa/sg
   cd sgfa
   for i in *; do 
     nice pslxToFa $i $i.fa; 
     rm $i; 
   done
   cd ..
   ls -1S sgfa/*.fa > sg.lst
   mkdir blastOut
   for i in `cat sg.lst`; do  mkdir blastOut/`basename $i .fa`; done
   
   cat << '_EOF_' > template
#LOOP
blastSome $(path1) {check in line $(path2)} {check out exists blastOut/$(root2)/q.$(root1).psl }
#ENDLOOP
'_EOF_'
    # << happy emacs

   cat << '_EOF_' > blastSome
#!/bin/sh
DB=priPac2
BLASTMAT=/scratch/data/blast-2.2.11/data
SCR="/scratch/tmp/${DB}"
g=`basename $2`
D=`basename $1`
export BLASTMAT DB SCR g D
mkdir -p ${SCR}
cp -p $1.* ${SCR}
f=${SCR}/`basename $3`.$g
for eVal in 0.01 0.001 0.0001 0.00001 0.000001 1E-09 1E-11
do
if /scratch/data/blast-2.2.11/bin/blastall -M BLOSUM80 -m 0 -F no \
	-e $eVal -p tblastn -d ${SCR}/$D -i $2 -o $f.8
then
        mv $f.8 $f.1
        break;
fi
done
if test -f  $f.1
then
    if /cluster/bin/x86_64/blastToPsl $f.1 $f.2
    then
        liftUp -nosort -type=".psl" -nohead $f.3 \
	    /hive/data/genomes/${DB}/blastDb.lft carry $f.2 > /dev/null
        liftUp -nosort -type=".psl" -pslQ -nohead $3.tmp \
	    /hive/data/genomes/ce9/bed/blat.ce9SG/protein.lft warn $f.3 > /dev/null
        if pslCheck -prot $3.tmp
        then
            mv $3.tmp $3
            rm -f $f.1 $f.2 $f.3 $f.4 ${SCR}/$D.*
            rmdir --ignore-fail-on-non-empty ${SCR}
        fi
        exit 0
    fi
fi
rm -f $f.1 $f.2 $3.tmp $f.8 $f.3 $f.4 ${SCR}/$D.*
rmdir --ignore-fail-on-non-empty ${SCR}
exit 1
'_EOF_'
    # << happy emacs
    chmod +x blastSome

    ssh swarm
    cd /hive/data/genomes/priPac2/bed/tblastn.ce9SG
    gensub2 query.lst sg.lst template jobList
    para create jobList
    para try, check, push, check etc.
# Completed: 446784 of 446784 jobs
# CPU time in finished jobs:    6100754s  101679.24m  1694.65h   70.61d  0.193 y
# IO & Wait Time:               1723436s   28723.93m   478.73h   19.95d  0.055 y
# Average job time:                  18s       0.29m     0.00h    0.00d
# Longest finished job:              52s       0.87m     0.01h    0.00d
# Submission to last job:         18781s     313.02m     5.22h    0.22d

    # do the cluster run for chaining
    ssh swarm
    mkdir /hive/data/genomes/priPac2/bed/tblastn.ce9SG/chainRun
    cd /hive/data/genomes/priPac2/bed/tblastn.ce9SG/chainRun
    cat << '_EOF_' > template
#LOOP
chainOne $(path1) {check out exists+ ../blastOut/c.$(file1).psl}
#ENDLOOP
'_EOF_'
    # << happy emacs

    cat << '_EOF_' > chainOne
#!/bin/csh -fe
cd $1
set b = $1:t
cat q.*.psl | simpleChain -prot -outPsl -maxGap=50000 stdin \
/hive/data/genomes/priPac2/bed/tblastn.ce9SG/blastOut/c.$b.psl
'_EOF_'
    # << happy emacs
    chmod +x chainOne

    ls -1dS /hive/data/genomes/priPac2/bed/tblastn.ce9SG/blastOut/sg?? \
	> chain.lst
    gensub2 chain.lst single template jobList
    cd /hive/data/genomes/priPac2/bed/tblastn.ce9SG/chainRun
    para create jobList
    para try, check, push, check etc.
# Completed: 32 of 32 jobs
# CPU time in finished jobs:        305s       5.09m     0.08h    0.00d  0.000 y
# IO & Wait Time:                 25350s     422.50m     7.04h    0.29d  0.001 y
# Average job time:                 802s      13.36m     0.22h    0.01d
# Longest finished job:             833s      13.88m     0.23h    0.01d
# Submission to last job:          1149s      19.15m     0.32h    0.01d

    cd /hive/data/genomes/priPac2/bed/tblastn.ce9SG/blastOut
    for i in sg??
    do
       cat c.$i.psl | awk "(\$13 - \$12)/\$11 > 0.6 {print}" > c60.$i.psl
       sort -rn c60.$i.psl | pslUniq stdin u.$i.psl
       awk "((\$1 / \$11) ) > 0.60 { print   }" c60.$i.psl > m60.$i.psl
       echo $i
    done
    sort -T /scratch/tmp -k 14,14 -k 16,16n -k 17,17n u.*.psl m60* | uniq \
	> ../blastCe9SG.psl
    cd ..
    pslCheck blastCe9SG.psl
    #	checked: 20932 failed: 0 errors: 0

    # load table 
    ssh hgwdev
    cd /hive/data/genomes/priPac2/bed/tblastn.ce9SG
    hgLoadPsl priPac2 blastCe9SG.psl

    # check coverage
    featureBits priPac2 blastCe9SG
    #	5436779 bases of 133634773 (4.068%) in intersection
    featureBits cb3 blastCe9SG
    #	18490367 bases of 108433446 (17.052%) in intersection
    featureBits caeRem3 blastCe9SG
    #	20302540 bases of 138406388 (14.669%) in intersection
    featureBits caePb2 blastCe9SG
    #	23730009 bases of 170473138 (13.920%) in intersection
    featureBits caeJap3 blastCe9SG
    #	12894398 bases of 154057934 (8.370%) in intersection
    featureBits melHap1 blastCe9SG
    #	4376245 bases of 53017507 (8.254%) in intersection
    featureBits melInc1 blastCe9SG
    #	3882043 bases of 82095019 (4.729%) in intersection
    featureBits bruMal1 blastCe9SG 
    #	4424694 bases of 89235536 (4.958%) in intersection
    featureBits haeCon1 blastCe9SG
    #	4990746 bases of 278844984 (1.790%) in intersection
    featureBits ce9 sangerGene
    #	28689552 bases of 100286004 (28.608%) in intersection

    rm -rf blastOut

#########################################################################
# GENBANK AUTO UPDATE (DONE - 2010-10-20 - Hiram)
    # align with latest genbank process.
    ssh hgwdev
    cd ~/kent/src/hg/makeDb/genbank
    git pull
    # edit etc/genbank.conf to add priPac2 just before priPac1

# priPac2 (P. pacificus)
priPac2.serverGenome = /hive/data/genomes/priPac2/priPac2.2bit
priPac2.clusterGenome = /scratch/data/priPac2/priPac2.2bit
priPac2.ooc = /scratch/data/priPac2/priPac2.11.ooc
priPac2.lift = no
priPac2.refseq.mrna.native.pslCDnaFilter  = ${lowCover.refseq.mrna.native.pslCDnaFilter}
priPac2.refseq.mrna.xeno.pslCDnaFilter    = ${lowCover.refseq.mrna.xeno.pslCDnaFilter}
priPac2.genbank.mrna.native.pslCDnaFilter = ${lowCover.genbank.mrna.native.pslCDnaFilter}
priPac2.genbank.mrna.xeno.pslCDnaFilter   = ${lowCover.genbank.mrna.xeno.pslCDnaFilter}
priPac2.genbank.est.native.pslCDnaFilter  = ${lowCover.genbank.est.native.pslCDnaFilter}
priPac2.refseq.mrna.native.load = yes
priPac2.refseq.mrna.xeno.load  = yes
priPac2.refseq.mrna.xeno.loadDesc = yes
priPac2.genbank.mrna.xeno.load = yes
priPac2.genbank.est.native.load = yes
priPac2.genbank.est.native.loadDesc = no
priPac2.downloadDir = priPac2
priPac2.perChromTables = no

    git commit -m "Added priPac2." etc/genbank.conf
    # update /cluster/data/genbank/:
    make etc-update

    ssh genbank
    screen
    cd /cluster/data/genbank
    time nice -n +19 bin/gbAlignStep -initial priPac2 &
    #	logFile: var/build/logs/2010.10.20-11:29:48.priPac2.initalign.log
    #	real    179m55.957s

    # load database when finished
    ssh hgwdev
    cd /cluster/data/genbank
    time nice -n +19 ./bin/gbDbLoadStep -drop -initialLoad priPac2
    #	logFile:  var/dbload/hgwdev/logs/2010.10.20-15:03:44.dbload.log
    #	real    24m11.438s

    # enable daily alignment and update of hgwdev
    cd ~/kent/src/hg/makeDb/genbank
    git pull
    # add priPac2 to:
        etc/align.dbs
        etc/hgwdev.dbs
    git commit -m "Added priPac2 - P. pacificus" etc/align.dbs etc/hgwdev.dbs
    make etc-update

#########################################################################
# WS210 WormBase genes loading (DONE - 2010-10-14 - Hiram)
    mkdir /hive/data/genomes/priPac2/bed/ws210Gene
    cd /hive/data/genomes/priPac2/bed/ws210Gene
    ln -s  /hive/data/genomes/priPac1/bed/blat.priPac2.2010-10-14/priPac1ToPriPac2.over.chain.gz .
    ln -s /hive/data/genomes/priPac1/bed/ws210Gene/priPac1.ws210Gene.gp .
    export DB=priPac2
    liftOver -genePred priPac1.ws210Gene.gp priPac1ToPriPac2.over.chain.gz \
        ${DB}.ws210Gene.gp ws210gene.unmapped.gp

    genePredCheck -db=${DB} ${DB}.ws210Gene.gp
    #	checked: 24319 failed: 0

    hgLoadGenePred ${DB} ws210Gene ${DB}.ws210Gene.gp

#########################################################################
# set this as the defaultDb (DONE - 2010-10-20 - Hiram)
    hgsql -e 'update defaultDb set name="priPac2" where name="priPac1";' \
	hgcentraltest

#########################################################################
# verify all.joiner has everything (DONE - 2010-10-20 - Hiram)
    # edit all.joiner until all these commands are successful
    cd $HOME/kent/src/hg/makeDb/schema
    joinerCheck -tableCoverage -database=priPac2 ./all.joiner
    joinerCheck -keys -database=priPac2 ./all.joiner
    joinerCheck -times -database=priPac2 ./all.joiner
    joinerCheck -all -database=priPac2 ./all.joiner
    #	the -all command will complain about other databases on hgwdev
    #	that are not specified in all.joiner.  There are a lot of test
    #	databases on hgwdev

#########################################################################
# construct downloads files (DONE - 2010-10-20 - Hiram)
    cd /hive/data/genomes/priPac2
    makeDownloads.pl -dbHost=hgwdev -workhorse=hgwdev -verbose=2 priPac2 \
	 > downloads.log 2>&1

#########################################################################
## Creating pushQ (DONE - 2010-10-20 - Hiram)
    ssh hgwdev
    mkdir /hive/data/genomes/priPac2/pushQ
    cd /hive/data/genomes/priPac2/pushQ
    makePushQSql.pl priPac2 > priPac2.sql 2> errorLog.out

    ## check the errorLog.out for anything that needs to be fixed
    ## copy priPac2.sql to hgwbeta:/tmp
    ## then on hgwbeta:
    hgsql qapushq < priPac2.sql

#######################################################################
# LIFTOVER TO priPac3 (DONE - 2015-07-07 - Hiram )
    mkdir /hive/data/genomes/priPac2/bed/blat.priPac3.2015-07-07
    cd /hive/data/genomes/priPac2/bed/blat.priPac3.2015-07-07
    # -debug run to create run dir, preview scripts...
    doSameSpeciesLiftOver.pl \
	-buildDir=`pwd` \
	-bigClusterHub=ku -dbHost=hgwdev -workhorse=hgwdev \
	-ooc=/hive/data/genomes/priPac2/priPac2.11.ooc -debug priPac2 priPac3
    # Real run:
    time (doSameSpeciesLiftOver.pl -buildDir=`pwd` \
	-bigClusterHub=ku -dbHost=hgwdev -workhorse=hgwdev \
	-ooc=/hive/data/genomes/priPac2/priPac2.11.ooc priPac2 priPac3) \
		> do.log 2>&1
    #	real    4m3.210s
    # verify it works on genome-test

#############################################################################
