# for emacs: -*- mode: sh; -*-

# This file describes how we made the elephant browser database on
#  Broad Institute loxAfr3 (NCBI project 12569, AAGU03000000)

#	"$Id: loxAfr3.txt,v 1.5 2009/09/20 17:16:45 markd Exp $";

#	http://www.ncbi.nlm.nih.gov/bioproject/70973 - ?
#	http://www.ncbi.nlm.nih.gov/bioproject/12569
#	http://www.ncbi.nlm.nih.gov/genome/224
#	http://www.ncbi.nlm.nih.gov/Traces/wgs/?val=AAGU00

#############################################################################
# Download sequence (DONE - 2009-07-15 - Hiram)
    mkdir -p /hive/data/genomes/loxAfr3/broad
    /hive/data/genomes/loxAfr3/broad
    wget --timestamping \
"ftp://ftp.broadinstitute.org/pub/assemblies/mammals/elephant/loxAfr3/*"

    #	lift quality scores to scaffold coordinates
    qaToQac assembly.quals.gz assembly.quals.qac
    qacAgpLift assembly.agp assembly.quals.qac loxAfr3.quals.qac

#############################################################################
# Elephant loxAfr3 browser initialization (DONE - 2009-07-15 - Hiram)
    cd /hive/data/genomes/loxAfr3
    cat << '_EOF_' > loxAft3.config.ra
# Config parameters for makeGenomeDb.pl:
db loxAfr3
clade mammal
genomeCladePriority 35
scientificName Loxodonta africana
commonName Elephant
assemblyDate Jul. 2009
assemblyLabel Broad Institute loxAfr3 (NCBI project 12569, AAGU03000000)
orderKey 340
mitoAcc NC_000934
fastaFiles /hive/data/genomes/loxAfr3/broad/assembly.bases.gz
agpFiles /hive/data/genomes/loxAfr3/broad/assembly.agp
qualFiles /hive/data/genomes/loxAfr3/broad/loxAfr3.quals.qac
dbDbSpeciesDir elephant
taxId 9785
'_EOF_'
    # << happy emacs
    #	run stepwise to verify each step
    makeGenomeDb.pl -stop=seq loxAfr3.config.ra > seq.log 2>&1
    makeGenomeDb.pl -continue=agp -stop=agp loxAfr3.config.ra > agp.log 2>&1
    makeGenomeDb.pl -continue=db -stop=db loxAfr3.config.ra > db.log 2>&1
    makeGenomeDb.pl -continue=dbDb -stop=dbDb loxAfr3.config.ra > dbDb.log 2>&1
    makeGenomeDb.pl -continue=trackDb loxAfr3.config.ra > trackDb.log 2>&1


#############################################################################
# loxAfr3 repeatMasker (DONE - 2009-07-15 - Hiram)
    mkdir /hive/data/genomes/loxAfr3/bed/repeatMasker
    cd /hive/data/genomes/loxAfr3/bed/repeatMasker
    doRepeatMasker.pl -buildDir=`pwd` loxAfr3 > do.log 2>&1
    #	about 6 hours
    cat faSize.rmsk.txt
    #	3196760833 bases (78195493 N's 3118565340 real 1633809371 upper
    #	1484755969 lower) in 2353 sequences in 1 files

#############################################################################
# loxAfr3 simpleRepeat (DONE - 2009-07-15 - Hiram)
    mkdir /hive/data/genomes/loxAfr3/bed/simpleRepeat
    cd /hive/data/genomes/loxAfr3/bed/simpleRepeat
    time doSimpleRepeat.pl -buildDir=`pwd` loxAfr3 > do.log 2>&1
    #	real    23m42.536s
    cat fb.simpleRepeat
    #	27746420 bases of 3118565340 (0.890%) in intersection

    #	add to RM after done above:
    twoBitMask bed/repeatMasker/loxAfr3.clean.2bit \
	-add bed/simpleRepeat/trfMask.bed loxAfr3.2bit
    twoBitToFa loxAfr3.2bit stdout | faSize stdin > faSize.loxAfr3.2bit.txt

##########################################################################
## WINDOWMASKER (DONE- 2013-06-21 - Hiram)
    mkdir /hive/data/genomes/loxAfr3/bed/windowMasker
    cd /hive/data/genomes/loxAfr3/bed/windowMasker
    time nice -n +19 doWindowMasker.pl -buildDir=`pwd` -workhorse=hgwdev \
	-dbHost=hgwdev loxAfr3 > do.log 2>&1 &
    # real    260m2.629s

    # Masking statistics
    faSize.loxAfr3.cleanWMSdust.txt
# 3196760833 bases (78195493 N's 3118565340 real 1683497170
#    upper 1435068170 lower) in 2353 sequences in 1 files
# Total size: mean 1358589.4 sd 8404246.1 min 3026 (scaffold_2351)
#    max 129759341 (scaffold_0) median 10119
# %44.89 masked total, %46.02 masked real

    # how much does this window masker and repeat masker overlap:
    featureBits -countGaps loxAfr3 rmsk windowmaskerSdust \
       > fb.loxAfr3.rmsk.windowmaskerSdust.txt 2>&1
    #   1000702369 bases of 3196760833 (31.304%) in intersection

#############################################################################
# create ooc file and populate /scratch/data (DONE - 2009-07-16 - Hiram)
    #	repMatch = 1024 * sizeof(loxAfr3)/sizeof(hg19)
    #	1102 = 1024 * (3118565340/2897310462)
    time blat loxAfr3.2bit \
	/dev/null /dev/null -tileSize=11 -makeOoc=jkStuff/loxAfr3.11.ooc \
	-repMatch=1100
    #	Wrote 41026 overused 11-mers to jkStuff/loxAfr3.11.ooc

    mkdir /hive/data/staging/data/loxAfr3
    cp -p loxAfr3.2bit /hive/data/staging/data/loxAfr3
    cp -p jkStuff/loxAfr3.11.ooc /hive/data/staging/data/loxAfr3
    cp -p chrom.sizes /hive/data/staging/data/loxAfr3

    #	request push to kluster nodes

##########################################################################
## GENBANK alignments (DONE - 2007-08-03 - Hiram)
    cd $HOME/kent/src/hg/makeDb/genbank/etc
    cvs up
    # edit genbank.conf and add the following entry just above loxAfr1:
# loxAfr3 (elephant)
loxAfr3.serverGenome = /hive/data/genomes/loxAfr3/loxAfr3.2bit
loxAfr3.clusterGenome = /scratch/data/loxAfr3/loxAfr3.2bit
loxAfr3.ooc = /scratch/data/loxAfr3/loxAfr3.11.ooc
loxAfr3.lift = no
loxAfr3.refseq.mrna.native.pslCDnaFilter  = ${lowCover.refseq.mrna.native.pslCDnaFilter}
loxAfr3.refseq.mrna.xeno.pslCDnaFilter    = ${lowCover.refseq.mrna.xeno.pslCDnaFilter}
loxAfr3.genbank.mrna.native.pslCDnaFilter = ${lowCover.genbank.mrna.native.pslCDnaFilter}
loxAfr3.genbank.mrna.xeno.pslCDnaFilter   = ${lowCover.genbank.mrna.xeno.pslCDnaFilter}
loxAfr3.genbank.est.native.pslCDnaFilter  = ${lowCover.genbank.est.native.pslCDnaFilter}
loxAfr3.refseq.mrna.native.load = yes
loxAfr3.refseq.mrna.xeno.load = yes
loxAfr3.genbank.mrna.xeno.load = yes
loxAfr3.genbank.est.native.load = no
loxAfr3.downloadDir = loxAfr3
loxAfr3.perChromTables = no

    # after commiting that edit, install thusly:
    cd $HOME/kent/src/hg/makeDb/genbank
    make etc-update

    ssh genbank
    screen	# use a screen to manage this long lived job
    cd /cluster/data/genbank
    time nice -n +19 bin/gbAlignStep -initial loxAfr3 &
    ##	logFile: var/build/logs/2009.07.21-10:14:29.loxAfr3.initalign.log
    #	real    174m1.258s

    ssh hgwdev
    cd /cluster/data/genbank
    time nice -n +19 ./bin/gbDbLoadStep -drop -initialLoad loxAfr3
    #	var/dbload/hgwdev/logs/2009.07.21-13:22:25.dbload.log
    #	real    14m0.774s

    featureBits loxAfr3 xenoMrna
    #	65367813 bases of 3118565340 (2.096%) in intersection
    featureBits loxAfr3 xenoRefGene
    #	50364043 bases of 3118565340 (1.615%) in intersection
    featureBits loxAfr3 all_mrna
    #	15550 bases of 3118565340 (0.000%) in intersection

    # enable daily alignment and update of hgwdev (DONE - 2009-07-21 - Hiram)
    cd ~/kent/src/hg/makeDb/genbank
    cvsup
    # add mm9 to:
        etc/align.dbs
        etc/hgwdev.dbs
    cvs ci -m "Added loxAfr3 - Loxodonta africana" etc/align.dbs etc/hgwdev.dbs
    make etc-update

##########################################################################
# HUMAN (hg18) PROTEINS TRACK (DONE 2009-07-23 braney )
    # bash  if not using bash shell already

    cd /cluster/data/loxAfr3
    mkdir /cluster/data/loxAfr3/blastDb

    awk '{if ($2 > 1000000) print $1}' chrom.sizes > 1meg.lst
    twoBitToFa -seqList=1meg.lst  loxAfr3.2bit temp.fa
    faSplit gap temp.fa 1000000 blastDb/x -lift=blastDb.lft
    rm temp.fa 1meg.lst

    awk '{if ($2 <= 1000000) print $1}' chrom.sizes > less1meg.lst
    twoBitToFa -seqList=less1meg.lst  loxAfr3.2bit temp.fa
    faSplit about temp.fa 1000000 blastDb/y
    rm temp.fa less1meg.lst

    cd blastDb
    for i in *.fa
    do
	/hive/data/outside/blast229/formatdb -i $i -p F
    done
    rm *.fa
    ls *.nsq | wc -l
# 3934

    mkdir -p /cluster/data/loxAfr3/bed/tblastn.hg18KG
    cd /cluster/data/loxAfr3/bed/tblastn.hg18KG
    echo  ../../blastDb/*.nsq | xargs ls -S | sed "s/\.nsq//"  > query.lst
    wc -l query.lst
# 3934 query.lst

   # we want around 250000 jobs
   calc `wc /cluster/data/hg18/bed/blat.hg18KG/hg18KG.psl | awk '{print $1}'`/\(250000/`wc query.lst | awk '{print $1}'`\)

# 36727/(250000/3934) = 577.936072

   mkdir -p kgfa
   split -l 578 /cluster/data/hg18/bed/blat.hg18KG/hg18KG.psl  kgfa/kg
   cd kgfa
   for i in *; do
     nice pslxToFa $i $i.fa;
     rm $i;
   done
   cd ..
   ls -1S kgfa/*.fa > kg.lst
   wc kg.lst
#  64  64 832 kg.lst

   mkdir -p blastOut
   for i in `cat kg.lst`; do  mkdir blastOut/`basename $i .fa`; done
   tcsh
   cd /cluster/data/loxAfr3/bed/tblastn.hg18KG
   cat << '_EOF_' > blastGsub
#LOOP
blastSome $(path1) {check in line $(path2)} {check out exists blastOut/$(root2)/q.$(root1).psl }
#ENDLOOP
'_EOF_'

   cat << '_EOF_' > blastSome
#!/bin/sh
BLASTMAT=/hive/data/outside/blast229/data
export BLASTMAT
g=`basename $2`
f=/tmp/`basename $3`.$g
for eVal in 0.01 0.001 0.0001 0.00001 0.000001 1E-09 1E-11
do
if /hive/data/outside/blast229/blastall -M BLOSUM80 -m 0 -F no -e $eVal -p tblastn -d $1 -i $2 -o $f.8
then
        mv $f.8 $f.1
        break;
fi
done
if test -f  $f.1
then
    if /cluster/bin/i386/blastToPsl $f.1 $f.2
    then
	liftUp -nosort -type=".psl" -nohead $f.3 /cluster/data/loxAfr3/blastDb.lft carry $f.2
        liftUp -nosort -type=".psl" -pslQ -nohead $3.tmp /cluster/data/hg18/bed/blat.hg18KG/protein.lft warn $f.3
        if pslCheck -prot $3.tmp
        then
            mv $3.tmp $3
            rm -f $f.1 $f.2 $f.3 $f.4
        fi
        exit 0
    fi
fi
rm -f $f.1 $f.2 $3.tmp $f.8 $f.3 $f.4
exit 1
'_EOF_'
    # << happy emacs
    chmod +x blastSome
    exit

    ssh swarm
    cd /cluster/data/loxAfr3/bed/tblastn.hg18KG
    gensub2 query.lst kg.lst blastGsub blastSpec
    para create blastSpec
#    para try, check, push, check etc.

    para time
# Completed: 251776 of 251776 jobs
# CPU time in finished jobs:   18571030s  309517.16m  5158.62h  214.94d  0.589 y
# IO & Wait Time:               1488748s   24812.47m   413.54h   17.23d  0.047 y
# Average job time:                  80s       1.33m     0.02h    0.00d
# Longest finished job:             287s       4.78m     0.08h    0.00d
# Submission to last job:         21050s     350.83m     5.85h    0.24d

    ssh swarm
    cd /cluster/data/loxAfr3/bed/tblastn.hg18KG
    mkdir chainRun
    cd chainRun
    tcsh
    cat << '_EOF_' > chainGsub
#LOOP
chainOne $(path1)
#ENDLOOP
'_EOF_'

    cat << '_EOF_' > chainOne
(cd $1; cat q.*.psl | simpleChain -prot -outPsl -maxGap=12000 stdin ../c.`basename $1`.psl)
'_EOF_'
    chmod +x chainOne
    ls -1dS ../blastOut/kg?? > chain.lst
    gensub2 chain.lst single chainGsub chainSpec
    # do the cluster run for chaining
    para create chainSpec
    para try, check, push, check etc.

# Completed: 64 of 64 jobs
# CPU time in finished jobs:      29478s     491.30m     8.19h    0.34d  0.001 y
# IO & Wait Time:                 19822s     330.37m     5.51h    0.23d  0.001 y
# Average job time:                 770s      12.84m     0.21h    0.01d
# Longest finished job:            2868s      47.80m     0.80h    0.03d
# Submission to last job:          2882s      48.03m     0.80h    0.03d

    cd /cluster/data/loxAfr3/bed/tblastn.hg18KG/blastOut
    for i in kg??
    do
       cat c.$i.psl | awk "(\$13 - \$12)/\$11 > 0.6 {print}" > c60.$i.psl
       sort -rn c60.$i.psl | pslUniq stdin u.$i.psl
       awk "((\$1 / \$11) ) > 0.60 { print   }" c60.$i.psl > m60.$i.psl
       echo $i
    done
    sort u.*.psl m60* | uniq | sort -T /tmp -k 14,14 -k 16,16n -k 17,17n > ../blastHg18KG.psl
    cd ..
    pslCheck blastHg18KG.psl
# checked: 43305 failed: 0 errors: 0

    # load table
    ssh hgwdev
    cd /cluster/data/loxAfr3/bed/tblastn.hg18KG
    hgLoadPsl loxAfr3 blastHg18KG.psl

    # check coverage
    featureBits loxAfr3 blastHg18KG
# 42480830 bases of 3118565340 (1.362%) in intersection

    featureBits loxAfr3 blastHg18KG xenoRefGene  -enrichment
# blastHg18KG 1.362%, xenoRefGene 1.615%, both 0.695%, cover 51.03%, enrich 31.60x
    rm -rf blastOut
#end tblastn

#########################################################################
# lastz swap from Mouse Mm9 (DONE - 2009-08-27 - Hiram)
    # original alignment
    cd /hive/data/genomes/mm9/bed/lastzLoxAfr3.2009-08-12
    cat fb.mm9.chainLoxAfr3Link.txt
    #	684326090 bases of 2620346127 (26.116%) in intersection

    mkdir /hive/data/genomes/loxAfr3/bed/blastz.mm9.swap
    cd /hive/data/genomes/loxAfr3/bed/blastz.mm9.swap
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
	/hive/data/genomes/mm9/bed/lastzLoxAfr3.2009-08-12/DEF \
	-swap -noLoadChainSplit -chainMinScore=3000 -chainLinearGap=medium \
	-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
	-syntenicNet > swap.log 2>&1 &
    #	real    123m9.342s
    cat fb.loxAfr3.chainMm9Link.txt
    #	673856452 bases of 3118565340 (21.608%) in intersection

#########################################################################
############################################################################
# TRANSMAP vertebrate.2009-09-13 build  (2009-09-20 markd)

vertebrate-wide transMap alignments were built  Tracks are created and loaded
by a single Makefile. This is available from:
   svn+ssh://hgwdev.soe.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2009-09-13

see doc/builds.txt for specific details.

#########################################################################
# lastz swap from Opossum MonDom5 (DONE - 2011-11-10 - Chin)
    # original alignment
    cd /hive/data/genomes/monDom5/bed/lastzLoxAfr3.2011-11-10
    cat fb.monDom5.chainLoxAfr3Link.txt
    #   228949705 bases of 3501660299 (6.538%) in intersection

    mkdir /hive/data/genomes/loxAfr3/bed/blastz.monDom5.swap
    cd /hive/data/genomes/loxAfr3/bed/blastz.monDom5.swap
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
        /hive/data/genomes/monDom5/bed/lastzLoxAfr3.2011-11-10/DEF \
        -swap -noLoadChainSplit -chainMinScore=3000 -chainLinearGap=medium \
        -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
        -syntenicNet > swap.log 2>&1 &
    #   real    123m9.342s

    cat fb.loxAfr3.chainMonDom5Link.txt
    #   230673812 bases of 3118565340 (7.397%) in intersection

    cd /hive/data/genomes/loxAfr3/bed
    ln -s blastz.monDom5.swap lastz.monDom5

############################################################################
# create ucscToINSDC name mapping (DONE - 2013-08-23 - Hiram)
    mkdir /hive/data/genomes/loxAfr3/bed/ucscToINSDC
    cd /hive/data/genomes/loxAfr3/bed/ucscToINSDC

    # copying these scripts from the previous load and improving them
    # with each instance
    ./translateNames.sh NC_001913.1
    # used the join.sh from danRer7, same difficulty here
    # join.sh much abused here, and result edited for scaffold_750
    ./join.sh
    # special verify here:
    ./verifyAll.sh

    sed -e "s/21/13/" $HOME/kent/src/hg/lib/ucscToINSDC.sql \
        | hgLoadSqlTab oryCun2 ucscToINSDC stdin ucscToINSDC.tab
    checkTableCoords oryCun2 ucscToINSDC
    featureBits -countGaps oryCun2 ucscToINSDC
    # 3196760833 bases of 3196760833 (100.000%) in intersection

    # verify the track link to INSDC functions

##############################################################################
# genscan - (DONE - 2013-11-12 - Hiram)
    mkdir /hive/data/genomes/loxAfr3/bed/genscan
    cd /hive/data/genomes/loxAfr3/bed/genscan
    time doGenscan.pl loxAfr3 > do.log 2>&1
    #  real    68m7.619s

    cat fb.loxAfr3.genscan.txt
    #   61173957 bases of 3118565340 (1.962%) in intersection

    cat fb.loxAfr3.genscanSubopt.txt
    #   72799698 bases of 3118565340 (2.334%) in intersection

#########################################################################
##############################################################################
# TransMap V3 tracks. see makeDb/doc/transMapTracks.txt (2014-12-21 markd)
##############################################################################

##############################################################################
# GENEID GENE PREDICTIONS (DONE - 2015-07-31 - Hiram)
    ssh hgwdev
    mkdir /hive/data/genomes/loxAfr3/bed/geneid
    cd /hive/data/genomes/loxAfr3/bed/geneid
    wget --timestamping \
http://genome.crg.es/genepredictions/L.africana/loxAfr3/geneid_v1.4/00README
    wget --timestamping \
http://genome.crg.es/genepredictions/L.africana/loxAfr3/geneid_v1.4/loxAfr3.geneid.gtf

    ldHgGene -gtf -genePredExt loxAfr3 geneid loxAfr3.geneid.gtf
    # Read 46786 transcripts in 318266 lines in 1 files
    #  46786 groups 2351 seqs 1 sources 3 feature types
    # 46786 gene predictions

    featureBits -enrichment loxAfr3 augustusGene:CDS geneid
# augustusGene:CDS 1.066%, geneid 1.385%, both 0.847%, cover 79.49%,
#     enrich 57.38x

##########################################################################
