# for emacs: -*- mode: sh; -*-

# Caenorhabditis remanei
#	Washington University School of Medicine GSC and Sanger Institute
#

###########################################################################
## Download sequence (DONE - 2011-05-25 - Hiram)
    mkdir /hive/data/genomes/caeRem4
    cd /hive/data/genomes/caeRem4
    mkdir ws220
    cd ws220
    wget --no-parent --timestamping -m -nH --cut-dirs=5 \
	ftp://ftp.sanger.ac.uk/pub/wormbase/WS220/genomes/c_remanei/

    # no AGP, make one from the fasta file
    hgFakeAgp -minContigGap=1 sequences/dna/c_remanei.WS220.dna.fa.gz \
	caeRem4.ws220.fake.agp

    faSize sequences/dna/c_remanei.WS220.dna.fa.gz
# 145442736 bases (7036533 N's 138406203 real 0 upper 138406203 lower)
#	in 3670 sequences in 1 files
# %95.16 masked total, %100.00 masked real

###########################################################################
## Initial sequence (DONE - 2011-05-25 - Hiram)
    cd /hive/data/genomes/caeRem4
    cat << '_EOF_' > caeRem4.config.ra
# Config parameters for makeGenomeDb.pl:
db caeRem4
# clade worm
# genomeCladePriority 10
scientificName Caenorhabditis remanei
commonName C. remanei
assemblyDate Jul. 2007
assemblyLabel Washington University School of Medicine GSC C. remanei 15.0.1
assemblyShortLabel WS220
orderKey 877
mitoAcc none
fastaFiles /hive/data/genomes/caeRem4/ws220/sequences/dna/c_remanei.WS220.dna.fa.gz
agpFiles /hive/data/genomes/caeRem4/ws220/caeRem4.ws220.fake.agp
# qualFiles none
taxId 31234
'_EOF_'
    # << happy emacs

    mkdir jkStuff
    #	run just to AGP to make sure things are sane first
    time nice -n +19 makeGenomeDb.pl caeRem4.config.ra -stop=agp \
      > jkStuff/makeGenomeDb.agp.log 2>&1
    #	real    0m22.470s
    #	check that log to verify it has no errors
    #	now, continuing to make the Db and all
    time nice -n +19 makeGenomeDb.pl caeRem4.config.ra -continue=db \
      > jkStuff/makeGenomeDb.db.log 2>&1
    #	real    1m22.079s

    #	take the trackDb business there and check it into the source tree
    #	fixup the description, gap and gold html page descriptions

    # should have been type W not type D:

    hgsql -e 'update gold set type="W" where type="D";' caeRem4

###########################################################################
## RepeatMasker (DONE - 2011-05-25 - Hiram)
    mkdir /hive/data/genomes/caeRem4/bed/repeatMasker
    cd /hive/data/genomes/caeRem4/bed/repeatMasker
    time nice -n +19 doRepeatMasker.pl -noSplit -bigClusterHub=swarm \
	-buildDir=`pwd` caeRem4 > do.log 2>&1 &
    #	real    10m0.784s

    #	from the do.log:
# RepeatMasker version development-$Id: RepeatMasker,v
#	1.25 2010/09/08 21:32:26 angie Exp $
#	CC   RELEASE 20090604; 

    cat faSize.rmsk.txt
# 145442736 bases (7036533 N's 138406203 real 137059054 upper 1347149 lower)
#	in 3670 sequences in 1 files
# %0.93 masked total, %0.97 masked real

###########################################################################
## Simple Repeats (DONE - 2011-05-25 - Hiram)
    mkdir /cluster/data/caeRem4/bed/simpleRepeat
    cd /cluster/data/caeRem4/bed/simpleRepeat
    time nice -n +19 doSimpleRepeat.pl -smallClusterHub=memk \
	-workhorse=hgwdev -buildDir=`pwd` caeRem4 > do.log 2>&1 &
    #	real    12m41.640s
    cat fb.simpleRepeat 
    #	5219501 bases of 138406203 (3.771%) in intersection

###########################################################################
## WindowMasker (DONE - 2011-05-25 - Hiram)
    ssh hgwdev
    mkdir /hive/data/genomes/caeRem4/bed/windowMasker
    cd /hive/data/genomes/caeRem4/bed/windowMasker
    time nice -n +19 doWindowMasker.pl -verbose=2 -buildDir=`pwd` \
	-workhorse=hgwdev caeRem4 > do.log 2>&1 &
    #	real    5m22.429s
    twoBitToFa caeRem4.wmsk.sdust.2bit stdout | faSize stdin
# 145442736 bases (7036533 N's 138406203 real 96272057 upper 42134146 lower)
#	in 3670 sequences in 1 files
# %28.97 masked total, %30.44 masked real

    #	load this initial data to get ready to clean it
    cd /hive/data/genomes/caeRem4/bed/windowMasker
    hgLoadBed caeRem4 windowmaskerSdust windowmasker.sdust.bed.gz
    #	Loaded 1074076 elements of size 3
    featureBits -countGaps caeRem4 windowmaskerSdust
    #	49170450 bases of 145442736 (33.807%) in intersection

    #	eliminate the gaps from the masking
    featureBits caeRem4 -not gap -bed=notGap.bed
    #	138406203 bases of 138406203 (100.000%) in intersection
    time nice -n +19 featureBits caeRem4 windowmaskerSdust notGap.bed \
	-bed=stdout | gzip -c > cleanWMask.bed.gz
    #	42134146 bases of 138406203 (30.442%) in intersection
    #	reload track to get it clean
    hgLoadBed caeRem4 windowmaskerSdust cleanWMask.bed.gz
    #	Loaded 1073077 elements of size 4
    featureBits -countGaps caeRem4 windowmaskerSdust
    #	featureBits -countGaps caeRem4 windowmaskerSdust

    #	mask the sequence with this clean mask
    zcat cleanWMask.bed.gz \
	| twoBitMask ../../caeRem4.unmasked.2bit stdin \
	    -type=.bed caeRem4.cleanWMSdust.2bit
    twoBitToFa caeRem4.cleanWMSdust.2bit stdout | faSize stdin \
        > caeRem4.cleanWMSdust.faSize.txt
    cat caeRem4.cleanWMSdust.faSize.txt
# 145442736 bases (7036533 N's 138406203 real 96272057 upper 42134146 lower)
#	in 3670 sequences in 1 files
# %28.97 masked total, %30.44 masked real

########################################################################
# MASK SEQUENCE WITH WM+TRF (DONE - 2011-05-25 - Hiram)
    cd /hive/data/genomes/caeRem4
    twoBitMask -add bed/windowMasker/caeRem4.cleanWMSdust.2bit \
	bed/simpleRepeat/trfMask.bed caeRem4.2bit
    #	safe to ignore the warnings about BED file with >=13 fields
    twoBitToFa caeRem4.2bit stdout | faSize stdin > faSize.caeRem4.txt
    cat faSize.caeRem4.txt
# 145442736 bases (7036533 N's 138406203 real 96220935 upper 42185268 lower)
#	in 3670 sequences in 1 files
# %29.00 masked total, %30.48 masked real

    #	create symlink to gbdb
    ssh hgwdev
    rm /gbdb/caeRem4/caeRem4.2bit
    ln -s `pwd`/caeRem4.2bit /gbdb/caeRem4/caeRem4.2bit

#########################################################################
# MAKE 11.OOC FILE FOR BLAT (DONE - 2011-05-25 - Hiram)
    # numerator is caeRem4 gapless bases "real" as reported by faSize 
    # denominator is hg19 gapless bases "real" as reported by faSize
    # 1024 is threshold used for human -repMatch:
    calc \( 138406203 / 2897310462 \) \* 1024
    #	( 138406203 / 2897310462 ) * 1024 = 48.917075

    # Round up to use -repMatch=100 since 50 would result in too many
    cd /hive/data/genomes/caeRem4
    blat caeRem4.2bit /dev/null /dev/null -tileSize=11 \
      -makeOoc=jkStuff/caeRem4.11.ooc -repMatch=100
    #	Wrote 10105 overused 11-mers to jkStuff/caeRem4.11.ooc
    # there are no non-bridged gaps here to make a lift file from
    # cd jkStuff
    # gapToLift -verbose=2 caeRem4 caeRem4.nonBridged.lift -bedFile=caeRem4.nonBridged.bed

    mkdir /hive/data/staging/data/caeRem4
    cp -p chrom.sizes caeRem4.2bit jkStuff/caeRem4.11.ooc \
	/hive/data/staging/data/caeRem4

#########################################################################
# GENBANK AUTO UPDATE (DONE - 2011-05-26 - Hiram)
    # align with latest genbank process.
    ssh hgwdev
    cd ~/kent/src/hg/makeDb/genbank
    git pull
    # edit etc/genbank.conf to add caeRem4 just before caeRem3

# caeRem4 (C. remanei)
caeRem4.serverGenome = /hive/data/genomes/caeRem4/caeRem4.2bit
caeRem4.clusterGenome = /scratch/data/caeRem4/caeRem4.2bit
caeRem4.ooc = /scratch/data/caeRem4/caeRem4.11.ooc
caeRem4.lift = no
caeRem4.refseq.mrna.native.pslCDnaFilter  = ${lowCover.refseq.mrna.native.pslCDnaFilter}
caeRem4.refseq.mrna.xeno.pslCDnaFilter    = ${lowCover.refseq.mrna.xeno.pslCDnaFilter}
caeRem4.genbank.mrna.native.pslCDnaFilter = ${lowCover.genbank.mrna.native.pslCDnaFilter}
caeRem4.genbank.mrna.xeno.pslCDnaFilter   = ${lowCover.genbank.mrna.xeno.pslCDnaFilter}
caeRem4.genbank.est.native.pslCDnaFilter  = ${lowCover.genbank.est.native.pslCDnaFilter}
caeRem4.refseq.mrna.native.load = no
caeRem4.refseq.mrna.xeno.load  = yes
caeRem4.refseq.mrna.xeno.loadDesc = yes
caeRem4.genbank.mrna.xeno.load = yes
caeRem4.genbank.est.native.load = yes
caeRem4.genbank.est.native.loadDesc = no
caeRem4.downloadDir = caeRem4
caeRem4.perChromTables = no

    git commit -m "Added caeRem4 C. remanei WS220" etc/genbank.conf
    git push
    # update /cluster/data/genbank/:
    make etc-update

    screen		#	use a screen to manage this job
    cd /cluster/data/genbank
    time nice -n +19 bin/gbAlignStep -initial caeRem4 &
    #	logFile:  var/build/logs/2011.05.26-09:19:08.caeRem4.initalign.log
    #	real	320m18.548s

    # load database when finished
    ssh hgwdev
    cd /cluster/data/genbank
    time nice -n +19 ./bin/gbDbLoadStep -drop -initialLoad caeRem4
    #	logFile: var/dbload/hgwdev/logs/2011.05.26-14:41:33.dbload.log
    #	real    21m49.269s

    # enable daily alignment and update of hgwdev
    cd ~/kent/src/hg/makeDb/genbank
    git pull
    # add caeRem4 to:
        etc/align.dbs
        etc/hgwdev.dbs
    git commit -m "Added caeRem4 C. remanei WS220" etc/align.dbs etc/hgwdev.dbs
    git push
    make etc-update

#########################################################################
# lastz swap ce10 to caeRem4 (DONE - 2011-06-07 - Hiram)
    #	original alignment on ce10
    cd /hive/data/genomes/ce10/bed/lastzCaeRem4.2011-06-07
    cat fb.ce10.chainCaeRem4Link.txt 
    #	41841289 bases of 100286070 (41.722%) in intersection

    mkdir /hive/data/genomes/caeRem4/bed/blastz.ce10.swap
    cd /hive/data/genomes/caeRem4/bed/blastz.ce10.swap
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
	/hive/data/genomes/ce10/bed/lastzCaeRem4.2011-06-07/DEF \
	-syntenicNet -workhorse=hgwdev -bigClusterHub=swarm \
	-smallClusterHub=encodek -swap > swap.log 2>&1 &
    #	real    3m52.350s

    cat fb.caeRem4.chainCe10Link.txt
    #	46320954 bases of 138406203 (33.467%) in intersection

############################################################################
# Constructing Downloads (DONE - 2011-06-10 - Hiram)
    cd /hive/data/genomes/caeRem4
    time makeDownloads.pl -dbHost=hgwdev -workhorse=hgwdev -verbose=2 caeRem4 \
	 > downloads.log 2>&1
    #	real    1m15.214s
    # fixup the README files constructed in goldenPath/*/README.txt

    # add window masker bed file:
    cp -p bed/windowMasker/cleanWMask.bed.gz \
	goldenPath/bigZips/chromWMSdust.bed.gz

############################################################################
