# for emacs: -*- mode: sh; -*-

# Mouse lemur ( Microcebus murinus)

#	http://www.ncbi.nlm.nih.gov/Traces/wgs/?val=ABDC00
#	http://www.ncbi.nlm.nih.gov/bioproject/19967
#	http://www.ncbi.nlm.nih.gov/genome/777
#	http://www.ncbi.nlm.nih.gov/genome/assembly/200518/

#########################################################################
# DOWNLOAD SEQUENCE (DONE braney 2008-07-28 )
    ssh kkstore05
    mkdir /cluster/store12/micMur1
    ln -s /cluster/store12/micMur1 /cluster/data
    mkdir /cluster/data/micMur1/broad
    cd /cluster/data/micMur1/broad

    wget --timestamping \
ftp://ftp.broad.mit.edu/pub/assemblies/mammals/mouseLemur/MicMur1.0/assembly.agp \
ftp://ftp.broad.mit.edu/pub/assemblies/mammals/mouseLemur/MicMur1.0/assembly.bases.gz \
ftp://ftp.broad.mit.edu/pub/assemblies/mammals/mouseLemur/MicMur1.0/assembly.quals.gz 
    md5sum ass* > assembly.md5sum

    qaToQac assembly.quals.gz stdout | qacAgpLift assembly.agp stdin micMur1.qual.qac

   cut -f 1 assembly.agp | uniq -c | wc -l 
   # Number of scaffolds: 185042


#########################################################################
# Create .ra file and run makeGenomeDb.pl
    ssh kkstore05
    cd /cluster/data/micMur1
cat << _EOF_ >micMur1.config.ra
# Config parameters for makeGenomeDb.pl:
db micMur1
clade mammal
genomeCladePriority 35
scientificName Microcebus murinus
commonName Mouse lemur
assemblyDate Jun. 2003
assemblyLabel Broad Institute micMur1 
orderKey 236.5
#mitoAcc AJ222767
mitoAcc none
fastaFiles /cluster/data/micMur1/broad/assembly.bases.gz
agpFiles /cluster/data/micMur1/broad/assembly.agp
qualFiles /cluster/data/micMur1/broad/micMur1.qual.qac
dbDbSpeciesDir mouseLemur
_EOF_

# use 'screen' make sure on kkstore05
    makeGenomeDb.pl -verbose=2 micMur1.config.ra > makeGenomeDb.out 2>&1 &

# 'ctl-a ctl -d' returns to previous shell
cut -f 2 chrom.sizes | ave stdin
# Q1 1105.000000
# median 2118.000000
# Q3 6087.000000
# average 15684.389144
# min 600.000000
# max 1949650.000000
# count 185042
# total 2902270736.000000
# standard deviation 61772.166578


#########################################################################
# REPEATMASKER (DONE braney 2008-08-03)
    ssh kkstore05
    screen # use a screen to manage this job
    mkdir /cluster/data/micMur1/bed/repeatMasker
    cd /cluster/data/micMur1/bed/repeatMasker
    doRepeatMasker.pl -buildDir=/cluster/data/micMur1/bed/repeatMasker \
        micMur1 > do.log 2>&1 &

    # Note: can run simpleRepeats simultaneously
    #### When done with RM:
    ssh pk
    para time

# Completed: 6975 of 6975 jobs
# CPU time in finished jobs:   25455563s  424259.38m  7070.99h  294.62d  0.807 y
# IO & Wait Time:                255530s    4258.84m    70.98h    2.96d  0.008 y
# Average job time:                3686s      61.44m     1.02h    0.04d
# Longest finished job:            8546s     142.43m     2.37h    0.10d
# Submission to last job:        439829s    7330.48m   122.17h    5.09d

    doRepeatMasker.pl -continue=cat -buildDir=/cluster/data/micMur1/bed/repeatMasker \
        micMur1 > do2.log 2>&1 &

    time nice -n +19 featureBits micMur1 rmsk > fb.micMur1.rmsk.txt 2>&1 &
# 692542939 bases of 1852394361 (37.386%) in intersection

    # RepeatMasker and lib version from do.log:
    #    Jun 13 2008 (open-3-2-5) version of RepeatMasker
    # CC   RELEASE 20080611;  


#########################################################################
# SIMPLE REPEATS TRF (DONE braney 2008-08-03)
    ssh kkstore05
    screen # use a screen to manage this job
    mkdir /cluster/data/micMur1/bed/simpleRepeat
    cd /cluster/data/micMur1/bed/simpleRepeat
    # 
    doSimpleRepeat.pl -buildDir=/cluster/data/micMur1/bed/simpleRepeat \
	micMur1 > do.log 2>&1 &

    #### When done
    ssh pk
    para time
    # Completed: 51 of 51 jobs
    # CPU time in finished jobs:      24985s     416.41m     6.94h    0.29d
    # 0.001 y
    # IO & Wait Time:                   101s       1.69m     0.03h    0.00d
    # 0.000 y
    # Average job time:                 492s       8.20m     0.14h    0.01d
    # Longest finished job:            3887s      64.78m     1.08h    0.04d
    # Submission to last job:          3911s      65.18m     1.09h    0.05d

    featureBits micMur1 simpleRepeat
    # 24006609 bases of 1852394361 (1.296%) in intersection

    #	after RM run is done, add this mask:
    cd /cluster/data/micMur1
    twoBitMask micMur1.rmsk.2bit -add bed/simpleRepeat/trfMask.bed micMur1.2bit

    twoBitToFa micMur1.2bit stdout | faSize stdin
# 2902270736 bases (1049876375 N's 1852394361 real 1161230687 upper 691163674
# lower) in 185042 sequences in 1 files
# Total size: mean 15684.4 sd 61772.3 min 600 (scaffold_185041) max 1949650
# (scaffold_0) median 2118
# N count: mean 5673.7 sd 19304.2
# U count: mean 6275.5 sd 29644.7
# L count: mean 3735.2 sd 15428.8
# %23.81 masked total, %37.31 masked real

    twoBitToFa micMur1.rmsk.2bit stdout | faSize stdin
# 2902270736 bases (1049876375 N's 1852394361 real 1161791947 upper 690602414
# lower) in 185042 sequences in 1 files
# Total size: mean 15684.4 sd 61772.3 min 600 (scaffold_185041) max 1949650
# (scaffold_0) median 2118
# N count: mean 5673.7 sd 19304.2
# U count: mean 6278.5 sd 29656.4
# L count: mean 3732.1 sd 15417.4
# %23.80 masked total, %37.28 masked real

    # Link to it from /gbdb
    ssh hgwdev
    ln -s /cluster/data/micMur1/micMur1.2bit /gbdb/micMur1/micMur1.2bit

    # mkdir /san/sanvol1/scratch/micMur1
    cp /cluster/data/micMur1/micMur1.2bit /san/sanvol1/scratch/micMur1
    cp /cluster/data/micMur1/chrom.sizes /san/sanvol1/scratch/micMur1


############################################################################
# add NCBI identifiers to the dbDb (DONE - 2008-10-21 - Hiram)
    hgsql -e 'update dbDb set
sourceName="Broad Institute micMur1 (NCBI project 19967, ABDC00000000)" where name="micMur1";' hgcentraltest

###########################################################################
# cpgIslands - (DONE - 2011-04-24 - Hiram)
    mkdir /hive/data/genomes/micMur1/bed/cpgIslands
    cd /hive/data/genomes/micMur1/bed/cpgIslands
    time doCpgIslands.pl micMur1 > do.log 2>&1
    #   real    314m39.323s

    cat fb.micMur1.cpgIslandExt.txt
    #   33763514 bases of 1852394361 (1.823%) in intersection

#########################################################################
# genscan - (DONE - 2011-04-26 - Hiram)
    mkdir /hive/data/genomes/micMur1/bed/genscan
    cd /hive/data/genomes/micMur1/bed/genscan
    time doGenscan.pl micMur1 > do.log 2>&1
    # recovering from power failure, kluster run had just finished
    # and it had just started on makeBed:
    time ./doMakeBed.csh
    #   real    256m38.291s
    # continuing:
    time doGenscan.pl -continue=load micMur1 > load.log 2>&1
    #   real    9m19.938s
    # failed out of inodes on hive:
    time doGenscan.pl -continue=cleanup micMur1 > cleanup.log 2>&1
    #   real    25m41.674s

    cat fb.micMur1.genscan.txt
    #   50561491 bases of 1852394361 (2.730%) in intersection
    cat fb.micMur1.genscanSubopt.txt
    #   55072205 bases of 1852394361 (2.973%) in intersection

#########################################################################
# windowMasker - (DONE - 2012-05-02 - Hiram)
    screen -S micMur1
    mkdir /hive/data/genomes/micMur1/bed/windowMasker
    cd /hive/data/genomes/micMur1/bed/windowMasker
    # trying out new version of the script that does all the usual steps
    #   that used to be performed manually after it was done
    time /cluster/home/hiram/kent/src/hg/utils/automation/doWindowMasker.pl \
        -workhorse=hgwdev -buildDir=`pwd` -dbHost=hgwdev micMur1 > do.log 2>&1
    #   Elapsed time: 769m16s

    sed -e 's/^/    #\t/' fb.micMur1.windowmaskerSdust.beforeClean.txt
    #   1606982741 bases of 2902270736 (55.370%) in intersection
    sed -e 's/^/    #\t/' fb.micMur1.windowmaskerSdust.clean.txt
    #   557106366 bases of 2902270736 (19.196%) in intersection
    sed -e 's/^/    #\t/' fb.micMur1.rmsk.windowmaskerSdust.txt
    #   333189865 bases of 2902270736 (11.480%) in intersection

#########################################################################
# MAKE 11.OOC FILE FOR BLAT/GENBANK (DONE - 2012-05-03 - Hiram)
    # Use -repMatch=900, based on size -- for human we use 1024
    # use the "real" number from the faSize measurement,
    # hg19 is 2897316137, calculate the ratio factor for 1024:
    calc \( 1852394361 / 2897316137 \) \* 1024
    #	( 1852394361 / 2897316137 ) * 1024 = 654.692735

    # round up to 700

    cd /hive/data/genomes/micMur1
    time blat micMur1.2bit /dev/null /dev/null -tileSize=11 \
      -makeOoc=jkStuff/micMur1.11.ooc -repMatch=700
    #   Wrote 19902 overused 11-mers to jkStuff/micMur1.11.ooc
    #   real    0m47.067s

    # there are no non-bridged gaps, no lift file needed for genbank
    hgsql -N -e "select bridge from gap;" micMur1 | sort | uniq -c
    #   484693 yes
#    cd /hive/data/genomes/micMur1/jkStuff
#    gapToLift micMur1 micMur1.nonBridged.lift -bedFile=micMur1.nonBridged.bed
    # largest non-bridged contig:
#    awk '{print $3-$2,$0}' micMur1.nonBridged.bed | sort -nr | head
    #   123773608 chrX  95534   123869142       chrX.01

#########################################################################
# AUTO UPDATE GENBANK (DONE - 2012-05-03 - Hiram)
    # examine the file:
    /cluster/data/genbank/data/organism.lst
    # for your species to see what counts it has for:
# organism       mrnaCnt estCnt  refSeqCnt
# Melopsittacus undulatus	25	1	0
    # to decide which "native" mrna or ests you want to specify in genbank.conf

    ssh hgwdev
    cd $HOME/kent/src/hg/makeDb/genbank
    git pull
    # edit etc/genbank.conf to add micMur1 just after ce2
# micMur1 (Mouse lemur)
micMur1.serverGenome = /hive/data/genomes/micMur1/micMur1.2bit
micMur1.clusterGenome = /hive/data/genomes/micMur1/micMur1.2bit
micMur1.ooc = /hive/data/genomes/micMur1/jkStuff/micMur1.11.ooc
micMur1.lift = no
micMur1.refseq.mrna.native.pslCDnaFilter  = ${lowCover.refseq.mrna.native.pslCDnaFilter}
micMur1.refseq.mrna.xeno.pslCDnaFilter    = ${lowCover.refseq.mrna.xeno.pslCDnaFilter}
micMur1.genbank.mrna.native.pslCDnaFilter = ${lowCover.genbank.mrna.native.pslCDnaFilter}
micMur1.genbank.mrna.xeno.pslCDnaFilter   = ${lowCover.genbank.mrna.xeno.pslCDnaFilter}
micMur1.genbank.est.native.pslCDnaFilter  = ${lowCover.genbank.est.native.pslCDnaFilter}
micMur1.refseq.mrna.native.load = no
micMur1.refseq.mrna.xeno.load = yes
micMur1.genbank.mrna.xeno.load = yes
micMur1.genbank.est.native.load = no
micMur1.downloadDir = micMur1
micMur1.perChromTables = no

    # end of section added to etc/genbank.conf
    git commit -m "adding micMur1 mouse lemur" etc/genbank.conf
    git push
    make etc-update

    git pull
    # Edit src/lib/gbGenome.c to add new species.
    git commit -m "adding definition for micMurNames" \
        src/lib/gbGenome.c
    git push
    make install-server

    ssh hgwdev			# used to do this on "genbank" machine
    screen -S micMur1           # long running job managed in screen
    cd /cluster/data/genbank
    time nice -n +19 ./bin/gbAlignStep -initial micMur1 &
    #   var/build/logs/2012.06.07-16:10:31.micMur1.initalign.log
    #   real    2142m37.090s

    # load database when finished
    ssh hgwdev
    cd /cluster/data/genbank
    time nice -n +19 ./bin/gbDbLoadStep -drop -initialLoad micMur1 &
    #	logFile:  var/dbload/hgwdev/logs/2012.06.11-11:02:03.dbload.log
    #   real    40m11.656s

    # enable daily alignment and update of hgwdev (DONE - 2012-05-11 - Hiram)
    cd ~/kent/src/hg/makeDb/genbank
    git pull
    # add micMur1 to:
    vi etc/align.dbs etc/hgwdev.dbs
    git commit -m "Added micMur1." etc/align.dbs etc/hgwdev.dbs
    git push
    make etc-update

#########################################################################
# set default position to RHO gene displays  (DONE - 2012-07-24 - Hiram)
    hgsql -e \
'update dbDb set defaultPos="scaffold_31:985166-987669" where name="micMur1";' \
	hgcentraltest

############################################################################
# pushQ entry (DONE - 2012-07-24 - Hiram)
    mkdir /hive/data/genomes/micMur1/pushQ
    cd /hive/data/genomes/micMur1/pushQ
    # Mark says don't let the transMap track get there
    time makePushQSql.pl micMur1 2> stderr.txt | grep -v transMap > micMur1.sql
    #   real    3m52.913s

    scp -p micMur1.sql hgwbeta:/tmp
    ssh hgwbeta "hgsql qapushq < /tmp/micMur1.sql"

##############################################################################
##############################################################################
# TransMap V3 tracks. see makeDb/doc/transMapTracks.txt (2014-12-21 markd)
##############################################################################
##############################################################################
# LIFTOVER TO micMur2 (DONE - 2015-05-05 - Hiram)
    ssh hgwdev
    mkdir /hive/data/genomes/micMur1/bed/blat.micMur2.2015-05-05
    cd /hive/data/genomes/micMur1/bed/blat.micMur2.2015-05-05
    time (doSameSpeciesLiftOver.pl -verbose=2 -buildDir=`pwd` \
	-ooc=/hive/data/genomes/micMur1/jkStuff/micMur1.11.ooc \
        -bigClusterHub=ku -dbHost=hgwdev -workhorse=hgwdev \
         micMur1 micMur2) > do.log 2>&1
    # real    1443m2.941s

    # verify the convert link on the test browser is now active from micMur1 to
    # micMur2

##############################################################################
# LIFTOVER TO micMur3 (DONE - 2017-03-03 - Hiram)
    ssh hgwdev
    mkdir /hive/data/genomes/micMur1/bed/blat.micMur3.2017-03-03
    cd /hive/data/genomes/micMur1/bed/blat.micMur3.2017-03-03
    time (doSameSpeciesLiftOver.pl -verbose=2 -buildDir=`pwd` \
	-ooc=/hive/data/genomes/micMur1/jkStuff/micMur1.11.ooc \
        -bigClusterHub=ku -dbHost=hgwdev -workhorse=hgwdev \
         micMur1 micMur3) > do.log 2>&1 &
    # real    3006m22.394s

    # verify the convert link on the test browser is now active from micMur1 to
    # micMur3

##############################################################################
