# for emacs: -*- mode: sh; -*-

# This is actually supposed to be proCap1.
# You will find the correct building of rock hyrax in the proCap1.txt doc.

# rock hyrax ( Procavia capensis )
#########################################################################
# DOWNLOAD SEQUENCE (DONE braney 2008-07-11)
    ssh kkstore05
    mkdir /cluster/store12/proCav1
    ln -s /cluster/store12/proCav1 /cluster/data
    mkdir /cluster/data/proCav1/broad
    cd /cluster/data/proCav1/broad

    wget --timestamping \
ftp://ftp.broad.mit.edu/pub/assemblies/mammals/rockHyrax/assembly.agp \
ftp://ftp.broad.mit.edu/pub/assemblies/mammals/rockHyrax/assembly.bases.gz \
ftp://ftp.broad.mit.edu/pub/assemblies/mammals/rockHyrax/assembly.quals.gz 


qaToQac assembly.quals.gz stdout | qacAgpLift assembly.agp stdin proCav1.qual.qac

    wget --timestamping \
ftp://ftp.broad.mit.edu/pub/assemblies/mammals/rockHyrax/BasicStats.out

# --------------------------------------------------------------------------------
# Fri Jan 18 10:02:36 2008 run (pid=6704), using Tue Jan 15 10:51:03 EST 2008
# make
# BasicStats PRE=/wga/dev/WGAdata DATA=projects/LowCoverage/Procavia
# \
# RUN=run/work SUBDIR=assisted_4.12 QUAL_STATS=True
# \
# OUTFILE=BasicStats.out
# --------------------------------------------------------------------------------
# 
# Supercontigs having < 3 reads or < 1kb sequence are
# ignored.
# 1759 gaps <= -1000; 0 gaps <= -10000; 0 gaps <= -100000
##  fraction of gaps < -10kb or more than 4 deviations below
zero: 0.341%
# 0 gaps > 10kb, 0 gaps > 50kb, 0 gaps > 200kb, 0 gaps >
# 1Mb
# 79.79% of reads were used in the assembly (80.62% of
# bases, 79.78% of Q20 bases)
# 0% of reads were used multiply in the assembly
# 794933 contigs, having N50 length 3487
# total contig length: 2341371985, spanning 2908327961
# bases (with 19.5% in gaps)
# 242541 supercontigs, having N50 length 21281 (not
# including gaps)
# 99.2% of assembly in supers of size < 200000 (2883627646
# bases)
# Assembly base coverage: 2.04X.  Assembly Q20 coverage:
# 1.87X.
# 99.99% of bases have q >= 1
# 97.24% of bases have q >= 20
# 93.46% of bases have q >= 30
# 89.6% of bases have q >= 40
# 84.66% of bases have q >= 50

   cut -f 1 assembly.agp | uniq -c | wc -l 
   # Number of scaffolds: 295008


#########################################################################
# Create .ra file and run makeGenomeDb.pl
    ssh kkstore05
    cd /cluster/data/proCav1
cat << _EOF_ >proCav1.config.ra
# Config parameters for makeGenomeDb.pl:
db proCav1
clade mammal
genomeCladePriority 35
scientificName  Procavia capensis 
commonName Rock hyrax
assemblyDate Jul. 2008
assemblyLabel Broad Institute proCav1 
orderKey 236.5
#mitoAcc AJ222767
mitoAcc none
fastaFiles /cluster/data/proCav1/broad/assembly.bases.gz
agpFiles /cluster/data/proCav1/broad/assembly.agp
qualFiles /cluster/data/proCav1/broad/proCav1.qual.qac
dbDbSpeciesDir rockHyrax
_EOF_

# use 'screen' make sure on kkstore05
    makeGenomeDb.pl -verbose=2 proCav1.config.ra > makeGenomeDb.out 2>&1 &

# 'ctl-a ctl -d' returns to previous shell
cut -f 2 chrom.sizes | ave stdin
# Q1 1549.000000
# median 4249.000000
# Q3 11283.000000
# average 10119.277335
# min 1000.000000
# max 390036.000000
# count 295008
# total 2985267768.000000
# standard deviation 16728.467480


#########################################################################
# REPEATMASKER (DONE braney 2008-07-29)
    ssh kkstore05
    screen # use a screen to manage this job
    mkdir /cluster/data/proCav1/bed/repeatMasker
    cd /cluster/data/proCav1/bed/repeatMasker
    doRepeatMasker.pl -buildDir=/cluster/data/proCav1/bed/repeatMasker \
        proCav1 > do.log 2>&1 &

    # Note: can run simpleRepeats simultaneously
    #### When done with RM:
    ssh pk
    para time

#    Completed: 6258 of 6258 jobs
#    CPU time in finished jobs:   20645642s  344094.03m  5734.90h  238.95d
#    0.655 y
#    IO & Wait Time:                120642s    2010.71m    33.51h    1.40d
#    0.004 y
#    Average job time:                3318s      55.31m     0.92h    0.04d
#    Longest finished job:            9661s     161.02m     2.68h    0.11d
#    Submission to last job:         66905s    1115.08m    18.58h    0.77d


    time nice -n +19 featureBits proCav1 rmsk > fb.proCav1.rmsk.txt 2>&1 &
    # 1006823376 bases of 2298455021 (43.804%) in intersection

    # RepeatMasker and lib version from do.log:
    #    Jun 13 2008 (open-3-2-5) version of RepeatMasker
    # CC   RELEASE 20080611;  


#########################################################################
# SIMPLE REPEATS TRF (DONE braney 2008-07-29)
    ssh kkstore05
    screen # use a screen to manage this job
    mkdir /cluster/data/proCav1/bed/simpleRepeat
    cd /cluster/data/proCav1/bed/simpleRepeat
    # 
    doSimpleRepeat.pl -buildDir=/cluster/data/proCav1/bed/simpleRepeat \
	proCav1 > do.log 2>&1 &

    #### When done
    ssh pk
    para time
    # Completed: 51 of 51 jobs
    # CPU time in finished jobs:      24985s     416.41m     6.94h    0.29d
    # 0.001 y
    # IO & Wait Time:                   101s       1.69m     0.03h    0.00d
    # 0.000 y
    # Average job time:                 492s       8.20m     0.14h    0.01d
    # Longest finished job:            3887s      64.78m     1.08h    0.04d
    # Submission to last job:          3911s      65.18m     1.09h    0.05d

    featureBits proCav1 simpleRepeat
    # 24277312 bases of 2407857472 (1.008%) in intersection

    #	after RM run is done, add this mask:
    cd /cluster/data/proCav1
    twoBitMask proCav1.rmsk.2bit -add bed/simpleRepeat/trfMask.bed proCav1.2bit

    twoBitToFa proCav1.2bit stdout | faSize stdin
# 2985267768 bases (577410296 N's 2407857472 real 1732706800 upper 675150672
# lower) in 295008 sequences in 1 files
# Total size: mean 10119.3 sd 16728.5 min 1000 (scaffold_295007) max 390036
# (scaffold_1) median 4249
# N count: mean 1957.3 sd 3298.2
# U count: mean 5873.4 sd 10786.7
# L count: mean 2288.6 sd 3537.1
# %22.62 masked total, %28.04 masked real

    twoBitToFa proCav1.rmsk.2bit stdout | faSize stdin
# 2985267768 bases (577410296 N's 2407857472 real 1733130127 upper 674727345
# lower) in 295008 sequences in 1 files
# Total size: mean 10119.3 sd 16728.5 min 1000 (scaffold_295007) max 390036
# (scaffold_1) median 4249
# N count: mean 1957.3 sd 3298.2
# U count: mean 5874.9 sd 10789.4
# L count: mean 2287.1 sd 3534.9
# %22.60 masked total, %28.02 masked real

    # Link to it from /gbdb
    ln -s /cluster/data/proCav1/proCav1.2bit /gbdb/proCav1/proCav1.2bit

    # mkdir /san/sanvol1/scratch/proCav1
    cp /cluster/data/proCav1/proCav1.2bit /san/sanvol1/scratch/proCav1
    cp /cluster/data/proCav1/chrom.sizes /san/sanvol1/scratch/proCav1


