# for emacs: -*- mode: sh; -*-

#	http://www.ncbi.nlm.nih.gov/bioproject/13937
#	http://www.ncbi.nlm.nih.gov/Traces/wgs/?val=AAQQ01
#	http://www.ncbi.nlm.nih.gov/nuccore/107785164
#	http://www.ncbi.nlm.nih.gov/genome/472

# squirrel ( Spermophilus tridecemlineatus)
#########################################################################
# DOWNLOAD SEQUENCE (DONE braney 2008-07-31)
    ssh kkstore05
    mkdir /cluster/store12/speTri1
    ln -s /cluster/store12/speTri1 /cluster/data
    mkdir /cluster/data/speTri1/broad
    cd /cluster/data/speTri1/broad

    wget --timestamping \
ftp://ftp.broad.mit.edu/pub/assemblies/mammals/squirrel/speTri1/assembly.agp \
ftp://ftp.broad.mit.edu/pub/assemblies/mammals/squirrel/speTri1/assembly.bases.gz \
ftp://ftp.broad.mit.edu/pub/assemblies/mammals/squirrel/speTri1/assembly.quals.gz 
    md5sum ass* > assembly.md5sum


qaToQac assembly.quals.gz stdout | qacAgpLift assembly.agp stdin speTri1.qual.qac

    wget --timestamping \
ftp://ftp.broad.mit.edu/pub/assemblies/mammals/squirrel/speTri1/BasicStats.out

# --------------------------------------------------------------------------------
# Tue May 16 00:08:35 2006 run, based on Mon May  8 09:51:44 EDT 2006 make
# BasicStats PRE=/wga/dev1/WGAdata DATA=projects/Squirrel RUN=run/work
##  \
#            SUBDIR=assisted_3 QUAL_STATS=True
# --------------------------------------------------------------------------------
# 
# Supercontigs having < 3 reads or < 1kb sequence are ignored.
# 4229 gaps <= -1000; 25 gaps <= -10000; 0 gaps <= -100000
# fraction of gaps < -10kb or more than 4 deviations below zero: 0.384%
# 42487 gaps > 10kb, 47 gaps > 50kb, 0 gaps > 200kb, 0 gaps > 1Mb
# 82.4% of reads were used in the assembly (84.67% of bases, 85.56% of Q20
# bases)
# 0% of reads were used multiply in the assembly
# 758936 contigs, having N50 length 2799
# total contig length: 1873262772, spanning 3431133042 bases (with 45.4% in
# gaps)
# 122351 supercontigs, having N50 length 55167 (not including gaps)
# 79.3% of assembly in supers of size < 200000 (2722264171 bases)
# Assembly base coverage: 1.37X.  Assembly Q20 coverage:  1.18X.
# 99.87% of bases have q >= 1
# 94.72% of bases have q >= 20
# 87.83% of bases have q >= 30
# 79.33% of bases have q >= 40
# 68.22% of bases have q >= 50


   cut -f 1 assembly.agp | uniq -c | wc -l 
   # Number of scaffolds: 160889


#########################################################################
# Create .ra file and run makeGenomeDb.pl
    ssh kkstore05
    cd /cluster/data/speTri1
cat << _EOF_ >speTri1.config.ra
# Config parameters for makeGenomeDb.pl:
db speTri1
clade mammal
genomeCladePriority 35
scientificName  Spermophilus tridecemlineatus
commonName Squirrel
assemblyDate Feb. 2008
assemblyLabel Broad Institute speTri1 
orderKey 236.5
#mitoAcc AJ222767
mitoAcc none
fastaFiles /cluster/data/speTri1/broad/assembly.bases.gz
agpFiles /cluster/data/speTri1/broad/assembly.agp
qualFiles /cluster/data/speTri1/broad/speTri1.qual.qac
dbDbSpeciesDir squirrel
_EOF_

# use 'screen' make sure on kkstore05
    makeGenomeDb.pl -verbose=2 speTri1.config.ra > makeGenomeDb.out 2>&1 &

# 'ctl-a ctl -d' returns to previous shell
cut -f 2 chrom.sizes | ave stdin
# Q1 1345.000000
# median 4617.000000
# Q3 12707.500000
# average 21684.320196
# min 200.000000
# max 996791.000000
# count 160889
# total 3488768592.000000
# standard deviation 47336.311846

#########################################################################
# REPEATMASKER (DONE braney 2008-08-03)
    ssh kkstore05
    screen # use a screen to manage this job
    mkdir /cluster/data/speTri1/bed/repeatMasker
    cd /cluster/data/speTri1/bed/repeatMasker
    doRepeatMasker.pl -buildDir=/cluster/data/speTri1/bed/repeatMasker \
        speTri1 > do.log 2>&1 &

    # Note: can run simpleRepeats simultaneously
    #### When done with RM:
    time nice -n +19 featureBits speTri1 rmsk > fb.speTri1.rmsk.txt 2>&1 &
# 514833923 bases of 1913367893 (26.907%) in intersection

    # RepeatMasker and lib version from do.log:
    #    Jun 13 2008 (open-3-2-5) version of RepeatMasker
    # CC   RELEASE 20080611;  


#########################################################################
# SIMPLE REPEATS TRF (DONE braney 2008-08-03)
    ssh kkstore05
    screen # use a screen to manage this job
    mkdir /cluster/data/speTri1/bed/simpleRepeat
    cd /cluster/data/speTri1/bed/simpleRepeat
    # 
    doSimpleRepeat.pl -buildDir=/cluster/data/speTri1/bed/simpleRepeat \
	speTri1 > do.log 2>&1 &

    #### When done

    ssh hgwdev
    featureBits speTri1 simpleRepeat
    # 150142628 bases of 1913367893 (7.847%) in intersection

    #	after RM run is done, add this mask:
    cd /cluster/data/speTri1
    twoBitMask speTri1.rmsk.2bit -add bed/simpleRepeat/trfMask.bed speTri1.2bit

    twoBitToFa speTri1.2bit stdout | faSize stdin
# 3488768592 bases (1575400699 N's 1913367893 real 1398742896 upper 514624997
# lower) in 160889 sequences in 1 files
# Total size: mean 21684.3 sd 47336.5 min 200 (scaffold_63211) max 996791
# (scaffold_113960) median 4617
# N count: mean 9791.8 sd 22439.9
# U count: mean 8693.8 sd 19894.4
# L count: mean 3198.6 sd 8123.2
# %14.75 masked total, %26.90 masked real

    twoBitToFa speTri1.rmsk.2bit stdout | faSize stdin
# 3488768592 bases (1575400699 N's 1913367893 real 1399153971 upper 514213922
# lower) in 160889 sequences in 1 files
# Total size: mean 21684.3 sd 47336.5 min 200 (scaffold_63211) max 996791
# (scaffold_113960) median 4617
# N count: mean 9791.8 sd 22439.9
# U count: mean 8696.4 sd 19900.2
# L count: mean 3196.1 sd 8117.5
# %14.74 masked total, %26.87 masked real

    # Link to it from /gbdb
    ln -s /cluster/data/speTri1/speTri1.2bit /gbdb/speTri1/speTri1.2bit

    # mkdir /san/sanvol1/scratch/speTri1
    cp /cluster/data/speTri1/speTri1.2bit /san/sanvol1/scratch/speTri1
    cp /cluster/data/speTri1/chrom.sizes /san/sanvol1/scratch/speTri1

############################################################################
#  speTri1 - Squirrel - Ensembl Genes version 51  (DONE - 2008-12-03 - hiram)
    ssh kkr14u07
    cd /hive/data/genomes/speTri1
    cat << '_EOF_' > speTri1.ensGene.ra
# required db variable
db speTri1
# do we need to translate geneScaffold coordinates
geneScaffolds yes
# ignore genes that do not properly convert to a gene pred, and contig
#	names that are not in the UCSC assembly
skipInvalid yes
# ignore the single gene that has an invalid structure from Ensembl:
# 1071: ENSSTOT00000007455 no exonFrame on CDS exon 1
'_EOF_'
#  << happy emacs

    doEnsGeneUpdate.pl -ensVersion=51 speTri1.ensGene.ra
    ssh hgwdev
    cd /hive/data/genomes/speTri1/bed/ensGene.51
    featureBits speTri1 ensGene
    # 21525994 bases of 1913367893 (1.125%) in intersection

 *** All done!  (through the 'makeDoc' step)
 *** Steps were performed in /hive/data/genomes/speTri1/bed/ensGene.51

############################################################################
