#!/bin/sh

# Strongylocentrotus purpuratus (California purple sea urchin)

# Baylor College of Medicine HGSC 
# Spur_v3.1 (June 15th, 2011)

# Project ID:   13728
# http://www.ncbi.nlm.nih.gov/bioproject/13728

# Assembly     Type         Number    N50(kb)   Bases+Gaps(Mb) Bases(Mb)
# Spur_v3.0   Scaffolds     31,238    404,330    936,069,451 816,170,552 
# Spur_v3.0   Contigs      174,512     13,472    816,170,552 816,170,552

# Spur_3.1 (June, 2011)
# Contamination removed version of Spur_3.0.
# 
# Spur_v3.0 (March, 2011)
# This release is an improved assembly using a variety of Illumina
# libraries with different mate-pair distances for scaffolding and gap
# filling.  

# (Referencing hetGla1 for up-to-date procedures.)

################################################################
# Downloads Sequence (DONE - 2011-12-20 - kord)

mkdir -p /cluster/data/strPur4/downloads
cd /cluster/data/strPur4/downloads

wget http://www.spbase.org/SpBase/download/downloadfile.php?file=Spur_v3.1_assembly.tar
tar -xvf Spur_v3.1_assembly.tar
# Spur_v3.1_assembly/
# Spur_v3.1_assembly/Contigs/
# Spur_v3.1_assembly/Contigs/acc_ctg_num.tbl.gz
# Spur_v3.1_assembly/Contigs/Spur_3.1.contig.fa.gz
# Spur_v3.1_assembly/Contigs/Spur_3.1.AGP.gz
# Spur_v3.1_assembly/Contigs/Spur_3.1.contig.qual.gz
# Spur_v3.1_assembly/README.Spur_3.1.txt
# Spur_v3.1_assembly/LinearScaffolds/
# Spur_v3.1_assembly/LinearScaffolds/Spur_3.1.LinearScaffold.fa.gz
# Spur_v3.1_assembly/LinearScaffolds/Spur_3.1.LinearScaffold.qual.gz

faSize Spur_v3.1_assembly/Contigs/Spur_3.1.contig.fa.gz
# 815995273 bases (59015 N's 815936258 real 815936258 upper 0 lower) in 174772 sequences in 1 files
# Total size: mean 4668.9 sd 8818.7 min 51 (AAGJ04169930.1) max 237383 (AAGJ04011348.1) median 1448
# N count: mean 0.3 sd 2.0
# U count: mean 4668.6 sd 8818.3
# L count: mean 0.0 sd 0.0
# %0.00 masked total, %0.00 masked real

cd ..
# remove the version suffix ('.1') from the contig names
zcat downloads/Spur_v3.1_assembly/Contigs/Spur_3.1.AGP.gz \
    | grep "^#" > ucsc.agp

zcat downloads/Spur_v3.1_assembly/Contigs/Spur_3.1.AGP.gz \
    | grep -v "^#" | sed -e "s/\.1//" >> ucsc.agp
gzip ucsc.agp

# extract the scaffold/contig name and remove the .1 accession version
zcat downloads/Spur_v3.1_assembly/Contigs/Spur_3.1.contig.fa.gz \
    | sed -e "s/\.1.*//" \
    > ucsc.fa

# extract the scaffold/contig name and remove the .1 accession version
# for quality file
zcat downloads/Spur_v3.1_assembly/Contigs/Spur_3.1.contig.qual.gz \
    | sed -e "s/\.1.*//" \
    > ucsc.qual.fa
gzip ucsc.qual.fa 

gzip ucsc.fa
faSize ucsc.fa.gz
# 815995273 bases (59015 N's 815936258 real 815936258 upper 0 lower) in 174772 sequences in 1 files
# Total size: mean 4668.9 sd 8818.7 min 51 (AAGJ04169930) max 237383 (AAGJ04011348) median 1448
# N count: mean 0.3 sd 2.0
# U count: mean 4668.6 sd 8818.3
# L count: mean 0.0 sd 0.0
# %0.00 masked total, %0.00 masked real

mv ucsc.agp.gz ucsc.spur3_1.agp.gz
mv ucsc.fa.gz ucsc.spur3_1.fa.gz
mv ucsc.qual.fa.gz ucsc.spur3_1.qual.fa.gz

#########################################################################
# Initial  database build (DONE - 2011-12-20 - kord)

hgsql -e "select orderKey,name from dbDb order by orderKey;" \
   hgcentraltest | less
# 804     ci2
# 805     ci1
# 814     cioSav2
# 816     cioSav1
# 818     strPur3
# 819     strPur2
# 820     strPur1
# 822     ce10
# 823     ce9

cd /hive/data/genomes/strPur4
cat << _EOF_ > strPur4.config.ra
# Config parameters for makeGenomeDb.pl:
db strPur4
clade deuterostome
genomeCladePriority 20
scientificName Strongylocentrotus purpuratus
commonName S. purpuratus
assemblyDate Jun. 2011
assemblyLabel Baylor College of Medicine Human Genome Sequencing Center (BCM-HGSC) Spur_v3.1
assemblyShortLabel BCM-HGSC Spur_v3.1
orderKey 817
mitoAcc NC_001453
fastaFiles /hive/data/genomes/strPur4/ucsc.spur3_1.fa.gz
agpFiles /hive/data/genomes/strPur4/ucsc.spur3_1.agp.gz
#qualFiles /hive/data/genomes/strPur4/ucsc.spur3_1.qual.fa.gz
dbDbSpeciesDir urchin
taxId 7668
subsetLittleIds Y
_EOF_

time makeGenomeDb.pl -noGoldGapSplit -workhorse=hgwdev strPur4.config.ra \
    > makeGenomeDb.log 2>&1 &

# real    6m34.244s
# user    0m0.078s
# sys     0m0.084s

# NOTES -- STUFF THAT YOU WILL HAVE TO DO --
# 
# Template trackDb.ra and .html's have been created, but they all need editing!
# 
# cd /cluster/data/strPur4/TemporaryTrackDbCheckout/kent/src/hg/makeDb/trackDb/urchin/strPur4
# 
# Search for '***' notes in each file in and make corrections (sometimes the
# files used for a previous assembly might make a better template):
#   description.html /cluster/data/strPur4/html/{trackDb.ra,gap.html,gold.html}
# 
# Then copy these files to your ~/kent/src/hg/makeDb/trackDb/urchin/strPur4
#  - cd ~/kent/src/hg/makeDb/trackDb
#  - edit makefile to add strPur4 to DBS.
#  - git add urchin/strPur4/*.{ra,html}
#  - git commit -m "Added strPur4 to DBS." makefile
#  - git commit -m "Initial descriptions for strPur4." urchin/strPur4/*.{ra,html}
#  - git pull; git push
#  - Run make update DBS=strPur4 and make alpha when done.
#  - (optional) Clean up /cluster/data/strPur4/TemporaryTrackDbCheckout

ln -s `pwd`/strPur4.unmasked.2bit /gbdb/strPur4/strPur4.2bit 

mkdir /cluster/home/kord/kent/src/hg/makeDb/trackDb/urchin/strPur4
cp /cluster/data/strPur4/TemporaryTrackDbCheckout/kent/src/hg/makeDb/trackDb/urchin/strPur4/* \
  /cluster/home/kord/kent/src/hg/makeDb/trackDb/urchin/strPur4


#########################################################################
# RepeatMasker (DONE 2011-12-20 - kord)

    mkdir /hive/data/genomes/strPur4/bed/repeatMasker
    cd /hive/data/genomes/strPur4/bed/repeatMasker
    time nice -n +19 doRepeatMasker.pl -buildDir=`pwd` \
        -workhorse=hgwdev -bigClusterHub=swarm -noSplit strPur4 > do.log 2>&1 &

#     real    142m59.141s
#     user    0m0.243s
#     sys     0m0.363s

    cat faSize.rmsk.txt
# 936580645 bases (120628737 N's 815951908 real 671935715 upper 144016193 lower)
# in 32009 sequences in 1 files
# Total size: mean 29259.9 sd 117676.2 min 116 (Scaffold17086) max 2423607
# (Scaffold5) median 1006
# N count: mean 3768.6 sd 15822.1
# U count: mean 20992.1 sd 87536.4
# L count: mean 4499.2 sd 17290.8
# %15.38 masked total, %17.65 masked real

    grep -i versi do.log
# RepeatMasker version development-$Id: RepeatMasker,v 1.26 2011/09/26 16:19:44 angie Exp $
# grep version of RepeatMasker$ /scratch/data/RepeatMasker/RepeatMasker
#    April 26 2011 (open-3-3-0) version of RepeatMasker

    featureBits -countGaps strPur2 rmsk
# 115258247 bases of 907085737 (12.706%) in intersection

    # why is it different than the faSize above ?
    # because rmsk masks out some N's as well as bases, the count above
    #   separates out the N's from the bases, it doesn't show lower case N's

#########################################################################
# simpleRepeats (DONE - 2011-12-21 - Kord)

    mkdir /hive/data/genomes/strPur4/bed/simpleRepeat
    cd /hive/data/genomes/strPur4/bed/simpleRepeat
    time nice -n +19 doSimpleRepeat.pl -buildDir=`pwd` -workhorse=hgwdev \
       -dbHost=hgwdev -bigClusterHub=swarm -smallClusterHub=memk \
        strPur4 > do.log 2>&1 &

# real    93m24.992s
# user    0m0.085s
# sys     0m0.205s

#  *** All done!
#  *** Steps were performed in /hive/data/genomes/strPur4/bed/simpleRepeat
#  *** Please check the log file to see if trf had warnings that we
#      should pass on to Gary Benson (ideally with a small test case).
#  *** IMPORTANT: Developer, it is up to you to make use of trfMask.bed!
#      Here is a typical command sequence, to be run after RepeatMasker
#      (or WindowMasker etc.) has completed:
# 
#     cd /hive/data/genomes/strPur4
#     twoBitMask strPur4.rmsk.2bit -add bed/simpleRepeat/trfMask.bed strPur4.2bit
# 
#  *** Again, it is up to you to incorporate trfMask.bed into the final 2bit!

    cat fb.simpleRepeat 
# 63920123 bases of 816010923 (7.833%) in intersection

    # add to rmsk after it is done:
    cd /hive/data/genomes/strPur4
    twoBitMask strPur4.rmsk.2bit -add bed/simpleRepeat/trfMask.bed strPur4.2bit
    #   safe to ignore warnings about >=13 fields
    twoBitToFa strPur4.2bit stdout | faSize stdin > strPur4.2bit.faSize.txt
    cat strPur4.2bit.faSize.txt

    # double check with featureBits
    featureBits -countGaps strPur4 gap
# 936580645 bases (120628737 N's 815951908 real 670818208 upper 145133700 lower)
# in 32009 sequences in 1 files
# Total size: mean 29259.9 sd 117676.2 min 116 (Scaffold17086) max 2423607
# (Scaffold5) median 1006
# N count: mean 3768.6 sd 15822.1
# U count: mean 20957.2 sd 87408.0
# L count: mean 4534.2 sd 17420.4
# %15.50 masked total, %17.79 masked real

    rm /gbdb/strPur4/strPur4.2bit
    ln -s `pwd`/strPur4.2bit /gbdb/strPur4/strPur4.2bit

#########################################################################
# SPBASE.ORG gene predictions (DONE - 2011-12-21 - Kord)

cd /cluster/data/strPur4/downloads
wget http://www.spbase.org/SpBase/download/downloadfile.php?file=SpBase3.1.gff3.tar
tar -xvf SpBase3.1.gff3.tar 
# SpBase3.1.gff3/
# SpBase3.1.gff3/GLEAN-3.1.gff3-chado-UTR
# SpBase3.1.gff3/contigs-3.1.gff3
# SpBase3.1.gff3/EST-3.1.gff3
# SpBase3.1.gff3/region-3.1.gff3
# SpBase3.1.gff3/BAC-END.gff3~
# SpBase3.1.gff3/exact-match-3.0-3.1
# SpBase3.1.gff3/3.0-3.1-map-b
# SpBase3.1.gff3/qiang-3.1.gff3
# SpBase3.1.gff3/3.0-3.1-map
# SpBase3.1.gff3/BAC-END.gff3
# SpBase3.1.gff3/CVS/
# SpBase3.1.gff3/CVS/Repository
# SpBase3.1.gff3/CVS/Entries
# SpBase3.1.gff3/CVS/Root
# SpBase3.1.gff3/eBAC-3.1.gff3
# SpBase3.1.gff3/GLEAN-3.1.gff3-chado

mkdir ../gff
cd ../gff
cat ../downloads/SpBase3.1.gff3/GLEAN-3.1.gff3-chado-UTR | sed 's/\"//g' | sed 's/gene_id //g' \
    | sed 's/exon/CDS/g' > GLEAN-3.1.gff3-chado-UTR.ucsc.gff3

gffPeek GLEAN-3.1.gff3-chado-UTR.ucsc.gff3 
# source:
#   GLEAN3        245885
# feature:
#   3UTR  706
#   CDS   186609
#   gene  29285
#   transcript    29285
# strand:
#   +     123018
#   -     122867
# frame:
#   .     245885

cat GLEAN-3.1.gff3-chado-UTR.ucsc.gff3 | cut -d';' -f1 | sed 's/ID=//g' \
    | cut -d':' -f1 | sed 's/gn$//g' | sed 's/tr$//g' \
    > GLEAN-3.1.gff3-chado-UTR.ucsc.clean.gff3

./numberDuplicatesInGff.py -i GLEAN-3.1.gff3-chado-UTR.ucsc.clean.gff3 \
    -o GLEAN-3.1.gff3-chado-UTR.ucsc.clean.fixDups.gff3
# Corrected 163 duplicate names in 29285 genes.

grep -v "transcript|gene" GLEAN-3.1.gff3-chado-UTR.ucsc.clean.fixDups.gff3 \
    > GLEAN-3.1.gff3-chado-UTR_gene.gff3
ldHgGene strPur4 GLEAN_3_1_chado_UTR_gene GLEAN-3.1.gff3-chado-UTR_gene.gff3
# Reading GLEAN-3.1.gff3-chado-UTR.ucsc.clean.fixDups.gff3
# Read 29285 transcripts in 245885 lines in 1 files
#   29285 groups 4213 seqs 1 sources 4 feature types
# 29122 gene predictions

# create peptide predictions
twoBitToFa ../strPur4.rmsk.2bit ../strPur4.rmsk.fa   
hgPepPred strPur4 generic GLEAN_3_1_chado_UTR_genePep \
    /hive/data/genomes/strPur4/strPur4.rmsk.fa
# Processing /hive/data/genomes/strPur4/strPur4.rmsk.fa

## Adding a second gene prediction set for comparison from spbase.org (spbase6)

cat qiang-3.1.gff3 | sed 's/exon/CDS/g' | cut -d';' -f1     | sed 's/ID=//g' \
    | cut -d':' -f1 | sed 's/\"//g' | egrep -v transcript >qiang-3.1.cds.gff3

ldHgGene strPur4 qiang_3_1_gene qiang-3.1.cds.gff3 
# Reading qiang-3.1.cds.gff3
# Read 50161 transcripts in 255110 lines in 1 files
#   50161 groups 3838 seqs 1 sources 2 feature types
#   29070 gene predictions

hgPepPred strPur4 generic qiang_3_1_genePred \
    /hive/data/genomes/strPur4/strPur4.rmsk.fa


#########################################################################
# Create cpgIslandExt (DONE - 2011-12-21 - Kord)
mkdir /cluster/data/strPur4/bed/cpgIslands
cd /cluster/data/strPur4/bed/cpgIslands
doCpgIslands.pl strPur4 -dbHost=hgwdev -workhorse=hgwdev -buildDir=`pwd` > strPur4eMods.log 2>&1 &

featureBits strPur4 cpgIslandExt
# 8293997 bases of 816010923 (1.016%) in intersection

##########################################################################
## WINDOWMASKER (DONE - 2011-12-21 - Kord)
    mkdir /hive/data/genomes/strPur4/bed/windowMasker
    cd /hive/data/genomes/strPur4/bed/windowMasker
    time nice -n +19 doWindowMasker.pl -buildDir=`pwd` -workhorse=hgwdev \
        -dbHost=hgwdev strPur4 > do.log 2>&1 &
# real    56m3.925s
# user    0m0.068s
# sys     0m0.148s

twoBitToFa strPur4.wmsk.2bit stdout | faSize stdin
# 936580645 bases (120628737 N's 815951908 real 513579450 upper 302372458 lower) in 32009 sequences in 1 files
# Total size: mean 29259.9 sd 117676.2 min 116 (Scaffold17086) max 2423607 (Scaffold5) median 1006
# %32.28 masked total, %37.06 masked real

twoBitToFa strPur4.wmsk.sdust.2bit stdout | faSize stdin
# 936580645 bases (120628737 N's 815951908 real 506302518 upper 309649390 lower) in 32009 sequences in 1 files
# Total size: mean 29259.9 sd 117676.2 min 116 (Scaffold17086) max 2423607 (Scaffold5) median 1006
# %33.06 masked total, %37.95 masked real

hgLoadBed strPur4  windowmaskerSdust windowmasker.sdust.bed.gz
# Loaded 5445810 elements of size 3

featureBits -countGaps strPur4 windowmaskerSdust
# 430238784 bases of 936580645 (45.937%) in intersection

#   eliminate the gaps from the masking
featureBits strPur4 -not gap -bed=notGap.bed
# 816010923 bases of 816010923 (100.000%) in intersection

time nice -n +19 featureBits strPur4 windowmaskerSdust notGap.bed \
    -bed=stdout | gzip -c > cleanWMask.bed.gz
# 309669062 bases of 816010923 (37.949%) in intersection
# 
# real    16m13.749s
# user    14m47.332s
# sys     0m56.948s

#   reload track to get it clean
hgLoadBed strPur4 windowmaskerSdust cleanWMask.bed.gz
# Loaded 5453517 elements of size 4

featureBits -countGaps strPur4 windowmaskerSdust
# 309669062 bases of 936580645 (33.064%) in intersection

#   mask the sequence with this clean mask
zcat cleanWMask.bed.gz \
    | twoBitMask ../../strPur4.unmasked.2bit stdin \
        -type=.bed strPur4.cleanWMSdust.2bit
twoBitToFa strPur4.cleanWMSdust.2bit stdout | faSize stdin \
    > strPur4.cleanWMSdust.faSize.txt
cat strPur4.cleanWMSdust.faSize.txt
# 936580645 bases (120628737 N's 815951908 real 506302518 upper 309649390 lower) in 32009 sequences in 1 files
# Total size: mean 29259.9 sd 117676.2 min 116 (Scaffold17086) max 2423607 (Scaffold5) median 1006
# %33.06 masked total, %37.95 masked real

    # how much does this window masker and repeat masker overlap:
    featureBits -countGaps strPur4 rmsk windowmaskerSdust
#     110294593 bases of 936580645 (11.776%) in intersection

#########################################################################
# MASK SEQUENCE WITH WM+TRF (DONE - 2011-12-21 - Kord)

    cd /hive/data/genomes/strPur4
    twoBitMask -add bed/windowMasker/strPur4.cleanWMSdust.2bit \
        bed/simpleRepeat/trfMask.bed strPur4.2bit
    #   safe to ignore the warnings about BED file with >=13 fields
    twoBitToFa strPur4.2bit stdout | faSize stdin > faSize.strPur4.txt
    cat faSize.strPur4.txt
# 936580645 bases (120628737 N's 815951908 real 505914972 upper 310036936 lower) in 32009 sequences in 1 files
# Total size: mean 29259.9 sd 117676.2 min 116 (Scaffold17086) max 2423607 (Scaffold5) median 1006
# %33.10 masked total, %38.00 masked real

    #   create symlink to gbdb
    ssh hgwdev
    rm /gbdb/strPur4/strPur4.2bit
    ln -s `pwd`/strPur4.2bit /gbdb/strPur4/strPur4.2bit

###############################################################################
# Add Illumina alignments for S. droebachiensis (DONE - 2012-01-12 - Kord)

mkdir -p /gbdb/strPur4/Sdroe/ssaha2.sdro_d206a_KMK001.2011-12-22
cd /gbdb/strPur4/Sdroe/ssaha2.sdro_d206a_KMK001.2011-12-22
ln -s /hive/data/genomes/strPur4/sam/ssaha2.sdro_d206a_KMK001.2011-12-22/mapped.merge.sorted.bam
ln -s /hive/data/genomes/strPur4/sam/ssaha2.sdro_d206a_KMK001.2011-12-22/mapped.merge.sorted.bam.bai

hgBbiDbLink strPur4 bamSdroeGdna /gbdb/strPur4/Sdroe/ssaha2.sdro_d206a_KMK001.2011-12-22/mapped.merge.sorted.bam

# add new strongylocentrotus group:
hgsql strPur4 -e "INSERT INTO grp VALUES ('strongylocentrotus', 'Strongylocentrotus spp. Assembly and Analysis', 6.6);"

make DBS=strPur4 update

###############################################################################
# Add Illumina alignments for S. pallidus (DONE - 2012-01-12 - Kord)

mkdir -p /gbdb/strPur4/Spall/ssaha2.spall_p20a_KMK002.2011-12-20
cd /gbdb/strPur4/Spall/ssaha2.spall_p20a_KMK002.2011-12-20
ln -s /hive/data/genomes/strPur4/sam/ssaha2.spall_p20a_KMK002.2011-12-20/mapped.merge.sorted.bam
ln -s /hive/data/genomes/strPur4/sam/ssaha2.spall_p20a_KMK002.2011-12-20/mapped.merge.sorted.bam.bai

hgBbiDbLink strPur4 bamSpallGdna /gbdb/strPur4/Spall/ssaha2.spall_p20a_KMK002.2011-12-20/mapped.merge.sorted.bam

make DBS=strPur4 update

###############################################################################
# Add Illumina alignments for S. intermedius ( DONE - 2012-01-18 - Kord)

mkdir -p /gbdb/strPur4/Sint/ssaha2.sint_int1_KMK013.2012-01-06
cd /gbdb/strPur4/Sint/ssaha2.sint_int1_KMK013.2012-01-06
ln -s /hive/data/genomes/strPur4/sam/ssaha2.sint_int1_KMK013.2012-01-06/mapped.merge.sorted.bam
ln -s /hive/data/genomes/strPur4/sam/ssaha2.sint_int1_KMK013.2012-01-06/mapped.merge.sorted.bam.bai

hgBbiDbLink strPur4 bamSintGdna /gbdb/strPur4/Sint/ssaha2.sint_int1_KMK013.2012-01-06/mapped.merge.sorted.bam

make DBS=strPur4 update

###############################################################################
# Add Illumina alignments for S. franciscanus ( DONE - 2012-03-16 - Kord)

mkdir -p /gbdb/strPur4/Sfran/ssaha2.sfra_sf4_KMK010.2012-02-18
cd /gbdb/strPur4/Sfran/ssaha2.sfra_sf4_KMK010.2012-02-18
ln -s /hive/data/genomes/strPur4/sam/ssaha2.sfra_sf4_KMK010.2012-02-18/mapped.merge.sorted.bam
ln -s /hive/data/genomes/strPur4/sam/ssaha2.sfra_sf4_KMK010.2012-02-18/mapped.merge.sorted.bam.bai

hgBbiDbLink strPur4 bamSfranGdna/gbdb/strPur4/Sfran/ssaha2.sfra_sf4_KMK010.2012-02-18/mapped.merge.sorted.bam

make DBS=strPur4 update

###############################################################################
# Add Illumina alignments for S. nudus ( DONE - 2012-03-16 - Kord)

mkdir -p /gbdb/strPur4/Snud/ssaha2.snud_nu1_KMK011.2012-02-02
cd /gbdb/strPur4/Snud/ssaha2.snud_nu1_KMK011.2012-02-02
ln -s /hive/data/genomes/strPur4/sam/ssaha2.snud_nu1_KMK011.2012-02-02/mapped.merge.sorted.bam
ln -s /hive/data/genomes/strPur4/sam/ssaha2.snud_nu1_KMK011.2012-02-02/mapped.merge.sorted.bam.bai

hgBbiDbLink strPur4 bamSnudGdna /gbdb/strPur4/Snud/ssaha2.snud_nu1_KMK011.2012-02-02/mapped.merge.sorted.bam

###############################################################################
# Add Illumina alignments for A. fragilis ( DONE - 2012-03-16 - Kord)

mkdir -p /gbdb/strPur4/Afrag/ssaha2.afra_af10_KMK012.2012-02-10
cd /gbdb/strPur4/Afrag/ssaha2.afra_af10_KMK012.2012-02-10
ln -s /hive/data/genomes/strPur4/sam/ssaha2.afra_af10_KMK012.2012-02-10/mapped.merge.sorted.bam
ln -s /hive/data/genomes/strPur4/sam/ssaha2.afra_af10_KMK012.2012-02-10/mapped.merge.sorted.bam.bai

hgBbiDbLink strPur4 bamAfraGdna /gbdb/strPur4/Afrag/ssaha2.afra_af10_KMK012.2012-02-10/mapped.merge.sorted.bam

###############################################################################
# Add Illumina alignments for P. depressus ( DONE - 2012-03-27 - Kord)

mkdir -p /gbdb/strPur4/Pdep/ssaha2.pdep_pd5_KMK010.2012-03-21
cd /gbdb/strPur4/Pdep/ssaha2.pdep_pd5_KMK010.2012-03-21
ln -s /hive/data/genomes/strPur4/sam/ssaha2.pdep_pd5_KMK010.2012-03-21/mapped.merge.sorted.bam
ln -s /hive/data/genomes/strPur4/sam/ssaha2.pdep_pd5_KMK010.2012-03-21/mapped.merge.sorted.bam.bai

hgBbiDbLink strPur4 bamPdepGdna /gbdb/strPur4/Pdep/ssaha2.pdep_pd5_KMK010.2012-03-21/mapped.merge.sorted.bam

###############################################################################
# Add Illumina alignments for P. depressus ( DONE - 2012-03-27 - Kord)

mkdir -p /gbdb/strPur4/Hpul/ssaha2.hpul_hp5_KMK16.2012-03-14
cd /gbdb/strPur4/Hpul/ssaha2.hpul_hp5_KMK16.2012-03-14
ln -s /hive/data/genomes/strPur4/sam/ssaha2.hpul_hp5_KMK16.2012-03-14/mapped.merge.sorted.bam
ln -s /hive/data/genomes/strPur4/sam/ssaha2.hpul_hp5_KMK16.2012-03-14/mapped.merge.sorted.bam.bai

hgBbiDbLink strPur4 bamHpulGdna /gbdb/strPur4/Hpul/ssaha2.hpul_hp5_KMK16.2012-03-14/mapped.merge.sorted.bam

########################################################################
# MAKE 11.OOC FILE FOR BLAT/GENBANK (DONE - 2012-01-18 - Kord)

    # Use -repMatch=650, based on size -- for human we use 1024
    # use the "real" number from the faSize measurement,
    # hg19 is 2897316137, calculate the ratio factor for 1024:
    calc \( 815995273 / 2897316137 \) \* 1024
    #     ( 815995273 / 2897316137 ) * 1024 = 288.397648
    # round up to 300

    cd /hive/data/genomes/strPur4
    blat strPur4.2bit /dev/null /dev/null -tileSize=11 \
      -makeOoc=jkStuff/strPur4.11.ooc -repMatch=300
    # Wrote 36670 overused 11-mers to jkStuff/strPur4.11.ooc

    #   copy all of this stuff to the klusters:
    # there are no non-bridged gaps
    hgsql -N -e "select bridge from gap;" strPur4 | sort | uniq -c
    # 142764 yes
    #   If there were:
    #   cd /hive/data/genomes/oreNil1/jkStuff
    #   gapToLift oreNil1 nonBridged.lift -bedFile=nonBridged.bed
    cd /hive/data/genomes/strPur4
    mkdir /hive/data/staging/data/strPur4
    cp -p jkStuff/strPur4.11.ooc chrom.sizes \
        strPur4.2bit /hive/data/staging/data/strPur4
    # request rsync copy from cluster admin

##########################################################################
#  BLATSERVERS ENTRY (DONE -  2012-01-18 - Kord)
#       After getting a blat server assigned by the Blat Server Gods,
#   
# blatx
# trans port 17844
# untrans port 17845


    hgsql -e 'INSERT INTO blatServers (db, host, port, isTrans, canPcr) \
        VALUES ("strPur4", "blatx", "17844", "1", "0"); \
        INSERT INTO blatServers (db, host, port, isTrans, canPcr) \
        VALUES ("strPur4", "blatx", "17845", "0", "1");' \
            hgcentraltest

# mysql> SELECT * FROM blatServers WHERE db="strPur4";
# +---------+-------+-------+---------+--------+
# | db      | host  | port  | isTrans | canPcr |
# +---------+-------+-------+---------+--------+
# | strPur4 | blatx | 17844 |       1 |      0 | 
# | strPur4 | blatx | 17845 |       0 |      1 | 
# +---------+-------+-------+---------+--------+
# 2 rows in set (0.00 sec)

    #   test it with some sequence


###############################################################################
# Add Illumina alignments for S. intermedius chrM ( DONE - 2012-02-02 - Kord)

mkdir -p /gbdb/strPur4/Sint/ssaha2.sint_int1_KMK013.chrM.2012-01-31
cdp /gbdb/strPur4/Sint/ssaha2.sint_int1_KMK013.chrM.2012-01-31
ln -s /hive/data/genomes/strPur4/sam/ssaha2.sint_int1_KMK013.chrM.2012-01-31/mapped.mq25.mated.sorted.bam
ln -s /hive/data/genomes/strPur4/sam/ssaha2.sint_int1_KMK013.chrM.2012-01-31/mapped.mq25.mated.sorted.bam.bai

hgBbiDbLink strPur4 bamSintGdnaChrM /hive/data/genomes/strPur4/sam/ssaha2.sint_int1_KMK013.chrM.2012-01-31/mapped.mq25.mated.sorted.bam


###############################################################################
# Add the mitochondria genes ( Done - 2012-02-02 - Kord)

# bring them over from strPur2

ssh hgwdev
cd /hive/data/genomes/strPur4/gff
cp /hive/data/genomes/strPur2/gff/X12631.fix.gff .

# laod the GFF
ldHgGene strPur4 mitoGene X12631.fix.gff 
# Reading X12631.fix.gff
# Read 10 transcripts in 14 lines in 1 files
#   10 groups 1 seqs 1 sources 2 feature types
#   10 gene predictions

# create peptide predictions from the mitoGene table
cd ..
hgPepPred strPur4 generic mitoGenePep strPur4.rmsk.fa
# Processing strPur4.rmsk.fa

########################################################################
# codeml sites Pvalues

mkdir -p /hive/data/genomes/strPur4/codeml/2012-04-01/download
mkdir -p /hive/data/genomes/strPur4/codeml/2012-04-01/load
cd /hive/data/genomes/strPur4/codeml/2012-04-01/load

for chrom in `/bin/ls -1 ../download/*wig | sed 's/\./ /g'| awk '{print $2}'`; do
    wigEncode ../download/codeml.${chrom}.wig codeml.${chrom}.wig codeml.${chrom}.wib; 
done

hgLoadWiggle -pathPrefix=/gbdb/strPur4/wib/codemlSites     strPur4 codemlSites *.wig
rm /gbdb/strPur4/wib/codemlSites/*
ln -s `pwd`/*.wib /gbdb/strPur4/wib/codemlSites

########################################################################
# SpBase.org EST alignments (TODO - TBD - Kord)


########################################################################
# Create kluster run files (TODO - TBD - Kord)


#########################################################################
# RefSeq alignments  (TODO - TBD - Kord)


#########################################################################
# Genscan gene predictions (TODO - TBD - Kord)


#########################################################################
# LIFTOVER TO strPur2 (TODO - TBD - Kord )


#########################################################################
# LASTZ Drosophila droMel3 (TODO - TBD - Kord)


#########################################################################
# LASTZ Tunicate ci2 (TODO - TBD - Kord)


