# for emacs: -*- mode: sh; -*-


# Ciona Savigny from Broad/MIT  4/25/2003 data version
#       

# DOWNLOAD SEQUENCE (2004-05-11 kate)
# Download description:
#    http://www.broad.mit.edu/cgi-bin/annotation/ciona/download_license.cgi
# Assembly description:
#   http://www.broad.mit.edu/annotation/ciona/assembly.html
#
# 3 sets of sequences, with quality scores -- "paired scaffolds", "paired scaffold remainders"
# and "separate scaffolds".  Assembly details on the web site.

    ssh kksilo
    mkdir /cluster/store7/cioSav1
    ln -s /cluster/store7/cioSav1 /cluster/data
    cd /cluster/data/cioSav1
    wget ftp://ftp.broad.mit.edu/pub/annotation/ciona/assembly_4_25_2003/paired_scaffold_remainders.fa.gz
    wget ftp://ftp.broad.mit.edu/pub/annotation/ciona/assembly_4_25_2003/paired_scaffold_remainders.qual.gz
    wget ftp://ftp.broad.mit.edu/pub/annotation/ciona/assembly_4_25_2003/paired_scaffolds.fa.gz
    wget ftp://ftp.broad.mit.edu/pub/annotation/ciona/assembly_4_25_2003/paired_scaffolds.qual.gz
    wget ftp://ftp.broad.mit.edu/pub/annotation/ciona/assembly_4_25_2003/separate_scaffolds.fa.gz
    wget ftp://ftp.broad.mit.edu/pub/annotation/ciona/assembly_4_25_2003/separate_scaffolds.qual.gz

    gunzip paired_scaffolds.fa.gz
    mkdir scaffolds
    sed -e "s/paired_scaffold/ps/" paired_scaffolds.fa | faSplit byname paired_scaffolds.fa scaffolds/
    mkdir nib
    cd nib
    foreach i (../scaffolds/*.fa)
        faToNib -softMask $i `basename $i .fa`.nib
    end

### CREATE DATABASE
    # Create the database.
    ssh hgwdev
    echo 'create database cioSav1' | hgsql ''
# CREATING GRP TABLE FOR TRACK GROUPING 
    echo "create table grp (PRIMARY KEY(NAME)) select * from hg17.grp" \
      | hgsql cioSav1


#    echo 'insert into blatServers values("cioSav1", "blat10", "17780", "1"); \
#          insert into blatServers values("cioSav1", "blat10", "17781", "0");' \
#      | hgsql -h genome-testdb hgcentraltest

    mkdir /gbdb/cioSav1
    ln -s /cluster/data/cioSav1/nib /gbdb/cioSav1/nib
    hgsql cioSav1  < ~/kent/src/hg/lib/chromInfo.sql
    cd /cluster/data/cioSav1
    hgNibSeq -preMadeNib cioSav1 /gbdb/cioSav1/nib scaffolds/*.fa

    mkdir -p ~/kent/src/hg/makeDb/trackDb/squirt/cioSav1
    
    cd $HOME/kent/src/hg/makeDb/trackDb
    cvsup
    # Edit that makefile to add cioSav1 in all the right places and do
    make 
    cvs commit -m "Added cioSav1." makefile

    # Add dbDb and defaultDb entries:
    echo 'insert into dbDb (name, description, nibPath, organism,  \
          defaultPos, active, orderKey, genome, scientificName,  \
          htmlPath, hgNearOk, hgPbOk, sourceName)  \
          values("cioSav1", "Sept. 2001", \
          "/gbdb/cioSav1/nib", "C. savignyi", "ps_233", 1, \
          51, "C. savignyi", "Ciona Savignyi", \
          "/gbdb/cioSav1/html/description.html", 0, 0, \
          "Broad");'  | hgsql -h genome-testdb hgcentraltest
    echo 'insert into defaultDb (genome, name) \
          values ("C. savignyi", "cioSav1");' \
          | hgsql -h genome-testdb hgcentraltest


# MAKE DESCRIPTION/SAMPLE POSITION HTML PAGE 
    mkdir /cluster/data/cioSav1/html
    cd /cluster/data/cioSav1/html

    mkdir -p ~/kent/src/hg/makeDb/trackDb/squirt/cioSav1                                               
    # make a symbolic link from /gbdb/cioSav1/html to /cluster/data/cioSav1/html                          
    ln -s /cluster/data/cioSav1/html /gbdb/cioSav1/html                                                   
    # NEED TO ADD DESCRIPTION.HTML PAGE 

    # make gcPercent table
    mkdir -p ~/cioSav1/bed/gcPercent
    cd ~/cioSav1/bed/gcPercent
    hgsql cioSav1  < ~/kent/src/hg/lib/gcPercent.sql
    hgGcPercent cioSav1 /cluster/data/cioSav1/nib


# BLASTZ CI1
    ssh kksilo
    mkdir /cluster/data/cioSav1/bed/blastz.ci1.2004-08-21
    cd /cluster/data/cioSav1/bed/blastz.ci1.2004-08-21
    ln -s blastz.ci1.2004-08-21 /cluster/data/cioSav1/bed/blastz.ci1
    cat << '_EOF_' > DEF
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/i386:/cluster/home/angie/schwartzbin

ALIGN=blastz-run
BLASTZ=blastz
BLASTZ_H=2000
BLASTZ_ABRIDGE_REPEATS=0

SEQ1_DIR=/iscratch/i/cioSav1/nib
# unused: SEQ1_RMSK=
SEQ1_SMSK=
SEQ1_FLAG=-ciona
SEQ1_IN_CONTIGS=0
SEQ1_CHUNK=10000000
SEQ1_LAP=10000

SEQ2_DIR=/cluster/data/ci1/maskedFa
# unused: SEQ2_RMSK=
SEQ2_SMSK=
SEQ2_FLAG=-ciona
SEQ2_IN_CONTIGS=1
SEQ2_CHUNK=10000000
SEQ2_LAP=0

BASE=/cluster/data/cioSav1/bed/blastz.ci1.2004-08-21

DEF=$BASE/DEF
RAW=$BASE/raw
CDBDIR=$BASE
SEQ1_LEN=$BASE/S1.len
SEQ2_LEN=$BASE/S2.len

'_EOF_'
    # << this line keeps emacs coloring happy
    # run bash shell if you don't already:
    bash
    source DEF
    mkdir run
    ~angie/hummus/make-joblist $DEF > $BASE/run/j
    sh ./xdir.sh
    cd run
    sed -e 's#^#/cluster/bin/penn/#' j > j2
    wc -l j*
    head j2
    mv j2 j
    # cluster run
    ssh kk
    cd /cluster/data/cioSav1/bed/blastz.ci1.2004-08-21/run
    para create j
    para try, check, push, check, ....
# Completed: 176616 of 176616 jobs
# CPU time in finished jobs:     327504s    5458.39m    90.97h    3.79d  0.010 y
# IO & Wait Time:                558756s    9312.61m   155.21h    6.47d  0.018 y
# Average job time:                   5s       0.08m     0.00h    0.00d
# Longest job:                      473s       7.88m     0.13h    0.01d
# Submission to last job:          1109s      18.48m     0.31h    0.01d

    # back in the bash shell on kksilo...
    mkdir /cluster/data/cioSav1/bed/blastz.ci1.2004-08-21/run.1
    cd /cluster/data/cioSav1/bed/blastz.ci1.2004-08-21/run.1
    /cluster/bin/scripts/blastz-make-out2lav $DEF $BASE > j
    # small cluster run
    ssh kki
    cd /cluster/data/cioSav1/bed/blastz.ci1.2004-08-21/run.1
    para create j
    para try, check, push, check, ....

# Completed: 446 of 446 jobs
# CPU time in finished jobs:       1629s      27.15m     0.45h    0.02d  0.000 y
# IO & Wait Time:                  3359s      55.98m     0.93h    0.04d  0.000 y
# Average job time:                  11s       0.19m     0.00h    0.00d
# Longest job:                       22s       0.37m     0.01h    0.00d
# Submission to last job:           322s       5.37m     0.09h    0.00d

    cd ..
    rm -r raw

    # Translate .lav to axt, with ci1 in scaffold coords for collaborators:
    ssh kksilo
    cd /cluster/data/cioSav1/bed/blastz.ci1.2004-08-21
    mkdir axtChrom
    foreach c (lav/*)
      pushd $c
      set chr=$c:t
      set out=axtChrom/$chr.axt
      echo "Translating $chr lav to $out"
      cat `ls -1 *.lav | sort -g` | lavToAxt stdin /cluster/data/cioSav1/nib /cluster/data/ci1/nib stdout \
        | axtSort stdin ../../$out
      popd
    end


# CHAIN CI1 BLASTZ 
    # Run axtChain on little cluster
    ssh kki
    cd /cluster/data/cioSav1/bed/blastz.ci1.2004-08-21
    mkdir -p axtChain/run1
    cd axtChain/run1
    mkdir out chain
    ls -1S /cluster/data/cioSav1/bed/blastz.ci1.2004-08-21/axtChrom/*.axt > input.lst
    cat << '_EOF_' > gsub
#LOOP
doChain {check in exists $(path1)} {check out line+ chain/$(root1).chain} {check out exists out/$(root1).out}
#ENDLOOP
'_EOF_'
    # << this line makes emacs coloring happy

    cat << '_EOF_' > doChain
#!/bin/csh
axtChain -verbose=0 $1 /iscratch/i/cioSav1/nib /iscratch/i/ci1/nib $2 > $3
'_EOF_'
    # << this line makes emacs coloring happy
    chmod a+x doChain
    gensub2 input.lst single gsub jobList
    para create jobList
    para try, check, push, check...
# Completed: 443 of 446 jobs
# Crashed: 3 jobs
# chain/ps_242.chain is empty
# chain/ps_280.chain is empty
# chain/ps_320.chain is empty
# Average job time:                   3s       0.05m     0.00h    0.00d
# Longest job:                        9s       0.15m     0.00h    0.00d
# Submission to last job:           139s       2.32m     0.04h    0.00d

    # now on the cluster server, sort chains
    ssh kksilo
    cd /cluster/data/cioSav1/bed/blastz.ci1.2004-08-21/axtChain
    chainMergeSort run1/chain/*.chain > all.chain
#    chainSplit chain all.chain
    rm run1/chain/*.chain

    # take a look at score distr's
    foreach f (chain/*.chain)
      grep chain $f | awk '{print $2;}' | sort -nr > /tmp/score.$f:t:r
      echo $f:t:r
      textHistogram -binSize=10000 /tmp/score.$f:t:r
      echo ""
    end

    # Load chains into database
    ssh hgwdev
    cd /cluster/data/cioSav1/bed/blastz.ci1.2004-08-21/axtChain/chain
    hgLoadChain cioSav1 chainCi1 all.chain
#    foreach i (*.chain)
#        set c = $i:r
#        echo loading $c
#        hgLoadChain cioSav1 ${c}_chainCi1 $i
#    end


# NET Ci1 BLASTZ 
    ssh kksilo
    cd /cluster/data/cioSav1/bed/blastz.ci1.2004-08-21/axtChain
    chainPreNet all.chain ../S1.len ../S2.len stdout \
    | chainNet stdin -minSpace=1 ../S1.len ../S2.len stdout /dev/null \
    | netSyntenic stdin noClass.net

    # Add classification info using db tables:
    ssh hgwdev
    cd /cluster/data/cioSav1/bed/blastz.ci1.2004-08-21/axtChain
    netClass -noAr noClass.net cioSav1 ci1 ci1.net 

    # Make a 'syntenic' subset:
    ssh kksilo
    cd /cluster/data/cioSav1/bed/blastz.ci1.2004-08-21/axtChain
    rm noClass.net
    netFilter -syn ci1.net > ci1Syn.net

    # Load the nets into database 
    ssh hgwdev
    cd /cluster/data/cioSav1/bed/blastz.ci1.2004-08-21/axtChain
    netFilter -minGap=10 ci1.net |  hgLoadNet cioSav1 netCi1 stdin
    netFilter -minGap=10 ci1Syn.net | hgLoadNet cioSav1 netSyntenyCi1 stdin

