# for emacs: -*- mode: sh; -*-


# Ciona Intestinalis V2.0 from JGI

# DOWNLOAD SEQUENCE 

    ssh kkstore02
    mkdir /cluster/store11/cioSav2/downloads
    ln -s  /cluster/store11/cioSav2 /cluster/data
    cd /cluster/data/cioSav2/downloads
    cd ..

    mkdir chunks500k jkStuff scaffolds nib

    cd scaffolds
    faSplit byname ../download/*fa .     
    for i in *.fa
    do
	faToNib $i ../nib/`basename $i .fa`.nib
    done

    cd ..

    faSplit gap download/*fa 50000 chunks50k/x  -lift=jkStuff/chunks50k.lft -minGapSize=100     

# REPEAT MASKING
    #- Make the run directory and job list:
    cd /cluster/data/cioSav2
    tcsh

    cat << '_EOF_' > jkStuff/RMCiona
#!/bin/csh -fe

cd $1
pushd .
/bin/mkdir -p /tmp/cioSav2/$2
/bin/cp $2 /tmp/cioSav2/$2/
cd /tmp/cioSav2/$2
/cluster/bluearc/RepeatMasker/RepeatMasker -ali -s $2
popd
/bin/cp /tmp/cioSav2/$2/$2.out ./
if (-e /tmp/cioSav2/$2/$2.align) /bin/cp /tmp/cioSav2/$2/$2.align ./
if (-e /tmp/cioSav2/$2/$2.tbl) /bin/cp /tmp/cioSav2/$2/$2.tbl ./
if (-e /tmp/cioSav2/$2/$2.cat) /bin/cp /tmp/cioSav2/$2/$2.cat ./
/bin/rm -fr /tmp/cioSav2/$2/*
/bin/rmdir --ignore-fail-on-non-empty /tmp/cioSav2/$2
/bin/rmdir --ignore-fail-on-non-empty /tmp/cioSav2
'_EOF_'
    # << this line makes emacs coloring happy
    chmod +x jkStuff/RMCiona
    exit
    mkdir RMRun
    for i in chunks50k/*.fa
    do
	d=`dirname $i`
	f=`basename $i`
    echo "/cluster/data/cioSav2/jkStuff/RMCiona /cluster/data/cioSav2/$d $f {check out line+ /cluster/data/cioSav2/$d/$f.out} "
    done > RMRun/RMJobs

    #- Do the run
    ssh pk
    cd /cluster/data/cioSav2/RMRun
    para create RMJobs
    para try, para check, para check, para push, para check,...
# Completed: 4617 of 4617 jobs
# CPU time in finished jobs:     133300s    2221.66m    37.03h    1.54d  0.004 y
# IO & Wait Time:                703794s   11729.90m   195.50h    8.15d  0.022 y
# Average job time:                 181s       3.02m     0.05h    0.00d
# Longest finished job:             651s      10.85m     0.18h    0.01d
# Submission to last job:          2948s      49.13m     0.82h    0.03d

    featureBits cioSav2 rmsk
# 22881665 bases of 141233565 (16.201%) in intersection

# SIMPLE REPEATS (TRF) 
    ssh kkstore02
    mkdir -p /cluster/data/cioSav2/bed/simpleRepeat
    cd /cluster/data/cioSav2/bed/simpleRepeat
    mkdir trf
    tcsh
    cp /dev/null jobs.csh
    foreach d (/cluster/data/cioSav2/chunks500k)
      foreach f ($d/*.fa)
        set fout = $f:t:r.bed
        echo $fout
        echo "/cluster/bin/i386/trfBig -trf=/cluster/bin/i386/trf $f /dev/null -bedAt=trf/$fout -tempDir=/tmp" >> jobs.csh
      end
    end
    screen
    csh -ef jobs.csh 

    # check on this with
    tail -f jobs.log
    wc -l jobs.csh
    ls -1 trf | wc -l
    endsInLf trf/*
    # When job is done do:
    liftUp simpleRepeat.bed /cluster/data/cioSav2/jkStuff/liftAll.lft error  trf/*.bed

    # Load into the database:
    ssh hgwdev
    hgLoadBed cioSav2 simpleRepeat /cluster/data/cioSav2/bed/simpleRepeat/simpleRepeat.bed -sqlTable=$HOME/kent/src/hg/lib/simpleRepeat.sql
    featureBits cioSav2 simpleRepeat
# 6047914 bases of 141233565 (4.282%) in intersection


# Create the database.
    ssh hgwdev
    echo 'create database cioSav2' | hgsql ''
# CREATING GRP TABLE FOR TRACK GROUPING 
    echo "create table grp (PRIMARY KEY(NAME)) select * from hg17.grp" | hgsql cioSav2

    echo 'insert into blatServers values("cioSav2", "blat15", "17780", "1","0"); \
          insert into blatServers values("cioSav2", "blat15", "17781", "0","1");' \
      | hgsql -h genome-testdb hgcentraltest

    # Add dbDb and defaultDb entries:
    echo 'insert into dbDb (name, description, nibPath, organism, defaultPos, active, orderKey, genome, scientificName,  \
          htmlPath, hgNearOk)  values("cioSav2", "July 2005", "/gbdb/cioSav2/nib", "C. savignyi", "", 1, \
          44, "C. savignyi", "Ciona savignyi", "/gbdb/cioSav2/html/description.html", 0);' \
    | hgsql -h genome-testdb hgcentraltest

# STORING O+O SEQUENCE AND ASSEMBLY INFORMATION
    # Make symbolic links in /gbdb/cioSav2/nib to the real nibs.
    ssh hgwdev
    mkdir /gbdb/cioSav2
    ln -s /cluster/data/cioSav2/nib /gbdb/cioSav2/nib
    # Load /gbdb/cioSav2/nib paths into database 
    cd /cluster/data/cioSav2
    hgsql cioSav2  < ~kent/src/hg/lib/chromInfo.sql
    hgNibSeq -preMadeNib cioSav2 /gbdb/cioSav2/nib scaffolds/*.fa

    # make gcPercent table
    ssh hgwdev
    mkdir -p /cluster/data/cioSav2/bed/gcPercent
    cd /cluster/data/cioSav2/bed/gcPercent
    hgsql cioSav2  < ~/kent/src/hg/lib/gcPercent.sql
    hgGcPercent cioSav2 ../../nib
#dropped gcPercent table (DONE braney 2005-08-19)

### AUTO UPDATE GENBANK MRNA RUN  (DONE markd 2005-08-17)
    # genbank done with revised alignment procedure
    # Update genbank config and source in CVS:
    cd ~/kent/src/hg/makeDb/genbank
    cvs update etc
    
    # Edit etc/genbank.conf and add these lines:
    # NOTE: braney created a ooc and this was added after the initial alignment
# cioSav2 (ciona intestinalis)
cioSav2.serverGenome = /cluster/data/cioSav2/cioSav2.2bit
cioSav2.clusterGenome = /iscratch/i/cioSav2/cioSav2.2bit
cioSav2.ooc = no
cioSav2.maxIntron = 20000
cioSav2.lift = no
cioSav2.refseq.mrna.native.load = yes
cioSav2.refseq.mrna.xeno.load = yes
cioSav2.refseq.mrna.xeno.pslCDnaFilter = -minCover=0.25 -coverNearTop=0.005 -minId=0.15 -idNearTop=0.005 -maxRepMatch=0.4 -bestOverlap
cioSav2.genbank.mrna.xeno.load = yes
cioSav2.genbank.est.xeno.load = no
cioSav2.downloadDir = cioSav2
cioSav2.perChromTables = no

    cvs ci etc/genbank.conf
    # Install to /cluster/data/genbank:
    make etc-update

    ssh kkstore02
    cd /cluster/data/genbank
    # do mrnas and ests in one run
    nice bin/gbAlignStep -initial cioSav2 &

    # Load results:
    ssh hgwdev
    cd /cluster/data/genbank
    nice bin/gbDbLoadStep -drop -initialLoad cioSav2

### gap and repeats tables
    ssh hgwdev
    mkdir -p /cluster/data/cioSav2/bed/gapRmsk
    cd /cluster/data/cioSav2/bed/gapRmsk
    simpleGap /cluster/data/cioSav2/nib gap.bed repeats.bed
    echo "drop table gap;" | hgsql cioSav2
    hgsql cioSav2 < ~/kent/src/hg/lib/gap.sql   
    hgLoadBed -oldTable cioSav2 gap gap.bed
    echo "create index chrom on gap (chrom(13), bin) ;" | hgsql cioSav2
    echo "create index chrom_2 on gap  (chrom(13), chromStart);" | hgsql cioSav2
    echo "create index chrom_3 on gap  (chrom(13), chromEnd);" | hgsql cioSav2

    echo "drop index bin on rmsk;" | hgsql cioSav2
    echo "drop index genoStart on rmsk;" | hgsql cioSav2
    echo "drop index genoEnd on rmsk;" | hgsql cioSav2
    echo "create index chrom_2 on rmsk  (genoName(13), genoStart);" | hgsql cioSav2
    echo "create index chrom_3 on rmsk  (genoName(13), genoEnd);" | hgsql cioSav2

    ssh kkstore02
    mkdir -p /cluster/data/cioSav2/bed/simpleRepeat
    cd /cluster/data/cioSav2/bed/simpleRepeat
    mkdir trf
    for i in ../../scaffolds/*.fa
    do 
    	trfBig $i /dev/null -bedAt=trf/`basename $i .fa`.bed > /dev/null 2>&1 ; echo $i;
    done
    cat trf/* > simpleRepeat.bed

    ssh hgwdev
    hgLoadBed cioSav2 simpleRepeat /cluster/data/cioSav2/bed/simpleRepeat/simpleRepeat.bed \
    	-sqlTable=$HOME/kent/src/hg/lib/simpleRepeat.sql 

    ssh kkstore02
    cd /cluster/data/cioSav2/bed/simpleRepeat
    mkdir -p trfMask
    cd trf
    for i in *.bed
    do
      awk '{if ($5 <= 12) print;}' $i > ../trfMask/$i
    done

    cd ../trfMask
    liftUp ../../../all.trfMask.bed ../../../jkStuff/chunks500k.lft error *.bed

    # use RepeatMasker and simpleRepeat to build masked fa's
    cd /cluster/data/cioSav2
    mkdir maskedScaffolds
    cat scaffolds/*.fa |  maskOutFa -soft stdin all.out all.tmp.fa
    maskOutFa -softAdd all.tmp.fa all.trfMask.bed all.masked.fa

    # Rebuild the nib files, using the soft masking in the fa:                                           
    mkdir -p /cluster/data/cioSav2/nib
    cd /cluster/data/cioSav2/maskedScaffolds
    
    for i in *.fa
    do
      faToNib -softMask $i ../nib/`basename $i .fa`.nib
    done

    # Make one big 2bit file as well, and make a link to it in
    faToTwoBit *.fa ../cioSav2.2bit                                                                     
    ln -s /cluster/data/cioSav2/cioSav2.2bit /gbdb/cioSav2/

### SNAP GENE PREDICTIONS FROM COLIN DEWEY 
    ssh hgwdev
    mkdir /cluster/data/cioSav2/bed/snap
    cd /cluster/data/cioSav2/bed/snap
    # contact: Colin Dewey <cdewey@eecs.berkeley.edu>
    wget "http://hanuman.math.berkeley.edu/~cdewey/tracks/SNAP.CioInt_2.gff.gz"
    gunzip SNAP.CioInt_2.gff.gz
    ldHgGene -gtf -frame -id -geneName cioSav2 snapGene SNAP.CioInt_2.gff
# 31491546 bases of 141233565 (22.297%) in intersection
# in ci1 28699953 bases of 113192845 (25.355%) in intersection

# MAKE DOWNLOADABLE SEQUENCE FILES 
    ssh kkstore02
    cd /cluster/data/cioSav2
    #- Build the .zip files
    cat << '_EOF_' > jkStuff/zipAll.csh
rm -rf zip
mkdir zip
zip -j zip/Scaffold.out.zip all.out
zip -j zip/ScaffoldFa.zip scaffolds/*.fa
zip -j zip/ScaffoldFaMasked.zip maskedScaffolds/*.fa
cd bed/simpleRepeat
zip -j ../../zip/ScaffoldTrf.zip simpleRepeat.bed
cd ../..
'_EOF_'
    # << this line makes emacs coloring happy
### NOTES: NEED NEW GENBANK PROCESS FOR THIS
    csh ./jkStuff/zipAll.csh |& tee zipAll.log
    cd zip
    #- Look at zipAll.log to make sure all file lists look reasonable.  
    #- Check zip file integrity:
    foreach f (*.zip)
      unzip -t $f > $f.test
      tail -1 $f.test
    end
    wc -l *.zip.test

    #- Copy the .zip files to hgwdev:/usr/local/apache/...
    ssh hgwdev
    cd /cluster/data/cioSav2/zip
    set gp = /usr/local/apache/htdocs/goldenPath/cioSav2
    mkdir -p $gp/bigZips
    cp -p *.zip $gp/bigZips
    # mkdir -p $gp/scaffolds
    # foreach f ( ../*/chr*.fa )
      # zip -j $gp/chromosomes/$f:t.zip $f
    # end

    cd $gp/bigZips
    md5sum *.zip > md5sum.txt
    # cd $gp/chromosomes
    # md5sum *.zip > md5sum.txt
    # Take a look at bigZips/* and chromosomes/*, update their README.txt's

# MAKE 11.OOC FILE FOR BLAT 
    mkdir /cluster/bluearc/cioSav2
    blat /cluster/data/cioSav2/cioSav2.2bit /dev/null /dev/null -tileSize=11 \
      -makeOoc=/cluster/bluearc/cioSav2/11.ooc -repMatch=100
# Wrote 10830 overused 11-mers to /cluster/bluearc/cioSav2/11.ooc
    cp -p /cluster/bluearc/cioSav2/*.ooc /iscratch/i/cioSav2/
    iSync

#######################################################################
# GC5BASE (DONE - 2005-08-19 - Hiram)
    ssh kkstore02
    mkdir /cluster/data/cioSav2/bed/gc5Base
    cd /cluster/data/cioSav2/bed/gc5Base
    hgGcPercent -wigOut -doGaps -file=stdout -win=5 cioSav2 \
        /cluster/data/cioSav2 | wigEncode stdin gc5Base.wig gc5Base.wib

    ssh hgwdev
    cd /cluster/data/cioSav2/bed/gc5Base
    mkdir /gbdb/cioSav2/wib
    ln -s `pwd`/gc5Base.wib /gbdb/cioSav2/wib
    hgLoadWiggle cioSav2 gc5Base gc5Base.wig


#######################################################################
### AUTO UPDATE GENBANK MRNA RUN  (DONE markd 2005-08-31)
    # redo genbank revised alignment procedure once again to
    # pickup local near best pslCDnaFilter

    # Update genbank config and source in CVS:
    cd ~/kent/src/hg/makeDb/genbank
    cvs update etc
    
    # Edit etc/genbank.conf and add these lines:
# cioSav2 (ciona intestinalis)
cioSav2.serverGenome = /cluster/data/cioSav2/cioSav2.2bit
cioSav2.clusterGenome = /iscratch/i/cioSav2/cioSav2.2bit
cioSav2.ooc = /iscratch/i/cioSav2/11.ooc
cioSav2.maxIntron = 20000
cioSav2.lift = no
cioSav2.refseq.mrna.native.pslCDnaFilter  = ${lowCover.refseq.mrna.native.pslCDnaFilter}
cioSav2.refseq.mrna.xeno.pslCDnaFilter    = ${lowCover.refseq.mrna.xeno.pslCDnaFilter}
cioSav2.genbank.mrna.native.pslCDnaFilter = ${lowCover.genbank.mrna.native.pslCDnaFilter}
cioSav2.genbank.mrna.xeno.pslCDnaFilter   = ${lowCover.genbank.mrna.xeno.pslCDnaFilter}
cioSav2.genbank.est.native.pslCDnaFilter  = ${lowCover.genbank.est.native.pslCDnaFilter}
cioSav2.genbank.est.xeno.pslCDnaFilter    = ${lowCover.genbank.est.xeno.pslCDnaFilter}
cioSav2.refseq.mrna.native.load = yes
cioSav2.refseq.mrna.xeno.load = yes
cioSav2.genbank.mrna.xeno.load = yes
cioSav2.genbank.est.xeno.load = no
cioSav2.downloadDir = cioSav2
cioSav2.perChromTables = no

    cvs ci etc/genbank.conf
    # Install to /cluster/data/genbank:
    make etc-update

    ssh kkstore02
    cd /cluster/data/genbank
    # do mrnas and ests in one run
    nice bin/gbAlignStep -initial cioSav2 &

    # Load results:
    ssh hgwdev
    cd /cluster/data/genbank
    nice bin/gbDbLoadStep -drop -initialLoad cioSav2

    # Note: download sequences are made as part of the first
    # genbank update on hgdownload.
