# for emacs: -*- mode: sh; -*-


# This file describes processing of genomic sequence for the pufferfish
# Tetraodon nigroviridis (Green Spotted Pufferfish), obtained
# from www.genoscope.cns.fr
# Fugu Rubripes (Japanese pufferfish), whole genome shotgun assembly dated 

# MASK TETRAODON SEQUENCE  (IN PROGRESS 2003-09-22 kate)
    # which was downloaded from genoscope, V6, 99MB compressed
    #
    # NOTE: the RM cluster job was run with input from kksilo fileserver,
    # instead of cluster local disk, iservers or bluearc, and 
    # the fileserver handled it fine.
     
    ssh kksilo
    cd /cluster/data/tetra
    ls tetraodon6.gz
    gunzip tetraodon6.gz
    mkdir -p split500K
    faSplit about tetraodon6 500000 split500K/tetra_
    mkdir -p RMRun/out
    cd split500K
    foreach f (tetra_*.fa)
        echo /cluster/bin/scripts/RMFugu \
                /cluster/data/tetra/split500K $f /cluster/data/tetra/RMRun/out \
               '{'check out line+ /cluster/data/tetra/RMRun/out/$f.out'}' \
              >> ../RMRun/RMJobs
    end
    cd ../RMRun
    wc -l RMJobs
       # 632 RMJobs
    ssh kk
    cd /cluster/data/tetra/RMRun
    head RMJobs
    para create RMJobs
    para try 
    para check


    # PROCESS TETRA FOR SIMPLE REPEATS 
    ssh kksilo
    mkdir -p /cluster/data/tetra/simpleRepeat
    cd /cluster/data/tetra/simpleRepeat
    mkdir trf
cat << 'EOF' > jobs.csh
    foreach f (/cluster/data/tetra/split500K/tetra*.fa)
        set fout = $f:t:r.bed
        echo $fout
        /cluster/bin/i386/trfBig -trf=/cluster/bin/i386/trf $f /dev/null -bedAt=trf/$fout -tempDir=/tmp
    end
'EOF'
    tcsh jobs.csh >&! jobs.log &
    # check on this with
    tail -f jobs.log
    ls -1 trf | wc -l

    # FILTER SIMPLE REPEATS INTO MASK 
    # make a filtered version # of the trf output: 
    # keep trf's with period <= 12:
    ssh kksilo
    cd /cluster/data/tetra/simpleRepeat
    mkdir -p trfMask
    foreach f (trf/*.bed)
        echo "filtering $f"
        awk '{if ($5 <= 12) print;}' $f > trfMask/$f:t
    end

    # MASK USING REPEATMASKER AND FILTERED TRF FILES
    ssh kksilo
    cd /cluster/data/tetra
    mkdir masked
cat << 'EOF' > mask.csh
    foreach p (split500K/tetra*.fa)
        set f = $p:t
        echo "masking $f"
        maskOutFa $p RMRun/out/$f.out masked/$f -soft
        maskOutFa masked/$f simpleRepeat/trfMask/${f:r}.bed masked/$f -softAdd
    end
'EOF'
    csh mask.csh >&! mask.log &
    tail -100f mask.log

    # COPY TO CLUSTER DATA AREA
    ssh kkr1u00
    cd /iscratch/i
    mkdir tetra.2002
    ln -s /iscratch/i/tetra.2002 tetra
    cp /cluster/data/tetra/masked/* /iscratch/i/tetra
    iSync
