# for emacs: -*- mode: sh; -*-

# Drosophila pseudoobscura -- Baylor "CAF1" via Eisen's 12-fly site
# (Identical to FlyBase dpse_r2.1_FB2008_02)

# THIS IS ONLY TO GET MASKED SEQUENCE -- NOT A BROWSER AT THIS POINT


#########################################################################
# DOWNLOAD SEQUENCE (DONE 9/26/06 angie)
    ssh kkstore05
    mkdir /cluster/store12/dp4
    ln -s /cluster/store12/dp4 /cluster/data/dp4
    mkdir /cluster/data/dp4/downloads
    cd /cluster/data/dp4/downloads
    wget http://rana.lbl.gov/drosophila/caf1/dpse_caf1.tar.gz
    tar xvzf dpse_caf1.tar.gz
    cd dpse
    faSize scaffolds.fa
#152738921 bases (6681752 N's 146057169 real 146057169 upper 0 lower) in 4896 sequences in 1 files
#Total size: mean 31196.7 sd 649312.0 min 101 (Unknown_singleton_1691) max 30794189 (Ch2) median 1734
#N count: mean 1364.7 sd 21011.7
#U count: mean 29831.9 sd 628747.5
#L count: mean 0.0 sd 0.0
    # Tweak their funny chromosome names (Ch*) to our pattern (chr*):
    sed -e 's/^>Ch/>chr/' scaffolds.fa > UCSC.fa
    sed -e 's/^Ch/chr/' assembly.agp > UCSC.agp


#########################################################################
# MAKE GENOME DB *UP TO DB STEP ONLY* (DONE 9/27/06 angie)
    ssh kkstore05
    cd /cluster/data/dp4
    cat > dp4.config.ra <<EOF
# Config parameters for makeGenomeDb.pl:
db dp4
clade insect
scientificName Drosophila pseudoobscura
assemblyDate Feb. 2006
assemblyLabel Baylor CAF1
orderKey 57
mitoAcc none
fastaFiles /cluster/data/dp4/downloads/dpse/UCSC.fa
agpFiles /cluster/data/dp4/downloads/dpse/UCSC.agp
dbDbSpeciesDir drosophila
EOF

    # Stop at db step so we can use featureBits, but don't do dbDb and trackDb
    # because we're not building an actual browser for now.
    makeGenomeDb.pl dp4.config.ra -stop=db \
      >& makeGenomeDb.log & tail -f makeGenomeDb.log
    # Because of the extremely long sequence names for unplaced sequences,
    # the chromInfo load command failed because the index string length was
    # too short.  So I temporarily modified my ~/kent/src/hg/lib/chromInfo.sql
    # to lengthen the index, dropped the dp4 database and ran -continue db.


#########################################################################
# REPEATMASKER (DONE 9/27/06 angie)
    ssh kkstore05
    # Run -debug to create the dir structure and preview the scripts:
    doRepeatMasker.pl dp4 -verbose 3 -debug
    # Run it for real and tail the log:
    doRepeatMasker.pl dp4 -species drosophila -verbose 3 \
      >& /cluster/data/dp4/bed/RepeatMasker.2006-09-27/do.log &
    tail -f /cluster/data/dp4/bed/RepeatMasker.2006-09-27/do.log
    # RepeatMasker and lib version from do.log:
#    March 20 2006 (open-3-1-5) version of RepeatMasker
#CC   RELEASE 20060315;                                            *
    # Compare coverage to previous assembly:
    featureBits -chrom=chr2 dp4 rmsk
#1245499 bases of 29873196 (4.169%) in intersection
    featureBits -chrom=chr2 dp3 rmsk
#1140300 bases of 29702756 (3.839%) in intersection


#########################################################################
# SIMPLE REPEATS (TRF) (DONE 9/27/06 angie)
    ssh kolossus
    nice tcsh
    mkdir /cluster/data/dp4/bed/simpleRepeat
    cd /cluster/data/dp4/bed/simpleRepeat
    twoBitToFa ../../dp4.unmasked.2bit stdout \
    | trfBig -trf=/cluster/bin/i386/trf stdin /dev/null \
      -bedAt=simpleRepeat.bed -tempDir=/tmp \
    >& trf.log & tail -f trf.log
    # ~50 minutes (longer than D. mel, must be because of the scaffolds)

    # Make a filtered version for sequence masking:
    awk '{if ($5 <= 12) print;}' simpleRepeat.bed > trfMask.bed

    # Load unfiltered repeats into the database:
    ssh hgwdev
    hgLoadBed dp4 simpleRepeat \
      /cluster/data/dp4/bed/simpleRepeat/simpleRepeat.bed \
      -sqlTable=$HOME/kent/src/hg/lib/simpleRepeat.sql
    # Compare coverage to previous assembly:
    featureBits -chrom=chr2 dp4 simpleRepeat
#530609 bases of 29873196 (1.776%) in intersection
    featureBits -chrom=chr2 dp3 simpleRepeat
#517077 bases of 29702756 (1.741%) in intersection


#########################################################################
# MASK SEQUENCE WITH FILTERED TRF IN ADDITION TO RM (DONE 9/27/06 angie)
    ssh kolossus
    cd /cluster/data/dp4
    time twoBitMask dp4.rmsk.2bit -add bed/simpleRepeat/trfMask.bed dp4.2bit
    # This warning is OK -- the extra fields are not block coordinates.
#Warning: BED file bed/simpleRepeat/trfMask.bed has >=13 fields which means it might contain block coordinates, but this program uses only the first three fields (the entire span -- no support for blocks).
#0.201u 0.367s 0:01.78 31.4%     0+0k 0+0io 1pf+0w

    # Because this is a no-browser build (just masking for alignment)
    # I did not make the usual /gbdb/$db/$db.2bit link.


###########################################################################
# BLASTZ/CHAIN/NET DROANA3 (DONE 10/3/06 angie)
    ssh kkstore05
    mkdir /cluster/data/dp4/bed/blastz.droAna3.2006-10-02
    cd /cluster/data/dp4/bed/blastz.droAna3.2006-10-02
    cat << '_EOF_' > DEF
# D. pseudoobscura vs. D. ananassae

BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=4000
BLASTZ_K=2200
BLASTZ_Q=/cluster/data/blastz/HoxD55.q

# TARGET - D. pseudoobscura
SEQ1_DIR=/iscratch/i/dp4/dp4.2bit
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
SEQ1_LEN=/cluster/data/dp4/chrom.sizes

# QUERY - D. ananassae
SEQ2_DIR=/iscratch/i/droAna3/droAna3.2bit
SEQ2_CHUNK=10000000
SEQ2_LAP=10000
SEQ2_LEN=/cluster/data/droAna3/chrom.sizes

BASE=/cluster/data/dp4/bed/blastz.droAna3.2006-10-02
'_EOF_'
    # << this line keeps emacs coloring happy
    doBlastzChainNet.pl DEF \
      -blastzOutRoot /panasas/store/dp4droAna3 >& do.log &
    tail -f do.log
    ln -s blastz.droAna3.2006-10-02 /cluster/data/dp4/bed/blastz.droAna3


###########################################################################
# BLASTZ/CHAIN/NET DROWIL1 (DONE 10/3/06 angie)
    ssh kkstore05
    mkdir /cluster/data/dp4/bed/blastz.droWil1.2006-10-02
    cd /cluster/data/dp4/bed/blastz.droWil1.2006-10-02
    cat << '_EOF_' > DEF
# D. pseudoobscura vs. D. willistoni

BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=4000
BLASTZ_K=2200
BLASTZ_Q=/cluster/data/blastz/HoxD55.q

# TARGET - D. pseudoobscura
SEQ1_DIR=/iscratch/i/dp4/dp4.2bit
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
SEQ1_LEN=/cluster/data/dp4/chrom.sizes

# QUERY - D. willistoni
SEQ2_DIR=/iscratch/i/droWil1/droWil1.2bit
SEQ2_CHUNK=10000000
SEQ2_LAP=10000
SEQ2_LEN=/cluster/data/droWil1/chrom.sizes

BASE=/cluster/data/dp4/bed/blastz.droWil1.2006-10-02
'_EOF_'
    # << this line keeps emacs coloring happy
    doBlastzChainNet.pl DEF \
      -blastzOutRoot /panasas/store/dp4droWil1 >& do.log &
    tail -f do.log
    ln -s blastz.droWil1.2006-10-02 /cluster/data/dp4/bed/blastz.droWil1


#########################################################################
# SWAP DM3 CHAIN/NET (DONE 6/30/08 angie)
    ssh kkstore05
    mkdir /cluster/data/dp4/bed/blastz.dm3.swap
    cd /cluster/data/dp4/bed/blastz.dm3.swap
    doBlastzChainNet.pl -swap \
      /cluster/data/dm3/bed/blastz.dp4.2006-12-04/DEF >& do.log &
    tail -f do.log
    ln -s blastz.dm3.swap /cluster/data/dp4/bed/blastz.dm3


#########################################################################
# MAKE 11.OOC FILE FOR BLAT (DONE 9/18/08 angie)
    # Use -repMatch=85 -- increased from dp3's 75 because this assembly
    # loses 9272 tiles w/75, as opposed to dp3's 6790 -- even if dp4 has
    # more repetitive reads than dp3, that seems like too big of an increase
    # (would lose sensitivity).
    ssh kolossus
    blat /hive/data/genomes/dp4/dp4.2bit /dev/null /dev/null -tileSize=11 \
      -makeOoc=/hive/data/genomes/dp4/11.ooc -repMatch=85
#Wrote 7218 overused 11-mers to /hive/data/genomes/dp4/11.ooc


#########################################################################
# LIFTOVER TO DP3 (DONE 9/18/08)
    doSameSpeciesLiftOver.pl -bigClusterHub=pk -workhorse=kolossus \
      -ooc=/hive/data/genomes/dp4/11.ooc dp4 dp3 -debug
# *** Steps were performed in /cluster/data/dp4/bed/blat.dp3.2008-09-18
    cd /cluster/data/dp4/bed/blat.dp3.2008-09-18
#NOTE FOR NEXT TIME: save log file!  (oops)
    doSameSpeciesLiftOver.pl -bigClusterHub=pk -workhorse=kolossus \
      -ooc=/hive/data/genomes/dp4/11.ooc dp4 dp3


#########################################################################
# lastz run to D. pseudoobscura (DONE - 2014-06-26 - Hiram)
    screen -S dp4DroPer1      # use a screen to manage this longish running job
    mkdir /hive/data/genomes/dp4/bed/lastzDroPer1.2014-06-26
    cd /hive/data/genomes/dp4/bed/lastzDroPer1.2014-06-26

    # always set the BLASTZ program so we know what version was used
    cat << '_EOF_' > DEF
# D. pseudoobscura vs D. persimilis
BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.52/bin/lastz
BLASTZ_M=50

BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=4000
BLASTZ_K=2200
BLASTZ_Q=/cluster/data/blastz/HoxD55.q
#     A    C    G    T
#    91  -90  -25 -100
#   -90  100 -100  -25
#   -25 -100  100  -90
#  -100  -25  -90  91

# TARGET: D. pseudoobscura Dp4
SEQ1_DIR=/hive/data/genomes/dp4/dp4.2bit
SEQ1_LEN=/hive/data/genomes/dp4/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
SEQ1_LIMIT=5

# QUERY: D. persimilis DroPer1
SEQ2_DIR=/hive/data/genomes/droPer1/droPer1.2bit
SEQ2_LEN=/hive/data/genomes/droPer1/chrom.sizes
SEQ2_CHUNK=10000000
SEQ2_LAP=10000

BASE=/hive/data/genomes/dp4/bed/lastzDroPer1.2014-06-26
TMPDIR=/dev/shm
'_EOF_'
    # << emacs

    time (doBlastzChainNet.pl `pwd`/DEF -verbose=2 \
        -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \
        -syntenicNet) > do.log 2>&1
    # broke down during featureBits on the chainLink table due to
    # inadequate index, fixed hgLoadChain and reloaded:
    time /cluster/home/hiram/kent/src/hg/makeDb/hgLoadChain/hgLoadChain \
       -tIndex dp4 chainDroPer1 dp4.droPer1.all.chain.gz
    # Loading 92008528 chains into dp4.chainDroPer1
    # real    76m24.273s

    time featureBits dp4 chainDroPer1Link > fb.dp4.chainDroPer1Link.txt 2>&1
    # real    34m12.908s

    cat fb.dp4.chainDroPer1Link.txt
    # 142529024 bases of 146066673 (97.578%) in intersection

    time (doBlastzChainNet.pl `pwd`/DEF -verbose=2 \
        -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \
        -continue=download -syntenicNet) > download.log 2>&1
    # real    9m38.094s

    # filter with doRecipBest.pl
    time (/cluster/bin/scripts/doRecipBest.pl -buildDir=`pwd` \
        -dbHost=hgwdev -workhorse=hgwdev dp4 droPer1) > rbest.log 2>&1
    # real    130m22.825s
    # this was run before the above download was finished, to complete:
    time (/cluster/bin/scripts/doRecipBest.pl -buildDir=`pwd` \
        -continue=download -dbHost=hgwdev -workhorse=hgwdev dp4 droPer1) \
           > rbest.download.log 2>&1
    # real    0m4.233s

    # running the swap
    mkdir /hive/data/genomes/droPer1/bed/blastz.dp4.swap
    cd /hive/data/genomes/droPer1/bed/blastz.dp4.swap
    time (doBlastzChainNet.pl -verbose=2 \
        /hive/data/genomes/dp4/bed/lastzDroPer1.2014-06-26/DEF \
        -swap -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \
        -syntenicNet) > swap.log 2>&1
    # real    170m37.103s

    cat fb.droPer1.chainDp4Link.txt
    # 169459603 bases of 175583556 (96.512%) in intersection

    time (/cluster/bin/scripts/doRecipBest.pl -buildDir=`pwd` \
        -dbHost=hgwdev -workhorse=hgwdev droPer1 dp4) > rbest.log 2>&1
    #  real    3m45.728s

#############################################################################
# self lastz run (DONE - 2014-07-07 - Hiram)
    screen -S dp4      # use a screen to manage this longish running job
    mkdir /hive/data/genomes/dp4/bed/lastzSelf.2014-07-07
    cd /hive/data/genomes/dp4/bed/lastzSelf.2014-07-07

    # always set the BLASTZ program so we know what version was used
    cat << '_EOF_' > DEF
# D. pseudoobscura vs D. pseudoobscura
BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.52/bin/lastz
BLASTZ_M=50

BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=4000
BLASTZ_K=2200
BLASTZ_Q=/cluster/data/blastz/HoxD55.q
#     A    C    G    T
#    91  -90  -25 -100
#   -90  100 -100  -25
#   -25 -100  100  -90
#  -100  -25  -90  91

# TARGET: D. pseudoobscura Dp4
SEQ1_DIR=/hive/data/genomes/dp4/dp4.2bit
SEQ1_LEN=/hive/data/genomes/dp4/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
SEQ1_LIMIT=5

# QUERY: D. pseudoobscura Dp4
SEQ2_DIR=/hive/data/genomes/dp4/dp4.2bit
SEQ2_LEN=/hive/data/genomes/dp4/chrom.sizes
SEQ2_CHUNK=10000000
SEQ2_LAP=10000

BASE=/hive/data/genomes/dp4/bed/lastzSelf.2014-07-07
TMPDIR=/dev/shm
'_EOF_'
    # << emacs

    time (doBlastzChainNet.pl `pwd`/DEF -verbose=2 \
        -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \
        -syntenicNet) > do.log 2>&1
    # failed on the syntenicNet which shouldn't be done for selfies
    # real    1134m13.977s

    time featureBits dp4 chainSelfLink > fb.dp4.chainSelfLink.txt 2>&1
    # real    54m55.931s

    cat fb.dp4.chainSelfLink.txt
    # 113982936 bases of 146066673 (78.035%) in intersection

#############################################################################
