# for emacs: -*- mode: sh; -*-


# Drosophila yakuba -- 
# 
# WUSTL's 
# http://www.genome.wustl.edu/???
#

#  NOTE:  this doc may have genePred loads that fail to include
#  the bin column.  Please correct that for the next build by adding
#  a bin column when you make any of these tables:
#
#  mysql> SELECT tableName, type FROM trackDb WHERE type LIKE "%Pred%";
#  +-------------+---------------------------------+
#  | tableName   | type                            |
#  +-------------+---------------------------------+
#  | xenoRefGene | genePred xenoRefPep xenoRefMrna |
#  | genscan     | genePred genscanPep             |
#  +-------------+---------------------------------+



# DOWNLOAD SEQUENCE AND AGP (DONE 11/2/05 angie)
    ssh kkstore03
    mkdir /cluster/store6/droYak2
    cd /cluster/data
    ln -s /cluster/store6/droYak2 droYak2
    cd /cluster/data/droYak2
    mkdir downloads
    cd downloads
    wget ftp://genome.wustl.edu/private/lhillier/old/dyak2.tar.gz
    tar xvzf dyak2.tar.gz
    cd dyak2
    faSize *.fa
#168647858 bases (5982851 N's 162665007 real 162665007 upper 0 lower) in 20 sequences in 20 files
#Total size: mean 8432392.9 sd 10946150.8 min 31700 (chr4_random) max 28832112 (chr3R) median 3277457
#N count: mean 299142.5 sd 709171.6
#U count: mean 8133250.3 sd 10635266.7
#L count: mean 0.0 sd 0.0
    foreach c (2L 2R 2h 3L 3R 3h 4 U Uh X Xh Yh)
      mkdir ../../$c
      if (-e chr$c.fa) then
        mv chr$c.fa ../../$c
        mv chr$c.agp ../../$c
      endif
      if (-e chr${c}_random.fa) then
        mv chr${c}_random.fa ../../$c
        mv chr${c}_random.agp ../../$c
      endif
    end
    cd ../..
    # download mitochondrion sequence
    mkdir M
    cd M
    # go to http://www.ncbi.nih.gov/ and search Genome for 
    # "yakuba mitochondrion".  That shows the gi number:
    # 5834829
    # Use that number in the entrez linking interface to get fasta:
    wget -O chrM.fa \
      'http://www.ncbi.nlm.nih.gov/entrez/query.fcgi?cmd=Text&db=Nucleotide&uid=5834829&dopt=FASTA'
    # Edit chrM.fa: make sure the long fancy header line says it's the 
    # Drosophila yakuba mitochondrion complete genome, and then replace the 
    # header line with just ">chrM".
    cd ..
    # checkAgpAndFa prints out way too much info -- keep the end/stderr only:
    foreach c (?{,?})
      foreach agp ($c/chr$c{,_random}.agp)
        if (-e $agp) then
          set fa = $agp:r.fa
          echo checking consistency of $agp and $fa
          checkAgpAndFa $agp $fa | tail -1
        endif
      end
    end
    # see what kind of gap types we have:
    awk '$5 == "N" {print $7;}' */chr*.agp | uniq | sort | uniq
#contig
#fragment
    # biggest gap size is 3000:
    awk '$5 == "N" {print $6;}' */chr*.agp | sort -nr | head -1
    # rough hist of gap sizes from AGP:
    awk '$5 == "N" {print $6;}' */chr*.agp | textHistogram stdin -binSize=100
    # fancy hist of gap size from FASTA (10, 250, 1000, 3000 overrepresented):
    cat */chr*.fa | faGapSizes -niceSizes=10,50,100,250,300,700,1000,3000 stdin


# BREAK UP SEQUENCE INTO 5 MB CHUNKS AT CONTIGS/GAPS (DONE 11/2/05 angie)
    ssh kkstore03
    cd /cluster/data/droYak2
    foreach c (?{,?})
      foreach agp ($c/chr$c{,_random}.agp)
        if (-e $agp) then
          set fa = $agp:r.fa
          echo splitting $agp and $fa
          cp -p $agp $agp.bak
          cp -p $fa $fa.bak
          splitFaIntoContigs $agp $fa . -nSize=5000000
        endif
      end
    end
    # splitFaIntoContigs makes new dirs for _randoms.  Move their contents 
    # back into the main chrom dirs and get rid of the _random dirs.
    foreach d (*_random)
      set base = `echo $d | sed -e 's/_random$//'`
      mkdir -p $base/lift
      mv $d/lift/oOut.lst $base/lift/rOut.lst
      mv $d/lift/ordered.lft $base/lift/random.lft
      mv $d/lift/ordered.lst $base/lift/random.lst
      rmdir $d/lift
      mv $d/* $base
      rmdir $d
    end
    # Make a "pseudo-contig" for processing chrM too:
    mkdir M/chrM_1
    sed -e 's/chrM/chrM_1/' M/chrM.fa > M/chrM_1/chrM_1.fa
    mkdir M/lift
    echo "chrM_1/chrM_1.fa.out" > M/lift/oOut.lst
    echo "chrM_1" > M/lift/ordered.lst
    set msize = `faSize M/chrM.fa | awk '{print $1;}'`
    echo "0\tM/chrM_1\t$msize\tchrM\t$msize" > M/lift/ordered.lft


# MAKE JKSTUFF AND BED DIRECTORIES (DONE 11/2/05 angie)
    # This used to hold scripts -- better to keep them inline here so 
    # they're in CVS.  Now it should just hold lift file(s) and 
    # temporary scripts made by copy-paste from this file.  
    mkdir /cluster/data/droYak2/jkStuff
    # This is where most tracks will be built:
    mkdir /cluster/data/droYak2/bed


# CREATING DATABASE AND GRP TABLE (DONE 11/2/05 angie)
    # Create the database.
    ssh hgwdev
    # Make sure there is at least 5 gig free for the database
    df -h /var/lib/mysql
    hgsql -e 'create database droYak2'
    hgsql -e "create table grp (PRIMARY KEY(NAME)) select * from droYak1.grp" \
      droYak2


# MAKE CHROMINFO TABLE WITH (TEMPORARILY UNMASKED) 2BIT (DONE 11/2/05 angie)
    # Make nib/, unmasked until RepeatMasker and TRF steps are done.
    # Do this now so we can load up RepeatMasker and run featureBits; 
    # can also load up other tables that don't depend on masking.  
    ssh kkstore03
    cd /cluster/data/droYak2
    faToTwoBit ?{,?}/chr*.fa droYak2.2bit
    mkdir bed/chromInfo
    twoBitInfo droYak2.2bit stdout \
    | awk '{print $1 "\t" $2 "\t/gbdb/droYak2/droYak2.2bit";}' \
      > bed/chromInfo/chromInfo.tab

    # Make symbolic links from /gbdb/droYak2/nib to the real nibs.
    ssh hgwdev
    mkdir /gbdb/droYak2
    ln -s /cluster/data/droYak2/droYak2.2bit /gbdb/droYak2/
    # Load /gbdb/droYak2/droYak2.2bit paths into database and save size info.
    cd /cluster/data/droYak2
    hgsql droYak2  < $HOME/kent/src/hg/lib/chromInfo.sql
    hgsql droYak2 -e 'load data local infile \
      "/cluster/data/droYak2/bed/chromInfo/chromInfo.tab" \
      into table chromInfo;'
    echo "select chrom,size from chromInfo" | hgsql -N droYak2 > chrom.sizes
    # take a look at chrom.sizes size
    wc chrom.sizes
#     21      42     338 chrom.sizes


# REPEATMASKER (DONE 11/4/05 angie)
    #- Split contigs into 500kb chunks, at gaps if possible:
    ssh kkstore03
    cd /cluster/data/droYak2
    foreach c (?{,?})
      foreach d ($c/chr${c}*_?{,?})
        cd $d
        echo "splitting $d"
        set contig = $d:t
        faSplit gap $contig.fa 500000 ${contig}_ -lift=$contig.lft \
          -minGapSize=100
        cd ../..
      end
    end

    #- Make the run directory and job list:
    cd /cluster/data/droYak2
    cat << '_EOF_' > jkStuff/RMDrosophila
#!/bin/csh -fe

cd $1
pushd .
/bin/mkdir -p /tmp/droYak2/$2
/bin/cp $2 /tmp/droYak2/$2/
cd /tmp/droYak2/$2
/cluster/bluearc/RepeatMasker/RepeatMasker -s -spec drosophila $2
popd
/bin/cp /tmp/droYak2/$2/$2.out ./
if (-e /tmp/droYak2/$2/$2.tbl) /bin/cp /tmp/droYak2/$2/$2.tbl ./
if (-e /tmp/droYak2/$2/$2.cat) /bin/cp /tmp/droYak2/$2/$2.cat ./
/bin/rm -fr /tmp/droYak2/$2/*
/bin/rmdir --ignore-fail-on-non-empty /tmp/droYak2/$2
/bin/rmdir --ignore-fail-on-non-empty /tmp/droYak2
'_EOF_'
    # << this line makes emacs coloring happy
    chmod +x jkStuff/RMDrosophila
    mkdir RMRun
    cp /dev/null RMRun/RMJobs
    foreach c (?{,?})
      foreach d ($c/chr${c}{,_random}_?{,?})
          set ctg = $d:t
          foreach f ( $d/${ctg}_?{,?}.fa )
            set f = $f:t
            echo /cluster/data/droYak2/jkStuff/RMDrosophila \
                 /cluster/data/droYak2/$d $f \
               '{'check out line+ /cluster/data/droYak2/$d/$f.out'}' \
              >> RMRun/RMJobs
          end
      end
    end
    # Do a dummy run to make sure that the libraries are unpacked 
    # before kicking off a cluster run:
    /cluster/bluearc/RepeatMasker/RepeatMasker -spec drosophila /dev/null

    #- Do the run
    ssh kk
    cd /cluster/data/droYak2/RMRun
    para make RMJobs
    para time
#Completed: 417 of 417 jobs
#Average job time:                6212s     103.53m     1.73h    0.07d
#Longest finished job:            8274s     137.90m     2.30h    0.10d
#Submission to last job:         15830s     263.83m     4.40h    0.18d

    #- Lift up the 500KB chunk .out's to 5MB ("pseudo-contig") level
    ssh kkstore03
    cd /cluster/data/droYak2
    foreach d (*/chr*_?{,?})
      set contig = $d:t
      echo $contig
      liftUp $d/$contig.fa.out $d/$contig.lft warn $d/${contig}_*.fa.out \
        > /dev/null
    end

    #- Lift pseudo-contigs to chromosome level
    foreach c (?{,?})
      echo lifting $c
      cd $c
      if (-e lift/ordered.lft && ! -z lift/ordered.lft) then
        liftUp chr$c.fa.out lift/ordered.lft warn `cat lift/oOut.lst` \
        > /dev/null
      endif
      if (-e lift/random.lft && ! -z lift/random.lft) then
        liftUp chr${c}_random.fa.out lift/random.lft warn `cat lift/rOut.lst` \
        > /dev/null
      endif
      cd ..
    end

    #- Load the .out files into the database with:
    ssh hgwdev
    cd /cluster/data/droYak2
    hgLoadOut droYak2 */chr*.fa.out


# VERIFY REPEATMASKER RESULTS (DONE 11/15/05 angie)
    # Eyeball some repeat annotations in the browser, compare to lib seqs.
    # Run featureBits on droYak2 and on a comparable genome build, and compare:
    ssh hgwdev
    featureBits droYak2 rmsk
#27434880 bases of 162681153 (16.864%) in intersection
    # compare to droYak1:
    featureBits droYak1 rmsk
#29021326 bases of 169423277 (17.129%) in intersection
    # Interesting that both total #bases and %repetitive dropped in 
    # the new assembly... same holds for just chr2L, so it's not just a 
    # chr*_random difference:
    featureBits -chrom=chr2L droYak2 rmsk
#1515320 bases of 22210025 (6.823%) in intersection
    featureBits -chrom=chr2L droYak1 rmsk
#1554767 bases of 22382034 (6.946%) in intersection
    # LaDeana says this is as expected.


# MAKE HGCENTRALTEST ENTRY AND TRACKDB TABLE (DONE 11/15/05 angie)
    # Make trackDb table so browser knows what tracks to expect:
    ssh hgwdev
    cd ~/kent/src/hg/makeDb/trackDb
    cvs up -d -P

    # Edit that makefile to add droYak2 in all the right places and do
    make update

    # Add trackDb directories
    mkdir drosophila/droYak2
    cvs add drosophila/droYak2
    cvs commit drosophila/droYak2

    mkdir /gbdb/droYak2/html
    cvs commit makefile
    # go public on genome-test
    make alpha

    # Warning: genome and organism fields must correspond
    # with defaultDb values
    hgsql -h genome-testdb hgcentraltest \
      -e 'INSERT INTO dbDb \
        (name, description, nibPath, organism, \
                defaultPos, active, orderKey, genome, scientificName, \
                htmlPath, hgNearOk, hgPbOk, sourceName) values \
        ("droYak2", "Nov. 2005", "/gbdb/droYak2/nib", "D. yakuba", \
               "chr2L:809001-825000", 1, 53, "D. yakuba", \
                "Drosophila yakuba", "/gbdb/droYak2/html/description.html", \
                0, 0, "WUSTL version 2.0");'


# MAKE LIFTALL.LFT (DONE 11/2/05 angie)
    ssh kkstore03
    cd /cluster/data/droYak2
    cat */lift/{ordered,random}.lft > jkStuff/liftAll.lft


# SIMPLE REPEATS (TRF) (DONE 11/3/05 angie)
    ssh kkstore03
    mkdir /cluster/data/droYak2/bed/simpleRepeat
    cd /cluster/data/droYak2/bed/simpleRepeat
    mkdir trf
    cp /dev/null jobs.csh
    foreach d (/cluster/data/droYak2/?{,?}/chr*_?{,?})
      set ctg = $d:t
      foreach f ($d/${ctg}.fa)
        set fout = $f:t:r.bed
        echo $fout
        echo "trfBig -trf=/cluster/bin/i386/trf $f /dev/null -bedAt=trf/$fout -tempDir=/tmp" \
        >> jobs.csh
      end
    end
    tcsh jobs.csh >&! jobs.log &
    # check on this with
    tail -f jobs.log
    wc -l jobs.csh
    ls -1 trf | wc -l
    liftUp simpleRepeat.bed ../../jkStuff/liftAll.lft warn \
      trf/*.bed > /dev/null

    # Load this into the database as so
    ssh hgwdev
    hgLoadBed droYak2 simpleRepeat \
      /cluster/data/droYak2/bed/simpleRepeat/simpleRepeat.bed \
      -sqlTable=$HOME/kent/src/hg/lib/simpleRepeat.sql


# FILTER SIMPLE REPEATS (TRF) INTO MASK (DONE 11/3/05 angie)
    # make a filtered version of the trf output: 
    # keep trf's with period <= 12:
    ssh kkstore03
    cd /cluster/data/droYak2/bed/simpleRepeat
    mkdir trfMask
    foreach f (trf/*.bed)
      echo -n "filtering $f... "
      awk '{if ($5 <= 12) print;}' $f > trfMask/$f:t
    end
    # Lift up filtered trf output to chrom coords:
    mkdir trfMaskChrom
    foreach f (../../?{,?}/chr*.fa)
      set c = $f:t:r
      liftUp trfMaskChrom/$c.bed ../../jkStuff/liftAll.lft warn \
        trfMask/${c}_[0-9]*.bed > /dev/null
    end


# MASK FA USING REPEATMASKER AND FILTERED TRF FILES (DONE 11/15/05 angie)
    ssh kkstore03
    cd /cluster/data/droYak2
    # Soft-mask (lower-case) the contig and chr .fa's, 
    # then make hard-masked versions from the soft-masked.  
    set trfCtg=bed/simpleRepeat/trfMask
    set trfChr=bed/simpleRepeat/trfMaskChrom
    foreach f (*/chr*.fa)
      echo "repeat- and trf-masking $f"
      maskOutFa -soft $f $f.out $f
      set chr = $f:t:r
      maskOutFa -softAdd $f $trfChr/$chr.bed $f
      echo "hard-masking $f"
      maskOutFa $f hard $f.masked
    end
    foreach c (?{,?})
      echo "repeat- and trf-masking contigs of chr$c, chr${c}_random"
      foreach d ($c/chr*_?{,?})
        set ctg=$d:t
        set f=$d/$ctg.fa
        maskOutFa -soft $f $f.out $f
        maskOutFa -softAdd $f $trfCtg/$ctg.bed $f
        maskOutFa $f hard $f.masked
      end
    end
    #- Rebuild the 2bit, using the soft masking in the fa:
    faToTwoBit ?{,?}/chr*.fa droYak2.2bit


# GOLD AND GAP TRACKS (DONE 11/3/05 angie)
    ssh hgwdev
    cd /cluster/data/droYak2
    cp /dev/null chrom.lst
    foreach f (?{,?}/chr*.agp chrM)
      echo $f:t:r >> chrom.lst
    end
    hgGoldGapGl -noGl -chromLst=chrom.lst droYak2 /cluster/data/droYak2 .
    # featureBits fails if there's no chrM_gap, so make one:
    # echo "create table chrM_gap like chr1_gap" | hgsql droYak2
    # oops, that won't work until v4.1, so do this for the time being:
    hgsql droYak2 -e 'create table chrM_gap select * from chr2L_gap where 0=1'


# MAKE GCPERCENT (DONE 11/3/05 angie)
    ssh hgwdev
    mkdir /cluster/data/droYak2/bed/gc5Base
    cd /cluster/data/droYak2/bed/gc5Base
    hgGcPercent -wigOut -doGaps -file=stdout -win=5 -verbose=2 droYak2 \
       /cluster/data/droYak2 | wigEncode stdin gc5Base.wig gc5Base.wib
    mkdir /gbdb/droYak2/wib
    ln -s `pwd`/gc5Base.wib /gbdb/droYak2/wib
    hgLoadWiggle -pathPrefix=/gbdb/droYak2/wib droYak2 gc5Base gc5Base.wig


# MAKE DOWNLOADABLE SEQUENCE FILES (DONE 11/15/05 angie)
    ssh kolossus
    cd /cluster/data/droYak2
    #- Build the .tar.gz files -- no genbank for now.
    cat << '_EOF_' > jkStuff/zipAll.csh
rm -rf bigZips
mkdir bigZips
tar cvzf bigZips/chromAgp.tar.gz ?{,?}/chr*.agp
tar cvzf bigZips/chromOut.tar.gz ?{,?}/chr*.fa.out
tar cvzf bigZips/chromFa.tar.gz ?{,?}/chr*.fa
tar cvzf bigZips/chromFaMasked.tar.gz ?{,?}/chr*.fa.masked
cd bed/simpleRepeat
tar cvzf ../../bigZips/chromTrf.tar.gz trfMaskChrom/chr*.bed
cd ../..
'_EOF_'
    # << this line makes emacs coloring happy
    csh -ef ./jkStuff/zipAll.csh |& tee zipAll.log
    #- Look at zipAll.log to make sure all file lists look reasonable.  
    cd bigZips
    md5sum *.gz > md5sum.txt
    # Make a README.txt
    cd ..
    mkdir chromGz
    foreach f ( ?{,?}/chr*.fa )
      echo $f:t:r
      gzip -c $f > chromGz/$f:t.gz
    end
    cd chromGz
    md5sum *.gz > md5sum.txt
    # Make a README.txt

    #- Link the .gz and .txt files to hgwdev:/usr/local/apache/...
    ssh hgwdev
    set gp = /usr/local/apache/htdocs/goldenPath/droYak2
    mkdir -p $gp/bigZips
    ln -s /cluster/data/droYak2/bigZips/{chrom*.tar.gz,*.txt} $gp/bigZips
    mkdir -p $gp/chromosomes
    ln -s /cluster/data/droYak2/chromGz/{chr*.gz,*.txt} $gp/chromosomes
    # Take a look at bigZips/* and chromosomes/*
    # Can't make refGene upstream sequence files - no refSeq for yakuba.
    mkdir $gp/database
    # Create README.txt files in database/ to explain the files.


# PUT MASKED SEQUENCE OUT FOR CLUSTER RUNS (DONE 11/15/05 angie)
    ssh kkr1u00
    # Chrom-level 2bit that has been repeat- and trf-masked:
    mkdir /iscratch/i/droYak2
    cp -p /cluster/data/droYak2/droYak2.2bit /iscratch/i/droYak2/
    iSync


# PRODUCING GENSCAN PREDICTIONS (DONE 11/15/05 angie)
    # Run on small cluster -- genscan needs big mem.
    ssh hgwdev
    mkdir /cluster/data/droYak2/bed/genscan
    cd /cluster/data/droYak2/bed/genscan
    # Check out hg3rdParty/genscanlinux to get latest genscan:
    cvs co hg3rdParty/genscanlinux
    # Run on small cluster (more mem than big cluster).
    ssh kki
    cd /cluster/data/droYak2/bed/genscan
    # Make 3 subdirectories for genscan to put their output files in
    mkdir gtf pep subopt
    # Generate a list file, genome.list, of all the hard-masked contigs that 
    # *do not* consist of all-N's (which would cause genscan to blow up)
    cp /dev/null genome.list
    foreach f ( `ls -1S /cluster/data/droYak2/*/chr*_*/chr*_?{,?}.fa.masked` )
      egrep '[ACGT]' $f > /dev/null
      if ($status == 0) echo $f >> genome.list
    end
    wc -l genome.list
#47
    # Create template file, gsub, for gensub2.  For example (3-line file):
    cat << '_EOF_' > gsub
#LOOP
/cluster/bin/x86_64/gsBig {check in line+ $(path1)} {check out line gtf/$(root1).gtf} -trans={check out line pep/$(root1).pep} -subopt={check out line subopt/$(root1).bed} -exe=hg3rdParty/genscanlinux/genscan -par=hg3rdParty/genscanlinux/HumanIso.smat -tmp=/tmp -window=2400000
#ENDLOOP
'_EOF_'
    # << this line makes emacs coloring happy
    gensub2 genome.list single gsub jobList
    para make jobList
    para time
#Completed: 47 of 47 jobs
#Average job time:                 109s       1.81m     0.03h    0.00d
#Longest finished job:             225s       3.75m     0.06h    0.00d
#Submission to last job:           396s       6.60m     0.11h    0.00d
    # If there are crashes, diagnose with "para problems".  
    # If a job crashes due to genscan running out of memory, re-run it 
    # manually with "-window=1200000" instead of "-window=2400000".

    # Convert these to chromosome level files as so:
    ssh kkstore03
    cd /cluster/data/droYak2/bed/genscan
    liftUp genscan.gtf ../../jkStuff/liftAll.lft warn gtf/*.gtf
    liftUp genscanSubopt.bed ../../jkStuff/liftAll.lft warn subopt/*.bed
    cat pep/*.pep > genscan.pep

    # Load into the database as so:
    ssh hgwdev
    cd /cluster/data/droYak2/bed/genscan
    ldHgGene droYak2 genscan genscan.gtf
    hgPepPred droYak2 generic genscanPep genscan.pep
    hgLoadBed droYak2 genscanSubopt genscanSubopt.bed


# MAKE 11.OOC FILE FOR BLAT (DONE 11/15/05 angie)
    # Use -repMatch=100 (based on size -- for human we use 1024, and 
    # fly size is ~4.4% of human judging by gapless dm1 genome size from 
    # featureBits -- we would use 45, but bump that up a bit to be more 
    # conservative).
    ssh kkr1u00
    mkdir /cluster/bluearc/droYak2
    blat /cluster/data/droYak2/droYak2.2bit /dev/null /dev/null -tileSize=11 \
      -makeOoc=/cluster/bluearc/droYak2/11.ooc -repMatch=100
#Wrote 7237 overused 11-mers to /cluster/bluearc/droYak2/11.ooc
    cp -p /cluster/bluearc/droYak2/*.ooc /iscratch/i/droYak2/
    iSync


# AUTO UPDATE GENBANK MRNA RUN  (DONE 11/23/05 angie)
    ssh hgwdev
    # Update genbank config and source in CVS:
    cd ~/kent/src/hg/makeDb/genbank
    cvsup .
    make
    # See if /cluster/data/genbank/etc/genbank.conf has had any un-checked-in
    # edits, check them in if necessary:
    diff /cluster/data/genbank/etc/genbank.conf etc/genbank.conf

    # Edit etc/genbank.conf and add these lines:
# droYak2 (D. yakuba)
droYak2.serverGenome = /cluster/data/droYak2/droYak2.2bit
droYak2.clusterGenome = /iscratch/i/droYak2/droYak2.2bit
droYak2.ooc = /iscratch/i/droYak2/11.ooc
droYak2.lift = /cluster/data/droYak2/jkStuff/liftAll.lft
droYak2.refseq.mrna.native.pslCDnaFilter  = ${lowCover.refseq.mrna.native.pslCDnaFilter}
droYak2.refseq.mrna.xeno.pslCDnaFilter    = ${lowCover.refseq.mrna.xeno.pslCDnaFilter}
droYak2.genbank.mrna.native.pslCDnaFilter = ${lowCover.genbank.mrna.native.pslCDnaFilter}
droYak2.genbank.mrna.xeno.pslCDnaFilter   = ${lowCover.genbank.mrna.xeno.pslCDnaFilter}
droYak2.genbank.est.native.pslCDnaFilter  = ${lowCover.genbank.est.native.pslCDnaFilter}
droYak2.refseq.mrna.native.load = no
droYak2.refseq.mrna.xeno.load = yes
droYak2.genbank.mrna.xeno.load = yes
droYak2.downloadDir = droYak2

    cvs ci etc/genbank.conf

    # update /cluster/data/genbank/
    make etc-update

    ssh kkstore02
    cd /cluster/data/genbank
    nice bin/gbAlignStep -initial droYak2 &

    # when finished
    ssh hgwdev
    cd /cluster/data/genbank
    nice ./bin/gbDbLoadStep -drop -initialLoad droYak2 &
    featureBits droYak2 xenoRefGene
#27309563 bases of 162681153 (16.787%) in intersection
    featureBits droYak2 mrna
#376949 bases of 162681153 (0.232%) in intersection
    featureBits droYak2 xenoMrna
#23932293 bases of 162681153 (14.711%) in intersection
    featureBits droYak2 est
#1768531 bases of 162681153 (1.087%) in intersection


# SWAP CHAINS FROM DM2, BUILD NETS ETC. (DONE 11/23/05 angie)
    mkdir /cluster/data/droYak2/bed/blastz.dm2.swap
    cd /cluster/data/droYak2/bed/blastz.dm2.swap
    doBlastzChainNet.pl -swap /cluster/data/dm2/bed/blastz.droYak2/DEF \
      >& do.log &
    tail -f do.log
    # Add {chain,net}Dm2 to trackDb.ra if necessary.


# MAKE THIS THE DEFAULT ASSEMBLY (DONE 11/23/05 angie)
    # -- when there are enough tracks!
    hgsql -h genome-testdb hgcentraltest \
      -e 'UPDATE defaultDb set name = "droYak2" where genome = "D. yakuba"'

# MAKE Drosophila Proteins track (DONE braney 2005-12-07)
    ssh kkstore03
    mkdir -p /cluster/data/droYak2/blastDb
    cd /cluster/data/droYak2/blastDb
    awk "{print \$2}" ../*/chr*/*.lft > subChr.lst
    for i in `cat subChr.lst`; do ln -s ../*/*/$i.fa; done
    for i in *.fa; do /cluster/bluearc/blast2211x86_64/bin/formatdb -i $i -p F 2> /dev/null; done
    rm *.fa *.log
    cd ..
    cat */chr*/*.lft > jkStuff/subChr.lft   

    ssh kk
    destDir=/cluster/panasas/home/store/droYak2/blastDb
    mkdir -p $destDir
    cp /cluster/data/droYak2/blastDb/* $destDir
    
    mkdir -p /cluster/data/droYak2/bed/tblastn.dm2FB
    cd /cluster/data/droYak2/bed/tblastn.dm2FB
    ls -1S $destDir/*.nsq | sed "s/\.nsq//" > target.lst
    mkdir fbfa
    # calculate a reasonable number of jobs 
    calc `wc /cluster/data/dm2/bed/blat.dm2FB/dm2FB.psl|awk "{print \\\$1}"`/\(80000/`wc target.lst | awk "{print \\\$1}"`\)
# 18929/(80000/417) = 98.667412

    split -l 99 /cluster/data/dm2/bed/blat.dm2FB/dm2FB.psl fbfa/fb
    cd fbfa
    for i in *; do pslxToFa $i $i.fa; rm $i; done
    cd ..
    ls -1S fbfa/*.fa > fb.lst
    mkdir -p /cluster/bluearc/droYak2/bed/tblastn.dm2FB/blastOut  
    ln -s /cluster/bluearc/droYak2/bed/tblastn.dm2FB/blastOut  
    for i in `cat fb.lst`; do  mkdir blastOut/`basename $i .fa`; done
    tcsh
    cat << '_EOF_' > blastGsub
#LOOP
blastSome $(path1) {check in line $(path2)} {check out exists blastOut/$(root2)/q.$(root1).psl } 
#ENDLOOP
'_EOF_'
    cat << '_EOF_' > blastSome
#!/bin/sh
BLASTMAT=/iscratch/i/blast/data
export BLASTMAT
g=`basename $2`
f=/tmp/`basename $3`.$g
for eVal in 0.01 0.001 0.0001 0.00001 0.000001 1E-09 1E-11
do
if /scratch/blast/blastall -M BLOSUM80 -m 0 -F no -e $eVal -p tblastn -d $1 -i $2 -o $f.8
then
        mv $f.8 $f.1
        break;
fi
done
if test -f  $f.1
then
if /cluster/bin/i386/blastToPsl $f.1 $f.2
then
	liftUp -nosort -type=".psl" -nohead $f.3 /cluster/data/droYak2/jkStuff/subChr.lft warn $f.2  
	liftUp -nosort -type=".psl" -nohead $f.4 /cluster/data/droYak2/jkStuff/liftAll.lft warn $f.3  
        liftUp -nosort -type=".psl" -pslQ -nohead $3.tmp /cluster/data/dm2/bed/blat.dm2FB/protein.lft warn $f.4
        mv $3.tmp $3
        rm -f $f.1 $f.2 $f.3 $f.4
        exit 0
    fi
fi
rm -f $f.1 $f.2 $3.tmp $f.3 $f.8 $f.4
exit 1
'_EOF_'

    chmod +x blastSome
    gensub2 target.lst fb.lst blastGsub blastSpec

    para create blastSpec
    para push

# Completed: 80064 of 80064 jobs
# CPU time in finished jobs:    2324537s   38742.28m   645.70h   26.90d  0.074 y
# IO & Wait Time:                586197s    9769.96m   162.83h    6.78d  0.019 y
# Average job time:                  36s       0.61m     0.01h    0.00d
# Longest finished job:             348s       5.80m     0.10h    0.00d
# Submission to last job:         10775s     179.58m     2.99h    0.12d

    ssh kkstore03
    cd /cluster/data/droYak2/bed/tblastn.dm2FB
    tcsh
    cat << '_EOF_' > chainGsub
#LOOP
chainSome $(path1)
#ENDLOOP
'_EOF_'

    cat << '_EOF_' > chainSome
(cd $1; cat q.*.psl | /cluster/bin/i386/simpleChain -prot -outPsl -maxGap=25000 stdin ../c.`basename $1`.psl)
'_EOF_'
    chmod +x chainSome

    ls -1dS `pwd`/blastOut/fb?? > chain.lst
    gensub2 chain.lst single chainGsub chainSpec

    para create chainSpec
    para push

# Completed: 192 of 192 jobs
# CPU time in finished jobs:      11624s     193.73m     3.23h    0.13d  0.000 y
# IO & Wait Time:                  3404s      56.74m     0.95h    0.04d  0.000 y
# Average job time:                  78s       1.30m     0.02h    0.00d
# Longest finished job:            1681s      28.02m     0.47h    0.02d
# Submission to last job:          1681s      28.02m     0.47h    0.02d

    cd /cluster/data/droYak2/bed/tblastn.dm2FB/blastOut
    for i in fb??
    do 
	awk "(\$13 - \$12)/\$11 > 0.6 {print}" c.$i.psl > c60.$i.psl
	sort -rn c60.$i.psl | pslUniq stdin u.$i.psl
	awk "((\$1 / \$11) ) > 0.60 { print   }" c60.$i.psl > m60.$i.psl
	echo $i
    done

    sort -T /tmp -k 14,14 -k 16,16n -k 17,17n u.*.psl m60* | uniq > /cluster/data/droYak2/bed/tblastn.dm2FB/blastDm2FB.psl
    cd ..
    wc blastDm2FB.psl
# 21136  443856 3623407 blastDm2FB.psl
    pslUniq blastDm2FB.psl stdout | wc                                                                                    
# 18837  395577 3306944
    cat fbfa/*fa | grep ">" | wc
# 82338   82338 1300520

    ssh hgwdev
    cd /cluster/data/droYak2/bed/tblastn.dm2FB
    hgLoadPsl droYak2 blastDm2FB.psl
    featureBits droYak2 blastDm2FB
# 21478785 bases of 162681153 (13.203%) in intersection
    exit

    # back to kkstore03
    rm -rf blastOut

# End tblastn


# CONTAMINATION (DONE 12/13/05 angie)
    # LaDeana emailed a list of 57 contaminated contigs on chrU -- 
    # make a track out of her coords.  She cleared them all to NNN's 
    # in a 2.1 release for FlyBase, but all coords are the same and 
    # we can just make a contamination track showing the affected 
    # chrU contigs, instead of redoing the whole db.
    mkdir /cluster/data/droYak2/bed/contamination
    cd /cluster/data/droYak2/bed/contamination
    # saved off LaDeana's emailed list to contamContigs.txt
    wc -l contamContigs.txt
#57 contamContigs.txt
    cp /dev/null contamination.bed
    foreach ctg (`cat contamContigs.txt`)
      fgrep -w $ctg ../../U/chrU.agp \
      | awk '{print $1 "\t" $2-1 "\t" $3 "\t" $6;}' >> contamination.bed
    end
    wc -l contamination.bed
#57 contamination.bed
    hgLoadBed droYak2 contamination contamination.bed


# SWAP CHAINS FROM DROSIM1, BUILD NETS ETC. (DONE 8/1/06 angie)
    mkdir /cluster/data/droYak2/bed/blastz.droSim1.swap
    cd /cluster/data/droYak2/bed/blastz.droSim1.swap
    doBlastzChainNet.pl -swap /cluster/data/droSim1/bed/blastz.droYak2/DEF \
      >& do.log &
    tail -f do.log
    ln -s blastz.droSim1.swap /cluster/data/droYak2/bed/blastz.droSim1


###########################################################################
# SWAP/CHAIN/NET DM3 (DONE 6/7/07 angie)
    ssh kkstore03
    mkdir /cluster/data/droYak2/bed/blastz.dm3.swap
    cd /cluster/data/droYak2/bed/blastz.dm3.swap
    doBlastzChainNet.pl -swap /cluster/data/dm3/bed/blastz.droYak2/DEF >& do.log &
    tail -f do.log
    ln -s blastz.dm3.swap /cluster/data/droYak2/bed/blastz.dm3

