# for emacs: -*- mode: sh; -*-


# Apis mellifera -- 
# 
# Baylor HGSC's January 20, 2005 (Amel_2.0) assembly
# http://www.hgsc.bcm.tmc.edu/projects/honeybee/
#

#  NOTE:  this doc may have genePred loads that fail to include
#  the bin column.  Please correct that for the next build by adding
#  a bin column when you make any of these tables:
#
#  mysql> SELECT tableName, type FROM trackDb WHERE type LIKE "%Pred%";
#  +--------------+---------------------+
#  | tableName    | type                |
#  +--------------+---------------------+
#  | modelRefGene | genePred .          |
#  | ensGene      | genePred ensPep     |
#  | brhInparalog | genePred .          |
#  | geneid       | genePred geneidPep  |
#  | genscan      | genePred genscanPep |
#  +--------------+---------------------+


# DOWNLOAD SEQUENCE (DONE 2/1/05 Andy)
    ssh kksilo
    mkdir /cluster/store10/apiMel2
    cd /cluster/data
    ln -s /cluster/store10/apiMel2 apiMel2
    cd /cluster/data/apiMel2
    mkdir -p jkStuff bed downloads/fa downloads/contigs
    cd downloads
    wget ftp://ftp.hgsc.bcm.tmc.edu/pub/data/Amellifera/fasta/Amel20050120-freeze/README_Amel_2.0_20050120.TXT
    cd contigs
    for grp in `seq 1 16` Un; do
      file=Group${grp}
      wget ftp://ftp.hgsc.bcm.tmc.edu/pub/data/Amellifera/fasta/Amel20050120-freeze/contigs/${file}_20050120.agp
      wget ftp://ftp.hgsc.bcm.tmc.edu/pub/data/Amellifera/fasta/Amel20050120-freeze/contigs/${file}_20050120.fa.gz
      mv ${file}_20050120.agp $file.agp
      mv ${file}_20050120.fa.gz $file.fa.gz     
      zcat $file.fa.gz  | egrep -v '^>\?[0-9]+$' | sed -e \
         's/gnl|Amel_2.0|//; s/Amel_2.0_Contig/Contig/' | awk '{print $1}' > $file.contigs.fa
      cat $file.contigs.fa >> allContigs.fa
      sed -e 's/gnl|Amel_2.0|//g; s/Amel_2.0_Contig/Contig/' $file.agp > new.agp
      mv new.agp $file.agp
      agpToFa -simpleMultiMixed $file.agp $file $file.fa $file.contigs.fa
      cat $file.fa >> all.fa
      mv $file.fa ../fa
      gzip $file.contigs.fa
    done
    rm *.fa.gz
    faSize allContigs.fa
#224752160 bases (1637 N's 224750523 real 224750523 upper 0 lower) in 16028 sequences in 1 files
#Total size: mean 14022.5 sd 22045.0 min 607 (Contig11541) max 314907 (Contig661) median 5126
#N count: mean 0.1 sd 0.8
#U count: mean 14022.4 sd 22044.9
#L count: mean 0.0 sd 0.0
    faSize all.fa
#502497597 bases (277747074 N's 224750523 real 224750523 upper 0 lower) in 17 seq uences in 1 files
#Total size: mean 29558682.2 sd 75264208.4 min 5608962 (Group16) max 321143811 (G roupUn) median 10665743
#N count: mean 16338063.2 sd 60963796.3
#U count: mean 13220619.0 sd 14657764.0
#L count: mean 0.0 sd 0.0
    
# CREATING DATABASE (DONE 1/27/05 Andy)
    # Create the database.
    ssh hgwdev
    # Make sure there is at least 5 gig free for the database
    df -h /var/lib/mysql
#/dev/sdc1             1.8T  641G 1019G  39% /var/lib/mysql
    hgsql '' -e 'create database apiMel2'

# PARTITION SCAFFOLDS FOR REPEATMASKER RUN (DONE 2/2/05 Andy)
    ssh kksilo
    cd /cluster/data/apiMel2
    mkdir faSplits
    for fa in downloads/fa/*.fa; do
        c=${fa#*Group}; c=Group${c%*.fa}
        mkdir faSplits/$c
        faSplit -lift=faSplits/$c.lft gap $fa 500000 faSplits/$c/${c}_
    done

# RUN REPEAT MASKER (DONE 2/3/05 Andy)
    # January '05 version of RepeatMasker and libs.
    ssh kksilo
    cd /cluster/data/apiMel2
    # Check the library for apis repeats.
    /cluster/bluearc/RepeatMasker050112/util/queryRepeatDatabase.pl -species apis -stat
#>IS1#ARTEFACT  Length = 768 bp
#>IS2#ARTEFACT  Length = 1331 bp
#>IS3#ARTEFACT  Length = 1258 bp
#>IS5#ARTEFACT  Length = 1195 bp
#>IS10#ARTEFACT  Length = 1329 bp
# .....
#>Mariner_AM#DNA/Mariner  Length = 937 bp
#>MARINA#DNA/Mariner  Length = 1267 bp
#
#Total Sequence Length = 47828 bp

    # So there's honeybee in the library now, so just run repeat mask with that
cat << "_EOF_" > jkStuff/RMApis
#!/bin/bash 

file=`basename $1`
grp=${file%_*}
chunk=${file%.fa}

cd /cluster/data/apiMel2/RMOut
mkdir -p /tmp/apiMel2/$chunk
cp ../faSplits/$grp/$file /tmp/apiMel2/$chunk/
pushd /tmp/apiMel2/$chunk
/cluster/bluearc/RepeatMasker/RepeatMasker -s -spec apis $file
popd
mkdir -p $grp
cp /tmp/apiMel2/$chunk/$file.out ./$grp/$chunk.out
rm -fr /tmp/apiMel2/$chunk/*
rmdir --ignore-fail-on-non-empty /tmp/apiMel2/$chunk
rmdir --ignore-fail-on-non-empty /tmp/apiMel2
_EOF_
    # << this line makes emacs coloring happy
    chmod +x jkStuff/RMApis
    mkdir RMRun RMOut
    cd RMRun/
    find ../faSplits/ -name '*.fa' > input.lst
    cat << "_EOF_" > gsub
#LOOP
../jkStuff/RMApis $(path1) {check in line+ $(path1)} {check out line+ ../RMOut/$(lastDir1)/$(root1).out}
#ENDLOOP
_EOF_
    gensub2 input.lst single gsub RMJobs
    # Do the cluster run
    ssh kk
    cd /cluster/data/apiMel2/RMRun
    para create RMJobs
    para try
    para push
    para time
#1077 jobs in batch
#44332 jobs (including everybody's) in Parasol queue.
#Checking finished jobs
#Completed: 1077 of 1077 jobs
#CPU time in finished jobs:     120949s    2015.82m    33.60h    1.40d  0.004 y
#IO & Wait Time:                  5901s      98.34m     1.64h    0.07d  0.000 y
#Average job time:                 118s       1.96m     0.03h    0.00d
#Longest job:                      262s       4.37m     0.07h    0.00d
#Submission to last job:          2609s      43.48m     0.72h    0.03d
    # lift and load results 
    ssh kksilo
    cd /cluster/data/apiMel1/RMOut
    for grp in *; do
      cd $grp
      for out in *; do
         lifted=${out%.out}.lifted.out
         liftUp $lifted ../../faSplits/$grp.lft warn $out
         if [ -e ../$grp.out ]; then 
            tail +4 $lifted >> ../$grp.out
         else
            cp $lifted ../$grp.out
         fi         
      done
      cd ../
      if [ -e all.out ]; then 
         tail +4 $grp.out
      else
         cp $grp.out all.out
      fi
    done
    ssh hgwdev
    cd /cluster/data/apiMel1/RMOut
    hgLoadOut apiMel2 all.out
    hgsql apiMel2 -e 'rename table all_rmsk to rmsk'
    hgsql apiMel2 -e 'drop index bin       on rmsk; \
                  drop index genoStart on rmsk; \
                  drop index genoEnd   on rmsk; \
                  create index bin       on rmsk (genoName(11), bin); \
                  create index genoStart on rmsk (genoName(11), genoStart); \
                  create index genoEnd   on rmsk (genoName(11), genoEnd);' 

# LOAD GAP & GOLD TABLES FROM AGP (DONE 2/3/2005 Andy)
    ssh hgwdev
    cd /cluster/data/apiMel2/downloads/contigs
    cat Group?{,?}.agp >> all.agp
    hgGoldGapGl -noGl apiMel2 all.agp
    rm all.agp
    # For some reason, the indices did not get built correctly --
    # "show index from gap/gold" shows NULL cardinalities for chrom.  
    # Rebuild indices with "analyze table".
    # *** Andy's note: the same thing happened in this assembly too.
    hgsql apiMel2 -e 'analyze table gold; analyze table gap;'

# MAKE GC5BASE WIGGLE TRACK (DONE 2/17/2005 Andy)
    ssh hgwdev
    mkdir /cluster/data/apiMel2/bed/gc5Base
    cp /cluster/data/apiMel2/bed/gc5Base
    hgGcPercent -wigOut -doGaps -file=stdout -win=5 -verbose=2 apiMel2 \
       /cluster/data/apiMel2 | wigEncode stdin gc5Base.wig gc5Base.wib
    mkdir /gbdb/apiMel2/wib
    ln -s `pwd`/gc5Base.wib /gbdb/apiMel2/wib
    hgLoadWiggle -pathPrefix=/gbdb/apiMel2/wib apiMel2 gc5Base gc5Base.wig

# SIMPLE REPEATS (TRF) (DONE 2/3/2005 Andy)
    ssh kksilo
    mkdir -p /cluster/data/apiMel2/bed/simpleRepeat
    cd /cluster/data/apiMel2/bed/simpleRepeat
    for f in ../../downloads/fa/*; do
      grp=${f##*\/}; grp=${grp%.fa}
      echo nice /cluster/bin/i386/trfBig -trf=/cluster/bin/i386/trf $f /dev/null -bedAt=simpleRepeat.$grp.bed -tempDir=/tmp >> jobs.sh
    done
    chmod +x jobs.sh
    ./jobs.sh | egrep -v '^(Removed|Tandem|Copyright|Loading|Allocating|Initializing|Computing|Scanning|Freeing)' \
      > jobs.log 2> jobs.errors.log &
    # check on this with
    tail -f jobs.log
    for bed in *.bed; do 
       cat $bed >> simpleRepeat.bed
    done
    rm *Group*.bed
    # Load this into the database as so
    ssh hgwdev
    cd /cluster/data/apiMel2/bed/simpleRepeat
    hgLoadBed -sqlTable=/cluster/home/aamp/kent/src/hg/lib/simpleRepeat.sql \
       apiMel2 simpleRepeat simpleRepeat.bed


# FILTER SIMPLE REPEATS (TRF) INTO MASK (DONE 2/3/2005 Andy)
    # make a filtered version of the trf output: 
    # keep trf's with period <= 12:
    ssh kksilo
    cd /cluster/data/apiMel/bed/simpleRepeat
    awk '{if ($5 <= 12) print;}' simpleRepeat.bed > trfMask.bed


# MASK FA USING REPEATMASKER AND FILTERED TRF FILES (DONE 2/3/2005 Andy)
    ssh kksilo
    cd /cluster/data/apiMel2
    mkdir maskedFa
    cd maskedFa 
    cp ../downloads/fa/*.fa .
    for fa in *; do
       grp=${fa%.fa}
       egrep "^${grp}\>" ../bed/simpleRepeat/trfMask.bed > $grp.trf.bed
       maskOutFa -soft $fa $grp.trf.bed $grp.masked.fa
       maskOutFa -softAdd $grp.masked.fa ../RMOut/$grp.out $grp.masked.fa  
       rm $fa
       mv $grp.masked.fa $fa
    done
    rm *.bed

# STORE SEQUENCE AND ASSEMBLY INFORMATION (DONE 2/4/05 Andy)
    # Translate to nib
    ssh kksilo
    cd /cluster/data/apiMel2
    mkdir nib
    for fa in maskedFa/*; do
       nib=${fa##*\/}; nib=nib/${nib%.fa}.nib
       faToNib -softMask $fa $nib
    done
    faToTwoBit maskedFa/Group?{,?}.fa apiMel2.2bit
    # Make links in /gbdb   
    ssh hgwdev
    mkdir -p /gbdb/apiMel2/nib
    cd /cluster/data/apiMel2/nib
    for nib in *; do 
       ln -s /cluster/data/apiMel2/nib/$nib /gbdb/apiMel2/nib/$nib
    done
    ln -s /cluster/data/apiMel2/apiMel2.2bit /gbdb/apiMel2/apiMel2.2bit
    # Load into database
    hgsql apiMel2 < ~/kent/src/hg/lib/chromInfo.sql
    cd ../
    hgNibSeq -preMadeNib apiMel2 /gbdb/apiMel2/nib /cluster/data/apiMel2/maskedFa/Group?{,?}.fa
    echo "select chrom,size from chromInfo" | hgsql apiMel2 | tail +2 \
      > /cluster/data/apiMel2/chrom.sizes

# CREATING GRP TABLE FOR TRACK GROUPING (DONE 2/4/2005 Andy)
    # Copy all the data from the table "grp" 
    # in an existing database to the new database
    ssh hgwdev
    hgsql apiMel2 -e 'create table grp (PRIMARY KEY(NAME)) select * from hg17.grp'

# MAKE HGCENTRALTEST ENTRY AND TRACKDB TABLE (DONE 2/4/2005 Andy)
    # Warning: genome and organism fields must correspond
    # with defaultDb values
    echo 'INSERT INTO dbDb \
        (name, description, nibPath, organism, \
             defaultPos, active, orderKey, genome, scientificName, \
             htmlPath, hgNearOk, hgPbOk, sourceName) values \
        ("apiMel2", "Jan. 2005", "/gbdb/apiMel2", "A. mellifera", \
             "Group10:100001-130000", 1, 57, \
             "A. mellifera", \
             "Apis mellifera", "/gbdb/apiMel2/html/description.html", \
             0, 0, "Baylor HGSC Amel_2.0");' \
      | hgsql -N hgcentraltest
    echo 'update defaultDb set name="apiMel2" where genome="A. mellifera"' \
      | hgsql -N hgcentraltest

    # Make trackDb table so browser knows what tracks to expect:
    ssh hgwdev
    cd ~/kent/src/hg/makeDb/trackDb
    cvs update
    # Edit trackDb/makefile to add apiMel1 to the DBS variable.
    mkdir apis/apiMel2
    # Create a simple apis/apiMel2/description.html file.
    cvs add apis/apiMel2
    cvs add apis/apiMel2/description.html
    make update DBS=apiMel2 ZOO_DBS=
    # go public on genome-test
    cvs ci -m "Added apiMel2 (Apis mellifera, honeybee)." makefile
    cvs ci -m "First-cut trackDb.ra and description.html for apiMel2 (Apis mellifera, honeybee)." apis
    mkdir /gbdb/apiMel1/html
    # in a clean, updated tree's kent/src/hg/makeDb/trackDb:
    make alpha

# MAKE HGCENTRALTEST BLATSERVERS ENTRY (DONE 1/29/2005 Andy)
    ssh hgwdev
    echo 'insert into blatServers values("apiMel2", "blat14", "17792", 1, 0); \
          insert into blatServers values("apiMel2", "blat14", "17793", 0, 1);' \
      | hgsql -N hgcentraltest

# MAKE DOWNLOADABLE FILES (DONE 2/6/2005 Andy)
    ssh kksilo
    cd /cluster/data/apiMel2
    mkdir zips
    zip -j zips/chromOut.zip RMOut/Group?{,?}.out
    zip -j zips/chromFa.zip maskedFa/Group?{,?}.fa
    zip -j zips/GroupTrf.zip bed/simpleRepeat/trfMask.bed    
    zip -j zips/GroupAgp.zip downloads/contigs/*.agp
    for fa in maskedFa/Group?{,?}.fa; do
      maskOutFa $fa hard $fa.hardMasked
    done
    zip -j zips/chromFaMasked.zip maskedFa/Group?{,?}.fa.hardMasked
    cd /usr/local/apache/htdocs/goldenPath/apiMel2
    mkdir bigZips database
    # Create README.txt files in bigZips/ and database/ to explain the files.
    cp -p /cluster/data/apiMel2/zips/*.zip bigZips
    md5sum *.zip > md5sum.txt

# MAKE 11.OOC FILE FOR BLAT (DONE 2/8/2005 Andy)
    # Use -repMatch=100 (based on size -- for human we use 1024, and 
    # fly size is ~4.4% of human judging by gapless dm1 genome size from 
    # featureBits -- we would use 45, but bump that up a bit to be more 
    # conservative).
    ssh kkr1u00
    mkdir /cluster/bluearc/apiMel2
    blat /cluster/data/apiMel2/apiMel2.2bit /dev/null /dev/null -tileSize=11 \
      -makeOoc=/cluster/bluearc/apiMel2/11.ooc -repMatch=100
#Wrote 39886 overused 11-mers to /cluster/bluearc/apiMel2/11.ooc
    cp -p /cluster/bluearc/apiMel2/*.ooc /iscratch/i/apiMel2/
    iSync

# PUT SEQUENCE ON /ISCRATCH FOR BLASTZ (DONE 2/8/2005 Andy)
    # First, agglomerate small scaffolds into chunks of ~500k median 
    # (many scaffolds are larger than that) so we don't have too many 
    # files for one dir, but keep a reasonably low job run time:
    ssh kksilo
    cd /cluster/data/apiMel2
    ssh kkr1u00
    cp -pR /cluster/data/apiMel2/faSplits /iscratch/i/apiMel2/
    cp -p /cluster/data/apiMel2/apiMel2.2bit /iscratch/i/apiMel2/
    cp -pR /cluster/data/apiMel2/nib /iscratch/i/apiMel2
    iSync

# PRODUCING GENSCAN PREDICTIONS (DONE 1/28/2005 Andy)
    ssh kksilo
    # Make hard-masked scaffolds and split up for processing:
    cd /cluster/data/apiMel2
    mkdir hardMaskedFaSplits
    cd maskedFa/
    for fa in *.hardMasked; do
      grp=${fa%.fa.hardMasked}
      faSplit -lift=../hardMaskedFaSplits/${grp}.lft gap $fa 1000000 ../hardMaskedFaSplits/${grp}_
    done
    cd ../hardMaskedFaSplits
    cat Group?{,?}.lft > all.lft
    mkdir /cluster/data/apiMel2/bed/genscan
    cd /cluster/data/apiMel2/bed/genscan
    # Check out hg3rdParty/genscanlinux to get latest genscan:
    cvs co hg3rdParty/genscanlinux
    # Make 3 subdirectories for genscan to put their output files in
    mkdir gtf pep subopt
    ls -1S ../../hardMaskedFaSplits/*.fa > chunks.list
    cat << _EOF_ > gsub
#LOOP
gsBig {check in line+ \$(path1)} {check out line gtf/\$(root1).gtf} -trans={check out line pep/\$(root1).pep} -subopt={check out line subopt/\$(root1).bed} -exe=hg3rdParty/genscanlinux/genscan -par=hg3rdParty/genscanlinux/HumanIso.smat -tmp=/tmp -window=2400000
#ENDLOOP
_EOF_
    # << this line keeps emacs coloring happy
    gensub2 chunks.list single gsub jobList
    ssh kk
    cd /cluster/data/apiMel2/bed/genscan
    para create jobList
    para try
    para check
    para push 
#538 jobs in batch
#0 jobs (including everybody's) in Parasol queue.
#Checking finished jobs
#Completed: 537 of 538 jobs
#Crashed: 1 jobs
#CPU time in finished jobs:     113471s    1891.18m    31.52h    1.31d  0.004 y
#IO & Wait Time:                  7324s     122.07m     2.03h    0.08d  0.000 y
#Average job time:                 225s       3.75m     0.06h    0.00d
#Longest job:                    32966s     549.43m     9.16h    0.38d
#Submission to last job:         34674s     577.90m     9.63h    0.40d

    # If there are crashes, diagnose with "para problems".  
    # If a job crashes due to genscan running out of memory, re-run it 
    # manually with "-window=1200000" instead of "-window=2400000".
    # Wow, 1.2M and 1M crashed too.  Dropped down to .6M:
    ssh kolossus
    cd /cluster/data/apiMel2/bed/genscan
    gsBig ../../hardMaskedFaSplits/Group9_05.fa gtf/Group9_05.gtf \
        -trans=pep/Group9_05.pep -subopt=subopt/Group9_05.bed \
        -exe=hg3rdParty/genscanlinux/genscan -par=hg3rdParty/genscanlinux/HumanIso.smat \
        -tmp=/tmp -window=600000
    # Concatenate scaffold-level results:
    ssh kksilo
    cd /cluster/data/apiMel2/bed/genscan

    # Lift and cat things
    cat gtf/*.gtf > /tmp/genscan.gtf
    liftUp genscan.gtf ../../hardMaskedFaSplits/all.lft warn /tmp/genscan.gtf
    cat subopt/*.bed > /tmp/genscanSubopt.bed
    liftUp genscanSubopt.bed ../../hardMaskedFaSplits/all.lft warn /tmp/genscanSubopt.bed
    cat pep/*.pep > genscan.pep
    # Clean up
    rm -r /cluster/data/apiMel2/hardMaskedFaSplits 
    # Change gene names around a little (errored on loading with this)
    #sed 's/\_[0-9Un]\{1,2\}//g' < genscan.gtf > /tmp/genscan.gtf 
    #cp /tmp/genscan.gtf .
    #sed 's/\_[0-9Un]\{1,2\}//g' < genscan.pep > /tmp/genscan.pep 
    #cp /tmp/genscan.pep .
    #sed 's/\_[0-9Un]\{1,2\}//g' < genscanSubopt.bed > /tmp/genscanSubopt.bed
    #cp /tmp/genscanSubopt.bed .

    # Load into the database as so:
    ssh hgwdev
    cd /cluster/data/apiMel2/bed/genscan
    ldHgGene -gtf apiMel2 genscan genscan.gtf
    hgPepPred apiMel2 generic genscanPep genscan.pep
    hgLoadBed apiMel2 genscanSubopt genscanSubopt.bed

# GENBANK mRNA AND EST COUNTS (DONE 2/14/2005 Andy)
    # Go to the latest GenBank full release dir and get an idea of how
    # many mRNAs and ESTs there are to align.
    ssh eieio
    cd /cluster/data/genbank/data/processed/genbank.145.0/full
    awk '$4 == "Apis" {print $4 " " $5;}' mrna.gbidx | sort | uniq -c
#    199 Apis mellifera
    awk '$4 == "Apis" {print $4 " " $5;}' est*.gbidx | sort | uniq -c
#  24643 Apis mellifera

# AUTO UPDATE GENBANK MRNA RUN  (DONE 2/8/2005 Andy)
    ssh hgwdev
    # Update genbank config and source in CVS:
    cd ~/kent/src/hg/makeDb/genbank
    cvsup .

# apiMel2 (A. mellifera)
apiMel2.genome = /iscratch/i/apiMel2/nib/*.nib
apiMel2.lift = no
apiMel2.refseq.mrna.native.load = no
apiMel2.genbank.mrna.xeno.load = yes
apiMel2.genbank.est.xeno.load = no
apiMel2.downloadDir = apiMel2
apiMel2.perChromTables = no

    cvs commit -m 'apiMel2 added to genbank update.' etc/genbank.conf 
    # Edit src/align/gbBlat to add /iscratch/i/apiMel2/11.ooc
    cvs commit -m '' src/align/gbBlat
    # Install to /cluster/data/genbank:
    make install-server
    ssh `fileServer /cluster/data/genbank/`
    cd /cluster/data/genbank
    # This is an -initial run, mRNA only:
    nice bin/gbAlignStep -srcDb=genbank -type=mrna -initial apiMel2 &
    tail -f [its logfile]
    # Load results:
    ssh hgwdev
    cd /cluster/data/genbank
    nice bin/gbDbLoadStep -verbose=1 -drop -initialLoad apiMel2 &
    featureBits apiMel2 all_mrna
#171268 bases of 249601222 (0.069%) in intersection
    featureBits apiMel2 xenoMrna
#7237381 bases of 249601222 (2.900%) in intersection
    # Clean up:
    rm -rf work/initial.apiMel2

    # -initial for ESTs
    ssh eieio
    cd /cluster/data/genbank
    nice bin/gbAlignStep -srcDb=genbank -type=est -initial apiMel2 &
    tail -f [its logfile]
    # Load results:
    ssh hgwdev
    cd /cluster/data/genbank
    nice bin/gbDbLoadStep -verbose=1 apiMel2 &
    featureBits apiMel2 all_est
#5809581 bases of 249601222 (2.328%) in intersection
    # Clean up:
    rm -rf work/initial.apiMel2

# SWAP DM2-APIMEL2 BLASTZ (DONE 2/9/2005 Andy)
# OBSOLETED 1/13/06 angie, see SWAP/CHAIN/NET DM2 below...

# CHAIN MELANOGASTER BLASTZ (DONE 2/9/2005)
# OBSOLETED 1/13/06 angie, see SWAP/CHAIN/NET DM2 below...

# NET MELANOGASTER BLASTZ (DONE 2/9/2005 Andy)
# OBSOLETED 1/13/06 angie, see SWAP/CHAIN/NET DM2 below...

# MAKE AXTNET (DONE 2/10/2005 Andy)
# OBSOLETED 1/13/06 angie, see SWAP/CHAIN/NET DM2 below...

# MAKE VSDM2 DOWNLOADABLES (DONE 2/10/2005 Andy)
# OBSOLETED 1/13/06 angie, see SWAP/CHAIN/NET DM2 below...


# MAKE Drosophila Proteins track  (DONE 2005-02-24 braney)
    ssh kksilo
    mkdir -p /cluster/data/apiMel2/blastDb
    cd /cluster/data/apiMel2/blastDb
    ln -s /cluster/data/apiMel2/fa*/*/*.fa .
    for i in *.fa; do formatdb -i $i -p F 2> /dev/null; done
    rm *.fa *.log

    ssh kkr1u00
    mkdir -p /iscratch/i/apiMel2/blastDb
    cd /cluster/data/apiMel2/blastDb
    cp * /iscratch/i/apiMel2/blastDb
    iSync > sync.out
    
    mkdir -p /cluster/data/apiMel2/bed/tblastn.dm1FB
    cd /cluster/data/apiMel2/bed/tblastn.dm1FB
    ls -1S /iscratch/i/apiMel2/blastDb/*.nsq | sed "s/\.nsq//" > bug.lst
    exit

    # back to kksilo
    cd /cluster/data/apiMel2/bed/tblastn.dm1FB
    mkdir fbfa
    # calculate a reasonable number of jobs
    calc `wc /cluster/data/dm1/bed/blat.dm1FB/dm1FB.psl | awk "{print \\\$1}"`/\(150000/`wc bug.lst | awk "{print \\\$1}"`\)
    # 18735/(150000/1077) = 134.517300
    split -l 135 /cluster/data/dm1/bed/blat.dm1FB/dm1FB.psl fbfa/fb
    cd fbfa
    for i in *; do pslxToFa $i $i.fa; rm $i; done
    cd ..
    ls -1S fbfa/*.fa > fb.lst
    mkdir blastOut
    for i in `cat fb.lst`; do  mkdir blastOut/`basename $i .fa`; done
    cat << '_EOF_' > blastGsub
#LOOP
blastSome $(path1) {check in line $(path2)} {check out exists blastOut/$(root2)/q.$(root1).psl } 
#ENDLOOP
'_EOF_'
    cat << '_EOF_' > blastSome
#!/bin/sh
BLASTMAT=/iscratch/i/blast/data
export BLASTMAT
g=`basename $2`
f=/tmp/`basename $3`.$g
for eVal in 0.01 0.001 0.0001 0.00001 0.000001 1E-09 1E-11
do
if /scratch/blast/blastall -M BLOSUM80 -m 0 -F no -e $eVal -p tblastn -d $1 -i $2 -o $f.8
then
        mv $f.8 $f.1
        break;
fi
done
if test -f  $f.1
then
if /cluster/bin/i386/blastToPsl $f.1 $f.2
then
	liftUp -nosort -type=".psl" -nohead $f.3 /cluster/data/apiMel2/jkStuff/liftAll.lft warn $f.2  
	liftUp -nosort -type=".psl" -pslQ -nohead $3.tmp /iscratch/i/dm1/protein.lft warn $f.3

        mv $3.tmp $3
        rm -f $f.1 $f.2 $f.3
        exit 0
    fi
fi
rm -f $f.1 $f.2 $3.tmp $f.3
exit 1
'_EOF_'

    chmod +x blastSome
    gensub2 bug.lst fb.lst blastGsub blastSpec

    ssh kk
    cd /cluster/data/apiMel2/bed/tblastn.dm1FB
    para create blastSpec
    para try, push

# Completed: 149703 of 149703 jobs
# CPU time in finished jobs:    4747336s   79122.27m  1318.70h   54.95d  0.151 y
# IO & Wait Time:               1196093s   19934.88m   332.25h   13.84d  0.038 y
# Average job time:                  40s       0.66m     0.01h    0.00d
# Longest job:                      221s       3.68m     0.06h    0.00d
# Submission to last job:          7580s     126.33m     2.11h    0.09d

    cat << '_EOF_' > chainGsub
#LOOP
chainSome $(path1)
#ENDLOOP
'_EOF_'

    cat << '_EOF_' > chainSome
(cd $1; cat q.*.psl | simpleChain -prot -outPsl -maxGap=25000 stdin ../c.`basename $1`.psl)
'_EOF_'
    chmod +x chainSome

    ls -1dS `pwd`/blastOut/fb?? > chain.lst
    gensub2 chain.lst single chainGsub chainSpec
    para create chainSpec

    # should run this on the mini-cluster or with my shove script
    # so you can limit the number of jobs starting to 3 or 4
    para try, push...

# Completed: 139 of 139 jobs
# CPU time in finished jobs:       3259s      54.32m     0.91h    0.04d  0.000 y
# IO & Wait Time:                  3745s      62.41m     1.04h    0.04d  0.000 y
# Average job time:                  50s       0.84m     0.01h    0.00d
# Longest job:                      930s      15.50m     0.26h    0.01d
# Submission to last job:          1580s      26.33m     0.44h    0.02d

    exit
    # back to kksilo
    cd /cluster/data/apiMel2/bed/tblastn.dm1FB/blastOut
    for i in fb??
    do 
	awk "(\$13 - \$12)/\$11 > 0.6 {print}" c.$i.psl > c60.$i.psl
	sort -rn c60.$i.psl | pslUniq stdin u.$i.psl
	awk "((\$1 / \$11) ) > 0.60 { print   }" c60.$i.psl > m60.$i.psl
	echo $i
    done
    sort -T /tmp -k 14,14 -k 16,16n -k 17,17n u.*.psl m60* | uniq  >  /cluster/data/apiMel2/bed/tblastn.dm1FB/blastDm1FB.psl

    ssh hgwdev
    cd /cluster/data/apiMel2/bed/tblastn.dm1FB

    hgLoadPsl apiMel2 blastDm1FB.psl

    #once everything checks out, rm the temp dir
    rm -rf blastOut
# End tblastn

# MAKE SCAFFOLDS TRACK (DONE 2/23/2005 Andy)
    # NOTE: all intermediate files from this run were immediately cleaned up.
    # To see more details about the alignments, see the scaffolds2 directory 
    # (RE-ALIGN SCAFFOLDS below).
    ssh kksilo
    cd /cluster/data/apiMel2/bed
    mkdir scaffolds
    cd scaffolds
    for g in `seq 1 16` Un; do 
       wget ftp://ftp.hgsc.bcm.tmc.edu/pub/data/Amellifera/fasta/Amel20050120-freeze/linearScaffolds/Group${g}_20050120.fa.gz; 
       gunzip Group${g}_20050120.fa.gz
       nice blat -fastMap -maxIntron=0 -dots=1000000 ../../apiMel2.2bit:Group$g -ooc=../../11.ooc Group${g}_20050120.fa Group${g}.psl
       perfectBlatBed4 Group${g}.psl Group${g}.scaffolds.bed
    done
    # Fix up the Group10, and GroupUn ones by hand.  Checking line counts for each bed file (Y1) with GroupX.Y2 last record,
    # Y1 should = Y2.  GroupUn had two full-length hits to GroupUn.1669, so I deleted the one that didn't seem to jive.
    rm *.fa *.psl
    cat *.bed >> scaffolds.bed
    rm Group*.bed
    sed 's/gnl|.*|//' scaffolds.bed > tmp
    mv tmp scaffolds.bed
    hgLoadBed apiMel2 scaffolds scaffolds.bed
    # trackDb.ra entry:
track scaffolds
shortLabel Scaffolds
longLabel Assembly from Scaffolds
group map
priority 9.5
visibility hide
color 99,66,20
altColor 199,147,35
type bed 4 .
    # The description needs some editing.

# MAKE Human Proteins track (hg17 DONE 2005-03-26 braney)
    ssh kolossus
    mkdir -p /cluster/data/apiMel2/blastDb
    cd /cluster/data/apiMel2/blastDb
    ln -s /cluster/data/apiMel2/faSplits/*/*.fa .
    for i in *.fa
    do
	formatdb -i $i -p F
    done
    rm *.log *.fa

    ssh kkr1u00
    mkdir -p /iscratch/i/apiMel2/blastDb
    cd /cluster/data/apiMel2/blastDb
    cp * /iscratch/i/apiMel2/blastDb
    iSync > sync.out
    exit # back to kolossus

    mkdir -p /cluster/data/apiMel2/bed/tblastn.hg17KG
    cd /cluster/data/apiMel2/bed/tblastn.hg17KG
    ls -1S /iscratch/i/apiMel2/blastDb/*.nsq | sed "s/\.nsq//" > query.lst

    calc `wc /cluster/data/hg17/bed/blat.hg17KG/hg17KG.psl | awk "{print \\\$1}"`/\(350000/`wc query.lst | awk "{print \\\$1}"`\)
    # 42156/(350000/1077) = 129.720034
    mkdir -p /cluster/bluearc/apiMel2/bed/tblastn.hg17KG/kgfa
    split -l 130 /cluster/data/hg17/bed/blat.hg17KG/hg17KG.psl /cluster/bluearc/apiMel2/bed/tblastn.hg17KG/kgfa/kg
    ln -s /cluster/bluearc/apiMel2/bed/tblastn.hg17KG/kgfa kgfa
    cd kgfa
    for i in *; do pslxToFa $i $i.fa; rm $i; done
    cd ..
    ls -1S kgfa/*.fa > kg.lst
    mkdir -p /cluster/bluearc/apiMel2/bed/tblastn.hg17KG/blastOut
    ln -s /cluster/bluearc/apiMel2/bed/tblastn.hg17KG/blastOut 
    for i in `cat kg.lst`; do  mkdir blastOut/`basename $i .fa`; done
    tcsh
    cat << '_EOF_' > blastGsub
#LOOP
blastSome $(path1) {check in line $(path2)} {check out exists blastOut/$(root2)/q.$(root1).psl } 
#ENDLOOP
'_EOF_'
    cat << '_EOF_' > blastSome
#!/bin/sh
BLASTMAT=/iscratch/i/blast/data
export BLASTMAT
g=`basename $2`
f=/tmp/`basename $3`.$g
for eVal in 0.01 0.001 0.0001 0.00001 0.000001 1E-09 1E-11
do
if /scratch/blast/blastall -M BLOSUM80 -m 0 -F no -e $eVal -p tblastn -d $1 -i $2 -o $f.8
then
        mv $f.8 $f.1
        break;
fi
done
if test -f  $f.1
then
if /cluster/bin/i386/blastToPsl $f.1 $f.2
then
	liftUp -nosort -type=".psl" -nohead $f.3 /cluster/data/apiMel2/jkStuff/liftAll.lft warn $f.2  
	liftUp -nosort -type=".psl" -pslQ -nohead $3.tmp /cluster/data/hg17/bed/blat.hg17KG/protein.lft warn $f.3
	if pslCheck -prot $3.tmp
	then
	    mv $3.tmp $3
	    rm -f $f.1 $f.2 
	fi
        exit 0
    fi
fi
rm -f $f.1 $f.2 $3.tmp $f.8
exit 1
'_EOF_'

    chmod +x blastSome
    gensub2 query.lst kg.lst blastGsub blastSpec

    ssh kk
    cd /cluster/data/apiMel2/bed/tblastn.hg17KG
    para create blastSpec
    para push
# Completed: 350025 of 350025 jobs
# CPU time in finished jobs:   18368399s  306139.99m  5102.33h  212.60d  0.582 y
# IO & Wait Time:               8819533s  146992.21m  2449.87h  102.08d  0.280 y
# Average job time:                  78s       1.29m     0.02h    0.00d
# Longest running job:                0s       0.00m     0.00h    0.00d
# Longest finished job:            5725s      95.42m     1.59h    0.07d
# Submission to last job:         37495s     624.92m    10.42h    0.43d

    cat << '_EOF_' > chainGsub
#LOOP
chainSome $(path1)
#ENDLOOP
'_EOF_'

    cat << '_EOF_' > chainSome
(cd $1; cat q.*.psl | simpleChain -prot -outPsl -maxGap=25000 stdin ../c.`basename $1`.psl)
'_EOF_'
    chmod +x chainSome

    ls -1dS `pwd`/blastOut/kg?? > chain.lst
    gensub2 chain.lst single chainGsub chainSpec

    ssh kki
    cd /cluster/data/apiMel2/bed/tblastn.hg17KG
    para create chainSpec
    para push
# Completed: 325 of 325 jobs
# CPU time in finished jobs:       9334s     155.56m     2.59h    0.11d  0.000 y
# IO & Wait Time:                 25507s     425.12m     7.09h    0.30d  0.001 y
# Average job time:                 107s       1.79m     0.03h    0.00d
# Longest running job:                0s       0.00m     0.00h    0.00d
# Longest finished job:            2999s      49.98m     0.83h    0.03d
# Submission to last job:          3888s      64.80m     1.08h    0.04d

    exit
    # back to kolossus
    cd /cluster/data/apiMel2/bed/tblastn.hg17KG/blastOut
    for i in kg??
    do 
	awk "(\$13 - \$12)/\$11 > 0.6 {print}" c.$i.psl > c60.$i.psl
	sort -rn c60.$i.psl | pslUniq stdin u.$i.psl
	awk "((\$1 / \$11) ) > 0.60 { print   }" c60.$i.psl > m60.$i.psl
	echo $i
    done

    cat u.*.psl m60* | sort -T /tmp -k 14,14 -k 17,17n -k 17,17n  | uniq  > /cluster/data/apiMel2/bed/tblastn.hg17KG/blastHg17KG.psl
    cd ..

    ssh hgwdev
    cd /cluster/data/apiMel2/bed/tblastn.hg17KG
    hgLoadPsl apiMel2 blastHg17KG.psl

    # back to kolossus
    rm -rf blastOut
# End tblastn


# RE-ALIGN SCAFFOLDS (DONE 3/30/05 angie)
    # Andy cleaned up so well after his scaffold alignment that all of the 
    # strand info is gone, but I need that to make the liftUp file below...
    # would be good to include it in the track as well, if there are both 
    # + and - alignments.  
    ssh kolossus
    mkdir /cluster/data/apiMel2/bed/scaffolds2
    cd /cluster/data/apiMel2/bed/scaffolds2
    foreach g (`seq 1 16` Un)
      wget ftp://ftp.hgsc.bcm.tmc.edu/pub/data/Amellifera/fasta/Amel20050120-freeze/linearScaffolds/Group${g}_20050120.fa.gz
    end
    faSize *.fa.gz
#228567597 bases (3817074 N's 224750523 real 224750523 upper 0 lower) in 15293 sequences in 17 files
#Total size: mean 14945.9 sd 83937.6 min 0 (?35000) max 2209875 (gnl|Amel_2.0|Group12.11) median 771
    # Have to uncompress for blat.
    gunzip *.gz
    ssh kki
    cd /cluster/data/apiMel2/bed/scaffolds2
    mkdir psl
    ls -1S *.fa | sed -e 's/_20050120.fa//' > group.lst
    cat > gsub <<EOF
#LOOP
/cluster/bin/x86_64/blat -fastMap -maxIntron=0 /iscratch/i/apiMel2/apiMel2.2bit:\$(root1) -ooc=/iscratch/i/apiMel2/11.ooc {check in line+ \$(root1)_20050120.fa} {check out line+ psl/\$(root1).psl}
#ENDLOOP
EOF
    gensub2 group.lst single gsub jobList
    para create jobList
    para try, check, push, check, ...
#Completed: 17 of 17 jobs
#Average job time:                  86s       1.44m     0.02h    0.00d
#Longest job:                      248s       4.13m     0.07h    0.00d
#Submission to last job:           248s       4.13m     0.07h    0.00d
    # EXPERIMENT: try psLayout since it was recommended for this purpose:
    mkdir psLayoutIn
    ln -s /cluster/data/apiMel2/maskedFa/*.fa psLayoutIn
    foreach f (/cluster/data/apiMel2/RMOut/*.out)
      ln -s $f psLayoutIn/$f:t:r.fa.out
    end
    mkdir psLayout
    cat > gsub <<EOF
#LOOP
psLayout {check in line+ psLayoutIn/\$(root1).fa} {check in line+ \$(root1)_20050120.fa} g2g /iscratch/i/apiMel2/11.ooc {check out line+ psLayout/\$(root1).psl}
#ENDLOOP
EOF
    gensub2 group.lst single gsub jobList
    para create jobList
    para try, check, push, check, ...
#Completed: 16 of 17 jobs
#Crashed: 1 jobs
#Average job time:                  41s       0.68m     0.01h    0.00d
#Longest job:                      145s       2.42m     0.04h    0.00d
#Submission to last job:           145s       2.42m     0.04h    0.00d
    # It wasn't happy with GroupUn:
    para problems
#[host kkr2u00 ] Too many blocks, can only handle 471039
    # Compare pslReps-filtered Group1 results:
    # back on kolossus:
    pslReps -nohead psl/Group1.psl Group1.reps.psl /dev/null
    pslReps -nohead psLayout/Group1.psl Group1.reps.psL /dev/null
    grep Group Group1_20050120.fa | wc -l
#     64
    wc -l Group1.reps.*
#     68 Group1.reps.psL
#     64 Group1.reps.psl
    less Group1.reps.psl Group1.reps.psL
    # 64 scaffold sequences in Group1_20050120.fa, and 64 gapless alignments 
    # (1 per sequence) in the filtered blat -fastMap output (.psl) which is 
    # exactly what we need.  The filtered psLayout output (.psL) has gaps 
    # and a few cases of multiple alignments per scaffold... not as easy to 
    # translate into a scaffolds track / liftUp file.  
    # So use the blat -fastMap results.
    # END EXPERIMENT

    # Use pslReps to pick the best alignment for each scaffold.  
    mkdir pslReps
    foreach f (psl/*.psl)
      pslReps -nohead -noIntrons -ignoreNs -minCover=0.99 -singleHit $f \
        pslReps/$f:t /dev/null
    end
    # Get counts of scaffolds per group:
    foreach f ( *.fa )
     echo -n $f " "
     grep Group $f | wc -l
    end
    # Compare with alignment counts:
    wc -l pslReps/*.psl
    # The only one that ended up with too many is GroupUn:
#  7202 pslReps/GroupUn.psl
    # Get a list of scaffolds with multiple alignments that passed pslReps 
    # and examine some:
    awk '{print $10;}' pslReps/GroupUn.psl | sort | uniq -c \
    | awk '$1 != 1 {print;}' | sort -nr
#      2 gnl|Amel_2.0|GroupUn.5526
#      2 gnl|Amel_2.0|GroupUn.1699
#      2 gnl|Amel_2.0|GroupUn.1669
    egrep 'GroupUn.(5526|1699|1669)' pslReps/GroupUn.psl 
    # Wow, GroupUn.1669 has two perfect mappings, one to 
    # GroupUn:26535330-26536289 and one to GroupUn:108268474-108269435
    # The former is completely contained in GroupUn.191 (Contig7152).
    # The latter fits perfectly between two gaps ==> edit out the former.
    # GroupUn.5526 and .1699 have imperfect mappings that slipped in 
    # under the pslReps wire -- edit out the ones with multiple blocks.

    # Having whittled down the psl to one perfect alignment per scaffold,
    # make bed 6 to preserve the strand info.  
    awk '{printf "%s\t%d\t%d\t%s\t0\t%s\n", $14, $16, $17, $10, $9;}' \
      pslReps/*.psl \
    | sed 's/gnl|Amel_2.0|//' > scaffolds.bed
    # Compare coordinates to Andy's bed4 from previous run:
    sort ../scaffolds/scaffolds.bed > /tmp/1
    awk '{print $1 "\t" $2 "\t" $3 "\t" $4;}' scaffolds.bed | sort > /tmp/2
    diff /tmp/1 /tmp/2 | wc -l
#0
    # OK, now after all that fuss, are there any - strand alignments?
    awk '{print $6;}' scaffolds.bed | sort | uniq
#+
    # No!  So no need to reload the scaffolds track.  However, we can 
    # proceed with confidence to make a strandless liftUp file.  
    # (liftUp doesn't support all formats when there are - strand items)
    # Cleanup:
    gzip *.fa psl/*.psl
    rm *.tab


# MAKE SCAFFOLDS -> GROUPS LIFTUP FILE (DONE 3/30/05 angie)
    ssh kolossus
    cd /cluster/data/apiMel2/bed/scaffolds2
    cp /dev/null ../../jkStuff/scaffolds.lft
    foreach g (`awk '{print $1;}' ../../chrom.sizes`)
      set size = `awk '$1 == "'$g'" {print $2;}' ../../chrom.sizes`
      echo $g $size
      awk '$1 == "'$g'" {printf "%d\t%s\t%d\t%s\t%d\n", $2, $4, ($3 - $2), $1, '$size';}' \
        scaffolds.bed >> ../../jkStuff/scaffolds.lft
    end
    wc -l scaffolds.bed ../../jkStuff/scaffolds.lft
#  7655 scaffolds.bed
#  7655 ../../jkStuff/scaffolds.lft


# HOMOLOGY GENE PREDICTIONS FROM ZDOBNOV (DONE 5/13/05 angie)
    # Updated data -- GFF loads cleanly now, and is split into high-
    # and low-confidence sets.
    ssh hgwdev
    mkdir /cluster/data/apiMel2/bed/brhInparalog
    cd /cluster/data/apiMel2/bed/brhInparalog
    wget http://azra.embl-heidelberg.de/~zdobnov/Bee2/amel_set1.gff.txt
    wget http://azra.embl-heidelberg.de/~zdobnov/Bee2/amel_set2.gff.txt
    # Discard header line, change "exon" to "CDS", and lift up from 
    # scaffolds to group coords:
    tail +2 amel_set1.gff.txt | sed -e 's/exon/CDS/' \
    | liftUp brhInparalog.gff ../../jkStuff/scaffolds.lft warn stdin
    tail +2 amel_set2.gff.txt | sed -e 's/exon/CDS/' \
    | liftUp brhInparalogLow.gff ../../jkStuff/scaffolds.lft warn stdin
    ldHgGene apiMel2 brhInparalog brhInparalog.gff
    ldHgGene apiMel2 brhInparalogLow brhInparalogLow.gff


# ENSEMBL GENES (DONE 4/12/05 angie)
    ssh hgwdev
    mkdir /cluster/data/apiMel2/bed/ensGene
    cd /cluster/data/apiMel2/bed/ensGene
    wget ftp://ftp.ensembl.org/pub/misc/vvi/ensembl_genes_8Apr05.tar.gz
    wget ftp://ftp.ensembl.org/pub/misc/vvi/ensembl_translations8Apr05.fa.gz
    tar xvzf ensembl_genes_8Apr05.tar.gz
    zcat ensembl_translations8Apr05.fa.gz \
    | perl -wpe 's/^>\S+ Translation id (ENSAPMT\d+) .*/>$1/' \
    > ensPep.fa
    ldHgGene -gtf -genePredExt apiMel2 ensGene gtf_dump*
    hgPepPred apiMel2 generic ensPep ensPep.fa


# NCBI ANNOTATIONS (XM_* PREDICTED REFSEQS) (REDONE 5/26/05 angie)
    # Originally done 5/17; reloaded w/new data from Barbara Ruef @ncbi 5/26.
    ssh hgwdev
    mkdir /cluster/data/apiMel2/bed/ncbi
    cd /cluster/data/apiMel2/bed/ncbi
    wget ftp://ftp.ncbi.nlm.nih.gov/genomes/Apis_mellifera/special_requests/Amel2.0_Scaffolds_NCBI_BCM.map
    wget ftp://ftp.ncbi.nlm.nih.gov/genomes/Apis_mellifera/special_requests/NCBI.gff.gz
    wget ftp://ftp.ncbi.nlm.nih.gov/genomes/Apis_mellifera/special_requests/NCBI-mrna.fa.gz
    wget ftp://ftp.ncbi.nlm.nih.gov/genomes/Apis_mellifera/special_requests/NCBI-prot.fa.gz
    # NCBI has their own scaffold IDs... make a liftUp spec file from 
    # scaffolds.lft and their mapping.
    perl -e 'open(MAP, "Amel2.0_Scaffolds_NCBI_BCM.map") || die doh; \
             %b2n = (); \
             while (<MAP>) { \
               chomp;  my ($ncbi, undef, $baylor) = split; \
               $b2n{$baylor} = $ncbi; \
             } \
             while (<>) { \
               my @words = split; \
               next if (! exists $b2n{$words[1]}); \
               $words[1] = $b2n{$words[1]}; \
               print join("\t", @words) . "\n"; \
             }' ../../jkStuff/scaffolds.lft > ../../jkStuff/ncbiScaffolds.lft
    liftUp -type=.gff apiMel2.RefSeq.gff ../../jkStuff/ncbiScaffolds.lft warn \
      NCBI.gff.gz
    # Some warnings about NC_001566.1, the mitochondrion, which wasn't 
    # included in apiMel2 -- OK.  
    # At first I wasn't sure if we wanted to load this up, so I made it 
    # available as a custom track file
    # http://genome-test.soe.ucsc.edu/angie/apiMel2.RefSeq.gff
    ln -s /cluster/data/apiMel2/bed/ncbi/apiMel2.RefSeq.gff \
      /usr/local/apache/htdocs/angie/
    # But Jim said that we should show these, so load it up:
    ldHgGene -gtf -genePredExt apiMel2 modelRefGene apiMel2.RefSeq.gff
    # The proteins XP_* don't seem to correspond to the XM_* genes with 
    # the same numbers, so don't load them up -- the predicted proteins 
    # are straight from genomic sequence anyway.  


# EXONERATE TRACK (IN PROGRESS 5/23/2005 Andy)
    ssh kk
    mkdir /cluster/data/apiMel2/bed/exonerateDm2
    cd /cluster/data/apiMel2/bed/exonerateDm2
    cat << EOF > exonerate.sh
#!/bin/bash
dirGroup=\`dirname \$2\`
group=\`basename \$dirGroup\`
gSplit=\`basename \$2 .fa\`
pSplit=\`basename \$1 .fa\`
outFile=\${gSplit}_\${pSplit}.gff

alias exonerate=/cluster/home/aamp/bin/i386/exonerate
exonerate --model  protein2genome --percent 50 --showvulgar false --showalignment false --showtargetgff true \$1 \$2 | grep -v "^Command" | grep -v "^Hostname" | grep -v "^#" | grep -v "^--" > /tmp/\$outFile
mkdir -p /cluster/bluearc/apiMel2/exonerateOutput/\$group
cp /tmp/\$outFile \$3
rm /tmp/\$outFile
EOF
    find /iscratch/i/apiMel2/faSplits/ -name '*.fa' > honey.lst
    mkdir /santest/scratch/dm2
    pushd /santest/scratch/dm2
    cp /cluster/data/dm2/bed/flybase/flybasePep.fa .
    faSplit sequence flybasePep.fa 25 fly_
    rm flybasePep.fa
    popd
    find /santest/scratch/dm2 -name '*.fa' > fly.lst
    cat << EOF > gsub
#LOOP
./exonerate.sh {check in line+ \$(path2)} {check in line+ \$(path1)} {check out exists /cluster/bluearc/apiMel2/exonerateOutput/\$(lastDir1)/\$(root1)_\$(root2).gff}
#ENDLOOP
EOF
    gensub2 honey.lst fly.lst gsub spec
    para create spec
    para try
    para check
#19386 jobs in batch
#0 jobs (including everybody's) in Parasol queue.
#Checking finished jobs
#Completed: 19386 of 19386 jobs
#CPU time in finished jobs:    1349378s   22489.63m   374.83h   15.62d  0.043 y
#IO & Wait Time:                192915s    3215.25m    53.59h    2.23d  0.006 y
#Average job time:                  80s       1.33m     0.02h    0.00d
#Longest running job:                0s       0.00m     0.00h    0.00d
#Longest finished job:            7474s     124.57m     2.08h    0.09d
#Submission to last job:         14456s     240.93m     4.02h    0.17d
     ssh hgwdev
     cd /cluster/data/apiMel2/bed/exonerateDm2
     find /panasas/store/apiMel2/exonerateOutput -name "*.gff" -size +0b -exec cat '{}' ';' > dm2.gff
     ssh kk
     cd /cluster/data/apiMel2/bed/exonerateDm2
     cat /iscratch/i/apiMel2/faSplits/*.lft > liftAll.lft
     ssh hgwdev
     cd /cluster/data/apiMel2/bed/exonerateDm2
     liftUp fly.gff liftAll.lft warn dm2.gff 
     ldHgGene apiMel2 exonerateDm2 fly.gff 
#Reading fly.gff
#Read 0 transcripts in 52886 lines in 1 files
#  0 groups 17 seqs 1 sources 6 feature types
#0 gene predictions
      # Yeah that's no good.

# MAKE Drosophila Proteins track (DONE 06-30-05 braney)
    ssh kk
    mkdir -p /cluster/data/apiMel2/bed/tblastn.dm2FB
    cd /cluster/data/apiMel2/bed/tblastn.dm2FB
    ls -1S /iscratch/i/apiMel2/blastDb/*.nsq | sed "s/\.nsq//" > target.lst
    mkdir fbfa
    # calculate a reasonable number of jobs 
    calc `wc /cluster/data/dm2/bed/blat.dm2FB/dm2FB.psl | awk "{print \\\$1}"`/\(164630/`wc target.lst | awk "{print \\\$1}"`\)
    # 18929/(164630/1077) = 123.832430
    split -l 124 /cluster/data/dm2/bed/blat.dm2FB/dm2FB.psl fbfa/fb
    cd fbfa
    for i in *; do pslxToFa $i $i.fa; rm $i; done
    cd ..
    ls -1S fbfa/*.fa > fb.lst
    mkdir -p /cluster/bluearc/apiMel2/bed/tblastn.dm2FB/blastOut  
    ln -s /cluster/bluearc/apiMel2/bed/tblastn.dm2FB/blastOut  
    for i in `cat fb.lst`; do  mkdir blastOut/`basename $i .fa`; done
    cat << '_EOF_' > blastGsub
#LOOP
blastSome $(path1) {check in line $(path2)} {check out exists blastOut/$(root2)/q.$(root1).psl } 
#ENDLOOP
'_EOF_'
    cat << '_EOF_' > blastSome
#!/bin/sh
BLASTMAT=/iscratch/i/blast/data
export BLASTMAT
g=`basename $2`
f=/tmp/`basename $3`.$g
for eVal in 0.01 0.001 0.0001 0.00001 0.000001 1E-09 1E-11
do
if /scratch/blast/blastall -M BLOSUM80 -m 0 -F no -e $eVal -p tblastn -d $1 -i $2 -o $f.8
then
        mv $f.8 $f.1
        break;
fi
done
if test -f  $f.1
then
if /cluster/bin/i386/blastToPsl $f.1 $f.2
then
	liftUp -nosort -type=".psl" -nohead $f.3 /cluster/data/apiMel2/jkStuff/liftAll.lft warn $f.2  
        liftUp -nosort -type=".psl" -pslQ -nohead $3.tmp /cluster/data/dm2/bed/blat.dm2FB/protein.lft warn $f.3
        mv $3.tmp $3
        rm -f $f.1 $f.2 $f.3
        exit 0
    fi
fi
rm -f $f.1 $f.2 $3.tmp $f.3 $f.8
exit 1
'_EOF_'

    chmod +x blastSome
    gensub2 target.lst fb.lst blastGsub blastSpec

    ssh kk
    cd /cluster/data/apiMel2/bed/tblastn.dm2FB
    para create blastSpec
    para push

# Completed: 164781 of 164781 jobs
# CPU time in finished jobs:    2133855s   35564.25m   592.74h   24.70d  0.068 y
# IO & Wait Time:               1168861s   19481.02m   324.68h   13.53d  0.037 y
# Average job time:                  20s       0.33m     0.01h    0.00d
# Longest finished job:             276s       4.60m     0.08h    0.00d
# Submission to last job:         39379s     656.32m    10.94h    0.46d

    exit
    # back to kksilo

    cat << '_EOF_' > chainGsub
#LOOP
chainSome $(path1)
#ENDLOOP
'_EOF_'

    cat << '_EOF_' > chainSome
(cd $1; cat q.*.psl | simpleChain -prot -outPsl -maxGap=25000 stdin ../c.`basename $1`.psl)
'_EOF_'
    chmod +x chainSome

    ls -1dS `pwd`/blastOut/fb?? > chain.lst
    gensub2 chain.lst single chainGsub chainSpec

    ssh kki
    cd /cluster/data/apiMel2/bed/tblastn.dm2FB
    para create chainSpec
    para push
 
# Completed: 153 of 153 jobs
# CPU time in finished jobs:       8279s     137.98m     2.30h    0.10d  0.000 y
# IO & Wait Time:                  4933s      82.22m     1.37h    0.06d  0.000 y
# Average job time:                  86s       1.44m     0.02h    0.00d
# Longest finished job:            3445s      57.42m     0.96h    0.04d
# Submission to last job:          3823s      63.72m     1.06h    0.04d

    cd /cluster/data/apiMel2/bed/tblastn.dm2FB/blastOut
    for i in fb??
    do 
	awk "(\$13 - \$12)/\$11 > 0.6 {print}" c.$i.psl > c60.$i.psl
	sort -rn c60.$i.psl | pslUniq stdin u.$i.psl
	awk "((\$1 / \$11) ) > 0.60 { print   }" c60.$i.psl > m60.$i.psl
	echo $i
    done

    sort -u -T /tmp -k 14,14 -k 16,16n -k 17,17n u.*.psl m60* > /cluster/data/apiMel2/bed/tblastn.dm2FB/blastDm2FB.psl
    cd ..

    ssh hgwdev
    cd /cluster/data/apiMel2/bed/tblastn.dm2FB
    hgLoadPsl apiMel2 blastDm2FB.psl
    exit

    # back to kksilo
    rm -rf blastOut

# End tblastn

# MAKE Human Proteins track (DONE 06-30-05 braney)
    mkdir -p /cluster/data/apiMel2/bed/tblastn.hg17KG
    cd /cluster/data/apiMel2/bed/tblastn.hg17KG
    ls -1S /iscratch/i/apiMel2/blastDb/*.nsq | sed "s/\.nsq//" > target.lst

    mkdir kgfa
    # calculate a reasonable number of jobs
    calc `wc /cluster/data/hg17/bed/blat.hg17KG/hg17KG.psl | awk "{print \\\$1}"`/\(150000/`wc target.lst | awk "{print \\\$1}"`\)
# 37365/(150000/1077) = 268.280700

    split -l 268 /cluster/data/hg17/bed/blat.hg17KG/hg17KG.psl kgfa/kg
    cd kgfa
    for i in *; do pslxToFa $i $i.fa; rm $i; done
    cd ..
    ls -1S kgfa/*.fa > kg.lst
    rm -rf  /cluster/bluearc/apiMel2/bed/tblastn.hg17KG/blastOut
    mkdir -p /cluster/bluearc/apiMel2/bed/tblastn.hg17KG/blastOut
    ln -s  /cluster/bluearc/apiMel2/bed/tblastn.hg17KG/blastOut
    for i in `cat kg.lst`; do  mkdir blastOut/`basename $i .fa`; done
    tcsh
    cat << '_EOF_' > blastGsub
#LOOP
blastSome $(path1) {check in line $(path2)} {check out exists blastOut/$(root2)/q.$(root1).psl } 
#ENDLOOP
'_EOF_'
    cat << '_EOF_' > blastSome
#!/bin/sh
BLASTMAT=/iscratch/i/blast/data
export BLASTMAT
g=`basename $2`
f=/tmp/`basename $3`.$g
for eVal in 0.01 0.001 0.0001 0.00001 0.000001 1E-09 1E-11
do
if /scratch/blast/blastall -M BLOSUM80 -m 0 -F no -e $eVal -p tblastn -d $1 -i $2 -o $f.8
then
        mv $f.8 $f.1
        break;
fi
done
if test -f  $f.1
then
if /cluster/bin/i386/blastToPsl $f.1 $f.2
then
	liftUp -nosort -type=".psl" -nohead $f.3 /cluster/data/apiMel2/jkStuff/liftAll.lft warn $f.2  
        liftUp -nosort -type=".psl" -pslQ -nohead $3.tmp /cluster/data/hg17/bed/blat.hg17KG/protein.lft warn $f.3
        mv $3.tmp $3
        rm -f $f.1 $f.2 
        exit 0
    fi
fi
rm -f $f.1 $f.2 $3.tmp $f.8
exit 1
'_EOF_'

    chmod +x blastSome
    gensub2 target.lst kg.lst blastGsub blastSpec

    ssh kk
    cd /cluster/data/apiMel2/bed/tblastn.hg17KG
    para create blastSpec
    para push
# Completed: 150780 of 150780 jobs
# CPU time in finished jobs:   14906565s  248442.75m  4140.71h  172.53d  0.473 y
# IO & Wait Time:               3654875s   60914.58m  1015.24h   42.30d  0.116 y
# Average job time:                 123s       2.05m     0.03h    0.00d
# Longest running job:                0s       0.00m     0.00h    0.00d
# Longest finished job:             610s      10.17m     0.17h    0.01d
# Submission to last job:        122371s    2039.52m    33.99h    1.42d

    cat << '_EOF_' > chainGsub
#LOOP
chainSome $(path1)
#ENDLOOP
'_EOF_'

    cat << '_EOF_' > chainSome
(cd $1; cat q.*.psl | simpleChain -prot -outPsl -maxGap=25000 stdin ../c.`basename $1`.psl)
'_EOF_'
    chmod +x chainSome

    ls -1dS `pwd`/blastOut/kg?? > chain.lst
    gensub2 chain.lst single chainGsub chainSpec

    ssh kki
    cd /cluster/data/apiMel2/bed/tblastn.hg17KG
    para create chainSpec
    para push
# Completed: 140 of 140 jobs
# CPU time in finished jobs:       5416s      90.27m     1.50h    0.06d  0.000 y
# IO & Wait Time:                 11122s     185.37m     3.09h    0.13d  0.000 y
# Average job time:                 118s       1.97m     0.03h    0.00d
# Longest running job:                0s       0.00m     0.00h    0.00d
# Longest finished job:            1427s      23.78m     0.40h    0.02d
# Submission to last job:          1696s      28.27m     0.47h    0.02d

    ssh kkstore01
    cd /cluster/data/apiMel2/bed/tblastn.hg17KG/blastOut
    for i in kg??
    do 
	awk "(\$13 - \$12)/\$11 > 0.6 {print}" c.$i.psl > c60.$i.psl
	sort -rn c60.$i.psl | pslUniq stdin u.$i.psl
	awk "((\$1 / \$11) ) > 0.60 { print   }" c60.$i.psl > m60.$i.psl
	echo $i
    done

    sort -u -k 14,14 -k 16,16n -k 17,17n u.*.psl m60* > /cluster/data/apiMel2/bed/tblastn.hg17KG/blastHg17KG.psl
    cd ..

    ssh hgwdev
    cd /cluster/data/apiMel2/bed/tblastn.hg17KG
    hgLoadPsl apiMel2 blastHg17KG.psl
    exit

    # back to kksilo
    rm -rf blastOut

# End tblastn


# BEEBASE GENE MODELS (DONE 7/27/05 angie)
    mkdir /cluster/data/apiMel2/bed/beeBase
    cd /cluster/data/apiMel2/bed/beeBase
    set bbpassword = # emailed from Chris Elsik at TAMU
    wget ftp://guest:$bbpassword@trixie.tamu.edu/official_gene_sets/Amel_abinitio.gff
    wget ftp://guest:$bbpassword@trixie.tamu.edu/official_gene_sets/Amel_predicted_glean3.gff
    sed -e 's/gene_id //' Amel_abinitio.gff \
    | liftUp -type=.gff stdout ../../jkStuff/scaffolds.lft warn stdin \
    | ldHgGene apiMel2 beeBaseAbinitio stdin
    perl -wpe 's/GenePrediction (\w+-\w+).*/$1/' Amel_predicted_glean3.gff \
    | liftUp -type=.gff stdout ../../jkStuff/scaffolds.lft warn stdin \
    | ldHgGene apiMel2 beeBaseGlean3 stdin
    # Make a custom track for Hugh Robertson and Chris to use in the meantime 
    # (until they can send more complete descriptions).
    echo 'track name=beeBaseGlean3 description="BeeBase GLEAN3 Gene Models" color=175,175,0 grp=genes priority=41 visibility=pack' > beeBaseCustomTracks.txt
    perl -wpe 's/GenePrediction (\w+-\w+).*/$1/' Amel_predicted_glean3.gff \
    | liftUp -type=.gff stdout ../../jkStuff/scaffolds.lft warn stdin \
    >> beeBaseCustomTracks.txt
    echo 'track name=beeBaseAbinitio description="BeeBase Ab Initio Gene Models" color=150,120,10 grp=genes priority=42 visibility=pack' >> beeBaseCustomTracks.txt
    sed -e 's/gene_id //' Amel_abinitio.gff \
    | liftUp -type=.gff stdout ../../jkStuff/scaffolds.lft warn stdin \
    >> beeBaseCustomTracks.txt
    cp beeBaseCustomTracks.txt /usr/local/apache/htdocs/angie/


# GENEID PREDICTIONS (DONE 1/4/06 angie)
    ssh hgwdev
    mkdir /cluster/data/apiMel2/bed/geneid
    cd /cluster/data/apiMel2/bed/geneid
    foreach chr (`awk '{print $1;}' /cluster/data/apiMel2/chrom.sizes`)
      wget http://genome.imim.es/genepredictions/A.mellifera/golden_path_200501/geneidv1.2/$chr.gtf
    end
    ldHgGene -gtf -genePredExt apiMel2 geneid *.gtf


# SWAP/CHAIN/NET DM2 (DONE 1/13/06 angie)
    # dm2-apiMel2 was redone with better parameters in May '05... 
    # pick those up and also get the latest file conventions.
    ssh hgwdev
    rm /cluster/data/apiMel2/bed/blastz.dm2
    mkdir /cluster/data/apiMel2/bed/blastz.dm2.swap
    cd /cluster/data/apiMel2/bed/blastz.dm2.swap
    doBlastzChainNet.pl -swap /cluster/data/dm2/bed/blastz.apiMel2/DEF >& do.log &
    tail -f do.log
    ln -s blastz.dm2.swap /cluster/data/apiMel2/bed/blastz.dm2


