# for emacs: -*- mode: sh; -*-


# Bos taurus 
# Btau_1.0 from Baylor
# September 2004
# contigs and scaffolds, no chromosomes
# WGS, Atlas assembler, 3x coverage
# ftp://ftp.hgsc.bcm.tmc.edu/pub/data/Btaurus/fasta/Btau20040927-freeze
# anonymous ftp, no login/password needed

# DOWNLOAD SEQUENCE (DONE Sept. 30, 2004 Heather)
    ssh kksilo
    mkdir /cluster/store8/bosTau1
    cd /cluster/data
    ln -s /cluster/store8/bosTau1 bosTau1
    cd /cluster/data/bosTau1
    mkdir downloads
    cd downloads

    # we won't use these, but download them for completeness
    mkdir Bin0
    cd Bin0
    wget ftp://ftp.hgsc.bcm.tmc.edu/pub/data/Btaurus/fasta/Btau20040927-freeze/Bin0/Btau20040927-freeze-bin0.fa.gz
    wget ftp://ftp.hgsc.bcm.tmc.edu/pub/data/Btaurus/fasta/Btau20040927-freeze/Bin0/Btau20040927-freeze-bin0.fa.qual.gz
    wget ftp://ftp.hgsc.bcm.tmc.edu/pub/data/Btaurus/fasta/Btau20040927-freeze/Bin0/Btau_lib_insert_size_20040927.tbl.gz
    cd ..

    mkdir contigs
    cd contigs
    wget ftp://ftp.hgsc.bcm.tmc.edu/pub/data/Btaurus/fasta/Btau20040927-freeze/contigs/Btau20040927-freeze-contigs.fa.gz
    wget ftp://ftp.hgsc.bcm.tmc.edu/pub/data/Btaurus/fasta/Btau20040927-freeze/contigs/Btau20040927-freeze-contigs.fa.qual.gz
    wget ftp://ftp.hgsc.bcm.tmc.edu/pub/data/Btaurus/fasta/Btau20040927-freeze/contigs/Btau_v1.0_20040927.agp.gz
    wget ftp://ftp.hgsc.bcm.tmc.edu/pub/data/Btaurus/fasta/Btau20040927-freeze/contigs/AAFC01_accs.gz
    cd ..

    mkdir linearScaffolds
    cd linearScaffolds
    wget ftp://ftp.hgsc.bcm.tmc.edu/pub/data/Btaurus/fasta/Btau20040927-freeze/linearScaffolds/Btau20040927-freeze-assembly.fa.gz
    wget ftp://ftp.hgsc.bcm.tmc.edu/pub/data/Btaurus/fasta/Btau20040927-freeze/linearScaffolds/Btau20040927-freeze-assembly.fa.qual.gz
    cd ..

    mkdir repeat-reads
    cd repeat-reads
    wget ftp://ftp.hgsc.bcm.tmc.edu/pub/data/Btaurus/fasta/Btau20040927-freeze/repeat-reads/Btau20040927-freeze-repeat-reads.fa.gz
    wget ftp://ftp.hgsc.bcm.tmc.edu/pub/data/Btaurus/fasta/Btau20040927-freeze/repeat-reads/Btau20040927-freeze-repeat-reads.fa.qual.gz
    cd ..

# GUNZIP
  gunzip */*.gz

# CHECK FILES

# Check that IDs in .fa match IDs in .qual
  
  ssh kksilo
  cd /cluster/data/bosTau1/downloads
  # foreach Bin0 contigs linearScaffolds repeat-reads
  grep ">" *.fa > id.fa
  grep ">" *.qual > id.qual
  diff id.fa id.qual

  # this takes about 5-10 minutes
  /cluster/bin/i386/checkAgpAndFa contigs/Btau_v1.0_20040927.agp linearScaffolds/Btau20040927-freeze >& check.out
  /cluster/bin/i386/checkAgpAndFa contigs/Btau_v1.0_20040927.agp linearScaffolds/Btau20040927-freeze > check.err

# SELECTED DOWNLOAD FILES (DONE, Feb. 9, 2005, Heather)

  ssh hgwdev

  cd /cluster/data/bosTau1/downloads
  cd contigs
  cp *agp /usr/local/apache/htdocs/goldenPath/bosTau1/bigZips/scaffoldAgp
  cd /usr/local/apache/htdocs/goldenPath/bosTau1/bigZips
  nice gzip scaffoldAgp

  # takes about 30 minutes for gzip
  cd /cluster/data/bosTau1/downloads
  cd scaffolds
  cp Btau20040927-freeze-assembly.fa /usr/local/apache/htdocs/goldenPath/bosTau1/bigZips/scaffoldFa
  cd /usr/local/apache/htdocs/goldenPath/bosTau1/bigZips
  nice gzip scaffoldFa


# SPLIT SCAFFOLDS (DONE Sep. 30, 2004, Heather)

  ssh eieio
  cd /cluster/data/bosTau1
  mkdir scaffolds
  /cluster/bin/i386/faSplit byname downloads/linearScaffolds/Btau20040927-freeze-assembly.fa scaffolds -outDirDepth=3
  # not sure why these didn't get put under scaffolds directory
  mv 1 scaffolds
  mv 2 scaffolds
  mv 3 scaffolds
  mv 4 scaffolds
  etc.
  # generate list
  find scaffolds -print > scaffolds.list.0
  grep fa scaffolds.list.0 > scaffolds.list

# MAKE 2BIT NIB FILE (DONE Oct.1, 2004, Heather)

  ssh kksilo
  cd /cluster/data/bosTau1/linearScaffolds
  /cluster/bin/i386/faToTwoBit Btau20040927-freeze-assembly.fa bosTau1.2bit
  /cluster/bin/i386/twoBitToFa bosTau1.2bit check.fa
  mv bosTau1.2bit ..
  diff -q Btau20040927-freeze-assembly.fa check.fa
  # could also use cmp
  # note: size match
  ssh hgwdev
  mkdir /gbdb/bosTau1
  mkdir /gbdb/bosTau1/nib
  # could get rid of the nib in this path, 2bit isn't a nib (4bit)
  ln -s /cluster/data/bosTau1/bosTau1.2bit /gbdb/bosTau1/nib

# CREATE DATABASE

  ssh hgwdev
  hgsql hg17
  create database bosTau1;
  use bosTau1;
  create table grp (PRIMARY KEY(NAME)) select * from hg17.grp;
  # add rows to hgcentraltest on hgwbeta in dbDb and defaultDb
  # set orderKey past dog, before mouse

# CREATE GAP TABLE

  ssh hgwdev
  cd kent/src/hg/lib
  # remove bin column for now
  hgsql bosTau1 < gap2.sql
  cd /cluster/data/bosTau1/downloads/contigs
  grep fragment Btau_v1.0_20040927.agp > fragment.txt
  hgsql bosTau1
  load data local infile 'fragment.txt' into table gap
  # create trackDb/cow/bosTau1/gap.html

# CREATE CONTIG TABLE (DONE, Oct. 5, 2004, Heather)

  ssh hgwdev
  cd kent/src/hg/lib
  # this doesn't include strand or contig start/end
  hgsql bosTau1 < ctgPos2.sql
  cd /cluster/data/bosTau1/downloads/contigs
  grep Contig Btau_v1.0_20040927.agp > Contig.out
  ssh kksilo
  cd /cluster/data/bosTau1/downloads/contigs
  getContig.pl < Contig.out > ctgPos2
  ssh hgwdev
  load data local infile 'ctgPos2' into table ctgPos2
  # 9 skipped
  # Contig24811
  # Contig368174
  # Contig44331
  # Contig55201
  # Contig59195
  # Contig64664
  # Contig67471
  # Contig88377
  # Contig89200
  # 9 with size 0

# CREATE CHROMINFO TABLE (DONE Oct. 5, 2004, Heather)
# an improvement here would be to have getMaxCoord output the 2bit file name
  ssh hgwdev
  cd /cluster/data/bosTau1
  # get coordinates from agp and put in database table
  getcoords.pl < downloads/contigs/Btau_v1.0_20040927.agp > scaffolds.coords
  hgsql bosTau1 < coords.sql
  load data local infile 'scaffolds.coords' into table scaffoldCoords
  # calculate maximum coords; check that start coord is always 1
  /cluster/home/heather/bin/i386/GetMaxCoord > getMaxCoord.bosTau1
  # edit chromInfo.sql; allow fileName to be null
  cp ~heather/kent/src/hg/lib/chromInfo.sql .
  hgsql bosTau1 < chromInfo.sql
  load data local infile 'getMaxCoord.bosTau1' into table chromInfo
  update chromInfo set fileName = "/gbdb/bosTau1/nib/bosTau1.2bit"

# REPEATMASKER (DONE, Oct. 5, 2004, Heather)
# using the split scaffold fa files generated earlier

# note the version of RepeatMasker in use; will want this for download README
  cat /cluster/bluearc/RepeatMasker/Libraries/version
  # RepBase Update 9.04, RM database version 20040702

# do a trial run
  ssh hgwdev
  cd /cluster/data/bosTau1/scaffolds/0/0/0
  /cluster/bluearc/RepeatMasker/RepeatMasker -ali -s -spec bos SCAFFOLD1000.fa
# configure
  cd /cluster/data/bosTau1
  mkdir jkStuff
  cp /cluster/data/anoGam1/jkStuff/RMAnopheles jkStuff/RMBosTaurus
  # change references anoGam1 --> bosTau1
  mkdir RMRun

  # /bin/csh makeJoblist-RM.csh

  cd scaffolds
  foreach i (0 1 2 3 4 5 6 7 8 9)
    cd $i
    foreach j (0 1 2 3 4 5 6 7 8 9)
      cd $j
      foreach k (0 1 2 3 4 5 6 7 8 9)
        cd $k
        foreach f (*.fa)
          echo /cluster/data/bosTau1/jkStuff/RMBosTaurus \
	       /cluster/data/bosTau1/scaffolds/$i/$j/$k $f \
	      '{'check out line+ /cluster/data/bosTau1/$i/$j/$k/$f.out'}' \
	  >> /cluster/data/bosTau1/RMRun/RMJobs
	end
      cd ..
      end
    cd ..
    end
  cd ..
  end

  # do the run
  ssh kk
  cd /cluster/data/bosTau1/RMRun
  para create RMJobs
  para try
  para push
  para check
  para time
  etc.

  # concatenate into one output file; took about 90 minutes
  ssh kksilo
  cd /cluster/data/bosTau1
  mkdir repeats
  /bin/tcsh concatRM.tcsh
  cd repeats
  # use grep -v to get rid of all the headers
  # then, add back in an initial header
  # this is so hgLoadOut is happy
  grep SCAFFOLD repeats.all > repeats.clean
  cat repeats.header repeats.clean > repeats.final
  grep "There were no repetitive sequences found" repeats.final > repeats.notfound
  grep -v "There were no repetitive sequences found" repeats.final > repeats.out
  ssh hgwdev
  hgLoadOut bosTau1 repeats.out
  # Strange perc. fields:
  # -2.3 line 925441
  # -18.2 line 1644720
  # -18.2 line 1644722
  hgsql bosTau1
  rename table repeats_rmsk to rmsk

  # select count(*) from rmsk;
  # 4,244,045
  # select count(*) from rmsk where repName like "Bov%";
  # 1,445,972

# SIMPLE REPEATS (DONE, Oct. 6, 2004, Heather)
# put the results throughout the scaffold 0/0/0 directories,
# same as RepeatMasker, to avoid too many files in the same directory
  ssh kksilo
  cd /cluster/data/bosTau1
  mkdir bed
  cd bed
  mkdir simpleRepeat
  cd simpleRepeat

  /bin/csh makeJoblist-trf.csh
  # do the run; took about 12 hours
  tcsh trf-run.csh > & ! trf.log &

  # concatenate into one output file; took about an hour
  /bin/tcsh concatTRF.csh

  # load
  /cluster/bin/i386/hgLoadBed bosTau1 simpleRepeat trf.bed \
    -sqlTable = /cluster/home/heather/kent/src/hg/lib/simpleRepeat.sql

  # Reading trf.all
  # Loaded 248313 elements of size 16
  # Sorted
  # Saving bed.tab
  # Loading bosTau1

  create index chrom_2 on simpleRepeat (chrom(16), chromStart);
  create index chrom_3 on simpleRepeat (chrom(16), chromEnd);


# CREATE MASKED FA USING REPEATMASKER AND FILTERED TRF FILES (DONE Oct. 10, 2004, Heather) 

  ssh kksilo
  cd /cluster/data/bosTau1
  /cluster/bin/i386/maskOutFa -soft downloads/linearScaffolds/Btau20040927-freeze-assembly.fa repeats/repeat.out bosTau1.softmask.fa
  # 71 warnings about negative rEnd
  # matches select count(*) from rmsk where repEnd < 0;

  /cluster/bin/i386/maskOutFa -softAdd bosTau1.softmask.fa bed/simpleRepeat/trf.bed bosTau1.softmask2.fa

  # hard masking (Ns instead of lower case) for download files
  # split for use by genscan
  /cluster/bin/i386/maskOutFa bosTau1.softmask2.fa hard bosTau1.hardmask.fa
  mkdir hardmask-split
  /cluster/bin/i386/faSplit about bosTau1.hardmask.fa 2000000 hardmask-split

# DOWNLOAD FILES (DONE, Feb. 9, 2005, Heather)

  ssh hgwdev
  cd /cluster/data/bosTau1
  cp bosTau1.softmask.fa /usr/local/apache/htdocs/goldenPath/bosTau1/bigZips
  cp bosTau1.softmask2.fa /usr/local/apache/htdocs/goldenPath/bosTau1/bigZips
  cp bosTau1.hardmask.fa /usr/local/apache/htdocs/goldenPath/bosTau1/bigZips

  cd /usr/local/apache/htdocs/goldenPath/bosTau1/bigZips
  nice gzip bosTau1.softmask.fa
  nice gzip bosTau1.softmask2.fa
  nice gzip bosTau1.hardmask.fa

# REGENERATE 2BIT NIB (DONE, Oct. 10, 2004, Heather)
  ssh kksilo
  cd /cluster/data/bosTau1
  /cluster/bin/i386/faToTwoBit bosTau1.softmask2.fa bosTau1.softmask.2bit
  /cluster/bin/i386/twoBitToFa bosTau1.softmask.2bit check.softmask.fa
  diff -q bosTau1.softmask2.fa check.softmask.fa
  mv bosTau1.2bit bosTau1.2bit.unmasked
  mv bosTau1.softmask.2bit bosTau1.2bit
  # note: size match

# PUT MASKED SEQUENCE OUT FOR CLUSTER RUNS (DONE, Oct. 10, 2004, Heather)
  # could get rid of the nib in this path, 2bit isn't a nib (4bit)
  ssh kkr1u00
  mkdir -p /iscratch/i/bosTau1/nib
  cp -p /cluster/data/bosTau1/bosTau1.2bit /iscratch/i/bosTau1/nib
  iSync

  ssh kksilo
  mkdir -p /cluster/bluearc/bosTau1/nib
  cp -p /cluster/data/bosTau1/nib/bosTau1.2bit /cluster/bluearc/bosTau1/nib

# GENERATE split masked fas for blastz (DONE, Oct. 19, 2004, Heather)
  ssh kkr1u00
  mkdir -p /iscratch/i/bosTau1/splitFas
  cd /cluster/data/bosTau1
  /cluster/bin/i386/faSplit sequence /cluster/data/bosTau1/bosTau1.softmask2.fa 350 \
    /iscratch/i/bosTau1/splitFas
  # slight problem with directory name
  cd /iscratch/i/bosTau1
  mkdir splitDir
  mv splitFas* splitDir
  rmdir splitFas
  # check that I got all scaffolds
  grep SCAFFOLD *.fa > SCAFFOLD.list
  # sync
  /cluster/bin/scripts/iSync

# MAKE 11.OOC FILE FOR BLAT (DONE, Oct. 10, 2004, Heather)
  ssh kkr1u00
  mkdir /cluster/data/bosTau1/bed/ooc
  cd /cluster/data/bosTau1/bed/ooc
  ls -1 /cluster/data/bosTau1/bosTau1.2bit > nib.lst
  /cluster/bin/i386/blat nib.lst /dev/null /dev/null -tileSize=11 \
    -makeOoc=/cluster/bluearc/bosTau1/11.ooc -repMatch=100
  # Wrote 1327650 overused 11-mers to /cluster/bluearc/bosTau1/11.ooc
  cp -p /cluster/bluearc/bosTau1/11.ooc /iscratch/i/bosTau1/
  iSync

# GENBANK
  ssh hgwdev
  cd /cluster/home/heather/kent/src/hg/makeDb/genbank
  # check for missing commits
  diff /cluster/data/genbank/etc/genbank.conf etc/genbank.conf
  # edit etc/genbank.conf and add these lines, starting with the comment:
  # bosTau1 (B. taurus)
  # could get rid of the nib in this path, 2bit isn't a nib (4bit)
  bosTau1.genome = /iscratch/i/bosTau1/nib/bosTau1.2bit
  bosTau1.lift = no
  bosTau1.refseq.mrna.native.load = no
  # bosTau1.genbank.mrna.xeno.load = yes
  bosTau1.genbank.mrna.xeno.load = no
  bosTau1.genbank.est.xeno.load = no
  bosTau1.downloadDir = bosTau1
  bosTau1.perChromTables = no
  cvs commit -m "added cow" etc/genbank.conf
  #revision 1.56

  # edit src/lib/gbGenome.c
  make
  cvs commit -m "added cow" src/lib/gbGenome.c
  # revision 1.19

  # edit src/align/gbBlat
  make
  cvs commit -m "added cow" src/align/gbBlat
  # revision 1.29

  make install-server

  ssh eieio
  cd /cluster/data/genbank
  nice bin/gbAlignStep -srcDb=genbank -type=mrna -initial -verbose=1 bosTau1 &
  # logged to 
  # /cluster/data/genbank/var/build/logs/YYMMDDHHMMSS.bosTau1.initalign.log
  # load
  ssh hgwdev
  cd /cluster/data/genbank
  nice bin/gbDbLoadStep -verbose=1 -drop -initialLoad bosTau1
  # logged to /cluster/data/genbank/var/dbload/hgwdev/logs/YYMMDDHHMMSS.dbload.log

  # ESTs required modification to the build process to deal with the huge
  # number of scaffolds.  Once completed, the following line was
  # added to genbank.conf to enable it's uses:
    bosTau1.mondoTwoBitParts = 5000
  ssh eieio
  cd /cluster/data/genbank
  bin/gbAlignStep -initial -srcDb=genbank -type=est bosTau1
  
  # load the ESTs, reloading mRNAs as well
  ssh hgwdev
  cd /cluster/data/genbank
  nice bin/gbDbLoadStep -drop -initialLoad bosTau1

  # add bosTau1 to list of databases to align in
  #  /cluster/data/genbank/etc/align-genbank

# GENBANK: add Non-Cow Refseqs (DONE May 9, 2005, Heather)

  # add to kent/src/hg/makeDb/genbank/etc/genbank.conf and commit:
  # bosTau1.refseq.mrna.xeno.load = yes
  cd /cluster/data/genbank/etc
  cvs update
  # new tables: xenoRefGene, xenoRefFlat, xenoRefSeqAli, refLink, refSeqStatus
  # refSeqSummary

# GC PERCENT (DONE Oct. 17, 2004, Heather)
  # use old-style (non-wiggle)
  # run on kolossus (serialized)
  # use default window size (20K); only applies to 25k scaffolds
  # ran from Friday night 4pm to Sunday morning 7am
  # dominated one CPU
  ssh kolossus
  mkdir /cluster/data/bosTau1/bed/gcPercent
  cd /cluster/data/bosTau1/bed/gcPercent
  # gcPercent modified to recognize 2bit file
  /cluster/bin/x86_64/hgGcPercent -noLoad bosTau1 /gbdb/bosTau1/nib
  ssh hgwdev
  cd /cluster/home/heather/kent/src/hg/lib
  # change index from 12 to 14 characters
  hgsql bosTau1 < gcPercent.sql
  load data local infile 'gcPercent.bed' into table gcPercent
  # 54007 rows

# GC5BASE - (DONE 2004-10-27 - Hiram)
    ssh kksilo
    mkdir /cluster/data/bosTau1/bed/gc5Base
    cd /cluster/data/bosTau1/bed/gc5Base
    hgGcPercent -wigOut -doGaps -file=stdout -win=5 bosTau1 \
      /cluster/data/bosTau1 | wigBedToBinary stdin gc5Base.wig gc5Base.wib
    #       Calculating gcPercent with window size 5
    #       Using twoBit: /cluster/data/bosTau1/bosTau1.2bit
    #       File stdout created
    #	Converted stdin, upper limit 100.00, lower limit 0.00
    #	real    254m41.199s
    #	user    139m32.740s
    #	sys     85m1.450s
    #	Now load those results into the database
    ssh hgwdev
    cd /cluster/data/bosTau1/bed/gc5Base
    mkdir /gbdb/bosTau1/wib
    ln -s `pwd`/gc5Base.wib /gbdb/bosTau1/wib
    hgLoadWiggle -pathPrefix=/gbdb/bosTau1/wib bosTau1 gc5Base gc5Base.wig
    #	The index is no good for this organism due to the scaffold
    #	naming scheme, drop the index and create one that works:
    hgsql -e "drop index chrom on gc5Base;" bosTau1
    hgsql -e "create index chrom on gc5Base (chrom(16));" bosTau1
    #	The speeds up an operation on the hgTables browser by orders of
    #	magnitude - whole genome statistics display

# LOAD GAP & GOLD TABLES FROM AGP (DONE 11/10/04 angie)
    ssh hgwdev
    # There is one line of AGP containing a 0-length contig segment, and 
    # this causes a problem for findBin in hgGoldGapGl.  awk that out:
    awk '$3 != 0 {print;}' \ 
      /cluster/data/bosTau1/downloads/contigs/Btau_v1.0_20040927.agp \
    | hgGoldGapGl -noGl bosTau1 stdin
    # For some reason, the indices did not get built correctly --
    # "show index from gap/gold" shows NULL cardinalities for chrom.  
    # Rebuild indices with "analyze table".
    hgsql bosTau1 -e 'analyze table gold; analyze table gap;'

# FIX AGP FILE
    # Ali detected some truncated scaffolds
    # I added agpFragValidate to kent/src/hg/lib/agpFrag.c (revision 1.9)
    # and added a call to it from kent/src/hg/makeDb/hgGoldGapGl/hgGoldGapGl.c (revision 1.20)
    ssh hgwdev
    cd /cluster/data/bosTau1/downloads/contigs
    cp Btau_v1.0_20040927.agp copy.agp
    # edit copy.agp:
    # delete 0 length contig that Angie found (Contig92264 in SCAFFOLD230837)
    # delete -1 length contigs found by hgGoldGapGl:
    # SCAFFOLD36273	9126	9125	3	W	Contig55199	1	0	+
    # SCAFFOLD120295	9153	9152	4	W	Contig64664	1	0	+
    # SCAFFOLD130877	2412	2411	3	W	Contig89199	1656	1655	-
    # SCAFFOLD165124	13935	13934	11	W	Contig67469	2691	2690	-
    # SCAFFOLD195025	37500	37499	17	W	Contig59193	1196	1195	-
    # SCAFFOLD215637	2564	2563	3	W	Contig24808	1	0	+
    # SCAFFOLD225287	10250	10249	7	W	Contig88376	998	997	-
    # SCAFFOLD300287	13363	13362	7	W	Contig44330	1417	1416	-
    # drop gold and gap tables; rerun
    /cluster/home/heather/bin/i386/hgGoldGapGl -noGl bosTau1 copy.agp
    # check
    cd /cluster/home/heather/jdbc
    source javaenv
    ScaffoldGapChain bosTau1 
    # No anomalies detected
    # Took about 3 hours to run
    

# QUALITY TRACK (DONE April 2005 Heather)
# The challenge was a sorting problem.
# The quality file from Baylor was ordered by contig.
# We need quality data by scaffold.
# Solution was to load quality data into database,
# then extract it by scaffold.
    ssh kolossus
    cd /cluster/data/bosTau1/bed
    mkdir quality
    cd quality

    # load quality data into database table
    hgsql mysql -e 'create database bosTau1;'
    hgsql bosTau1 < contigQuality.sql
    source javaenv
    # took about 2 hours to load
    /usr/java/jdk1.5.0_01/bin/java LoadQuality ../../downloads/contigs/Btau20040927-freeze-contigs.fa.qual bosTau1 \
      > LoadQuality.log.mar9

    # create test table of quality data
    hgsql bosTau1 < contigQualityTest.sql
    source javaenv
    /usr/java/jdk1.5.0_01/bin/java LoadQualityTest quality.test bosTau1

    # transfer gold table from hgwdev to kolossus
    ssh hgwdev
    cd /cluster/data/bosTau1/bed/quality
    hgsqldump bosTau1 gold > gold.dump
    ssh kolossus
    cd /cluster/data/bosTau1/bed/quality
    hgsql bosTau1 < gold.dump

    # read 
    # run a test case
    hgsql bosTaul < goldClean.dump
    source javaenv
    /usr/java/jdk1.5.0_01/bin/java ScaffoldQualityTest bosTau1 goldClean > goldClean.run

    # run for real; took about 7 hours (3pm - 10pm)
    # should have written to local scratch on kolossus
    /usr/java/jdk1.5.0_01/bin/java ScaffoldQuality bosTau1 gold > gold.run

    # check
    grep fixedStep gold.run > fixedStep.out

    # wigEncode (actually copied data to eieio; no logins to kksilo; copied back to kksilo)
    # took about 20 minutes
    wigEncode gold.run quality.wig quality.wib

    # load
    # took about 10 minutes
    ssh hgwdev
    ln -s /cluster/data/bosTau1/bed/quality/quality.wib /gbdb/bosTau1/wib
    hgLoadWiggle -pathPrefix=/gbdb/bosTau1/wib bosTau1 quality quality.wig

    # Connected to database bosTau1 for track quality
    # Creating table definition with 13 columns in bosTau1.quality
    # Saving wiggle.tab
    # WARNING: Exceeded SCAFFOLD195025 size 51200 > 50221. dropping 980 data point(s)
    # WARNING: Exceeded SCAFFOLD195025 size 51416 > 50221. dropping 1196 data point(s)
    # WARNING: Exceeded SCAFFOLD215637 size 11264 > 10768. dropping 497 data point(s)
    # WARNING: Exceeded SCAFFOLD215637 size 12288 > 10768. dropping 1521 data point(s)
    # WARNING: Exceeded SCAFFOLD215637 size 13159 > 10768. dropping 2392 data point(s)
    # Loading bosTau1

# Rerun Quality with new gold table

    # transfer gold table from hgwdev to kolossus
    ssh hgwdev
    cd /cluster/data/bosTau1/bed/quality
    hgsqldump bosTau1 gold > gold.dump.2
    ssh kolossus
    cd /cluster/data/bosTau1/bed/quality
    # mysql: rename table gold to goldBackup
    hgsql bosTau1 < gold.dump.2

    source javaenv
    # should have written to local scratch on kolossus
    /usr/java/jdk1.5.0_01/bin/java ScaffoldQuality bosTau1 gold > gold.run.2

    # check
    ssh kkstore01
    cd /cluster/data/bosTau1/bed/quality
    grep fixedStep gold.run.2 > fixedStep.out.2
    # fixedStep.out.2 matches fixedStep.out

    # wigEncode 
    ssh kkstore01
    cd /cluster/data/bosTau1/bed/quality
    wigEncode gold.run.2 quality.wig.2 quality.wib.2

    # load
    ssh hgwdev
    ln -s /cluster/data/bosTau1/bed/quality/quality.wib.2 /gbdb/bosTau1/wib
    cd /cluster/data/bosTau1/bed/quality
    hgLoadWiggle -pathPrefix=/gbdb/bosTau1/wib bosTau1 quality quality.wig.2

# Rerun without using zeroes for gaps (new version of ScaffoldQuality)
# (Heather, May 24, 2005)

    ssh kkstore01
    cd /cluster/data/bosTau1/bed/quality
    source javaenv
    # should have written to local scratch on kolossus
    /usr/java/jdk1.5.0_01/bin/java ScaffoldQuality bosTau1 gold > gold.run.3
    wigEncode gold.run.3 quality.wig.3 quality.wib.3
    # upper limit 90.0, lower limit 0.0
    ssh hgwdev
    ln -s /cluster/data/bosTau1/bed/quality/quality.wib.3 /gbdb/bosTau1/wib
    cd /cluster/data/bosTau1/bed/quality
    hgLoadWiggle -pathPrefix=/gbdb/bosTau1/wib bosTau1 quality quality.wig.3



# GENSCAN (DONE Nov. 17, 2004 Heather)
  # uses hard-masked sequence files
  ssh hgwdev
  mkdir /cluster/data/bosTau1/bed/genscan
  cd /cluster/data/xenTro1/bed/genscan
  cvs co hg3rdParty/genscanlinux
  mkdir gtf pep subopt

  # Run on small cluster (more memory than big cluster).
  ssh kki

  # generate list of hard-masked scaffolds that are not all-N's
  # (genscan crashes if all-N's)
  foreach f ( `ls -1S /cluster/data/bosTau1/hardmask-split/*` )
    egrep '[ACGT]' $f > /dev/null
    if ($status == 0) echo $f >> genome.list
  end
  wc -l genome.list
  # Create gsub
  cat << '_EOF_' > gsub
  #LOOP
  /cluster/bin/x86_64/gsBig {check in line+ $(path1)} {check out line gtf/$(root1).gtf} -trans={check out line pep/$(root1).pep} -subopt={check out line subopt/$(root1).bed} -exe=hg3rdParty/genscanlinux/genscan -par=hg3rdParty/genscanlinux/HumanIso.smat -tmp=/tmp -window=2400000
  #ENDLOOP
  '_EOF_'
  # << this line makes emacs coloring happy
  /cluster/bin/i386/gensub2 genome.list single gsub jobList
  para create jobList
  para try, check, push, check, ...
  ls -1 gtf | wc -l
  endsInLf gtf/*

  # Concatenate
  ssh kksilo
  cd /cluster/data/bosTau1/bed/genscan
  cat gtf/*.gtf > genscan.gtf
  cat pep/*.pep > genscan.pep
  cat subopt/*.bed > genscanSubopt.bed

  # Load
  ssh hgwdev
  cd /cluster/data/bosTau1/bed/genscan
  ldHgGene -gtf bosTau1 genscan genscan.gtf
  hgPepPred bosTau1 generic genscanPep genscan.pep
  hgLoadBed bosTau1 genscanSubopt genscanSubopt.bed
  featureBits bosTau1 genscan
  # 49346362 bases of 2261116798 (2.182%) in intersection

# QA NOTE [ASZ: 9-11-2006]: used mytouch on genscanPep.
# sudo mytouch bosTau1 genscanPep 200504081400.00


# RE-INDEX TABLES TO ACCOMODATE LONG SCAFFOLD NAMES

  ssh hgwdev
# Note: next time around rmsk index redo won't be necessary,
# because of modifications to hgLoadOut.  Note we're just
# doing an extremely simplified (chrom-only) index in most
# of these cases because the scaffolds are so small.
  hgsql bosTau1 -e 'drop index bin on rmsk'
  hgsql bosTau1 -e 'create index genoName on rmsk(genoName(16))'
  hgsql bosTau1 -e 'drop index genoStart on rmsk'
  hgsql bosTau1 -e 'drop index genoEnd on rmsk'

# Next time around simpleRepeat index redo also not necessary,
# because of changes to simpleRepeat.sql
  hgsql bosTau1 -e 'drop index chrom on simpleRepeat'
  hgsql bosTau1 -e 'drop index chrom_2 on simpleRepeat'
  hgsql bosTau1 -e 'drop index chrom_3 on simpleRepeat'
  hgsql bosTau1 -e 'create index chrom on simpleRepeat(chrom(16),bin)'

# I'm not sure if the all_est/intronEst/all_mrna will need
# redoing next time or not.  Please check with show index from table.
  hgsql bosTau1 -e 'drop index tName on all_est'
  hgsql bosTau1 -e 'drop index tName_2 on all_est'
  hgsql bosTau1 -e 'drop index tname_3 on all_est'
  hgsql bosTau1 -e 'create index tName on all_est(tName(16))'

  hgsql bosTau1 -e 'drop index tName on intronEst'
  hgsql bosTau1 -e 'drop index tName_2 on intronEst'
  hgsql bosTau1 -e 'drop index tname_3 on intronEst'
  hgsql bosTau1 -e 'create index tName on intronEst(tName(16))'

  hgsql bosTau1 -e 'drop index tName on all_mrna'
  hgsql bosTau1 -e 'drop index tName_2 on all_mrna'
  hgsql bosTau1 -e 'drop index tname_3 on all_mrna'
  hgsql bosTau1 -e 'create index tName on all_mrna(tName(16))'


# MAKE Human Proteins track (hg17 DONE 2004-12-20 braney)
    ssh kksilo
    mkdir -p /cluster/data/bosTau1/blastDb
    cd /cluster/data/bosTau1/blastDb
    faSplit sequence ../downloads/linearScaffolds/Btau20040927-freeze-assembly.fa 500 x 
    for i in x*
    do
	formatdb -i $i -p F
    done
    rm *.log *.fa

    ssh kkr1u00
    mkdir -p /iscratch/i/bosTau1/blastDb
    cp /cluster/data/bosTau1/blastDb/* /iscratch/i/bosTau1/blastDb
    (iSync) > sync.out

    mkdir -p /cluster/data/bosTau1/bed/tblastn.hg17KG
    cd /cluster/data/bosTau1/bed/tblastn.hg17KG
    ls -1S /iscratch/i/bosTau1/blastDb/*.nsq | sed "s/\.nsq//" > query.lst
    # back to kksilo
    exit

    # we want around 200000 jobs
    cd /cluster/data/bosTau1/bed/tblastn.hg17KG
    calc `wc /cluster/data/hg17/bed/blat.hg17KG/hg17KG.psl | awk "{print \\\$1}"`/\(200000/`wc query.lst | awk "{print \\\$1}"`\)
    # 42156/(200000/494) = 104.125320
    mkdir -p /cluster/bluearc/bosTau1/bed/tblastn.hg17KG/kgfa
    split -l 104 /cluster/data/hg17/bed/blat.hg17KG/hg17KG.psl /cluster/bluearc/bosTau1/bed/tblastn.hg17KG/kgfa/kg
    ln -s /cluster/bluearc/bosTau1/bed/tblastn.hg17KG/kgfa kgfa
    cd kgfa
    for i in *; do pslxToFa $i $i.fa; rm $i; done
    cd ..
    ls -1S kgfa/*.fa > kg.lst
    mkdir -p /cluster/bluearc/bosTau1/bed/tblastn.hg17KG/blastOut
    ln -s /cluster/bluearc/bosTau1/bed/tblastn.hg17KG/blastOut
    for i in `cat kg.lst`; do  mkdir blastOut/`basename $i .fa`; done
    tcsh
    cat << '_EOF_' > blastGsub
#LOOP
blastSome $(path1) {check in line $(path2)} {check out exists blastOut/$(root2)/q.$(root1).psl } 
#ENDLOOP
'_EOF_'
    cat << '_EOF_' > blastSome
#!/bin/sh
BLASTMAT=/iscratch/i/blast/data
export BLASTMAT
g=`basename $2`
f=/tmp/`basename $3`.$g
for eVal in 0.01 0.001 0.0001 0.00001 0.000001 1E-09 1E-11
do
if /scratch/blast/blastall -M BLOSUM80 -m 0 -F no -e $eVal -p tblastn -d $1 -i $2 -o $f.8
then
        mv $f.8 $f.1
        break;
fi
done
if test -f  $f.1
then
if /cluster/bin/i386/blastToPsl $f.1 $f.2
then
        liftUp -nosort -type=".psl" -pslQ -nohead $3.tmp /cluster/data/hg17/bed/blat.hg17KG/protein.lft warn $f.2
	if pslCheck -prot $3.tmp
	then
	    mv $3.tmp $3
	    rm -f $f.1 $f.2 
	fi
        exit 0
    fi
fi
rm -f $f.1 $f.2 $3.tmp $f.8
exit 1
'_EOF_'

    chmod +x blastSome
    gensub2 query.lst kg.lst blastGsub blastSpec

    ssh kk
    cd /cluster/data/bosTau1/bed/tblastn.hg17KG
    para create blastSpec
    para push
# Completed: 200564 of 200564 jobs
# CPU time in finished jobs:   44956749s  749279.15m 12487.99h  520.33d  1.426 y
# IO & Wait Time:               2159792s   35996.53m   599.94h   25.00d  0.068 y
# Average job time:                 235s       3.92m     0.07h    0.00d
# Longest job:                      843s      14.05m     0.23h    0.01d
# Submission to last job:        102195s    1703.25m    28.39h    1.18d

    cat << '_EOF_' > chainGsub
#LOOP
chainSome $(path1)
#ENDLOOP
'_EOF_'

    cat << '_EOF_' > chainSome
(cd $1; cat q.*.psl | simpleChain -prot -outPsl -maxGap=150000 stdin ../c.`basename $1`.psl)
'_EOF_'
    chmod +x chainSome

    ls -1dS `pwd`/blastOut/kg?? > chain.lst
    gensub2 chain.lst single chainGsub chainSpec

    ssh kki
    cd /cluster/data/bosTau1/bed/tblastn.hg17KG
    para create chainSpec
    para push
# Completed: 406 of 406 jobs
# CPU time in finished jobs:       1354s      22.57m     0.38h    0.02d  0.000 y
# IO & Wait Time:                 18021s     300.35m     5.01h    0.21d  0.001 y
# Average job time:                  48s       0.80m     0.01h    0.00d
# Longest job:                      116s       1.93m     0.03h    0.00d

    exit
    # back to kksilo
    cd /cluster/data/bosTau1/bed/tblastn.hg17KG/blastOut
    for i in kg??
    do 
	awk "(\$13 - \$12)/\$11 > 0.6 {print}" c.$i.psl > c60.$i.psl
	sort -rn c60.$i.psl | pslUniq stdin u.$i.psl
	awk "((\$1 / \$11) ) > 0.60 { print   }" c60.$i.psl > m60.$i.psl
	echo $i
    done

    cat u.*.psl m60* | sort -T /tmp -k 14,14 -k 17,17n -k 17,17n  | uniq  > /cluster/data/bosTau1/bed/tblastn.hg17KG/blastHg17KG.psl
    cd ..

    ssh hgwdev
    cd /cluster/data/bosTau1/bed/tblastn.hg17KG
    hgLoadPsl bosTau1 blastHg17KG.psl

    # back to kksilo
    rm -rf blastOut
# End tblastn

# Pick up photo from NHGRI (DONE - 2004-12-22 - Hiram)
    ssh hgwdev
    cd /tmp
    wget \
	'http://www.genome.gov/Pages/News/Photos/Science/bovine_image.jpg'
	-O bovine_image.jpg
    #	view that image in 'display' to determine crop edges, then:
    convert -crop 1280x800+280+240 -quality 80 -sharpen 0 \
	-normalize bovine_image.jpg bbt.jpg
    convert -geometry 300x200 -quality 80 bbt.jpg Bos_taurus.jpg
    rm -f bbt.jpg bovine_image.jpg

    cp -p Bos_taurus.jpg /usr/local/apache/htdocs/images
    #	add links to this image in the description.html page, request push


# MAKE 11.OOC FILE FOR BLAT (REDONE, Jan. 18, 2004, Heather)
  ssh kkr1u00
  mkdir /cluster/data/bosTau1/bed/ooc
  cd /cluster/data/bosTau1/bed/ooc
  ls -1 /cluster/data/bosTau1/bosTau1.2bit > nib.lst
  /cluster/bin/i386/blat nib.lst /dev/null /dev/null -tileSize=11 \
    -makeOoc=/cluster/bluearc/bosTau1/11.ooc -repMatch=800
  # Wrote 1327650 overused 11-mers to /cluster/bluearc/bosTau1/11.ooc
  cp -p /cluster/bluearc/bosTau1/11.ooc /iscratch/i/bosTau1/
  iSync

# GENBANK
  # gbAlign for mrna
  ssh eieio
  cd /cluster/data/genbank
  nice bin/gbAlignStep -srcDb=genbank -type=mrna -initial -verbose=1 bosTau1 &
  # logged to 
  # /cluster/data/genbank/var/build/logs/YYMMDDHHMMSS.bosTau1.initalign.log

  # load mrna
  ssh hgwdev
  cd /cluster/data/genbank
  nice bin/gbDbLoadStep -verbose=1 -drop -initialLoad bosTau1
  # logged to /cluster/data/genbank/var/dbload/hgwdev/logs/YYMMDDHHMMSS.dbload.log

  # gbAlign for est
  ssh eieio
  cd /cluster/data/genbank
  nice bin/gbAlignStep -initial -srcDb=genbank -type=est bosTau1
  
  # load the ESTs, reloading mRNAs as well
  ssh hgwdev
  cd /cluster/data/genbank
  nice bin/gbDbLoadStep -drop -initialLoad bosTau1

  # add bosTau1 to list of databases to align in
  #  /cluster/data/genbank/etc/align-genbank

# GENBANK DOWNLOAD FILES (DONE, Jan. 26 Heather)

  ssh hgwbeta
  cd /genbank
  ./bin/gbDbLoadStep -getDownloadSeqs=1 bosTau1
  # output goes to /genbank/download/bosTau1/bigZips
  # these are automatically rsynced to hgnfs1 (which hgdownload reads)


# SWAP human(target)/cow(query) alignments to make bosTau1.chainHg17
    ssh kksilo
    cd /cluster/data/bosTau1/bed
    mkdir blastz
    mkdir blastz/axtChain
    /cluster/bin/i386/chainSwap zb.hg17/axtChain/all.chain.jan5.filtered blastz/axtChain/all.chain
    cp all.chain all.chain.backup

#   out of memory loading in one piece
#   can't split by scaffold, too many files
#   solution: split into 2 pieces
    cd blastz/axtChain
    mkdir split-2
    /cluster/bin/i386/chainSplit -lump=2 split-2 all.chain 

    ssh hgwdev
    hgsql bosTau1
    drop table chainHg17
    drop table chainHg17Link
    cd split-2
    # each of these takes about 4 hours
    /cluster/bin/i386/hgLoadChain -tIndex bosTau1 chainHg17 000.chain
    /cluster/bin/i386/hgLoadChain -tIndex -oldTable bosTau1 chainHg17 001.chain

    # cleanup
    ssh kksilo
    cd /cluster/data/bosTau1/bed/blastz/axtChain
    rm all.chain.backup

#   bosTau1.netHg17

    ssh kolossus
    cd /cluster/data/bosTau1/bed/blastz/axtChain

    # PRE
    /cluster/bin/x86_64/chainPreNet all.chain ../../zb.hg17/S2.len ../../zb.hg17/S1.len chainPreNet.out

    # chainNet
    /cluster/bin/x86_64/chainNet chainPreNet.out 
      -minSpace=1 ../../zb.hg17/S2.len ../../zb.hg17/S1.len hg17.net.raw /dev/null

    # syntenic (using revision 1.7)
    /cluster/home/heather/bin/x86_64/netSyntenic hg17.net.raw hg17.net.syn
    # memory usage 2733064192, utime 5681 s/100, stime 617

    # gzip
    ssh kksilo
    cd /cluster/data/bosTau1/bed/blastz/axtChain
    gzip chainPreNet.out
    gzip bosTau1.net.raw

    # netClass
    # lots of feedback as each scaffold is processed; should have redirected output
    # takes about an hour
    ssh hgwdev
    cd /cluster/data/bosTau1/bed/blastz/axtChain
    /cluster/bin/i386/netClass -noAr hg17.net.syn bosTau1 hg17 hg17.net 

    # load
    # again, lots of feedback as each scaffold is processed
    ssh hgwdev
    cd /cluster/data/bosTau1/bed/blastz/axtChain
    /cluster/bin/i386/netFilter -minGap=10 hg17.net | /cluster/bin/i386/hgLoadNet bosTau1 netHg17 stdin 

    # generate axts 
    # don't need to split?
    # kksilo or kolossus?
    # yet again, lots of feedback as each scaffold is processed
    # WORKING HERE
    ssh kksilo
    mkdir ../axtNet
    /cluster/bin/i386/netToAxt hg17.net.syn all.chain /cluster/data/bosTau1/bosTau1.2bit /cluster/data/hg17/nib ../axtNet/all.axt

    # sort?

    # cleanup, compress, etc.
    ssh kksilo
    cd /cluster/data/bosTau1/bed/blastz/axtChain
    gzip all.chain
    gzip hg17.net.syn
    rm chain.tab link.tab

    cd /cluster/data/bosTau1/bed/blastz/axtNet
    cp all.axt all.axt.backup

# MAKE VSHG17 DOWNLOADABLES (DONE Feb. 15, 2005 Heather)
  ssh hgwdev
  cd /usr/local/apache/htdocs/goldenPath/bosTau1
  mkdir vsHg17
  cd /cluster/data/bosTau1/bed/blastz/axtNet
  cp -p all.axt.gz /usr/local/apache/htdocs/goldenPath/bosTau1/vsHg17/axtNet.gz
  cd ../axtChain
  cp -p all.chain.gz /usr/local/apache/htdocs/goldenPath/bosTau1/vsHg17/human.chain.gz
  cp -p hg17.net.gz /usr/local/apache/htdocs/goldenPath/bosTau1/vsHg17/human.net.gz

  cd /usr/local/apache/htdocs/goldenPath/bosTau1/vsHg17
  # Make a README.txt which explains the files & formats.
  md5sum *.gz > md5sum.txt


# SELF BLASTZ / BLASTZ-RUN-UCSC EXPERIMENT (DONE 1/27/05 angie)
    # DELETED 9/20/05 angie (never used)
    # Put chrom.sizes in /iscratch/ since blastz-run-ucsc reads it when 
    # lifting, and bosTau1 chrom.sizes is large.
    ssh kkr1u00
    cp /cluster/data/bosTau1/chrom.sizes /iscratch/i/bosTau1/
    iSync
    ssh kk
    mkdir /cluster/data/bosTau1/bed/blastz.bosTau1.2005-01-26
    cd /cluster/data/bosTau1/bed/blastz.bosTau1.2005-01-26
    ln -s blastz.bosTau1.2005-01-26 /cluster/data/bosTau1/bed/blastz.bosTau1
    # 10M chunks led to hour-long average job time, so try 5M chunks:
    cat << '_EOF_' > DEF
# Cow vs. self
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/i386:/cluster/home/angie/schwartzbin

ALIGN=blastz-run
BLASTZ=blastz
BLASTZ_H=2000
BLASTZ_ABRIDGE_REPEATS=0

# TARGET - Cow
SEQ1_DIR=/iscratch/i/bosTau1/nib/bosTau1.2bit
SEQ1_IN_CONTIGS=0
SEQ1_CHUNK=5000000
SEQ1_LAP=10000

# QUERY - Cow
SEQ2_DIR=/iscratch/i/bosTau1/nib/bosTau1.2bit
SEQ2_IN_CONTIGS=0
SEQ2_CHUNK=5000000
SEQ2_LAP=10000

BASE=/cluster/data/bosTau1/bed/blastz.bosTau1.2005-01-26

DEF=$BASE/DEF
RAW=$BASE/raw
CDBDIR=$BASE
SEQ1_LEN=/iscratch/i/bosTau1/chrom.sizes
SEQ2_LEN=/iscratch/i/bosTau1/chrom.sizes

'_EOF_'
    # << this line keeps emacs coloring happy

    mkdir run
    cd run
    mkdir lstFiles
    partitionSequence.pl 5000000 10000 /iscratch/i/bosTau1/nib/bosTau1.2bit \
      /iscratch/i/bosTau1/chrom.sizes \
      -xdir xdir.sh -rawDir ../axtParts -lstDir lstFiles \
    > bosTau1.lst
    csh -ef xdir.sh
    cat << '_EOF_' > gsub
#LOOP
/cluster/home/angie/kent/src/utils/blastz-run-ucsc -outFormat axt -dropSelf  $(path1) $(path2) ../DEF {check out exists ../axtParts/$(file1)/$(file1)_$(file2).axt}
#ENDLOOP
'_EOF_'
    # << this line keeps emacs coloring happy
    gensub2 bosTau1.lst bosTau1.lst gsub jobList
    para create jobList
    para try, check, push, check, ...
    # Here's the time for the first 10 jobs, to .axt:
#Completed: 10 of 220900 jobs
#CPU time in finished jobs:       9177s     152.95m     2.55h    0.11d  0.000 y
#IO & Wait Time:                    85s       1.41m     0.02h    0.00d  0.000 y
#Average job time:                 926s      15.44m     0.26h    0.01d
#Longest job:                     1047s      17.45m     0.29h    0.01d
#Submission to last job:          1047s      17.45m     0.29h    0.01d
    # Here's the time for the first 10 jobs, to .psl.gz (should we believe the I/O?):
#Completed: 10 of 220900 jobs
#CPU time in finished jobs:      10709s     178.48m     2.97h    0.12d  0.000 y
#IO & Wait Time:                     0s       0.00m     0.00h    0.00d  0.000 y
#Average job time:                1008s      16.79m     0.28h    0.01d
#Longest job:                     1238s      20.63m     0.34h    0.01d
#Submission to last job:          1238s      20.63m     0.34h    0.01d
    # And from a previous run of the first 10 jobs, to lav:
#Completed: 10 of 220900 jobs
#CPU time in finished jobs:       9273s     154.55m     2.58h    0.11d  0.000 y
#IO & Wait Time:                    84s       1.40m     0.02h    0.00d  0.000 y
#Average job time:                 936s      15.60m     0.26h    0.01d
#Longest job:                     1060s      17.67m     0.29h    0.01d
#Submission to last job:          1062s      17.70m     0.29h    0.01d
    # Jim said we don't really want this track, so no need to carry through 
    # with the whole thing.  But I did go through the steps below with 
    # partial results to make sure that they work.

    # Combine the many small .psl.gz files into per-target-partition files 
    # for chaining.
#NOT DONE
    ssh kksilo
    cd /cluster/data/bosTau1/bed/blastz.bosTau1
    mkdir psl
    foreach t (pslParts/*)
      zcat $t/*.psl.gz \
      | sort -k 14,14 -k 16n,17n \
      | gzip -c \
      > psl/$t:t:r.psl.gz
    end

    # chain psl/*.psl.gz


# LOAD CPGISSLANDS (DONE, Nov. 27, 2004, Heather)
    ssh hgwdev
    mkdir -p /cluster/data/bosTau1/bed/cpgIsland
    cd /cluster/data/bosTau1/bed/cpgIsland
    # Build software from Asif Chinwalla (achinwal@watson.wustl.edu)
    cvs co hg3rdParty/cpgIslands
    cd hg3rdParty/cpgIslands
    make
    mv cpglh.exe /cluster/data/bosTau1/bed/cpgIsland/
    
    ssh kksilo
    cd /cluster/data/bosTau1/bed/cpgIsland
    foreach f (../../softMasked2/*.fa)
      set fout=$f:t:r:r.cpg
      echo running cpglh on $f to $fout
      ./cpglh.exe $f > $fout
    end
    # Transform cpglh output to bed +
    cat << '_EOF_' > filter.awk
/* Input columns: */
/* chrom, start, end, len, CpG: cpgNum, perGc, cpg:gpc, observed:expected */
/* chr1\t 41776\t 42129\t 259\t CpG: 34\t 65.8\t 0.92\t 0.94 */
/* Output columns: */
/* chrom, start, end, name, length, cpgNum, gcNum, perCpg, perGc, obsExp */
/* chr1\t41775\t42129\tCpG: 34\t354\t34\t233\t19.2\t65.8\to0.94 */
{
$2 = $2 - 1;
width = $3 - $2;
printf("%s\t%d\t%s\t%s %s\t%s\t%s\t%0.0f\t%0.1f\t%s\t%s\n",
       $1, $2, $3, $5,$6, width,
       $6, width*$7*0.01, 100.0*2*$6/width, $7, $9);
}
'_EOF_'
    # << this line makes emacs coloring happy
    awk -f filter.awk chr*.cpg > cpgIsland.bed

    # load into database:
    ssh hgwdev
    cd /cluster/data/bosTau1/bed/cpgIsland
    hgLoadBed bosTau1 cpgIslandExt -tab -noBin \
      -sqlTable=$HOME/kent/src/hg/lib/cpgIslandExt.sql cpgIsland.bed
    wc -l cpgIsland.bed 
#  44091 cpgIsland.bed
    featureBits bosTau1 cpgIslandExt


# LOAD GENEID PREDICTIONS (DONE 2/25/05)
    ssh kksilo
    mkdir /cluster/data/bosTau1/bed/geneid
    cd /cluster/data/bosTau1/bed/geneid
    wget http://genome.imim.es/genepredictions/B.taurus/golden_path_200409/geneidv1.2/bosTau1.gtf
    wget http://genome.imim.es/genepredictions/B.taurus/golden_path_200409/geneidv1.2/bosTau1.prot
    # Add ".1" suffix to each item in .prot's, to match transcript_id's in gtf
    perl -wpe 's/^(>SCAFFOLD\S+)/$1.1/' bosTau1.prot > geneid.fa
    ssh hgwdev
    cd /cluster/data/bosTau1/bed/geneid
    ldHgGene -gtf -genePredExt bosTau1 geneid bosTau1.gtf 
    hgPepPred bosTau1 generic geneidPep geneid.fa

# QA NOTE [ASZ: 9-11-2006]: mytouch for geneidPep
# sudo mytouch bosTau1 geneidPep 200504081400.00


# SWAP CHAINS FROM MM5, BUILD NETS ETC. (DONE 2/26/05 angie)
    ssh kksilo
    mkdir /cluster/data/bosTau1/bed/blastz.mm5.swap
    cd /cluster/data/bosTau1/bed/blastz.mm5.swap
    # Small cluster is quiet, workhorse needs to see cluster /scratch (and 
    # kolossus has its own local /scratch/), so use kkr7u00:
    doBlastzChainNet.pl -swap /cluster/data/mm5/bed/blastz.bosTau1/DEF \
      -workhorse=kkr7u00 >& do.log &
    tail -f do.log
    # Add {chain,net}Mm5 to trackDb.ra if necessary.
    # Add /usr/local/apache/htdocs/goldenPath/bosTau1/vsMm5/README.txt

# MAKE COW TO HUMAN LIFTOVER (DONE Mar. 4, 2005, Heather)

    ssh kkr1u00
    cd /cluster/data/bosTau1/bed/blastz/axtChain
    /cluster/bin/x86_64/netChainSubset hg17.net.gz all.chain.gz stdout | gzip -c > bosTau1.hg17.over.chain.gz
    cp bosTau1.hg17.over.chain.gz /cluster/data/bosTau1/bed/bedOver

    ssh hgwdev
    cd /gbdb/bosTau1
    mkdir liftOver
    cd liftOver
    cp /cluster/data/bosTau1/bed/bedOver/bosTau1.hg17.over.chain.gz .
    
    cd /usr/local/apache/htdocs/goldenPath/bosTau1/liftOver
    cp /cluster/data/bosTau1/bed/bedOver/bosTau1.hg17.over.chain.gz .

    # hgsql hgcentraltest
    # insert into liftOverChain 
    # (fromDb, toDb, path, minMatch, minSizeT, minSizeQ, multiple, minBlocks, fudgeThick) 
    # values ("bosTau1", "hg17", "/gbdb/bosTau1/liftOver/bosTau1.hg17.over.chain", 0.95, 0, 0, "Y", 1, "N");


# ACCESSIONS FOR CONTIGS (DONE 2005-03-22 kate)
#  Used for Assembly track details, and also ENCODE AGP's
    cd /cluster/data/bosTau1/bed
    mkdir -p contigAcc
    cd contigAcc
    
    # extract WGS contig to accession mapping from Entrez Nucleotide summaries
    # To get the summary file, access the Genbank page for the project 
    # by searching:
    #       genus[ORGN] AND WGS[KYWD]
    # At the end of the page, click on the list of accessions for the contigs.
    # Select summary format, and send to file.
    # output to file <species>.acc
    #  The cow WGS project accesion is: AAFC01000000
    grep Bos cow.acc | wc -l
    # 795203

    # edit hg/encode/regionAgp/contigAccession.pl to recognize cow format
    # and install in /cluster/bin/scripts
    contigAccession cow.acc > contigAcc.tab
    wc -l contigAcc.tab
    # 795203
    hgsql bosTau1 < $HOME/kent/src/hg/lib/contigAcc.sql
    echo "LOAD DATA LOCAL INFILE 'contigAcc.tab' INTO TABLE contigAcc" | \
        hgsql bosTau1
    hgsql bosTau1 -e "SELECT COUNT(*) FROM contigAcc"
        #


#########################################################################
# MOUSE NET/CHAINS MM6 - Info contained in makeMm6.doc (200503 Hiram)


TODO:
 
# LIFTOVER CHAIN TO BOSTAU2 (DONE 2005-09-08 galt)
    ssh kkstore02
    cd /cluster/data/bosTau2
    mkdir liftSplits/
    mkdir liftSplits/lift
    mkdir liftSplits/split
    cat << _EOF_ > split.csh
#!/bin/tcsh
set liftDir = /cluster/data/bosTau2/liftSplits
cd /cluster/data/bosTau2/softmask-split
foreach n (*.fa)
    echo \$n
    faSplit -lift=\$liftDir/lift/\$n:r.lft size \$n -oneFile 3000 \$liftDir/split/\$n:r
end
_EOF_
    chmod +x split.csh
    ./split.csh
    mkdir /san/sanvol1/scratch/bosTau2
    cp -r liftSplits/ /san/sanvol1/scratch/bosTau2
    
    cd /cluster/data/bosTau1
    mkdir softTwoBit
    cd softTwoBit
    faSplit sequence ../bosTau1.softmask2.fa 33 x
    # 33 files avg size 75 MB

    foreach f ( *.fa )
    faToTwoBit $f $f:r.2bit
    rm $f
    end
    # 33 files avg size 20MB

    cd ..
    mkdir /san/sanvol1/scratch/bosTau1
    cp -r softTwoBit/ /san/sanvol1/scratch/bosTau1

    cd /cluster/data/bosTau1
    cp `which makeLoChain-align` .
    vi makeLoChain-align
    cp /cluster/bluearc/bosTau1/11.ooc /san/sanvol1/scratch/bosTau1/
    # change so that it accepts 2bit files, not just nibs 
    #  Jim says using ooc file on target bosTau1 is important

    
    ssh pk 
    ./makeLoChain-align bosTau1 /san/sanvol1/scratch/bosTau1/softTwoBit \
                        bosTau2 /san/sanvol1/scratch/bosTau2/liftSplits/split \
			/san/sanvol1/scratch/bosTau1/11.ooc
			

#Setting up blat in /cluster/data/bosTau1/bed/blat.bosTau2.2005-09-20
#Checking input files
#3003 jobs written to batch
# 
#First two lines of para spec:
#blat /san/sanvol1/scratch/bosTau1/softTwoBit/x31.2bit /san/sanvol1/scratch/bosTau2/liftSplits/split/x089.fa {check out line+ ../raw/x31_x089.psl}  -tileSize=11 -ooc=/san/sanvol1/scratch/bosTau1/11.ooc -minScore=100 -minIdentity=98 -fastMap
#blat /san/sanvol1/scratch/bosTau1/softTwoBit/x09.2bit /san/sanvol1/scratch/bosTau2/liftSplits/split/x089.fa {check out line+ ../raw/x09_x089.psl}  -tileSize=11 -ooc=/san/sanvol1/scratch/bosTau1/11.ooc -minScore=100 -minIdentity=98 -fastMap
# 
#DO THIS NEXT:
#    cd /cluster/data/bosTau1/bed/blat.bosTau2.2005-09-20/run
#    para try, check, push, check, ...
#    ssh kkstore02
#    ./makeLoChain-lift.csh bosTau1 bosTau2 <lift-dir>

    cd /cluster/data/bosTau1/bed/blat.bosTau2.2005-09-20/run
    /parasol/bin/para push

#3003 jobs in batch
#1525 jobs (including everybody's) in Parasol queue.
#Checking finished jobs
#Completed: 3003 of 3003 jobs
#CPU time in finished jobs:     150921s    2515.34m    41.92h    1.75d  0.005 y
#IO & Wait Time:                 12002s     200.04m     3.33h    0.14d  0.000 y
#Average job time:                  54s       0.90m     0.02h    0.00d
#Longest running job:                0s       0.00m     0.00h    0.00d
#Longest finished job:             651s      10.85m     0.18h    0.01d
#Submission to last job:          3472s      57.87m     0.96h    0.04d

    # Lifting
    ssh kkstore02
    cd /cluster/data/bosTau1/
    cp `which makeLoChain-lift` .
    vi makeLoChain-lift
    # changed so uses softmask-split/ instead of nib/
    #  and uses x??_$c.psl instead of chr*_$c.psl
    # use kolossus because we lost our mapping to /san/sanvol1/ again
    ssh kolossus 
    cd /cluster/data/bosTau1/bed/blat.bosTau2.2005-09-20/
    ../../makeLoChain-lift bosTau1 bosTau2 /san/sanvol1/scratch/bosTau2/liftSplits/lift > lift.log &
    tail -f lift.log

    ssh kki
    cd /cluster/data/bosTau1
    cp `which makeLoChain-chain` .
    vi makeLoChain-chain
    # fix to tolerate 2bit instead of only nib
    cd /cluster/data/bosTau1/bed/blat.bosTau2.2005-09-20/
    
    ../../makeLoChain-chain bosTau1 /cluster/data/bosTau1/bosTau1.2bit \
                            bosTau2 /cluster/data/bosTau2/bosTau2.2bit
    
    cd /cluster/data/bosTau1/bed/blat.bosTau2.2005-09-20/chainRun
    para push
    para check
    para time

#[kki:/cluster/data/bosTau1/bed/blat.bosTau2.2005-09-20/chainRun> para time
#91 jobs in batch
#0 jobs (including everybody's) in Parasol queue.
#Checking finished jobs
#Completed: 91 of 91 jobs
#CPU time in finished jobs:     560372s    9339.53m   155.66h    6.49d  0.018 y
#IO & Wait Time:                 14128s     235.47m     3.92h    0.16d  0.000 y
#Average job time:                6313s     105.22m     1.75h    0.07d
#Longest running job:                0s       0.00m     0.00h    0.00d
#Longest finished job:          220728s    3678.80m    61.31h    2.55d
#Submission to last job:        220728s    3678.80m    61.31h    2.55d


    #------ Optional Speedup -------------------------------
    # Because I am having trouble with the bin0 x089.psl 1.4G 12million lines,
    # I am going to split it carefully into a multiple cluster job, since this
    # hippo has been running for days.  I checked out the structure of the 
    # psl and chain files and the order that everything is sorted in.
    # I believe this will work correctly without side-effects.
    ssh kkstore02
    cd /cluster/data/bosTau1/bed/blat.bosTau2.2005-09-20/psl/
    ~/bin/i386/pslSplitOnTarget -maxTargetCount=100 -lump x089.psl bin0/
    # made 100 psl files 000.psl to 099.psl in bin0/ from 7MB to 32MB in size

    ls -1S *.psl > pslList
    wc -l pslList
    # Create gsub
    {
    cat << '_EOF_' > gsub
    #LOOP
    axtChain -psl {check in line+ $(path1)} /cluster/data/bosTau1/bosTau1.2bit /cluster/data/bosTau2/bosTau2.2bit {check out line $(root1).chain} 
    #ENDLOOP
    '_EOF_'
    # << this line makes emacs coloring happy
    }
    /cluster/bin/i386/gensub2 pslList single gsub jobList
    ssh kk
    /cluster/data/bosTau1/bed/blat.bosTau2.2005-09-20/psl/bin0
    para create jobList
    para try, check, push, check, ...

    #re-merge using chainMergeSort to x089.chain:
    chainMergeSort *.chain > ../../chainRaw/x089.chain

    #This approach seems to work well for this situation.
    #Feel free to use it if you have 
    # one big artificial chrom query against many small scaffolds target
    #-------------------------------------

    ssh kkstore02
    cd /cluster/data/bosTau1/
    cp `which makeLoChain-net` .
    vi makeLoChain-net
    # added option -lump=33 to chainSplit command 
    #  (because otherwise it says "too many files open" since cow1 has many SCAFFOLDS)
    #  chainSplit splits by target by default (cow1)
    #  script expects directory with current date
    cd bed
    ln -s blat.bosTau2.2005-09-20 blat.bosTau2.2005-09-23
    cd ..
    # do it
    ./makeLoChain-net bosTau1 bosTau2 >& mklc-net.log

    ssh hgwdev
    cd /cluster/data/bosTau1/
    cp /cluster/scripts/makeLoChain-load .
    # Make sure your .hg.conf central and backupcentral
    # are pointing to the right thing for hgAddLiftOverChain
    makeLoChain-load bosTau1 bosTau2 >& mklc-load.log

    #Connected to central database
    #Inserting record in liftOverChain
    #Finished loading bosTau1ToBosTau2.over.chain
    #Now, add download link to hgLiftOver for 
    #/usr/local/apache/htdocs/goldenPath/bosTau1/liftOver/bosTau1ToBosTau2.over.chain.gz

