# for emacs: -*- mode: sh; -*-


# Bos taurus 
# Btau_2.0 from Baylor
# March 2005
# contigs and chromosomes
# WGS, Atlas assembler, 6x coverage
# ftp://ftp.hgsc.bcm.tmc.edu/pub/data/Btaurus/fasta/Btau20050310-freeze/
# anonymous ftp, no login/password needed
# http://www.hgsc.bcm.tmc.edu/projects/bovine/
# bovine chrom map: http://locus.jouy.inra.fr/fpc/cattle/WebAGCoL/WebChrom/index.html
# agp format: http://www.ncbi.nlm.nih.gov/Genbank/WGS.agpformat.html
#  also old page here: http://bioinfo.hku.hk/ucsc/goldenPath/datorg.html

#  NOTE:  this doc may have genePred loads that fail to include
#  the bin column.  Please correct that for the next build by adding
#  a bin column when you make any of these tables:
#
# mysql> SELECT tableName, type FROM trackDb WHERE type LIKE "%Pred%";
#  +-------------+---------------------------------+
#  | tableName   | type                            |
#  +-------------+---------------------------------+
#  | refGene     | genePred refPep refMrna         |
#  | xenoRefGene | genePred xenoRefPep xenoRefMrna |
#  | sgpGene     | genePred sgpPep                 |
#  | geneid      | genePred geneidPep              |
#  | genscan     | genePred genscanPep             |
#  +-------------+---------------------------------+


# Create storage area on store11 (DONE Jul. 18, 2005 galt)
    ssh kkstore02
    mkdir /cluster/store11/bosTau2
    cd /cluster/data
    ln -s /cluster/store11/bosTau2 bosTau2

# DOWNLOAD SEQUENCE (DONE Jul. 18, 2005 Galt)
    cd /cluster/data/bosTau2
    mkdir downloads
    cd downloads

    # download all for completeness
    wget -r "ftp://ftp.hgsc.bcm.tmc.edu/pub/data/Btaurus/fasta/Btau20050310-freeze/"
    mv ftp.hgsc.bcm.tmc.edu/pub/data/Btaurus/fasta/Btau20050310-freeze/* .
    rm -fr ftp.hgsc.bcm.tmc.edu
    

# GUNZIP
    gunzip */*.gz

  # note: this turned up an error which I sent to Kim Worley 
  # Chr10.fa.qual.gz, Chr11.fa.qual.gz, and ChrUn.fa.qual.gz:
  # "gunzip: linearScaffolds/Chr10.fa.qual.gz: invalid compressed data--format violated"
  # The next day she posted new ones, which did download, decompress correctly

# CHECK FILES

# Check that IDs in .fa match IDs in .qual
  
    ssh kkstore02
    cd /cluster/data/bosTau2/downloads
    # foreach Bin0 contigs linearScaffolds repeat-reads
    grep -h ">" *.fa > fa.id
    grep -h ">" *.qual > qual.id
    diff fa.id qual.id
    # note: also re-did for the 3 re-posted qual.gz files mentioned above
    #  and the ids did match the .fa ok

# I'm thinking really it would be better to keep
# the chromosomes with real numbers as chromosomes,
# and to keep chrUn as scaffolds.  These days we
# handle scaffolds pretty well.  This approach
# has the advantage of avoiding "false joins" when
# we align RNA to chrUn.  It also has the advantage
# of making us more interoperable with ensembl.
# -Jim

# For the bin0 stuff, how many are there?
# They're probably small enough and enough of them
# that it'd make the performance not so great.
# It'd be good then to put them in one pseudo-chromosome,
# perhaps one with not much space between them so
# as to keep the whole thing less than 4 gig.
# 
# The system handles chromosomes up to 512 MB.
# The whole genome needs to come in under 4 gig for BLAT.
# Each chromosome needs to be 512 meg or less for the
# binning scheme in the browser. 
# -Jim

# No chrom/scaffold is individually over 372M,
# so it's less than 512M bin scheme max size,
# and collectively, cow2 genome is 3.4 GB,
# which is less then the 4GB BLAT limit.
# -Galt

# Let's not split any table for the cow.
# -Jim


# DOWNLOAD MITOCHONDRION GENOME SEQUENCE (DONE 7/18/05 galt)
    cd /cluster/data/bosTau2/downloads
    # go to http://www.ncbi.nih.gov/ and search Nucleotide for
    # "bos taurus mitochondrion genome".  That shows the gi number:
    # 12800
    # Use that number in the entrez linking interface to get fasta:
    wget -O chrM.fa \
       'http://www.ncbi.nlm.nih.gov/entrez/query.fcgi?cmd=Text&db=Nucleotide&uid=12800&dopt=FASTA'
    # Edit chrM.fa: make sure the long fancy header line says it's the
    # Bos taurus mitochondrion complete genome, and then replace the
    # header line with just ">chrM".
    echo "chrM   1       16338    1       W       chrM    1       16338    +" > chrM.agp

    
# RE-DID THIS WHOLE SECTION (DONE 7/23/05 galt)

# Fix the chrom ids in contigs: "Chr" to "chr" and chr30 to chrX
    sed -e 's/^Chr/chr/' contigs/Btau20050310.agp | sed 's/^chr30/chrX/' > temp.agp

# Create chrBin0.agp from bin0/ 
    cd /cluster/data/bosTau2/downloads/bin0
    scaffoldFaToAgp bin0-reads.fa
    cd ..
    # fix bin0-reads.agp which has D instead of W and contig instead of clone
    #  and called chrUn instead of chrBin0
    #  and ids gnl|baylor|9999999 are too long, shorten to gnl9999999
    sed -e 's/[\t]D[\t]/\tW\t/' bin0/bin0-reads.agp \ 
     | sed 's/[\t]contig[\t]/\tclone\t/' \
     | sed 's/chrUn[\t]/\tchrBin0\t/' \
     | sed 's/|baylor|//' \
     > tempBin0.agp
    # gaps of size 1000 lead to chrUn with positions > 512MB,
    # So use my agpTweak program to reduce the size of clone gaps
    agpTweak tempBin0.agp tempBin0-500.agp -agpSeq=chrBin0 -cloneGapSize=500
    cat temp.agp tempBin0-500.agp chrM.agp > tempAll.agp
    
# Split chrUn into Scaffolds
    agpTweak tempAll.agp UCSC_Cow2.0.agp -agpSeq=chrUn -splitIntoScaffolds


# Create chrom and scaffold lists
    awk '{print $1;}' UCSC_Cow2.0.agp | uniq | grep -v scaffold > ../chrom.lst
    awk '{print $1;}' UCSC_Cow2.0.agp | uniq | grep scaffold > ../scaffold.lst

    # make chrBin0.fa
    # also fix gnl ids here
    sed -e 's/|baylor|//' bin0/bin0-reads.fa > tempBin0.fa

    cat contigs/Btau20050310.contigs.fa tempBin0.fa chrM.fa > tempAll.fa

    # found out that agpAllToFaFile does not preserve order
    # so I extended agpAllToFaFile with -sort option to preserve agp chrom order
    agpAllToFaFile -sort UCSC_Cow2.0.agp tempAll.fa UCSC_Cow2.0.fa

    # checkAgpAndFa for scaffolds (only works if agp and fa are sorted the same order)
    checkAgpAndFa UCSC_Cow2.0.agp UCSC_Cow2.0.fa | tail -1
    #All AGP and FASTA entries agree - both files are valid

    faSize UCSC_Cow2.0.fa
    #3525488607 bases (713506215 N's 2811982392 real 2811982392 upper 0 lower) in 98090 sequences in 1 files
    #Total size: mean 35941.4 sd 1613747.0 min 101 (scaffold97899) max 381246137 (chrBin0) median 1186
    #N count: mean 7274.0 sd 634636.2
    #U count: mean 28667.4 sd 1055740.1
    #L count: mean 0.0 sd 0.0
    

# SELECTED DOWNLOAD FILES (DONE 07/23/05, galt)
  ssh hgwdev
  cd /usr/local/apache/htdocs/goldenPath
  mkdir bosTau2
  cd bosTau2
  mkdir bigZips
  cd bigZips
  cp /cluster/data/bosTau2/downloads/UCSC_Cow2.0.agp .
  nice gzip UCSC_Cow2.0.agp
  cp /cluster/data/bosTau2/downloads/UCSC_Cow2.0.fa .
  nice gzip UCSC_Cow2.0.fa
  # takes 45 min.


# CREATING DATABASE (DONE Jul.20, 2005 galt)
    hgsql mysql -e 'create database bosTau2'
    # Use df to make sure there is at least 75G free on hgwdev:/var/lib/mysql
    df -h /var/lib/mysql
    #Filesystem            Size  Used Avail Use% Mounted on
    #/dev/sdc1             1.8T  955G  706G  58% /var/lib/mysql


# CREATING GRP TABLE FOR TRACK GROUPING (DONE Jul.20, 2005 galt)
    hgsql bosTau2 -e 'create table grp (PRIMARY KEY(NAME)) select * from hg17.grp'


# MAKE CHROMINFO TABLE WITH (TEMPORARILY UNMASKED) 2BIT (DONE 07/21/05 galt)
    # Make .2bit, unmasked until RepeatMasker and TRF steps are done.
    # Do this now so we can load up RepeatMasker and run featureBits; 
    # can also load up other tables that don't depend on masking.  
    ssh kkstore02
    cd /cluster/data/bosTau2
    #OLDWAY nice faToTwoBit ?{,?}/chr*.fa Bin0/chrBin0.fa scaffolds/scaffolds.fa bosTau2.2bit
    nice faToTwoBit downloads/UCSC_Cow2.0.fa bosTau2.2bit
    # Make symbolic links from /gbdb/bosTau2/ to the real .2bit.
    ssh hgwdev
    mkdir /gbdb/bosTau2
    ln -s /cluster/data/bosTau2/bosTau2.2bit /gbdb/bosTau2/
    mkdir -p bed/chromInfo

    # Load /gbdb/bosTau2/bosTau2.2bit paths into database and save size info.
    twoBitInfo bosTau2.2bit stdout \
    | awk '{print $1 "\t" $2 "\t/gbdb/bosTau2/bosTau2.2bit";}' \
      > bed/chromInfo/chromInfo.tab
    hgsql bosTau2  < $HOME/kent/src/hg/lib/chromInfo.sql
    hgsql bosTau2 -e 'load data local infile \
      "/cluster/data/bosTau2/bed/chromInfo/chromInfo.tab" \
      into table chromInfo;'
    echo "select chrom,size from chromInfo" | hgsql -N bosTau2 > chrom.sizes
    # take a look at chrom.sizes, should be 41 lines
    wc chrom.sizes
    #  98090  196180 1833156 chrom.sizes
    # large because scaffolds as well as chroms  


# GOLD AND GAP TRACKS (DONE 07/22/05 galt)
    ssh hgwdev
    cd /cluster/data/bosTau2
    # create chrN_gold and chrN_gap tables
    hgGoldGapGl -noGl bosTau2 downloads/UCSC_Cow2.0.agp
    # unexpected coords (10239932, 10239932) for frag Contig53769 in chrom chr21
    # chr21   10239932        10239932        1248    W       Contig53769     2146    2146    -
    # unexpected coords (30727, 30727) for frag Contig108449 in chrom scaffold4717
    # unexpected coords (1, 1) for frag Reptig1055060 in chrom scaffold19165
    # This warning is not an error because there are over 10000 records in the original agp
    # where fragstart is not 1. This contig may not have helped much, but it did fill in
    # one base next to the succeeding adjoining contig.
    
    # Refresh index stats with "analyze table".
    hgsql bosTau2 -e 'analyze table gold; analyze table gap;'
    # check cardinalities: 
    hgsql bosTau2 -e 'show index from gap'
    hgsql bosTau2 -e 'show index from gold'
    # cardinalities ok

    # The best index is chrom+bin because chrom helps if you have many scaffolds
    # and bin helps when you are zoomed in to a particular position.
    # But having chrom+start and chrom+end are only confusing mysql which uses
    # the wrong index. So I am removing these moot indexes, and now it goes really fast
    # when zoomed in on a chrom or scaffold:
    hgsql bosTau2 -e 'drop index chrom_2 on gap'
    hgsql bosTau2 -e 'drop index chrom_3 on gap'
    hgsql bosTau2 -e 'drop index chrom_2 on gold'
    hgsql bosTau2 -e 'drop index chrom_3 on gold'

    cd /cluster/home/heather/jdbc
    source javaenv
    java AssemblyGapChain bosTau2 
    # No anomalies detected except "Contiguous rows detected at gold pos ..."
    # There seem to be lots of these in the Baylor .agp. This is ok.
    # Note that these seem to be the case where a reptig is right next to a contig.
    #
    # Heather's comment:
    # "It's fine about the contiguous gaps.  Those are from overlapping
    # contigs; they have to truncate one of them."
    #
    
    
    

# MAKE HGCENTRALTEST ENTRY AND TRACKDB TABLE FOR BOSTAU2 (DONE 07/22/04 galt)
    # Make trackDb table so browser knows what tracks to expect:
    ssh hgwdev
    cd $HOME/kent/src/hg/makeDb/trackDb
    cvs up -d -P
    # Edit that makefile to add bosTau2 in all the right places and do
    make update
    cvs commit -m "added bosTau2" makefile
    mkdir -p cow/bosTau2
    cvs add cow/bosTau2
    cvs ci -m "trackDb dir for cow genome" cow/bosTau2
    make alpha

    # I copied trackDb.ra from bosTau1, and removed all but gold and gap
    # to start it off. 

    # Add dbDb and defaultDb entries:
    echo 'insert into dbDb (name, description, nibPath, organism,  \
          defaultPos, active, orderKey, genome, scientificName,  \
          htmlPath, hgNearOk, hgPbOk, sourceName)  \
          values("bosTau2", "Mar. 2005", \
          "/gbdb/bosTau2/nib", "Cow", "chr1:10666612-10673232", 1, \
          19, "Cow", "Bos taurus", \
          "/gbdb/bosTau2/html/description.html", 0, 0, \
          "Baylor Btau_2.0");' \
    | hgsql -h genome-testdb hgcentraltest
    # add rows to hgcentraltest on hgwbeta in dbDb
    # set orderKey past cow1 temporarily
    # update orderKey to swap cow2, cow1 so cow2 comes first in dropdownlist
    echo 'update dbDb set orderKey=20 where name="bosTau1"' \ 
    | hgsql -h genome-testdb hgcentraltest

    # update defaultDb to make cow2 the default
    # If there's another cow assembly, this will be an update not insert:
    echo 'update defaultDb set name="bosTau2" where name="bosTau1"' \ 
    | hgsql -h genome-testdb hgcentraltest


    # clone the description.html page from bosTau1
    cd /cluster/data/bosTau2
    cp -r /cluster/data/bosTau1/html/ . 
    ln -s /cluster/data/bosTau2/html /gbdb/bosTau2/
    # make basic edits necessary
    vi html/description.html
    cd ${HOME}/kent/src/hg/makeDb/trackDb/cow/bosTau2/
    cp /cluster/data/bosTau2/html/description.html .
    cvs add description.html
    cvs ci -m "added initial description page" description.html

# CREATE CONTIGORIENT TRACK (DONE 08/03/05 galt)
    # This was done to provide the user with info in the agp comment field
    # that would otherwise be lost.  
    # Some of the contigs say "#unorientedScaffold" in field 10.
    # This info is extracted as a bed 6 + and loaded as follows:
    ssh hgwdev
    cd /cluster/data/bosTau2/bed
    mkdir contigOrient
    cd contigOrient
    gawk '{if (($5 == "W") && ($1 != "chrUn")) {if ($10=="") score="1000"; else score="500";print $1 "\t" ($2 -1) "\t" $3 "\t" $6 "\t" score "\t" $9}}' ../../downloads/temp.agp > oriented.bed
    hgLoadBed bosTau2 contigOrient oriented.bed
    #Reading oriented.bed
    #Loaded 118581 elements of size 6
    #Sorted
    #Creating table definition for contigOrient
    #Saving bed.tab
    #Loading bosTau2

    cd ~/kent/src/hg/makeDb/trackDb/
    vi cow/bosTau2/contigOrient.html
    #------------
    #track contigOrient
    #shortLabel Contig Orient
    #longLabel Contig Orientation
    #priority 15
    #group map
    #visibility hide
    #type bed 6 +
    #useScore 1
    #chromosomes chr1,chr10,chr11,chr12,chr13,chr14,chr15,chr16,chr17,chr18,chr19,chr2,chr20,chr21,chr22,chr23,chr24,chr25,chr26,chr27,chr28,chr29,chr3,chr4,chr5,chr6,chr7,chr8,chr9,chrX
    make update DBS=bosTau2 ZOO_DBS=


# SPLIT SCAFFOLDS (DONE 07/23/05 galt)
    ssh kkstore02
    cd /cluster/data/bosTau2
    mkdir scaffolds
    cd scaffolds
    /cluster/bin/i386/faSplit byname ../downloads/UCSC_Cow2.0.fa scaffolds -outDirDepth=3
    # find big files over 500k to split
    du -ka | grep "[.]fa" | gawk '{if ($1 > 500) print $2}' > bigfiles
    #splitBig.csh
    #/bin/tcsh -ef
    foreach b (`cat bigfiles`)
	echo "splitting $b:h $b:t"
    	pushd $b:h
	set contig = $b:t:r
    	echo $contig
	faSplit gap $contig.fa 500000 ${contig}_ -lift=$contig.lft -minGapSize=100
    	popd
    end
    
    chmod +x splitBig.csh
    splitBig.csh
 

# REPEATMASKER (DONE 07/23/05 galt)
# using the split scaffold fa files generated earlier

# note the version of RepeatMasker in use; will want this for download README
    cat /cluster/bluearc/RepeatMasker/Libraries/version
    # RepBase Update 9.11, RM database version 20050112

# do a trial run
    ssh hgwdev
    cd /cluster/data/bosTau2/scaffolds/0/0/0
    /cluster/bluearc/RepeatMasker/RepeatMasker -ali -s -spec bos scaffold1000.fa
    #took about 40 min, no errors
    
# configure
    ssh kkstore02
    cd /cluster/data/bosTau2
    mkdir jkStuff
    cp /cluster/data/anoGam1/jkStuff/RMAnopheles jkStuff/RMBosTaurus
    # change references anoGam1 --> bosTau2 including -spec bos
    mkdir RMRun

    #makeJoblist-RM.csh
    # /bin/tcsh -ef
    cd scaffolds
    foreach i (0 1 2 3 4 5 6 7 8 9)
      	cd $i
    	foreach j (0 1 2 3 4 5 6 7 8 9)
	    cd $j
      	    foreach k (0 1 2 3 4 5 6 7 8 9)
	      	cd $k
		foreach f (*.fa)
		    # big files already split have a .lft so skip the big ones
		    if (! -e $f:r.lft) then
    			echo /cluster/data/bosTau2/jkStuff/RMBosTaurus \
			/cluster/data/bosTau2/scaffolds/$i/$j/$k $f \
	    		    '{'check out line+ /cluster/data/bosTau2/scaffolds/$i/$j/$k/$f.out'}' \
			    >> /cluster/data/bosTau2/RMRun/RMJobs
		    endif				
		end
		cd ..
      	    end
	    cd ..
    	end
      	cd ..
    end

    chmod +x makeJoblist-RM.csh
    makeJoblist-RM.csh

    # do the run
    ssh kk
    cd /cluster/data/bosTau2/RMRun
    para create RMJobs
    # Checking input files
    # 102932 jobs written to batch
    para try
    para push
    para check
    para time
    etc.

# CHECK REPEATMASKER FINISHED, LIFT BIGFILES AND CONCATENATE OUTPUT (DONE 07/24/05 galt)

#[kk:/cluster/data/bosTau2/RMRun> para time
#102932 jobs in batch
#3 jobs (including everybody's) in Parasol queue.
#Checking finished jobs
#Completed: 102932 of 102932 jobs
#CPU time in finished jobs:   33107057s  551784.28m  9196.40h  383.18d  1.050 y
#IO & Wait Time:               6194171s  103236.19m  1720.60h   71.69d  0.196 y
#Average job time:                 382s       6.36m     0.11h    0.00d
#Longest running job:                0s       0.00m     0.00h    0.00d
#Longest finished job:            7956s     132.60m     2.21h    0.09d
#Submission to last job:         77640s    1294.00m    21.57h    0.90d

    #- Lift up the 500KB chunk .out's to scaffold level
    ssh kkstore02
    cd /cluster/data/bosTau2/scaffolds
    #liftBig.csh
    #/bin/tcsh -ef
    foreach b (`cat bigfiles`)
	echo "lifting $b:h $b:t"
    	pushd $b:h
	set contig = $b:t:r
    	echo $contig
	liftUp $contig.fa.out $contig.lft warn ${contig}_*.fa.out > /dev/null
    	popd
    end
    
    chmod +x liftBig.csh
    liftBig.csh
 


# concatenate into one output file; took about 90 minutes
    ssh kkstore02
    cd /cluster/data/bosTau2
    mkdir repeats
    #create script concatRM.csh
    {
    #!/bin/tcsh -ef
    if (-e /cluster/data/bosTau2/repeats/repeats.all) then
	rm /cluster/data/bosTau2/repeats/repeats.all
    endif
    cd scaffolds
    foreach i (0 1 2 3 4 5 6 7 8 9)
      cd $i
      foreach j (0 1 2 3 4 5 6 7 8 9)
	cd $j
	foreach k (0 1 2 3 4 5 6 7 8 9)
	  cd $k
	  foreach f (*.fa.out)
	    # make sure we only get big lifted files, not small split files
	    set pf = `echo $f | sed -e 's/[_][0-9][0-9]*[.]fa[.]out$/.fa.out/'`
	    if ("$pf" == "$f") then
		echo $f
		cat $f >> /cluster/data/bosTau2/repeats/repeats.all
	    endif
	  end
	  cd ..
	end
	cd ..
      end
      cd ..
    end
    cd ..

    chmod +x concatRM.csh
    concatRM.csh > concatRM.list
    wc concatRM.list
    # 98090   98090 2048519 concatRM.list

    cd repeats
    wc repeats.all
    # 6044161  89020033 723977658 repeats.all

    # use grep -v to get rid of all the headers
    # then, add back in an initial header
    # this is so hgLoadOut is happy
    head -3 repeats.all > repeats.header
    egrep "chr|scaffold" repeats.all > repeats.clean
    cat repeats.header repeats.clean > repeats.final
    grep "There were no repetitive sequences detected" repeats.final > repeats.notdetected
    wc repeats.notdetected
    # 5211  41688 333503 repeats.notdetected
    grep -v "There were no repetitive sequences detected" repeats.final > repeats.out
    wc repeats.out
    # 5760316  86470639 704384016 repeats.out

    ssh hgwdev
    hgLoadOut bosTau2 /cluster/data/bosTau2/repeats/repeats.out
#Processing /cluster/data/bosTau2/repeats/repeats.out
#Strange perc. field -5.2 line 726783 of /cluster/data/bosTau2/repeats/repeats.out
#Strange perc. field -0.3 line 1844558 of /cluster/data/bosTau2/repeats/repeats.out
#Strange perc. field -24.1 line 2227209 of /cluster/data/bosTau2/repeats/repeats.out
#Strange perc. field -396.9 line 2244938 of /cluster/data/bosTau2/repeats/repeats.out
#Strange perc. field -763.4 line 2244938 of /cluster/data/bosTau2/repeats/repeats.out
#Strange perc. field -10363.0 line 2434452 of /cluster/data/bosTau2/repeats/repeats.out
#Strange perc. field -3147.2 line 2434452 of /cluster/data/bosTau2/repeats/repeats.out
#Strange perc. field -1217.1 line 2434452 of /cluster/data/bosTau2/repeats/repeats.out
#Strange perc. field -0.9 line 2613009 of /cluster/data/bosTau2/repeats/repeats.out
#Strange perc. field -4.0 line 2745397 of /cluster/data/bosTau2/repeats/repeats.out
#Strange perc. field -0.1 line 3534731 of /cluster/data/bosTau2/repeats/repeats.out
#Strange perc. field -0.1 line 3726610 of /cluster/data/bosTau2/repeats/repeats.out
#Strange perc. field -6355.5 line 4849142 of /cluster/data/bosTau2/repeats/repeats.out
#Strange perc. field -1155.4 line 4849142 of /cluster/data/bosTau2/repeats/repeats.out
#Strange perc. field -174.3 line 4849142 of /cluster/data/bosTau2/repeats/repeats.out
#Strange perc. field -0.3 line 5117371 of /cluster/data/bosTau2/repeats/repeats.out
#Strange perc. field -0.2 line 5117371 of /cluster/data/bosTau2/repeats/repeats.out
#Strange perc. field -0.9 line 5365557 of /cluster/data/bosTau2/repeats/repeats.out
#Strange perc. field -1.1 line 5365557 of /cluster/data/bosTau2/repeats/repeats.out
#Strange perc. field -11.0 line 5561481 of /cluster/data/bosTau2/repeats/repeats.out
#Loading up table repeats_rmsk
#note: 116 records dropped due to repStart > repEnd
#
#      run with -verbose=2 for details


    hgsql bosTau2 -e 'alter table repeats_rmsk rename rmsk'

    hgsql bosTau2 -e 'select count(*) from rmsk'
    # 5760197
    hgsql bosTau2 -e 'select count(*) from rmsk where repName like "Bov%"'
    # 2193098
    nice featureBits bosTau2 rmsk
    # 1291561904 bases of 2812203870 (45.927%) in intersection

    # indexes slow and not working, because they were optimized for
    # having a small number of chroms, with a fairly big max position 200+, ~30chrom
    # but we have many small scaffolds, so need to index on genoName.
    # but bin can also be useful for when zoomed-in.
    # I experimented and found that having just one compound index 
    # with the key of genoName+bin solves this problem well.
    drop index bin on rmsk;
    drop index genoStart on rmsk;
    drop index genoEnd on rmsk;
    create index genoNameBin on rmsk(genoName,bin);
    # I tested this and it is very fast when zoomed in, or when on full scaffolds.
    # Even on the largest full-view for chr1 and chrBin0, the response time is ok.



# SIMPLE REPEATS (DONE 07/25/05 galt)
# put the results throughout the scaffold 0/0/0 directories,
# same as RepeatMasker, to avoid too many files in the same directory
    ssh kkstore02
    cd /cluster/data/bosTau2
    mkdir -p bed/simpleRepeat
    cd bed/simpleRepeat

    # create script makeJoblist-trf.csh
    #!/bin/tcsh -ef
    if (-e /cluster/data/bosTau2/bed/simpleRepeat/trf-run.csh) then
	rm /cluster/data/bosTau2/bed/simpleRepeat/trf-run.csh
    endif
    pushd /cluster/data/bosTau2/scaffolds
    foreach i (0 1 2 3 4 5 6 7 8 9)
	cd $i
	foreach j (0 1 2 3 4 5 6 7 8 9)
	    cd $j
	    foreach k (0 1 2 3 4 5 6 7 8 9)
		cd $k
		foreach f (*.fa)
		    # make sure we only get big files, not small split files
		    set pf = `echo $f | sed -e 's/[_][0-9][0-9]*[.]fa$/.fa/'`
		    if ("$pf" == "$f") then
			set fout = $f:t:r.bed
			echo "/cluster/bin/i386/trfBig  -trf=/cluster/bin/i386/trf /cluster/data/bosTau2/scaffolds/$i/$j/$k/$f  /dev/null  -bedAt=/cluster/data/bosTau2/scaffolds/$i/$j/$k/$fout  -tempDir=/tmp" \
    >> /cluster/data/bosTau2/bed/simpleRepeat/trf-run.csh
		    endif
		end
		cd ..
	    end
	    cd ..
	end
	cd ..
    end
    popd

    chmod +x makeJoblist-trf.csh
    makeJoblist-trf.csh
    wc trf-run.csh
    # 98090   588540 19399078 trf-run.csh
    # This is correct: 98058 scaffolds + chr1-29 + chrM,X,Bin0 = 98090
    chmod +x trf-run.csh

  
    # do the run 
    trf-run.csh > & ! trf.log &
    #did re-nice on it
    #took about 4 hours
    grep "Done.faFile" trf.log | wc
    # 98090 1569440 11944238
    # count good.
    
      

    # concatenate into one output file; took about an hour
    # create concatTRF.csh
    #!/bin/tcsh -ef
    if (-e /cluster/data/bosTau2/bed/simpleRepeat/trf.bed) then
	rm -i /cluster/data/bosTau2/bed/simpleRepeat/trf.bed
    endif
    cd ../../scaffolds
    foreach i (0 1 2 3 4 5 6 7 8 9)
      cd $i
      foreach j (0 1 2 3 4 5 6 7 8 9)
	cd $j
	foreach k (0 1 2 3 4 5 6 7 8 9)
	  cd $k
	  foreach f (*.bed)
	    echo $f
	    cat $f >> /cluster/data/bosTau2/bed/simpleRepeat/trf.bed
	  end
	  cd ..
	end
	cd ..
      end
      cd ..
    end

    chmod +x concatTRF.csh
    concatTRF.csh > concatTRF.log
    wc concatTRF.log
    #  98090   98090 1754249 concatTRF.log
    # good.	

    
    # load
    ssh hgwdev
    cd /cluster/data/bosTau2/bed/simpleRepeat
    hgLoadBed bosTau2 simpleRepeat trf.bed -sqlTable=${HOME}/kent/src/hg/lib/simpleRepeat.sql

    # Reading trf.bed
    # Loaded 417896 elements of size 16
    # Sorted
    # Saving bed.tab
    # Loading bosTau2
    
    hgsql bosTau2 -e 'select count(distinct chrom) from simpleRepeat'
    # +-----------------------+
    # | count(distinct chrom) |
    # +-----------------------+
    # |                 37407 |
    # +-----------------------+
    # Many small scaffolds didn't have tandem repeats.

    hgsql bosTau2 -e 'select count(distinct chrom) from simpleRepeat where chrom like "chr%"'
    # +-----------------------+
    # | count(distinct chrom) |
    # +-----------------------+
    # |                    31 |
    # +-----------------------+
    # All the larger chroms had tandem repeats as expected.

    # Get rid of these indexes that just slow down mysql and confuse it
    # as far as I can tell we don't need them.
    hgsql bosTau2 -e 'drop index chrom_2 on simpleRepeat'    
    hgsql bosTau2 -e 'drop index chrom_3 on simpleRepeat'    


# CREATE MASKED FA USING REPEATMASKER AND FILTERED TRF FILES (DONE 07/26/05 galt) 

    ssh kkstore02
    cd /cluster/data/bosTau2
    maskOutFa -soft downloads/UCSC_Cow2.0.fa repeats/repeats.out bosTau2.softmask.fa
    # 117 warnings about negative rEnd
    # matches this query:
    hgsql bosTau2 -e 'select count(*) from rmsk where repEnd < 0'

    maskOutFa -softAdd bosTau2.softmask.fa bed/simpleRepeat/trf.bed bosTau2.softmask2.fa

    # hard masking (Ns instead of lower case) for download files
    # split for use by genscan
    maskOutFa bosTau2.softmask2.fa hard bosTau2.hardmask.fa
    mkdir hardmask-split
    cd hardmask-split
    faSplit about ../bosTau2.hardmask.fa 2000000 hardmask-split
    # made 687 files, useful with genescan later
     
# DOWNLOAD FILES (DONE 07/26/05 galt)

    ssh hgwdev
    cd /usr/local/apache/htdocs/goldenPath/bosTau2/bigZips
    cp /cluster/data/bosTau2/bosTau2.softmask.fa .
    cp /cluster/data/bosTau2/bosTau2.softmask2.fa .
    cp /cluster/data/bosTau2/bosTau2.hardmask.fa .

    nice gzip bosTau2.softmask.fa
    nice gzip bosTau2.softmask2.fa
    nice gzip bosTau2.hardmask.fa

# REGENERATE 2BIT NIB (DONE 07/26/05 galt)
    ssh kkstore02
    cd /cluster/data/bosTau2
    mv bosTau2.2bit bosTau2.2bit.unmasked
    faToTwoBit bosTau2.softmask2.fa bosTau2.2bit
    twoBitToFa bosTau2.2bit check.fa
    diff -q bosTau2.softmask2.fa check.fa
    # note: size match
    # note: you can also use faCmp

    # Now check in the browser
    # RepeatMasker (rmsk) and trf (simpleRepeat)
    # should show lower case when masking option used
    # and be sure to use Extended DNA Case/Color Options 
    # for simpleRepeat because it doesn't have strand info
    # and otherwise lower-casing does not occur with standard getDna 
    # Result: bosTau2 looks good for both rmsk and simpleRepeat	


# GC5BASE - (DONE 07/26/05 galt)
    ssh kolossus
    mkdir /cluster/data/bosTau2/bed/gc5Base
    cd /cluster/data/bosTau2/bed/gc5Base
    hgGcPercent -wigOut -doGaps -file=stdout -win=5 bosTau2 \
      /cluster/data/bosTau2 | wigEncode stdin gc5Base.wig gc5Base.wib
    #       Calculating gcPercent with window size 5
    #       Using twoBit: /cluster/data/bosTau2/bosTau2.2bit
    #       File stdout created
    #   Converted stdin, upper limit 100.00, lower limit 0.00
    #	about 10 minutes
    #	Now load those results into the database
    ssh hgwdev
    cd /cluster/data/bosTau2/bed/gc5Base
    mkdir /gbdb/bosTau2/wib
    ln -s `pwd`/gc5Base.wib /gbdb/bosTau2/wib
    hgLoadWiggle -pathPrefix=/gbdb/bosTau2/wib bosTau2 gc5Base gc5Base.wig
    #	The index is no good for this organism due to the scaffold
    #	naming scheme, drop the index and create one that works:
    hgsql bosTau2 -e 'drop index chrom on gc5Base'
    hgsql bosTau2 -e 'create index chromBin on gc5Base (chrom,bin)'
    #	The speeds up an operation on the hgTables browser by orders of
    #	magnitude - whole genome statistics display


# PUT MASKED SEQUENCE OUT FOR CLUSTER RUNS (DONE 07/26/05 galt)
    ssh kkr1u00
    mkdir -p /iscratch/i/bosTau2
    cp -p /cluster/data/bosTau2/bosTau2.2bit /iscratch/i/bosTau2/
    iSync  
    #if you get error "Permission denied" it means you have to add ~/.rhosts file
    # and be sure to chmod 600 ~/.rhosts
  
    mkdir -p /cluster/bluearc/bosTau2
    cp -p /cluster/data/bosTau2/bosTau2.2bit /cluster/bluearc/bosTau2/

# MAKE 11.OOC FILE FOR BLAT (DONE 07/26/05 galt)
    # Use -repMatch=1005 (based on size -- for human we use 1024, and 
    # dog size is 2812203870 / 2866216770 = ~98.1% of human judging by 
    # gapless genome size from featureBits)
    ssh kkr1u00
    mkdir -p /cluster/data/bosTau2/bed/ooc
    cd /cluster/data/bosTau2/bed/ooc
    ls -1 /cluster/data/bosTau2/bosTau2.2bit > nib.lst
    blat nib.lst /dev/null /dev/null -tileSize=11 \
	-makeOoc=/cluster/bluearc/bosTau2/11.ooc -repMatch=1005
    # Wrote 31532 overused 11-mers to /cluster/bluearc/bosTau2/11.ooc
    cp -p /cluster/bluearc/bosTau2/11.ooc /iscratch/i/bosTau2/
    cp -p /cluster/bluearc/bosTau2/11.ooc .
    iSync 


#> My question is, can I still use just a .2bit and no lift?
#
#With the largest chr being 97M, no need for a lift, so don't
#bother.  You actually get better results without a lift.
#
#You probably want use a smaller value for bosTau2.mondoTwoBitParts,
#I would try 1000.
#-mark
#
#oops chrBin0 is 372M.


# GENBANK  (NOTE: All of this was redone by MarkD to test his new process, 
#    see next section below)

    ssh hgwdev
    cd ${HOME}/kent/src/hg/makeDb/genbank
    # check for missing commits
    diff /cluster/data/genbank/etc/genbank.conf etc/genbank.conf
    # (there were none)
    # edit etc/genbank.conf and add these lines, starting with the comment:
    vi etc/genbank.conf
    # bosTau2 (B. taurus) 30 chromosomes plus 98058 scaffolds
    # This uses the mondo 2bit file functionality, where multiple
    # sequences from a two bit are alignmed from a single jobs.
    # Genbank B. taurus: (these are the old numbers from bosTau1)
    #     3720 mRNAs
    #   493370 ESTs
    # partation 2bit into 1000 parts for initial build
    bosTau2.genome = /iscratch/i/bosTau2/bosTau2.2bit
    bosTau2.lift = no
    bosTau2.refseq.mrna.native.load = no
    bosTau2.genbank.mrna.xeno.load = no
    bosTau2.genbank.est.xeno.load = no
    bosTau2.downloadDir = bosTau2
    bosTau2.perChromTables = no
    bosTau2.mondoTwoBitParts = 1000
    # note: MarkD later decided he thought 2000 is better for mondo,
    # so we will use that for ESTs below
    cvs commit -m "added bosTau2" etc/genbank.conf
    #revision 1.116,1.117

    # edit src/lib/gbGenome.c
    vi src/lib/gbGenome.c
    #(no changes were necessary)

    # edit src/align/gbBlat
    vi src/align/gbBlat
    make
    cvs commit -m "added bosTau2" src/align/gbBlat
    # revision 1.61
    # actually I just change the .ooc path from bosTau1 to bosTau2

    make install-server

    # I tried doing this from kkstore02 but it didn't work, so went back to eieio
    ssh eieio
    cd /cluster/data/genbank
    nice bin/gbAlignStep -srcDb=genbank -type=mrna -initial -verbose=1 bosTau2 &
    # logFile: var/build/logs/2005.08.02-12:00:54.bosTau2.initalign.log
    # /panasas/store/genbank/work/initial.bosTau2/
    # and now it is re-running again on eieio. Started around 12:15pm on Tues.Aug2nd.
    # This died by 3pm, with an error
    # I did a para push again, removed a few dead machines, it finally finished
    # para check says all ran ok.
    # instructions: repeat with -continue=finish which must go before the database argument.
    # note: screen is handy for long-running operations
    screen
    nice bin/gbAlignStep -srcDb=genbank -type=mrna -initial -verbose=1 -continue=finish bosTau2
    # logFile: var/build/logs/2005.08.03-00:07:50.bosTau2.initalign.log
    #ctrl-a ctrl-d to release terminal.
    #screen -r -d to recapture terminal.
    #exit to close screen when done.

    # load
    ssh hgwdev
    cd /cluster/data/genbank
    nice bin/gbDbLoadStep -verbose=1 -drop -initialLoad bosTau2
    # logFile: var/dbload/eieio/logs/2005.08.03-11:07:08.dbload.log
    #  finished no errs

    # checked out the new tables list which seems right
    # added all_mrna clause to bosTau2/trackDb.ra, copied from from cow1
    # did make update on bosTau2
    # looked at the new Cow mRNAs track
    # one problem is some are split up with parts on different scaffolds
    #  just because the assembly is an early draft and we still have scaffolds.
    # another problem is some are split on opp strands of say chr1,
    # probably because some of the contigs in the chroms are unoriented.
    # So, now I will have to create a track to show which regions are
    # oriented and which are not.

    # MarkD thinks maybe the mondoTwoBitParts should be increased to 2000.
    ssh hgwdev
    cd ${HOME}/kent/src/hg/makeDb/genbank
    diff /cluster/data/genbank/etc/genbank.conf etc/genbank.conf
    vi etc/genbank.conf
    #change to bosTau2.mondoTwoBitParts = 2000
    cvs commit -m "MarkD advises changing to bosTau2.mondoTwoBitParts = 2000" etc/genbank.conf
    #revision 1.119
    make
    make install-server


    # ESTs with a huge number of scaffolds.  
    ssh eieio
    cd /cluster/data/genbank
    screen
    bin/gbAlignStep -initial -srcDb=genbank -type=est bosTau2
    #logFile: var/build/logs/2005.08.05-02:12:43.bosTau2.initalign.log
    #ctrl-a,d to detach
    tail -f var/build/logs/2005.08.05-02:12:43.bosTau2.initalign.log
    # to check progres: [kk:/cluster/data/genbank/work/initial.bosTau2/align, para check

    #Markd:
    #Yea, chrBin0.psl is too big and pslIntronsOnly (intronEst builder)
    #A short-term fix is to finish it on the kki minicluster.  
    #So please do the following:
    # on kk, kill the para make that is running on kk, if you are not running
    cd  /cluster/store3/genbank/work/initial.bosTau2/align/
    para stop
    #use  parasol list batches to determine if they have really stopped.
    #then do a  
    para recover align.jobs align2.jobs

    #So now create a directory under the align directory to save the parasol bits
    mkdir firstRun
    mv batch firstRun/
    mv batch.bak firstRun/
    mv err firstRun/
    mv para.results firstRun/

    #Check the align2.jobs file (it should have just the ~21 jobs that
    #were still running and any that had failed 4 times).
    ssh kki
    cd  /cluster/store3/genbank/work/initial.bosTau2/align/
    para create align2.jobs
    para push
    #etc
    #once this has finished, rerun the gbAlignStep command, 
    # only adding the -continue=finish option (before the database).

  ssh eieio
  cd /cluster/data/genbank
  screen
  nice bin/gbAlignStep -srcDb=genbank -type=est -initial -verbose=1 -continue=finish bosTau2
  #

# Markd wanted a lift file for the big artificial chrBin0
ssh hgwdev
cd /cluster/data/bosTau2/downloads
# need the size of chrBin0:
hgsql bosTau2 -e 'select size from chromInfo where chrom="chrBin0"'
# 381246137 
# now use gawk to make the lift file
gawk '{ \
if ($5=="W") { \
  print ($2 - 1) "\t" $6 "\t" ($3 - $2 + 1) "\t" "chrBin0" "\t" "381246137" \
} else { \
  print ($2 - 1) "\t" "gap" "\t" ($3 - $2 + 1) "\t" "chrBin0" "\t" "381246137" \
} \
}' tempBin0-500.agp > chrBin0.lft



  # load the ESTs, reloading mRNAs as well
  ssh hgwdev
  cd /cluster/data/genbank
  nice bin/gbDbLoadStep -drop -initialLoad bosTau2

  # add bosTau2 to list of databases to align in
  #  /cluster/data/genbank/etc/align-genbank

# GENBANK: check Non-Cow Refseqs
    # Jim says he doesn't care about these?

  # add to kent/src/hg/makeDb/genbank/etc/genbank.conf and commit:
  # bosTau1.refseq.mrna.xeno.load = yes
  cd /cluster/data/genbank/etc
  cvs update
    # new tables: xenoRefGene, xenoRefFlat, xenoRefSeqAli, refLink, refSeqStatus
    #     refSeqSummary



# MARKD re-does it again to test his new genbank process:
# AUTO UPDATE GENBANK MRNA AND EST AND MGC GENES RUN (DONE, 2005-09-05, markd)
    # redo genbank revised alignment procedure once again to
    # pickup local near best pslCDnaFilter

    # align with revised genbank process
    cd ~kent/src/makeDb/genbank
    cvs update -d etc
    # edit etc/genbank.conf to add bosTau2
# bosTau2 (B. taurus) 30 chromosomes plus 98058 scaffolds
# Lift file partitions unplaced sequence pseudo-chroms
bosTau2.serverGenome = /cluster/data/bosTau2/bosTau2.2bit
bosTau2.clusterGenome = /iscratch/i/bosTau2/bosTau2.2bit
bosTau2.ooc = /iscratch/i/bosTau2/11.ooc
bosTau2.align.unplacedChroms = chrBin0
bosTau2.lift = /cluster/data/bosTau2/downloads/chrBin0.lft
bosTau2.refseq.mrna.native.pslCDnaFilter  = ${lowCover.refseq.mrna.native.pslCDnaFilter}
bosTau2.refseq.mrna.xeno.pslCDnaFilter    = ${lowCover.refseq.mrna.xeno.pslCDnaFilter}
bosTau2.genbank.mrna.native.pslCDnaFilter = ${lowCover.genbank.mrna.native.pslCDnaFilter}
bosTau2.genbank.mrna.xeno.pslCDnaFilter   = ${lowCover.genbank.mrna.xeno.pslCDnaFilter}
bosTau2.genbank.est.native.pslCDnaFilter  = ${lowCover.genbank.est.native.pslCDnaFilter}
bosTau2.genbank.est.xeno.pslCDnaFilter    = ${lowCover.genbank.est.xeno.pslCDnaFilter}
bosTau2.refseq.mrna.native.load = no
bosTau2.refseq.mrna.xeno.load = yes
bosTau2.genbank.mrna.xeno.load = no
bosTau2.genbank.est.xeno.load = no
bosTau2.downloadDir = bosTau2
bosTau2.perChromTables = no

# N.B. above was changed later to include refseqs don't just copy this or a spell will be
# cast on your descendents.

    # update /cluster/data/genbank/
    make etc-update

    ssh kkstore02
    cd /cluster/data/genbank
    nice bin/gbAlignStep -initial bosTau2 &

    # load database when finished
    ssh hgwdev
    cd /cluster/data/genbank
    nice ./bin/gbDbLoadStep -drop -initialLoad  bosTau2&


    # enable daily alignment and update of hgwdev
    cd ~kent/src/makeDb/genbank
    cvs update -d etc
    # add bosTau2 to:
        etc/align.dbs
        etc/hgwdev.dbs
    cvs commit
    make etc-update

# enabled native refSeq: 2006-05-11 markd
    # set this in genbank.conf:
    bosTau2.refseq.mrna.native.load = yes
    # kick off an alignment to verify
    ssh kkstore02
    cd /cluster/data/genbank
    (nice ./bin/gbAlignStep bosTau2)|&mail markd&


# GENBANK DOWNLOAD FILES

#Nope, no need. It will now done automatically on hgdownload.
#-MarkD
  # output goes to /genbank/download/bosTau1/bigZips
  # these are automatically rsynced to hgnfs1 (which hgdownload reads)



# DROP INDEXES FROM GENBANK TABLES (galt)
#(these would just slow down most operations)
    ssh hgwdev
    hgsql bosTau2 -e 'drop index tName_2 on all_est'
    hgsql bosTau2 -e 'drop index tname_3 on all_est'
    hgsql bosTau2 -e 'drop index tName_2 on intronEst'
    hgsql bosTau2 -e 'drop index tname_3 on intronEst'
    hgsql bosTau2 -e 'drop index tName_2 on all_mrna'
    hgsql bosTau2 -e 'drop index tname_3 on all_mrna'



# REQUESTED BLAT SERVER 2005-08-07
#  blat server working.


# GENSCAN (DONE 2005-08-12 galt)
    # uses hard-masked sequence files
    ssh hgwdev
    mkdir /cluster/data/bosTau2/bed/genscan
    cd /cluster/data/bosTau2/bed/genscan
    cvs co hg3rdParty/genscanlinux
    mkdir gtf pep subopt

    # Run on small cluster (more memory than big cluster).
    ssh kki
    cd /cluster/data/bosTau2/bed/genscan

    # generate list of hard-masked scaffolds that are not all-N's
    # (genscan crashes if all-N's) (this is slow started 6:45pm)
    foreach f ( `ls -1S /cluster/data/bosTau2/hardmask-split/*` )
      	egrep '[ACGT]' $f > /dev/null
    	if ($status == 0) echo $f >> genome.list
    end
    wc -l genome.list
    # 779 should be 689, has some duplicates which sorting shows
    # I found dupes for some reason, so I will remove those
    mv genome.list temp.list
    sort -u temp.list > genome.list
    rm temp.list
    wc -l genome.list
    # 689
    # Create gsub (I changed the method to make it more readable)
    # move this block to the left by putting your cursor on the
    # next line and pressing <%, then paste into window, then press >% to restore
    {    
    echo "#LOOP" > gsub
    echo '/cluster/bin/x86_64/gsBig \
    {check in line+ $(path1)} \
    {check out line gtf/$(root1).gtf} \
    -trans={check out line pep/$(root1).pep} \
    -subopt={check out line subopt/$(root1).bed} \
    -exe=hg3rdParty/genscanlinux/genscan \
    -par=hg3rdParty/genscanlinux/HumanIso.smat \
    -tmp=/tmp \
    -window=2400000' \ 
    | tr -d '\n' >> gsub
    echo "\n#ENDLOOP">>gsub
    }
    /cluster/bin/i386/gensub2 genome.list single gsub jobList
    para create jobList
    para try, check, push, check, ...
    ls -1 gtf | wc -l
    # 689
    endsInLf gtf/*
    # no messages, so ok

    # Concatenate
    ssh kkstore02
    cd /cluster/data/bosTau2/bed/genscan
    cat gtf/*.gtf > genscan.gtf
    cat pep/*.pep > genscan.pep
    cat subopt/*.bed > genscanSubopt.bed
    
    # Load
    ssh hgwdev
    cd /cluster/data/bosTau2/bed/genscan
    ldHgGene -gtf bosTau2 genscan genscan.gtf
    #Reading genscan.gtf
    #Read 61380 transcripts in 358757 lines in 1 files
    #  61380 groups 18721 seqs 1 sources 1 feature types
    #  61380 gene predictions
    hgPepPred bosTau2 generic genscanPep genscan.pep
    hgLoadBed bosTau2 genscanSubopt genscanSubopt.bed
    #Reading genscanSubopt.bed
    #Loaded 589698 elements of size 6
    #Sorted
    #Creating table definition for genscanSubopt
    #Saving bed.tab
    #Loading bosTau2
    featureBits bosTau2 genscan
    # 58980161 bases of 2812203870 (2.097%) in intersection
    



# QUALITY TRACK (DONE 2005-08017 galt)
    # The challenge was a sorting problem.
    # The quality file from Baylor was ordered by contig.
    # We need quality data by scaffold, so used the gold table
    # It won't have any data for chrM or for chrBin0 which is fine.
    ssh hgwdev
    cd /cluster/data/bosTau2/bed
    mkdir quality
    cd quality
    # I wrote a c program /cluster/store11/bosTau2/bed/quality/quality.c to process the database.gold
    # and also the contigs.fa.qual data from Baylor.
    # Because the qual file is so big, I do not load 
    # it all at once, instead I read it once and 
    # create an index into where each section lies e.g. '>Contig999'
    # and then using the gold resultset, I fetch each
    # section from the qual file as needed.
    # this creates the simple output format needed by wigEncode
    # I later broke the program into 3 stages for convenience and speed

    # run on hgwdev for sql connection, just dumps a simple query of db.gold
    # this is not super fast because I removed some indexes, 
    # but it's still less than a minute
    loadQuality getGold bosTau2 gold.tab
    
    ssh kkstore02
    cd /cluster/data/bosTau2/bed/quality

    # run this on kkstore02 where it is fast and we won't ever have to run index step again
    # takes about 3 minutes to read and index a 7GB quality file
    loadQuality indexQuality ../../downloads/contigs/Btau20050310.contigs.fa.qual qual.index
    
    # run this on kkstore02 where it is fast (less than an hour!)
    loadQuality build gold.tab qual.index ../../downloads/contigs/Btau20050310.contigs.fa.qual quality.out > quality.log
    # inspect quality.log. only expected things like chrM and chrBin0 
    # should have a message saying they are missing.
    
    # wigEncode 
    wigEncode quality.out quality.wig quality.wib
    # Converted quality.out, upper limit 90.00, lower limit 0.00

    # load
    ssh hgwdev
    cd /cluster/data/bosTau2/bed/quality
    ln -s /cluster/data/bosTau2/bed/quality/quality.wib /gbdb/bosTau2/wib/
    hgLoadWiggle -pathPrefix=/gbdb/bosTau2/wib bosTau2 quality quality.wig
    # loaded with no error messages

    # Hiram and I spot-checked several contigs for the qual data,
    # both full contig forward and reverse strand, and one reptig that
    # had fragStart=781 on - strand and they all were perfect matches
    # for the track data using hgTables. Looks good!


# MAKE Human Proteins track (hg17 DONE 2005-09-08 galt)
    ssh kkstore02
    mkdir /cluster/data/bosTau2/blastDb
    cd /cluster/data/bosTau2/blastDb
    
    #old: faSplit sequence ../downloads/linearScaffolds/Btau20040927-freeze-assembly.fa 500 x
    faSplit sequence ../downloads/UCSC_Cow2.0.fa 500 x

    # find big files over 10M to split
    du -ka | grep '[.]fa$' | gawk '{if ($1 > 10000) print $2}' > bigfiles
    #splitBig.csh
    {    
    #/bin/tcsh -ef
    foreach b (`cat bigfiles`)
	echo "splitting $b"
	set contig = $b:t:r
	echo $contig
	faSplit gap $contig.fa 10000000 ${contig}_ -lift=$contig.lft -minGapSize=100
    end
    }    
    chmod +x splitBig.csh
    splitBig.csh
 
    
    #I will use the scaffolds/ which has this stuff already split up for me.
    #use formatdb to prepare for blastall
    
    #prepForBlast.csh
    {    
    #!/bin/tcsh -ef
    foreach f (*.fa)
	# big files already split have a .lft so skip the big ones
	if (! -e $f:r.lft) then
	    /projects/compbio/bin/i686/formatdb -i $f -p F
	endif				
    end
    }		

    chmod +x prepForBlast.csh
    prepForBlast.csh
    #takes about 5 min
    #[kkstore02:/cluster/data/bosTau2/blastDb> ls | wc -l
    #2627
    #[kkstore02:/cluster/data/bosTau2/blastDb> ls | grep '[.]fa[.]nsq$' | wc -l
    #640
    #[kkstore02:/cluster/data/bosTau2/blastDb> ls | grep '[.]fa[.]nhr$' | wc -l
    #640
    #[kkstore02:/cluster/data/bosTau2/blastDb> ls | grep '[.]fa[.]nin$' | wc -l
    #640

    rm *.log
    rm *.fa
    
    mkdir /cluster/data/bosTau2/bed/tblastn.hg17KG

    ssh kkr1u00
    cp -r /cluster/data/bosTau2/blastDb/ /iscratch/i/bosTau2/
    ls /iscratch/i/bosTau2/blastDb/*.nsq  | sed "s/\.nsq//" > /cluster/data/bosTau2/bed/tblastn.hg17KG/query.lst
    (iSync) > ~/bosTau2sync.out
    # took a few hours
    # Hiram now says not to bother with iSync, it's too slow.
    # Better to just run rsync on the directory you need, 8x.
    # See one of his more recent makedocs for an example.

    # back to kkstore02
    exit

    # we want around 200,000 jobs
    cd /cluster/data/bosTau2/bed/tblastn.hg17KG
    calc `wc /cluster/data/hg17/bed/blat.hg17KG/hg17KG.psl | awk '{print $1}'`/\(200000/`wc query.lst | awk '{print $1}'`\)
    # 37365/(200000/640) = 119.6
    mkdir -p /cluster/bluearc/bosTau2/bed/tblastn.hg17KG/kgfa
    split -l 120 /cluster/data/hg17/bed/blat.hg17KG/hg17KG.psl /cluster/bluearc/bosTau2/bed/tblastn.hg17KG/kgfa/kg
    # 120 created 312 psl files
    ln -s /cluster/bluearc/bosTau2/bed/tblastn.hg17KG/kgfa kgfa
    pushd kgfa
    # go compile kent/src/hg/pslxToFa on kolossus for 64-bit util
    # this runs fast
    foreach i (*)
	pslxToFa $i $i.fa
	rm $i
    end
    popd
    ls -1S kgfa/*.fa > kg.lst
    mkdir -p /cluster/bluearc/bosTau2/bed/tblastn.hg17KG/blastOut
    ln -s /cluster/bluearc/bosTau2/bed/tblastn.hg17KG/blastOut
    #old for i in (`cat kg.lst`) do;mkdir blastOut/`basename $i`;end
    foreach i (`cat kg.lst`)
	mkdir blastOut/$i:t:r 
    end
    
    cat << '_EOF_' > blastGsub
#LOOP
blastSome $(path1) {check in line $(path2)} {check out exists blastOut/$(root2)/q.$(root1).psl } 
#ENDLOOP
'_EOF_'

    cat << '_EOF_' > blastSome
#!/bin/sh
BLASTMAT=/iscratch/i/blast/data
export BLASTMAT
g=`basename $2`
f=/tmp/`basename $3`.$g
for eVal in 0.01 0.001 0.0001 0.00001 0.000001 1E-09 1E-11
do
if /scratch/blast/blastall -M BLOSUM80 -m 0 -F no -e $eVal -p tblastn -d $1 -i $2 -o $f.8
then
        mv $f.8 $f.1
        break;
fi
done
if test -f  $f.1
then
if /cluster/bin/i386/blastToPsl $f.1 $f.2
then
        liftUp -nosort -type=".psl" -pslQ -nohead $3.tmp /cluster/data/hg17/bed/blat.hg17KG/protein.lft warn $f.2
	if pslCheck -prot $3.tmp
	then
	    mv $3.tmp $3
	    rm -f $f.1 $f.2 
	fi
        exit 0
    fi
fi
rm -f $f.1 $f.2 $3.tmp $f.8
exit 1
'_EOF_'

    chmod +x blastSome
    /cluster/bin/i386/gensub2 query.lst kg.lst blastGsub blastSpec
    wc -l blastSpec
    $199680 blastSpec
    

    ssh kk
    cd /cluster/data/bosTau2/bed/tblastn.hg17KG
    para create blastSpec
    # from previous failed attempts, counts are:
    # galt        9      0    238    10  10  -1 /cluster/store11/bosTau2/bed/tblastn.hg17KG/

    para try
    para push

    # There were major power and cluster problems that delayed this batch

[kk:/cluster/data/bosTau2/bed/tblastn.hg17KG> para check
#199680 jobs in batch
#0 jobs (including everybody's) in Parasol queue.
#Checking finished jobs
#ranOk: 199680
#total jobs in batch: 199680

[kk:/cluster/data/bosTau2/bed/tblastn.hg17KG> para time
#199680 jobs in batch
#0 jobs (including everybody's) in Parasol queue.
#Checking finished jobs
#Completed: 199680 of 199680 jobs
#CPU time in finished jobs:   95239579s 1587326.31m 26455.44h 1102.31d  3.020 y
#IO & Wait Time:               4986306s   83105.10m  1385.09h   57.71d  0.158 y
#Average job time:                 502s       8.37m     0.14h    0.01d
#Longest running job:                0s       0.00m     0.00h    0.00d
#Longest finished job:            4991s      83.18m     1.39h    0.06d
#Submission to last job:        560248s    9337.47m   155.62h    6.48d

    # I have to do the lift now!

    # I also accidentally moved files with the same name together into one directory
    # while trying to do liftup, which wiped out one-third of my files,
    # which were redone with para recover blastSpec blastSpec2, etc.

    #- Lift up the 10MB chunk psl's to chrom/scaffold level
    ssh kkstore02
    cd /cluster/data/bosTau2/bed/tblastn.hg17KG/blastOut
    #create liftBig.csh
    {
    #/bin/tcsh -ef
    set nonomatch = 1
    foreach d ( */ )
	pushd $d
	set flist =  ( *_*.psl )
	if ( "$flist" != "*_*.psl" ) then
	    foreach f ( $flist )
		mv $f $f:r.qsl
	    end
	endif
	foreach l ( /cluster/data/bosTau2/blastDb/*.lft )
	    set contig = $l:t:r
	    echo "lifting $d $contig"
	    liftUp -nohead q.${contig}.psl $l warn q.${contig}_*.qsl >> lift.log
	end
	popd
    end
    }                                                                                                                                                                                          
    chmod +x liftBig.csh
    liftBig.csh >& liftBig.log

    #started at 5:30 pm
    #ended ?


    cat << '_EOF_' > chainGsub
#LOOP
chainSome $(path1)
#ENDLOOP
'_EOF_'

    cat << '_EOF_' > chainSome
(cd $1; cat q.*.psl | simpleChain -prot -outPsl -maxGap=150000 stdin ../c.`basename $1`.psl)
'_EOF_'

    chmod +x chainSome

    ls -1dS `pwd`/blastOut/kg?? > chain.lst
    gensub2 chain.lst single chainGsub chainSpec

    ssh kki
    cd /cluster/data/bosTau2/bed/tblastn.hg17KG
    para create chainSpec
    para push

# Completed: 312 of 312 jobs
# CPU time in finished jobs:     198486s    3308.09m    55.13h    2.30d  0.006 y
# IO & Wait Time:                 20318s     338.64m     5.64h    0.24d  0.001 y
# Average job time:                 701s      11.69m     0.19h    0.01d
# Longest running job:                0s       0.00m     0.00h    0.00d
# Longest finished job:            6778s     112.97m     1.88h    0.08d
# Submission to last job:         99549s    1659.15m    27.65h    1.15d

# actually, I had run into errors where there weren't 21 words per line in the psl files,
#foreach d ( */ )
#cd $d
#ls -1 *.[pq]sl | xargs -iX awk '{if (NF != 21) print ENVIRON["PWD"] " " FILENAME " " NR " " NF}' X >> ../badPslList2
#cd ..
#end
#
# and also problems with strange things that look like unlifted but are just junk
# due to cluster power outage crashes I guess - I found those, removed them
# grep -c 'chr[0-9][0-9]_' *.psl > problems
#
# I deleted the problem files and then I did para recover, and re-ran the missing bits.
# Finally I got it to complete with no errors.

    exit
    # back to kkstore02
    pushd /cluster/data/bosTau2/bed/tblastn.hg17KG/blastOut
    {
    foreach i ( kg?? )
	awk '($13 - $12)/$11 > 0.60 {print}' c.$i.psl > c60.$i.psl
	sort -rn c60.$i.psl | pslUniq stdin u.$i.psl
	awk '$1 / $11 > 0.60 {print}' c60.$i.psl > m60.$i.psl
	echo $i
    end
    }
    cat u.*.psl m60* | sort -T /tmp -k 14,14 -k 17,17n -k 17,17n  | uniq \
      > /cluster/data/bosTau2/bed/tblastn.hg17KG/blastHg17KG.psl
    popd

    ssh hgwdev
    cd /cluster/data/bosTau2/bed/tblastn.hg17KG
    hgLoadPsl bosTau2 blastHg17KG.psl

    # back to kksilo
    rm -rf blastOut  # TODO this not done yet, waiting to see if needed for debugging

    # search not working for all expected terms, sent question to braney
    #On Fri, 9 Sep 2005, Brian Raney wrote:
    #
    #> Hey Galt,
    #>
    #> I checked in some search rules for blastHg17KG in
    #> trackDb/cow/bosTau2/trackDb.ra .
    #>
    #> brian

    #the main difference seems to be:
    #xrefTable hg17.blastKGRef01 (in main trackDb.ra)
    #and
    #xrefTable hg17.blastKGRef02 (in bosTau2 trackDb.ra that you added)
    #
    #I see however that searches are now working much better.
    #-Galt
    
# End tblastn




# MAKE SPLIT MASK2 FA (done 2005-09-08 galt)  (for CpG Islands)
    ssh kkstore02
    cd /cluster/data/bosTau2
    mkdir softmask-split
    cd softmask-split
    faSplit sequence ../bosTau2.softmask2.fa 100 x



# LOAD CPGISSLANDS (DONE, 2005-09-08, galt)
    ssh hgwdev
    mkdir -p /cluster/data/bosTau2/bed/cpgIsland
    cd /cluster/data/bosTau2/bed/cpgIsland
    # Build software from Asif Chinwalla (achinwal@watson.wustl.edu)
    cvs co hg3rdParty/cpgIslands
    cd hg3rdParty/cpgIslands
    make
    mv cpglh.exe /cluster/data/bosTau2/bed/cpgIsland/
    
    ssh kkstore02
    cd /cluster/data/bosTau2/bed/cpgIsland

    foreach f (../../softmask-split/*.fa)
	set fout=$f:t:r:r.cpg
    	echo running cpglh on $f to $fout
	./cpglh.exe $f > $fout
    end
    
    # Transform cpglh output to bed +
    cat << '_EOF_' > filter.awk
/* Input columns: */
/* chrom, start, end, len, CpG: cpgNum, perGc, cpg:gpc, observed:expected */
/* chr1\t 41776\t 42129\t 259\t CpG: 34\t 65.8\t 0.92\t 0.94 */
/* Output columns: */
/* chrom, start, end, name, length, cpgNum, gcNum, perCpg, perGc, obsExp */
/* chr1\t41775\t42129\tCpG: 34\t354\t34\t233\t19.2\t65.8\to0.94 */
{
$2 = $2 - 1;
width = $3 - $2;
printf("%s\t%d\t%s\t%s %s\t%s\t%s\t%0.0f\t%0.1f\t%s\t%s\n",
       $1, $2, $3, $5,$6, width,
       $6, width*$7*0.01, 100.0*2*$6/width, $7, $9);
}
'_EOF_'
    # << this line makes emacs coloring happy
    awk -f filter.awk x*.cpg > cpgIsland.bed

    # load into database:
    ssh hgwdev
    cd /cluster/data/bosTau2/bed/cpgIsland
    hgLoadBed bosTau2 cpgIslandExt -tab -noBin \
      -sqlTable=$HOME/kent/src/hg/lib/cpgIslandExt.sql cpgIsland.bed
    #Loaded 108010 elements of size 10
    wc -l cpgIsland.bed 
    #  108010 cpgIsland.bed
    featureBits bosTau2 cpgIslandExt
    # 69207600 bases of 2812203870 (2.461%) in intersection

#CHROMSOMES DOWNLOAD DIRECTORY (done 2005-09-26 galt)
    ssh kkstore02
    cd /cluster/data/bosTau2
    mkdir chromosomes
    cd chromosomes
    faFilter -name=scaffold* ../bosTau2.softmask2.fa scaffolds.fa
    /cluster/home/galt/bin/i386/faFilter -v -name=scaffold* ../bosTau2.softmask2.fa nonscaffolds.fa
    faSplit byname nonscaffolds.fa ./
    rm nonscaffolds.fa
    gzip *.fa
    md5sum *.gz > md5sum.txt
    hgwdev
    cd /usr/local/apache/htdocs/goldenPath/bosTau2/
    mv /cluster/data/bosTau2/chromosomes/ .

# LOAD GENEID PREDICTIONS (DONE galt 2005-10-19)
    ssh kksilo
    mkdir /cluster/data/bosTau2/bed/geneid
    cd /cluster/data/bosTau2/bed/geneid
    {
    rm -f bosTau2.gtf
    rm -f bosTau2.prot
    set base = 'http://genome.imim.es/genepredictions/B.taurus/golden_path_200503/geneidv1.2/'
    foreach f ( `htmlCheck getLinks $base | grep '[.]gtf$'` )
      echo $f
      set url = "$base$f"
      echo "$url"
      wget $url
      set url = "$base$f:r.prot"
      echo "$url"
      wget $url
    end
    cat {c,s}*.gtf > bosTau2.gtf
    cat {c,s}*.prot > bosTau2.prot
    }
    # Add ".1" suffix to each item in .prot's, to match transcript_id's in gtf
    perl -wpe 's/^(>scaffold\S+|>chr\S+)/$1.1/' bosTau2.prot > geneid.fa
    ssh hgwdev
    cd /cluster/data/bosTau2/bed/geneid
    ldHgGene -gtf -genePredExt bosTau2 geneid bosTau2.gtf
    #Reading bosTau2.gtf
    #Read 98183 transcripts in 399106 lines in 1 files
    #  98183 groups 65717 seqs 1 sources 3 feature types
    #98183 gene predictions
    hgPepPred bosTau2 generic geneidPep geneid.fa
    #Processing geneid.fa


# LOAD SGPGENE PREDICTIONS (DONE galt 2005-12-14)
    ssh kksilo
    mkdir /cluster/data/bosTau2/bed/sgpGene
    cd /cluster/data/bosTau2/bed/sgpGene
    {
    rm -f bosTau2.gtf
    rm -f bosTau2.prot
    set base = 'http://genome.imim.es/genepredictions/B.taurus/golden_path_200503/SGP/mm6/'
    foreach f ( `htmlCheck getLinks $base | grep '[.]gtf$'` )
      echo $f
      set url = "$base$f"
      echo "$url"
      wget $url
      set url = "$base$f:r.prot"
      echo "$url"
      wget $url
    end
    cat {c,s}*.gtf > bosTau2.gtf
    cat {c,s}*.prot > bosTau2.prot
    }
    # Add ".1" suffix to each item in .prot's, to match transcript_id's in gtf
    perl -wpe 's/^(>scaffold\S+|>chr\S+)/$1.1/' bosTau2.prot > sgpGene.fa
    ssh hgwdev
    cd /cluster/data/bosTau2/bed/sgpGene
    ldHgGene -gtf -genePredExt bosTau2 sgpGene bosTau2.gtf
    #Reading bosTau2.gtf
    #Read 44919 transcripts in 284636 lines in 1 files
    #  44919 groups 14813 seqs 1 sources 3 feature types
    #44919 gene predictions
    hgPepPred bosTau2 generic sgpGenePep sgpGene.fa
    #Processing sgpGene.fa

# SGP GENES (UPDATE 1/18/2006)
    sgpPep table dropped, replaced by hgc generated protein seq in browser

############################################################################
##  BLASTZ swap from mm8 alignments (DONE - 2006-02-18 - Hiram)
    ssh pk
    cd /cluster/data/mm8/bed/blastzBosTau.2006-02-18
    time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
        -swap -bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
        `pwd`/DEF > swap.out 2>&1 &

    time nice -n +19 featureBits bosTau2 chainMm8Link
    #   683178156 bases of 2812203870 (24.293%) in intersection


############################################################################
# SWAP CHAINS/NET RN4 (DONE 4/2/06 angie)
    ssh kkstore02
    mkdir /cluster/data/bosTau2/bed/blastz.rn4.swap
    cd /cluster/data/bosTau2/bed/blastz.rn4.swap
    doBlastzChainNet.pl -swap /cluster/data/rn4/bed/blastz.bosTau2/DEF \
      -workhorse kkr8u00 >& do.log & tail -f do.log
    ln -s blastz.rn4.swap /cluster/data/bosTau2/bed/blastz.rn4


############################################################################
##  BLASTZ swap from hg17 alignments (DONE - 2006-03-30 - 2004-04-04 - Hiram)
#  LIFTOVER BOSTAU2HG17
    #	swapping to get the lift over file in the other direction
    ssh pk
    mkdir /cluster/data/bosTau2/bed/blastz.hg17.swap
    cd /cluster/data/bosTau2/bed
    ln -s blastz.hg17.swap blastz.hg17
    cd blastz.hg17.swap
    
    time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-swap -bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
	/cluster/data/hg17/bed/blastz.bosTau2.2005-12-19/DEF > swap.out 2>&1 &

    #	manually cleaned this up since the run faild during the MySQL
    #	load due to out of space problems.  These tables do not need to
    #	be loaded anyway.
    sh kkstore02
    cd /cluster/data/bosTau2/bed/blastz.hg17.swap

    rm -fr psl/

    rm -fr axtChain/run/chain/
    rm -f  axtChain/noClass.net
    rm -fr axtChain/net/
    rm -fr axtChain/chain/

############################################################################
##  BLASTZ swap from canFam2 alignments (DONE 2006-05-04 markd)
    ssh pk  # not sure why this was done
    mkdir /cluster/data/bosTau2/bed/blastz.canFam2.swap
    cd /cluster/data/bosTau2/bed
    ln -s blastz.canFam2.swap blastz.canFam2
    cd blastz.canFam2.swap
    
    time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-swap -bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
	/cluster/data/canFam2/bed/blastz.bosTau2/DEF >& swap.out

    # should have specified -stop=net, so kill hgLoadChain
    # create the net files
    ssh kolossus
    ./netChains.csh
    ssh hgwdev
    cd /cluster/data/bosTau2/bed/blastz.canFam2.swap/axtChain
    nice netClass -verbose=0 -noAr noClass.net bosTau2 canFam2 bosTau2.canFam2.net
    nice gzip bosTau2.canFam2.net
##########################################################################
# GenBank gbMiscDiff table (markd 2007-01-10)
# Supports `NCBI Clone Validation' section of mgcGenes details page

   # genbank release 157.0 now contains misc_diff fields for MGC clones
   # reloading mRNAs results in gbMiscDiff table being created.
   ./bin/gbDbLoadStep -reload -srcDb=genbank -type=mrna bosTau2

###########################################################################
# HUMAN (hg18) PROTEINS TRACK (DONE braney 2006-06-16)
    ssh kkstore02
    bash # if not using bash shell already

    # we can use blastDb directory set up for hg17 proteins
    cd /cluster/data/bosTau2/blastDb
    mkdir -p /san/sanvol1/scratch/bosTau2/blastDb
    cd /cluster/data/bosTau2/blastDb
    for i in nhr nin nsq; 
    do 
	echo $i
	cp *.$i /san/sanvol1/scratch/bosTau2/blastDb
    done

    mkdir -p /cluster/data/bosTau2/bed/tblastn.hg18KG
    cd /cluster/data/bosTau2/bed/tblastn.hg18KG
    echo  /san/sanvol1/scratch/bosTau2/blastDb/*.nsq | xargs ls -S | sed "s/\.nsq//"  > query.lst
    wc -l query.lst
# 640 query.lst

   # we want around 350000 jobs
   calc `wc /cluster/data/hg18/bed/blat.hg18KG/hg18KG.psl | awk "{print \\\$1}"`/\(350000/`wc query.lst | awk "{print \\\$1}"`\)
# 36727/(350000/640) = 67.157943

   mkdir -p /cluster/bluearc/bosTau2/bed/tblastn.hg18KG/kgfa
   split -l 67 /cluster/data/hg18/bed/blat.hg18KG/hg18KG.psl  /cluster/bluearc/bosTau2/bed/tblastn.hg18KG/kgfa/kg
   ln -s /cluster/bluearc/bosTau2/bed/tblastn.hg18KG/kgfa kgfa
   cd kgfa
   for i in *; do 
     nice pslxToFa $i $i.fa; 
     rm $i; 
     done
   cd ..
   ls -1S kgfa/*.fa > kg.lst
   mkdir -p /cluster/bluearc/bosTau2/bed/tblastn.hg18KG/blastOut
   ln -s /cluster/bluearc/bosTau2/bed/tblastn.hg18KG/blastOut
   for i in `cat kg.lst`; do  mkdir blastOut/`basename $i .fa`; done
   tcsh
   cd /cluster/data/bosTau2/bed/tblastn.hg18KG
   cat << '_EOF_' > blastGsub
#LOOP
blastSome $(path1) {check in line $(path2)} {check out exists blastOut/$(root2)/q.$(root1).psl }
#ENDLOOP
'_EOF_'

   cat << '_EOF_' > blastSome
#!/bin/sh
BLASTMAT=/cluster/bluearc/blast229/data
export BLASTMAT
g=`basename $2`
f=/tmp/`basename $3`.$g
for eVal in 0.01 0.001 0.0001 0.00001 0.000001 1E-09 1E-11
do
if /cluster/bluearc/blast229/blastall -M BLOSUM80 -m 0 -F no -e $eVal -p tblastn -d $1 -i $2 -o $f.8
then
        mv $f.8 $f.1
        break;
fi
done
if test -f  $f.1
then
    if /cluster/bin/i386/blastToPsl $f.1 $f.2
    then
	liftUp -nosort -type=".psl" -nohead $f.4 /cluster/data/bosTau2/blastDb.lft carry $f.2
        liftUp -nosort -type=".psl" -pslQ -nohead $3.tmp /cluster/data/hg18/bed/blat.hg18KG/protein.lft warn $f.4

        if pslCheck -prot $3.tmp                                                  
        then                                                                      
            mv $3.tmp $3                                                          
            rm -f $f.1 $f.2 $f.3 $f.4
        fi
        exit 0                                                                    
    fi                                                                            
fi                                                                                
rm -f $f.1 $f.2 $3.tmp $f.8 $f.3 $f.4
exit 1
'_EOF_'
    # << happy emacs
    chmod +x blastSome
    gensub2 query.lst kg.lst blastGsub blastSpec
    exit # back to bash
    
    ssh pk
    cd /cluster/data/bosTau2/bed/tblastn.hg18KG
    para create blastSpec
#    para try, check, push, check etc.

    para time

# Completed: 290218 of 351360 jobs
# Crashed: 61142 jobs
# CPU time in finished jobs:   25182187s  419703.12m  6995.05h  291.46d  0.799 y
# IO & Wait Time:               2666154s   44435.89m   740.60h   30.86d  0.085 y
# Average job time:                  96s       1.60m     0.03h    0.00d
# Longest finished job:            1106s      18.43m     0.31h    0.01d
# Submission to last job:        296737s    4945.62m    82.43h    3.43d

# Completed: 61142 of 61142 jobs
# CPU time in finished jobs:    4075252s   67920.86m  1132.01h   47.17d  0.129 y
# IO & Wait Time:                383494s    6391.57m   106.53h    4.44d  0.012 y
# Average job time:                  73s       1.22m     0.02h    0.00d
# Longest finished job:             389s       6.48m     0.11h    0.00d
# Submission to last job:         27257s     454.28m     7.57h    0.32d


    ssh kkstore02
    cd /cluster/data/bosTau2/bed/tblastn.hg18KG
    mkdir chainRun
    cd chainRun
    tcsh
    cat << '_EOF_' > chainGsub
#LOOP
chainOne $(path1)
#ENDLOOP
'_EOF_'

    cat << '_EOF_' > chainOne
(cd $1; cat q.*.psl | simpleChain -prot -outPsl -maxGap=150000 stdin /cluster/bluearc/bosTau2/bed/tblastn.hg18KG/blastOut/c.`basename $1`.psl)
'_EOF_'
    exit
    chmod +x chainOne
    ls -1dS /cluster/bluearc/bosTau2/bed/tblastn.hg18KG/blastOut/kg?? > chain.lst
    gensub2 chain.lst single chainGsub chainSpec
    # do the cluster run for chaining
    ssh kk
    cd /cluster/data/bosTau2/bed/tblastn.hg18KG/chainRun
    para create chainSpec
    para maxNode 30
    para try, check, push, check etc.

# Completed: 549 of 549 jobs
# CPU time in finished jobs:     923373s   15389.55m   256.49h   10.69d  0.029 y
# IO & Wait Time:                 35341s     589.01m     9.82h    0.41d  0.001 y
# Average job time:                1746s      29.10m     0.49h    0.02d
# Longest finished job:           49684s     828.07m    13.80h    0.58d
# Submission to last job:         57041s     950.68m    15.84h    0.66d


    ssh kkstore02
    cd /cluster/data/bosTau2/bed/tblastn.hg18KG/blastOut
    bash # if using another shell
    for i in kg??
    do
       cat c.$i.psl | awk "(\$13 - \$12)/\$11 > 0.6 {print}" > c60.$i.psl
       sort -rn c60.$i.psl | pslUniq stdin u.$i.psl
       awk "((\$1 / \$11) ) > 0.60 { print   }" c60.$i.psl > m60.$i.psl
       echo $i
    done
    sort -T /tmp -k 14,14 -k 16,16n -k 17,17n u.*.psl m60* | uniq > /cluster/data/bosTau2/bed/tblastn.hg18KG/blastHg18KG.psl
    cd ..
    pslCheck blastHg18KG.psl
    # this is ok.
    # load table 
    ssh hgwdev
    cd /cluster/data/bosTau2/bed/tblastn.hg18KG
    hgLoadPsl bosTau2 blastHg18KG.psl
    # check coverage
    featureBits bosTau2 refGene:cds  blastHg18KG -enrichment
# refGene:cds 0.274%, blastHg18KG 0.966%, both 0.230%, cover 83.83%, enrich  86.80x

    ssh kkstore02
    rm -rf /cluster/data/bosTau2/bed/tblastn.hg18KG/blastOut
    rm -rf /cluster/bluearc/bosTau2/bed/tblastn.hg18KG/blastOut
#end tblastn


##########################################################################
# LIFTOVER TO BOSTAU3 (DONE 8/17/07 angie)
    ssh kolossus # machine on which script is run doesn't really matter
    mkdir /cluster/data/bosTau2/bed/blat.bosTau3.2007-08-16
    cd /cluster/data/bosTau2/bed/blat.bosTau3.2007-08-16
    doSameSpeciesLiftOver.pl bosTau2 bosTau3 >& do.log &
    tail -f do.log


##########################################################################
