# for emacs: -*- mode: sh; -*-

# IMPORTANT NOTE:
#
# DUE TO A SIMPLISTIC APPROACH/ERROR, THE chrUn WAS CONSTRUCTED WHICH DID NOT
# PRESERVE THE BRIDGED GAPS OF ABOUT 1,000 SUPERCONTIGS WHICH ARE NOT PLACED IN
# REGULAR CHROMOSOMES.
#
# WHEN LOADING 3RD PARTY DATA, PLEASE DO NOT LOAD chrUn. 
#
# PLEASE SEE THE SECTION OF "CREATE A SINGLE chrUn" FOR MORE DETAILS.
# (Fan, 5/10/07).

# Equus caballus - Broad Institute 1.0

    wget --timestamping http://www.genome.gov/Images/press_photos/lowres/20008-72.jpg

#########################################################################
# DOWNLOAD SEQUENCE (DONE 2/6/07 Fan)

    ssh kkstore05
    mkdir /cluster/store12/equCab1
    ln -s /cluster/store12/equCab1 /cluster/data/equCab1
    mkdir /cluster/data/equCab1/downloads
    cd /cluster/data/equCab1/downloads

    wget --timestamping http://www.broad.mit.edu/ftp/pub/assemblies/mammals/horse/Equus1/BasicAssemblyOneLiner.out
    wget --timestamping http://www.broad.mit.edu/ftp/pub/assemblies/mammals/horse/Equus1/Draft_v1.agp.chromosome.fasta.gz
    wget --timestamping http://www.broad.mit.edu/ftp/pub/assemblies/mammals/horse/Equus1/Draft_v1.agp.chromosome.qual.gz
    wget --timestamping http://www.broad.mit.edu/ftp/pub/assemblies/mammals/horse/Equus1/ForDistribution.command
    wget --timestamping http://www.broad.mit.edu/ftp/pub/assemblies/mammals/horse/Equus1/assembly.agp
    wget --timestamping http://www.broad.mit.edu/ftp/pub/assemblies/mammals/horse/Equus1/assembly.bases.gz
    wget --timestamping http://www.broad.mit.edu/ftp/pub/assemblies/mammals/horse/Equus1/assembly.bases.split.tar.gz
    wget --timestamping http://www.broad.mit.edu/ftp/pub/assemblies/mammals/horse/Equus1/assembly.links
    wget --timestamping http://www.broad.mit.edu/ftp/pub/assemblies/mammals/horse/Equus1/assembly.quals.gz
    wget --timestamping http://www.broad.mit.edu/ftp/pub/assemblies/mammals/horse/Equus1/assembly.quals.split.tar.gz
    wget --timestamping http://www.broad.mit.edu/ftp/pub/assemblies/mammals/horse/Equus1/assembly.unplaced
    wget --timestamping http://www.broad.mit.edu/ftp/pub/assemblies/mammals/horse/Equus1/chromosomes.agp
    wget --timestamping http://www.broad.mit.edu/ftp/pub/assemblies/mammals/horse/Equus1/source
    wget --timestamping http://www.broad.mit.edu/ftp/pub/assemblies/mammals/horse/Equus1/unplaced.fasta.gz

    faSize Draft_v1.agp.chromosome.fasta.gz
#2454853977 bases (32946942 N's 2421907035 real 2421907035 upper 0 lower) in #10526 sequences in 1 files
#Total size: mean 233218.1 sd 993878.5 min 3000 (scaffold_10140.1-3000) max #5000000 (scaffold_0.1-5000000) median 4087
#N count: mean 3130.1 sd 16192.7
#U count: mean 230088.1 sd 982249.2
#L count: mean 0.0 sd 0.0

# CREATE A SINGLE chrUn

# PLEASE NOTE THE FOLLOWING STEPS THAT CREATED THE chrUn WAS
# LESS-THAN-OPTIMAL, BECAUSE IT DID NOT TAKE INTO CONSIDERATION THAT SOME
# chrUnXXXXX SCAFFOLDS CONTAINS MULTIPLE CONTIGS WITH KNOWN BRIDGED GAP  LENGTHS.
# A MORE ACCURATE APPROACH NEXT TIME WOULD BE TO CONSTRUCT THE .FA AND .AGP
# FILES THAT PRESERVE THOSE KNOWN BRIDGED GAPS IN chrUnXXXX SCAFFOLDS 
# INSTEAD OF LETTING THE scaffoldFaToAgp PROGRAM TO ASSIGN THE DEFAULT 
# 1,000 BASE GAPS BETWEEN ALL CONTIGS. (Fan 5/10/07).

    ssh kkstore05
    cd /cluster/data/equCab1
    mkdir ucsc
    cd ucsc

    cat ../chromosomes.agp |grep Un |grep -v fragment |cut -f 1,6 >un.tab

    ssh hgwdev
    hgsql equCab1

CREATE TABLE horseUnContig (
  	unId  varchar(40) NOT NULL,
  	contigId varchar(40) NOT NULL,
  	KEY (unId),
  	KEY (contigId)
  	) TYPE=MyISAM;

	load data local infile "un.tab" into table horseUnContiq;
	quit

    hgsql equCab1 -N -e 'select s.* from horseContigSeq s, horseUnContig u where s.id=u.contigId' >unSeq.tab

    hgsql equCab1 -e 'load data local infile "unSeq.tab" into table horseUnContigSeq'

    pepPredToFa equCab1 horseUnContigSeq horseUnSeq.fa

    scaffoldFaToAgp horseUnSeq.fa
    cat ../chromosomes.agp |grep -v Un >j1.tmp
    cat j1.tmp horseUnSeq.agp >ucsc.agp

    qaToQac ../assembly.quals.gz  assembly.quals.qac

    qacAgpLift ../ucsc.agp assembly.quals.qac ucscChrom.qac
    mv ucscChrom.qac ..


#########################################################################
# MAKE GENOME DB (IN PROGRESS 2/8/07 Fan)
    ssh kkstore05
    cd /cluster/data/equCab1

    cat > equCab1.config.ra <<EOF
# Config parameters for makeGenomeDb.pl:
db equCab1
clade vertebrate
genomeCladePriority 40
scientificName Equus caballus
commonName Horse
assemblyDate Feb. 2007
assemblyLabel Broad Institute equCab1 (1.0)
orderKey 161
# NC_001640
mitoAcc 5835107
fastaFiles /cluster/data/equCab1/downloads/assembly.bases.gz
agpFiles /cluster/data/equCab1/downloads/ucsc.agp 
qualFiles /cluster/data/equCab1/downloads/ucscChrom.qac
dbDbSpeciesDir horse
EOF

    # Run with -debug to see what it would do / check for issues:
    bash
    makeGenomeDb.pl equCab1.config.ra -debug -verbose=3

    # Run it for real:
    makeGenomeDb.pl -continue db -verbose=2 equCab1.config.ra \
      >& makeGenomeDb.log & tail -f makeGenomeDb.log

    # Follow the directions at the end of the log after
    #NOTES -- STUFF THAT YOU WILL HAVE TO DO --

#########################################################################
# REPEATMASKER (DONE 02/14/07, Fan)
    ssh kkstore05
    # Run -debug to create the dir structure and preview the scripts:
    doRepeatMasker.pl equCab1 -verbose 3 -debug
    # Run it for real and tail the log:
    mkdir /cluster/data/equCab1/bed/RepeatMasker.2007-02-13
    cd /cluster/data/equCab1/bed/RepeatMasker.2007-02-13

    doRepeatMasker.pl equCab1 -verbose 3 \
      >& do.log & tail -f do.log
    # RM version info from do.log:
#grep version of RepeatMasker$ /cluster/bluearc/RepeatMasker/RepeatMasker
##    October 6 2006 (open-3-1-6) version of RepeatMasker
#grep RELEASE /cluster/bluearc/RepeatMasker/Libraries/RepeatMaskerLib.embl
##CC   RELEASE 20061006;

# Took almost 2 days.

# Completed: 4942 of 4942 jobs
# CPU time in finished jobs:   34711332s  578522.21m  9642.04h  401.75d  1.101 y
# IO & Wait Time:                918255s   15304.24m   255.07h   10.63d  0.029 y
# Average job time:                7210s     120.16m     2.00h    0.08d
# Longest running job:                0s       0.00m     0.00h    0.00d
# Longest finished job:           13135s     218.92m     3.65h    0.15d
# Submission to last job:        125032s    2083.87m    34.73h    1.45d
    
    ssh hgwdev
    featureBits equCab1 rmsk
# 994130286 bases of 2421923695 (41.047%) in intersection

#########################################################################
# SIMPLE REPEATS (TRF) (DONE 2/15/07 Fan)
    ssh kolossus
    nice tcsh
    mkdir /cluster/data/equCab1/bed/simpleRepeat
    cd /cluster/data/equCab1/bed/simpleRepeat
    twoBitToFa ../../equCab1.unmasked.2bit stdout \
    | trfBig -trf=/cluster/bin/i386/trf stdin /dev/null \
      -bedAt=simpleRepeat.bed -tempDir=/tmp \
    >& trf.log & tail -f trf.log
    # ~2 hours
    # Make a filtered version for sequence masking:
    awk '{if ($5 <= 12) print;}' simpleRepeat.bed > trfMask.bed
    splitFileByColumn trfMask.bed trfMaskChrom

    # Load unfiltered repeats into the database:
    ssh hgwdev
    cd /cluster/data/equCab1/bed/simpleRepeat
    hgLoadBed equCab1 simpleRepeat \
      /cluster/data/equCab1/bed/simpleRepeat/simpleRepeat.bed \
      -sqlTable=$HOME/kent/src/hg/lib/simpleRepeat.sql
    featureBits equCab1 simpleRepeat
#40603979 bases of 2421923695 (1.677%) in intersection

#########################################################################
# MASK SEQUENCE WITH FILTERED TRF IN ADDITION TO RM (DONE 2/15/07 Fan)
    ssh kolossus
    cd /cluster/data/equCab1
    time twoBitMask equCab1.rmsk.2bit -add bed/simpleRepeat/trfMask.bed equCab1.2bit
    # This warning is OK -- the extra fields are not block coordinates.
#Warning: BED file bed/simpleRepeat/trfMask.bed has >=13 fields which means it might contain block coordinates, but this program uses only the first three fields (the entire span -- no support for blocks).
2.234u 4.142s 0:17.72 35.9%     0+0k 0+0io 5pf+0w
    # Link to it from /gbdb:
    ssh hgwdev
    ln -s /cluster/data/equCab1/equCab1.2bit /gbdb/equCab1/equCab1.2bit

#########################################################################
# MAKE DOWNLOADABLE / GOLDENPATH FILES (DONE 2/15/07 Fan)
    cd /cluster/data/equCab1
    /cluster/bin/scripts/makeDownloads.pl equCab1 -verbose=2 \
      >& jkStuff/downloads.log & tail -f jkStuff/downloads.log

#########################################################################
## Send sequence to san for kluster runs (DONE - 2007-02-15 - Fan)
    ssh kkstore05
    cd /cluster/data/equCab1
    mkdir /san/sanvol1/scratch/equCab1
    cp -p equCab1.2bit /san/sanvol1/scratch/equCab1/
    cp -p chrom.* /san/sanvol1/scratch/equCab1/

#########################################################################
## Photograph (DONE - 2007-02-20 - Hiram)
    mkdir /cluster/data/equCab1/photograph
    wget --timestamping \
	"http://www.genome.gov/Images/press_photos/highres/20008-300.jpg" \
	-O Twilight_equCab1.jpg
    convert -quality 80 -sharpen 0 -normalize -geometry 320x200 \
	Twilight_equCab1.jpg Equus_caballus.jpg
    # Add Equus_caballus.jpg to the browser/images source tree and copy
    #	to /usr/local/apache/htdocs/images
    #	Remember to use the -kb option when adding this binary data:
    cvs add -kb Equus_caballus.jpg

##########################################################################
## WindowMasker masked sequence (DONE - 2007-02-20 - Hiram)
    ssh kolossus
    mkdir /cluster/data/equCab1/bed/WindowMasker.2007-02-20
    cd /cluster/data/equCab1/bed/WindowMasker.2007-02-20
    ## copy the .csh scripts from xenTro2 WindowMasker run, edit to fixup
    #	reference to correct DB and work directory.
    time nice -n +19 ./doCount.csh > doCount.out 2>&1
    #	real    78m24.832s
    #	user    61m36.550s
    #	sys     2m28.392s
    time nice -n +19 ./doSdust.csh > doSdust.out 2>&1

    ### Surprisingly, this did not mask more than RepeatMasker ?
    
    ssh kkstore05
    cd /cluster/data/equCab1/bed/WindowMasker.2007-02-20
    gzip *.counts *.bed
    nice -n +19 ./applyMask.csh
    #	this addTrf properly gets the n's changed to N which WM masked
    nice -n +19 ./addTrf.csh
    #	measuring faSize of resulting equCab1.sdTrf.2bit:
    #	2462493800 bases (40570105 N's 2421923695 real 1686809847 upper
    #	735113848 lower) in 34 sequences in 1 files
    #	%29.85 masked total, %30.35 masked real

    #	vs. existing equCab1.2bit
    #	2462493800 bases (40570105 N's 2421923695 real
    #	1427706342 upper 994217353 lower) in 34 sequences in 1 files
    #	%40.37 masked total, %41.05 masked real

    ssh hgwdev
    cd /cluster/data/equCab1/bed/WindowMasker.2007-02-20
    time nice -n +19 ./load.csh > load.out 2>&1
    #	Loaded 14600828 elements of size 3

##########################################################################
## ADD BLAT SERVER (DONE - 2007-02-20 - Fan)

# ask admin to create blat server.

    echo 'INSERT INTO blatServers (db, host, port, isTrans) \
         VALUES ("equCab1", "blat11", "17778", "1"); \
         INSERT INTO blatServers (db, host, port, isTrans) \
	 VALUES ("equCab1", "blat11", "17779", "0");' \
    | hgsql -h genome-testdb hgcentraltest

##########################################################################
# MAKE 11.OOC FILE FOR BLAT (DONE 2/20/07, Fan)
    # Use -repMatch=860 (based on size -- for human we use 1024, and
    # horse size is 84% of human judging by twoBitInfo -noNs)
    ssh kolossus
    blat /cluster/data/equCab1/equCab1.2bit /dev/null /dev/null -tileSize=11 \
      -makeOoc=/cluster/bluearc/equCab1/11.ooc -repMatch=860
# Wrote 22800 overused 11-mers to /cluster/bluearc/equCab1/11.ooc

    ssh kkr1u00
    mkdir /iscratch/i/equCab1
    cd /iscratch/i/equCab1
    cp -p /cluster/bluearc/equCab1/11.ooc .
    cp /cluster/data/equCab1/equCab1.2bit . -p
    iSync

##########################################################################
# BLASTZ/CHAIN/NET RN4 (DONE 2/18/07 Fan)
    ssh kkstore05
    mkdir /cluster/data/equCab1/bed/blastz.rn4.2007-02-18
    cd /cluster/data/equCab1/bed/blastz.rn4.2007-02-18
    cat << '_EOF_' > DEF
# Horse vs. Rat

BLASTZ_M=50

# TARGET: Horse equCab1
SEQ1_DIR=/san/sanvol1/scratch/equCab1/equCab1.2bit
SEQ1_LEN=/san/sanvol1/scratch/equCab1/chrom.sizes       
# Maximum number of scaffolds that can be lumped together
SEQ1_LIMIT=500     
SEQ1_CHUNK=30000000
SEQ1_LAP=10000

# QUERY: Mouse rn4
SEQ2_DIR=/scratch/hg/rn4/nib
SEQ2_LEN=/cluster/data/rn4/chrom.sizes 
SEQ2_CHUNK=10000000
SEQ2_LAP=0

BASE=/cluster/data/equCab1/bed/blastz.rn4.2007-02-18
TMPDIR=/scratch/tmp
'_EOF_'
    # << this line keeps emacs coloring happy
doBlastzChainNet.pl DEF -chainMinScore=3000 -chainLinearGap=medium \
      -bigClusterHub pk \
      -blastzOutRoot /cluster/bluearc/equCab1Rn4 >& do.log &
    tail -f do.log

    ssh hgwdev
    cd /cluster/data/equCab1/bed
    ln -s blastz.rn4.2007-02-18 /cluster/data/equCab1/bed/blastz.rn4
    nice featureBits equCab1 -chrom=chr1 chainRn4Link
# 67408374 bases of 177498097 (37.977%) in intersection

    bash
    time nice -n 19 featureBits equCab1 chainRn4Link \
	> fb.equCab1.chainRn4Link.txt 2>&1
    #	859116859 bases of 2421923695 (35.472%) in intersection

    ssh kkstore05
    mkdir /cluster/data/rn4/bed/blastz.equCab1.swap
    cd /cluster/data/rn4/bed/blastz.equCab1.swap
    bash
    time doBlastzChainNet.pl \
	/cluster/data/equCab1/bed/blastz.rn4.2007-02-18/DEF \
	-chainMinScore=3000 -chainLinearGap=medium \
	-verbose=2 -swap -bigClusterHub=pk \
	-blastzOutRoot /cluster/bluearc/equCab1Rn4 > swap.log 2>&1 &

    ssh hgwdev
    cd /cluster/data/rn4/bed/blastz.equCab1.swap
    bash
    time nice -n 19 featureBits rn4 chainEquCab1Link \
	> fb.rn4.chainEquCab1Link.txt 2>&1
    # 867029914 bases of 2571531505 (33.716%) in intersection

#########################################################################
# BLASTZ/CHAIN/NET CANFAM2 (DONE 2/19/07 Fan)
    ssh kkstore05
    mkdir /cluster/data/equCab1/bed/blastz.canFam2.2007-02-18
    cd /cluster/data/equCab1/bed/blastz.canFam2.2007-02-18
    cat << '_EOF_' > DEF
# Horse vs. Dog

BLASTZ_M=50

# TARGET: Horse equCab1
SEQ1_DIR=/san/sanvol1/scratch/equCab1/equCab1.2bit
SEQ1_LEN=/san/sanvol1/scratch/equCab1/chrom.sizes       
# Maximum number of scaffolds that can be lumped together
SEQ1_LIMIT=500     
SEQ1_CHUNK=30000000
SEQ1_LAP=10000

# QUERY: Dog canFam2
SEQ2_DIR=/scratch/hg/canFam2/nib
SEQ2_LEN=/cluster/data/canFam2/chrom.sizes 
SEQ2_CHUNK=10000000
SEQ2_LAP=0

BASE=/cluster/data/equCab1/bed/blastz.canFam2.2007-02-18
TMPDIR=/scratch/tmp
'_EOF_'
    # << this line keeps emacs coloring happy
doBlastzChainNet.pl DEF -chainMinScore=3000 -chainLinearGap=medium \
      -bigClusterHub pk \
      -blastzOutRoot /cluster/bluearc/equCab1CanFam2 >& do.log &
    tail -f do.log

# 6 jobs failed.
# Robert re-ran those 6 jobs. (2/22/07)

doBlastzChainNet.pl DEF -chainMinScore=3000 -chainLinearGap=medium \
-continue cat -bigClusterHub pk \
-blastzOutRoot /cluster/bluearc/equCab1CanFam2 >& do2.log &
tail -f do2.log
    ln -s blastz.canFam2.2007-02-18 /cluster/data/equCab1/bed/blastz.canFam2
    
    ssh hgwdev
    nice featureBits equCab1 -chrom=chrI chainCanFam2Link
    # 128747357 bases of 177498097 (72.534%) in intersection

    bash
    time nice -n 19 featureBits equCab1 chainCanFam2Link \
	> fb.equCab1.chainCanFam2Link.txt 2>&1
    # 1717664968 bases of 2421923695 (70.922%) in intersection

    ssh kkstore04
    mkdir /cluster/data/canFam2/bed/blastz.equCab1.swap
    cd /cluster/data/canFam2/bed/blastz.equCab1.swap
    bash
    time doBlastzChainNet.pl \
	/cluster/data/equCab1/bed/blastz.canFam2.2007-02-18/DEF \
	-chainMinScore=5000 -chainLinearGap=loose \
	-verbose=2 -swap -bigClusterHub=pk > swap.log 2>&1 &
    # real    115m55.880s

    cat *.txt
    # 1673177547 bases of 2384996543 (70.154%) in intersection

#########################################################################
# BLASTZ/CHAIN/NET PANTRO2 (DONE 2/21/07 Fan)
    ssh kkstore05
    mkdir /cluster/data/equCab1/bed/blastz.panTro2.2007-02-19
    cd /cluster/data/equCab1/bed/blastz.panTro2.2007-02-19
    cat << '_EOF_' > DEF
# Horse vs. Chimp

BLASTZ_M=50

# TARGET: Horse equCab1
SEQ1_DIR=/san/sanvol1/scratch/equCab1/equCab1.2bit
SEQ1_LEN=/san/sanvol1/scratch/equCab1/chrom.sizes       
# Maximum number of scaffolds that can be lumped together
SEQ1_LIMIT=500     
SEQ1_CHUNK=30000000
SEQ1_LAP=10000

# QUERY: Chimp panTro2
SEQ2_DIR=/scratch/hg/panTro2/panTro2.2bit
SEQ2_LEN=/cluster/data/panTro2/chrom.sizes 
SEQ2_CHUNK=10000000
SEQ2_LAP=0

BASE=/cluster/data/equCab1/bed/blastz.panTro2.2007-02-19
TMPDIR=/scratch/tmp
'_EOF_'
    # << this line keeps emacs coloring happy
doBlastzChainNet.pl DEF -chainMinScore=3000 -chainLinearGap=medium \
      -bigClusterHub pk \
      -blastzOutRoot /cluster/bluearc/equCab1PanTro2 >& do.log &
    tail -f do.log
    ln -s blastz.panTro2.2007-02-19 /cluster/data/equCab1/bed/blastz.panTro2
    nice featureBits equCab1 -chrom=chr1 chainPanTro2Link
# 121391547 bases of 177498097 (68.390%) in intersection

    bash
    time nice -n 19 featureBits equCab1 chainPanTro2Link \
	> fb.equCab1.chainPanTro2Link.txt 2>&1
    # 1593914101 bases of 2421923695 (65.812%) in intersection

    ssh kkstore04
    mkdir /cluster/data/panTro2/bed/blastz.equCab1.swap
    cd /cluster/data/panTro2/bed/blastz.equCab1.swap
    bash
    time doBlastzChainNet.pl \
	/cluster/data/equCab1/bed/blastz.panTro2.2007-02-19/DEF \
	-chainMinScore=3000 -chainLinearGap=medium \
	-verbose=2 -swap -bigClusterHub=pk \
	-blastzOutRoot /cluster/bluearc/equCab1PanTro2 > swap.log 2>&1 &

    # The -blastzOutRoot /cluster/bluearc/equCab1PanTro2 option above
    # should not be there.  It caused an error message during cleaning up.
    # Don't specify it for -swap next time.

    ssh hgwdev
    cd /cluster/data/panTro2/bed/blastz.equCab1.swap
    bash
    time nice -n 19 featureBits panTro2 chainEquCab1Link \
	> fb.panTro2.chainEquCab1Link.txt 2>&1
    # 1636782924 bases of 2909485072 (56.257%) in intersection
#########################################################################
# BLASTZ/CHAIN/NET mm8 (DONE 2/21/07 Fan)
    ssh kkstore05
    mkdir /cluster/data/equCab1/bed/blastz.mm8.2007-02-17
    cd /cluster/data/equCab1/bed/blastz.mm8.2007-02-17
    cat << '_EOF_' > DEF
# Horse vs. Mouse

BLASTZ_M=50

# TARGET: Horse equCab1
SEQ1_DIR=/san/sanvol1/scratch/equCab1/equCab1.2bit
SEQ1_LEN=/san/sanvol1/scratch/equCab1/chrom.sizes       
# Maximum number of scaffolds that can be lumped together
SEQ1_LIMIT=500     
SEQ1_CHUNK=30000000
SEQ1_LAP=10000

# QUERY: Mouse mm8
SEQ2_DIR=/scratch/hg/mm8/mm8.2bit
SEQ2_LEN=/cluster/data/mm8/chrom.sizes 
SEQ2_CHUNK=10000000
SEQ2_LAP=0

BASE=/cluster/data/equCab1/bed/blastz.mm8.2007-02-17
TMPDIR=/scratch/tmp
'_EOF_'

    # << this line keeps emacs coloring happy
    doBlastzChainNet.pl DEF \
      -bigClusterHub pk \
      -chainMinScore=3000 -chainLinearGap=medium \
      -blastzOutRoot /cluster/bluearc/equCab1/blastz.mm8 >& do.log &
    tail -f do.log

    ssh hgwdev
    cd /cluster/data/equCab1/bed/blastz.mm8.2007-02-17
    ln -s blastz.mm8.2007-02-17 /cluster/data/equCab1/bed/blastz.mm8
    nice featureBits equCab1 -chrom=chr1 chainMm8Link
# 70800969 bases of 177498097 (39.888%) in intersection

    bash
    time nice -n 19 featureBits equCab1 chainMm8Link \
	> fb.equCab1.chainMm8Link.txt 2>&1
# 903993981 bases of 2421923695 (37.325%) in intersection

    ssh kkstore05
    mkdir /cluster/data/mm8/bed/blastz.equCab1.swap
    cd /cluster/data/mm8/bed/blastz.equCab1.swap
    bash
    time doBlastzChainNet.pl \
	/cluster/data/equCab1/bed/blastz.mm8.2007-02-17/DEF \
	-chainMinScore=3000 -chainLinearGap=medium \
	-verbose=2 -swap -bigClusterHub=pk > swap.log 2>&1 &
    tail -f swap.log
# real    76m34.873s

    ssh hgwdev
    cd /cluster/data/mm8/bed/blastz.equCab1.swap
    bash
    time nice -n 19 featureBits mm8 chainEquCab1Link \
	> fb.mm8.chainEquCab1Link.txt 2>&1
    # 906568751 bases of 2567283971 (35.312%) in intersection

#########################################################################
# BLASTZ/CHAIN/NET galGal3 (DONE 2/27/07 Fan)
    ssh kkstore05
    mkdir /cluster/data/equCab1/bed/blastz.galGal3.2007-02-22
    cd /cluster/data/equCab1/bed/blastz.galGal3.2007-02-22
    cat << '_EOF_' > DEF
# Horse vs. Chicken

BLASTZ_M=50

BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=8000
BLASTZ_K=2200
BLASTZ_Q=/cluster/data/blastz/HoxD55.q

# TARGET: Horse equCab1
SEQ1_DIR=/san/sanvol1/scratch/equCab1/equCab1.2bit
SEQ1_LEN=/san/sanvol1/scratch/equCab1/chrom.sizes       
# Maximum number of scaffolds that can be lumped together
SEQ1_LIMIT=500     
SEQ1_CHUNK=30000000
SEQ1_LAP=10000

# QUERY: Chicken galGal3
SEQ2_DIR=/scratch/hg/galGal3/galGal3.2bit
SEQ2_LEN=/cluster/data/galGal3/chrom.sizes 
SEQ2_CHUNK=10000000
SEQ2_LAP=0

BASE=/cluster/data/equCab1/bed/blastz.galGal3.2007-02-22
TMPDIR=/scratch/tmp
'_EOF_'

    # << this line keeps emacs coloring happy
    doBlastzChainNet.pl DEF \
      -bigClusterHub pk \
      -chainMinScore=5000 -chainLinearGap=loose \
      -blastzOutRoot /cluster/bluearc/equCab1/blastz.galGal3 >& do.log &
    tail -f do.log

    ssh hgwdev
    cd /cluster/data/equCab1/bed/blastz.galGal3.2007-02-22
    ln -s blastz.galGal3.2007-02-22 /cluster/data/equCab1/bed/blastz.galGal3
    nice featureBits equCab1 -chrom=chr1 chainGalGal3Link
# 9136777 bases of 177498097 (5.148%) in intersection

    bash
    time nice -n 19 featureBits equCab1 chainGalGal3Link \
	> fb.equCab1.chainGalGal3Link.txt 2>&1
# 114216918 bases of 2421923695 (4.716%) in intersection

    ssh kkstore05
    mkdir /cluster/data/galGal3/bed/blastz.equCab1.swap
    cd /cluster/data/galGal3/bed/blastz.equCab1.swap
    bash
    time doBlastzChainNet.pl \
	/cluster/data/equCab1/bed/blastz.galGal3.2007-02-22/DEF \
	-chainMinScore=5000 -chainLinearGap=loose \
	-verbose=2 -swap -bigClusterHub=pk > swap.log 2>&1 &
    tail -f swap.log

# the above script stopped with the following lines in the .log file:

# netToAxt axtChain/net/chrM.net axtChain/chain/chrM.chain /scratch/hg/galGal3/galGal3.2bit /san/sanvol1/scratch/equCab1/equCab1.2bit stdout
# axtSort stdin stdout
# gzip -c
# Processing chrM
# end
# netToAxt axtChain/net/chrUn_random.net axtChain/chain/chrUn_random.chain /scratch/hg/galGal3/galGal3.2bit /san/sanvol1/scratch/equCab1/equCab1.2bit stdout
# axtSort stdin stdout
# gzip -c
# Processing chrUn_random

# Hiram helped to created the following temp .csh:

    cat finiNet.csh
#!/bin/csh -efx
# This script was automatically generated by /cluster/bin/scripts/doBlastzChainNet.pl
# from /cluster/data/equCab1/bed/blastz.galGal3.2007-02-22/DEF
# It is to be executed on kolossus in /cluster/data/galGal3/bed/blastz.equCab1.swap/axtChain .
# It generates nets (without repeat/gap stats -- those are added later on
# hgwdev) from chains, and generates axt, maf and .over.chain from the nets.
# This script will fail if any of its commands fail.

cd /cluster/data/galGal3/bed/blastz.equCab1.swap

foreach f (axtChain/net/*.net)
netToAxt $f axtChain/chain/$f:t:r.chain \
  /scratch/hg/galGal3/galGal3.2bit /san/sanvol1/scratch/equCab1/equCab1.2bit stdout \
  | axtSort stdin stdout \
  | gzip -c > axtNet/$f:t:r.galGal3.equCab1.net.axt.gz
end

# Make mafNet for multiz: one .maf per galGal3 seq.
mkdir mafNet
foreach f (axtNet/*.galGal3.equCab1.net.axt.gz)
  axtToMaf -tPrefix=galGal3. -qPrefix=equCab1. $f \
        /cluster/data/galGal3/chrom.sizes  /san/sanvol1/scratch/equCab1/chrom.sizes        \
        stdout \
  | gzip -c > mafNet/$f:t:r:r:r:r:r.maf.gz
end

    ssh kolossus
    cd /cluster/data/galGal3/bed/blastz.equCab1.swap
    fininet.csh

    ssh hgwdev
    cd /cluster/data/galGal3/bed/blastz.equCab1.swap
    bash
    time nice -n 19 featureBits galGal3 chainEquCab1Link \
	> fb.galGal3.chainEquCab1Link.txt 2>&1
    # 104418042 bases of 1042591351 (10.015%) in intersection
    
###########################################################################

# HUMAN (hg18) PROTEINS TRACK (DONE Fan 2007-02-19)
    ssh kkstore05
    bash 

    mkdir /cluster/data/equCab1/blastDb
    cd /cluster/data/equCab1
    zcat /cluster/data/equCab1/goldenPath/chromosomes/*.fa.gz > temp.fa
    faSplit sequence temp.fa 500 blastDb/
    rm temp.fa
    cd blastDb
    for i in *.fa
    do
	/cluster/bluearc/blast229/formatdb -i $i -p F
    done
    rm *.fa

    mkdir -p /san/sanvol1/scratch/equCab1/blastDb
    cd /cluster/data/equCab1/blastDb
    for i in nhr nin nsq; 
    do 
	echo $i
	cp *.$i /san/sanvol1/scratch/equCab1/blastDb
    done

    mkdir -p /cluster/data/equCab1/bed/tblastn.hg18KG
    cd /cluster/data/equCab1/bed/tblastn.hg18KG
    echo  /san/sanvol1/scratch/equCab1/blastDb/*.nsq | xargs ls -S | sed "s/\.nsq//"  > query.lst
    wc -l query.lst
# 34 query.lst

   # we want around 100000 jobs
   calc `wc /cluster/data/hg18/bed/blat.hg18KG/hg18KG.psl | awk "{print \\\$1}"`/\(100000/`wc query.lst | awk "{print \\\$1}"`\)
# 36727/(100000/34) = 12.487180
# 13 did not work, 50 did not work either, finally 100 seems to work

   mkdir -p /cluster/bluearc/equCab1/bed/tblastn.hg18KG/kgfa
   split -l 100 /cluster/data/hg18/bed/blat.hg18KG/hg18KG.psl  /cluster/bluearc/equCab1/bed/tblastn.hg18KG/kgfa/kg
   ln -s /cluster/bluearc/equCab1/bed/tblastn.hg18KG/kgfa kgfa
   cd kgfa
   for i in *; do 
     nice pslxToFa $i $i.fa; 
     rm $i; 
     done
   cd ..
   ls -1S kgfa/*.fa > kg.lst
   mkdir -p /cluster/bluearc/equCab1/bed/tblastn.hg18KG/blastOut
   ln -s /cluster/bluearc/equCab1/bed/tblastn.hg18KG/blastOut
   for i in `cat kg.lst`; do  mkdir blastOut/`basename $i .fa`; done
   tcsh
   cd /cluster/data/equCab1/bed/tblastn.hg18KG
   cat << '_EOF_' > blastGsub
#LOOP
blastSome $(path1) {check in line $(path2)} {check out exists blastOut/$(root2)/q.$(root1).psl }
#ENDLOOP
'_EOF_'

   cat << '_EOF_' > blastSome
#!/bin/sh
BLASTMAT=/cluster/bluearc/blast229/data
export BLASTMAT
g=`basename $2`
f=/tmp/`basename $3`.$g
for eVal in 0.01 0.001 0.0001 0.00001 0.000001 1E-09 1E-11
do
if /cluster/bluearc/blast229/blastall -M BLOSUM80 -m 0 -F no -e $eVal -p tblastn -d $1 -i $2 -o $f.8
then
        mv $f.8 $f.1
        break;
fi
done
if test -f  $f.1
then
    if /cluster/bin/i386/blastToPsl $f.1 $f.2
    then
        liftUp -nosort -type=".psl" -pslQ -nohead $3.tmp /cluster/data/hg18/bed/blat.hg18KG/protein.lft warn $f.2

        if pslCheck -prot $3.tmp                                                  
        then                                                                      
            mv $3.tmp $3                                                          
            rm -f $f.1 $f.2 $f.3 $f.4
        fi
        exit 0                                                                    
    fi                                                                            
fi                                                                                
rm -f $f.1 $f.2 $3.tmp $f.8 $f.3 $f.4
exit 1
'_EOF_'
    # << happy emacs
    chmod +x blastSome
    gensub2 query.lst kg.lst blastGsub blastSpec
    exit # back to bash
    
    ssh pk
    cd /cluster/data/equCab1/bed/tblastn.hg18KG
    para create blastSpec
#    para try, check, push, check etc.

    para time
#Completed: 12512 of 12512 jobs
#CPU time in finished jobs:   20551564s  342526.06m  5708.77h  237.87d  0.652 y
#IO & Wait Time:                119733s    1995.56m    33.26h    1.39d  0.004 y
#Average job time:                1652s      27.54m     0.46h    0.02d
#Longest running job:                0s       0.00m     0.00h    0.00d
#Longest finished job:           29231s     487.18m     8.12h    0.34d
#Submission to last job:        162262s    2704.37m    45.07h    1.88d

    ssh kkstore05
    cd /cluster/data/equCab1/bed/tblastn.hg18KG
    tcsh
    mkdir chainRun
    cd chainRun
    cat << '_EOF_' > chainGsub
#LOOP
chainOne $(path1)
#ENDLOOP
'_EOF_'

    cat << '_EOF_' > chainOne
(cd $1; cat q.*.psl | simpleChain -prot -outPsl -maxGap=75000 stdin /cluster/bluearc/equCab1/bed/tblastn.hg18KG/blastOut/c.`basename $1`.psl)
'_EOF_'
    chmod +x chainOne
    ls -1dS /cluster/bluearc/equCab1/bed/tblastn.hg18KG/blastOut/kg?? > chain.lst
    gensub2 chain.lst single chainGsub chainSpec
    # do the cluster run for chaining
    ssh kk
    cd /cluster/data/equCab1/bed/tblastn.hg18KG/chainRun
    para create chainSpec
    para try, check, push, check etc.
# Completed: 368 of 368 jobs
# CPU time in finished jobs:     297634s    4960.57m    82.68h    3.44d  0.009 y
# IO & Wait Time:                221852s    3697.53m    61.63h    2.57d  0.007 y
# Average job time:                1412s      23.53m     0.39h    0.02d
# Longest running job:                0s       0.00m     0.00h    0.00d
# Longest finished job:           47807s     796.78m    13.28h    0.55d
# Submission to last job:         47828s     797.13m    13.29h    0.55d

    ssh kkstore05
    cd /cluster/data/equCab1/bed/tblastn.hg18KG/blastOut
    bash 
    for i in kg??
    do
       cat c.$i.psl | awk "(\$13 - \$12)/\$11 > 0.6 {print}" > c60.$i.psl
       sort -rn c60.$i.psl | pslUniq stdin u.$i.psl
       awk "((\$1 / \$11) ) > 0.60 { print   }" c60.$i.psl > m60.$i.psl
       echo $i
    done
    sort -T /tmp -k 14,14 -k 16,16n -k 17,17n u.*.psl m60* | uniq > /cluster/data/equCab1/bed/tblastn.hg18KG/blastHg18KG.psl
    cd /cluster/data/equCab1/bed/tblastn.hg18KG 
    pslCheck blastHg18KG.psl

    # load table 
    ssh hgwdev
    cd /cluster/data/equCab1/bed/tblastn.hg18KG
    hgLoadPsl equCab1 blastHg18KG.psl

    # check coverage
    nice featureBits equCab1 blastHg18KG 
# 32605747 bases of 2421923695 (1.346%) in intersection
# In comparison to cat and dog:
    nice featureBits felCat3  blastHg18KG
# 15218612 bases of 1642698377 (0.926%) in intersection
    nice featureBits canFam2 blastHg18KG
# 32565727 bases of 2384996543 (1.365%) in intersection

    featureBits equCab1 refGene:cds blastHg18KG  -enrichment
# refGene:cds 0.011%, blastHg18KG 1.346%, both 0.010%, cover 90.61%, enrich 67.31x

    ssh kkstore04
    rm -rf /cluster/data/equCab1/bed/tblastn.hg18KG/blastOut
    rm -rf /cluster/bluearc/equCab1/bed/tblastn.hg18KG/blastOut
#end tblastn

#########################################################################
# GENBANK AUTO UPDATE (DONE - 2007-02-21 - Fan)
    # Make a liftAll.lft that specifies 5M chunks for genbank:
    ssh kkstore05
    cd /cluster/data/equCab1

    simplePartition.pl equCab1.2bit 5000000 /tmp/equCab1P
    cat /tmp/equCab1P/*/*.lft > jkStuff/liftAll.lft
    rm -r /tmp/equCab1P

    cat equCab1.agp | agpToLift -revStrand > jkStuff/genbank.lft
    # align with latest genbank process.
    ssh hgwdev
    cd ~/kent/src/hg/makeDb/genbank
    cvsup
    # edit etc/genbank.conf to add equCab1 just after gasAcu1

# equCab1 (Equus caballus)
equCab1.serverGenome = /cluster/data/equCab1/equCab1.2bit
equCab1.clusterGenome = /iscratch/i/equCab1/equCab1.2bit
equCab1.refseq.mrna.native.pslCDnaFilter  = ${ordered.refseq.mrna.native.pslCDnaFilter}
equCab1.refseq.mrna.xeno.pslCDnaFilter    = ${ordered.refseq.mrna.xeno.pslCDnaFilter}
equCab1.genbank.mrna.native.pslCDnaFilter = ${ordered.genbank.mrna.native.pslCDnaFilter}
equCab1.genbank.mrna.xeno.pslCDnaFilter   = ${ordered.genbank.mrna.xeno.pslCDnaFilter}
equCab1.genbank.est.native.pslCDnaFilter  = ${ordered.genbank.est.native.pslCDnaFilter}
equCab1.ooc = /iscratch/i/equCab1/11.ooc
equCab1.lift = /cluster/data/equCab1/jkStuff/genbank.lft
equCab1.align.unplacedChroms = chrUn
equCab1.genbank.mrna.xeno.load = yes
equCab1.downloadDir = equCab1

    cvs ci -m "Added equCab1." etc/genbank.conf
    # update /cluster/data/genbank/:
    make etc-update

    # Encountered a problem:

    # test -e /cluster/data/genbank/etc/CVS || cp -r etc/CVS /cluster/data/genbank/etc
(cd /cluster/data/genbank/etc && cvs update -d)
    # Permission denied.
    # cvs [update aborted]: end of file from server (consult above messages if any)
    # make[1]: *** [install-etc] Error 1

    # added the following line to my .cshrc fix the problem.
    setenv CVS_RSH ssh

    # Edit src/lib/gbGenome.c to add new species.
    cvs ci -m "Added equCab (horse)." src/lib/gbGenome.c

    make install-server

    ssh kkstore02
    cd /cluster/data/genbank
    bash
    nice -n +19 bin/gbAlignStep -initial equCab1 &

    # cluster jobs crashed when KK shut down due to power outage
    # manually pushed to finish the remaining jobs.
 
    ssh kkstore02
    cd /cluster/data/genbank
    bash
    nice -n +19 bin/gbAlignStep -initial -continue=finish equCab1 &

    # load database when finished
    ssh hgwdev
    cd /cluster/data/genbank
    bash
    time nice -n +19 ./bin/gbDbLoadStep -drop -initialLoad equCab1
    # real    16m19.370s

    # enable daily alignment and update of hgwdev
    cd ~/kent/src/hg/makeDb/genbank
    cvsup
    # add equCab1 to:
        etc/align.dbs
        etc/hgwdev.dbs
    cvs ci -m "Added equCab1." etc/align.dbs etc/hgwdev.dbs
    make etc-update

#########################################################################
# BLASTZ/CHAIN/NET HG18 PART I (STARTED 2/16/07, DONE 2/21/07, Fan)
    ssh kkstore05
    mkdir /cluster/data/equCab1/bed/blastz.hg18.2007-02-15
    cd /cluster/data/equCab1/bed/blastz.hg18.2007-02-15
# NOTE: THE TARGET WAS ORIGINALLY INTENDED TO BE HORSE, BUT I DID NOT
# DISCOVER THIS UNTIL THE TASK IS DONE.
    cat << '_EOF_' > DEF
# Horse vs. Human

BLASTZ_M=50

# TARGET: Human Hg18
SEQ1_DIR=/scratch/hg/hg18/nib
SEQ1_LEN=/cluster/data/hg18/chrom.sizes 
SEQ1_CHUNK=10000000
SEQ1_LAP=10000

# QUERY: Horse equCab1
SEQ2_DIR=/san/sanvol1/scratch/equCab1/equCab1.2bit
SEQ2_LEN=/san/sanvol1/scratch/equCab1/chrom.sizes       
# Maximum number of scaffolds that can be lumped together
SEQ2_LIMIT=500     
SEQ2_CHUNK=30000000
SEQ2_LAP=0

BASE=/cluster/data/equCab1/bed/blastz.hg18.2007-02-15
TMPDIR=/scratch/tmp
'_EOF_'

    # << this line keeps emacs coloring happy
    doBlastzChainNet.pl DEF \
      -bigClusterHub pk \
      -chainMinScore=3000 -chainLinearGap=medium \
      -blastzOutRoot /cluster/bluearc/equCab1/blastz.hg18 >& do.log &
    tail -f do.log

    ln -s blastz.hg18.2007-02-15 /cluster/data/hg18/bed/blastz.equCab1

    nice featureBits hg18 -chrom=chr1 chainEquCab1Link
    # 132947074 bases of 224999719 (59.088%) in intersection

    ssh hgwdev
    cd /cluster/data/equCab1/bed/blastz.hg18.2007-02-15

    bash
    time nice -n 19 featureBits hg18 chainEquCab1Link \
	> fb.hg18.chainEquCab1Link.txt 2>&1 &
    # 1643928877 bases of 2881515245 (57.051%) in intersection

#########################################################################
# BLASTZ/CHAIN/NET HG18 PART II (DONE 2/27/07 Fan)
# NOTE: THE PART II WAS NOT DONE BY USING -swap OPTION
    ssh kkstore05
    mkdir /cluster/data/equCab1/bed/blastz.hg18.2007-02-25
    cd /cluster/data/equCab1/bed/blastz.hg18.2007-02-25
    cat << '_EOF_' > DEF
# Horse vs. Human

BLASTZ_M=50

# TARGET: Horse equCab1
SEQ1_DIR=/san/sanvol1/scratch/equCab1/equCab1.2bit
SEQ1_LEN=/san/sanvol1/scratch/equCab1/chrom.sizes       
# Maximum number of scaffolds that can be lumped together
SEQ1_LIMIT=500     
SEQ1_CHUNK=30000000
SEQ1_LAP=10000

# QUERY: Human hg18
SEQ2_DIR=/scratch/hg/hg18/hg18.2bit
SEQ2_LEN=/cluster/data/hg18/chrom.sizes 
SEQ2_CHUNK=10000000
SEQ2_LAP=0

BASE=/cluster/data/equCab1/bed/blastz.hg18.2007-02-25
TMPDIR=/scratch/tmp
'_EOF_'
    # << this line keeps emacs coloring happy
doBlastzChainNet.pl DEF -chainMinScore=3000 -chainLinearGap=medium \
      -bigClusterHub pk \
      -blastzOutRoot /cluster/bluearc/equCab1Hg18 >& do.log &
    tail -f do.log

    rm /cluster/data/equCab1/bed/blastz.hg18
    ln -s blastz.hg18.2007-02-25 /cluster/data/equCab1/bed/blastz.hg18
    nice featureBits equCab1 -chrom=chr1 chainHg18Link
    # 122777285 bases of 177498097 (69.171%) in intersection

    bash
    time nice -n 19 featureBits equCab1 chainHg18Link \
	> fb.equCab1.chainHg18Link.txt 2>&1
    # 1618657165 bases of 2421923695 (66.834%) in intersection

############################################################################
## Reset default position the HMGB1 gene (DONE - 2004-04-09 - Hiram)
    ssh hgwdev
    hgsql -e 'update dbDb set defaultPos="chr17:9817422-9822403"
	where name="equCab1";' hgcentraltest

##########################################################################
# NSCAN track - (2007-03-08 markd)
# uses hg18 as informat, with pseudogene masking 
#
    # remove old, alpha N-SCAN track built by robert
    cd /cluster/data/equCab1/bed/nscan/

    # obtained NSCAN predictions from michael brent's group
    # at WUSTL
    wget -nv http://mblab.wustl.edu/predictions/horse/equCab1.gtf
    wget -nv http://mblab.wustl.edu/predictions/horse/equCab1.prot.fa

    bzip2 equCab1.*
    chmod a-w *.bz2

    # load tracks.
    ldHgGene -bin -gtf -genePredExt equCab1 nscanGene equCab1.gtf.bz2
    hgPepPred equCab1 generic nscanPep equCab1.prot.fa.bz2
    rm -f *.tab

    # update trackDb; need a equCab1-specific page to describe informants
    horse/equCab1/nscanGene.html    
    horse/equCab1/trackDb.ra

    # request push of 
    nscanGene
    nscanPep

# QA NOTE (ASZ 06-11-2007): mytouch of nscanPep table to bring into line with nscanGene.
# sudo mytouch equCab1 nscanPep sudo mytouch equCab1 nscanPep 200705081600.00

##########################################################################
## ADD chrM record into chrM_gold.  (DONE, 2007-05-02, Fan)

# get a template record from oryLat1

    hgsql oryLat1 -e 'select * from chrM_gold' equCab1.chrM_gold.tab

# edit it with equCab1 chrM legth and accession.
    
    vi equCab1.chrM_gold.tab

# load it into chrM_gold table

hgsql equCab1 -e 'load data local infile "equCab1M.tab" into table chrM_gold'

##########################################################################
# SGP GENES (DONE - 2007-12-20 - Hiram)
    ssh kkstore05
    mkdir  /cluster/data/equCab1/bed/sgp
    cd  /cluster/data/equCab1/bed/sgp

    mkdir gtf
for C in `awk '{print $1}' /cluster/data/equCab1/chrom.sizes`
do
    wget --timestamping \
"http://genome.imim.es/genepredictions/E.caballus/ecJan2007/SGP/200603mm8/${C}.gtf" \
        -O "gtf/${C}.gtf"
done
    ssh hgwdev
    cd /cluster/data/equCab1/bed/sgp
    ldHgGene -gtf -genePredExt equCab1 sgpGene gtf/chr*.gtf
    #	Read 30403 transcripts in 267753 lines in 33 files
    #	30403 groups 33 seqs 1 sources 3 feature types
    #	30403 gene predictions

    featureBits equCab1 sgpGene
    #	32915117 bases of 2421923695 (1.359%) in intersection
    featureBits equCab1 -enrichment refGene:CDS sgpGene
# refGene:CDS 0.015%, sgpGene 1.359%, both 0.014%, cover 89.17%, enrich 65.61x

#####################################################################
# LOAD GENEID GENES (DONE - 2007-12-20 - Hiram)
    ssh kkstore05
    mkdir -p /cluster/data/equCab1/bed/geneid
    cd /cluster/data/equCab1/bed/geneid
    mkdir gtf prot

for C in `awk '{print $1}' /cluster/data/equCab1/chrom.sizes`
do
  echo $C
  wget --timestamping \
"http://genome.imim.es/genepredictions/E.caballus/ecJan2007/geneid_v1.2/${C}.gtf" \
	-O "gtf/${C}.gtf"
      wget --timestamping \
"http://genome.imim.es/genepredictions/E.caballus/ecJan2007/geneid_v1.2/${C}.prot" \
	-O "prot/${C}.prot"
done

    # Add missing .1 to protein id's
    cd prot
    foreach f (*.prot)
      perl -wpe 's/^(>chr\S+)$/$1.1/' $f > $f:r-fixed.prot
    end

    ssh hgwdev
    cd /cluster/data/equCab1/bed/geneid
    ldHgGene -genePredExt -gtf equCab1 geneid gtf/*.gtf
# Read 35577 transcripts in 285553 lines in 33 files
# 35577 groups 33 seqs 1 sources 3 feature types
# 35577 gene predictions

    #	load proteins
    hgPepPred equCab1 generic geneidPep prot/chr*-fixed.prot
    featureBits equCab1 geneid
    #	38938548 bases of 2421923695 (1.608%) in intersection
    featureBits equCab1 -enrichment refGene:CDS geneid
# refGene:CDS 0.015%, geneid 1.608%, both 0.013%, cover 86.04%, enrich 53.52x

#########################################################################
# EQUCAB1->EQUCAB2 LIFTOVER (DONE - 2008-07-08 - larrym)
    ssh kkstore02
    cd /cluster/data/equCab1
    cp -p /cluster/data/equCab2/equCab2.2bit /cluster/bluearc/equCab2

    # -debug run to create run dir, preview scripts...
    doSameSpeciesLiftOver.pl -debug equCab1 equCab2
    # Real run:
    cd /cluster/data/equCab1/bed/blat.equCab2.2008-07-08
    doSameSpeciesLiftOver.pl equCab1 equCab2 >& do.log &
    tail -f do.log
    rm -f /cluster/bluearc/equCab2/equCab2.2bit

##########################################################################

#########################################################################
# REDO EQUCAB1->EQUCAB2 LIFTOVER (DONE - 2008-09-09 - larrym)

    # -debug run to create run dir, preview scripts...
    doSameSpeciesLiftOver.pl -workhorse=hgwdev -bigClusterHub=pk -debug equCab1 equCab2

    # Real run:
    cd /cluster/data/equCab1/bed/blat.equCab2.2008-07-08
    doSameSpeciesLiftOver.pl -ooc /-workhorse=hgwdev -bigClusterHub=pk equCab1 equCab2 >& do.log &
    tail -f do.log
    rm -f /cluster/bluearc/equCab2/equCab2.2bit

    # redo to make sure we used the right .ooc file:
    doSameSpeciesLiftOver.pl -ooc /hive/data/genomes/equCab2/equCab2.11.ooc -workhorse=hgwdev -bigClusterHub=pk -debug equCab1 equCab2
    HgStepManager: executing from step 'align' through step 'cleanup'.
    HgStepManager: executing step 'align' Thu Sep  4 12:59:45 2008.
    align: can't find equCab2/equCab2.2bit in /cluster/bluearc/, /san/sanvol1/scratch/, /scratch/hg/ -- please distribute.

    # fix scratch data for equCab2
    cd /san/sanvol1/scratch/equCab2
    ln -s /hive/data/genomes/equCab2/equCab2.2bit .
    cd /cluster/data/equCab1
    doSameSpeciesLiftOver.pl -ooc /hive/data/genomes/equCab2/equCab2.11.ooc -workhorse=hgwdev -bigClusterHub=pk equCab1 equCab2
