# for emacs: -*- mode: sh; -*-


# This file describes how we made the browser database on 
# NCBI build 34 (July 18, 2003 freeze)

# HOW TO BUILD A ASSEMBLY FROM NCBI FILES 
# ---------------------------------------

# Make gs.17 directory, gs.17/build34 directory, and gs.17/ffa directory.
    mkdir /cluster/store4/gs.17
    mkdir /cluster/store4/gs.17/build34
    mkdir /cluster/store4/gs.17/agp
    mkdir /cluster/store4/gs.17/ffa

#    Make a symbolic link from /cluster/store1 to this location
	
    cd /cluster/store1
    ln -s /cluster/store4/gs.17 ./gs.17

#    Make a symbolic link from your home directory to the build dir:

    ln -s /cluster/store4/gs.17/build34 ~/oo

# NCBI download site:

    ftp ftp.ncbi.nih.gov
    # user and password from /cse/guests/kent/buildHg6.doc
    cd build_34    
 
# Download all finished agp's and fa's into gs.17/agp

    mget chr*.agp
    mget chr*.fa.gz
    gunzip *.gz

# Download contig agp's into gs.17/build34

    get ref_placed.agp   # used to be in reference.agp
    get ref_unplaced.agp # used to be in reference.agp
    get DR51.agp
    get PAR.agp          # new for this build - PAR regions added to chrY
    cat ref_placed.agp ref_unplaced.agp DR51.agp > ncbi_build34.agp

# Download contig fa's into gs.17/ffa

    get ref_placed.fa.gz   # used to be in reference.fa
    get ref_unplaced.fa.gz # used to be in reference.fa
    get DR51.fa.gz
    get PAR.fa.gz          # new for this build - PAR regions added to chrY
    get sequence.inf
    cat ref_placed.fa ref_unplaced.fa DR51.fa > ncbi_build34.fa

# Download assembly related files into gs.17/build34

    get seq_contig.md
    get contig_overlaps.agp

# Download questionable join certificates file

    get e-certificates.txt
    mkdir certificates
    mv e-certificates.txt certificates

# Save a copy of the original seq_contig.md file

    cp seq_contig.md seq_contig.md.orig

# For build34, edit the seq_contig.md file to remove the alternative chr7
# sequence supplied by the Toronto group: NT_079590, NT_079591, NT_079592,
# NT_079593, NT_079594, NT_079595, NT_079596, NT_079597

# Edit seq_contig.md to make the DR51 alternative haplotype look like a
# chr6_random sequence:
# 9606  6       32491690        32629063        +       NG_002432       GI:28212469     CONTIG    DR51    1
# to 
# 9606  6|NG_002432     1       137374  +       NG_002432       GI:28212469     CONTIG  DR51      1

# Move this edited DR51 line next to other chr6_random contigs (for creating
#	the lift file)
    
# Sanity check
    /cluster/bin/i386/checkYbr build34/ncbi_build34.agp ffa/ncbi_build34.fa \
      build34/seq_contig.md

# Convert fa files into UCSC style fa files and place in "contigs" directory
# inside the gs.17/build34 directory 

    cd build34
    mkdir contigs
    /cluster/bin/i386/faNcbiToUcsc -split -ntLast ../ffa/ncbi_build34.fa \
      contigs
   
# Copy over chrM contig from previous version
    cd ~/oo
    cp -r gs.17/build33/M .

# Determine the chromosome sizes from agps

    /cluster/bin/scripts/getChromSizes ../agp

# Create lift files (this will create chromosome directory structure) and
#	inserts file
  
    /cluster/bin/scripts/createNcbiLifts -s chrom_sizes seq_contig.md .

# Create contig agp files (will create contig directory structure)
	
    /cluster/bin/scripts/createNcbiCtgAgp seq_contig.md ncbi_build34.agp .

# Create chromsome random agp files.

    /cluster/bin/scripts/createNcbiChrAgp -randomonly .

# Copy the original chrN.agp files from the gs.17/agp directory 
#    into each of the chromosome directories since they contain better 
#    gap information. Delete the comments at top from these.

# Distribute contig .fa to appropriate directory (assumes all files
# are in "contigs" directory).

    # create global data link for everyone.  No more home directory
    # links required.
    ln -s /cluster/store4/gs.17/build34 /cluster/data/hg16
    cd /cluster/data/hg16
    /cluster/bin/scripts/distNcbiCtgFa contigs .
    rm -r contigs

# Copy over jkStuff from previous build (??)
    mkdir jkStuff
    cp /cluster/store1/gs.17/build33/jkStuff/*.sh jkStuff
    /build31/jkStuff/*.csh jkStuff
    cp /cluster/store1/gs.17/build33/jkStuff/*.gsub jkStuff        

# Create contig gl files

    /cluster/bin/i386/agpToGl contig_overlaps.agp . -md=seq_contig.md

# Create chromosome gl files

     jkStuff/liftGl.sh contig.gl

# Files ready for repeat-masking and trf


# CREATE STS/FISH/BACENDS/CYTOBANDS DIRECTORY STRUCTURE AND SETUP 
#        (DONE 2003-07-23 Terry)

# Create directory structure to hold information for these tracks
        cd /projects/hg2/booch/psl/

# Change Makefile parameters for OOVERS, GSVERS, PREVGS, PREVOO
        make new

# Update all Makefiles with latest OOVERS and GSVERS, DATABASE, and locations of .fa files

# Makefiles in:
#     /gs.17/build33/
#     /gs.17/build33/bacends
#     /gs.17/build33/cytobands
#     /gs.17/build33/cytoPlots
#     /gs.17/build33/fish
#     /gs.17/build33/fosends
#     /gs.17/build33/g2g
#     /gs.17/build33/geneticPlots
#     /gs.17/build33/primers
#     /gs.17/build33/recombrate
#     /gs.17/build33/sts
#     /gs.17/build33/stsPlots

# Create accession_info file *****
	make accession_info.rdb


# UPDATE STS INFORMATION (DONE 2003-07-23 Terry)
# Download and unpack updated information from dbSTS:

    	cd /projects/hg2/booch/psl/update
	wget ftp://ftp.ncbi.nih.gov/repository/dbSTS/dbSTS.sts
	wget ftp://ftp.ncbi.nih.gov/repository/dbSTS/dbSTS.aliases
	wget ftp://ftp.ncbi.nih.gov/blast/db/FASTA/sts.Z
	mv sts.Z dbSTS.FASTA.dailydump.Z
	gunzip dbSTS.FASTA.dailydump.Z


# Make new directory for this info and move files there
        mkdir /cluster/store1/sts.8
        cp all.STS.fa /cluster/store1/sts.8
        cp all.primers /cluster/store1/sts.8
        cp all.primers.fa /cluster/store1/sts.8

# Copy new files to cluster
        ssh kkstore
        cd /cluster/store1/sts.8
        cp /cluster/store1/sts.8 /*.* /scratch/hg/STS

# Ask for propagation from sysadmin

# Load the sequences into the database (after database created)
	ssh hgwdev
	mkdir /gbdb/hg16/sts.8
	cd /gbdb/hg16/sts.8
	ln -s /cluster/store1/sts.8/all.STS.fa ./all.STS.fa
	ln -s /cluster/store1/sts.8/all.primers.fa ./all.primers.fa
	cd /cluster/store2/tmp
	hgLoadRna addSeq hg16 /gbdb/hg16/sts.8/all.STS.fa
	hgLoadRna addSeq hg16 /gbdb/hg16/sts.8/all.primers.fa


# CREATE STS MARKER ALIGNMENTS (DONE 2003-08-03 Terry)

# Create full sequence alignments
        ssh kk
        cd /cluster/home/booch/sts

# Update Makefile with latest OOVERS and GSVERS and
# run cluster jobs
        make new
        make jobList 
        para create jobList
        para push 
	# wait until alignments done
        make stsMarkers.psl

# Copy files to final destination and remove originals
        make copy.assembly
        make clean

# Create primer alignments
        ssh kk
        cd /cluster/home/booch/primers
        
# Update Makefile with latest OOVERS and GSVERS and
# run cluster jobs
        make new
        make jobList.scratch
        para create jobList
        para push

# Do an initial quick filter of results (takes a while, still) and create 
# final file - best done on eieio since disks local
	ssh eieio
	make filter
        make primers.psl

# Copy files to final destination and remove
        make copy.assembly
        make clean
        
# Create ePCR alignments
        ssh kk
        cd /cluster/home/booch/epcr

# Update Makefile with latest OOVERS and GSVERS
        make new
        make jobList
        para create jobList
        para push
        make all.epcr

# Copy files to final destination and remove
        make copy.assembly
        make clean
        

# CREATE AND LOAD STS MARKERS TRACK (DONE 2003-08-03 Terry)

# Copy in current stsInfo2.bed and stsAlias.bed files
        cd /projects/hg2/booch/psl/gs.17/build33
        cp ../update/stsInfo2.bed .
        cp ../update/stsAlias.bed .

# Create final version of sts sequence placements
        ssh kks00
        cd /projects/hg2/booch/psl/gs.17/build33/sts
        make stsMarkers.final

# Create final version of primers placements
# Make sure PRIMERS variable in Makefile is pointing to current version
        cd /projects/hg2/booch/psl/gs.17/build33/primers
        make primers.final

# Create bed file
        cd /projects/hg2/booch/psl/gs.17/build33
        make stsMap.bed

# Create database tables
        ssh hgwdev
        cd /projects/hg2/booch/psl/tables
        hgsql hg16 < all_sts_primer.sql
        hgsql hg16 < all_sts_seq.sql
        hgsql hg16 < stsAlias.sql
        hgsql hg16 < stsInfo2.sql
        hgsql hg16 < stsMap.sql

# Load the tables
	cd /projects/hg2/booch/psl/gs.17/build34/sts/
        echo 'load data local infile "stsMarkers.psl.filter.lifted" into table all_sts_seq;' | hgsql hg16
	cd /projects/hg2/booch/psl/gs.17/build34/primers/
        echo 'load data local infile "primers.psl.filter.lifted" into table all_sts_primer;' | hgsql hg16
	cd /projects/hg2/booch/psl/gs.17/build34/
        echo 'load data local infile "stsAlias.bed" into table stsAlias;' | hgsql hg16
        echo 'load data local infile "stsInfo2.bed" into table stsInfo2;' | hgsql hg16
	echo 'load data local infile "stsMap.bed" into table stsMap;' | hgsql hg16


# CREATE AND LOAD RECOMBINATION RATE TRACK (DONE 2003-08-05 Terry)
# (must be done after STS Markers track) 

# Create bed file
	cd /projects/hg2/booch/psl/gs.17/build34/recombrate
	make recombRate.bed

# Create database table
        ssh hgwdev
        cd /projects/hg2/booch/psl/tables
        hgsql hg16 < recombRate.sql
        
# Load the table
	cd /projects/hg2/booch/psl/gs.17/build34/recombrate/
	echo 'load data local infile "recombRate.bed" into table recombRate;' | hgsql hg16


# UPDATE BACEND SEQUENCES (DONE 2003-07-23 Terry)

# **** Sequences were determined to not have changed since bacends.4 *****
# **** No new sequences downloaded - See makeHg15.doc for download instructions  ***** 

# Load the sequences into the database (after database created)
	ssh hgwdev
	mkdir /gbdb/hg16/bacends.4
	cd /gbdb/hg16/bacends.4
	ln -s /cluster/store1/bacends.4/BACends.fa ./BACends.fa
	cd /cluster/store2/tmp
	hgLoadRna addSeq hg16 /gbdb/hg16/bacends.4/BACends.fa


# BACEND SEQUENCE ALIGNMENTS (DONE 2003-08-01 Terry)
# (alignments done without RepeatMasking)

# Create full sequence alignments
	ssh kk
        cd /cluster/home/booch/bacends

# Update Makefile with latest OOVERS and GSVERS and run cluster jobs
        make new
        make jobList
        para create jobList
        para push 

# Compile alignments and lift the files (takes a while)
	ssh eieio
	make bacEnds.psl.lifted

# Copy files to final destination and remove
        make copy.assembly
        make clean # (may want to wait until sure they're OK)

# BACEND PAIRS TRACK (DONE 2003-08-01 Terry)

# Add /projects/compbiousr/booch/booch/scripts to your path

# Update Makefile with new location of pairs/singles 
# files, if necessary (DONE)
        cd /projects/hg2/booch/psl/gs.17/build33/bacends

# Make initial file of alignments
	make bacEnds.rdb
 
# Try to fish out more pairs
	make bacEndsMiss.psl

# Re-make bacEnds.rdb with new info
	make bacEnds.rdb
 
# Create bacEndPairs track file
        make bacEndPairs.bed

# Create bacEndPairsBad and bacEndPairsLong files
	make bacEndPairsBad.bed

# Create psl file to load
	make bacEnds.load.psl

# Create database tables
        ssh hgwdev
        cd /projects/hg2/booch/psl/tables
        hgsql hg16 < all_bacends.sql
        hgsql hg16 < bacEndPairs.sql
        hgsql hg16 < bacEndPairsBad.sql
        hgsql hg16 < bacEndPairsLong.sql

# Load the tables
	cd /projects/hg2/booch/psl/gs.17/build34/bacends/
        echo 'load data local infile "bacEnds.load.psl" into table all_bacends;' | hgsql hg16
        echo 'load data local infile "bacEndPairs.bed" into table bacEndPairs;' | hgsql hg16
        echo 'load data local infile "bacEndPairsBad.bed" into table bacEndPairsBad;' | hgsql hg16
        echo 'load data local infile "bacEndPairsLong.bed" into table bacEndPairsLong;' | hgsql hg16


# FOSEND SEQUENCE ALIGNMENTS (DONE 2003-08-03 Terry)

# Create full sequence alignments
        ssh kk
        cd /cluster/home/booch/fosends

# Update Makefile with latest OOVERS and GSVERS and run cluster jobs
        make new
        make jobList
	para create jobList
        para push

# Compile alignments and lift the files (takes a while)
	ssh eieio
        cd /cluster/home/booch/fosends
	make fosEnds.psl.lifted

# Copy files to final destination and remove
        make copy.assembly
        make clean

# FOSEND PAIRS TRACK (DONE 2003-08-01 Terry)

# Update Makefile with location of pairs files, if necessary
        ssh kks00
        cd /projects/hg2/booch/psl/gs.17/build33/fosends

# Make initial file of alignments
	make fosEnds.rdb

# Try to fish out more pairs
	make fosEndsMiss.psl

# Re-make bacEnds.rdb with new info
	make fosEnds.rdb
 
# Create bacEndPairs track file
        make fosEndPairs.bed

# Create bacEndPairsBad and bacEndPairsLong files
	make fosEndPairsBad.bed

# Create psl file to load
	make fosEnds.load.psl

# Create database tables
        ssh hgwdev
        cd /projects/hg2/booch/psl/tables
        hgsql hg16 < all_fosends.sql
        hgsql hg16 < fosEndPairs.sql
        hgsql hg16 < fosEndPairsBad.sql
        hgsql hg16 < fosEndPairsLong.sql

# Load the tables
	cd /projects/hg2/booch/psl/gs.17/build34/fosends/
        echo 'load data local infile "fosEnds.load.psl" into table all_fosends;' | hgsql hg16        
        echo 'load data local infile "fosEndPairs.bed" into table fosEndPairs;' | hgsql hg16
        echo 'load data local infile "fosEndPairsBad.bed" into table fosEndPairsBad;' | hgsql hg16
        echo 'load data local infile "fosEndPairsLong.bed" into table fosEndPairsLong;' | hgsql hg16

# Load the sequences (change fosends.# to match correct location) (done for hg15 early 4/9/2003)
	mkdir /gbdb/hg15/fosends.3
	cd /gbdb/hg15/fosends.3
	ln -s /cluster/store1/fosends.3/fosEnds.fa ./fosEnds.fa
	cd /cluster/store2/tmp
	hgLoadRna addSeq hg15 /gbdb/hg15/fosends.3/fosEnds.fa
                

# UPDATE FISH CLONES INFORMATION (DONE 2003-07-23 Terry)

# Download the latest info from NCBI
        # point browser at http://www.ncbi.nlm.nih.gov/genome/cyto/cytobac.cgi?CHR=all&VERBOSE=ctg
        # change "Show details on sequence-tag" to "yes"
        # change "Download or Display" to "Download table for UNIX"
        # press Submit - save as /projects/hg2/booch/psl/fish/hbrc/hbrc.20030723.table

# Format file just downloaded.  
        cd /projects/hg2/booch/psl/fish/

# Edit Makefile to point at file just downloaded (variables HBRC, HBRCFORMAT)
        make HBRC

# (Manually added 21 results from FHCRC)

# Copy it to the new freeze location
        cp /projects/hg2/booch/psl/fish/all.fish.format /projects/hg2/booch/psl/gs.17/build34/fish/

# Save it as the new "gold" file
	cp all.fish.format all.fish.format.gold

# CREATE AND LOAD FISH CLONES TRACK (DONE 2003-08-08 Terry)
# (must be done after Coverage, STS markers track and BAC end pairs track)

# Extract the file with clone positions from database
        ssh hgwdev
        echo 'select * into outfile "/tmp/booch/clonePos.txt" from clonePos' | hgsql hg16
        mv /tmp/booch/clonePos.txt /projects/hg2/booch/psl/gs.17/build34/fish

# Get current clone/accession information
	ssh kks00
        cd /projects/hg2/booch/psl/gs.17/build34/fish
	wget http://www.ncbi.nlm.nih.gov/genome/clone/DATA/clac.out

# Create initial placement file
	cp /projects/hg2/booch/psl/gs.17/build33/fish/extract.pl . 
	make cyto.markers.bed

# Get sequences for accessions not in genome
	# goto http://www.ncbi.nlm.nih.gov/entrez/batchentrez.cgi?db=Nucleotide
	# select file "/projects/hg2/booch/psl/gs.17/build33/fish/not.found.acc
	# change output to FASTA format
	# download results to "/projects/hg2/booch/psl/gs.17/build33/fish/not.found.fa"

# Place sequences against genome
	make blat

# Try to incorporate new placements
	make cyto.markers.bed2

# Create bed file
        make fishClones.bed

# Create database table
        ssh hgwdev
        cd /projects/hg2/booch/psl/tables
        hgsql hg16 < fishClones.sql

# Load the table
	cd /projects/hg2/booch/psl/gs.17/build34/fish/
        echo 'load data local infile "fishClones.bed" into table fishClones;' | hgsql hg16
        

# CREATE AND LOAD CHROMOSOME BANDS TRACK (DONE 2003-08-08 Terry)
# (must be done after FISH Clones track) 

# Create bed file
        ssh kks00
	cd /projects/hg2/booch/psl/gs.17/build34/cytobands/
        make setBands.txt   # NOTE: may get errors if inserts file out-of-sync with pctSetBands file 
        make cytobands.pct.ranges
        make predict

# Create database table
        ssh hgwdev
        cd /projects/hg2/booch/psl/tables
        hgsql hg16 < cytoBand.sql
        
# Load the table
	cd /projects/hg2/booch/psl/gs.17/build34/cytobands/
	echo 'load data local infile "cytobands.bed" into table cytoBand;' | hgsql hg16

# Make cytoBandIdeo track for ideogram gif on hgTracks page.
    # For human cytoBandIdeo is just a replicate of the cytoBand track.
    # Make the cytoBand track (above) and then:
    echo "create table cytoBandIdeo (index(chrom(10),chromStart)) as select * from cytoBand;" | hgsql hg16

# CREATING DATABASE  (DONE - 2003-07-26 - Hiram)
    ssh hgwdev
    # Make sure there is at least 5 gig free on hgwdev:/var/lib/mysql
    df -h /var/lib/mysql
#	Filesystem            Size  Used Avail Use% Mounted on
#	/dev/sda1             472G  416G   31G  93% /var/lib/mysql
    # Create the database.
    echo 'create database hg16' | hgsql hg15
    # make a semi-permanent read-only alias (add this to your .cshrc/.bashrc):
    #	(I have not seen a use for this in any procedures ? -Hiram)
    #		alias hg16 mysql -u hguser -phguserstuff -A hg16
    #	(use 'hgsql hg16' instead)
    # Initialize the relational-mrna and external sequence info tables:
    hgLoadRna new hg16
    # Copy over grp table (for track grouping) from another database:
    echo "create table grp (PRIMARY KEY(NAME)) select * from hg15.grp" \
    | hgsql hg16
    # add ENCODE track.  Move Repeats lower in priority
    echo 'UPDATE grp SET priority=7 WHERE name="varRep"'| hgsql hg16

    echo 'INSERT INTO grp (name, label, priority) VALUES ("encode", "ENCODE Tracks", 8)' | hgsql hg16

    # New ENCODE groups
    echo 'INSERT INTO grp (name, label, priority) VALUES ("encodeGenes", "ENCODE Regions and Genes", 8.1)' | hgsql hg16
    echo 'INSERT INTO grp (name, label, priority) VALUES ("encodeTxLevels", "ENCODE Transcript Levels", 8.2)' | hgsql hg16
    echo 'INSERT INTO grp (name, label, priority) VALUES ("encodeChip", "ENCODE Chromatin Immunoprecipitation", 8.3)' | hgsql hg16
    echo 'INSERT INTO grp (name, label, priority) VALUES ("encodeChrom", "ENCODE Chromosome, Chromatin and DNA Structure", 8.4)' | hgsql hg16
    echo 'INSERT INTO grp (name, label, priority) VALUES ("encodeCompGeno", "ENCODE Comparative Genomics", 8.5)' | hgsql hg16
    echo 'INSERT INTO grp (name, label, priority) VALUES ("encodeVariation", "ENCODE Variation", 8.6)' | hgsql hg16
    echo 'INSERT INTO grp (name, label, priority) VALUES ("encodeAnalysis", "ENCODE Analysis", 8.9)' | hgsql hg16
    

# MAKE HGCENTRALTEST ENTRY AND TRACKDB TABLE (DONE - 2003-07-26 - Hiram)
    ssh hgwdev
    # Enter hg16 into hgcentraltest.dbDb so test browser knows about it:
    echo 'insert into dbDb (name, description, nibPath, organism, \
	defaultPos, active, orderKey, genome, scientificName) \
	values("hg16", "July 2003", "/gbdb/hg16/nib", "Human", \
	"chr7:26828631-26938371", 1, 10, "Human", "Homo sapiens");' \
	| hgsql -h genome-testdb hgcentraltest
    # Make trackDb table so browser knows what tracks to expect:
    cd ~kent/src/hg/makeDb/trackDb
    cvs up -d -P .
    # Edit that makefile to add hg16 in all the right places and do
    make update
    make alpha
    cvs commit makefile

# MAKE LIFTALL.LFT, NCBI.LFT (DONE - 2003-07-26 - Hiram)
    cd /cluster/data/hg16
    mkdir -p jkStuff
    cat ?{,?}/lift/{ordered,random}.lft > jkStuff/liftAll.lft
    # Create jkStuff/ncbi.lft for lifting stuff built with the NCBI assembly.
    # Note: this ncbi.lift will not lift floating contigs to chr_random coords,
    # but it will show the strand orientation of the floating contigs 
    # (grep for '|').
    mdToNcbiLift seq_contig.md jkStuff/ncbi.lft 
    # If a lift file has been edited (e.g. as in 6.2.5 above), edit ncbi.lft 
    # to match. If no step 6.2.5 then no editing needed

# REPEAT MASKING (DONE - 2003-07-25 - Hiram, REDONE 2003-08-02)
    # Split contigs, run RepeatMasker, lift results
    # Notes: 
    # * Using new RepeatMasker in /cluster/bluearc/RepeatMasker030619
    #	Always check for new RepeatMasker before proceeding
    # * Contigs (*/N{T,G}_*/N{T,G}_*.fa) are split into 500kb chunks to make 
    #   RepeatMasker runs manageable on the cluster ==> results need lifting.
    # * For the NCBI assembly we repeat mask on the sensitive mode setting
    #   (RepeatMasker -s)

    #- Split contigs into 500kb chunks:

    ssh eieio
    cd /cluster/data/hg16
    foreach chrom ( ?{,?} )
	foreach c ( $chrom/N{T,G}_?????? )
      set contig = $c:t
	echo "splitting ${chrom}/${contig}/${contig}.fa"
      faSplit size ${chrom}/${contig}/$contig.fa 500000 \
	${chrom}/${contig}/${contig}_ -lift=${chrom}/${contig}/$contig.lft \
	-maxN=500000
	end
    end

    #- Make the run directory and job list:
    cd /cluster/data/hg16
    mkdir -p jkStuff
    #  According to RepeatMasker help file, no arguments are required to
    #	specify species because its default is set for primate (human)
    #  This run script saves the .tbl file to be sent to Arian.  He uses
    # those for his analysis.  Sometimes he needs the .cat and .align files for
    # checking problems.  Krish needs the .align files, they are large.

    cat << '_EOF_' > jkStuff/RMHuman
#!/bin/csh -fe

cd $1
pushd .
/bin/mkdir -p /tmp/hg16/$2
/bin/cp $2 /tmp/hg16/$2/
cd /tmp/hg16/$2
/cluster/bluearc/RepeatMasker030619/RepeatMasker -ali -s $2
popd
/bin/cp /tmp/hg16/$2/$2.out ./
 if (-e /tmp/hg16/$2/$2.align) /bin/cp /tmp/hg16/$2/$2.align ./
if (-e /tmp/hg16/$2/$2.tbl) /bin/cp /tmp/hg16/$2/$2.tbl ./
# if (-e /tmp/hg16/$2/$2.cat) /bin/cp /tmp/hg16/$2/$2.cat ./
/bin/rm -fr /tmp/hg16/$2/*
/bin/rmdir --ignore-fail-on-non-empty /tmp/hg16/$2
/bin/rmdir --ignore-fail-on-non-empty /tmp/hg16
'_EOF_'
    # << this line makes emacs coloring happy
    chmod +x jkStuff/RMHuman

    ssh eieio
    cd /cluster/data/hg16
    mkdir RMRun
    rm -f RMRun/RMJobs
    touch RMRun/RMJobs
   foreach d ( ?{,?} )
     foreach c ( $d/N{T,G}_*/N{T,G}_*_*.fa )
        set f = $c:t
        set cc = $c:h
        set contig = $cc:t
        echo /cluster/store4/gs.17/build34/jkStuff/RMHuman \
   		/cluster/store4/gs.17/build34/${d}/${contig} $f \
   '{'check out line+ /cluster/store4/gs.17/build34/${d}/${contig}/$f.out'}' \
          >> RMRun/RMJobs
      end
    end

    # We have 6015 jobs in RMJobs:
    wc RMRun/RMJobs
#	6015   42105 1184896 RMRun/RMJobs

    #- Do the run
    ssh kk
    cd /cluster/data/hg16/RMRun
    para create RMJobs
    para try, para check, para check, para push, para check,...
    #- While that is running, you can run TRF (simpleRepeat) on the small
    # cluster.  See SIMPLE REPEAT section below
# CPU time in finished jobs:   33575296s  559588.26m  9326.47h  388.60d  1.065 y
# IO & Wait Time:                238878s    3981.30m    66.36h    2.76d  0.008 y
# Average job time:                7513s     125.21m     2.09h    0.09d
# Longest job:                    18457s     307.62m     5.13h    0.21d
# Submission to last job:         55537s     925.62m    15.43h    0.64d


    #- Lift up the split-contig .out's to contig-level .out's
    ssh eieio
    cd /cluster/data/hg16
    foreach d ( ?{,?}/N{T,G}_* )
        cd $d
        set contig = $d:t
        liftUp $contig.fa.out $contig.lft warn ${contig}_?{,?,??}.fa.out 
        cd ../..
    end

    #- Lift up RepeatMask .out files to chromosome coordinates via
    # picked up jkStuff/liftOut2.sh from the hg15 build.  Reset the
    # liftUp command from ~kent/bin/$MACHTYPE to be from
    # /cluster/bin/i386.  Took the redirection to dev/null off of the
    # command and capture the output here to see what errors we have.

    ./jkStuff/liftOut2.sh > liftOut2.out 2>&1 &

    #- By this point, the database should have been created (above):
    ssh hgwdev
    cd /cluster/data/hg16
    hgLoadOut hg16 ?/*.fa.out ??/*.fa.out

    # errors during this load:
Processing 2/chr2.fa.out
Strange perc. field -6.1 line 243430 of 2/chr2.fa.out
Strange perc. field -5.6 line 243430 of 2/chr2.fa.out
Strange perc. field -6.1 line 243432 of 2/chr2.fa.out
Strange perc. field -5.6 line 243432 of 2/chr2.fa.out
Processing 5/chr5.fa.out
Strange perc. field -0.3 line 4339 of 5/chr5.fa.out
Processing 19/chr19.fa.out
Strange perc. field -18.6 line 77032 of 19/chr19.fa.out

# SIMPLE REPEAT [TRF] TRACK (DONE - 2003-07-25 - Hiram)
    # Distribute contigs to /iscratch/i
    ssh kkr1u00
    rm -rf /iscratch/i/gs.17/build34/contigs
    mkdir -p /iscratch/i/gs.17/build34/contigs
    cd /cluster/data/hg16
    cp -p contigs/*.fa /iscratch/i/gs.17/build34/contigs
    # Make sure the total size looks like what you'd expect:
    du ./contigs /iscratch/i/gs.17/build34/contigs
    # 2839768 ./contigs
    # 2839768 /iscratch/i/gs.17/build34/contigs
    ~kent/bin/iSync

    # Create cluster parasol job like so:
    mkdir -p /cluster/data/hg16/bed/simpleRepeat
    cd /cluster/data/hg16/bed/simpleRepeat
    mkdir trf
    cat << '_EOF_' > runTrf
#!/bin/csh -fe
#
set path1 = $1
set inputFN = $1:t
set outpath = $2
set outputFN = $2:t
mkdir -p /tmp/$outputFN
cp $path1 /tmp/$outputFN
pushd .
cd /tmp/$outputFN
/cluster/bin/i386/trfBig -trf=/cluster/bin/i386/trf $inputFN /dev/null -bedAt=$outputFN -tempDir=/tmp
popd
rm -f $outpath
cp -p /tmp/$outputFN/$outputFN $outpath
rm -fr /tmp/$outputFN/*
rmdir --ignore-fail-on-non-empty /tmp/$outputFN
'_EOF_'
    # << this line makes emacs coloring happy
    chmod +x runTrf

    cat << '_EOF_' > gsub
#LOOP
./runTrf {check in line+ $(path1)}  {check out line trf/$(root1).bed}
#ENDLOOP
'_EOF_'
    # << this line makes emacs coloring happy

    ls -1S /iscratch/i/gs.17/build34/contigs/*.fa > genome.lst
    gensub2 genome.lst single gsub spec
    para create spec
    para try
    para check
    para push
    para check
#  Completed: 472 of 472 jobs
# CPU time in finished jobs:      36177s     602.95m    10.05h    0.42d  0.001 y
# IO & Wait Time:                  2038s      33.97m     0.57h    0.02d  0.000 y
# Average job time:                  81s       1.35m     0.02h    0.00d
# Longest job:                     6992s     116.53m     1.94h    0.08d
# Submission to last job:         10703s     178.38m     2.97h    0.12d
    # When cluster run is done, a couple of extra files not caught in
    # the above sequence
    ./runTrf /cluster/store4/gs.17/build34/M/NT_999999/NT_999999.fa trf/NT_999999.bed
    # That produces an empty .bed file, mark it so:
    echo "# trf run produces nothing for this one" >> trf/NT_999999.bed

    liftUp simpleRepeat.bed /cluster/data/hg16/jkStuff/liftAll.lft \
	warn trf/*.bed  > lu.out 2>&1

    # Load into the database:
    ssh hgwdev
    cd /cluster/data/hg16/bed/simpleRepeat
    /cluster/bin/i386/hgLoadBed hg16 simpleRepeat simpleRepeat.bed \
      -sqlTable=$HOME/kent/src/hg/lib/simpleRepeat.sql
#	stringTab = 0
#	Reading simpleRepeat.bed
#	Loaded 627883 elements
#	Sorted
#	Saving bed.tab
#	Loading hg16

# PROCESS SIMPLE REPEATS INTO MASK (DONE - 2003-07-27 - Hiram - REDONE 07-30)
    # After the simpleRepeats track has been built, make a filtered version 
    # of the trf output: keep trf's with period <= 12:
    ssh eieio
    cd /cluster/data/hg16/bed/simpleRepeat
    mkdir -p trfMask
    foreach f (trf/*.bed)
      awk '{if ($5 <= 12) print;}' $f > trfMask/$f:t
    end
    # Lift up filtered trf output to chrom coords as well:
    cd /cluster/data/hg16
    mkdir -p bed/simpleRepeat/trfMaskChrom
    foreach c (?{,?})
      if (-e $c/lift/ordered.lst) then
        perl -wpe 's@(\S+)@bed/simpleRepeat/trfMask/$1.bed@' \
          $c/lift/ordered.lst > $c/lift/oTrf.lst
        liftUp bed/simpleRepeat/trfMaskChrom/chr$c.bed \
          jkStuff/liftAll.lft warn `cat $c/lift/oTrf.lst`
      endif
      if (-e $c/lift/random.lst) then
        perl -wpe 's@(\S+)@bed/simpleRepeat/trfMask/$1.bed@' \
           $c/lift/random.lst > $c/lift/rTrf.lst
        liftUp bed/simpleRepeat/trfMaskChrom/chr${c}_random.bed \
          jkStuff/liftAll.lft warn `cat $c/lift/rTrf.lst`
      endif
    end

# MASK SEQUENCE BOTH REPEATMASKER AND SIMPLE REPEAT/TRF (DONE - 2003-07-27)
#							 -Hiram
    # This used to be done right after RepeatMasking.  Now, we mask with 
    # TRF as well, so do this after the "PROCESS SIMPLE REPEATS" step above.
    ssh eieio
    cd /cluster/data/hg16

    # Make chr*.fa from contig .fa
    #  Copied chrFa.sh from hg15/jkStuff - reset  path from ~kent to
    #  /cluster for the ctgToChromFa comand
    tcsh ./jkStuff/chrFa.sh > chrFa.out 2>&1 &

    # copied these three scripts from hg15 - fixup path names to
    # reference /cluster/bin instead of ~kent/bin

    #- Soft-mask (lower-case) the contig and chr .fa's
    tcsh ./jkStuff/makeFaMasked.sh > maFaMasked.out 2>&1
    #- Make hard-masked .fa.masked files as well:
    tcsh ./jkStuff/makeHardMasked.sh > maHardMasked.out 2>&1
    #- Rebuild the nib, mixedNib, maskedNib files:
    tcsh ./jkStuff/makeNib.sh > maNib.out 2>&1

    # Make symbolic links from /gbdb/hg16/nib to the real nibs.
    ssh hgwdev
    mkdir -p /gbdb/hg16/nib
    foreach f (/cluster/store4/gs.17/build34/nib/chr*.nib)
      ln -s $f /gbdb/hg16/nib
    end
    # Load /gbdb/hg16/nib paths into database and save size info.
    hgsql hg16  < ~/kent/src/hg/lib/chromInfo.sql
    cd /cluster/data/hg16
    hgNibSeq -preMadeNib hg16 /gbdb/hg16/nib ?{,?}/chr?{,?}{,_random}.fa
    echo "select chrom,size from chromInfo" | hgsql -N hg16 > chrom.sizes

    # Copy the masked contig fa to /iscratch and /scratch:
    #	And everything else we will need for blastz runs, etc ...
    ssh kkr1u00
    rm -rf /iscratch/i/gs.17/build34/trfFa
    mkdir -p /iscratch/i/gs.17/build34/trfFa
    cp -p /cluster/data/hg16/?{,?}/N{T,G}_*/N{T,G}_??????.fa /iscratch/i/gs.17/build34/trfFa
    rm -rf /iscratch/i/gs.17/build34/bothMaskedNibs
    mkdir -p /iscratch/i/gs.17/build34/bothMaskedNibs
    cp -p /cluster/data/hg16/nib/*.nib /iscratch/i/gs.17/build34/bothMaskedNibs
    rm -rf /iscratch/i/gs.17/build34/rmsk
    mkdir -p /iscratch/i/gs.17/build34/rmsk
    cp -p /cluster/data/hg16/?{,?}/*.out /iscratch/i/gs.17/build34/rmsk
    ~kent/bin/iSync

    # ssh kkstore
    #  Since kkstore is currently /cluster/bluearc/scratch, better to do
    #  this on eieio and copy to 
    rm -rf /scratch/hg/gs.17/build34/trfFa
    mkdir -p /scratch/hg/gs.17/build34/trfFa
    cp -p /cluster/data/hg16/?{,?}/N{T,G}_*/N{T,G}_??????.fa /scratch/hg/gs.17/build34/trfFa
    rm -rf /scratch/hg/gs.17/build34/bothMaskedNibs
    mkdir /scratch/hg/gs.17/build34/bothMaskedNibs
    cp -p /cluster/data/hg16/nib/*.nib /scratch/hg/gs.17/build34/bothMaskedNibs
    rm -rf /scratch/hg/gs.17/build34/rmsk
    mkdir -p /scratch/hg/gs.17/build34/rmsk
    cp -p /cluster/data/hg16/?{,?}/*.out /scratch/hg/gs.17/build34/rmsk

    # request rsync of kkstore /scratch


# O+O: ASSEMBLY [GOLD], GAP, COVERAGE, MAP CONTIGS TRACKS (DONE - 2003-07-27)
# Store o+o info in database.
    ssh eieio
    cd /cluster/store4/gs.17/build34
    if (-f contig_overlaps.agp) then
      jkStuff/liftGl.sh contig.gl
    else
      ssh hgwdev
      hgGoldGapGl -noGl hg16 /cluster/store4/gs.17 build34 
      echo ""
      echo "*** Note from makeHg15.doc:"
      echo "Come back to this step later when we have contig_overlaps.agp\!"
    endif
    ssh hgwdev
    cd /cluster/store4/gs.17/build34
    if (-f contig_overlaps.agp) then
      hgGoldGapGl hg16 /cluster/store4/gs.17 build34 
      cd /cluster/store4/gs.17
      /cluster/bin/i386/hgClonePos hg16 build34 ffa/sequence.inf /cluster/store4/gs.17 -maxErr=3
    end 
    cd /cluster/store4/gs.17
    # (2/27/04 angie) re-loaded -- chr{1,4,8,15}_random lift files changed
    # 7/30/04.
    hgCtgPos hg16 build34 


# CREATE NON-STANDARD JOIN CERTIFICATES WEB PAGE AND TABLE

# Filter certificates file to only contain those relevant to current assembly

    cd ~/hg16/certificates
    /cluster/bin/scripts/extractCertificates.pl e-certificates.txt ~/hg16 \
    > e-certificates.filter.txt

# Create initial web page and table for loading into database

    hgCert e-certificates.filter.txt > certificates.html

# Donna's edits to html page

# (3/2/04 angie: edit cert.tab to remove some extra tab characters in comments
#  so mySql doesn't truncate them, & reload)

# Load cert table into database

    ssh hgwdev
    cd ~/hg16/certificates
    echo "drop table certificate" | hgsql hg16
    hgsql hg16 < ~/kent/src/hg/lib/certificate.sql 
    echo 'load data local infile "cert.tab" into table certificate;' \
    | hgsql hg16


# AUTO UPDATE GENBANK MRNA RUN  (WORKING - 2003-07-30 - Hiram)

    ssh eieio
    cd /cluster/store5/genbank
    # This is a new organism, edit the etc/genbank.conf file and add:
	# hg16
	hg16.genome = /scratch/hg/gs.17/build34/bothMaskedNibs/chr*.nib
	hg16.lift = /cluster/store4/gs.17/build34/jkStuff/liftAll.lft
	hg16.genbank.est.xeno.load = yes
	hg16.mgcTables.default = full
	hg16.mgcTables.mgc = all
	hg16.downloadDir = hg16

    ssh eieio
    cd /cluster/store5/genbank
    nice bin/gbAlignStep -iserver=no -clusterRootDir=/cluster/bluearc/genbank \
	-srcDb=genbank -type=mrna -verbose=1 -initial hg16
# Completed: 49591 of 49591 jobs
# CPU time in finished jobs:    3853288s   64221.47m  1070.36h   44.60d  0.122 y
# IO & Wait Time:                246323s    4105.38m    68.42h    2.85d  0.008 y
# Average job time:                  83s       1.38m     0.02h    0.00d
# Longest job:                    21265s     354.42m     5.91h    0.25d
# Submission to last job:         22930s     382.17m     6.37h    0.27d

    # Load the results from the above
    ssh hgwdev
    cd /cluster/store5/genbank
    nice bin/gbDbLoadStep -verbose=1 -drop -initialLoad hg16

#	To get this next one started, the above results need to be
#	moved out of the way.  These things can be removed if there are
#	no problems to debug
    ssh eieio
    cd /cluster/bluearc/genbank/work
    mv initial.hg16 initial.hg16.genbank.mrna

    ssh eieio
    cd /cluster/store5/genbank
    nice bin/gbAlignStep -iserver=no -clusterRootDir=/cluster/bluearc/genbank \
	-srcDb=refseq -type=mrna -verbose=1 -initial hg16
# Completed: 68740 of 68740 jobs
# CPU time in finished jobs:    1253290s   20888.16m   348.14h   14.51d  0.040 y
# IO & Wait Time:                309126s    5152.10m    85.87h    3.58d  0.010 y
# Average job time:                  23s       0.38m     0.01h    0.00d
# Longest job:                    13290s     221.50m     3.69h    0.15d
# Submission to last job:         13609s     226.82m     3.78h    0.16d

    # The iservers came back on-line, so use them for this run.
    #  The batch file can be found in:
    #	/cluster/store5/genbank/work/initial.hg16/align
    ssh hgwdev
    cd /cluster/store5/genbank
    nice bin/gbDbLoadStep -verbose=1 hg16

    nice bin/gbAlignStep -srcDb=genbank -type=est -verbose=1 -initial hg16

# GC PERCENT (DONE 2003-07-31 - Hiram)
     ssh hgwdev
     mkdir -p /cluster/data/hg16/bed/gcPercent
     cd /cluster/data/hg16/bed/gcPercent
     hgsql hg16  < ~/kent/src/hg/lib/gcPercent.sql
     hgGcPercent hg16 ../../nib


# MAKE HGCENTRALTEST BLATSERVERS ENTRY (DONE - 2003-07-31 - Hiram)
    ssh hgwdev
    # Substitute BBB with the correct number for the hostname:
    echo 'insert into blatServers values("hg16", "blat6", "17778", "1"); \
          insert into blatServers values("hg16", "blat6", "17779", "0");' \
    | hgsql -h genome-testdb hgcentraltest


# PRODUCING GENSCAN PREDICTIONS (DONE - 2003-08-01 - Hiram)

    ssh eieio
    mkdir -p /cluster/data/hg16/bed/genscan
    cd /cluster/data/hg16/bed/genscan
    # Make 3 subdirectories for genscan to put their output files in
    mkdir -p gtf pep subopt
    # Generate a list file, genome.list, of all the contigs
    # *that do not have pure Ns* (due to heterochromatin, unsequencable 
    # stuff) which would cause genscan to run forever.
    rm -f genome.list
    touch genome.list
    foreach f ( `ls -1S /cluster/store4/gs.17/build34/?{,?}/N{T,G}_*/N{T,G}_??????.fa.masked` )
      egrep '[ACGT]' $f > /dev/null
      if ($status == 0) echo $f >> genome.list
    end
        
    # Log into kkr1u00 (not kk!).  kkr1u00 is the driver node for the small
    # cluster (kkr2u00 -kkr8u00. Genscan has problem running on the
    # big cluster, due to limitation of memory and swap space on each
    # processing node).
    ssh kkr1u00
    cd /cluster/data/hg16/bed/genscan
    # Create template file, gsub, for gensub2.  For example (3-line file):
    cat << '_EOF_' > gsub
#LOOP
/cluster/home/hiram/bin/i386/gsBig {check in line+ $(path1)} {check out line gtf/$(root1).gtf} -trans={check out line pep/$(root1).pep} -subopt={check out line subopt/$(root1).bed} -exe=/cluster/home/fanhsu/projects/compbio/bin/genscan-linux/genscan -par=/cluster/home/fanhsu/projects/compbio/bin/genscan-linux/HumanIso.smat -tmp=/tmp -window=2400000
#ENDLOOP
'_EOF_'
    # << this line makes emacs coloring happy
    gensub2 genome.list single gsub jobList
    para create jobList
    para try
    para check
    para push
# Completed: 491 of 491 jobs		(this was with only 6 CPUs available)
# CPU time in finished jobs:     216220s    3603.67m    60.06h    2.50d  0.007 y
# IO & Wait Time:                 85597s    1426.62m    23.78h    0.99d  0.003 y
# Average job time:                 615s      10.24m     0.17h    0.01d
# Longest job:                    10986s     183.10m     3.05h    0.13d
# Submission to last job:         54395s     906.58m    15.11h    0.63d


    # Issue either one of the following two commands to check the
    # status of the cluster and your jobs, until they are done.
    parasol status
    para check
    # If there were out-of-memory problems (run "para problems"), then 
    # re-run those jobs by hand but change the -window arg from 2400000
    # to 1200000.  In build33, this was 22/NT_011519.
    #  In build34 there were NO failures !
    # Convert these to chromosome level files as so:     
    ssh eieio
    cd /cluster/data/hg16/bed/genscan
    liftUp genscan.gtf ../../jkStuff/liftAll.lft warn gtf/N{T,G}*.gtf
    liftUp genscanSubopt.bed ../../jkStuff/liftAll.lft warn subopt/N{T,G}*.bed
    cat pep/*.pep > genscan.pep

    # Load into the database as so:
    ssh hgwdev
    cd /cluster/data/hg16/bed/genscan
    ldHgGene hg16 genscan genscan.gtf
#	Reading genscan.gtf
#	Read 42974 transcripts in 326300 lines in 1 files
#	  42974 groups 41 seqs 1 sources 1 feature types
#	42974 gene predictions
    hgPepPred hg16 generic genscanPep genscan.pep
#	Processing genscan.pep
    hgLoadBed hg16 genscanSubopt genscanSubopt.bed
#	stringTab = 0
#	Reading genscanSubopt.bed
#	Loaded 518038 elements
#	Sorted
#	Creating table definition for 
#	Saving bed.tab
#	Loading hg16


# CPGISLANDS (DONE - 2003-08-01 - Hiram)
    ssh eieio
    mkdir -p /cluster/data/hg16/bed/cpgIsland
    cd /cluster/data/hg16/bed/cpgIsland
    # Copy program as built for previous hg build:
    mkdir cpg_dist
    cp -p ~/hg15/bed/cpgIsland/cpg_dist/cpglh.exe ./cpg_dist
    #  This step used to read, but I do not immediately see the .tar
    #  file anywhere:  (there is a copy in ~/rn3/bed/cpgIsland)
    # Build software emailed from Asif Chinwalla (achinwal@watson.wustl.edu)
    # copy the tar file to the current directory
    # tar xvf cpg_dist.tar 
    # cd cpg_dist
    # gcc readseq.c cpg_lh.c -o cpglh.exe
    # cd ..
    # cpglh.exe requires hard-masked (N) .fa's.  
    # There may be warnings about "bad character" for IUPAC ambiguous 
    # characters like R, S, etc.  Ignore the warnings.  
    foreach f (../../?{,?}/chr?{,?}{,_random}.fa.masked)
      set fout=$f:t:r:r.cpg
      echo producing $fout...
      ./cpg_dist/cpglh.exe $f > $fout
    end
    cat << '_EOF_' > filter.awk
/* chr1\t1325\t3865\t754\tCpG: 183\t64.9\t0.7 */
/* Transforms to:  (tab separated columns above, spaces below) */
/* chr1  1325    3865    CpG: 183  754  183 489  64.9  0.7 */
{
width = $3-$2;
printf("%s\t%s\t%s\t%s %s\t%s\t%s\t%0.0f\t%0.1f\t%s\n",
  $1,$2,$3,$5,$6,width,$6,width*$7*0.01,100.0*2*$6/($3-$2),$7);}
'_EOF_'
    # << this line makes emacs coloring happy
    awk -f filter.awk chr*.cpg > cpgIsland.bed
    ssh hgwdev
    cd /cluster/data/hg16/bed/cpgIsland
    hgLoadBed hg16 cpgIsland -tab -noBin \
      -sqlTable=$HOME/kent/src/hg/lib/cpgIsland.sql cpgIsland.bed
#	stringTab = 1
#	Reading cpgIsland.bed
#	Loaded 27596 elements
#	Sorted
#	Saving bed.tab
#	Loading hg16

# VERIFY REPEATMASKER RESULTS (DONE - 2003-08-01 - Hiram)

    # Run featureBits on hg16 and on a comparable genome build, and compare:
    ssh hgwdev

    featureBits hg16 rmsk
    # --> 1388770568 bases of 2865697954 (48.462%) in intersection
    # --> 1388044886 bases of 2865697954 (48.437%) in intersection
    # --> 1388157103 bases of 2863665240 (48.475%) in intersection
    featureBits hg15 rmsk
    # --> 1386879340 bases of 2866466359 (48.383%) in intersection
    featureBits hg13 rmsk
    # --> 1383216615 bases of 2860907679 (48.349%) in intersection

# PREPARE CLUSTER FOR BLASTZ RUN (DONE - 2003-08-05 - Hiram)

    ssh eieio
    #  This is where kkstore /scratch is kept:
    cd /cluster/bluearc/scratch/hg/gs.17/build34/rmsk
    #  The following will mark each line for rat and mouse
    #	Rat first will column 1, Mouse second will be column 2
    foreach outfl ( *.out )
	echo "$outfl"
	/cluster/bluearc/RepeatMasker030619/DateRepsinRMoutput.pl \
	${outfl} -query human -comp rat -comp mouse
    end
    #	Now extract each one, 1 = Rat, 2 = Mouse
    cd /cluster/bluearc/scratch/hg/gs.17/build34
    mkdir linSpecRep.notInRat
    mkdir linSpecRep.notInMouse
    foreach f (rmsk/*.out_rat_mus)
        set base = $f:t:r:r
        echo $base.out.spec
        /cluster/bin/scripts/extractLinSpecReps 1 $f > \
                        linSpecRep.notInRat/$base.out.spec
        /cluster/bin/scripts/extractLinSpecReps 2 $f > \
                        linSpecRep.notInMouse/$base.out.spec
    end
    #  That produced no difference at all between those two targets.
    #  Have requested confirmation from Arian


#  BLASTZ MOUSE (DONE - 2003-08-07 - Hiram)

    ssh eieio
    cd /cluster/bluearc/mm3.RM030619

    foreach f (rmsk.spec/*.out_rat_hum)
	set base = $f:t:r:r
	echo $base.out.spec
	/cluster/bin/scripts/extractLinSpecReps 2 $f > \
		linSpecRep.notInHuman/$base.out.spec
    end

    ssh eieio
    mkdir -p /cluster/data/hg16/bed/blastz.mm3
    cd /cluster/data/hg16/bed/blastz.mm3

    cat << '_EOF_' > DEF
# mouse vs. human
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/home/angie/schwartzbin:/cluster/home/kent/bin/i386

ALIGN=blastz-run
BLASTZ=blastz
BLASTZ_H=2000
BLASTZ_ABRIDGE_REPEATS=1

# TARGET
# Human
SEQ1_DIR=/iscratch/i/gs.17/build34/bothMaskedNibs
# not used
SEQ1_RMSK=
# not used
SEQ1_FLAG=
SEQ1_SMSK=/iscratch/i/gs.17/build34/linSpecRep.notInMouse
SEQ1_IN_CONTIGS=0
SEQ1_CHUNK=10000000
SEQ1_LAP=10000

# QUERY
# Mouse
SEQ2_DIR=/iscratch/i/mm3.RM030619/mixedNib/
# not currently used
SEQ2_RMSK=
# not currently used
SEQ2_FLAG=
SEQ2_SMSK=/cluster/bluearc/mm3.RM030619/linSpecRep.notInHuman/
SEQ2_IN_CONTIGS=0
SEQ2_CHUNK=30000000
SEQ2_LAP=0

BASE=/cluster/store4/gs.17/build34/bed/blastz.mm3

DEF=$BASE/DEF
RAW=$BASE/raw
CDBDIR=$BASE
SEQ1_LEN=$BASE/S1.len
SEQ2_LEN=$BASE/S2.len
'_EOF_'
    # << this line makes emacs coloring happy

# Save the DEF file in the current standard place
    DS=`date -I`
    cp DEF ~angie/hummus/DEF.mm3-hg16.$DS

    ssh kk
    cd ~hg16/bed/blastz.mm3
    cd /cluster/data/hg16/bed/blastz.mm3

    # source the DEF file to establish environment for following commands
    bash
    . ./DEF

    # follow the next set of directions slavishly
    mkdir -p $BASE/run
    # give up on avoiding angie's directories
    # tcl script
    # creates xdir.sh and joblist run/j
    ~angie/hummus/make-joblist $DEF > $BASE/run/j

    # xdir.sh makes a bunch of result directories in $BASE/raw/
    # based on chrom name and CHUNK size
    sh $BASE/xdir.sh
    cd $BASE/run

    # now edit j to prefix path to executable name
    # NOTE: we should have a controlled version of schwartz bin executables
    sed -e 's#^#/cluster/bin/penn/#' j > j2
    wc -l j*
    head j2

    # make sure the j2 edits are OK, then use it:
    mv j2 j

    # para create will create the file: 'batch' for the cluster run
    para create j
	# 39663 jobs
    para try
    para check
    para push
    # ... etc ...
    # With competition on the cluster:
# Completed: 39663 of 39663 jobs
# CPU time in finished jobs:   14365996s  239433.27m  3990.55h  166.27d  0.456 y
# IO & Wait Time:                681029s   11350.48m   189.17h    7.88d  0.022 y
# Average job time:                 379s       6.32m     0.11h    0.00d
# Longest job:                     9275s     154.58m     2.58h    0.11d
# Submission to last job:         53023s     883.72m    14.73h    0.61d

    # post-process blastz
    ssh eieio
    cd /cluster/data/hg16/bed/blastz.mm3
    #   source the DEF file again in case you are coming back to this
    #	(must be bash shell)

    . ./DEF
    
    # a new run directory
    mkdir -p run.1
    
    mkdir -p $BASE/lav
    
    # create a new job list to convert out files to lav
    /cluster/bin/scripts/blastz-make-out2lav $DEF $BASE \
                        > run.1/jobList
    cd run.1

    # make sure the job list is OK
    wc -l jobList
       # 312 jobs 
    head jobList

    # run on cluster
    ssh kk
    cd /cluster/data/hg16/bed/blastz.mm3/run.1
    para create jobList
    para try
    para check
    para push
    # etc.
# Completed: 339 of 339 jobs
# CPU time in finished jobs:      11666s     194.44m     3.24h    0.14d  0.000 y
# IO & Wait Time:                 69155s    1152.58m    19.21h    0.80d  0.002 y
# Average job time:                 238s       3.97m     0.07h    0.00d
# Longest job:                     1332s      22.20m     0.37h    0.02d
# Submission to last job:          1497s      24.95m     0.42h    0.02d

    # convert lav files to axt
    ssh kk
    cd /cluster/data/hg16/bed/blastz.mm3
    mkdir axtChrom
    
    # a new run directory
    mkdir run.2
    cd run.2

    # create template file for gensub2
    # usage:  blastz-chromlav2axt lav-dir axt-file seq1-dir seq2-dir
    cat << '_EOF_' > gsub
#LOOP
/cluster/bin/scripts/blastz-chromlav2axt /cluster/store4/gs.17/build34/bed/blastz.mm3/lav/$(root1) {check out line+ /cluster/store4/gs.17/build34/bed/blastz.mm3/axtChrom/$(root1).axt} /iscratch/i/gs.17/build34/bothMaskedNibs /iscratch/i/mm3.RM030619/mixedNib/
#ENDLOOP
'_EOF_'
    # << this line makes emacs coloring happy
    ls -1S /cluster/store4/gs.17/build34/bed/blastz.mm3/lav > chrom.list
    gensub2 chrom.list single gsub jobList
    wc -l jobList
        # 42 jobs
    head jobList

    cd /cluster/data/hg16/bed/blastz.mm3/run.2
    para create jobList
    para try
    para check
    para push
    # The two crashed jobs are about chr19 and chr19_random
    #  Its chr19_random .fa file is almost all masked sequence
    #  The resulting .axt file is empty.  The chr19 is too big
#Completed: 40 of 42 jobs
#Crashed: 2 jobs
#CPU time in finished jobs:       1908s      31.80m     0.53h    0.02d  0.000 y
#IO & Wait Time:                 22178s     369.64m     6.16h    0.26d  0.001 y
#Average job time:                 602s      10.04m     0.17h    0.01d
#Longest job:                     1723s      28.72m     0.48h    0.02d
#Submission to last job:          1802s      30.03m     0.50h    0.02d
    # To fixup the chr19 axtsort problem
    # sometimes alignments are so huge that they cause axtSort to run out 
    # of memory.  Run them in two passes like this:
    ssh kkr1u00
    cd /cluster/data/hg16/bed/blastz.mm3
    set base=/cluster/data/hg16/bed/blastz.mm3
    set seq1_dir=/iscratch/i/gs.17/build34/bothMaskedNibs
    set seq2_dir=/iscratch/i/mm3.RM030619/mixedNib/
    foreach c (lav/chr19)
      pushd $c
      set chr=$c:t
      set out=axtChrom/$chr.axt
      echo "Translating $chr lav to $out"
      foreach d (*.lav)
        set smallout=$d.axt
        lavToAxt $d $seq1_dir $seq2_dir stdout \
        | axtDropSelf stdin stdout \
        | axtSort stdin $smallout
      end
      cat `ls -1 *.lav.axt | sort -g` > $base/$out
      popd
    end

    #	Remove the empty axtChrom/chr19_random.axt file to avoid future
    #	processing errors

    # translate sorted axt files into psl
    ssh eieio
    cd /cluster/data/hg16/bed/blastz.mm3
    mkdir -p pslChrom
    set tbl = "blastzMm3"
    foreach f (axtChrom/chr*.axt)
      set c=$f:t:r
      echo "Processing chr $c"
      /cluster/bin/i386/axtToPsl $f S1.len S2.len pslChrom/${c}_${tbl}.psl
    end
    #	That takes about 20 minutes
    #	chr19 came along later
    ssh kkr1u00
    set tbl = "blastzMm3"
    foreach f (axtChrom/chr19.axt)
      set c=$f:t:r
      echo "Processing chr $c"
      /cluster/bin/i386/axtToPsl $f S1.len S2.len pslChrom/${c}_${tbl}.psl
    end

    # Load database tables
    ssh hgwdev
    set tbl = "blastzMm3"
    cd /cluster/data/hg16/bed/blastz.mm3/pslChrom
    /cluster/bin/i386/hgLoadPsl -noTNameIx hg16 chr*_${tbl}.psl
    # This takes 30 minutes to an hour
    # and later chr19
    /cluster/bin/i386/hgLoadPsl -noTNameIx hg16 chr19_${tbl}.psl
    # create trackDb/human/hg16 and get a trackDb.ra file started with:
#	track blastzMm3
#	shortLabel Mouse Blastz
#	longLabel Blastz All Mouse (Feb. 03) Alignments
#	group compGeno
#	priority 130
#	visibility hide
#	color 100,50,0
#	altColor 255,240,200
#	spectrum on
#	type psl xeno mm3
#	otherDb mm3
    # remake trackDb tables

    # redo chr1 (featureBits shows 7% lower aligments than hg16)
    # (DONE 2003-09-09 kate)
    # blastz run ended prematurely -- .tmp files leftover, not moved to .out's 
    ssh kk
    cd /cluster/data/hg16/bed/blastz.mm3
    bash
    . ./DEF
    cd $BASE
    mkdir run.chr1
    # create job list for human chr1, with parasol output file validation
    ~angie/hummus/make-joblist $DEF | \
        /cluster/bin/scripts/blastz-clusterjob.pl $BASE | \
            grep 'run chr1.nib' | \
            sed -e 's#^#/cluster/bin/penn/#'  \
                > $BASE/run.chr1/spec
    grep 'chr1/' $BASE/xdir.sh > $BASE/xdir.chr1.sh
    mv raw/chr1 raw/chr1.old
    mkdir raw/chr1
    sh xdir.chr1.sh
    cd run.chr1
    para create spec
	# 2925 jobs
    para try
    para check
    para push
    # ... etc ...
    ssh eieio
    bash
    cd /cluster/data/hg16/bed/blastz.mm3
    . DEF
    mv lav/chr1 lav/chr1.old
    mkdir run.chr1.lav
    /cluster/bin/scripts/blastz-make-out2lav $DEF $BASE \
        | grep 'lav chr1 ' > run.chr1.lav/jobList
    cd run.chr1.lav
    wc -l jobList
       # 25 jobs 
    head jobList
    # run on cluster
    ssh kk
    cd /cluster/data/hg16/bed/blastz.mm3/run.chr1.lav
    para create jobList
    para try
    para check
    para push
    # etc.
    
    # convert lav files to chrom axt
    /cluster/bin/scripts/blastz-chromlav2axt /cluster/data/hg16/bed/blastz.mm3/lav/chr1 /cluster/data/hg16/bed/blastz.mm3/axtChrom/chr1.axt /cluster/data/hg16/nib /cluster/data/mm3.RM030619/mixedNib

    # translate sorted axt files into psl
    ssh eieio
    cd /cluster/data/hg16/bed/blastz.mm3
    mv pslChrom/chr1_blastzMm3.psl pslChrom/chr1_blastzMm3.psl.old
    /cluster/bin/i386/axtToPsl axtChrom/chr1.axt S1.len S2.len \
                pslChrom/chr1_blastzMm3.psl
    # reload database table
    hgsql hg16 -e "drop table chr1_blastzMm3"
    ssh hgwdev
    cd /cluster/data/hg16/bed/blastz.mm3/pslChrom
    /cluster/bin/i386/hgLoadPsl -noTNameIx hg16 chr1_blastzMm3.psl

    # make chain
    cd /cluster/data/hg16/bed/blastz.mm3/axtChain/run1
    mv chain/chr1.chain chain/chr1.chain.old
    mv out/chr1.out out/chr1.out.old
    axtFilter -notQ=chrUn_random /cluster/data/hg16/bed/blastz.mm3/axtChrom/chr1.axt | axtChain stdin \
	/cluster/data/hg16/nib \
	/cluster/data/mm3/mixedNib chain/chr1.chain > out/chr1.out 

    # sort chains
    ssh eieio
    cd /cluster/data/hg16/bed/blastz.mm3/axtChain
    mv all.chain all.chain.old
    chainMergeSort run1/chain/*.chain > all.chain
    mv chain chain.old
    mkdir chain
    chainSplit chain all.chain

    # reload chr1 chain into database
    ssh hgwdev
    cd /cluster/data/hg16/bed/blastz.mm3/axtChain/chain
    hgLoadChain hg16 chr1_chainMm3 chr1.chain 
        # Loading 510456 chains into hg16.chr1_chainMm3

    # make net
    ssh eieio
    cd /cluster/data/hg16/bed/blastz.mm3/axtChain
    cd chain
    /cluster/bin/i386/chainPreNet chr1.chain /cluster/data/hg16/chrom.sizes \
                        /cluster/data/mm3/chrom.sizes ../preNet/chr1.chain
    cd ..
    cd preNet
    mv ../n1/chr1.net ../n1/chr1.net.old
    /cluster/bin/i386/chainNet chr1.chain -minSpace=1 \
                /cluster/data/hg16/chrom.sizes \
                /cluster/data/mm3/chrom.sizes ../n1/chr1.net /dev/null
    cd ..
    cp hNoClass.net hNoClass.net.old
    cat n1/*.net | /cluster/bin/i386/netSyntenic stdin hNoClass.net

    ssh hgwdev
    cd /cluster/data/hg16/bed/blastz.mm3/axtChain
    netClass hNoClass.net hg16 mm3 mouse.net \
	-tNewR=/cluster/bluearc/scratch/hg/gs.17/build34/linSpecRep.notInMouse \
	-qNewR=/cluster/bluearc/mm3.RM030619/linSpecRep.notInHuman

    ssh eieio
    cd /cluster/data/hg16/bed/blastz.mm3/axtChain
    # rm -r n1 hNoClass.net

    # Make a 'syntenic' subset of these with
    mv mouseSyn.net mouseSyn.net.old
    netFilter -syn mouse.net > mouseSyn.net

    # Load the nets into database
    ssh hgwdev
    cd /cluster/data/hg16/bed/blastz.mm3/axtChain
    netFilter -minGap=10 mouse.net |  hgLoadNet hg16 netMm3 stdin
    netFilter -minGap=10 mouseSyn.net | hgLoadNet hg16 syntenyNetMm3 stdin

    # make tight subset of net
    ssh eieio
    cd /cluster/data/hg16/bed/blastz.mm3/axtChain
    mv ../axtNet/chr1.axt ../axtNet/chr1.old.axt
    netToAxt mouseNet/chr1.net chain/chr1.chain /cluster/data/hg16/nib \
                /cluster/data/mm3.RM030619/mixedNib ../axtNet/chr1.axt
    mv ../axtTight/chr1.axt ../axtTight/chr1.axt.old
    cd ../axtNet
    subsetAxt chr1.axt ../axtTight/chr1.axt \
                /cluster/data/subsetAxt/coding.mat 3400

    # translate to psl
    cd ../axtTight
    axtToPsl chr1.axt ../S1.len ../S2.len ../pslTight/chr1_blastzTightMm3.psl

    # Load table into database
    ssh hgwdev
    cd /cluster/data/hg16/bed/blastz.mm3/pslTight
    hgLoadPsl -noTNameIx hg16 chr1_blastzTightMm3.psl
        # $ featureBits -chrom=chr1 hg16 chr1_blastzTightMm3.psl
        # 14052627 bases of 221562941 (6.342%) in intersection
        # hg15: 13990547 bases of 218713898 (6.397%) in intersection


    # make axtNet300
    ssh eieio
    cd /cluster/data/hg16/bed/blastz.mm3/axtChain
    netSplit mouse.net mouseNet
    mv ../axtNet300/chr1.axt ../axtNet300/chr1.axt.old
    netToAxt -maxGap=300 mouseNet/chr1.net chain/chr1.chain /cluster/data/hg16/nib /cluster/data/mm3.RM030619/mixedNib ../axtNet300/chr1.axt

    # create 2-way maf file for humor alignment
    set multizDir = /cluster/data/hg16/bed/humor.2003-09-02
    cd /cluster/data/hg16
    set mouseDir = bed/blastz.mm3/axtNet300
    axtSort $mouseDir/chr1.axt $mouseDir/chr1.axt.sorted
    mv $mouseDir/chr1.axt.sorted $mouseDir/chr1.axt 
    axtToMaf $mouseDir/chr1.axt \
            /cluster/data/hg16/chrom.sizes /cluster/data/mm3/chrom.sizes \
            $multizDir/maf/chr1.mm3.maf.unfixed -tPrefix=hg16. -qPrefix=mm3.
    /cluster/bin/scripts/fixmaf.pl \
            < $multizDir/maf/chr1.mm3.maf.unfixed > $multizDir/maf/chr1.mm3.maf
    rm $multizDir/maf/chr1.mm3.maf.unfixed


# NET MOUSE BLASTZ (DONE - 2003-08-22 - Hiram)

    ssh eieio
    cd /cluster/data/hg16/bed/blastz.mm3/axtChain
    mkdir preNet
    cd chain
    foreach i (*.chain)
      echo preNetting $i
      /cluster/bin/i386/chainPreNet $i /cluster/data/hg16/chrom.sizes \
                        /cluster/data/mm3/chrom.sizes ../preNet/$i
    end
    # This foreach loop will take about 15 min to execute.

    cd ..
    mkdir n1 
    cd preNet
    foreach i (*.chain)
      set n = $i:r.net
      echo primary netting $i
      /cluster/bin/i386/chainNet $i -minSpace=1 /cluster/data/hg16/chrom.sizes \
                            /cluster/data/mm3/chrom.sizes ../n1/$n /dev/null
    end

    cd ..
    cat n1/*.net | /cluster/bin/i386/netSyntenic stdin hNoClass.net
    # memory usage 2490523648, utime 15421 s/100, stime 3665

    ssh hgwdev
    cd /cluster/data/hg16/bed/blastz.mm3/axtChain
    ~/bin/i386/netClass hNoClass.net hg16 mm3 mouse.net \
	-tNewR=/cluster/bluearc/scratch/hg/gs.17/build34/linSpecRep.notInMouse \
	-qNewR=/cluster/bluearc/mm3.RM030619/linSpecRep.notInHuman

    # If things look good do
    ssh eieio
    cd /cluster/data/hg16/bed/blastz.mm3/axtChain
    rm -r n1 hNoClass.net
    # Make a 'syntenic' subset of these with
    netFilter -syn mouse.net > mouseSyn.net

    # Load the nets into database 
    ssh hgwdev
    cd /cluster/data/hg16/bed/blastz.mm3/axtChain
    netFilter -minGap=10 mouse.net |  hgLoadNet hg16 netMm3 stdin
    netFilter -minGap=10 mouseSyn.net | hgLoadNet hg16 syntenyNetMm3 stdin
    # make net
    ssh eieio
    cd /cluster/data/hg16/bed/blastz.mm3/axtChain
    mkdir mouseNet
    netSplit mouse.net mouseNet
    foreach n (mouseNet/chr*.net)
	set c=$n:t:r
	echo "netToAxt: $c.net -> $c.axt"
	rm -f ../axtNet/$c.axt
	netToAxt mouseNet/$c.net chain/$c.chain \
		/cluster/bluearc/scratch/hg/gs.17/build34/bothMaskedNibs \
		/cluster/data/mm3.RM030619/mixedNib \
		../axtNet/$c.axt
	echo "Complete: $c.net -> $c.axt"
    end


# MAKE BLASTZ BEST MOUSE MM3 (DONE - 2003-08-26 - Hiram)

    #  IMPORTANT NOTE - This axtBest process has been replaced by the
    #  chain to net to axt process.  Note procedure below continues
    #  after the chain and nets have been produced.

    # Consolidate AXT files to chrom level, sort, pick best, make psl.
    ssh eieio
    cd /cluster/data/hg16/bed/blastz.mm3/axtChrom
    mkdir -p /cluster/bluearc/hg16/bed/blastz.mm3/axtChrom
    
    # copy chrom axt's to bluearc, to avoid hitting fileserver too hard
    cp -p *.axt /cluster/bluearc/hg16/bed/blastz.mm3/axtChrom
    # chr19 came along later:
    cp -p chr19.axt /cluster/bluearc/hg16/bed/blastz.mm3/axtChrom

    ssh kk
    cd /cluster/data/hg16/bed/blastz.mm3
    mkdir -p axtBest pslBest
    mkdir run.3
    cd run.3

    # create script to filter files 
    cat << '_EOF_' > doBestAxt
#!/bin/csh -f
# usage: doBestAxt chr axt-file best-file psl-file
/cluster/bin/i386/axtBest $2 $1 $3 -minScore=300
sleep 1
/cluster/bin/i386/axtToPsl $3 /cluster/data/hg16/bed/blastz.mm3/S1.len \
	/cluster/data/hg16/bed/blastz.mm3/S2.len $4
'_EOF_'
    # << this line makes emacs coloring happy
    chmod +x doBestAxt
    cd ../axtChrom
    ls -1S | sed 's/.axt$//' > ../run.3/chrom.list
    cd ../run.3

    # create template for cluster job
    cat << '_EOF_' > gsub
#LOOP
doBestAxt $(root1) {check in line+ /cluster/bluearc/hg16/bed/blastz.mm3/axtChrom/$(root1).axt} {check out line+ /cluster/data/hg16/bed/blastz.mm3/axtBest/$(root1).axt} {check out line+  /cluster/data/hg16/bed/blastz.mm3/pslBest/$(root1)_blastzBestMm3.psl}
#ENDLOOP
'_EOF_'
    # << this line makes emacs coloring happy
    gensub2 chrom.list single gsub jobList
    wc -l jobList
        # 41 jobs
    head jobList

    ssh kk
    cd /cluster/data/hg16/bed/blastz.mm3
    cd run.3
    para create jobList
    para try
    para check
    para push
    #  With the chr19 situation, went back and reran this situation.
    #	for some unknown reason the first time it had 9 failed jobs:
Completed: 32 of 41 jobs
Crashed: 9 jobs
CPU time in finished jobs:        827s      13.78m     0.23h    0.01d  0.000 y
IO & Wait Time:                  1299s      21.65m     0.36h    0.02d  0.000 y
Average job time:                  66s       1.11m     0.02h    0.00d
Longest job:                      361s       6.02m     0.10h    0.00d
Submission to last job:          1195s      19.92m     0.33h    0.01d
    # And then rerunning those 9 failed jobs, only chr19 failed:
Completed: 8 of 9 jobs
Crashed: 1 jobs
CPU time in finished jobs:        748s      12.47m     0.21h    0.01d  0.000 y
IO & Wait Time:                  2290s      38.16m     0.64h    0.03d  0.000 y
Average job time:                 380s       6.33m     0.11h    0.00d
Longest job:                     1247s      20.78m     0.35h    0.01d
Submission to last job:          1261s      21.02m     0.35h    0.01d

    #	Better yet, Jim says to be consistent, do all the chroms in
    #	this manner:
    ssh eieio
    cd /cluster/data/hg16/bed/blastz.mm3/axtChain
    mkdir mouseNet
    netSplit mouse.net mouseNet
    foreach n (mouseNet/chr*.net)
	set c=$n:t:r
	echo "netToAxt: $c.net -> $c.axt"
	rm -f ../axtNet/$c.axt
	netToAxt mouseNet/$c.net chain/$c.chain \
		/cluster/bluearc/scratch/hg/gs.17/build34/bothMaskedNibs \
		/cluster/data/mm3.RM030619/mixedNib \
		../axtNet/$c.axt
	echo "Complete: $c.net -> $c.axt"
    end

    mkdir -p /cluster/data/hg16/bed/blastz.mm3/axtBest
    cd /cluster/data/hg16/bed/blastz.mm3/axtBest
    ln -s ../axtNet/chr*.axt .

    # copy net axt's to download area (DONE 2003-09-24 kate)
    cd /cluster/data/hg16/bed/blastz.mm3/axtNet
    gzip *.axt
    mkdir -p /usr/local/apache/htdocs/goldenPath/hg16/vsMm3/axtNet
    cp -p *.axt.gz /usr/local/apache/htdocs/goldenPath/hg16/vsMm3/axtNet
    # add README.txt file to dir, if needed

    #  Convert those axt files to psl
    ssh eieio
    cd /cluster/data/hg16/bed/blastz.mm3
    mkdir pslBest
    foreach a (axtBest/chr*.axt)
	set c=$a:t:r
	echo "processing $c.axt -> ${c}_blastzBestMm3.psl"
    /cluster/bin/i386/axtToPsl axtBest/${c}.axt \
	S1.len S2.len pslBest/${c}_blastzBestMm3.psl
	echo "Done: ${c}_blastzBestMm3.psl"
    end

    # Load tables
     ssh hgwdev
     set base="/cluster/data/hg16/bed/blastz.mm3"
     set tbl="blastzBestMm3"
     cd $base/pslBest
     /cluster/bin/i386/hgLoadPsl -noTNameIx hg16 chr*_${tbl}.psl

     # check results
     # the original axtBest stuff from the axtBest operation:
#  featureBits hg16 blastzBestMm3
#  1027438291 bases of 2865248791 (35.859%) in intersection
     #  After going through the chain->net->axt operation:
#  featureBits hg16 blastzBestMm3
#   991468768 bases of 2865248791 (34.603%) in intersection
     #  And finally after fixing a blastz execution problem on chr1:
#  1007362800 bases of 2865248791 (35.158%) in intersection

#  featureBits hg15 blastzBestMm3
#  1035090465 bases of 2866466359 (36.110%) in intersection


    # Make /gbdb links and add them to the axtInfo table:
     mkdir -p /gbdb/hg16/axtBestMm3
     cd /gbdb/hg16/axtBestMm3
     foreach f (/cluster/data/hg16/bed/blastz.mm3/axtNet/chr*.axt)
       ln -s $f .
     end
     cd /cluster/data/hg16/bed/blastz.mm3/axtNet
     rm -f axtInfoInserts.sql
     touch axtInfoInserts.sql
     foreach f (/gbdb/hg16/axtBestMm3/chr*.axt)
       set chr=$f:t:r
       echo "INSERT INTO axtInfo (species, alignment, chrom, fileName) \
                VALUES ('mm3','Blastz Best in Genome','$chr','$f');" \
         >> axtInfoInserts.sql
     end
     hgsql hg16 < ~/kent/src/hg/lib/axtInfo.sql
     hgsql hg16 < axtInfoInserts.sql


# MAKING THE AXTTIGHT FROM AXTBEST (DONE - 2003-08-25 - Hiram)
    # After creating axtBest alignments above, use subsetAxt to get axtTight:
    ssh eieio
    cd /cluster/data/hg16/bed/blastz.mm3/axtNet
    mkdir -p ../axtTight
    tcsh
    foreach i (*.axt)
      echo $i
      subsetAxt  $i ../axtTight/$i \
        ~kent/src/hg/mouseStuff/subsetAxt/coding.mat 3400
    end

    # translate to psl
    cd ../axtTight
    mkdir -p ../pslTight
    foreach i (*.axt)
      set c = $i:r
      axtToPsl $i ../S1.len ../S2.len ../pslTight/${c}_blastzTightMm3.psl
      echo "Done: $i"
    end

    # Load tables into database
    ssh hgwdev
    cd /cluster/data/hg16/bed/blastz.mm3/pslTight
    hgLoadPsl -noTNameIx hg16 chr*_blastzTightMm3.psl

    # copy to axt's to download area (DONE 2003-09-24 kate)
    cd /cluster/data/hg16/bed/blastz.mm3/axtTight
    gzip *.axt
    mkdir -p /usr/local/apache/htdocs/goldenPath/hg16/vsMm3/axtTight
    cp -p *.axt.gz /usr/local/apache/htdocs/goldenPath/hg16/vsMm3/axtTight
    # add README.txt file to dir, if needed

# CHAIN MOUSE BLASTZ (DONE 2003-08-28 - Hiram)

# Run axtChain on little cluster
    ssh kkr1u00
    mkdir -p /cluster/data/hg16/bed/blastz.mm3/axtChain/run1
    cd /cluster/data/hg16/bed/blastz.mm3/axtChain/run1
    mkdir out chain

    ls -1S /cluster/data/hg16/bed/blastz.mm3/axtChrom/*.axt > input.lst
    cat << '_EOF_' > gsub
#LOOP
doChain {check in exists $(path1)} {check out line+ chain/$(root1).chain} {check out line+ out/$(root1).out}
#ENDLOOP
'_EOF_'
    # << this line makes emacs coloring happy

#    axtFilter -notQ=chrUn_random $1 | axtChain stdin

    cat << '_EOF_' > doChain
#!/bin/csh
    axtFilter -notQ=chrUn_random $1 | axtChain stdin \
	/iscratch/i/gs.17/build34/bothMaskedNibs \
	/iscratch/i/mm3.RM030619/mixedNib $2 > $3
'_EOF_'
    # << this line makes emacs coloring happy
    chmod a+x doChain
    mkdir out chain

    # 41 jobs
    gensub2 input.lst single gsub jobList
    para create jobList
    para try
    para push # ... etc ...
Completed: 41 of 41 jobs
CPU time in finished jobs:      31379s     522.98m     8.72h    0.36d  0.001 y
IO & Wait Time:                 10761s     179.35m     2.99h    0.12d  0.000 y
Average job time:                1028s      17.13m     0.29h    0.01d
Longest job:                    10327s     172.12m     2.87h    0.12d
Submission to last job:         10327s     172.12m     2.87h    0.12d

    # now on the cluster server, sort chains
    ssh eieio
    cd /cluster/data/hg16/bed/blastz.mm3/axtChain
    chainMergeSort run1/chain/*.chain > all.chain
    chainSplit chain all.chain

    # these steps take ~20 minutes
    # optionally: rm run1/chain/*.chain

    # Load chains into database
    # next machine
    ssh hgwdev
    cd /cluster/data/hg16/bed/blastz.mm3/axtChain/chain
    foreach i (*.chain)
        set c = $i:r
        hgLoadChain hg16 ${c}_chainMm3 $i
        echo done $c
    end

# NET MOUSE BLASTZ (DONE - 2003-08-22 - Hiram)

    ssh eieio
    cd /cluster/data/hg16/bed/blastz.mm3/axtChain
    mkdir preNet
    cd chain
    foreach i (*.chain)
      echo preNetting $i
      /cluster/bin/i386/chainPreNet $i /cluster/data/hg16/chrom.sizes \
                        /cluster/data/mm3/chrom.sizes ../preNet/$i
    end
    # This foreach loop will take about 15 min to execute.

    cd ..
    mkdir n1 
    cd preNet
    foreach i (*.chain)
      set n = $i:r.net
      echo primary netting $i
      /cluster/bin/i386/chainNet $i -minSpace=1 /cluster/data/hg16/chrom.sizes \
                            /cluster/data/mm3/chrom.sizes ../n1/$n /dev/null
    end

    cd ..
    cat n1/*.net | /cluster/bin/i386/netSyntenic stdin hNoClass.net
    # memory usage 2490523648, utime 15421 s/100, stime 3665

    ssh hgwdev
    cd /cluster/data/hg16/bed/blastz.mm3/axtChain
    ~/bin/i386/netClass hNoClass.net hg16 mm3 mouse.net \
	-tNewR=/cluster/bluearc/scratch/hg/gs.17/build34/linSpecRep.notInMouse \
	-qNewR=/cluster/bluearc/mm3.RM030619/linSpecRep.notInHuman

    # If things look good do
    ssh eieio
    cd /cluster/data/hg16/bed/blastz.mm3/axtChain
    rm -r n1 hNoClass.net
    # Make a 'syntenic' subset of these with
    netFilter -syn mouse.net > mouseSyn.net

    # Load the nets into database 
    ssh hgwdev
    cd /cluster/data/hg16/bed/blastz.mm3/axtChain
    netFilter -minGap=10 mouse.net |  hgLoadNet hg16 netMm3 stdin
    netFilter -minGap=10 mouseSyn.net | hgLoadNet hg16 syntenyNetMm3 stdin

    # Add entries for net and chain to human/hg16 trackDb


# MAKE HUMAN-MOUSE MM3 OVER.CHAIN FOR LIFTOVER (2004-07-09 kate)
    ssh eieio
    set chainDir = /cluster/data/hg16/bed/blastz.mm3/axtChain
    gunzip *.gz

    ssh kolossus
    set chainDir = /cluster/data/hg16/bed/blastz.mm3/axtChain
    cd $chainDir
    mkdir subset
cat > makeSubset.csh << 'EOF'
    set chainDir = /cluster/data/hg16/bed/blastz.mm3/axtChain
    foreach f ($chainDir/chain/*.chain.gz)
      set c = $f:t:r:r
      echo subsetting $c
      gunzip -c $f | netChainSubset $chainDir/mouseNet/$c.net stdin \
                        subset/$c.chain
    end
'EOF'
# << for emacs
    csh makeSubset.csh >&! makeSubset.log &
    tail -100f makeSubset.log
    cat subset/*.chain > /cluster/data/hg16/bed/liftOver/hg16Tomm3.chain

    # test reciprocal best on chr6 for ENr233
    ssh kkstore
    cd /cluster/data/hg16/bed/blastz.mm3/axtChain

    # renumber chains to assure unique ID's, 
    # since netting splits some (should redo the  liftOver chain with new ID's)
    # then sort by score for netter
    mkdir uniqueSubset
    chainMergeSort subset/chr6.chain > uniqueSubset/chr6.chain
    mkdir swappedSubset
    chainSwap uniqueSubset/chr6.chain swappedSubset/chr6.chain

    mkdir recipBestTest
    cd recipBestTest
    chainSort ../uniqueSubset/chr6.chain stdout | \
        chainNet stdin \
                /cluster/data/hg16/chrom.sizes /cluster/data/mm3/chrom.sizes\
                    human.chr6.net mouse.chr6.net
    netChainSubset mouse.chr6.net ../swappedSubset/chr6.chain stdout | \
        chainSwap stdin chr6.chain
    ssh hgwdev
    cd /cluster/data/hg16/bed/blastz.mm3/axtChain/recipBestTest
    hgLoadChain hg16 rBestChainMm3 chr6.chain
    # didn't filter enuff -- perhaps didn't rechain in proper direction


#  BLASTZ RAT  (DONE - 2003-08-07 - Hiram)

    ssh eieio
    mkdir -p /cluster/data/hg16/bed/blastz.rn3
    cd /cluster/data/hg16/bed/blastz.rn3

    cat << '_EOF_' > DEF
# rat vs. human
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/home/angie/schwartzbin:/cluster/home/kent/bin/i386

ALIGN=blastz-run
BLASTZ=blastz
BLASTZ_H=2000
BLASTZ_ABRIDGE_REPEATS=1

# TARGET
# Human
SEQ1_DIR=/iscratch/i/gs.17/build34/bothMaskedNibs
# not used
SEQ1_RMSK=
# not used
SEQ1_FLAG=
SEQ1_SMSK=/iscratch/i/gs.17/build34/linSpecRep.notInRat
SEQ1_IN_CONTIGS=0
SEQ1_CHUNK=10000000
SEQ1_LAP=10000

# QUERY
# Rat
SEQ2_DIR=/iscratch/i/rn3/bothMaskedNibs
# not currently used
SEQ2_RMSK=
# not currently used
SEQ2_FLAG=
SEQ2_SMSK=/cluster/bluearc/rat/rn3/linSpecRep.notInHuman
SEQ2_IN_CONTIGS=0
SEQ2_CHUNK=30000000
SEQ2_LAP=0

BASE=/cluster/store4/gs.17/build34/bed/blastz.rn3

DEF=$BASE/DEF
RAW=$BASE/raw
CDBDIR=$BASE
SEQ1_LEN=$BASE/S1.len
SEQ2_LEN=$BASE/S2.len
'_EOF_'
    # << this line makes emacs coloring happy

# Save the DEF file in the current standard place
    DS=`date -I`
    cp DEF ~angie/hummus/DEF.rn3-hg16.$DS

    ssh kk
    cd /cluster/data/hg16/bed/blastz.rn3

    # source the DEF file to establish environment for following commands
    . ./DEF

    # follow the next set of directions slavishly
    mkdir -p $BASE/run
    # give up on avoiding angie's directories
    # tcl script
    # creates xdir.sh and joblist run/j
    ~angie/hummus/make-joblist $DEF > $BASE/run/j

    # xdir.sh makes a bunch of result directories in $BASE/raw/
    # based on chrom name and CHUNK size
    sh $BASE/xdir.sh
    cd $BASE/run

    # now edit j to prefix path to executable name
    # NOTE: we should have a controlled version of schwartz bin executables
    sed -e 's#^#/cluster/bin/penn/#' j > j2
    wc -l j*
    head j2

    # make sure the j2 edits are OK, then use it:
    mv j2 j

    # para create will create the file: 'batch' for the cluster run
    para create j
	# 39663 jobs
    para try
    para check
    para push
    # ... etc ...
# Completed: 41697 of 41697 jobs
# CPU time in finished jobs:   14155946s  235932.43m  3932.21h  163.84d  0.449 y
# IO & Wait Time:               1005629s   16760.49m   279.34h   11.64d  0.032 y
# Average job time:                 364s       6.06m     0.10h    0.00d
# Longest job:                     4310s      71.83m     1.20h    0.05d
# Submission to last job:         35086s     584.77m     9.75h    0.41d


    # post-process blastz
    ssh kk
    cd /cluster/data/hg16/bed/blastz.rn3
    #   source the DEF file again in case you are coming back to this
    #	(must be bash shell)

    . ./DEF
    
    # a new run directory
    mkdir -p run.1
    
    mkdir -p $BASE/lav
    
    # create a new job list to convert out files to lav
    /cluster/bin/scripts/blastz-make-out2lav $DEF $BASE \
                        > run.1/jobList
    cd run.1

    # make sure the job list is OK
    wc -l jobList
       # 339 jobs 
    head jobList

    # run on cluster
    ssh kk
    cd /cluster/data/hg16/bed/blastz.rn3/run.1
    para create jobList
    para try
    para check
    para push
    # etc.
# Completed: 339 of 339 jobs
# CPU time in finished jobs:       6562s     109.37m     1.82h    0.08d  0.000 y
# IO & Wait Time:                154475s    2574.58m    42.91h    1.79d  0.005 y
# Average job time:                 475s       7.92m     0.13h    0.01d
# Longest job:                      924s      15.40m     0.26h    0.01d
# Submission to last job:           933s      15.55m     0.26h    0.01d

    # convert lav files to axt
    ssh kk
    cd /cluster/data/hg16/bed/blastz.rn3
    mkdir axtChrom
    
    # a new run directory
    mkdir run.2
    cd run.2

    # create template file for gensub2
    # usage:  blastz-chromlav2axt lav-dir axt-file seq1-dir seq2-dir
    cat << '_EOF_' > gsub
#LOOP
/cluster/bin/scripts/blastz-chromlav2axt /cluster/store4/gs.17/build34/bed/blastz.rn3/lav/$(root1) {check out line+ /cluster/store4/gs.17/build34/bed/blastz.rn3/axtChrom/$(root1).axt} /iscratch/i/gs.17/build34/bothMaskedNibs /iscratch/i/rn3/bothMaskedNibs
#ENDLOOP
'_EOF_'
    # << this line makes emacs coloring happy
    ls -1S /cluster/store4/gs.17/build34/bed/blastz.rn3/lav > chrom.list
    gensub2 chrom.list single gsub jobList
    wc -l jobList
        # 42 jobs
    head jobList

    para create jobList
    para try
    para check
    para push
    # ... etc ...
    #  The crashed job is again chr19_random
# Completed: 41 of 42 jobs
# Crashed: 1 jobs
# CPU time in finished jobs:       1507s      25.12m     0.42h    0.02d  0.000 y
# IO & Wait Time:                 17520s     292.00m     4.87h    0.20d  0.001 y
# Average job time:                 464s       7.73m     0.13h    0.01d
# Longest job:                     1214s      20.23m     0.34h    0.01d
# Submission to last job:          1214s      20.23m     0.34h    0.01d

    #	Remove the empty axtChrom/chr19_random.axt file to avoid future
    #	processing errors

    # translate sorted axt files into psl
    ssh eieio
    cd /cluster/data/hg16/bed/blastz.rn3
    mkdir -p pslChrom
    set tbl = "blastzRn3"
    foreach f (axtChrom/chr*.axt)
      set c=$f:t:r
      echo "Processing chr $c"
      /cluster/bin/i386/axtToPsl $f S1.len S2.len pslChrom/${c}_${tbl}.psl
    end
    #	That takes about 20 minutes

    # Load database tables
    ssh hgwdev
    set tbl = "blastzRn3"
    cd /cluster/data/hg16/bed/blastz.rn3/pslChrom
    /cluster/bin/i386/hgLoadPsl -noTNameIx hg16 chr*_${tbl}.psl
    # This takes 30 minutes to an hour
    #	New entry in human/hg16/trackDb.ra
#	track blastzRn3
#	shortLabel Rat Blastz
#	longLabel Merged Blastz Rat (June 03) Alignments
#	group compGeno
#	priority 142
#	visibility hide
#	color 100,50,0
#	altColor 255,240,200
#	spectrum on
#	type psl xeno rn3
#	otherDb rn3

# MAKE BLASTZ BEST RAT RN3 (DONE - 2003-08-08 - Hiram - Redone 08-26)

    # IMPORTANT NOTE - this axtBest process has been replaced by
    #	the chain -> net -> netToAxt process.  So, after chains and
    #	nets have been created, pick up this best process below.

    # Consolidate AXT files to chrom level, sort, pick best, make psl.
    ssh eieio
    cd /cluster/data/hg16/bed/blastz.rn3/axtChrom
    mkdir -p /cluster/bluearc/hg16/bed/blastz.rn3/axtChrom
    
    # copy chrom axt's to bluearc, to avoid hitting fileserver too hard
    cp -p *.axt /cluster/bluearc/hg16/bed/blastz.rn3/axtChrom

    ssh kk
    cd /cluster/data/hg16/bed/blastz.rn3
    mkdir -p axtBest pslBest
    mkdir run.3
    cd run.3

    # create script to filter files 
    cat << '_EOF_' > doBestAxt
#!/bin/csh -f
# usage: doBestAxt chr axt-file best-file psl-file
/cluster/bin/i386/axtBest $2 $1 $3 -minScore=300
sleep 1
/cluster/bin/i386/axtToPsl $3 /cluster/data/hg16/bed/blastz.rn3/S1.len \
	/cluster/data/hg16/bed/blastz.rn3/S2.len $4
'_EOF_'
    # << this line makes emacs coloring happy
    chmod +x doBestAxt
    cd ../axtChrom
    ls -1S | sed 's/.axt$//' > ../run.3/chrom.list
    cd ../run.3

    # create template for cluster job
    cat << '_EOF_' > gsub
#LOOP
doBestAxt $(root1) {check in line+ /cluster/bluearc/hg16/bed/blastz.rn3/axtChrom/$(root1).axt} {check out line+ /cluster/data/hg16/bed/blastz.rn3/axtBest/$(root1).axt} {check out line+  /cluster/data/hg16/bed/blastz.rn3/pslBest/$(root1)_blastzBestRn3.psl}
#ENDLOOP
'_EOF_'
    # << this line makes emacs coloring happy
    gensub2 chrom.list single gsub jobList
    wc -l jobList
        # 41 jobs
    head jobList

    cd /cluster/data/hg16/bed/blastz.rn3
    cd run.3
    para create jobList
    para try
    para check
    para push
        # 106 minutes, almost all I/O time:
# Completed: 41 of 41 jobs
# CPU time in finished jobs:       2225s      37.09m     0.62h    0.03d  0.000 y
# IO & Wait Time:                 36349s     605.81m    10.10h    0.42d  0.001 y
# Average job time:                 941s      15.68m     0.26h    0.01d
# Longest job:                     6415s     106.92m     1.78h    0.07d
# Submission to last job:          6417s     106.95m     1.78h    0.07d

    #	Better yet, Jim says to be consistent, do all the chroms in
    #	this manner:
    ssh eieio
    cd /cluster/data/hg16/bed/blastz.rn3/axtChain
    mkdir ratNet
    netSplit rat.net ratNet

    mkdir ../axtNet
    foreach n (ratNet/chr*.net)
	set c=$n:t:r
	echo "netToAxt: $c.net -> $c.axt"
	rm -f ../axtNet/$c.axt
	netToAxt -maxGap=25 ratNet/$c.net chain/$c.chain \
		/cluster/bluearc/scratch/hg/gs.17/build34/bothMaskedNibs \
		/cluster/bluearc/rat/rn3/softNib \
		../axtNet/$c.axt
	echo "Complete: $c.net -> $c.axt"
    end

    mkdir -p /cluster/data/hg16/bed/blastz.rn3/axtBest
    cd /cluster/data/hg16/bed/blastz.rn3/axtBest
    ln -s ../axtNet/chr*.axt .

    # copy net axt's to download area (DONE 2003-09-24 kate)
    ssh eieio
    cd /cluster/data/hg16/bed/blastz.rn3/axtNet
    gzip *.axt
    ssh hgwdev
    mkdir -p /usr/local/apache/htdocs/goldenPath/hg16/vsRn3/axtNet
    cd /cluster/data/hg16/bed/blastz.rn3/axtNet
    cp -p *.axt.gz /usr/local/apache/htdocs/goldenPath/hg16/vsRn3/axtNet
    # add README.txt file to dir, if needed

    #  Convert those axt files to psl
    ssh eieio
    cd /cluster/data/hg16/bed/blastz.rn3
    mkdir pslBest
    foreach a (axtBest/chr*.axt)
	set c=$a:t:r
	echo "processing $c.axt -> ${c}_blastzBestRn3.psl"
    /cluster/bin/i386/axtToPsl axtBest/${c}.axt \
	S1.len S2.len pslBest/${c}_blastzBestRn3.psl
	echo "Done: ${c}_blastzBestRn3.psl"
    end

    # Load tables
     ssh hgwdev
     cd /cluster/data/hg16/bed/blastz.rn3/pslBest
     /cluster/bin/i386/hgLoadPsl -noTNameIx hg16 chr*_blastzBestRn3.psl

     # check results
#  Via the netToAxt process:
#    featureBits hg16 blastzBestRn3
#    976121391 bases of 2865248791 (34.068%) in intersection
#  With the original axtBest process, before the netToAxt process:
#    featureBits hg16 blastzBestRn3
#    1002119325 bases of 2865248791 (34.975%) in intersection

#  Hg15 results:
#    featureBits hg15 blastzBestRn3
#    992724355 bases of 2866466359 (34.632%) in intersection

    # Make /gbdb links and add them to the axtInfo table:
     mkdir -p /gbdb/hg16/axtBestRn3
     cd /gbdb/hg16/axtBestRn3
     ln -s /cluster/data/hg16/bed/blastz.rn3/axtNet/chr*.axt .

     cd /cluster/data/hg16/bed/blastz.rn3/axtNet
     rm -f axtInfoInserts.sql
     touch axtInfoInserts.sql
     foreach f (/gbdb/hg16/axtBestRn3/chr*.axt)
       set chr=$f:t:r
       echo "INSERT INTO axtInfo (species, alignment, chrom, fileName) \
                VALUES ('rn3','Blastz Best in Genome','$chr','$f');" \
         >> axtInfoInserts.sql
     end
     #	Already done above.  This table needs definition only once
     # hgsql hg16 < ~/kent/src/hg/lib/axtInfo.sql
     hgsql hg16 < axtInfoInserts.sql


# MAKING RAT AXTTIGHT FROM AXTBEST (DONE - 2003-08-26 - Hiram)
    # After creating axtBest alignments above, use subsetAxt to get axtTight:
    ssh eieio
    cd /cluster/data/hg16/bed/blastz.rn3/axtNet
    mkdir -p ../axtTight
    tcsh
    foreach i (*.axt)
      echo $i
      subsetAxt  $i ../axtTight/$i \
        ~kent/src/hg/mouseStuff/subsetAxt/coding.mat 3400
    end

    # translate to psl
    cd ../axtTight
    mkdir -p ../pslTight
    foreach i (*.axt)
      set c = $i:r
      axtToPsl $i ../S1.len ../S2.len ../pslTight/${c}_blastzTightRn3.psl
      echo "Done: $i"
    end

    # Load tables into database
    ssh hgwdev
    cd /cluster/data/hg16/bed/blastz.rn3/pslTight
    hgLoadPsl -noTNameIx hg16 chr*_blastzTightRn3.psl

    # copy  axt's to download area (DONE 2003-09-24 kate)
    ssh eieio
    cd /cluster/data/hg16/bed/blastz.rn3/axtTight
    gzip *.axt
    ssh hgwdev
    mkdir -p /usr/local/apache/htdocs/goldenPath/hg16/vsRn3/axtTight
    cd /cluster/data/hg16/bed/blastz.rn3/axtTight
    cp -p *.axt.gz /usr/local/apache/htdocs/goldenPath/hg16/vsRn3/axtTight
    # add README.txt file to dir, if needed

# CHAIN RAT BLASTZ (DONE 2003-08-08 - Hiram)

# Run axtChain on little cluster
    ssh kkr1u00
    cd /cluster/data/hg16/bed/blastz.rn3
    mkdir -p axtChain/run1
    cd axtChain/run1
    mkdir out chain

    ls -1S /cluster/data/hg16/bed/blastz.rn3/axtChrom/*.axt > input.lst
    cat << '_EOF_' > gsub
#LOOP
doChain {check in exists $(path1)} {check out line+ chain/$(root1).chain} {check out line+ out/$(root1).out}
#ENDLOOP
'_EOF_'
    # << this line makes emacs coloring happy

#    axtFilter -notQ=chrUn_random $1 | axtChain stdin

    cat << '_EOF_' > doChain
#!/bin/sh
axtFilter $1 | axtChain stdin \
	/iscratch/i/gs.17/build34/bothMaskedNibs \
	/iscratch/i/rn3/bothMaskedNibs $2 > $3
'_EOF_'
    # << this line makes emacs coloring happy
    chmod a+x doChain
    mkdir out chain

    # 41 jobs
    gensub2 input.lst single gsub jobList
    para create jobList
    para try
    para push # ... etc ...
    # With only 6 CPUs available:
# Completed: 40 of 40 jobs
# CPU time in finished jobs:      21791s     363.19m     6.05h    0.25d  0.001 y
# IO & Wait Time:                 12491s     208.18m     3.47h    0.14d  0.000 y
# Average job time:                 857s      14.28m     0.24h    0.01d
# Longest job:                     2724s      45.40m     0.76h    0.03d
# Submission to last job:          5875s      97.92m     1.63h    0.07d

    # now on the cluster server, sort chains
    ssh eieio
    cd /cluster/data/hg16/bed/blastz.rn3/axtChain
    /cluster/bin/i386/chainMergeSort run1/chain/*.chain > all.chain
    /cluster/bin/i386/chainSplit chain all.chain
    # these steps take ~20 minutes
    # optionally: rm run1/chain/*.chain

    # Load chains into database
    # next machine
    ssh hgwdev
    cd /cluster/data/hg16/bed/blastz.rn3/axtChain/chain
    foreach i (*.chain)
        set c = $i:r
        hgLoadChain hg16 ${c}_chainRn3 $i
        echo done $c
    end

CREATE bigZips stuff for release (DONE 2003-08-01, 08-06, 08-08 - Hiram)

    # make bigZips/mrna.zip (markd 8 aug 2003)
    # on hgbeta: 
    cd /genbank
    ./bin/i386/gbGetSeqs -get=seq -db=hg16 -native genbank mrna download/hg16/bigZips/mrna.fa 
    zip download/hg16/bigZips/mrna.zip  download/hg16/bigZips/mrna.fa 
    rm download/hg16/bigZips/mrna.fa 

    ssh hgwdev
    #  This stuff has to work in a different way because this stuff
    #	updates on a daily basis. - (DONE 2003-08-09 - Hiram)
    cd /usr/local/apache/htdocs/goldenPath/hg16/bigZips
    featureBits hg16 refGene:upstream:1000 -fa=upstream1000.fa
    zip upstream1000.zip upstream1000.fa
    rm upstream1000.fa
    featureBits hg16 refGene:upstream:2000 -fa=upstream2000.fa
    zip upstream2000.zip upstream2000.fa
    rm upstream2000.fa
    featureBits hg16 refGene:upstream:5000 -fa=upstream5000.fa
    zip upstream5000.zip upstream5000.fa
    rm upstream5000.fa

# MAKING MOUSE AND RAT SYNTENY (MOUSE done 2003-09-16)(RAT Done 2003-08-28)

ssh hgwdev
mkdir -p /cluster/data/hg16/bed/syntenyMm3
cd /cluster/data/hg16/bed/syntenyMm3
# Copy all the needed scripts from /cluster/data/hg15/bed/syntenyMouse
cp -p /cluster/data/hg15/bed/syntenyMouse/*.pl .

./syntenicBest.pl -db=hg16 -table=blastzBestMm3
./smooth.pl
./joinsmallgaps.pl
./fillgap.pl -db=hg16 -table=blastzBestMm3
./synteny2bed.pl
hgLoadBed hg16 syntenyMm3 ucsc100k.bed

#	And for the Rat, same thing, different directory:
mkdir ../syntenyRn3
cd ../syntenyRn3
../syntenyMm3/syntenicBest.pl -db=hg16 -table=blastzBestRn3
#	smooth.pl overwrites genomeBest2phase created by the previous
#	run of this above.  Runs quickly.
../syntenyMm3/smooth.pl
# joinsmallgaps.pl overwrites genomeBest3phase created above. Runs quickly.
../syntenyMm3/joinsmallgaps.pl
# fillgap.pl creates genomeBestFinal
../syntenyMm3/fillgap.pl -db=hg16 -table=blastzBestRn3
# synteny2bed.pl creates ucsc100k.bed
../syntenyMm3/synteny2bed.pl
hgLoadBed hg16 syntenyRn3 ucsc100k.bed
# Loaded 1537 elements

# NET RAT BLASTZ (WORKING - 2003-08-11 - Hiram)

    ssh eieio
    cd /cluster/data/hg16/bed/blastz.rn3/axtChain
    mkdir preNet
    cd chain
    foreach i (*.chain)
      echo preNetting $i
      /cluster/bin/i386/chainPreNet $i /cluster/data/hg16/chrom.sizes \
                        /cluster/data/rn3/chrom.sizes ../preNet/$i
    end
    # This foreach loop will take about 15 min to execute.

    cd ..
    mkdir n1 
    cd preNet
    foreach i (*.chain)
      set n = $i:r.net
      echo primary netting $i
      /cluster/bin/i386/chainNet $i -minSpace=1 /cluster/data/hg16/chrom.sizes \
                            /cluster/data/rn3/chrom.sizes ../n1/$n /dev/null
    end

    cd ..
    cat n1/*.net | /cluster/bin/i386/netSyntenic stdin hNoClass.net
    # memory usage 2511495168, utime 15658 s/100, stime 3383

    # The netClass operations requires an "ancientRepeat" table to exist
    # in either hg16 or rn3.  So, create the table:

    ssh hgwdev
    mkdir -p /cluster/data/hg16/bed/ancientRepeat
    cd /cluster/data/hg16/bed/ancientRepeat
    # mysqldump needs write permission to this directory
    # and you need to use your read/write enabled user with password
    chmod 777 .
    hgsqldump --all --tab=. hg15 ancientRepeat
    chmod 775 .
    hgsql hg16 < ancientRepeat.sql
    mysqlimport -u<r/w user> -p<r/w pass> hg16 ancientRepeat.txt
    # This is a hand curated table obtained from Arian.

	
    #	The mouse.net argument here should have been rat.net
    cd /cluster/data/hg16/bed/blastz.rn3/axtChain
    /cluster/bin/i386/netClass hNoClass.net hg16 rn3 mouse.net \
	-tNewR=/cluster/bluearc/scratch/hg/gs.17/build34/linSpecRep.notInRat \
	-qNewR=/cluster/bluearc/rat/rn3/linSpecRep.notInHuman

    # If things look good do
    ssh eieio
    cd /cluster/data/hg16/bed/blastz.rn3/axtChain
    rm -r n1 hNoClass.net
    #	The arguments here should have been rat.net and ratSyn.net
    # Make a 'syntenic' subset of these with
    netFilter -syn mouse.net > mouseSyn.net

    # The mouse.net argument here should have been rat.net from the
    # netClass operation above.

    # Load the nets into database 
    ssh hgwdev
    cd /cluster/data/hg16/bed/blastz.rn3/axtChain
    netFilter -minGap=10 mouse.net |  hgLoadNet hg16 netRn3 stdin
    netFilter -minGap=10 mouseSyn.net | hgLoadNet hg16 syntenyNetRn3 stdin

    # Add entries for net and chain to human/hg16 trackDb


# MAKE HUMAN-RAT OVER.CHAIN FOR LIFTOVER (2004-07-09 kate)
    ssh eieio
    set chainDir = /cluster/data/hg16/bed/blastz.rn3/axtChain
    cd $chainDir/ratNet
    gunzip *.gz

    ssh kolossus
    cd /cluster/data/hg16/bed/liftOver
    mkdir hg16Torn3
    cd hg16Torn3
    set chainDir = /cluster/data/hg16/bed/blastz.rn3/axtChain
    mkdir subset
cat > makeSubset.csh << 'EOF'
    set chainDir = /cluster/data/hg16/bed/blastz.rn3/axtChain
    foreach f ($chainDir/chain/*.chain)
      set c = $f:t:r:r
      echo subsetting $c
      netChainSubset $chainDir/ratNet/$c.net $f subset/$c.chain
    end
'EOF'
# << for emacs
    csh makeSubset.csh >&! makeSubset.log &
    tail -100f makeSubset.log
    cat subset/*.chain > /cluster/data/hg16/bed/liftOver/hg16Torn3.chain


# Make Known Genes Track

  This task has many steps and currently it is described by two documents:

 	1. makeProteins072003.doc

 	   describes how the protein databases, biosql072003 and proteins072003,
 	   were built

 	2. makeKgHg16.doc

	   describes how the Known Genes related database tables
	   were built for hg16.  makeKgHg16.doc could be merged 
 	   with makeHg16.doc after minor editing of the format style.

# LIFTING REPEATMASKER .ALIGN FILES

# for this work, I had to delete some comments that were in the .align files.
# The edited files were
#   NT_008046_01.fa.align   (around line 10586)
#   NT_008046_75.fa.align   (around line 3320)
# The lines I deleted are:
#
# These elements can be clipped out with the options is_clip or is_only.
# The latter does not run the 'normal' RepeatMasker routine and positions in the current
# .out file will not correspond with the -is_only reconstructed sequence.
#
foreach d (?{,?}/NT_??????)
  set c=$d:t
  cd $d
  echo $c to $c.fa.align
  /cluster/bin/scripts/liftRMAlign.pl $c.lft > $c.fa.align
  cd ../..
end

foreach chr (?{,?})
  cd $chr
  echo making symbolic links for chr$chr NT .fa.align files
  foreach ctg (NT_??????)
    ln -s $ctg/$ctg.fa.align
  end
  cd ..
  if (-e $chr/lift/ordered.lft) then
    echo making $chr/chr$chr.fa.align
    /cluster/bin/scripts/liftRMAlign.pl $chr/lift/ordered.lft \
      > $chr/chr$chr.fa.align
  endif
  if (-e $chr/lift/random.lft) then
    echo making $chr/chr${chr}_random.fa.align
    /cluster/bin/scripts/liftRMAlign.pl $chr/lift/random.lft \
      > $chr/chr${chr}_random.fa.align
  endif
  echo removing symbolic links for chr$chr NT .fa.align files
  rm $chr/NT_??????.fa.align
end


# TWINSCAN 1.3 GENE PREDICTIONS (2003-12-12 braney)

    cd /cluster/data/hg16/bed
    rm -fr twinscan
    mkdir twinscan.2003-12-12
    ln -s twinscan.2003-12-12 twinscan
    cd twinscan
    tarFile=Hs-NCBI34-TS13-pseudo-masked.tgz
    wget http://genes.cs.wustl.edu/predictions/human/NCBI34_TS13_pseudo/Hs-NCBI34-TS13-pseudo-masked.tgz
    wget http://genes.cs.wustl.edu/predictions/human/NCBI34_TS13_pseudo/md5sum.txt

    # check file transferred correctly
    md5sum $tarFile | diff - md5sum.txt
    tar xvfz $tarFile
    unset tarFile 

    # pare down protein FASTA header to id and add missing .a:
    foreach c (1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 X Y)
      echo chr$c
      perl -wpe 's/^(\>\S+)\s.*$/$1.a/' < chr_ptx/chr$c.ptx > chr_ptx/chr$c-fixed.fa
    end
    ldHgGene hg16 twinscan chr_gtf/chr*.gtf -gtf -genePredExt
    hgPepPred hg16 generic twinscanPep chr_ptx/chr*-fixed.fa

# RAW TWINSCAN 1.3 GENE PREDICTIONS, WITHOUT FILTERING OF PSEUDOGENES
# (2004-01-11 acs)

    cd /cluster/data/hg16/bed
    mkdir twinscan_raw.2004-01-11
    ln -s twinscan.2004-01-11 twinscan_raw
    cd twinscan_raw
    tarFile=NCBI34_Hs_TS13_11_11_03.tgz
    wget http://genes.cs.wustl.edu/predictions/human/NCBI34_TS13/$tarFile
    wget http://genes.cs.wustl.edu/predictions/human/NCBI34_TS13/md5sum.txt

    # check file transferred correctly
    md5sum $tarFile | diff - md5sum.txt
    tar xvfz $tarFile
    unset tarFile 

    # pare down protein FASTA header to id and add missing .a:
    foreach c (1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 X Y)
      echo chr$c
      perl -wpe 's/^(\>\S+)\s.*$/$1.a/' < chr_ptx/chr$c.ptx > chr_ptx/chr$c-fixed.fa
    end
    ldHgGene hg16 twinscan_raw chr_gtf/chr*.gtf -gtf
    hgPepPred hg16 generic twinscanrawPep chr_ptx/chr*-fixed.fa

# LOAD GENEID GENES (DONE - 2003-09-02 - Hiram RELOADED -gtf 2004-04-02 kate)
    mkdir -p /cluster/data/hg16/bed/geneid/download
    cd /cluster/data/hg16/bed/geneid/download

    # Now download *.gtf and *.prot from 
    set dir = genome.imim.es/genepredictions/H.sapiens/golden_path_200307/geneid_v1.1/
    foreach c (1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 X Y Un)
      wget http://$dir/chr$c.gtf
      wget http://$dir/chr${c}_random.gtf
      wget http://$dir/chr$c.prot
      wget http://$dir/chr${c}_random.prot
    end
    wget http://$dir/readme
    # Add missing .1 to protein id's
    foreach f (*.prot)
      perl -wpe 's/^(>chr\w+)$/$1.1/' $f > $f:r-fixed.prot
      echo "done $f"
    end
    cd ..
    ldHgGene hg16 geneid download/*.gtf -gtf
#	Read 32255 transcripts in 281180 lines in 40 files
#	  32255 groups 40 seqs 1 sources 3 feature types
#	32255 gene predictions
    hgPepPred hg16 generic geneidPep download/*-fixed.prot

# QA NOTE: [ASZ 2007-10-02] sudo mytouch hg16 geneidPep 200404021400.00


# HUMAN/MOUSE/RAT ALIGMNMENT USING HUMOR(MULTIZ) (IN PROGRESS 2003-0829 kate)
# Multiple alignment with Mm3, Rn3

    ssh eieio

    # make mouse axtNet300
    cd /cluster/data/hg16/bed/blastz.mm3/axtChain/mouseNet
    mkdir -p ../../axtNet300
    foreach f (chr*.net)
        set c = $f:r
        echo "mouse axtNet300 on $c"
        netToAxt -maxGap=300 $c.net ../chain/$c.chain /cluster/data/hg16/nib /cluster/data/mm3.RM030619/mixedNib ../../axtNet300/$c.axt
    end

    # make rat axtNet300
    cd /cluster/data/hg16/bed/blastz.rn3/axtChain/ratNet
    mkdir -p ../../axtNet300
    foreach f (chr*.net)
        set c = $f:r
        echo "rat axtNet300 on $c"
        netToAxt -maxGap=300 $c.net ../chain/$c.chain /cluster/data/hg16/nib /cluster/data/rn3/nib ../../axtNet300/$c.axt
    end

    # create 2-way maf files
    #set multizDir = /cluster/data/hg16/bed/humor.2003-09-02
    set multizDir = /cluster/data/hg16/bed/humor.2003-09-08
    mkdir -p $multizDir/maf
    cd /cluster/data/hg16
    set mouseDir = bed/blastz.mm3/axtNet300
    set ratDir = bed/blastz.rn3/axtNet300
    foreach c (`cut -f 1 chrom.sizes`)
        echo "making mouse mafs on $c"
        # NOTE: this sort should probably be earlier in the pipeline
        axtSort $mouseDir/$c.axt $mouseDir/$c.axt.sorted
        mv $mouseDir/$c.axt.sorted $mouseDir/$c.axt 
        axtToMaf $mouseDir/$c.axt /cluster/data/hg16/chrom.sizes /cluster/data/mm3/chrom.sizes $multizDir/maf/$c.mm3.maf.unfixed -tPrefix=hg16. -qPrefix=mm3.
        /cluster/bin/scripts/fixmaf.pl \
                < $multizDir/maf/$c.mm3.maf.unfixed > $multizDir/maf/$c.mm3.maf

        echo "making rat mafs on $c"
        axtSort $ratDir/$c.axt $ratDir/$c.axt.sorted
        mv $ratDir/$c.axt.sorted $ratDir/$c.axt
        axtToMaf $ratDir/$c.axt /cluster/data/hg16/chrom.sizes /cluster/data/rn3/chrom.sizes $multizDir/maf/$c.rn3.maf.unfixed -tPrefix=hg16. -qPrefix=rn3.
        /cluster/bin/scripts/fixmaf.pl \
                < $multizDir/maf/$c.rn3.maf.unfixed > $multizDir/maf/$c.rn3.maf
        rm $multizDir/maf/*.unfixed
    end


    # copy maf files to bluearc for cluster run
    set clusterDir = /cluster/bluearc/hg16/bed
    mkdir $clusterDir/blastz.mm3/mafNet300
    cp $multizDir/maf/*.mm3.maf $clusterDir/blastz.mm3/mafNet300
    mkdir /cluster/bluearc/hg16/bed/blastz.rn3/mafNet300
    cp $multizDir/maf/*.rn3.maf $clusterDir/blastz.rn3/mafNet300

    # create scripts to run on cluster
    # run "humor"
    cd $multizDir
    mkdir hmr
    mkdir run
    cd run
cat << EOF > doHumor.kk
/cluster/bin/penn/humor.v4 $clusterDir/blastz.mm3/mafNet300/\$1.mm3.maf $clusterDir/blastz.rn3/mafNet300/\$1.rn3.maf > $multizDir/hmr/\$1.hmr.maf
EOF
    chmod +x doHumor.kk
    
cat << EOF > gsub 
#LOOP
doHumor.kk \$(root1) {check out line+ $multizDir/hmr/\$(root1).hmr.maf}
#ENDLOOP
EOF
    cd $clusterDir/blastz.mm3/mafNet300
    # NOTE: probably want a better way to make the chrom list 
    ls *.maf | awk -F. '{print $1}' > $multizDir/run/chrom.list
    cd $multizDir/run
    gensub2 chrom.list single gsub jobList

    # run jobs
    ssh kkr9u01
    #set multizDir = /cluster/data/hg16/bed/humor.2003-09-02
    set multizDir = /cluster/data/hg16/bed/humor.2003-09-08
    cd $multizDir/run
    para create jobList
    para try
    para check
    para push
    # longest job 27 minutes

    # setup external files for database reference
    ssh hgwdev
    mkdir -p /gbdb/hg16/humorMm3Rn3
    cd /gbdb/hg16/humorMm3Rn3
    foreach f ($multizDir/hmr/*.maf)
        ln -s $f .
    end

    # load into database
    #cd $multizDir/hmr/*.maf
    /cluster/bin/i386/hgLoadMaf -warn hg16 humorMm3Rn3

    # copy files to download area (2003-10-24 kate)
    set dir = /usr/local/apache/htdocs/goldenPath/hg16/humorMm3Rn3
    mkdir -p $dir
    cp -p /gbdb/hg16/humorMm3Rn3/*.maf $dir
    cd $dir
    gzip *
    # edit downloads page to add linke to humorMm3Rn3

    # add pairwise mafs to downloads page (2003-11-25 kate)
    set dir = /usr/local/apache/htdocs/goldenPath/hg16/humorMm3Rn3
    mkdir $dir/{rn3,mm3}
    cd /cluster/data/hg16/bed/humor/maf
    cp *.mm3.maf $dir/mm3
    cp *.rn3.maf $dir/rn3
    gzip $dir/mm3/*
    gzip $dir/rn3/*

    # Create upstream files (kent)
    ssh hgwdev
    cd /usr/local/apache/htdocs/goldenPath/hg16/humorMm3Rn3
    echo hg16 mm3 rn3 > org.txt
    foreach i (1000 2000 5000)
    featureBits hg16 refGene:upstream:$i -fa=/dev/null -bed=up.bad
    awk -F '\t' '{printf("%s\t%s\t%s\t%s\t%s\t%s\n", $1, $2, $3, substr($4, 0, 9), 0, $5)}' up.bad > up.bed
    rm up.bad
    mafFrags hg16 humorMm3Rn3 up.bed upstream$i.maf -orgs=org.txt
    rm up.bed
    end


#  MAKING BLASTZ SELF (DONE - 2003-08-08 - Hiram)

    # The procedure for lineage spec business with self is to simply
    # use the actual repeat masker output for this human assembly as
    # the lineage specific repeats for itself.  Thus, merely make
    # symlinks to the repeat masker out files and name them as expected
    # for blastz.  In this case they are called notInHuman but they
    # really mean InHuman.  Yes, it is confusing, but that's just the
    # nature of the game in this case.

    ssh eieio
    mkdir -p /cluster/bluearc/scratch/hg/gs.17/build34/linSpecRep.notInHuman
    cd /cluster/bluearc/scratch/hg/gs.17/build34/linSpecRep.notInHuman
    foreach f (../rmsk/*.fa.out)
	set base = $f:t:r:r
	echo $base.out.spec
	ln -s $f $base.out.spec
    end

    ssh eieio
    mkdir -p /cluster/data/hg16/bed/blastzSelf
    cd /cluster/data/hg16/bed/blastzSelf

    cat << '_EOF_' > DEF
# human vs. human
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/home/angie/schwartzbin:/cluster/home/kent/bin/i386

ALIGN=blastz-run
BLASTZ=blastz
BLASTZ_H=2000
BLASTZ_ABRIDGE_REPEATS=1

# TARGET
# Human
SEQ1_DIR=/iscratch/i/gs.17/build34/bothMaskedNibs
# not used
SEQ1_RMSK=
# not used
SEQ1_FLAG=
SEQ1_SMSK=/cluster/bluearc/scratch/hg/gs.17/build34/linSpecRep.notInHuman
SEQ1_IN_CONTIGS=0
SEQ1_CHUNK=10000000
SEQ1_LAP=10000

# QUERY
# Human
SEQ2_DIR=/iscratch/i/gs.17/build34/bothMaskedNibs
# not currently used
SEQ2_RMSK=
# not currently used
SEQ2_FLAG=
SEQ2_SMSK=/cluster/bluearc/scratch/hg/gs.17/build34/linSpecRep.notInHuman
SEQ2_IN_CONTIGS=0
SEQ2_CHUNK=10000000
SEQ2_LAP=10000

BASE=/cluster/store4/gs.17/build34/bed/blastzSelf

DEF=$BASE/DEF
RAW=$BASE/raw
CDBDIR=$BASE
SEQ1_LEN=$BASE/S1.len
SEQ2_LEN=$BASE/S2.len
'_EOF_'
    # << this line makes emacs coloring happy

# Save the DEF file in the current standard place
    DS=`date -I`
    cp DEF ~angie/hummus/DEF.hg16-hg16.$DS

    ssh kk
    cd /cluster/data/hg16/bed/blastzSelf

    # source the DEF file to establish environment for following commands
    . ./DEF

    # follow the next set of directions slavishly
    mkdir -p $BASE/run
    # give up on avoiding angie's directories
    # tcl script
    # creates xdir.sh and joblist run/j
    ~angie/hummus/make-joblist $DEF > $BASE/run/j

    # xdir.sh makes a bunch of result directories in $BASE/raw/
    # based on chrom name and CHUNK size
    sh $BASE/xdir.sh
    cd $BASE/run

    # now edit j to prefix path to executable name
    # NOTE: we should have a controlled version of schwartz bin executables
    sed -e 's#^#/cluster/bin/penn/#' j > j2
    wc -l j*
    # 114921 j
    head j2

    # make sure the j2 edits are OK, then use it:
    mv j2 j

    # para create will create the file: 'batch' for the cluster run
    para create j
	# 114921 jobs
    para try
    para check
    para push
    # ... etc ...
    #  With some cluster difficulties, bluearc hangups, etc:
Completed: 114921 of 114921 jobs
CPU time in finished jobs:   19898031s  331633.85m  5527.23h  230.30d  0.631 y
IO & Wait Time:              42606494s  710108.24m 11835.14h  493.13d  1.351 y
Average job time:                 544s       9.06m     0.15h    0.01d
Longest job:                   111877s    1864.62m    31.08h    1.29d
Submission to last job:        344744s    5745.73m    95.76h    3.99d

    # post-process blastz
    ssh eieio
    cd /cluster/data/hg16/bed/blastzSelf
    #   source the DEF file again in case you are coming back to this
    #	(must be bash shell)

    . ./DEF
    
    # a new run directory
    mkdir -p run.1
    
    mkdir -p $BASE/lav
    
    # create a new job list to convert out files to lav
    /cluster/bin/scripts/blastz-make-out2lav $DEF $BASE \
                        > run.1/jobList
    cd run.1

    # make sure the job list is OK
    wc -l jobList
       # 339 jobs 
    head jobList

    # run on cluster
    ssh kk
    cd /cluster/data/hg16/bed/blastzSelf/run.1
    para create jobList
    para try
    para check
    para push
    # etc.
#Completed: 339 of 339 jobs
#CPU time in finished jobs:      21101s     351.68m     5.86h    0.24d  0.001 y
#IO & Wait Time:                 74915s    1248.58m    20.81h    0.87d  0.002 y
#Average job time:                 283s       4.72m     0.08h    0.00d
#Longest job:                     2028s      33.80m     0.56h    0.02d
#Submission to last job:          2993s      49.88m     0.83h    0.03d

    # convert lav files to axt
    ssh kk
    cd /cluster/data/hg16/bed/blastzSelf
    mkdir axtChrom
    
    # a new run directory
    mkdir run.2
    cd run.2

    # create template file for gensub2
    # usage:  blastz-chromlav2axt lav-dir axt-file seq1-dir seq2-dir
    cat << '_EOF_' > gsub
#LOOP
/cluster/bin/scripts/blastz-chromlav2axt /cluster/store4/gs.17/build34/bed/blastzSelf/lav/$(root1) {check out line+ /cluster/store4/gs.17/build34/bed/blastzSelf/axtChrom/$(root1).axt} /iscratch/i/gs.17/build34/bothMaskedNibs /iscratch/i/gs.17/build34/bothMaskedNibs
#ENDLOOP
'_EOF_'
    # << this line makes emacs coloring happy
    ls -1S /cluster/store4/gs.17/build34/bed/blastzSelf/lav > chrom.list
    gensub2 chrom.list single gsub jobList
    wc -l jobList
        # 42 jobs
    head jobList

    cd /cluster/data/hg16/bed/blastzSelf/run.2
    para create jobList
    para try
    para check
    para push
    # We have two crashed jobs here.  The data for chr7 and chr19 is
    # too much for the processing.  Have to run those separately on
    #	the file server eieio.
Completed: 40 of 42 jobs
Crashed: 2 jobs
CPU time in finished jobs:       4737s      78.95m     1.32h    0.05d  0.000 y
IO & Wait Time:                 57154s     952.57m    15.88h    0.66d  0.002 y
Average job time:                1547s      25.79m     0.43h    0.02d
Longest job:                     7969s     132.82m     2.21h    0.09d
Submission to last job:          8029s     133.82m     2.23h    0.09d
    # Fixup chr7 and chr19 by running them in two passes like this:
    ssh eieio
    cd /cluster/data/hg16/bed/blastzSelf
    set base=/cluster/data/hg16/bed/blastzSelf
    set seq1_dir=/cluster/data/hg16/nib
    set seq2_dir=/cluster/data/hg16/nib
    foreach c (lav/chr19 lav/chr7)
      pushd $c
      set chr=$c:t
      set out=axtChrom/$chr.axt
      echo "Translating $chr lav to $out"
      foreach d (*.lav)
        set smallout=$d.axt
        lavToAxt $d $seq1_dir $seq2_dir stdout \
        | axtDropSelf stdin stdout \
        | axtSort stdin $smallout
      end
      cat `ls -1 *.lav.axt | sort -g` > $base/$out
      popd
    end

    # translate sorted axt files into psl
    ssh eieio
    cd /cluster/data/hg16/bed/blastzSelf
    #  Need to drop overlaps to eliminate diagonals
    #	DropOverlap seems to drop more than axtDropSelf above
    foreach f (axtChrom/chr*.axt)
      set c=$f:t:r
	mv axtChrom/$c.axt axtChrom/$c.axt
	/cluster/bin/i386/axtDropOverlap axtChrom/$c.axt \
		/cluster/bluearc/hg16/bed/blastzSelf/axtChromDropped/$c.axt
	echo "Done: $c"
    end
    cd /cluster/bluearc/hg16/bed/blastzSelf/axtChromDropped
    gzip *.axt
    #  Needed a deliver of these right away:  (REMOVED 2005-01-27)
    ssh hgwdev
    mkdir -p /usr/local/apache/htdocs/goldenPath/hg16/vsSelf
    cd /usr/local/apache/htdocs/goldenPath/hg16/vsSelf
    cp -p /cluster/bluearc/hg16/bed/blastzSelf/axtChromDropped/*.axt.gz .
	
    ssh eieio
    mkdir -p /cluster/data/hg16/bed/blastzSelf/pslChrom
    cd /cluster/data/hg16/bed/blastzSelf
    set tbl = "blastzSelf"
    foreach f (axtChrom/chr*.axt)
      set c=$f:t:r
      echo "Processing chr $c"
      zcat /cluster/bluearc/hg16/bed/blastzSelf/axtChromDropped/${c}.axt.gz | \
	/cluster/bin/i386/axtToPsl stdin S1.len S2.len pslChrom/${c}_${tbl}.psl
    end
    #	That takes about 20 minutes

XXXX Pick this up tomorrow, 03-09-12 with pslChromDroppedFix
    # Load database tables
    ssh hgwdev
    set tbl = "blastzSelf"
    cd /cluster/data/hg16/bed/blastzSelf/pslChrom
    /cluster/bin/i386/hgLoadPsl -noTNameIx hg16 chr*_blastzSelf.psl
    # This takes 30 minutes to an hour
    # create trackDb/human/hg16 and get a trackDb.ra file started with:

    # remake trackDb tables


# PRODUCE FUGU BLAT ALIGNMENT (IN PROGRESS 2003-08-22 kate)

    # Use masked scaffolds from fr1 assembly (same sequence as
    # previous BlatFugu, however it's repeat and TRF-masked).

    # NOTE: can't access /iscratch/i from fileserver
    ssh kk
    mkdir /cluster/data/hg16/bed/blatFr1
    cd /cluster/data/hg16/bed/blatFr1
    mkdir psl 
    # next time, use N?_?????? (to pick up NG_ contigs)
    foreach f (/cluster/data/hg16/?{,?}/NT_??????/NT_??????.fa)
      set c=$f:t:r
      echo $c
      mkdir -p psl/$c
    end
    # special case for NG_002432
    mkdir -p psl/NG_002432

    # create cluster job
    cd run
    ls -1S /iscratch/i/fugu/trfFa/*.fa > fugu.lst
    ls -1S /scratch/hg/gs.17/build34/trfFa/*.fa > human.lst
cat << 'EOF' > gsub
#LOOP
/cluster/bin/i386/blat -mask=lower -qMask=lower -q=dnax -t=dnax {check in line+ $(path1)} {check in line+ $(path2)} {check out line+ /cluster/data/hg16/bed/blatFr1/psl/$(root1)/$(root1)_$(root2).psl}
#ENDLOOP
'EOF'
    # << this line makes emacs coloring happy
    gensub2 human.lst fugu.lst gsub spec
    para create spec
       # 283798 jobs  
    para try
    para check
    para push
    para check
        # cd psl
        # count files with aligments
        # find . -not -size 427c | wc -l
        # 89878
        # count files with no aligments
        # find . -size 427c | wc -l
        # 195265

   # When cluster run is done, sort alignments
   # into chrom directory
    ssh eieio
    cd /cluster/data/hg16/bed/blatFr1
    pslCat -dir psl/N?_?????? | \
      liftUp -type=.psl stdout \
        /cluster/data/hg16/jkStuff/liftAll.lft warn stdin | \
      pslSortAcc nohead chrom temp stdin
        # 15 minutes ?
        # Processed 855648 lines into 4 temp files

    # Rename to correspond with tables and load into database:
    ssh hgwdev
    cd /cluster/data/hg16/bed/blatFr1/chrom
    rm -f chr*_blatFr1.psl
    foreach i (chr?{,?}{,_random}.psl)
        set r = $i:r
        echo $r
        mv $i ${r}_blatFr1.psl
    end

    # Next assembly, lift fugu scaffolds to Fugu browser chrUn,
    # so you can link to other browser.  And don't need to load sequence
    # liftUp -pslQ all.psl /cluster/data/fr1/fugu_v3.masked.lft warn chrom/*.psl

    hgLoadPsl -noTNameIx hg16 *.psl
        # $ featureBits hg16 blatFr1 refGene:CDS
        # 12787423 bases of 2865248791 (0.446%) in intersection
        # $ featureBits hg15 blatFugu refGene:CDS
        # 12427544 bases of 2866466359 (0.434%) in intersection

    # Edit trackDb.ra to include blatFr1
    # NOTE: already in top-level trackDb.ra

    # Make fugu /gbdb/ symlink and load Fugu sequence data.
    # NOTE: don't need to do this in next assembly
    mkdir /gbdb/hg16/fuguSeq
    cd /gbdb/hg16/fuguSeq
    ln -s /cluster/data/fr1/fugu_v3.masked.fa
    # hide .tab file
    cd /cluster/store2/tmp
    hgLoadSeq hg16 /gbdb/hg16/fuguSeq/fugu_v3.masked.fa


# MAKE BLASTZ BEST SELF (RE-DONE - 2003-08-28 - Hiram)

    #	Pick up on this process below after chain and nets have been
    #	done.  This run.3 business is obsolete

    # Consolidate AXT files to chrom level, sort, pick best, make psl.
    ssh eieio
    cd /cluster/data/hg16/bed/blastzSelf/axtChrom
    mkdir -p /cluster/bluearc/hg16/bed/blastzSelf/axtChrom
    
    # copy chrom axt's to bluearc, to avoid hitting fileserver too hard
    cp -p *.axt /cluster/bluearc/hg16/bed/blastzSelf/axtChrom

    ssh kk
    cd /cluster/data/hg16/bed/blastzSelf
    mkdir -p axtBest pslBest
    mkdir run.3
    cd run.3

    # create script to filter files 
    cat << '_EOF_' > doBestAxt
#!/bin/csh -f
# usage: doBestAxt chr axt-file best-file psl-file
/cluster/bin/i386/axtBest $2 $1 $3 -minScore=300
sleep 1
/cluster/bin/i386/axtToPsl $3 /cluster/data/hg16/bed/blastzSelf/S1.len \
	/cluster/data/hg16/bed/blastzSelf/S2.len $4
'_EOF_'
    # << this line makes emacs coloring happy
    chmod +x doBestAxt
    cd ../axtChrom
    ls -1S | sed 's/.axt$//' > ../run.3/chrom.list
    cd ../run.3

    # create template for cluster job
    cat << '_EOF_' > gsub
#LOOP
doBestAxt $(root1) {check in line+ /cluster/bluearc/hg16/bed/blastzSelf/axtChrom/$(root1).axt} {check out line+ /cluster/data/hg16/bed/blastzSelf/axtBest/$(root1).axt} {check out line+  /cluster/data/hg16/bed/blastzSelf/pslBest/$(root1)_blastzBestMm3.psl}
#ENDLOOP
'_EOF_'
    # << this line makes emacs coloring happy
    gensub2 chrom.list single gsub jobList
    wc -l jobList
        # 42 jobs
    head jobList

    ssh kkr1u00
    cd /cluster/data/hg16/bed/blastzSelf/run.3
    para create jobList
    para try
    para check
    para push
Completed: 38 of 42 jobs
Crashed: 4 jobs
CPU time in finished jobs:       1884s      31.41m     0.52h    0.02d  0.000 y
IO & Wait Time:                  8421s     140.34m     2.34h    0.10d  0.000 y
Average job time:                 271s       4.52m     0.08h    0.00d
Longest job:                     2061s      34.35m     0.57h    0.02d
Submission to last job:          2277s      37.95m     0.63h    0.03d
    #  Some of these files are getting too big for this operation
    # We will have to get back to these via the chains, nets and a
    # netToAxt trick

    # Problems:
/cluster/data/hg16/bed/blastzSelf/axtBest/chr19.axt is empty
/cluster/data/hg16/bed/blastzSelf/pslBest/chr19_blastzBestMm3.psl is empty
Out of memory - request size 1564 bytes
/cluster/data/hg16/bed/blastzSelf/axtBest/chr7.axt is empty
/cluster/data/hg16/bed/blastzSelf/pslBest/chr7_blastzBestMm3.psl is empty
Out of memory - request size 634045604 bytes
/cluster/data/hg16/bed/blastzSelf/axtBest/chr1.axt is empty
/cluster/data/hg16/bed/blastzSelf/pslBest/chr1_blastzBestMm3.psl is empty
ut of memory - request size 984185908 bytes
/cluster/data/hg16/bed/blastzSelf/axtBest/chr2.axt is empty
/cluster/data/hg16/bed/blastzSelf/pslBest/chr2_blastzBestMm3.psl is empty
Out of memory - request size 973662824 bytes

    #   Here is the replacement process for the above sequence
    #	Better yet, Jim says to be consistent, do all the chroms in
    #	this manner:
    ssh eieio
    cd /cluster/data/hg16/bed/blastzSelf/axtChain
    mkdir humanNet
    mkdir ../axtNet
    netSplit human.net humanNet
    foreach n (humanNet/chr*.net)
	set c=$n:t:r
	echo "netToAxt: $c.net -> $c.axt"
	rm -f ../axtNet/$c.axt
	netToAxt humanNet/$c.net chain/$c.chain \
		/cluster/bluearc/scratch/hg/gs.17/build34/bothMaskedNibs \
		/cluster/bluearc/scratch/hg/gs.17/build34/bothMaskedNibs \
		../axtNet/$c.axt
	echo "Complete: $c.net -> $c.axt"
    end

    mkdir -p /cluster/data/hg16/bed/blastzSelf/axtBest
    cd /cluster/data/hg16/bed/blastzSelf/axtBest
    ln -s ../axtNet/chr*.axt .

    #  Convert those axt files to psl
    ssh eieio
    cd /cluster/data/hg16/bed/blastzSelf
    mkdir pslBest
    foreach a (axtBest/chr*.axt)
	set c=$a:t:r
	echo "processing $c.axt -> ${c}_blastzBestSelf.psl"
    /cluster/bin/i386/axtToPsl axtBest/${c}.axt \
	S1.len S2.len pslBest/${c}_blastzBestSelf.psl
	echo "Done: ${c}_blastzBestSelf.psl"
    end

    # Load tables
     ssh hgwdev
     set base="/cluster/data/hg16/bed/blastzSelf"
     set tbl="blastzBestSelf"
     cd $base/pslBest
     /cluster/bin/i386/hgLoadPsl -noTNameIx hg16 chr*_${tbl}.psl

    # check results
    #  After going through the chain->net->axt operation:
    #	featureBits hg16 blastzBestSelf
    #	1388295977 bases of 2865248791 (48.453%) in intersection

    # Hg15 doesn't have a BestSelf, gave this a try with the following
    # result:
    #	featureBits hg15 blastzSelf
    #	Out of memory - request size 6 bytes

    # Make /gbdb links and add them to the axtInfo table:
     mkdir -p /gbdb/hg16/axtBestSelf
     cd /gbdb/hg16/axtBestSelf
     ln -s /cluster/data/hg16/bed/blastzSelf/axtNet/chr*.axt .

     cd /cluster/data/hg16/bed/blastzSelf/axtNet
     rm -f axtInfoInserts.sql
     touch axtInfoInserts.sql
     foreach f (/gbdb/hg16/axtBestSelf/chr*.axt)
       set chr=$f:t:r
       echo "INSERT INTO axtInfo (species, alignment, chrom, fileName) \
                VALUES ('hg16','Blastz Best in Genome','$chr','$f');" \
         >> axtInfoInserts.sql
     end
     #  This table has already been created above
     # hgsql hg16 < ~/kent/src/hg/lib/axtInfo.sql
     hgsql hg16 < axtInfoInserts.sql

# MAKE BLASTZ BEST SELF (NOT NECESSARY - NOT USEFUL - NOT NEEDED - NOT DONE)

# MAKING CHAIN SELF BLASTZ (DONE - 2003-08-27 - Hiram)
# MAKING CHAIN SELF BLASTZ (RE-DONE - 2003-09-04 - Hiram)
#				2003-09-04 - with dropped overlap axtChrom

# Run axtChain on little cluster
    ssh kkr1u00
    mkdir -p /cluster/data/hg16/bed/blastzSelf/axtChain/run1
    cd /cluster/data/hg16/bed/blastzSelf/axtChain/run1
    mkdir out chain

  ls -1S /cluster/bluearc/hg16/bed/blastzSelf/axtChromDropped/*.axt > input.lst
    cat << '_EOF_' > gsub
#LOOP
doChain {check in exists $(path1)} {check out line+ chain/$(root1).chain} {check out line+ out/$(root1).out}
#ENDLOOP
'_EOF_'
    # << this line makes emacs coloring happy

#	The -notQ_random (new argument to axtFilter) will omit any
#	*_random from the query.

    cat << '_EOF_' > doChain
#!/bin/csh
~/bin/i386/axtFilter -notQ_random $1 | axtChain stdin \
	/iscratch/i/gs.17/build34/bothMaskedNibs \
	/iscratch/i/gs.17/build34/bothMaskedNibs $2 > $3
'_EOF_'
    # << this line makes emacs coloring happy
    chmod a+x doChain
    mkdir out chain

    gensub2 input.lst single gsub jobList
	# edit jobList and remove the first one that does chr19
	#	It is a job that would fail anyway after more than an
	#	hour of run time.  It will be done separately below
    para create jobList
    # 41 jobs
    para try
    para push # ... etc ...
# Completed: 41 of 41 jobs
# CPU time in finished jobs:      27107s     451.78m     7.53h    0.31d  0.001 y
# IO & Wait Time:                 16236s     270.60m     4.51h    0.19d  0.001 y
# Average job time:                1057s      17.62m     0.29h    0.01d
# Longest job:                     4989s      83.15m     1.39h    0.06d
# Submission to last job:        240988s    4016.47m    66.94h    2.79d

    #  The chr19 recovery process:
    ssh kk
    mkdir -p /cluster/data/hg16/bed/blastzSelf/axtChain/run1.19
    cd /cluster/data/hg16/bed/blastzSelf/axtChain/run1.19
    cat << '_EOF_' > gsubQ
#LOOP
doChainQ.sh $(path2) $(path1) {check out line+ chain/$(root1).$(path2).chain} {check out line+ out/$(root1).$(path2).out}
#ENDLOOP
'_EOF_'
    # << this line makes emacs coloring happy

    cat << '_EOF_' > doChainQ.sh
#!/bin/sh
~/bin/i386/axtFilter -notQ_random -q=$1 $2 | axtChain stdin \
	/cluster/store4/gs.17/build34/nib \
	/cluster/store4/gs.17/build34/nib $3 > $4
'_EOF_'
    # << this line makes emacs coloring happy
    chmod +x doChainQ.sh
    #  This is a mistake, this should have been chr19.axt only
  ls -1S /cluster/bluearc/hg16/bed/blastzSelf/axtChromDropped/*.axt > input.lst
    pushd /cluster/data/hg16
    ls -d ?{,?} | sed -e "s/^/chr/" | grep -v chr19 \
	> /cluster/data/hg16/bed/blastzSelf/axtChain/run1.19/chrom19.lst
    popd

    mkdir out chain
    gensub2 input.lst chrom19.lst gsubQ spec19
    para create spec19
    para try
    para check
    para push
    ... etc ...
Completed: 948 of 1050 jobs
Crashed: 102 jobs
CPU time in finished jobs:      45918s     765.30m    12.75h    0.53d  0.001 y
IO & Wait Time:               1700328s   28338.80m   472.31h   19.68d  0.054 y
Average job time:                1842s      30.70m     0.51h    0.02d
Longest job:                    13247s     220.78m     3.68h    0.15d
Submission to last job:         13268s     221.13m     3.69h    0.15d
    # the "crashed 102" jobs are empty chains.
    # This mistakenly did them all, the input.lst should have been
    #	chr19 only.
# So, copy the chr19 results to the ../run1/chain result location
cp -p chain/chr19*.chain ../run1/chain

    # now on the cluster server, sort chains
    ssh eieio
    cd /cluster/data/hg16/bed/blastzSelf/axtChain
    chainMergeSort run1/chain/*.chain > all.chain
    chainSplit chain all.chain

    # these steps take ~20 minutes
    # optionally: rm run1/chain/*.chain

    # Load chains into database
    # next machine
    ssh hgwdev
    cd /cluster/data/hg16/bed/blastzSelf/axtChain/chain
    foreach i (*.chain)
        set c = $i:r
        hgLoadChain hg16 ${c}_chainSelf $i
        echo done $c
    end

    #	DELIVER these chain files to hgdownload (2005-01-27 - Hiram)
    ssh eieio
    cd /cluster/data/hg16/bed/blastzSelf/axtChain/chain
    gzip chr*.chain

    ssh hgwdev
    mkdir /usr/local/apache/htdocs/goldenPath/hg16/vsSelf
    cd /cluster/data/hg16/bed/blastzSelf/axtChain/chain
    cp -p *.chain.gz /usr/local/apache/htdocs/goldenPath/hg16/vsSelf
    #	fixup README file, request push

# NET SELF BLASTZ (RE-DONE 2003-09-09 - DONE - 2003-08-27 - Hiram)

    ssh eieio
    cd /cluster/data/hg16/bed/blastzSelf/axtChain
    mkdir preNet
    cd chain
    foreach i (*.chain)
      echo preNetting $i
      /cluster/bin/i386/chainPreNet $i /cluster/data/hg16/chrom.sizes \
                        /cluster/data/hg16/chrom.sizes ../preNet/$i
    end

    # This foreach loop will take about 15 min to execute.

    cd ..
    mkdir n1 
    cd preNet
#	Probably OK to make this minSpace=10, used to be 1
    foreach i (*.chain)
      set n = $i:r.net
      echo primary netting $i
      /cluster/bin/i386/chainNet $i -minSpace=10 \
			/cluster/data/hg16/chrom.sizes \
                        /cluster/data/hg16/chrom.sizes ../n1/$n /dev/null
    end
    #  The above takes about 5 minutes

    cd ..
    cat n1/*.net | /cluster/bin/i386/netSyntenic stdin hNoClass.net
    #	memory usage 200167424, utime 2489 s/100, stime 161

    ssh hgwdev
    cd /cluster/data/hg16/bed/blastzSelf/axtChain
    ~/bin/i386/netClass hNoClass.net hg16 hg16 human.net \
	-tNewR=/cluster/bluearc/scratch/hg/gs.17/build34/linSpecRep.notInHuman \
	-qNewR=/cluster/bluearc/scratch/hg/gs.17/build34/linSpecRep.notInHuman

    # If things look good do
    ssh eieio
    cd /cluster/data/hg16/bed/blastzSelf/axtChain
    rm -r n1 hNoClass.net
    # Make a 'syntenic' subset of these with
    netFilter -syn human.net > humanSyn.net

    # Load the nets into database 
    ssh hgwdev
    cd /cluster/data/hg16/bed/blastzSelf/axtChain
    netFilter -minGap=10 human.net |  hgLoadNet hg16 netSelf stdin
    netFilter -minGap=10 humanSyn.net | hgLoadNet hg16 syntenyNetSelf stdin

    # Add entries for net and chain to human/hg16 trackDb

# MAKING SELF AXTTIGHT FROM AXTCHROM (DONE - 2003-09-09 - Hiram)

    ssh eieio
    cd /cluster/data/hg16/bed/blastzSelf/axtChrom
    mkdir -p /cluster/data/hg16/bed/blastzSelf/axtTight
    tcsh

    foreach i (*.axt)
      echo $i
      subsetAxt  $i /cluster/data/hg16/bed/blastzSelf/axtTight/$i \
        ~kent/src/hg/mouseStuff/subsetAxt/90.mat 5000
    end

    # translate to psl
    cd ../axtTight
    mkdir -p ../pslTight
    foreach i (*.axt)
      set c = $i:r
      axtToPsl $i ../S1.len ../S2.len ../pslTight/${c}_blastzTightSelf.psl
      echo "Done: $i"
    end

    # Load tables into database
    ssh hgwdev
    cd /cluster/data/hg16/bed/blastzSelf/pslTight
    hgLoadPsl -noTNameIx hg16 chr*_blastzTightSelf.psl

# MAKING SELF SYNTENY - Can be done after Best (NEEDS TO BE REDONE 2003-09-09)

ssh hgwdev
mkdir -p /cluster/data/hg16/bed/syntenySelf
cd /cluster/data/hg16/bed/syntenySelf
# Use the scripts that were already copied to ../syntenyMm3

The first one takes 3 to 4 hours.

../syntenyMm3/syntenicBest.pl -db=hg16 -table=blastzBestSelf > synBest.out 2>&1
XXXX - Running 2003-08-27 21:32
../syntenyMm3/smooth.pl
../syntenyMm3/joinsmallgaps.pl
../syntenyMm3/fillgap.pl -db=hg16 -table=blastzBestSelf
../syntenyMm3/synteny2bed.pl
#    Load results
hgLoadBed hg16 syntenySelf ucsc100k.bed


# SGP GENE PREDICTIONS vs Mm4 (DONE - 2003-12-30 - Hiram)
    mkdir -p /cluster/data/hg16/bed/sgp_mm4/download
    cd /cluster/data/hg16/bed/sgp_mm4/download
    foreach f (/cluster/data/hg16/?{,?}/chr?{,?}{,_random}.fa)
      set chr = $f:t:r
      wget --timestamping \
	http://genome.imim.es/genepredictions/H.sapiens/golden_path_200307_x_mm4/SGP/$chr.gtf
      wget --timestamping \
	http://genome.imim.es/genepredictions/H.sapiens/golden_path_200307_x_mm4/SGP/$chr.prot
    end
    wget --timestamping \
http://genome.imim.es/genepredictions/H.sapiens/golden_path_200307_x_mm4/SGP/chrUn.gtf -O chrUn_random.gtf
    wget --timestamping \
http://genome.imim.es/genepredictions/H.sapiens/golden_path_200307_x_mm4/SGP/chrUn.prot -O chrUn_random.prot
    wget --timestamping \
http://genome.imim.es/genepredictions/H.sapiens/golden_path_200307_x_mm4/SGP/readme
    # Add missing .1 to protein id's
    foreach f (*.prot)
      perl -wpe 's/^(>chr\w+)$/$1.1/' $f > $f:r-fixed.prot
    end
    cd ..
    # since this is a relolad of this table updating the data
    #	from Mm3 to Mm4.  First check what is there:
#  featureBits hg16 sgpGene
#  39781330 bases of 2865248791 (1.388%) in intersection
    #	now drop that table, and reload
    hgsql -e "drop table sgpGene;" hg16
    #	This used to be done with -exon=CDS but it will do the same
    #	thing _AND_ add stop codons when done with -gtf, so do this
    #	with -gtf
    ldHgGene -gtf hg16 sgpGene download/*.gtf
#	Read 42880 transcripts in 322086 lines in 39 files
#	  42880 groups 39 seqs 1 sources 3 feature types
#	42880 gene predictions
    hgsql -e "drop table sgpPep;" hg16
    hgPepPred hg16 generic sgpPep download/*-fixed.prot
#  featureBits hg16 sgpGene
#  39698249 bases of 2865248791 (1.386%) in intersection
#  featureBits hg15 sgpGene
#  40395614 bases of 2866466359 (1.409%) in intersection

# SGP GENE PREDICTIONS - Mm3 (DONE - 2003-09-14 - Hiram - to be verified)
    mkdir -p /cluster/data/hg16/bed/sgp/download
    cd /cluster/data/hg16/bed/sgp/download
    foreach f (/cluster/data/hg16/?{,?}/chr?{,?}{,_random}.fa)
      set chr = $f:t:r
      wget http://genome.imim.es/genepredictions/H.sapiens/golden_path_200307/SGP/$chr.gtf
      wget http://genome.imim.es/genepredictions/H.sapiens/golden_path_200307/SGP/$chr.prot
    end
    wget http://genome.imim.es/genepredictions/H.sapiens/golden_path_200307/SGP/chrUn.gtf -O chrUn_random.gtf
    wget http://genome.imim.es/genepredictions/H.sapiens/golden_path_200307/SGP/chrUn.prot -O chrUn_random.prot
    # Add missing .1 to protein id's
    foreach f (*.prot)
      perl -wpe 's/^(>chr\w+)$/$1.1/' $f > $f:r-fixed.prot
    end
    cd ..
    ldHgGene hg16 sgpGene download/*.gtf -exon=CDS
#  Read 43109 transcripts in 323911 lines in 39 files
#    43109 groups 39 seqs 1 sources 3 feature types
#  43109 gene predictions
    hgPepPred hg16 generic sgpPep download/*-fixed.prot
#  featureBits hg16 sgpGene
#  39781330 bases of 2865248791 (1.388%) in intersection
#  featureBits hg15 sgpGene
#  40395614 bases of 2866466359 (1.409%) in intersection

# SGP GENES (UPDATE 1/18/2006)
    sgpPep table dropped, replaced by hgc generated protein seq in browser

LOAD NCI60 (DONE: Fan 10/20/2003)
o - # ssh hgwdev
    cd /projects/cc/hg/mapplots/data/NCI60/dross_arrays_nci60/
    mkdir hg16
    cd hg16
    findStanAlignments hg16 ../BC2.txt.ns ../../image/cumulative_plates.011204.list.human hg16.image.psl >& hg16.image.log
    cp ../experimentOrder.txt ./
    sed -e 's/ / \.\.\//g' < experimentOrder.txt > epo.txt
    egrep -v unknown hg16.image.psl > hg16.image.good.psl
    stanToBedAndExpRecs  hg16.image.good.psl hg16.nci60.exp hg16.nci60.bed `cat epo.txt`
    hgsql hg16 < ../../scripts/nci60.sql
    echo "load data local infile 'hg16.nci60.bed' into table nci60" | hgsql hg16
    mkdir /cluster/store4/gs.17/build34/bed/nci60
    mv hg16.nci60.bed /cluster/store4/gs.17/build34/bed/nci60
    rm *.psl

# LOAD AFFYRATIO [GNF in progress jk Sept 19, 2003] 
# LOAD AFFYRATIO U95Av2 sequences [DONE hartera Feb 2, 2004]
# Used consensus/exemplar sequences instead of target sequences
# LOAD AFFYRATIO [in progress, Feb 4, 2004]
# changed pslReps parameters as minAli = 0.97 was too stringent
    # Set up cluster job to align consenesus/exemplars to hg16
    ssh kkr1u00
    cd /cluster/data/hg16/bed
    rm -rf affyGnf.2004-02-04/
    mkdir affyGnf.2004-02-04
    cd affyGnf.2004-02-04/
    mkdir -p /iscratch/i/affy
    cp /projects/compbio/data/microarray/affyGnf/sequences/HG-U95/HG-U95Av2_all.fa /iscratch/i/affy
    iSync

    ssh kk
    cd /cluster/data/hg16/bed/affyGnf.2004-02-04
    ls -1 /iscratch/i/affy/HG-U95Av2_all.fa > affy.lst
    ls -1 /scratch/hg/gs.17/build34/trfFa/ > allctg.lst
    echo '#LOOP\n/cluster/bin/i386/blat -fine -mask=lower -minIdentity=95 -ooc=/scratch/hg/h/11.ooc  /scratch/hg/gs.17/build34/trfFa/$(path1) $(path2) {check out line+ psl/$(root1)_$(root2).psl}\n#ENDLOOP' > template.sub
    gensub2 allctg.lst affy.lst template.sub para.spec
    mkdir psl
    para create para.spec

    # Actually do the job with usual para try/check/push/time etc.
# para time 2/4/04
#Completed: 491 of 491 jobs
#CPU time in finished jobs:       8344s     139.06m     2.32h    0.10d  0.000 y
#IO & Wait Time:                  2281s      38.02m     0.63h    0.03d  0.000 y
#Average job time:                  22s       0.36m     0.01h    0.00d
#Longest job:                      289s       4.82m     0.08h    0.00d
#Submission to last job:           388s       6.47m     0.11h    0.00d

    # Do sort, best in genome filter, and convert to chromosome coordinates
    # to create affyU95.psl
    pslSort dirs raw.psl tmp psl
    
    # change filter parameters for these sequences. only use alignments that
    # cover 30% of sequence and have at least 95% identity in aligned region. 
    # minAli = 0.97 too high. low minCover as a lot of n's in these sequences
    pslReps -minCover=0.3 -sizeMatters -minAli=0.95 -nearTop=0.005 raw.psl contig.psl /dev/null
    liftUp affyU95.psl ../../jkStuff/liftAll.lft warn contig.psl

    # Merge with spot data and load into database. added -chip flag to 
    # affyPslAndAtlasToBed to allow correct parsing
    ssh hgwdev
    cd /cluster/data/hg16/bed/affyGnf.2004-02-04
    /cluster/home/sugnet/bin/i386/affyPslAndAtlasToBed -chip=U95Av2 affyU95.psl   /projects/compbiodata/microarray/affyGnf/human_atlas_U95_gnf.noquotes.txt affyRatio.bed affyRatio.exr >& affyPslAndAtlasToBed.log 
    hgLoadBed -sqlTable=$HOME/src/hg/lib/affyRatio.sql hg16 affyRatio affyRatio.bed    
    #	This affyU95 load was later changed to eliminate the long names
    # hgLoadPsl hg16 affyU95.psl
    #	by the following:
    sed -e "s/U95Av2://" affyU95.psl | sed -e "s/;//" > affyU95shortQname.psl
    hgLoadPsl hg16 -table=affyU95 affyU95shortQname.psl
    # Clean up
    rm -r psl tmp err affyRatio.bed affyRatio.exr bed.tab scores.tab *.debug batch.bak contig.psl raw.psl

LOAD AffyUclaRatio [in progress jk Sept 19, 2003] 
#LOAD AffyUclaRatio and AFFY U133A and U133B sequences[DONE hartera Feb 3, 2004]
# Used consensus/exemplar sequences instead of target sequences
    # Set up cluster job to align consensus/exemplars to hg16
    ssh kkr1u00
    cd /cluster/data/hg16/bed
    rm -rf affyUcla.2004-02-04/
    mkdir affyUcla.2004-02-04
    cd affyUcla.2004-02-04/
    mkdir -p /iscratch/i/affy
    cp /projects/compbio/data/microarray/affyUcla/sequences/HG-U133AB_all.fa /iscratch/i/affy
    iSync

    ssh kk
    cd /cluster/data/hg16/bed/affyUcla.2004-02-04/
    ls -1 /iscratch/i/affy/HG-U133AB_all.fa > affy.lst
    ls -1 /scratch/hg/gs.17/build34/trfFa/ > allctg.lst
    echo '#LOOP\n/cluster/bin/i386/blat -fine -mask=lower -minIdentity=95 -ooc=/scratch/hg/h/11.ooc  /scratch/hg/gs.17/build34/trfFa/$(path1) $(path2) {check out line+ psl/$(root1)_$(root2).psl}\n#ENDLOOP' > template.sub
    gensub2 allctg.lst affy.lst template.sub para.spec
    mkdir psl
    para create para.spec

    # Actually do the job with usual para try/check/push/time etc.
# on 2/4/04:
#Completed: 491 of 491 jobs
#CPU time in finished jobs:      23137s     385.61m     6.43h    0.27d  0.001 y
#IO & Wait Time:                 23057s     384.29m     6.40h    0.27d  0.001 y
#Average job time:                  94s       1.57m     0.03h    0.00d
#Longest job:                      617s      10.28m     0.17h    0.01d
#Submission to last job:           747s      12.45m     0.21h    0.01d

    # Do sort, best in genome filter, and convert to chromosome coordinates
    # to create affyU133.psl.
    pslSort dirs raw.psl tmp psl
    
    # change filter parameters for these sequences. only use alignments that
    # cover 30% of sequence and have at least 95% identity in aligned region. 
    # minAli = 0.97 too high. low minCover as a lot of n's in these sequences
    pslReps -minCover=0.3 -sizeMatters -minAli=0.95 -nearTop=0.005 raw.psl contig.psl /dev/null
    liftUp affyU133.psl ../../jkStuff/liftAll.lft warn contig.psl

    # Merge with spot data and load into database.
    ssh hgwdev
    cd /cluster/data/hg16/bed/affyUcla.2004-01-28/
    # added to hashPsls to process shorter Affy probe set names
    # assumes that names has 2 colons but when shortened to fit in the seq 
    # database, there is only 1.
    # e.g. full name: "consensus:HG-U133A:212933_x_at;" short name: "HG-U133A:212933_x_at;"
    affyUclaMergePslData affyUclaMergePslData -pslFile=affyU133.psl -affyFile=/projects/compbio/data/microarray/affyUcla/data/030602_ucla_normal_human_tissue_snapshot.txt -bedOut=affyUcla.bed -expRecordOut=affyUcla.expRecords -expFile=/projects/compbio/data/microarray/affyUcla/data/expNames -toDiffFile=toDiff.txt
    hgLoadBed -sqlTable=$HOME/src/hg/lib/affyUcla.sql hg16 affyUcla affyUcla.bed
    hgLoadPsl hg16 affyU133.psl
    
    # Clean up
    rm -r psl tmp err affyUcla.bed affyUcla.expRecords bed.tab *.debug batch.bak contig.psl raw.psl
    
    # Add in sequence data for affyU95 and affyU133 tracks.
    # Copy probe sequence to /gbdb if it isn't already
    mkdir -p /gbdb/hgFixed/affyProbes
    cd /gbdb/hgFixed/affyProbes
    ln -s /projects/compbio/data/microarray/affyGnf/sequences/HG-U95/HG-U95Av2_all.fa .
    ln -s /projects/compbio/data/microarray/affyUcla/sequences/HG-U133AB_all.fa .
    
    # use perl -pi.bak -e 's/;/ /' <file> to remove ";" after probe name
    # in HG-U95Av2_all.fa seque
    # reload sequences with "U95Av2" prefix removed so acc matches name used 
    # in other dependent tables for affyU95Av2 only
    hgLoadSeq -abbr=U95Av2: hg16 /gbdb/hgFixed/affyProbes/HG-U95Av2_all.fa
    hgLoadSeq hg16 /gbdb/hgFixed/affyProbes/HG-U133AB_all.fa

# QA repush 2006-02-08 seq/extFile to correct mismatched ID for affyU133 alignment data (Jen)

# Load AFFYUCLANORM, extended version of affyUcla track. Hopefully
# final freeze of data set.
    mkdir ~sugnet/store1/
    cd hg16
    mkdir affyUcla
    cd affyUcla/
    ssh kk
    cd /cluster/store1/sugnet/hg16/affyUcla
    cp /projects/compbio/data/microarray/affyUcla/sequences/HG-U133AB_all ./
    ls -1 /scratch/hg/gs.17/build34/trfFa/* > allctg.lst
    echo '#LOOP\n/cluster/bin/i386/blat -fine -mask=lower -minIdentity=95 -ooc=/scratch/hg/h/11.ooc  $(path1) $(path2) {check out line+ psl/$(root1)_$(root2).psl}\n#ENDLOOP' > template.sub
    echo "HG-U133AB_all" > affy.lst
    gensub2 allctg.lst affy.lst template.sub para.spec
    mkdir psl
    para create para.spec
       Checking input files
       491 jobs written to batch
       updated job database on disk
    para push
    # Wait until jobs run...
    exit
    pslSort dirs hg16.affyU133AB_all.psl tmp psl
    # Lots of messages
       writing hg16.affyU133AB_all.psl
       Cleaning up temp files
    wc hg16.affyU133AB_all.psl 
         60962 1280141 13677509 hg16.affyU133AB_all.psl
    ls /cluster/data/hg16/jkStuff/liftAll.lft
       /cluster/data/hg16/jkStuff/liftAll.lft
    liftUp hg16.affyU133AB_all.lifted.psl  /cluster/data/hg16/jkStuff/liftAll.lft warn hg16.affyU133AB_all.psl 
       Got 491 lifts in /cluster/data/hg16/jkStuff/liftAll.lft
       Lifting hg16.affyU133AB_all.psl
    pslReps -minCover=0.5 -sizeMatters -minAli=0.97 -nearTop=0.005  hg16.affyU133AB_all.lifted.psl hg16.affyU133AB_all.lifted.pslReps.psl out.psr
       Processing hg16.affyU133AB_all.lifted.psl to hg16.affyU133AB_all.lifted.pslReps.psl and out.psr
       Processed 60957 alignments
    affyUclaMergePslData -pslFile=hg16.affyU133AB_all.lifted.pslReps.psl -affyFile=/projects/compbio/data/microarray/affyUcla/data/feature_biomaterial_chip_logratios_formatForTrack.txt -bedOut=hg16.affyUcla.bed -expRecordOut=hg16.affyUcla.expRecords -expFile=/projects/compbio/data/microarray/affyUcla/data/expNames.sorted.txt
       Reading psls from: hg16.affyU133AB_all.lifted.pslReps.psl
       Outputing beds:
       ............................................
       Freeing Memory.
       Done.
    addUclaAnnotations.pl hg16.affyUcla.expRecords  /projects/compbio/data/microarray/affyUcla/data/normal_tissue_database_annotations2.txt > hg16.affyUcla.annotations.expRecords
       
    # Load the databases
    cp ~/jk/hg/lib/affyRatio.sql ./
    sed -e 's/affyRatio/affyUclaNorm/' < affyRatio.sql > affyUclaNorm.sql
    # Just use the hgLoadBed program specifying sqlFile
    hgLoadBed hg16 affyUclaNorm hg16.affyUcla.bed -sqlTable=affyUclaNorm.sql
       Reading hg16.affyUcla.bed
       Loaded 44446 elements of size 15
       Sorted
       Saving bed.tab
       Loading hg16
    cp ~/jk/hg/lib/expRecord.sql ./
    sed -e 's/expRecord/affyUclaNormExps/' < expRecord.sql > affyUclaNormExps.sql
    hgFixedS -A < affyUclaNormExps.sql 
    echo "load data local infile 'hg16.affyUcla.annotations.expRecords' into table affyUclaNormExps" | hgFixedS -A
    
    # Cleanup
    rm HG-U133AB_all 

# DO FAMILY BROWSER VERSIONS OF AFFYUCLANORMAL TRACK (In Progress -jk 3/2/2004)
# (This is suspended because GNF Gene Atlas data is available and public!)
   # Create affyU133Orient table data
   ssh eieio
   cd /cluster/data/hg16/bed/affyUcla.2044-02-04
   pslSortAcc nohead chrom temp affyU133.psl
   rm -r temp
   cd chrom
   #This loop takes about 15 minutes
   foreach i (*.psl)
     polyInfo $i /cluster/data/hg16/nib/$i:r.nib \
       /projects/compbio/data/microarray/affyUcla/sequences/HG-U133AB_all.fa \
       $i:r.polyInfo
     echo done $i
   end
   cat *.polyInfo > ../affyU133OrientInfo.bed
   rm *.polyInfo

   # Load orientation table data 
   ssh hgwdev
   cd /cluster/data/hg16/bed/affyUcla.2044-02-04
   sed 's/mrnaOrientInfo/affyU133OrientInfo/' \
       $HOME/kent/src/hg/lib/mrnaOrientInfo.sql > affyU133OrientInfo.sql
   hgLoadBed hg16 affyU133OrientInfo affyU133OrientInfo.bed \
      -sqlTable=affyU133OrientInfo.sql > /dev/null
       
   # Do clustering (this takes about 10 minutes to run)
   clusterRna hg16 u133Cluster.bed /dev/null -noEst -noRefSeq -group=u133Group.tab -mRNAOrient=affyU133OrientInfo -rna=affyU133
~~~

# GNF ATLAS 2  [Done jk 3/29/2004]
    # Align probes from GNF1H chip.
    ssh kk
    cd /cluster/data/hg16/bed
    mkdir -p geneAtlas2/run/psl
    cd geneAtlas2/run
    mkdir -p /cluster/bluearc/geneAtlas2
    cp /projects/compbio/data/microarray/geneAtlas2/human/gnf1h.fa /cluster/bluearc/geneAtlas2
    ls -1 /scratch/hg/gs.17/build34/trfFa/ > genome.lst
    ls -1 /cluster/bluearc/geneAtlas2/gnf1h.fa > mrna.lst
    echo '#LOOP\nblat -fine -ooc=/scratch/hg/h/11.ooc  /scratch/hg/gs.17/build34/trfFa/$(path1) $(path2) {check out line+ psl/$(root1)_$(root2).psl}\n#ENDLOOP' > gsub
    gensub2 genome.lst mrna.lst gsub spec
    para create spec
    para try
    para check
    para push
    para time
#Completed: 491 of 491 jobs
#CPU time in finished jobs:      10718s     178.63m     2.98h    0.12d  0.000 y
#IO & Wait Time:                  1499s      24.99m     0.42h    0.02d  0.000 y
#Average job time:                  25s       0.41m     0.01h    0.00d
#Longest job:                      652s      10.87m     0.18h    0.01d
#Submission to last job:           723s      12.05m     0.20h    0.01d
    # Do sort, best in genome filter, and convert to chromosome coordinates
    # to create gnf1h.psl.
    pslSort dirs raw.psl tmp psl
    pslReps -minCover=0.3 -minAli=0.95 -nearTop=0.005 raw.psl contig.psl /dev/null
    liftUp ../affyGnf1h.psl ../../../jkStuff/liftAll.lft warn contig.psl
    rm -r contig.psl raw.psl psl

    # Load probes and alignments from GNF1H into database.
    ssh hgwdev
    cd /cluster/data/hg16/bed/geneAtlas2
    ln -s /projects/compbio/data/microarray/geneAtlas2/human/gnf1h.fa /gbdb/hgFixed/affyProbes
    hgLoadPsl hg16 affyGnf1h.psl
    hgLoadSeq hg16 /gbdb/hgFixed/affyProbes/gnf1h.fa
    grep -v U133B ../affyUcla.2004-02-04/affyU133.psl | sed 's/exemplar://' \
	| sed 's/consensus://' \
        | sed 's/HG-U133A://' | sed 's/;//' > affyU133A.psl
    hgMapMicroarray gnfAtlas2.bed hgFixed.gnfHumanAtlas2MedianRatio \
    	affyU133A.psl  /cluster/data/hg16/bed/geneAtlas2/affyGnf1h.psl
    # Note that the unmapped 11000 records are from all-N sequences.
    hgLoadBed hg16 gnfAtlas2 gnfAtlas2.bed

    


# GENE BOUNDS (RNACLUSTER) (DONE 10-05-03 Chuck)
    # Create rnaCluster table (depends on {est,mrna}OrientInfo created but not checked in)
    cd /cluster/store4/gs.17/build34/
    # Create a list of accessions that come from RAGE libraries and need to
    # be excluded. (added by Chuck Wed Nov 27 13:09:07 PST 2002)
    ~/kent/src/hg/geneBounds/clusterRna/generateRageAccList.csh hg16 \
      rage.libs
    mkdir -p bed/rnaCluster/chrom
    # Exclude accesions in the RAGE file
    foreach f (?{,?}/chr*.fa)
      set c = $f:t:r
      set out = bed/rnaCluster/chrom/$c.bed
      echo clusterRna -mrnaExclude=hg16.rage.libs hg16 /dev/null $out -chrom=$c
      clusterRna -mrnaExclude=hg16.rage.libs hg16 /dev/null $out -chrom=$c
    end

    cd bed/rnaCluster
    hgLoadBed hg16 rnaCluster chrom/*.bed > /dev/null

    
# MAKE UNIGENE ALIGNMENTS (DONE - 2003-10-09 - Hiram)
    # Download of the latest UniGene version is now automated by a 
    # cron job -- see /cluster/home/angie/crontab , 
    # /cluster/home/angie/unigeneVers/unigene.csh .  
    # If hgwdev gets rebooted, that needs to be restarted... maybe there's 
    # a more stable place to set up that cron job.  

    # substitute XXX -> the uniGene version used by SAGE, if building the 
    # uniGene/SAGE track;  or just the latest uniGene version in 
    # /projects/cc/hg/sugnet/uniGene/ , if doing uniGene alignments only.
    # set Version = XXX
    set Version = 162			(bash: export Version=162)
    cd /projects/cc/hg/sugnet/uniGene/uniGene.$Version
    gunzip Hs.seq.uniq.gz Hs.data.gz
    ../countSeqsInCluster.pl Hs.data counts.tab
    ../parseUnigene.pl Hs.seq.uniq Hs.seq.uniq.simpleHeader.fa leftoverData.tab
    # Distribute UniGene sequence to /iscratch/i/ (kkstore can see /projects)
    ssh kkstore
    set Version = 162 # same as above
    mkdir -p /iscratch/i/uniGene.$Version
    cp -p \
  /projects/cc/hg/sugnet/uniGene/uniGene.$Version/Hs.seq.uniq.simpleHeader.fa \
      /iscratch/i/uniGene.$Version
    ssh kkr1u00
    ~kent/bin/iSync
    ssh kk
    set Version = 162 # same as above
    mkdir -p /cluster/data/hg16/bed/uniGene.$Version
    cd /cluster/data/hg16/bed/uniGene.$Version
    ls -1S /scratch/hg/gs.17/build34/trfFa/*.fa > allctg.lst
    ls -1S /iscratch/i/uniGene.$Version/Hs.seq.uniq.simpleHeader.fa \
      > uniGene.lst
    cat << '_EOF_' > template.sub
#LOOP
/cluster/bin/i386/blat -mask=lower -minIdentity=95 -ooc=/scratch/hg/h/11.ooc $(path1) $(path2)  {check out line+ psl/$(root1)_$(root2).psl}
#ENDLOOP
'_EOF_'

    gensub2 allctg.lst uniGene.lst template.sub para.spec
    para create para.spec
    mkdir psl
    para try
    para check
    para push
# Checking finished jobsCompleted: 491 of 491 jobs
# CPU time in finished jobs:      39689s     661.49m    11.02h    0.46d  0.001 y
# IO & Wait Time:                 38269s     637.81m    10.63h    0.44d  0.001 y
# Average job time:                 159s       2.65m     0.04h    0.00d
# Longest job:                     1805s      30.08m     0.50h    0.02d
# Submission to last job:          1972s      32.87m     0.55h    0.02d

    # ssh eieio
    set Version = 162 # same as above
    cd /cluster/data/hg16/bed/uniGene.$Version
    pslSort dirs raw.psl tmp psl >& pslSort.log
    liftUp -type=.psl stdout ../../jkStuff/liftAll.lft warn raw.psl \
    | pslReps -minCover=0.2 -sizeMatters -minAli=0.965 -nearTop=0.002 \
      stdin hg16.uniGene.lifted.pslReps.psl /dev/null
    # use hg16.uniGene.lifted.pslReps.psl for building SAGE track (next).
    
# LOAD SAGE DATA (TBD)
    ssh hgwdev
    cd ~/kent/src/hg/sage
    make
    # XXX = uniGene build for which SAGE was built -- not necessarily current!
    # Figure out the build number by peeking at this file:
    wget -O - ftp://ftp.ncbi.nih.gov/pub/sage/map/info.txt 2> /dev/null
    # Or, look at the contents of this directory:
    ls /projects/cc/hg/sugnet/uniGene
#   set Version = XXX
    set Version=162
    mkdir /projects/cc/hg/sugnet/sage/sage.$Version
    cd /projects/cc/hg/sugnet/sage/sage.$Version
    ncftp ftp://ftp.ncbi.nih.gov/pub/sage
      mget -R map/readme.txt map/info.txt extr info map/Hs
      quit
#  That downloaded about 380 Mb of data
    mkdir map
    mv Hs map
    cd map/Hs/NlaIII
    unzip -j SAGEmap_tag_ug-rel.zip
    cd ../../../extr/
    ../../scripts/summarizeCounts.pl expCounts.tab ./SAGE_*
    ../../scripts/countGenesPerTag.pl expCounts.tab allTags.count.tab
    ../../scripts/createArraysForTags.pl allTags.count.tab tagExpArrays.tab \
      ./SAGE_*
    ../../scripts/countsPerExp.pl expCounts.tab expList.tab

    cd ../map/Hs/NlaIII/ 
    cat << '_EOF_' > /tmp/t.pl
#!/usr/local/bin/perl

while (<>) {
 chomp($_);
 @p = split(/\t/, $_);
 print "$p[2]\t$p[3]\t$p[0]\n";
}
'_EOF_'
    chmod +x /tmp/t.pl

    cat SAGEmap_tag_ug-rel | /tmp/t.pl | sort | sed -e 's/ /_/g' \
      > SAGEmap_ug_tag-rel_Hs
    cd ../../../extr
    createSageSummary ../map/Hs/NlaIII/SAGEmap_ug_tag-rel_Hs \
      tagExpArrays.tab sageSummary.sage
    # Create the uniGene alignments 
    # /cluster/data/hg16/uniGene/hg16.uniGene.lifted.pslReps.psl
    # -- see "MAKE UNIGENE ALIGNMENTS" above
    # continuing from above, we are already in this extr directory
    cd /projects/cc/hg/sugnet/sage/sage.$Version/extr
    addAveMedScoreToPsls \
      /cluster/data/hg16/bed/uniGene.$Version/hg16.uniGene.lifted.pslReps.psl \
      sageSummary.sage  uniGene.wscores.bed
    hgLoadBed hg16 uniGene_2 uniGene.wscores.bed
    hgsql hg16 < ~kent/src/hg/lib/sage.sql 
    echo "load data local infile 'sageSummary.sage' into table sage" \
        | hgsql hg16
    cd ../info
    ../../scripts/parseRecords.pl ../extr/expList.tab  > sageExp.tab
    hgsql hg16 < ~/kent/src/hg/lib/sageExp.sql 
    echo "load data local infile 'sageExp.tab' into table sageExp" | hgsql hg16
    # update ~/kent/src/hg/makeDb/trackDb/human/hg16/uniGene_2.html 
    # with current uniGene date. 

# MAKING FOLDUTR TABLES (DONE - jk - 2003-10-14, REDONE jk 2004-04-07)
# First set up directory structure and extract UTR sequence on hgwdev
    ssh hgwdev
    mkdir -p /cluster/data/hg16/bed/rnaStruct
    cd /cluster/data/hg16/bed/rnaStruct
    mkdir -p utr3/split utr5/split utr3/fold utr5/fold
    utrFa hg16 knownGene utr3 utr3/utr.fa
    utrFa hg16 knownGene utr5 utr5/utr.fa

# Split up files and make files that define job.
    ssh kk
    cd /cluster/data/hg16/bed/rnaStruct
    faSplit sequence utr3/utr.fa 50000 utr3/split/s
    faSplit sequence utr5/utr.fa 50000 utr5/split/s
    ls -1 utr3/split > utr3/in.lst
    ls -1 utr5/split > utr5/in.lst
    cd utr3
    cat > gsub <<end
#LOOP
rnaFoldBig split/\$(path1) fold
#ENDLOOP
end
    cp gsub ../utr5

# Do cluster run for 3' UTRs
    gensub2 in.lst single gsub spec
    para create spec
    para try
    para push
#CPU time in finished jobs:     842416s   14040.26m   234.00h    9.75d  0.027 y
#IO & Wait Time:                 78541s    1309.02m    21.82h    0.91d  0.002 y
#Average job time:                  32s       0.53m     0.01h    0.00d
#Longest job:                     3318s      55.30m     0.92h    0.04d
#Submission to last job:          4282s      71.37m     1.19h    0.05d
#
#Completed: 37250 of 37250 jobs
#CPU time in finished jobs:    1028594s   17143.24m   285.72h   11.91d  0.033 y
#IO & Wait Time:                125807s    2096.78m    34.95h    1.46d  0.004 y
#Average job time:                  31s       0.52m     0.01h    0.00d
#Longest job:                     3396s      56.60m     0.94h    0.04d
#Submission to last job:          4422s      73.70m     1.23h    0.05d


# Do cluster run for 5' UTRs 
    cd ../utr5
    gensub2 in.lst single gsub spec
    para create spec
    para try
    para push
#Completed: 25808 of 25808 jobs
#CPU time in finished jobs:      51700s     861.67m    14.36h    0.60d  0.002 y
#IO & Wait Time:                114430s    1907.16m    31.79h    1.32d  0.004 y
#Average job time:                   6s       0.11m     0.00h    0.00d
#Longest job:                     1044s      17.40m     0.29h    0.01d
#Submission to last job:          1164s      19.40m     0.32h    0.01d
#
#Completed: 29770 of 29770 jobs
#CPU time in finished jobs:     100407s    1673.45m    27.89h    1.16d  0.003 y
#IO & Wait Time:                 93019s    1550.32m    25.84h    1.08d  0.003 y
#Average job time:                   6s       0.11m     0.00h    0.00d
#Longest job:                     2209s      36.82m     0.61h    0.03d
#Submission to last job:          2596s      43.27m     0.72h    0.03d



# Load database
    ssh hgwdev
    cd /cluster/data/hg16/bed/rnaStruct/utr5
    hgLoadRnaFold hg16 foldUtr5 fold
    cd ../utr3
    hgLoadRnaFold hg16 foldUtr3 fold

# Clean up
    rm -r split fold err batch.bak
    cd ../utr5
    rm -r split fold err batch.bak


# TBA (Webb Miller's Threaded Blockset Aligner) Alignments (CFTR region)   2003-10-17 kate
# 9-way alignment: human, chimp, baboon, mouse, rat, doc, cat, cow, pig
# Using sequences from browser (human, mouse, rat), and from
# Elliot Margulies at NISC (via Webb)

    # unrolled sequences and ran TBA in /cluster/data/nisc/targets/cftr/tba9Mammal
    ssh kksilo
    mkdir -p /cluster/data/hg16/bed/nisc/cftr
    ln -s /cluster/data/nisc/targets/cftr/tba9Mammal/human.out \
                /cluster/data/hg16/bed/nisc/cftr/tba9Mammal.maf

    # setup external files for database reference
    ssh hgwdev
    set table = tba9MammalCFTR
    mkdir -p /gbdb/hg16/$table
    cd /gbdb/hg16/$table
    ln -s /cluster/data/hg16/bed/nisc/cftr/tba9Mammal.maf tba.maf
    mkdir -p /gbdb/hg16/${table}
    cd /gbdb/hg16/${table}
    ln -s /cluster/data/hg16/bed/nisc/cftr/tba9Mammal.maf tba.maf

    # load into database
    cd /cluster/data/hg16/bed/nisc/cftr
    /cluster/bin/i386/hgLoadMaf -WARN hg16 $table

# TBA with Non-mammalian species included (Fugu & Chicken)  2003-10-20 kate
    ssh hgwdev
    ln -s /cluster/data/nisc/targets/cftr/CFTR.non-mammal/human.out \
                /cluster/data/hg16/bed/nisc/cftr/tbaFishBird.maf
    set table = tbaFishBirdCFTR
    mkdir -p /gbdb/hg16/$table
    cd /gbdb/hg16/$table
    ln -s /cluster/data/hg16/bed/nisc/cftr/tbaFishBird.maf tba.maf
    cd /cluster/data/hg16/bed/nisc/cftr
    /cluster/bin/i386/hgLoadMaf -WARN hg16 $table
        # 1072 warnings (mostly score=0's, a few minus scores)
        # 4377 rows

# TBA 25-species CFTR region  (DONE 2003-10-28 kate)
# run in /cluster/data/nisc/targets/cftr/25way, using makefile

    ssh hgwdev
    ln -s /cluster/data/nisc/targets/cftr/25way/human.maf \
                /cluster/data/hg16/bed/nisc/cftr/tba25.maf
    set table = tba25CFTR
    mkdir -p /gbdb/hg16/$table
    cd /gbdb/hg16/$table
    ln -s /cluster/data/hg16/bed/nisc/cftr/tba25.maf tba.maf
    cd /cluster/data/hg16/bed/nisc/cftr
    /cluster/bin/i386/hgLoadMaf -WARN hg16 $table
        # 22267 rows
        # 24 warnings


# MAKE HG16-PANTRO1 MAF FOR MULTIZ/TBA (DONE 3/8/04 angie)
    ssh kolossus
    mkdir /cluster/data/hg16/bed/blastz-blat.panTro1.lifted
    cd /cluster/data/hg16/bed/blastz-blat.panTro1.lifted
    # use the combined blastz-blat reciprocal best human-pt0 chain, but 
    # assign unique IDs:
    chainSort /cluster/data/pt0/bed/blastz-blatHg16/human.best.2.chain stdout \
    | chainMergeSort stdin \
    | chainSplit pt0RBestChain stdin
    # re-net with the new IDs:
    mkdir pt0RBestNet
    foreach f (pt0RBestChain/*.chain)
      echo chaining $f
      chainNet $f /cluster/data/hg16/chrom.sizes \
        /cluster/data/pt0/scaffold.sizes pt0RBestNet/$f:t:r.net /dev/null
    end
    # Now lift chain to panTro1 coords:
    mkdir chain
    foreach f (pt0RBestChain/*.chain)
      liftUp -chainQ rBestChain/$f:t \
        /cluster/data/panTro1/jkStuff/scaffolds.lft warn $f
    end
    # re-net with panTro1 coords (liftUp -netQ doesn't like - strand lifting):
    mkdir rBestNet
    foreach f (rBestChain/*.chain)
      echo chaining $f
      chainNet $f /cluster/data/hg16/chrom.sizes \
        /cluster/data/panTro1/chrom.sizes rBestNet/$f:t:r.net /dev/null
    end
    # make axt and maf from the hg16-panTro1 net:
    mkdir axtRBestNet mafRBestNet
    foreach f (rBestNet/chr*.net)
      set chr = $f:t:r
      netToAxt $f rBestChain/$chr.chain /cluster/data/hg16/nib \
        /cluster/data/panTro1/nib stdout \
      | axtSort stdin axtRBestNet/$chr.axt
      axtToMaf axtRBestNet/$chr.axt /cluster/data/hg16/chrom.sizes \
        /cluster/data/panTro1/chrom.sizes mafRBestNet/$chr.maf \
        -tPrefix=hg16. -qPrefix=panTro1.
    end

    # copy reciprocal net axt's for download (2004-10-04 kate)
    cd axtRBestNet
    gzip *.axt

    ssh hgwdev
    cd /usr/local/apache/htdocs/goldenPath/hg16/vsPanTro1
    mkdir axtRBestNet
    cd axtRBestNet
    cp /cluster/data/hg16/bed/blastz-blat.panTro1.lifted/axtRBestNet/*.gz .
    md5sum *.gz > md5sum.txt
    
    # load renumbered chains into database  (2004-03-14 kate)
    ssh hgwdev
    cd /cluster/data/hg16/bed/blastz-blat.panTro1.lifted/rBestChain
    foreach i (*.chain)
        set c = $i:r
        hgLoadChain hg16 ${c}_rBestChainPanTro1 $i
        echo done $c
    end

    # save for download (2004-05-14 kate)
    ssh kksilo
    cd /cluster/data/hg16/bed/blastz-blat.panTro1.lifted/rBestChain
    chainMergeSort -saveId *.chain > ../rBest.chain
    ssh hgwdev
    cd /cluster/data/hg16/bed/blastz-blat.panTro1.lifted/rBestChain
    set dir = /usr/local/apache/htdocs/goldenPath/hg16/vsPanTro1
    mkdir -p $dir
    cp -p ../rBest.chain $dir/human.best.chain
    cd $dir
    gzip *.chain
    # copy README file
    

    # load net into database  (2004-03-14 kate)
    ssh hgwdev
    cd /cluster/data/hg16/bed/blastz-blat.panTro1.lifted
    cat rBestNet/*.net | /cluster/bin/i386/netSyntenic stdin noClass.net
    netClass noClass.net hg16 panTro1 human.net
    netFilter -chimpSyn human.net > rBest.net
    hgLoadNet -warn hg16 rBestNetPanTro1 rBest.net


# EXPERIMENT: TBA WHOLE CHROM 5 SPECIES (DONE ENOUGH 3/8/04 angie)
    # Put 2-ways in /cluster/bluearc
    ssh eieio
    mkdir /cluster/bluearc/hg16/tba
    mkdir /cluster/bluearc/hg16/tba/{hp,hg}
    cp /cluster/data/hg16/bed/blastz-blat.panTro1.lifted/mafRBestNet/chr*.maf \
      /cluster/bluearc/hg16/tba/hp
    # hg16-mm3 already in /cluster/bluearc/hg16/bed/blastz.mm3/mafNet300
    # hg16-rn3 already in /cluster/bluearc/hg16/bed/blastz.rn3/mafNet300
    cp /cluster/data/hg16/bed/blastz.galGal2.2004-02-25/mafBest/*.maf \
      /cluster/bluearc/hg16/tba/hg

    ssh kolossus
    mkdir /cluster/data/hg16/bed/tbaExperiment
    cd /cluster/data/hg16/bed/tbaExperiment
    # tba needs to run multiz, so make sure they're in $PATH:
    set path = (/cluster/bin/penn $path)
    # Try just one chromosome:
    set chr = chr16
    # tba needs filenames to correspond to its tree input, so make links to 
    # maf and fasta:
    rm -f human.chimp.maf human.mouse.maf human.rat.maf human.chicken.maf \
          human
    mafSort /cluster/bluearc/hg16/tba/hp/$chr.z.mm3/mafNet300/$chr.mm3.maf > \
      human.mouse.mafmaf > human.chimp.maf
    mafSort /cluster/bluearc/hg16/bed/blastz.mm3/mafNet300/$chr.mm3.maf > \
      human.mouse.maf
    mafSort /cluster/bluearc/hg16/bed/blastz.rn3/mafNet300/$chr.rn3.maf > \
      human.rat.maf
    mafSort /cluster/bluearc/hg16/tba/hg/$chr.hg.maf > human.chicken.maf
    ln -s /cluster/data/hg16/?{,rror that tba is dying with is this:
    # ?}/$chr.fa human
    tba "(((human chimp) (mouse rat)) chicken)" \
      human.chimp.maf human.mouse.maf human.rat.maf human.chicken.maf
    # Doh -- looks like tba wants *all* pairwise inputs, and how do we 
    # tell which rat-chicrror that tba is dying with is this:
    # ken alignments to include for a given human chr??
    # The error that tba is dying with is this:
    # pair2tb.v4: alignments of human out of order around 172596-175110
    # ... even though inputs are sorted...?  Oh well, clean up:
    rm human*
    rm -r /cluster/bluearc/hg16/tba/


# CREATING KNOWNtOsUPER (which enables superFamily stuff in hgNear/hgGene)
# First see if need to update superfamily data from 
# ftp server at supfam.mrc-lmb.cam.ac.uk following instructions
# in /cluster/store1/superFamily/genomes/README.ucsc.  Then
# make sure that knownToEnsembl and ensGtp tables are created, then:
    zcat /cluster/store1/superFamily/genomes/ass_26-Oct-2003.tab.gz | hgKnownToSuper hg16 hs stdin


# BLASTZ CHICKEN (done, 11/3/2003, Adam)
# (adapted from BLASTZ mouse/rat, above)

# NOTE: this first time we're using the contigs that Terry has
# installed at /cluster/bluearc/gg0 (see fa and split100
# subdirectories).  When we have an assembly, things should be able to
# proceed more as with mouse and rat

    ssh kk
    mkdir -p /cluster/data/hg16/bed/blastz.gg0
    cd /cluster/data/hg16/bed/blastz.gg0

    # first it looks like we need to run TRF on the contigs (realizing
    # this on second time through!)
    mkdir trf
    cd trf
    rm -rf jobList
    foreach file (/cluster/bluearc/gg0/split100/*.fa)
	set root=$file:t:r
	echo "/cluster/bin/i386/trfBig -trf=/cluster/bin/i386/trf $file /dev/null -bedAt=/cluster/data/hg16/bed/blastz.gg0/trf/${root}.bed -tempDir=/tmp" >> jobList
    end
    #(run jobList on cluster)  -- took 2.5 min.

    # add new softmasking to reflect TRF output
    mkdir /cluster/bluearc/gg0/split100_with_trf
    rm -rf jobList
    foreach file (/cluster/bluearc/gg0/split100/*.fa)
	set root=$file:t:r
	echo "/cluster/bin/i386/maskOutFa -softAdd $file /cluster/data/hg16/bed/blastz.gg0/trf/${root}.bed /cluster/bluearc/gg0/split100_with_trf/${root}.fa" >> jobList
    end
    (run jobList on cluster)  # took <1 min.

    # now set up for BLASTZ (picking up with instructions above for
    # mouse and rat)

    cat << '_EOF_' > DEF
# chicken vs. human
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/home/angie/schwartzbin:/cluster/home/kent/bin/i386

ALIGN=blastz-run
BLASTZ=blastz
BLASTZ_H=2000
BLASTZ_Q=/cluster/data/penn/HoxD55.q
BLASTZ_ABRIDGE_REPEATS=0

# TARGET
# Human
SEQ1_DIR=/iscratch/i/gs.17/build34/bothMaskedNibs
# not used
SEQ1_RMSK=
# not used
SEQ1_FLAG=
SEQ1_SMSK=
SEQ1_IN_CONTIGS=0
SEQ1_CHUNK=10000000
SEQ1_LAP=10000

# QUERY
# Chicken
SEQ2_DIR=/cluster/bluearc/gg0/split100_with_trf
# not currently used
SEQ2_RMSK=
# not currently used
SEQ2_FLAG=
SEQ2_SMSK=
SEQ2_IN_CONTIGS=1
SEQ2_CHUNK=
SEQ2_LAP=

BASE=/cluster/store4/gs.17/build34/bed/blastz.gg0

DEF=$BASE/DEF
RAW=$BASE/raw
CDBDIR=$BASE
SEQ1_LEN=$BASE/S1.len
SEQ2_LEN=$BASE/S2.len
'_EOF_'
    # << this line makes emacs coloring happy

# Save the DEF file in the current standard place
    DS=`date -I`
    cp DEF ~angie/hummus/DEF.gg0-hg16.$DS

    # source the DEF file to establish environment for following commands
    . ./DEF

    # follow the next set of directions slavishly
    mkdir -p $BASE/run
    # give up on avoiding angie's directories
    # tcl script
    # creates xdir.sh and joblist run/j
    ~angie/hummus/make-joblist $DEF > $BASE/run/j

    # xdir.sh makes a bunch of result directories in $BASE/raw/
    # based on chrom name and CHUNK size
    sh $BASE/xdir.sh
    cd $BASE/run

    # now edit j to prefix path to executable name
    # NOTE: we should have a controlled version of schwartz bin executables
    sed -e 's#^#/cluster/bin/penn/#' j > j2
    wc -l j*
    head j2

    # make sure the j2 edits are OK, then use it:
    mv j2 j

    # para create will create the file: 'batch' for the cluster run
    para create j
    para try
    para check
    para push
    # ... etc ...

#Completed: 33561 of 33561 jobs
#CPU time in finished jobs:   11426279s  190437.98m  3173.97h  132.25d  0.362 y
#IO & Wait Time:                212940s    3549.01m    59.15h    2.46d  0.007 y
#Average job time:                 347s       5.78m     0.10h    0.00d
#Longest job:                     4036s      67.27m     1.12h    0.05d
#Submission to last job:         16433s     273.88m     4.56h    0.19d

    # post-process blastz
    ssh kk
    cd /cluster/data/hg16/bed/blastz.gg0
    #   source the DEF file again in case you are coming back to this
    #	(must be bash shell)

    . ./DEF
    
    # a new run directory
    mkdir -p run.1
    
    mkdir -p $BASE/lav
    
    # create a new job list to convert out files to lav
    /cluster/bin/scripts/blastz-make-out2lav $DEF $BASE \
                        > run.1/jobList
    cd run.1

    # make sure the job list is OK
    wc -l jobList
       # 339 jobs 
    head jobList

    # run on cluster
    ssh kk
    cd /cluster/data/hg16/bed/blastz.rn3/run.1
    para create jobList
    para try
    para check
    para push
    # etc.

#Completed: 339 of 339 jobs
#CPU time in finished jobs:       8611s     143.52m     2.39h    0.10d  0.000 y
#IO & Wait Time:                106450s    1774.17m    29.57h    1.23d  0.003 y
#Average job time:                 339s       5.66m     0.09h    0.00d
#Longest job:                      456s       7.60m     0.13h    0.01d
#Submission to last job:           465s       7.75m     0.13h    0.01d

    # convert lav files to axt
    ssh kk
    cd /cluster/data/hg16/bed/blastz.gg0
    mkdir axtChrom
    
    # a new run directory
    mkdir run.2
    cd run.2

    # create custom version of blastz-chromlav2axt with -fa option,
    # because nibs aren't available for chicken
    cp /cluster/bin/scripts/blastz-chromlav2axt .
    # (hand edit: add -fa option to call to lavToAxt)

    # create template file for gensub2
    # usage:  blastz-chromlav2axt lav-dir axt-file seq1-dir seq2-dir
    cat << '_EOF_' > gsub
#LOOP
/cluster/store4/gs.17/build34/bed/blastz.gg0/run.2/blastz-chromlav2axt /cluster/store4/gs.17/build34/bed/blastz.gg0/lav/$(root1) {check out line+ /cluster/store4/gs.17/build34/bed/blastz.gg0/axtChrom/$(root1).axt} /iscratch/i/gs.17/build34/bothMaskedNibs /cluster/bluearc/gg0/fa/chicken_with_trf.fa
#ENDLOOP
'_EOF_'
    # << this line makes emacs coloring happy
    ls -1S /cluster/store4/gs.17/build34/bed/blastz.gg0/lav > chrom.list
    gensub2 chrom.list single gsub jobList
    wc -l jobList
        # 42 jobs
    head jobList

    para create jobList
    para try
    para check
    para push
    # ... etc ...

#Completed: 39 of 42 jobs
#Crashed: 3 jobs
#CPU time in finished jobs:      32763s     546.05m     9.10h    0.38d  0.001 y
#IO & Wait Time:                 48182s     803.03m    13.38h    0.56d  0.002 y
#Average job time:                2076s      34.59m     0.58h    0.02d
#Longest job:                     5291s      88.18m     1.47h    0.06d
#Submission to last job:          5291s      88.18m     1.47h    0.06d

    # The crashes are three of the "randoms" (chr8, 18, 19) -- parasol
    # thinks they crashed because of 0-length output files
    
    # This run took quite a bit longer than with mouse and rat, presumably
    # because of the use of the fa file

    #	Remove the empty axtChrom/chr*_random.axt files to avoid future
    #	processing errors

    # translate sorted axt files into psl
    ssh eieio
    cd /cluster/data/hg16/bed/blastz.gg0
    mkdir -p pslChrom
    set tbl = "blastzGg0"
    foreach f (axtChrom/chr*.axt)
      set c=$f:t:r
      echo "Processing chr $c"
      /cluster/bin/i386/axtToPsl $f S1.len S2.len pslChrom/${c}_${tbl}.psl
    end
    # (~5 minutes)

    # Load database tables
    ssh hgwdev
    set tbl = "blastzGg0"
    cd /cluster/data/hg16/bed/blastz.gg0/pslChrom
    /cluster/bin/i386/hgLoadPsl -noTNameIx hg16 chr*_${tbl}.psl
    #	New entry in human/hg16/trackDb.ra
#	track blastzGg0
#	shortLabel Chicken Blastz
#	longLabel Blastz Chicken (Gg0-contigs, 5.2x coverage)
#	group compGeno
#	priority 145.9
#	visibility hide
#	color 100,50,0
#	altColor 255,240,200
#	spectrum on
#	type psl xeno 


# MAKE BLASTZ BEST CHICKEN (finished, Adam, 11/3/03)

    # Consolidate AXT files to chrom level, sort, pick best, make psl.
    ssh eieio
    cd /cluster/data/hg16/bed/blastz.gg0/axtChrom
    mkdir -p /cluster/bluearc/hg16/bed/blastz.gg0/axtChrom
    
    # copy chrom axt's to bluearc, to avoid hitting fileserver too hard
    cp -p *.axt /cluster/bluearc/hg16/bed/blastz.gg0/axtChrom

    ssh kk
    cd /cluster/data/hg16/bed/blastz.gg0
    mkdir -p axtBest pslBest
    mkdir run.3
    cd run.3

    # create script to filter files 
    cat << '_EOF_' > doBestAxt
#!/bin/csh -f
# usage: doBestAxt chr axt-file best-file psl-file
/cluster/bin/i386/axtBest $2 $1 $3 -minScore=300
sleep 1
/cluster/bin/i386/axtToPsl $3 /cluster/data/hg16/bed/blastz.gg0/S1.len \
	/cluster/data/hg16/bed/blastz.gg0/S2.len $4
'_EOF_'
    # << this line makes emacs coloring happy

    # NOTE: in a subsequent run, we have used -minScore=6000 and added
    # the -matrix option to use HoxD55.q (need to add a line with gap
    # penalties to the bottom of the score matrix file, e.g., "O =
    # 400, E = 30"; see
    # /cluster/data/hg16/bed/blastz.gg0/run.3.2003-11-11).  These new
    # options should be considered part of the standard procedure, at
    # least for now.

    chmod +x doBestAxt
    cd ../axtChrom
    ls -1S | sed 's/.axt$//' > ../run.3/chrom.list
    cd ../run.3

    # create template for cluster job
    cat << '_EOF_' > gsub
#LOOP
doBestAxt $(root1) {check in line+ /cluster/bluearc/hg16/bed/blastz.gg0/axtChrom/$(root1).axt} {check out line+ /cluster/data/hg16/bed/blastz.gg0/axtBest/$(root1).axt} {check out line+  /cluster/data/hg16/bed/blastz.gg0/pslBest/$(root1)_blastzBestGg0.psl}
#ENDLOOP
'_EOF_'
    # << this line makes emacs coloring happy
    gensub2 chrom.list single gsub jobList
    wc -l jobList
    head jobList

    cd /cluster/data/hg16/bed/blastz.gg0
    cd run.3
    para create jobList
    para try
    para check
    para push
#Checking finished jobs
#Completed: 39 of 39 jobs
#CPU time in finished jobs:       1111s      18.52m     0.31h    0.01d  0.000 y
#IO & Wait Time:                  7775s     129.58m     2.16h    0.09d  0.000 y
#Average job time:                 228s       3.80m     0.06h    0.00d
#Longest job:                     1375s      22.92m     0.38h    0.02d
#Submission to last job:          1375s      22.92m     0.38h    0.02d

    # create human/chicken mafs
    cd /cluster/data/hg16/bed/blastz.gg0
    mkdir maf
    foreach file (axtBest/*.axt)
	set root=$file:t:r
	echo $root
	/cluster/bin/i386/axtToMaf $file S1.len S2.len maf/${root}.maf.unfixed -tPrefix=hg16. -qPrefix=gg0.
	/cluster/bin/scripts/fixmaf.pl < maf/${root}.maf.unfixed > maf/${root}.maf
    end

# MULTIZ HUMAN/MOUSE/RAT/CHICKEN (Finished, Adam, 11/3)
# (chicken added to human/mouse/rat alignments described above [HUMOR])

    ssh kk
    mkdir /cluster/data/hg16/bed/multiz.hg16mm3rn3gg0
    cd /cluster/data/hg16/bed/multiz.hg16mm3rn3gg0
    mkdir hmrc

    # wrapper script for multiz
    cat << EOF > mz
#!/bin/csh
/cluster/bin/penn/tbaBin/multiz \$1 \$2 - > \$3
EOF
    chmod +x mz

    # put the MAFs on bluearc
    ssh eieio
    mkdir -p /cluster/bluearc/multiz.hg16mm3rn3gg0/hmr
    mkdir -p /cluster/bluearc/multiz.hg16mm3rn3gg0/hc
    cp /cluster/data/hg16/bed/humor.2003-09-08/hmr/*.maf /cluster/bluearc/multiz.hg16mm3rn3gg0/hmr
    cp /cluster/data/hg16/bed/blastz.gg0/maf/*.maf /cluster/bluearc/multiz.hg16mm3rn3gg0/hc
    logout  # back to kk

    # set up joblist
    rm -f jobList
    foreach file (/cluster/bluearc/multiz.hg16mm3rn3gg0/hmr/*.maf) 
	set root=`echo $file:t:r | sed 's/\.hmr//'`
	echo "/cluster/data/hg16/bed/multiz.hg16mm3rn3gg0/mz /cluster/bluearc/multiz.hg16mm3rn3gg0/hc/${root}.maf $file /cluster/data/hg16/bed/multiz.hg16mm3rn3gg0/hmrc/${root}.maf" >> jobList
    end
    # (run on cluster)  41 jobs, ~10 min

    # FIXME: maybe should run on the common denominator of the two
    # sets, then copy over remaining MAFs (?)  In this case, copied
    # chr8_random and chr18_random from hmr

    # clean up bluearc (these are big files!)
    rm -r /cluster/bluearc/multiz.hg16mm3rn3gg0

    # setup external files for database reference
    ssh hgwdev
    mkdir -p /gbdb/hg16/multizMm3Rn3Gg0
    ln -s /cluster/data/hg16/bed/multiz.hg16mm3rn3gg0/hmrc/*.maf /gbdb/hg16/multizMm3Rn3Gg0

    # load into database
#    cd $multizDir/hmr/*.maf
    /cluster/bin/i386/hgLoadMaf -warn hg16 multizMm3Rn3Gg0

    # add dummy entry to dbDb so that name shows up as "Chicken"
    echo 'insert into dbDb (name, description, nibPath, organism, defaultPos, active, orderKey, genome, scientificName) values ("gg0", "November 2003", "", "Chicken", "", 0, 0, "Chicken", "Gallus gallus");' | hgsql -h genome-testdb hgcentraltest


# BLASTZ Mm4  (DONE - 2003-10-31 - Hiram)

    ssh kk
    mkdir -p /cluster/data/hg16/bed/blastz.mm4.2003-10-29
    cd /cluster/data/hg16/bed
    ln -s  blastz.mm4.2003-10-29 blastz.mm4
    cd blastz.mm4

    cat << '_EOF_' > DEF
# human vs. mouse
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/home/angie/schwartzbin:/cluster/home/kent/bin/i386

ALIGN=blastz-run
BLASTZ=blastz
BLASTZ_H=2000
BLASTZ_ABRIDGE_REPEATS=1

# TARGET
# Human
SEQ1_DIR=/iscratch/i/gs.17/build34/bothMaskedNibs
# RMSK not currently used
SEQ1_RMSK=/iscratch/i/gs.17/build34/rmsk
# FLAG not currently used
SEQ1_FLAG=-primate
SEQ1_SMSK=/iscratch/i/gs.17/build34/linSpecRep.notInMouse
SEQ1_IN_CONTIGS=0
SEQ1_CHUNK=10000000
SEQ1_LAP=10000


# QUERY
# Mouse
SEQ2_DIR=/scratch/mus/mm4/softNib
# RMSK not currently used
SEQ2_RMSK=/scratch/mus/mm4/rmsk
# FLAG not currently used
SEQ2_FLAG=-rodent
SEQ2_SMSK=/scratch/mus/mm4/linSpecRep.notInHuman
SEQ2_IN_CONTIGS=0
SEQ2_CHUNK=30000000
SEQ2_LAP=0

BASE=/cluster/data/hg16/bed/blastz.mm4

DEF=$BASE/DEF
RAW=$BASE/raw
CDBDIR=$BASE
SEQ1_LEN=$BASE/S1.len
SEQ2_LEN=$BASE/S2.len
'_EOF_'
    # << this line keeps emacs coloring happy

    # prepare first cluster run
    ssh kk
    cd /cluster/data/hg16/bed/blastz.mm4
    source DEF
    /cluster/data/mm4/jkStuff/BlastZ_run0.sh
    cd run.0
    para try, check, push, check, ....
# Completed: 43390 of 43392 jobs
# Crashed: 2 jobs
# CPU time in finished jobs:   15770466s  262841.10m  4380.69h  182.53d  0.500 y
# IO & Wait Time:                626227s   10437.11m   173.95h    7.25d  0.020 y
# Average job time:                 378s       6.30m     0.10h    0.00d
# Longest job:                     8052s     134.20m     2.24h    0.09d
# Submission to last job:         45886s     764.77m    12.75h    0.53d
    # the two crashed jobs:
# /cluster/home/angie/schwartzbin/blastz-run chr10.nib 40000001 50010000 chrX.nib 120000001 150000000 /cluster/data/hg16/bed/blastz.mm4/DEF
# blastz: Illegal character '@' in sequence file.
# /cluster/home/angie/schwartzbin/blastz-run chr18.nib 1 10010000 chr15.nib 60000001 90000000 /cluster/data/hg16/bed/blastz.mm4/DEF
# seq_read(/tmp/blastz.zstcGa/s1.fa): Input/output error
    # unusual errors.  Simply try them again and they work

    #	Second cluster run to convert the .out's to .lav's
    #	You do NOT want to run this on the big cluster.  It brings
    #	the file server to its knees.  Run this on the small cluster.
    ssh kkr1u00
    cd /cluster/data/hg16/bed/blastz.mm4
    source DEF
    /cluster/data/mm4/jkStuff/BlastZ_run1.sh
    cd run.1
    para try, check, push, etc ...
# Completed: 339 of 339 jobs
# CPU time in finished jobs:      15434s     257.23m     4.29h    0.18d  0.000 y
# IO & Wait Time:                  2393s      39.89m     0.66h    0.03d  0.000 y
# Average job time:                  53s       0.88m     0.01h    0.00d
# Longest job:                     1128s      18.80m     0.31h    0.01d
# Submission to last job:          2561s      42.68m     0.71h    0.03d

    #	Third cluster run to convert lav's to axt's
    source DEF
    cd /cluster/data/hg16/bed/blastz.mm4
    /cluster/data/mm4/jkStuff/BlastZ_run2.sh
    cd run.2
    para try, check, push, etc ...
# Completed: 38 of 42 jobs
# Crashed: 4 jobs
# CPU time in finished jobs:       1826s      30.44m     0.51h    0.02d  0.000 y
# IO & Wait Time:                  9781s     163.01m     2.72h    0.11d  0.000 y
# Average job time:                 305s       5.09m     0.08h    0.00d
# Longest job:                     1489s      24.82m     0.41h    0.02d
# Submission to last job:          5125s      85.42m     1.42h    0.06d
    # FAILED: chr1, chr19, chr19_random, chr5
    # try these on kolossus
    ssh kolossus
    cd /cluster/data/hg16/bed/blastz.mm4/run.2
    /cluster/data/mm4/jkStuff/x86_64-chromlav2axt \
	/cluster/data/hg16/bed/blastz.mm4/lav/chr1 \
	/cluster/data/hg16/bed/blastz.mm4/axtChrom/chr1.axt \
	/cluster/data/hg16/nib /cluster/data/mm4/nib
    /cluster/data/mm4/jkStuff/x86_64-chromlav2axt \
	/cluster/data/hg16/bed/blastz.mm4/lav/chr19 \
	/cluster/data/hg16/bed/blastz.mm4/axtChrom/chr19.axt \
	/cluster/data/hg16/nib /cluster/data/mm4/nib
    /cluster/data/mm4/jkStuff/x86_64-chromlav2axt \
	/cluster/data/hg16/bed/blastz.mm4/lav/chr19_random \
	/cluster/data/hg16/bed/blastz.mm4/axtChrom/chr19_random.axt \
	/cluster/data/hg16/nib /cluster/data/mm4/nib
    /cluster/data/mm4/jkStuff/x86_64-chromlav2axt \
	/cluster/data/hg16/bed/blastz.mm4/lav/chr5 \
	/cluster/data/hg16/bed/blastz.mm4/axtChrom/chr5.axt \
	/cluster/data/hg16/nib /cluster/data/mm4/nib
    # about 26 minutes total time for those four
    # chr19_random.axt is still empty, remove it to avoid errors later

    # translate sorted axt files into psl
    ssh eieio
    cd /cluster/data/hg16/bed/blastz.mm4
    mkdir -p pslChrom
    set tbl = "blastzMm4"
    foreach f (axtChrom/chr*.axt)
      set c=$f:t:r
      echo "Processing chr $c"
      /cluster/bin/i386/axtToPsl $f S1.len S2.len pslChrom/${c}_${tbl}.psl
    end
    #	That takes about 30 minutes

    # Load database tables
    ssh hgwdev
    cd /cluster/data/hg16/bed/blastz.mm4/pslChrom
    /cluster/bin/i386/hgLoadPsl -noTNameIx hg16 chr*_blastzMm4.psl
    # this is a 55 minute job

    # featureBits on blastzMm3 or 4 will not work on hgwdev, runs out of
    # memory.  But if you reset your ~/.hg.conf to use the read-only
    #	user and contact the hgwdev host, and build featureBits as a
    #	x86_64 binary, you can run it on kolossus:
    # featureBits hg16 blastzMm3
    # 1050190071 bases of 2865248791 (36.653%) in intersection
    # featureBits hg16 blastzMm4
    # 1056761609 bases of 2865248791 (36.882%) in intersection


# CHAIN Mm4 BLASTZ (DONE - 2003-11-03 - Hiram)

# The axtChain is best run on the small kluster, or the kk9 kluster
#	in this case, it was run on the kk kluster
    ssh kkr1u00
    mkdir -p /cluster/data/hg16/bed/blastz.mm4/axtChain/run1
    cd /cluster/data/hg16/bed/blastz.mm4/axtChain/run1
    mkdir out chain

    ls -1S /cluster/data/hg16/bed/blastz.mm4/axtChrom/*.axt > input.lst
    cat << '_EOF_' > gsub
#LOOP
doChain {check in exists $(path1)} {check out line+ chain/$(root1).chain} {check out line+ out/$(root1).out}
#ENDLOOP
'_EOF_'
    # << this line makes emacs coloring happy

    cat << '_EOF_' > doChain
#!/bin/csh
    axtFilter -notQ_random $1 | axtChain stdin \
	/iscratch/i/gs.17/build34/bothMaskedNibs \
	/iscratch/i/mm4/softNib $2 > $3
'_EOF_'
    # << this line makes emacs coloring happy
    chmod a+x doChain

    # 41 jobs
    gensub2 input.lst single gsub jobList
    para create jobList
    para try
    para push # ... etc ...
# Completed: 41 of 41 jobs
# CPU time in finished jobs:      24547s     409.12m     6.82h    0.28d  0.001 y
# IO & Wait Time:                  3955s      65.91m     1.10h    0.05d  0.000 y
# Average job time:                 695s      11.59m     0.19h    0.01d
# Longest job:                     7336s     122.27m     2.04h    0.08d
# Submission to last job:          8251s     137.52m     2.29h    0.10d

    # now on the file server, sort chains
    ssh eieio
    cd /cluster/data/hg16/bed/blastz.mm4/axtChain
    time chainMergeSort run1/chain/*.chain > all.chain
    # real    10m5.525s
    # user    8m9.350s
    # sys     0m48.450s

    time chainSplit chain all.chain
    # real    10m23.201s
    # user    7m51.930s
    # sys     0m53.910s

    # these steps take ~20 minutes
    # optionally: rm run1/chain/*.chain

    # Load chains into database
    # next machine
    ssh hgwdev
    cd /cluster/data/hg16/bed/blastz.mm4/axtChain/chain
    foreach i (*.chain)
        set c = $i:r
        hgLoadChain hg16 ${c}_chainMm4 $i
        echo done $c
    end

# NET Mm4 (DONE - 2003-11-03 - Hiram)

    ssh eieio
    cd /cluster/data/hg16/bed/blastz.mm4/axtChain
    mkdir preNet
    cd chain
    foreach i (*.chain)
      echo preNetting $i
      /cluster/bin/i386/chainPreNet $i /cluster/data/hg16/chrom.sizes \
                        /cluster/data/mm4/chrom.sizes ../preNet/$i
    end
    # real    11m58.018s
    # user    4m10.390s
    # sys     2m10.780s

    cd ..
    mkdir n1
    cd preNet
    foreach i (*.chain)
      set n = $i:r.net
      echo primary netting $i
      /cluster/bin/i386/chainNet $i -minSpace=1 /cluster/data/hg16/chrom.sizes \
                            /cluster/data/mm4/chrom.sizes ../n1/$n /dev/null
    end

    cd ..
    cat n1/*.net | /cluster/bin/i386/netSyntenic stdin hNoClass.net
    # memory usage 2505211904, utime 15891 s/100, stime 3245

    ssh hgwdev
    cd /cluster/data/hg16/bed/blastz.mm4/axtChain
    time netClass hNoClass.net hg16 mm4 mouse.net \
	-tNewR=/cluster/bluearc/scratch/hg/gs.17/build34/linSpecRep.notInMouse \
	-qNewR=/cluster/bluearc/scratch/mus/mm4/linSpecRep.notInHuman
    # real    14m2.042s
    # user    10m6.450s
    # sys     1m46.950s

    # If things look good do
    ssh eieio
    cd /cluster/data/hg16/bed/blastz.mm4/axtChain
    rm -r n1 hNoClass.net
    # Make a 'syntenic' subset of these with
    netFilter -syn mouse.net > mouseSyn.net
    # real    9m44.445s
    # user    6m42.660s
    # sys     1m10.100s

    # Load the nets into database
    ssh hgwdev
    cd /cluster/data/hg16/bed/blastz.mm4/axtChain
    netFilter -minGap=10 mouse.net |  hgLoadNet hg16 netMm4 stdin
    netFilter -minGap=10 mouseSyn.net | hgLoadNet hg16 syntenyNetMm4 stdin
#   real    12m53.070s
#   user    6m6.540s
#   sys     0m50.580s
    # check results
    # featureBits hg16 netMm4
    # 2823565051 bases of 2865248791 (98.545%) in intersection
    # featureBits hg16 netMm3
    # 2834484276 bases of 2865248791 (98.926%) in intersection

    # featureBits hg16 syntenyNetMm3
    # 2804467412 bases of 2865248791 (97.879%) in intersection
    # featureBits hg16 syntenyNetMm4
    # 2786960572 bases of 2865248791 (97.268%) in intersection

    # Add entries for net and chain to mouse/hg16 trackDb

    # make net
    ssh eieio
    cd /cluster/data/hg16/bed/blastz.mm4/axtChain
    mkdir mouseNet
    time netSplit mouse.net mouseNet
    # real    10m44.479s
    # user    6m43.680s
    # sys     1m20.860s

    mkdir ../axtNet
    foreach n (mouseNet/chr*.net)
	set c=$n:t:r
	echo "netToAxt: $c.net -> $c.axt"
	rm -f ../axtNet/$c.axt
	netToAxt mouseNet/$c.net chain/$c.chain \
		/cluster/data/hg16/nib \
		/cluster/data/mm4/nib \
		../axtNet/$c.axt
	echo "Complete: $c.net -> axtNet/$c.axt"
    end

    ssh hgwdev
    mkdir -p /cluster/data/hg16/bed/blastz.mm4/axtBest
    cd /cluster/data/hg16/bed/blastz.mm4/axtBest
    ln -s ../axtNet/chr*.axt .

    # copy net axt's to download area
    ssh hgwdev
    cd /cluster/data/hg16/bed/blastz.mm4/axtNet
    mkdir -p /usr/local/apache/htdocs/goldenPath/hg16/vsMm4/axtNet
    cp -p *.axt /usr/local/apache/htdocs/goldenPath/hg16/vsMm4/axtNet
    cd /usr/local/apache/htdocs/goldenPath/hg16/vsMm4/axtNet
    gzip *.axt
    # add README.txt file to dir, if needed

    #  Convert those axt files to psl
    ssh eieio
    cd /cluster/data/hg16/bed/blastz.mm4
    mkdir pslBest
    foreach a (axtBest/chr*.axt)
	set c=$a:t:r
	echo "processing $c.axt -> ${c}_blastzBestMm4.psl"
    /cluster/bin/i386/axtToPsl axtBest/${c}.axt \
	S1.len S2.len pslBest/${c}_blastzBestMm4.psl
	echo "Done: ${c}_blastzBestMm4.psl"
    end

    # Load tables
    ssh hgwdev
    cd /cluster/data/hg16/bed/blastz.mm4/pslBest
    time /cluster/bin/i386/hgLoadPsl -noTNameIx hg16 chr*_blastzBestMm4.psl
    # real    10m47.853s
    # user    2m48.700s
    # sys     0m24.250s

     # check results
    # featureBits hg16 blastzBestMm4
    # 996722004 bases of 2865248791 (34.787%) in intersection
    # featureBits hg16 blastzBestMm3
    # 1007362800 bases of 2865248791 (35.158%) in intersection

    # Make /gbdb links and add them to the axtInfo table:
     mkdir -p /gbdb/hg16/axtBestMm4
     cd /gbdb/hg16/axtBestMm4
     ln -s /cluster/data/hg16/bed/blastz.mm4/axtNet/chr*.axt .
     cd /cluster/data/hg16/bed/blastz.mm4/axtNet
     rm -f axtInfoInserts.sql
     foreach f (/gbdb/hg16/axtBestMm4/chr*.axt)
       set chr=$f:t:r
       echo "INSERT INTO axtInfo (species, alignment, chrom, fileName) \
                VALUES ('mm4','Blastz Best in Genome','$chr','$f');" \
         >> axtInfoInserts.sql
     end
    hgsql hg16 < ~/kent/src/hg/lib/axtInfo.sql
    #	table axtInfo may already exist, ignore create error.
    hgsql hg16 < axtInfoInserts.sql

# MAKING THE AXTTIGHT FROM AXTBEST (DONE - 2003-11-04 - Hiram)
    # After creating axtBest alignments above, use subsetAxt to get axtTight:
    ssh eieio
    cd /cluster/data/hg16/bed/blastz.mm4/axtNet
    mkdir ../axtTight
    tcsh
    foreach i (*.axt)
      echo subsetAxt $i ../axtTight/$i
      subsetAxt  $i ../axtTight/$i \
        ~kent/src/hg/mouseStuff/subsetAxt/coding.mat 3400
    end

    # translate to psl
    cd ../axtTight
    mkdir ../pslTight
    foreach i (*.axt)
      set c = $i:r
      axtToPsl $i ../S1.len ../S2.len ../pslTight/${c}_blastzTightMm4.psl
      echo "Done: $i"
    end

    # Load tables into database
    ssh hgwdev
    cd /cluster/data/hg16/bed/blastz.mm4/pslTight
    hgLoadPsl -noTNameIx hg16 chr*_blastzTightMm4.psl

    # check results
    # featureBits hg16 blastzTightMm4
    # 162641577 bases of 2865248791 (5.676%) in intersection
    # featureBits hg16 blastzTightMm3
    # 164148288 bases of 2865248791 (5.729%) in intersection

    # copy to axt's to download area
    cd /cluster/data/hg16/bed/blastz.mm4/axtTight
    mkdir -p /usr/local/apache/htdocs/goldenPath/hg16/vsMm4/axtTight
    cp -p *.axt /usr/local/apache/htdocs/goldenPath/hg16/vsMm4/axtTight
    cd /usr/local/apache/htdocs/goldenPath/hg16/vsMm4/axtTight
    gzip *.axt
    # add README.txt file to dir, if needed


# RUNNING AXTBEST (DONE 12/2/03 angie)
    # Penn State complained of a loss in coverage when using axtNet instead 
    # of axtBest.  So run axtBest for them, and axtToMaf in prep for multiz.
    ssh eieio
    cd /cluster/data/hg16/bed/blastz.mm4.2003-10-29
    # I removed links from axtBest/* to axtNet/*
    foreach f (axtChrom/chr*.axt)
      set chr=$f:t:r
      echo axtBesting $chr
      axtBest $f $chr axtBest/$chr.axt -minScore=300
    end
    # As usual, ran out of mem on chr19, so use kolossus & 2 passes:
    ssh kolossus
    cd /cluster/data/hg16/bed/blastz.mm4.2003-10-29
    set chr = chr19
    foreach d (lav/$chr/*.lav)
      set smallout=$d.axt
      lavToAxt $d /cluster/data/hg16/nib /cluster/data/mm4/nib stdout \
      | axtSort stdin $smallout
      axtBest $smallout $chr $smallout:r.axtBest
    end
    cat `ls -1 lav/$chr/*.axtBest | sort -g` \
    > lav/$chr/$chr.axtBestPieces
    axtBest lav/$chr/$chr.axtBestPieces $chr axtBest/$chr.axt
    rm lav/$chr/*.axt*


# MAKE MAF FROM AXTBEST FOR PENN STATE (DONE 12/2/03 angie)
    ssh eieio
    cd /cluster/data/hg16/bed/blastz.mm4.2003-10-29
    mkdir mafBest
    foreach f (axtBest/chr*.axt)
      set maf = mafBest/$f:t:r.hm.maf
      echo translating $f to $maf
      axtToMaf $f \
            /cluster/data/hg16/chrom.sizes /cluster/data/mm4/chrom.sizes \
            $maf -tPrefix=hg16. -qPrefix=mm4.
    end


# MAKING MOUSE MM4 SYNTENY (DONE 2003-11-05 - Hiram)

    ssh hgwdev
    mkdir -p /cluster/data/hg16/bed/syntenyMm4
    cd /cluster/data/hg16/bed/syntenyMm4

#	updating the scripts in use here from 
#	/cluster/data/hg16/bed/syntenyMm3
    cp -p /cluster/data/hg16/bed/syntenyMm3/*.pl .

#	fix the syntenicBest script to not try and work on empty
#	results from its queries.  Also, set the db and table name
#	in the script itself so the arguments are not needed

    ./syntenicBest.pl
#	on the order of 3 to 4 hours to complete syntenicBest
#	almost no time, or only a few minutes at most for any of
#	the rest
    ../syntenyMm3/smooth.pl
    ../syntenyMm3/joinsmallgaps.pl
#	set db and table name in fillgap.pl
    ./fillgap.pl
    ../syntenyMm3/synteny2bed.pl
    hgLoadBed hg16 syntenyMm4 ucsc100k.bed
# featureBits hg16 syntenyMm3
# 2651945520 bases of 2865248791 (92.556%) in intersection
# featureBits hg16 syntenyMm4
# 2560252977 bases of 2865248791 (89.355%) in intersection
    # hgTracks.c needed to be updated to recognize syntenyMm4 so it
    # would color properly.


# TIGR GENE INDEX (DONE 2004-05020 Fan)
    mkdir -p /cluster/data/hg16/bed/tigr
    cd /cluster/data/hg16/bed/tigr
    wget ftp://ftp.tigr.org/pub/data/tgi/Homo_sapiens/TGI_track_HumanGenome_hg16_05-2004.tgz
    tar xvzf TGI*.tgz

    foreach f (*cattle*)
      set f1 = `echo $f | sed -e 's/cattle/cow/g'`
      mv $f $f1
    end

    foreach o (mouse cow human pig rat)
      echo $o
      setenv O $o
      foreach f (chr*_$o*s)
        tail +2 $f | perl -wpe 's /THC/TC/; s/(TH?C\d+)/$ENV{O}_$1/;' > $f.gff
      end
    end

    ssh hgwdev
    cd /cluster/data/hg16/bed/tigr
    hgsql hg16 -e "drop table tigrGeneIndex"
    hgsql hg16 < ~/kent/src/hg/lib/tigrGeneIndex.sql

    foreach f (*.gff)
        echo Processing $f ...
        /cluster/home/fanhsu/bin/i386/ldHgGene -oldTable -exon=TC hg16 tigrGeneIndex $f
        hgsql hg16 -e "select count(*) from tigrGeneIndex"
    end
    # Total of 354491 entries created in tigrGeneIndex table.

    hgsql hg16 -e "update tigrGeneIndex set cdsStart = txStart;"
    hgsql hg16 -e "update tigrGeneIndex set cdsEnd = txEnd;"

    checkTableCoords hg16 tigrGeneIndex

    gzip *.gff *TCs

# LOAD VEGA GENES AND PSEUDOGENES (DONE 2003-11-11 braney )
    #####
    ##### WARNING: vega procedure changed, use process later in file
    #####
    mkdir ~/hg16/bed/vega
    cd ~/hg16/bed/vega
    
    wget "http://www.sanger.ac.uk/Users/keenan/vega_homo_sapiens_core_4_0.gtf.gz"
    gunzip vega_homo_sapiens_core_4_0.gtf.gz

# Load genes and Immunoglobulin/Pseudogenes into 2 separate tracks
    awk '$2 != "Pseudogene" && $2 != "Ig_Pseudogene_Segment" && $2 != "Ig_Segment" {print "chr"$0}' \
	vega_homo_sapiens_core_4_0.gtf  > vega_fixed.gtf
    awk '$2 == "Pseudogene" || $2 == "Ig_Pseudogene_Segment" || $2 == "Ig_Segment" {print "chr"$0}' \
    	vega_homo_sapiens_core_4_0.gtf  > vega_pseudo.gtf
    ldHgGene hg16 vegaGene vega_fixed.gtf -gtf
    ldHgGene hg16 vegaPseudoGene vega_pseudo.gtf -gtf

    wget "http://www.sanger.ac.uk/Users/keenan/vega_pep_dump_ncbi34.fa.gz"
    hgPepPred hg16 generic vegaPep vega_pep_dump_ncbi34.fa

    vegaBuildInfo vega_homo_sapiens_core_4_0.gtf vegaInfo.tab
    hgsql hg16 < ~/kent/src/hg/lib/vegaInfo.sql
    echo "load data local infile 'vegaInfo.tab' into table vegaInfo" | hgsql hg16

# Set cdsStart and cdsEnd to 0 if method is Novel_Transcript
    foreach ntname (`echo 'select name from vegaGene,vegaInfo \
                           where vegaGene.name = vegaInfo.transcriptId AND \
                                 vegaInfo.method = "Novel_Transcript"' \
                     | hgsql -N hg16`)
      echo "update vegaGene set cdsStart = 0 where name = '$ntname'" \
      | hgsql hg16
      echo "update vegaGene set cdsEnd = 0 where name = '$ntname'" \
      | hgsql hg16
    end



# LOAD FIRSTEF TRACK  Done 2003-07-31 braney
# Create firstEF track from Zhang lab at CSHL
# contacts
# Gengxin Chen <cheng@cshl.edu>
# Ivo Grosse <grosse@ipk-gatersleben.de>
# Michael Zhang <mzhang@cshl.edu>

    mkdir /cluster/data/hg16/bed/firstEF
    cd /cluster/data/hg16/bed/firstEF
# Got firstEF.txt from Gengzin 7/30/03
    hgLoadBed hg16 firstEF firstEF.txt

# Load chicken sequence loaded & processed by booch & acs (2003-11-4 kate)
    hgLoadSeq hg16 /gbdb/gg0/chicken.fa
        # 73234 sequences


# LOAD ENSEMBL GENES (DONE 2003-11-07 angie)
    mkdir /cluster/data/hg16/bed/ensembl
    cd /cluster/data/hg16/bed/ensembl
    # Get the ensembl protein data from 
    # http://www.ensembl.org/Homo_sapiens/martview
    # Follow this sequence through the pages:
    # Page 1) Make sure that the Homo_sapiens choice is selected. Hit next.
    # Page 2) Uncheck the "Limit to" box in the region choice. Then hit next.
    # Page 3) Choose the "Structures" box. 
    # Page 4) Choose GTF as the ouput.  choose gzip compression.  hit export.
    # Save as ensemblGene.gtf.gz

    # Ensembl handles random chromosomes differently than us, so we
    # strip this data.  Fortunately it just loses a couple of genes.
    # Add "chr" to front of each line in the gene data gtf file to make 
    # it compatible with our software.
    # Finally, get rid of the ".1" or ".2" after the name
    gunzip -c ensemblGene.gtf.gz \
    | grep -v ^6_DR51 \
    | grep -v ^DR51 \
    | grep -v _NT_ \
    | perl -wpe 's/^([0-9]|X|Y|Un)/chr$1/ \
                 || die "Line $. doesnt start with human chrom:\n$_"' \
    | sed -e 's/\..\"/\"/g' \
    > ensGene.gtf
    ssh hgwdev
    /cluster/bin/i386/ldHgGene hg16 ensGene \
      /cluster/data/hg16/bed/ensembl/ensGene.gtf

    # ensGtp associates geneId/transcriptId/proteinId for hgPepPred and 
    # hgKnownToSuper.  Use ensMart to create it as above, except:
    # Page 3) Choose the "Features" box. In "Ensembl Attributes", check 
    # Ensembl Gene ID, Ensembl Transcript ID, Ensembl Peptide ID.  
    # Choose Text, tab-separated as the output format.  Result name ensGtp.
    # Save file as ensGtp.txt.gz
    gunzip ensGtp.txt.gz
    hgsql hg16 < ~/kent/src/hg/lib/ensGtp.sql
    echo "load data local infile 'ensGtp.txt' into table ensGtp" | hgsql -N hg16

    # Load Ensembl peptides:
    # Get them from ensembl as above in the gene section except for
    # Page 3) Choose the "Sequences" box. 
    # Page 4) Transcripts/Proteins.  Peptide.  Format = FASTA.
    # Save file as ensemblPep.fa.gz
    gunzip ensemblPep.fa.gz
    hgPepPred hg16 ensembl ensemblPep.fa


LOAD GENOMIC DUPES (DONE - 2003-11-11 - Hiram)
o - Load genomic dupes
    ssh hgwdev
    mkdir /cluster/data/hg16/bed/genomicDups
    cd /cluster/data/hg16/bed/genomicDups
    # pick up Build34GenomicDups.gz from
    #	http://humanparalogy.cwru.edu/build34/files_for_ucsc/build34_ucsc.htm
    #	it has a user and password login.  you can use this wget command
    #	with the user/password:
    wget --http-user=X --http-passwd=X \
    "http://humanparalogy.cwru.edu/build34/files_for_ucsc/Build34GenomicDups.gz"
    gunzip *.gz
    # awk -f filter.awk oo33_dups_for_kent > genomicDups.bed
    hgsql hg16 < ~/kent/src/hg/lib/genomicDups.sql
    hgLoadBed hg16 -oldTable genomicDups Build34GenomicDups
    # load of genomicDups did not go as planned: 57702 record(s), 0 row(s) skipped, 57702 warning(s) loading bed.tab
    # There was an error in this data delivery.  To fixup:
    hgsql -e \
	'update genomicDups set name = concat(otherChrom,":",otherStart);' \
	hg16

# LOAD CHIMP NET (2003-11-20 kate)
# NOTE: Net preparation doc'ed in makePt0.doc
    ssh hgwdev
    cd /cluster/data/pt0/bed/blastz.hg16/axtChain
    netFilter -minGap=10 chimp.net |  hgLoadNet hg16 netPt0 stdin
    netFilter -minGap=10 chimpSyn.net | hgLoadNet hg16 syntenyNetPt0 stdin


# CHIMP BEST CHAINS, IN CHROMOSOME COORDINATES (2004-02-25 kate)
#       NOTE: start with scaffold-based human-reference reciprocal best chains 
#       doc'ed in makePt0.doc, then lift using scaffold lift file in panTro1
#  NOTENOTENOTE:  Angie redid this with chain renumbering
    ssh kksilo
    mkdir -p /cluster/data/hg16/bed/blastz-blat.panTro1
    cd /cluster/data/hg16/bed/blastz-blat.panTro1
    liftUp -chainQ best.chain \
        /cluster/data/panTro1/jkStuff/scaffolds.lft \
        warn /cluster/data/pt0/bed/blastz-blatHg16/human.best.chain
    chainSplit bestChain best.chain

    # Load chains into database
    ssh hgwdev
    cd /cluster/data/hg16/bed/blastz-blat.panTro1/bestChain
    foreach i (*.chain)
        set c = $i:r
        echo loading $c
        hgLoadChain hg16 ${c}_bestChainPanTro1 $i
    end


# CHIMP ALL CHAINS, IN CHROMOSOME COORDINATES (2004-02-25 kate)
    ssh kksilo
    cd /cluster/data/hg16/bed/blastz-blat.panTro1
    liftUp -chainQ all.chain \
        /cluster/data/panTro1/jkStuff/scaffolds.lft \
        warn /cluster/data/pt0/bed/blastz-blatHg16/all.chain
    chainSplit chain all.chain

    ssh hgwdev
    cd /cluster/data/hg16/bed/blastz-blat.panTro1/chain
    foreach i (*.chain)
        set c = $i:r
        hgLoadChain hg16 ${c}_chainPanTro1 $i
        echo done $c
    end

# CHIMP RECIPRCAL BEST NET, IN CHROMOSOME COORDS (kate)
#    Redo the netting on chrom-based chain files
    ssh kolossus
    cd /cluster/data/hg16/bed/blastz-blat.panTro1
     ~/bin/x86_64/chainNet all.chain -minSpace=10 \
        /cluster/data/hg16/chrom.sizes /cluster/data/panTro1/chrom.sizes \
        human.net chimp.net

    ssh kksilo
    cd /cluster/data/hg16/bed/blastz-blat.panTro1
    chainSwap all.chain all.swap.chain
    ~/bin/i386/netChainSubset chimp.net all.swap.chain stdout | \
        chainSort stdin chimpNet.chain

    ssh kolossus
    cd /cluster/data/hg16/bed/blastz-blat.panTro1
    ~/bin/x86_64/chainNet all.chain -minSpace=10 \
        /cluster/data/hg16/chrom.sizes /cluster/data/panTro1/chrom.sizes \
        human.net chimp.net



# UPDATE WOODY BINARIES (see PHYLOHMM CONSERVATION entries below)
# done, acs, 2003-11-19
    ssh hgwdev
    cd /cluster/data/woody   # better place?  don't have permission in /cluster/install
    cvs update -dP
    cd src
    make
    # make sure Makefile has INSTALLDIR = /cluster/bin/woody
    make install

# CFTR PHYLOHMM CONSERVERVATION
# done, acs, 2003-11-19 (currently using 9-way alignment)

# NOTE: essentially the same procedure applies for any Zoo or ENCODE
# target, as long as a suitable tree topology is available for the
# species in question (when distant species are included, e.g.,
# chicken and fish, the branch-length estimation procedure may need to
# be adapted slightly -- details to come)

    ssh hgwdev
    # (update woody binaries, if necessary -- see above)
    # make sure /cluster/bin/penn/tbaBin and /cluster/bin/woody in path

    mkdir -p /cluster/data/nisc/targets/cftr/phyloHMMcons
    cd /cluster/data/nisc/targets/cftr/phyloHMMcons
    # extract sufficient stats for phylog. analysis from MAF file
    CFTR_START=115365025      # boundaries of CFTR region in hg16 coords 
    CFTR_END=117242450	      # (these don't have to be perfect)
    maf_project /cluster/data/nisc/targets/cftr/tba9way.maf /cluster/data/nisc/targets/cftr/tba9Mammal/human > cftr9_humanref.maf
    msa_view cftr9_humanref.maf -i MAF -o SS -s ${CFTR_START} -e ${CFTR_END} -r 1 -O hg16,chimp,baboon,mm3,rn3,cow,pig,cat,dog > cftr9.ss
    head cftr9.ss            
#NSEQS = 9
#LENGTH = 2063003
#TUPLE_SIZE = 1
#NTUPLES = 57302
#NAMES = hg16,chimp,baboon,mm3,rn3,cow,pig,cat,dog
#ALPHABET = ACGTN
#IDX_OFFSET = 115365024
#NCATS = -1
# 
#0       C--------       26480

    # fit a phylogenetic model to the data, with rate variation
    echo "((((1,2),3),(4,5)),((6,7),(8,9)))" > cftr9.nh
    # (indexes refer to sequences in the order of the NAMES line in
    # the *.ss file)
    fit_tree_model -m cftr9.ss -i SS -t cftr9.nh -s REV -o cftr9_rev -E -l fit.log -k 5 -a 4.8 -T -p MED
    # (takes about 5 min.  Watch log file for convergence --
    # single lines correspond to outer maximization algorithm,
    # interleaved sets of lines correspond to inner maximization
    # algorithms [see http://www.soe.ucsc.edu/~acs/Siepel-03-0304.pdf
    # for background])
    # Note: k=5 is adequate for a good estimate of the alpha
    # parameter, even though we'll use k=10 in the next step.  The -a
    # argument just provides a reasonable starting value for alpha, to
    # speed convergence

    # (check estimated branch lengths to be sure they make sense)
    cat cftr9_rev.nh
#((((hg16:0.005601,chimp:0.005707):0.019356,baboon:0.034458):0.080743,(mm3:0.072487,rn3:0.079445):0.287368):0.035643,((cow:0.107791,pig:0.102431):0.040419,(cat:0.074444,dog:0.104476):0.053251):0.035643);
    # (small deviations from one data set to the next are normal)

    # you can also do "draw_tree cftr9_rev.nh > cftr9_rev.ps" to get a
    # simple postscript rendering of the tree.  Zero- or
    # near-zero-length branches usually indicate a problem, e.g.,
    # incorrect topology

    # (also check cftr9_rev.mod; look in particular at ALPHA)
    cat cftr9_rev.mod
#ALPHABET: A C G T
#ORDER: 0
#SUBST_MOD: REV
#NRATECATS: 5
#ALPHA: 4.778715
#TRAINING_LNL: -6471907.615171
#BACKGROUND: 0.304536 0.191156 0.191907 0.312401
#RATE_MAT:
#  -0.848833    0.150792    0.552489    0.145552
#   0.240232   -1.259134    0.166198    0.852704
#   0.876738    0.165547   -1.285792    0.243507
#   0.141887    0.521764    0.149586   -0.813238
#TREE: ((((1:0.005601,2:0.005707):0.019356,3:0.034458):0.080743,(4:0.072487,5:0.079445):0.287368):0.035643,((6:0.107791,7:0.102431):0.040419,(8:0.074444,9:0.104476):0.053251):0.035643);

    # now compute the posterior probabilities of interest, according to
    # a phylo-HMM
    label -m cftr9.ss -d cftr9_rev.mod -i SS -o cftr9 -k 10 -L 0.9 -A -p 0 -j 1 -x -s chr7
    # (takes 12 min)
    # (check postprob file)
    wc cftr9.postprob
#1752168 3504336 31539024 cftr9.postprob
    head cftr9.postprob
#115370785       0.0664
#115370786       0.0583
#115370787       0.0448
#115370788       0.0271
#115370789       0.0217
#115370790       0.0232
#115370791       0.0331
#115370792       0.0396
#115370793       0.0417
#115370794       0.0557

    # load as a (Hiramesque) wiggle track
    cd /cluster/data/nisc/targets/cftr/phyloHMMcons
    zcat cftr9.postprob.gz | wigAsciiToBinary -chrom=chr7 -binsize=1024 \
	-dataSpan=1 -wibFile=chr7_phyloHMMcons_CFTR -name=cftr9 stdin
    rm -r /gbdb/hg16/wib/chr7_phyloHMMcons_CFTR.wib
    ln -s \
     /cluster/data/nisc/targets/cftr/phyloHMMcons/chr7_phyloHMMcons_CFTR.wib \
	/gbdb/hg16/wib/chr7_phyloHMMcons_CFTR.wib
    hgLoadWiggle hg16 chr7_phyloHMMcons_CFTR chr7_phyloHMMcons_CFTR.wig
    chmod 664 chr7_phyloHMMcons_CFTR.wib
    chmod 775 .

    # add trackDb.ra entry, e.g.,
#track phyloHMMcons_CFTR
#shortLabel phyloHMMcons CFTR
#longLabel phylo-HMM-based conservation, CFTR (post. prob. of slowest of 10 rates)
#group compGeno
#priority 150
#visibility hide
#color 175,150,128
#altColor 255,128,0
#type wig 0.0 1.0
#autoScale Off

    # adapt HTML for details page, if necessary (e.g., copy an existing
    # phyloHMMcons*.html page to phyloHMMcons_CFTR.html, edit to
    # reflect data set, do "make update", don't forget to cvs add and
    # commit)

    # cleanup
    rm cftr9.ss cftr9_humanref.maf   # easy to regenerate
    gzip cftr9.postprob

# CFTR PHYLOHMM CONSERVATION, 25-way alignment
# done, acs, 2003-11-21

# This can be done exactly as above for the 9-way alignment, except
# that the tree estimation procedure has to be adjusted to circumvent
# the problem that the distant species align only in conserved regions
# (so that a tree estimated from the whole data set will have
# disproportionally short branches to and among these species).  The
# procedure I've used is semi-manual and somewhat ad hoc, but I'll
# record the main steps here for completeness.  I'll only cover the
# tree estimation procedure (running 'label' and loading the track is
# the same as before) .

    ssh hgwdev
    mkdir /cluster/data/nisc/targets/cftr/phyloHMMcons25
    cd /cluster/data/nisc/targets/cftr/phyloHMMcons25

    # extract sufficient statistics for two data sets: all sites for
    # mammals and sites in 3rd codon positions for all species.  I'm
    # not including platyplus with the mammals (it's technically a
    # mammal, but a monotreme, and quite distant) because it seems to
    # align mostly in conserved regions
    maf_project /cluster/data/nisc/targets/cftr/25way/tba.maf /cluster/data/nisc/targets/cftr/25way/human > cftr25_humanref.maf
    setenv CFTR_START 115365025 
    setenv CFTR_END 117242450	
    setenv SPEC_ORDER  hg16,chimp,orangutan,baboon,macaque,vervet,lemur,rabbit,rn3,mm3,cow,pig,horse,cat,dog,ajbat,cpbat,hedgehog,opossum,dunnart,platypus,chicken,zfish,tetra,fr1
    msa_view cftr25_humanref.maf -i MAF -o SS -s $CFTR_START -e $CFTR_END -r 1 -O $SPEC_ORDER > cftr25.ss 
	# whole data set, ordered suff stats -- use this for 'label'
    msa_view cftr25.ss -i SS -o SS -z -l 21,22,23,24,25 -x > cftr20.ss
				# exclude non-mammals (plus platypus)
    /bin/echo -e 'NCATS = 3\ncds 1-3' > cats.cm   # category map for cds sites
    /cluster/home/acs/woody/scripts/refFlat2gff.pl -S -P -A hg16 -w 'chrom="chr7" and cdsStart > 115365025 and cdsEnd < 117242450' | sed 's/chr7/hg16/' | egrep -v 'NM_152829|NM_001233|NM_018412' > cftr.gff
	# gets refseq annotations for this region as a gff; the egrep
	# explicitly removes some duplicate entries (should have a
	# better way of doing this); the sed changes the seq name so
	# that msa_view recognizes it's the same as the name in the
	# alignment
    msa_view cftr25_humanref.maf -i MAF -o SS -z -c cats.cm -g cftr.gff -O $SPEC_ORDER > cftr25.3.ss

    # now fit a tree model to each data set
    echo "((((((((((1,2),3),((4,5),6)),7),(8,(9,10))),((((11,12),(13,(14,15))),(16,17)),18)),(19,20)),21),22),(23,(24,25)))" > cftr25.nh
    fit_tree_model -m cftr25.3.ss -C 3 -i SS -t cftr25.nh -s REV -o cftr25 -E -l cftr25.3.log -T -p MED -k 5 -a 1.8

    # (this next one may take an hour or two -- run it on a fast
    # workstation or be sure to nice if on hgwdev; you can speed it up
    # by giving it a good starting *.mod file based on the above [-M option])
    echo "(((((((1,2),3),((4,5),6)),7),(8,(9,10))),((((11,12),(13,(14,15))),(16,17)),18)),(19,20))" > cftr20.nh
    fit_tree_model -m cftr20.ss -i SS -t cftr20.nh -s REV -o cftr20 -E -l cftr20.log -T -p MED -k 5 -a 4

    cp cftr20.mod cftr25_hybrid.mod
    # Now edit cftr25_hybrid.mod by hand.  Copy the tail end of the
    # TREE line from cftr25.3.mod, corresponding to all nodes and
    # branches outside of the clade for the non-monotreme mammals, and
    # append it to the TREE line in cftr25_hybrid.mod (adjusting
    # parens as necessary).  Then multiply each one of these new
    # branch lengths by a factor of 1.2 (computed as the sum of all
    # branch lengths in cftr25.mod divided by the sum of the
    # corresponding branch lengths in cftr25.3.mod).  The resulting
    # tree is well supported within the (non-monotreme mammals) and
    # includes a reasonable approximation of the non-mammal branch
    # lengths.  Proceed with 'label' using cftr25_hybrid.mod.

    # cleanup
    rm cftr25_humanref.maf cftr*.ss         # easy to regenerate

# HMR PHYLOHMM CONSERVATION
# (started, acs, 2003-11-11, finished 11-19)

    ssh hgwdev
    # (update woody binaries, if necessary -- see above)
    # (also, make sure /cluster/bin/woody in path)
    mkdir /cluster/data/hg16/bed/phyloHMMcons.hg16mm3rn3.2003-11-11
    cd /cluster/data/hg16/bed/phyloHMMcons.hg16mm3rn3.2003-11-11

    # estimate a phylog. model using the entire genome-wide alignments
    # first extract sufficient statistics by chromosome
    ssh eieio
    cd /cluster/data/hg16/bed/phyloHMMcons.hg16mm3rn3.2003-11-11
    foreach file (/cluster/data/hg16/bed/humor/hmr/*.maf)
	set prefix = $file:t:r
	msa_view -i MAF $file -o SS -z -O hg16,mm3,rn3 > $prefix.ss 
    end
    logout
    # NOTE: may be worth doing the above as a small cluster job instead
    # (put the mafs on bluearc -- end up doing this below anyway)

    # now combine suff stats across chromosomes
    # (back on hgwdev)
    ls chr*.ss > files
    msa_view -i SS -o SS -A hg16,mm3,rn3 '*files' > all.ss
    # estimate the model (very fast, now that suff stats are avail)
    echo "(1,(2,3));" > tree.nh
    fit_tree_model -i SS -m all.ss -t tree.nh -s REV -k 10 -o rev_dg
    cat rev_dg.mod
#ALPHABET: A C G T
#ORDER: 0
#SUBST_MOD: REV
#NRATECATS: 10
#ALPHA: 4.428803
#TRAINING_LNL: -448054115.568696
#BACKGROUND: 0.286083 0.213573 0.213691 0.286652
#RATE_MAT:
#  -0.891523    0.166770    0.574850    0.149902
#   0.223389   -1.146311    0.153784    0.769137
#   0.769591    0.153699   -1.147159    0.223869
#   0.149605    0.573055    0.166888   -0.889548
#TREE: (1:0.192598,(2:0.076303,3:0.083043):0.192598);

    # now, break up the genome-wide MAFs into pieces; it's worth doing
    # this as a little cluster job
    ssh eieio
    mkdir -p /cluster/bluearc/hg16/bed/humor
    cp /cluster/data/hg16/bed/humor/hmr/*.maf /cluster/data/hg16/?{,?}/chr*.fa /cluster/bluearc/hg16/bed/humor
    logout
    ssh kk
    cd /cluster/data/hg16/bed/phyloHMMcons.hg16mm3rn3.2003-11-11
    cat << EOF > doSplit
#!/bin/sh

WOODY=/cluster/bin/woody
FA_SRC=/cluster/bluearc/hg16/bed/humor
WINDOWS=/cluster/data/hg16/bed/phyloHMMcons.hg16mm3rn3.2003-11-11/WINDOWS

maf=$1
prefix=`echo $maf | awk -F\/ '{print $NF}' | awk -F\. '{print $1}'` 
mkdir -p /scratch/msa_split
${WOODY}/msa_split $maf -i MAF -M ${FA_SRC}/$prefix.fa -O hg16,mm3,rn3 -w 1000000,0 -r /scratch/msa_split/$prefix -o SS -I 1000 -d 1 -B 5000
cd /scratch/msa_split
for file in ${prefix}.*.ss ; do gzip -c $file > ${WINDOWS}/$file.gz ; done
rm -f /scratch/msa_split/${prefix}.*.ss
EOF
    chmod +x doSplit
    mkdir -p WINDOWS
    rm -f WINDOWS/* jobs.lst
    foreach file (/cluster/bluearc/hg16/bed/humor/*.maf) 
	echo "doSplit $$file" >> jobs.lst
    end
    para create jobs.lst  
    # etc ...   (run cluster job)

    # now setup and run the cluster job to compute the conservation scores
    # NOTE: the TMP dir should be set to something other than /scratch,
    # as it is not shared between cluster nodes ?
    
cat << EOF > doPostProbs
#!/bin/sh

WOODY=/cluster/bin/woody
TMP=/scratch/phyloHMMcons

file=$1
root=`echo $file | awk -F\/ '{print $NF}' | sed 's/\.ss\.gz//'`
chrom=`echo $root | awk -F\. '{print $1}'`

mkdir -p $TMP
zcat $file | $WOODY/label -m - -d rev_dg.mod -i SS -o $TMP/$root -k 10 -L 0.9 -A -p 0 -j 1 -s $chrom -x
mkdir -p POSTPROBS/$chrom
gzip -c $TMP/$root.postprob > POSTPROBS/$chrom/$root.postprob.gz
rm $TMP/$root.postprob
EOF
    chmod +x doPostProbs

    mkdir -p POSTPROBS
    rm -f jobs2.lst 
    foreach file (WINDOWS/chr*.ss.gz) 
	echo "doPostProbs $file" >> jobs2.lst 
    end
    para create jobs2.lst
    # etc ... (run cluster job)
    logout

    # finally, make track
    # phyloHMMcons.hg16mm3rn3.2003-11-11 dir)
    ssh eieio
    cd /cluster/data/hg16/bed/phyloHMMcons.hg16mm3rn3.2003-11-11
    mkdir wibLimits
    mkdir wib
    foreach dir (POSTPROBS/*)
	set chrom = $dir:t
	echo $chrom 
	zcat `ls POSTPROBS/$chrom/*postprob.gz | sort -t\. -k2,2n` | \
	    wigAsciiToBinary -chrom=$chrom -binsize=1024 \
		-dataSpan=1 -wibFile=wib/${chrom}_phyloHMMcons -name=hmr \
		stdin > wibLimits/${chrom}
    end
    ssh hgwdev
    cd /cluster/data/hg16/bed/phyloHMMcons.hg16mm3rn3.2003-11-11
    hgLoadWiggle hg16 phyloHMMcons_HMR wib/*_phyloHMMcons.wig 
    ln -s `pwd`/wib/chr*_phyloHMMcons.wib /gbdb/hg16/wib 
    chmod 775 . wib
    chmod 664 wib/*.wib

    # add entry to trackDb.ra
#track phyloHMMcons_HMR
#shortLabel phyloHMMcons HMR
#longLabel phylo-HMM-based conservation, human-mouse-rat (post. prob. of slowest of 10 rates)
#group compGeno
#priority 150
#visibility hide
#color 175,150,128
#altColor 255,128,0
#type wig 0.0 1.0
#autoScale Off

    # cleanup (only when you're pretty sure you're done!)
    rm -r chr*.ss WINDOWS wiggle.tab para.results batch*


# CHICKEN BLAT (translated)
# (done, acs, 2003-11-19)

    # (using repeat- and TRF-masked files already created -- see
    # CHICKEN BLASTZ, above)
    ssh kk

    # set up main dir
    cd /cluster/data/hg16/bed
    mkdir blat.gg0.2003-11-19
    ln -s blat.gg0.2003-11-19 blat.gg0
    cd blat.gg0

    # warning: I'm writing this up in a rush -- watch for errors!

    # set up cluster job
    cat << EOF > make-joblist.pl
#!/usr/bin/perl

# script to create a job list for translated blat of human
# vs. another species; assumes directory of fa files for the xeno
# species.  Output directories are set up as a side effect.

# USAGE: make-joblist.pl <hg-nibs-dir> <xeno-fa-dir> <hg-chr-lengths-file> <output_root_dir>

$SIZE=10000000;			# partitioning params for human
$OVERLAP=10000;

# read lengths of chromosomes
open(LENF, $ARGV[2]);
while (<LENF>) { ($chr, $l) = split(/\s+/); $length{$chr} = $l;}
close(LENF);

@falist = <$ARGV[1]/*.fa>;
foreach $nib (<$ARGV[0]/*.nib>) {
  $nib =~ /.*(chr.*)\.nib/ || die();
  $chr = $1;
  $l = $length{$chr};

  for ($start = 1; $start <= $l; $start += $SIZE) {
    $end = $start + $SIZE + $OVERLAP - 1;
    if ($end > $l) { $end = $l; }
    $dir = sprintf("%s/%s/%d_%d", $ARGV[3], $chr, $start, $end);
    foreach $fa (@falist) {    
      $fa =~ /.*\/([^\/]+)\.fa/ || die();
      $name = $1;
      printf "/cluster/bin/i386/blat -mask=lower -qMask=lower -q=dnax -t=dnax %s:%d-%d %s {check out line+ %s/%s_%d_%d_%s.psl}\\n", $nib, $start, $end, $fa, $dir, $chr, $start, $end, $name;
    }
    `mkdir -p $dir`;		# set up output directories
  }
}
EOF

    # NOTE: there's a slight error above with indexing.  Next time use
    # something like:

#  for ($start = 0; $start < $l; $start += $SIZE) {
#    $end = $start + $SIZE + $OVERLAP;
#    if ($end >= $l) { $end = $l; }

    # The "make-lift.pl" script below should be changed also to be
    # consistent (should be enough to change exactly the same lines)

    chmod +x make-joblist.pl
    cp /cluster/data/hg16/bed/blastz.gg0/S1.len . # just borrow existing lens
    mkdir -p run
    ./make-joblist.pl /iscratch/i/gs.17/build34/bothMaskedNibs /cluster/bluearc/gg0/split100_with_trf S1.len /cluster/data/hg16/bed/blat.gg0/psl > run/jobs.lst
    # make sure directory structure is created under psl
    cd run
    para create jobs.lst ; para try ; para check ; para push ; # etc...

#33561 jobs in batch
#0 jobs (including everybody's) in Parasol queue.
#Checking finished jobs
#Completed: 33561 of 33561 jobs
#CPU time in finished jobs:   14432527s  240542.12m  4009.04h  167.04d  0.458 y
#IO & Wait Time:                147210s    2453.50m    40.89h    1.70d  0.005 y
#Average job time:                 434s       7.24m     0.12h    0.01d
#Longest job:                    14117s     235.28m     3.92h    0.16d
#Submission to last job:         31483s     524.72m     8.75h    0.36d

    # post process psl files
    cd ..			# back to main blat.gg0 dir
cat << EOF > make-lift.pl
#!/usr/bin/perl

# create a lift spec to map psl files for windows to chromosome coords
# USAGE: make-lift.pl <hg-nibs-dir> <hg-chr-lengths-file>

$SIZE=10000000;
$OVERLAP=10000;

open(LENF, $ARGV[1]);
while (<LENF>) { ($chr, $l) = split(/\s+/); $length{$chr} = $l;}
close(LENF);

foreach $nib (<$ARGV[0]/*.nib>) {
  $nib =~ /.*(chr.*)\.nib/ || die();
  $chr = $1;
  $l = $length{$chr};
  for ($start = 1; $start <= $l; $start += $SIZE) {
    $end = $start + $SIZE + $OVERLAP - 1;
    if ($end > $l) { $end = $l; }
    printf "%d\t%s:%d-%d\t%d\t%s\t%d\n", $start, $chr, $start, $end, $end-$start, $chr, $l;
  }
}
EOF

    chmod +x make-lift.pl
    ./make-lift.pl /iscratch/i/gs.17/build34/bothMaskedNibs S1.len > psl.lft
    mkdir -p pslChrom
    foreach dir ( psl/* )
	set chrom = $dir:t 
	echo $chrom 
	/cluster/bin/i386/pslCat -dir $dir/* | /cluster/bin/i386/liftUp pslChrom/${chrom}_blatGg0.psl psl.lft warn stdin
    end

    # Load database tables
    ssh hgwdev
    cd /cluster/data/hg16/bed/blat.gg0/pslChrom
    /cluster/bin/i386/hgLoadPsl -noTNameIx hg16 chr*.psl
    #	New entry in human/hg16/trackDb.ra
#	track blatGg0
#	shortLabel Chicken Blat
#	longLabel Chicken Translated Blat (Gg0-contigs, 5.2x coverage)
#	group compGeno
#	priority 145.95
#	visibility hide
#	color 100,50,0
#	altColor 255,240,200
#	spectrum on
#	type psl xeno 

    # look at coverage
    featureBits hg16 blatGg0 knownGene:CDS
#18205137 bases of 2865248791 (0.635%) in intersection
    featureBits hg16 knownGene:CDS
#31268809 bases of 2865248791 (1.091%) in intersection

# RELOAD ENSEMBL GENES WITH VERSION 34a (DONE 2003/12/16 markd)
    # save current tables, just in case.
    rename table  ensGene to ensGene_old;
    rename table  ensGtp  to ensGtp_old;  
    rename table  ensPep  to ensPep_old;  

    mkdir /cluster/data/hg16/bed/ensembl34a
    cd /cluster/data/hg16/bed/ensembl34a
    # Get the ensembl protein data from 
    # http://www.ensembl.org/Homo_sapiens/martview
    # Follow this sequence through the pages:
    # Page 1) Make sure that the Homo_sapiens choice is selected. Hit next.
    # Page 2) Uncheck the "Limit to" box in the region choice. Then hit next.
    # Page 3) Choose the "Structures" box. 
    # Page 4) Choose GTF as the ouput.  choose gzip compression.  hit export.
    # Save as ensemblGene.gtf.gz

    # Ensembl handles random chromosomes differently than us, so we
    # strip this data.  Fortunately it just loses a couple of genes.
    # Add "chr" to front of each line in the gene data gtf file to make 
    # it compatible with our software.
    # Finally, get rid of the ".1" or ".2" after the name
    zcat ensemblGene.gtf.gz \
    | grep -v ^6_DR51 \
    | grep -v ^DR51 \
    | grep -v _NT_ \
    | perl -wpe 's/^([0-9]|X|Y|Un)/chr$1/ \
                 || die "Line $. doesnt start with human chrom:\n$_"' \
    | sed -e 's/\..\"/\"/g' \
    > ensGene.gtf
    ssh hgwdev
    /cluster/bin/i386/ldHgGene hg16 ensGene \
      /cluster/data/hg16/bed/ensembl34a/ensGene.gtf

    # ensGtp associates geneId/transcriptId/proteinId for hgPepPred and 
    # hgKnownToSuper.  Use ensMart to create it as above, except:
    # Page 3) Choose the "Features" box. In "Ensembl Attributes", check 
    # Ensembl Gene ID, Ensembl Transcript ID, Ensembl Peptide ID.  
    # Choose Text, tab-separated as the output format.  Result name ensGtp.
    # Save file as ensGtp.txt.gz
    gunzip ensGtp.txt.gz
    hgsql hg16 < ~/kent/src/hg/lib/ensGtp.sql
    echo "load data local infile 'ensGtp.txt' into table ensGtp" | hgsql -N hg16
    gzip ensGtp.txt

    # Load Ensembl peptides:
    # Get them from ensembl as above in the gene section except for
    # Page 3) Choose the "Sequences" box. 
    # Page 4) Transcripts/Proteins.  Peptide.  Format = FASTA.
    # Save file as ensemblPep.fa.gz
    zcat ensemblPep.fa.gz | hgPepPred hg16 ensembl stdin

    # compare size of old and new tables as a sanity check
    drop table ensGene_old;
    drop table ensGtp_old;  
    drop table ensPep_old;  

    # Create knownToEnsembl column and knownToSuperfamily column
    hgMapToGene hg16 ensGene knownGene knownToEnsembl
    zcat /cluster/store1/superFamily/genomes/ass_26-Oct-2003.tab.gz | hgKnownToSuper hg16 hs stdin


# LOAD ECgene tables ((redone with existing data) braney, 2004-01-30)
    cd /cluster/data/hg16/bed
    rm -f ECgene
    mkdir ECgene.2003-12-18
    ln -s ECgene.2003-12-18 ECgene
    cd ECgene
    wget "http://genome.ewha.ac.kr/ECgene/download/ECgene_hg16_v1.1_25oct2003_genes.txt.gz"
    wget "http://genome.ewha.ac.kr/ECgene/download/ECgene_hg16_v1.1_25oct2003_genepep.txt.gz"
    gunzip *.gz
    ldHgGene -predTab hg16 ECgene ECgene_hg16_v1.1_25oct2003_genes.txt 
    hgPepPred hg16 tab ECgenePep ECgene_hg16_v1.1_25oct2003_genepep.txt
    rm genePred.tab
    gzip *

# QA NOTE: [ASZ, 2007-10-01] mytouch to ECGenePep table 200401301000.00
# contents were fine.  passed -keys rule.


# MULTIZ HUMAN/MOUSE/RAT/CHIMP (kpollard, 12/16/03)
#  chimp added to human/mouse/rat (HUMOR) alignment described above
#  for now, human referenced and no new BLASTZ runs

    ssh kk
    #fix order in human/chimp BLASTZ MAF files
    #use Kate's new files in humanBestAxt.2
    cd /cluster/data/pt0/bed/blastz-blatHg16
    mkdir humanBestAxt.ord
    mkdir maf.ord
    foreach file (humanBestAxt.2/*.axt)
	set root=$file:t:r
	echo $root
	/cluster/bin/i386/axtSort $file humanBestAxt.ord/${root}.axt
	/cluster/bin/i386/axtToMaf humanBestAxt.ord/${root}.axt ../blastz.hg16/S1.len /cluster/data/pt0/scaffold.sizes maf.ord/${root}.maf.unfixed -tPrefix=hg16. -qPrefix=pt0.
	/cluster/bin/scripts/fixmaf.pl < maf.ord/${root}.maf.unfixed > maf.ord/${root}.maf
    end

    #test on chr11 with HMR
    ssh eieio
    mkdir -p /cluster/bluearc/multiz.hg16mm3rn3gg0pt0/hp.ord
    cp /cluster/data/pt0/bed/blastz-blatHg16/maf.ord/chr11.maf /cluster/bluearc/multiz.hg16mm3rn3gg0pt0/hp.ord
    logout  # back to kk
    /cluster/data/hg16/bed/multiz.hg16mm3rn3gg0pt0/mz /cluster/bluearc/multiz.hg16mm3rn3gg0pt0/hp.ord/chr11.maf /cluster/bluearc/multiz.hg16mm3rn3gg0pt0/hmr/chr11.hmr.maf /cluster/data/hg16/bed/multiz.hg16mm3rn3gg0pt0/hmrp/chr11.ord.maf
    #looks good, go ahead with HMRP multiz

    mkdir -p /cluster/data/hg16/bed/multiz.hg16mm3rn3pt0
    cd /cluster/data/hg16/bed/multiz.hg16mm3rn3pt0
    mkdir hmrp

    # wrapper script for multiz
    cat << EOF > mz
#!/bin/csh
/cluster/bin/penn/tbaBin/multiz \$1 \$2 - > \$3
EOF
    chmod +x mz

    ssh eieio
    # clean up bluearc
    rm -r /cluster/bluearc/multiz.hg16mm3rn3gg0pt0
    # move MAFS to bluearc
    mkdir -p /cluster/bluearc/multiz.hg16mm3rn3pt0/hmr
    mkdir -p /cluster/bluearc/multiz.hg16mm3rn3pt0/hp
    cp /cluster/data/hg16/bed/humor/hmr/*.maf /cluster/bluearc/multiz.hg16mm3rn3pt0/hmr
    cp /cluster/data/pt0/bed/blastz-blatHg16/maf.ord/*.maf /cluster/bluearc/multiz.hg16mm3rn3pt0/hp
    logout

    # set up joblist (common denominator set: no chr19_random in hmr)
    foreach file (/cluster/bluearc/multiz.hg16mm3rn3pt0/hmr/*.maf) 
	set root=`echo $file:t:r | sed 's/\.hmr//'`
	echo "/cluster/data/hg16/bed/multiz.hg16mm3rn3pt0/mz /cluster/bluearc/multiz.hg16mm3rn3pt0/hp/${root}.maf $file /cluster/data/hg16/bed/multiz.hg16mm3rn3pt0/hmrp/${root}.maf" >> jobList
    end

    #run MULTIZ
    chmod +x jobList
    para create jobList
    #submit 10 jobs
    para try
    #keep an eye on them
    para check
    para finished
    para running
    #once these are done, submit rest
    para push
    para check
    para time
    #ran on cluster: 41 jobs, longest 42 min

    #copy over chr19_random.maf from human/chimp
    cp /cluster/bluearc/multiz.hg16mm3rn3pt0/hp/chr19_random.maf /cluster/data/hg16/bed/multiz.hg16mm3rn3pt0/hmrp/chr19_random.maf

    # clean up bluearc
    ssh eieio
    rm -r /cluster/bluearc/multiz.hg16mm3rn3pt0
    logout

    # setup external files for database reference
    ssh hgwdev
    mkdir -p /gbdb/hg16/multizMm3Rn3Pt0
    ln -s /cluster/data/hg16/bed/multiz.hg16mm3rn3pt0/hmrp/*.maf /gbdb/hg16/multizMm3Rn3Pt0

    #load into database
    cd /cluster/data/hg16/bed/multiz.hg16mm3rn3pt0/hmrp
    /cluster/bin/i386/hgLoadMaf -warn hg16 multizMm3Rn3Pt0
        # 5385226 mafs in 42 files
        # 0-2594 warnings/file
    
    #NOTE: only added track to hgwdev-kpollard (for now).


# LIFTOVER RNAGENE FROM HG15 (DONE CIRCA 12/27/03 schattner)
#	Replaced below by new RNAGENES (2004-03-09)
    cd /cluster/data/hg16/bed/bedOver
    mkdir rnaGene
    cd rnaGene
    hgsql -N hg15 '-e select * from rnaGene' > rnaGeneHg15.bed 
    liftOver rnaGeneHg15.bed ../over.chain rnaGeneLiftGene.bed \
      rnaGeneLiftGeneMiss.bed
    hgLoadBed -noBin -sqlTable=$HOME/kent/src/hg/lib/rnaGene.sql hg16 rnaGene \
      /cluster/data/hg16/bed/bedOver/rnaGene/rnaGeneLiftGene.bed


LOAD RNAGENES (DONE - 2004-03-09 - Hiram)
    #	http://www.genetics.wustl.edu/eddy
    #	Sean Eddy, eddy@genetics.wustl.edu
    #	Dept. of Genetics, Washington University School of Medicine

    ssh hgwdev
    mkdir -p /cluster/data/hg16/bed/rnaGene
    cd /cluster/data/hg16/bed/rnaGene
    mkdir rnaGenePrevious
    #	save previous rnaGene track for reference
    hgsqldump -T rnaGenePrevious hg16 rnaGene
    wget --timestamping \
	ftp://ftp.genetics.wustl.edu/pub/eddy/annotation/human-hg16/*
    grep -v "^#" ncrna-hg16-mito.gff | sed -e "s/^NT_999999/chrM/" > mito.gff
    grep -v "^#" ncrna-hg16-chrom.gff > chrom.gff
    cat chrom.gff mito.gff > all.gff

    hgsql -e 'drop table rnaGene;' hg16
    hgsql hg16 < ~/kent/src/hg/lib/rnaGene.sql
    hgRnaGenes hg16 all.gff

# rmMm3Rn3 3-way Regulatory Potential Score track (DONE - 2004-01-14 - Hiram)
#	Data from: James Taylor james@bx.psu.edu
#	Track description from: Francesca Chiaromonte chiaro@stat.psu.edu
    ssh eieio
    # Right now we are out of space on this /cluster/store4 filesystem,
    #	so send the data to the bluearc
    mkdir /cluster/bluearc/hg16/bed/regPotential3X
    ln -s /cluster/bluearc/hg16/bed/regPotential3X \
    	/cluster/data/hg16/bed/regPotential3X

    cd /cluster/data/hg16/bed/regPotential3X
    mkdir data
    cd data
    foreach c (1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 X Y)
    wget --timestamping \
	"http://www.bx.psu.edu/~james/rp/hg16_mm3_rn3/chr${c}.hmr.maf.gz_rpscores.txt.truncated.bz2"
    wget --timestamping \
	"http://www.bx.psu.edu/~james/rp/hg16_mm3_rn3/chr${c}.hmr.maf.gz_rpscores.txt.bz2"
    end
    # The truncated files were a test.  They want to see the raw data.

    ssh eieio
    cd /cluster/data/hg16/bed/regPotential3X
    mkdir wigRawData
    foreach c (1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 X Y)
	bzcat data/chr${c}.hmr.maf.gz_rpscores.txt.bz2 | sort -n | \
	    wigAsciiToBinary -chrom=chr${c} -dataSpan=5 \
	    -verbose -wibFile=wigRawData/chr${c}_rpMm3Rn3_Data \
	    -name=${c} stdin > chr${c}.out
	echo chr${c} done
    end

    ssh hgwdev
    cd /cluster/data/hg16/bed/regPotential3X/wigRawData
    hgLoadWiggle hg16 regPotential3X chr*_rpMm3Rn3_Data.wig
    ln -s `pwd`/chr*_rpMm3Rn3_Data.wib /gbdb/hg16/wib

# rmMm4 2-way Regulatory Potential Score track (DONE - 2004-01-14 - Hiram)
#	Data from: James Taylor james@bx.psu.edu
#	Track description from: Francesca Chiaromonte chiaro@stat.psu.edu
    ssh eieio
    # Right now we are out of space on this /cluster/store4 filesystem,
    #	so send the data to the bluearc
    mkdir /cluster/bluearc/hg16/bed/regPotential2X
    ln -s /cluster/bluearc/hg16/bed/regPotential2X \
    	/cluster/data/hg16/bed/regPotential2X

    cd /cluster/data/hg16/bed/regPotential2X

    mkdir data
    cd data
    foreach c (1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 M X Y)
    wget --timestamping \
	"http://www.bx.psu.edu/~james/rp/hg16_mm4/chr${c}.axt_rpscores.txt.truncated"
    wget --timestamping \
	"http://www.bx.psu.edu/~james/rp/hg16_mm4/chr${c}.axt_rpscores.txt"
    end
    gzip *.truncated *.txt
    #	I'll bet you could gzip the .wig files too and zcat them
    #	into hgLoadWiggle ?

    # The truncated files were a test.  It turns out the full scores
    #	are desired to be seen

    #	The data is for every 5 bases.  Doesn't appear to be in order,
    #	so sort it into wigAsciiToBinary
    ssh eieio
    cd /cluster/data/hg16/bed/regPotential2X
    mkdir wigFiles
    foreach c (1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 M X Y)
	zcat data/chr${c}.axt_rpscores.txt.gz | sort -n | \
	    wigAsciiToBinary -chrom=chr${c} -dataSpan=5 \
	    -wibFile=wigFiles/chr${c}_rpMm4 stdin > chr${c}.limits
	echo chr${c} done
    end

    #	To load the data
    #		(some day in the future the above wigAsciiToBinary function
    #		will be folded into hgLoadWiggle and thus one command)
    ssh hgwdev
    cd /cluster/data/hg16/bed/regPotential2X/wigFiles
    hgLoadWiggle hg16 regPotential2X chr*_rpMm4.wig
    ln -s `pwd`/chr*_rpMm4.wib /gbdb/hg16/wib

    # an optional data load to check a display problem
    mkdir wigTrunc
    foreach c (1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 M X Y)
	zcat data/chr${c}.axt_rpscores.txt.truncated.gz | sort -n | \
	    wigAsciiToBinary -chrom=chr${c} -dataSpan=5 \
	    -wibFile=wigTrunc/chr${c}_rpMm4t stdin > chr${c}t.limits
    end
    ssh hgwdev
    cd /cluster/data/hg16/bed/regPotential2X/wigTrunc
    hgLoadWiggle hg16 regPotential2XTrunc chr*_rpMm4t.wig
    ln -s `pwd`/chr*_rpMm4t.wib /gbdb/hg16/wib


# CREATE chimpSimpleDiff TRACK AND TABLE
    # Convert chimp quality scores from uncompressed contig to compressed
    # supercontig format.  This will take half an hour or so.
    cd /cluster/data/pt0
    zcat contigs.quals.gz | qaToQac stdin stdout | \
        chimpSuperQuals assembly.agp stdin scaffolds.qac

    # Make single base pair high quality differences into a bed file
    # and load into database
    cd /cluster/data/hg16/bed
    mkdir chimpSimpleDiff
    cd chimpSimpleDiff
    chimpHiQualDiffs /cluster/data/pt0/bed/blastz-blatHg16/axtBest \
    	/cluster/data/pt0/scaffolds.qac chimpSimpleDiff.bed
    sed 's/simpleNucDiff/chimpSimpleDiff/' ~/kent/src/hg/lib/simpleNucDiff.sql > \
    	chimpSimpleDiff.sql
    hgLoadBed -sqlTable=simpleNucDiff.sql hg16 chimpSimpleDiff.bed
    
  ### chimpFixedDiff -- panTro1 (Daryl, July 8, 2005)

    # Convert chimp quality scores from uncompressed to compressed
    # chromosome format.  This took 22 minutes on crow.
    cd /cluster/data/panTro1
    cat */chr*.qa | qaToQac stdin chrom.qac

    # Make single base pair high quality differences into a bed file
    # and load into database
    cd /cluster/data/hg16/bed
    mkdir chimpFixedDiff
    cd chimpFixedDiff
    sed 's/simpleNucDiff/chimpFixedDiff/' ~/kent/src/hg/lib/simpleNucDiff.sql > chimpFixedDiffs.sql

    # chimpHiQualDiffs was changed to allow different quality
    # parameters as command line options

    ## FIRST ATTEMPT:  
    set axtDir = cluster/data/hg16/bed/blastz-blat.panTro1.lifted/axtRBestNet
    ##  time chimpFixedDiffs /$axtDir /cluster/data/panTro1/chrom.qac chimpFixedDiffs.bed >& chimpFixedDiffs.log
    # This crashed twice at the same place, but ran successfully when
    # each chromosome was run separately.
    mkdir chroms; cd chroms
    ls -1 $axtDir | grep chr | grep axt | sed 's/.axt//' | xargs mkdir
    rmdir chr*random
    foreach f (chr*)
	echo -n $f "  "
	ln -s /$axtDir/$f.axt $f/$f.axt
	time nice chimpFixedDiffs $f /cluster/data/panTro1/chrom.qac $f.chimpFixedDiffs.bed>>& cfd.log
    end
    cat chr*bed > ../chimpFixedDiffs.bed

    ## The load (sort) ran out of memory on hgwdev, so I sorted the
    ## file first on kolossus (3 minutes) and then loaded it on hgwdev
    ssh kolossus
    hgLoadBed -strict -sqlTable=chimpFixedDiffs.sql -noLoad hg16 chimpFixedDiff chimpFixedDiffs.bed
    exit
    ## hgwdev (37 minutes)
    hgLoadBed -hasBin -noSort -sqlTable=chimpFixedDiffs.sql hg16 chimpFixedDiff bed.tab

    TODO: need to filter out polymorphic sites (SNPs)


## LS-SNP links [load data only] (Daryl Thomas; November 3, 2005)
    # Data from Rachel Karchin in the Andrej Sali lab at UCSF
    # /cluster/data/hg16/bed/lssnp
    hgsql hg16 < ${HOME}/kent/src/hg/lib/lsSnpFunction.sql
    hgsql hg16 < ${HOME}/kent/src/hg/lib/lsSnpStructure.sql
    mysql> load data local infile "snp-human2-function-predictions.txt" into table lsSnpFunction;
    Query OK, 7689 rows affected (0.52 sec)
    mysql> load data local infile "snp-human2-structure-predictions.txt" into table lsSnpStructure;
    Query OK, 28144 rows affected (2.39 sec)


#  gc5Base wiggle TRACK (DONE - 2004-03-12 - Hiram)
#	reloaded wib files 2005-05-17 to place them in /gbdb/hg16/wib/gc5Base
    # a demonstration wiggle track.  Perform a gc count with a 5 base
    # window.  Also compute a "zoomed" view for display efficiency.
    mkdir /cluster/data/hg16/bed/gc5Base
    cd /cluster/data/hg16/bed/gc5Base

    #	in the script below, the 'grep -w GC' selects the lines of
    #	output from hgGcPercent that are real data and not just some
    #	information from hgGcPercent.  The awk computes the number
    #	of bases that hgGcPercent claimed it measured, which is not
    #	necessarily always 5 if it ran into gaps, and then the division
    #	by 10.0 scales down the numbers from hgGcPercent to the range
    #	[0-100].  Two columns come out of the awk print statement:
    #	<position> and <value> which are fed into wigAsciiToBinary through
    #	the pipe.  It is set at a dataSpan of 5 because each value
    #	represents the measurement over five bases beginning with
    #	<position>.  The result files end up in ./wigData5.
    cat << '_EOF_' > runGcPercent.sh
#!/bin/sh
mkdir -p wigData5
mkdir -p dataLimits5
for n in ../../nib/*.nib
do
	c=`basename ${n} | sed -e "s/.nib//"`
	C=`echo $c | sed -e "s/chr//"`
	echo -n "working on ${c} - ${C} ... "
	hgGcPercent -chr=${c} -doGaps \
		-file=stdout -win=5 hg16 ../../nib | grep -w GC | \
		awk '{printf "%d\t%.1f\n", $2+1, $5/10.0 }' | \
	wigAsciiToBinary \
	-dataSpan=5 -chrom=${c} -wibFile=wigData5/gc5Base_${C} \
	-name=${C} stdin 2> dataLimits5/${c}
echo "done"
done
'_EOF_'
    chmod +x runGcPercent.sh
    #	This is going to take perhaps two hours to run.  It is a lot of
    #	data.  make sure you do it on the fileserver:
    ssh eieio
    cd /cluster/data/hg16/bed/gc5Base
    ./runGcPercent.sh
    # load the .wig files back on hgwdev:
    ssh hgwdev
    cd /cluster/data/hg16/bed/gc5Base
    hgLoadWiggle -pathPrefix=/gbdb/hg16/wib/gc5Base hg16 gc5Base wigData5/*.wig
    # and symlink the .wib files into /gbdb
    mkdir /gbdb/hg16/wib/gc5Base
    ln -s `pwd`/wigData5/*.wib /gbdb/hg16/wib/gc5Base

    # to speed up display for whole chromosome views, compute a "zoomed"
    # view and load that on top of the existing table.  The savings
    # comes from the number of data table rows the browser needs to load
    # for a full chromosome view.  Without the zoomed view there are
    # over 43,000 data rows for chrom 1.  With the zoomed view there are
    # only 222 rows needed for the display.  If your original data was
    # at 1 value per base the savings would be even greater.
    #	Pretty much the same data calculation
    # situation as above, although this time note the use of the
    # 'wigZoom -dataSpan=1000 stdin' in the pipeline.  This will average
    # together the data points coming out of the awk print statment over
    # a span of 1000 bases.  Thus each <position> coming out of wigZoom
    # will represent the measurement of GC in the next 1000 bases.  Note
    # the use of -dataSpan=1000 on the wigAsciiToBinary to account for
    # this type of data.  You want your dataSpan here to be an exact
    # multiple of your original dataSpan (5*200=1000) and on the order
    # of at least 1000, doesn't need to go too high.  For data that is
    # originally at 1 base per value, a convenient span is: -dataSpan=1024
    # A new set of result files ends up in ./wigData5_1K/*.wi[gb]
    cat << '_EOF_' > runZoom.sh
#!/bin/sh

mkdir -p wigData5_1K
mkdir -p dataLimits5_1K

for n in ../../nib/*.nib
do
        c=`basename ${n} | sed -e "s/.nib//"`
        C=`echo $c | sed -e "s/chr//"`
        echo -n "working on ${c} - ${C} ... "
        hgGcPercent -chr=${c} -doGaps \
                -file=stdout -win=5 hg16 ../../nib | grep -w GC | \
                awk '{printf "%d\t%.1f\n", $2+1, $5/10.0}' | \
	wigZoom -dataSpan=1000 stdin | wigAsciiToBinary \
        -dataSpan=1000 -chrom=${c} -wibFile=wigData5_1K/gc5Base_${C}_1K \
        -name=${C} stdin 2> dataLimits5_1K/${c}
echo "done"
done
'_EOF_'
    chmod +x runZoom.sh
    #	This is going to take even longer than above, certainly do this
    #	on the fileserver
    ssh eieio
    time ./runZoom.sh
real    232m3.265s
user    302m37.050s
sys     16m13.770s
    #	Then load these .wig files into the same database as above
    ssh hgwdev
    hgLoadWiggle -pathPrefix=/gbdb/hg16/wib/gc5Base -oldTable hg16 gc5Base \
	wigData5_1K/*.wig
    # and symlink these .wib files into /gbdb
    mkdir -p /gbdb/hg16/wib/gc5Base
    ln -s `pwd`/wigData5_1K/*.wib /gbdb/hg16/wib/gc5Base

# KNOWN GENES TRACK (STARTED - 2004-01-15 - with Gene Sorter complete
#				1004-02-17 Hiram)

    # you will probably need to make the programs in kent/src/hg/protein
    cd ~/kent/src/hg/protein
    make
    # The scripts run below will check for programs and let you know
    # which ones are missing

    # obtain new SwissProt database (should be done about once a month)
    # the swiss prot data is currently living on store5, first step is
    # on the fileserver.  This script was used once as it was created,
    # it may need to be verified and improved as it is used again.  See
    # comments at the top of the script.
    ssh eieio
    cd /cluster/data/swissprot
    ~/kent/src/hg/protein/mkSwissProtDB.sh
    # that obtains the data and unpacks it, second step is on hgwdev
    # to create the database
    ssh hgwdev
    cd /cluster/data/swissprot
    ~/kent/src/hg/protein/mkSwissProtDB.sh
    # Now the proteins database can be created from that.  Must be on hgwdev
    # Again, a script that has been used once upon creation, see
    # comments in it.  For example currently it is assumed these two
    # scripts have been run on the same day.  In this case 03112
    ssh hgwdev
    cd /cluster/data/proteins
    ~/kent/src/hg/protein/mkProteinsDB.sh
    # with those two databases existing, read for the actual known genes
    # track build.  Must be on hgwdev since it is all mostly database
    # operations.  The {Date} argument is the date stamp created by the
    # above two scripts.  Something of the form YYMMDD, e.g.: 031112
    # Again, a script that has been used only once at creation, see
    # comments at top of script.
    ssh hgwdev
    mkdir /cluster/data/hg16/bed/knownGenes
    cd /cluster/data/hg16/bed/knownGenes
    DateStamp=040115
    ~/kent/src/hg/protein/KGprocess.sh ${DateStamp}
    # that runs to a point where it prepares data and jobList for a
    # cluster run.  Continue with a cluster run on kk
    ssh kk
    cd /cluster/data/hg16/bed/knownGenes/kgBestMrna
    para create jobList
    para try
    para check
    para push
    # this is a quick cluster job.  Less than five minutes. e.g.:
# Completed: 43580 of 43580 jobs
# CPU time in finished jobs:     114636s    1910.60m    31.84h    1.33d  0.004 y
# IO & Wait Time:                111889s    1864.82m    31.08h    1.30d  0.004 y
# Average job time:                   5s       0.09m     0.00h    0.00d
# Longest job:                        9s       0.15m     0.00h    0.00d
# Submission to last job:           282s       4.70m     0.08h    0.00d

    # Continuing back on hgwdev, run the same script again
    ssh hgwdev
    cd /cluster/data/hg16/bed/knownGenes
    DateStamp=031112
    ~/kent/src/hg/protein/KGprocess.sh ${DateStamp}
    # that should run to completion and the known genes track is ready

    # Add the proteins link into gdbPdb.hgcentral:
    hgsql -e 'INSERT INTO gdbPdb (genomeDb, proteomeDb) \
	VALUES ("hg16","proteins040115");' \
	-h genome-testdb hgcentraltest

# CREATE FULL TEXT INDEX FOR KNOWN GENES (DONE 1/19/2006 JK)
# This depends on the go and uniProt databases as well as 
# the kgAlias and kgProAlias tables.  The hgKgGetText takes
# about 5 minutes when the database is not too busy.  The rest
# is real quick.
     ssh hgwdev
     cd /cluster/data/hg16/bed/knownGenes.2004-01-29
     mkdir index
     cd index
     hgKgGetText hg16 knownGene.text
     ixIxx knownGene.text knownGene.ix knownGene.ixx
     ln -s /cluster/data/hg16/bed/knownGenes.2004-01-29/index/knownGene.ix /gbdb/hg16/knownGene.ix
     ln -s /cluster/data/hg16/bed/knownGenes.2004-01-29/index/knownGene.ixx /gbdb/hg16/knownGene.ixx


# VEGA GENES UPDATE from 2004/01/15 below  (2004-02-04 - Hiram)
    mv ~/hg16/bed/vega ~/hg16/bed/vega.badcds
    mkdir /cluster/data/hg16/bed/vegaUpdate
    cd /cluster/data/hg16/bed/vegaUpdate
    
    wget --timestamping ftp://ftp.sanger.ac.uk/pub/searle/*.gtf.gz

    # Load genes and Immunoglobulin/Pseudogenes into 2 separate tracks.  Just
    # omit snoRNAs, as there are so few of them
    zcat *.gtf.gz | awk '!(/small nucleolar RNA/ || $2 == "Pseudogene" || $2 == "Ig_Pseudogene_Segment" || $2 == "Ig_Segment") {print "chr"$0}'  > vegaGene.gtf
    zcat *.gtf.gz | awk '($2 == "Pseudogene" || $2 == "Ig_Pseudogene_Segment" || $2 == "Ig_Segment") {print "chr"$0}'  > vegaPseudoGene.gtf

    ldHgGene -gtf hg16 vegaGeneUpdate vegaGene.gtf
    ldHgGene -gtf hg16 vegaPseudoGeneUpdate vegaPseudoGene.gtf

    wget http://www.sanger.ac.uk/Users/keenan/vega_pep_dump_ncbi34.fa.gz
    hgPepPred hg16 generic vegaPep vega_pep_dump_ncbi34.fa

    vegaBuildInfo vega_homo_sapiens_core_4_0.gtf vegaInfo.tab
    hgsql hg16 < ~/kent/src/hg/lib/vegaInfo.sql
    hgsql -e "load data local infile 'vegaInfo.tab' into table vegaInfo" hg16


# LOAD VEGA GENES AND PSEUDOGENES (reloaded 2004/01/15 markd)
    # reloaded due to bug in creating bogus CDS
    mv ~/hg16/bed/vega ~/hg16/bed/vega.badcds
    mkdir ~/hg16/bed/vega
    cd ~/hg16/bed/vega
    
    wget http://www.sanger.ac.uk/Users/keenan/vega_homo_sapiens_ncbi34.gtf.gz

    # Load genes and Immunoglobulin/Pseudogenes into 2 separate tracks.  Just
    # omit snoRNAs, as there are so few of them
    zcat vega_homo_sapiens_ncbi34.gtf.gz | awk '!(/small nucleolar RNA/ || $2 == "Pseudogene" || $2 == "Ig_Pseudogene_Segment" || $2 == "Ig_Segment") {print "chr"$0}'  > vegaGene.gtf
    zcat vega_homo_sapiens_ncbi34.gtf.gz | awk '($2 == "Pseudogene" || $2 == "Ig_Pseudogene_Segment" || $2 == "Ig_Segment") {print "chr"$0}'  > vegaPseudoGene.gtf

    ldHgGene -gtf hg16 vegaGene vegaGene.gtf
    ldHgGene -gtf hg16 vegaPseudoGene vegaPseudoGene.gtf

    wget http://www.sanger.ac.uk/Users/keenan/vega_pep_dump_ncbi34.fa.gz
    hgPepPred hg16 generic vegaPep vega_pep_dump_ncbi34.fa

    vegaBuildInfo vega_homo_sapiens_core_4_0.gtf vegaInfo.tab
    hgsql hg16 < ~/kent/src/hg/lib/vegaInfo.sql
    hgsql -e "load data local infile 'vegaInfo.tab' into table vegaInfo" hg16


# KNOWN GENES UPDATE (DONE - 2004-01-29 - Hiram)
# RELOADED THE cgapBiocDesc AND cgapAlias TABLES TO REMOVE REPLICATED ROWS 
# (DONE, 2005-07-26, hartera)
# RELOADED cgapAlias AGAIN AS TOO MANY ROWS REMOVED BEFORE (hartera, 2005-10-06)
    # update swissProt and proteins databases
    # You want to run these two scripts on the same day to keep the
    #	the date stamp consistent.  In this case the data stamp is 040115
    ssh eieio
    cd /cluster/data/swissprot
    ~kent/src/hg/protein/mkSwissProtDB.sh
    # that obtains the data and unpacks it, second step is on hgwdev
    # to create the database
    ssh hgwdev
    cd /cluster/data/swissprot
    ~/kent/src/hg/protein/mkSwissProtDB.sh
    # Now the proteins database can be created from that.  Must be on
    # hgwdev
    ssh hgwdev
    cd /cluster/data/proteins
    ~/kent/src/hg/protein/mkProteinsDb.sh 040115

    # prepare all the tables in a temporary database, then move
    #	into Hg16.  Leave a link in hg16/bed so it can be found
    mkdir /cluster/data/kgDB/bed/hg16
    ln -s /cluster/data/kgDB/bed/hg16 \
	/cluster/data/hg16/bed/knownGenes.2004-01-29
    cd /cluster/data/kgDB/bed/hg16
    ~/kent/src/hg/protein/KGprocess.sh kgDB hg16 040115
    # That runs to a point that prepares a cluster job, continuing on kk
    ssh kk
    cd /cluster/data/kgDB/bed/hg16/kgBestMrna
    para create jobList
    para try
    para push 
    ... etc ...
    # on a busy cluster, takes almost an hour:
# Completed: 46583 of 46583 jobs
# CPU time in finished jobs:     127351s    2122.51m    35.38h    1.47d  0.004 y
# IO & Wait Time:                119182s    1986.37m    33.11h    1.38d  0.004 y
# Average job time:                   5s       0.09m     0.00h    0.00d
# Longest job:                       14s       0.23m     0.00h    0.00d
# Submission to last job:          3513s      58.55m     0.98h    0.04d

     # Continuing back on hgwdev, run the same script again
    ssh hgwdev
    cd /cluster/data/kgDB/bed/hg16
    ~/kent/src/hg/protein/KGprocess.sh kgDB hg16 040115
    # should continue to completion, all tables are in kgDB and can be
    # moved if they check out to be similar to existing tables in hg16
    # You can verify table sizes with the script:
    ~kent/src/hg/protein/checkTbls.pl kgDB
    ~kent/src/hg/protein/checkTbls.pl hg16 kg
    # should have similar row counts in each of these outputs
    #	This rename can be done more simply with the 'rename' command
    #	instead of the 'alter table' used here.
    cat << '_EOF_' > renameTables.sh
#!/bin/sh

SOURCE=kgDB
TARGET=hg16

for T in cgapAlias cgapBiocDesc cgapBiocPathway dupSpMrna \
        keggMapDesc keggPathway kgAlias kgProtAlias kgXref \
        knownGene knownGeneLink knownGeneMrna knownGenePep mrnaRefseq spMrna
do
    hgsql -e "drop table ${T};" ${TARGET}
    hgsql -e "alter table ${SOURCE}.${T} rename ${TARGET}.${T}" mysql
    echo "done $T"
done
'_EOF_'
    # << this line keeps emacs coloring happy
    chmod +x renameTables.sh
    ./renameTables.sh
    # RELOAD THE cgapBiocDesc AND cgapAlias TABLES (hartera, 2005-07-26)
    # Reload the cgapBiocDesc and cgapAlias tables as they have replicated 
    # rows. Need to sort and unique the file before loading into the database.

    cd /cluster/data/kgDB/bed/hg16
    sort -u cgapBIOCARTAdesc.tab > cgapBIOCARTAdescSorted.tab
    # for cgapAlias, the number of rows in the table is different to the
    # tab file here so dump the table first.
    # RELOAD cgapAlias AGAIN AS sort -nu REMOVES MORE ROWS THAN sort -u
    # OR sort -n | uniq.
    #USE sort -n then uniq TO SORT ON THE IDs AND THEN UNIQ(hartera, 2005-10-06)
    # hgsql -N -e 'select * from cgapAlias;' hg16 > cgapAliasDump.txt
    # above command used to get alias file from hg16 before sorting
    sort -n cgapAliasDump.txt | uniq > cgapAliasDumpSorted.tab
    hgsql hg16 -e "drop table cgapBiocDesc"
    hgsql hg16 -e "drop table cgapAlias"
    hgsql hg16 < ~/kent/src/hg/lib/cgapBiocDesc.sql
    hgsql hg16 < ~/kent/src/hg/lib/cgapAlias.sql
    hgsql hg16 -e 'load data local infile "cgapBIOCARTAdescSorted.tab" \
          into table cgapBiocDesc'
    hgsql hg16 -e 'load data local infile "cgapAliasDumpSorted.tab" \
          into table cgapAlias'

    # the following extra process will be included in the next version
    # of KGprocess.sh to create the kgProtMap table:
    mkdir /cluster/data/kgDB/bed/hg16/kgProtMap
    cd /cluster/data/kgDB/bed/hg16/kgProtMap
    awk '{print ">" $1;print $2}' ../refMrna.tab > kgMrna.fa
    /scratch/blast/formatdb -i kgMrna.fa -p F
    echo "`date` creating kgPep.fa"
hgsql -N -e 'select spID,seq from kgXref,knownGenePep where kgID=name' ${DB} \
        | awk '{print ">" $1;print $2}' >kgPep.fa
    rm -fr kgPep
    rm -f jobList
    mkdir kgPep
    faSplit sequence kgPep.fa 5000 kgPep/kgPep
    for f in kgPep/*.fa
    do
      echo ./kgProtBlast.csh $f >> jobList
    done
    awk '{printf "%s\t%s\n", $3,$2}' ../kgXref.tab > kgProtMrna.pairs
    # run a cluster job
    ssh kk9
    cd /cluster/data/kgDB/bed/hg16/kgProtMap
    para create jobList
    para try
    para push ... etc
# Completed: 4949 of 4949 jobs
# CPU time in finished jobs:    1061454s   17690.90m   294.85h   12.29d  0.034 y
# IO & Wait Time:                 13400s     223.33m     3.72h    0.16d  0.000 y
# Average job time:                 217s       3.62m     0.06h    0.00d
# Longest job:                      996s      16.60m     0.28h    0.01d
# Submission to last job:         12152s     202.53m     3.38h    0.14d

    # back to hgwdev
    ssh hgwdev
    cd /cluster/data/kgDB/bed/hg16/kgProtMap
    find ./psl.tmp -name '*.psl.gz' | xargs zcat | \
        pslReps -nohead stdin psl.tmp/kgProtMrna.psl /dev/null
    cd psl.tmp
    (pslMap kgProtMrna.psl ../../tight_mrna.psl stdout | \
        sort -k 14,14 -k 16,16n -k 17,17n > kgProtMap.psl) > kgProtMap.out 2>&1
    # this table data is ready to load, verify it by comparison with
    # existing kgProtMap data, then load:
    hgLoadPsl hg16 kgProtMap.psl


# MAKE LINEAGE-SPECIFIC REPEATS FOR CHICKEN (DONE 2/18/04 angie)
    # In an email 2/13/04, Arian said we could treat all human repeats as 
    # lineage-specific for human-chicken blastz.  Scripts expect *.out.spec 
    # filenames, so set that up:
    ssh kkr1u00
    cd /cluster/data/hg16
    mkdir /iscratch/i/gs.17/build34/linSpecRep.Chicken
    foreach f (/scratch/hg/gs.17/build34/rmsk/chr*.fa.out)
      cp -p $f /iscratch/i/gs.17/build34/linSpecRep.Chicken/$f:t:r:r.out.spec
    end
    iSync
    # Use these the next time we run human-chicken blastz.


# BLASTZ CHICKEN (GALGAL2) (DONE 2/26/04 angie)
    ssh kk
    # space is awful tight on store4 -- use store7.  
    mkdir -p /cluster/store7/hg16/bed/blastz.galGal2.2004-02-25
    ln -s /cluster/store7/hg16/bed/blastz.galGal2.2004-02-25 \
      /cluster/data/hg16/bed/
    cd /cluster/data/hg16/bed/blastz.galGal2.2004-02-25
    # Set L=10000 (higher threshold on blastz's outer loop) and abridge 
    # repeats.
    cat << '_EOF_' > DEF
# human vs. chicken
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/i386:/cluster/home/angie/schwartzbin

ALIGN=blastz-run
BLASTZ=blastz

# Specific settings for chicken (per Webb email to Brian Raney)
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=10000
BLASTZ_K=2200
BLASTZ_Q=/cluster/data/blastz/HoxD55.q
BLASTZ_ABRIDGE_REPEATS=1

# TARGET: Human
SEQ1_DIR=/iscratch/i/gs.17/build34/bothMaskedNibs
SEQ1_RMSK=
SEQ1_FLAG=
SEQ1_SMSK=/iscratch/i/gs.17/build34/linSpecRep.Chicken
SEQ1_IN_CONTIGS=0
SEQ1_CHUNK=10000000
SEQ1_LAP=10000

# QUERY: Chicken
SEQ2_DIR=/iscratch/i/galGal2/nib
SEQ2_RMSK=
SEQ2_FLAG=
SEQ2_SMSK=/iscratch/i/galGal2/linSpecRep
SEQ2_IN_CONTIGS=0
SEQ2_CHUNK=10000000
SEQ2_LAP=0

BASE=/cluster/store7/hg16/bed/blastz.galGal2.2004-02-25

DEF=$BASE/DEF
RAW=$BASE/raw
CDBDIR=$BASE
SEQ1_LEN=$BASE/S1.len
SEQ2_LEN=$BASE/S2.len
'_EOF_'
    # << this line keeps emacs coloring happy

    # first cluster run: raw blastz alignments
    ssh kk
    bash # if a csh/tcsh user
    cd /cluster/data/hg16/bed/blastz.galGal2.2004-02-25
    source DEF
    mkdir $RAW run.0
    /cluster/home/angie/hummus/make-joblist $DEF > $BASE/run.0/j
    sh ./xdir.sh
    cd run.0
    sed -e 's@^blastz-run@/cluster/bin/penn/blastz-run@' j > jobList
    para create jobList
    para try, check, push, check, ....
#Completed: 51189 of 51189 jobs
#Average job time:                 477s       7.95m     0.13h    0.01d
#Longest job:                     2318s      38.63m     0.64h    0.03d
#Submission to last job:         29598s     493.30m     8.22h    0.34d

    # second cluster run: lift raw alignments -> lav dir
    ssh kki
    bash # if a csh/tcsh user
    cd /cluster/data/hg16/bed/blastz.galGal2.2004-02-25
    source DEF
    mkdir run.1 lav
    /cluster/bin/scripts/blastz-make-out2lav $DEF $BASE > $BASE/run.1/jobList
    cd run.1
    wc -l jobList
    para create jobList
    para try, check, push, etc ...
#Completed: 339 of 339 jobs
#Average job time:                   6s       0.11m     0.00h    0.00d
#Longest job:                       21s       0.35m     0.01h    0.00d
#Submission to last job:           150s       2.50m     0.04h    0.00d

    # third run: lav -> axt
    ssh kki
    cd /cluster/data/hg16/bed/blastz.galGal2.2004-02-25
    mkdir axtChrom pslChrom run.2
    cd run.2
    cat << '_EOF_' > do.csh
#!/bin/csh -ef
cd $1
set chr = $1:t
cat `ls -1 *.lav | sort -g` \
| $HOME/bin/x86_64/lavToAxt stdin \
    /iscratch/i/gs.17/build34/bothMaskedNibs /iscratch/i/galGal2/nib stdout \
| $HOME/bin/x86_64/axtSort stdin ../../axtChrom/$chr.axt 
$HOME/bin/x86_64/axtToPsl ../../axtChrom/$chr.axt ../../S1.len ../../S2.len \
  ../../pslChrom/$chr.psl
'_EOF_'
    # << this line keeps emacs coloring happy
    chmod a+x do.csh
    cp /dev/null jobList
    foreach d (../lav/chr*)
      echo "do.csh $d" >> jobList
    end
    para create jobList
    para try, check, push, check
#Completed: 42 of 42 jobs
#Average job time:                  38s       0.64m     0.01h    0.00d
#Longest job:                      147s       2.45m     0.04h    0.00d
#Submission to last job:           147s       2.45m     0.04h    0.00d


# RUN AXTBEST AND GENERATE MAF FOR MULTIZ (DONE 2/26/04 angie)
    ssh kolossus
    cd /cluster/data/hg16/bed/blastz.galGal2.2004-02-25
    mkdir axtBest pslBest
    foreach chrdir (lav/chr*)
      set chr=$chrdir:t
      echo axtBesting $chr
      axtBest axtChrom/$chr.axt $chr axtBest/$chr.axt -minScore=300
      axtToPsl axtBest/$chr.axt S1.len S2.len pslBest/$chr.psl
    end
    mkdir mafBest
    foreach f (axtBest/chr*.axt)
      set maf = mafBest/$f:t:r.hg.maf
      axtToMaf $f \
            /cluster/data/hg16/chrom.sizes /cluster/data/galGal2/chrom.sizes \
            $maf -tPrefix=hg16. -qPrefix=galGal2.
    end
    ssh hgwdev
    cd /cluster/data/hg16/bed/blastz.galGal2.2004-02-25
    cat pslBest/chr*.psl | hgLoadPsl -table=blastzBestGalGal2 hg16  stdin


# CHAIN CHICKEN BLASTZ (DONE 2/26/04 angie)
    # Run axtChain on little cluster
    ssh kki
    cd /cluster/data/hg16/bed/blastz.galGal2.2004-02-25
    mkdir -p axtChain/run1
    cd axtChain/run1
    mkdir out chain
    ls -1S /cluster/data/hg16/bed/blastz.galGal2.2004-02-25/axtChrom/*.axt \
      > input.lst
    cat << '_EOF_' > gsub
#LOOP
doChain {check in exists $(path1)} {check out line+ chain/$(root1).chain} {check out line+ out/$(root1).out}
#ENDLOOP
'_EOF_'
    # << this line makes emacs coloring happy

    # Make our own linear gap file with reduced gap penalties, 
    # in hopes of getting longer chains:
    cat << '_EOF_' > ../../chickenHumanTuned.gap
tablesize	11
smallSize	111
position	1	2	3	11	111	2111	12111	32111	72111	152111	252111
qGap	325	360	400	450	600	1100	3600	7600	15600	31600	56600
tGap	325	360	400	450	600	1100	3600	7600	15600	31600	56600
bothGap	625	660	700	750	900	1400	4000	8000	16000	32000	57000
'_EOF_'
    # << this line makes emacs coloring happy

    cat << '_EOF_' > doChain
#!/bin/csh
axtFilter -notQ=chrUn $1 \
| axtChain -scoreScheme=/cluster/data/blastz/HoxD55.q \
                      -linearGap=../../chickenHumanTuned.gap \
                      -minScore=5000 stdin \
    /iscratch/i/gs.17/build34/bothMaskedNibs \
    /iscratch/i/galGal2/nib $2 > $3
'_EOF_'
    # << this line makes emacs coloring happy
    chmod a+x doChain
    gensub2 input.lst single gsub jobList
    para create jobList
    para try, check, push, check...
    # axtChrom/chr1{8,9}_random.axt are empty, so the {out line +} checks 
    # failed:
#Completed: 40 of 42 jobs
#Crashed: 2 jobs
#Average job time:                  28s       0.46m     0.01h    0.00d
#Longest job:                       76s       1.27m     0.02h    0.00d
#Submission to last job:            92s       1.53m     0.03h    0.00d

    # now on the cluster server, sort chains
    ssh kksilo
    cd /cluster/data/hg16/bed/blastz.galGal2.2004-02-25/axtChain
    chainMergeSort run1/chain/*.chain > all.chain
    chainSplit chain all.chain
    rm run1/chain/*.chain

    # Load chains into database
    ssh hgwdev
    cd /cluster/data/hg16/bed/blastz.galGal2.2004-02-25/axtChain/chain
    foreach i (*.chain)
        set c = $i:r
        echo loading $c
        hgLoadChain hg16 ${c}_chainGalGal2 $i
    end


# RESCORE CHICKEN BLASTZ (DONE 3/1/04 angie)
    # Webb noticed low scores in latest runs with repeats abridged --
    # PSU's restore_rpts program rescored alignments with default matrix 
    # instead of BLASTZ_Q matrix.  Rescore them here so the chainer sees 
    # the higher scores:
    ssh kolossus
    cd /cluster/data/hg16/bed/blastz.galGal2.2004-02-25
    mkdir axtChrom.rescore
    foreach f (axtChrom/chr*.axt)
      axtRescore -scoreScheme=/cluster/data/blastz/HoxD55.q \
        $f axtChrom.rescore/$f:t
    end
    mv axtChrom axtChrom.orig
    mv axtChrom.rescore axtChrom


# NET HUMAN BLASTZ (DONE 2/26/04 angie)
    ssh kksilo
    cd /cluster/data/hg16/bed/blastz.galGal2.2004-02-25/axtChain
    chainPreNet all.chain ../S1.len ../S2.len stdout \
    | chainNet stdin -minSpace=1 ../S1.len ../S2.len stdout /dev/null \
    | netSyntenic stdin noClass.net

    # Add classification info using db tables:
    ssh hgwdev
    cd /cluster/data/hg16/bed/blastz.galGal2.2004-02-25/axtChain
    netClass noClass.net hg16 galGal2 human.net

    # Make a 'syntenic' subset:
    ssh kksilo
    cd /cluster/data/hg16/bed/blastz.galGal2.2004-02-25/axtChain
    rm noClass.net
    # Make a 'syntenic' subset of these with
    netFilter -syn human.net > humanSyn.net

    # Load the nets into database 
    ssh hgwdev
    cd /cluster/data/hg16/bed/blastz.galGal2.2004-02-25/axtChain
    netFilter -minGap=10 human.net |  hgLoadNet hg16 netGalGal2 stdin
    netFilter -minGap=10 humanSyn.net | hgLoadNet hg16 netSyntenyGalGal2 stdin
    # Add entries for chainGalGal2, netGalGal2, syntenyGalGal2 to 
    # human/hg16 trackDb


# MAKE VSGALGAL2 DOWNLOADABLES (DONE 3/1/04 angie)
    ssh kksilo
    cd /cluster/data/hg16/bed/blastz.galGal2.2004-02-25
    # Webb asked for axtChrom/chr22.axt... since axtChrom is rel. small 
    # this time, just put it all out there.
    zip /cluster/data/hg16/zip/GGaxtChrom.zip axtChrom/chr*.axt
    cd /cluster/data/hg16/bed/blastz.galGal2.2004-02-25/axtChain
    cp all.chain chicken.chain
    zip /cluster/data/hg16/zip/chicken.chain.zip chicken.chain
    rm chicken.chain
    cp human.net chicken.net
    zip /cluster/data/hg16/zip/chicken.net.zip chicken.net
    rm chicken.net
    cp humanSyn.net chickenSyn.net
    zip /cluster/data/hg16/zip/chickenSyn.net.zip chickenSyn.net
    rm chickenSyn.net

    ssh hgwdev
    mkdir /usr/local/apache/htdocs/goldenPath/hg16/vsGalGal2
    cd /usr/local/apache/htdocs/goldenPath/hg16/vsGalGal2
    mv /cluster/data/hg16/zip/GGaxtChrom.zip axtChrom.zip
    mv /cluster/data/hg16/zip/chicken*.zip .
    md5sum *.zip > md5sum.txt
    # Copy over & edit README.txt w/pointers to chain, net formats.


# MULTIZ HUMAN/MOUSE/RAT/GALGAL2 (DONE 3/8/04 angie)
# (galGal2 added to human/mouse/rat alignments described above [HUMOR])
    # put the MAFs on bluearc
    ssh eieio
    mkdir -p /cluster/bluearc/multiz.hg16mm3rn3galGal2/hmr
    mkdir -p /cluster/bluearc/multiz.hg16mm3rn3galGal2/hg
    cp /cluster/data/hg16/bed/humor.2003-09-08/hmr/*.maf \
      /cluster/bluearc/multiz.hg16mm3rn3galGal2/hmr
    cp /cluster/data/hg16/bed/blastz.galGal2.2004-02-25/mafBest/*.maf \
      /cluster/bluearc/multiz.hg16mm3rn3galGal2/hg

    ssh kki
    mkdir /cluster/data/hg16/bed/multiz.hg16mm3rn3galGal2
    cd /cluster/data/hg16/bed/multiz.hg16mm3rn3galGal2
    mkdir hmrg
    # Wrapper script required because of stdout redirect:
    cat << '_EOF_' > doMultiz
#!/bin/csh
/cluster/bin/penn/multiz $1 $2 - > $3
'_EOF_'
    # << this line makes emacs coloring happy
    chmod a+x doMultiz
    rm -f jobList
    foreach file (/cluster/bluearc/multiz.hg16mm3rn3galGal2/hmr/*.maf) 
      set root=$file:t:r:r
      echo "doMultiz /cluster/bluearc/multiz.hg16mm3rn3galGal2/hg/${root}.hg.maf $file /cluster/data/hg16/bed/multiz.hg16mm3rn3galGal2/hmrg/${root}.maf" >> jobList
    end
    para create jobList
    para try, check, push, check
#Completed: 41 of 41 jobs
#Average job time:                  88s       1.47m     0.02h    0.00d
#Longest job:                      276s       4.60m     0.08h    0.00d
#Submission to last job:           278s       4.63m     0.08h    0.00d

    # clean up bluearc (these are big files!)
    rm -r /cluster/bluearc/multiz.hg16mm3rn3galGal2

    # setup external files for database reference
    ssh hgwdev
    mkdir -p /gbdb/hg16/multizMm3Rn3GalGal2
    ln -s /cluster/data/hg16/bed/multiz.hg16mm3rn3galGal2/hmrg/*.maf \
      /gbdb/hg16/multizMm3Rn3GalGal2
    # load into database
    /cluster/bin/i386/hgLoadMaf -warn hg16 multizMm3Rn3GalGal2


# LOAD SOFTBERRY GENES (DONE - 2004-02-10 - Hiram)
     mkdir -p /cluster/data/hg16/bed/softberry
     cd /cluster/data/hg16/bed/softberry
     set file = Soft_fgenesh_jul03.tar.gz
     wget --timestamping ftp://www.softberry.com/pub/SC_HUM_JUL03/$file
     tar xzvf $file
     ldHgGene hg16 softberryGene fgenesh_jul03/chr*.gff
     hgPepPred hg16 softberry fgenesh_jul03/*.protein
     hgSoftberryHom hg16 fgenesh_jul03/*.protein


# CHIMP (panTro1) ALIGNMENTS (2004-02-12 kate)

    # lift scaffold-based reciprocal best chains to chrom coordinates
    ssh eieio
    mkdir -p bed/blastz-blat.panTro1
    cd bed/blastz-blat.panTro1
    cp /cluster/data/pt0/bed/blastz-blatHg16/human.best.chain \
                                        best.scaffolds.chain
    cp /cluster/data/panTro1/jkStuff/scaffolds.lft scaffolds.lft
    ~kate/bin/i386/liftUp -chainQ best.chain scaffolds.lft \
                                        warn best.scaffolds.chain

#Make a track from Tarjei's chimp deletions file (2/12/04, kpollard)
#  80-12000 bp indels in human/chimp alignments

    #make .bed files from Tarjei's .fa files
    cd /cluster/data/panTro1/bed/indels
    /cluster/bin/i386/faSimplify indels.human.fa , , temp.fa
    /cluster/bin/i386/faSize detailed=on temp.fa > human.start.txt
    /cluster/bin/i386/faSimplify indels.human.fa ">" , temp.fa
    /cluster/bin/i386/faSize detailed=on temp.fa > human.chr.txt
    R
	#Commands in R
	chr<-read.table("human.chr.txt") #read in chromosome and size
	start<-read.table("human.start.txt") #read in start and size
	both<-cbind(chr,start) #concatinate: chrN size start size
	sum(both[,2]!=both[,4]) #check that the size columns are identical
	#0
	both[,4]<-both[,2]+both[,3] #add start and size to get stop
	both<-both[,c(1,3,4,2)] #reorder columns to get chrN start stop size
	both[,4]<-paste("CD",1:length(both[,4]),"_",both[,4],sep="") #make name like CDN_size
	write(t(both),"indels.human.bed",ncol=4) #write bed file
	q() #quit
    #delimit with tabs
    cat indels.human.bed | gawk '{print $1"\t"$2"\t"$3"\t"$4}' > indels.human.tab.bed
    #load track into browser
    mkdir -p /gbdb/hg16/hg_insert
    ln -s  /cluster/data/panTro1/bed/indels/indels.human.tab.bed /gbdb/hg16/hg_insert
    cd /cluster/data/panTro1/bed/indels
    /cluster/bin/i386/hgLoadBed hg16 hg_insert indels.human.tab.bed
    #change name to humanDels
    hgsql hg16
	rename table hg_insert to chimpDels;
	exit
    #add description file chimpDels.html 
    # to ~/kent/src/hg/makeDb/trackDb/human/hg16
    #add a track entry to trackDb.ra 
    # in ~/kent/src/hg/makeDb/trackDb/human/hg16


#	FAMILY BROWSER UPDATE	(DONE - 2004-02-17 - Hiram)
#	to be done after knownGene tables are complete from known gene
#	process.
#
# Cluster together various alt-splicing isoforms.
#	Creates the knownIsoforms and knownCanonical tables
ssh hgwdev
mkdir /cluster/data/hg16/bed/famBro.2004-02-17
ln -l /cluster/data/hg16/bed/famBro.2004-02-17 /cluster/data/hg16/bed/famBro
cd /cluster/data/hg16/bed/famBro
hgClusterGenes hg16 knownGene knownIsoforms knownCanonical

# Extract peptides from knownGenes into fasta file
# and create a blast database out of them.
mkdir /cluster/data/hg16/bed/famBro/blastp
cd /cluster/data/hg16/bed/famBro/blastp
pepPredToFa hg16 knownGenePep known.faa
#	You may need to build this binary in src/hg/near/pepPredToFa
/scratch/blast/formatdb -i known.faa -t known -n known
#	This command is in /projects/compbio/bin/$MACH/formatdb

# Copy over database to bluearc
rm -fr /cluster/bluearc/hg16/blastp
mkdir -p /cluster/bluearc/hg16/blastp
cp -p /cluster/data/hg16/bed/famBro/blastp/known.* /cluster/bluearc/hg16/blastp

# Load up cluster/bluearc with blastp and related files
# if necessary
if (! -e /cluster/bluearc/blast/blastall) then
    mkdir -p /cluster/bluearc/blast
    cp /projects/compbio/bin/i686/blastall /cluster/bluearc/blast
    mkdir -p /cluster/bluearc/blast/data
    cp /projects/compbio/bin/i686/data/* /cluster/bluearc/blast/data
endif

# Split up fasta file into bite sized chunks for cluster
cd /cluster/data/hg16/bed/famBro/blastp
mkdir split
faSplit sequence known.faa 8000 split/kg

# Make parasol run directory   (this would not work on kk, use kk9 instead)
#	Need to check the difference between the blast in /scratch/blast
#	and this /cluster/bluearc/blast
ssh kk9
mkdir /cluster/data/hg16/bed/famBro/blastp/self
cd /cluster/data/hg16/bed/famBro/blastp/self
mkdir run
cd run
mkdir out

# Make blast script
cat  << '_EOF_' > blastSome
#!/bin/sh
BLASTMAT=/cluster/bluearc/blast/data
export BLASTMAT
/cluster/bluearc/blast/blastall -p blastp \
	-d /cluster/bluearc/hg16/blastp/known -i $1 -o $2 \
	-e 0.01 -m 8 -b 1000
'_EOF_'
chmod a+x blastSome

# Make gensub2 file
cat  << '_EOF_' > gsub
#LOOP
blastSome {check in line+ $(path1)} {check out line out/$(root1).tab}
#ENDLOOP
'_EOF_'

# Create parasol batch
#	'ls ../../split/*.fa' is too much, hence the echo
echo ../../split/*.fa | wordLine stdin > split.lst
gensub2 split.lst single gsub jobList
para create jobList
para try

# Wait a couple of minutes, and do a para check,  if all is good
# then do a
para push
# This should finish in ~15 minutes if the cluster is free.

# Completed: 7748 of 7748 jobs
# CPU time in finished jobs:      73213s    1220.22m    20.34h    0.85d  0.002 y
# IO & Wait Time:                 20054s     334.23m     5.57h    0.23d  0.001 y
# Average job time:                  12s       0.20m     0.00h    0.00d
# Longest job:                      118s       1.97m     0.03h    0.00d
# Submission to last job:          1117s      18.62m     0.31h    0.01d

# Load into database.  This takes about an hour.
ssh hgwdev
cd /cluster/data/hg16/bed/famBro/blastp/self/run/out
hgLoadBlastTab hg16 knownBlastTab *.tab
# Scanning through 7748 files
# Loading database with 11376875 rows

cd /cluster/data/hg16/bed/famBro
# Create table that maps between known genes and RefSeq
hgMapToGene hg16 refGene knownGene knownToRefSeq
#	may need to build this command in src/hg/near/hgMapToGene
#	row count changed from 32674 to 35416

# Create table that maps between known genes and LocusLink
hgsql --skip-column-names -e "select mrnaAcc,locusLinkId from refLink" hg16 \
	> refToLl.txt
hgMapToGene hg16 refGene knownGene knownToLocusLink -lookup=refToLl.txt
#	row count went from 32845 to 35146

# Create table that maps between known genes and Pfam domains
hgMapViaSwissProt hg16 knownGene name proteinID Pfam knownToPfam
#	row count went from 31201 to 32225
# JK Fixed bug that let multiple identical columns happen in knownToPfam
# on April 15, 2004.  Row count now 30467

# Create table to map between known genes and GNF Atlas2
# expression data.
    hgMapToGene hg16 gnfAtlas2 knownGene knownToGnfAtlas2 '-type=bed 12'

# Create expression distance table - takes about an hour
# (Regenerated April 16, 2004 in response to knownToGnfAtlas2 update)
    hgExpDistance hg16 hgFixed.gnfHumanAtlas2MedianRatio \
    	hgFixed.gnfHumanAtlas2MedianExps gnfAtlas2Distance \
	-lookup=knownToGnfAtlas2

# Create a table that maps between known genes and 
# the nice affy expression data.
hgMapToGene "-type=bed 12" hg16 affyUcla knownGene knownToU133
#	row count went from 34148 to 36818

# Create expression distance table.  This will take about an hour.
cd ~/kent/src/hg/near/hgExpDistance
time hgExpDistance hg16 affyUcla affyUclaExp knownExpDistance \
	-weights=affyUcla.weight -lookup=knownToU133
# 42 genes, 42 weights, 26.500000 total wieght
# Got 36818 unique elements in affyUcla
# Made knownExpDistance.tab
# Loaded knownExpDistance
# Made query index
# real    80m50.113s
# user    62m33.290s
# sys     2m15.200s

#	This command should be done elsewhere, /tmp or something like that
#	It makes a temporary .tab file of almost 1 Gb
#	row count went from 34148000 to 36818000

# Create table that maps between known genes and 
# the GNF data.
hgMapToGene hg16 affyU95 knownGene knownToU95
cd /tmp
#	hgFixed.gnfHumanU95Exps argument is unused, no need to exist
hgExpDistance hg16 hgFixed.gnfHumanU95MedianRatio hgFixed.gnfHumanU95Exps gnfU95Distance  -lookup=knownToU95
#	row count went from 11718000 to 17330000
#  original makeNear.doc had this as:
# hgExpDistance hg16 affyGnfU95 affyGnfU95Exps knownGnfDistance -lookup=knownToU95

# Make sure that GO database is up to date.
See README in /cluster/store1/geneOntology.
#	I update this GO database very carefully, checking that all
#	structures in it remain the same from release to release and
#	backing up the current go DB in a backup database.  In this case
#	the backup is go040107 - when it was loaded for Mm4, and the new
#	go database is based on data from Dec 17th 2003 and Feb 2004 according
#	to the time stamp on the fetched data.  This build was done in
#	/cluster/store1/geneOntology/20040217

cd /cluster/data/hg16/bed/famBro

# Create knownToEnsembl column
hgMapToGene hg16 ensGene knownGene knownToEnsembl
#	table row count went from previous version: 36068 to 38251

# Make knownToCdsSnp column.  This is a little complicated by
# having to merge data form the snpTsc and the snpNih tracks.
hgMapToGene hg16 snpTsc knownGene knownToCdsSnp -createOnly -all -cds
hgMapToGene hg16 snpTsc knownGene snp1 -noLoad -all -cds
hgMapToGene hg16 snpNih knownGene snp2 -noLoad -all -cds
sort snp1.tab snp2.tab > knownToCdsSnp.tab
rm snp1.tab snp2.tab
hgsql \
    -e 'load data local infile "knownToCdsSnp.tab" into table knownToCdsSnp;' \
	hg16
#	row count went from 87273 to 106199

# Make C. elegans ortholog column using blastp on wormpep.
# First make C. elegans protein database and copy it to cluster/bluearc
# if it doesn't exist already
#	This is already done, see makeMm3.doc for procedure
#	the directory: /cluster/bluearc/ce1/blastp should have data

# Create the ceBlastTab  (the blastall binary only works on kk9 for now ...)
ssh kk9
mkdir /cluster/data/hg16/bed/famBro/blastp/ce1
cd /cluster/data/hg16/bed/famBro/blastp/ce1
mkdir run
cd run
mkdir out

# Make blast script
cat  << '_EOF_' > blastSome
#!/bin/sh
BLASTMAT=/cluster/bluearc/blast/data /cluster/bluearc/blast/blastall \
	-p blastp -d /cluster/bluearc/ce1/blastp/wormPep \
	-i $1 -o $2 -e 0.01 -m 8 -b 1
'_EOF_'
chmod a+x blastSome

# Make gensub2 file
cat << '_EOF_' > gsub
#LOOP
blastSome {check in line+ $(path1)} {check out line out/$(root1).tab}
#ENDLOOP
'_EOF_'

# Create parasol batch
echo ../../split/*.fa | wordLine stdin > split.lst
gensub2 split.lst single gsub jobList
para create jobList
para try

# Wait a couple of minutes, and do a para check,  if all is good
# then do a
para push

# This should finish in ~10 minutes if the cluster is free.
# Here's the para time results
# Completed: 7748 of 7748 jobs
# CPU time in finished jobs:      28869s     481.16m     8.02h    0.33d  0.001 y
# IO & Wait Time:                 20454s     340.89m     5.68h    0.24d  0.001 y
# Average job time:                   6s       0.11m     0.00h    0.00d
# Longest job:                       52s       0.87m     0.01h    0.00d
# Submission to last job:           584s       9.73m     0.16h    0.01d

# Load into database.  
ssh hgwdev
cd /cluster/data/hg16/bed/famBro/blastp/ce1/run/out
hgLoadBlastTab hg16 ceBlastTab -maxPer=1 *.tab
#	row count went from 25599 to 26958

# Make mouse ortholog column using blastp on mouse known genes.
# First make mouse protein database and copy it to cluster/bluearc
# if it doesn't exist already
#	This already exists.  See makeMm4.doc for procedure
#	the directory: /cluster/bluearc/mm4/blastp should have data

# Make parasol run directory 
ssh kk9
mkdir /cluster/data/hg16/bed/famBro/blastp/mm4
cd /cluster/data/hg16/bed/famBro/blastp/mm4
mkdir run
cd run
mkdir out

# Make blast script
cat  << '_EOF_' > blastSome
#!/bin/sh
BLASTMAT=/cluster/bluearc/blast/data /cluster/bluearc/blast/blastall \
	-p blastp -d /cluster/bluearc/mm4/blastp/known \
	-i $1 -o $2 -e 0.001 -m 8 -b 1
'_EOF_'
chmod a+x blastSome

# Make gensub2 file
cat << '_EOF_' > gsub
#LOOP
blastSome {check in line+ $(path1)} {check out line out/$(root1).tab}
#ENDLOOP
'_EOF_'

# Create parasol batch
# (wordLine wouldn't run on kk9:
#	wordLine: /lib/i686/libc.so.6: version `GLIBC_2.3' not found
#	run this echo statement on hgwdev
#	this echo trick is used because otherwise the command line is
#	too long and you can not do a simple ls

echo ../../split/*.fa | wordLine stdin > split.lst
gensub2 split.lst single gsub jobList
para create jobList
para try

# Wait a couple of minutes, and do a para check,  if all is good
# then do a
para push

#	takes about 15 minutes:
# Completed: 7748 of 7748 jobs
# CPU time in finished jobs:      54179s     902.98m    15.05h    0.63d  0.002 y
# IO & Wait Time:                 20428s     340.47m     5.67h    0.24d  0.001 y
# Average job time:                  10s       0.16m     0.00h    0.00d
# Longest job:                       76s       1.27m     0.02h    0.00d
# Submission to last job:          2031s      33.85m     0.56h    0.02d

# Load into database.  
ssh hgwdev
cd /cluster/data/hg16/bed/famBro/blastp/mm4/run/out
hgLoadBlastTab hg16 mmBlastTab -maxPer=1 *.tab
# Scanning through 7748 files
# Loading database with 35611 rows
#	row count went from 33191 to 35611


# REFSEQ HOMOLOGS (DONE 6/18/04 angie)
    # Translate mmBlastTab's knownGene acc's into RefSeq where possible, 
    # since our users frequently ask for help in determining homologs for 
    # human/mouse RefSeq accs...
    ssh hgwdev
    hgsql hg16 -e \
     'create table mmRefSeqHomolog \
      select hg16.knownToRefSeq.value as name, \
             mm3.knownToRefSeq.value as homolog, \
             mmBlastTab.identity, mmBlastTab.aliLength, mmBlastTab.mismatch, \
             mmBlastTab.gapOpen, mmBlastTab.qStart, mmBlastTab.qEnd, \
             mmBlastTab.tStart, mmBlastTab.tEnd, mmBlastTab.eValue , \
             mmBlastTab.bitScore \
      from mmBlastTab, hg16.knownToRefSeq, mm3.knownToRefSeq \
      where hg16.knownToRefSeq.name = mmBlastTab.query and \
            mm3.knownToRefSeq.name = mmBlastTab.target;'



# Make Danio rerio (zebrafish) ortholog column using blastp on Ensembl.
# First make protein database and copy it to cluster/bluearc
# if it doesn't exist already
#	This is already done, see makeMm3.doc for procedure
#	the directory: /cluster/bluearc/dr1/blastp should have data

# Make parasol run directory 
ssh kk9
mkdir /cluster/data/hg16/bed/famBro/blastp/dr1
cd /cluster/data/hg16/bed/famBro/blastp/dr1
mkdir run
cd run
mkdir out

# Make blast script
cat  << '_EOF_' > blastSome
#!/bin/sh
BLASTMAT=/cluster/bluearc/blast/data /cluster/bluearc/blast/blastall \
	-p blastp -d /cluster/bluearc/dr1/blastp/ensembl \
	-i $1 -o $2 -e 0.005 -m 8 -b 1
'_EOF_'
chmod a+x blastSome

# Make gensub2 file
cat << '_EOF_' > gsub
#LOOP
blastSome {check in line+ $(path1)} {check out line out/$(root1).tab}
#ENDLOOP
'_EOF_'

# Create parasol batch
echo ../../split/*.fa | wordLine stdin > split.lst
gensub2 split.lst single gsub jobList
para create jobList
para try

# Wait a couple of minutes, and do a para check,  if all is good
# then do a
para push

# Completed: 7748 of 7748 jobs
# CPU time in finished jobs:      40575s     676.24m    11.27h    0.47d  0.001 y
# IO & Wait Time:                 19781s     329.69m     5.49h    0.23d  0.001 y
# Average job time:                   8s       0.13m     0.00h    0.00d
# Longest job:                       95s       1.58m     0.03h    0.00d
# Submission to last job:          2036s      33.93m     0.57h    0.02d

# Load into database.  
ssh hgwdev
cd /cluster/data/hg16/bed/famBro/blastp/dr1/run/out
hgLoadBlastTab hg16 drBlastTab -maxPer=1 *.tab
# Scanning through 7748 files
# Loading database with 32204 rows
#	row count went from 30339 to 32204

# Make Saccharomyces cerevisiae (yeast) ortholog column using blastp on RefSeq.
# First make protein database and copy it to cluster/bluearc
# if it doesn't exist already
#	This is already done, see makeMm3.doc for procedure
#	the directory: /cluster/bluearc/sc1/blastp should have data

# Make parasol run directory 
ssh kk9
mkdir /cluster/data/hg16/bed/famBro/blastp/sc1
cd /cluster/data/hg16/bed/famBro/blastp/sc1
mkdir run
cd run
mkdir out

# Make blast script
cat  << '_EOF_' > blastSome
#!/bin/sh
BLASTMAT=/cluster/bluearc/blast/data /cluster/bluearc/blast/blastall \
	-p blastp -d /cluster/bluearc/sc1/blastp/sgd \
	-i $1 -o $2 -e 0.01 -m 8 -b 1
'_EOF_'
chmod a+x blastSome

# Make gensub2 file
cat << '_EOF_' > gsub
#LOOP
blastSome {check in line+ $(path1)} {check out line out/$(root1).tab}
#ENDLOOP
'_EOF_'

# Create parasol batch
echo ../../split/*.fa | wordLine stdin > split.lst
gensub2 split.lst single gsub jobList
para create jobList
para try

# Wait a couple of minutes, and do a para check,  if all is good
# then do a
para push

# Completed: 7748 of 7748 jobs
# CPU time in finished jobs:       8577s     142.96m     2.38h    0.10d  0.000 y
# IO & Wait Time:                 19756s     329.26m     5.49h    0.23d  0.001 y
# Average job time:                   4s       0.06m     0.00h    0.00d
# Longest job:                       15s       0.25m     0.00h    0.00d
# Submission to last job:          1172s      19.53m     0.33h    0.01d
# Load into database.  
ssh hgwdev
cd /cluster/data/hg16/bed/famBro/blastp/sc1/run/out
hgLoadBlastTab hg16 scBlastTab -maxPer=1 *.tab
#	row count went from 17089 to 17886

# Make Drosophila melanagaster ortholog column using blastp on FlyBase.
# First make SwissProt protein database and copy it to cluster/bluearc
# if it doesn't exist already
#	This is already done, see makeMm3.doc for procedure
#	the directory: /cluster/bluearc/dm1/blastp should have data

# Make parasol run directory 
ssh kk9
mkdir /cluster/data/hg16/bed/famBro/blastp/dm1
cd /cluster/data/hg16/bed/famBro/blastp/dm1
mkdir run
cd run
mkdir out

# Make blast script
cat  << '_EOF_' > blastSome
#!/bin/sh
BLASTMAT=/cluster/bluearc/blast/data /cluster/bluearc/blast/blastall \
	-p blastp -d /cluster/bluearc/dm1/blastp/flyBase \
	-i $1 -o $2 -e 0.01 -m 8 -b 1
'_EOF_'
chmod a+x blastSome

# Make gensub2 file
cat << '_EOF_' > gsub
#LOOP
blastSome {check in line+ $(path1)} {check out line out/$(root1).tab}
#ENDLOOP
'_EOF_'

# Create parasol batch
echo ../../split/*.fa | wordLine stdin > split.lst
gensub2 split.lst single gsub jobList
para create jobList
para try

# Wait a couple of minutes, and do a para check,  if all is good
# then do a
para push

# Completed: 7748 of 7748 jobs
# CPU time in finished jobs:      33371s     556.18m     9.27h    0.39d  0.001 y
# IO & Wait Time:                 19546s     325.77m     5.43h    0.23d  0.001 y
# Average job time:                   7s       0.11m     0.00h    0.00d
# Longest job:                       53s       0.88m     0.01h    0.00d
# Submission to last job:          1657s      27.62m     0.46h    0.02d

# Load into database.  
ssh hgwdev
cd /cluster/data/hg16/bed/famBro/blastp/dm1/run/out
hgLoadBlastTab hg16 dmBlastTab -maxPer=1 *.tab
# Scanning through 7748 files
# Loading database with 28645 rows
#	row count went from 27173 to 28645


LOAD SNPS (Done.  Daryl Thomas; February 18, 2004)
    # SNP processing has been condensed into a single script,
    # which makes snpNih, snpTsc, and snpMap
    #   ${HOME}/kent/src/hg/snp/locations/processSnpLocations.csh

    # snpBuild = 119
    # Run from directory $oo/bed/snp/build$snpBuild/snpMap
    mkdir -p $oo/bed/snp/build$snpBuild/snpMap
    cd       $oo/bed/snp/build$snpBuild/snpMap
    processSnpLocations.csh hg16 human 34_2 119 >& log &

    # check data:
    # wc -l snpTsc.bed; hg16 -e "select count(*) from snpTsc; 
    # wc -l snpNih.bed; hg16 -e "select count(*) from snpNih; 
    # wc -l snpMap.bed; hg16 -e "select count(*) from snpMap; 
    # select * from snpNih limit 5; desc snpNih; show indexes from snpNih"
    # select * from snpTsc limit 5; desc snpTsc; show indexes from snpTsc"
    # select * from snpMap limit 5; desc snpMap; show indexes from snpMap"

    # remove temp files
    # rm human* *bed.gz

LOAD SNP DETAILS (Done.  Daryl Thomas; February 18, 2004)
    # SNP processing has been condensed into a single script,
    # which makes dbSnpRsHg
    #   ${HOME}/kent/src/hg/snp/details/processSnpDetails.csh

    # snpBuild = 119
    # Run from directory $oo/bed/snp/build$snpBuild/snpMap
    mkdir -p $oo/bed/snp/build$snpBuild/details/Done
    mkdir -p $oo/bed/snp/build$snpBuild/details/Observed
    cd       $oo/bed/snp/build$snpBuild/details
    processSnpDetails.csh hg16 human 119 >& log &
    load data local infile "$fileBase.out" into table $database.$table
    gzip $fileBase.out

    # check data:
    # hgFixed -e "select count(*) from dbSnpRsHg; 
    # select * from dbSnpRSHg limit 5; desc dbSnpRsHg; show indexes from dbSnpRSHg"

    # remove temp files
    # rm dbSnpRs*



# LOAD SNPS ( Daryl Thomas; February ??, 2005)
    set db    = hg16
    set org   = human
    set build = 122
    set dir   = /cluster/bluearc/snp/$db/build$build

    # ssh to some quiet machine with fast access to the bluearc
    # it takes ~4.5 hours to download the data 
    # (build 124 directly to /cluster/bluearc/... from eieio)
    # Check to make sure the chrMT file is included
    mkdir -p $dir $dir/ds_ch.xml $dir/det $dir/str $dir/loc $dir/seq
    cd $dir
    ln -s /cluster/data/$db/jkStuff/liftAll.lft .

    screen
    ftp ftp.ncbi.nih.gov
    cd snp/$org/XML
    prompt
    mget ds_ch*.xml.gz
    exit # screen
    exit # machine

    # TODO: check chromStart for each locType

    cp -f {$HOME}/kent/src/hg/snp/parseDbSnpXML /cluster/bin/scripts
    chmod 775 /cluster/bin/scripts/parseDbSnpXML

    ssh kk
    touch jobList
    foreach file ( /cluster/bluearc/snp/$db/build$build/ds_ch*.xml.gz )
	    set out = $file:t:r
	    echo /cluster/bin/scripts/parseDbSnpXML $file /cluster/bluearc/snp/$db/build$build $out.contig >> jobList
    end

    # para create jobList; para push; para check ... 
# CPU time in finished jobs:      28235s     470.58m     7.84h    0.33d  0.001 y
# IO & Wait Time:                  1986s      33.10m     0.55h    0.02d  0.000 y
# Average job time:                1119s      18.65m     0.31h    0.01d
# Longest job:                     2339s      38.98m     0.65h    0.03d
    exit # kk

    mv -r $dir /cluster/data/$db/bed/snp/build$build
    set dir = /cluster/data/$db/bed/snp/build$build
    cd $dir

    ssh eieio # or wherever data is local

    # concatenate the details files to make it easier to lift (and load)
    time zcat det/ds_ch*.xml.contig.det.gz > $db.build$build.contig.bed
    # 16.120u 13.070s 1:35.26 30.6%   0+0k 0+0io 86pf+0w (hgwdev)
    time gzip $db.build$build.contig.bed
    # 102.307u 5.524s 1:48.97 98.9%   0+0k 0+0io 1pf+0w (eieio/store5)

    # some of the NT contigs are not in the liftSpec - this is expected as snps that map to
    # alternate assemblies (Celera) are in the original files, but we disregard their mappings.
    time liftUp $db.build$build.bed liftAll.lft warn $db.build$build.contig.bed.gz 
    # 190.473u 18.873s 3:52.33 90.1%  0+0k 0+0io 1pf+0w (eieio/store5)

    time gzip $db.build$build.bed
    # 107.476u 5.286s 1:54.25 98.6%   0+0k 0+0io 0pf+0w

    ssh hgwdev # or wherever database is located
    # hgLoadBed is the important step - check to make sure there are no warnings
    time hgLoadBed $db snp $db.build$build.bed.gz -sqlTable=${HOME}/kent/src/hg/lib/snp.sql
    # Loaded 8722437 elements of size 16
    # 206.170u 48.370s 35:59.52 11.7% 0+0k 0+0io 82994pf+0w


    # basic snp table is now loaded, but exception column needs to be updated
    # ~ 3 hours wall clock time from here to end

    # run queries from snpException.query against snp table
    mkdir -p /usr/local/apache/htdocs/qa/test-results/snpException/build$build
    cd       /usr/local/apache/htdocs/qa/test-results/snpException/build$build
    time snpException $db 0 ${db}snpException > ${db}snpException.log
    chmod o+rx . 
    chmod o+r  * 
    # 24.590u 34.150s 41:04.48 2.3%   0+0k 0+0io 191pf+0w

    # check alignment of flanking sequences
    time snpValid $db /cluster/data/$db/bed/snp/build$build/seq > ${db}snpValid.log
    # 4688.790u 172.770s 1:28:45.62 91.2%     0+0k 0+0io 23000pf+0w
    # 5205.860u 216.570s 1:55:10.27 78.4%     0+0k 0+0io 72408pf+0w (hgwdev)

    ### NOTE: the pseudoautosomal snps are reported in the chrX files
    ### only, which causes problems for snpValid when checking the
    ### chrY snp mappings.  I got around this by confirming that all
    ### of the 'missing flank' errors (#23) were in pseudoautosomal
    ### regions and ignoring them.  I manually truncated the
    ### hg17snpException.23.bed file before continuing with the next
    ### step.  This could/should be fixed in the next iteration.

    # update snpExceptions table to match the number of exceptions found in the snpValid results
    # these numbers come from counting the numbers of lines in the output files without headers
    mysql> update snpExceptions set num=60797  where exceptionId=21;
    mysql> update snpExceptions set num=5657   where exceptionId=22;
    mysql> update snpExceptions set num=284098 where exceptionId=23;
    mysql> update snpExceptions set num=173    where exceptionId=24;

    # create list of statements to update the snp table and run them
    time tail +3 ${db}snpException.* | awk '/rs/ {printf "%s\t%d\t%d\n",$4,$2,$5}' | sort -k1,2n > exceptionList.txt
    # ~10 seconds
    time updateExceptionList.pl < exceptionList.txt > updateExceptionList.sql
    # 36.270u 1.980s 0:38.27 99.9%    0+0k 0+0io 337pf+0w
    time hgsql $db < updateExceptionList.sql
    # 18.130u 26.680s 58:39.97 1.2%   0+0k 0+0io 413pf+0w build122 (had to optimize table during run)
    # 8.420u 10.370s 11:58.44 2.6%    0+0k 0+0io 413pf+0w build123 (this is mostly a mysql process)
    # 6.550u  9.370s 14:34.17 1.8%    0+0k 0+0io 413pf+0w build124
    # > wc -l build12*/updateExceptionList.sql
    # 1110994 build122/updateExceptionList.sql
    #  387166 build123/updateExceptionList.sql
    #  383759 build124/updateExceptionList.sql

# Add Affy SNPs from new submission

    #!/bin/csh -fe

    set db = hg16
    cd /cluster/data/$db/bed/snp/affy/latest
    touch affy.txt affy.bed Affy.bed bed.tab
    rm -f affy*.txt affy*.bed Affy.bed* bed.tab

    # datafile was provided by Valmeekam, Venu [Venu_Valmeekam@affymetrix.com]
    tar xfz affyhg16maps.tgz
    wc -l affy*txt

    awk '$1 !~ /^chrom/ {printf("%s\t%d\t%d\t%s\t0\t%s\t%s\tunknown\tsnp\tunknown\t0\t0\tunknown\texact\tAffy10K\t0\n",        $1,$2,$3,$4,$6,$7);}' < affy10K.txt         > affy10K.bed
    awk '$1 !~ /^chrom/ {printf("%s\t%d\t%d\t%s\t0\t%s\t%s\tunknown\tsnp\tunknown\t0\t0\tunknown\texact\tAffy10Kv2\t0\n",      $1,$2,$3,$4,$6,$7);}' < affy10Kv2.txt       > affy10Kv2.bed
    awk '$1 !~ /^chrom/ {printf("%s\t%d\t%d\t%s\t0\t%s\t%s\tunknown\tsnp\tunknown\t0\t0\tunknown\texact\tAffy50K_HindIII\t0\n",$1,$2,$3,$4,$6,$7);}' < affy50K_HindIII.txt > affy50K_HindIII.bed
    awk '$1 !~ /^chrom/ {printf("%s\t%d\t%d\t%s\t0\t%s\t%s\tunknown\tsnp\tunknown\t0\t0\tunknown\texact\tAffy50K_XbaI\t0\n",   $1,$2,$3,$4,$6,$7);}' < affy50K_XbaI.txt    > affy50K_XbaI.bed

    # this is a temporary kluge to fix some bad input data.
    cat affy*.bed | sed 's/_par//' > Affy.bed

    # the source enum for 'dbSnp' is 2; all of the affy* values are higher.
    hgsql $db -e "delete from snp where source > 2 "

    hgLoadBed $db snp Affy.bed -oldTable -tab

    rm -f affy*.txt affy*.bed bed.tab
    gzip Affy.bed

    #mysql> select source, count(*) from snp group by source;                                                                                
    #+-----------------+----------+
    #| source          | count(*) |
    #+-----------------+----------+
    #| dbSnp           |  8722437 |
    #| Affy10K         |    11464 |
    #| Affy10Kv2       |    10128 |
    #| Affy50K_HindIII |    56965 |
    #| Affy50K_XbaI    |    58646 |
    #+-----------------+----------+
    #5 rows in set (52.96 sec)


    # March 7, 2005: fixed pseudoautosomal snps:
    #affy10Kv2.txt:chrX_par  1920780 1920781 SNP_A-1606360   0       ?       C/T
    #affy10Kv2.txt:chrX_par  2047561 2047562 SNP_A-1510197   0       ?       G/T
    #affy10Kv2.txt:chrX_par  2047486 2047487 SNP_A-1510243   0       ?       A/G
    #affy10Kv2.txt:chrX_par  2060858 2060859 SNP_A-1606356   0       ?       A/G
    #affy10Kv2.txt:chrX_par  2163964 2163965 SNP_A-1606329   0       ?       C/T
    
    delete from snp where chrom = 'chrY' and name in ('SNP_A-1606360','SNP_A-1510197','SNP_A-1510243','SNP_A-1606356','SNP_A-1606329');
    update snp set chrom = 'chrX' where name in ('SNP_A-1606360','SNP_A-1510197','SNP_A-1510243','SNP_A-1606356','SNP_A-1606329');
    insert into snp 
	    select  bin, 'chrY' as chrom, chromStart, chromEnd, name, score, strand, 
		    observed, molType, class, valid, avHet, avHetSE, func, locType, source, exception 
	    from    snp 
	    where   name in ('SNP_A-1606360','SNP_A-1510197','SNP_A-1510243','SNP_A-1606356','SNP_A-1606329');
    select chrom, count(*) from snp where name in ('SNP_A-1606360','SNP_A-1510197','SNP_A-1510243','SNP_A-1606356','SNP_A-1606329') group by chrom;


### hapmapRecombRate (Daryl; September 19, 2005)
  # updated coordinates (Daryl; December 8, 2005)
    mkdir -p /cluster/data/hg16/bed/hapmap/recombination/HapMap_PhaseI/20051115
    cd       /cluster/data/hg16/bed/hapmap/recombination/HapMap_PhaseI/20051115
    wget -N http://www.stats.ox.ac.uk/~cfreeman/HapMap_Phase1/genetic_map_HapMap_Phase1_UCSC.tar.gz
    tar xvfz genetic_map_HapMap_Phase1_UCSC.tar.gz
    tail --lines=+2 -q Gen_map_chr*_COMBINED_UCSC.txt | sed 's/_non_par//;s/_par1//;s/_par2//' | awk '{printf "%s\t%d\t%d\t%0.3f\n",$1,$2,$3,$4}' >! hg16.hapmapRecombRate.bed
    liftOver hg16.hapmapRecombRate.bed /cluster/data/hg16/bed/liftOver/hg16ToHg17.over.chain.gz hg17.hapmapRecombRate.bed hg16ToHg17.unmapped
    hgLoadBed -bedGraph=4 hg16 hapmapRecombRate hg16.hapmapRecombRate.bed
    hgLoadBed -bedGraph=4 hg17 hapmapRecombRate hg17.hapmapRecombRate.bed
    rm -f bed.tab Gen_map_chr*.txt

### hapmapRecombHotspot (Daryl; September 19, 2005; chr X data update October 21, 2005)
    wget -N http://www.stats.ox.ac.uk/~mcvean/HapMap/hotspots/Genomewidehots16a.txt
    wget -N http://www.stats.ox.ac.uk/~mcvean/HapMap/hotspots/chrX_non_par_hotspots.txt
    wget -N http://www.stats.ox.ac.uk/~mcvean/HapMap/hotspots/chrX_par1_hotspots.txt
    # this takes about 3 seconds to run
    rm -f hg*.hapmapRecombHotspots.bed 
    tail +2 Genomewidehots16a.txt     |                    awk -F " " '{printf "chr%s\t%d\t%d\n",$1, $3-1, $4}' >  hg16.hapmapRecombHotspots.bed
    tail +2 chrX_non_par_hotspots.txt | sed s/_non_par// | awk -F " " '{printf "chr%s\t%d\t%d\n",$1, $3-1, $4}' >> hg16.hapmapRecombHotspots.bed
    tail +2 chrX_par1_hotspots.txt    | sed s/_par1//    | awk -F " " '{printf "chr%s\t%d\t%d\n",$1, $3-1, $4}' >> hg16.hapmapRecombHotspots.bed
    liftOver hg16.hapmapRecombHotspots.bed /cluster/data/hg16/bed/bedOver/hg16ToHg17.over.chain hg17.hapmapRecombHotspots.bed hg16ToHg17.unmapped
    hgLoadBed hg16 hapmapRecombHotspots hg16.hapmapRecombHotspots.bed
    hgLoadBed hg17 hapmapRecombHotspots hg17.hapmapRecombHotspots.bed
    rm -f bed.tab

### encodeRecombHotspot (Daryl; December 8, 2005)
    mkdir -p /cluster/data/hg16/bed/hapmap/recombination/ENCODE_16c.1/hotspots
    cd       /cluster/data/hg16/bed/hapmap/recombination/ENCODE_16c.1/hotspots
    wget -N http://www.stats.ox.ac.uk/~cfreeman/ENCODE_16c.1/Hotspots16c1.txt
    wget -N http://www.stats.ox.ac.uk/~cfreeman/ENCODE_16c.1/Readme_rates_hotspots.txt
    tail +2 Hotspots16c1.txt | sed 's/ENm010\.7p15\.2/chr7/;s/ENm013\.7q21\.13/chr7/;s/ENm014\.7q31\.33/chr7/;s/ENr112\.2p16\.3/chr2/;s/ENr113\.4q26/chr4/;s/ENr123\.12q12/chr12/;s/ENr131\.2q37\.1/chr2/;s/ENr213\.18q12\.1/chr18/;s/ENr232\.9q34\.11/chr9/;s/ENr321\.8q24\.11/chr8/' | awk '{printf "%s\t%d\t%d\n", $1, $3, $4}' > hg16.encodeRecombHotspot.bed
    liftOver hg16.encodeRecombHotspot.bed /cluster/data/hg16/bed/bedOver/hg16ToHg17.over.chain hg17.encodeRecombHotspot.bed hg16ToHg17.unmapped
    hgLoadBed hg16 encodeRecombHotspot hg16.encodeRecombHotspot.bed
    hgLoadBed hg17 encodeRecombHotspot hg17.encodeRecombHotspot.bed
    rm -f bed.tab *bed *unmapped



### Perlegen Recombination Rates and Hotspots (Daryl; December 9, 2005)
    # Home page: http://www.stats.ox.ac.uk/mathgen/Recombination.html

    mkdir -p /cluster/data/hg16/bed/hapmap/recombination/Perlegen
    cd       /cluster/data/hg16/bed/hapmap/recombination/Perlegen
    wget -nv -N http://www.stats.ox.ac.uk/mathgen/oxstats_map/README.txt

    mkdir -p /cluster/data/hg16/bed/hapmap/recombination/Perlegen/hotspots
    cd       /cluster/data/hg16/bed/hapmap/recombination/Perlegen/hotspots

    wget -nv -N http://www.stats.ox.ac.uk/mathgen/oxstats_map/README.txt
    wget -nv -N http://www.stats.ox.ac.uk/mathgen/oxstats_map/hotspots.zip
    unzip hotspots.zip
    tail +2 hotspots.txt  | grep -v 1.51000 | awk '{printf "chr%s\t%d\t%d\n",$1,$3-1,$4}' > hg16.perlegenRecombHotspots.bed
    tail +2 coldspots.txt | grep -v "-"     | awk '{printf "chr%s\t%d\t%d\n",$1,$3-1,$4}' > hg16.perlegenRecombColdspots.bed

    liftOver hg16.perlegenRecombHotspots.bed  /cluster/data/hg16/bed/bedOver/hg16ToHg17.over.chain hg17.perlegenRecombHotspots.bed  hg16ToHg17.hots.unmapped
    liftOver hg16.perlegenRecombColdspots.bed /cluster/data/hg16/bed/bedOver/hg16ToHg17.over.chain hg17.perlegenRecombColdspots.bed hg16ToHg17.cold.unmapped

    hgLoadBed hg16 perlegenRecombHotspots  hg16.perlegenRecombHotspots.bed
    hgLoadBed hg17 perlegenRecombHotspots  hg17.perlegenRecombHotspots.bed
    hgLoadBed hg16 perlegenRecombColdspots hg16.perlegenRecombColdspots.bed
    hgLoadBed hg17 perlegenRecombColdspots hg17.perlegenRecombColdspots.bed

    rm -f bed.tab hg1*ed *spots*txt

    mkdir -p /cluster/data/hg16/bed/hapmap/recombination/Perlegen/rates
    cd       /cluster/data/hg16/bed/hapmap/recombination/Perlegen/rates
    cp ../makeBed.pl .
    chmod ug+x makeBed.pl

    wget -nv -N http://www.stats.ox.ac.uk/mathgen/oxstats_map/README.txt
    wget -nv -N http://www.stats.ox.ac.uk/mathgen/oxstats_map/recombination_rates.zip
    unzip recombination_rates.zip

    rm -f hg16.perlegenRecombRate.bed
    time ./makeBed.pl > hg16.perlegenRecombRate.bed
    cut -f1 hg16.perlegenRecombRate.bed | sort -u
    wc -l hg16.perlegenRecombRate.bed

    liftOver hg16.perlegenRecombRate.bed /cluster/data/hg16/bed/bedOver/hg16ToHg17.over.chain hg17.perlegenRecombRate.bed hg16ToHg17.rates.unmapped

    hgLoadBed hg16 perlegenRecombRate  hg16.perlegenRecombRate.bed
    hgLoadBed hg17 perlegenRecombRate  hg17.perlegenRecombRate.bed

    rm -f bed.tab chr*_rates.txt hg1*ed


# HapMap Linkage Disequilibrium (Daryl; January 2006)

    mkdir -p /cluster/data/hg16/bed/hapmap/ld_data/2005-10/data
    cd /cluster/data/hg16/bed/hapmap/ld_data/2005-10/data
    screen
    ftp www.hapmap.org
    cd ld_data/2005-10
    prompt
    mget ld_chr*.txt.gz

    # look for consistency in max LD distance
    set out = maxDist.txt
    rm -f $out
    touch $out
    foreach f (ld_*.txt.gz)
	echo -n "$f    " >> $out
	zcat $f | awk '{if ($2-$1>max) max=$2-$1} END {print max}' >> $out
    end
    # most should be 249999
    grep -v 249999 maxDist.txt

    # look for consistency in line counts
    # ssh eieio; screen
    set out = wcList.txt
    rm -f $out
    touch $out
    # this takes about 2 hours to run completely on eieio (local disk)"
    foreach f (*.txt.gz) 
	echo -n $f:r:r "    " | sed 's/ld_//;s/chr//;s/_/\t/' >> $out
	zcat $f | cut -f1 -d " " | uniq | wc -l >> $out
    end
    # plot the sizes from wcList.txt by population (lines) 
    # with chrom on the X axis and size on the Y axis.
    # look for anomalies


    mkdir ../bed
    cd ../bed
    # from the raw LD values, compute colors and encode
    cat << EOF > makeLdBed.pl
#!/usr/bin/perl -W

sub min ($$)
{
    my $a = shift @_;
    my $b = shift @_;
    if ($a<$b) {return $a;}
    return $b;
}
sub encodeDprime($)
{
    my $val = shift @_;
    if    ( ($val > 1) || ($val < -1) ) { die "Dprime value ($val) is out of range [-1,1]";}
    elsif ($val>=0) { $ret = ord('a') + $val*9;}
    else         { $ret = ord('A') - $val*9;}
    return chr($ret);
}
sub encodeRsquared($)
{
    my $val = shift @_;
    if ( ($val > 1) || ($val < 0) ) { die "R^2 value ($val) is out of range [0,1]";}
    return encodeDprime($val);
}
sub encodeLod($$)
{
    my $lod    = shift @_;
    my $dPrime = shift @_;
    $ret = ord('a');
    if ($lod>=2) # high LOD
	{
	if (abs($dPrime)<0.5) { $ret = ord('y'); } # high LOD, low D'  -> pink
	else { $ret += min((int($lod-abs($dPrime)-1.5)), 9) ;}
	}
    elsif (abs($dPrime)>0.99) { $ret = ord('z'); } # high D', low LOD  -> blue
    return chr($ret); 
}

$inDir  = shift||"data"; 
$outDir = shift||"bed";
$foo    = "";
$bar    = ""; 
@rest   = (); 
@pops   = ("CEU", "CHB", "JPT", "YRI");

foreach $pop (@pops)
{
    opendir(DIR, $inDir) || die "can't open $inDir";
    @hmFiles = grep {/^ld_/ && /_${pop}.txt.gz$/} readdir(DIR); #ld_chr22_CEU.txt.gz
    closedir(DIR);
    printf "\nPOP:\t$pop\t$#hmFiles\n";
    foreach $hmFile (sort @hmFiles)
    {
    ($foo, $chrom, $bar) = split /_/, $hmFile;
    $chrom =~ s/chrx/chrX/;
    $chrom =~ s/chry/chrY/;
    $outfile = "$outDir/${pop}_${chrom}.bed";
    if ((-e $outfile)||(-e "$outfile.gz")) { next; }
    $tmpFile = "/tmp/${pop}_${chrom}.bed";
    printf("$inDir/$hmFile => $outfile.gz\t" . `date`);
    open(OUT, "> $tmpFile" ) || die "can't open $tmpFile";
    open(IN, "zcat $inDir/$hmFile | " ) || die "can't open $inDir/$hmFile";
    $line = <IN>;
    chomp($line);
    ($chromStart, $chromEnd, $pop, $name, $marker2, $dprime, $rsquared, $lod, @rest) = split / /, $line;
    $ldCount = 1;
    while (<IN>)
    {
        chomp();
	($chromStartNew, $chromEndNew, $pop, $nameNew, $marker2, $dprime, $rsquared, $lod, @rest) = split / /;
	if ($chromStart ne $chromStartNew)
	    {
	    $chromStart--;
	    printf(OUT "$chrom\t$chromStart\t$chromEnd\t$name\t$ldCount\t$dprimeList\t$rsquaredList\t$lodList\n");
	    $chromStart   = $chromStartNew;
	    $chromEnd     = $chromEndNew;
	    $name         = $nameNew;
	    $ldCount      = 1;
	    $dprimeList   = encodeDprime($dprime);
	    $rsquaredList = encodeRsquared($rsquared);
	    $lodList      = encodeLod($lod, $dprime);
	    }
	elsif ($chromEndNew-$chromStartNew<250000)
	    {
	    $chromEnd     = $chromEndNew;
	    $ldCount++;
	    $dprimeList   .= encodeDprime($dprime);
	    $rsquaredList .= encodeRsquared($rsquared);
	    $lodList      .= encodeLod($lod, $dprime);
	    }
    }
    close(IN);
    $chromStart--;
    printf(OUT "$chrom\t$chromStart\t$chromEnd\t$name\t$ldCount\t$dprimeList\t$rsquaredList\t$lodList\n");
    close(OUT);
    system("gzip $tmpFile");
    system("mv $tmpFile.gz $outDir");
    }
}
EOF
#
    chmod ug+x ./makeLdBed.pl
    ssh eieio
    screen
    time ./makeLdBed.pl

    # look for consistency in line counts
    # ssh eieio
    set out = wcList.txt
    rm -f $out
    touch $out
    foreach f (*.bed.gz) 
	echo -n $f:r:r "    " | sed 's/chr//g;s/_/\t/g' >> $out
	zcat $f | wc -l >> $out
    end
    # plot the sizes from wcList.txt by population (lines) 
    # with chrom on the X axis and size on the Y axis.
    # look for anomalies


    # load data
    sed 's/hapmapLd/hapmapLdCeu/' ${HOME}/kent/src/hg/lib/hapmapLd.sql | hgsql hg16
    sed 's/hapmapLd/hapmapLdChb/' ${HOME}/kent/src/hg/lib/hapmapLd.sql | hgsql hg16
    sed 's/hapmapLd/hapmapLdJpt/' ${HOME}/kent/src/hg/lib/hapmapLd.sql | hgsql hg16
    sed 's/hapmapLd/hapmapLdYri/' ${HOME}/kent/src/hg/lib/hapmapLd.sql | hgsql hg16

    # The length of each of the three value vectors (rsquared, dprime,
    # and lod) is the same and is stored in the score field.

    # 30-40 minutes
    foreach c (1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 X)
	echo
	echo -n loading CEU chr${c} 
	zcat CEU_chr${c}.bed.gz | wc -l
	hgLoadBed -noSort -oldTable -strict hg16 hapmapLdCeu CEU_chr${c}.bed.gz
	echo
	echo -n loading CHB chr${c} 
	zcat CHB_chr${c}.bed.gz | wc -l
	hgLoadBed -noSort -oldTable -strict hg16 hapmapLdChb CHB_chr${c}.bed.gz
	echo
	echo -n loading JPT chr${c} 
	zcat JPT_chr${c}.bed.gz | wc -l
	hgLoadBed -noSort -oldTable -strict hg16 hapmapLdJpt JPT_chr${c}.bed.gz
	echo
	echo -n loading YRI chr${c} 
	zcat YRI_chr${c}.bed.gz | wc -l
	hgLoadBed -noSort -oldTable -strict hg16 hapmapLdYri YRI_chr${c}.bed.gz
    end
    rm -f bed.tab


# Tajima's D (DONE -- 2005-06-04 -- Daryl)
# Data from Chris Carlson in Debbie Nickerson's lab
# Chris Carlson [csc47<AT>u<DOT>washington<DOT>edu]

    set db=hg16
    set dir=/cluster/data/$db/bed/tajdpoly/latest
    cd $dir
    
    set chain = "/gbdb/hg17/liftOver/hg17ToHg16.over.chain"
    foreach p (AD ED XD)
	# lift SNP tracks
	set f   = $p.SNP.track
	set in  = /cluster/data/hg17/bed/tajdpoly/latest/$f.bed4
	set out = /cluster/data/hg16/bed/tajdpoly/latest/$f.$db
	liftOver $in $chain $out.$db.bed4 $out.$db.unmapped
	# lift tajd tracks
	set f   = $p.tajd.track
	set in  = /cluster/data/hg17/bed/tajdpoly/latest/$f.bedGraph
	set out = /cluster/data/hg16/bed/tajdpoly/latest/$f.$db
	liftOver $in $chain $out.bedGraph $out.unmapped
	# load SNP tracks
	set f = $p.SNP.track.hg16
	echo `date` $f "=>" $f.bed4
	hgLoadBed $db tajdSnp$p $f.bed4
	head -3 $f*
	hgsql -e "select * from tajdSnp$p limit 3" $db
	# load tajd tracks
	set f = $p.tajd.track.$db
	echo `date` $f "=>" $f.bedGraph
	hgLoadBed -bedGraph=4 $db tajd$p $f.bedGraph
	head -3 $f*
	hgsql -e "select * from tajd$p limit 3" $db
    end

    # deleting elements that overlap with gaps -- tajd files have overlaps due to the windowing scheme (snps are not found in gaps)
    rm -f delete.sql
    touch delete.sql
    set $where="where t.chrom=g.chrom and (t.chromStart between g.chromStart and g.chromEnd or t.chromEnd between g.chromStart and g.chromEnd)"
    foreach p (AD ED XD SnpAD SnpED SnpXD)
	foreach c (1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 X Y)
	    echo "select 'tajd$p' as pop, t.chrom, t.chromStart from tajd${p} t, chr${c}_gap g $where " | \
		hgsql $db | grep -v pop | \
		awk '{printf "delete from %s where chrom=\"%s\" and chromStart=%d;\n", $1, $2, $3}' >> delete.sql
	end
    end
    $db < delete.sql

    # cleanup elements that didn't get deleted properly
    ## cleanup.pl
    #!/usr/bin/perl -W
    $pop=shift;
    while (<>)
    {   
	if (/^(chr..?)\s+(\d+)/) 
	{ print "delete from tajd$pop where chrom='$1' and chromStart<$2 and chromEnd>$2;\n"; }
    }
    ##

    foreach p (AD ED XD)
	featureBits $db tajd$p gap -bed=$p.inGaps.bed
	cleanup.pl < $p.inGaps.bed $p | $db
	featureBits $db tajd$p gap -bed=$p.inGaps.bed ## should be empty now
    end


# JAX ORTHOLOG (WORKING hiram 2004-02-20 )
    # Add Jackson labs info
    cd /cluster/data/hg16/bed
    mkdir jaxOrtholog
    cd jaxOrtholog
    wget --timestamping ftp://ftp.informatics.jax.org/pub/reports/HMD_Human4.rpt
    # save a little space
    gzip HMD_Human4.rpt

    # this is a tricky one to parse.  This .rpt file is plain text, no
    # tabs, with expected text columns to contain the data.  We need to
    # convert this.  Beware of table changes, you may need to rework
    # this each time if they change the data.  Here is what we have
    # today, an example first line with text columns numbered:
# 1234567 101234567 201234567 301234567 401234567 501234567 601234567 701234567 
801234567 90123456 100123456 110123456 120123456 130123456 140123456 150123456 1
60123456 170123456 180123456 170
# MGI:1918914                    71664                          0610006F02Rik   
          10           syntenic D3         196410                         MGC173
01                  12q13.13
# ^ mgiId
#         ^ mouse chr
#                      ^ mouseCm position
#                               ^ possible Mouse band
#                                            Mouse-Human Symbol ^
#                                                            Human Symbol ^
#                   ^ Human Band(s)

    #	This awk script picks out the correct columns, removes spaces,
    #	picks the first of possibly several human band designations,
    #	and decides if a mouse band has been specified
    cat << '_EOF_' > jaxToUCSC.awk
/^MGI:/ {
    LAST=NF
    PREV=LAST-1
    humanSymbol = substr($0,153,26)
    gsub(" ","",humanSymbol)
    Band = substr($0,179)
    gsub(" *$","",Band)
    gsub("^ *","",Band)
    mgiId = substr($0,1,31)
    gsub(" ","",mgiId)
    mouseSym = substr($0,63,26)
    gsub(" ","",mouseSym)
    mouseChr = substr($0,89,13)
    gsub(" ","",mouseChr)
    mouseCm = substr($0,102,9)
    gsub(" ","",mouseCm)
    mouseBand = substr($0,111,11)
    gsub(" ","",mouseBand)
    if (length(mouseBand) < 1) { mouseBand = "N/A" }
    printf "%s\t%s\t%s\t%s\t%s\t%s\t%s\n", humanSymbol,Band,
        mgiId,mouseSym,mouseChr,mouseCm,mouseBand
}
'_EOF_'
    # << this line makes emacs coloring happy
    # then using that script to fix it:
    zcat HMD_Human4.rpt.gz | awk -f jaxToUCSC.awk > jaxOrtholog.tab

    # Drop (just in case), create and load the table:
    hgsql -e 'drop table jaxOrtholog;' hg16
    hgsql hg16 < ~/kent/src/hg/lib/jaxOrtholog.sql
    hgsql -e \
	'load data local infile "jaxOrtholog.tab" into table jaxOrtholog;' hg16
    # save a little space
    gzip jaxOrtholog.tab

LOAD ACEMBLY (DONE - 2004-03-30 - Hiram)
    mkdir -p /cluster/data/hg16/bed/acembly
    cd /cluster/data/hg16/bed/acembly

    # Data is obtained from:
    # Danielle et Jean Thierry-Mieg	mieg@ncbi.nlm.nih.gov

    wget --timestamping ftp://ftp.ncbi.nih.gov/repository/acedb/ncbi_34.human.genes/acembly.ncbi_34.genes.proteins.fasta.tar.gz
    wget --timestamping ftp://ftp.ncbi.nih.gov/repository/acedb/ncbi_34.human.genes/acembly.ncbi_34.genes.gff.tar.gz
    wget --timestamping ftp://ftp.ncbi.nih.gov/repository/acedb/ncbi_34.human.genes/acembly.ncbi_34.mrnas.fasta.tar.gz
    wget --timestamping ftp://ftp.ncbi.nih.gov/repository/acedb/ncbi_34.human.genes/acembly.ncbi_34.genes.pfamhits.tar.gz

    tar xvzf acembly.ncbi_34.genes.gff.tar.gz
    tar xvzf acembly.ncbi_34.genes.proteins.fasta.tar.gz
    cd acembly.ncbi_34.genes.gff

    # chrom 6.gff is broken, it has a bogus number in the first column
    #	where a 6 should be.  Fix-up until I hear from the authors:
    mv x1.acemblygenes.6.gff x1.acemblygenes.6.gff.broken
    sed -e "s/^28212469/6/" x1.acemblygenes.6.gff.broken > x1.acemblygenes.6.gff 

    #	There are a number of start and end coordinates that are
    #	in reversed order.  Until I hear from the authors, I have
    #	switched those coords:
    cat << '_EOF_' > fixupReversedBlocks
#!/bin/sh
for i in x1*.gff
do
        echo -n "$i working ..."
        awk -F"\t" '
{
if ($4 > $5) {
        printf "%s\t%s\t%s\t%s\t%s", $1, $2, $3, $5, $4
        for ( i = 6; i <= NF; ++i ) {
                printf "\t%s", $i
        }
        printf "\n"
} else
        print
}
' $i > $i.fixed
        echo " done"
done
'_EOF_'
    # << this line makes emacs coloring happy
    chmod +x fixupReversedBlocks
    ./fixupReversedBlocks

    # Save just the floating-contig features to different files for lifting 
    # and lift up the floating-contig features to chr*_random coords:
    # NOTE: file prefix (x1) has been added since build 31

    foreach f (x1.acemblygenes.*.gff.fixed)
      set c=$f:r:e
      set c=$f:r:r:e
      egrep '^[a-zA-Z0-9]+\|NT_[0-9][0-9][0-9][0-9][0-9][0-9]' $f | \
        perl -wpe 's/^(\w+)\|(\w+)/$1\/$2/' > ctg-chr${c}_random.gff
      if (-e ../../../$c/lift/random.lft) then
        liftUp chr${c}_random.gff ../../../$c/lift/random.lft warn \
          ctg-chr${c}_random.gff
      endif
      grep -v ^$c\| $f | grep -v ^Hs | perl -wpe 's/^/chr/;' | \
	grep -v "^chr//" > chr$c.gff
      echo "done $c"
    end
    # that last grep strips out _random or floating contig lines from the
    # normal chrom gff, and add the "chr" prefix
    #	Three of them end up empty, check for this and remove them
    #	if necessary
    rm -f chr19_random.gff chr18_random.gff chrUn.gff
    #	There was one error in a coordinate on chr17_random:
    # chr17_random    acembly stop_codon      -2      0       .       +       1      gene_id M17S2; transcript_id M17S2.cDec03;
    #	This line was removed (shows up as first line) from
    #	chr17_random.gff before the database load

    #- Load into database:
    cd ..
    ldHgGene -gtf hg16 acembly acembly.ncbi_34.genes.gff/chr*.gff
    hgPepPred hg16 generic acemblyPep \
	acembly.ncbi_34.genes.proteins.fasta/*.fasta

    #	check that the track is OK
    checkTableCoords hg16 acembly
    #	should display no errors


# MAKE HUMAN-CHIMP OVER.CHAIN FOR LIFTOVER (DONE 3/2/04 angie)

    ssh kolossus
    mkdir /cluster/data/hg16/bed/bedOver/hg16toPt0
    cd /cluster/data/hg16/bed/bedOver/hg16toPt0
    # use the combined blastz-blat best human chain, but assign unique IDs
    # so that netChainSubset doesn't die:
    chainSort /cluster/data/pt0/bed/blastz-blatHg16/human.best.2.chain stdout \
    | chainMergeSort stdin \
    | chainSplit chain stdin
    # re-net with the new IDs:
    mkdir net
    foreach f (chain/*.chain)
      echo chaining $f
      chainNet $f /cluster/data/hg16/chrom.sizes \
        /cluster/data/pt0/scaffold.sizes net/$f:t:r.net /dev/null
    end
    # Now get a single-cov subset as usual:
    mkdir subset
    foreach f (chain/*.chain)
      echo subsetting net/$f:t:r.net, $f to subset/$f:t
      netChainSubset net/$f:t:r.net $f subset/$f:t
    end
    cat subset/*.chain > /cluster/data/hg16/bed/bedOver/hg16Topt0.chain
    # make it available:
    ssh hgwdev
    cd /usr/local/apache/htdocs/goldenPath/hg16/liftOver/
    zip -j hg16Topt0.zip /cluster/data/hg16/bed/bedOver/hg16Topt0.chain
    # update README.txt

    # lift scaffold-based over.chain to chrom-based (2004-07-09 kate)
    ssh kksilo
    cd /cluster/data/hg16/bed/bedOver
    liftUp -chainQ hg16TopanTro1.chain /cluster/data/panTro1/jkStuff/scaffolds.lft warn hg16Topt0.chain

    # NOTE: these chains appear to be broken up -- try using all chains,
    # instead of reciprocal best
    ssh kolossus
    cd /cluster/data/hg16/bed/blastz-blat.panTro1
    netChainSubset human.net all.chain over.chain
    # load just for ENCODE dev
    hgLoadChain hg16 liftOverPanTro1Chain over.chain
    # TODO: delete table

    ssh kolossus
    cd /cluster/data/hg16/bed/blastz-blat.panTro1
    chainSwap \
        /cluster/data/panTro1/bed/blastz-blatHg16.pt0.swap/all.newId.chain \
                all.newId.swp.chain
    chainSplit chain.newId all.newId.swp.chain
    mkdir preNet
    cd chain.newId
cat > preNet.csh << 'EOF'
    foreach i (*.chain)
        echo pre-netting $i
        chainSort $i stdout | \
            chainPreNet stdin /cluster/data/hg16/chrom.sizes \
                        /cluster/data/panTro1/chrom.sizes ../preNet/$i
    end
'EOF'
    csh preNet.csh >&! preNet.log &
    tail -100f preNet.log
    cd ..
# << for emacs

mkdir n1
cd preNet
cat > net.csh << 'EOF'
    foreach i (*.chain)
        set n = $i:r.net
        echo netting $i
        chainNet $i -minSpace=1 /cluster/data/hg16/chrom.sizes \
                        /cluster/data/panTro1/chrom.sizes ../n1/$n /dev/null
    end
'EOF'
    csh net.csh >&! net.log &
    tail -100f net.log
    cd ..

    cat n1/*.net | /cluster/bin/i386/netSyntenic stdin hNoClass.net

    # GOT HERE

    ssh hgwdev
    cd /cluster/data/hg16/bed/blastz-blat.panTro1
    netClass hNoClass.net hg16 panTro1 chimp.newId.net

    # chain files from the net
    ssh kolossus
    cd /cluster/data/hg16/bed/blastz-blat.panTro1
    netChainSubset chimp.newId.net all.newId.swp.chain over.newId.chain
    cp over.newId.chain \
        /cluster/data/hg16/bed/liftOver/hg16ToPanTro1.newId.over.chain
    mv hg16TopanTro1.chain hg16Topantro1.chain.old
    cd /cluster/data/hg16/bed/liftOver
    ln -s hg16ToPanTro1.newId.over.chain hg16TopanTro1.chain

    ssh hgwdev
    cd /cluster/data/hg16/bed/blastz-blat.panTro1
    hgLoadChain hg16 liftOverPanTro1NewIdChain over.newId.chain


# MAKE HUMAN-CHICKEN OVER.CHAIN FOR LIFTOVER (DONE 3/2/04 angie)
    ssh kolossus
    mkdir /cluster/data/hg16/bed/bedOver/hg16TogalGal2
    cd /cluster/data/hg16/bed/bedOver/hg16TogalGal2
    set chainDir = /cluster/data/hg16/bed/blastz.galGal2.2004-02-25/axtChain
    netSplit $chainDir/human.net net
    mkdir subset
    foreach f ($chainDir/chain/*.chain)
      echo subsetting $f:t:r
      netChainSubset net/$f:t:r.net $f subset/$f:t
    end
    cat subset/*.chain > /cluster/data/hg16/bed/bedOver/hg16TogalGal2.chain


# HUMAN/MOUSE/RAT/CHICKEN (HMRG) PHYLOHMM CONSERVATION (IN PROGRESS 2004-03-8 kate)

# Set path

    set path = ($path /cluster/bin/woody)

# Obtain phylogenetic model (hmrc_rev_dg.mod)
# from Adam (hand-tuned, instead of fit_model)
# then, create New Hampshire tree for data (.nh file)

    cat hmrc_rev_dg.mod
        #ALPHABET: A C G T
        #ORDER: 0
        #SUBST_MOD: REV
        #NRATECATS: 10
        #ALPHA: 4.4
        #BACKGROUND: 0.286083 0.213573 0.213691 0.286652
        #RATE_MAT:
          #-0.891523    0.166770    0.574850    0.149902
           #0.223389   -1.146311    0.153784    0.769137
           #0.769591    0.153699   -1.147159    0.223869
           #0.149605    0.573055    0.166888   -0.889548
        #TREE: ((1:0.192598,(2:0.076303,3:0.083043):0.192598):0.47,4:0.47);


    /cluster/data/woody/scripts/extract-tree.pl human,mouse,rat,chicken \
                                hmrc_rev_dg.mod
#((human:0.192598,(mouse:0.076303,rat:0.083043):0.192598):0.47,chicken:0.47);

    ssh eieio
    set path = ($path /cluster/bin/woody)
    cd /cluster/data/hg16/bed/multiz.hg16mm3rn3galGal2
    cd phyloHMM

    # now, break up the genome-wide MAFs into pieces; it's worth doing
    # this as a little cluster job
    # NOTE: using the hg16 chr fasta files stashed on bluearc for hg16 humor run
    # NOTE: next time add "check out" lines to assure files are created
    ssh eieio
    mkdir -p /cluster/bluearc/hg16/bed/multiz.hg16mm3rn3galGal2
    cp /cluster/data/hg16/bed/multiz.hg16mm3rn3galGal2/hmrg/*.maf /cluster/bluearc/hg16/bed/multiz.hg16mm3rn3galGal2
    cd /cluster/data/hg16/bed/multiz.hg16mm3rn3galGal2/phyloHMM
    
    cat << 'EOF' > doSplit
#!/bin/sh

WOODY=/cluster/bin/woody
FA_SRC=/cluster/bluearc/hg16/bed/humor
WINDOWS=/cluster/data/hg16/bed/multiz.hg16mm3rn3galGal2/phyloHMM/WINDOWS

maf=$1
c =`basename $maf .maf`
echo $c
mkdir -p /scratch/msa_split
${WOODY}/msa_split $maf -i MAF -M ${FA_SRC}/$c.fa -O hg16,mm3,rn3,galGal2 -w 1000000,0 -r /scratch/msa_split/$c -o SS -I 1000 -d 1 -B 5000
echo "Copying..."
cd /scratch/msa_split
for file in $c.*.ss ; do gzip -c $file > ${WINDOWS}/$file.gz ; done
rm -f /scratch/msa_split/$c.*.ss
echo "Done copying"
'EOF'
    chmod +x doSplit
    mkdir -p WINDOWS
    rm -f WINDOWS/* jobs.lst
    foreach file (/cluster/bluearc/hg16/bed/multiz.hg16mm3rn3galGal2/*.maf) 
	echo "doSplit $file" >> jobs.lst
    end

    ssh kk
    cd /cluster/data/hg16/bed/multiz.hg16mm3rn3galGal2/phyloHMM
    para create jobs.lst  
    # para try, para push, etc.

    # now setup and run the cluster job to compute the conservation scores
    # NOTE: need to use gensub2, check out+ facilities to check for
    # failures.  Will want to chunk msa_split output (above) into chr dirs.
    # to make the gensub template reasonable.
    
cat << 'EOF' > doPostProbs
#!/bin/sh

WOODY=/cluster/bin/woody
TMP=/tmp/phyloHMMcons

file=$1
root=`basename $file .ss.gz`
chrom=`echo $root | awk -F\. '{print $1}'`
echo $chrom

mkdir -p $TMP
zcat $file | $WOODY/label -m - -d hmrc_rev_dg.mod -i SS -o $TMP/$root -k 10 -L 0.9 -A -p 0 -j 1 -s $chrom -x
mkdir -p POSTPROBS/$chrom
gzip -c $TMP/$root.postprob > POSTPROBS/$chrom/$root.postprob.gz
rm $TMP/$root.postprob
'EOF'
    chmod +x doPostProbs

    mkdir -p POSTPROBS
    rm -f jobs2.lst 
    foreach file (WINDOWS/chr*.ss.gz) 
       echo "doPostProbs $file" >> jobs2.lst 
    end
    wc -l job2.lst
    para create jobs2.lst
    # etc ... (run cluster job)

    # Create wiggle (.wib) file using and load into database
    ssh eieio
    cd /cluster/data/hg16/bed/multiz.hg16mm3rn3galGal2/phyloHMM
    mkdir wibLimits
    mkdir wib
cat > makeWig.csh << 'EOF'
    foreach dir (POSTPROBS/*)
	set chrom = $dir:t
	echo $chrom 
	zcat `ls POSTPROBS/$chrom/*postprob.gz | sort -t\. -k2,2n` | \
	    wigAsciiToBinary -chrom=$chrom \
		-dataSpan=1 -wibFile=wib/${chrom}_hmrg_phyloHMM -name=hmrg \
		stdin > wibLimits/${chrom}
    end
'EOF'
    csh makeWig.csh >&! makeWig.log &

    ssh hgwdev
    cd /cluster/data/hg16/bed/multiz.hg16mm3rn3galGal2/phyloHMM
    hgLoadWiggle hg16 multizMm3Rn3GalGal2_phyloHMM_wig wib/*_hmrg_phyloHMM.wig 
    ln -s `pwd`/wib/chr*_hmrg_phyloHMM.wib /gbdb/hg16/wib 
    chmod 775 . wib
    chmod 664 wib/*.wib

    #	Add zoom records to table to speed display of large regions (>600Kbp)
    # NOTE: this doesn't work -- the rows were dropped
    ssh eieio
    cd /cluster/data/hg16/bed/multiz.hg16mm3rn3galGal2/phyloHMM
    mkdir -p wib1K wibLimits1K
cat > wigZoom1K.csh << 'EOF'
    foreach dir (POSTPROBS/*)
	set chrom = $dir:t
	echo $chrom 
	zcat `ls POSTPROBS/$chrom/*postprob.gz | sort -t\. -k2,2n` | \
	    wigZoom stdin | wigAsciiToBinary -chrom=$chrom \
                -dataSpan=1024 -wibFile=wib1K/${chrom}_hmrg_phyloHMM_1K \
                -name=hmrg stdin > wibLimits1K/${chrom}
    end
'EOF'
    csh wigZoom1K.csh >&! wigZoom1K.log &
    tail -100f wigZoom1K.log
    ssh hgwdev
    cd /cluster/data/hg16/bed/multiz.hg16mm3rn3galGal2/phyloHMM/wib1K
    hgLoadWiggle -oldTable hg16 multizMm3Rn3GalGal2_phyloHMM_wig *.wig
    #	create symlinks for .wib files
    ln -s `pwd`/*.wib /gbdb/hg16/wib
    # NOTE: this doesn't work -- the rows were dropped

    # setup external files for database reference
    # reuse mafs loaded in the maf track (just symlink the /gbdb dir before
    # loading
    ssh hgwdev
    ln -s /gbdb/hg16/multizMm3Rn3GalGal2 /gbdb/hg16/multizMm3Rn3GalGal2_phyloHMM
    # load into database
    cd /cluster/data/hg16/bed/multiz.hg16mm3rn3galGal2/phyloHMM
    /cluster/bin/i386/hgLoadMaf -warn hg16 multizMm3Rn3GalGal2_phyloHMM
    # create trackDb entry
    # track multizMm3Rn3GalGal2_phyloHMM
    # type wigMaf 0.0 1.0
    # wiggle multizMm3Rn3GalGal2_phyloHMM_wig
    # etc.

    # Load pairwise mafs
    ssh hgwdev
    cd /gbdb/hg16
    mkdir -p mouse_hmrg rat_hmrg chicken_hmrg

    foreach f (/cluster/data/hg16/bed/humor/maf/*.mm3.maf)
        ln -s $f /gbdb/hg16/mouse_hmrg
    end
    cd /tmp
    hgLoadMaf -WARN hg16 mouse_hmrg

    foreach f (/cluster/data/hg16/bed/humor/maf/*.rn3.maf)
        ln -s $f /gbdb/hg16/rat_hmrg
    end
    cd /tmp
    hgLoadMaf -WARN hg16 mouse_hmrg

    foreach f (/cluster/data/hg16/bed/blastz.galGal2.2004-02-25/mafBest/*.maf)
        ln -s $f /gbdb/hg16/chicken_hmrg
    end
    cd /tmp
    hgLoadMaf -WARN hg16 chicken_hmrg

    # copy files to download area 
    set dir = /usr/local/apache/htdocs/goldenPath/hg16/multizMm3Rn3GalGal2
    mkdir $dir
    ln -s $dir multiz
    cp -p /gbdb/hg16/multizMm3Rn3GalGal2_phyloHMM/*.maf $dir
    cd $dir
    gzip *

# As the 5-way alignment is imminent, this wasn't completed
    
    # edit downloads page to add links
    # add pairwise mafs to downloads page 
    mkdir $dir/{rn3,mm3}
    cd /cluster/data/hg16/bed/humor/maf
    cp *.mm3.maf $dir/mm3
    cp *.rn3.maf $dir/rn3
    gzip $dir/mm3/*
    gzip $dir/rn3/*
    # also add human/chicken maf's

    # Create upstream files
    ssh hgwdev
    echo hg16 mm3 rn3 galGal2> org.txt
    foreach i (1000 2000 5000)
    featureBits hg16 refGene:upstream:$i -fa=/dev/null -bed=up.bad
    awk -F '\t' '{printf("%s\t%s\t%s\t%s\t%s\t%s\n", $1, $2, $3, substr($4, 0, 9), 0, $5)}' up.bad > up.bed
    rm up.bad
    mafFrags hg16 multizMm3Rn3GalGal2 up.bed upstream$i.maf -orgs=org.txt
    rm up.bed
    end


#  miRNA track (DONE - 2004-05-04 - Hiram)
    #	data from: Sam Griffiths-Jones <sgj@sanger.ac.uk>
    #	and Michel.Weber@ibcg.biotoul.fr
    #	notify them if this assembly updates to renew this track
    ssh hgwdev
    mkdir /cluster/data/rn3/bed/miRNA
    cd /cluster/data/rn3/bed/miRNA
    wget --timestamping \
    hgLoadBed rn3 miRNA rn3.bed
    # entry in trackDb/trackDb.ra already there

#  miRNA track (UPDATED - 2004-05-04 - Hiram)
    #	(first version done 2004-03-02)
    #	data from: Sam Griffiths-Jones <sgj@sanger.ac.uk>
    #	and Michel.Weber@ibcg.biotoul.fr
    #	notify them if this assembly updates to renew this track
    cd /cluster/data/hg16/bed
    mv miRNA miRNA.2004_03_02
    mkdir miRNA
    cd miRNA
    wget --timestamping \
    "ftp://ftp.sanger.ac.uk/pub/databases/Rfam/miRNA/genomes/hsa_ncbi34.*"
    grep -v "^track " hsa_ncbi34.bed | sed -e "s/ /\t/g" > hg16.bed
    #	check existing track for comparison after update load
    #	featureBits hg16 miRNA
    #	15385 bases of 2865248791 (0.001%) in intersection
    hgLoadBed hg16 miRNA hg16.bed
    #	featureBits hg16 miRNA
    #	16923 bases of 2865248791 (0.001%) in intersection

    # added an entry to trackDb/trackDb.ra:  (good for Mm4 and Ce1 too)
track miRNA
shortLabel miRNA
longLabel MicroRNAs from the miRNA Registry
group genes
priority 63
visibility hide
useScore 1
color 255,64,64
type bed 8
url http://www.sanger.ac.uk/cgi-bin/Rfam/mirna/mirna_entry.pl?id=$$
    #	Note the useScore item.  This colors plus strand items in black
    #	and minus strand items in gray.  A rarely used option.
    #	This same track is in Rn3, Mm4 and Ce2 too.  Added
    #	findBedPos(query, hgp, "miRNA");
    #	to lib/hgFind.c to allow searching for these items.


#5-WAY MULTIZ & PHYLO-HMM HUMAN/CHIMP/MOUSE/RAT/CHICKEN (3/19/04, kpollard)

    # UPDATE WOODY BINARIES
    ssh hgwdev
    cd /cluster/data/woody   
    cvs update -dP
    cd src
    make
    # make sure Makefile has INSTALLDIR = /cluster/bin/woody
    make install 

    #MULTIZ to add chimp, then chicken to HUMOR (see above)
    ssh kk
    set fiveDir = /cluster/data/hg16/bed/multiz.hg16mm3rn3panTro1galGal2
    mkdir -p $fiveDir/hmrp
    mkdir -p $fiveDir/hmrpg
    cd $fiveDir
	
    #wrapper script for multiz
    cat << EOF > mz
#!/bin/csh
/cluster/bin/penn/tbaBin/multiz \$1 \$2 - > \$3
EOF
    chmod +x mz

    #CHIMP
    # put the MAFs on bluearc
    ssh eieio
    set clustDir = /cluster/bluearc/multiz.hg16mm3rn3panTro1galGal2
    mkdir -p $clustDir/hp
    mkdir -p $clustDir/hmr
    cp /cluster/data/hg16/bed/humor.2003-09-08/hmr/*.maf $clustDir/hmr
    cp /cluster/data/hg16/bed/blastz-blat.panTro1.lifted/mafRBestNet/*.maf $clustDir/hp
    logout  # back to kk

    #set up joblist (common denominator set: no chr19_random in hmr)
    set clustDir = /cluster/bluearc/multiz.hg16mm3rn3panTro1galGal2
    cd $fiveDir
    rm -f jobList
    foreach file ($clustDir/hmr/*.maf) 
	set root=`echo $file:t:r | sed 's/\.hmr//'`		
       	echo "mz $clustDir/hp/${root}.maf $file $fiveDir/hmrp/${root}.maf" >> jobList
    end

    #run on kk	
    chmod +x jobList
    para create jobList
    #para try, para check, para push, etc.

    #add chr19_random from hp to hmrp
    cp /cluster/data/hg16/bed/blastz-blat.panTro1.lifted/mafRBestNet/chr19_random.maf $fiveDir/hmrp

    #clean up bluearc
    ssh eieio
    set clustDir = /cluster/bluearc/multiz.hg16mm3rn3panTro1galGal2
    rm -r $clustDir/hp
    rm -r $clustDir/hmr

    #CHICKEN
    # put the MAFs on bluearc
    ssh eieio
    set fiveDir = /cluster/data/hg16/bed/multiz.hg16mm3rn3panTro1galGal2
    mkdir -p $clustDir/hmrp
    mkdir -p $clustDir/hg
    cp $fiveDir/hmrp/*.maf $clustDir/hmrp
    cp /cluster/data/hg16/bed/blastz.galGal2.2004-02-25/mafBest/*.maf
 $clustDir/hg
    logout  # back to kk
    logout #move to kki

    #set up job list 2
    ssh kki
    set fiveDir = /cluster/data/hg16/bed/multiz.hg16mm3rn3panTro1galGal2
    set clustDir = /cluster/bluearc/multiz.hg16mm3rn3panTro1galGal2
    cd $fiveDir
    rm -f jobList.2
    foreach file ($clustDir/hg/*.maf) 
	set root=`echo $file:t:r | sed 's/\.hg//'`		
	echo "mz $file $clustDir/hmrp/${root}.maf $fiveDir/hmrpg/${root}.maf" >> jobList.2
    end
    
    #run on kki	
    chmod +x jobList.2
    para create jobList.2
    #para try, para check, para push, etc.

    # clean up bluearc
    ssh eieio
    rm -r /cluster/bluearc/multiz.hg16mm3rn3panTro1galGal2
    logout

    #PHYLO-HMM CONSERVATION
    #Set path
    set path = ($path /cluster/bin/woody)

    #Create "sufficient statistics" (SS) file from maf
    ssh eieio
    cd /cluster/data/hg16/bed/multiz.hg16mm3rn3panTro1galGal2
    mkdir phyloHMM
    cd phyloHMM

    # create script to run msa_view.
    cat > makeSS.csh << 'EOF'
set path = ($path /cluster/bin/woody)
cd /cluster/data/hg16/bed/multiz.hg16mm3rn3panTro1galGal2/hmrpg
foreach f (chr*.maf)
    set c = $f:r
    echo "$c"
    msa_view $f -i MAF -o SS -s 1 -r 1 -O hg16,mm3,rn3,panTro1,galGal2 > \
       ../phyloHMM/$c.ss
end
'EOF'
    csh makeSS.csh >&! makeSS.log &
    tail -100f makeSS.log
    head phyloHMM/chr1.ss
    head phyloHMM/chrY.ss
    
    #model hpmrc_rev_dg.mod (from Adam)
    set path = ($path /cluster/bin/woody)
    cat hpmrc_rev_dg.mod
#ALPHABET: A C G T
#ORDER: 0
#SUBST_MOD: REV
#NRATECATS: 10
#ALPHA: 4.4
#BACKGROUND: 0.286083 0.213573 0.213691 0.286652
#RATE_MAT:
#  -0.891523    0.166770    0.574850    0.149902
#   0.223389   -1.146311    0.153784    0.769137
#   0.769591    0.153699   -1.147159    0.223869
#   0.149605    0.573055    0.166888   -0.889548
#TREE: ((1:0.0056,2:0.0057):0.1043,(3:0.076303,4:0.083043):0.2753):0.47,5:0.47); 
    /cluster/data/woody/scripts/extract-tree.pl human,chimp,mouse,rat,chicken \
        hpmrc_rev_dg.mod
#((human:0.0056,chimp:0.0057):0.1043,(mouse:0.076303,rat:0.083043):0.2753):0.47,chicken:0.47);
    #order is human-chimp-mouse-rat-chicken, so fix maf order in next step

    #break up the genome-wide MAFs into pieces
    # NOTE: using the hg16 chr fasta files stashed on bluearc for hg16 humor
    mkdir -p /cluster/bluearc/hg16/bed/multiz.hg16mm3rn3panTro1galGal2
    cp /cluster/data/hg16/bed/multiz.hg16mm3rn3panTro1galGal2/hmrpg/*.maf /cluster/bluearc/hg16/bed/multiz.hg16mm3rn3panTro1galGal2
    cd /cluster/data/hg16/bed/multiz.hg16mm3rn3panTro1galGal2/phyloHMM
    cat << 'EOF' > doSplit
#!/bin/sh

WOODY=/cluster/bin/woody
FA_SRC=/cluster/bluearc/hg16/bed/humor
WINDOWS=/cluster/data/hg16/bed/multiz.hg16mm3rn3panTro1galGal2/phyloHMM/WINDOWS

maf=$1
c=`basename $maf .maf`
echo $c
mkdir -p /scratch/msa_split
${WOODY}/msa_split $maf -i MAF -M ${FA_SRC}/$c.fa -O hg16,panTro1,mm3,rn3,galGal2 -w 1000000,0 -r /scratch/msa_split/$c -o SS -I 1000 -d 1 -B 5000
echo "Copying..."
cd /scratch/msa_split
for file in $c.*.ss ; do gzip -c $file > ${WINDOWS}/$file.gz ; done
rm -f /scratch/msa_split/$c.*.ss
echo "Done copying"
'EOF'
    chmod +x doSplit
    mkdir -p WINDOWS
    rm -f WINDOWS/* jobs.lst
    foreach file (/cluster/bluearc/hg16/bed/multiz.hg16mm3rn3panTro1galGal2/*.maf) 
	echo "doSplit $file" >> jobs.lst
    end
    
    #run on kki
    ssh kki
    cd /cluster/data/hg16/bed/multiz.hg16mm3rn3panTro1galGal2/phyloHMM
    para create jobs.lst  
    # para try, para check, para push, etc.
    logout

    #compute the conservation scores
    # NOTE: need to use gensub2, check out+ facilities to check for
    # failures.  Will want to chunk msa_split output (above) into chr dirs.
    # to make the gensub template reasonable.    
    ssh kk
    cd /cluster/data/hg16/bed/multiz.hg16mm3rn3panTro1galGal2/phyloHMM
    cat << 'EOF' > doPostProbs
#!/bin/sh

WOODY=/cluster/bin/woody
TMP=/tmp/phyloHMMcons

file=$1
root=`basename $file .ss.gz`
chrom=`echo $root | awk -F\. '{print $1}'`
echo $chrom

mkdir -p $TMP
zcat $file | $WOODY/label -m - -d hpmrc_rev_dg.mod -i SS -o $TMP/$root -k 10 -L 0.9 -A -p 0 -j 1 -s $chrom -x
mkdir -p POSTPROBS/$chrom
gzip -c $TMP/$root.postprob > POSTPROBS/$chrom/$root.postprob.gz
rm $TMP/$root.postprob
'EOF'
    # << this line makes emacs coloring happy

    chmod +x doPostProbs
    mkdir -p POSTPROBS
    rm -f jobs2.lst 
    foreach file (WINDOWS/chr*.ss.gz) 
       echo "doPostProbs $file" >> jobs2.lst 
    end
    wc -l jobs2.lst
    para create jobs2.lst
    #para try, para check, para push, etc.
    #1 problem: chr19_random crashed - due to no alignments in HMR. Leave out.

    # Create wiggle (.wib) file and load into database
    ssh eieio
    cd /cluster/data/hg16/bed/multiz.hg16mm3rn3panTro1galGal2/phyloHMM
    mkdir wibLimits
    mkdir wib
    cat > makeWig.csh << 'EOF'
foreach dir (POSTPROBS/*)
    set chrom = $dir:t
    echo $chrom 
    zcat `ls POSTPROBS/$chrom/*postprob.gz | sort -t\. -k2,2n` | \
	wigAsciiToBinary -chrom=$chrom \
	-dataSpan=1 -wibFile=wib/${chrom}_hpmrg_phyloHMM -name=hpmrg \
	stdin > wibLimits/${chrom}
end
'EOF'
    # << this line makes emacs coloring happy
    csh makeWig.csh >&! makeWig.log &

    #load tables
    ssh hgwdev
    cd /cluster/data/hg16/bed/multiz.hg16mm3rn3galGal2/phyloHMM
    hgLoadWiggle hg16 mzPt1Mm3Rn3Gg2_pHMM_wig wib/*_hpmrg_phyloHMM.wig 
    ln -s `pwd`/wib/chr*_hpmrg_phyloHMM.wib /gbdb/hg16/wib 
    chmod 775 . wib 
    chmod 664 wib/*.wib

    cd /cluster/data/hg16/bed/multiz.hg16mm3rn3panTro1galGal2/hmrpg
    mkdir -p /gbdb/hg16/mzPt1Mm3Rn3Gg2_pHMM
    ln -s /cluster/data/hg16/bed/multiz.hg16mm3rn3panTro1galGal2/hmrpg/*.maf /gbdb/hg16/mzPt1Mm3Rn3Gg2_pHMM
    hgLoadMaf hg16 -warn mzPt1Mm3Rn3Gg2_pHMM   

    cd /cluster/data/hg16/bed/blastz-blat.panTro1.lifted/mafRBestNet/
    mkdir -p /gbdb/hg16/chimp_hmrg
    ln -s /cluster/data/hg16/bed/blastz-blat.panTro1.lifted/mafRBestNet/*.maf /gbdb/hg16/chimp_hmrg 
    hgLoadMaf hg16 -warn chimp_hmrg   

    #cleanup bluearc
    ssh eieio
    rm -r /cluster/bluearc/hg16/bed/multiz.hg16mm3rn3panTro1galGal2
    logout

    #Add description file: mzPt1Mm3Rn3Gg2_pHMM.html
    #Add track to trackDb.ra: mzPt1Mm3Rn3Gg2_pHMM

    #Copy files to download area
    cd /gbdb/hg16
    set dir = /usr/local/apache/htdocs/goldenPath/hg16/mzPt1Mm3Rn3Gg2
    mkdir $dir
    ln -s $dir multiz
    cp -p /gbdb/hg16/mzPt1Mm3Rn3Gg2_pHMM/*.maf $dir
    cd $dir
    gzip *

    # edit downloads page to add links
    # add pairwise mafs to downloads page 
    mkdir $dir/{rn3,mm3,pt1,gg2}
    cd /cluster/data/hg16/bed/humor/maf
    cp *.mm3.maf $dir/mm3
    cp *.rn3.maf $dir/rn3
    gzip $dir/mm3/*
    gzip $dir/rn3/*
    cd /cluster/data/hg16/bed/blastz.galGal2.2004-02-25/mafBest
    cp *.maf $dir/gg2
    gzip $dir/gg2/*
    cd /cluster/data/hg16/bed/blastz-blat.panTro1.lifted/mafRBestNet/
    cp *.maf $dir/pt1
    gzip $dir/pt1/*

# EXONIPHY HMR
# (started, acs, 2004-03-23)
# (redone 2004-07-01, with new version of software; have revised 
# docs accordingly)
# Warning: some commands here require bash shell
    ssh hgwdev
    # (make sure /cluster/bin/phast is in path)
    mkdir /cluster/store6/exoniphy.hg16mm3rn3.2004-03-23
    cd /cluster/data/hg16/bed
    ln -s /cluster/store6/exoniphy.hg16mm3rn3.2004-03-23
    ln -s exoniphy.hg16mm3rn3.2004-03-23 exoniphy.hg16mm3rn3

    # first, break up the genome-wide MAFs into pieces; it's worth doing
    # this as a little cluster job
    ssh eieio
    mkdir -p /cluster/bluearc/hg16/bed/humor
    cp /cluster/data/hg16/bed/humor/hmr/*.maf /cluster/data/hg16/?{,?}/chr*.fa /cluster/bluearc/hg16/bed/humor
    logout
    ssh kk
    cd /cluster/data/hg16/bed/exoniphy.hg16mm3rn3
    cat << '_EOF_' > doSplit
#!/bin/sh

PHAST=/cluster/bin/phast
FA_SRC=/cluster/bluearc/hg16/bed/humor
WINDOWS=/cluster/data/hg16/bed/exoniphy.hg16mm3rn3/WINDOWS

maf=$1
prefix=`basename $maf .hmr.maf`
chr=`echo $prefix | sed 's/chr//g ; s/_random//g'`
mkdir -p /scratch/msa_split
${PHAST}/msa_split $maf --in-format MAF --refseq ${FA_SRC}/$prefix.fa --order hg16,mm3,rn3 --windows 50000,2000 --out-root /scratch/msa_split/$prefix --out-format SS --min-informative 1000 --between-blocks 1000 --tuple-size 3
mkdir -p ${WINDOWS}/$chr
cd /scratch/msa_split
for file in `ls | egrep -w ${prefix}` ; do gzip -c $file > ${WINDOWS}/$chr/$file.gz ; rm $file ; done
_EOF_
    # << this line makes emacs coloring happy

    chmod +x doSplit
    mkdir -p WINDOWS
    rm -rf WINDOWS/* jobs.lst
    for file in /cluster/bluearc/hg16/bed/humor/*.maf ; do echo "doSplit $file" >> jobs.lst ; done

    para create jobs.lst  
    # etc ...   (run cluster job)


    # now set up cluster job for exoniphy.  
    cat << '_EOF_' > doExoniphy
#!/bin/bash
zcat $1 | /cluster/bin/phast/exoniphy - ${*:3} > $2
_EOF_
    # << this line makes emacs coloring happy
    chmod +x doExoniphy

    rm -f jobs.lst
    for dir in WINDOWS/* ; do
	chrNo=`basename $dir`
	mkdir -p OUTPUT/$chrNo
	for file in $dir/* ; do 
	    base=`basename $file .ss.gz`
	    chrStr=`echo $base | awk -F\. '{print $1}'`
	    echo "doExoniphy $file OUTPUT/$chrNo/$base.gff --seqname $chrStr --idpref $base --score --indels --quiet " >> jobs.lst 
	done 
    done

#[acs@kk exoniphy.hg16mm3rn3]$ wc jobs.lst
#  59175  591750 7179445 jobs.lst
    
    para create jobs.lst
    # etc... (run cluster job)

#Completed: 59175 of 59175 jobs
#CPU time in finished jobs:   49361849s  822697.48m 13711.62h  571.32d  1.565 y
#IO & Wait Time:                258451s    4307.52m    71.79h    2.99d  0.008 y
#Average job time:                 839s      13.98m     0.23h    0.01d
#Longest job:                     1868s      31.13m     0.52h    0.02d
#Submission to last job:         75584s    1259.73m    21.00h    0.87d

    # create track
    logout
    ssh hgwdev
    cd /cluster/data/hg16/bed/exoniphy.hg16mm3rn3
    for dir in OUTPUT/* ; do 
	    chrNo=`basename $dir`
	    echo $chrNo
	    find $dir -name "*.gff" | grep -v random > files
	    if [ -s files ] ; then cat `cat files` | refeature - --unique --sort --include-only CDS,start_codon,stop_codon > chr$chrNo.gff ; fi
	    find $dir -name "*.gff" | grep random > files
	    if [ -s files ] ; then cat `cat files` | refeature - --unique --sort --include-only CDS,start_codon,stop_codon > chr${chrNo}_random.gff ; fi
    done

    ldHgGene -gtf -frame hg16 exoniphy chr*.gff
#track exoniphy
#shortLabel Exoniphy 
#longLabel Exoniphy: Conserved Exon Predictions (Human/Mouse/Rat)
#group genes
#priority 50.9
#visibility hide
#color 173,17,162
#type genePred

#
# Load tfbsCons track DONE 2004-03-31 braney
#

set humordir=/gbdb/hg16/humorMm3Rn3
set transfacdir=/projects/compbio/data/transfac
set outdir=hg16_tfbsCons

ssh hgwdev
mkdir /cluster/data/hg16/bed/tfbsCons
cd /cluster/data/hg16/bed/tfbsCons
# Get tfbsConsUtils.tar.gz from Matt Weirauch with Perl scripts  weirauch@soe.ucsc.edu
set tarfile=/cluster/data/hg15/bed/tfbsCons/tfbsConsUtils.tar.gz
tar zxf $tarfile
# the following takes days (says Matt)
nice getTfbsConsData.pl `pwd` $humordir $transfacdir ./IDS.txt $outdir -over &
cd $outdir
rm chr*.bed
hgLoadBed -noSort hg16 tfbsCons -sqlTable=$HOME/kent/src/hg/lib/tfbsCons.sql tfbsCons.bed -tab

# Get mapping of ID's from Matt so we can link into the TRANSFAC database
set idmap=/cluster/data/hg16/bed/tfbsCons/tfbsConsMap
hgsql hg16 < ~/kent/src/hg/lib/tfbsConsMap.sql
echo "load data local infile '$idmap' into table tfbsConsMap;" | hgsql hg16


# PREP FOR LIFTOVER CHAINS TO HG16 (2004-04-12 kate)

# split into 3K chunks
ssh eieio
set tempDir = /cluster/bluearc/hg/gs.17/build34/liftOver
cd $tempDir
mkdir lift 
cat > split.csh << 'EOF'
set scratch = /iscratch/i/gs.17/build34/liftOver/split
mkdir -p $scratch
foreach i (1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 X Y M)
    echo chr$i
    faSplit -lift=lift/chr$i.lft size /cluster/data/hg16/$i/chr$i.fa -oneFile 3000 $scratch/chr$i
end
'EOF'
csh split.csh >&! split.log &
tail -100f split.log
/cluster/bin/iSync



# ECORES FROM GENOSCOPE [DONE, hartera, 2004-03-31]
# download data from  http://www.genoscope.cns.fr/externe/tetraodon/Data3/ecores# ecotigHF - ecores on Human, genome conserved with Fugu, Fr1
# ecotigHT - ecores on Human, genome conserved with Tetraodon (March 2004)

    ssh hgwdev
    mkdir /cluster/data/hg16/bed/ecores/
    # add parse_ecotig.pl to this directory

# FUGU
    mkdir /cluster/data/hg16/bed/ecores/fr1
    cd /cluster/data/hg16/bed/ecores/fr1/

    # download data for ecotigHF to this directory
    # parse ecotig files to produce a bed format file
    perl ../parse_ecotig.pl < ecotigHF > ecotigHF.bed

    # change from upper to lower case for "CHR"
    perl -pi.bak -e 's/CHR/chr/g' ecotigHF.bed

    hgLoadBed -tab hg16 ecoresFr1 ecotigHF.bed

    # clean up
    rm *.bak

# TETRAODON
    mkdir /cluster/data/hg16/bed/ecores/tetraodon
    cd /cluster/data/hg16/bed/ecores/tetraodon/

    # download data for ecotigHT to this directory
    # parse ecotig files to produce a bed format file
    perl ../parse_ecotig.pl < ecotigHT > ecotigHT.bed
    # change from upper to lower case for "CHR"
    perl -pi.bak -e 's/CHR/chr/g' ecotigHT.bed

    hgLoadBed -tab hg16 ecoresTetraodon ecotigHT.bed

    # clean up
    rm *.bak

    # add entries in kent/src/hg/makeDb/trackDb/human/hg16/trackDb.ra
    # add html for details pages to this directory:
    # ecoresFr1.html and ecoresTetraodon.html


# VNTR MICROSATELLITE REPEATS FROM GEROME BREEN (DONE 4/28/04 angie)
    ssh hgwdev
    mkdir /cluster/data/hg16/bed/vntr
    cd /cluster/data/hg16/bed/vntr
    # saved email attachment from Gerome Breen <g.breen@iop.kcl.ac.uk>
    # as HumJuly2003microsats_finished_for_angieH.txt
    # Replace 1-based start coords with 0-based, tweak n/a distance values:
    tail +2 HumJuly2003microsats_finished_for_angieH.txt \
    | perl -wpe 's/(first|last) in chromosome\/sequence/-1/i' \
    | awk '{printf "%s\t%d\t%d\t%s\t%s\t%d\t%s\t%s\t%s\t%s\n", $1, $2-1, $3, $4, $5, $6, $7, $8, $9, $10;}'  \
      > vntr.bed
    hgLoadBed -tab -sqlTable=$HOME/kent/src/hg/lib/vntr.sql hg16 \
      vntr vntr.bed


# WEBB'S PUTATIVE NON-EXONIC CONSERVED REGIONS (DONE 4/6/04 angie)
    ssh hgwdev
    mkdir /cluster/data/hg16/bed/webbNonExonic
    cd /cluster/data/hg16/bed/webbNonExonic
    wget http://bio.cse.psu.edu/~webb/nonexonic.tar.gz
    tar xvzf nonexonic.tar.gz 
    # Score should really be scaled from 5k..276k --> 200-1000
    cat chr* \
    | awk '{printf "%s\t%d\t%d\t%s:%d-%d\t%d\t%c\n", $2, $3-1, $4, $5, $6, $7, $9, $8;}' \
    > webbNonExonic.bed
    hgLoadBed hg16 webbNonExonic webbNonExonic.bed


# phylo HMM data quintile calculation
    ssh eieio
    cat << '_EOF_' > /tmp/allpHMMdata.sh
#!/bin/sh
#       there is only an empty file in chr13_random, it causes all
#       files following it on the xargs zcat line to be missed.
#       Eliminate it from the processing
find ./POSTPROBS -type f | grep -v chr13_random | sort -t\. -k2,2n | \
        xargs zcat | awk '{print $2}' > /tmp/pHMM.data
'_EOF_'
    chmod +x /tmp/allpHMMdata.sh
    cd  /cluster/data/hg16/bed/phyloHMMcons.hg16mm3rn3.2003-11-11
    time /tmp/allpHMMdata.sh

#  Create top 5 % set of data for phyloHMMcons.hg16mm3rn3.2003-11-11
#	(DONE - 2004-05-15 - Hiram)
    cd  /cluster/data/hg16/bed/phyloHMMcons.hg16mm3rn3.2003-11-11
    cat << '_EOF_' > top5.sh
#!/bin/sh
#
#       Do not work on chr13_random, it has no data
#	this for loop should have been:
#	ls POSTPROBS/chr* | sort -t\. -k2,2n | while read i
#	to get the data in properly sorted order.  With this as is,
#	we will need to sort the coords later to make any wiggle
#	track out of this data
#
mkdir top5_data
for i in POSTPROBS/chr*
do
        c=${i/POSTPROBS\//}
        echo $i $c
        if [ "$c" != "chr13_random" ]; then
            if [ ! -f top5_data/$c.ascii.gz ]; then
                find ${i} -type f | sort -t\. -k2,2n | while read FN
                do
                    zcat ${FN}
                done | awk '{if ($2 > 0.450) print}' > top5_data/$c.ascii
                rm -f top5_data/$c.ascii.gz
                gzip top5_data/$c.ascii &
            else
                ls -og top5_data/$c.ascii.gz
            fi
        fi
done
'_EOF_'
    chmod +x top5_data
    #	running this script takes several hours, make sure you do it
    #	on the file server
    ssh eieio
    cd  /cluster/data/hg16/bed/phyloHMMcons.hg16mm3rn3.2003-11-11
    #	Then, to make the histogram data:
    cd /cluster/data/hg16/bed/phyloHMMcons.hg16mm3rn3.2003-11-11/top5_data
    cat << '_EOF' > mkHisto.sh
#!/bin/sh

for f in chr*.ascii.gz
do
    zcat $f
done | textHistogram -real -col=2 -binSize=0.001 -maxBinCount=1000 stdin
'_EOF_'
    chmod +x mkHisto.sh
    ./mkHisto.sh > histoGram.data
    
# BLASTZ FUGU (FR1) (DONE 4/19/04 angie)
    ssh kk
    # space is awful tight on store4 -- use store7.  
    mkdir -p /cluster/store7/hg16/bed/blastz.fr1.2004-04-19
    ln -s /cluster/store7/hg16/bed/blastz.fr1.2004-04-19 \
      /cluster/data/hg16/bed/
    cd /cluster/data/hg16/bed/blastz.fr1.2004-04-19
    # Set L=6000 (more relaxed than chicken) and abridge repeats.
    # Treat all repeats as lineage-specific (reuse linSpecRep.Chicken).
    cat << '_EOF_' > DEF
# human vs. fugu
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/i386:/cluster/home/angie/schwartzbin

ALIGN=blastz-run
BLASTZ=blastz

# Reuse parameters from human-chicken.
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=6000
BLASTZ_K=2200
BLASTZ_Q=/cluster/data/blastz/HoxD55.q
BLASTZ_ABRIDGE_REPEATS=1

# TARGET: Human
SEQ1_DIR=/iscratch/i/gs.17/build34/bothMaskedNibs
SEQ1_RMSK=
SEQ1_FLAG=
SEQ1_SMSK=/iscratch/i/gs.17/build34/linSpecRep.Chicken
SEQ1_IN_CONTIGS=0
SEQ1_CHUNK=10000000
SEQ1_LAP=10000

# QUERY: Fugu
SEQ2_DIR=/iscratch/i/fr1/nib
SEQ2_RMSK=
SEQ2_FLAG=
SEQ2_SMSK=/iscratch/i/fr1/linSpecRep
SEQ2_IN_CONTIGS=0
SEQ2_CHUNK=10000000
SEQ2_LAP=0

BASE=/cluster/store7/hg16/bed/blastz.fr1.2004-04-19

DEF=$BASE/DEF
RAW=$BASE/raw
CDBDIR=$BASE
SEQ1_LEN=$BASE/S1.len
SEQ2_LEN=$BASE/S2.len
'_EOF_'
    # << this line keeps emacs coloring happy

    bash # if a csh/tcsh user
    source DEF
    mkdir $RAW run.0
    /cluster/home/angie/hummus/make-joblist $DEF > $BASE/run.0/j
    sh ./xdir.sh
    cd run.0
    sed -e 's@^blastz-run@/cluster/bin/penn/blastz-run@' j > jobList
    para create jobList
    para try, check, push, check, ....
#Completed: 11865 of 11865 jobs
#Average job time:                 414s       6.90m     0.11h    0.00d
#Longest job:                      709s      11.82m     0.20h    0.01d
#Submission to last job:          5678s      94.63m     1.58h    0.07d

    # second cluster run: lift raw alignments -> lav dir
    ssh kki
    cd /cluster/data/hg16/bed/blastz.fr1.2004-04-19
    bash # if a csh/tcsh user
    source DEF
    mkdir run.1 lav
    /cluster/bin/scripts/blastz-make-out2lav $DEF $BASE > $BASE/run.1/jobList
    cd run.1
    wc -l jobList
    para create jobList
    para try, check, push, etc ...
#Completed: 339 of 339 jobs
#Average job time:                   4s       0.07m     0.00h    0.00d
#Longest job:                       19s       0.32m     0.01h    0.00d
#Submission to last job:            91s       1.52m     0.03h    0.00d

    # third run: lav -> axt
    ssh kki
    cd /cluster/data/hg16/bed/blastz.fr1.2004-04-19
    mkdir axtChrom pslChrom run.2
    cd run.2
    cat << '_EOF_' > do.csh
#!/bin/csh -ef
cd $1
set chr = $1:t
cat `ls -1 *.lav | sort -g` \
| $HOME/bin/x86_64/lavToAxt stdin \
    /iscratch/i/gs.17/build34/bothMaskedNibs /iscratch/i/fr1/nib stdout \
| $HOME/bin/x86_64/axtSort stdin ../../axtChrom/$chr.axt 
$HOME/bin/x86_64/axtToPsl ../../axtChrom/$chr.axt ../../S1.len ../../S2.len \
  ../../pslChrom/$chr.psl
'_EOF_'
    # << this line keeps emacs coloring happy
    chmod a+x do.csh
    cp /dev/null jobList
    foreach d (../lav/chr*)
      echo "do.csh $d" >> jobList
    end
    para create jobList
    para try, check, push, check
#Completed: 42 of 42 jobs
#Average job time:                  16s       0.26m     0.00h    0.00d
#Longest job:                       75s       1.25m     0.02h    0.00d
#Submission to last job:            80s       1.33m     0.02h    0.00d


# CHAIN FUGU BLASTZ (REDONE 10/1/04 angie)
    # NOTE: originally done 4/19, but with a buggy axtChain.
    # axtChain dir moved aside to axtChain.orig before rebuilding.
    # Run axtChain on little cluster
    ssh kki
    cd /cluster/data/hg16/bed/blastz.fr1.2004-04-19
    mkdir -p axtChain/run1
    cd axtChain/run1
    mkdir out chain
    # Check size>0 for .axt files (empty inputs cause out line+ check to fail):
    cp /dev/null input.lst
    foreach f (`ls -1S /cluster/data/hg16/bed/blastz.fr1.2004-04-19/axtChrom/*.axt`)
      if (-s $f) then
        echo $f >> input.lst
      endif
    end
    cat << '_EOF_' > gsub
#LOOP
doChain {check in exists $(path1)} {check out line+ chain/$(root1).chain} {check out line+ out/$(root1).out}
#ENDLOOP
'_EOF_'
    # << this line makes emacs coloring happy

    # Reuse gap penalties from chicken run.
    cat << '_EOF_' > doChain
#!/bin/csh
axtChain -scoreScheme=/cluster/data/blastz/HoxD55.q \
         -linearGap=/cluster/data/blastz/chickenHumanTuned.gap \
         -minScore=5000 $1 \
    /iscratch/i/gs.17/build34/bothMaskedNibs \
    /iscratch/i/fr1/nib $2 > $3
'_EOF_'
    # << this line makes emacs coloring happy
    chmod a+x doChain
    gensub2 input.lst single gsub jobList
    para create jobList
    para try, check, push, check...
#Completed: 41 of 41 jobs
#Average job time:                  26s       0.44m     0.01h    0.00d
#Longest job:                      121s       2.02m     0.03h    0.00d
#Submission to last job:           121s       2.02m     0.03h    0.00d

    # now on the cluster server, sort chains
    ssh kksilo
    cd /cluster/data/hg16/bed/blastz.fr1.2004-04-19/axtChain
    chainMergeSort run1/chain/*.chain > all.chain
    chainSplit chain all.chain
    rm run1/chain/*.chain

    # Load chains into database
    ssh hgwdev
    cd /cluster/data/hg16/bed/blastz.fr1.2004-04-19/axtChain/chain
    foreach i (*.chain)
        set c = $i:r
        echo loading $c
        hgLoadChain hg16 ${c}_chainFr1 $i
    end


# NET FUGU BLASTZ (REDONE 10/1/04 angie)
    # NOTE: originally done 4/19, but with results of a buggy axtChain.
    ssh kksilo
    cd /cluster/data/hg16/bed/blastz.fr1.2004-04-19/axtChain
    chainPreNet all.chain ../S1.len ../S2.len stdout \
    | chainNet stdin -minSpace=1 ../S1.len ../S2.len stdout /dev/null \
    | netSyntenic stdin noClass.net

    # Add classification info using db tables:
    ssh hgwdev
    cd /cluster/data/hg16/bed/blastz.fr1.2004-04-19/axtChain
    netClass noClass.net hg16 fr1 fugu.net

    # Make a 'syntenic' subset:
    ssh kksilo
    cd /cluster/data/hg16/bed/blastz.fr1.2004-04-19/axtChain
    rm noClass.net
    # Make a 'syntenic' subset of these with
    netFilter -syn fugu.net > fuguSyn.net

    # Load the nets into database 
    ssh hgwdev
    cd /cluster/data/hg16/bed/blastz.fr1.2004-04-19/axtChain
    netFilter -minGap=10 fugu.net |  hgLoadNet hg16 netFr1 stdin
    netFilter -minGap=10 fuguSyn.net | hgLoadNet hg16 netSyntenyFr1 stdin

# LIFTOVER CHAIN TO FUGU FR1 (DONE 2004-09-28 kate)
    ssh kolossus
    cd /cluster/data/hg16/bed/blastz.fr1/axtChain
    time netChainSubset human.net all.chain \
        /cluster/data/hg16/bed/liftOver/hg16ToFr1.chain


# RUN AXTBEST (DONE 4/20/04 angie)
    # Webb asked for axtBest too...
    ssh kolossus
    cd /cluster/data/hg16/bed/blastz.fr1.2004-04-19
    mkdir axtBest
    foreach f (axtChrom/*.axt)
      set chr=$f:t:r
      echo axtBesting $chr
      axtBest $f $chr axtBest/$chr.axt -minScore=300
    end


# H-INVITATIONAL GENE ANNOTATION DATABASE (2004-04-29 kate)
    # https://www.jbirc.aist.go.jp/hinv/top.html
    # Create knownGene table to reference HINV gene ID's
    #  for link on knownGenes details page
    # Also, create an HINV gene track, just to look at 
    # (probably not publish, as these are just mRNA alignments
    # already visible on browser).

    # download CDNA file (release 1.0)
    ssh kksilo
    mkdir /cluster/data/hinv
    cd /cluster/data/hinv
    wget http://www.jbirc.aist.go.jp/hinv/download/alldata/flatfile/FCDNA.gz
    gunzip FCDNA.gz
    mv FCDNA FCDNA.1.0

    # set up assembly work area
    ssh eieio
    cd /cluster/data/hg16
    mkdir -p bed/hinv
    cd bed/hinv

    # extract H-INV ID's and Genbank accessions of mRNAs
    awk '/CDNA_ACCESSION-NO:/ {print $2}' < /cluster/data/hinv/FCDNA.1.0 \
                                                        > accessions.txt
    awk '/CDNA_H-INVITATIONAL-ID:/ {print $2}' < /cluster/data/hinv/FCDNA.1.0 \
                                                        > ids.txt
    paste accessions.txt ids.txt > queries.txt

    # create PSL file from alignments for these mRNA's, extracted from the 
    #       table of all aligned mRNA's
    hgsql hg16 -s -e "SELECT * FROM all_mrna"  | cut -f 2- > all_mrna.tab

    pslReps /dev/null stdout /dev/null | cat - all_mrna.tab > all_mrna.psl
        # using pslReps to generate the PSL file header
    pslSelect -queryPairs=queries.txt all_mrna.psl hinv_mrna.psl

    # load track of mrna alignments
    hgwdev
    cd /cluster/data/hg16/bed/hinv
    hgLoadPsl hg16 -table=HInvGeneMrna hinv_mrna.psl

    # also make a gene track using the genomic exon coordinates for build34
    # in the FCDNA file.  NOTE: not all of the genes have these
    ssh kksilo
    cd /cluster/data/hg16/bed/hinv
    /cluster/data/hinv/hinvToGff.pl < /cluster/data/hinv/FCDNA.1.0 > hinv.gff
    ssh hgwdev
    cd /cluster/data/hg16/bed/hinv
    ldHgGene hg16 HInvGene hinv.gff
        # Read 40140 transcripts
    # TrackDb for this
        # track HInvGene
        # shortLabel H-INV Gene
        # longLabel H-Invitational Genes
        # group genes
        # priority 37
        # visibility hide
        # color 0,100,180
        # type genePred .

    # also make a table with various useful items for each gene
    ssh hgwdev
    hgsql hg16 < ~/kent/src/hg/lib/HInv.sql
    cd /cluster/data/hg16/bed/hinv
    /cluster/data/hinv/hinvToTable.pl < /cluster/data/hinv/FCDNA.1.0 > HInv.tab
    echo 'load data local infile "HInv.tab" into table HInv' | hgsql hg16

    # create table for knownGenes detail page
    ssh hgwdev
    cd /cluster/data/hg16/bed/hinv
    hgMapToGene hg16 HInvGeneMrna knownGene knownToHInv


# GENERATE GALGAL2 MAF FOR MULTIZ FROM NET (DONE 5/10/04 angie)
    ssh kksilo
    cd /cluster/data/hg16/bed/blastz.galGal2.2004-02-25/axtChain
    netSplit human.net net
    ssh kolossus
    cd /cluster/data/hg16/bed/blastz.galGal2.2004-02-25
    mkdir axtNet
    foreach f (axtChain/net/*)
      set chr = $f:t:r
      netToAxt $f axtChain/chain/$chr.chain /cluster/data/hg16/nib \
        /cluster/data/galGal2/nib stdout \
      | axtSort stdin axtNet/$chr.axt
    end
    mkdir mafNet
    foreach f (axtNet/chr*.axt)
      set maf = mafNet/$f:t:r.hg.maf
      axtToMaf $f \
            /cluster/data/hg16/chrom.sizes /cluster/data/galGal2/chrom.sizes \
            $maf -tPrefix=hg16. -qPrefix=galGal2.
    end


# MULTIZ HUMAN/MOUSE/RAT/GALGAL2 WITH NET MAF FOR ALL (DONE 5/10/04 angie)
# (galGal2 net maf added to human/mouse/rat alignments described above [HUMOR])
    # put the MAFs on bluearc
    ssh eieio
    mkdir -p /cluster/bluearc/multiz.hg16mm3rn3galGal2.allNet/hmr
    mkdir -p /cluster/bluearc/multiz.hg16mm3rn3galGal2.allNet/hg
    cp /cluster/data/hg16/bed/humor.2003-09-08/hmr/*.maf \
      /cluster/bluearc/multiz.hg16mm3rn3galGal2.allNet/hmr
    cp /cluster/data/hg16/bed/blastz.galGal2.2004-02-25/mafNet/*.maf \
      /cluster/bluearc/multiz.hg16mm3rn3galGal2.allNet/hg

    ssh kki
    mkdir /cluster/data/hg16/bed/multiz.hg16mm3rn3galGal2.allNet
    cd /cluster/data/hg16/bed/multiz.hg16mm3rn3galGal2.allNet
    mkdir hmrg
    # Wrapper script required because of stdout redirect:
    cat << '_EOF_' > doMultiz
#!/bin/csh
/cluster/bin/penn/multiz $1 $2 - > $3
'_EOF_'
    # << this line makes emacs coloring happy
    chmod a+x doMultiz
    rm -f jobList
    foreach file (/cluster/bluearc/multiz.hg16mm3rn3galGal2.allNet/hmr/*.maf) 
      set root=$file:t:r:r
      echo "doMultiz /cluster/bluearc/multiz.hg16mm3rn3galGal2.allNet/hg/${root}.hg.maf $file /cluster/data/hg16/bed/multiz.hg16mm3rn3galGal2.allNet/hmrg/${root}.maf" >> jobList
    end
    para create jobList
    para try, check, push, check
#Completed: 40 of 41 jobs
#Crashed: 1 jobs
#Average job time:                  84s       1.40m     0.02h    0.00d
#Longest job:                      267s       4.45m     0.07h    0.00d
#Submission to last job:           290s       4.83m     0.08h    0.00d
    # The crash was due to empty hg/chr18_random.hg.maf -- OK.

    # clean up bluearc (these are big files!)
    rm -r /cluster/bluearc/multiz.hg16mm3rn3galGal2.allNet

    # put this out there for Glenn Tesler (not a browser track!)
    ssh eieio
    cd /cluster/data/hg16/bed/multiz.hg16mm3rn3galGal2.allNet/hmrg
    gzip *
    ssh hgwdev
    mkdir /usr/local/apache/htdocs/angie/hg16.multizMm3Rn3GalGal2.allNet
    foreach f (/cluster/data/hg16/bed/multiz.hg16mm3rn3galGal2.allNet/hmrg/*)
      ln -s $f /usr/local/apache/htdocs/angie/hg16.multizMm3Rn3GalGal2.allNet
    end


# EPONINE TSS PREDICTION (DONE 5/21/04 angie)
    # Eponine runs fine on 2.5Mb contig, but barfs on much larger contig;
    # chop up sequence at gaps into ~2.5Mb chunks for cluster run.
    ssh eieio
    mkdir /cluster/bluearc/hg16/chunks
    cd /cluster/data/hg16
    # Note: faSplit seems to ignore the ".chunk_" suffix below:
    foreach f (?{,?}/NT_*/NT_??????.fa)
      set ctg = $f:t:r
      faSplit -minGapSize=10 -lift=/cluster/bluearc/hg16/chunks/$ctg.lft \
        gap $f 2500000 /cluster/bluearc/hg16/chunks/$ctg.chunk_
    end
    mkdir /cluster/data/hg16/bed/eponine
    cd /cluster/data/hg16/bed/eponine
    wget http://www.sanger.ac.uk/Software/analysis/eponine/eponine-scan.jar
    cat << '_EOF_' > doEpo
#!/bin/csh
set path=(/usr/java/j2re1.4.1_01/bin $path)
java -jar ./eponine-scan.jar -threshold 0.999 -seq $1 > $2
'_EOF_'
    # << this line makes emacs coloring happy
    chmod a+x doEpo
    cp /dev/null jobList
    foreach f (/cluster/bluearc/hg16/chunks/NT*.fa)
      echo "./doEpo {check in line+ $f} {check out exists out/$f:t:r.gff}" \
      >> jobList
    end
    mkdir out
    ssh kk9
    cd /cluster/data/hg16/bed/eponine
    para create jobList
    para try, check, push, check, ...
#Completed: 1588 of 1588 jobs
#Average job time:                 208s       3.47m     0.06h    0.00d
#Longest job:                      447s       7.45m     0.12h    0.01d
#Submission to last job:          3591s      59.85m     1.00h    0.04d

    # lift chunks -> contigs
    mkdir contigs/
    foreach l (/cluster/bluearc/hg16/chunks/*.lft)
      set ctg = $l:t:r
      liftUp contigs/$ctg.gff $l warn out/${ctg}*.gff
    end
    # lift contigs -> chrom
    liftUp eponine.gff ../../jkStuff/liftAll.lft warn contigs/NT_*.gff
    # Translate to bed 4 + float-score -- it would be a shame to lose 
    # those scores in genePred or bed 5 (int score)
    awk 'BEGIN {i=0;} \
         {printf "%s\t%d\t%d\t%s.%d\t%s\t%s\n", $1, $4-1, $5, $1, i, $6, $7; \
          i = i + 1;}' \
      eponine.gff > eponine.bed
    # load up
    ssh hgwdev
    cd /cluster/data/hg16/bed/eponine
    sed -e 's/bed6FloatScore/eponine/g' \
      $HOME/kent/src/hg/lib/bed6FloatScore.sql > eponine.sql
    hgLoadBed hg16 eponine eponine.bed -tab -sqlTable=eponine.sql


# RELOAD ENSEMBL GENES WITH VERSION 34d (DONE 2004/05/20 baertsch)
    # save current tables, just in case.
    rename table  ensGene to ensGene_old;
    rename table  ensGtp  to ensGtp_old;  
    rename table  ensPep  to ensPep_old;  

    mkdir /cluster/data/hg16/bed/ensembl34d
    cd /cluster/data/hg16/bed/ensembl34d
    # Get the ensembl protein data from 
    # http://www.ensembl.org/Homo_sapiens/martview
    # Follow this sequence through the pages:
    # Page 1) Make sure that the Homo_sapiens choice is selected. Hit next.
    # Page 2) Uncheck the "Limit to" box in the region choice. Then hit next.
    # Page 3) Choose the "Structures" box. 
    # Page 4) Choose GTF as the ouput.  choose gzip compression.  hit export.
    # Save as ensemblGene.gtf.gz

    # Ensembl handles random chromosomes differently than us, so we
    # strip this data.  Fortunately it just loses a couple of genes.
    # Add "chr" to front of each line in the gene data gtf file to make 
    # it compatible with our software.
    # Finally, get rid of the ".1" or ".2" after the name
    zcat ensbuild34d.gff.gz \
    | grep -v ^6_DR51 \
    | grep -v ^DR51 \
    | grep -v _NT_ \
    | perl -wpe 's/^([0-9]|X|Y|Un)/chr$1/ \
                 || die "Line $. doesnt start with human chrom:\n$_"' \
    | sed -e 's/\..\"/\"/g' \
    > ensGene.gtf
    ssh hgwdev
    /cluster/bin/i386/ldHgGene -gtf -genePredExt hg16 ensGene \
      /cluster/data/hg16/bed/ensembl34d/ensGene.gtf

    # ensGtp associates geneId/transcriptId/proteinId for hgPepPred and 
    # hgKnownToSuper.  Use ensMart to create it as above, except:
    # Page 3) Choose the "Features" box. In "Ensembl Attributes", check 
    # Ensembl Gene ID, Ensembl Transcript ID, Ensembl Peptide ID.  
    # Choose Text, tab-separated as the output format.  Result name ensGtp.
    # Save file as ensGtp.txt.gz
    gunzip ensGtp.txt.gz
    hgsql hg16 < ~/kent/src/hg/lib/ensGtp.sql
    echo "load data local infile 'ensGtp.txt' into table ensGtp" | hgsql -N hg16
    gzip ensGtp.txt

    # Load Ensembl peptides:
    # Get them from ensembl as above in the gene section except for
    # Page 3) Choose the "Sequences" box. 
    # Page 4) Transcripts/Proteins.  Peptide.  Format = FASTA.
    # Save file as ensemblPep.fa.gz
    zcat ensemblPep.fa.gz | hgPepPred hg16 ensembl stdin

    # compare size of old and new tables as a sanity check
    drop table ensGene_old;
    drop table ensGtp_old;  
    drop table ensPep_old;  

    # Create knownToEnsembl column 
    hgMapToGene hg16 ensGene knownGene knownToEnsembl

#### BUILD Ensembl cross-reference table, ensemblXref3 (DONE - 2004-05-24 - Fan)

    # Get the ensembl gene/protein cross-reference data from
    # http://www.ensembl.org/Homo_sapiens/martview
    # Follow this sequence through the pages:
    # Page 1) Make sure that the Homo_sapiens choice is selected. Hit next.
    # Page 2) Uncheck the "Limit to" box in the region choice. Then hit next.
    # Page 3) Choose the "Feature" box, select gene, transcript, protein, 
	      SPTrEMBL ID, SWISSPROT ID, and SWISSPROT AC 
    # Page 4) Choose "Text, tab separated".  choose gzip compression.  hit export.
    # Save as ensXref.txt

    sed ensXref.txt -e 's/\./\t/g' > ensemblXref3.tab

    hgsql hg16 -e "drop table ensemblXref3"
    hgsql hg16 < ~/src/hg/lib/ensemblXref3.sql

    hgsql hg16 -e 'load data local infile "ensemblXref3.tab" into table ensemblXref3 ignore 1 lines'


#### REBUILD SUPERFAMILY RELATED TABLES (DONE - 2004-05-21 - Fan)

# Download Superfamily data files and build the Superfamily DB
# from supfam.mrc-lmb.cam.ac.uk

        mkdir /cluster/store1/superFamily/040516
        cd /cluster/store1/superFamily/040516

# ftp over the following two files:

        ass_16-May-2004.tab.gz
        supfam_16-May-2004.sql.gz

# This may take about an hour.

     hgsql hg16 -e "create database superfam040516"
     hgsql superfam040516 < supfam_16-May-2004.sql

# Make sure to add an index on id of the des table of superfam040516.

     hgsql superfam040516 < ~/src/hg/lib/sfAssign.sql
     hgsql superfam040516 -e 'load data local infile "ass_16-May-2004.tab" into table superfam040516.sfAssign;'

# Build or rebuild Superfamily track and create sf tables needed for PB

   hgsql hg16 < ~/src/hg/lib/sfAssign.sql

   cd /cluster/store1/superFamily/040516
  
   hgsql hg16 -e 'load data local infile "ass_16-May-2004.tab" into table hg16.sfAssign;'

# If hg16.sfDes already exists, drop it.

   hgsql superfam040516 -e "select * from des" >sfDes.tab

   hgsql hg16 < ~/src/hg/lib/sfDes.sql

   hgsql hg16 -e 'load data local infile "sfDes.tab" into table hg16.sfDes ignore 1 lines;'

# If hg16.superfamily already exists, drop it.

   hgSuperfam hg16 > sf.log

# It is normal that many proteins does not have corresponding Superfamily entries.

# If hg16.sfDescription exists, drop it.

   hgsql hg16 < ~/src/hg/lib/sfDescription.sql
   hgsql hg16 -e 'LOAD DATA local INFILE "sfDescription.tab"  into table hg16.sfDescription;'

# Finally, load the superfamily table.

   hgLoadBed hg16 superfamily superfamily.tab -tab

# Create knownToSuperfamily table
   
   cat /cluster/store1/superFamily/040516/ass_16-May-2004.tab \
   | hgKnownToSuper hg16 hs stdin
# creates 32542 rows in knownToSuper

# seq table acc field is too small; up the max to match new hgLoadSeq 
# schema (2004/05/22 markd)

    alter table modify column `acc` varchar(128) NOT NULL default '';


####  Blat knownGene proteins to determine exons (braney 2004-06-02)
    ssh kk
    mkdir -p /cluster/data/hg16/bed/blat.hg16KG.2004-05-27
    cd /cluster/data/hg16/bed
    rm blat.hg16KG
    ln -s  blat.hg16KG.2004-05-27 blat.hg16KG
    cd blat.hg16KG
    pepPredToFa hg16 knownGenePep known.fa
    hgPepPred hg16 generic blastKGPep00 known.fa
    cat << '_EOF_' > blatSome
#!/bin/csh -fe
/cluster/bin/i386/blat -t=dnax -q=prot -out=pslx $1 $2 $3
'_EOF_'
    ls -1S /scratch/hg/gs.17/build34/bothMaskedNibs/*.nib > human.lst
    mkdir kgfa
    cd kgfa
    faSplit sequence ../known.fa 300 kg
    ls -1S kgfa/*.fa > kg.lst
    cat << '_EOF_' > blatGsub
#LOOP
blatSome $(path1) {check in line $(path2)} {check out line psl/$(root1)/$(root2).psl}
#ENDLOOP
'_EOF_'
    gensub2 human.lst kg.lst blatGsub blatSpec
    mkdir psl
    cd psl
    foreach i (`cat ../human.lst`)
	mkdir `basename $i .nib`
    end
    para create blatSpec
    para push
# Completed: 12222 of 12222 jobs
# CPU time in finished jobs:   23286365s  388106.09m  6468.43h  269.52d  0.738 y
# IO & Wait Time:                710342s   11839.03m   197.32h    8.22d  0.023 y
# Average job time:                1963s      32.72m     0.55h    0.02d
# Longest job:                   106239s    1770.65m    29.51h    1.23d
# Submission to last job:        106248s    1770.80m    29.51h    1.23d

    pslSort dirs raw.psl /tmp psl/*
    pslReps -nohead -minCover=0.9 -minAli=0.9 raw.psl cooked.psl /dev/null
    sort -rn cooked.psl | pslUniq stdin hg16KG.psl
    pslxToFa hg16KG.psl hg16KG_ex.fa -liftTarget=genome.lft -liftQuery=protein.lft
    kgName hg16 hg16KG.psl blastKGRef00
    ssh hgwdev
    cd /cluster/data/hg16/bed/blat.hg16KG
    hgsql hg16 < ~/kent/src/lib/hg/blastRef.sql
    echo "rename table blastRef to blastKGRef00" | hgsql hg16
    echo "load data local infile 'blastKGRef00' into table blastKGRef00" | hgsql hg16

### RUN BLASTZ VS. MACACA MULATTA

#get sequence from trace repository
cd /cluster/bluearc/macaca
for i in 01 02 03 04 05 06 07 08 09 10 11 12 13 14 ; do echo $i ; wget ftp://ftp.ncbi.nih.gov/pub/TraceDB/macaca_mulatta/fasta.macaca_mulatta.0$i.gz ; done

# distribute contigs to bluearc and /iscratch/i for cluster run
#split the sequence into 1mb chunks (about 13k reads per file)
ssh kksilo
mkdir -p /cluster/bluearc/macaca/split
for i in 001 002 003 004 005 006 007 008 009 010 011 012 013 014 ; do faSplit about macacca_mulatta.$i.fa 10000000 split/$i/mac ; done
find split -name \*.fa > mac.lst
hgsql hg16 -N < chromLen.sql > S1.len
#flatten directory structure for Angie's scripts
for i in `ls` ; do cd /iscratch/i/macaca/$i ; for j in `ls` ; do mv $j ../$i.$j ; done ; done

    ssh kkr1u00
    mkdir -p /iscratch/i/macaca/
    df /iscratch/i
    cp /cluster/bluearc/macaca/split/* /iscratch/i/macaca
    /cluster/bin/scripts/iSync

# make DEF file for blastz
    ssh kksilo
    cd /cluster/bluearc/macaca

# NOTE: need schwartzbin below for utils still not in penn bin

cat << '_EOF_' > DEF
# human vs. macaca mulatta
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/home/angie/schwartzbin:/cluster/home/kent/bin/i386


ALIGN=blastz-run
BLASTZ=blastz
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_T=2
BLASTZ_K=4500
BLASTZ_Q=/cluster/data/blastz/human_mulatta.q


BLASTZ_ABRIDGE_REPEATS=0

SEQ1_DIR=/scratch/hg/gs.17/build34/bothMaskedNibs/
SEQ1_RMSK=/scratch/hg/gs.17/build34/rmsk/
SEQ1_SMSK=
SEQ1_FLAG=-primate
SEQ1_IN_CONTIGS=0
SEQ1_CHUNK=10000000
SEQ1_LAP=10000

SEQ2_DIR=/iscratch/i/macaca/
SEQ2_RMSK=
SEQ2_SMSK=
SEQ2_FLAG=-primate
SEQ2_IN_CONTIGS=1
SEQ2_CHUNK=30000000
SEQ2_LAP=0

BASE=/cluster/bluearc/macaca
DEF=$BASE/DEF
RAW=$BASE/raw
CDBDIR=$BASE
SEQ1_LEN=$BASE/S1.len
SEQ2_LEN=$BASE/S2.len

DEBUG=0

'_EOF_'

    # << this line makes emacs coloring happy

# Save the DEF file in the current standard place
    cp DEF ~angie/hummus/DEF.hg16-rm0.`date -I`

    ssh kk
    cd /cluster/bluearc/macaca
    # source the DEF file to establish environment for following commands
    bash
    source ./DEF
    cp /cluster/data/mm4/jkStuff/BlastZ_run0.sh .
    ./BlastZ_run0.sh
    cd run.1
    para try
    para check
    para push

    #   Second cluster run to convert the .out's to .lav's
    cp /cluster/data/mm4/jkStuff/BlastZ_run1.sh . 
    ssh kk
    cd /cluster/data/pt0/bed/blastz.hg16
    bash
    source DEF
    ./BlastZ_run1.sh
    cd run.2
    para try
    para check
    para push

    #   Prepare third cluster run script to convert lav's to axt's
    cd /cluster/bluearc/macaca/
cat << '_EOF_' > ../../jkStuff/BlastZ_run2.sh
#!/bin/sh
#       prepare third cluster run for blastz processing
# NOTE: should run this on iservers (4G), 
# with chr19 and chr1 on kolossus (8G)
M=`uname -n`
if [ "$M" != "kk" ]; then
    echo "ERROR: you are on machine: '$M'"
    echo -e "\tthis script expects machine kk"
    exit 255
fi
source DEF
mkdir axtChrom
mkdir run.2
cd run.2
# usage:  blastz-contiglav2axt lav-dir axt-file seq1-dir seq2-file
echo '#LOOP' > gsub
echo '/cluster/bin/scripts/blastz-contiglav2axt '${BASE}'/lav/$(root1) {check out line+ '${BASE}'/axtChrom/$(root1).axt} '${SEQ1_DIR}' /cluster/bluearc/macaca/split/'${path2} >> gsub
echo '#ENDLOOP' >> gsub
ls -1S ${BASE}/lav > chrom.list
gensub2 chrom.list ../mac.lst gsub jobList
wc -l jobList
echo "running 'para create'"
para create jobList
echo "Ready for cluster run.  para try, check, push, etc ..."
'_EOF_'
    chmod +x ../../jkStuff/BlastZ_run2.sh
    #   Third cluster run to convert lav's to axt's
    source DEF
    ../../jkStuff/BlastZ_run2.sh
    cd run.2
    para try, check, push, etc ...
    # NOTE: ran this on kolossus and mini-cluster
    # 30 min. to 2 hrs. per chrom
    # Wrapper script required because of stdout redirect:

    cd /cluster/bluearc/macaca
    cat << '_EOF_' > doMultiz
#!/bin/csh
/cluster/bin/penn/multiz $1 $2 - > $3
'_EOF_'
    # << this line makes emacs coloring happy
    chmod a+x doMultiz
    rm -f jobList
    foreach file (/cluster/data/pt0/bed/blastz-blatHg16.2003-11-24/maf/*.maf) 
      set root=$file:t:r:r
      echo "doMultiz /cluster/data/pt0/bed/blastz-blatHg16.2003-11-24/maf/${root}.maf $file /cluster/bluearc/macaca/blastz.hg16/${root}.maf" >> jobList
    end
    para create jobList
    para try, check, push, check

# seq table acc field is too small; up the max to match new hgLoadSeq 
# schema (2004/05/22 markd)

    alter table modify column `acc` varchar(128) NOT NULL default '';


####  Blat knownGene proteins to determine exons (braney 2004-06-02)
    ssh kk
    mkdir blat.hg16KG.2004-05-27
    rm blat.hg16KG
    ln -s  blat.hg16KG.2004-05-27 blat.hg16KG
    pepPredToFa hg16 knownGenePep known.fa
    grep ">" known.fa | sed "s/>//" > kgName.lst
    kgName hg16 kgName.lst kg.mapNames
    cat << '_EOF_' > blatSome
#!/bin/csh -fe
/cluster/bin/i386/blat -t=dnax -q=prot -out=pslx $1 $2 $3
'_EOF_'
    ls -1S /scratch/hg/gs.17/build34/bothMaskedNibs/*.nib > human.lst
    mkdir kgfa
    cd kgfa
    faSplit sequence ../known.fa 300 kg
    ls -1S kgfa/*.fa > kg.lst
    cat << '_EOF_' > blatGsub
#LOOP
blatSome $(path1) {check in line $(path2)} {check out line psl/$(root1)/$(root2).psl}
#ENDLOOP
'_EOF_'
    gensub2 human.lst kg.lst blatGsub blatSpec
    mkdir psl
    cd psl
    foreach i (`cat ../human.lst`)
	mkdir `basename $i .nib`
    end
    para create blatSpec
    para push
# Completed: 12222 of 12222 jobs
# CPU time in finished jobs:   23286365s  388106.09m  6468.43h  269.52d  0.738 y
# IO & Wait Time:                710342s   11839.03m   197.32h    8.22d  0.023 y
# Average job time:                1963s      32.72m     0.55h    0.02d
# Longest job:                   106239s    1770.65m    29.51h    1.23d
# Submission to last job:        106248s    1770.80m    29.51h    1.23d

    pslSort dirs raw.psl /tmp psl/*
    pslReps -nohead -minCover=0.9 -minAli=0.9 raw.psl cooked.psl /dev/null
    pslxToFa uniq.psl uniq_ex.fa -liftTarget=genome.lft -liftQuery=protein.lft


# LIFTOVER CHAINS TO HG17 (DONE 2004-07-14 kate)

    # run alignment
    # NOTE: split hg17 to /iscratch/i is doc'ed in makeHg17.doc
    ssh kk
    cd /cluster/data/hg16
    makeLoChain-align hg16 /scratch/hg/gs.17/build34/bothMaskedNibs \
                    hg17 /iscratch/i/hg17/liftOver/split
        # Created parasol job in bed/blat.hg17.2004-07-14/run
    cd bed
    rm blat.hg17
    ln -s blat.hg17.2004-07-14 blat.hg17
    cd blat.hg17/run
    para try
    para check
    para push

    # lift results
    # the lift directory was defined in makeHg17.doc when split was performed
    # this expects data in bed/blat.hg17, so symlink must be there 
    # use kolossus for speed
    ssh kolossus
    cd /cluster/data/hg16/bed/blat.hg17
    makeLoChain-lift hg16 hg17 /cluster/data/hg17/bed/liftOver/liftSplit \
                        >&! lift.log &
    tail -100f lift.log
        # 25 minutes 
    
    # chain alignments
    ssh kk
    makeLoChain-chain hg16 /cluster/data/hg16/nib hg17 /cluster/data/hg17/nib
        # Created parasol job in /cluster/data/hg16/bed/blat.hg17/chainRun
    cd /cluster/data/hg16/bed/blat.hg17/chainRun
    para try
        # 46 jobs
    para check
    para push

    # make alignment net
    ssh kolossus
    makeLoChain-net hg16 hg17

    # load into database and copy to download directory
    ssh hgwdev
    makeLoChain-load hg16 hg17
    cp /cluster/data/hg16/bed/blat.hg17/over.chain \
        /cluster/data/hg16/bed/liftOver/hg16ToHg17.chain
        # Finished loading hg16ToHg17.over.chain
        # Now, add download link for /usr/local/apache/htdocs/goldenPath/hg16/liftOver/hg16ToHg17.over.chain.gz


# LIFTOVER CHAIN FROM HG17 TO HG16 (IN PROGRESS 2005-01-03 kate)

    ssh kolossus
    cd /cluster/data/hg16/bed/blast.hg17
    mkdir net.hg17
    cd chain
    chainMergeSort
    chainNet stdin /cluster/data/hg16/chrom.sizes \
                   /cluster/data/hg17/chrom.sizes \
                        /dev/null ../net.hg17
    time chainSwap 
    netChainSubset net.hg17



# ENCODE Regions        (kate)
#       NOTE: these instructions are not yet complete (scripts and datafiles 
#               are currently in ~kate/encode)
    mkRegionsBed.pl build34_regions.txt > encodeRegionsHg16.bed
    hgLoadBed hg16 encodeRegions encodeRegionsHg16.bed -noBin
    mkdir -p /cluster/data/hg16/bed/encodeRegions
    cp encodeRegionsHg16.bed /cluster/data/hg16/bed/encodeRegions/encodeRegions.bed

    # Create hgFixed table for name+description
    hgsql -D hgFixed < ${HOME}/kent/src/hg/lib/encodRegionInfo.sql
    sed -e 's/^/INSERT INTO encodeRegionInfo (name, descr) VALUES (\"/' \
            -e 's/|/\",\"/' \
            -e 's/$/\");/' < regionInfo.txt | hgsql -D hgFixed

    # create frameset for region display
    make

    # create sequence downloads
    set dir = /usr/local/apache/htdocs/ENCODE/sequences
    rm sizes.txt
    foreach b (hg12 hg13 hg15 hg16)
        encodeSequence.pl regions.$b.txt /cluster/data/$b/nib > $b.fa
        cp $b.fa $dir
        faCount $b.fa | awk '{print $1, $2}' > $dir/${b}_count.txt
        echo $b >> sizes.txt
        faSize $b.fa >> sizes.txt
        echo "" >> sizes.txt
    end
    cp sizes.txt $dir
    cd $dir
    md5sum *.fa > md5sum.txt

    # QA
    checkEncodeRegions.pl regions.hg12.txt /cluster/data/hg12/nib > hg12.check
    cp sizes.txt $dir
       # etc.
    csh printRegionDiffs.csh > regionDiffs.out



## end of blastz macaca mulatta alignment



# UN-ANNOTATED (EXCEPT FOR CROSS-SPECIES) REGIONS (DONE 6/8/04 angie)
    # Anton Nekrutenko asked for this... easy to do with featureBits!
    # NOTE: excluding mRNAs this time because of the controversial 
    # just-submitted-to-GenBank intronic BV* "mRNA" seqs.  
    ssh hgwdev
    mkdir /cluster/data/hg16/bed/unAnnotated
    cd /cluster/data/hg16/bed/unAnnotated
    nice featureBits hg16 -minSize=12 \
      \!gap \
      \!knownGene \!refGene \!mgcGenes \
      \!vegaGene \!vegaPseudoGene \!ensGene \!acembly \!ECgene \
      \!geneid \!genscan \!twinscan \!slamMouse \!sgpGene \!softberryGene \
      \!rnaGene \!superfamily \
      \!est \!xenoMrna \!HInvGene \!tigrGeneIndex \
      \!uniGene_2 \
      \!cpgIsland \!rmsk \!simpleRepeat \
      -bed=unAnnotated.bed
#905732944 bases of 2865248791 (31.611%) in intersection
    hgLoadBed hg16 unAnnotated unAnnotated.bed
    # not much of a drop in coverage with the -minSize:
    nice featureBits hg16 unAnnotated
#903585585 bases of 2865248791 (31.536%) in intersection


# ANDY LAW CPGISSLANDS (DONE 6/15/04 angie)
    # See notes about this in makeGalGal2.doc.
    ssh eieio
    mkdir /cluster/data/hg16/bed/cpgIslandGgfAndy
    cd /cluster/data/hg16/bed/cpgIslandGgfAndy
    cp /dev/null cpgIslandAndy.bed
    cp /dev/null cpgIslandGgfAndy.bed
    foreach f (../../?{,?}/chr*.fa)
      set chr = $f:t:r
      echo preproc $chr
      /cluster/home/angie/bin/$MACHTYPE/preProcGgfAndy $f > $chr.preproc
      echo running original on $chr
      awk '{print $1 "\t" $2 "\t" ($3 + $4) "\t" $5;}' $chr.preproc \
      | /cluster/home/angie/andy-cpg-island.pl \
      | perl -wpe '$i=0 if (not defined $i); \
                   chomp; ($s,$e) = split("\t"); $s--; \
                   $_ = "'$chr'\t$s\t$e\tcpg$i\n";  $i++' \
      >> cpgIslandAndy.bed
      echo running modified on $chr
      /cluster/home/angie/ggf-andy-cpg-island.pl $chr.preproc \
      | perl -wpe 'chomp; ($s,$e,$cpg,$n,$c,$g,$oE) = split("\t"); $s--; \
                   $gc = $c + $g;  $pCpG = (100.0 * 2 * $cpg / $n); \
                   $pGc = (100.0 * $gc / $n); \
                   $_ = "'$chr'\t$s\t$e\tCpG: $cpg\t$n\t$cpg\t$gc\t" . \
                        "$pCpG\t$pGc\t$oE\n";' \
      >> cpgIslandGgfAndy.bed
    end
    # load into database:
    ssh hgwdev
    cd /cluster/data/hg16/bed/cpgIslandGgfAndy
    # this one is a bed 4:
    hgLoadBed hg16 cpgIAndy -tab -noBin cpgIslandAndy.bed
    # this one is a cpgIslandExt but with a different table name:
    sed -e 's/cpgIslandExt/cpgIslandGgfAndy/g' \
      $HOME/kent/src/hg/lib/cpgIslandExt.sql > cpgIslandGgfAndy.sql
    hgLoadBed hg16 cpgIslandGgfAndy -tab -noBin \
      -sqlTable=cpgIslandGgfAndy.sql cpgIslandGgfAndy.bed
    # WOW, even masking out repeat bases from the results, there's a huge 
    # increase in reported islands!!
    featureBits hg16 cpgIsland
#21077002 bases of 2865248791 (0.736%) in intersection
    featureBits hg16 cpgIslandGgfAndy
#135249416 bases of 2865248791 (4.720%) in intersection
    featureBits hg16 cpgIslandGgfAndy \!rmsk
#68714633 bases of 2865248791 (2.398%) in intersection
    wc -l ../cpgIsland/cpgIsland.bed *bed
#  27596 ../cpgIsland/cpgIsland.bed
# 376478 cpgIslandAndy.bed
# 260761 cpgIslandGgfAndy.bed
    # http://www.pnas.org/cgi/content/full/99/6/3740
    # Takai D Jones PA
    # Comprehensive analysis of CpG islands in human chromosomes 21 and 22
    #
    # Regions of DNA of greater than 500 bp with a G+C equal to or
    # greater than 55% and observed CpG/expected CpG of 0.65 were more
    # likely to be associated with the 5' regions of genes and this
    # definition excluded most Alu-repetitive elements.
    #
    # Also, our description reduced the number of CpG islands located
    # on these chromosomes from 14,062 to 1,101, which is more
    # consistent with the expected number of genes (750) located on
    # these two chromosomes.
    #
    # To exclude "mathematical CpG islands" (for example, a 300-bp
    # sequence containing one G, 150 Cs, and only one CpG, which would
    # meet the criteria of a CpG island), we added one more condition:
    # that there are at least seven CpGs in these 200 bp. This number
    # was selected on the basis that there would be 200/16 (i.e.,
    # 12.5) CpGs in a random DNA fragment containing no suppression of
    # CpG. Because Gardiner-Garden and Frommer's criterion (1) of
    # ObsCpG/ExpCpG of 0.6 would accommodate (0.6  12.5) CpGs (i.e.,
    # 7.5), we selected seven CpGs as being a reasonable cutoff for
    # the initial analysis.
    #
    egrep -w '^chr2[12]' ../cpgIsland/cpgIsland.bed | wc -l
#   1033
    egrep -w '^chr2[12]' cpgIslandAndy.bed | wc -l
#  16462
    # Hmm, how did I find fewer with looser params??  Better run Takai and 
    # Jones's script on chr21 and chr22 for comparison...
    egrep -w '^chr2[12]' cpgIslandGgfAndy.bed |wc -l
#  10680
    # OK, I just have to try again with masked sequence:
    ssh eieio
    cd /cluster/data/hg16/bed/cpgIslandGgfAndy
    cp /dev/null cpgIslandMaskedAndy.bed
    cp /dev/null cpgIslandMaskedGgfAndy.bed
    foreach f (../../?{,?}/chr*.fa.masked.gz)
      set chr = $f:t:r:r:r
      echo preproc $chr
      zcat $f \
      | /cluster/home/angie/bin/$MACHTYPE/preProcGgfAndy stdin \
      > $chr.masked.preproc
      echo running original on $chr
      awk '{print $1 "\t" $2 "\t" ($3 + $4) "\t" $5;}' $chr.masked.preproc \
      | /cluster/home/angie/andy-cpg-island.pl \
      | perl -wpe '$i=0 if (not defined $i); \
                   chomp; ($s,$e) = split("\t"); $s--; \
                   $_ = "'$chr'\t$s\t$e\tcpg$i\n";  $i++' \
      >> cpgIslandMaskedAndy.bed
      echo running modified on $chr
      /cluster/home/angie/ggf-andy-cpg-island.pl $chr.masked.preproc \
      | perl -wpe 'chomp; ($s,$e,$cpg,$n,$c,$g,$oE) = split("\t"); $s--; \
                   $gc = $c + $g;  $pCpG = (100.0 * 2 * $cpg / $n); \
                   $pGc = (100.0 * $gc / $n); \
                   $_ = "'$chr'\t$s\t$e\tCpG: $cpg\t$n\t$cpg\t$gc\t" . \
                        "$pCpG\t$pGc\t$oE\n";' \
      >> cpgIslandMaskedGgfAndy.bed
    end
    ssh hgwdev
    cd /cluster/data/hg16/bed/cpgIslandGgfAndy
    hgLoadBed hg16 cpgIAndyMasked -tab -noBin cpgIslandMaskedAndy.bed
    # this one is a cpgIslandExt but with a different table name:
    sed -e 's/cpgIslandExt/cpgIslandGgfAndyMasked/g' \
      $HOME/kent/src/hg/lib/cpgIslandExt.sql > cpgIslandMaskedGgfAndy.sql
    hgLoadBed hg16 cpgIslandGgfAndyMasked -tab -noBin \
      -sqlTable=cpgIslandMaskedGgfAndy.sql cpgIslandMaskedGgfAndy.bed
    featureBits hg16 cpgIAndyMasked
#93307698 bases of 2865248791 (3.257%) in intersection
    featureBits hg16 cpgIslandGgfAndyMasked
#56180461 bases of 2865248791 (1.961%) in intersection
    wc -l *ed
# 376478 cpgIslandAndy.bed
# 260761 cpgIslandGgfAndy.bed
# 125851 cpgIslandMaskedAndy.bed
#  80350 cpgIslandMaskedGgfAndy.bed
    # 6/28/04 -- masking simpleRepeats, and even repeats other than Alu's,
    # might not be the right thing to do (?).  Give it a try with less-masked 
    # sequence.
    ssh eieio
    cd /cluster/data/hg16/bed/cpgIslandGgfAndy
    cp /dev/null cpgIslandGgfAndyOnlyRM.bed
    cp /dev/null cpgIslandGgfAndyOnlyRMAlu.bed
    foreach f (../../?{,?}/chr*.fa)
      set chr = $f:t:r
      echo preproc, ggf-andy $chr onlyRM
      zcat $f.out.gz > /tmp/tmp.fa.out
      maskOutFa $f /tmp/tmp.fa.out stdout \
      | /cluster/home/angie/bin/$MACHTYPE/preProcGgfAndy stdin \
      | /cluster/home/angie/ggf-andy-cpg-island.pl \
      | perl -wpe 'chomp; ($s,$e,$cpg,$n,$c,$g,$oE) = split("\t"); $s--; \
                   $gc = $c + $g;  $pCpG = (100.0 * 2 * $cpg / $n); \
                   $pGc = (100.0 * $gc / $n); \
                   $_ = "'$chr'\t$s\t$e\tCpG: $cpg\t$n\t$cpg\t$gc\t" . \
                        "$pCpG\t$pGc\t$oE\n";' \
      >> cpgIslandGgfAndyOnlyRM.bed
      echo preproc, ggf-andy $chr onlyRMAlu
      head -3 /tmp/tmp.fa.out > /tmp/tmp2.fa.out
      awk '$11 == "SINE/Alu" {print;}' /tmp/tmp.fa.out >> /tmp/tmp2.fa.out
      maskOutFa $f /tmp/tmp2.fa.out stdout \
      | /cluster/home/angie/bin/$MACHTYPE/preProcGgfAndy stdin \
      | /cluster/home/angie/ggf-andy-cpg-island.pl \
      | perl -wpe 'chomp; ($s,$e,$cpg,$n,$c,$g,$oE) = split("\t"); $s--; \
                   $gc = $c + $g;  $pCpG = (100.0 * 2 * $cpg / $n); \
                   $pGc = (100.0 * $gc / $n); \
                   $_ = "'$chr'\t$s\t$e\tCpG: $cpg\t$n\t$cpg\t$gc\t" . \
                        "$pCpG\t$pGc\t$oE\n";' \
      >> cpgIslandGgfAndyOnlyRMAlu.bed
    end
#  80314 cpgIslandGgfAndyOnlyRM.bed
# 110598 cpgIslandGgfAndyOnlyRMAlu.bed
    ssh hgwdev
    cd /cluster/data/hg16/bed/cpgIslandGgfAndy
    sed -e 's/cpgIslandExt/cpgIslandGgfAndyOnlyRM/g' \
      $HOME/kent/src/hg/lib/cpgIslandExt.sql > /tmp/c.sql
    hgLoadBed hg16 cpgIslandGgfAndyOnlyRM -tab -noBin -sqlTable=/tmp/c.sql \
      cpgIslandGgfAndyOnlyRM.bed
    sed -e 's/cpgIslandExt/cpgIslandGgfAndyOnlyRMAlu/g' \
      $HOME/kent/src/hg/lib/cpgIslandExt.sql > /tmp/c.sql
    hgLoadBed hg16 cpgIslandGgfAndyOnlyRMAlu -tab -noBin -sqlTable=/tmp/c.sql \
      cpgIslandGgfAndyOnlyRMAlu.bed
    featureBits hg16 cpgIslandGgfAndyOnlyRM
#56275308 bases of 2865248791 (1.964%) in intersection
    featureBits hg16 cpgIslandGgfAndyOnlyRMAlu
#78743130 bases of 2865248791 (2.748%) in intersection


#### mrnaBlastz track - all mrnas aligned using blastz  Robert 2/20/2004
mkdir /cluster/data/hg16/bed/mrnaBlastz
cd /cluster/data/hg16/bed/mrnaBlastz
/cluster/data/genbank/bin/i386/gbGetSeqs -gbRoot=/cluster/data/genbank genbank mrna mrna.fa -db=hg16 -native
faTrimPolyA mrna.fa hg16Mrna.fa
faSize hg16Mrna.fa -detailed=on > S2.len
mkdir /cluster/bluearc/hg/mrnaHg16
faSplit sequence hg16Mrna.fa 100 /cluster/bluearc/hg/mrnaHg16/mrna
ls -1 /cluster/bluearc/scratch/hg/mrnaHg16/ > mrna.lst
hgsql hg16 < chromInfo.sql > S1.len
awk '{print $1}' S1.len |grep -v random >  S1.lst
cd /cluster/bluearc/hg/gs.17/build34/mrnaBlastz
make-joblist
para create spec
para push
~angie/hummus/do.out2lav DEF > j
para create j
para push
#!/bin/tcsh
set base="/cluster/bluearc/hg/gs.17/build34/mrnaBlastz"

    cd $base
    mkdir -p pslRaw
    foreach c (lav/*)
      pushd $c
      set chr=$c:t
      set out=$base/pslRaw/$chr.psl
      echo "Translating $chr lav to $out"
      cat `ls -1 *.lav | sort -g` \
        | lavToPsl stdin stdout \
        | sed -e 's@scratch/hg/gs.17/build34/bothMaskedNibs//@@' | sed -e 's/\.nib:[0-9]*-[0-9]*//' > $out 
      popd
    end

for i in `ls pslRaw/` ; do echo sortIt.sh pslRaw/$i pslSort/$i >> spec.sort ; done
para create spec.sort - sorts pslRaw to pslSort

for i in `awk '{print $1}' S1.len` ; do echo pslFilterDups pslSort/$i.psl pslFilter/$i.psl  >> spec.dup ; done
para create spec.dup - filters pslSort to pslFilter using pslFilterDups

for i in `awk '{print $1}' S1.len` ; do echo axtChain -linearGap=linearGap.txt -psl pslFilter/$i.psl /scratch/hg/gs.17/build34/bothMaskedNibs/ -faQ /cluster/data/hg16/bed/mrnaBlastz/hg16Mrna.fa chain/$i.chain >> spec.chain ; done
para create spec.chain - chains pslFilter to chain

mkdir chainFilter
for i in `awk '{print $1}' S1.len` ; do echo doFilter ../chain/$i.chain ../chainFilter/$i.chain >> spec.filter ; done
spec.filter - filters chain to chainFilter using doFilter

mkdir -p preNet

cd chainFilter
foreach i ( *.chain)
chainPreNet $i ../S1.len ../S2.len ../preNet/$i
end

ls /cluster/data/hg16/nib/*.nib > S1.lst

for i in `awk '{print $1}' S1.len`; do chainToPsl ../preNet/$i.chain ../S1.len ../S2.len ../S1.lst /cluster/data/hg16/bed/mrnaBlastz/hg16Mrna.fa ../psl/$i.psl >> spec.chain2psl.new ; echo $i done chainToPsl ; done
ssh kk9-10
para create spec.chain2psl.new

for i in `awk '{print $1}' S1.len`; do hgLoadPsl -noTNameIx hg16 -table=${i}_mrnaBlastz psl/$i.psl ; echo $i done ; done

## end of blastz Mrna track

#### BUILD RETROGENE TRACK ( done Robert 6/15/2004)
cp  /cluster/data/genbank/data/aligned/genbank.137.0/hg16/full/mrna.native.rawPsl.gz .
gunzip mrna.native.rawPsl.gz
awk  '{OFS="\t";print $1,$2,$3,$4,$5,$6,$7,$8,$9,substr($10,1,index($10,".")-1),$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23}' mrna.native.rawPsl  > mrnaBlat.psl
hgLoadPsl hg16 mrnaBlat.psl

hgsql hg16 -N -B < refGene.sql > refGene.tab
cd /cluster/bluearc/hg/gs.17/build34/mrnaBlastz/
netToBed /cluster/data/hg16/bed/blastz.mm3/axtChain/mouseSynNet.net mouseSyn.bed

ssh eieio
pslCat -nohead -check all_mrna.psl /cluster/bluearc/hg/gs.17/build34/mrnaBlastz/psl/*.psl |awk '{print $0, $1*3-$2}' | sort -k 10,10 -k 22nr -T /tmp | awk '{OFS="\t"; print $1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21}' >  blatBlastz.psl 

awk '{OFS="\t"; print $1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21}' /scratch/blatBlastz.psl  > /scratch/x.psl
hgsql hg16 < mrna.sql | grep -v matches | awk '{OFS="\t"; print $2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22}' > all_mrna.psl
tawk '$12 > 1 && $12 < 9999999{x=$11;$11=$12;$12=x;print $0}'  /cluster/data/kgDB/bed/hg16/kgBestMrna/sortedKnownGene.tab > sortedKnownGene.tab
ssh kkr1u00
cd /cluster/data/hg16/bed/pseudo
cp refGene.tab /iscratch/i/hg/gs.17/build34/pseudo
cp /cluster/data/hg16/bed/simpleRepeat.bed /iscratch/i/hg/gs.17/build34/pseudo
cp mrnaHg16.fa /iscratch/i/hg/gs.17/build34/pseudo
cp mouseSyn.bed /iscratch/i/hg/gs.17/build34/pseudo
cp sortedKnownGene.tab /iscratch/i/hg/gs.17/build34/pseudo
pslSplit nohead -chunkSize=121 /iscratch/i/hg/gs.17/build34/pseudo blatBlastz.psl
cd /iscratch/i/hg/gs.17/build34/pseudo
iSync

ssh kk
cd /cluster/data/hg16/bed/pseudo
para create spec.kk
para push

#post process and load track
./buildSort.sh


### PHASTCONS HUMAN/CHIMP/MOUSE/RAT/CHICKEN (6/20/04, acs)

    # this is an addendum to Katie's '5-WAY MULTIZ & PHYLO-HMM' (see above)
    # just redoing the 'label' step with the new 'phastCons' program
    # picking up where it says "compute the conservation scores" 

    ssh hgwdev
    cd /cluster/data/hg16/bed/multiz.hg16mm3rn3panTro1galGal2/phyloHMM
    
    # set up wrapper for phastCons
    cat << '_EOF_' > doPhastCons
#!/bin/sh

PHAST=/cluster/bin/phast
TMP=/tmp/phastCons

file=$1
root=`basename $file .ss.gz`
chrom=`echo $root | awk -F\. '{print $1}'`

mkdir -p $TMP PREDICTIONS/$chrom PHASTCONS/$chrom
zcat $file | $PHAST/phastCons - hpmrc_rev_dg.mod --nrates 20 --transitions 0.018,0.002 --viterbi PREDICTIONS/$chrom/$root.bed --score --seqname $chrom --quiet > ${TMP}/$root.pp
gzip -c $TMP/$root.pp > PHASTCONS/$chrom/$root.pp.gz
rm $TMP/$root.pp
'_EOF_'
    chmod u+x doPhastCons

    # the --transitions arguments are approximate maximum likelihood
    # estimates obtained by running the program *without* --transitions
    # (causes estimation by EM) on five randomly selected 1M bp
    # windows.  All estimates were in the same ballpark (took a rough average)

    # set up cluster job
    ssh eieio
    cd /cluster/data/hg16/bed/multiz.hg16mm3rn3panTro1galGal2/phyloHMM
    cp WINDOWS/*.ss.gz /cluster/bluearc/hg16/bed/hg16mm3rn3panTro1galGal2-SS/
    logout
    rm -f jobs.lst
    for file in /cluster/bluearc/hg16/bed/hg16mm3rn3panTro1galGal2-SS/*.ss.gz ; do echo doPhastCons $file >> jobs.lst ; done
    ssh kk
    cd /cluster/data/hg16/bed/multiz.hg16mm3rn3panTro1galGal2/phyloHMM
    para create ; para try ; para push ... etc.
    
    # now create tracks
    mkdir -p PHASTCONS/wib
    for dir in PHASTCONS/chr* ; do \
	echo $dir ;\
	chr=`basename $dir` ;\
	zcat `ls $dir/*.pp.gz | sort -t\. -k2,2n` | \
	    wigAsciiToBinary -chrom=$chr \
	    -wibFile=PHASTCONS/wib/${chr}_phastCons stdin ;\
    done
    hgLoadWiggle hg16 phastCons PHASTCONS/wib/chr*_phastCons.wig
    mkdir -p /gbdb/hg16/wib
    rm -f /gbdb/hg16/wib/chr*phastCons.wib
    ln -s `pwd`/PHASTCONS/wib/*.wib /gbdb/hg16/wib
    chmod 775 . PHASTCONS PHASTCONS/wib
    chmod 664 PHASTCONS/wib/*.wib
    
    # tweak scores and names of predictions
    cat PREDICTIONS/*/*.bed | sed 's/id //' | \
	awk '{printf "%s\t%s\t%s\tlod=%d\t%d\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n", \
	    $1, $2, $3, $5, 147.49 * log($5) - 240.34, $6, $7, $8, $9, \
	    $10, $11, $12}' > all.bed
    hgLoadBed hg16 phastConsElements all.bed

    # Scores are transformed as follows, for a reasonable-looking 
    # "spectrum".  Let x_max be the maximum score (here
    # x_max  = 4490) and let x_med be the median score (here x_med =
    # 39).  The  scores are transformed via the function f(x) = a *
    # log x + b, s.t. f(x_med) = 300 and f(x_max) = 1000.  Solving
    # for a and b, you get b = (300 log x_max - 1000 log x_med) /
    # (log x_max - log x_med), a = (1000 - b) / log x_max.  Here a =
    # 147.49, b = -240.34

#track phastCons
#shortLabel phastCons 
#longLabel phastCons Conservation Score, Human/Chimp/Mouse/Rat/Chicken
#group compGeno
#priority 103
#visibility hide
#color 0,10,100
#maxHeightPixels 40
#type wig 0.0 1.0
#autoScale off

#track phastConsElements
#shortLabel phastConsElements
#longLabel phastCons Conserved Elements, Human/Chimp/Mouse/Rat/Chicken
#group compGeno
#priority 104
#visibility hide
#spectrum on
#color 0,60,120
#altColor 200,220,255
#exonArrows off
#type bed 12 .


# Ensembl 34d GENE PREDICTIONS (2004-07-13 baertsch)
## reloaded ensGene to add frame info, no change to data
    /cluster/bin/i386/ldHgGene -gtf -genePredExt hg16 ensGene \
      /cluster/data/hg16/bed/ensembl34d/ensGene.gtf
# TWINSCAN 1.3 GENE PREDICTIONS (2004-07-13 baertsch)
## reloaded twinscan to add frame info, no change to data
    ldHgGene hg16 twinscan chr_gtf/chr*.gtf -gtf -genePredExt


#### AFFYTRANSFRAG AND AFFYTRANCRIPTION TRACKS - (2004-07-21 sugnet)
# tracks covering about 1/3 of genome with probes
# every 5bp and hybridized to RNA from SK-N-AS cell line.
# Lifted from genome version hg15.

# affyTransfrag track: lift tranfrags to hg16
   cd /cluster/store6/weber/affy/transfrags/transfragsLabeled/
   mkdir hg16
   cd hg16
   liftOver ../SK_phase2_tfgs_final.biggerThan50bp.tab /cluster/store4/gs.17/build34/bed/bedOver/33to34.chain \ 
      SK_phase2_tfgs_final.hg16.bed SK_phase2_tfgs_final.err.bed
   # check to make sure that most lifted...
   wc *.bed
   #  12      49     346 SK_phase2_tfgs_final.err.bed
   #  170749  853745 6936780 SK_phase2_tfgs_final.hg16.bed
   #  170761  853794 6937126 total
   hgLoadBed hg16 affyTransfrags SK_phase2_tfgs_final.hg16.bed 
   # Reading SK_phase2_tfgs_final.hg16.bed
   # Loaded 170749 elements of size 5
   # Sorted
   # Creating table definition for 
   # Saving bed.tab
   # Loading hg16

# affyTranscription track: 
   cd /cluster/store6/weber/affy/graph/hg15/gz
   gunzip *.gz
   mkdir hg16
   cd hg16
   ln -s ../*.signal ./
   # remapGraphs.pl just makes a quick bed file for each signal file with 1bp spans
   # and then lifts via liftOver to new genome.
   remapGraphs.pl -liftChain /cluster/store4/gs.17/build34/bed/bedOver/33to34.chain \
      -oldGenome hg15 -newGenome hg16 *.signal
   # Lifting chr13.hg16.signal.
   # Lifting chr13.sk.signal.
   # Lifting chr14.sk.signal.
   # Lifting chr19.sk.signal.
   # Lifting chr20.sk.signal.
   # Lifting chr21.sk.signal.
   # Lifting chr22.hg16.signal.
   # Lifting chr22.sk.signal.
   # Lifting chr6.sk.signal.
   # Lifting chr7.sk.signal.
   # Lifting chrX.sk.signal.
   # Lifting chrY.sk.signal.
   # runWiggles.sh just calls wigAsciiToBinary for each signal file.
   cat ../runWiggles.sh | sed -e 's/hg15/hg16/g' | sed -e 's/sk/hg16/g' > runWiggles.sh
   ./runWiggles.sh 
   hgLoadWiggle -pathPrefix=/gbdb/hg16/wib/affyTranscription hg16 affyTranscription *.wig
   Connected to database hg16 for track affyTranscription
   Creating table definition with 13 columns in hg16.affyTranscription
   Saving wiggle.tab
   Loading hg16
   cp *.wib /cluster/data/hg16/bed/affyTranscription/wib/
   cd  /gbdb/hg15/wib/affyTranscription/
   ln -s /cluster/data/hg16/bed/affyTranscription/wib/*.wib ./
   cd /cluster/data/hg16/bed/affyTranscription/wib/*.wib
   chmod 664 *.wib
   cd /cluster/store6/weber/affy/graph/hg15/gz/hg16
   rm *.wib *.wig *.bed
   gzip *hg16.signal &


# EXTRACT LINEAGE-SPECIFIC REPEATS FOR DOG (DONE 2004/08/11 markd)
    cd /cluster/bluearc/scratch/hg/gs.17/build34/rmsk

    # Run Arian's DateRepsinRMoutput.pl to add extra columns telling 
    # whether repeats in -query are also expected in -comp species.  
    # Even though we already have the human-mouse linSpecReps,
    # extractLinSpecReps requires two columns of DateRepsinRMoutput.pl
    # additions.  So add mouse, then ignore it.  
    # Dog in extra column 1, Mouse in extra column 2
    foreach outfl ( *.out )
        echo "$outfl"
        /cluster/bluearc/RepeatMasker/DateRepsinRMoutput.pl \
          ${outfl} -query human -comp dog -comp mouse
    end
    # Now extract dog (extra column 1), ignore mouse.
    cd /cluster/bluearc/scratch/hg/gs.17/build34
    mkdir linSpecRep.notInDog
    foreach f (rmsk/*.out_dog_mus)
        set base = $f:t:r:r
        echo $base.out.spec
        /cluster/bin/scripts/extractLinSpecReps 1 $f > \
                        linSpecRep.notInDog/$base.out.spec
    end
    # Clean up.
    rm /cluster/bluearc/scratch/hg/gs.17/build34/rmsk/*.out_dog_mus

    # copy to iservers
    ssh kkr1u00
    cp -r /cluster/bluearc/scratch/hg/gs.17/build34/linSpecRep.notInDog  /iserver/kkr1u00/i/gs.17/build34/
    iSync

# BLASTZ DOG (CANFAM1) (DONE 2004/08/12 markd)

    ssh kk
    # store4 low on disk space; symlink to store7
    mkdir -p /cluster/store7/hg16/bed/blastz.canFam1.2004-08-10
    ln -s  /cluster/store7/hg16/bed/blastz.canFam1.2004-08-10 /cluster/data/hg16/bed
    cd /cluster/data/hg16/bed/blastz.canFam1.2004-08-10
    # Use default (Human-Mouse) settings for starters.
    cat << '_EOF_' > DEF
# human vs. dog
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/i386:/cluster/home/angie/schwartzbin

ALIGN=blastz-run
BLASTZ=blastz

# Default
BLASTZ_H=2000
BLASTZ_ABRIDGE_REPEATS=1

# TARGET: Human
SEQ1_DIR=/scratch/hg/gs.17/build34/bothMaskedNibs
SEQ1_RMSK=
SEQ1_FLAG=
SEQ1_SMSK=/iscratch/i/gs.17/build34/linSpecRep.notInDog
SEQ1_IN_CONTIGS=0
SEQ1_CHUNK=10000000
SEQ1_LAP=10000

# QUERY: Dog
SEQ2_DIR=/scratch/hg/canFam1/nib
SEQ2_RMSK=
SEQ2_FLAG=
SEQ2_SMSK=/scratch/hg/canFam1/linSpecRep.notInHuman
SEQ2_IN_CONTIGS=0
SEQ2_CHUNK=10000000
SEQ2_LAP=0

BASE=/cluster/data/hg16/bed/blastz.canFam1.2004-08-10

DEF=$BASE/DEF
RAW=$BASE/raw
CDBDIR=$BASE
SEQ1_LEN=$BASE/S1.len
SEQ2_LEN=$BASE/S2.len
'_EOF_'
    # << this line keeps emacs coloring happy

    # first cluster run: raw blastz alignments
    bash # if a csh/tcsh user
    cd /cluster/data/hg16/bed/blastz.canFam1.2004-08-10
    source DEF
    mkdir -p $RAW run.0
    /cluster/home/angie/hummus/make-joblist $DEF > $BASE/run.0/j 2>log
    sh ./xdir.sh
    cd run.0
    sed -e 's@^blastz-run@/cluster/bin/penn/blastz-run@' j > jobList
    # edit jobList to do chr19 first; hg17 run notes indicated
    # this might save around 4 hours
    para create jobList
    para try, check, push, check, ....

#Completed: 93225 of 93225 jobs
#CPU time in finished jobs:   18459718s  307661.97m  5127.70h  213.65d
#IO & Wait Time:                429193s    7153.21m   119.22h    4.97d
#Average job time:                 203s       3.38m     0.06h    0.00d
#Longest job:                    18951s     315.85m     5.26h    0.22d
#Submission to last job:         58889s     981.48m    16.36h    0.68d

    # second cluster run: lift raw alignments -> lav dir
    ssh kki
    bash # if a csh/tcsh user
    cd /cluster/data/hg16/bed/blastz.canFam1.2004-08-10
    source DEF
    mkdir run.1 lav
    /cluster/bin/scripts/blastz-make-out2lav $DEF $BASE > $BASE/run.1/jobList
    cd run.1
    wc -l jobList
    para create jobList
    para try, check, push, etc ...
#Completed: 339 of 339 jobs
#CPU time in finished jobs:       3771s      62.85m     1.05h    0.04d  0.000 y
#IO & Wait Time:                  6671s     111.18m     1.85h    0.08d  0.000 y
#Average job time:                  31s       0.51m     0.01h    0.00d
#Longest job:                      334s       5.57m     0.09h    0.00d
#Submission to last job:          1464s      24.40m     0.41h    0.02d

    # third run: lav -> axt
    ssh kki
    cd /cluster/data/hg16/bed/blastz.canFam1.2004-08-10
    mkdir axtChrom pslChrom run.2
    cd run.2
    cat << '_EOF_' > do.csh
#!/bin/csh -ef
cd $1
set chr = $1:t
cat `ls -1 *.lav | sort -g` \
|  /cluster/bin/x86_64/lavToAxt stdin \
    /iscratch/i/gs.17/build34/bothMaskedNibs /iscratch/i/canFam1/nib stdout \
| /cluster/bin/x86_64/axtSort stdin ../../axtChrom/$chr.axt 
/cluster/bin/x86_64/axtToPsl ../../axtChrom/$chr.axt ../../S1.len ../../S2.len \
  ../../pslChrom/$chr.psl
'_EOF_'
    # << this line keeps emacs coloring happy
    chmod a+x do.csh
    cp /dev/null jobList
    foreach d (../lav/chr*)
      echo "do.csh $d" >> jobList
    end
    para create jobList
    para try, check, push, check
#Completed: 42 of 42 jobs
#CPU time in finished jobs:       1297s      21.62m     0.36h    0.02d  0.000 y
#IO & Wait Time:                 15428s     257.13m     4.29h    0.18d  0.000 y
#Average job time:                 398s       6.64m     0.11h    0.00d
#Longest job:                     1714s      28.57m     0.48h    0.02d
#Submission to last job:          1723s      28.72m     0.48h    0.02d

   # axtChrom/chr19_random.axt is empty, probably ok

# CHAIN DOG BLASTZ (DONE)
    # Run axtChain on little cluster
    ssh kki
    cd /cluster/data/hg16/bed/blastz.canFam1.2004-08-10
    mkdir -p axtChain/run1
    cd axtChain/run1
    mkdir out chain
    ls -1S /cluster/data/hg16/bed/blastz.canFam1.2004-08-10/axtChrom/*.axt \
      > input.lst
    cat << '_EOF_' > gsub
#LOOP
doChain {check in line+ $(path1)} {check out line+ chain/$(root1).chain} {check out exists out/$(root1).out}
#ENDLOOP
'_EOF_'
    # << this line makes emacs coloring happy

    cat << '_EOF_' > doChain
#!/bin/csh
axtChain $1 \
    /iscratch/i/gs.17/build34/bothMaskedNibs \
    /iscratch/i/canFam1/nib $2 > $3
'_EOF_'
    # << this line makes emacs coloring happy
    chmod a+x doChain
    gensub2 input.lst single gsub jobList
    # edit to remove chr19_random
    para create jobList
    para try, check, push, check...
#Completed: 41 of 41 jobs
#CPU time in finished jobs:       8233s     137.22m     2.29h    0.10d  0.000 y
#IO & Wait Time:                 11718s     195.29m     3.25h    0.14d  0.000 y
#Average job time:                 487s       8.11m     0.14h    0.01d
#Longest job:                     4623s      77.05m     1.28h    0.05d
#Submission to last job:          4971s      82.85m     1.38h    0.06d


    # now on the cluster server, sort chains
    ssh kksilo
    cd /cluster/data/hg16/bed/blastz.canFam1.2004-08-10/axtChain
    chainMergeSort run1/chain/*.chain > all.chain
    chainSplit chain all.chain
    rm run1/chain/*.chain

    # hg17 said:
    # Lots of chaff with scores in the 3000's.  Many very-high-scoring 
    # chains.  So filter the chain down somewhat...
    # didn't bother rechecking, just filtered.
    mv all.chain all.chain.unfiltered
    chainFilter -minScore=5000 all.chain.unfiltered > all.chain
    rm chain/*
    chainSplit chain all.chain
    gzip all.chain.unfiltered

    # Load chains into database
    ssh hgwdev
    cd /cluster/data/hg16/bed/blastz.canFam1.2004-08-10/axtChain/chain
    foreach i (*.chain)
        set c = $i:r
        hgLoadChain hg16 ${c}_chainCanFam1 $i
    end
    # Coverage is significantly higher than mouse:
    featureBits hg16 -chrom=chr1 chainCanFam1Link
# 123343602 bases of 221562941 (55.670%) in intersection

# NET DOG BLASTZ (DONE 2004/08/15)
    ssh kolossus
    cd /cluster/data/hg16/bed/blastz.canFam1.2004-08-10/axtChain
    chainPreNet all.chain ../S1.len ../S2.len stdout \
    | chainNet stdin -minSpace=1 ../S1.len ../S2.len stdout /dev/null \
    | netSyntenic stdin noClass.net

    # Add classification info using db tables:
    ssh hgwdev
    cd /cluster/data/hg16/bed/blastz.canFam1.2004-08-10/axtChain
    netClass noClass.net hg16 canFam1 dog.net

    # Make a 'syntenic' subset:
    ssh kksilo
    cd /cluster/data/hg16/bed/blastz.canFam1.2004-08-10/axtChain
    rm noClass.net
    # Make a 'syntenic' subset of these with
    netFilter -syn dog.net > dogSyn.net

    # Load the nets into database 
    ssh hgwdev
    cd /cluster/data/hg16/bed/blastz.canFam1.2004-08-10/axtChain
    netFilter -minGap=10 dog.net |  hgLoadNet hg16 netCanFam1 stdin
    netFilter -minGap=10 dogSyn.net | hgLoadNet hg16 syntenyNetCanFam1 stdin
    # Add entries for chainCanFam1, netCanFam1 to human/hg16 trackDb


# LIFTOVER CHAIN TO DOG CANFAM1 (DONE 2004-09-16 kate)
    ssh kolossus
    cd /cluster/data/hg16/bed/blastz.canFam1/axtChain
    time netChainSubset dog.net all.chain \
        /cluster/data/hg16/bed/liftOver/hg16ToCanFam1.chain


# LOAD ENSEMBL ESTS (DONE 2004-09-07 braney)
     cd /cluster/data/hg16/bed
     mkdir ensEst
     cd ensEst

#        Get the ensembl EST data from http://www.ensembl.org/
#        Go to the Martview link
#        Choose Homo sapiens as the organism
#        Follow this sequence through the pages:
#        Page 1) Choose the Ensembl ESTs choice. Hit next.
#        Page 2) Uncheck the "Limit to" box in the region choice. Then hit next.        
#        Page 3) Choose the "Structures" box. 
#        Page 4) Choose GTF as the ouput, choose gzip compression and then hit Export.
#	        Name file ensEst.gff.gz

# Ensembl handles random chromosomes differently than us.  They give the
# contig name.  We can lift these up to our chrN_random chromosomes
    gunzip ensEst.gff.gz
    sed "/^[0-9XY]*\t/d" ensEst.gff | sed "s/^.*_NT/NT/" > random.gff 
    liftUp -type=".gff" liftRandom.gff /cluster/data/hg16/jkStuff/liftAll.lft warn random.gff
    sed "/_NT_/d" ensEst.gff | sed "s/^/chr/" > unrandom.gff 
    cat liftRandom.gff unrandom.gff > fixed.gff
    ldHgGene hg16 ensESTGene fixed.gff

#        Get the ensembl protein data from http://www.ensembl.org/
#        Go to the Martview link
#        Choose Homo sapien as the organism
#        Follow this sequence through the pages:
#        Page 1) Choose the Ensembl ESTs choice. Hit next.
#        Page 2) Uncheck the "Limit to" box in the region choice. Then hit next.
#        Page 3) Choose the "Sequences" box. 
#        Page 4) Choose Transcripts/Proteins and Gene sequence Only as the ouput, 
#	    choose text/fasta and gzip compression and then hit export. Name to ensEstPep.fasta

    gunzip ensEstPep.fasta.gz
    sed "s/|.*//" ensEstPep.fasta > fixedPep.fa
    hgPepPred hg16 generic ensESTPep fixedPep.fa

    # ensGtp associates geneId/transcriptId/proteinId for name searches
    # Use ensMart to create it as above, except:
    # Page 3) Choose the "Features" box. In "Ensembl Attributes", check 
    # Ensembl Gene ID, Ensembl Transcript ID, Ensembl Peptide ID.  
    # Choose Text, tab-separated as the output format.
    # Save file as ensGtp.tsv.gz
    gunzip ensGtp.tsv.gz
    sed "s/ensGtp/ensESTGtp/"  ~/kent/src/hg/lib/ensGtp.sql | hgsql hg16
    echo "load data local infile 'ensESTGtp.tsv' into table ensESTGtp ignore 1 lines" | hgsql hg16

# QA Note - table ensGtp was updated on 2004-08-18 to remove a header line that
  was included in the actual table data. This was not ever pushed out to the rr.
  Table fix (push) done on 2006-01-31 (Jen). Original push on 2004-06. No other
  pushQ entries exist for table change on 2004-08.

# BLASTZ MOUSE MM5 (DONE 2004-09-10 kate)
    
    ssh kk
    # use store7 (lots of space)
    mkdir -p /cluster/store7/hg16/bed/blastz.mm5.2004-09-10
    ln -s /cluster/store7/hg16/bed/blastz.mm5.2004-09-10 \
                /cluster/data/hg16/bed
    cd /cluster/data/hg16/bed
    ln -s  blastz.mm5.2004-09-10 blastz.mm5
    cd blastz.mm5

    cat << '_EOF_' > DEF
# human vs. mouse
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/home/angie/schwartzbin:/cluster/home/kent/bin/i386

ALIGN=blastz-run
BLASTZ=blastz
BLASTZ_H=2000
BLASTZ_ABRIDGE_REPEATS=1

# TARGET
# Human
SEQ1_DIR=/iscratch/i/gs.17/build34/bothMaskedNibs
# not used
SEQ1_RMSK=
# not used
SEQ1_FLAG=
SEQ1_SMSK=/iscratch/i/gs.17/build34/linSpecRep.notInMouse
SEQ1_IN_CONTIGS=0
SEQ1_CHUNK=10000000
SEQ1_LAP=10000

# QUERY
# Mouse
SEQ2_DIR=/scratch/mus/mm5/softNib
# RMSK not currently used
SEQ2_RMSK=/scratch/mus/mm5/rmsk
# FLAG not currently used
SEQ2_FLAG=-rodent
SEQ2_SMSK=/scratch/mus/mm5/linSpecRep.notInHuman
SEQ2_IN_CONTIGS=0
SEQ2_CHUNK=30000000
SEQ2_LAP=0

BASE=/cluster/data/hg16/bed/blastz.mm5.2004-09-10

DEF=$BASE/DEF
RAW=$BASE/raw
CDBDIR=$BASE
SEQ1_LEN=$BASE/S1.len
SEQ2_LEN=$BASE/S2.len
'_EOF_'
    # << this line keeps emacs coloring happy

    # first cluster run: blastz alignments
    ssh kk
    bash # if a csh/tcsh user
    cd /cluster/data/hg16/bed/blastz.mm5.2004-09-10
    source DEF
    mkdir $RAW run.0
    /cluster/home/angie/hummus/make-joblist $DEF > $BASE/run.0/j
    sh ./xdir.sh
    cd run.0
    sed -e 's@^blastz-run@/cluster/bin/penn/blastz-run@' j > jobList
    para create jobList
        # 44060 jobs
    para try, check, push, check, ....
# Average job time:                 382s       6.37m     0.11h    0.00d
# Longest job:                     4510s      75.17m     1.25h    0.05d
# Submission to last job:         26324s     438.73m     7.31h    0.30d

    # second cluster run: lift raw alignments -> lav dir
    ssh kki
    bash # if a csh/tcsh user
    cd /cluster/data/hg16/bed/blastz.mm5.2004-09-10
    source DEF
    mkdir run.1 lav
    /cluster/bin/scripts/blastz-make-out2lav $DEF $BASE > $BASE/run.1/jobList
    cd run.1
    wc -l jobList
    para create jobList
        # 339 jobs
    para try, check, push, etc ...
# Average job time:                  16s       0.27m     0.00h    0.00d
# Longest job:                      112s       1.87m     0.03h    0.00d
# Submission to last job:           401s       6.68m     0.11h    0.00d

    # convert lav files to axt
    ssh kki
    cd /cluster/data/hg16/bed/blastz.mm5.2004-09-10
    mkdir axtChrom pslChrom
    # a new run directory
    mkdir run.2
    cd run.2
    cat << '_EOF_' > do.csh
#!/bin/csh
cd $1
set chr = $1:t
cat `ls -1 *.lav | sort -g` \
|  /cluster/bin/x86_64/lavToAxt -dropSelf stdin \
    /iscratch/i/gs.17/build34/bothMaskedNibs /iscratch/i/mus/mm5/softNib stdout \
| /cluster/bin/x86_64/axtSort stdin ../../axtChrom/$chr.axt 
/cluster/bin/x86_64/axtToPsl ../../axtChrom/$chr.axt ../../S1.len ../../S2.len \
  ../../pslChrom/$chr.psl
'_EOF_'
    # << this line makes emacs coloring happy
    chmod a+x do.csh
    cat << '_EOF_' > gsub
#LOOP
./do.csh {check in exists $(path1)} {check out line+ /cluster/data/hg16/bed/blastz.mm5.2004-09-10/axtChrom/$(root1).axt} {check out line+ /cluster/data/hg16/bed/blastz.mm5.2004-09-10/pslChrom/$(root1).psl}
#ENDLOOP
'_EOF_'
    # << this line makes emacs coloring happy
    ls -1Sd ../lav/chr* > chrom.list
    gensub2 chrom.list single gsub jobList
    wc -l jobList
        # 42 jobs
    head jobList
    para create jobList
    para try, check, push, check,...

    # Load database tables
    ssh hgwdev
    cd /cluster/data/hg16/bed/blastz.mm5/pslChrom
    foreach f (*.psl)
        set c = $f:r
        hgLoadPsl -noTNameIx hg16 -table=${c}_blastzMm5 $f
    end
    # takes 30-60 min


# CHAIN MOUSE MM5 BLASTZ (DONE 2004-09-15 kate)
    # Run axtChain on little cluster
    ssh kki
    cd /cluster/data/hg16/bed/blastz.mm5.2004-09-10
    mkdir -p axtChain/run1
    cd axtChain/run1
    mkdir out chain
    ls -1S /cluster/data/hg16/bed/blastz.mm5.2004-09-10/axtChrom/*.axt \
      > input.lst
    cat << '_EOF_' > gsub
#LOOP
doChain {check in line+ $(path1)} {check out line+ chain/$(root1).chain} {check out exists out/$(root1).out}
#ENDLOOP
'_EOF_'
    # << this line makes emacs coloring happy

    cat << '_EOF_' > doChain
#!/bin/csh
axtChain $1 \
    /iscratch/i/gs.17/build34/bothMaskedNibs \
    /iscratch/i/mus/mm5/softNib $2 > $3
'_EOF_'
    # << this line makes emacs coloring happy
    chmod a+x doChain
    gensub2 input.lst single gsub jobList
    # edit to remove chr19_random
    para create jobList
        # 41 jobs
    para try, check, push, check...

    # now on the cluster server, sort chains
    ssh kksilo
    cd /cluster/data/hg16/bed/blastz.mm5.2004-09-10/axtChain
    time chainMergeSort run1/chain/*.chain > all.chain
        # 5 min -- 230.070u 58.980s 5:07.13 94.1%  0+0k 0+0io 117pf+0w
    time chainSplit chain all.chain
        # 5 min -- 208.490u 56.360s 4:48.81 91.7%  0+0k 0+0io 125pf+0w
    rm run1/chain/*.chain

    # Load chains into database
    ssh hgwdev
    cd /cluster/data/hg16/bed/blastz.mm5.2004-09-10/axtChain/chain
    foreach i (*.chain)
        set c = $i:r
        echo $c
        hgLoadChain hg16 ${c}_chainMm5 $i
    end
    # compare with previous mouse, and with this assembly on later human
    featureBits hg16 -chrom=chr1 chainMm5
    featureBits hg17 -chrom=chr1 chainMm5
    featureBits hg16 -chrom=chr1 chainMm3

    featureBits hg16 -chrom=chr1 chainMm5Link
        # 83288228 bases of 221562941 (37.591%) in intersection
    featureBits hg17 -chrom=chr1 chainMm5Link
        # 83773012 bases of 222827847 (37.595%) in intersection
    featureBits hg16 -chrom=chr1 chainMm3Link
        # 82665800 bases of 221562941 (37.310%) in intersection


# NET MOUSE MM5 BLASTZ (DONE 2004-09-16 kate)
    ssh kolossus
    cd /cluster/data/hg16/bed/blastz.mm5.2004-09-10/axtChain
    chainPreNet all.chain ../S1.len ../S2.len stdout \
    | chainNet stdin -minSpace=1 ../S1.len ../S2.len stdout /dev/null \
    | netSyntenic stdin noClass.net
    # < 10 minutes

    # Add classification info using db tables:
    ssh hgwdev
    cd /cluster/data/hg16/bed/blastz.mm5.2004-09-10/axtChain
    time netClass noClass.net hg16 mm5 human.net
        # 15 minutes

    # Make a 'syntenic' subset:
    ssh kksilo
    cd /cluster/data/hg16/bed/blastz.mm5.2004-09-10/axtChain
    rm noClass.net
    # Make a 'syntenic' subset of these with
    netFilter -syn human.net > humanSyn.net

    # Load the nets into database 
    ssh hgwdev
    cd /cluster/data/hg16/bed/blastz.mm5.2004-09-10/axtChain
    netFilter -minGap=10 human.net |  hgLoadNet hg16 netMm5 stdin
    netFilter -minGap=10 humanSyn.net | hgLoadNet hg16 netSyntenyMm5 stdin

    # GOT HERE

    # Add entries for chainMm5, netMm5, netSyntenyMm5
    # human/hg16 trackDb


# LIFTOVER CHAIN TO MOUSE MM5 (DONE 2004-09-16 kate)
    ssh kolossus
    cd /cluster/data/hg16/bed/blastz.mm5/axtChain
    time netChainSubset human.net all.chain \
        /cluster/data/hg16/bed/liftOver/hg16ToMm5.chain
        # 7 mins.

# TIGHT FOR MOUSE MM5 (TBD kate)
# BEST FOR MOUSE MM5 (TBD kate)
# SYNTENIC NET FOR MOUSE MM5 (TBD kate)

# DOWNLOADS FOR MOUSE MM5 (TBD kate)


# BLASTZ FOR ZEBRAFISH DANRER1 (WORKING 2004-09-29 kate)

    # Treat all repeats as lineage-specific
    ssh kkr1u00
    mkdir /iscratch/i/gs.17/build34/linSpecRep.notInZebrafish
    foreach f (/iscratch/i/gs.17/build34/rmsk/chr*.fa.out)
        cp -p $f  \
           /iscratch/i/gs.17/build34/linSpecRep.notInZebrafish/$f:t:r:r.out.spec
    end
    iSync
  
    ssh kk
    # use store7 (lots of space)
    mkdir -p /cluster/store7/hg16/bed/blastz.danRer1.2004-09-29
    ln -s /cluster/store7/hg16/bed/blastz.danRer1.2004-09-29 \
                /cluster/data/hg16/bed
    cd /cluster/data/hg16/bed
    ln -s  blastz.danRer1.2004-09-29 blastz.danRer1
    cd blastz.danRer1

    cat << '_EOF_' > DEF
# human vs zebrafish (danRer1)
# params for zebrafish --  L=6000 (threshold for gapped alignments)
# (same params as used for Fugu)
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/i386:/cluster/home/angie/schwartzbin

ALIGN=blastz-run
BLASTZ=blastz

# Reuse parameters from hg16-fr1.
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=6000
BLASTZ_K=2200
BLASTZ_Q=/cluster/data/blastz/HoxD55.q
BLASTZ_ABRIDGE_REPEATS=1

# Target: Human
SEQ1_DIR=/iscratch/i/gs.17/build34/bothMaskedNibs
SEQ1_RMSK=
SEQ1_FLAG=
SEQ1_SMSK=/iscratch/i/gs.17/build34/linSpecRep.notInZebrafish
SEQ1_IN_CONTIGS=0
SEQ1_CHUNK=10000000
SEQ1_LAP=10000

# Query: Zebrafish (danRer1)
SEQ2_DIR=/iscratch/i/danRer1/nib/
SEQ2_RMSK=
SEQ2_FLAG=
SEQ2_SMSK=/iscratch/i/danRer1/linSpecRep.notInHuman
SEQ2_IN_CONTIGS=0
SEQ2_CHUNK=10000000
SEQ2_LAP=0

BASE=/cluster/data/hg16/bed/blastz.danRer1.2004-09-29

DEF=$BASE/DEF
RAW=$BASE/raw
CDBDIR=$BASE
SEQ1_LEN=$BASE/S1.len
SEQ2_LEN=$BASE/S2.len
'_EOF_'
    # << this line keeps emacs coloring happy
    # Save the DEF file in the current standard place
    cp DEF ~angie/hummus/DEF.hg16-danRer1.2004-09-29

    # prepare first cluster run
    ssh kk
    bash # if a csh/tcsh user
    cd /cluster/data/hg16/bed/blastz.danRer1
    source DEF
    mkdir $RAW run.0
    /cluster/home/angie/hummus/make-joblist $DEF > $BASE/run.0/j
    sh ./xdir.sh
    cd run.0
    sed -e 's@^blastz-run@/cluster/bin/penn/blastz-run@' j > jobList
    para create jobList
        # 57630 jobs
    para try, check, push, check, ....
# Average job time:                 477s       7.95m     0.13h    0.01d
# Longest job:                    12147s     202.45m     3.37h    0.14d

    # second cluster run: lift raw alignments -> lav dir
    ssh kki
    cd /cluster/data/hg16/bed/blastz.danRer1
    bash # if a csh/tcsh user
    source DEF
    mkdir run.1 lav
    /cluster/bin/scripts/blastz-make-out2lav $DEF $BASE > $BASE/run.1/jobList
    cd run.1
    wc -l jobList
    para create jobList
        # 339 jobs
    para try
    para check 
    para push
        # GOT HERE

    # third run: lav -> axt
    ssh kki
    cd /cluster/data/hg16/bed/blastz.danRer1
    mkdir axtChrom run.2
    cd run.2
    cat << '_EOF_' > do.csh
#!/bin/csh -ef
cd $1
cat `ls -1 *.lav | sort -g` \
| lavToAxt stdin /iscratch/i/gs.17/build34/bothMaskedNibs \
/iscratch/i/danRer1/nib stdout \
| axtSort stdin $2
'_EOF_'
    # << this line makes emacs coloring happy
    chmod a+x do.csh
    cat << '_EOF_' > gsub
#LOOP
./do.csh {check in exists $(path1)} {check out line+ /cluster/data/hg16/bed/blastz.danRer1/axtChrom/$(root1).axt}
#ENDLOOP
'_EOF_'
    # << this line makes emacs coloring happy
    ls -1Sd ../lav/chr* > chrom.list
    gensub2 chrom.list single gsub jobList
    wc -l jobList
    head jobList
    para create jobList
        # 42 jobs

    # GOT HERE

# CHAIN TETRAODON (tetNig1) BLASTZ (DONE, 2004-08-26, hartera)
# Make chains with rescored blastz
    # Run axtChain on little cluster
    ssh kki
    cd /cluster/data/hg17/bed/blastz.tetNig1
    mkdir -p axtChain/run1
    cd axtChain/run1
    mkdir out chain
    ls -1S /cluster/data/hg17/bed/blastz.tetNig1/axtChrom/*.axt \
        > input.lst
cat << '_EOF_' > gsub
#LOOP
doChain {check in exists $(path1)} {check out line+ chain/$(root1).chain} {check out line+ out/$(root1).out}
#ENDLOOP
'_EOF_'
    # << this line makes emacs coloring happy

    # Make our own linear gap file with reduced gap penalties, 
    # in hopes of getting longer chains - works well for species at 
    # chicken-human distance or greater
    cat << '_EOF_' > ../../chickenHumanTuned.gap
tablesize	11
smallSize	111
position	1	2	3	11	111	2111	12111	32111	72111	152111	252111
qGap	325	360	400	450	600	1100	3600	7600	15600	31600	56600
tGap	325	360	400	450	600	1100	3600	7600	15600	31600	56600
bothGap	625	660	700	750	900	1400	4000	8000	16000	32000	57000
'_EOF_'
    # << this line makes emacs coloring happy

cat << '_EOF_' > doChain
#!/bin/csh
axtChain -linearGap=../../chickenHumanTuned.gap $1 \
    /iscratch/i/gs.18/build35/bothMaskedNibs \
    /iscratch/i/tetNig1/nib $2 >& $3
'_EOF_'
    # << this line makes emacs coloring happy
    chmod a+x doChain
    gensub2 input.lst single gsub jobList
    para create jobList
    ara try, check, push, check,...
=======
        # 29 jobs
    para try, check, push, check,...


# SEGMENTAL DUPLICATIONS (DONE 10/21/04 angie)
    ssh hgwdev
    mkdir /cluster/data/hg16/bed/genomicSuperDups
    cd /cluster/data/hg16/bed/genomicSuperDups
    wget http://humanparalogy.gs.washington.edu/segDupDb.tar
    # This tar file contains files for both hg16 and hg17.  A note 
    # from Xinwei She about the contents:
#Build34 contains 4 tables: 3 of them are already in the genome browser source code:
#genomicSuperDups, celereCoverage and celeraDupPositive. A new table, vanillaTrack,
#which display the Celera assembly overlay in the public assembly build34, is added.
#There trackDb entries can be founded in the file trackDb.add.
#
#Build35 contains only 2 tables: genomicSuperDups and celeraDupPositive.
    tar xvf segDupDb.tar 
    cd bd34
    # use tail +2 to skip past the header line:
    zcat celeraCoverage.tab.gz | tail +2 \
    | hgLoadBed -tab hg16 celeraCoverage stdin
    zcat celeraDupPositive.tab.gz | tail +2 \
    | hgLoadBed -tab -sqlTable=$HOME/kent/src/hg/lib/celeraDupPositive.sql \
        hg16 celeraDupPositive stdin
    zcat genomicSuperDups.tab.gz | tail +2 \
    | hgLoadBed -tab -sqlTable=$HOME/kent/src/hg/lib/genomicSuperDups.sql \
        hg16 genomicSuperDups stdin
    # Change the name of "vanillaTrack" to celeraOverlay:
    zcat vanillaTrack.mysqldump.gz | sed -e 's/vanillaTrack/celeraOverlay/g' \
    | hgsql hg16
    # It needs a new index, and it needs a bin field, so dump out its 
    # contents and load them back in using hgLoadBed and an edited 
    # SQL definition:
    hgsql hg16 -N -e 'select * from celeraOverlay' > celeraOverlay.bed
    # Make a ~/kent/src/hg/lib/celeraOverlay.as and run autoSql.  
    # Add bin and indices to celeraOverlay.sql, and reload with hgLoadBed:
    hgLoadBed -tab -sqlTable=$HOME/kent/src/hg/lib/celeraOverlay.sql \
      hg16 celeraOverlay celeraOverlay.bed
    # clean up
    rm celeraOverlay.bed bed.tab

# YALE PSEUDOGENES (started Robert Baertsch, finished JK 2/21/05)
    ssh hgwdev
    cd /cluster/data/hg16/bed
    mkdir pseudoYale
    cd pseudoYale
    # Place file obtained from Mark Gerstein at yale in pseudoYale.gtf
    ldHgGene hg16 pseudoYale pseudoYale.gtf
    # Note - I'm guessing how this goes.  Robert left no record. -jk

## refresh vega tracks with vega build30        (done 5/4/04 Robert)
##download vega mysql tables
cd /cluster/store8/ensembl
mkdir vega30_35c
cd vega30_35c
ln /cluster/store8/ensembl/vega30_35c /cluster/data/hg17/bed/vega30 -s

for i in `cat tables` ; do wget -N ftp://ftp.ensembl.org/pub/human-30.35c/data/mysql/homo_sapiens_vega_30_35c/$i.gz ; done
wget -N ftp://ftp.ensembl.org/pub/human-30.35c/data/mysql/homo_sapiens_vega_30_35c/homo_sapiens_vega_30_35c_mysql40_compatible..sql.gz 
gunzip *.gz
##create mysql database
mysql 
create database vega30
use vega30
source homo_sapiens_vega_30_35c_mysql40_compatible.sql
source dropMt.sql
source load.sql
exit


hgsql vega30 -N -B < vegaGene.sql > vegaGene.tab
awk -f vegaGene.awk < vegaGene.tab > vegaGene.gp     
ldHgGene hg17 vegaGene -predTab vegaGene.gp -gtf -genePredExt 
hgsql vega30 -N -B < vegaPseudo.sql > vegaPseudo.tab
awk -f vegaPseudo.awk < vegaPseudo.tab > vegaPseudo.gp     
ldHgGene hg17 vegaPseudoGene -predTab vegaPseudo.gp -gtf -genePredExt 

#load processed pseudogenes
grep Processed vegaPseudo.tab > vegaProcPseudo.tab
awk -f vegaPseudo.awk < vegaProcPseudo.tab > vegaProcPseudo.gp
ldHgGene hg17 vegaProcessedPseudo -predTab vegaProcPseudo.gp -gtf -genePredExt

#load vegaInfo
hgsql vega30 -N -B < vegaGeneInfo.sql > vegaInfo.tab
hgsql vega30 -N -B < vegaPseudoInfo.sql >> vegaInfo.tab

hgsql hg17 -N -B < /cluster/home/baertsch/kent/src/hg/lib/vegaInfo.sql
echo 'load data local infile "vegaInfo.tab" into table vegaInfo' | hgsql hg17 -N -B

#load down to hg16
liftOver vegaGene.gp /gbdb/hg17/liftOver/hg17ToHg16.over.chain vegaGeneHg16.gp unMapped.gp -genePred
liftOver vegaPseudo.gp /gbdb/hg17/liftOver/hg17ToHg16.over.chain vegaPseudoGeneHg16.gp unMappedPseudo.gp -genePred

ldHgGene hg16 vegaGene -predTab vegaGeneHg16.gp -gtf 
ldHgGene hg16 vegaPseudoGene -predTab vegaPseudoGeneHg16.gp -gtf 
echo 'truncate table vegaInfo' | hgsql hg16 -N -B
echo 'load data local infile "vegaInfo.tab" into table vegaInfo' | hgsql hg16 -N -B

# QA note - table vegaPep dropped during this update. Not dropped from rr at 
  time of initial push, creating a -times error in joinerCheck. Table vegaPep 
  dropped from hgwbeta and rr/mgc on 2006-01-31.

#########################################################################
# MOUSE NET/CHAINS MM6 - Info contained in makeMm6.doc (200503 Hiram)

##########################################################################
# CNPs from University of Washington (Done, Heather and Daryl, June/July 2005)
# data from http://humanparalogy.gs.washington.edu/structuralvariation

ssh hgwdev
cd /cluster/data/hg16/bed
mkdir cnp
cd cnp

# Sharp data
cp dupArray.txt cnpSharp.bed.orig
# change CNP type to match Iafrate data (with permission from Andy)
sed -e "s/dup/Gain" cnpSharp.bed.orig > cnpSharp.bed.2
sed -e "s/del/Loss/" cnpSharp.bed.2 > cnpSharp.bed.3
sed -e "s/Both Loss and Gain/Gain and Loss/" cnpSharp.bed.3 > cnpSharp.bed
hgLoadBed hg16 cnpSharp -tab -sqlTable=cnpSharp.sql cnpSharp.bed
# Loaded 160 elements of size 14
# note: 11 names with special characters: CTD-2183E4*, RP11-111A4?, RP11-325E8#, RP11-1000I9*, RP11-159F11*,
# RP11-177L24*, RP11-136P13*, RP11-1151C19*, RP11-1008M3*, RP11-379N11?, CTD-3185D7#
# no apparent problems with these
hgsql hgFixed < cnpSharpCutoff.sql
echo 'load data local infile "sampleCUTOFF.txt" into table cnpSharpCutoff' | hgsql hgFixed
hgsql hg16 < cnpSharpSamples.sql
echo 'load data local infile "andyArraySample.txt" into table cnpSharpSamples' | hgsql hg16
hgsql hg16 < cnpSharpSampleCount.sql
hgsql hg16 < sampleCount.sql


# fosmid discordants
# don't need the id column
cp fosmidDiscordant.txt fosmidDiscordant.bed
hgLoadBed hg16 fosmidDiscordantPrelim -tab -sqlTable=fosmidDiscordantPrelim.sql fosmidDiscordant.bed
hgsql hg16 < fosmidDiscordant.sql
echo 'insert into fosmidDiscordant select bin, chrom, chromStart, chromEnd, name from fosmidDiscordantPrelim' | hgsql hg16
echo 'drop table fosmidDiscordantPrelim' | hgsql hg16

# Iafrate data
cp Iafrate.txt cnpIafrate.bed
hgLoadBed hg16 cnpIafrate -tab -sqlTable=cnpIafrate.sql cnpIafrate.bed

# Sebat data
cp Sebat.txt cnpSebat.bed
hgLoadBed hg16 cnpSebat -tab -sqlTable=cnpSebat.sql cnpSebat.bed

# deletions added May 2006
# From mccarroll@molbio.mgh.harvard.edu
genId.pl < mcCarrolldels.txt > mcCarrolldels.bed
hgLoadBed hg16 -noBin -tab delMccarroll mcCarrolldels.bed
# Hinds data via Andy Sharp
sort -n hindsDels.txt > hindsDels.sort
genId.pl < hindsDels.sort > hindsDels.bed
hgLoadBed hg16 -noBin -tab delHinds hindsDels.bed
# From conrad@uchicago.edu
conrad.pl < conradDels.txt > conradDels.bed
hgLoadBed hg16 -noBin -tab delConrad conradDels.bed



##########################################################################
# sno/miRNA track from Michel Weber (DONE - 2005-06-16 - Hiram)
#	received the data file UCSC_snotrack_hg16.txt via email
    ssh hgwdev
    cd /cluster/data/hg16/bed/wgRna
    #	As a quick first pass at classification, take a look at the
    #	items in the hg17.wgRna table and use those as a guide
    hgsql -N -e "select * from wgRna;" hg17 > hg17.wgRna.txt
    awk '{print $5,$10}' hg17.wgRna.txt > name.type.hg17
    #	combine this new sno data with the existing miRNA data
    hgsql -N -e "select * from miRNA;" hg16 > hg16.miRNA.txt
    cat << '_EOF_' > addTypes.pl
#!/usr/bin/env perl

use warnings;
use strict;

my %types;              #       key is name, value is the type

open (FH, "name.type.hg17") or die "Can not open name.type.hg17";
while (my $line=<FH>)
{
    chomp $line;
    my ($name, $type) = split('\s+',$line);
    $types{$name} = $type;
}
close (FH);

open (FH,"grep ^chr UCSC_snotrack_hg16.txt | sort -k1,1 -k2,2n|") or
        die "can not open UCSC_snotrack_hg16.txt";
while (my $line=<FH>)
{
    chomp $line;
    my $type="unknown";
    my ($chrom, $start, $end, $name, $score, $strand) = split('\s+',$line);
    if (exists($types{$name})) { $type = $types{$name}; }
    else { if ($name =~ m/^HBII/) { $type = "CDBox"; } }
    print "$chrom\t$start\t$end\t$name\t$score\t$strand\t0\t0\t$type\n";
}
close (FH);
'_EOF_'
    #	happy emacs
    chmod +x addTypes.pl 
    awk '{print $2"\t"$3"\t"$4"\t"$5"\t"$6"\t"$7"\t"$8"\t"$9"\t""miRna"}' \
	hg16.miRNA.txt > hg16.wgRna.tab
    ./addTypes.pl >> hg16.wgRna.tab
    hgLoadBed -sqlTable=$HOME/kent/src/hg/lib/wgRna.sql hg16 wgRna \
	hg16.wgRna.tab
    #	this leaves 16 items classified as unknown, request to
    #	Michel Weber for proper classification

################################################################################
# Build hg17Kg table for KG II for hg16, using hg17 KG data (DONE 2005-07-11 Fan).

   ssh hgwdev
   cd /cluster/data/mm6/bed
   mkdir hg17Kg
   cd hg17Kg
   hgsql hg16 -N -e "select * from all_mrna" |cut -f 2-22 >all_mrna.psl

   hgsql hg16 -N -e \
'select gbCdnaInfo.acc,cds.name from gbCdnaInfo,cds,all_mrna where all_mrna.qName=gbCdnaInfo.acc and   gbCdnaInfo.cds=cds.id' \
   |sort -u > all_mrna.cds

   bash
   mrnaToGene -cdsFile=all_mrna.cds all_mrna.psl all_mrna.gp 2>all_mrna.err > all_mrna.log
   exit

   hgsql hg16 -e 'drop table mrnaGp'
   hgsql hg16 < ~/src/hg/lib/mrnaGp.sql
   hgsql hg16 -e 'load data local infile "all_mrna.gp" into table mrnaGp'

   hgsql hg16 -N -e \
   'select mrnaGp.* from mrnaGp,hg17.knownGene where mrnaGp.name = knownGene.name and mrnaGp.chrom=knownGene.chrom' \
   |sort -u > mrnaGp2.tab
   hgsql hg16 -e 'drop table mrnaGp2'
   hgsql hg16 < ~/src/hg/lib/mrnaGp2.sql
   hgsql hg16 -e 'load data local infile "mrnaGp2.tab" into table mrnaGp2'

# Create hg16Kg table in hg17 to get over a hurdle that we can not do join
# between mySQL DBs

   hgsql hg17 -e 'drop table hg16Kg'
   hgsql hg17 < ~/src/hg/lib/hg16Kg.sql
   hgsql hg16 -N -e 'select * from knownGene' >hg16Kg.tab
   hgsql hg17 -e 'load data local infile "hg16Kg.tab" into table hg16Kg'

   hgsql hg17 -N -e \
   'select hg16Kg.* from hg16Kg, knownGene where hg16Kg.name=knownGene.name and knownGene.name not like "NM_%" and hg16Kg.chrom=knownGene.chrom '\
   >j

   cut -f 1-10 j >j1

# j1 are mRNA records through old KG process.
# j2 are RefSeq records based on hg17 KG
# mrnaGp2 are mRNA records based on hg17 KG non-Refseq entries and GenBank CDS data (which is incomplete).

   hgsql hg16 -N -e \
   'select refGene.* from refGene, hg17.knownGene where hg17.knownGene.name=refGene.name' >j2

   cat j1 j2 mrnaGp2.tab |sort -u >j.tab

   ~/kent/src/hg/protein/sortKg.pl j.tab >hg17Kg.tab

   wc hg17Kg.tab

   hgsql hg16 -e "delete from hg17Kg"

   hgsql hg16 -e 'load data local infile "hg17Kg.tab" into table hg17Kg'

####################################################################
# Make mouse ortholog column using blastp on mm6 known genes. (DONE 7/12/05, Fan).

# First make mouse protein database and copy it to /cluster/panasas
# if it doesn't exist already
#	This already exists.  See makeMm6.doc for procedure

# Make parasol run directory 
ssh kk
mkdir -p /cluster/data/hg16/bed/blastp/mm6
cd /cluster/data/hg16/bed/blastp/mm6
mkdir run
cd run
mkdir out

# Make blast script
cat  << '_EOF_' > blastSome
#!/bin/sh
BLASTMAT=/cluster/bluearc/blast229/data /cluster/bluearc/blast229/blastall \
-p blastp -d /cluster/panasas/home/store/mm6/blastp/known \
-i $1 -o $2 -e 0.001 -m 8 -b 1
'_EOF_'
    # << keep emacs happy
chmod a+x blastSome

# Make gensub2 file
cat << '_EOF_' > gsub
#LOOP
blastSome {check in line+ $(path1)} {check out line out/$(root1).tab}
#ENDLOOP
'_EOF_'
    # << keep emacs happy

# Create parasol batch
#	this echo trick is used because otherwise the command line is
#	too long and you can not do a simple ls
echo ../../split/*.fa | wordLine stdin > split.lst
gensub2 split.lst single gsub jobList
para create jobList
para try
para check
para push ... etc ...
Completed: 5812 of 5812 jobs
CPU time in finished jobs:      96031s    1600.52m    26.68h    1.11d  0.003 y
IO & Wait Time:                 15641s     260.68m     4.34h    0.18d  0.000 y
Average job time:                  19s       0.32m     0.01h    0.00d
Longest running job:                0s       0.00m     0.00h    0.00d
Longest finished job:             168s       2.80m     0.05h    0.00d
Submission to last job:           766s      12.77m     0.21h    0.01d

# Load into database.  
ssh hgwdev
cd /cluster/data/hg16/bed/blastp/mm6/run/out
hgLoadBlastTab hg16 mmBlastTab -maxPer=1 *.tab
# Scanning through 5812 files
# Loading database with 35707 rows

# Update otherOrg.ra under hg/hgGene/hgGeneData/Human/hg16 to mm6 instead of
# mm4.


##########################################################################
# EVOFOLD  - RNA secondary structure predictions lifted from hg17 (Jakob Skou Pedersen)
# Jakob Skou Pedersen, July 12, 2005
  ssh -C hgwdev
  mkdir -p /cluster/data/hg16/bed/evofold
  cd /cluster/data/hg16/bed/evofold
  # lifting folds from hg17 to hg16
  echo "select chrom, chromStart, chromEnd, name, score, strand, size, secStr, conf from evofold;" | hgsql hg17 | sed -e 1d > foldsHg17.bed
  liftOver -minMatch=1.0 foldsHg17.bed /cluster/data/hg17/bed/liftOver/hg17ToHg16.over.chain tmp.bed unmapped.bed
  # remove elements which are wrong size after lifting
  awk '$3-$2 == $7' tmp.bed > foldsHg16.bed
  hgLoadBed -notItemRgb -sqlTable=/cluster/home/jsp/prog/kent/src/hg/lib/evofold.sql hg16 evofold foldsHg16.bed
  # clean up
  rm foldsHg17.bed unmapped.bed tmp.bed

# Tajima's D (DONE -- 2005-09-20 -- Daryl)
# Data from Chris Carlson in Debbie Nickerson's lab
# Chris Carlson [csc47<AT>u<DOT>washington<DOT>edu]

    # lifted down from hg17.  See makeHg17.doc for details

# AFFYHUEX1 track (sugnet Wed Oct  5 12:18:18 PDT 2005)

mkdir hg16
cd hg16
pwd
# /cluster/store1/sugnet/affymetrixHumanAllExon/hg16
mkdir gff beds annot
cd gff
# download gff design files
cp ../../hg17/gff/parseGff.pl .
# parse gff script...
#!/usr/bin/perl -w
if(scalar(@ARGV) == 0) {
    print STDERR "parseGff.pl - Parse out affymetrixes gff annotation 
probesets for human all exon design.
usage:
   parseGff.pl file1.design.gff file2.design.gff ... fileN.design.gff
";
    exit(1);
}

sub splitField($) {
    my $l = shift(@_);
    my @w = split / /, $l;
    return $w[1];
}

while($file = shift(@ARGV)) {
    if(!($file =~ /(.+)\.gff/)) {
	die "$file doesn't have .gff suffix\n";
    }
    $prefix = $1;
    print STDERR "Doing file $file.\n";
    open(IN, $file) or die "Can't open $file to read.";
    open(BED, ">../beds/$prefix.pset.bed") or die "Can't open ../beds/$prefix.pset.bed to write.";
    open(ANNOT, ">../annot/$prefix.tab") or die "Can't open ../annot/$prefix.tab to write.";
    while($line = <IN>) {
	# Only want the probeset records.
	if($line =~ /\tprobeset\t/) {
	    $score = 0;
	    $cds = 0;
	    $bounded = 0;
	    chomp($line);
	    # pop off an microsoft line endings.
	    $line =~ s/\r$//;
	    @words = split /\t/, $line;
	    # This makes the evidence be comman separated.
	    $words[8] =~ s/\" \"/,/g;
	    # This gets rid of pesky quotes.
	    $words[8] =~ s/\"//g;

	    # Set the score based on the annotation type
	    if($words[8] =~ /full/) {
		$score = 200;
	    }
	    elsif($words[8] =~ /extended/) {
		$score = 500;
	    }
	    elsif($words[8] =~ /core/) {
		$score = 900;
	    }
	    if($words[8] =~ /bounded/) {
		$score -= 200;
	    }
	    if($words[8] =~ /cds/) {
		$score += 100;
	    }
	    if($score <= 0) {
		$score = 100;
	    }
		
	    # Print out the annotation fields.
	    @fields = split /; /,$words[8];
	    $id = splitField($fields[1]);
	    $f = shift(@fields);
	    $f = splitField($f);
	    print ANNOT "$f";
	    while($f = shift(@fields)) {
		if($f =~ /^bounded/) {
		    $bounded = 1;
		}
		if($f =~ /^cds/) {
		    $cds = 1;
		}
		if(!($f =~ /^bounded/ || $f =~ /^cds/)) {
		    $f = splitField($f);
		    print ANNOT "\t$f";
		}
	    }
	    print ANNOT "\t$bounded\t$cds";
	    print ANNOT "\n";
	    print BED "$words[0]\t$words[3]\t$words[4]\t$id\t$score\t$words[6]\n";
	}
    }
    close(IN);
    close(BED);
    close(ANNOT);
}

./parseGff.pl *.gff
cat beds/*.bed > affyHuEx1.bed
hgLoadBed hg16 affyHuEx1 affyHuEx1.bed -strict
cat annot/*.tab > affyHuEx1.annot.tab 
cp ../hg17/affyHuEx1Annot.sql ./
# Contents of affyHuEx1Annot.sql file
CREATE TABLE affyHuEx1Annot (
  numIndependentProbes smallint not null,
  probesetId int(11) not null,
  exonClustId int(11) not null,
  numNonOverlapProbes smallint not null,
  probeCount smallint not null,
  transcriptClustId int(11) not null,
  probesetType smallint not null,
  numXHybeProbe smallint not null,
  psrId int(11) not null,
  level varchar(10) not null,
  evidence varchar(255) not null,
  bounded smallint not null,
  cds smallint not null,
  PRIMARY KEY (probesetId)
);
hg16S -A < affyHuEx1Annot.sql 
echo "load data local infile 'affyHuEx1.annot.tab' into table affyHuEx1Annot;" | hg16S -A

# end AFFYHUEX1 track

##########################################################################
# NHGRI DNASE I HYPERSENSITIVE SITES (2005-10-05 kate)
#  Submitted by Greg Crawford via web site,
#  http://research.nhgri.nih.gov/DNaseHS/May2005/
#  In addition, a file containing the 'randoms' was FTP'ed by Greg
#     NOTE: bad chr8_random entry removed, as per G. Crawford

    # Same display as ENCODE track by Angie...
    # Jim asked to add scores for grayscale-coloring:
    #   clusters of 2 drawn in 50%,  clusters of 3 drawn in 75%,
    #   and clusters of 4 or more drawn in 100% black.

    mkdir /cluster/data/hg16/bed/nhgri/lab
    cd /cluster/data/hg16/bed/nhgri/lab
    foreach c (`cut -f 1 /cluster/data/hg16/chrom.sizes`)
        echo $c
        wget -nd http://research.nhgri.nih.gov/DNaseHS/May2005/clusters/$c.LynxClusters.bed 
    end
    cd ..

    # special handling for ID's on chrM (they are preceded by 'M_')
    ls lab/chr*.bed lab/randoms.txt \
        | grep -v chrM | xargs cat | grep '^chr'  \
        | perl -wpe 'if (/500bp_(\d+)_(\d+)/) { \
                   $id = $1 . "_" . $2; \
                   $score = ($2 >= 4) ? 1000 : $2 * 250; \
                   s/500bp.+/$id\t$score/; } else { die "parse"; }' > hs.bed

    cat lab/chrM*.bed | grep '^chr'  \
        | perl -wpe 'if (/500bp_(M_\d+)_(\d+)/) { \
                   $id = $1 . "_" . $2; \
                   $score = ($2 >= 4) ? 1000 : $2 * 250; \
                   s/500bp.*/$id\t$score/; } else { die "parse"; }' >> hs.bed
    hgLoadBed hg16 nhgriDnaseHs hs.bed
        # Loaded 14224 elements of size 5
    checkTableCoords hg16 nhgriDnaseHs

# MYTOUCH FIX - jen - 2006-01-24
  sudo mytouch hg16 superfamily 0407141100.00
  sudo mytouch hg16 acemblyPep 0406151200.00
  sudo mytouch hg16 twinscanPep 0407141200.00
  sudo mytouch hg16 superfamily 0407141100.00
  sudo mytouch hg16 ensPep 0407141100.00
  sudo mytouch hg16 knownToEnsembl 0407141100.00
  sudo mytouch hg16 sfDescription 0407141100.00
  sudo mytouch hg16 ensEstGtp 0409081800.00
  sudo mytouch hg16 ensEstPep 0409081800.00


##########################################################################
# AFFY HUEX1 OFF-BY-ONE FIX (Andy 2006-12-14)
   ssh hgwdev
   cd /cluster/data/hg16/bed/affyHumanExon
   liftOver /cluster/data/hg17/bed/affyHumanExon/affyHuEx1.fixed.bed \
      /gbdb/hg17/liftOver/hg17ToHg16.over.chain.gz affyHuEx1.fixed.bed affyHuEx1.unmapped
awk 'BEGIN{OFS="\t"}{print $4,$3-$2}' affyHuEx1.fixed.bed | sort -k2,2nr | head
#2325773 204918
#2402134 204802
#3645108 60419
#2366900 52086
#3016074 9552
#3641787 8061
#2321649 8054
   # So there's 4 of them with problems this time:
     egrep -v "\b(2325773|2402134|3645108|2366900)\b" affyHuEx1.fixed.bed > alreadyok.bed
     egrep "\b(2325773|2402134|3645108|2366900)\b" affyHuEx1.fixed.bed \
      /cluster/data/hg17/bed/affyHumanExon/affyHuEx1.fixed.bed > good.hg17.bed
     bedToFa /cluster/data/hg17/hg17.2bit good.hg17.bed good.hg17.fa
     gfClient blat6 17785 /cluster/data/hg16/nib good.hg17.fa bad.hg16.psl
     tail +6 bad.hg16.psl | awk '$11==$13{print}' > good.hg16.psl
     pslToBed good.hg16.psl good.hg16.bed
     # Scores were lost in the transformations.  Put em back in.
     egrep "\b(2325773|2402134|3645108|2366900)\b" affyHuEx1.fixed.bed
#chr1    24924744        25129662        2325773 500     +
#chr1    24924872        25129674        2402134 900     -
#chr1    168139941       168192027       2366900 1000    +
#chr16   2600606 2661025 3645108 200     +
     awk 'BEGIN{OFS="\t"} 
         $4=="2325773"{score="500";} 
         $4=="2402134"{score="900";} 
         $4=="3645108"{score="200";}
     {print $1,$2,$3,$4,score,$6}' good.hg16.bed > good.bed
     cat alreadyok.bed good.bed > affyHuEx1.fixed.bed
     bedSort affyHuEx1.fixed.bed tmp.bed
     rm good.* bad.* alreadyok.bed
   hgLoadBed hg16 affyHuEx1 affyHuEx1.fixed.bed
##########################################################################
# GenBank gbMiscDiff table (markd 2007-01-10)
# Supports `NCBI Clone Validation' section of mgcGenes details page

   # genbank release 157.0 now contains misc_diff fields for MGC clones
   # reloading mRNAs results in gbMiscDiff table being created.
   ./bin/gbDbLoadStep -reload -srcDb=genbank -type=mrna hg16

################################################
# SPLIT EXPRESSION & REGULATION GROUPS
# (2008-09-09 kate)

echo "insert into grp (name, label, priority) values ('expression', 'Expression', 4.5)" | hgsql hg16
echo "update grp set label='Regulation' where name='regulation'" | hgsql hg16

################################################
# AUTOMATE UPSTREAM FILE CREATION (2008-10-15 markd)

echo hg16 panTro1 mm3 rn3 galGal2> /hive/data/genomes/hg16/bed/multiz.hg16mm3rn3panTro1galGal2/species.lst
update genbank.conf:
hg16.upstreamGeneTbl = refGene
hg16.upstreamMaf = mzPt1Mm3Rn3Gg2_pHMM /hive/data/genomes/hg16/bed/multiz.hg16mm3rn3panTro1galGal2/species.lst


#############################################################################
# MAKE PCR TARGET FOR UCSC GENES (REGENERATED BUT NOT RELOADED 4/2/10 angie)
# Originally done 11/4/08 (with file from not quite the right UCSC Genes build directory,
# resulting in some missing genes in kgTargetAli)
    ssh hgwdev
    mkdir /cluster/data/hg16/bed/mrnaPcr
    cd /cluster/data/hg16/bed/mrnaPcr
    # First, get consistent FA and PSL for UCSC Genes.
    hgsql hg16 -NBe 'select * from knownGene' | cut -f 1-10 > knownGene.gp
    genePredToBed knownGene.gp \
      > ucscGenes.bed
    hgsql hg16 -NBe 'select kgId,geneSymbol from kgXref' \
    | perl -wpe 's/^(\S+)\t(\S+)/$1\t${1}__$2/ || die;' \
      > idSub.txt
    subColumn 4 ucscGenes.bed idSub.txt ucscGenesIdSubbed.bed
    sequenceForBed -keepName -db=hg16 -bedIn=ucscGenesIdSubbed.bed \
      -fastaOut=stdout \
    | faToTwoBit -ignoreDups stdin kgTargetSeq.2bit
    cut -f 1-10 knownGene.gp \
    | genePredToFakePsl hg16 stdin kgTargetAli.psl /dev/null

# NOT DONE -- not worth QA effort:
    # Load up the UCSC Genes target PSL table and put 2bit in /gbdb::
    hgLoadPsl hg16 -table=kgTargetAliMar10 kgTargetAli.psl
    mkdir /gbdb/hg16/targetDb
    ln -s /cluster/data/hg16/bed/mrnaPcr/kgTargetSeq.2bit /gbdb/hg16/targetDb/kgTargetSeqMar10.2bit

    # Ask cluster-admin to start an untranslated, -stepSize=5 gfServer on
    # /gbdb/hg16/targetDb/kgTargetSeq.2bit .

    ssh hgwdev
    # Add records to hgcentraltest blatServers and targetDb:
    hgsql hgcentraltest -e \
      'INSERT into blatServers values ("hg16KgMar10", "blat13", 17795, 0, 1);'
    hgsql hgcentraltest -e \
      'INSERT into targetDb values("hg16KgMar10", "UCSC Genes", \
         "hg16", "kgTargetAliMar10", "", "", \
         "/gbdb/hg16/targetDb/kgTargetSeqMar10.2bit", 1, now(), "");'


#############################################################################
# LIFTOVER TO Hg19 (DONE - 2009-04-24 - Hiram )
    mkdir /hive/data/genomes/hg16/bed/blat.hg19.2009-04-24
    cd /hive/data/genomes/hg16/bed/blat.hg19.2009-04-24
    # -debug run to create run dir, preview scripts...
    doSameSpeciesLiftOver.pl -buildDir=`pwd` -debug hg16 hg19
    # Real run:
    time nice -n +19 \
	$HOME/kent/src/hg/utils/automation/doSameSpeciesLiftOver.pl \
	-buildDir=`pwd` -verbose=2 \
	-bigClusterHub=swarm -dbHost=hgwdev -workhorse=hgwdev \
	 hg16 hg19 > do.log 2>&1 &
    #	real    93m11.093s

#############################################################################
# CPG Islands Unmasked track (DONE - 2014-04-24 - Hiram)
    mkdir /hive/data/genomes/hg16/bed/cpgIslandsUnmasked
    cd /hive/data/genomes/hg16/bed/cpgIslandsUnmasked
    time doCpgIslands.pl -buildDir=`pwd` -bigClusterHub=ku \
     -tableName=cpgIslandExtUnmasked -dbHost=hgwdev -smallClusterHub=ku \
        -workhorse=hgwdev \
          -maskedSeq=/hive/data/genomes/hg16/hg16.unmasked.2bit \
            hg16 > do.log 2>&1
    # real    1m30.968s

    cat fb.hg16.cpgIslandExtUnmasked.txt
    # 61507517 bases of 2192103426 (2.806%) in intersection

##############################################################################
# CPG Islands (DONE - 2014-04-24 - Hiram)
    mkdir /hive/data/genomes/hg16/bed/cpgIslandsExt
    cd /hive/data/genomes/hg16/bed/cpgIslandsExt
    time doCpgIslands.pl -buildDir=`pwd` -bigClusterHub=ku \
     -tableName=cpgIslandExt -dbHost=hgwdev -smallClusterHub=ku \
        -workhorse=hgwdev hg16 > do.log 2>&1
    # real     2m3.684s

    cat fb.hg16.cpgIslandExt.txt
    # 21191456 bases of 2865248791 (0.740%) in intersection

##############################################################################
# LIFTOVER TO HG38 (DONE 10/10/14 angie)
    mkdir /hive/data/genomes/hg16/bed/blat.hg38.2014-10-08
    cd /hive/data/genomes/hg16/bed/blat.hg38.2014-10-08
    # -debug run to create run dir and scripts to verify this command
    # is OK
    doSameSpeciesLiftOver.pl -debug -buildDir=`pwd` \
          -ooc=/hive/data/genomes/hg16/11.ooc hg16 hg38
    # then running for real
    time doSameSpeciesLiftOver.pl -buildDir=`pwd` \
            -ooc=/hive/data/genomes/hg16/11.ooc hg16 hg38 >&! do.log & tail -f do.log
#0.240u 0.230s 37:09.66 0.0%     0+0k 0+0io 0pf+0w

    # verify the convert link on the browser is now active from hg16 to hg38

##############################################################################
