# for emacs: -*- mode: sh; -*-


# This file describes how we made the browser database on 
# NCBI build 35 (May 2004 freeze)

# HOW TO BUILD AN ASSEMBLY FROM NCBI FILES 
# ---------------------------------------

    # Make gs.18 directory, gs.18/build35 directory, and gs.18/ffa directory.
    ssh eieio
    mkdir /cluster/store5/gs.18
    mkdir /cluster/store5/gs.18/build35
    mkdir /cluster/store5/gs.18/agp
    mkdir /cluster/store5/gs.18/ffa

    #    Make a symbolic link from /cluster/store1 to this location
    #	(I assume there is some use for this later ?)
	
    cd /cluster/store1
    ln -s /cluster/store5/gs.18 ./gs.18
    ln -s /cluster/store5/gs.18/build35 /cluster/data/hg17

    #    Make a symbolic link from your home directory to the build dir:
    #	(Investigate what this is used for, may no longer be necessary)

    ln -s /cluster/store5/gs.18/build35 ~/oo

# NCBI download site, fetch everything into this one directory:

    #	with the machine and password in your $HOME/.netrc file, this
    #	wget command will require no login.  Your $HOME/.netrc file
    #	is set to 'chmod 600 .netrc' to prevent anyone from finding
    #	the data.  (There were some early files that later moved
    #		into an OLD subdirectory.  They were broken.)
    mkdir /cluster/store5/gs.18/ncbi
    cd /cluster/store5/gs.18/ncbi
    wget --timestamping ftp://ftp.ncbi.nlm.nih.gov/build_35/*

    # FYI: agp file format documented at:
    #	http://www.ncbi.nlm.nih.gov/Genbank/WGS.agpformat.html    # fixup a couple of names for our own purposes here
    cd /cluster/store5/gs.18/agp
    ln -s ../ncbi/chr*.agp ../ncbi/chr*.fa.gz .
    sed -e "s#MT/NC_001807.4#NC_001807#" ../ncbi/chrMT.agp > chrM.agp
    sed -e "s/NG_002392.2/NG_002392/" ../ncbi/DR52.agp > chr6_hla_hap1.agp
    sed -e "s/NG_002433.1/NG_002433/" ../ncbi/DR53.agp > chr6_hla_hap2.agp
    zcat ../ncbi/DR52.fa.gz | \
	sed -e "s/gi|29124352|ref|NG_002392.2/ref|NG_002392/" | \
	gzip > chr6_hla_hap1.fa.gz
    zcat ../ncbi/DR53.fa.gz | \
	sed -e "s/gi|28212470|ref|NG_002433.1/ref|NG_002433/" | \
	gzip > chr6_hla_hap2.fa.gz
    zcat ../ncbi/chrMT.fa.gz | \
	sed -e "s/gi|17981852|ref|NC_001807.4/ref|NC_001807/" | \
	gzip > chrM.fa.gz
    

    #  Put all the agp files together into one.
    cd /cluster/store5/gs.18/build35
    #	The chrM sequence now has its own agp, remove it from
    #	ref_placed.agp
    sed -e "/^NC_001807/d" ../ncbi/ref_placed.agp > ref_placed.agp
    cat ref_placed.agp ../agp/chrM.agp ../ncbi/ref_unplaced.agp \
	../agp/chr6_hla_hap1.agp ../agp/chr6_hla_hap2.agp \
	../ncbi/PAR.agp > ncbi_build35.agp

    #	and into ffa
    cd /cluster/store5/gs.18/ffa
    #	There is a single bogus line at the end of ref_placed.fa.gz
    #	declaring the NC_001807 MT sequence, this was later replaced by
    #	chrMT.fa.gz, so remove that one line:
    zcat ../ncbi/ref_placed.fa.gz | sed -e "/^>ref|NC_001807/d" | \
	gzip > ref_placed.fa.gz
    #	(That's a 40 minute job)

    #	sequence.inf is usually here, symlink it
    ln -s ../ncbi/sequence.inf
    #	put all the fa.gz files together in one big fa.gz
    time zcat ref_placed.fa.gz ../agp/chrM.fa.gz ../ncbi/ref_unplaced.fa.gz \
	../agp/chr6_hla_hap?.fa.gz ../ncbi/PAR.fa.gz | gzip \
	> ncbi_build35.fa.gz
    #	real    37m42.208s
    #	user    37m3.490s
    #	sys     0m31.430s

    #	Make a listing of all the fasta record headers, just FYI:
    cd /cluster/store5/gs.18/ffa
    zcat ffa/ncbi_build35.fa.gz | grep "^>" > ncbi.fa.headers


    #	New to this build is the sequence: NC_001807 which is the
    #	mitochondria sequence.  This prefix NC_ is new to the process
    #	and will have to be accounted for below.  The other two special
    #	prefixes are similar to what was seen before:
    #	from DR52.agp NG_002392
    #	Homo sapiens major histocompatibility complex, class II,
    #		DR52 haplotype (DR52) on chromosome 6
    #	and from DR53.agp NG_002433
    #	Homo sapiens major histocompatibility complex, class II,
    #		DR53 haplotype (DR53) on chromosome 6

    #	Fixup seq_contig.md
    #
    #	It has a bunch of stuff belonging to the Celera
    #	genome assembly.  Filter those out.  I don't know what the
    #	NT_07959[0-7] items are, but there are no definitions for them
    #	in the agp files and no sequence in any fa.gz file.
    #	Fixup the names for the NG_ items, and change chrom MT to be M
    cd /cluster/store5/gs.18/build35
    egrep -v "Celera|NT_07959[0-7]" ../ncbi/seq_contig.md | \
	sed -e "s/6|NG_002392/6_hla_hap1/" \
	-e "s/6|NG_002433/6_hla_hap2/" \
	-e "s/^9606\tMT|NC_001807/9606\tM/" \
	> temp_contig.md

    #	get the randoms sorted in proper order.  The createNcbiLifts
    #	does not work correctly if the randoms are not grouped together
    #	by chromosome
    grep -v "|" temp_contig.md  > seq_contig.md
    #	This pulls out all the randoms and groups them within the
    #	same chrom but leaving them in the same order as they orginally
    #	were  (warning this is BASH code ...)
    grep "|" temp_contig.md | awk -F"|" '{print $1}' | \
        awk '{print $2}' | sort -n -u | while read CHR
do
        grep "[^0-9]${CHR}|" temp_contig.md
done >> seq_contig.md


    # Sanity check, checkYbr was updated to handle the NC_ identifier
    time zcat ../ffa/ncbi_build35.fa.gz | $HOME/bin/i386/checkYbr \
	ncbi_build35.agp stdin seq_contig.md > check.seq_contig
    #	real    2m34.143s
    #	user    2m24.970s
    #	sys     0m8.900s
    #	result should be clean:
    cat check.seq_contig
    #	Read 380 contigs from ncbi_build35.agp
    #	Verifying sequence sizes in stdin
    #	0 problems detected


    # Convert fa files into UCSC style fa files and place in "contigs"
    # directory inside the gs.18/build35 directory 
    #	(a check that can be done here is make a list of the contigs
    #	in this ./contigs directory before and compare it with the
    #	list of distributed contigs created after they have been
    #	disbursed.)
    #	faNcbiToUcsc was fixed to handle the NC_ identifier

    cd /cluster/store5/gs.18/build35
    #	We've been through this often
    mv contigs contigs.0
    time zcat ../ffa/ncbi_build35.fa.gz | $HOME/bin/i386/faNcbiToUcsc \
	-split -ntLast stdin contigs
    #	real    5m10.938s
    #	user    2m20.070s
    #	sys     0m51.020s
    #	If you want to compare anything to previous work, check now, then:
    rm -fr contigs.0

    # Determine the chromosome sizes from agps
    #	Watch carefully how chrY gets constructed.  I'm not sure
    #	this chrom_sizes represents the whole length of chrY with
    #	the PAR added.  We will see about that.
    #	Script updated to handle new chrom names:
    #	my @chroms = (1 .. 22, 'X', 'Y', 'M', '6_hla_hap1', '6_hla_hap2');

    cd /cluster/store5/gs.18/build35
    /cluster/bin/scripts/getChromSizes ../agp
    #	Create chrom.lst list for use in foreach() loops
    awk '{print $1}' chrom_sizes | sed -e "s/chr//" > chrom.lst

    # Create lift files (this will create chromosome directory structure) and
    #	inserts file
  
    /cluster/bin/scripts/createNcbiLifts -s chrom_sizes seq_contig.md .

    # Create contig agp files (will create contig directory structure)
	
    /cluster/bin/scripts/createNcbiCtgAgp seq_contig.md ncbi_build35.agp .

    # Create chromsome random agp files.

    /cluster/bin/scripts/createNcbiChrAgp -randomonly .

    # Copy the original chrN.agp files from the gs.18/agp directory 
    #    into each of the chromosome directories since they contain better 
    #    gap information. Delete the comments at top from these.
    cd /cluster/store5/gs.18/build35
    foreach c ( `cat chrom.lst` )
	sed -e "/^#.*/d" ../agp/chr${c}.agp > ./${c}/chr${c}.agp
    end
    #	chrM needs a name fixup
    sed -e "s#NC_001807#chrM#" ../agp/chrM.agp > M/chrM.agp

    # Distribute contig .fa to appropriate directory (assumes all files
    # are in "contigs" directory).

    # Create inserts file from agp and lift files (new - added by Terry, 2004-07-12)
    /cluster/bin/scripts/createInserts /cluster/data/hg17 > /cluster/data/hg17/inserts

    # create global data link for everyone.  No more home directory
    # links required.
    ln -s /cluster/store5/gs.18/build35 /cluster/data/hg17
    cd /cluster/data/hg17
    /cluster/bin/scripts/distNcbiCtgFa contigs .
    #	Verify that everything was moved properly, the contigs directory
    #	should be empty:
    ls contigs
    #	Nothing there, then remove it
    rmdir  contigs

    #	Make a list of the contigs for use later
    rm contig.lst
    touch contig.lst
    foreach chrom ( `cat chrom.lst` )
	foreach c ( $chrom/N{C,G,T}_?????? )
	    set contig = $c:t
	    echo "${chrom}/${contig}/${contig}.fa" >> contig.lst
	end
    end
    #   For later comparisons, this is how many contigs we have:
    wc -l contig.lst
    #	380

    #	Note 2004-06-30 - there are some clone numbers left in some of
    #	the NCBI files that are incorrect.  Due to version number
    #	changes, more than one version is listed.  Namely for accession
    #	numbers: AC004491 AC004921 AC004983 AC005088 AC006014 AC099654
    #	The AGP files are correct, the sequence.inf file lists these
    #	twice: AC004491.1 AC004491.2
    #	AC004921.1 AC004921.2 AC004983.2 AC004983.3
    #	AC005088.2 AC005088.3 AC006014.2 AC006014.3
    #	AC099654.4 AC099654.5 

    # FILES ARE NOW READY FOR REPEAT MASKING - start that process as
    #	other steps here can proceed in parallel.

    #	Previous practice used to copy everything over for jkStuff from a
    #	previous build.  Rather than do that, pick up whatever is needed
    #	at the time it is needed and verify that it is going to do what
    #	you expect.

    cd /cluster/data/hg17
    mkdir jkStuff

    # Create the contig.gl files - XXX - NCBI doesn't deliver
    # contig_overlaps.agp - 2004-06-18 - this is beginning to come
    # together and there is now a contig_overlaps.agp file

    #	This is properly done below with a combination of psLayout
    #	alignments to create the contig_overlaps.agp file
    # /cluster/bin/i386/agpToGl contig_overlaps.agp . -md=seq_contig.md
    # Create chromosome gl files
    # jkStuff/liftGl.csh contig.gl

# CREATING DATABASE  (DONE - 2004-05-20 - Hiram)
    #	RE-DONE for new NIBS - 2004-06-03
    ssh hgwdev
    # Make sure there is at least 5 gig free on hgwdev:/var/lib/mysql
    df -h /var/lib/mysql
#	Filesystem            Size  Used Avail Use% Mounted on
#	/dev/sdc1             1.8T  303G  1.4T  19% /var/lib/mysql

    # Create the database.
    hgsql -e 'create database hg17' mysql
    # Copy over grp table (for track grouping) from another database:
    hgsql -e "create table grp (PRIMARY KEY(NAME)) select * from hg16.grp" hg17

    # ENCODE groups
    # Added 2005-08016 kate
    echo 'UPDATE grp SET priority=7 WHERE name="varRep"'| hgsql hg17
    echo 'UPDATE grp SET priority=8 WHERE name="encode"'| hgsql hg17
    echo 'INSERT INTO grp (name, label, priority) VALUES ("encodeGenes", "ENCODE Regions and Genes", 8.1)' | hgsql hg17
    echo 'INSERT INTO grp (name, label, priority) VALUES ("encodeTxLevels", "ENCODE Transcript Levels", 8.2)' | hgsql hg17
    echo 'INSERT INTO grp (name, label, priority) VALUES ("encodeChip", "ENCODE Chromatin Immunoprecipitation", 8.3)' | hgsql hg17
    echo 'INSERT INTO grp (name, label, priority) VALUES ("encodeChrom", "ENCODE Chromosome, Chromatin and DNA Structure", 8.4)' | hgsql hg17
    echo 'INSERT INTO grp (name, label, priority) VALUES ("encodeCompGeno", "ENCODE Comparative Genomics", 8.5)' | hgsql hg17
    echo 'INSERT INTO grp (name, label, priority) VALUES ("encodeVariation", "ENCODE Variation", 8.6)' | hgsql hg17
    echo 'INSERT INTO grp (name, label, priority) VALUES ("encodeAnalysis", "ENCODE Analysis", 8.9)' | hgsql hg17
    

# MAKE CHROMINFO TABLE WITH (TEMPORARILY UNMASKED) NIBS
#	(DONE - 2004-05-21 - Hiram)
    #	RE-DONE with new NIBS - 2004-06-03
    # Make nib/, unmasked until RepeatMasker and TRF steps are done.
    # Do this now so that the chromInfo table will exist and thus the
    #	trackDb tables can be built in the next step.
    #	These unmasked nibs will be replaced by the masked nibs after
    #	repeat mask and trf are done.
    ssh eieio
    cd /cluster/data/hg17
    # Make chr*.fa from contig .fa
    #  Copied chrFa.sh from hg16/jkStuff, renamed it to chrFa.csh
    time ./jkStuff/chrFa.csh
    #	real    13m24.710s
    #	user    9m0.360s
    #	sys     1m15.820s

    mkdir nib
    foreach c (`cat chrom.lst`)
      foreach f ($c/chr${c}{,_random}.fa)
        if (-e $f) then
          echo "nibbing $f"
          /cluster/bin/i386/faToNib $f nib/$f:t:r.nib
        endif
      end
    end

    # Make symbolic links from /gbdb/hg17/nib to the real nibs.
    ssh hgwdev
    mkdir -p /gbdb/hg17/nib
    ln -s /cluster/data/hg17/nib/chr*.nib /gbdb/hg17/nib
    # Load /gbdb/hg17/nib paths into database and save size info.
    cd /cluster/data/hg17
    hgsql hg17  < $HOME/kent/src/hg/lib/chromInfo.sql
    hgNibSeq -preMadeNib hg17 /gbdb/hg17/nib */chr*.fa
    hgsql -N -e "select chrom,size from chromInfo order by chrom" hg17 \
	> chrom.sizes
    # You can compare this chrom.sizes with the previously created
    # chrom_sizes.  Should be no difference
    sort chrom_sizes > s0
    sort chrom.sizes | grep -v random > s1
    diff s0 s1
    rm s0 s1

# MAKE HGCENTRALTEST ENTRY AND TRACKDB TABLE (DONE - 2004-05-21 - Hiram)
    #	dbDb orderKey updated 2004-06-08 - Hiram
    ssh hgwdev
    #	reset dbDb orderKey - these have never been ordered properly
    #	before, this will get them on the program.
    hgsql -e 'update dbDb set orderKey=11 where name = "hg16";' \
	-h genome-testdb hgcentraltest
    hgsql -e 'update dbDb set orderKey=12 where name = "hg15";' \
	-h genome-testdb hgcentraltest
    hgsql -e 'update dbDb set orderKey=13 where name = "hg13";' \
	-h genome-testdb hgcentraltest

    # Enter hg17 into hgcentraltest.dbDb so test browser knows about it:
    hgsql -e 'INSERT INTO dbDb (name, description, nibPath, organism, \
	defaultPos, active, orderKey, genome, scientificName, \
	htmlPath, hgNearOk, hgPbOk, sourceName) \
	VALUES("hg17", "May 2004", "/gbdb/hg17/nib", "Human", \
	"chr4:56214201-56291736", 1, 10, "Human", "Homo sapiens", \
	"/gbdb/hg17/html/description.html", 0, 0, "NCBI Build 35");' \
	-h genome-testdb hgcentraltest
    # Make trackDb table so browser knows what tracks to expect:
    cd ~/kent/src/hg/makeDb/trackDb
    cvs up -d -P .
    # Edit the makefile to add hg17 in all the right places and do
    make update
    make alpha
    cvs commit makefile

# MAKE LIFTALL.LFT, NCBI.LFT (DONE - 2004-05-21 - Hiram)
    #	Re-DONE with new randoms - 2004-06-03 - Hiram)
    cd /cluster/data/hg17
    mkdir -p jkStuff
    cat */lift/{ordered,random}.lft > jkStuff/liftAll.lft
    # Create jkStuff/ncbi.lft for lifting stuff built with the NCBI assembly.
    # Note: this ncbi.lift will not lift floating contigs to chr_random coords,
    # but it will show the strand orientation of the floating contigs 
    # (grep for '|').
    #   mdToNcbiLift seq_contig.md jkStuff/ncbi.lft 
    #	XXXX - appears to be unused, not done - Hiram

# REPEAT MASKING (DONE - 2004-05-24 - Hiram)
    #	The randoms were rearranged after this was first done,
    #	they are re-made below 2004-06-02)
    #	Record the RM version here:
    #	RepBase Update 8.12, RM database version 20040130
    #	as this changes over time and there is no record in the results

    # Split contigs, run RepeatMasker, lift results

    #	This split takes about 8 minutes
    ssh eieio
    cd /cluster/data/hg17
    foreach chrom ( `cat chrom.lst` )
	foreach c ( $chrom/N{C,G,T}_?????? )
	    set contig = $c:t
	    echo "splitting ${chrom}/${contig}/${contig}.fa"
	    faSplit size ${chrom}/${contig}/$contig.fa 500000 \
		${chrom}/${contig}/${contig}_ \
		-lift=${chrom}/${contig}/$contig.lft -maxN=500000
	end
    end

    #- Make the run directory and job list:
    cd /cluster/data/hg17
    mkdir -p jkStuff
    #  According to RepeatMasker help file, no arguments are required to
    #	specify species because its default is set for primate (human)
    #  This run script saves the .tbl file to be sent to Arian.  He uses
    # those for his analysis.  Sometimes he needs the .cat and .align files for
    # checking problems.  Krish needs the .align files, they are large.

    cat << '_EOF_' > jkStuff/RMHuman
#!/bin/csh -fe

cd $1
pushd .
/bin/mkdir -p /tmp/hg17/$2
/bin/cp $2 /tmp/hg17/$2/
cd /tmp/hg17/$2
/cluster/bluearc/RepeatMasker/RepeatMasker -ali -s $2
popd
/bin/cp /tmp/hg17/$2/$2.out ./
 if (-e /tmp/hg17/$2/$2.align) /bin/cp /tmp/hg17/$2/$2.align ./
if (-e /tmp/hg17/$2/$2.tbl) /bin/cp /tmp/hg17/$2/$2.tbl ./
# if (-e /tmp/hg17/$2/$2.cat) /bin/cp /tmp/hg17/$2/$2.cat ./
/bin/rm -fr /tmp/hg17/$2/*
/bin/rmdir --ignore-fail-on-non-empty /tmp/hg17/$2
/bin/rmdir --ignore-fail-on-non-empty /tmp/hg17
'_EOF_'
    # << this line makes emacs coloring happy
    chmod +x jkStuff/RMHuman

    ssh eieio
    cd /cluster/data/hg17
    mkdir RMRun
    rm -f RMRun/RMJobs
    touch RMRun/RMJobs
    foreach d ( `cat chrom.lst` )
     foreach c ( ${d}/N{C,G,T}_*/N{C,G,T}_*_*.fa )
        set f = $c:t
        set cc = $c:h
        set contig = $cc:t
        echo /cluster/store5/gs.18/build35/jkStuff/RMHuman \
   		/cluster/store5/gs.18/build35/${d}/${contig} $f \
   '{'check out line+ /cluster/store5/gs.18/build35/${d}/${contig}/$f.out'}' \
          >> RMRun/RMJobs
      end
    end

    # We have 5971 jobs in RMJobs:
    wc RMRun/RMJobs
    #	5970   41790 1105804 RMRun/RMJobs

    #- Do the run
    ssh kk
    cd /cluster/data/hg17/RMRun
    para create RMJobs
    para try, para check, para check, para push, para check,...

    #- While that is running, you can run TRF (simpleRepeat) on the small
    # cluster.  See SIMPLE REPEAT section below
# Completed: 5970 of 5970 jobs
# CPU time in finished jobs:   45189516s  753158.60m 12552.64h  523.03d  1.433 y
# IO & Wait Time:                141333s    2355.55m    39.26h    1.64d  0.004 y
# Average job time:                7593s     126.55m     2.11h    0.09d
# Longest job:                    10268s     171.13m     2.85h    0.12d
# Submission to last job:         81484s    1358.07m    22.63h    0.94d

    #	Lift up the split-contig .out's to contig-level .out's
    #
    #	If a mistake is made in the following it would be possible to
    #	destroy all the RM output.  So, just to be paranoid, save all
    #	the RM output in bluearc for the time being:
    ssh eieio

    cd /cluster/data/hg17
    mkdir /cluster/bluearc/hg17/RMOutput
    foreach c ( `cat chrom.lst` )
     foreach d ( ${c}/N{C,G,T}_* )
	set T = /cluster/bluearc/hg17/RMOutput/${d}
	mkdir -p ${T}
        cd ${d}
        set contig = $d:t
        cp -p ${contig}_?{,?,??}.fa.out ${T}
        cd ../..
	echo "${d} done"
     end
    end
    #	Make sure we got them all:
    #	(this doesn't work later since there are more *.fa.out files
    #	after the lifting.  More explicitly to find just these:
    #		find . -name "N?_*_*.fa.out" -print | wc -l
    find . -name "*.fa.out" -print | wc -l
    #	5970
    find /cluster/bluearc/hg17/RMOutput -type f | wc -l
    #	5970
    #	same count

    #	OK, now you can try this operation, do it in a script like this
    #	and save the output of the script for a record of what happened.

    cat << '_EOF_' > jkStuff/liftRM.csh
#!/bin/csh -fe
foreach c ( `cat chrom.lst` )
 foreach d ( ${c}/N{C,G,T}_* )
    cd $d
    set contig = $d:t
    liftUp $contig.fa.out $contig.lft warn ${contig}_?{,?,??}.fa.out 
    cd ../..
 end
end
'_EOF_'
    chmod +x jkStuff/liftRM.csh
    mkdir scriptsOutput
    time jkStuff/liftRM.csh > scriptsOutput/liftRM.1 2>&1
    #	real    4m37.572s
    #	user    1m19.130s
    #	sys     0m32.950s
    #	Check that they all were done:
    grep "fa.out" scriptsOutput/liftRM.1 | wc -l
    #	5959
    #	same count as above

    #- Lift up RepeatMask .out files to chromosome coordinates via
    # picked up jkStuff/liftOut2.sh from the hg16 build.  Renamed to
    # liftOut2.csh, changed the line that does the chrom listing

    time ./jkStuff/liftOut2.csh > scriptsOutput/liftOut2 2>&1
    #	real    9m46.780s
    #	user    1m18.900s
    #	sys     7m33.990s

    #- By this point, the database should have been created (above):
    ssh hgwdev
    cd /cluster/data/hg17
    time hgLoadOut hg17 ?/*.fa.out ??/*.fa.out 6_hla_hap?/*.fa.out > \
	scriptsOutput/hgLoadOut 2>&1
    #	real    5m59.137s
    #	user    1m47.550s
    #	sys     0m15.410s

    # errors during this load:  (there are always a couple of these)
    #	Strange perc. field -6.1 line 243543 of 2/chr2.fa.out
    #	Strange perc. field -5.6 line 243543 of 2/chr2.fa.out
    #	Strange perc. field -6.1 line 243545 of 2/chr2.fa.out
    #	Strange perc. field -5.6 line 243545 of 2/chr2.fa.out
    #	Strange perc. field -0.2 line 30322 of 17/chr17.fa.out
    #	Strange perc. field -0.2 line 30324 of 17/chr17.fa.out
    #	Strange perc. field -0.2 line 30326 of 17/chr17.fa.out
    #	Strange perc. field -0.2 line 30328 of 17/chr17.fa.out
    #	Strange perc. field -18.6 line 77034 of 19/chr19.fa.out

    #	Verify we have similar results to previous assembly:
    #	featureBits hg17 rmsk
    #	1391378842 bases of 2867328468 (48.525%) in intersection
    #	featureBits hg16 rmsk
    #	1388770568 bases of 2865248791 (48.469%) in intersection
    #	Now proceed to MASK SEQUENCE BOTH REPEATMASKER AND SIMPLE REPEAT/TRF
    #	following the SIMPLE REPEAT sections below

# Re-Running REPEAT_MASKER on the new Randoms (DONE - 2004-06-02 - Hiram)
    ssh eieio
    cd /cluster/data/hg17
    grep "|" seq_contig.md | awk '{print $2}' | sed -e "s#|#/#" > randoms.lst

    mkdir /cluster/data/hg17/RMRandoms
    foreach r ( `cat randoms.lst` )
	set d = $r:h
	set contig = $r:t
	foreach c ( ${r}/N{C,G,T}_*_*.fa )
	    set f = $c:t
	    echo /cluster/store5/gs.18/build35/jkStuff/RMHuman \
   		/cluster/store5/gs.18/build35/${d}/${contig} $f \
   '{'check out line+ /cluster/store5/gs.18/build35/${d}/${contig}/$f.out'}' \
          >> RMRandoms/RMJobs
	end
    end

    ssh kk
    cd /cluster/data/hg17/RMRandoms
    para create RMJobs
    para try, para check, para check, para push, para check,...
# Completed: 94 of 94 jobs
# CPU time in finished jobs:     221454s    3690.91m    61.52h    2.56d  0.007 y
# IO & Wait Time:                   866s      14.43m     0.24h    0.01d  0.000 y
# Average job time:                2365s      39.42m     0.66h    0.03d
# Longest job:                     9062s     151.03m     2.52h    0.10d
# Submission to last job:          9106s     151.77m     2.53h    0.11d

    #	Continuing with the paranoia theme, let's backup all the RM output
    #
    ssh eieio

    cd /cluster/data/hg17
    mkdir /cluster/bluearc/hg17/RMRandoms
    foreach c ( `cat chrom.lst` )
     foreach d ( ${c}/N{C,G,T}_* )
	set T = /cluster/bluearc/hg17/RMRandoms/${d}
	mkdir -p ${T}
        cd ${d}
        set contig = $d:t
        cp -p ${contig}_?{,?,??}.fa.out ${T}
        cd ../..
	echo "${d} done"
     end
    end
    #	Make sure we got them all:
    find . -name "N?_*_*.fa.out" -print | wc -l
    #	5959
    find /cluster/bluearc/hg17/RMRandoms -type f | wc -l
    #	5959
    #	same count


    time jkStuff/liftRM.csh > scriptsOutput/liftRM2.1 2>&1
    #	real    4m46.302s
    #	user    1m18.260s
    #	sys     0m18.000s
    #	Check that they all were done:
    grep "fa.out" scriptsOutput/liftRM2.1 | wc -l
    #	5959
    #	same count as above

    #- Lift up RepeatMask .out files to chromosome coordinates via
    # picked up jkStuff/liftOut2.sh from the hg16 build.  Renamed to
    # liftOut2.csh, changed the line that does the chrom listing

    time ./jkStuff/liftOut2.csh > scriptsOutput/liftOut2.1 2>&1
    #	real    2m46.347s
    #	user    1m18.650s
    #	sys     0m15.990s

    #- By this point, the database should have been created (above):
    ssh hgwdev
    cd /cluster/data/hg17
    time hgLoadOut hg17 ?/*.fa.out ??/*.fa.out 6_hla_hap?/*.fa.out > \
	scriptsOutput/hgLoadOut 2>&1
    #	real    5m59.137s
    #	user    1m47.550s
    #	sys     0m15.410s

    # errors during this load:  (there are always a couple of these)
    #	Strange perc. field -6.1 line 243543 of 2/chr2.fa.out
    #	Strange perc. field -5.6 line 243543 of 2/chr2.fa.out
    #	Strange perc. field -6.1 line 243545 of 2/chr2.fa.out
    #	Strange perc. field -5.6 line 243545 of 2/chr2.fa.out
    #	Strange perc. field -0.2 line 30322 of 17/chr17.fa.out
    #	Strange perc. field -0.2 line 30324 of 17/chr17.fa.out
    #	Strange perc. field -0.2 line 30326 of 17/chr17.fa.out
    #	Strange perc. field -0.2 line 30328 of 17/chr17.fa.out
    #	Strange perc. field -18.6 line 77034 of 19/chr19.fa.out

    #	Verify we have similar results to previous assembly:
    #	featureBits hg17 rmsk
    #	1390952984 bases of 2866216770 (48.529%) in intersection
    #	featureBits hg17 rmsk  #with previous randoms:
    #	1391378842 bases of 2867328468 (48.525%) in intersection
    #	featureBits hg16 rmsk
    #	1388770568 bases of 2865248791 (48.469%) in intersection
    #	Now proceed to MASK SEQUENCE BOTH REPEATMASKER AND SIMPLE REPEAT/TRF
    #	following the SIMPLE REPEAT sections below

# SIMPLE REPEAT [TRF] TRACK (DONE - 2004-05-21 - Hiram)
    #	Re-done with new randoms, 2004-06-02 - Hiram
    #	Copy the contigs, first to the bluearc, then to /iscratch/i
    ssh eieio
    mkdir /cluster/bluearc/hg17
    mkdir /cluster/bluearc/hg17/contigs

    cd /cluster/data/hg17
    foreach ctg ( `cat contig.lst` )
	set c = $ctg:t
 	echo "$ctg > /cluster/bluearc/hg17/contigs/$c"
	cp -p $ctg /cluster/bluearc/hg17/contigs/$c
    end
    #	Check how much is there:
    #	du -hsc /cluster/bluearc/hg17/contigs
    #	2.8G    /cluster/bluearc/hg17/contigs

    # Distribute contigs to /iscratch/i
    ssh kkr1u00
    mkdir /iscratch/i/gs.18/build35/unmaskedContigs
    cd /iscratch/i/gs.18/build35/unmaskedContigs
    cp -p /cluster/bluearc/hg17/contigs/* .

    # Verify same amount made it there:
    #	du -hsc /iscratch/i/gs.18/build35/unmaskedContigs
    #	2.8G    /iscratch/i/gs.18/build35/unmaskedContigs
    #	Then send them to the other 7 Iservers
    /cluster/bin/iSync

    #	Go to the small cluster for this business:
    ssh kki

    mkdir -p /cluster/data/hg17/bed/simpleRepeat
    cd /cluster/data/hg17/bed/simpleRepeat
    mkdir trf
    cat << '_EOF_' > runTrf
#!/bin/csh -fe
#
set path1 = $1
set inputFN = $1:t
set outpath = $2
set outputFN = $2:t
mkdir -p /tmp/$outputFN
cp $path1 /tmp/$outputFN
pushd .
cd /tmp/$outputFN
/cluster/bin/i386/trfBig -trf=/cluster/bin/i386/trf $inputFN /dev/null -bedAt=$outputFN -tempDir=/tmp
popd
rm -f $outpath
cp -p /tmp/$outputFN/$outputFN $outpath
rm -fr /tmp/$outputFN/*
rmdir --ignore-fail-on-non-empty /tmp/$outputFN
'_EOF_'
    # << this line makes emacs coloring happy
    chmod +x runTrf

    cat << '_EOF_' > gsub
#LOOP
./runTrf {check in line+ $(path1)}  {check out line trf/$(root1).bed}
#ENDLOOP
'_EOF_'
    # << this line makes emacs coloring happy

    ls -1S /iscratch/i/gs.18/build35/unmaskedContigs/*.fa > genome.lst
    gensub2 genome.lst single gsub jobList
    para create jobList
    para try
    para check
    para push
    para check
# Completed: 380 of 380 jobs
# CPU time in finished jobs:      13230s     220.49m     3.67h    0.15d  0.000 y
# IO & Wait Time:                  2078s      34.64m     0.58h    0.02d  0.000 y
# Average job time:                  40s       0.67m     0.01h    0.00d
# Longest job:                     1590s      26.50m     0.44h    0.02d
# Submission to last job:          2504s      41.73m     0.70h    0.03d

    liftUp simpleRepeat.bed /cluster/data/hg17/jkStuff/liftAll.lft \
	warn trf/*.bed  > lu.out 2>&1

    # Load into the database:
    ssh hgwdev
    cd /cluster/data/hg17/bed/simpleRepeat
    /cluster/bin/i386/hgLoadBed hg17 simpleRepeat simpleRepeat.bed \
      -sqlTable=$HOME/kent/src/hg/lib/simpleRepeat.sql
    #	Loaded 629076 elements of size 16
    #	Compare with previous assembly
    featureBits hg17 simpleRepeat
    #	54952425 bases of 2866216770 (1.917%) in intersection

    #	with previous randoms
    featureBits hg17 simpleRepeat
    #	54964044 bases of 3096628158 (1.775%) in intersection
    featureBits hg16 simpleRepeat
    #	54320136 bases of 2865248791 (1.896%) in intersection
    #	GAPS weren't in hg17 yet at this point, after gaps added:
    #	featureBits hg17 simpleRepeat
    #	54964044 bases of 2867328468 (1.917%) in intersection
    #	featureBits -countGaps hg17 simpleRepeat
    #	54964044 bases of 3096628158 (1.775%) in intersection

###########################################################################
# CREATE MICROSAT TRACK (done 2006-7-5 JK)
     ssh hgwdev
     cd /cluster/data/hg17/bed
     mkdir microsat
     cd microsat
     awk '($5==2 || $5==3) && $6 >= 15 && $8 == 100 && $9 == 0 {printf("%s\t%s\t%s\t%dx%s\n", $1, $2, $3, $6, $16);}' ../simpleRepeat/simpleRepeat.bed > microsat.bed 
    /cluster/bin/i386/hgLoadBed hg17 microsat microsat.bed


# PROCESS SIMPLE REPEATS INTO MASK (DONE - 2004-05-21 - Hiram)
    #	re-done with new randoms - 2004-06-03 - Hiram
    # After the simpleRepeats track has been built, make a filtered version 
    # of the trf output: keep trf's with period <= 12:
    ssh eieio
    cd /cluster/data/hg17/bed/simpleRepeat
    mkdir -p trfMask
    foreach f (trf/*.bed)
      awk '{if ($5 <= 12) print;}' $f > trfMask/$f:t
    end

    #	EXPERIMENT, at a filter of <= 12, we have coverage:
    #	20904399 bases of 2867328468 (0.729%) in intersection
    #	at a filter of <= 9, we have coverage:
    #	19271270 bases of 2867328468 (0.672%) in intersection


    # Lift up filtered trf output to chrom coords as well:
    cd /cluster/data/hg17
    mkdir bed/simpleRepeat/trfMaskChrom
    foreach c ( `cat chrom.lst` )
      if (-e $c/lift/ordered.lst) then
        perl -wpe 's@(\S+)@bed/simpleRepeat/trfMask/$1.bed@' \
          $c/lift/ordered.lst > $c/lift/oTrf.lst
        liftUp bed/simpleRepeat/trfMaskChrom/chr$c.bed \
          jkStuff/liftAll.lft warn `cat $c/lift/oTrf.lst`
      endif
      if (-e $c/lift/random.lst) then
        perl -wpe 's@(\S+)@bed/simpleRepeat/trfMask/$1.bed@' \
           $c/lift/random.lst > $c/lift/rTrf.lst
        liftUp bed/simpleRepeat/trfMaskChrom/chr${c}_random.bed \
          jkStuff/liftAll.lft warn `cat $c/lift/rTrf.lst`
      endif
    end

# MASK SEQUENCE BOTH REPEATMASKER AND SIMPLE REPEAT/TRF (DONE - 2004-05-25)
#							 -Hiram
    #	re-done with new randoms - 2004-06-03 - Hiram
    # This used to be done right after RepeatMasking.  Now, we mask with 
    # TRF as well, so do this after the "PROCESS SIMPLE REPEATS" step above,
    #	and after Repeat Masker is complete.
    ssh eieio
    cd /cluster/data/hg17

    # copied these scripts from hg16 - reset the lines that make
    # the chrom list to work on, reset the wild cards that find all the
    # contig .fa's

    # Make chr*.fa from contig .fa
    #  Copied chrFa.sh from hg16/jkStuff, renamed it to chrFa.csh
    time ./jkStuff/chrFa.csh > scriptsOutput/chrFa.out 2>&1 &
    #	real    13m18.512s
    #	user    9m1.670s
    #	sys     1m7.290s

    #- Soft-mask (lower-case) the contig and chr .fa's
    time ./jkStuff/makeFaMasked.csh > scriptsOutput/maFaMasked.out 2>&1
    #	real    29m31.623s
    #	user    13m49.700s
    #	sys     5m58.750s
    #- Make hard-masked .fa.masked files as well:
    time ./jkStuff/makeHardMasked.csh > scriptsOutput/maHardMasked.out 2>&1

    #- Create the bothMasksNib/ directory
    time ./jkStuff/makeNib.csh > scriptsOutput/maNib.out 2>&1
    #	real    14m41.694s
    #	user    6m28.000s
    #	sys     1m42.500s

    # Make symbolic links from /gbdb/hg17/nib to the real nibs.
    ssh hgwdev
    mv nib nib.raw
    mv bothMasksNib nib
    rm /gbdb/hg17/nib/*.nib
    ln -s `pwd`/nib/* /gbdb/hg17/nib

    # Load /gbdb/hg17/nib paths into database and save size info.
    hgsql hg17  < ~/kent/src/hg/lib/chromInfo.sql
    cd /cluster/data/hg17
    hgNibSeq -preMadeNib hg17 /gbdb/hg17/nib */chr*.fa
    #	3096628158 total bases

    #	Should be the same size as before
    hgsql -N -e "select chrom,size from chromInfo order by chrom" hg17 \
	> chrom.sizes.masked
    diff chrom.sizes chrom.sizes.masked
    #	should be no output at all, thus:
    rm chrom.sizes.masked

    # Copy the masked contig fa to /scratch and /iscratch
    #	And everything else we will need for blastz runs, etc ...
    #	Best to do this sequence first to /cluster/bluearc/scratch,
    #	which is going to be the source for the /scratch copy.
    #	And then from there to the /iscratch
    #	Make sure you are on the fileserver for the original source:
    ssh eieio
    mkdir -p /cluster/bluearc/scratch/hg/gs.18/build35
    cd /cluster/bluearc/scratch/hg/gs.18/build35

    #	these copies take less than 2 minutes each
    mkdir bothMaskedNibs
    cp -p /cluster/data/hg17/nib/*.nib ./bothMaskedNibs
    mkdir maskedContigs
    foreach chrom ( `cat /cluster/data/hg17/chrom.lst` )
	cp -p /cluster/data/hg17/${chrom}/N{C,G,T}_*/N{C,G,T}_??????.fa \
		./maskedContigs
	echo "done ${chrom}"
    end
    #	make sure you have them all:
    ls maskedContigs | wc -l
    #	380
    wc -l /cluster/data/hg17/contig.lst
    #	380
    mkdir rmsk
    foreach chrom ( `cat /cluster/data/hg17/chrom.lst` )
	cp -p /cluster/data/hg17/${chrom}/*.out ./rmsk
	echo "done ${chrom}"
    end

    #	Now, go to the destination for /iscratch and copy from the
    #	bluearc
    ssh kkr1u00
    mkdir -p /iscratch/i/gs.18/build35
    cd /iscratch/i/gs.18/build35
    #	This takes about 5 minutes
    rsync -arlv /cluster/bluearc/scratch/hg/gs.18/build35/ .

    time /cluster/bin/iSync
    #	real    7m27.649s

    # request rsync of /cluster/bluearc/scratch to the KiloKluster /scratch

# LOAD ctgPos table - Contig position track (DONE - 2004-06-08 - Hiram)
    #	After fixing up hgCtgPos to accept the -chromLst argument, simply:
    cd /cluster/data/hg17
    hgCtgPos -chromLst=chrom.lst hg17 .

# GOLD AND GAP TRACKS (DONE - 2004-05-21 - Hiram)
    #	RE-DONE with new randoms - 2004-06-03 - Hiram
    ssh hgwdev
    cd /cluster/data/hg17
    hgGoldGapGl -noGl -chromLst=chrom.lst hg17 /cluster/data/hg17 .
    #	Disappointing to see this create so many tables ...
    #	_gap and _gold for each chrom

    # Create the contig.gl files - XXX - NCBI doesn't deliver
    # contig_overlaps.agp - 2004-06-18 - this is beginning to come
    # together and there is now a contig_overlaps.agp file
    cd /cluster/store5/gs.18/build35
    #	combine the various psLayout attempts on different sections of
    #	clones
    ./combineContigOverlaps.sh
    #	Turn contig_overlaps.agp into gl files
    ~hiram/bin/i386/agpToGl contig_overlaps.agp . -md=seq_contig.md
    # Create chromosome gl files  (had to fix liftUp to do the NC_ properly)
    jkStuff/liftGl.csh contig.gl
    #
    #	Need to remove these PAR clone names from chrY.gl
    rm -f /tmp/chrY.gl
    sed -e "/^AL954722.18/d; /^BX537334.4/d; /^BX000483.7/d; \
	/^BX908402.3/d; /^BX649635.3/d; /^BX119919.5/d; \
	/^AC079176.15/d; /^AC097314.27/d; /^AC006209.25/d; \
	/^AJ271735.1/d; /^AJ271736.1/d" Y/chrY.gl > /tmp/chrY.gl
    rm -f Y/chrY.gl
    mv /tmp/chrY.gl Y/chrY.gl

#  After contig.gl files have been made from contig_overlaps.agp
#	The sed fixes the Celera clones that are marked phase W
#	Call that phase 3 instead,
#	Delete the Celera AACC clones, they are not in this assembly,
#	And fix the line of AC018743 to add it to the assembly, it was a
#	last minute addition by Terry that didn't get carried into the
#	NCBI sequence.inf file.  And remove the older versions of five
#	clones that got left in by mistake at NCBI
    #	AC004491.1=AC004491.2 AC004921.1=AC004921.2 AC004983.2=AC004983.3
    #	AC005088.2=AC005088.3 AC006014.2=AC006014.3 AC099654.4=AC099654.5 
#	And finally the grep selects only those things for_assembly
    cd /cluster/data/hg17
    egrep "for_assembly|AC018743" /cluster/store5/gs.18/ncbi/sequence.inf | \
    sed -e "s/\tW\t/\t3\t/; /^AACC010000.*/d; /^AC004491.1.*/d; \
	/^AC004921.1.*/d; /^AC004983.2.*/d; /^AC005088.2.*/d; \
	/^AC006014.2.*/d; /^AC099654.4.*/d; \
	s/AC018743.27\t31791062\t466818\t1\tD\tUn\t-\tBCM\tRP11-289M22\tSIZE:2big/AC018743.27\t31791062\t466818\t1\t-\t(12)\t-\tBCM\tRP11-289M22\tfor_assembly/" \
	> sequence.inf
    cd /cluster/data/hg17
    hgGoldGapGl -chromLst=chrom.lst hg17 /cluster/store5/gs.18 build35
    $HOME/bin/i386/hgClonePos -chromLst=chrom.lst hg17 \
	/cluster/data/hg17 ./sequence.inf /cluster/store5/gs.18 -maxErr=3 \
	-maxWarn=2000 2> clone.pos.errors
    #	Extract all the PAR clones for chrX from clonePos, change the X
    #	to Y, fixup the coordinates on the last three, and load this
    #	data in on the clonePos table in addition to what is there
    #	already.
    cat << '_EOF_' > chrY.par.clonePos
BX640545.2      34821   3       chrY    0       34250   F
AL954722.18     37771   3       chrY    84821   122592  F
BX537334.4      36872   3       chrY    120592  157464  F
BX000483.7      15918   3       chrY    155466  171384  F
AL954664.17     39200   3       chrY    251384  290307  F
BX000476.5      33340   3       chrY    282188  315528  F
AL732314.18     218723  3       chrY    313528  532251  F
BX004827.18     119555  3       chrY    479050  600112  F
AL683871.15     175765  3       chrY    598112  773877  F
AL672311.26     115998  3       chrY    771877  887875  F
AL672277.20     131682  3       chrY    885875  1017557 F
BX908402.3      36556   3       chrY    1067557 1104113 F
BX649635.3      43709   3       chrY    1154113 1197822 F
BX649553.5      90286   3       chrY    1347822 1438108 F
BX296563.3      21008   3       chrY    1488108 1509117 F
BX119906.16     35666   3       chrY    1507116 1542782 F
AL683870.15     162377  3       chrY    1541782 1704175 F
AL691415.17     45085   3       chrY    1702175 1747265 F
AL683807.22     189825  3       chrY    1745260 1935086 F
AL672040.10     117297  3       chrY    1933086 2050383 F
BX004859.8      63432   3       chrY    2048380 2111815 F
BX119919.5      55442   3       chrY    2261815 2317257 F
AC079176.15     186278  3       chrY    2311674 2497952 F
AC097314.27     80501   3       chrY    2495948 2576449 F
AC006209.25     141759  3       chrY    2551122 2692881 F
AJ271735.1      240000  3       chrY    57302979        57543030        F
AJ271736.1      158661  3       chrY    57543030        57701691        F
'_EOF_'

hgsql -e 'load data local infile "chrY.par.clonePos" into table clonePos;' hg17

    #	We have the following errors
# Processing /cluster/data/hg17/Y/chrY.gl
# Clone BX640545 is on chromosomes chrX and chrY.  Ignoring chrY
# Clone AL954722 is on chromosomes chrX and chrY.  Ignoring chrY
# ... etc for all the PAR clones
# ... And there are an unknown number of these:
# AB000359 is in ./sequence.inf but not in ooDir/*/*.gl
# AB000360 is in ./sequence.inf but not in ooDir/*/*.gl

#  gc5Base wiggle TRACK (DONE - 2004-05-22 - Hiram)
    #	This previously was a script that ran through each nib
    #	Recently transformed into a mini cluster run.
    #	Re-DONE with the new randoms - 2004-06-04
    ssh kki
    mkdir /cluster/data/hg17/bed/gc5Base
    cd /cluster/data/hg17/bed/gc5Base

    mkdir wigData5 dataLimits5 wigData5_1K dataLimits5_1K

    cat << '_EOF_' > kkRun.sh
#!/bin/sh
NIB=$1

chr=${NIB/.nib/}
chrom=${chr#chr}

hgGcPercent -chr=${chr} -doGaps -file=stdout -win=5 hg17 \
        /iscratch/i/gs.18/build35/bothMaskedNibs | \
    grep -w GC | \
    awk '{if (($3-$2) >= 5) {printf "%d\t%.1f\n", $2+1, $5/10.0} }' | \
    wigAsciiToBinary -dataSpan=5 -chrom=${chr} \
        -wibFile=wigData5/gc5Base_${chrom} \
            -name=${chrom} stdin 2> dataLimits5/${chr}
'_EOF_'
    # << this line makes emacs coloring happy
    chmod +x kkRun.sh

    ls /iscratch/i/gs.18/build35/bothMaskedNibs > nibList
    cat << '_EOF_' > gsub
#LOOP
./kkRun.sh $(path1)
#ENDLOOP
'_EOF_'
    # << this line makes emacs coloring happy

    gensub2 nibList single gsub jobList
    para create jobList
    para try, check, ... etc
# Completed: 46 of 46 jobs
# CPU time in finished jobs:       5251s      87.51m     1.46h    0.06d  0.000 y
# IO & Wait Time:                   130s       2.17m     0.04h    0.00d  0.000 y
# Average job time:                 117s       1.95m     0.03h    0.00d
# Longest job:                      413s       6.88m     0.11h    0.00d
# Submission to last job:           475s       7.92m     0.13h    0.01d

    # load the .wig files back on hgwdev:
    ssh hgwdev
    cd /cluster/data/hg17/bed/gc5Base
    hgLoadWiggle -pathPrefix=/gbdb/hg17/wib/gc5Base hg17 gc5Base wigData5/*.wig
    # and symlink the .wib files into /gbdb
    mkdir /gbdb/hg17/wib/gc5Base
    ln -s `pwd`/wigData5/*.wib /gbdb/hg17/wib/gc5Base

    #	And then the zoomed data view
    ssh kki
    cd /cluster/data/hg17/bed/gc5Base
    mkdir wigData5_1K dataLimits5_1K

    cat << '_EOF_' > kkRunZoom.sh
#!/bin/sh
NIB=$1

chr=${NIB/.nib/}
chrom=${chr#chr}

hgGcPercent -chr=${chr} -doGaps -file=stdout -win=5 hg17 \
        /iscratch/i/gs.18/build35/bothMaskedNibs | \
    grep -w GC | \
    awk '{if (($3-$2) >= 5) {printf "%d\t%.1f\n", $2+1, $5/10.0} }' | \
    wigZoom -dataSpan=1000 stdin | wigAsciiToBinary -dataSpan=1000 \
	-chrom=${chr} -wibFile=wigData5_1K/gc5Base_${chrom}_1K \
            -name=${chrom} stdin 2> dataLimits5_1K/${chr}
'_EOF_'
    # << this line makes emacs coloring happy
    chmod +x kkRunZoom.sh

    cat << '_EOF_' > gsubZoom
#LOOP
./kkRunZoom.sh $(path1)
#ENDLOOP
'_EOF_'
    # << this line makes emacs coloring happy

    gensub2 nibList single gsubZoom jobListZoom
    para create jobListZoom
    para try ... check ... etc ...
# Completed: 46 of 46 jobs
# CPU time in finished jobs:       5216s      86.93m     1.45h    0.06d  0.000 y
# IO & Wait Time:                    34s       0.57m     0.01h    0.00d  0.000 y
# Average job time:                 114s       1.90m     0.03h    0.00d
# Longest job:                      415s       6.92m     0.12h    0.00d
# Submission to last job:           469s       7.82m     0.13h    0.01d

    #	Then load these .wig files into the same database as above
    ssh hgwdev
    hgLoadWiggle -pathPrefix=/gbdb/hg17/wib/gc5Base \
	-oldTable hg17 gc5Base wigData5_1K/*.wig
    # and symlink these .wib files into /gbdb
    mkdir -p /gbdb/hg17/wib/gc5Base
    ln -s `pwd`/wigData5_1K/*.wib /gbdb/hg17/wib/gc5Base

# AUTO UPDATE GENBANK MRNA RUN  (DONE - 2004-06-08 - Hiram)
    ssh eieio
    cd /cluster/data/genbank
    # This is a new organism, edit the etc/genbank.conf file and add:
	# hg17
	hg17.genome = /scratch/hg/gs.18/build35/bothMaskedNibs/chr*.nib
	hg17.lift = /cluster/store5/gs.18/build35/jkStuff/liftAll.lft
	hg17.genbank.est.xeno.load = yes
	hg17.mgcTables.default = full
	hg17.mgcTables.mgc = all
	hg17.downloadDir = hg17

    #	Do the refseq's first, they are the quick ones
    ssh eieio
    cd /cluster/data/genbank
    nice bin/gbAlignStep -srcDb=refseq -type=mrna -verbose=1 -initial hg17
    #	 logFile: var/build/logs/2004.05.25-13:41:07.hg17.initalign.log
    #	checking that log, or watching the batch on kk, you can find
    #	where the batch is running and after it is done get the time:
    cd /cluster/store6/genbank/work/initial.hg17/align
    para time > time
    cat time
# Completed: 9500 of 9500 jobs
# CPU time in finished jobs:      62241s    1037.35m    17.29h    0.72d  0.002 y
# IO & Wait Time:                 33719s     561.98m     9.37h    0.39d  0.001 y
# Average job time:                  10s       0.17m     0.00h    0.00d
# Longest job:                     1062s      17.70m     0.29h    0.01d
# Submission to last job:          1063s      17.72m     0.30h    0.01d

    # Load the results from the above
    ssh hgwdev
    cd /cluster/data/genbank
    nice bin/gbDbLoadStep -verbose=1 -drop -initialLoad hg17

#	To get the genbank started, the above results need to be
#	moved out of the way.  These things can be removed if there are
#	no problems to debug
    ssh eieio
    cd /cluster/data/genbank/work
    mv initial.hg17 initial.hg17.refseq.mrna

    cd /cluster/data/genbank
    nice bin/gbAlignStep -srcDb=genbank -type=mrna -verbose=1 -initial hg17
    #	logFile: var/build/logs/2004.06.04-10:47:21.hg17.initalign.log
    #	One job was hung up, after killing it on its node, the batch
    #	finished in a few minutes.
# Completed: 35720 of 35720 jobs
# CPU time in finished jobs:    5161424s   86023.74m  1433.73h   59.74d  0.164 y
# IO & Wait Time:                144149s    2402.48m    40.04h    1.67d  0.005 y
# Average job time:                 149s       2.48m     0.04h    0.00d
# Longest job:                    18306s     305.10m     5.08h    0.21d
# Submission to last job:         35061s     584.35m     9.74h    0.41d

    ssh hgwdev
    cd /cluster/data/genbank
    #	some kind of error happened here, had to remove a lock file to
    #	get this to proceed  (this same thing happened again the second
    #	time around)
    nice bin/gbDbLoadStep -verbose=1 -drop -initialLoad hg17

    ssh eieio
    cd /cluster/data/genbank/work
    mv initial.hg17 initial.hg17.genbank.mrna
    cd /cluster/data/genbank
    nice bin/gbAlignStep -srcDb=genbank -type=est -verbose=1 -initial hg17
# Completed: 189240 of 189240 jobs
# CPU time in finished jobs:   97172120s 1619535.33m 26992.26h 1124.68d  3.081 y
# IO & Wait Time:               1507789s   25129.82m   418.83h   17.45d  0.048 y
# Average job time:                 521s       8.69m     0.14h    0.01d
# Longest job:                    33165s     552.75m     9.21h    0.38d
# Submission to last job:        126988s    2116.47m    35.27h    1.47d

    ssh hgwdev
    cd /cluster/data/genbank
    time nice bin/gbDbLoadStep -verbose=1 -drop -initialLoad hg17
    #	real    440m42.750s
    #	user    69m7.810s
    #	sys     23m18.640s
    #	This is ~7.5 hours

    #	If the above is all OK, ask Mark to put this assembly on
    #	the daily updates.

# CPGISLANDS (DONE - 2004-05-25 - Hiram)
    #	Re-DONE with new randoms - 2004-06-04 - Hiram
    ssh hgwdev
    mkdir -p /cluster/data/hg17/bed/cpgIsland
    cd /cluster/data/hg17/bed/cpgIsland

    # Build software from Asif Chinwalla (achinwal@watson.wustl.edu)
    cvs co hg3rdParty/cpgIslands
    cd hg3rdParty/cpgIslands
    make
    #	gcc readseq.c cpg_lh.c -o cpglh.exe
    mv cpglh.exe /cluster/data/hg17/bed/cpgIsland/
    
    # cpglh.exe requires hard-masked (N) .fa's.  
    # There may be warnings about "bad character" for IUPAC ambiguous 
    # characters like R, S, etc.  Ignore the warnings.  
    ssh eieio
    cd /cluster/data/hg17/bed/cpgIsland
    foreach f (../../*/chr*.fa.masked)
      set fout=$f:t:r:r.cpg
      echo running cpglh on $f to $fout
      ./cpglh.exe $f > $fout
    end
    #	the warnings:
    # Bad char 0x52 = 'R' at line 2046, base 102229, sequence chr16_random
    # Bad char 0x4d = 'M' at line 1216113, base 60805573, sequence chr3
    # Bad char 0x52 = 'R' at line 1216118, base 60805801, sequence chr3
    # Bad char 0x52 = 'R' at line 1216118, base 60805801, sequence chr3
    #	real    21m47.823s
    #	user    18m30.810s
    #	sys     1m13.420s

    # Transform cpglh output to bed +
    cat << '_EOF_' > filter.awk
/* Input columns: */
/* chrom, start, end, len, CpG: cpgNum, perGc, cpg:gpc, observed:expected */
/* chr1\t 41776\t 42129\t 259\t CpG: 34\t 65.8\t 0.92\t 0.94 */
/* Output columns: */
/* chrom, start, end, name, length, cpgNum, gcNum, perCpg, perGc, obsExp */
/* chr1\t41775\t42129\tCpG: 34\t354\t34\t233\t19.2\t65.8\to0.94 */
{
$2 = $2 - 1;
width = $3 - $2;
printf("%s\t%d\t%s\t%s %s\t%s\t%s\t%0.0f\t%0.1f\t%s\t%s\n",
       $1, $2, $3, $5,$6, width,
       $6, width*$7*0.01, 100.0*2*$6/width, $7, $9);
}
'_EOF_'
    # << this line makes emacs coloring happy
    awk -f filter.awk chr*.cpg > cpgIsland.bed

    ssh hgwdev
    cd /cluster/data/hg17/bed/cpgIsland
    hgLoadBed hg17 cpgIslandExt -tab -noBin \
      -sqlTable=$HOME/kent/src/hg/lib/cpgIslandExt.sql cpgIsland.bed
    #	Reading cpgIsland.bed
    #	Loaded 27801 elements of size 10
    #	Sorted
    #	Saving bed.tab
    #	Loading hg17

# MAKE HGCENTRALTEST BLATSERVERS ENTRY (DONE - 2004-05-25 - Heather)
    ssh hgwdev
    hgsql -e 'INSERT INTO blatServers (db, host, port, isTrans) \
	VALUES("hg17", "blat12", "17778", "1"); \
	INSERT INTo blatServers (db, host, port, isTrans) \
	VALUES("hg17", "blat12", "17779", "0");' \
	-h genome-testdb hgcentraltest

# PREPARE CLUSTER FOR BLASTZ RUNS (DONE - 2004-05-26 - Hiram)
    #	Re-DONE with new randoms - 2004-06-03 - Hiram

    ssh eieio
    mkdir /cluster/bluearc/scratch/hg/gs.18/build35/rmsk.spec
    cd /cluster/bluearc/scratch/hg/gs.18/build35/rmsk.spec
    ln -s ../rmsk/*.out .
    #	This takes 40 minutes run as a script, to hurry it up it has
    #	been converted to a mini cluster run
    cat << '_EOF_' > runArian.sh
#!/bin/sh
for FN in *.out
do
    echo /cluster/bluearc/RepeatMasker030619/DateRepsinRMoutput.pl \
	${FN} -query human -comp rat -comp mouse
done
'_EOF_'
    # << emacs
    chmod +x runArian.sh
    ssh kki
    cd /cluster/bluearc/scratch/hg/gs.18/build35/rmsk.spec
    ./runArian.sh > jobList
    para create jobList
    para try, ... check ... push ... etc ...
# Completed: 46 of 46 jobs
# CPU time in finished jobs:        668s      11.14m     0.19h    0.01d  0.000 y
# IO & Wait Time:                   514s       8.56m     0.14h    0.01d  0.000 y
# Average job time:                  26s       0.43m     0.01h    0.00d
# Longest job:                       86s       1.43m     0.02h    0.00d
# Submission to last job:           108s       1.80m     0.03h    0.00d

    #	Now extract each one, 1 = Rat, 2 = Mouse
    ssh eieio
    cd /cluster/bluearc/scratch/hg/gs.18/build35

    mkdir linSpecRep.notInRat linSpecRep.notInMouse
    foreach f (rmsk.spec/*.out_rat_mus)
        set base = $f:t:r:r
        echo "$f -> $base.out.spec"
        /cluster/bin/scripts/extractLinSpecReps 1 $f > \
                        linSpecRep.notInRat/$base.out.spec
        /cluster/bin/scripts/extractLinSpecReps 2 $f > \
                        linSpecRep.notInMouse/$base.out.spec
    end
    #	There is actually no difference at all between these two results.
    #	copy to iscratch
    ssh kkr1u00
    cd /iscratch/i/gs.18/build35
    rsync -arlv /cluster/bluearc/scratch/hg/gs.18/build35/ .
    /cluster/bin/iSync
    # request rsync of /cluster/bluearc/scratch to the KiloKluster /scratch

# COPY DATA TO GOLDEN PATH LOCATIONS (DONE - 2004-06-04 - Hiram)
    ssh hgwdev
    mkdir -p /usr/local/apache/htdocs/goldenPath/hg17/chromosomes
    cd /cluster/data/hg17
    #	Beware, this backgrounding of the gzips can be hard on hgwdev.
    #	You could wait until after the copy then run one gzip to do them all
    foreach chrom ( `cat chrom.lst` )
	cp -p ${chrom}/*.fa /usr/local/apache/htdocs/goldenPath/hg17/chromosomes
	gzip \
	/usr/local/apache/htdocs/goldenPath/hg17/chromosomes/chr${chrom}*.fa &
	echo "done ${chrom}"
    end
    cd /usr/local/apache/htdocs/goldenPath/hg17/chromosomes
    gzip *.fa


# FOSMID END PAIRS TRACK (2004-06-09 kate)
# Corrected upper size limit to 50Kbp, reran pslPairs, 
# and reloaded (2004-07-15 kate)

    # Use latest fosmid ends data prepared by Terry Furey.
    # He says there is no on-going work on fosmid ends, so this
    # should suffice indefinitely ?  Move/link this stuff into
    # central data area.
    ssh eieio
    cd /cluster/data/ncbi
    mkdir -p fosends/human
    ln -s /cluster/store1/fosends.3 fosends/human
    cd fosends/human/fosends.3
    faSize fosEnds.fa
       # 579735181 bases (369769 N's 579365412 real) in 1087670 sequences 
       # 580M bases in 1M sequences
    # create link in /gbdb/ncbi/fosends/human ?

    # use pre-split fosend files, and associated list for cluster run
    # Sequences are in /cluster/bluearc/hg/fosEnds
    cp /cluster/bluearc/booch/fosends/fosEnds.lst /cluster/bluearc/hg/fosEnds
    
    # run on rack9 since kilokluster is busy
    ssh kk9
    cd /cluster/data/hg17
    mkdir -p bed/fosends
    cd bed/fosends
    mkdir -p run
    cd run
    ls -1S /scratch/hg/gs.18/build35/maskedContigs/*.fa \
                > contigs.lst
    cp /cluster/bluearc/hg/fosEnds/fosEnds.lst fosEnds.lst
        # 380 contigs vs 97 fosEnd files -> 40K jobs
    # send output to kksilo, as it can better handle the NFS load
    mkdir -p /cluster/store7/kate/hg17/fosends/out
    ln -s /cluster/store7/kate/hg17/fosends/out ../out
cat > gsub << 'EOF'
#LOOP
/cluster/bin/i386/blat $(path1) $(path2) -ooc=/scratch/hg/h/11.ooc {check out line+ /cluster/data/hg17/bed/fosends/out/$(root2)/$(root1).$(root2).psl}
#ENDLOOP
'EOF'
    # << emacs
    gensub2 contigs.lst fosEnds.lst gsub jobList
    foreach f (`cat fosEnds.lst`)
        set d = $f:r:t
        echo $d
        mkdir -p /cluster/data/hg17/bed/fosends/out/$d
    end

    para create jobList
        # 36860 jobs
    para try
    para check
    para push
# CPU time in finished jobs:    1655943s   27599.05m   459.98h   19.17d  0.053 y
# IO & Wait Time:                101145s    1685.75m    28.10h    1.17d  0.003 y
# Average job time:                  48s       0.79m     0.01h    0.00d
# Longest job:                     1294s      21.57m     0.36h    0.01d
# Submission to last job:         19269s     321.15m     5.35h    0.22d

    # sort, filter, and lift alignments
    ssh eieio
    cd /cluster/data/hg17/bed/fosends
    pslSort dirs raw.psl temp out/fosEnds*
    pslReps  -nearTop=0.01 -minCover=0.70 -minAli=0.85 -noIntrons raw.psl \
                        fosEnds.psl /dev/null
        # Processed 84096767 alignments

    # cleanup
    rm -r temp
    rm raw.psl
    rm -fr out /cluster/store7/kate/hg17/fosends

    mkdir lifted
    liftUp lifted/fosEnds.lifted.psl \
                /cluster/data/hg17/jkStuff/liftAll.lft warn fosEnds.psl
    pslSort dirs fosEnds.sorted.psl temp lifted
    rmdir temp
    wc -l *.sorted.psl
        # 1693693 fosEnds.sorted.psl
 
    set ncbiDir = /cluster/data/ncbi/fosends/human/fosends.3
    ~/bin/i386/pslPairs -tInsert=5000 -minId=0.94 -noBin -min=30000 -max=50000 -slop -short -long -orphan -mismatch -verbose fosEnds.sorted.psl $ncbiDir/fosEnds.pairs all_fosends fosEnds

    # create header required by "rdb" tools
    # TODO: replace w/ awk & sort
    echo 'chr\tstart\tend\tclone\tscore\tstrand\tall\tfeatures\tstarts\tsizes' > header
    echo '10\t10N\t10N\t10\t10N\t10\t10\t10N\t10\t10' >> header
    cat header fosEnds.pairs | row score ge 300 | sorttbl chr start | headchg -del > fosEndPairs.bed
    cat header fosEnds.slop fosEnds.short fosEnds.long fosEnds.mismatch \
         fosEnds.orphan \
    | row score ge 300 | sorttbl chr start | headchg -del > fosEndPairsBad.bed

    extractPslLoad -noBin fosEnds.sorted.psl fosEndPairs.bed \
                fosEndPairsBad.bed | \
                        sorttbl tname tstart | headchg -del > fosEnds.load.psl

    # load into database
    ssh hgwdev
    cd /cluster/data/hg17/bed/fosends
    hgLoadBed hg17 fosEndPairs fosEndPairs.bed \
                 -sqlTable=/cluster/home/kate/kent/src/hg/lib/fosEndPairs.sql 
        # Loaded 384558 elements

    # note - this track isn't pushed to RR, just used for assembly QA
    hgLoadBed hg17 fosEndPairsBad fosEndPairsBad.bed \
                 -sqlTable=/cluster/home/kate/kent/src/hg/lib/fosEndPairsBad.sql
        # Loaded  30830 elements

    #hgLoadPsl hg17 -nobin -table=all_fosends fosEnds.load.psl
    # NOTE: truncates file to 0 if -nobin is used
    hgLoadPsl hg17 -table=all_fosends fosEnds.load.psl
        # load of all_fosends did not go as planned: 1526991 record(s), 0 row(s) skipped, 156 warning(s) loading psl.tab

    # load sequences
    mkdir -p /gbdb/hg17/fosends
    ln -s /cluster/data/ncbi/fosends/human/fosends.3/fosEnds.fa \
                                /gbdb/hg17/fosends/fosEnds.fa
    hgLoadSeq hg17 /gbdb/hg17/fosends/fosEnds.fa
        # 1087670 sequences
       # NOTE: extFile ID is 832625 (shouldn't be so large ??) 
       # may want to reset this.


# BAC END PAIRS TRACK (DONE - 2004-06-09 kate)
    # Re-ran pslPairs with updated pairs file (2004-10-04 booch)

    # Use latest BAC ends data from NCBI
    # Checked  ftp.ncbi.nih.gov/genomes/BACENDS/homo_sapiens,
    #  and files were unchanged from Terry's last download
    #  (to /cluster/store1/bacends.4)
    # Link this stuff into central data area.
    ssh eieio
    cd /cluster/data/ncbi
    mkdir -p bacends/human
    ln -s /cluster/store1/bacends.4 bacends/human
    cd bacends/human/bacends.4
    faSize BACends.fa
        # 400230494 bases (2743171 N's 397487323 real) in 832614 sequences
        # 400M bases in 800K sequences

    # use pre-split bacends files, and associated list for cluster run
    ssh kk
    cd /cluster/data/hg17
    mkdir -p bed/bacends
    cd bed/bacends
    mkdir run
    cd run
    ls -1S /scratch/hg/gs.18/build35/maskedContigs/*.fa > contigs.lst
    ls -1S /cluster/bluearc/hg/bacEnds/hs/*.fa > bacends.lst
        # 380 contigs vs 98 bacends files -> 40K jobs

    # send output to kksilo, as it can better handle the NFS load
    # (these are quick jobs)
    mkdir -p /cluster/store7/kate/hg17/bacends/out
    ln -s /cluster/store7/kate/hg17/bacends/out ../out
cat > gsub << 'EOF'
#LOOP
/cluster/bin/i386/blat $(path1) $(path2) -ooc=/scratch/hg/h/11.ooc {check out line+ /cluster/data/hg17/bed/bacends/out/$(root2)/$(root1).$(root2).psl}
#ENDLOOP
'EOF'
# << emacs
    gensub2 contigs.lst bacends.lst gsub jobList
    foreach f (`cat bacends.lst`)
        set d = $f:r:t
        echo $d
        mkdir -p /cluster/data/hg17/bed/bacends/out/$d
    end

    para create jobList
        # 37240 jobs written to batch
    para try
    para check
    para push
# CPU time in finished jobs:    1573932s   26232.19m   437.20h   18.22d  0.050 y
# IO & Wait Time:                122751s    2045.86m    34.10h    1.42d  0.004 y
# Average job time:                  46s       0.76m     0.01h    0.00d
# Longest job:                     3312s      55.20m     0.92h    0.04d
# Submission to last job:          7148s     119.13m     1.99h    0.08d

    cd ../out/BACends000
    pslCheck *.psl
#Error: invalid PSL: AZ519021:1-575 NT_004559:1306426-1608347 - NT_004559.BACends000.psl:1101
#AZ519021 query block 3 start 283 < previous block end 575
    # NOTE: inquired with JK regarding these results

    # lift alignments
    ssh eieio
    cd /cluster/data/hg17/bed/bacends
    pslSort dirs raw.psl temp out/BACends*
    # takes hours ?

        # 37240 files in 98 dirs
        # Got 37240 files 193 files per mid file
    pslReps -nearTop=0.02 -minCover=0.60 -minAli=0.85 -noIntrons \
                raw.psl  bacEnds.psl /dev/null
        # Processed 52291246 alignments
    mkdir lifted
    liftUp lifted/bacEnds.lifted.psl \
                /cluster/data/hg17/jkStuff/liftAll.lft warn bacEnds.psl
    pslSort dirs bacEnds.sorted.psl temp lifted

    # cleanup
    rmdir temp
    rm -fr out /cluster/store7/kate/hg17/bacends

    wc -l *.sorted.psl
        # 2497227 bacEnds.sorted.psl

    set ncbiDir = /cluster/data/ncbi/bacends/human/bacends.4
    ~/bin/i386/pslPairs -tInsert=10000 -minId=0.91 -noBin -min=25000 -max=350000 -slopval=10000 -hardMax=500000 -slop -short -long -orphan -mismatch -verbose bacEnds.sorted.psl $ncbiDir/bacEndPairs.txt all_bacends bacEnds

    # create header required by "rdb" tools
    # TODO: replace w/ awk & sort
    echo 'chr\tstart\tend\tclone\tscore\tstrand\tall\tfeatures\tstarts\tsizes' > header
    echo '10\t10N\t10N\t10\t10N\t10\t10\t10N\t10\t10' >> header
    cat header bacEnds.pairs | row score ge 300 | sorttbl chr start | headchg -del > bacEndPairs.bed
    cat header  bacEnds.slop bacEnds.short bacEnds.long bacEnds.mismatch bacEnds.orphan \
        | row score ge 300 | sorttbl chr start | headchg -del > bacEndPairsBad.bed

    extractPslLoad -noBin bacEnds.sorted.psl bacEndPairs.bed \
                bacEndPairsBad.bed | \
                        sorttbl tname tstart | headchg -del > bacEnds.load.psl

    # load into database
    ssh hgwdev
    cd /cluster/data/hg17/bed/bacends
    hgLoadBed hg17 bacEndPairs bacEndPairs.bed \
                 -sqlTable=/cluster/home/kate/kent/src/hg/lib/bacEndPairs.sql 
        # Loaded 201380  
    # note - this track isn't pushed to RR, just used for assembly QA
    hgLoadBed hg17 bacEndPairsBad bacEndPairsBad.bed \
                 -sqlTable=/cluster/home/kate/kent/src/hg/lib/bacEndPairsBad.sql
        # Loaded 81773
    #hgLoadPsl hg17 -nobin -table=all_bacends bacEnds.load.psl
    # NOTE: truncates file to 0 if -nobin is used
    hgLoadPsl hg17 -table=all_bacends bacEnds.load.psl
        #load of all_bacends did not go as planned: 441072 record(s), 0 row(s) skipped, 30 warning(s) loading psl.tab
    # Reloaded table, 2004-07-21, and got more rows:
    # load of all_bacends did not go as planned: 1698790 record(s), 
    # 0 row(s) skipped, 63 warning(s) loading psl.tab

    # load BAC end sequences

    mkdir -p /gbdb/hg17/bacends
    ln -s /cluster/data/ncbi/bacends/human/bacends.4/BACends.fa \
                                /gbdb/hg17/bacends/BACends.fa
    hgLoadSeq hg17 /gbdb/hg17/bacends/BACends.fa
        # 158588 sequences

    # Re-ran pslPairs with updated pairs file to take advantage of new 
    # feature of # allowing comma separated lists of end accessions for each
    #  end for a clone
    
    # First, create new pairs file (bacEndPairs.txt, bacEndSingles.txt)
    mkdir /cluster/data/ncbi/bacends/human/bacends.5
    cd /cluster/data/ncbi/bacends/human/bacends.5
    cp ../bacends.4/cl_ac_gi_len .
    /cluster/bin/scripts/convertEndPairInfo cl_ac_gi_len

    # Next, re-create the bed file
    mkdir /cluster/data/hg17/bed/bacends.update
    cd /cluster/data/hg17/bed/bacends.update
    ln -s /cluster/data/hg17/bed/bacends/bacEnds.sorted.psl ./bacEnds.sorted.psl
    set ncbiDir = /cluster/data/ncbi/bacends/human/bacends.5
    ~/bin/i386/pslPairs -tInsert=10000 -minId=0.91 -noBin -min=25000 -max=350000 -slopval=10000 -hardMax=500000 -slop -short -long -orphan -mismatch -verbose bacEnds.sorted.psl $ncbiDir/bacEndPairs.txt all_bacends bacEnds

    # create header required by "rdb" tools
    # TODO: replace w/ awk & sort
    echo 'chr\tstart\tend\tclone\tscore\tstrand\tall\tfeatures\tstarts\tsizes' > header
    echo '10\t10N\t10N\t10\t10N\t10\t10\t10N\t10\t10' >> header
    cat header bacEnds.pairs | row score ge 300 | sorttbl chr start | headchg -del > bacEndPairs.bed
    cat header  bacEnds.slop bacEnds.short bacEnds.long bacEnds.mismatch bacEnds.orphan \
        | row score ge 300 | sorttbl chr start | headchg -del > bacEndPairsBad.bed

    # wc *.bed
    # 204884 2253724 20612402 bacEndPairs.bed
    # 79401  873411 6527559 bacEndPairsBad.bed

    # previous
    # wc ../bacends/*.bed
    # 201380 2215180 20280578 ../bacends/bacEndPairs.bed
    # 81773  899503 6712402 ../bacends/bacEndPairsBad.bed

    extractPslLoad -noBin bacEnds.sorted.psl bacEndPairs.bed \
                bacEndPairsBad.bed | \
                        sorttbl tname tstart | headchg -del > bacEnds.load.psl

    # load into database
    ssh hgwdev
    cd /cluster/data/hg17/bed/bacends.update
    hgLoadBed hg17 bacEndPairs bacEndPairs.bed \
                 -sqlTable=/cluster/home/kate/kent/src/hg/lib/bacEndPairs.sql 
        # Loaded 204884  
    # note - this track isn't pushed to RR, just used for assembly QA
    hgLoadBed hg17 bacEndPairsBad bacEndPairsBad.bed \
                 -sqlTable=/cluster/home/kate/kent/src/hg/lib/bacEndPairsBad.sql
        # Loaded 79401
    #hgLoadPsl hg17 -nobin -table=all_bacends bacEnds.load.psl
    # NOTE: truncates file to 0 if -nobin is used
    hgLoadPsl hg17 -table=all_bacends bacEnds.load.psl
    # load of all_bacends did not go as planned: 1729146 record(s), 0 row(s) skipped, 70 warning(s) loading psl.tab


# PLACE ASSEMBLY CLONES - misc instructions, only somewhat relevant
#	See PLACE ASSEMBLY CLONES ON CONTIGS AND SEQUENCE below

######	 A second attempt at clone alignment###
    #	Split the clones into 3K pieces into about 1000 fa files

    #	Example:
zcat Z99916.1.fa.gz Z99774.1.fa.gz Z99756.7.fa.gz | faSplit size stdin 3000 /tmp/name.fa -lift=/tmp/name.lft -oneFile

    #	Trying this idea in unPlacedBatch
    ssh kk0
    mkdir /cluster/data/hg17/bed/contig_overlaps/unPlacedBatch
    cd /cluster/data/hg17/bed/contig_overlaps/unPlacedBatch
    ls -1S /scratch/hg/gs.18/build35/bothMaskedNibs > nibList
    ls -1S /cluster/data/hg17/bed/contig_overlaps/blatClones > cloneList
cat << '_EOF_' > gsub
#LOOP
/cluster/bin/i386/blat -fastMap -ooc=/scratch/hg/h/11.ooc -q=dna -t=dna {check in exists /scratch/hg/gs.18/build35/bothMaskedNibs/$(path1)} {check in exists+ /cluster/data/hg17/bed/contig_overlaps/blatClones/$(path2)} {check out line+ psl/$(root1)/$(root2).psl}
#ENDLOOP
'_EOF_'
    # << this line makes emacs coloring happy

    mkdir psl
    cat nibList | sed -e "s/.nib//" | while read D
do
mkdir psl/$D
done

    gensub2 nibList cloneList gsub jobList
    para create jobList

# PLACE ASSEMBLY CLONES ON CONTIGS AND SEQUENCE (DONE - 2004-07-12 - Hiram)
    ssh eieio
    mkdir /cluster/data/hg17/bed/contig_overlaps
    cd /cluster/data/hg17/bed/contig_overlaps
    #	find all the clones that were used in the assembly
    sed -e "/^#.*/d" /cluster/data/hg17/ncbi_build35.agp | \
        awk '{if (!match($5,"N")) {print $6}}' | \
        sort -u > placed_in_assembly.list
    wc -l placed_in_assembly.list
    #	26872 placed_in_assembly.list
    #	These may be available from the phases files at:
    #	ftp://ftp.ncbi.nih.gov/genbank/genomes/H_sapiens
    #	Which are easily fetched with wget.  However I took a look
    #	at those and could not find all the clones in them.  There may
    #	be a versioning problem because these phases files are often
    #	updated.
    #	Fetch them from Genbank with the following three PERL scripts:
    #	[hiram@hgwdev /cluster/data/hg17/bed/contig_overlaps] ls -og *.pl
    #	-rwxrwxr-x    1     3047 May 24 18:43 bioPerlFetch.pl
    #	-rwxrwxr-x    1     2370 Jun  4 15:21 fetchGenbank.pl
    #	-rwxrwxr-x    1      700 May 24 21:47 foldEm.pl

    #	Which takes about 4 days ...
    #	Example, 
    cat << '_EOF_' > terrys.list
AC011841.7
AC018692.9
AC018743.27
AC037482.14
AL163540.11
'_EOF_'
    # << this line makes emacs coloring happy
    #	only works on hgwdev
    ssh hgwdev
    cd /cluster/data/hg17/bed/contig_overlaps
    mkdir fasta
    time ./fetchGenbank.pl terrys.list > fetchResult.out 2>&1

    #	There is a bit of behind the scenes hocus pocus going on here.
    #	This is a tedious task of comparing various lists with each
    #	other and making sure everything matches.  Manual fixups are
    #	done for the newly named 6_hla_hap* items, copies of the PAR
    #	business were duplicated so that X and Y both have the same set
    #	of clones for that.  The end result should be a directory hierarchy
    #	here with a directory for each chrom, each random, the 6_hla_hap?
    #	items and each directory contains the clones that belong to that
    #	chromosome.  The leftovers are the unplaced clones which end up
    #	in the directory called: unPlaced.  The instructions here are
    #	merely a guideline of possibilities.  Care should be taken to
    #	make sure all listings are correct and everything gets in the
    #	right place.
    ssh eieio
    #	And then make a list of all clones considered for assembly:
    sed -e "/^#.*/d" /cluster/store5/gs.18/ncbi/sequence.inf | \
	grep for_assembly | awk '{print $1}' | sort -u > sequence.list
    wc -l sequence.list
    #	46733 sequence.list
    #	Verify overlaps are correct:
    comm -12 placed_in_assembly.list sequence.list > inBoth
    comm -23 placed_in_assembly.list sequence.list > inAssemblyNotSequence
    comm -13 placed_in_assembly.list sequence.list > inSequenceNotAssembly
    wc in*
    #	    1       1      12 inAssemblyNotSequence
    #	26871   26871  301709 inBoth
    #	19862   19862  219050 inSequenceNotAssembly
    #	46734   46734  520771 total
    #	This stray one is from Terry's five additions in the final fixup
    #	phase with Greg:
    cat inAssemblyNotSequence
    #	AC018743.27
    #	Terry added: AC011841.7 AC018692.9 AC018743.27 AC037482.14 AL163540.11
    #
    #	Generate a listing that relates clones to their contigs
    sed -e "/^#.*/d" /cluster/store5/gs.18/build35/ncbi_build35.agp | \
	./contigAcc.pl > disburseEm.list
    #
    #	Using that list, sort the downloaded clones into their
    #	respective chrom directories:
    ./disburse.sh

    #	Check the number of sequences obtained:
    find ./? ./?? ./*_random ./6_hla* -type f | wc -l
    #	26872
    #	So, why is this number one more than the inBoth list ?
    #	Because, the official NCBI sequence.inf file is missing one of
    #	the clones that Terry added: AC018743.27
    #	And it shows up in our check list above as inAssemblyNotSequence
    #	It isn't exactly missing, it just isn't marked "for_assembly"

    #	OK, with everything in place, we are ready to try and find
    #	all these items in the assembly.  To run a Kluster job on one of
    #	the chroms, matching the items that are supposed to be included
    #	in that chrom.  We need to get things set up on the Iservers,
    #	psLayout is heavy into disk I/O and it brings everything down if
    #	allowed to work on any NFS filesystems for input.

    #	It appears that psLayout wants an ooc file of tile size 10
    #	I tried making one for the whole assembly but it seemed to
    #	include too much for some contigs and it caused a lot of
    #	alignments to be missed.  Thus, create an ooc file for each
    #	contig

    ssh eieio
    mkdir /cluster/bluearc/scratch/hg/gs.18/build35/contigOoc10
    cd /cluster/bluearc/scratch/hg/gs.18/build35/contigOoc10
    ls ../maskedContigs | sed -e "s/.fa//" | while read CONTIG
    do
	blat -repMatch=256 -makeOoc=${CONTIG}.10.ooc -tileSize=10 \
	    ../maskedContigs/${CONTIG}.fa \
	    ../maskedContigs/${CONTIG}.fa /dev/null
	echo "done: ${CONTIG}"
    done

    #	Copy that result to the Iservers:
    ssh kkr1u00
    mkdir /iscratch/i/gs.18/build35/contigOoc10
    cd /iscratch/i/gs.18/build35/contigOoc10
    rsync -arlv /cluster/bluearc/scratch/hg/gs.18/build35/contigOoc10/ .
    #	And, copy the clone sequences:
    mkdir /iscratch/i/gs.18/build35/clones
    cd /cluster/store5/gs.18/build35/bed/contig_overlaps
    for D in ? ?? *_random 6_hla_hap?
    do
	rsync -arlv `pwd`/${D} /iscratch/i/gs.18/build35/clones
    done
    
    /cluster/bin/iSync

    ssh kk
    cd /cluster/data/hg17/bed/contig_overlaps
    mkdir psl
    cat << '_EOF_' > runPsLayout.sh
#!/bin/sh
#       kkiPsLayout.sh <chrom> <clone> <contig>
#       where <chrom> is the chrom this contig is on
#       <clone> is one of the .fa.gz files in
#               /cluster/data/hg17/bed/contig_overlaps/*/<clone>.fa.gz
#               without the .fa.gz extension
#               This stuff has been mirrored to:
#               /iscratch/i/gs.18/clones/*/<clone>.fa.gz
#       <contig> is one of the contigs found in:
#               /cluster/store5/gs.18/build35/<chrom>/<contig>/<contig>.fa
#
CHROM=$1
CLONE=$2
CONTIG=$3
TARGET=/iscratch/i/gs.18/build35/maskedContigs/${CONTIG}.fa
FAZ=/iscratch/i/gs.18/build35/clones/${CHROM}/${CLONE}.fa.gz
OOC=/iscratch/i/gs.18/build35/contigOoc10/${CONTIG}.10.ooc
mkdir -p psl/${CONTIG}
if [ ! -s ${FAZ} ]; then
        echo "Can not find: ${FAZ}"
        exit 255
fi
if [ ! -s ${TARGET} ]; then
        echo "Can not find: ${TARGET}"
        exit 255
fi
if [ ! -s ${OOC} ]; then
        echo "Can not find: ${OOC}"
        exit 255
fi
zcat ${FAZ} > /tmp/${CLONE}.fa
$HOME/bin/i386/psLayout ${TARGET} \
        /tmp/${CLONE}.fa genomic ${OOC} psl/${CONTIG}/${CLONE}.psl
RET=$?
rm -f /tmp/${CLONE}.fa
exit ${RET}
'_EOF_'
    # << this line makes emacs coloring happy
    chmod +x runPsLayout.sh

    #	make up a listing of chrom, clone, contig from:
    grep -v "^#" disburseEm.list | sed -e "s/.fa.gz//" > chr.clone.contig.list
    wc -l chr.clone.contig.list
    #	26872 chr.clone.contig.list
    awk '{
printf "./runPsLayout.sh %s %s %s {check out line+ psl/%s/%s.psl}\n",
        $1, $2, $3, $3, $2
}' chr.clone.contig.list > jobList
    # << this line makes emacs coloring happy
    #	To do a quick test, run just chr22:
    grep -v "^22" chr.clone.contig.list | awk '{
printf "./runPsLayout.sh %s %s %s {check out line+ psl/%s/%s.psl}\n",
        $1, $2, $3, $3, $2
}' > jobList
    para create jobList
    para try ... check ... etc ...
    #	One run on chr22 took:
# Completed: 561 of 561 jobs
# CPU time in finished jobs:     927068s   15451.14m   257.52h   10.73d  0.029 y
# IO & Wait Time:                  6295s     104.91m     1.75h    0.07d  0.000 y
# Average job time:                1664s      27.73m     0.46h    0.02d
# Longest job:                    69745s    1162.42m    19.37h    0.81d
# Submission to last job:         69780s    1163.00m    19.38h    0.81d


    #	put the results together, filter, lift and load:
    cd /cluster/data/hg17/bed/contig_overlaps/psl
    pslSort dirs raw.psl tmp N*
    pslReps -singleHit raw.psl repsSingle.psl /dev/null
    liftUp chr22.psl /cluster/data/hg17/jkStuff/liftAll.lft \
	warn repsSingle.psl
    hgLoadPsl -table=cloneTest hg17 chr22.psl

    #	There are a number of clones listed in the sequence.inf file
    #	as status W with names beginning AACC AADB AADC AADD
    #	These are the Whole shotgun assemblies for the Celera genome.
    #	A few of them were used in the assembly of the NCBI genome, namely:
./11/AADB01066164.1.fa.gz
./11/AADC01095577.1.fa.gz
./11/AADD01116830.1.fa.gz
./11/AADD01118406.1.fa.gz
./11/AADD01116787.1.fa.gz
./11/AADD01112371.1.fa.gz
./11/AADD01116788.1.fa.gz
./11/AADD01115518.1.fa.gz
./11/AADD01118410.1.fa.gz
./11/AADD01117999.1.fa.gz
./21/AADD01172789.1.fa.gz
./21/AADD01172788.1.fa.gz
./21/AADD01209098.1.fa.gz
./21/AADD01172902.1.fa.gz
    #	And these have been distributed properly in their corresponding
    #	chromosome.  The rest of them, 26, all with names starting AACC are in
    #	the directory here: celeraOnly

    #	To run the unPlaced alignments.
    #	Prepare scratch and iscratch
    ssh eieio
    mkdir /cluster/bluearc/scratch/hg/gs.18/build35/clones/unPlaced
    rsync -arlv /cluster/data/hg17/bed/contig_overlaps/unPlaced/ \
	/cluster/bluearc/scratch/hg/gs.18/build35/clones/unPlaced
    #	request scratch sync to cluster admins

    ssh kkr1u00
    mkdir /iscratch/i/gs.18/build35/clones/unPlaced
    rsync -arlv /cluster/data/hg17/bed/contig_overlaps/unPlaced/ \
	/iscratch/i/gs.18/build35/clones/unPlaced
    /cluster/bin/iSync

    ssh hgwdev
    cd /cluster/data/hg17/bed/contig_overlaps/unPlacedBatch
    #	There are too many to try them all, obtain guildelines from hg16
    #	of clone to contig mapping:
    hgsql -N -e "select name,chrom from clonePos;" hg16 > hg16.clone.chrom
    hgsql -N -e "select contig,chrom from ctgPos;" hg16 > hg16.contig.chrom

    ssh kk
    mkdir /cluster/data/hg17/bed/contig_overlaps/unPlacedBatch
    cd /cluster/data/hg17/bed/contig_overlaps/unPlacedBatch
    ls ../unPlaced | sed -e "s/.fa.gz//" > unPlaced.clone.list
    wc -l unPlaced.clone.list
    #	19836 unPlaced.clone.list
    ls -1S /scratch/hg/gs.18/build35/maskedContigs > contig.list
    wc -l contig.list
    #	380 contig.list

    cat << '_EOF_' > runPsLayout.sh
#!/bin/sh
#       kkiPsLayout.sh <clone> <contig>
#       <clone> is one of the .fa.gz files in
#               /scratch/hg/gs.18/build35/clones/unPlaced
#               without the .fa.gz extension
#       <contig> is one of the contigs found in:
#               /iscratch/i/gs.18/build35/maskedContigs
#
CLONE=$1
CONTIG=$2
TARGET=/iscratch/i/gs.18/build35/maskedContigs/${CONTIG}.fa
FAZ=/scratch/hg/gs.18/build35/clones/unPlaced/${CLONE}.fa.gz
OOC=/iscratch/i/gs.18/build35/contigOoc10/${CONTIG}.10.ooc
mkdir -p psl/${CONTIG}
if [ ! -s ${FAZ} ]; then
        echo "Can not find: ${FAZ}"
        exit 255
fi
if [ ! -s ${TARGET} ]; then
        echo "Can not find: ${TARGET}"
        exit 255
fi
if [ ! -s ${OOC} ]; then
        echo "Can not find: ${OOC}"
        exit 255
fi
zcat ${FAZ} > /tmp/${CLONE}.fa
$HOME/bin/i386/psLayout ${TARGET} \
        /tmp/${CLONE}.fa genomic ${OOC} psl/${CONTIG}/${CLONE}.psl
RET=$?
rm -f /tmp/${CLONE}.fa
exit ${RET}
'_EOF_'
    # << this line makes emacs coloring happy
    chmod +x runPsLayout.sh

    cat << '_EOF_' > gsub
#LOOP
./runPsLayout.sh $(path1) $(path2) {check out line+ psl/$(path2)/$(path1).psl}
#ENDLOOP
'_EOF_'
    # << this line makes emacs coloring happy

    gensub2 unPlaced.clone.list contig.list gsub jobList

   # XXXX - some time later ... 2004-07-12
    # Bringing this sequence to a close.  Difficulties encountered:
    #	Placed clones that did not survive the psLayout filter:
    #	AC006040.3 AC006328.5 AC007039.6 AC007241.3 AC007965.3
    #	AC009947.2 AC010682.2 AC012005.4 AC016707.2 AC016728.4
    #	AC016752.2 AC017005.7 AC025226.4 AC025246.6 AC055713.29
    #	AC068541.7 AC068601.8 AC068704.4 AC073649.3 AC073962.5
    #	AC091175.11 AC095381.1 AC104597.3 AC130223.2 AC130814.3
    #	AC133883.6 AC139103.3 AF003627.3 AF135405.3 AL021878.2
    #	AL137064.6 AL356803.2 AL390801.4 AL591480.8 AL901608.1
    #	AP005814.2 BX322790.2 Z84489.1 Z84814.1
    #	And placed clones that broken into two pieces during their
    #	psLayout alignment:
    #	AC006982.3 AC007742.4 AC023342.3 AC024183.4 AC025735.4
    #	AC095380.1 AL646104.4 BX293536.4
    #	For the above clones, their assignments in ref_placed.agp were
    #	used instead of trying to adjust the psLayout process.

    #	The PAR clones are a problem.  They were placed properly, but
    #	during their load with hgClonePos there was a warning issued
    #	about their dual existance.  hgClonePos said they were only
    #	going to be placed on chrX and not on chrY.  However in the
    #	browser when chrY is viewed it issues errors about these not
    #	having proper coordinates in the clonePos table.  These were
    #	removed from the coverage track to eliminate that error.
    #	AL954722.18 BX537334.4 BX000483.7 BX908402.3 BX649635.3 BX119919.5
    #	AC079176.15 AC097314.27 AC006209.25 AJ271735.1 AJ271736.1
    #
    #	And finally, after many different types of alignment attempts,
    #	there remain 1489 un-placed clones that could not be located.

    #	While trying to figure out which contigs many clones belonged
    #	to, the following cluster run script was used to take a survey
    #	using blat:
#!/bin/sh
#       runBlat.sh <clone> <contig>
#       <clone> is one of the .fa.gz files in
#               /scratch/hg/gs.18/build35/clones/
#               without the .fa.gz extension
#       <contig> is one of the contigs found in:
#               /iscratch/i/gs.18/build35/maskedContigs
#   
# ./runBlat.sh unPlaced/AB000876.1.fa.gz NT_005612.fa {check out line+
# psl/NT_005612.fa/unPlaced/AB000876.1.fa.gz.psl}
#
HERE=`pwd`
CLONE=$1 
CLONEDIR=`dirname ${CLONE}`
CLONENAME=`basename ${CLONE}`
CLONESRC=/iscratch/i/gs.18/build35/clones/${CLONE}.fa.gz
CONTIG=$2
CONTIGBASE=${CONTIG/.fa/}
TARGET=/iscratch/i/gs.18/build35/maskedContigs/${CONTIG}
if [ ! -s ${CLONESRC} ]; then
        echo "Can not find: ${CLONESRC}" 1>/dev/stderr
        exit 255
fi
if [ ! -s ${TARGET} ]; then
        echo "Can not find: ${TARGET}" 1>/dev/stderr
        exit 255
fi
mkdir -p /tmp/${CLONEDIR}/${CLONENAME}
zcat ${CLONESRC} > /tmp/${CLONEDIR}/${CLONENAME}/${CLONENAME}.fa
cd /tmp/${CLONEDIR}
/cluster/data/hg17/bed/contig_overlaps/FfaSplit/faToFfa ${CLONENAME}
ECOUNT=`cat error.convert | wc -l`
if [ "${ECOUNT}" -ne 0 ]; then
        echo "Error during faToFfa, error.convert not empty" 1>/dev/stderr
        exit 255
fi
rm -f error.convert
B=${CLONENAME/\.*/}
cd /tmp/${CLONEDIR}/${CLONENAME}
faSplit byname ${CLONENAME}.fa .
RET=0
export RET
for F in ${CLONENAME}_*.fa
do
    FA=${F/_*.fa/}
    A=${FA/.[0-9]*/}
    P=${F/.fa/}
    N=${P##*_}
    rm -f t.fa
    mv ${F} t.fa
    cat t.fa | faSplit -oneFile size stdin 3000 ${A}_${N}
    rm -f t.fa
    blat ${TARGET} ${A}_${N}.fa -ooc=/scratch/hg/h/11.ooc ${A}_${N}.psl \
        -t=dna -q=dna -fastMap -noHead
    RET=$?
    if [ "$RET" -ne 0 ]; then
        echo "Error during blat ${TARGET} ${A}_${N}.fa" 1>/dev/stderr
        break
    fi
done
rm -f ${CLONENAME}.fa
rm -f ${B}_*.fa
cd ${HERE}
mkdir -p psl/${CONTIGBASE}
sed -e "s/${A}/${CLONENAME}/" /tmp/${CLONEDIR}/${CLONENAME}/*.psl > \
        psl/${CONTIGBASE}/${CLONENAME}.psl
rm -f /tmp/${CLONEDIR}/${CLONENAME}/*.psl
rmdir --ignore-fail-on-non-empty /tmp/${CLONEDIR}/${CLONENAME}
rmdir --ignore-fail-on-non-empty /tmp/${CLONEDIR}
exit ${RET}

    #	The alignment with psLayout were done with the following cluster
    #	run script:

#!/bin/sh
#       kkiPsLayout.sh <clone> <contig>
#       <clone> is one of the .fa.gz files in
#               /scratch/hg/gs.18/build35/clones/unPlaced
#               without the .fa.gz extension
#       <contig> is one of the contigs found in:
#               /iscratch/i/gs.18/build35/maskedContigs
#
# ./runPsLayout.sh unPlaced/AP001966.2 NT_016354 {check out exists
# psl/NT_016354/AP001966.2.psl}
#
HERE=`pwd`
CLONE=$1
CONTIG=$2
CLONEDIR=`dirname ${CLONE}`
CLONENAME=`basename ${CLONE}`
RESULT=psl/${CONTIG}/${CLONENAME}.psl
CLONESRC=/iscratch/i/gs.18/build35/clones/${CLONE}.fa.gz
TARGET=/iscratch/i/gs.18/build35/maskedContigs/${CONTIG}.fa
OOC=/iscratch/i/gs.18/build35/contigOoc10/${CONTIG}.10.ooc
if [ ! -s ${CLONESRC} ]; then
        echo "Can not find: ${CLONESRC}" 1>/dev/stderr
        exit 255
fi
if [ ! -s ${TARGET} ]; then
        echo "Can not find: ${TARGET}" 1>/dev/stderr
        exit 255
fi
if [ ! -s ${OOC} ]; then
        echo "Can not find: ${OOC}" 1>/dev/stderr
        exit 255
fi
mkdir -p /tmp/${CONTIG}/${CLONEDIR}/${CLONENAME}
zcat ${CLONESRC} > /tmp/${CONTIG}/${CLONEDIR}/${CLONENAME}.fa
cd /tmp/${CONTIG}
/cluster/data/hg17/bed/contig_overlaps/FfaSplit/faToFfa ${CLONEDIR}
cd ${HERE}
mkdir -p psl/${CONTIG}
$HOME/bin/i386/psLayout ${TARGET} /tmp/${CONTIG}/${CLONEDIR}/${CLONENAME}.fa genomic ${OOC} ${RESULT}
RET=$?
rm -f /tmp/${CONTIG}/${CLONEDIR}/${CLONENAME}.fa /tmp/${CONTIG}/error.convert
rmdir --ignore-fail-on-non-empty /tmp/${CONTIG}/${CLONEDIR}/${CLONENAME}
rmdir --ignore-fail-on-non-empty /tmp/${CONTIG}/${CLONEDIR}/
rmdir --ignore-fail-on-non-empty /tmp/${CONTIG}
exit ${RET}

# BUILD KNOWN GENES TABLES (DONE 6/8/04 Fan)

  Build sp040515 and proteins040515 DBs first.
  
  hgsql hg17 -e "create database kgHg17"
  
  cd /cluster/store6/kgDB/bed
  mkdir kgHg17
  cd /cluster/store6/kgDB/bed/kgHg17

  ~/src/hg/protein/KGprocess.sh kgHg17 hg17 040515
  
  The script was run successfully with the last message:

  	Tue Jun  8 15:36:52 PDT 2004 DONE 

  After initial inspection of tables in kgHg17, do the following
  from mySql prompt:

  alter table kgHg17.cgapAlias rename as hg17.cgapAlias;
  alter table kgHg17.cgapBiocDesc rename as hg17.cgapBiocDesc;
  alter table kgHg17.cgapBiocPathway rename as hg17.cgapBiocPathway;
  alter table kgHg17.dupSpMrna rename as hg17.dupSpMrna;
  alter table kgHg17.keggMapDesc rename as hg17.keggMapDesc;
  alter table kgHg17.keggPathway rename as hg17.keggPathway;
  alter table kgHg17.kgAlias rename as hg17.kgAlias;
  alter table kgHg17.kgProtAlias rename as hg17.kgProtAlias;
  alter table kgHg17.kgXref rename as hg17.kgXref;
  alter table kgHg17.knownGene rename as hg17.knownGene;
  alter table kgHg17.knownGeneLink rename as hg17.knownGeneLink;
  alter table kgHg17.knownGeneMrna rename as hg17.knownGeneMrna;
  alter table kgHg17.knownGenePep rename as hg17.knownGenePep;
  alter table kgHg17.mrnaRefseq rename as hg17.mrnaRefseq;
  alter table kgHg17.spMrna rename as hg17.spMrna;

  hg17.knownGene has 43,401 entries and hg16.knownGene has 43,232 entries.
  and running featireBits shows:
  
   	featureBits hg17 knownGene
   	63983072 bases of 2866216770 (2.232%) in intersection
   
   	featureBits hg16 knownGene
   	63781799 bases of 2865248791 (2.226%) in intersection
  
  Connect to genome-testdb and use hgcentraltest DB.
  Add a new entry in gdbPdb table:
 
        insert into gdbPdb values('hg17', 'proteins040515');


# CREATE LINEAGE-SPECIFIC REPEATS FOR BLASTZ WITH ZEBRAFISH
# (DONE, 2004-06-08, hartera)
    # Treat all repeats as lineage-specific
    mkdir /iscratch/i/gs.18/build35/linSpecRep.notInZebrafish
    foreach f (/iscratch/i/gs.18/build35/rmsk/chr*.fa.out)
 cp -p $f /iscratch/i/gs.18/build35/linSpecRep.notInZebrafish/$f:t:r:r.out.spec
    end
    iSync
  

# PREP FOR LIFTOVER CHAINS TO THIS ASSEMBLY (2004-06-10 kate)

    # split into 3K chunks
    ssh eieio
    set liftDir = /cluster/data/hg17/bed/liftOver/liftSplit
    mkdir -p $liftDir
    cd $liftDir
cat > split.csh << 'EOF'
    set splitDir = /iscratch/i/hg17/liftOver/split
    mkdir -p $splitDir
    set liftDir = /cluster/data/hg17/bed/liftOver/liftSplit
    foreach n (`ls /cluster/data/hg17/nib`)
        set c = $n:r
        echo $c
        faSplit -lift=$liftDir/$c.lft size \
            /cluster/data/hg17/$d/$c.fa -oneFile 3000 $splitDir/$c
    end
'EOF'
# << for emacs
    csh split.csh >&! split.log &
    tail -100f split.log

    ssh kkr1u00
    iSync


# STS MARKERS (DONE 2004-07-21 kate)
# MANUAL UPDATE OF D21S168 and D21S167 (DONE, 2005-02-11, hartera)
# FILTERED OUT noOoc ALIGNMENTS WITH tBaseInsert >=1000 
# (DONE, 2005-02-17, hartera) AND RELOADED stsMap, stsInfo2 and all_sts_seq
# DATABASE TABLES AFTER ADDING FILTERED ALIGNMENTS TO all_sts_seq AND 
# REMOVING DATA FROM stsMap and stsInfo2 FOR THE MARKERS REMOVED FROM THE 
# FILTERED SET (DONE, 2005-02-18, hartera)
# UPDATE PSL ALIGNMENTS FOR D21S167 and D21S168 AND RELOAD INTO all_sts_seq
# (DONE, 2005-02-23, hartera)
# UPDATED stsAlias TABLE REMOVING OF FILTERED ALIGNMENTS (2005-02-24, hartera)

    # Terry's sts.9 dir is in /cluster/store5/sts.2004-07.old
    # remove this after verifying the newer version

   # update from NCBI (booch)
    ssh eieio
    # use store5 for space
    mkdir -p /cluster/store5/sts.2004-07
    ln -s /cluster/store5/sts.2004-07 /cluster/data/ncbi
    ln -s /cluster/data/ncbi/sts.2004-07 sts.9
    cd /cluster/data/ncbi/sts.2004-07
    wget ftp://ftp.ncbi.nih.gov/repository/dbSTS/dbSTS.sts
    wget ftp://ftp.ncbi.nih.gov/repository/dbSTS/dbSTS.aliases
    wget ftp://ftp.ncbi.nih.gov/blast/db/FASTA/sts.gz
    gunzip sts.gz
    mv sts dbSTS.fa

    # incremental update from previous build
    # NOTE: could mysql dump this, unless hand-updated (like hg16)
    # First - copy from Terry's dir
    ssh eieio
    ln -s /cluster/store1/sts.8 /cluster/data/ncbi
    cd /cluster/data/ncbi/sts.9

    # this time, snag from Terry's dir
    cd /cluster/data/ncbi/sts.9
    cp -p ~booch/tracks/update/all.STS.fa.prev .
    cp -p ~booch/tracks/update/stsInfo2.bed stsInfo2.bed.prev

    # Convert dbSTS.fa file to easier reading format, and get accessions
    /cluster/bin/scripts/convertGbFaFile dbSTS.fa > dbSTS.convert.fa
    grep ">" dbSTS.convert.fa | cut -f 2 -d ">" > dbSTS.acc

    # NOTE: updateStsInfo creates new stsInfo2.bed, all.primers, 
    #   all.STS.fa, stsAlias.bed files 
    updateStsInfo -verbose=1 -gb=dbSTS.acc stsInfo2.bed.prev all.STS.fa.prev \
                    dbSTS.sts dbSTS.aliases dbSTS.convert.fa new
    # 129991  SWXD2599        99622   (0) not in dbSTS anymore
    # 166473  D3S3812 154523  (0) not in dbSTS anymore
    # 185776  RH83562 209614  (0) not in dbSTS anymore

    mv new.info stsInfo2.bed
    mv new.primers all.primers
    mv new.alias stsAlias.bed
    mv new.fa all.STS.fa

    # get list of all STS id's in the fasta file
    sed -n 's/^>\([0-9][0-9]*\) .*/\1/p' all.STS.fa | sort -n >  all.STS.id
    wc -l all.STS.id
        # 92674 total sequences
    /cluster/bin/scripts/convertPrimerToFA all.primers > all.primers.fa

    # Copy stsInfo2.bed and stsAlias.bed to data directory becuase
    # these will be loaded into the database later
    mkdir -p /cluster/data/hg17/bed/sts
    cp stsInfo2.bed /cluster/data/hg17/bed/sts/
    cp stsAlias.bed /cluster/data/hg17/bed/sts/

    # Create sts sequence alignments
    mkdir -p /cluster/bluearc/sts.9/sts.split
    faSplit sequence all.STS.fa 50 /cluster/bluearc/sts.9/sts.split/sts
    cp /cluster/data/ncbi/sts.9/all.STS.fa /cluster/bluearc/sts.9

    # create small ooc file to use with alignments (if not existing)
    # NOTE: these were just used for experimenting; weren't used in
    # final runs
    ssh kolossus
    cd /cluster/data/hg17/bed/sts
    ls /cluster/bluearc/hg17/bothMaskedNibs/chr*.nib > nib.lst
    blat nib.lst /dev/null /dev/null \
        -tileSize=11 -makeOoc=/cluster/bluearc/hg/h/11.4096.ooc -repMatch=4096
    blat nib.lst /dev/null /dev/null \
        -tileSize=11 -makeOoc=/cluster/bluearc/hg/h/11.16384.ooc -repMatch=16384

    ssh kk
    cd /cluster/data/hg17/bed/sts
    mkdir run
    cd run
    ls -1S /scratch/hg/hg17/maskedContigs/*.fa > contigs.lst
    ls -1S /cluster/bluearc/sts.9/sts.split/sts*.fa > sts.lst
    mkdir -p /cluster/bluearc/hg17/sts/sts/out
    foreach f (`cat sts.lst`)
        set d = $f:t:r
        mkdir /cluster/bluearc/hg17/sts/sts/out/$d
    end

    # create alignments
cat > template << 'EOF'
#LOOP
/cluster/bin/i386/blat $(path1) $(path2) -ooc=/cluster/bluearc/hg/h/11.ooc -stepSize=5 {check out line+ /cluster/bluearc/hg17/sts/sts/out/$(root2)/$(root1).$(root2).psl}
#ENDLOOP
'EOF'
# << for emacs

    gensub2 contigs.lst sts.lst template jobList
    para create jobList
        # 17860 jobs
    para try
    para check
    para push
# CPU time in finished jobs:     216985s    3616.41m    60.27h    2.51d  0.007 y
# IO & Wait Time:                 48790s     813.17m    13.55h    0.56d  0.002 y
# Average job time:                  15s       0.25m     0.00h    0.00d
# Longest job:                      267s       4.45m     0.07h    0.00d
# Submission to last job:          2228s      37.13m     0.62h    0.03d

    # Compile sts sequence results
    ssh kolossus
    cd /cluster/bluearc/hg17/sts/sts
    pslSort dirs raw.psl temp out/*
    rm -rf temp
    pslReps -nearTop=0.0001 -minCover=0.6 -minAli=0.8 -noIntrons raw.psl \
	stsMarkers.psl /dev/null
            # Processed 7121016 alignments
    #cp stsMarkers.psl /cluster/data/hg17/bed/sts/run

    # Lift them and get them ready to combine with primer alignments
    #cd  /cluster/data/hg17/bed/sts/run
    #liftUp -nohead /cluster/data/hg17/bed/sts/run/stsMarkers.lifted.psl \
    liftUp -nohead stsMarkers.lifted.psl \
        /cluster/data/hg17/jkStuff/liftAll.lft warn stsMarkers.psl

    # missing some utilities for kolossus, so switch to fileserver
    # NOTE: probably no longer true -- try on kolossus next time
    ssh kksilo
    cd /cluster/bluearc/hg17/sts/sts
    /cluster/bin/scripts/extractPslInfo stsMarkers.lifted.psl
        # creates <file>.initial
    /cluster/bin/scripts/findAccession -agp stsMarkers.lifted.psl.initial \
	/cluster/data/hg17
            # "Could not open /cluster/data/hg17/Y/chrY_random.agp" etc.
            # Looks like it trys all _randoms (even one's that don't
            # exist/aren't needed
            # creates <file>.acc
    #rm stsMarkers.lifted.psl.initial
    sort -k 4n stsMarkers.lifted.psl.initial.acc > stsMarkers.final
    #rm stsMarkers.lifted.psl.initial.acc
    #cp stsMarkers.final stsMarkers.lifted.psl.initial /cluster/data/hg17/bed/sts
    # determine found markers (4th field in file)
    cut -f 4 stsMarkers.final | sort -n -u > stsMarkers.found
    wc -l stsMarkers.found
        #   89532 stsMarkers.found
        # out of 92674 total sequences

    # extract sequences for markers not yet found, and
    # blat w/o ooc to try to place more
    comm -1 -3  stsMarkers.found /cluster/data/ncbi/sts.9/all.STS.id \
                > stsMarkers.notFound
    wc -l stsMarkers.notFound
        # 3142 stsMarkers.notFound
    faSomeRecords /cluster/data/ncbi/sts.9/all.STS.fa stsMarkers.notFound \
                notFound.STS.fa
    mkdir /cluster/bluearc/sts.9/sts.splitNotFound
    faSplit sequence notFound.STS.fa 20 \
                /cluster/bluearc/sts.9/sts.splitNotFound/sts

    # blat with 11.ooc misses alignments, so reblat w/o the
    # sequences that aren't found
    # NOTE: filtering produces yield of only 149 markers placed (out of 3142).
    # not enough to justify this step next time
    ssh kk
    cd /cluster/data/hg17/bed/sts
    mkdir run.noOoc
    cd run.noOoc
    ls -1S /scratch/hg/hg17/maskedContigs/*.fa > contigs.lst
    ls -1S /cluster/bluearc/sts.9/sts.splitNotFound/sts*.fa > sts.lst
    mkdir -p /cluster/bluearc/hg17/sts/sts/out.noOoc

    foreach f (`cat sts.lst`)
        set d = $f:t:r
        mkdir /cluster/bluearc/hg17/sts/sts/out.noOoc/$d
    end

cat > template << 'EOF'
#LOOP
/cluster/bin/i386/blat $(path1) $(path2) -stepSize=5 {check out line+ /cluster/bluearc/hg17/sts/sts/out.noOoc/$(root2)/$(root1).$(root2).psl}
#ENDLOOP
'EOF'
# << for emacs

    gensub2 contigs.lst sts.lst template jobList
    para create jobList
        # 7220 jobs written to batch
    para try
    para check

    # process this set of alignments
    ssh kolossus
    cd /cluster/bluearc/hg17/sts/sts
    pslSort dirs raw.noOoc.psl temp out.noOoc/*

    rm -rf temp
    pslReps -nearTop=0.0001 -minCover=0.6 -minAli=0.8 -noIntrons \
        raw.noOoc.psl stsMarkers.noOoc.psl /dev/null
            # Processed 4254094 alignments
    #cp stsMarkers.psl /cluster/data/hg17/bed/sts/run

    # Lift them and get them ready to combine with primer alignments
    liftUp -nohead stsMarkers.noOoc.lifted.psl \
        /cluster/data/hg17/jkStuff/liftAll.lft warn stsMarkers.noOoc.psl

    /cluster/bin/scripts/extractPslInfo stsMarkers.noOoc.lifted.psl
        # creates <file>.initial
    /cluster/bin/scripts/findAccession -agp \
        stsMarkers.noOoc.lifted.psl.initial /cluster/data/hg17
            # "Could not open /cluster/data/hg17/Y/chrY_random.agp" etc.
            # Looks like it trys all _randoms (even one's that don't
            # exist/aren't needed
            # creates <file>.acc
    #rm stsMarkers.lifted.psl.initial
    mv stsMarkers.final stsMarkers.ooc.final
    sort -k 4n stsMarkers.noOoc.lifted.psl.initial.acc > stsMarkers.extra
    sort -k 4n stsMarkers.lifted.psl.initial.acc \
                stsMarkers.noOoc.lifted.psl.initial.acc > stsMarkers.final

    # determine found markers (4th field in file)
    cut -f 4 stsMarkers.final | sort -n -u > stsMarkers.found
    wc -l stsMarkers.found
        #  89681 stsMarkers.found
    cut -f 4 stsMarkers.extra | sort -n -u > stsMarkers.extra.found
    wc -l stsMarkers.extra.found
        #   149 out of 3142 attempted
        # out of 92674 total sequences
    cp stsMarkers.final stsMarkers.lifted.psl stsMarkers.*lifted.psl.initial* stsMarkers.found \
                /cluster/data/hg17/bed/sts
    # Alignments from noOoc set were not added to all_sts_seq but info for the markers
    # is in stsMap and stsInfo2. Some of the alignments are bad so filter by removing
    # all alignments from noOoc psl file where tBaseInsert >=1000. Add the remaining 
    # alignments to the set of final alignments for stsMarkers. The information for the  
    # removed markers from the filtered set was also removed from stsMap and stsInfo2.
    # (DONE, 2005-02-17, hartera)
    ssh eieio
    cd /cluster/data/hg17/bed/sts/fix
    cp /cluster/bluearc/hg17/sts/sts/stsMarkers.noOoc.lifted.psl .
    awk '{if ($8 < 1000) print;}' stsMarkers.noOoc.lifted.psl > stsMarkers.noOoc.lifted.filt1000.psl
    wc -l *.filt*.psl
    # 254    5334   26384 stsMarkers.noOoc.lifted.filt1000.psl
    sort -k 4n /cluster/bluearc/hg17/sts/sts/stsMarkers.noOoc.lifted.psl.initial.acc > stsMarkers.extra
    awk '{print $4;}' stsMarkers.extra | sort -n | uniq >  extra.ids
    # in psl file, the ids are the 10th field
    awk '{print $10;}' stsMarkers.noOoc.lifted.psl | sort -n | uniq \
        > noOoc.ids
    diff extra.ids noOoc.ids
    # there is no difference as expected
    # get list of IDs from filtered file, filter < 1000
    awk '{print $10;}' stsMarkers.noOoc.lifted.filt1000.psl \
        | sort -n | uniq > filt1000.ids
    foreach i (`cat filt1000.ids`)
     awk 'BEGIN {OFS="\t"} \
         {if ($4 == "'$i'") print $1, $2, $3, $4, $5, $6, $7}' \
         stsMarkers.extra >> stsMarkers.extra.filt1000
    end
    cp ../stsMarkers.final stsMarkers.final
  #  cat stsMarkers.extra.filt1000 >> stsMarkers.final2 
     # need to filter stsMarkers.final not just cat this on the end
    # get list of alignments with tBaseInsert >= 1000 and remove these
    cd /cluster/data/hg17/bed/sts/fix
    awk '{if ($8 >= 1000) print;}' stsMarkers.noOoc.lifted.psl > stsMarkers.noOoc.lifted.filtToRemove.psl
    wc -l *.filt*.psl
    # 254 stsMarkers.noOoc.lifted.filt1000.psl
    # 249 stsMarkers.noOoc.lifted.filt500.psl
    # 448 stsMarkers.noOoc.lifted.filtToRemove.psl
    # get list of IDs that need to be removed
    awk '{print $10;}' stsMarkers.noOoc.lifted.filtToRemove.psl | sort -n \
        | uniq  > noOoc.IdsToRemove.txt
    # get chrom and co-ordinates for IDs to be removed 
    awk 'BEGIN {OFS = "\t"} {print $14,$16,$17,$10}' \
        stsMarkers.noOoc.lifted.filtToRemove.psl | sort | uniq \
        > sts.noOoc.filtToRemove.coords
    # checked that the stsMarkers.final contain the noOoc alignments
    # wrote perl script to remove lines with these IDs from stsMarkers.final
cat << '_EOF_' > removeIds.pl
#!/usr/bin/perl -w
use strict;

my $ids = $ARGV[0];
my $file = $ARGV[1];
# list of IDs with chrom and coords to remove
open(IDS, $ids) || die "Can not open $ids: $!\n";
# file for removal of IDs
open(FILE, $file) || die "Can not open $file: $!\n";
open(OUT, ">removed.txt") || die "Can not create removed.txt: $!\n";

my %idsHash;

while (<IDS>) {
   chomp;
   my @a = split(/\t/);

   my $chr = $a[0];
   my $st = $a[1];
   my $end = $a[2];
   my $id = $a[3];
   my $key = $id."_".$chr . "_" . $st . "_" . $end;
   $idsHash{$key}->{chrom} = $chr;
   $idsHash{$key}->{start} = $st;
   $idsHash{$key}->{end} = $end;
}
close IDS;

while (<FILE>) {
   chomp;
   my $l = $_;
   my $found = "FALSE";
   my @f = split(/\t/, $l);
   foreach my $k (keys(%idsHash)) {
      # if the id is contained in the key
      if ($k =~ /^$f[3]/) {
         my $c = $idsHash{$k}->{chrom};
         my $s = $idsHash{$k}->{start};
         my $e = $idsHash{$k}->{end};
         if ($f[0] eq $c && $f[1] == $s && $f[2] == $e) {
             print OUT "$c\t$s\t$e\t$f[3]\n";
             $found = "TRUE";
         }
      }
   }
   if ($found eq "FALSE") {
      print "$l\n";
   }
}
'_EOF_'
    chmod +x removeIds.pl
    perl removeIds.pl sts.noOoc.filtToRemove.coords stsMarkers.final \
         > stsMarkers.final.new
    wc -l stsMarkers.final*
    # 92338 stsMarkers.final
    # 91890 stsMarkers.final.new
    # There are 448 ids and sets of co-ordinates in list of Ids to remove
    # check that stsMarkers.final.new contains all the alignments that
    # are in filtered set: stsMarkers.noOoc.lifted.filt1000.psl
    awk 'BEGIN {OFS = "\t"} {print $14,$16,$17,$10}' \
        stsMarkers.noOoc.lifted.filt1000.psl | sort | uniq \
        > sts.noOoc.filt1000.coords
    awk 'BEGIN {OFS = "\t"} {print $1,$2,$3,$4}' \
        stsMarkers.final.new | sort | uniq \
        > sts.finalnew.coords
    diff sts.finalnew.coords sts.noOoc.filt1000.coords > finalnewvsfilt1000
    grep '>' finalnewvsfilt1000
    # there is nothing in sts.noOoc.filt1000.coords not found in the 
    # sts.finalnew.coords file therefore this contains all the alignments
    # from the filtered noOoc file.
    cp ../primers/primers.final .
    awk '{print $4}' primers.final | sort | uniq > primers.ids
    awk '{print $4}' stsMarkers.final.new | sort | uniq > stsfinal.new.ids


    # primers
    ssh eieio
    cd /cluster/data/ncbi/sts.9
    # strip out N's and wobbles (KS) from primers, as isPcr
    # can't currently handle them
    # strip out primers < 10 as isPcr can't handle them
    awk '$0 !~ /[^ACGT0-9\-\t]/ && (length($2) > 10) && (length($3) > 10) {printf "dbSTS_%s\t%s\t%s\n", $1,$2,$3}' \
                all.primers > all.primers.ispcr
    mkdir -p /cluster/bluearc/sts.9/primers
    cd /cluster/bluearc/sts.9/primers
    split -l 2000 /cluster/data/ncbi/sts.9/all.primers.ispcr primers_

    ssh kk
    cd /cluster/data/hg17/bed/sts
    mkdir primers
    cd primers
    mkdir run
    cd run
    ls -1S /scratch/hg/gs.18/build35/maskedContigs/*.fa > contigs.lst
    ls -1S /cluster/bluearc/sts.9/primers/primers_* > primers.lst
    mkdir -p /cluster/bluearc/hg17/sts/primers/out

cat > template << 'EOF'
#LOOP
/cluster/home/kate/bin/i386/isPcr -out=psl -minPerfect=2 -maxSize=5000 -tileSize=10 -ooc=/scratch/hg/h/10.ooc  -stepSize=5 $(path1) $(path2) {check out line /cluster/bluearc/hg17/sts/primers/out/$(root1)_$(root2).psl}
#ENDLOOP
'EOF'
# << for emacs

    gensub2 contigs.lst primers.lst template jobList
    para create jobList
	# 26980 jobs
    para try
    para check
    para push

    #Completed: 26953 of 26980 jobs
    #Crashed: 27 jobs
    #CPU time in finished jobs:    1130353s   18839.22m   313.99h   13.08d  0.036 y
    #IO & Wait Time:                 86067s    1434.44m    23.91h    1.00d  0.003 y
    #Average job time:                  45s       0.75m     0.01h    0.00d
    #Longest job:                     1255s      20.92m     0.35h    0.01d
    #Submission to last job:          2762s      46.03m     0.77h    0.03d

    # 27 jobs seg faulted due to -minPerfect=2.
    # Looks like a bug in isPcr -- till it's fixed,
    # we'll rerun with -minPerfect=5 (Terry determined they
    # all complete with this (he used 3, 4, or 5, tuned individually
    # for each job, but just using 5 should be adequate and 
    # less labor-intensive).
    # NOTE: isPcr bug is fixed -- this shouldn't be necessary for
    # next run

    para crashed | grep isPcr | sed 's/minPerfect=2/minPerfect=5/' \
        > jobList.minPerfect5
    para create jobList.minPerfect5
        # 28 jobs
    # repeat with increasing minPerfect, till all complete succesfully

    # Filter output file quickly based on simple parameters
    ssh kolossus
    cd /cluster/bluearc/hg17/sts/primers/
    mkdir -p filter
    pslQuickFilter -minMatch=26 -maxMismatch=5 -maxTinsert=5000 -verbose out/ filter/
	# Note: there will be many messages saying files are empty - this is OK
    pslSort dirs primers.psl.unlifted temp filter

    # filter primer alignments and create not found primer file for ePCR run (booch)
    pslFilterPrimers /cluster/bluearc/hg17/sts/primers/primers.psl.unlifted  \
	/cluster/data/ncbi/sts.9/all.primers primers.filter.unlifted.psl
        # creates $3.notfound.primers
    wc -l primers.filter.unlifted.psl.notfound.primers                   
    # 21919  primers.filter.unlifted.psl.notfound.primers

    # use Greg Schuler's ePCR to attempt alignment of primers missed
    # by isPcr
    mkdir -p /cluster/data/hg17/bed/sts/primers/run.epcr
    mkdir -p /cluster/bluearc/hg17/sts/primers/epcr
    cd /cluster/bluearc/hg17/sts/primers/epcr
    split -l 2500 /cluster/data/hg17/bed/sts/primers/primers.filter.unlifted.psl.notfound.primers  primers_
    cd /cluster/data/hg17/bed/sts/primers/run.epcr
    ls -1S /cluster/bluearc/hg17/sts/primers/epcr/primers_* > primers.lst
    # create contig.lst based on split in build dir
    # NOTE: should probably replace this with something more standard
    # and faster.  Also, this appears to cause load spikes on fileservers.
    # Should get contigs from bluearc, iservers, or cluster local disk
    # At least it's over pretty quick!
    ssh eieio
    cd /cluster/data/hg17/bed/sts/primers/run.epcr
    /cluster/bin/scripts/splitContigList -ncbi /cluster/data/hg17 1
    # next time... ls -1S /cluster/bluearc/hg17/contigs/* > contig.lst (?)
    mkdir -p /cluster/bluearc/hg17/sts/primers/epcr/out

    ssh kk
    cd /cluster/data/hg17/bed/sts/primers/run.epcr

cat > template << 'EOF'
#LOOP
/cluster/bin/scripts/runEpcr $(path1) $(path2) {check out line /cluster/bluearc/hg17/sts/primers/epcr/out/$(root1).$(root2).epcr}
#ENDLOOP
'EOF'
# << for emacs
    gensub2 primers.lst contig.lst template jobList
    para create jobList
	# 3420 jobs
    para try
    para check
    para push

# CPU time in finished jobs:      78897s    1314.95m    21.92h    0.91d  0.003 y
# IO & Wait Time:                254582s    4243.03m    70.72h    2.95d  0.008 y
# Average job time:                  98s       1.63m     0.03h    0.00d
# Longest job:                      647s      10.78m     0.18h    0.01d
# Submission to last job:          1112s      18.53m     0.31h    0.01d

    # merge output
    ssh eieio
    cd /cluster/bluearc/hg17/sts/primers/epcr
    cat out/*.epcr > all.epcr
    wc -l all.epcr
    # 3573 

    # use all.epcr file to re-filter alignemnts and determine which
    # ePCR records to keep
    cp all.epcr /cluster/data/hg17/bed/sts/primers
    cd /cluster/data/hg17/bed/sts/primers
    pslFilterPrimers -epcr=all.epcr -verbose=1 \
        /cluster/bluearc/hg17/sts/primers/primers.psl.unlifted \
	/cluster/data/ncbi/sts.9/all.primers primers.unlifted.epcr.psl

    # convert to PSL and combine with other psl file (this takes a couple hours)
    /cluster/bin/scripts/epcrToHgPsl epcr.not.found \
        /cluster/data/ncbi/sts.9/all.primers /cluster/data/hg17
    cat primers.unlifted.epcr.psl epcr.not.found.psl \
                | sort -k 10n > primers.final.unlifted.psl

    # Fix the query gap lengths so that they match the all.primers.fa 
    #   file lengths
    /cluster/bin/scripts/fixPrimersQueryGaps \
        /cluster/data/ncbi/sts.9/all.primers primers.final.unlifted.psl \
                > primers.final.unlifted.fix.psl

    # lift results from contigs to chrom coordinates, and create final file
    liftUp -nohead /cluster/data/hg17/bed/sts/primers/primers.psl \
            /cluster/data/hg17/jkStuff/liftAll.lft warn \
            primers.final.unlifted.fix.psl
    # Extract relevant info, make alignments unique, and create final file to be merged
    # with full sequence alignments
    /cluster/bin/scripts/extractPslInfo primers.psl
    /cluster/bin/scripts/findAccession -agp primers.psl.initial \
                /cluster/data/hg17
    #rm primers.psl.initial

    /cluster/bin/scripts/getStsId /cluster/data/ncbi/sts.9/stsInfo2.bed \
	primers.psl.initial.acc \
        | sort -k 4n > primers.final
    #rm primers.psl.initial.acc
    wc -l primers.final
    # 314713 primers.final

    # Merge primer and sequence files to create final bed file
    # Merge (combineSeqPrimerPos) takes about an hour to run
    ssh kolossus
    cd /cluster/data/hg17/bed/sts
    
    /cluster/bin/scripts/combineSeqPrimerPos stsMarkers.final primers/primers.final
        # creates *_pos.rdb
    /cluster/bin/scripts/createSTSbed /cluster/data/ncbi/sts.9/stsInfo2.bed \
                stsMarkers_pos.rdb > stsMap.bed

    # Set up sequence files
    ssh hgwdev
    mkdir -p /gbdb/hg17/sts.9/
    ln -s /cluster/data/ncbi/sts.9/all.STS.fa /gbdb/hg17/sts.9/all.STS.fa
    ln -s /cluster/data/ncbi/sts.9/all.primers.fa \
        /gbdb/hg17/sts.9/all.primers.fa

    # Load all files
    cd /cluster/data/hg17/bed/sts
    hgLoadSeq hg17 /gbdb/hg17/sts.9/all.STS.fa /gbdb/hg17/sts.9/all.primers.fa
    hgsql hg17 < ~kent/src/hg/lib/stsInfo2.sql
    hgsql hg17 < ~kent/src/hg/lib/stsAlias.sql
    cp /cluster/data/ncbi/sts.9/{stsInfo2.bed,stsAlias.bed} .
    hgsql hg17 -e 'load data local infile "stsInfo2.bed" into table stsInfo2'
    hgsql hg17 -e 'load data local infile "stsAlias.bed" into table stsAlias'
    hgLoadBed -noBin -tab -sqlTable=/cluster/home/kent/src/hg/lib/stsMap.sql \
	hg17 stsMap stsMap.bed
    hgLoadPsl -nobin -table=all_sts_primer hg17 primers/primers.psl
    hgLoadPsl -nobin -table=all_sts_seq hg17 stsMarkers.lifted.psl

    # update of information for D21S167 and D21S168 (2005-02-11, hartera)
    # currently X52289 associated with D21S168 
    # and X53367 associated with D21S167 - these need to be switched as they
    # are causing incorrect positioning
    # On Terry's advice, 
    # first manually update the accession field stsInfo2.bed so that the 
    # corrected version is carried through to the next version
    cd /cluster/data/hg17/bed/sts
    # manually change accessions in this file so now X52289 is associated
    # with D21S167 and X53367 is now associated with D21S168 
    # manually update the chromStart and chromEnd fields for these
    # records in stsMap.bed
    # this change was not carried through after filtering to change stsMap.bed
    # again and reload this table (DONE, 2005-02-18, hartera)
    chr21   39867340        39867513        D21S167 1000    7888    AF064860
    # becomes
    chr21   37117635        37117858        D21S167 1000    7888    AF064860

    chr21   37117635        37117858        D21S168 1000    103256  AP000699
    # becomes
    chr21   39867340        39867513        D21S168 1000    103256  AP000699
    
    # then reload the stsMap.bed and stsInfo2.bed files
    # copy this updated bed file back to ncbi directory
    cp stsInfo2.bed /cluster/data/ncbi/sts.9/
    # delete previous data before reloading tables
    hgsql hg17 -e 'delete from stsInfo2'
    hgsql hg17 -e 'drop table stsMap'
    hgsql hg17 -e 'load data local infile "stsInfo2.bed" into table stsInfo2'
    hgLoadBed -noBin -tab -sqlTable=/cluster/home/kent/src/hg/lib/stsMap.sql \
	hg17 stsMap stsMap.bed
    # (2005-02-19, hartera)
    # also need to update the psl alignment file and reload into all_sts_seq
    # for D21S168, the id is 103256, this is qName in the psl file
    # for D21S167, the id is 7888
    cd /cluster/data/hg17/bed/sts
    # manually update the stsMarkers.lifted.psl file with the new
    # co-ordinates as above.
    # (2005-02-23) Correct alignments.
    # need to swap the names for the alignments not just the start and end
    # coords as before as now the rest of the alignment data fields in the 
    # table are incorrect. Change the start and end co-ordinates and just swap 
    # the names for D21S167 and D21S168 in the psl file then reload the table. 
    # sort on the ID field (qName)
    sort -k 10n stsMarkers.lifted.psl > sts.lifted.sort
    mv sts.lifted.sort stsMarkers.lifted.psl 
    hgsql hg17 -e 'drop table all_sts_seq'
    hgLoadPsl -nobin -table=all_sts_seq hg17 stsMarkers.lifted.psl
    
    # Add new information after filtering the noOoc files 
    # (DONE, 2005-02-17, hartera)
    # latest psl file: stsMarker.lifted.new.psl is in fix dir    
    # Merge primer and sequence files to create final bed file
    ssh kolossus
    cd /cluster/data/hg17/bed/sts/fix
    nice /cluster/bin/scripts/combineSeqPrimerPos stsMarkers.final.new \
                                             ../primers/primers.final
    # creates *_pos.rdb
    /cluster/bin/scripts/createSTSbed /cluster/data/ncbi/sts.9/stsInfo2.bed \
                stsMarkers_pos.rdb > stsMap.bed
    awk '{print $6;}' stsMap.bed | sort -n | uniq > stsMap.ids
    diff stsMap.ids filt1000.ids 
    # There is only 1 id that does not make it into this set (109375)
    # There are 38 of the IDs to remove that do not appear in stsMap.ids
    # there are 65 therefore that appear in stsMap.bed: noOoctoremoveinStsMap
    foreach i (`cat noOoctoremoveinStsMap`) 
       awk 'BEGIN {OFS = "\t"} {if ($10 == "'$i'" && $8 >= 1000) \
        print $14, $16, $17, $10;}' stsMarkers.noOoc.lifted.filtToRemove.psl \
        >> stsMap.noOoc.toRemove.coords
    end 
    sort stsMap.noOoc.toRemove.coords > stsMap.noOoc.toRemove.coords.sort
    wc -l stsMap.noOoc.toRemove.coords.sort
    # 122
    # get the equivalent co-ordinates from stsMap.bed
    foreach i (`cat noOoctoremoveinStsMap`)
       awk 'BEGIN {OFS = "\t"} {if ($6 == "'$i'") print $1,$2,$3,$6;}' \
           stsMap.bed >> stsMap.toRemove.coords
    end
    sort stsMap.toRemove.coords > stsMap.toRemove.coords.sort
    wc -l stsMap.toRemove.coords.sort
    # 68
    diff stsMap.noOoc.toRemove.coords stsMap.toRemove.coords.sort
    # They are different co-ordinates in each set although the same ID
    # is represented.
    # none of the noOoc alignments are in stsMarkers.lifted.psl so add
    cp ../stsMarkers.lifted.psl stsMarkers.lifted.psl
    awk '{print $10}' stsMarkers.lifted.psl | sort -n | uniq > sts.liftedpsl.ids
    # none of the noOoc alignments are in stsMarkers.lifted.psl so add
    # the filtered version
    cp stsMarkers.lifted.psl stsMarkers.lifted.new.psl
    cat stsMarkers.noOoc.lifted.filt1000.psl >> stsMarkers.lifted.new.psl
    wc -l stsMarkers.lifted.new.psl
    # 91890 
    awk '{print $1;}' ../stsInfo2.bed | sort -n | uniq > stsInfo2.ids
    # diff with filt1000.ids and noOoc.IdsToRemove.txt
    # all of these are in stsInfo2.bed
    # need to remove info for the filtered out set but only for the 38 that
    # were removed from stsMap.bed - noOocnotinstsMap 
    perl removeById.pl noOocnotinstsMap stsInfo2.bed > stsInfo2.new.bed
cat << '_EOF_' > removeById.pl
#!/usr/bin/perl -w
use strict;

my $ids = $ARGV[0];
my $file = $ARGV[1];
# list of to remove
open(IDS, $ids) || die "Can not open $ids: $!\n";
# file of stsMarkers.final
open(FILE, $file) || die "Can not open $file: $!\n";
open(OUT, ">removedIds.txt") || die "Can not create removedIds.txt: $!\n";

my %idsHash;
while (<IDS>) {
   chomp;
   my @a = split(/\t/);
   my $id = $a[0];
   $idsHash{$id} = 1;
}
close IDS;

while (<FILE>) {
   my $l = $_;
   my $found = "FALSE";
   my @f = split(/\t/, $l);
   foreach my $k (keys(%idsHash)) {
      # if the id is contained in the key
      if ($k eq $f[0]) {
         $found = "TRUE";
         print OUT "$f[0]\n";
      }
   }
   if ($found eq "FALSE") {
      print $l;
   }
}
'_EOF_'
    # << emacs
    chmod +x removeById.pl
    # this removed data for all 38 of these Ids from stsInfo2.bed
    
    # need to reload database tables (2005-02-18, hartera)
    ssh hgwdev
    cd /cluster/data/hg17/bed/sts/fix
    hgsql hg17 -e 'drop table stsMap'
    hgsql hg17 -e 'drop table all_sts_seq'
    hgsql hg17 -e 'drop table stsInfo2'
    mv stsInfo2.new.bed stsInfo2.bed
    cp stsInfo2.bed /cluster/data/ncbi/sts.9/stsInfo2.bed
    mv stsMap.new.bed stsMap.bed
    mv stsMarkers.lifted.new.psl stsMarkers.lifted.psl
    hgLoadBed -noBin -tab -sqlTable=/cluster/home/kent/src/hg/lib/stsMap.sql \
	hg17 stsMap stsMap.bed
    hgLoadPsl -nobin -table=all_sts_seq hg17 stsMarkers.lifted.psl
    hgsql hg17 < ~kent/src/hg/lib/stsInfo2.sql
    hgsql hg17 -e 'load data local infile "stsInfo2.bed" into table stsInfo2'
    cd ..
    mkdir old
    mv stsMap.bed stsInfo2.bed stsMarkers.lifted.psl ./old
    mv ./fix/stsMap.bed ./fix/stsInfo2.bed ./fix/stsMarkers.lifted.psl .
   
    # Update of stsAlias table (DONE, 2005-02-24, hartera)
    # stsAlias filtered IDs removed
    # should have same IDs as in stsInfo2
    ssh eieio
    cd /cluster/data/hg17/bed/sts/fix
    awk '{print $2;}' ../stsAlias.bed | sort -n | uniq > alias.ids
    # 145985 alias.ids
    awk '{print $6;}' ../stsMap.bed | sort -n | uniq > stsMap.new.ids.sort
    awk '{print $1;}' ../stsInfo2.bed | sort -n | uniq > stsInfo.new.ids.sort
    # 16678 ids in stsInfo2 that are not in stsMap
    # 16717 ids in stsAlias that are not in stsMap
    # 38 ids in stsAlias that are not in stsInfo2
    cat stsMap.new.ids.sort stsInfo.new.ids.sort | sort -n | uniq \
        > stsMapandInfo.ids.sort
    diff stsMapandInfo.ids.sort alias.ids | grep '>' > idstoremoveAlias
    # there are 38 of these IDs to remove
    perl -pi.bak -e 's/> //' idstoremoveAlias
    cp ../stsAlias.bed .
    foreach i (`cat idstoremoveAlias`)
      awk '{if ($2 != "'$i'") print;}' stsAlias.bed > stsAlias.tmp
      mv stsAlias.tmp stsAlias.bed
    end
    # check that ids are removed from file and that they are the correct ones
    # all looks good
    cd /cluster/data/hg17/bed/sts
    # save old stsAlias file and copy new one to sts dir and to ncbi sts dir
    mv stsAlias.bed ./old
    cp ./fix/stsAlias.bed .
    cp stsAlias.bed /cluster/data/ncbi/sts.9/stsAlias.bed
    ssh hgwdev
    # remove old table data and reload
    hgsql hg17 -e 'delete from stsAlias'
    hgsql hg17 -e 'load data local infile "stsAlias.bed" into table stsAlias'

# PRUNE stsMap RECORDS (DONE 3/3/06)

  hgsql hg17 -e 'delete from stsMap where chromEnd-chromStart > 5000'
  
# RECOMBINATION RATES (2004-07-13 Terry)
#                       (2004-07-21 kate)

# The STS MArkers track must be completed prior to creating this track

    ssh eieio
    cd /cluster/data/hg17/bed
    mv recombRate recombRate.terry
    mkdir -p recombRate
	cd recombRate

# Copy other necessary files here (in future, can take from previous version)
# NOTE: these are stable, and could be saved in a permanent spot
    cp /projects/hg2/booch/psl/info/decode_all .
    cp /projects/hg2/booch/psl/info/marshfield_all .
    cp /projects/hg2/booch/psl/info/genethon_all .
	
# Determine maximum concordant set of markers for each of the maps
    /cluster/bin/scripts/assignGPsts -full -maxcon \
        /cluster/data/ncbi/sts.9/stsAlias.bed \
        /cluster/data/hg17/bed/sts/stsMarkers_pos.rdb \
        decode_all > decode.marker.rdb
    /cluster/bin/scripts/assignGPsts -full -maxcon \
        /cluster/data/ncbi/sts.9/stsAlias.bed \
        /cluster/data/hg17/bed/sts/stsMarkers_pos.rdb \
        marshfield_all > marshfield.marker.rdb
    /cluster/bin/scripts/assignGPsts -full -maxcon \
        /cluster/data/ncbi/sts.9/stsAlias.bed \
        /cluster/data/hg17/bed/sts/stsMarkers_pos.rdb \
        genethon_all > genethon.marker.rdb

# Determine the rates for each of the maps
    /cluster/bin/scripts/markers_to_recomb_rate.terry.pl decode.marker.rdb \
            /cluster/data/hg17/chrom.sizes 1000000 1000000 \
                > decode_1mb_slide_1mb
    /cluster/bin/scripts/markers_to_recomb_rate.terry.pl genethon.marker.rdb \
            /cluster/data/hg17/chrom.sizes 1000000 1000000 \
                > genethon_1mb_slide_1mb
        # Marker number 2 at position 120005974 on chr9 is out of genetic distance order. DISCARDING
    /cluster/bin/scripts/markers_to_recomb_rate.terry.pl marshfield.marker.rdb \
            /cluster/data/hg17/chrom.sizes 1000000 1000000 \
                > marshfield_1mb_slide_1mb
        # Marker number 1 at position 124276104 on chr9 is out of genetic distance order. DISCARDING

# Convert files to proper format
    /cluster/bin/scripts/convertRecombRate decode_1mb_slide_1mb \
        /cluster/data/hg17/inserts \
        /cluster/data/hg17 1000 > decode_1mb_slide_1mb_conv
    /cluster/bin/scripts/convertRecombRate marshfield_1mb_slide_1mb \
        /cluster/data/hg17/inserts \
         /cluster/data/hg17 1000 > marshfield_1mb_slide_1mb_conv
    /cluster/bin/scripts/convertRecombRate genethon_1mb_slide_1mb \
        /cluster/data/hg17/inserts \
	    /cluster/data/hg17 1000 > genethon_1mb_slide_1mb_conv

# Create bed file and load
    /cluster/bin/scripts/createRRbed decode_1mb_slide_1mb_conv \
        marshfield_1mb_slide_1mb_conv genethon_1mb_slide_1mb_conv \
                > recombRate.bed
    hgLoadBed -noBin -tab \
        -sqlTable=/cluster/home/kent/src/hg/lib/recombRate.sql \
	    hg17 recombRate recombRate.bed
	   

# FISH CLONES (DONE 2004-07-22 Kate)
#  Reloaded 2004-09-36 after Terry Furey reworked fishClones.c
#       to improve scoring

# The STS Marker, Coverage, and BAC End Pairs tracks must be completed prior to 
# creating this track

    ssh eieio
    mkdir -p /cluster/data/ncbi/fishClones/fishClones.2004-07/
    cd /cluster/data/ncbi/fishClones/fishClones.2004-07/

# Download information from NCBI
        # point browser at http://www.ncbi.nlm.nih.gov/genome/cyto/cytobac.cgi?CHR=all&VERBOSE=ctg
        # change "Show details on sequence-tag" to "yes"
        # change "Download or Display" to "Download table for UNIX"
        # press Submit - save as /cluster/data/ncbi/fishClones/fishClones.2004-07/hbrc.txt
    chmod 664 /cluster/data/ncbi/fishClones/fishClones.2004-07/hbrc.txt

# Get current clone/accession information
    wget http://www.ncbi.nlm.nih.gov/genome/clone/DATA/clac.out

# Create initial Fish Clones bed file
    mkdir -p /cluster/data/hg17/bed/fishClones
    cd /cluster/data/hg17/bed/fishClones

# Copy previous sts info from fhcrc (take from previous build in future)
    cp ~booch/tracks/fish/fhcrc.sts .
    fishClones -verbose=1 -fhcrc=fhcrc.sts -noBin hg17 \
         /cluster/data/ncbi/fishClones/fishClones.2004-07/hbrc.txt \
         /cluster/data/ncbi/fishClones/fishClones.2004-07/clac.out \
         /cluster/data/ncbi/bacends/human/bacends.4/cl_acc_gi_len \
         /cluster/data/hg17/bed/bacends/lifted/bacEnds.lifted.psl \
            fishClones_initial
	     
# Get sequences for accessions not in genome
    ssh eieio
    mkdir -p /cluster/bluearc/hg17/fishClones/
    cd /cluster/bluearc/hg17/fishClones/
	# goto http://www.ncbi.nlm.nih.gov/entrez/batchentrez.cgi?db=Nucleotide
	# select file "/cluster/data/hg17/bed/fishClones/fishClones_initial.acc"
	# change output to FASTA format
	# download results to "/cluster/bluearc/hg17/fishClones/notFound.fa"

# Align these using blat
    cp ~booch/tracks/gs.17/build34/fish/convert.pl .
    cp ~booch/tracks/gs.17/build34/fish/blatAll.pl .
        # edited to use ooc file on bluearc, so can run on kolossus
    convert.pl  < notFound.fa > notFound.convert.fa
    mkdir out
    blatAll.pl /cluster/data/hg17 notFound.convert.fa out
        # creates raw.psl, not.found.psl

# Make final fishClones file with this new clone placement info
    cd /cluster/data/hg17/bed/fishClones
    fishClones -verbose=1 -fhcrc=fhcrc.sts -noBin \
         -psl=/cluster/bluearc/hg17/fishClones/not.found.psl hg17 \
         /cluster/data/ncbi/fishClones/fishClones.2004-07/hbrc.txt \
         /cluster/data/ncbi/fishClones/fishClones.2004-07/clac.out \
         /cluster/data/ncbi/bacends/human/bacends.4/cl_acc_gi_len \
         /cluster/data/hg17/bed/bacends/lifted/bacEnds.lifted.psl fishClones

# Load the track
    ssh hgwdev
    cd /cluster/data/hg17/bed/fishClones
    hgLoadBed -noBin -tab \
        -sqlTable=/cluster/home/kent/src/hg/lib/fishClones.sql \
	hg17 fishClones fishClones.bed
            # Loaded 10601 elements of size 16

# fixed bad table entry (2004-08-12 kate)
# NOTE: this won't be necessary in the future, as the fishClones program
# will now accomodate more bad input data.
    hgsql hg17 -e "update fishClones set bandEnds='1q43,Yp' where  name='RP11-188A4' and placeCount=2"


# CHROMOSOME BANDS TRACK (2004-07-13 Terry)

# This must wait until the Fish Clones tracks is done
    mkdir -p /cluster/data/hg17/bed/cytoband
    cd /cluster/data/hg17/bed/cytoband

# Copy in some necessary files (usually from previous version)
    cp /projects/hg2/booch/psl/cytobands/pctSetBands.txt .
    cp /projects/hg2/booch/psl/cytobands/ISCN800.txt .

# Create some preliminary information files
    /cluster/bin/scripts/createSetBands pctSetBands.txt \
	/cluster/data/hg17/inserts /cluster/data/hg17  100 > setBands.txt
    /cluster/bin/scripts/makeBands ISCN800.txt /cluster/data/hg17 > cytobands.pct.bed
    /cluster/bin/scripts/makeBandRanges cytobands.pct.bed > cytobands.pct.ranges

# Reformat fishClones file
    /cluster/bin/scripts/createBanderMarkers \
	/cluster/data/hg17/bed/fishClones/fishClones.bed > fishClones.txt

# Create bed file
    /cluster/bin/scripts/runBander fishClones.txt \
	ISCN800.txt setBands.txt /cluster/data/hg17

    # Should be 862 bands
    wc cytobands.bed
    # 862    4310   30748 cytobands.bed

# Load track
    hgLoadBed -noBin -tab -sqlTable=/cluster/home/kent/src/hg/lib/cytoBand.sql \
	hg17 cytoBand cytobands.bed
	    
# Load ideogram table
    hgLoadBed -noBin -tab -sqlTable=/cluster/home/booch/src/hg/lib/cytoBandIdeo.sql \
	hg17 cytoBandIdeo cytobands.bed


# CHROMOSOME BANDS TRACK REDO (2004-07-22 Kate)
#       Just to make sure we know the proper steps.
#       The tables were not reloaded, as Terry has already
#       sent the data to NCBI

# This must wait until the Fish Clones tracks is done
    ssh kolossus
    mkdir -p /cluster/data/hg17/bed/cytoband.kate
    cd /cluster/data/hg17/bed/cytoband.kate

# Copy in some necessary files (usually from previous version)
    cp /projects/hg2/booch/psl/cytobands/pctSetBands.txt .
    cp /projects/hg2/booch/psl/cytobands/ISCN800.txt .

# Create some preliminary information files
    /cluster/bin/scripts/createSetBands pctSetBands.txt \
	/cluster/data/hg17/inserts /cluster/data/hg17  100 > setBands.txt
    /cluster/bin/scripts/makeBands ISCN800.txt \
        /cluster/data/hg17 > cytobands.pct.bed
    /cluster/bin/scripts/makeBandRanges cytobands.pct.bed \
        > cytobands.pct.ranges

# Reformat fishClones file
    /cluster/bin/scripts/createBanderMarkers \
	/cluster/data/hg17/bed/fishClones/fishClones.bed > fishClones.txt

# Create bed file
    ssh eieio
    cd /cluster/data/hg17/bed/cytoband.kate
    /cluster/bin/scripts/runBander fishClones.txt \
	ISCN800.txt setBands.txt /cluster/data/hg17
    # NOTE: fails on kolossus (C++ compiler different ??)

    # Should be 862 bands
    wc  -l cytobands.bed
    # 862    cytobands.bed

# NOTE - don't load tracks, as Terry has already sent his
#       versions to NCBI

# Load track
    #hgLoadBed -noBin -tab \
        # -sqlTable=/cluster/home/kent/src/hg/lib/cytoBand.sql \
	# hg17 cytoBand cytobands.bed
	    
# Load ideogram table
    #hgLoadBed -noBin -tab \
        # -sqlTable=/cluster/home/booch/src/hg/lib/cytoBandIdeo.sql \
	# hg17 cytoBandIdeo cytobands.bed


# LOAD AFFYRATIO (DONE - 2004-07-14 - Hiram)
#	Copied from Hg16 doc
    # Set up cluster job to align consenesus/exemplars to hg17
    ssh eieio
    mkdir /cluster/bluearc/hg17/affyGnf
    cp -p /projects/compbio/data/microarray/affyGnf/sequences/HG-U95/HG-U95Av2_all.fa /cluster/bluearc/hg17/affyGnf

    ssh kkr1u00
    mkdir -p /iscratch/i/affyGnf
    cp -p /cluster/bluearc/hg17/affyGnf/* /iscratch/i/affyGnf
    /cluster/bin/iSync

    ssh kki
    mkdir /cluster/data/hg17/bed/affyGnf.2004-06-09
    cd /cluster/data/hg17/bed/affyGnf.2004-06-09
    ls -1 /iscratch/i/affyGnf/* > affy.lst
    ls -1 /iscratch/i/gs.18/build35/maskedContigs/* > allctg.lst
    cat << '_EOF_' > template.sub
#LOOP
/cluster/bin/i386/blat -fine -mask=lower -minIdentity=95 -ooc=/iscratch/i/gs.18/build35/hg17.11.ooc  $(path1) $(path2) {check out line+ psl/$(root1)_$(root2).psl}
#ENDLOOP
'_EOF_'
    # << this line makes emacs coloring happy

    gensub2 allctg.lst affy.lst template.sub jobList
    mkdir psl
    para create jobList
# Completed: 380 of 380 jobs
# CPU time in finished jobs:       2922s      48.70m     0.81h    0.03d  0.000 y
# IO & Wait Time:                  1146s      19.10m     0.32h    0.01d  0.000 y
# Average job time:                  11s       0.18m     0.00h    0.00d
# Longest job:                       80s       1.33m     0.02h    0.00d
# Submission to last job:           333s       5.55m     0.09h    0.00d

    # Do sort, best in genome filter, and convert to chromosome coordinates
    # to create affyU95.psl
    ssh eieio
    cd /cluster/data/hg17/bed/affyGnf.2004-06-09
    pslSort dirs raw.psl tmp psl

    # change filter parameters for these sequences. only use alignments that
    # cover 30% of sequence and have at least 95% identity in aligned
    # region. 
    # minAli = 0.97 too high. low minCover as a lot of n's in these
    # sequences
    pslReps -minCover=0.3 -sizeMatters -minAli=0.95 -nearTop=0.005 \
	raw.psl contig.psl /dev/null
    liftUp affyU95.psl ../../jkStuff/liftAll.lft warn contig.psl
    #   Eliminate the long names
    sed -e "s/U95Av2://" affyU95.psl | sed -e "s/;//" > affyU95shortQname.psl

    # Merge with spot data and load into database. added -chip flag to 
    # affyPslAndAtlasToBed to allow correct parsing
    ssh hgwdev
    cd /cluster/data/hg17/bed/affyGnf.2004-06-09
    
    /cluster/home/sugnet/bin/i386/affyPslAndAtlasToBed -chip=U95Av2 \
	affyU95shortQname.psl \
	/projects/compbiodata/microarray/affyGnf/human_atlas_U95_gnf.noquotes.txt \
	affyRatio.bed affyRatio.exr > affyPslAndAtlasToBed.log 2>&1

    hgLoadBed -sqlTable=$HOME/src/hg/lib/affyRatio.sql hg17 \
	affyRatio affyRatio.bed
    #	Loaded 12740 elements of size 15

    mkdir affyU95
    hgLoadPsl hg17 -table=affyU95 affyU95shortQname.psl
    # sequences loaded 2004-08-06
    hgLoadSeq -abbr=U95Av2: hg17 /gbdb/hgFixed/affyProbes/HG-U95Av2_all.fa
    #	Advisory lock created
    #	Creating .tab file
    #	Adding /gbdb/hgFixed/affyProbes/HG-U95Av2_all.fa
    #	12386 sequences
    #	Updating seq table
    #	Advisory lock has been released
    #	All done

# Load AFFYUCLANORM, extended version of affyUcla track. Hopefully
# final freeze of data set.		(DONE - 2004-07-14 - Hiram)
    ssh kk
    mkdir /cluster/data/hg17/bed/affyUclaNorm
    cd /cluster/data/hg17/bed/affyUclaNorm

    cp /projects/compbio/data/microarray/affyUcla/sequences/HG-U133AB_all.fa .
    ls -1 /scratch/hg/gs.18/build35/maskedContigs/* > contig.lst

    cat << '_EOF_' > gsub
#LOOP
/cluster/bin/i386/blat -fine -mask=lower -minIdentity=95 -ooc=/scratch/hg/h/11.ooc  $(path1) $(path2) {check out line+ psl/$(root1)_$(root2).psl}
#ENDLOOP
'_EOF_'
    # << keep emacs happy

    mkdir psl
    ls HG-U133AB_all.fa > affy.lst
    gensub2 contig.lst affy.lst gsub jobList
    para create jobList
    para try
    para check
    para push ... etc
# Completed: 380 of 380 jobs
# CPU time in finished jobs:      20070s     334.51m     5.58h    0.23d  0.001 y
# IO & Wait Time:                162784s    2713.06m    45.22h    1.88d  0.005 y
# Average job time:                 481s       8.02m     0.13h    0.01d
# Longest job:                      735s      12.25m     0.20h    0.01d
# Submission to last job:           771s      12.85m     0.21h    0.01d

    ssh eieio
    cd /cluster/data/hg17/bed/affyUclaNorm
    pslSort dirs hg17.affyU133AB_all.psl tmp psl
    wc hg17.affyU133AB_all.psl
    #	61022 1281401 12934919 hg17.affyU133AB_all.psl
    liftUp hg17.affyU133AB_all.lifted.psl \
	/cluster/data/hg17/jkStuff/liftAll.lft warn hg17.affyU133AB_all.psl 
    pslReps -minCover=0.5 -sizeMatters -minAli=0.97 \
	-nearTop=0.005  hg17.affyU133AB_all.lifted.psl \
	hg17.affyU133AB_all.lifted.pslReps.psl out.psr
    #	Processed 61017 alignments
    affyUclaMergePslData -pslFile=hg17.affyU133AB_all.lifted.pslReps.psl \
	-affyFile=/projects/compbio/data/microarray/affyUcla/data/feature_biomaterial_chip_logratios_formatForTrack.txt \
	-bedOut=hg17.affyUcla.bed \
	-expRecordOut=hg17.affyUcla.expRecords \
	-expFile=/projects/compbio/data/microarray/affyUcla/data/expNames.sorted.txt

    ~/kent/src/hg/affyGnf/addUclaAnnotations.pl hg17.affyUcla.expRecords \
	/projects/compbio/data/microarray/affyUcla/data/normal_tissue_database_annotations2.txt > hg17.affyUcla.annotations.expRecords

    # Load the databases
    ssh hgwdev
    cd /cluster/data/hg17/bed/affyUclaNorm
    sed -e 's/affyRatio/affyUclaNorm/' ~/kent/src/hg/lib/affyRatio.sql \
	> affyUclaNorm.sql
    hgLoadBed hg17 affyUclaNorm hg17.affyUcla.bed -sqlTable=affyUclaNorm.sql

# MAKE AFFY U133 - made after above affyUclaNorm (DONE - 2004-07-15 - Hiram)
    #	Someday the names can be fixed.
    ssh hgwdev
    mkdir /cluster/data/hg17/bed/affyU133
    cd /cluster/data/hg17/bed/affyU133
    ln -s ../affyUclaNorm/hg17.affyU133AB_all.lifted.pslReps.psl affyU133.psl
    hgLoadPsl hg17 affyU133.psl
    #	hgsql -e "select count(*) from affyU133;" hg17
    #	row count in hg16: 45693, in hg17: 44620
    hgLoadSeq hg17 /gbdb/hgFixed/affyProbes/HG-U133AB_all.fa
    #	44792 sequences

# MAKE LINEAGE-SPECIFIC REPEATS FOR CHICKEN & FUGU (DONE 2004-06-10 kate)
    # In an email 2/13/04 to Angie, Arian said we could treat all 
    # human repeats as 
    # lineage-specific for human-chicken blastz.  
    # and Angie did the same for fugu.
    # Lacking input from Arian, and using blastzSelf as a model,
    # I'm also using all human repeats for the human/chimp blastz.
    # Scripts expect *.out.spec filenames.
    ssh kkr1u00
    cd /cluster/data/hg17
    mkdir /iscratch/i/hg17/linSpecRep.chicken
    foreach f (/iscratch/i/hg17/rmsk/chr*.fa.out)
      cp -p $f /iscratch/i/hg17/linSpecRep.chicken/$f:t:r:r.out.spec
    end
    ln -s /iscratch/i/hg17/linSpecRep.chicken \
          /iscratch/i/hg17/linSpecRep.fugu
    ln -s /iscratch/i/hg17/linSpecRep.chicken \
          /iscratch/i/hg17/linSpecRep.chimp
    iSync


# BLASTZ FUGU (FR1) (DONE 2004-06-24 kate)
    ssh kk
    mkdir -p /cluster/data/hg17/bed/blastz.fr1.2004-06-10
    ln -s /cluster/data/hg17/bed/blastz.fr1.2004-06-10 \
            /cluster/data/hg17/bed/blastz.fr1
    cd /cluster/data/hg17/bed/blastz.fr1
    # Set L=6000 (more relaxed than chicken) and abridge repeats.
    # Treat all repeats as lineage-specific (reuse linSpecRep.Chicken).
    cat << '_EOF_' > DEF
# human vs. fugu
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/i386:/cluster/home/angie/schwartzbin

ALIGN=blastz-run
BLASTZ=blastz

# Reuse parameters from human-chicken.
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=6000
BLASTZ_K=2200
BLASTZ_Q=/cluster/data/blastz/HoxD55.q
BLASTZ_ABRIDGE_REPEATS=1

# TARGET: Human
SEQ1_DIR=/iscratch/i/hg17/bothMaskedNibs
SEQ1_RMSK=
SEQ1_FLAG=
SEQ1_SMSK=/iscratch/i/hg17/linSpecRep.fugu
SEQ1_IN_CONTIGS=0
SEQ1_CHUNK=10000000
SEQ1_LAP=10000

# QUERY: Fugu
SEQ2_DIR=/iscratch/i/fr1/nib
SEQ2_RMSK=
SEQ2_FLAG=
SEQ2_SMSK=/iscratch/i/fr1/linSpecRep
SEQ2_IN_CONTIGS=0
SEQ2_CHUNK=10000000
SEQ2_LAP=0

BASE=/cluster/data/hg17/bed/blastz.fr1

DEF=$BASE/DEF
RAW=$BASE/raw
CDBDIR=$BASE
SEQ1_LEN=$BASE/S1.len
SEQ2_LEN=$BASE/S2.len
'_EOF_'
    # << this line keeps emacs coloring happy

    bash # if a csh/tcsh user
    source DEF
    mkdir $RAW run.0
    /cluster/home/angie/hummus/make-joblist $DEF > $BASE/run.0/j
    # GOT HERE
    sh ./xdir.sh
    cd run.0
    sed -e 's@^blastz-run@/cluster/bin/penn/blastz-run@' j > jobList
    para create jobList
        # 11935 jobs
    para try
    para check
    para push
# Completed: 11935 of 11935 jobs
# CPU time in finished jobs:    4673316s   77888.60m  1298.14h   54.09d  0.148 y
# IO & Wait Time:                329249s    5487.48m    91.46h    3.81d  0.010 y
# Average job time:                 419s       6.99m     0.12h    0.00d
# Longest job:                      714s      11.90m     0.20h    0.01d
# Submission to last job:          5575s      92.92m     1.55h    0.06d

    # second cluster run: lift raw alignments -> lav dir
    ssh kki
    cd /cluster/data/hg17/bed/blastz.fr1
    bash # if a csh/tcsh user
    source DEF
    mkdir run.1 lav
    /cluster/bin/scripts/blastz-make-out2lav $DEF $BASE > $BASE/run.1/jobList
    cd run.1
    wc -l jobList
    para create jobList
        # 341 jobs
    para try
    para check 
    para push
# CPU time in finished jobs:        315s       5.26m     0.09h    0.00d  0.000 y
# IO & Wait Time:                  4451s      74.18m     1.24h    0.05d  0.000 y
# Average job time:                  14s       0.23m     0.00h    0.00d
# Longest job:                      107s       1.78m     0.03h    0.00d
# Submission to last job:           368s       6.13m     0.10h    0.00d

    # third run: lav -> axt
    ssh kki
    cd /cluster/data/hg17/bed/blastz.fr1
    mkdir axtChrom pslChrom run.2
    cd run.2
    cat << 'EOF' > do.csh
#!/bin/csh -ef
cd $1
set chr = $1:t
cat `ls -1 *.lav | sort -g` \
| lavToAxt stdin \
        /iscratch/i/hg17/bothMaskedNibs /iscratch/i/fr1/nib stdout \
| axtSort stdin ../../axtChrom/$chr.axt 
axtToPsl ../../axtChrom/$chr.axt ../../S1.len ../../S2.len \
        ../../pslChrom/$chr.psl
'EOF'
    # << this line keeps emacs coloring happy
    chmod a+x do.csh
    cp /dev/null jobList
    foreach d (../lav/chr*)
      echo "do.csh $d" >> jobList
    end
    para create jobList
        # 41 jobs
    para try
    para check
    para push


# CHAIN FUGU BLASTZ (2004-06-11 kate)
    # Run axtChain on little cluster
    ssh kki
    cd /cluster/data/hg17/bed/blastz.fr1
    mkdir -p axtChain/run1
    cd axtChain/run1
    mkdir out chain
    ls -1S /cluster/data/hg17/bed/blastz.fr1/axtChrom/*.axt \
      > input.lst
    cat << '_EOF_' > gsub
#LOOP
doChain {check in exists $(path1)} {check out line+ chain/$(root1).chain} {check out line+ out/$(root1).out}
#ENDLOOP
'_EOF_'
    # << this line makes emacs coloring happy

    # Reuse gap penalties from chicken run.
    cat << '_EOF_' > temp.gap
tablesize	11
smallSize	111
position	1	2	3	11	111	2111	12111	32111	72111	152111	252111
qGap	325	360	400	450	600	1100	3600	7600	15600	31600	56600
tGap	325	360	400	450	600	1100	3600	7600	15600	31600	56600
bothGap	625	660	700	750	900	1400	4000	8000	16000	32000	57000
'_EOF_'
    # << this line makes emacs coloring happy
    sed 's/  */\t/g' temp.gap > ../../fuguHumanTuned.gap
    rm -f temp.gap

    cat << '_EOF_' > doChain
#!/bin/csh
axtChain -scoreScheme=/cluster/data/blastz/HoxD55.q \
                      -linearGap=../../fuguHumanTuned.gap \
                      -minScore=5000 $1 \
    /iscratch/i/hg17/bothMaskedNibs \
    /iscratch/i/fr1/nib $2 > $3
'_EOF_'
    # << this line makes emacs coloring happy
    chmod a+x doChain
    gensub2 input.lst single gsub jobList
    para create jobList
        # 46 jobs
    para try
    para check
    para push
        # 1 crashed job -- chr6_hla_hap1.chain is empty
# CPU time in finished jobs:        610s      10.16m     0.17h    0.01d  0.000 y
# IO & Wait Time:                  1644s      27.40m     0.46h    0.02d  0.000 y
# Average job time:                  50s       0.83m     0.01h    0.00d
# Longest job:                      233s       3.88m     0.06h    0.00d
# Submission to last job:           339s       5.65m     0.09h    0.00d

    # now on the cluster server, sort chains
    ssh eieio
    cd /cluster/data/hg17/bed/blastz.fr1/axtChain
    chainMergeSort run1/chain/*.chain > all.chain
    chainSplit chain all.chain
    rm run1/chain/*.chain

    # Load chains into database
    ssh hgwdev
    cd /cluster/data/hg17/bed/blastz.fr1/axtChain/chain
    foreach i (*.chain)
        set c = $i:r
        echo loading $c
        hgLoadChain hg17 ${c}_chainFr1 $i
    end
    featureBits hg16 chainFr1Link
        # 50709290 bases of 2865248791 (1.770%) in intersection


# ANCIENT REPEAT TABLE (2004-06-11 kate)

    # The netClass operations requires an "ancientRepeat" table in one 
    # of the databases.
    # This is a hand curated table obtained from Arian.

    ssh hgwdev
    mkdir -p /cluster/data/hg17/bed/ancientRepeat
    cd /cluster/data/hg17/bed/ancientRepeat
    # mysqldump needs write permission to this directory
    chmod 777 .
    hgsqldump --all --tab=. hg15 ancientRepeat
    chmod 775 .
    hgsql hg17 < ancientRepeat.sql
    echo "LOAD DATA LOCAL INFILE 'ancientRepeat.txt' into table ancientRepeat"\
                | hgsql hg17


# NET FUGU BLASTZ (2004-06-11 kate)
    ssh eieio
    cd /cluster/data/hg17/bed/blastz.fr1/axtChain
    chainPreNet all.chain ../S1.len ../S2.len stdout \
    | chainNet stdin -minSpace=1 ../S1.len ../S2.len stdout /dev/null \
    | netSyntenic stdin noClass.net

    # Add classification info using db tables:
    ssh hgwdev
    cd /cluster/data/hg17/bed/blastz.fr1/axtChain
    netClass noClass.net hg17 fr1 human.net

    # Make a 'syntenic' subset:
    ssh eieio
    cd /cluster/data/hg17/bed/blastz.fr1/axtChain
    rm noClass.net
    # Make a 'syntenic' subset of these with
    netFilter -syn human.net > humanSyn.net

    # Load the nets into database 
    ssh hgwdev
    cd /cluster/data/hg17/bed/blastz.fr1/axtChain
    netFilter -minGap=10 human.net |  hgLoadNet hg17 netFr1 stdin
    #netFilter -minGap=10 humanSyn.net | hgLoadNet hg17 netSyntenyFr1 stdin


# EXTRACT AXT'S AND MAF'S FROM THE NET (kate)
    # NOTE: Redo 2005-08-16 to fix overlap problem (use 8/05 netToAxt)
# Replace bad chr5 axtNet and mafNet (2006-01-05 kate) 
    ssh kkstore2
    cd /cluster/data/hg17/bed/blastz.fr1/axtChain
    netSplit human.net humanNet
    mkdir -p ../axtNet ../mafNet
cat > makeMaf.csh << 'EOF'
    foreach f (humanNet/chr*.net)
        set c = $f:t:r
        echo "axtNet on $c"
        netToAxt humanNet/$c.net chain/$c.chain /cluster/data/hg17/nib /cluster/data/fr1/nib stdout | axtSort stdin ../axtNet/$c.axt
        axtToMaf ../axtNet/$c.axt \
            /cluster/data/hg17/chrom.sizes /cluster/data/fr1/chrom.sizes \
            ../mafNet/$c.maf -tPrefix=hg17. -qPrefix=fr1.
    end
'EOF'
    csh makeMaf.csh >&! makeMaf.log &
    tail -100f makeMaf.log
    mkdir -p /cluster/bluearc/hg17/mafNet
    cp -rp ../mafNet /cluster/bluearc/hg17/mafNet/fr1


# FUGU FR1 DOWNLOADS (DONE 2004-09-17 kate)
#    REDO axtNet downloads for fix, above (2005-09-12 kate)
# Replace bad chr5 axtNet and mafNet (2006-01-05 kate) 

    ssh kksilo
    cd /cluster/data/hg17/bed/blastz.fr1/axtChain
    ln -s all.chain fugu.chain
    mkdir gz
    gzip -c fugu.chain > gz/fugu.chain.gz
    gzip -c human.net > gz/fugu.net.gz
    cd ../axtNet
    nice gzip *.axt

    ssh kkstore02
    cd /cluster/data/hg17/bed/blastz.fr1/axtNet
    gzip *.axt
    md5sum *.gz > md5sum.txt

    ssh hgwdev
    cd /usr/local/apache/htdocs/goldenPath/hg17
    mkdir -p vsFr1
    cd vsFr1
    # Copy and edit README
    cp /cluster/data/hg17/bed/blastz.fr1/axtChain/gz/*.gz .
    md5sum *.gz > md5sum.txt

    mv axtNet axtNet.old
    ln -s /cluster/data/hg17/bed/blastz.fr1/axtNet .


# PRODUCE FUGU BLAT ALIGNMENT (DONE - 2004-07-07 - Hiram)

    # Use masked scaffolds from fr1 assembly (same sequence as
    # previous BlatFugu, however it's repeat and TRF-masked).

    ssh kk
    mkdir /cluster/data/hg17/bed/blatFr1
    cd /cluster/data/hg17/bed/blatFr1
    mkdir psl 
    # next time, use N?_?????? (to pick up NG_ contigs)
    foreach f ( `cat /cluster/data/hg17/contig.lst` )
      set c=$f:t:r
      echo $c
      mkdir psl/$c
    end

    # create cluster job
    mkdir run
    cd run
    ls -1S /iscratch/i/fugu/trfFa/*.fa > fugu.lst
    ls -1S /scratch/hg/gs.18/build35/maskedContigs/*.fa > human.lst
cat << 'EOF' > gsub
#LOOP
/cluster/bin/i386/blat -mask=lower -qMask=lower -q=dnax -t=dnax {check in line+ $(path1)} {check in line+ $(path2)} {check out line+ /cluster/data/hg17/bed/blatFr1/psl/$(root1)/$(root1)_$(root2).psl}
#ENDLOOP
'EOF'
    # << keep emacs happy
    gensub2 human.lst fugu.lst gsub jobList
    para create spec
       # 219640 jobs  
    para try
    para check
    para push -maxQueue=300000 -maxPush=220000
    para check
# Completed: 219640 of 219640 jobs
# CPU time in finished jobs:    5206945s   86782.41m  1446.37h   60.27d  0.165 y
# IO & Wait Time:                797791s   13296.52m   221.61h    9.23d  0.025 y
# Average job time:                  27s       0.46m     0.01h    0.00d
# Longest job:                      951s      15.85m     0.26h    0.01d
# Submission to last job:          7553s     125.88m     2.10h    0.09d
        # cd psl
        # count files with aligments
        # find . -not -size 427c | wc -l
        # 44558
        # count files with no aligments
        # find . -size 427c | wc -l
        # 175463

   # When cluster run is done, sort alignments
   # into chrom directory
    ssh eieio
    cd /cluster/data/hg17/bed/blatFr1
    pslCat -dir psl/N?_?????? | \
      liftUp -type=.psl stdout \
        /cluster/data/hg17/jkStuff/liftAll.lft warn stdin | \
      pslSortAcc nohead chrom temp stdin
        # 65 minutes ?
        # Processed 216595 lines into 1 temp files

    # Rename to correspond with tables and load into database:
    ssh hgwdev
    cd /cluster/data/hg17/bed/blatFr1/chrom
    foreach i (chr*.psl)
        set r = $i:r
        echo mv $i ${r}_blatFr1.psl
        mv $i ${r}_blatFr1.psl
    end

    # lift fugu scaffolds to Fugu browser chrUn,
    # so you can link to other browser.  And don't need to load sequence
    cd /cluster/data/hg17/bed/blatFr1
    liftUp -pslQ all.psl /cluster/data/fr1/fugu_v3.masked.lft warn chrom/*.psl

    hgLoadPsl -table=blatFr1 hg17 all.psl
    #	load of blatFr1 did not go as planned: 216595 record(s),
    #	0 row(s) skipped, 3 warning(s) loading psl.tab
    #	featureBits hg17 blatFr1 refGene:CDS
    #	13563544 bases of 2866216770 (0.473%) in intersection
    #	featureBits hg16 blatFr1 refGene:CDS
    #	13547219 bases of 2865248791 (0.473%) in intersection
    #	featureBits hg15 blatFugu refGene:CDS
    #	12427544 bases of 2866466359 (0.434%) in intersection

#  BLASTZ RAT RN3 (DONE - 2004-06-14 - Hiram)

    ssh kk
    mkdir -p /cluster/data/hg17/bed/blastz.rn3.2004-06-11
    cd /cluster/data/hg17/bed
    ln -s  blastz.rn3.2004-06-11 blastz.rn3
    cd blastz.rn3

    cat << '_EOF_' > DEF
# rat vs. human
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/home/angie/schwartzbin:/cluster/home/kent/bin/i386

ALIGN=blastz-run
BLASTZ=blastz
BLASTZ_H=2000
BLASTZ_ABRIDGE_REPEATS=1

# TARGET
# Human
SEQ1_DIR=/iscratch/i/gs.18/build35/bothMaskedNibs
# not used
SEQ1_RMSK=
# not used
SEQ1_FLAG=
SEQ1_SMSK=/iscratch/i/gs.18/build35/linSpecRep.notInRat
SEQ1_IN_CONTIGS=0
SEQ1_CHUNK=10000000
SEQ1_LAP=10000

# QUERY
# Rat
SEQ2_DIR=/iscratch/i/rn3/bothMaskedNibs
# not currently used
SEQ2_RMSK=
# not currently used
SEQ2_FLAG=
SEQ2_SMSK=/cluster/bluearc/rat/rn3/linSpecRep.notInHuman
SEQ2_IN_CONTIGS=0
SEQ2_CHUNK=30000000
SEQ2_LAP=0

BASE=/cluster/store5/gs.18/build35/bed/blastz.rn3

DEF=$BASE/DEF
RAW=$BASE/raw
CDBDIR=$BASE
SEQ1_LEN=$BASE/S1.len
SEQ2_LEN=$BASE/S2.len
'_EOF_'
    # << this line makes emacs coloring happy

    # prepare first cluster run
    ssh kk
    cd /cluster/data/hg17/bed/blastz.rn3
    source DEF
    # script copied over from /cluster/data/mm4/jkStuff/BlastZ_run0.sh
    #	it is a generic script and works for any assembly
    /cluster/data/hg17/jkStuff/BlastZ_run0.sh
    cd run.0
    para try, check, push, check, ....
Completed: 41943 of 41943 jobs
CPU time in finished jobs:   15330421s  255507.02m  4258.45h  177.44d  0.486 y
IO & Wait Time:                673809s   11230.15m   187.17h    7.80d  0.021 y
Average job time:                 382s       6.36m     0.11h    0.00d
Longest job:                     4651s      77.52m     1.29h    0.05d
Submission to last job:        169197s    2819.95m    47.00h    1.96d

    #	Second cluster run to convert the .out's to .lav's
    #	You do NOT want to run this on the big cluster.  It brings
    #	the file server to its knees.  Run this on the small cluster.
    ssh kki
    cd /cluster/data/hg17/bed/blastz.rn3
    # script copied over from /cluster/data/mm4/jkStuff/BlastZ_run1.sh
    #	fixup machine check, should be kki, not kk
    /cluster/data/hg17/jkStuff/BlastZ_run1.sh
    cd run.1
    para try, check, push, etc ...
# Completed: 341 of 341 jobs
# CPU time in finished jobs:       1894s      31.56m     0.53h    0.02d  0.000 y
# IO & Wait Time:                  6271s     104.52m     1.74h    0.07d  0.000 y
# Average job time:                  24s       0.40m     0.01h    0.00d
# Longest job:                      131s       2.18m     0.04h    0.00d
# Submission to last job:           590s       9.83m     0.16h    0.01d

    #	Third cluster run to convert lav's to axt's
    cd /cluster/data/hg17/bed/blastz.rn3
    #	The copy of this in mm4 was broken, fixed here
    /cluster/data/hg17/jkStuff/BlastZ_run2.sh
    cd run.2
    para try, check, push, etc ...
# Completed: 46 of 46 jobs
# CPU time in finished jobs:        426s       7.09m     0.12h    0.00d  0.000 y
# IO & Wait Time:                  7283s     121.39m     2.02h    0.08d  0.000 y
# Average job time:                 168s       2.79m     0.05h    0.00d
# Longest job:                      642s      10.70m     0.18h    0.01d
# Submission to last job:           642s      10.70m     0.18h    0.01d

    # translate sorted axt files into psl
    ssh eieio
    cd /cluster/data/hg17/bed/blastz.rn3
    mkdir pslChrom
    set tbl = "blastzRn3"
    foreach f (axtChrom/chr*.axt)
      set c=$f:t:r
      echo "Processing chr $c"
      /cluster/bin/i386/axtToPsl $f S1.len S2.len pslChrom/${c}_${tbl}.psl
    end
    #	That takes about 30 minutes

    # Load database tables
    ssh hgwdev
    cd /cluster/data/hg17/bed/blastz.rn3/pslChrom
    for I in *.psl
    do
	/cluster/bin/i386/hgLoadPsl -noTNameIx hg17 ${I}
	echo "done: ${I}"
    done
    # this is a 55 minute job
    #	Check results
    #	featureBits hg16 blastzRn3
    #	1013603401 bases of 2865248791 (35.376%) in intersection
    #	featureBits hg17 blastzRn3
    #	1013003285 bases of 2866216770 (35.343%) in intersection

# CHAIN RN3 BLASTZ (DONE - 2004-06-14 - Hiram)
#  re-worked with no 'axtFilter -notQ_random' on the axtChain step - 2004-06-23
#    used to be: axtFilter -notQ_random $1 | axtChain stdin

# The axtChain is best run on the small kluster, or the kk9 kluster
    ssh kki
    mkdir -p /cluster/data/hg17/bed/blastz.rn3/axtChain/run1
    cd /cluster/data/hg17/bed/blastz.rn3/axtChain/run1
    mkdir out chain

    ls -1S /cluster/data/hg17/bed/blastz.rn3/axtChrom/*.axt > input.lst
    cat << '_EOF_' > gsub
#LOOP
doChain {check in exists $(path1)} {check out line+ chain/$(root1).chain} out/$(root1).out
#ENDLOOP
'_EOF_'
    # << this line makes emacs coloring happy

    cat << '_EOF_' > doChain
#!/bin/csh
    axtChain $1 \
	/iscratch/i/gs.18/build35/bothMaskedNibs \
	/iscratch/i/rn3/bothMaskedNibs $2 > $3
'_EOF_'
    # << this line makes emacs coloring happy
    chmod a+x doChain

    # 46 jobs
    gensub2 input.lst single gsub jobList
    para create jobList
    para try
    para push # ... etc ...
# Completed: 46 of 46 jobs
# CPU time in finished jobs:       4645s      77.41m     1.29h    0.05d  0.000 y
# IO & Wait Time:                  6840s     114.00m     1.90h    0.08d  0.000 y
# Average job time:                 250s       4.16m     0.07h    0.00d
# Longest job:                     1539s      25.65m     0.43h    0.02d
# Submission to last job:          3761s      62.68m     1.04h    0.04d


    # now on the file server, sort chains
    ssh eieio
    cd /cluster/data/hg17/bed/blastz.rn3/axtChain
    time chainMergeSort run1/chain/*.chain > all.chain
    #	real    36m42.170s
    #	user    4m55.970s
    #	sys     1m49.840s

    time chainSplit chain all.chain
    #	real    13m54.860s
    #	user    4m50.370s
    #	sys     1m3.260s

    # optionally: rm run1/chain/*.chain

    # Load chains into database
    # next machine
    ssh hgwdev
    cd /cluster/data/hg17/bed/blastz.rn3/axtChain/chain
    foreach i (*.chain)
        set c = $i:r
        hgLoadChain hg17 ${c}_chainRn3 $i
        echo done $c
    end
    #	featureBits hg17 chainRn3
    #	2827052992 bases of 2866216770 (98.634%) in intersection
    #	(with filter:) 2826192649 bases of 2866216770 (98.604%) in intersection
    #	featureBits hg16 chainRn3
    #	2830563493 bases of 2865248791 (98.789%) in intersection

# NET RN3 (DONE - 2004-06-15 - Hiram)
#	Re-done due to Chain being re-done 2004-06-23
#       NOTE: Redo net axt's and net maf's to fix overlaps,
#       (using 8/05 netToAxt). (2005-08-16 kate)

    ssh eieio
    cd /cluster/data/hg17/bed/blastz.rn3/axtChain
    mkdir preNet
    cd chain
    foreach i (*.chain)
      echo preNetting $i
      /cluster/bin/i386/chainPreNet $i /cluster/data/hg17/chrom.sizes \
                        /cluster/data/rn3/chrom.sizes ../preNet/$i
    end

    cd ..
    mkdir n1
    cd preNet
    foreach i (*.chain)
      set n = $i:r.net
      echo primary netting $i
      /cluster/bin/i386/chainNet $i -minSpace=1 /cluster/data/hg17/chrom.sizes \
                            /cluster/data/rn3/chrom.sizes ../n1/$n /dev/null
    end

    cd ..
    cat n1/*.net | /cluster/bin/i386/netSyntenic stdin hNoClass.net
    #	memory usage 2510467072, utime 19307 s/100, stime 3181

    ssh hgwdev
    cd /cluster/data/hg17/bed/blastz.rn3/axtChain
    time netClass hNoClass.net hg17 rn3 rat.net \
	-tNewR=/cluster/bluearc/scratch/hg/gs.18/build35/linSpecRep.notInRat \
	-qNewR=/cluster/bluearc/rat/rn3/linSpecRep.notInHuman
    #	real    34m29.829s
    #	user    11m30.440s
    #	sys     1m52.730s

    # If things look good do
    ssh eieio
    cd /cluster/data/hg17/bed/blastz.rn3/axtChain
    rm -r n1 hNoClass.net
    # Make a 'syntenic' subset of these with
    time netFilter -syn rat.net > ratSyn.net
    #	real    16m25.640s
    #	user    7m41.330s
    #	sys     1m1.150s

    # Load the nets into database
    ssh hgwdev
    cd /cluster/data/hg17/bed/blastz.rn3/axtChain
    netFilter -minGap=10 rat.net |  hgLoadNet hg17 netRn3 stdin
    netFilter -minGap=10 ratSyn.net | hgLoadNet hg17 syntenyNetRn3 stdin
    #	real    37m0.199s
    #	user    15m13.770s
    #	sys     1m41.540s

    # check results
    # featureBits hg17 netRn3
    # 2817656275 bases of 2866216770 (98.306%) in intersection
    # (with axtFilter) 2816623107 bases of 2866216770 (98.270%) in intersection
    # featureBits hg16 netRn3
    # 2820958389 bases of 2865248791 (98.454%) in intersection

    # featureBits hg17 syntenyNetRn3
    # 2781748096 bases of 2866216770 (97.053%) in intersection
    # (with axtFilter) 2780883450 bases of 2866216770 (97.023%) in intersection
    # featureBits hg16 syntenyNetRn3
    # 2784011730 bases of 2865248791 (97.165%) in intersection

    # Add entries for net and chain to rat/hg17 trackDb

    # make net
    ssh eieio
    cd /cluster/data/hg17/bed/blastz.rn3/axtChain
    mkdir ratNet
    time netSplit rat.net ratNet
    #	real    12m1.478s
    #	user    8m35.050s
    #	sys     1m7.230s

    # extract axts from net 
    mkdir ../axtNet ../mafNet
cat << 'EOF' > makeMaf.csh
    foreach n (ratNet/chr*.net)
	set c=$n:t:r
        echo $c
	netToAxt ratNet/$c.net chain/$c.chain \
		/cluster/data/hg17/nib  /cluster/data/rn3/nib stdout | \
                    axtSort stdin ../axtNet/$c.axt
        axtToMaf ../axtNet/$c.axt \
            /cluster/data/hg17/chrom.sizes /cluster/data/rn3/chrom.sizes \
                ../mafNet/$c.maf -tPrefix=hg17. -qPrefix=rn3.
    end
'EOF'
    csh makeMaf.csh >&! makeMaf.log &
    tail -100f makeMaf.log
    mkdir -p /cluster/bluearc/hg17/mafNet
    cp -rp ../mafNet /cluster/bluearc/hg17/mafNet/rn3

    ssh hgwdev
    mkdir -p /cluster/data/hg17/bed/blastz.rn3/axtBest
    cd /cluster/data/hg17/bed/blastz.rn3/axtBest
    ln -s ../axtNet/chr*.axt .

    # copy net axt's to download area
    ssh hgwdev
    cd /cluster/data/hg17/bed/blastz.rn3/axtNet
    mkdir -p /usr/local/apache/htdocs/goldenPath/hg17/vsRn3/axtNet
    cp -p *.axt /usr/local/apache/htdocs/goldenPath/hg17/vsRn3/axtNet
    cd /usr/local/apache/htdocs/goldenPath/hg17/vsRn3/axtNet
    gzip *.axt
    # add README.txt file to dir (use previous assembly's copy as template)

    #  Convert those axt files to psl
    ssh eieio
    cd /cluster/data/hg17/bed/blastz.rn3
    mkdir pslBest
    foreach a (axtBest/chr*.axt)
	set c=$a:t:r
	echo "processing $c.axt -> ${c}_blastzBestRn3.psl"
    /cluster/bin/i386/axtToPsl axtBest/${c}.axt \
	S1.len S2.len pslBest/${c}_blastzBestRn3.psl
	echo "Done: ${c}_blastzBestRn3.psl"
    end

    # Load tables
    ssh hgwdev
    cd /cluster/data/hg17/bed/blastz.rn3/pslBest
    for I in chr*BestRn3.psl
    do
	/cluster/bin/i386/hgLoadPsl -noTNameIx hg17 ${I}
	echo "done ${I}"
    done

     # check results
    # featureBits hg17 blastzBestRn3
    #	975533772 bases of 2866216770 (34.036%) in intersection
    # (with axtFilter) 970005525 bases of 2866216770 (33.843%) in intersection
    # featureBits hg16 blastzBestRn3
    #	976121391 bases of 2865248791 (34.068%) in intersection

    # Make /gbdb links and add them to the axtInfo table:
     mkdir -p /gbdb/hg17/axtBest/Rn3
     cd /gbdb/hg17/axtBest/Rn3
     ln -s /cluster/data/hg17/bed/blastz.rn3/axtNet/chr*.axt .
     cd /cluster/data/hg17/bed/blastz.rn3/axtNet
     rm -f axtInfoInserts.sql
     foreach f (/gbdb/hg17/axtBest/Rn3/chr*.axt)
       set chr=$f:t:r
       echo "INSERT INTO axtInfo (species, alignment, chrom, fileName) \
                VALUES ('rn3','Blastz Best in Genome','$chr','$f');" \
         >> axtInfoInserts.sql
     end
    hgsql hg17 < ~/kent/src/hg/lib/axtInfo.sql
    #	table axtInfo may already exist, ignore create error.
    hgsql hg17 < axtInfoInserts.sql


# MAKING RAT SYNTENY (DONE - 2004-06-30 - Hiram)
#	Re-Done after above done without the axtFilter

ssh hgwdev
mkdir /cluster/data/hg17/bed/syntenyRn3
cd /cluster/data/hg17/bed/syntenyRn3

# Copy all the needed scripts from /cluster/data/hg16/bed/syntenyMm3
cp -p /cluster/data/hg16/bed/syntenyMm3/*.pl .
cp -p /cluster/data/hg16/bed/syntenyMm3/*.sh .

./syntenicBest.pl -db=hg17 -table=blastzBestRn3
./smooth.pl
./joinsmallgaps.pl
./fillgap.pl -db=hg17 -table=blastzBestRn3
./synteny2bed.pl
#	The five commands above
#	real    196m2.565s
#	user    0m21.170s
#	sys     0m4.690s

#	Used to load this in syntenyRn3, but that type is misleading to
#	the table browser and fails the checkTableCoords check.
#	Better to use this ensRatMusHom type:
sed -e 's/ensPhusionBlast/ensRn3MusHom/g' \
      $HOME/kent/src/hg/lib/ensPhusionBlast.sql \
      > ensRn3MusHom.sql
hgLoadBed hg17 ensRn3MusHom ucsc100k.bed -sqlTable=ensRn3MusHom.sql

    #	featureBits hg17 ensRn3MusHom
    #	2592164486 bases of 2866216770 (90.439%) in intersection
    #	featureBits hg16 syntenyRn3
    #	2595919851 bases of 2865248791 (90.600%) in intersection


# MAKING RAT AXTTIGHT FROM AXTBEST (DONE - 2004-06-15 - Hiram)
    # After creating axtBest alignments above, use subsetAxt to get axtTight:
    ssh eieio
    cd /cluster/data/hg17/bed/blastz.rn3/axtNet
    mkdir -p ../axtTight
    foreach i (*.axt)
      echo $i
      subsetAxt  $i ../axtTight/$i \
        ~kent/src/hg/mouseStuff/subsetAxt/coding.mat 3400
    end

    # translate to psl
    cd ../axtTight
    mkdir ../pslTight
    foreach i (*.axt)
      set c = $i:r
      axtToPsl $i ../S1.len ../S2.len ../pslTight/${c}_blastzTightRn3.psl
      echo "Done: $i"
    end

    # Load tables into database
    ssh hgwdev
    cd /cluster/data/hg17/bed/blastz.rn3/pslTight
    for I in chr*TightRn3.psl
    do
	/cluster/bin/i386/hgLoadPsl -noTNameIx hg17 ${I}
	echo "done ${I}"
    done

    #	Compare results with previous assembly
    #	featureBits hg17 blastzTightRn3
    #	153936720 bases of 2866216770 (5.371%) in intersection
    #	featureBits hg16 blastzTightRn3
    #	153151903 bases of 2865248791 (5.345%) in intersection

    # copy  axt's to download area
    ssh hgwdev
    cd /cluster/data/hg17/bed/blastz.rn3/axtTight
    mkdir -p /usr/local/apache/htdocs/goldenPath/hg17/vsRn3/axtTight
    cp -p *.axt /usr/local/apache/htdocs/goldenPath/hg17/vsRn3/axtTight
    cd /usr/local/apache/htdocs/goldenPath/hg17/vsRn3/axtTight
    gzip *.axt
    # add README.txt file to dir (use previous assembly's copy as template)

    # REDO downloads with fixed axtNet's (2005=09-13 kate)
    ssh hgwdev
    cd /usr/local/apache/htdocs/goldenPath/hg17/vsRn3
    mv axtNet axtNet.old
    nice cp -rp /cluster/data/hg17/bed/blastz.rn3/axtNet .
    cd axtNet
    nice gzip *.axt
    md5sum *.axt.gz > md5sum.txt


# BLASTZ RN3 CLEAN UP (DONE - 2004-07-02 - Hiram)
    ssh eieio
    cd /cluster/data/hg17/bed/blastz.rn3
    nice rm -rf raw &
    nice rm axtChain/run1/chain/* &
    nice rm -fr axtChain/n1 axtChain/hNoClass.net &
    nice gzip axtChrom/* pslChrom/* lav/*/* axtChain/all.chain axtChain/*.net &

# BLASTZ CHICKEN (GALGAL2) (DONE - 2004-06-14 - Fan)

    ssh kk
    mkdir /cluster/data/hg17/bed/blastz.galGal2.2004-06-14
    cd /cluster/data/hg17/bed
    ln -s /cluster/data/hg17/bed/blastz.galGal2.2004-06-14 blastz.galGal2
    cd blastz.galGal2
    # Set L=10000 (higher threshold on blastz's outer loop) and abridge 
    # repeats.
    cat << '_EOF_' > DEF
# human vs. chicken
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/i386:/cluster/home/angie/schwartzbin

ALIGN=blastz-run
BLASTZ=blastz

# Specific settings for chicken (per Webb email to Brian Raney)
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=10000
BLASTZ_K=2200
BLASTZ_Q=/cluster/data/blastz/HoxD55.q
BLASTZ_ABRIDGE_REPEATS=1

# TARGET: Human
SEQ1_DIR=/iscratch/i/hg17/bothMaskedNibs
SEQ1_RMSK=
SEQ1_FLAG=
SEQ1_SMSK=/iscratch/i/gs.18/build35/linSpecRep.chicken
SEQ1_IN_CONTIGS=0
SEQ1_CHUNK=10000000
SEQ1_LAP=10000

# QUERY: Chicken
SEQ2_DIR=/iscratch/i/galGal2/nib
SEQ2_RMSK=
SEQ2_FLAG=
SEQ2_SMSK=/iscratch/i/galGal2/linSpecRep
SEQ2_IN_CONTIGS=0
SEQ2_CHUNK=10000000
SEQ2_LAP=0

BASE=/cluster/store5/gs.18/build35/bed/blastz.galGal2

DEF=$BASE/DEF
RAW=$BASE/raw
CDBDIR=$BASE
SEQ1_LEN=$BASE/S1.len
SEQ2_LEN=$BASE/S2.len
'_EOF_'
    # << this line keeps emacs coloring happy

    # prepare first cluster run
    ssh kk
    cd /cluster/data/hg17/bed/blastz.galGal2
    bash
    # script copied over from /cluster/data/mm4/jkStuff/BlastZ_run0.sh
    #	it is a generic script and works for any assembly
    /cluster/data/hg17/jkStuff/BlastZ_run0.sh
    cd run.0
    para try, check, push, check, ....
Completed: 41943 of 41943 jobs
CPU time in finished jobs:   15330421s  255507.02m  4258.45h  177.44d  0.486 y
IO & Wait Time:                673809s   11230.15m   187.17h    7.80d  0.021 y
Average job time:                 382s       6.36m     0.11h    0.00d
Longest job:                     4651s      77.52m     1.29h    0.05d
Submission to last job:        169197s    2819.95m    47.00h    1.96d

    #	Second cluster run to convert the .out's to .lav's
    #	You do NOT want to run this on the big cluster.  It brings
    #	the file server to its knees.  Run this on the small cluster.
    ssh kki
    cd /cluster/data/hg17/bed/blastz.galGal2
    bash
    # script copied over from /cluster/data/mm4/jkStuff/BlastZ_run1.sh
    #	fixup machine check, should be kki, not kk
    /cluster/data/hg17/jkStuff/BlastZ_run1.sh
    cd run.1
    para try, check, push, etc ...
# Completed: 341 of 341 jobs
# CPU time in finished jobs:       1894s      31.56m     0.53h    0.02d  0.000 y
# IO & Wait Time:                  6271s     104.52m     1.74h    0.07d  0.000 y
# Average job time:                  24s       0.40m     0.01h    0.00d
# Longest job:                      131s       2.18m     0.04h    0.00d
# Submission to last job:           590s       9.83m     0.16h    0.01d

    #	Third cluster run to convert lav's to axt's
    cd /cluster/data/hg17/bed/blastz.galGal2
    #	The copy of this in mm4 was broken, fixed here
    /cluster/data/hg17/jkStuff/BlastZ_run2.sh
    cd run.2
    para try, check, push, etc ...
# Completed: 46 of 46 jobs
# CPU time in finished jobs:        426s       7.09m     0.12h    0.00d  0.000 y
# IO & Wait Time:                  7283s     121.39m     2.02h    0.08d  0.000 y
# Average job time:                 168s       2.79m     0.05h    0.00d
# Longest job:                      642s      10.70m     0.18h    0.01d
# Submission to last job:           642s      10.70m     0.18h    0.01d

    # translate sorted axt files into psl
    ssh eieio
    cd /cluster/data/hg17/bed/blastz.galGal2
    mkdir pslChrom
    set tbl = "blastzGalGal2"
    foreach f (axtChrom/chr*.axt)
      set c=$f:t:r
      echo "Processing chr $c"
      /cluster/bin/i386/axtToPsl $f S1.len S2.len pslChrom/${c}_${tbl}.psl
    end
    #	That takes about 30 minutes

    # Load database tables
    ssh hgwdev
    cd /cluster/data/hg17/bed/blastz.galGal2/pslChrom
    bash
    for I in *.psl
    do
        /cluster/bin/i386/hgLoadPsl -noTNameIx hg17 ${I}
        echo "done: ${I}"
    done   


# GNF ATLAS 2 (DONE - 2004-07-14 - Hiram
    # Align probes from GNF1H chip.
    ssh kk
    cd /cluster/data/hg17/bed
    mkdir -p geneAtlas2/run/psl
    cd geneAtlas2/run
    #	This bluearc/geneAtlas2 directory already exists
    # mkdir -p /cluster/bluearc/geneAtlas2
    # cp /projects/compbio/data/microarray/geneAtlas2/human/gnf1h.fa /cluster/bluearc/geneAtlas2
    ls -1 /scratch/hg/gs.18/build35/maskedContigs > genome.lst
    ls -1 /cluster/bluearc/geneAtlas2/gnf1h.fa > mrna.lst
    cat << '_EOF_' > gsub
#LOOP
blat -fine -ooc=/scratch/hg/h/11.ooc  /scratch/hg/gs.18/build35/maskedContigs/$(path1) $(path2) {check out line+ psl/$(root1)_$(root2).psl}
#ENDLOOP
'_EOF_'
    # << this line makes emacs coloring happy

    gensub2 genome.lst mrna.lst gsub jobList
    para create jobList
    para try
    para check
    para push
    para time
# Completed: 380 of 380 jobs
# CPU time in finished jobs:      10599s     176.65m     2.94h    0.12d  0.000 y
# IO & Wait Time:                  3893s      64.88m     1.08h    0.05d  0.000 y
# Average job time:                  38s       0.64m     0.01h    0.00d
# Longest job:                      649s      10.82m     0.18h    0.01d
# Submission to last job:           663s      11.05m     0.18h    0.01d

    # Do sort, best in genome filter, and convert to chromosome coordinates
    # to create gnf1h.psl.
    pslSort dirs raw.psl tmp psl
    pslReps -minCover=0.3 -minAli=0.95 -nearTop=0.005 raw.psl \
	contig.psl /dev/null
    #	Processed 80818 alignments
    liftUp ../affyGnf1h.psl ../../../jkStuff/liftAll.lft warn contig.psl
    rm -r contig.psl raw.psl psl

    # Load probes and alignments from GNF1H into database.
    ssh hgwdev
    cd /cluster/data/hg17/bed/geneAtlas2
    #	Already symlinked
    # ln -s /projects/compbio/data/microarray/geneAtlas2/human/gnf1h.fa \
    #	/gbdb/hgFixed/affyProbes
    hgLoadPsl hg17 affyGnf1h.psl
    hgLoadSeq hg17 /gbdb/hgFixed/affyProbes/gnf1h.fa

    grep -v U133B ../affyUclaNorm/hg17.affyU133AB_all.lifted.pslReps.psl \
	| sed -e "s/exemplar://; s/consensus://; s/U133A://" \
	| sed -e "s/;//" > affyU133A.psl

    hgMapMicroarray gnfAtlas2.bed hgFixed.gnfHumanAtlas2MedianRatio \
    	affyU133A.psl  /cluster/data/hg17/bed/geneAtlas2/affyGnf1h.psl

# Loaded 44696 rows of expression data from hgFixed.gnfHumanAtlas2MedianRatio
# Mapped 32857,  multiply-mapped 1462, missed 49, unmapped 11839

    hgLoadBed hg17 gnfAtlas2 gnfAtlas2.bed
    #	Loaded 34319 elements of size 15

# LOAD SNPS ( Daryl Thomas; November 7, 2004 ; 
	      snpExceptions added January 8, 2005 ; 
	      updated to build 124 on January 13, 2005;
	      added affy snps March 5, 2004 )
    set db    = hg17
    set org   = human
    set build = 124
    set dir   = /cluster/bluearc/snp/$db/build$build

    # ssh to some quiet machine with fast access to the bluearc
    # it takes ~4.5 hours to download the data 
    # (build 124 directly to /cluster/bluearc/... from eieio)
    # Check to make sure the chrMT file is included
    mkdir -p $dir $dir/ds_ch.xml $dir/det $dir/str $dir/loc $dir/seq
    cd $dir
    ln -s /cluster/data/$db/jkStuff/liftAll.lft .

    screen
    ftp ftp.ncbi.nih.gov
    cd snp/$org/XML
    prompt
    mget ds_ch*.xml.gz
    exit # screen
    exit # machine

    # TODO: check chromStart for each locType

    cp -f {$HOME}/kent/src/hg/snp/parseDbSnpXML /cluster/bin/scripts
    chmod 775 /cluster/bin/scripts/parseDbSnpXML

    #ssh kk
    touch jobList
    foreach file ( /cluster/bluearc/snp/$db/build$build/ds_ch*.xml.gz )
	    set out = $file:t:r
	    echo /cluster/bin/scripts/parseDbSnpXML $file /cluster/bluearc/snp/$db/build$build $out.contig >> jobList
    end

    # I removed ds_chMulti.xml.gz and ds_chNotOn.xml.gz from the job list
    # para create jobList; para push; para check ... 

#Completed: 25 of 25 jobs
#CPU time in finished jobs:      30120s     502.01m     8.37h    0.35d  0.001 y
#IO & Wait Time:                  2533s      42.21m     0.70h    0.03d  0.000 y
#Average job time:                1306s      21.77m     0.36h    0.02d
#Longest job:                     2611s      43.52m     0.73h    0.03d
#Submission to last job:          2611s      43.52m     0.73h    0.03d
    exit # kk

    mv -r $dir /cluster/data/$db/bed/snp/build$build
    set dir = /cluster/data/$db/bed/snp/build$build
    cd $dir

    # concatenate the details files to make it easier to lift (and load)
    time zcat det/ds_ch*.xml.contig.det.gz > $db.build$build.contig.bed
    # 33.380u 24.470s 1:54.79 50.3%   0+0k 0+0io 86pf+0w (hgwdev)
    time gzip $db.build$build.contig.bed
    # 251.160u 16.770s 12:40.77 35.2% 0+0k 0+0io 83pf+0w (hgwdev/bluearc - should have done it on eieio/store5)

    # some of the NT contigs are not in the liftSpec - this is expected as snps that map to
    # alternate assemblies (Celera) are in the original files, but we disregard their mappings.
    time liftUp $db.build$build.bed liftAll.lft warn $db.build$build.contig.bed.gz 
    # 232.260u 30.050s 5:09.04 84.8%  0+0k 0+0io 379pf+0w (hgwdev/store5)

    time gzip hg17.build124.bed                                                                      
    # 141.980u 8.180s 2:34.43 97.2%   0+0k 0+0io 83pf+0w

    # hgLoadBed is the important step - check to make sure there are no warnings
    time hgLoadBed $db snp $db.build$build.bed.gz -sqlTable=${HOME}/kent/src/hg/lib/snp.sql
    # Loaded 9131054 elements of size 16
    # 225.040u 37.030s 35:20.45 12.3% 0+0k 0+0io 308pf+0w

    # basic snp table is now loaded, but exception column needs to be updated
    # ~ 3 hours wall clock time from here to end

    # run queries from snpException.query against snp table
    mkdir -p /usr/local/apache/htdocs/qa/test-results/snpException/build$build
    cd       /usr/local/apache/htdocs/qa/test-results/snpException/build$build
    time snpException hg17 0 ${db}snpException > ${db}snpException.log
    chmod o+rx . 
    chmod o+r  * 
    # 10.610u 19.200s 53:59.98 0.9%   0+0k 0+0io 264pf+0w

    # check alignment of flanking sequences
    time snpValid $db /cluster/data/$db/bed/snp/build$build/seq > ${db}snpValid.log
    # 5205.860u 216.570s 1:55:10.27 78.4%     0+0k 0+0io 72408pf+0w (hgwdev)

    ### NOTE: the pseudoautosomal snps are reported in the chrX files
    ### only, which causes problems for snpValid when checking the
    ### chrY snp mappings.  I got around this by confirming that all
    ### of the 'missing flank' errors (#23) were in pseudoautosomal
    ### regions and ignoring them.  I manually truncated the
    ### hg17snpException.23.bed file before continuing with the next
    ### step.  This could/should be fixed in the next iteration.

    # create list of statements to update the snp table and run them
    time tail +3 ${db}snpException.* | awk '/rs/ {printf "%s\t%d\t%d\n",$4,$2,$5}' | sort -k1,2n > exceptionList.txt
    # ~10 seconds
    time updateExceptionList.pl < exceptionList.txt > updateExceptionList.sql
    # 7.250u 0.390s 0:07.87 97.0%     0+0k 0+0io 337pf+0w
    time hgsql hg17 < updateExceptionList.sql
    # 8.420u 10.370s 11:58.44 2.6%    0+0k 0+0io 413pf+0w build123 (this is mostly a mysql process)
    # 6.550u  9.370s 14:34.17 1.8%    0+0k 0+0io 413pf+0w build124
    # > wc -l build12*/updateExceptionList.sql
    #  387166 build123/updateExceptionList.sql
    #  383759 build124/updateExceptionList.sql

    # Add Affy SNPs from new submission 
    #!/bin/csh -fe
    # rm -f log ; date ; ./loadAffySnps.csh > & log ; date ; cat log

    set db = hg17
    cd /cluster/data/$db/bed/snp/affy/latest
    touch affy.txt affy.bed Affy.bed bed.tab
    rm -f affy*.txt affy*.bed Affy.bed* bed.tab

    # datafile was provided by Valmeekam, Venu [Venu_Valmeekam@affymetrix.com]
    tar xfz affyhg17maps_withstrand_alleles.tgz
    wc -l affy*txt

    awk '$1 !~ /^chrom/ {printf("chr%s\t%d\t%d\t%s\t0\t%s\t%s\tunknown\tsnp\tunknown\t0\t0\tunknown\texact\tAffy10K\t0\n",        $1,$2,$3,$4,$6,$7);}' < affy10K.txt         > affy10K.bed
    awk '$1 !~ /^chrom/ {printf("chr%s\t%d\t%d\t%s\t0\t%s\t%s\tunknown\tsnp\tunknown\t0\t0\tunknown\texact\tAffy10Kv2\t0\n",      $1,$2,$3,$4,$6,$7);}' < affy10Kv2.txt       > affy10Kv2.bed
    awk '$1 !~ /^chrom/ {printf("chr%s\t%d\t%d\t%s\t0\t%s\t%s\tunknown\tsnp\tunknown\t0\t0\tunknown\texact\tAffy50K_HindIII\t0\n",$1,$2,$3,$4,$6,$7);}' < affy50K_HindIII.txt > affy50K_HindIII.bed
    awk '$1 !~ /^chrom/ {printf("chr%s\t%d\t%d\t%s\t0\t%s\t%s\tunknown\tsnp\tunknown\t0\t0\tunknown\texact\tAffy50K_XbaI\t0\n",   $1,$2,$3,$4,$6,$7);}' < affy50K_XbaI.txt    > affy50K_XbaI.bed
    
    # this is a temporary kluge to fix some bad input data.
    cat affy*.bed | sed 's/_par//' > Affy.bed
    
    # the source enum for 'dbSnp' is 2; all of the affy* values are higher.
    hgsql $db -e "delete from snp where source > 2 "
    
    hgLoadBed $db snp Affy.bed -oldTable -tab
    
    rm -f affy*.txt affy*.bed bed.tab
    gzip Affy.bed
    
    #mysql> select source, count(*) from hg17.snp group by source;                                                                                                                   
    #+-----------------+----------+
    #| source          | count(*) |
    #+-----------------+----------+
    #| dbSnp           |  9131054 |
    #| Affy10K         |    11344 |
    #| Affy10Kv2       |    10032 |
    #| Affy50K_HindIII |    56859 |
    #| Affy50K_XbaI    |    58494 |
    #+-----------------+----------+

    #March 7, 2005: fix pseudoautosomal snps:
    #SNP_A-1606360
    #SNP_A-1606329
    #SNP_A-1666553
    #SNP_A-1715750
    #SNP_A-1726331
    #SNP_A-1685712
    #SNP_A-1735899
    #SNP_A-1726272
    #SNP_A-1660936
    #SNP_A-1662285
    #SNP_A-1680848
    #SNP_A-1671440
    #SNP_A-1719355
    #SNP_A-1716499
    #SNP_A-1643847
    #SNP_A-1646007
    #SNP_A-1715285
    #SNP_A-1657714
    #SNP_A-1725038
    #SNP_A-1713938
    #SNP_A-1708565
    #SNP_A-1510243
    #SNP_A-1510197
    #SNP_A-1606356

    delete from snp 
    where  chrom = 'chrY' 
    and    name in ('SNP_A-1606360','SNP_A-1606329','SNP_A-1666553','SNP_A-1715750','SNP_A-1726331','SNP_A-1685712','SNP_A-1735899','SNP_A-1726272','SNP_A-1660936','SNP_A-1662285','SNP_A-1680848','SNP_A-1671440','SNP_A-1719355','SNP_A-1716499','SNP_A-1643847','SNP_A-1646007','SNP_A-1715285','SNP_A-1657714','SNP_A-1725038','SNP_A-1713938','SNP_A-1708565','SNP_A-1510243','SNP_A-1510197','SNP_A-1606356');

    update snp 
    set chrom = 'chrX' 
    where name in ('SNP_A-1606360','SNP_A-1606329','SNP_A-1666553','SNP_A-1715750','SNP_A-1726331','SNP_A-1685712','SNP_A-1735899','SNP_A-1726272','SNP_A-1660936','SNP_A-1662285','SNP_A-1680848','SNP_A-1671440','SNP_A-1719355','SNP_A-1716499','SNP_A-1643847','SNP_A-1646007','SNP_A-1715285','SNP_A-1657714','SNP_A-1725038','SNP_A-1713938','SNP_A-1708565','SNP_A-1510243','SNP_A-1510197','SNP_A-1606356');

    insert into snp 
	select  bin, 'chrY' as chrom, chromStart, chromEnd, name, score, strand, 
		observed, molType, class, valid, avHet, avHetSE, func, locType, source, exception 
	from    snp 
	where   name in ('SNP_A-1606360','SNP_A-1606329','SNP_A-1666553','SNP_A-1715750','SNP_A-1726331','SNP_A-1685712','SNP_A-1735899','SNP_A-1726272','SNP_A-1660936','SNP_A-1662285','SNP_A-1680848','SNP_A-1671440','SNP_A-1719355','SNP_A-1716499','SNP_A-1643847','SNP_A-1646007','SNP_A-1715285','SNP_A-1657714','SNP_A-1725038','SNP_A-1713938','SNP_A-1708565','SNP_A-1510243','SNP_A-1510197','SNP_A-1606356');

    select chrom, count(*) 
    from snp 
    where name in ('SNP_A-1606360','SNP_A-1606329','SNP_A-1666553','SNP_A-1715750','SNP_A-1726331','SNP_A-1685712','SNP_A-1735899','SNP_A-1726272','SNP_A-1660936','SNP_A-1662285','SNP_A-1680848','SNP_A-1671440','SNP_A-1719355','SNP_A-1716499','SNP_A-1643847','SNP_A-1646007','SNP_A-1715285','SNP_A-1657714','SNP_A-1725038','SNP_A-1713938','SNP_A-1708565','SNP_A-1510243','SNP_A-1510197','SNP_A-1606356') 
    group by chrom;;


## LS-SNP links [load data only] (Daryl Thomas; November 3, 2005)
    # Data from Rachel Karchin in the Andrej Sali lab at UCSF
    # /cluster/data/hg17/bed/lssnp
    hgsql hg17 < ${HOME}/kent/src/hg/lib/lsSnpFunction.sql
    hgsql hg17 < ${HOME}/kent/src/hg/lib/lsSnpStructure.sql
    mysql> load data local infile "snp-human3-function-predictions.txt" into table lsSnpFunction;
    Query OK, 24337 rows affected (1.27 sec)
    mysql> load data local infile "snp-human3-structure-predictions.txt" into table lsSnpStructure;
    Query OK, 34764 rows affected (2.36 sec)

# Tajima's D (DONE -- 2005-09-20 -- Daryl)
# Data from Chris Carlson in Debbie Nickerson's lab
# Chris Carlson [csc47<AT>u<DOT>washington<DOT>edu]

    # get data from ftp site, unpack in $dir:
    # tar tvfz *gz | more
    # -rw-r--r-- chris/admin 34405061 2005-06-03 13:22:15 AD.SNP.track
    # -rw-r--r-- chris/admin 29869512 2005-06-03 13:22:30 ED.SNP.track
    # -rw-r--r-- chris/admin 27154049 2005-06-03 13:22:41 XD.SNP.track
    # -rw-r--r-- chris/admin 10948753 2005-06-02 21:12:27 AD.tajd.track
    # -rw-r--r-- chris/admin 10928630 2005-06-02 21:12:39 ED.tajd.track
    # -rw-r--r-- chris/admin 10926122 2005-06-02 21:12:51 XD.tajd.track

    set db=hg17
    set dir=/cluster/data/$db/bed/tajdpoly/latest
    cd $dir
    
    tar xvfz TajDtracks.tar.gz

    mac2unix < AD.SNP.track  | grep -v track | sed 's/1\.02e+08/102000000/;s/8\.8e+07/88000000/;s/1\.5e+07/15000000/' > hg17.tajdSnpAd.bed
    mac2unix < ED.SNP.track  | grep -v track | sed 's/1\.02e+08/102000000/;s/8\.8e+07/88000000/;s/1\.5e+07/15000000/' > hg17.tajdSnpEd.bed
    mac2unix < XD.SNP.track  | grep -v track | sed 's/1\.02e+08/102000000/;s/8\.8e+07/88000000/;s/1\.5e+07/15000000/' > hg17.tajdSnpXd.bed
    mac2unix < AD.tajd.track | grep -v track | awk '{printf"%s\t%s\t%d\t%.3f\n",$1,$2,$3,$4}' > hg17.tajdAd.bedGraph
    mac2unix < ED.tajd.track | grep -v track | awk '{printf"%s\t%s\t%d\t%.3f\n",$1,$2,$3,$4}' > hg17.tajdEd.bedGraph
    mac2unix < XD.tajd.track | grep -v track | awk '{printf"%s\t%s\t%d\t%.3f\n",$1,$2,$3,$4}' > hg17.tajdXd.bedGraph
    
    set chain = /cluster/data/hg17/bed/bedOver/hg17ToHg16.over.chain

    foreach pop (Ad Ed Xd)
	liftOver hg17.tajdSnp$pop.bed   $chain hg16.tajdSnp$pop.bed   hg17ToHg16.tajdSnp$pop.unmapped
	liftOver hg17.tajd$pop.bedGraph $chain hg16.tajd$pop.bedGraph hg17ToHg16.tajd$pop.unmapped
	foreach db (hg16 hg17)
	    hgLoadBed -bedGraph=4 $db tajd$pop    $db.tajd$pop.bedGraph
	    hgLoadBed             $db tajdSnp$pop $db.tajdSnp$pop.bed
	end
    end

    set where1 = "where t.bin=g.bin and t.chrom=g.chrom and (t.chromStart between g.chromStart and g.chromEnd or t.chromEnd between g.chromStart and g.chromEnd)"
    set where2 = "t, chromInfo c where t.chromStart < 0 or (t.chrom=c.chrom and t.chromEnd > c.size)"
    set list = "as pop, t.chrom, t.chromStart from"
    foreach db (hg16 hg17)
	rm -f $db.delete.sql
	touch $db.delete.sql
	foreach p (Ad Ed Xd SnpAd SnpEd SnpXd)
	    foreach c (1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 X Y)
		echo "select 'tajd$p' $list tajd${p} t,chr${c}_gap g $where1" | \
		    hgsql $db | \
		    grep -v pop | \
		    awk '{printf "delete from %s where chrom=\"%s\" and chromStart=%d;\n",$1,$2,$3}' \
		    >> $db.delete.sql
	    end
	    echo "select 'tajd$p' $list tajd${p} $where2" | \
		hgsql $db | \
		grep -v pop | \
		awk '{printf "delete from %s where chrom=\"%s\" and chromStart=%d;\n",$1,$2,$3}'\
		>> $db.delete.sql
	end
	hgsql $db < $db.delete.sql
    end


# GENE SORTER (AKA: FAMILY BROWSER) (DONE - 2004-06-16 - Hiram)
# Added knownToU133Plus2 track (2004-10-14)
#	to be done after knownGene tables are complete from known gene
#	process.
#
# Cluster together various alt-splicing isoforms.
#	Creates the knownIsoforms and knownCanonical tables
ssh hgwdev
mkdir /cluster/data/hg17/bed/geneSorter.2004-06-15
ln -s /cluster/data/hg17/bed/geneSorter.2004-06-15 \
	/cluster/data/hg17/bed/geneSorter
cd /cluster/data/hg17/bed/geneSorter
hgClusterGenes hg17 knownGene knownIsoforms knownCanonical

# Extract peptides from knownGenes into fasta file
# and create a blast database out of them.
mkdir /cluster/data/hg17/bed/geneSorter/blastp
cd /cluster/data/hg17/bed/geneSorter/blastp
pepPredToFa hg17 knownGenePep known.faa
#	You may need to build this binary in src/hg/near/pepPredToFa
/scratch/blast/formatdb -i known.faa -t known -n known
#	This command is in /projects/compbio/bin/$MACH/formatdb

# Copy over database to bluearc
rm -fr /cluster/bluearc/hg17/blastp
mkdir -p /cluster/bluearc/hg17/blastp
cp -p /cluster/data/hg17/bed/geneSorter/blastp/known.* \
	/cluster/bluearc/hg17/blastp

#	Had to pick up a new blastall binary (2004-06-15)
#	Our old one would no longer run on our systems that have
#	updated Linux versions
mkdir /cluster/bluearc/blast229
cd /cluster/bluearc/blast229
wget --timestamping \
    ftp://ftp.ncbi.nlm.nih.gov/blast/executables/release/2.2.9/blast-2.2.9-ia32-linux.tar.gz
wget --timestamping \
    ftp://ftp.ncbi.nlm.nih.gov/blast/executables/release/2.2.9/ChangeLog.txt
wget --timestamping \
    ftp://ftp.ncbi.nlm.nih.gov/blast/executables/release/2.2.9/ReleaseNotes.txt
tar xvzf blast-2.2.9-ia32-linux.tar.gz


# Split up fasta file into bite sized chunks for cluster
cd /cluster/data/hg17/bed/geneSorter/blastp
mkdir split
faSplit sequence known.faa 8000 split/kg

# Make parasol run directory
ssh kk
mkdir /cluster/data/hg17/bed/geneSorter/blastp/self
cd /cluster/data/hg17/bed/geneSorter/blastp/self
mkdir run
cd run
mkdir out

# Make blast script
cat  << '_EOF_' > blastSome
#!/bin/sh
BLASTMAT=/cluster/bluearc/blast229/data /cluster/bluearc/blast229/blastall \
	-p blastp -d /cluster/bluearc/hg17/blastp/known -i $1 -o $2 \
	-e 0.01 -m 8 -b 1000
'_EOF_'
    # << keep emacs happy
chmod +x blastSome

# Make gensub2 file
cat  << '_EOF_' > gsub
#LOOP
blastSome {check in line+ $(path1)} {check out line out/$(root1).tab}
#ENDLOOP
'_EOF_'
    # << keep emacs happy

# Create parasol batch
#	'ls ../../split/*.fa' is too much, hence the echo
echo ../../split/*.fa | wordLine stdin > split.lst
gensub2 split.lst single gsub jobList
para create jobList
para try
# Wait a couple of minutes, and do a para check,  if all is good
# then do a
para push
# This should finish in ~15 minutes if the cluster is free.
Completed: 7749 of 7749 jobs
CPU time in finished jobs:     182148s    3035.81m    50.60h    2.11d  0.006 y
IO & Wait Time:                 22954s     382.56m     6.38h    0.27d  0.001 y
Average job time:                  26s       0.44m     0.01h    0.00d
Longest job:                      372s       6.20m     0.10h    0.00d
Submission to last job:           871s      14.52m     0.24h    0.01d

# Load into database.  This takes about 30 minutes
ssh hgwdev
cd /cluster/data/hg17/bed/geneSorter/blastp/self/run/out
time hgLoadBlastTab hg17 knownBlastTab *.tab
# Scanning through 7749 files
# Loading database with 11799667 rows
#	Hg16 was:       11376875 rows
# real    30m10.761s
# user    5m25.490s
# sys     1m0.630s

cd /cluster/data/hg17/bed/geneSorter
# Create table that maps between known genes and RefSeq
hgMapToGene hg17 refGene knownGene knownToRefSeq
#	may need to build this command in src/hg/near/hgMapToGene
#	hgsql -e "select count(*) from knownToRefSeq;" hg17
#	row count changed from 36078 in Hg16 to 36082

# Create table that maps between known genes and LocusLink
hgsql --skip-column-names -e "select mrnaAcc,locusLinkId from refLink" hg17 \
	> refToLl.txt
hgMapToGene hg17 refGene knownGene knownToLocusLink -lookup=refToLl.txt
#	hgsql -e "select count(*) from knownToLocusLink;" hg17
#	row count went from 36078 in Hg16 to 36082

# Create table that maps between known genes and Pfam domains
hgMapViaSwissProt hg17 knownGene name proteinID Pfam knownToPfam
#	hgsql -e "select count(*) from knownToPfam;" hg17
#	row count dropped from 30467 in Hg16 to 29725


# Create table to map between known genes and GNF Atlas2
# expression data.
    hgMapToGene hg17 gnfAtlas2 knownGene knownToGnfAtlas2 '-type=bed 12'
#	hgsql -e "select count(*) from knownToGnfAtlas2;" hg17
#	row count droppted from 35817 in Hg16 to 35739

# Create expression distance table - takes about an hour
    hgExpDistance hg17 hgFixed.gnfHumanAtlas2MedianRatio \
    	hgFixed.gnfHumanAtlas2MedianExps gnfAtlas2Distance \
	-lookup=knownToGnfAtlas2
# Got 35739 unique elements in hgFixed.gnfHumanAtlas2MedianRatio
    #	hgsql -e "select count(*) from gnfAtlas2Distance;" hg17
    #	row count went from 35,817,000 in Hg16 to 35,739,000
    #	real    108m1.671s
    #	user    89m30.680s
    #	sys     3m6.800s

# Create a table that maps between known genes and 
# the nice affy expression data.
hgMapToGene "-type=bed 12" hg17 affyUclaNorm knownGene knownToU133
    #	hgsql -e "select count(*) from knownToU133;" hg17
    #	row count went from 37,634 in Hg16 to 36,795

# Create expression distance table.  This will take about 2.5 hours
cd /tmp
cp -p ~/kent/src/hg/near/hgExpDistance/affyUcla.weight .
time hgExpDistance hg17 affyUclaNorm affyUclaExp knownExpDistance \
	-weights=affyUcla.weight -lookup=knownToU133
    #	211 genes, 42 weights, 26.500000 total wieght
    #	Got 36795 unique elements in affyUclaNorm
    #	real    154m1.058s
    #	user    134m45.000s
    #	sys     3m1.990s

# Create table that maps between known genes and 
# the GNF data.
cd /tmp
hgMapToGene hg17 affyU95 knownGene knownToU95
    #	row count went from 18780 in Hg16 to 18796
    #	hgFixed.gnfHumanU95Exps argument is unused, no need to exist
hgExpDistance hg17 hgFixed.gnfHumanU95MedianRatio \
	hgFixed.gnfHumanU95Exps gnfU95Distance  -lookup=knownToU95
    #	row count went from 17711000 in Hg16 to 17710000
    #	real    21m37.703s
    #	user    13m35.110s
    #	sys     0m28.470s

# Create known gene mapping table and expression distance tables
# for GNF Atlas 2.  (The hgExpDistance takes only 10 minutes.)

hgMapToGene hg17 affyGnf1h knownGene knownToGnf1h
hgExpDistance hg17 hgFixed.gnfHumanAtlas2MedianRatio \
	hgFixed.gnfHumanAtlas2MedianExps gnfAtlas2Distance \
	-lookup=knownToGnf1h
# Have 44696 elements in hgFixed.gnfHumanAtlas2MedianRatio
# Got 9756 unique elements in hgFixed.gnfHumanAtlas2MedianRatio

# create table mapping knownGenes to affyU133Plus2 table (2004-10-14, hartera)
cd /cluster/data/hg17/bed/geneSorter
hgMapToGene hg17 affyU133Plus2 knownGene knownToU133Plus2

# Make sure that GO database is up to date.
See README in /cluster/store1/geneOntology.
#	I update this GO database very carefully, checking that all
#	structures in it remain the same from release to release and
#	backing up the current go DB in a backup database.  In this case
#	the backup is go040107 - when it was loaded for Mm4, and the new
#	go database is based on data from Dec 17th 2003 and Feb 2004 according
#	to the time stamp on the fetched data.  This build was done in
#	/cluster/store1/geneOntology/20040217

cd /cluster/data/hg17/bed/geneSorter

XXX - DO NOT YET HAVE ensGene table - must wait on Ensembl to release that
XXX - have not created the knownToEnsembl table yet - 2004-07-15 - Hiram
# Create knownToEnsembl column
hgMapToGene hg17 ensGene knownGene knownToEnsembl
#	table row count went from previous version: 36068 to 38251

# Make knownToCdsSnp table (DONE Nov 11, 2004, Heather)
  ssh hgwdev
  nice hgMapToGene hg17 snp knownGene knownToCdsSnp -all -cds
# row count 165728
# unique 34013
# approx. 5 minutes running time

# Make C. elegans ortholog column using blastp on wormpep.
# First make C. elegans protein database and copy it to cluster/bluearc
# if it doesn't exist already
#	This is already done, see makeMm3.doc for procedure
#	the directory: /cluster/bluearc/ce1/blastp should have data

#	The blast jobs below can be run on the kk or kk9 clusters
# Create the ceBlastTab
ssh kk9
mkdir /cluster/data/hg17/bed/geneSorter/blastp/ce1
cd /cluster/data/hg17/bed/geneSorter/blastp/ce1
mkdir run
cd run
mkdir out

# Make blast script
cat  << '_EOF_' > blastSome
#!/bin/sh
BLASTMAT=/cluster/bluearc/blast229/data /cluster/bluearc/blast229/blastall \
	-p blastp -d /cluster/bluearc/ce1/blastp/wormPep \
	-i $1 -o $2 -e 0.01 -m 8 -b 1
'_EOF_'
    # << keep emacs happy
chmod a+x blastSome

# Make gensub2 file
cat << '_EOF_' > gsub
#LOOP
blastSome {check in line+ $(path1)} {check out line out/$(root1).tab}
#ENDLOOP
'_EOF_'
    # << keep emacs happy

# Create parasol batch
echo ../../split/*.fa | wordLine stdin > split.lst
gensub2 split.lst single gsub jobList
para create jobList
para try
para check
para push ... etc ...
#	Only takes 10 minutes on an idle cluster
# Completed: 7749 of 7749 jobs
# CPU time in finished jobs:      32023s     533.72m     8.90h    0.37d  0.001 y
# IO & Wait Time:                 20643s     344.05m     5.73h    0.24d  0.001 y
# Average job time:                   7s       0.11m     0.00h    0.00d
# Longest job:                      110s       1.83m     0.03h    0.00d
# Submission to last job:          1911s      31.85m     0.53h    0.02d

# Load into database.  
ssh hgwdev
cd /cluster/data/hg17/bed/geneSorter/blastp/ce1/run/out
hgLoadBlastTab hg17 ceBlastTab -maxPer=1 *.tab
#	row count went from 27620 to 27616

# Make mouse ortholog column using blastp on mouse known genes.
# First make mouse protein database and copy it to cluster/bluearc
# if it doesn't exist already
#	This already exists.  See makeMm5.doc for procedure
#	the directory: /cluster/bluearc/scratch/mus/mm5/blastp should have data

# Make parasol run directory 
ssh kk
mkdir /cluster/data/hg17/bed/geneSorter/blastp/mm5
cd /cluster/data/hg17/bed/geneSorter/blastp/mm5
mkdir run
cd run
mkdir out

# Make blast script
cat  << '_EOF_' > blastSome
#!/bin/sh
BLASTMAT=/cluster/bluearc/blast229/data /cluster/bluearc/blast229/blastall \
	-p blastp -d /cluster/bluearc/scratch/mus/mm5/blastp/known \
	-i $1 -o $2 -e 0.001 -m 8 -b 1
'_EOF_'
    # << keep emacs happy
chmod a+x blastSome

# Make gensub2 file
cat << '_EOF_' > gsub
#LOOP
blastSome {check in line+ $(path1)} {check out line out/$(root1).tab}
#ENDLOOP
'_EOF_'
    # << keep emacs happy

# Create parasol batch
#	this echo trick is used because otherwise the command line is
#	too long and you can not do a simple ls
echo ../../split/*.fa | wordLine stdin > split.lst
gensub2 split.lst single gsub jobList
para create jobList
para try
para check
para push ... etc ...
# Completed: 7749 of 7749 jobs
# CPU time in finished jobs:     139041s    2317.34m    38.62h    1.61d  0.004 y
# IO & Wait Time:                 21227s     353.79m     5.90h    0.25d  0.001 y
# Average job time:                  21s       0.34m     0.01h    0.00d
# Longest job:                      260s       4.33m     0.07h    0.00d
# Submission to last job:          1137s      18.95m     0.32h    0.01d

# Load into database.  
ssh hgwdev
cd /cluster/data/hg17/bed/geneSorter/blastp/mm5/run/out
hgLoadBlastTab hg17 mmBlastTab -maxPer=1 *.tab
# Scanning through 7748 files
#	row count went from 36471 to 36638

# Make rat ortholog column using blastp on rat known genes.
# First make rat protein database and copy it to cluster/bluearc
# if it doesn't exist already
#	This already exists.  See makeRn3.doc for procedure.
#	Files were put in this directory: /cluster/bluearc/rn3/blastp/

# Make parasol run directory 
ssh kk
mkdir /cluster/data/hg17/bed/geneSorter/blastp/rn3
cd /cluster/data/hg17/bed/geneSorter/blastp/rn3
mkdir run
cd run
mkdir out

# Make blast script
cat  << '_EOF_' > blastSome
#!/bin/sh
BLASTMAT=/cluster/bluearc/blast229/data /cluster/bluearc/blast229/blastall \
    -p blastp -d /cluster/bluearc/rn3/blastp/known \
    -i $1 -o $2 -e 0.001 -m 8 -b 1
'_EOF_'
    # << keep emacs happy
chmod a+x blastSome

# Make gensub2 file
cat << '_EOF_' > gsub
#LOOP
blastSome {check in line+ $(path1)} {check out line out/$(root1).tab}
#ENDLOOP
'_EOF_'
    # << keep emacs happy

# Create parasol batch
#	this echo trick is used because otherwise the command line is
#	too long and you can not do a simple ls
echo ../../split/*.fa | wordLine stdin > split.lst
gensub2 split.lst single gsub jobList
para create jobList
para try
para check
para push ... etc ...
#Completed: 7749 of 7749 jobs
#CPU time in finished jobs:      31035s     517.25m     8.62h    0.36d  0.001 y
#IO & Wait Time:                 38472s     641.20m    10.69h    0.45d  0.001 y
#Average job time:                   9s       0.15m     0.00h    0.00d
#Longest job:                       75s       1.25m     0.02h    0.00d
#Submission to last job:           169s       2.82m     0.05h    0.00d

# Load into database.  
ssh hgwdev
cd /cluster/data/hg17/bed/geneSorter/blastp/rn3/run/out
hgLoadBlastTab hg17 rnBlastTab -maxPer=1 *.tab
# Scanning through 7749 files
#Loading database with 25574 rows

# Make Danio rerio (zebrafish) ortholog column using blastp on Ensembl.
# First make protein database and copy it to cluster/bluearc
# if it doesn't exist already
#	This is already done, see makeMm3.doc for procedure
#	the directory: /cluster/bluearc/dr1/blastp should have data

# Make parasol run directory 
ssh kk
mkdir /cluster/data/hg17/bed/geneSorter/blastp/dr1
cd /cluster/data/hg17/bed/geneSorter/blastp/dr1
mkdir run
cd run
mkdir out

# Make blast script
cat  << '_EOF_' > blastSome
#!/bin/sh
BLASTMAT=/cluster/bluearc/blast229/data /cluster/bluearc/blast229/blastall \
	-p blastp -d /cluster/bluearc/dr1/blastp/ensembl \
	-i $1 -o $2 -e 0.005 -m 8 -b 1
'_EOF_'
    # << keep emacs happy
chmod a+x blastSome

# Make gensub2 file
cat << '_EOF_' > gsub
#LOOP
blastSome {check in line+ $(path1)} {check out line out/$(root1).tab}
#ENDLOOP
'_EOF_'
    # << keep emacs happy

# Create parasol batch
echo ../../split/*.fa | wordLine stdin > split.lst
gensub2 split.lst single gsub jobList
para create jobList
para try
para check
para push ... etc ...
# Completed: 7749 of 7749 jobs
# CPU time in finished jobs:     100217s    1670.28m    27.84h    1.16d  0.003 y
# IO & Wait Time:                 23697s     394.95m     6.58h    0.27d  0.001 y
# Average job time:                  16s       0.27m     0.00h    0.00d
# Longest job:                      233s       3.88m     0.06h    0.00d
# Submission to last job:          1667s      27.78m     0.46h    0.02d

# Load into database.  
ssh hgwdev
cd /cluster/data/hg17/bed/geneSorter/blastp/dr1/run/out
hgLoadBlastTab hg17 drBlastTab -maxPer=1 *.tab
#	row count went from 32971 to 33023

# Make Saccharomyces cerevisiae (yeast) ortholog column using blastp on RefSeq.
# First make protein database and copy it to cluster/bluearc
# if it doesn't exist already
#	This is already done, see makeMm3.doc for procedure
#	the directory: /cluster/bluearc/sc1/blastp should have data

# Make parasol run directory 
ssh kk
mkdir /cluster/data/hg17/bed/geneSorter/blastp/sc1
cd /cluster/data/hg17/bed/geneSorter/blastp/sc1
mkdir run
cd run
mkdir out

# Make blast script
cat  << '_EOF_' > blastSome
#!/bin/sh
BLASTMAT=/cluster/bluearc/blast229/data /cluster/bluearc/blast229/blastall \
	-p blastp -d /cluster/bluearc/sc1/blastp/sgd \
	-i $1 -o $2 -e 0.01 -m 8 -b 1
'_EOF_'
    # << keep emacs happy
chmod a+x blastSome

# Make gensub2 file
cat << '_EOF_' > gsub
#LOOP
blastSome {check in line+ $(path1)} {check out line out/$(root1).tab}
#ENDLOOP
'_EOF_'
    # << keep emacs happy

# Create parasol batch
echo ../../split/*.fa | wordLine stdin > split.lst
gensub2 split.lst single gsub jobList
para create jobList
para try
para check
para push ... etc ...
# Completed: 7749 of 7749 jobs
# CPU time in finished jobs:      20738s     345.64m     5.76h    0.24d  0.001 y
# IO & Wait Time:                 22018s     366.96m     6.12h    0.25d  0.001 y
# Average job time:                   6s       0.09m     0.00h    0.00d
# Longest job:                       39s       0.65m     0.01h    0.00d
# Submission to last job:           572s       9.53m     0.16h    0.01d

# Load into database.  
ssh hgwdev
cd /cluster/data/hg17/bed/geneSorter/blastp/sc1/run/out
hgLoadBlastTab hg17 scBlastTab -maxPer=1 *.tab
#	row count went from 18286 to 18265

# Make Drosophila melanagaster ortholog column using blastp on FlyBase.
# First make SwissProt protein database and copy it to cluster/bluearc
# if it doesn't exist already
#	This is already done, see makeMm3.doc for procedure
#	the directory: /cluster/bluearc/dm1/blastp should have data

# Make parasol run directory 
ssh kk
mkdir /cluster/data/hg17/bed/geneSorter/blastp/dm1
cd /cluster/data/hg17/bed/geneSorter/blastp/dm1
mkdir run
cd run
mkdir out

# Make blast script
cat  << '_EOF_' > blastSome
#!/bin/sh
BLASTMAT=/cluster/bluearc/blast229/data /cluster/bluearc/blast229/blastall \
	-p blastp -d /cluster/bluearc/dm1/blastp/flyBase \
	-i $1 -o $2 -e 0.01 -m 8 -b 1
'_EOF_'
    # << keep emacs happy
chmod a+x blastSome

# Make gensub2 file
cat << '_EOF_' > gsub
#LOOP
blastSome {check in line+ $(path1)} {check out line out/$(root1).tab}
#ENDLOOP
'_EOF_'
    # << keep emacs happy

# Create parasol batch
echo ../../split/*.fa | wordLine stdin > split.lst
gensub2 split.lst single gsub jobList
para create jobList
para try
para check
para push ... etc ...
# Completed: 7749 of 7749 jobs
# CPU time in finished jobs:      82022s    1367.03m    22.78h    0.95d  0.003 y
# IO & Wait Time:                 21982s     366.37m     6.11h    0.25d  0.001 y
# Average job time:                  13s       0.22m     0.00h    0.00d
# Longest job:                      174s       2.90m     0.05h    0.00d
# Submission to last job:          1439s      23.98m     0.40h    0.02d

# Load into database.  
ssh hgwdev
cd /cluster/data/hg17/bed/geneSorter/blastp/dm1/run/out
hgLoadBlastTab hg17 dmBlastTab -maxPer=1 *.tab
#	row count went from 29322 to 29341

####  Blat knownGene proteins to determine exons (braney 2004-06-20 DONE)
    ssh hgwdev
    cd /cluster/data/hg17/bed
    mkdir blat.hg17KG.2004-06-20
    rm blat.hg17KG
    ln -s  blat.hg17KG.2004-06-20 blat.hg17KG
    cd blat.hg17KG
    pepPredToFa hg17 knownGenePep known.fa
    hgPepPred hg17 generic blastKGPep00 known.fa
    grep ">" known.fa | sed "s/>//" > kgName.lst
    kgName hg17 kgName.lst blastKGRef00
    hgsql hg17 < ~/kent/src/lib/hg/blastRef.sql
    echo "rename table blastRef to blastKGRef00" | hgsql hg17
    echo "load data local infile 'blastKGRef00' into table blastKGRef00" | hgsql hg17
    ssh kk
    cd /cluster/data/hg17/bed/blat.hg17KG
    cat << '_EOF_' > blatSome
#!/bin/csh -fe
/cluster/bin/i386/blat -t=dnax -q=prot -out=pslx $1 $2 $3
'_EOF_'
    # << keep emacs happy
    chmod +x blatSome
    ls -1S /scratch/hg/gs.18/build35/bothMaskedNibs/*.nib > human.lst
    mkdir kgfa
    cd kgfa
    faSplit sequence ../known.fa 3000 kg
    cd ..
    ls -1S kgfa/*.fa > kg.lst
    cat << '_EOF_' > blatGsub
#LOOP
blatSome $(path1) {check in line $(path2)} {check out line psl/$(root1)/$(root2).psl}
#ENDLOOP
'_EOF_'
    # << keep emacs happy
    gensub2 human.lst kg.lst blatGsub blatSpec
    mkdir psl
    cd psl
    foreach i (`cat ../human.lst`)
	mkdir `basename $i .nib`
    end
    cd ..
    para create blatSpec
    para push

# Completed: 133676 of 133676 jobs
# CPU time in finished jobs:   29661130s  494352.16m  8239.20h  343.30d  0.941 y
# IO & Wait Time:               2181179s   36352.99m   605.88h   25.25d  0.069 y
# Average job time:                 238s       3.97m     0.07h    0.00d
# Longest job:                   105972s    1766.20m    29.44h    1.23d

    ssh eieio
    cd /cluster/data/hg17/bed/blat.hg17KG
    pslSort dirs raw.psl /tmp psl/*
    pslReps -nohead -minCover=0.9 -minAli=0.9 raw.psl cooked.psl /dev/null
    pslUniq cooked.psl hg17KG.psl
    pslxToFa hg17KG.psl hg17KG_ex.fa -liftTarget=genome.lft -liftQuery=protein.lft

# BLASTZ MM4 (DONE - 2004-06-22 - Hiram)
    ssh kk
    mkdir -p /cluster/data/hg17/bed/blastz.mm4.2004-06-21
    cd /cluster/data/hg17/bed
    ln -s  blastz.mm4.2004-06-21 blastz.mm4
    cd blastz.mm4

    cat << '_EOF_' > DEF
# human vs. mouse
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/home/angie/schwartzbin:/cluster/home/kent/bin/i386

ALIGN=blastz-run
BLASTZ=blastz
BLASTZ_H=2000
BLASTZ_ABRIDGE_REPEATS=1

# TARGET
# Human
SEQ1_DIR=/iscratch/i/gs.18/build35/bothMaskedNibs
# not used
SEQ1_RMSK=
# not used
SEQ1_FLAG=
SEQ1_SMSK=/iscratch/i/gs.18/build35/linSpecRep.notInRat
SEQ1_IN_CONTIGS=0
SEQ1_CHUNK=10000000
SEQ1_LAP=10000

# QUERY
# Mouse
SEQ2_DIR=/scratch/mus/mm4/softNib
# RMSK not currently used
SEQ2_RMSK=/scratch/mus/mm4/rmsk
# FLAG not currently used
SEQ2_FLAG=-rodent
SEQ2_SMSK=/scratch/mus/mm4/linSpecRep.notInHuman
SEQ2_IN_CONTIGS=0
SEQ2_CHUNK=30000000
SEQ2_LAP=0

BASE=/cluster/data/hg17/bed/blastz.mm4

DEF=$BASE/DEF
RAW=$BASE/raw
CDBDIR=$BASE
SEQ1_LEN=$BASE/S1.len
SEQ2_LEN=$BASE/S2.len
'_EOF_'
    # << this line keeps emacs coloring happy

    # prepare first cluster run
    ssh kk
    cd /cluster/data/hg17/bed/blastz.mm4
    /cluster/data/hg17/jkStuff/BlastZ_run0.sh
    cd run.0
    para try, check, push, check, ....
# Completed: 43648 of 43648 jobs
# CPU time in finished jobs:   16448001s  274133.36m  4568.89h  190.37d  0.522 y
# IO & Wait Time:                751666s   12527.76m   208.80h    8.70d  0.024 y
# Average job time:                 394s       6.57m     0.11h    0.00d
# Longest job:                     8323s     138.72m     2.31h    0.10d
# Submission to last job:         44244s     737.40m    12.29h    0.51d

    #	the file server to its knees.  Run this on the small cluster.
    ssh kki
    cd /cluster/data/hg17/bed/blastz.mm4
    /cluster/data/hg17/jkStuff/BlastZ_run1.sh
    cd run.1
    para try, check, push, etc ...
# Completed: 341 of 341 jobs
# CPU time in finished jobs:       3925s      65.42m     1.09h    0.05d  0.000 y
# IO & Wait Time:                  6208s     103.46m     1.72h    0.07d  0.000 y
# Average job time:                  30s       0.50m     0.01h    0.00d
# Longest job:                      289s       4.82m     0.08h    0.00d
# Submission to last job:          2800s      46.67m     0.78h    0.03d

    #	Third cluster run to convert lav's to axt's
    #	Does not work on kki since /scratch on the iservers is not the
    #	same as /scratch on the other clusters.
    ssh kk
    cd /cluster/data/hg17/bed/blastz.mm4
    /cluster/data/hg17/jkStuff/BlastZ_run2.sh
    cd run.2
    para try, check, push, etc ...
# Completed: 45 of 46 jobs
# Crashed: 1 jobs
# CPU time in finished jobs:       2389s      39.82m     0.66h    0.03d  0.000 y
# IO & Wait Time:                 13374s     222.90m     3.71h    0.15d  0.000 y
# Average job time:                 350s       5.84m     0.10h    0.00d
# Longest job:                     1426s      23.77m     0.40h    0.02d
# Submission to last job:          1440s      24.00m     0.40h    0.02d

    #	chr19 failing due to out of memory.  Run this job individually
    #	on kolossus, adjusting the location of the nib directories:
    ssh kolossus
    cd /cluster/data/hg17/bed/blastz.mm4
    sed -e "s/i386/x86_64/g" /cluster/bin/scripts/blastz-chromlav2axt > \
	x86_64-chromlav2axt
    chmod +x x86_64-chromlav2axt
    time ./x86_64-chromlav2axt \
	/cluster/data/hg17/bed/blastz.mm4/lav/chr19 \
	/cluster/data/hg17/bed/blastz.mm4/axtChrom/chr19.axt \
	/cluster/bluearc/scratch/hg/gs.18/build35/bothMaskedNibs \
	/cluster/bluearc/scratch/mus/mm4/softNib
    #	real    24m28.955s
    #	user    6m40.990s
    #	sys     1m16.500s

    # translate sorted axt files into psl
    ssh eieio
    cd /cluster/data/hg17/bed/blastz.mm4
    mkdir -p pslChrom
    set tbl = "blastzMm4"
    foreach f (axtChrom/chr*.axt)
      set c=$f:t:r
      echo "Processing chr $c"
      /cluster/bin/i386/axtToPsl $f S1.len S2.len pslChrom/${c}_${tbl}.psl
    end
    #	This takes more than an hour.  You can shorten this by changing
    #	that command to a simple echo, put the results into a file,
    #	split the file into four parts and run the four files as shell
    #	scripts on eieio to have four processes running at the same
    #	time.  Load on eieio gets up to about 20 which is reasonable.

    # Load database tables
    ssh hgwdev
    cd /cluster/data/hg17/bed/blastz.mm4/pslChrom
    bash
    for F in chr*_blastzMm4.psl
    do
	/cluster/bin/i386/hgLoadPsl -noTNameIx hg17 ${F}
	echo "${F} done"
    done
    # this is a 55 minute job
    # exit bash if you are tcsh

    # featureBits on blastzMm3 or 4 will not work on hgwdev, runs out of
    # memory.  But if you reset your ~/.hg.conf to use the read-only
    #	user and contact the hgwdev host, then use the x86_64 featureBits
    # featureBits hg16 blastzMm4
    # 1056761609 bases of 2865248791 (36.882%) in intersection
    # featureBits hg17 blastzMm4
    # 1056201417 bases of 2866216770 (36.850%) in intersection

# CHAIN MM4 BLASTZ (DONE - 2004-06-29 - Hiram)
# redone with the 'axtFilter -notQ_random' removed - 2004-06-23

# The axtChain is best run on the small kluster, or the kk9 kluster
    ssh kk9
    mkdir -p /cluster/data/hg17/bed/blastz.mm4/axtChain/run1
    cd /cluster/data/hg17/bed/blastz.mm4/axtChain/run1
    mkdir out chain

    ls -1S /cluster/data/hg17/bed/blastz.mm4/axtChrom/*.axt > input.lst
    cat << '_EOF_' > gsub
#LOOP
doChain {check in exists $(path1)} {check out line+ chain/$(root1).chain} out/$(root1).out
#ENDLOOP
'_EOF_'
    # << this line makes emacs coloring happy

#  May need -minScore=5000 for all chroms if chr19 won't finish on kolossus

    cat << '_EOF_' > doChain
#!/bin/csh
axtChain $1 /iscratch/i/gs.18/build35/bothMaskedNibs \
	/iscratch/i/mm4/softNib $2 > $3
'_EOF_'
    # << this line makes emacs coloring happy
    chmod a+x doChain

    # 46 jobs
    gensub2 input.lst single gsub jobList
    para create jobList
    para try
    para push # ... etc ...
# Completed: 45 of 46 jobs
# CPU time in finished jobs:       6575s     109.58m     1.83h    0.08d  0.000 y
# IO & Wait Time:                  9274s     154.57m     2.58h    0.11d  0.000 y
# Average job time:                 352s       5.87m     0.10h    0.00d
# Longest job:                     3121s      52.02m     0.87h    0.04d
# Submission to last job:          3121s      52.02m     0.87h    0.04d
    #	one job wouldn't finish due to memory usage
    #	run the chr19 job on kolossus, takes an hour, gets up to 4 Gb
    #	memory usage

    # now on the file server, sort chains
    ssh eieio
    cd /cluster/data/hg17/bed/blastz.mm4/axtChain
    time chainMergeSort run1/chain/*.chain > all.chain
    #	real    17m17.639s
    #	user    9m54.240s
    #	sys     1m31.210s
    #	(1.9 Gb result file !)

    time chainSplit chain all.chain
    #	real    27m32.278s
    #	user    9m46.970s
    #	sys     2m45.960s

    # optionally: rm run1/chain/*.chain

    # Load chains into database
    # next machine
    ssh hgwdev
    cd /cluster/data/hg17/bed/blastz.mm4/axtChain/chain
    foreach i (*.chain)
        set c = $i:r
        hgLoadChain hg17 ${c}_chainMm4 $i
        echo done $c
    end

    #	featureBits hg17 chainMm4
    #	2829135227 bases of 2866216770 (98.706%) in intersection
    #	featureBits hg16 chainMm4
    #	2828363353 bases of 2865248791 (98.713%) in intersection

# NET MM4 (DONE - 2004-06-29 - Hiram)
    ssh eieio
    cd /cluster/data/hg17/bed/blastz.mm4/axtChain
    mkdir preNet
    cd chain
    foreach i (*.chain)
      echo preNetting $i
      /cluster/bin/i386/chainPreNet $i /cluster/data/hg17/chrom.sizes \
                        /cluster/data/mm4/chrom.sizes ../preNet/$i
    end

    cd ..
    mkdir n1
    cd preNet
    foreach i (*.chain)
      set n = $i:r.net
      echo primary netting $i
      /cluster/bin/i386/chainNet $i -minSpace=1 /cluster/data/hg17/chrom.sizes \
	/cluster/data/mm4/chrom.sizes ../n1/$n /dev/null
    end

    cd ..
    cat n1/*.net | /cluster/bin/i386/netSyntenic stdin hNoClass.net
    #	memory usage 2504171520, utime 19373 s/100, stime 5906

    ssh hgwdev
    cd /cluster/data/hg17/bed/blastz.mm4/axtChain
    time netClass hNoClass.net hg17 mm4 mouse.net \
	-tNewR=/cluster/bluearc/scratch/hg/gs.18/build35/linSpecRep.notInMouse \
	-qNewR=/cluster/bluearc/scratch/mus/mm4/linSpecRep.notInHuman
    #	real    19m33.421s
    #	user    10m37.130s
    #	sys     1m45.630s

    # If things look good do
    ssh eieio
    cd /cluster/data/hg17/bed/blastz.mm4/axtChain
    rm -r n1 hNoClass.net
    # Make a 'syntenic' subset of these with
    time netFilter -syn mouse.net > mouseSyn.net
    #	real    13m24.885s
    #	user    7m37.100s
    #	sys     1m5.760s

    # Load the nets into database
    ssh hgwdev
    cd /cluster/data/hg17/bed/blastz.mm4/axtChain
    netFilter -minGap=10 mouse.net |  hgLoadNet hg17 netMm4 stdin
    netFilter -minGap=10 mouseSyn.net | hgLoadNet hg17 syntenyNetMm4 stdin
    #	real    44m20.735s
    #	user    15m58.620s
    #	sys     1m58.720s
    # check results
    # featureBits hg17 netMm4
    #	2824272033 bases of 2866216770 (98.537%) in intersection
    # featureBits hg16 netMm4
    #	2823565051 bases of 2865248791 (98.545%) in intersection

    # featureBits hg17 syntenyNetMm4
    #	2785830955 bases of 2866216770 (97.195%) in intersection
    # featureBits hg16 syntenyNetMm4
    #	2786960572 bases of 2865248791 (97.268%) in intersection

    # Add entries for net and chain to mouse/hg17 trackDb

    # make net
    ssh eieio
    cd /cluster/data/hg17/bed/blastz.mm4/axtChain
    mkdir mouseNet
    time netSplit mouse.net mouseNet
    #	real    12m1.478s
    #	user    8m35.050s
    #	sys     1m7.230s

    #	extract axt's from net, and convert to maf's (DONE - Kate - 2004-06-24)
    ssh eieio
    cd /cluster/data/hg17/bed/blastz.mm4/axtChain
    mkdir ../axtNet ../mafNet
cat > makeMaf.csh << '_EOF_'
    foreach f (mouseNet/chr*.net)
        set c = $f:t:r
        echo "netToAxt: $c.net -> $c.axt"
        rm -f ../axtNet/$c.axt
        netToAxt mouseNet/$c.net chain/$c.chain \
	    /cluster/data/hg17/nib /cluster/data/mm4/nib stdout | \
	    axtSort stdin ../axtNet/$c.axt
        axtToMaf ../axtNet/$c.axt \
            /cluster/data/hg17/chrom.sizes /cluster/data/mm4/chrom.sizes \
            ../mafNet/$c.maf -tPrefix=hg17. -qPrefix=mm4.
	echo "Complete: $c.net -> axtNet/$c.axt -> mafNet/$c.maf"
    end
'_EOF_'
# << for emacs
    csh makeMaf.csh >&! makeMaf.log &
    tail -100f makeMaf.log

    ssh hgwdev
    mkdir -p /cluster/data/hg17/bed/blastz.mm4/axtBest
    cd /cluster/data/hg17/bed/blastz.mm4/axtBest
    ln -s ../axtNet/chr*.axt .

    # copy net axt's to download area
    ssh hgwdev
    cd /cluster/data/hg17/bed/blastz.mm4/axtNet
    mkdir -p /usr/local/apache/htdocs/goldenPath/hg17/vsMm4/axtNet
    cp -p *.axt /usr/local/apache/htdocs/goldenPath/hg17/vsMm4/axtNet
    cd /usr/local/apache/htdocs/goldenPath/hg17/vsMm4/axtNet
    gzip *.axt
    # add README.txt file to dir (use previous assembly's copy as template)

    #  Convert those axt files to psl
    ssh eieio
    cd /cluster/data/hg17/bed/blastz.mm4
    mkdir pslBest
    foreach a (axtBest/chr*.axt)
	set c=$a:t:r
	echo "processing $c.axt -> ${c}_blastzBestMm4.psl"
    /cluster/bin/i386/axtToPsl axtBest/${c}.axt \
	S1.len S2.len pslBest/${c}_blastzBestMm4.psl
	echo "Done: ${c}_blastzBestMm4.psl"
    end

    # Load tables
    ssh hgwdev
    cd /cluster/data/hg17/bed/blastz.mm4/pslBest
    for I in chr*BestMm4.psl
    do
	/cluster/bin/i386/hgLoadPsl -noTNameIx hg17 ${I}
	echo "done ${I}"
    done

     # check results
    # featureBits hg17 blastzBestMm4
    #	1017319919 bases of 2866216770 (35.493%) in intersection
    # featureBits hg16 blastzBestMm4
    #	996722004 bases of 2865248791 (34.787%) in intersection

    # Make /gbdb links and add them to the axtInfo table:
     mkdir -p /gbdb/hg17/axtBest/Mm4
     cd /gbdb/hg17/axtBest/Mm4
     ln -s /cluster/data/hg17/bed/blastz.mm4/axtNet/chr*.axt .
     cd /cluster/data/hg17/bed/blastz.mm4/axtNet
     rm -f axtInfoInserts.sql
     foreach f (/gbdb/hg17/axtBest/Mm4/chr*.axt)
       set chr=$f:t:r
       echo "INSERT INTO axtInfo (species, alignment, chrom, fileName) \
                VALUES ('mm4','Blastz Best in Genome','$chr','$f');" \
         >> axtInfoInserts.sql
     end
    hgsql hg17 < ~/kent/src/hg/lib/axtInfo.sql
    #	table axtInfo may already exist, ignore create error.
    hgsql hg17 < axtInfoInserts.sql

# MAKING MOUSE SYNTENY (DONE - 2004-06-29 - Hiram)

ssh hgwdev
mkdir /cluster/data/hg17/bed/syntenyMm4
cd /cluster/data/hg17/bed/syntenyMm4

# Copy all the needed scripts from /cluster/data/hg16/bed/syntenyMm3
cp -p /cluster/data/hg17/bed/syntenyRn3/*.pl .

./syntenicBest.pl -db=hg17 -table=blastzBestMm4
./smooth.pl
./joinsmallgaps.pl
./fillgap.pl -db=hg17 -table=blastzBestMm4
./synteny2bed.pl
#	The five commands above
#	real    220m16.227s
#	user    0m22.940s
#	sys     0m3.960s

#	Used to load this in syntenyMm4, but that type is misleading to
#	the table browser and fails the checkTableCoords check.
#	Better to use this ensRatMusHom type:
#	Need a new name here for the Mm4 to not conflict with Rn3
sed -e 's/ensPhusionBlast/ensRatMm4Hom/g' \
      $HOME/kent/src/hg/lib/ensPhusionBlast.sql \
      > ensRatMm4Hom.sql
hgLoadBed hg17 ensRatMm4Hom ucsc100k.bed -sqlTable=ensRatMm4Hom.sql
    #	featureBits hg17 ensRatMm4Hom
    #	2549307611 bases of 2866216770 (88.943%) in intersection
    #	featureBits hg16 syntenyMm4
    #	2560252977 bases of 2865248791 (89.355%) in intersection

# MAKING MOUSE AXTTIGHT FROM AXTBEST (DONE - 2004-06-29 - Hiram)
    # After creating axtBest alignments above, use subsetAxt to get axtTight:
    ssh eieio
    cd /cluster/data/hg17/bed/blastz.mm4/axtNet
    mkdir -p ../axtTight
    foreach i (*.axt)
      echo $i
      subsetAxt  $i ../axtTight/$i \
        ~kent/src/hg/mouseStuff/subsetAxt/coding.mat 3400
    end

    # translate to psl
    cd ../axtTight
    mkdir ../pslTight
    foreach i (*.axt)
      set c = $i:r
      axtToPsl $i ../S1.len ../S2.len ../pslTight/${c}_blastzTightMm4.psl
      echo "Done: $i"
    end

    # Load tables into database
    ssh hgwdev
    cd /cluster/data/hg17/bed/blastz.mm4/pslTight
    for I in chr*TightMm4.psl
    do
	/cluster/bin/i386/hgLoadPsl -noTNameIx hg17 ${I}
	echo "done ${I}"
    done

    #	Compare results with previous assembly:
    #	featureBits hg17 blastzTightMm4
    #	166569246 bases of 2866216770 (5.811%) in intersection
    #	featureBits hg16 blastzTightMm4
    #	162641577 bases of 2865248791 (5.676%) in intersection

    # copy  axt's to download area
    ssh hgwdev
    cd /cluster/data/hg17/bed/blastz.mm4/axtTight
    mkdir -p /usr/local/apache/htdocs/goldenPath/hg17/vsMm4/axtTight
    cp -p *.axt /usr/local/apache/htdocs/goldenPath/hg17/vsMm4/axtTight
    cd /usr/local/apache/htdocs/goldenPath/hg17/vsMm4/axtTight
    gzip *.axt
    # add README.txt file to dir (use previous assembly's copy as template)

# BLASTZ MM4 CLEAN UP (DONE - 2004-07-02 - Hiram)
    ssh eieio
    cd /cluster/data/hg17/bed/blastz.mm4
    nice rm -rf raw &
    nice rm axtChain/run1/chain/* &
    nice rm -fr axtChain/n1 axtChain/hNoClass.net &
    nice gzip axtChrom/* pslChrom/* lav/*/* axtChain/all.chain axtChain/*.net &


# BLASTZ CHIMP panTro1 (DONE 2004-06-22 kate)
# NOTE: Ran with abridge repeats=0, although SMSK was set
# Looked better than running with abridge=1, which had very
# chopped-up alignments

    ssh kk
    cd /cluster/data/hg17/bed
    mkdir -p blastz.panTro1.2004-06-22
    rm -f blastz.panTro1
    cd blastz.panTro1.2004-06-22

    cat << 'EOF' > DEF
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/i386:/cluster/home/angie/schwartzbin

ALIGN=blastz-run
BLASTZ=blastz
BLASTZ_H=2000
BLASTZ_ABRIDGE_REPEATS=0

# Specific settings for chimp
BLASTZ_Y=3400
BLASTZ_T=2
BLASTZ_K=4500
BLASTZ_Q=/cluster/data/penn/human_chimp.q

# TARGET: Human
SEQ1_DIR=/scratch/hg/gs.18/build35/bothMaskedNibs
# not used 
SEQ1_RMSK=
# not used
SEQ1_FLAG=
SEQ1_SMSK=/iscratch/i/hg17/linSpecRep.chimp
SEQ1_IN_CONTIGS=0
SEQ1_CHUNK=10000000
SEQ1_LAP=10000

# QUERY: Chimp
SEQ2_DIR=/scratch/chimp/panTro1/nib
# not currently used
SEQ2_RMSK=/iscratch/i/chimp/panTro1/linSpecRep.human
# not currently used
SEQ2_FLAG=
SEQ2_SMSK=
SEQ2_IN_CONTIGS=0
SEQ2_CHUNK=10000000
SEQ2_LAP=0

BASE=/cluster/data/hg17/bed/blastz.panTro1.2004-06-22

DEF=$BASE/DEF
RAW=$BASE/raw
CDBDIR=$BASE
SEQ1_LEN=$BASE/S1.len
SEQ2_LEN=$BASE/S2.len

'EOF'
    # << this line keeps emacs coloring happy

    # first cluster run: raw blastz alignments
    ssh kk
    bash # if a csh/tcsh user
    cd /cluster/data/hg17/bed/blastz.panTro1.2004-06-22
    source DEF
    mkdir $RAW run.0
    /cluster/home/angie/hummus/make-joblist $DEF > $BASE/run.0/j
    sh ./xdir.sh
    cd run.0
    sed -e 's@^blastz-run@/cluster/bin/penn/blastz-run@' j > jobList
    para create jobList
        # 160270 jobs written to batc
    para try, check, push, check, ....
#CPU time in finished jobs:    2399227s   39987.11m   666.45h   27.77d  0.076 y
#IO & Wait Time:                503100s    8385.00m   139.75h    5.82d  0.016 y
#Average job time:                  18s       0.30m     0.01h    0.00d
#Longest running job:                0s       0.00m     0.00h    0.00d
#Longest finished job:            2073s      34.55m     0.58h    0.02d
#Submission to last job:         10843s     180.72m     3.01h    0.13d


    # second cluster run: lift raw alignments -> lav dir
    ssh kki
    bash # if a csh/tcsh user
    cd /cluster/data/hg17/bed/blastz.panTro1.2004-06-22
    source DEF
    mkdir run.1 lav
    /cluster/bin/scripts/blastz-make-out2lav $DEF $BASE > $BASE/run.1/jobList
    cd run.1
    wc -l jobList
    para create jobList
        # 341 jobs
    para try, check, push, etc ...
# CPU time in finished jobs:       3458s      57.63m     0.96h    0.04d  0.000 y
# IO & Wait Time:                 57996s     966.60m    16.11h    0.67d  0.002 y
# Average job time:                 180s       3.00m     0.05h    0.00d
# Longest job:                      483s       8.05m     0.13h    0.01d
# Submission to last job:          1498s      24.97m     0.42h    0.02d

    # third run: lav -> axt -> psl
    ssh kki
    cd /cluster/data/hg17/bed/blastz.panTro1.2004-06-22
    mkdir axtChrom pslChrom run.2
    cd run.2
    cat << '_EOF_' > do.csh
#!/bin/csh -ef
cd $1
set chr = $1:t
cat `ls -1 *.lav | sort -g` \
| /cluster/bin/x86_64/lavToAxt stdin \
    /iscratch/i/hg17/bothMaskedNibs /iscratch/i/chimp/panTro1/nib stdout \
| /cluster/bin/x86_64/axtSort stdin ../../axtChrom/$chr.axt 
/cluster/bin/x86_64/axtToPsl ../../axtChrom/$chr.axt ../../S1.len ../../S2.len \
  ../../pslChrom/$chr.psl
'_EOF_'
    # << this line keeps emacs coloring happy
    chmod a+x do.csh
    cp /dev/null jobList
    for d in ../lav/chr*; do
      echo "do.csh $d" >> jobList
    done
    para create jobList
        # 46 jobs
    para try, check, push, check
#Completed: 42 of 42 jobs
#Average job time:                  38s       0.64m     0.01h    0.00d
#Longest job:                      147s       2.45m     0.04h    0.00d
#Submission to last job:           147s       2.45m     0.04h    0.00d

    # Load database tables (takes an hour or so)
    ssh hgwdev
    cd /cluster/data/hg17/bed/blastz.panTro1.2004-06-22/pslChrom
cat > load.csh << 'EOF'
    foreach f (chr*.psl)
	set table = $f:r_blastzPanTro1
	echo "loading ${table}"
	/cluster/bin/i386/hgLoadPsl -noTNameIx hg17 -table=$f:r_${table} $f
    end
'EOF'
# << for emacs
    csh load.csh >&! load.log & 
    tail -100f load.log


# CHAIN CHIMP BLASTZ (6/23/04 kate)
    # Run axtChain on little cluster
    # first copy input to bluearc, as eieo bogs down if even mini-cluster
    # gets input from it !?
    ssh eieio
    cd /cluster/data/hg17/bed/blastz.panTro1.2004-06-22
    cp -rp axtChrom /cluster/bluearc/hg17/blastz.panTro1.2004-06-22/axtChrom

    ssh kki
    cd /cluster/data/hg17/bed/blastz.panTro1.2004-06-22
    mkdir -p axtChain/run1
    cd axtChain/run1
    mkdir out chain
    ls -1S /cluster/bluearc/hg17/blastz.panTro1.2004-06-22/axtChrom/*.axt \
      > input.lst
    cat << '_EOF_' > gsub
#LOOP
doChain {check in exists $(path1)} {check out line+ chain/$(root1).chain} {check out line+ out/$(root1).out}
#ENDLOOP
'_EOF_'
    # << this line makes emacs coloring happy

    cat << '_EOF_' > doChain
#!/bin/csh -fe
set c = $1:r:t
axtChain $1 -scoreScheme=/cluster/data/blastz/human_chimp.q \
        /iscratch/i/hg17/bothMaskedNibs \
        /iscratch/i/chimp/panTro1/nib /tmp/$c.chain.$$ > /tmp/$c.out.$$
set ret = $status
mv -f /tmp/$c.chain.$$ $2
mv -f /tmp/$c.out.$$ $3
exit $status
'_EOF_'
    # << this line makes emacs coloring happy
    chmod a+x doChain
    gensub2 input.lst single gsub jobList
    para create jobList
    para try, check, push, check...
    # TODO
    rm -fr /cluster/bluearc/hg17/blastz.panTro1.2004-06-22/axtChrom
    echo "remove after 7/1/04" > /cluster/bluearc/hg17/blastz.panTro1.2004-06-22/axtChrom/README

    # now on the cluster server, sort chains
    ssh eieio
    cd /cluster/data/hg17/bed/blastz.panTro1.2004-06-22/axtChain
    chainMergeSort run1/chain/*.chain > all.chain
    chainSplit chain all.chain
    # TODO
    rm run1/chain/*.chain
    echo "remove after 7/1/04" > run1/chain/README

    # Load chains into database
    ssh hgwdev
    cd /cluster/data/hg17/bed/blastz.panTro1.2004-06-22/axtChain/chain
    foreach i (*.chain)
        set c = $i:r
        echo loading $c
        hgLoadChain hg17 ${c}_chainPanTro1 $i
    end
    # TODO
    featureBits hg16 chainPanTro1Link
        #2627280557 bases of 2865248791 (91.695%) in intersection
    featureBits hg17 chainPanTro1Link
        # 2633869032 bases of 2866216770 (91.894%) in intersection


# NET CHIMP (DONE 2004-6-24 kate)
    # Redone to make chimp.net on 2004-10-11 kate (other files have
    # new times, but are the same as 6-24 versions)
    ssh kolossus
    cd /cluster/data/hg17/bed/blastz.panTro1.2004-06-22/axtChain
    #chainPreNet all.chain ../S1.len ../S2.len stdout \
    #| chainNet stdin -minSpace=1 ../S1.len ../S2.len stdout /dev/null \
    #| netSyntenic stdin noClass.net

    time chainPreNet all.chain ../S1.len ../S2.len stdout \
    | chainNet stdin -minSpace=10 ../S1.len ../S2.len human.net chimp.net
        # 42.860u 2.080s 2:11.11 34.2% 
    netSyntenic human.net noClass.net

    # Add classification info using db tables:
    ssh hgwdev
    cd /cluster/data/hg17/bed/blastz.panTro1.2004-06-22/axtChain
    netClass noClass.net hg17 panTro1 human.net
    rm noClass.net

    # Make a 'syntenic' subset:
    ssh eieio
    cd /cluster/data/hg17/bed/blastz.panTro1.2004-06-22/axtChain
    # TODO
    #rm noClass.net
    # Make a 'syntenic' subset of these with
    # NOTE: we used -chimpSyn filtering for the reciprocal best nets
    # on hg16 -- perhaps should use for nets here as well
    netFilter -chimpSyn human.net > humanSyn.net

    # Load the nets into database 
    ssh hgwdev
    cd /cluster/data/hg17/bed/blastz.panTro1.2004-06-22/axtChain
    netFilter -minGap=10 human.net |  hgLoadNet hg17 netPanTro1 stdin
    netFilter -minGap=10 humanSyn.net | hgLoadNet hg17 netSyntenyPanTro1 stdin
    # Add entries for chainPanTro1, netPanTro1 to
    # human/hg17 trackDb

    # save chimp net to downloads area
    ssh eieio
    cd /cluster/data/hg17/blastz.panTro1/axtChain
    nice gzip chimp.net
    cp chimp.net.gz /usr/local/apache/htdocs/goldenPath/panTro1/vsHg17
    cd /usr/local/apache/htdocs/goldenPath/panTro1/vsHg17
    md5sum *.gz > md5sum.txt


# RECIPROCAL BEST CHAINS FOR ENSEMBL GENE BUILD (DONE 2004-10-11 kate)

    # Starting with the chimp-reference net, which contains the best human
    # alignments to chimp, extract the subset of chains in the net.
    # (these are the "best" chains of human alignments to chimp).
    # Net these chains and use the resulting human-reference net (the
    # "reciprocal best" net).  Extract the chains from this net to
    # obtain "reciprocal best" chains of chimp alignments to human.

    ssh kolossus
    cd /cluster/data/hg17/bed/blastz.panTro1.2004-06-22/axtChain
    mkdir rBest
    grep chain all.chain | wc -l

    # extract "best" chains from the chimp-reference net
    time chainSwap all.chain stdout | \
        netChainSubset chimp.net stdin stdout | \
        chainSort stdin rBest/chimp.best.chain
    grep chain rBest/chimp.best.chain | wc -l
        # 64396 

    # for comparison later, extract "best" chains from human-reference net
    netChainSubset human.net all.chain stdout | \
        chainSort stdin rBest/human.best.chain
    cd rBest

    # net the best chains from the chimp net and pull the human-ref net
    # (Daryl accidentally deleted human.rbest.net and rebuilt it with the 
    #  same command on 8/14/2005, resulting in a file of the same size)
    time chainPreNet chimp.best.chain ../../S2.len ../../S1.len stdout | \
       chainNet stdin -minSpace=10 ../../S2.len ../../S1.len \
                         /dev/null human.rbest.net

    # extract "reciprocal best" chains from the "best" human-reference net
    netChainSubset human.rbest.net ../all.chain stdout | \
        chainSort stdin human.rbest.chain

    # take a look
    ssh hgwdev
    cd /cluster/data/hg17/bed/blastz.panTro1.2004-06-22/axtChain
    cd rBest
    mkdir rBestChain
    chainSplit rBestChain human.rbest.chain
    hgLoadChain hg17 chr7_rBestChainPanTro1 rBestChain/chr7.chain
        # Loading 1639 chains into hg17.chr7_rBestChainPanTro1
    mkdir bestChain
    chainSplit bestChain human.best.chain
    hgLoadChain hg17 chr7_bestChainPanTro1 bestChain/chr7.chain
        # Loading 6516 chains into hg17.chr7_bestChainPanTro1
    # compare 
    hgsql hg16 -s -e "select count(*) from chr7_rBestChainPanTro1"
        # 2416
    # spot-checked by comparing chr7 best and rbest:
    # 1. for a a chain appearing in rBest,  click thru to human browser, 
    #   then via chimp net back to human browser at same region
    # 2. for a chain in "best", but not rBest, do the same, verify
    #   that it produces a different region in the human browser

    # post pre-Q/A file for ensembl download
    cd /cluster/data/hg17/bed/blastz.panTro1.2004-06-22/axtChain/rBest
    gzip human.rbest.chain
    cp human.rbest.chain.gz \
        /usr/local/apache/htdocs/kate/ensembl/hg17-panTro1.rbest.chain.gz
    cd /usr/local/apache/htdocs/kate/ensembl
    md5sum *.gz > md5sum.txt
    mv hg17-panTro1.rbest.chain.gz /usr/local/apache/htdocs/hg17/vsPanTro1/hg17.panTro1.rbest.chain.gz

    # save as reciprocal best liftover chain (2005-02-22 kate)
    gunzip -c  human.rbest.chain.gz > \
        /cluster/data/hg17/bed/liftOver/hg17ToPanTro1.rbest.chain

    # cleanup (TODO -- after QA)
    ssh hgwdev
    hgsql hg17 -e "drop table chr7_rBestChainPanTro1"
    hgsql hg17 -e "drop table chr7_bestChainPanTro1"
    cd /cluster/data/hg17/bed/blastz.panTro1.2004-06-22/axtChain
    mv rBest/human.rbest.chain.gz ..
    rm -fr rBest

# RECIPROCAL BEST AXT'S FROM RECIPROCAL BEST CHAIN (2005-08-16 kate)
#       (requested by Daryl)
    cd /cluster/data/hg17/bed/blastz.panTro1
    mkdir -p axtRBestNet
cat > makeRbestAxt.csh << 'EOF'
foreach f (axtChain/rBest/rBestChain/*.chain)
    set c = $f:t:t:r
    echo $c
    chainToAxt $f /cluster/data/hg17/nib  /cluster/data/panTro1/nib stdout \
        | axtSort stdin axtRBestNet/$c.axt
    end
'EOF'
# << for emacs
    csh makeRbestAxt.csh >&! makeRbestAxt.log &


# GENERATE CHIMP MAF FOR MULTIZ FROM NET (DONE 2004-06-24 kate)
    # Redo to fix overlap problem using 8/05 netToAxt (2005-08-16 kate)
    # Replace bad chr5 axtNet and mafNet (2006-01-05 kate) 
    #   There was apparently a bad chr5 nib for a while...
    ssh kkstore02
    cd /cluster/data/hg17/bed/blastz.panTro1.2004-06-22/axtChain
    netSplit human.net net

    mkdir axtNet mafNet
cat > makeMaf.csh << 'EOF'
foreach f (axtChain/net/*.net)
    set c = $f:t:r
    netToAxt $f axtChain/chain/$c.chain /cluster/data/hg17/nib \
        /cluster/data/panTro1/nib stdout | axtSort stdin axtNet/$c.axt
    axtToMaf axtNet/$c.axt  \
        /cluster/data/hg17/chrom.sizes /cluster/data/panTro1/chrom.sizes \
        mafNet/$c.maf -tPrefix=hg17. -qPrefix=panTro1.
    end
'EOF'
# << for emacs
    csh makeMaf.csh >&! makeMaf.log &
    tail -100f makeMaf.log

    mkdir -p /cluster/bluearc/hg17/mafNet
    cp -rp mafNet /cluster/bluearc/hg17/mafNet/panTro1


# MAKE PANTRO1 DOWNLOADABLES (DONE 2004-09-14 kate)
#  Redo panTro1.net.gz (it was truncated) 2004-10-07 kate
# Redo axtNets with non-overlapped versions (2005-08-29 kate)
# Replace bad chr5 axtNet and mafNet (2006-01-05 kate) 
    ssh eieio
    cd /cluster/data/hg17/bed/blastz.panTro1
    # gzip chains and nets
    mkdir gz
    cd gz
    nice gzip -c ../axtChain/all.chain  > panTro1.chain.gz
    nice gzip -c ../axtChain/human.net > panTro1.net.gz
    wc -l *.gz
    cd ../axtNet
    time nice gzip *.axt
        # 46 mins.
   
    ssh hgwdev
    # copy chains and nets to downloads area
    cd /usr/local/apache/htdocs/goldenPath/hg17
    mkdir -p vsPanTro1
    cd vsPanTro1
    mv /cluster/data/hg17/bed/blastz.panTro1/gz/*.gz .
    md5sum *.gz > md5sum.txt
    # copy in README and edit
    rmdir /cluster/data/hg17/bed/blastz.panTro1/gz
    mkdir -p axtNet
    cd axtNet
    cp /cluster/data/hg17/bed/blastz.panTro1/axtNet/*.axt.gz .
    md5sum *.gz > md5sum.txt


# RESCORE CHICKEN BLASTZ (DONE 6/23/04 angie)
    # Webb noticed low scores when using non-default BLASTZ_Q scoring matrix 
    # and repeats abridged --
    # PSU's restore_rpts program rescored alignments with default matrix 
    # instead of BLASTZ_Q matrix.  Rescore them here so the chainer sees 
    # the higher scores:
    ssh kolossus
    cd /cluster/data/hg17/bed/blastz.galGal2.2004-06-14
    mkdir axtChrom.rescore
    foreach f (axtChrom/chr*.axt)
      axtRescore -scoreScheme=/cluster/data/blastz/HoxD55.q \
        $f axtChrom.rescore/$f:t
    end
    mv axtChrom axtChrom.preRescore
    mv axtChrom.rescore axtChrom


# CHAIN CHICKEN BLASTZ (DONE 6/23/04 angie)
    # Run axtChain on little cluster
    ssh kki
    cd /cluster/data/hg17/bed/blastz.galGal2.2004-06-14
    mkdir -p axtChain/run1
    cd axtChain/run1
    mkdir out chain
    ls -1S /cluster/data/hg17/bed/blastz.galGal2.2004-06-14/axtChrom/*.axt \
      > input.lst
    cat << '_EOF_' > gsub
#LOOP
doChain {check in exists $(path1)} {check out line+ chain/$(root1).chain} {check out line+ out/$(root1).out}
#ENDLOOP
'_EOF_'
    # << this line makes emacs coloring happy

    cat << '_EOF_' > doChain
#!/bin/csh
axtChain -scoreScheme=/cluster/data/blastz/HoxD55.q \
         -linearGap=/cluster/data/blastz/chickenHumanTuned.gap \
         -minScore=5000 $1 \
    /iscratch/i/hg17/bothMaskedNibs \
    /iscratch/i/galGal2/nib $2 > $3
'_EOF_'
    # << this line makes emacs coloring happy
    chmod a+x doChain
    gensub2 input.lst single gsub jobList
    para create jobList
    para try, check, push, check...
    # axtChrom/chr18_random.axt is empty, so the {out line +} check failed:
#Completed: 45 of 46 jobs
#Crashed: 1 jobs
#Average job time:                  46s       0.76m     0.01h    0.00d
#Longest job:                      273s       4.55m     0.08h    0.00d
#Submission to last job:           519s       8.65m     0.14h    0.01d

    # now on the cluster server, sort chains
    ssh eieio
    cd /cluster/data/hg17/bed/blastz.galGal2.2004-06-14/axtChain
    chainMergeSort run1/chain/*.chain > all.chain
    chainSplit chain all.chain
    rm run1/chain/*.chain

    # Load chains into database
    ssh hgwdev
    cd /cluster/data/hg17/bed/blastz.galGal2.2004-06-14/axtChain/chain
    foreach i (*.chain)
        set c = $i:r
        echo loading $c
        hgLoadChain hg17 ${c}_chainGalGal2 $i
    end


# NET CHICKEN BLASTZ (DONE 6/23/04 angie)
    ssh eieio
    cd /cluster/data/hg17/bed/blastz.galGal2.2004-06-14/axtChain
    chainPreNet all.chain ../S1.len ../S2.len stdout \
    | chainNet stdin -minSpace=1 ../S1.len ../S2.len stdout /dev/null \
    | netSyntenic stdin noClass.net

    # Add classification info using db tables:
    ssh hgwdev
    cd /cluster/data/hg17/bed/blastz.galGal2.2004-06-14/axtChain
    netClass noClass.net hg17 galGal2 human.net

    # Make a 'syntenic' subset:
    ssh eieio
    cd /cluster/data/hg17/bed/blastz.galGal2.2004-06-14/axtChain
    rm noClass.net
    # Make a 'syntenic' subset of these with
    netFilter -syn human.net > humanSyn.net

    # Load the nets into database 
    ssh hgwdev
    cd /cluster/data/hg17/bed/blastz.galGal2.2004-06-14/axtChain
    netFilter -minGap=10 human.net |  hgLoadNet hg17 netGalGal2 stdin
    netFilter -minGap=10 humanSyn.net | hgLoadNet hg17 netSyntenyGalGal2 stdin
    # Add entries for chainGalGal2, netGalGal2, syntenyGalGal2 to 
    # human/hg17 trackDb

# XENOPUS BLASTZ/CHAIN/NET (DONE 9/24/04 jk)
# see makeXenTro1.doc and search for zb.hg17
# The results of this are also symlinked under hg17/bed

# GENERATE GALGAL2 MAF FOR MULTIZ FROM NET (DONE 6/23/04 angie)
#       Redo net axt's and maf's to fix overlap problem (use 8/05 netToAxt)
#               (2005-08-16 kate)
    ssh kkstore02
    cd /cluster/data/hg17/bed/blastz.galGal2.2004-06-14/axtChain
    netSplit human.net net
    cd ..
    mkdir axtNet mafNet
cat > makeMaf.csh << 'EOF'
    foreach f (axtChain/net/*)
      set chr = $f:t:r
      netToAxt $f axtChain/chain/$chr.chain /cluster/data/hg17/nib \
        /cluster/data/galGal2/nib stdout \
      | axtSort stdin axtNet/$chr.axt
      axtToMaf axtNet/$chr.axt \
            /cluster/data/hg17/chrom.sizes /cluster/data/galGal2/chrom.sizes \
            mafNet/$chr.maf -tPrefix=hg17. -qPrefix=galGal2.
    end
'EOF'
# << for emacs
    csh makeMaf.csh >&! makeMaf.log &
    mkdir -p /cluster/bluearc/hg17/mafNet
    cp -rp mafNet /cluster/bluearc/hg17/mafNet/galGal2


# MAKE VSGALGAL2 DOWNLOADABLES (REDONE 9/13/04 angie)
#   REDO axtNet's to fix overlaps (2005-09-12 kate)
    ssh eieio
    cd /cluster/data/hg17/bed/blastz.galGal2.2004-06-14/axtChain
    gzip -c all.chain > /cluster/data/hg17/zip/chicken.chain.gz
    gzip -c human.net > /cluster/data/hg17/zip/chicken.net.gz
    mkdir /cluster/data/hg17/zip/axtNet
    foreach f (axtNet/chr*axt)
      gzip -c $f > /cluster/data/hg17/zip/$f.gz
    end
    # Doh! above for loop didn't work because all axt's have been removed 
    # from this dir!  :|  Just this once, regenerate compressed axtNet on 
    # the fly:
    ssh kolossus
    cd /cluster/data/hg17/bed/blastz.galGal2.2004-06-14/axtChain/net
    foreach f (*.net)
      set chr = $f:t:r
      echo $chr
      netToAxt $f ../chain/$chr.chain /cluster/data/hg17/nib \
        /cluster/data/galGal2/nib stdout \
      | axtSort stdin stdout \
      | gzip -c > /cluster/data/hg17/zip/axtNet/$chr.axt.gz
    end    

    ssh hgwdev
    mkdir /usr/local/apache/htdocs/goldenPath/hg17/vsGalGal2
    cd /usr/local/apache/htdocs/goldenPath/hg17/vsGalGal2
    mv /cluster/data/hg17/zip/chicken*.gz .
    mv /cluster/data/hg17/zip/axtNet .
    md5sum *.gz */*.gz > md5sum.txt
    # Copy over & edit README.txt w/pointers to chain, net formats.

    # REDO axtNet downloads to fix overlaps (2005-09-13 kate)
    ssh kkstore02
    cd /cluster/data/hg17/bed/blastz.galGal2/axtNet
    nice gzip *.axt
    md5sum *.axt.gz > md5sum.txt

    ssh hgwdev
    cd /usr/local/apache/htdocs/goldenPath/hg17/vsGalGal2
    mv axtNet axtNet.old
    ln -s /cluster/data/hg17/bed/blastz.galGal2/axtNet .


# 8-WAY MULTIZ MULTIPLE ALIGNMENT WITH MM5 (DONE 2004-07-13 kate)
#       Redo, below to fix overlapping alignments (2005-08-16 kate)

    ssh eieio
    set multizDir = multiz.2004-07-13
    set workingDir = /cluster/bluearc/hg17/$multizDir
    ln -s $workingDir /cluster/bluearc/hg17/multiz8way
    mkdir -p $workingDir
    mkdir -p /cluster/data/hg17/bed/$multizDir
    cd /cluster/data/hg17/bed/$multizDir

# wrapper script for multiz
    # NOTE: first arg is pairwise, 2nd arg is multiple (to add to) 
    # NOTE: next time, modify script so it only needs one arg -- saves the
    # multiple dirname in a file for use by the next run
    cat << 'EOF' > doMultiz.csh
#!/bin/csh -fe
mkdir -p $3:h
/cluster/bin/penn/multiz $1 $2 - > $3
'EOF'
# << for emacs
    cat << 'EOF' > gsub
#LOOP
../doMultiz.csh {check in line /cluster/bluearc/hg17/multiz.2004-07-13/$(dir1)/$(root2).maf} {check in line /cluster/bluearc/hg17/multiz.2004-07-13/$(root1)/$(root2).maf} {check out line+ /cluster/bluearc/hg17/multiz.2004-07-13/$(root1)$(dir1)/$(root2).maf}
#ENDLOOP
'EOF'
# << for emacs
    chmod +x doMultiz.csh

    ssh eieio
    set workingDir = /cluster/bluearc/hg17/multiz.2004-07-13

    # copy mafs to bluearc -- chimp
    mkdir $workingDir/panTro1
    cp /cluster/data/hg17/bed/blastz.panTro1/mafNet/*.maf \
                $workingDir/panTro1
    ls $workingDir/panTro1/*.maf > chrom.lst

    # mouse
    mkdir $workingDir/mm5
    cp /cluster/data/hg17/bed/blastz.mm5/mafNet/chr*.maf $workingDir/mm5

    # rat
    mkdir $workingDir/rn3
    cp /cluster/data/hg17/bed/blastz.rn3/mafNet/chr*.maf $workingDir/rn3

    # dog
    mkdir $workingDir/canFam1
    foreach f (/cluster/data/hg17/bed/blastz.canFam1.2004-07-08/mafNet/chr*.maf)
        set c = $f:r:r:t
        echo $c
        cp $f $workingDir/canFam1/$c.maf
    end

    # chicken
    mkdir $workingDir/galGal2
    foreach f (/cluster/data/hg17/bed/blastz.galGal2/mafNet/chr*.maf)
        set c = $f:r:r:t
        cp $f $workingDir/galGal2/$c.maf
    end

    # fugu
    mkdir $workingDir/fr1
    cp /cluster/data/hg17/bed/blastz.fr1/mafNet/chr*.maf $workingDir/fr1

    # zebrafish
    mkdir $workingDir/danRer1
    cp /cluster/data/hg17/bed/blastz.danRer1.swap/mafNet/chr*.maf \
                        $workingDir/danRer1

    # first multiz - add in mm5 mouse to human/chimp
    # 
    ssh kki
    set multizDir = multiz.2004-07-13
    set workingDir = /cluster/bluearc/hg17/$multizDir
    cd /cluster/data/hg17/bed/$multizDir
    mkdir run.mm5
    cd run.mm5
    echo "mm5/panTro1" > species.lst
    gensub2 species.lst ../chrom.lst ../gsub jobList
    para create jobList
        # 46 jobs
    para try, check, push, check
# CPU time in finished jobs:       6620s     110.33m     1.84h    0.08d  0.000 y
# IO & Wait Time:                  3685s      61.42m     1.02h    0.04d  0.000 y
# Average job time:                 224s       3.73m     0.06h    0.00d
# Longest job:                      819s      13.65m     0.23h    0.01d
# Submission to last job:          1474s      24.57m     0.41h    0.02d
    cd ..

    # rat
    mkdir run.rn3
    cd run.rn3
    echo "rn3/panTro1mm5" > species.lst
    gensub2 species.lst ../chrom.lst ../gsub jobList
    para create jobList
        # 46 jobs
    para try, check, push, check
    cd ..

    # dog
    mkdir run.canFam1
    cd run.canFam1
    echo "canFam1/panTro1mm5rn3" > species.lst
    gensub2 species.lst ../chrom.lst ../gsub jobList
    para create jobList
        # 46 jobs
    para try, check, push, check
    cd ../

    # chicken
    mkdir run.galGal2
    cd run.galGal2
    echo "galGal2/panTro1mm5rn3canFam1" > species.lst
    gensub2 species.lst ../chrom.lst ../gsub jobList
    # no alignment file for chr18_random -- create one so we can create jobList
    touch $workingDir/galGal2/chr18_random.maf
    para create jobList
        # 46 jobs
    para try, check, push, check
    # 1 crashed job for empty file chr18_random
    cd ..

    # fugu
    mkdir run.fr1
    cd run.fr1
    echo "fr1/panTro1mm5rn3canFam1galGal2" > species.lst
    gensub2 species.lst ../chrom.lst ../gsub jobList
    # create empty alignment file for missing one (no alignments)
    touch /cluster/bluearc/hg17/multiz.2004-07-13/fr1/chr6_hla_hap1.maf
    para create jobList
        # 46 jobs
    para try, check, push, check
    # 1 crashed job for empty file chr6_hla_hap1
    cd ..

    # zebrafish
    mkdir run.danRer1
    cd run.danRer1
    echo "danRer1/panTro1mm5rn3canFam1galGal2fr1" > species.lst
    gensub2 species.lst ../chrom.lst ../gsub jobList
    para create jobList
        # 46 jobs
    para try, check, push, check
    cd ..

    # copy 8-way mafs to build directory
    ssh eieio
    set multizDir = multiz.2004-07-13
    set workingDir = /cluster/bluearc/hg17/$multizDir
    ln -s $workingDir/panTro1mm5rn3canFam1galGal2fr1danRer1 $workingDir/maf
    cd /cluster/data/hg17/bed/multiz.2004-07-13
    mkdir maf
    cp $workingDir/maf/*.maf maf

    # copy to download area (2004-07-27 angie)
    # moved gzipped files to mafDownload dir and recreated symlinks 
    #   (2006-04-23 kate)
    cd /usr/local/apache/htdocs/goldenPath/hg17
    mkdir -p mzPt1Mm5Rn3Cf1Gg2Fr1Dr1
    # gzipped & copied maf files from /cluster/data/hg17/bed/multiz8way/maf
    # dumped table and gzipped for download (user request to after file
    # removed when the track was replaced by 18way).
    cd /cluster/data/hg17/bed/multiz8way/mafDownloads
    hgsqldump --all -c --tab=. hg17 multiz8way
    ssh kkstore02 \
        'gzip /cluster/data/hg17/bed/multiz8way/mafDownloads/multiz8way.{sql,txt}'
    ln -s /cluster/data/hg17/bed/multiz8way/mafDownloads/multiz8way.{sql,txt}.gz \
        /usr/local/apache/htdocs/goldenPath/hg17/multiz8way

    # load summary table (2005-09-27)
    cd /cluster/data/hg17/bed/multiz.2004-07-13/maf
    time cat chr*.maf | hgLoadMafSummary hg17 multiz8waySummary stdin
    # 30 minutes ?
    # NOTE: this didn't improve track display time at 5MB, so
    #   I'm leaving out of trackDb (sticking with pairwise maf's) for now
    #   It may be that this helps performance only with larger numbers
    #   of species.

    # Create upstream files for download (2004-09-13 kate)
    ssh hgwdev
    cd /cluster/data/hg17/bed/multiz8way
    echo hg17 panTro1 mm5 rn3 canFam1 galGal2 fr1 danRer1 > org.txt
    # mafFrags takes a while
    foreach i (1000 2000 5000)
        echo "making upstream$i.maf"
        featureBits hg17 refGene:upstream:$i -fa=/dev/null -bed=up.bad
        awk -F '\t' '{printf("%s\t%s\t%s\t%s\t%s\t%s\n", $1, $2, $3, substr($4, 0, 9), 0, $5)}' up.bad > up.bed
        rm up.bad
        mafFrags hg17 multiz8way up.bed upstream$i.maf -orgs=org.txt
        rm up.bed
    end
    ssh eieio
    cd /cluster/data/hg17/bed/multiz8way
    nice gzip upstream{1000,2000,5000}.maf
        # 6 mins.
    ssh hgwdev
    cd /usr/local/apache/htdocs/goldenPath/hg17
    ln -s mzPt1Mm5Rn3Cf1Gg2Fr1Dr1 multiz8way
    mv /cluster/data/hg17/bed/multiz8way/upstream*.maf.gz multiz8way



# PHYLO-HMM (PHASTCONS) CONSERVATION FOR 8-WAY WITH MM5 (DONE 2004-07-20 kate)
# (this was partially redone by acs using the new phastCons, 08-28;
# I've tried to merge the two sets of docs into one cohesive
# description)
# More revisions, acs, 09-13

    ssh eieio
    set path = ($path /cluster/bin/phast)
    cd /cluster/data/hg17/bed/multiz.2004-07-13
    mkdir cons
    cd cons

    #break up the genome-wide MAFs into pieces
    mkdir /cluster/bluearc/hg17/chrom
    cd /cluster/data/hg17
    foreach f (`cat chrom.lst`)
        echo $f
        cp $f/*.fa /cluster/bluearc/hg17/chrom
    end

    ssh kki
    cd /cluster/data/hg17/bed/multiz.2004-07-13/cons
    mkdir run.split
    cd run.split
    set WINDOWS = /cluster/bluearc/hg17/multiz.2004-07-13/cons/WINDOWS
    rm -fr $WINDOWS
    mkdir -p $WINDOWS
    cat << 'EOF' > doSplit.sh
#!/bin/sh

PHAST=/cluster/bin/phast
FA_SRC=/cluster/bluearc/hg17/chrom
WINDOWS=/cluster/bluearc/hg17/multiz.2004-07-13/cons/WINDOWS

maf=$1
c=`basename $maf .maf`
echo $c
mkdir -p /scratch/msa_split
${PHAST}/msa_split $maf -i MAF -M ${FA_SRC}/$c.fa -O hg17,panTro1,mm5,rn3,canFam1,galGal2,fr1,danRer1 -w 1000000,0 -r /scratch/msa_split/$c -o SS -I 1000 -B 5000
[ $? -eq 0 ] || exit 1
echo "Copying..."
cd /scratch/msa_split
for file in $c.*.ss ; do gzip -c $file > ${WINDOWS}/$file.gz ; done
[ $? -eq 0 ] || exit 1
rm -f /scratch/msa_split/$c.*.ss
echo "Done copying"
echo "Done" >> ${WINDOWS}/$c.done
'EOF'
# << for emacs
    chmod +x doSplit.sh
    rm -f jobList
    foreach file (/cluster/bluearc/hg17/multiz.2004-07-13/maf/*.maf) 
        set c = $file:t:r
	echo "doSplit.sh $file {check out line+ $WINDOWS/$c.done}" >> jobList
    end
    
    para create jobList
        # 46 jobs
    para try
    para check
    para push
        # 2 crashed jobs -- due to no alignments in input maf
        # chr18_random, chr6_hla_hap1
    cd ..

    # now generate conservation scores and predicted elements
    ssh hgwdev
    cd /cluster/data/hg17/bed/multiz.2004-07-13/cons
    mkdir run.elements   
    # despite the name, I've put the elements and the new conservation
    # scores here

    # first produce a rough starting model; in this case, we can just
    # use the model previously estimated (see the entry below on PHYLOFIT/PHASTCONS)
    cp /cluster/bluearc/hg17/multiz.2004-07-13/panTro1mm5rn3canFam1/hg17panTro1rn3mm5canFam1galGal2fr1danRer1.mod starting-tree.mod
    # In other cases, it would be sufficient to choose an arbitrary
    # input file from the WINDOWS directory (choose one with plenty of
    # data, i.e., large NTUPLES) and run phyloFit on it with the
    # correct tree topology, e.g.,
    # phyloFit -i SS datafile.ss --tree \
    #    "(((((hg17,panTro1),(mm5,rn3)),canFam1),galGal2),(fr1,danRer1))" \
    #    --out-root starting-tree

    # Get genome-wide average GC content (for all species together,
    # not just the reference genome).  If you have a globally
    # estimated tree model, as above, you can get this from the
    # BACKGROUND line in the .mod file.  E.g.,
# ALPHABET: A C G T
# ...
# BACKGROUND: 0.294633 0.205082 0.205189 0.295097
    # This implies a GC content of 0.205 + 0.205 = 0.410

    # If you do *not* have a global tree model and you do not know
    # your GC content, you can get it directly from the MAFs with
    # a command like:
    #	msa_view --aggregate hg17,panTro1,rn3,mm5,canFam1,galGal2,danRer1,fr1 \
    #       -i MAF --summary-only /cluster/data/hg17/bed/multiz.2004-07-13/maf/chr*.maf\
    #	    > maf_summary.txt
    # This will take a little while (30-60 min).  Run on eieio. 

    # now set up cluster job to estimate model parameters.  Parameters
    # will be estimated separately for each alignment fragment then
    # will be combined across fragments
    cat << 'EOF' > doEstimate.sh
#!/bin/sh
zcat $1 | /cluster/bin/phast/phastCons - starting-tree.mod --gc 0.410 --nrates 1,1 --no-post-probs --ignore-missing --expected-lengths 12 --target-coverage 0.17 --quiet --log $2 --estimate-trees $3
EOF
    # Be sure to substitute in the right G+C content. Also, notice the
    # target coverage of 0.17.  We actually want 5% coverage here but
    # the final (posterior) coverage is only indirectly related to the
    # expected (prior) coverage.  One thing to consider is that we
    # only have about 40% alignment coverage (excluding chimp, which
    # doesn't help us much in identifying conserved regions).  As far
    # as phastCons is concerned, we want to aim for about 0.05 / 0.4 =
    # 0.125 coverage.  In this case, though, --target-coverage
    # 0.125 resulted in only about 4.1% coverage.  I had to iterate
    # a couple of times (using only chromosome 1) to find a value that
    # got me close to the target of 5%

    chmod u+x doEstimate.sh

    rm -fr LOG TREES
    mkdir -p LOG TREES
    rm -f jobs.lst
    # watch out: bash assumed below in a few places
    for f in /cluster/bluearc/hg17/multiz.2004-07-13/cons/WINDOWS/*.ss.gz ; do \
	root=`basename $f .ss.gz` ;\
	echo doEstimate.sh $f LOG/$root.log TREES/$root >> jobs.lst ;\
    done

    # run cluster job
    ssh kk, cd /cluster/data/hg17/bed/multiz.2004-07-13/cons/run.elements, para create, ...
    # takes about an hour

    # Now combine parameter estimates.  We can average the .mod files
    # using phyloBoot.  This must be done separately for the conserved
    # and nonconserved models
    ls TREES/*.cons.mod > cons.txt
    phyloBoot --read-mods '*cons.txt' --output-average ave.cons.mod > cons_summary.txt
    ls TREES/*.noncons.mod > noncons.txt
    phyloBoot --read-mods '*noncons.txt' --output-average ave.noncons.mod > noncons_summary.txt
    # look over the files cons_summary.txt and noncons_summary.txt.
    # The means and medians should be roughly equal and the stdevs
    # should be reasonably small compared to the means, particularly
    # for rate matrix parameters (at bottom) and for branches to the
    # leaves of the tree.  The stdevs may be fairly high for branches
    # near the root of the tree; that's okay.  Some min values may be
    # 0 for some parameters.  That's okay, but watch out for very large
    # values in the max column, which might skew the mean.  If you see
    # any signs of bad outliers, you may have to track down the
    # responsible .mod files and throw them out.  I've never had to do
    # this; the estimates generally seem pretty well behaved.

    # NOTE: Actually, a random sample of several hundred to a thousand
    # alignment fragments (say, a number equal to the number of
    # available cluster nodes) should be more than adequate for
    # parameter estimation.  If pressed for time, use this strategy.

    # Now we are ready to set up the cluster job for computing the
    # conservation scores and predicted elements.  It's all downhill
    # from here.

    cat << 'EOF' > doPhastCons.sh
#!/bin/sh

mkdir -p /cluster/bluearc/hg17/phastCons/POSTPROBS /cluster/bluearc/hg17/phastCons/ELEMENTS
pref=`basename $1 .ss.gz`
chr=`echo $pref | awk -F\. '{print $1}'`
tmpfile=/scratch/phastCons.$$
zcat $1 | /cluster/bin/phast/phastCons - ave.cons.mod,ave.noncons.mod --expected-lengths 12 --target-coverage 0.17 --quiet --seqname $chr --idpref $pref --viterbi /cluster/bluearc/hg17/phastCons/ELEMENTS/$pref.bed --score --require-informative 0 > $tmpfile
gzip -c $tmpfile > /cluster/bluearc/hg17/phastCons/POSTPROBS/$pref.pp.gz
rm $tmpfile
EOF
    chmod u+x doPhastCons.sh

    rm -fr /cluster/bluearc/hg17/phastCons/POSTPROBS /cluster/bluearc/hg17/phastCons/ELEMENTS
    rm -f jobs2.lst
    for f in /cluster/bluearc/hg17/multiz.2004-07-13/cons/WINDOWS/*.ss.gz ; do echo doPhastCons.sh $f >> jobs2.lst ; done

    # run cluster job
    ssh kk, cd /cluster/data/hg17/bed/multiz.2004-07-13/cons/run.elements, para create, ...
    logout
    # takes about 20 minutes

    # combine predictions and transform scores to be in 0-1000 interval
    # do in a way that avoids limits on numbers of args
    find /cluster/bluearc/hg17/phastCons/ELEMENTS -name "*.bed" > files
    rm -f splitfiles* all.raw.bed
    split files splitfiles
    for s in splitfiles* ; do awk '{printf "%s\t%d\t%d\tlod=%d\t%s\n", $1, $2, $3, $5, $5;}' `cat $s` >> all.raw.bed ; done
    /cluster/bin/scripts/lodToBedScore all.raw.bed > all.bed
    rm files splitfiles* 

    hgLoadBed hg17 phastConsElements all.bed
    hgLoadBed -chrom=chr1 hg17 phastConsElements all.bed

    # check coverage
    featureBits hg17 phastConsElements
#137850739 bases of 2866216770 (4.810%) in intersection
    # This should be close enough.  If necessary, you can rerun the
    # steps above with a different target coverage.  When hitting the
    # target is important, you may want to perform several iterations
    # using a representative subset of the entire dataset (human chr1
    # seems to work pretty well)

    # set up wiggle
    mkdir -p /cluster/bluearc/hg17/phastCons/wib
    cat << 'EOF' > doWigAsciiToBinary.sh
#!/bin/sh
chr=$1
zcat `ls /cluster/bluearc/hg17/phastCons/POSTPROBS/$chr.*.pp.gz | sort -t\. -k2,2n` | wigAsciiToBinary -chrom=$chr -wibFile=/cluster/bluearc/hg17/phastCons/wib/${chr}_phastCons stdin 
EOF
    chmod u+x doWigAsciiToBinary.sh

    rm -f jobs3.lst
    for chr in `ls /cluster/bluearc/hg17/phastCons/POSTPROBS | awk -F\. '{print $1}' | sort -u` ; do echo doWigAsciiToBinary.sh $chr >> jobs3.lst ; done

    # run a little wigAsciiToBinary cluster job
    ssh kk, etc.

    # copy wibs and wigs from bluearc
    rsync -av /cluster/bluearc/hg17/phastCons/wib .

    # load track
    hgLoadWiggle hg17 phastCons -pathPrefix=/gbdb/hg17/phastCons/wib \
                wib/chr*_phastCons.wig
    mkdir -p /gbdb/hg17/phastCons/wib
    rm -f /gbdb/hg17/phastCons/wib/chr*phastCons.wib
    ln -s /cluster/data/hg17/bed/multiz.2004-07-13/cons/run.elements/wib/*.wib /gbdb/hg17/phastCons/wib
    chmod 775 . wib /gbdb/hg17/phastCons /gbdb/hg17/phastCons/wib
    chmod 664 wib/*.wib

    # move postprobs over and clean up bluearc 
    rsync -av /cluster/bluearc/hg17/phastCons/POSTPROBS .
    # (people sometimes want the raw scores)
    rm -r /cluster/bluearc/hg17/phastCons/ELEMENTS /cluster/bluearc/hg17/phastCons/POSTPROBS /cluster/bluearc/hg17/phastCons/wib

    # set up full alignment/conservation track ("multiz8way")
    # load multiz maf tables 
    ssh hgwdev
    cd /cluster/data/hg17/bed/multiz.2004-07-13
    set mafDir = /gbdb/hg17/multiz8way/maf
    set table = multiz8way
    mkdir -p $mafDir/$table
    ln -s `pwd`/maf/*.maf $mafDir/$table
    cd maf
    hgLoadMaf hg17 -warn multiz8way -pathPrefix=$mafDir/$table/maf

    # someone dropped from hgwdev
    # reload (2007-03-19 kate)
    nice hgLoadMaf hg17 -warn multiz8way -pathPrefix=/gbdb/hg17/multiz8wayFixed
    cat /gbdb/hg17/multiz8wayFixed/*.maf | \
        nice hgLoadMafSummary hg17 -minSize=30000 -mergeGap=1500 -maxSize=200000  \
            multiz8waySummary stdin

    # load blastz maf tables
    # TODO: change mafWiggle to use db names instead of species names
    # in speciesOrder 
    # link files into /gbdb table dir
    ln -s /cluster/data/hg17/bed/blastz.panTro1/mafNet $mafDir/chimp_netBlastz
    ln -s /cluster/data/hg17/bed/blastz.mm5/mafNet $mafDir/mouse_netBlastz
    ln -s /cluster/data/hg17/bed/blastz.rn3/mafNet $mafDir/rat_netBlastz
    ln -s /cluster/data/hg17/bed/blastz.canFam1.2004-07-08/mafNet $mafDir/dog_netBlastz
    ln -s /cluster/data/hg17/bed/blastz.galGal2/mafNet $mafDir/chicken_netBlastz
    ln -s /cluster/data/hg17/bed/blastz.fr1/mafNet $mafDir/fugu_netBlastz
    ln -s /cluster/data/hg17/bed/blastz.danRer1.swap/mafNet $mafDir/zebrafish_netBlastz

    # remove empty file, disliked by hgLoadMaf
    # NOTE: these shouldn't be empty -- next time, make sure previous
    # alignments are copied over to output maf (multiz won't if there's
    # an empty input file).
    rm chicken/chr18_random.maf
    rm fugu/chr6_hla_hap1.maf

    # load tables
    foreach s (chimp mouse rat dog chicken fugu zebrafish)
        set table = ${s}_netBlastz
        echo "$s $mafDir/$table"
        ~kate/bin/i386/hgLoadMaf hg17 -warn ${s}_netBlastz -pathPrefix=$mafDir/$table
    end

    # trackDb entry:
# track multiz8way
# shortLabel Conservation
# longLabel Chimp/Mouse/Rat/Dog/Chicken/Fugu/Zebrafish Multiz Alignments & Conservation
# group compGeno
# priority 149
# visibility pack
#color 0, 10, 100
# type wigMaf 0.0 1.0
# maxHeightPixels 100:40:11
# wiggle phastCons
# yLineOnOff Off
# autoScale Off
# pairwise netBlastz
# speciesOrder chimp mouse rat dog chicken fugu zebrafish


# PHASTCONS SCORES DOWNLOADABLES (REDONE 6/15/05 angie)
    # Initially done 10/11/04, but using scores from run.cons -- which 
    # had been replaced by scores in run.elements, where I did not think 
    # to look for scores.  :(  !
    ssh eieio
    mkdir /cluster/data/hg17/zip/mzPt1Mm5Rn3Cf1Gg2Fr1Dr1
    cd /cluster/data/hg17/bed/multiz8way/cons/run.elements/POSTPROBS
    foreach chr (`awk '{print $1;}' /cluster/data/hg17/chrom.sizes`)
      echo $chr
      nice zcat `ls -1 $chr.*.pp.gz | sort -t\. -k2,2n` \
      | nice gzip -c \
      > /cluster/data/hg17/zip/mzPt1Mm5Rn3Cf1Gg2Fr1Dr1/$chr.gz
    end
    ssh hgwdev
    mkdir /usr/local/apache/htdocs/goldenPath/hg17/phastCons
    # Doh!  /cluster/data/hg17/zip/mzPt1Mm5Rn3Cf1Gg2Fr1Dr1 is 11G now -- 
    # too much to dump on hgwdev's / which is at 94%.  So don't do this:
    #mv /cluster/data/hg17/zip/mzPt1Mm5Rn3Cf1Gg2Fr1Dr1 .
    # make symbolic links instead:
    mkdir /usr/local/apache/htdocs/goldenPath/hg17/phastCons/mzPt1Mm5Rn3Cf1Gg2Fr1Dr1
    cd /usr/local/apache/htdocs/goldenPath/hg17/phastCons/mzPt1Mm5Rn3Cf1Gg2Fr1Dr1
    ln -s /cluster/data/hg17/zip/mzPt1Mm5Rn3Cf1Gg2Fr1Dr1/* .
    md5sum *.gz > md5sum.txt
    # make a README.txt.


# PHYLOFIT AND TREE-DOCTOR FOR 8-WAY: ESTIMATE PHYLOGENETIC TREE (acs)
# (This was originally done for phastCons but is not necessary with
# the new version.  However, it may be useful for other purposes, so
# I'm leaving it in as a separate entry.)

    # first estimate a model for the mammals
    ssh eieio
    cd /cluster/bluearc/hg17/multiz.2004-07-13/panTro1mm5rn3canFam1

    # collect sufficient stats (takes maybe an hour)
    for file in *.maf ; do echo $file ; msa_view -i MAF $file -o SS --order hg17,panTro1,rn3,mm5,canFam1 > `basename $file .maf`.ss ; done
    ls *.ss | grep -v chr6_hla_hap2 > files
    msa_view '*files' --aggregate hg17,panTro1,rn3,mm5,canFam1 -i SS -o SS > all.ss
    # BTW, this can now be done in one step using something like:
    # msa_view --aggregate hg17,panTro1,rn3,mm5,canFam1 -i MAF -o SS *.maf > all.ss
    # (modify to exclude certain files if necessary)

    # estimate model, with rate variation (takes about a minute)
    phyloFit all.ss --nrates 10 --tree "(((hg17,panTro1),(rn3,mm5)),canFam1)" --alpha 4.4 --EM --log log -i SS --out-root hprmc-rev-dg
    # (Actually, --nrates 4 should be more than adequate for most purposes)

    cat hprmc-rev-dg.mod
#ALPHABET: A C G T
#ORDER: 0
#SUBST_MOD: REV
#NRATECATS: 10
#ALPHA: 4.658942
#TRAINING_LNL: -6889216721.159384
#BACKGROUND: 0.294633 0.205082 0.205189 0.295097
#RATE_MAT:
#  -0.865237    0.159990    0.554805    0.150442
#   0.229851   -1.194646    0.168269    0.796526
#   0.796651    0.168182   -1.194919    0.230086
#   0.150205    0.553556    0.159985   -0.863747
#TREE: (((1:0.006523,2:0.007997):0.103779,(3:0.104867,4:0.078911):0.265676):0.112364,5:0.112364);

    # now extrapolate to fish and chicken using tree_doctor and the CFTR 25 tree 
    # (replace numbers with names in hprmc-rev-dg.mod; this won't be necessary in the future)
    tree_doctor --rename "1->hg17;2->panTro1;3->rn3;4->mm5;5->canFam1" hprmc-rev-dg.mod > hprmc-rev-dg.names.mod
    # (obtain 8-way subtree from cftr25_hybrid.nh; also map names as necessary to match above)
    tree_doctor /cluster/data/nisc/targets/cftr/phyloHMMcons25/cftr25_hybrid.nh --prune-all-but hg16,chimp,mm3,rn3,dog,chicken,fr1,zfish --rename "hg16->hg17;mm3->mm5;chimp->panTro1;dog->canFam1;chicken->galGal2;zfish->danRer1" > cftr8way.nh
    # now merge (see tree_doctor help page for explanation)
    tree_doctor hprmc-rev-dg.names.mod --merge cftr8way.nh > hg17panTro1rn3mm5canFam1galGal2fr1danRer1.mod

    cat hg17panTro1rn3mm5canFam1galGal2fr1danRer1.mod
#ALPHABET: A C G T
#ORDER: 0
#SUBST_MOD: REV
#NRATECATS: 10
#ALPHA: 4.658942
#BACKGROUND: 0.294633 0.205082 0.205189 0.295097
#RATE_MAT:
#  -0.865237    0.159990    0.554805    0.150442
#   0.229851   -1.194646    0.168269    0.796526
#   0.796651    0.168182   -1.194919    0.230086
#   0.150205    0.553556    0.159985   -0.863747
#TREE: (((((hg17:0.006523,panTro1:0.007997):0.103779,(rn3:0.104867,mm5:0.078911):0.265676):0.019461,canFam1:0.205267):0.377150,galGal2:0.511134):0.536627,(danRer1:0.905323,fr1:0.922995):0.536627);


# CONSERVED NON-CODING (CNS) TRACK (acs 08/29/04)
# (depends on phastConsElements)

    cd /cluster/data/hg17/bed/multiz.2004-07-13/cons/run.elements
    featureBits hg17 -bed=possibleCoding.bed -or twinscan:exon xenoMrna mrna intronEst 
    # (add SGP, exoniphy, possib. others if available)
    # now filter out all phastCons elements that overlap
    overlapSelect -nonOverlapping possibleCoding.bed all.bed cns.bed
    hgLoadBed hg17 cns cns.bed

# track cns
# shortLabel CNS
# longLabel Conserved Non-Coding (Cons Elements Minus Predicted Coding)
# priority 109.11
# group compGeno
# visibility hide
# type bed 5 .

    
# PRODUCING GENSCAN PREDICTIONS (DONE - 2004-07-08 - Hiram)
#	Needed to download a new binary for this run.  Our Linux systems
#  XXX - I thought a new binary was needed.  Turned out it was already
#  here in our hg3rdParty CVS project.  All of this discussed here can
#  be simply fetched from cvs:  cvs co hg3rdParty/genscanlinux
#	have been updated since the last time, the old binary would not
#	run.  Go to: http://genes.mit.edu/GENSCAN.html
#	and then to: http://genes.mit.edu/license.html
#	Fill in the license agreement and you can then pick up the
#	README and the Linux version: genscanlinux.tar.uue.tgz
#	To uudecode that file, go to one of the Solaris home machines
#	and use the uudecode command:
#	uudecode genscanlinux.tar.uue.tgz
#	That produces the file: genscanlinux.tar
#	Which contains the files:
# drwxr-xr-x chris/burgelab    0 2003-02-17 11:48:44 ./
# -rw-r--r-- chris/burgelab 219056 2000-09-07 12:39:26 ./Arabidopsis.smat
# -rw-r--r-- chris/burgelab   6622 2000-09-07 12:39:26 ./HUMRASH
# -rw-r--r-- chris/burgelab    849 2000-09-07 12:39:26 ./HUMRASH.sample
# -rw-r--r-- chris/burgelab 219050 2000-09-07 12:39:26 ./HumanIso.smat
# -rw-r--r-- chris/burgelab 155735 2000-09-07 12:39:26 ./Maize.smat
# -rw-r--r-- chris/burgelab  24465 2000-09-07 12:39:26 ./README
# -rw-r--r-- chris/burgelab   6344 2000-09-07 12:39:27 ./HUMRASH.ps
# -rwxr-xr-x chris/burgelab 126365 2003-02-17 11:48:44 ./genscan
#
#	I placed these currently in: /cluster/home/hiram/GENSCAN/
#	I'll check with Angie where it should properly live ...
#  XXX - it already lives in 'cvs co hg3rdParty/genscanlinux'
#  These instructions should simple check it out right here in
#  bed/genscan and make the gsub command refer to these copies.

    ssh hgwdev
    mkdir /cluster/data/hg17/bed/genscan
    cd /cluster/data/hg17/bed/genscan
    cvs co hg3rdParty/genscanlinux

    ssh eieio
    cd /cluster/data/hg17/bed/genscan
    # Make 3 subdirectories for genscan to put their output files in
    mkdir gtf pep subopt
    # Generate a list file, genome.list, of all the contigs
    # *that do not have pure Ns* (due to heterochromatin, unsequencable 
    # stuff) which would cause genscan to run forever.
    rm -f genome.list
    bash
    for f in `cat /cluster/data/hg17/contig.lst`
    do
      egrep '[ACGT]' /cluster/data/hg17/$f.masked > /dev/null
	if [ $? = 0 ]; then
	    echo /cluster/data/hg17/$f.masked >> genome.list
	fi
    done
    # exit your bash shell if you are [t]csh ...
    #	This egrep matched all the contigs in hg17.  I guess none of
    #	them are complete Ns* at this point.

    # Log into kki (not kk !).  kki is the driver node for the small
    # cluster (kkr2u00 -kkr8u00. Genscan has problem running on the
    # big cluster, due to limitation of memory and swap space on each
    # processing node).
    ssh kki
    cd /cluster/data/hg17/bed/genscan
    # Create template file, gsub, for gensub2.  For example (3-line file):
    cat << '_EOF_' > gsub
#LOOP
/cluster/bin/x86_64/gsBig {check in line+ $(path1)} {check out line gtf/$(root1).gtf} -trans={check out line pep/$(root1).pep} -subopt={check out line subopt/$(root1).bed} -exe=hg3rdParty/genscanlinux/genscan -par=hg3rdParty/genscanlinux/HumanIso.smat -tmp=/tmp -window=2400000
#ENDLOOP
'_EOF_'
    # << this line makes emacs coloring happy
    gensub2 genome.list single gsub jobList
    para create jobList
    para try
    para check
    para push ... etc ...
# Completed: 379 of 380 jobs
# Crashed: 1 jobs
# CPU time in finished jobs:      79998s    1333.30m    22.22h    0.93d  0.003 y
# IO & Wait Time:                  2989s      49.82m     0.83h    0.03d  0.000 y
# Average job time:                 219s       3.65m     0.06h    0.00d
# Longest job:                     2999s      49.98m     0.83h    0.03d
# Submission to last job:          8324s     138.73m     2.31h    0.10d

    #	Running the single failed job on kolossus with a smaller window:
/cluster/bin/x86_64/gsBig /cluster/data/hg17/5/NT_006576/NT_006576.fa.masked \
        gtf/NT_006576.fa.gtf -trans=pep/NT_006576.fa.pep \
        -subopt=subopt/NT_006576.fa.bed -exe=hg3rdParty/genscanlinux/genscan \
        -par=hg3rdParty/genscanlinux/HumanIso.smat -tmp=/tmp -window=2000000

    # If there were out-of-memory problems (run "para problems"), then 
    # re-run those jobs by hand but change the -window arg from 2400000
    # something lower.  In build33, this was 22/NT_011519
    #  In build34 there were NO failures !

    # Convert these to chromosome level files as so:     
    ssh eieio
    cd /cluster/data/hg17/bed/genscan
    $HOME/bin/i386/liftUp genscan.gtf ../../jkStuff/liftAll.lft warn gtf/N*.gtf
    $HOME/bin/i386/liftUp genscanSubopt.bed ../../jkStuff/liftAll.lft \
	warn subopt/N*.bed
    cat pep/*.pep > genscan.pep

    # Load into the database as so:
    ssh hgwdev
    cd /cluster/data/hg17/bed/genscan
    ldHgGene hg17 genscan genscan.gtf
    #	35 minute job
    #	Read 42807 transcripts in 325994 lines in 1 files
    #	  42807 groups 46 seqs 1 sources 1 feature types

    hgPepPred hg17 generic genscanPep genscan.pep
    #	Processing genscan.pep
    hgLoadBed hg17 genscanSubopt genscanSubopt.bed
    #	Reading genscanSubopt.bed
    #	Loaded 517157 elements of size 6
    #	Sorted
    #	Creating table definition for 
    #	Saving bed.tab
    #	Loading hg17

    #	featureBits hg17 genscan
    #	55323340 bases of 2866216770 (1.930%) in intersection

    #	featureBits hg16 genscan
    #	55333689 bases of 2865248791 (1.931%) in intersection

    #	featureBits hg17 genscanSubopt
    #	55986178 bases of 2866216770 (1.953%) in intersection
    #	featureBits hg16 genscanSubopt
    #	56082952 bases of 2865248791 (1.957%) in intersection

    #	Should be zero intersection with rmsk
    #	featureBits -chrom=chr1 hg17 genscan rmsk
    #	794 bases of 222827847 (0.000%) in intersection


# EXTRACT LINEAGE-SPECIFIC REPEATS FOR DOG (DONE 8/1/05 angie)
    # Originally done 7/1/04 for canFam1 -- redone 8/1/05 for canFam2.
    ssh kolossus
    cd /san/sanvol1/scratch/hg17/rmsk
    # Run Arian's DateRepsinRMoutput.pl to add extra columns telling 
    # whether repeats in -query are also expected in -comp species.  
    # Even though we already have the human-mouse linSpecReps,
    # extractLinSpecReps requires two columns of DateRepsinRMoutput.pl
    # additions.  So add mouse, then ignore it.  
    # Dog in extra column 1, Mouse in extra column 2
    foreach outfl ( *.out )
        echo "$outfl"
        /cluster/bluearc/RepeatMasker/DateRepeats \
          ${outfl} -query human -comp dog -comp mouse
    end
    # Now extract dog (extra column 1), ignore mouse.
    cd ..
    mkdir linSpecRep.notInDog
    foreach f (rmsk/*.out_canis-familiaris_mus-musculus)
        set base = $f:t:r:r
        echo $base.out.spec
        /cluster/bin/scripts/extractLinSpecReps 1 $f > \
                        linSpecRep.notInDog/$base.out.spec
    end
    # Clean up.
    rm rmsk/*.out_canis*
    rsync -av /san/sanvol1/scratch/hg17/linSpecRep.notInDog \
      /cluster/bluearc/scratch/hg/gs.18/build35/
    # Ask cluster-admin for an rsync.


# BLASTZ DOG (CANFAM1) (DONE 7/8/04 angie)
    ssh kk
    # space is awful tight on store4 -- use store7.  
    mkdir -p /cluster/store7/hg17/bed/blastz.canFam1.2004-07-08
    ln -s /cluster/store7/hg17/bed/blastz.canFam1.2004-07-08 \
      /cluster/data/hg17/bed/
    cd /cluster/data/hg17/bed/blastz.canFam1.2004-07-08
    # Use default (Human-Mouse) settings for starters.
    cat << '_EOF_' > DEF
# human vs. dog
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/i386:/cluster/home/angie/schwartzbin

ALIGN=blastz-run
BLASTZ=blastz

# Default
BLASTZ_H=2000
BLASTZ_ABRIDGE_REPEATS=1

# TARGET: Human
SEQ1_DIR=/scratch/hg/gs.18/build35/bothMaskedNibs
SEQ1_RMSK=
SEQ1_FLAG=
SEQ1_SMSK=/scratch/hg/gs.18/build35/linSpecRep.notInDog
SEQ1_IN_CONTIGS=0
SEQ1_CHUNK=10000000
SEQ1_LAP=10000

# QUERY: Dog
SEQ2_DIR=/scratch/hg/canFam1/nib
SEQ2_RMSK=
SEQ2_FLAG=
SEQ2_SMSK=/scratch/hg/canFam1/linSpecRep.notInHuman
SEQ2_IN_CONTIGS=0
SEQ2_CHUNK=10000000
SEQ2_LAP=0

BASE=/cluster/data/hg17/bed/blastz.canFam1.2004-07-08

DEF=$BASE/DEF
RAW=$BASE/raw
CDBDIR=$BASE
SEQ1_LEN=$BASE/S1.len
SEQ2_LEN=$BASE/S2.len
'_EOF_'
    # << this line keeps emacs coloring happy

    # first cluster run: raw blastz alignments
    ssh kk
    bash # if a csh/tcsh user
    cd /cluster/data/hg17/bed/blastz.canFam1.2004-07-08
    source DEF
    mkdir $RAW run.0
    /cluster/home/angie/hummus/make-joblist $DEF > $BASE/run.0/j
    sh ./xdir.sh
    cd run.0
    sed -e 's@^blastz-run@/cluster/bin/penn/blastz-run@' j > jobList
    para create jobList
    para try, check, push, check, ....
    # Moving the human chr19 jobs up to the top of the jobList probably 
    # would have shaved 4 hours off the total time!  It was almost done 
    # after 6 hours, except for a few chr19 stragglers.
#Completed: 93775 of 93775 jobs
#Average job time:                 202s       3.37m     0.06h    0.00d
#Longest job:                    17806s     296.77m     4.95h    0.21d
#Submission to last job:         35523s     592.05m     9.87h    0.41d

    # second cluster run: lift raw alignments -> lav dir
    ssh kki
    bash # if a csh/tcsh user
    cd /cluster/data/hg17/bed/blastz.canFam1.2004-07-08
    source DEF
    mkdir run.1 lav
    /cluster/bin/scripts/blastz-make-out2lav $DEF $BASE > $BASE/run.1/jobList
    cd run.1
    wc -l jobList
    para create jobList
    para try, check, push, etc ...
#Completed: 341 of 341 jobs
#Average job time:                  36s       0.61m     0.01h    0.00d
#Longest job:                      302s       5.03m     0.08h    0.00d
#Submission to last job:          1143s      19.05m     0.32h    0.01d

    # third run: lav -> axt
    # (if non-default BLASTZ_Q is used in the future, put axtRescore in 
    # the pipe after lavToAxt)
    ssh kki
    cd /cluster/data/hg17/bed/blastz.canFam1.2004-07-08
    mkdir axtChrom pslChrom run.2
    cd run.2
    cat << '_EOF_' > do.csh
#!/bin/csh -ef
cd $1
set chr = $1:t
cat `ls -1 *.lav | sort -g` \
| $HOME/bin/x86_64/lavToAxt stdin \
    /iscratch/i/gs.18/build35/bothMaskedNibs /iscratch/i/canFam1/nib stdout \
| $HOME/bin/x86_64/axtSort stdin ../../axtChrom/$chr.axt 
$HOME/bin/x86_64/axtToPsl ../../axtChrom/$chr.axt ../../S1.len ../../S2.len \
  ../../pslChrom/$chr.psl
'_EOF_'
    # << this line keeps emacs coloring happy
    chmod a+x do.csh
    cp /dev/null jobList
    foreach d (../lav/chr*)
      echo "do.csh $d" >> jobList
    end
    para create jobList
    para try, check, push, check
#Completed: 46 of 46 jobs
#Average job time:                 300s       5.00m     0.08h    0.00d
#Longest job:                     1669s      27.82m     0.46h    0.02d
#Submission to last job:          1689s      28.15m     0.47h    0.02d


# CHAIN DOG BLASTZ (DONE 7/9/04 angie)
    # Run axtChain on little cluster
    ssh kki
    cd /cluster/data/hg17/bed/blastz.canFam1.2004-07-08
    mkdir -p axtChain/run1
    cd axtChain/run1
    mkdir out chain
    ls -1S /cluster/data/hg17/bed/blastz.canFam1.2004-07-08/axtChrom/*.axt \
      > input.lst
    cat << '_EOF_' > gsub
#LOOP
doChain {check in line+ $(path1)} {check out line+ chain/$(root1).chain} {check out exists out/$(root1).out}
#ENDLOOP
'_EOF_'
    # << this line makes emacs coloring happy

    cat << '_EOF_' > doChain
#!/bin/csh
axtChain $1 \
    /iscratch/i/gs.18/build35/bothMaskedNibs \
    /iscratch/i/canFam1/nib $2 > $3
'_EOF_'
    # << this line makes emacs coloring happy
    chmod a+x doChain
    gensub2 input.lst single gsub jobList
    para create jobList
    para try, check, push, check...
#Completed: 46 of 46 jobs
#Average job time:                 266s       4.43m     0.07h    0.00d
#Longest job:                     3578s      59.63m     0.99h    0.04d
#Submission to last job:          3578s      59.63m     0.99h    0.04d
    # now on the cluster server, sort chains
    ssh kksilo
    cd /cluster/data/hg17/bed/blastz.canFam1.2004-07-08/axtChain
    chainMergeSort run1/chain/*.chain > all.chain
    chainSplit chain all.chain
    rm run1/chain/*.chain

    # take a look at score distr's
    foreach f (chain/*.chain)
      grep chain $f | awk '{print $2;}' | sort -nr > /tmp/score.$f:t:r
      echo $f:t:r
      textHistogram -binSize=10000 /tmp/score.$f:t:r
      echo ""
    end

    # Lots of chaff with scores in the 3000's.  Many very-high-scoring 
    # chains.  So filter the chain down somewhat...
    mv all.chain all.chain.unfiltered
    chainFilter -minScore=5000 all.chain.unfiltered > all.chain
    rm chain/*
    chainSplit chain all.chain
    gzip all.chain.unfiltered

    # Load chains into database
    ssh hgwdev
    cd /cluster/data/hg17/bed/blastz.canFam1.2004-07-08/axtChain/chain
    foreach i (*.chain)
        set c = $i:r
        hgLoadChain hg17 ${c}_chainCanFam1 $i
    end
    # Coverage is significantly higher than mouse:
    featureBits hg17 -chrom=chr1 chainCanFam1Link
#123999291 bases of 222827847 (55.648%) in intersection
# before filtering: 124750124 bases of 222827847 (55.985%) in intersection
    featureBits hg17 -chrom=chr1 chainMm5Link
#83773012 bases of 222827847 (37.595%) in intersection


# NET DOG BLASTZ (DONE 7/9/04 angie)
    ssh kolossus
    cd /cluster/data/hg17/bed/blastz.canFam1.2004-07-08/axtChain
    chainPreNet all.chain ../S1.len ../S2.len stdout \
    | chainNet stdin -minSpace=1 ../S1.len ../S2.len stdout /dev/null \
    | netSyntenic stdin noClass.net

    # Add classification info using db tables:
    ssh hgwdev
    cd /cluster/data/hg17/bed/blastz.canFam1.2004-07-08/axtChain
    netClass noClass.net hg17 canFam1 dog.net

    # Make a 'syntenic' subset:
    ssh kksilo
    cd /cluster/data/hg17/bed/blastz.canFam1.2004-07-08/axtChain
    rm noClass.net
    # Make a 'syntenic' subset of these with
    netFilter -syn dog.net > dogSyn.net

    # Load the nets into database 
    ssh hgwdev
    cd /cluster/data/hg17/bed/blastz.canFam1.2004-07-08/axtChain
    netFilter -minGap=10 dog.net |  hgLoadNet hg17 netCanFam1 stdin
    netFilter -minGap=10 dogSyn.net | hgLoadNet hg17 syntenyNetCanFam1 stdin
    # Add entries for chainCanFam1, netCanFam1 to human/hg17 trackDb


# MAKE VSCANFAM1 DOWNLOADABLES (DONE 9/17/04 kate)
    ssh kksilo
    cd /cluster/data/hg17/bed/blastz.canFam1.2004-07-08/axtChain
    ln -s all.chain dog.chain
    mkdir gz
    cd gz
    gzip -c ../dog.chain > dog.chain.gz
    gzip -c ../dog.net > dog.net.gz
    gzip ../dogSyn.net > dogSyn.net.gz
    # Angie's notes...
    # Mike Zody asked for raw blastz in chain format, so figure out some 
    # way to translate axt or psl to chain and put it out there.  
    # Actually, it's probably just hg16-canFam1 that he wants for now -- ?
    # Ask when we get to this point.

    cd ../axtNet
    time gzip *.axt

    ssh hgwdev
    cd /usr/local/apache/htdocs/goldenPath/hg17
    mkdir -p vsCanFam1
    cd vsCanFam1
    mv /cluster/data/hg17/bed/blastz.canFam1/axtChain/gz/*.gz .
    md5sum *.gz > md5sum.txt
    mkdir -p axtNet
    cd axtNet
    cp /cluster/data/hg17/bed/blastz.canFam1/axtNet/*.axt.gz .
    md5sum *.gz > md5sum.txt
    # Copy over & edit README.txt w/pointers to chain, net formats.

    # REDO downloads of axtNet's to fix overlaps (2005-09-13 kate)
    # Finally, replace bad chr5 files (2006-01-05 kate)
    ssh kkstore02
    cd /cluster/data/hg17/bed/blastz.canFam1/axtNet
    nice gzip *.axt
    md5sum *.axt.gz > md5sum.txt

    ssh hgwdev
    cd /usr/local/apache/htdocs/goldenPath/hg17/vsCanFam1
    mv axtNet axtNet.old
    ln -s /cluster/data/hg17/bed/blastz.canFam1/axtNet .


# GENERATE CANFAM1 MAF FOR MULTIZ FROM NET (DONE 7/9/04 angie)
# Redo net axt's and maf's to fix overlaps (use 8/5 netToAxt)
#       (2005-08-16 kate)
# and replace bad chr5 files (2006-01-05 kate)
    ssh kkstore02
    cd /cluster/data/hg17/bed/blastz.canFam1.2004-07-08/axtChain
    netSplit dog.net net
    cd /cluster/data/hg17/bed/blastz.canFam1.2004-07-08
    mkdir axtNet mafNet
cat > makeMaf.csh << 'EOF'
    foreach f (axtChain/net/*)
      set chr = $f:t:r
      echo $chr
      netToAxt $f axtChain/chain/$chr.chain /cluster/data/hg17/nib \
        /cluster/data/canFam1/nib stdout \
      | axtSort stdin axtNet/$chr.axt
      axtToMaf axtNet/$chr.axt \
            /cluster/data/hg17/chrom.sizes /cluster/data/canFam1/chrom.sizes \
            mafNet/$chr.maf -tPrefix=hg17. -qPrefix=canFam1.
    end
'EOF'
    csh makeMaf.csh >&! makeMaf.log &
    mkdir -p /cluster/bluearc/hg17/mafNet
    cp -rp mafNet /cluster/bluearc/hg17/mafNet/canFam1


# BLASTZ MM5 (DONE - 2004-06-22 - Hiram)
    ssh kk
    mkdir -p /cluster/data/hg17/bed/blastz.mm5.2004-07-01
    cd /cluster/data/hg17/bed
    ln -s  blastz.mm5.2004-07-01 blastz.mm5
    cd blastz.mm5

    cat << '_EOF_' > DEF
# human vs. mouse
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/home/angie/schwartzbin:/cluster/home/kent/bin/i386

ALIGN=blastz-run
BLASTZ=blastz
BLASTZ_H=2000
BLASTZ_ABRIDGE_REPEATS=1

# TARGET
# Human
SEQ1_DIR=/iscratch/i/gs.18/build35/bothMaskedNibs
# not used
SEQ1_RMSK=
# not used
SEQ1_FLAG=
SEQ1_SMSK=/iscratch/i/gs.18/build35/linSpecRep.notInRat
# notInRat OK as it is identical to notInMouse
SEQ1_IN_CONTIGS=0
SEQ1_CHUNK=10000000
SEQ1_LAP=10000

# QUERY
# Mouse
SEQ2_DIR=/scratch/mus/mm5/softNib
# RMSK not currently used
SEQ2_RMSK=/scratch/mus/mm5/rmsk
# FLAG not currently used
SEQ2_FLAG=-rodent
SEQ2_SMSK=/scratch/mus/mm5/linSpecRep.notInHuman
SEQ2_IN_CONTIGS=0
SEQ2_CHUNK=30000000
SEQ2_LAP=0

BASE=/cluster/data/hg17/bed/blastz.mm5

DEF=$BASE/DEF
RAW=$BASE/raw
CDBDIR=$BASE
SEQ1_LEN=$BASE/S1.len
SEQ2_LEN=$BASE/S2.len
'_EOF_'
    # << this line keeps emacs coloring happy


    # prepare first cluster run
    ssh kk
    cd /cluster/data/hg17/bed/blastz.mm5
    /cluster/data/hg17/jkStuff/BlastZ_run0.sh
    cd run.0
    para try, check, push, check, ....
# Completed: 44330 of 44330 jobs
# CPU time in finished jobs:   16250628s  270843.80m  4514.06h  188.09d  0.515 y
# IO & Wait Time:                387936s    6465.60m   107.76h    4.49d  0.012 y
# Average job time:                 375s       6.26m     0.10h    0.00d
# Longest job:                     4417s      73.62m     1.23h    0.05d
# Submission to last job:         43754s     729.23m    12.15h    0.51d

    #	the file server to its knees.  Run this on the small cluster.
    ssh kki
    cd /cluster/data/hg17/bed/blastz.mm5
    /cluster/data/hg17/jkStuff/BlastZ_run1.sh
    cd run.1
    para try, check, push, etc ...
# Completed: 341 of 341 jobs
# CPU time in finished jobs:       2189s      36.48m     0.61h    0.03d  0.000 y
# IO & Wait Time:                  7714s     128.57m     2.14h    0.09d  0.000 y
# Average job time:                  29s       0.48m     0.01h    0.00d
# Longest job:                      165s       2.75m     0.05h    0.00d
# Submission to last job:           830s      13.83m     0.23h    0.01d


    #	Third cluster run to convert lav's to axt's
    #	Does not work on kki since /scratch on the iservers is not the
    #	same as /scratch on the other clusters.
    ssh kk
    cd /cluster/data/hg17/bed/blastz.mm5
    /cluster/data/hg17/jkStuff/BlastZ_run2.sh
    cd run.2
    para try, check, push, etc ...
# Completed: 45 of 46 jobs
# Crashed: 1 jobs
# CPU time in finished jobs:       1638s      27.30m     0.46h    0.02d  0.000 y
# IO & Wait Time:                 12068s     201.13m     3.35h    0.14d  0.000 y
# Average job time:                 305s       5.08m     0.08h    0.00d
# Longest job:                     1124s      18.73m     0.31h    0.01d
# Submission to last job:          2519s      41.98m     0.70h    0.03d
    #	chr19 takes too long, the axtSort becomes too large and the poor
    #	node ends up swapping forever.  When you are down to that last
    #	job running, stop it and go to kolossus.
    #	Adjusting the location of the nib directories, and fixing the
    #	MACHTYPE on the commands in the blastz script:
    ssh kolossus
    cd /cluster/data/hg17/bed/blastz.mm5
    sed -e "s/i386/x86_64/g" /cluster/bin/scripts/blastz-chromlav2axt > \
	x86_64-chromlav2axt
    chmod +x x86_64-chromlav2axt
    time ./x86_64-chromlav2axt \
	/cluster/data/hg17/bed/blastz.mm5/lav/chr19 \
	/cluster/data/hg17/bed/blastz.mm5/axtChrom/chr19.axt \
	/cluster/bluearc/scratch/hg/gs.18/build35/bothMaskedNibs \
	/cluster/bluearc/scratch/mus/mm5/softNib
    #	real    7m41.719s
    #	user    2m2.850s
    #	sys     0m23.070s

    # translate sorted axt files into psl
    ssh eieio
    cd /cluster/data/hg17/bed/blastz.mm5
    mkdir -p pslChrom
    set tbl = "blastzMm5"
    foreach f (axtChrom/chr*.axt)
      set c=$f:t:r
      echo "Processing chr $c"
      /cluster/bin/i386/axtToPsl $f S1.len S2.len pslChrom/${c}_${tbl}.psl
    end
    #	This takes more than an hour.  You can shorten this by changing
    #	that command to a simple echo, put the results into a file,
    #	split the file into four parts and run the four files as shell
    #	scripts on eieio to have four processes running at the same
    #	time.  Load on eieio gets up to about 20 which is reasonable.

    # Load database tables
    ssh hgwdev
    cd /cluster/data/hg17/bed/blastz.mm5/pslChrom
    bash		#	for tcsh users
    for F in chr*_blastzMm5.psl
    do
	/cluster/bin/i386/hgLoadPsl -noTNameIx hg17 ${F}
	echo "${F} done"
    done
    # this is a 40 minute job
    # exit bash if you are tcsh

    # featureBits on blastzMm3 or 4 will not work on hgwdev, runs out of
    # memory.  But if you reset your ~/.hg.conf to use the read-only
    #	user and contact the hgwdev host, then use the x86_64 featureBits
    # featureBits hg16 blastzMm5
    # 1056761609 bases of 2865248791 (36.882%) in intersection
    # featureBits hg17 blastzMm5
    #	1052077141 bases of 2866216770 (36.706%) in intersection
    # featureBits hg17 blastzMm4
    #  1056201417 bases of 2866216770 (36.850%) in intersection

# CHAIN MM5 BLASTZ (DONE - 2004-07-02 - Hiram)

# The axtChain is best run on the small kluster, or the kk9 kluster
    ssh kki
    mkdir -p /cluster/data/hg17/bed/blastz.mm5/axtChain/run1
    cd /cluster/data/hg17/bed/blastz.mm5/axtChain/run1
    mkdir out chain

    ls -1S /cluster/data/hg17/bed/blastz.mm5/axtChrom/*.axt > input.lst
    cat << '_EOF_' > gsub
#LOOP
doChain {check in exists $(path1)} {check out line+ chain/$(root1).chain} out/$(root1).out
#ENDLOOP
'_EOF_'
    # << this line makes emacs coloring happy

#  May need -minScore=5000 for all chroms if chr19 won't finish on kolossus

    cat << '_EOF_' > doChain
#!/bin/csh
axtChain $1 /iscratch/i/gs.18/build35/bothMaskedNibs \
	/iscratch/i/mus/mm5/softNib $2 > $3
'_EOF_'
    # << this line makes emacs coloring happy
    chmod a+x doChain

    # 46 jobs
    gensub2 input.lst single gsub jobList
    para create jobList
    para try
    para push # ... etc ...
# Completed: 46 of 46 jobs
# CPU time in finished jobs:       4856s      80.94m     1.35h    0.06d  0.000 y
# IO & Wait Time:                 20083s     334.71m     5.58h    0.23d  0.001 y
# Average job time:                 542s       9.04m     0.15h    0.01d
# Longest job:                     2929s      48.82m     0.81h    0.03d
# Submission to last job:          2929s      48.82m     0.81h    0.03d


    # now on the file server, sort chains
    ssh eieio
    cd /cluster/data/hg17/bed/blastz.mm5/axtChain
    time chainMergeSort run1/chain/*.chain > all.chain
    #	real    8m42.853s
    #	user    5m59.100s
    #	sys     0m40.320s

    time chainSplit chain all.chain
    #	real    10m52.224s
    #	user    5m52.360s
    #	sys     0m34.870s

    # optionally: rm run1/chain/*.chain

    # Load chains into database
    # next machine
    ssh hgwdev
    cd /cluster/data/hg17/bed/blastz.mm5/axtChain/chain
    bash	#	for tcsh users
    for i in *.chain
    do
        c=${i/.chain/}
        hgLoadChain hg17 ${c}_chainMm5 $i
        echo done $c
    done
    # exit bash if you are tcsh
    #	This is a 50 minute job

    #	featureBits hg17 chainMm5
    #	2834490112 bases of 2866216770 (98.893%) in intersection
    #	featureBits hg17 chainMm4
    #	2829135227 bases of 2866216770 (98.706%) in intersection
    #	featureBits hg16 chainMm4
    #	2828363353 bases of 2865248791 (98.713%) in intersection


# NET MM5 (DONE - 2004-07-02 - Hiram)
    ssh eieio
    cd /cluster/data/hg17/bed/blastz.mm5/axtChain
    mkdir preNet
    cd chain
    bash	#	for tcsh users
    for i in *.chain
    do
      echo preNetting $i
      /cluster/bin/i386/chainPreNet $i /cluster/data/hg17/chrom.sizes \
                        /cluster/data/mm5/chrom.sizes ../preNet/$i
    done
    # exit bash if you are tcsh
    #	15 minute job

    cd ..
    mkdir n1
    cd preNet
    bash	#	for tcsh users
    for i in *.chain
    do
      n=${i/.chain/}.net
      echo primary netting $i $n
      /cluster/bin/i386/chainNet $i -minSpace=1 /cluster/data/hg17/chrom.sizes \
	/cluster/data/mm5/chrom.sizes ../n1/$n /dev/null
    done
    # exit bash if you are tcsh
    #	9 minute job

    cd ..
    cat n1/*.net | /cluster/bin/i386/netSyntenic stdin hNoClass.net
    #	memory usage 2546110464, utime 16327 s/100, stime 3546

    ssh hgwdev
    cd /cluster/data/hg17/bed/blastz.mm5/axtChain
    time netClass hNoClass.net hg17 mm5 mouse.net \
	-tNewR=/cluster/bluearc/scratch/hg/gs.18/build35/linSpecRep.notInMouse \
	-qNewR=/cluster/bluearc/scratch/mus/mm5/linSpecRep.notInHuman
    #	real    16m38.098s
    #	user    11m38.490s
    #	sys     1m48.470s

    # If things look good do
    ssh eieio
    cd /cluster/data/hg17/bed/blastz.mm5/axtChain
    rm -r n1 hNoClass.net
    # Make a 'syntenic' subset of these with
    time netFilter -syn mouse.net > mouseSyn.net
    #	real    12m3.701s
    #	user    8m44.180s
    #	sys     1m1.610s

    # Load the nets into database
    ssh hgwdev
    cd /cluster/data/hg17/bed/blastz.mm5/axtChain
    netFilter -minGap=10 mouse.net |  hgLoadNet hg17 netMm5 stdin
    netFilter -minGap=10 mouseSyn.net | hgLoadNet hg17 syntenyNetMm5 stdin

    # check results
    #	featureBits hg17 netMm5
    #	2830625630 bases of 2866216770 (98.758%) in intersection

    #	featureBits hg17 netMm4
    #	2824272033 bases of 2866216770 (98.537%) in intersection
    # featureBits hg16 netMm5
    #	2823565051 bases of 2865248791 (98.545%) in intersection

    # featureBits hg17 syntenyNetMm5
    #	2799194300 bases of 2866216770 (97.662%) in intersection
    # featureBits hg17 syntenyNetMm4
    #	2785830955 bases of 2866216770 (97.195%) in intersection
    # featureBits hg16 syntenyNetMm5
    #	2786960572 bases of 2865248791 (97.268%) in intersection

    # Add entries for net and chain to mouse/hg17 trackDb

    # make net
    ssh eieio
    cd /cluster/data/hg17/bed/blastz.mm5/axtChain
    mkdir mouseNet
    time netSplit mouse.net mouseNet
    #	real    11m45.243s
    #	user    8m48.490s
    #	sys     1m13.490s

    #	extract axt's from net, and convert to maf's
    # NOTE: Redo the net axt's and maf's using 8/05 netToAxt
    # in order to remove overlaps (2005-08-16 kate)
    ssh kkstore02
    cd /cluster/data/hg17/bed/blastz.mm5/axtChain
    mkdir ../axtNet ../mafNet
cat > makeMaf.csh << '_EOF_'
#!/bin/csh -ef
    foreach f (mouseNet/chr*.net)
        set c = $f:t:r
        echo "netToAxt: $c.net -> $c.axt"
        rm -f ../axtNet/$c.axt
        netToAxt mouseNet/$c.net chain/$c.chain \
	    /cluster/data/hg17/nib /cluster/data/mm5/nib stdout | \
	    axtSort stdin ../axtNet/$c.axt
        axtToMaf ../axtNet/$c.axt \
            /cluster/data/hg17/chrom.sizes /cluster/data/mm5/chrom.sizes \
            ../mafNet/$c.maf -tPrefix=hg17. -qPrefix=mm5.
	echo "Complete: $c.net -> axtNet/$c.axt -> mafNet/$c.maf"
    end
'_EOF_'
# << for emacs
    csh makeMaf.csh >&! makeMaf.log &
    tail -100f makeMaf.log
    mkdir -p /cluster/bluearc/hg17/mafNet
    cp -rp ../mafNet /cluster/bluearc/hg17/mafNet/mm5

    ssh hgwdev
    mkdir /cluster/data/hg17/bed/blastz.mm5/axtBest
    cd /cluster/data/hg17/bed/blastz.mm5/axtBest
    ln -s ../axtNet/chr*.axt .

    # copy net axt's to download area
    ssh hgwdev
    cd /cluster/data/hg17/bed/blastz.mm5/axtNet
    mkdir -p /usr/local/apache/htdocs/goldenPath/hg17/vsMm5/axtNet
    cp -p *.axt /usr/local/apache/htdocs/goldenPath/hg17/vsMm5/axtNet
    cd /usr/local/apache/htdocs/goldenPath/hg17/vsMm5/axtNet
    gzip *.axt
    # add README.txt file to dir (use previous assembly's copy as template)
    #	32 minute gzip
    

    #  Convert those axt files to psl
    ssh eieio
    cd /cluster/data/hg17/bed/blastz.mm5
    mkdir pslBest
    foreach a (axtBest/chr*.axt)
	set c=$a:t:r
	echo -n "processing $c.axt -> ${c}_blastzBestMm5.psl ..."
    /cluster/bin/i386/axtToPsl axtBest/${c}.axt \
	S1.len S2.len pslBest/${c}_blastzBestMm5.psl
	echo "Done"
    end

    # Load tables
    ssh hgwdev
    cd /cluster/data/hg17/bed/blastz.mm5/pslBest
    for I in chr*BestMm5.psl
    do
	/cluster/bin/i386/hgLoadPsl -noTNameIx hg17 ${I}
	echo "done ${I}"
    done

     # check results
    # featureBits hg17 blastzBestMm5
    #	1013348528 bases of 2866216770 (35.355%) in intersection
    # featureBits hg17 blastzBestMm4
    #	1017319919 bases of 2866216770 (35.493%) in intersection
    # featureBits hg16 blastzBestMm5
    #	996722004 bases of 2865248791 (34.787%) in intersection

    # Make /gbdb links and add them to the axtInfo table:
     mkdir -p /gbdb/hg17/axtBest/Mm5
     cd /gbdb/hg17/axtBest/Mm5
     ln -s /cluster/data/hg17/bed/blastz.mm5/axtNet/chr*.axt .
     cd /cluster/data/hg17/bed/blastz.mm5/axtNet
     rm -f axtInfoInserts.sql
     foreach f (/gbdb/hg17/axtBest/Mm5/chr*.axt)
       set chr=$f:t:r
       echo "INSERT INTO axtInfo (species, alignment, chrom, fileName) \
                VALUES ('mm5','Blastz Best in Genome','$chr','$f');" \
         >>! axtInfoInserts.sql
     end
    hgsql hg17 < ~/kent/src/hg/lib/axtInfo.sql
    #	table axtInfo may already exist, ignore create error.
    hgsql hg17 < axtInfoInserts.sql

    # REDO: replace downloadable axtNet's to remove overlaps (2005-09-12 kate)
    ssh hgwdev
    cd /usr/local/apache/htdocs/goldenPath/hg17/vsMm5
    mv axtNet axtNet.old
    mkdir axtNet
    cd axtNet
    cp /cluster/data/hg17/bed/blastz.mm5/axtNet/*.axt .
    nice gzip *.axt
    md5sum *.axt.gz > md5sum.txt


# HG17 TO MM5 LIFTOVER CHAIN (DONE 1/6/05 Andy)
    ssh kolossus
    cd /cluster/data/hg17/bed/blastz.mm5/axtChain
    mkdir over
    for file in chain/*.chain; do
       chrom=`basename $file .chain`
       netChainSubset mouseNet/$chrom.net chain/$chrom.chain over/$chrom.over
       cat over/$chrom.over >> /cluster/data/hg17/bed/liftOver/hg17ToMm5.chain
    done
    rm -rf over/
    ssh hgwdev
    mkdir /usr/local/apache/htdocs/goldenPath/hg17/liftOver
    cd /usr/local/apache/htdocs/goldenPath/hg17/liftOver
    cp /cluster/data/hg17/bed/liftOver/hg17ToMm5.chain .
    gzip hg17ToMm5.chain
    ln -s /cluster/data/hg17/bed/liftOver/hg17ToMm5.chain /gbdb/hg17/liftOver/hg17ToMm5.over.chain
    hgAddLiftOverChain -multiple hg17 mm5

# HG17 TO CANFAM1 LIFTOVER CHAIN (DONE 1/7/05 Andy)
    ssh kolossus
    cd /cluster/data/hg17/bed/blastz.canFam1/axtChain
    mkdir over
    for file in chain/*.chain; do
       chrom=`basename $file .chain`
       netChainSubset net/$chrom.net chain/$chrom.chain over/$chrom.over
       cat over/$chrom.over >> /cluster/data/hg17/bed/liftOver/hg17ToCanFam1.chain
    done
    rm -rf over/
    ssh hgwdev
    cd /usr/local/apache/htdocs/goldenPath/hg17/liftOver
    cp /cluster/data/hg17/bed/liftOver/hg17ToCanFam1.chain .
    gzip hg17ToCanFam1.chain
    ln -s /cluster/data/hg17/bed/liftOver/hg17ToCanFam1.chain /gbdb/hg17/liftOver/hg17ToCanFam1.over.chain
    hgAddLiftOverChain -multiple hg17 canFam1

# HG17 TO PANTRO1 LIFTOVER CHAIN (DONE 1/20/05 Andy)
    ssh kolossus
    cd /cluster/data/hg17/bed/blastz.panTro1/axtChain
    mkdir over
    for file in chain/*.chain; do
       chrom=`basename $file .chain`
       netChainSubset net/$chrom.net chain/$chrom.chain over/$chrom.over
       cat over/$chrom.over >> /cluster/data/hg17/bed/liftOver/hg17ToPanTro1.chain
    done
    rm -rf over/
    ssh hgwdev
    cd /usr/local/apache/htdocs/goldenPath/hg17/liftOver
    cp /cluster/data/hg17/bed/liftOver/hg17ToPanTro1.chain .
    gzip hg17ToPanTro1.chain
    ln -s /cluster/data/hg17/bed/liftOver/hg17ToPanTro1.chain /gbdb/hg17/liftOver/hg17ToPanTro1.over.chain
    hgAddLiftOverChain -multiple hg17 panTro1   

# HG17 TO RN3 LIFTOVER CHAIN (DONE 3/1/05 Andy)
    #ssh kolossus
    #cd /cluster/data/hg17/bed/blastz.rn3/axtChain
    #mkdir over
    #for file in chain/*.chain; do
    #   chrom=`basename $file .chain`
    #   netChainSubset ratNet/$chrom.net chain/$chrom.chain over/$chrom.over
    #   cat over/$chrom.over >> /cluster/data/hg17/bed/liftOver/hg17ToRn3.chain
    #done
    #rm -rf over/
    # Oh fancy that, there's already a hg17ToRn3.over.chain in the /cluster/data/hg17/bed/liftOver
    # directory generated by Angie.  
    ssh hgwdev
    cd /usr/local/apache/htdocs/goldenPath/hg17/liftOver 
    cp /cluster/data/hg17/bed/liftOver/hg17ToRn3.over.chain .
    gzip hg17ToRn3.over.chain
    ln -s /cluster/data/hg17/bed/liftOver/hg17ToRn3.over.chain /gbdb/hg17/liftOver/hg17ToRn3.over.chain
    hgAddLiftOverChain -multiple hg17 rn3   

# HG17 TO GALGAL2 LIFTOVER CHAIN (DONE 3/1/05 Andy)
    # OK there's already a /cluster/data/hg17/bed/liftOver/hg17ToGalGal2.over.chain file generated
    # by Angie.
    ssh hgwdev
    cd /usr/local/apache/htdocs/goldenPath/hg17/liftOver 
    cp /cluster/data/hg17/bed/liftOver/hg17ToGalGal2.over.chain .
    gzip hg17ToGalGal2.over.chain
    ln -s /cluster/data/hg17/bed/liftOver/hg17ToGalGal2.over.chain /gbdb/hg17/liftOver/hg17ToGalGal2.over.chain
    hgAddLiftOverChain -multiple hg17 galGal2       

# HG17 TO MONDOM1 LIFTOVER CHAIN (DONE 3/1/05 Andy)
    ssh kksilo
    cd /cluster/data/monDom1/bed/zb.hg17/axtChain
    netSplit human.net.gz net
    ssh kolossus
    cd /cluster/data/monDom1/bed/zb.hg17/axtChain
    mkdir over
    for file in chain/*.chain.gz; do
       chrom=`basename $file .chain.gz`
       netChainSubset net/$chrom.net chain/$chrom.chain.gz over/$chrom.over
       cat over/$chrom.over >> /cluster/data/hg17/bed/liftOver/hg17ToMonDom1.chain
    done
    rm -rf over/ net/
    ssh hgwdev
    cd /usr/local/apache/htdocs/goldenPath/hg17/liftOver 
    cp /cluster/data/hg17/bed/liftOver/hg17ToMonDom1.chain .
    gzip hg17ToMonDom1.chain
    ln -s /cluster/data/hg17/bed/liftOver/hg17ToMonDom1.chain /gbdb/hg17/liftOver/hg17ToMonDom1.over.chain
    hgAddLiftOverChain -multiple hg17 monDom1 

# HG17 TO DANRER2 LIFTOVER CHAIN (DONE 3/2/05 Andy)
    ssh kolossus
    cd /cluster/data/hg17/bed/blastz.danRer2/axtChain
    chainSplit chain all.chain.gz
    netSplit zfishdanRer2.net.gz net
    mkdir over
    # FAILED STEPS:
    #for file in chain/*.chain; do
    #   chrom=`basename $file .chain`
    #   netChainSubset net/$chrom.net chain/$chrom.chain over/$chrom.over
    #   cat over/$chrom.over >> /cluster/data/hg17/bed/liftOver/hg17ToDanRer2.chain
    #done
    # Error:
#read 28019 of 28019 chains in chain/chr1.chain
#Processing chr1
#netChainSubset: netChainSubset.c:55: writeChainPart: Assertion `subChain != ((void *)0)' failed.
    # OK instead of using the ones in the chain/ subdir, I'm using the ones in
    # the chainAR/ subdir.  These chain files had an additional step in the process of making
    # them: Rachel used the chainAntiRepeat program.  
    for file in chain/*.chain; do
       chrom=`basename $file .chain`
       if [ $chrom = "chr1" ]; then
           netChainSubset net/$chrom.net chainAR/$chrom.chain over/$chrom.over
       else
           netChainSubset net/$chrom.net chainAR/$chrom.chain.gz over/$chrom.over          
       fi
       cat over/$chrom.over >> /cluster/data/hg17/bed/liftOver/hg17ToDanRer2.chain
    done  
    rm -rf over/
    ssh hgwdev
    cd /usr/local/apache/htdocs/goldenPath/hg17/liftOver 
    cp /cluster/data/hg17/bed/liftOver/hg17ToDanRer2.chain .
    gzip hg17ToDanRer2.chain
    ln -s /cluster/data/hg17/bed/liftOver/hg17ToDanRer2.chain /gbdb/hg17/liftOver/hg17ToDanRer2.over.chain
    hgAddLiftOverChain -multiple hg17 danRer2 
    

# HG17 TO TETNIG1 LIFTOVER CHAIN (DONE 3/1/05 Andy)
    ssh kolossus
    cd /cluster/data/hg17/bed/blastz.tetNig1/axtChain
    mkdir over
    for file in chain/*.chain; do
       chrom=`basename $file .chain`
       netChainSubset tetraodonNet/$chrom.net chain/$chrom.chain over/$chrom.over
       cat over/$chrom.over >> /cluster/data/hg17/bed/liftOver/hg17ToTetNig1.chain
    done
    ssh hgwdev
    cd /usr/local/apache/htdocs/goldenPath/hg17/liftOver 
    cp /cluster/data/hg17/bed/liftOver/hg17ToTetNig1.chain .
    gzip hg17ToTetNig1.chain
    ln -s /cluster/data/hg17/bed/liftOver/hg17ToTetNig1.chain /gbdb/hg17/liftOver/hg17ToTetNig1.over.chain
    hgAddLiftOverChain -multiple hg17 tetNig1      

# HG17 TO BOSTAU1 LIFTOVER CHAIN (DONE Mar. 18, 2004, Heather)

    ssh kolossus
    cd /cluster/data/bosTau1/bed/zb.hg17/axtChain
    mkdir over
    for file in chain/*.chain; do
       chrom=`basename $file .chain`
       netChainSubset net/$chrom.net chain/$chrom.chain over/$chrom.over
       cat over/$chrom.over >> /cluster/data/hg17/bed/liftOver/hg17ToBosTau1.chain
    done
    ssh hgwdev
    cd /usr/local/apache/htdocs/goldenPath/hg17/liftOver 
    cp /cluster/data/hg17/bed/liftOver/hg17ToBosTau1.chain .
    gzip hg17ToBosTau1.chain
    ln -s /cluster/data/hg17/bed/liftOver/hg17ToBosTau1.chain /gbdb/hg17/liftOver/hg17ToBosTau1.over.chain
    hgAddLiftOverChain -multiple hg17 bosTau1      

# HG17 TO XENTRO1 LIFTOVER CHAIN (DONE 7/5/05 Andy)
    ssh kolossus
    cd /cluster/data/xenTro1/bed/zb.hg17/axtChain
    mkdir chain net over
    chainSplit chain all.chain
    netSplit human.net net
    for file in chain/*.chain; do
       chrom=`basename $file .chain`
       netChainSubset net/$chrom.net chain/$chrom.chain over/$chrom.over
       cat over/$chrom.over >> /cluster/data/hg17/bed/liftOver/hg17ToXenTro1.chain
    done
    rm -rf over/ chain/ net/
    ssh hgwdev
    cd /usr/local/apache/htdocs/goldenPath/hg17/liftOver
    cp /cluster/data/hg17/bed/liftOver/hg17ToXenTro1.chain .
    gzip hg17ToXenTro1.chain
    ln -s /cluster/data/hg17/bed/liftOver/hg17ToXenTro1.chain /gbdb/hg17/liftOver/hg17ToXenTro1.over.chain
    hgAddLiftOverChain -multiple hg17 xenTro1   

# ADD CHAIN AND NET TO VSMM5 AND VSRN3 DOWNLOAD AREAS (DONE 8/5/04 angie)
    ssh hgwdev
    cp -p /cluster/data/hg17/bed/blastz.mm5/axtChain/all.chain.gz \
      /usr/local/apache/htdocs/goldenPath/hg17/vsMm5/mouse.chain.gz
    cp -p /cluster/data/hg17/bed/blastz.mm5/axtChain/mouse.net.gz \
      /usr/local/apache/htdocs/goldenPath/hg17/vsMm5/mouse.net.gz
    cd /usr/local/apache/htdocs/goldenPath/hg17/vsMm5
    md5sum *.gz */*.gz > md5sum.txt
    # Update the README.txt
    cp -p /cluster/data/hg17/bed/blastz.rn3/axtChain/all.chain.gz \
      /usr/local/apache/htdocs/goldenPath/hg17/vsRn3/rat.chain.gz
    cp -p /cluster/data/hg17/bed/blastz.rn3/axtChain/rat.net.gz \
      /usr/local/apache/htdocs/goldenPath/hg17/vsRn3/rat.net.gz
    cd /usr/local/apache/htdocs/goldenPath/hg17/vsRn3
    md5sum *.gz */*.gz > md5sum.txt
    # Update the README.txt

# ADD CHAIN AND NET TO VSHG17 DOWNLOAD AREAS (DONE Sept. 8th, 2004, heather)
    ssh hgwdev
    cp -p /cluster/data/mm5/bed/blastz.hg17/axtChain/all.chain.gz \
      /usr/local/apache/htdocs/goldenPath/mm5/vsHg17/human.chain.gz
    cp -p /cluster/data/mm5/bed/blastz.hg17/axtChain/human.net.gz \
      /usr/local/apache/htdocs/goldenPath/mm5/vsHg17/human.net.gz
    cd /usr/local/apache/htdocs/goldenPath/hg17/vsRn3
    md5sum *.gz */*.gz > md5sum.txt
    # Update the README.txt


# SWAP BLASTZ ZEBRAFISH-HUMAN (danRer1-hg17) to HUMAN-ZEBRAFISH (hg17-danRer1)
# USE RESCORED ALIGNMENTS (see makeDanRer1.doc)
# (DONE, 2004-06-22, hartera)
# CONVERT AXTs TO PSL AND LOAD INTO DATABASE (DONE, 2004-07-08, hartera)
    ssh kolossus
    mkdir /cluster/data/hg17/bed/blastz.danRer1.swap
    cd /cluster/data/hg17/bed/blastz.danRer1.swap
    # use rescored axtChrom from blastzHg17 on danRer1
    set aliDir = /cluster/data/danRer1/bed/blastz.hg17
    cp $aliDir/S1.len S2.len
    cp $aliDir/S2.len S1.len
    mkdir unsorted axtChrom
    cat $aliDir/axtChrom/chr*.axt \
    | axtSwap stdin $aliDir/S1.len $aliDir/S2.len stdout \
    | axtSplitByTarget stdin unsorted
    # Sort the shuffled .axt files.
    foreach f (unsorted/*.axt)
      echo sorting $f:t:r
      axtSort $f axtChrom/$f:t
    end
    du -sh $aliDir/axtChrom unsorted axtChrom
# 19G     /cluster/data/danRer1/bed/blastz.hg17/axtChrom
# 19G     unsorted
    rm -r unsorted
   # translate sorted axt files into psl
    ssh kolossus
    cd /cluster/data/hg17/bed/blastz.danRer1.swap
    mkdir -p pslChrom
    set tbl = "blastzDanRer1"
    foreach f (axtChrom/chr*.axt)
      set c=$f:t:r
      echo "Processing chr $c"
      /cluster/bin/i386/axtToPsl $f S1.len S2.len pslChrom/${c}_${tbl}.psl
    end
    # Load database tables
    ssh hgwdev
    cd /cluster/data/hg17/bed/blastz.danRer1.swap/pslChrom
    foreach f (./*.psl)
      /cluster/bin/i386/hgLoadPsl hg17 $f
      echo "$f Done"
    end

# CHAIN ZEBRAFISH (danRer1) BLASTZ (DONE, 2004-06-23, hartera)
    # Run axtChain on little cluster
    ssh kki
    cd /cluster/data/hg17/bed/blastz.danRer1.swap
    mkdir -p axtChain/run1
    cd axtChain/run1
    mkdir out chain
    ls -1S /cluster/data/hg17/bed/blastz.danRer1.swap/axtChrom/*.axt \
                    > input.lst
    cat << '_EOF_' > gsub
#LOOP
doChain {check in exists $(path1)} {check out line+ chain/$(root1).chain} {check out line+ out/$(root1).out}
#ENDLOOP
'_EOF_'

    # << this line makes emacs coloring happy

# Reuse gap penalties from hg16 vs chicken run.
    cat << '_EOF_' > ../../chickenHumanTuned.gap
tablesize^V     11
smallSize^V     111
position^V      1^V     2^V     3^V     11^V    111^V   2111^V  12111^V 32111^V 72111^V 152111^V        252111
qGap^V  325^V   360^V   400^V   450^V   600^V   1100^V  3600^V  7600^V  15600^V 31600^V 56600
tGap^V  325^V   360^V   400^V   450^V   600^V   1100^V  3600^V  7600^V  15600^V 31600^V 56600
bothGap^V       625^V   660^V   700^V   750^V   900^V   1400^V  4000^V  8000^V  16000^V 32000^V 57000
'_EOF_'
    # << this line makes emacs coloring happy

cat << '_EOF_' > doChain
#!/bin/csh
axtFilter $1 \
| axtChain -scoreScheme=/cluster/data/blastz/HoxD55.q \
                      -linearGap=../../chickenHumanTuned.gap \
                      -minScore=5000 stdin \
    /iscratch/i/gs.18/build35/bothMaskedNibs \
    /iscratch/i/danRer1/nib $2 > $3
'_EOF_'
    chmod a+x doChain
    gensub2 input.lst single gsub jobList
    para create jobList
    para try, check, push, check...
# para time
# Completed: 45 of 46 jobs
# Crashed: 1 jobs
# CPU time in finished jobs:       3559s      59.32m     0.99h    0.04d  0.000 y
# IO & Wait Time:                   934s      15.56m     0.26h    0.01d  0.000 y
# Average job time:                 100s       1.66m     0.03h    0.00d
# Longest job:                      502s       8.37m     0.14h    0.01d
# Submission to last job:          2969s      49.48m     0.82h    0.03d
# chr19.axt crashed - out of memory so try again on kolossus

    ssh kolossus
    cd /cluster/data/hg17/bed/blastz.danRer1.swap/axtChain/run1
    # need to use nibs on bluearc as iscratch not accessible to kolossus
cat << '_EOF_' > doChain2
#!/bin/csh
axtFilter $1 \
| axtChain -scoreScheme=/cluster/data/blastz/HoxD55.q \
                      -linearGap=../../chickenHumanTuned.gap \
                      -minScore=5000 stdin \
    /cluster/bluearc/hg17/bothMaskedNibs \
    /cluster/bluearc/danRer1/nib $2 >& $3
'_EOF_'
    chmod +x doChain2

    doChain2 \
         /cluster/data/hg17/bed/blastz.danRer1.swap/axtChrom/chr19.axt \
         chain/chr19.chain out/chr19.out
    # now on the cluster server, sort chains
    ssh kksilo
    cd /cluster/data/hg17/bed/blastz.danRer1.swap/axtChain
    chainMergeSort run1/chain/*.chain > all.chain
    chainSplit chain all.chain

    # Load chains into database
    # next machine
    ssh hgwdev
    cd /cluster/data/hg17/bed/blastz.danRer1.swap/axtChain/chain
    foreach i (*.chain)
        set c = $i:r
        hgLoadChain hg17 ${c}_chainDanRer1 $i
        echo done $c
    end
# tried minScore = 1000 and minScore = 10000 for axtChain
# minScore = 5000 was best for reducing low scoring chains but not reducing 
# overlap with refGene CDS too much

# NET ZEBRAFISH (danRer1) BLASTZ (DONE, 2004-06-24, hartera)
# REMAKE NET WITHOUT ANCIENT REPEATS (DONE, 2004-07-07, hartera)
    ssh kksilo
    cd /cluster/data/hg17/bed/blastz.danRer1.swap/axtChain
    mkdir preNet
    cd chain
    foreach i (*.chain)
       echo preNetting $i
       /cluster/bin/i386/chainPreNet $i ../../S1.len ../../S2.len \
                                     ../preNet/$i
    end

    cd ..
    mkdir n1
    cd preNet
    foreach i (*.chain)
      set n = $i:r.net
      echo primary netting $i
      /cluster/bin/i386/chainNet $i -minSpace=1 ../../S1.len ../../S2.len \
                                 ../n1/$n /dev/null
    end

    cd ..
    mkdir n1
    cd preNet
    foreach i (*.chain)
      set n = $i:r.net
      echo primary netting $i
      /cluster/bin/i386/chainNet $i -minSpace=1 ../../S1.len ../../S2.len \
                                 ../n1/$n /dev/null
    end

    cd ..
    cat n1/*.net | /cluster/bin/i386/netSyntenic stdin noClass.net
    # memory usage 149086208, utime 868 s/100, stime 173                                                                            
    # Add classification info using db tables:
    cd /cluster/data/hg17/bed/blastz.danRer1.swap/axtChain
    # netClass looks for ancient repeats in one of the databases
    # hg17 has this table - hand-curated by Arian
    # this is only for human rodent comparisons so use -noAr option
    mkdir -p /cluster/bluearc/danRer1/linSpecRep.notInHuman
    mkdir -p /cluster/bluearc/hg17/linSpecRep.notInZebrafish
    cp /iscratch/i/gs.18/build35/linSpecRep.notInZebrafish/* \
       /cluster/bluearc/hg17/linSpecRep.notInZebrafish
    cp /iscratch/i/danRer1/linSpecRep.notInHuman/* \
       /cluster/bluearc/danRer1/linSpecRep.notInHuman
    ssh hgwdev
    cd /cluster/data/hg17/bed/blastz.danRer1.swap/axtChain
    # add -noAr option
    # mkdir old
    # mv zebrafish.net ./old/zebrafish.net.old
    time netClass noClass.net hg17 danRer1 zebrafish.net \
        -tNewR=/cluster/bluearc/hg17/linSpecRep.notInZebrafish \
        -qNewR=/cluster/bluearc/danRer1/linSpecRep.notInHuman -noAr
    # 83.410u 43.650s 3:09.94 66.8%   0+0k 0+0io 198pf+0w
    ssh hgwdev
    cd /cluster/data/hg17/bed/blastz.danRer1.swap/axtChain
    netFilter -minGap=10 zebrafish.net | hgLoadNet hg17 netDanRer1 stdin

# EXTRACT AXT'S AND MAF'S FROM ZEBRAFISH (danRer1) NET 
# (DONE, 2004-06-24, hartera) used net where hg17 ancient Repeat table used
# sorted axts and remade mafs as multiz needs axts to be sorted
# (DONE, 2004-06-25, kate)
# Redone to fix overlaps using 8/05 axtToNet (2005-08-16 kate)
# Replace bad chr5 axtNet and mafNet (2006-01-05 kate) 
    ssh eieio
    # create axts
    cd /cluster/data/hg17/bed/blastz.danRer1.swap/axtChain
    netSplit zebrafish.net zebrafishNet
    mkdir -p ../axtNet ../mafNet
cat > makeMaf.csh << 'EOF'
    foreach f (zebrafishNet/chr*.net)
        set c = $f:t:r
        echo $c
        netToAxt zebrafishNet/$c.net chain/$c.chain \
        /cluster/data/hg17/nib /cluster/data/danRer1/nib stdout | \
               axtSort stdin ../axtNet/$c.axt
        axtToMaf ../axtNet/$c.axt \
            /cluster/data/hg17/chrom.sizes /cluster/data/danRer1/chrom.sizes \
            ../mafNet/$c.maf -tPrefix=hg17. -qPrefix=danRer1.
    end
'EOF'
    csh makeMaf.csh >&! makeMaf.log &
    tail -100f makeMaf.log
    mkdir -p /cluster/bluearc/hg17/mafNet
    cp -rp ../mafNet /cluster/bluearc/hg17/mafNet/danRer1


# BLASTZ ZEBRAFISH (danRer1) CLEAN UP (DONE, 2004-07-19, hartera)
# FURTHER CLEANUP (hartera, 2006-09-01, hartera)
     ssh eieio
     cd /cluster/data/hg17/bed/blastz.danRer1.swap
     nice rm axtChain/run1/chain/* &
     nice rm -fr axtChain/n1 axtChain/hNoClass.net &
     nice gzip axtChrom/* pslChrom/* axtChain/all.chain axtChain/*.net &
     # further cleanup (2006-09-01, hartera)
     ssh kkstore02
     cd /cluster/data/hg17/bed/blastz.danRer1.swap
     rm -r axtNet.old axtNet.unsorted mafNet
     cd axtChain
     rm hist*
     # remove chains and nets directories. These can be reconstructed with
     # all.chain.gz and zebrafish.net.gz
     rm -r old chain zebrafishNet preNet
     rm noClass.net.gz
     cd ..
     rm pslChrom/psl.tab.gz

# ZEBRAFISH DANRER1 DOWNLOADS (WORKING 2004-09-17 kate)
    ssh eieio
    cd /cluster/data/hg17/bed/blastz.danRer1.swap/axtNet
    gzip *.axt

    ssh hgwdev
    cd /usr/local/apache/htdocs/goldenPath/hg17
    mkdir -p vsDanRer1
    cd vsDanRer1
    cp /cluster/data/hg17/bed/blastz.danRer1.swap/axtChain/all.chain.gz zebrafish.chain.gz
    cp /cluster/data/hg17/bed/blastz.danRer1.swap/axtChain/zebrafish.net.gz .
    md5sum *.gz > md5sum.txt
    mkdir -p axtNet
    cd axtNet
    cp /cluster/data/hg17/bed/blastz.danRer1.swap/axtNet/*.axt.gz .
    md5sum *.gz > md5sum.txt
    # Copy and edit README.txt
      
 
# MAKING MOUSE SYNTENY (DONE - 2004-07-03 - Hiram)

ssh hgwdev
mkdir /cluster/data/hg17/bed/syntenyMm5
cd /cluster/data/hg17/bed/syntenyMm5

# Copy all the needed scripts from /cluster/data/hg16/bed/syntenyMm3
cp -p /cluster/data/hg17/bed/syntenyRn3/*.pl .

./syntenicBest.pl -db=hg17 -table=blastzBestMm5
./smooth.pl
./joinsmallgaps.pl
./fillgap.pl -db=hg17 -table=blastzBestMm5
./synteny2bed.pl
    #	The five commands above
    #	real    209m28.161s
    #	user    0m21.040s
    #	sys     0m4.100s

#	Used to load this in syntenyMm5, but that type is misleading to
#	the table browser and fails the checkTableCoords check.
#	Better to use this ensRatMusHom type:
#	Need a new name here for the Mm5 to not conflict with Rn3
sed -e 's/ensPhusionBlast/ensRatMm5Hom/g' \
      $HOME/kent/src/hg/lib/ensPhusionBlast.sql \
      > ensRatMm5Hom.sql
hgLoadBed hg17 ensRatMm5Hom ucsc100k.bed -sqlTable=ensRatMm5Hom.sql
    #	featureBits hg17 ensRatMm5Hom
    #	2649530748 bases of 2866216770 (92.440%) in intersection
    #	featureBits hg17 ensRatMm4Hom
    #	2549307611 bases of 2866216770 (88.943%) in intersection
    #	featureBits hg16 syntenyMm5
    #	2560252977 bases of 2865248791 (89.355%) in intersection

# MAKING MOUSE AXTTIGHT FROM AXTBEST (DONE - 2004-07-02 - Hiram)
    # After creating axtBest alignments above, use subsetAxt to get axtTight:
    ssh eieio
    cd /cluster/data/hg17/bed/blastz.mm5/axtNet
    mkdir -p ../axtTight
    bash	#	for tcsh users
    for I in *.axt
    do
      echo $I
      subsetAxt  $I ../axtTight/$I \
	~kent/src/hg/mouseStuff/subsetAxt/coding.mat 3400
    done
    # exit bash if you are tcsh
    #	An 8 minute job

    # translate to psl
    cd ../axtTight
    mkdir ../pslTight
    bash	#	for tcsh users
    for I in *.axt
    do
      C=${I/.axt/}
      axtToPsl $I ../S1.len ../S2.len ../pslTight/${C}_blastzTightMm5.psl
      echo "Done: $I -> ${C}_blastzTightMm5.psl"
    done
    # exit bash if you are tcsh

    # Load tables into database
    ssh hgwdev
    cd /cluster/data/hg17/bed/blastz.mm5/pslTight
    for I in chr*TightMm5.psl
    do
	/cluster/bin/i386/hgLoadPsl -noTNameIx hg17 ${I}
	echo "done ${I}"
    done

    #	Compare results with previous assembly:
    #	featureBits hg17 blastzTightMm5
    #	165862935 bases of 2866216770 (5.787%) in intersection
    #	featureBits hg17 blastzTightMm4
    #	166569246 bases of 2866216770 (5.811%) in intersection
    #	featureBits hg16 blastzTightMm5
    #	162641577 bases of 2865248791 (5.676%) in intersection

    # copy  axt's to download area
    ssh hgwdev
    cd /cluster/data/hg17/bed/blastz.mm5/axtTight
    mkdir -p /usr/local/apache/htdocs/goldenPath/hg17/vsMm5/axtTight
    cp -p *.axt /usr/local/apache/htdocs/goldenPath/hg17/vsMm5/axtTight
    cd /usr/local/apache/htdocs/goldenPath/hg17/vsMm5/axtTight
    gzip *.axt
    # add README.txt file to dir (use previous assembly's copy as template)
    #	4 minute gzip

# BLASTZ MM5 CLEAN UP (DONE 2004-07-02 - Hiram)
    ssh eieio
    cd /cluster/data/hg17/bed/blastz.mm5
    nice rm -rf raw &
    nice rm -fr axtChain/n1 axtChain/hNoClass.net &
    nice rm axtChain/run1/chain/* &
    nice gzip axtChrom/* pslChrom/* lav/*/* axtChain/all.chain axtChain/*.net &

##############################################################################
#  MAKING BLASTZ SELF (DONE - 2004-07-14 - Hiram)

    # The procedure for lineage spec business with self is to simply
    # use the actual repeat masker output for this human assembly as
    # the lineage specific repeats for itself.  Thus, merely make
    # symlinks to the repeat masker out files and name them as expected
    # for blastz.  In this case they are called notInHuman but they
    # really mean InHuman.  Yes, it is confusing, but that's just the
    # nature of the game in this case.

    ssh eieio
    mkdir /cluster/bluearc/scratch/hg/gs.18/build35/linSpecRep.notInHuman
    cd /cluster/bluearc/scratch/hg/gs.18/build35/linSpecRep.notInHuman
    foreach f (../rmsk/*.fa.out)
	set base = $f:t:r:r
	echo $base.out.spec
	ln -s $f $base.out.spec
    end
    #	Same thing done on iscratch
    #	Not worried about pushing this scratch yet, it will get done
    #	sometime later.  Using the actual /cluster/bluearc/scratch/
    #	location below.

    ssh kk
    mkdir /cluster/data/hg17/bed/blastzSelf.2004-07-01
    cd /cluster/data/hg17/bed
    ln -s blastzSelf.2004-07-01 blastzSelf
    cd blastzSelf

    cat << '_EOF_' > DEF
# human vs. human
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/home/angie/schwartzbin:/cluster/home/kent/bin/i386

ALIGN=blastz-run
BLASTZ=blastz
BLASTZ_H=2000
BLASTZ_ABRIDGE_REPEATS=1

# TARGET
# Human
SEQ1_DIR=/iscratch/i/gs.18/build35/bothMaskedNibs
# not used
SEQ1_RMSK=
# not used
SEQ1_FLAG=
SEQ1_SMSK=/cluster/bluearc/scratch/hg/gs.18/build35/linSpecRep.notInHuman
SEQ1_IN_CONTIGS=0
SEQ1_CHUNK=10000000
SEQ1_LAP=10000

# QUERY
# Human
SEQ2_DIR=/iscratch/i/gs.18/build35/bothMaskedNibs
# not currently used
SEQ2_RMSK=
# not currently used
SEQ2_FLAG=
SEQ2_SMSK=/cluster/bluearc/scratch/hg/gs.18/build35/linSpecRep.notInHuman
SEQ2_IN_CONTIGS=0
SEQ2_CHUNK=10000000
SEQ2_LAP=10000

BASE=/cluster/data/hg17/bed/blastzSelf

DEF=$BASE/DEF
RAW=$BASE/raw
CDBDIR=$BASE
SEQ1_LEN=$BASE/S1.len
SEQ2_LEN=$BASE/S2.len
'_EOF_'
    # << this line makes emacs coloring happy

    # prepare first cluster run
    ssh kk
    cd /cluster/data/hg17/bed/blastzSelf
    /cluster/data/hg17/jkStuff/BlastZ_run0.sh
    cd run.0
    para try, check, push, check, ....
    #	you need a -maxPush=200000 on this one, it is more than 100000
    #	jobs the default push limit.  Also be aware of maxQueue limits
    #	on the KK, may need something more than the default of 200000 if
    #	the KK is busy.
XXX - running 2004-07-01 11:26


##############################################################################
# LIFTOVER (DROP) CHAINS TO HG16 (IN PROGRESS 2005-01-03 kate)

   # swap hg16->hg17 chains 

 # LIFTOVER (DROP) CHAINS TO HG16 (IN PROGRESS 2004-07-07 kate)

    # run alignment
    # NOTE: split hg16 to /iscratch/i is doc'ed in makeHg16.doc
    ssh kk
    cd /cluster/data/hg17
    makeLoChain-align hg17 /scratch/hg/gs.18/build35/bothMaskedNibs \
                    hg16 /iscratch/i/gs.17/build34/liftOver/split
    # Created parasol job in bed/blat.hg16.2004-07-07/run
    # 1150 jobs
    cd bed/blat.hg16.2004-07-07/run
    para try
    para check
    para push

    # GOT HERE

    # lift results (use bash)
    cd /cluster/data/hg17/bed/blat.hg16
    for file in /cluster/data/hg16/nib/*.nib; do
       chrom=`basename $file .nib`
       liftUp -pslQ psl/$chrom.psl /cluster/bluearc/hg/gs.17/build34/liftOver/lift/$chrom.lft warn raw/chr*_${chrom}.psl
    done
    # There were some errors from not finding .lft files for the chr_random ones.
    ssh kk9
    cd ../liftOver
    ln -s blat.hg16 blat.hg16.2005-01-22
    makeLoChain-chain hg17 /cluster/data/hg17/nib hg16 /cluster/data/hg16/nib 2>chain.error.log >chain.log
    ssh eieio
    makeLoChain-net hg17 hg16
    ssh hgwdev
    makeLoChain-load hg17 hg16

# DROPUNDER CHAIN TO HG15 (DONE 2005-07-21 Andy)
    # Split things up
    ssh eieio
    cd /cluster/bluearc
    mkdir -p hg15/liftOver/split
    cd hg15/liftOver/split/
    mkdir ../lift
    for c in `cut -f1 /cluster/data/hg15/chrom.sizes`; do
       echo $c
       num=${c%_random}
       num=${num#chr}
       faSplit -lift=../lift/${c}.lft size /cluster/data/hg15/${num}/${c}.fa -oneFile 3000 ${c}
    done  

    # Move files to santest
    ssh hgwdev
    cd /santest/scratch
    mkdir hg15
    cd hg15/
    cp -r /cluster/bluearc/hg15/liftOver .

    # run alignment
    ssh kk
    cd /cluster/data/hg17
    makeLoChain-align hg17 /scratch/hg/gs.18/build35/bothMaskedNibs \
                    hg15 /santest/scratch/hg15/liftOver/split
    # Created parasol job in bed/blat.hg16.2004-07-07/run
    # 2024 jobs written to batch
    # *** IGNORE the batch created by the script.
    ln -s bed/blat.hg15.2005-07-21 bed/blat.hg15
    cd bed/blat.hg15/
    mv run run.kk
    mkdir run.kk9 run.kki
    cd run.kk/
    sed 's/\.fa\./\./g' spec > tmp; mv tmp spec
    grep Un_random spec > ../run.kki/spec
    grep -v Un_random spec > newspec
    mv newspec spec
    egrep "chr(1|19|X)(\.|_)" spec | grep -v random > ../run.kk9/spec
    grep -Fv -f ../run.kk9/spec spec > newspec
    mv newspec spec
    wc -l spec ../run.kk9/spec ../run.kki/spec
#   1831 spec
#    147 ../run.kk9/spec
#     46 ../run.kki/spec
#   2024 total
    # Checks out
    
    # Run the thing on all 3 clusters.
    para create spec
    para push
#Completed: 1831 of 1831 jobs
#CPU time in finished jobs:    8556066s  142601.10m  2376.69h   99.03d  0.271 y
#IO & Wait Time:                 60428s    1007.13m    16.79h    0.70d  0.002 y
#Average job time:                4706s      78.43m     1.31h    0.05d
#Longest running job:                0s       0.00m     0.00h    0.00d
#Longest finished job:           46724s     778.73m    12.98h    0.54d
#Submission to last job:         46725s     778.75m    12.98h    0.54d
    ssh kk9
    cd /cluster/data/hg17/bed/blat.hg15/run.kk9
    para create spec
    para push
#Completed: 147 of 147 jobs
#CPU time in finished jobs:    1698424s   28307.07m   471.78h   19.66d  0.054 y
#IO & Wait Time:                   874s      14.56m     0.24h    0.01d  0.000 y
#Average job time:               11560s     192.66m     3.21h    0.13d
#Longest running job:                0s       0.00m     0.00h    0.00d
#Longest finished job:           31413s     523.55m     8.73h    0.36d
#Submission to last job:         31413s     523.55m     8.73h    0.36d
    ssh kki
    cd /cluster/data/hg17/bed/blat.hg15/run.kki
    para create spec
    para push
    # OK I don't have para time stuff for this one, but it was the shortest 
    # by far.

    # lift results
    cd /cluster/data/hg17/bed/blat.hg15/lift.run
    for chrom in `cut -f1 /cluster/data/hg15/chrom.sizes`; do
       liftUp -pslQ /cluster/bluearc/hg15/liftOver/psl/${chrom}.psl /cluster/bluearc/hg15/liftOver/lift/${chrom}.lft warn raw/chr*_${chrom}.psl
    done

    # Chain
    # There's been some problems with store5.  
    ssh kk9
    cd /cluster/store12/store5/gs.18/build35/bed/blat.hg15.2005-07-21
    mkdir chainRun
    mkdir -p /panasas/store/hg15/chainRaw
    ln -s /panasas/store/hg15/chainRaw chainRaw
    cd chainRun/
    ls -1S ../psl/*.psl > in.lst
    cat > chain.sh << "_EOF_"
#!/bin/bash
tmp=/scratch/`basename $4`
axtChain -psl $1 $2 $3 $tmp
cp $tmp $4
rm $tmp  
_EOF_
    chmod +x chain.sh
    cat > gsub << "_EOF_"
#LOOP
./chain.sh $(path1) /scratch/hg/gs.18/build35/bothMaskedNibs /scratch/hg/gs.16/build33/chromTrfMixedNib {check out line+ ../chainRaw/$(root1).chain}
#ENDLOOP
_EOF_
    # <<
    cd chainRun/
    gensub2 in.lst single gsub spec
    para create spec
    para push
#Completed: 44 of 44 jobs
#CPU time in finished jobs:       7448s     124.13m     2.07h    0.09d  0.000 y
#IO & Wait Time:                  9591s     159.85m     2.66h    0.11d  0.000 y
#Average job time:                 387s       6.45m     0.11h    0.00d
#Longest running job:                0s       0.00m     0.00h    0.00d
#Longest finished job:            1906s      31.77m     0.53h    0.02d
#Submission to last job:          1906s      31.77m     0.53h    0.02d
    ssh kolossus
    cd /panasas/store/hg15/chainRaw    
    chainMergeSort *.chain | chainSplit /scratch/andy/chain stdin
    cd /scratch/andy
    mkdir net over
    cd chain/
    for chain in *; do
       c=${chain%.chain}
       echo $c
       chainNet $chain /cluster/store12/store5/gs.18/build35/chrom.sizes \
          /cluster/store12/store5/gs.16/build33/chrom.sizes \
          ../net/${c}.net /dev/null
       netChainSubset ../net/${c}.net $chain ../over/${c}.over
    done
    cd ../over/
    cat * >> ../hg17ToHg15.over.chain
    cd ../
    cp -r hg17* over/ /cluster/store12/store5/gs.18/build35/bed/blat.hg15.2005-07-21/
    cd ../
    rm -rf andy/
    rm -rf /panasas/store/hg15    
    cd /cluster/bluearc/hg15/liftOver/psl
    for psl in *; do
       gzip $psl
    done
    cd ../

    
# Completed: 116281 of 116281 jobs
# CPU time in finished jobs:   21807388s  363456.46m  6057.61h  252.40d  0.692 y
# IO & Wait Time:               2319383s   38656.39m   644.27h   26.84d  0.074 y
# Average job time:                 207s       3.46m     0.06h    0.00d
# Longest job:                    22063s     367.72m     6.13h    0.26d
# Submission to last job:         83402s    1390.03m    23.17h    0.97d


    #	Second cluster run to convert the .out's to .lav's
    #	You do NOT want to run this on the big cluster.  It brings
    #	the file server to its knees.  Run this on the small cluster.
    ssh kki
    cd /cluster/data/hg17/bed/blastzSelf
    /cluster/data/hg17/jkStuff/BlastZ_run1.sh
    cd run.1
    para try, check, push, etc ...

# Completed: 341 of 341 jobs
# CPU time in finished jobs:       6344s     105.73m     1.76h    0.07d  0.000 y
# IO & Wait Time:                  5413s      90.22m     1.50h    0.06d  0.000 y
# Average job time:                  34s       0.57m     0.01h    0.00d
# Longest job:                      505s       8.42m     0.14h    0.01d
# Submission to last job:          4521s      75.35m     1.26h    0.05d

    #	Third cluster run to convert lav's to axt's

    #	These self alignments do not work well as the usual third cluster job.
    #	Instead, a specialized job here that includes a DropSelf
    #	operation, and in individual lav pieces to avoid out of memory
    #	problems during axtSort
    ssh kki
    cd /cluster/data/hg17/bed/blastzSelf
    mkdir axtChrom run.2
    cd run.2
    cat << '_EOF_' > runLavToAxt.sh
#!/bin/sh

BASE=/cluster/data/hg17/bed/blastzSelf
SEQ1_DIR=/cluster/bluearc/scratch/hg/gs.18/build35/bothMaskedNibs
SEQ2_DIR=/cluster/bluearc/scratch/hg/gs.18/build35/bothMaskedNibs

CHR=$1
OUT=axtChrom/$CHR.axt
cd ${BASE}/lav/${CHR}
for D in *.lav
do
    smallout=$D.axt
    lavToAxt  $D $SEQ1_DIR $SEQ2_DIR stdout \
	| axtDropSelf stdin stdout \
	| axtSort stdin $smallout
done
cat `ls -1 *.lav.axt | sort -g` > $BASE/$OUT
'_EOF_'
    # << keep emacs coloring happy
    chmod +x runLavToAxt.sh

    cat << '_EOF_' > gsub
#LOOP
./runLavToAxt.sh $(path1) {check out line ../axtChrom/$(path1).axt}
#ENDLOOP
'_EOF_'
    # << keep emacs coloring happy

    ls ../lav > chrList
    gensub2 chrList single gsub jobList
    para create jobList
    para try
    para push
    #	This is a tough load on eieio.  Managable, but the load should
    #	be monitored to make sure it isn't severe.  I saw about 100 to 150
    #	the chr19 job will not finish, even in parts it takes up too
    #	much memory and the node it runs on ends up swapping endlessly.
    #	Need to go to kolossus to do chr19
    para stop
    para recover jobList chr19JobList
    ssh kolossus
    cd /cluster/data/hg17/bed/blastzSelf/run.2
    time ./runLavToAxt.sh chr19
    #	real    43m14.797s
    #	user    12m56.670s
    #	sys     3m13.590s

    # translate sorted axt files into psl
    ssh eieio
    cd /cluster/data/hg17/bed/blastzSelf
    mkdir pslChrom
    set tbl = "blastzSelf"
    foreach f (axtChrom/chr*.axt)
      set c=$f:t:r
      echo "Processing chr $c"
      /cluster/bin/i386/axtToPsl $f S1.len S2.len pslChrom/${c}_${tbl}.psl
    end
    #	That takes about 70 minutes

    # Load database tables
    ssh hgwdev
    cd /cluster/data/hg17/bed/blastzSelf/pslChrom
    bash # if a csh/tcsh user
    for I in *.psl
    do
	/cluster/bin/i386/hgLoadPsl -noTNameIx hg17 ${I}
	echo "done: ${I}"
    done
    # exit bash if you are tcsh
    #	This is an 80 minute job

    #	Check results
    #	featureBits hg17 blastzSelf
    #	252256266 bases of 2866216770 (8.801%) in intersection
    #	real    40m49.573s
    #	user    21m14.200s
    #	sys     2m10.420s

    #	featureBits hg16 blastzSelf
    #	254410837 bases of 2865248791 (8.879%) in intersection


# CHAIN SELF BLASTZ (DONE - 2004-07-07 - Hiram)

# The axtChain is best run on the small kluster, or the kk9 kluster
    ssh kki
    mkdir -p /cluster/data/hg17/bed/blastzSelf/axtChain/run1
    cd /cluster/data/hg17/bed/blastzSelf/axtChain/run1
    mkdir out chain

    ls -1S /cluster/data/hg17/bed/blastzSelf/axtChrom/*.axt > input.lst
    cat << '_EOF_' > gsub
#LOOP
doChain {check in exists $(path1)} {check out line+ chain/$(root1).chain} out/$(root1).out
#ENDLOOP
'_EOF_'
    # << this line makes emacs coloring happy

#  May need -minScore=5000 for all chroms if chr19 won't finish on kolossus

    cat << '_EOF_' > doChain
#!/bin/csh
axtChain $1 /iscratch/i/gs.18/build35/bothMaskedNibs \
	/iscratch/i/gs.18/build35/bothMaskedNibs $2 > $3
'_EOF_'
    # << this line makes emacs coloring happy
    chmod a+x doChain

    # 46 jobs
    gensub2 input.lst single gsub jobList
    para create jobList
    para try
    para push # ... etc ...
# Completed: 45 of 46 jobs
# Crashed: 1 jobs
# CPU time in finished jobs:       8519s     141.98m     2.37h    0.10d  0.000 y
# IO & Wait Time:                  4795s      79.92m     1.33h    0.06d  0.000 y
# Average job time:                 296s       4.93m     0.08h    0.00d
# Longest job:                     2407s      40.12m     0.67h    0.03d
# Submission to last job:          3540s      59.00m     0.98h    0.04d

    #	chr19 did fail, on kolossus, try:
    ssh kolossus
    cd /cluster/data/hg17/bed/blastzSelf/axtChain/run1
    time axtChain /cluster/data/hg17/bed/blastzSelf/axtChrom/chr19.axt \
	/cluster/data/hg17/nib \
	/cluster/data/hg17/nib \
	chain/chr19.chain > out/chr19.out
    #	80 minute job, 1.5 Gb result:
    #	-rw-rw-r--    1 1588795432 Jul  7 21:54 chr19.chain

    # now on the file server, sort chains
    ssh eieio
    cd /cluster/data/hg17/bed/blastzSelf/axtChain
    time chainMergeSort run1/chain/*.chain > all.chain
    #	real    27m38.935s
    #	user    23m18.540s
    #	sys     2m39.300s
    #	A 5 Gb file:
    #	-rw-rw-r--    1 5267202936 Jul  7 22:23 all.chain


    time chainSplit chain all.chain
    #	real    29m27.062s
    #	user    22m48.250s
    #	sys     1m57.910s

    # optionally: rm run1/chain/*.chain

    # Load chains into database
    # next machine
    ssh hgwdev
    cd /cluster/data/hg17/bed/blastzSelf/axtChain/chain
    bash	#	for tcsh users
    for I in *.chain
    do
        c=${I/.chain/}
        $HOME/bin/i386/hgLoadChain -normScore hg17 ${c}_chainSelf $I
        echo done $c
    done
    # exit bash if you are tcsh
    #	This is almost 3 hours to load

    ssh kolossus
    cd /cluster/data/hg17/bed/blastzSelf.2004-07-01
    time HGDB_CONF=~/.hg.conf.read-only featureBits \
	-noRandom -noHap hg17 chainSelfLink > fb.chainSelfLink 2>&1 &
    #	real    56m34.802s
    #	240976607 bases of 2851352871 (8.451%) in intersection

    #	featureBits hg17 chainSelf
    #	682833453 bases of 2866216770 (23.824%) in intersection
    #	featureBits hg16 chainSelf
    #	626345319 bases of 2865248791 (21.860%) in intersection

    #	DELIVER these chain files to hgdownload (2005-01-27 - Hiram)
    ssh eieio
    cd /cluster/data/hg17/bed/blastzSelf/axtChain/chain
    gzip chr*.chain

    ssh hgwdev
    mkdir /usr/local/apache/htdocs/goldenPath/hg17/vsSelf
    cd /cluster/data/hg17/bed/blastzSelf/axtChain/chain
    cp -p *.chain.gz /usr/local/apache/htdocs/goldenPath/hg17/vsSelf

    #	fixup README file, request push

# NET SELF (DONE - 2004-07-13 - Hiram)

    ssh eieio
    cd /cluster/data/hg17/bed/blastzSelf/axtChain
    mkdir preNet
    cd chain
    bash	#	for tcsh users
    for I in *.chain
    do
      echo preNetting $I
      /cluster/bin/i386/chainPreNet $I /cluster/data/hg17/chrom.sizes \
                        /cluster/data/hg17/chrom.sizes ../preNet/$I
    done
    #	23 minutes

    cd ..
    mkdir n1
    cd preNet
    for I in *.chain
    do
      N=${I/.chain/}.net
      echo primary netting $I
      /cluster/bin/i386/chainNet $I -minSpace=10 \
	/cluster/data/hg17/chrom.sizes /cluster/data/hg17/chrom.sizes \
	../n1/$N /dev/null
    done
    # exit bash if you are tcsh
    #	5 minute job

    cd ..
    cat n1/*.net | /cluster/bin/i386/netSyntenic stdin hNoClass.net
    #	memory usage 206442496, utime 3009 s/100, stime 252
    #	memory usage 2510467072, utime 19307 s/100, stime 3181

    ssh hgwdev
    cd /cluster/data/hg17/bed/blastzSelf/axtChain
    time netClass hNoClass.net hg17 hg17 human.net \
	-tNewR=/cluster/bluearc/scratch/hg/gs.18/build35/linSpecRep.notInHuman \
	-qNewR=/cluster/bluearc/scratch/hg/gs.18/build35/linSpecRep.notInHuman
    #	real    9m32.951s
    #	user    2m42.840s
    #	sys     1m23.460s

    # If things look good do
    ssh eieio
    cd /cluster/data/hg17/bed/blastzSelf/axtChain
    rm -r n1 hNoClass.net
    # Make a 'syntenic' subset of these with
    time netFilter -syn human.net > humanSyn.net
    #	real    0m29.851s
    #	user    0m27.200s
    #	sys     0m2.120s

    # Load the nets into database
    ssh hgwdev
    cd /cluster/data/hg17/bed/blastzSelf/axtChain
    netFilter -minGap=10 human.net |  hgLoadNet hg17 netSelf stdin
    netFilter -minGap=10 humanSyn.net | hgLoadNet hg17 syntenyNetSelf stdin

    # check results
    # featureBits hg17 netSelf
    #	620827374 bases of 2866216770 (21.660%) in intersection
    # featureBits hg16 netSelf
    #	563788850 bases of 2865248791 (19.677%) in intersection
    # featureBits hg15 selfNet
    #	749177799 bases of 2866466359 (26.136%) in intersection


    # featureBits hg17 syntenyNetSelf
    #	404535376 bases of 2866216770 (14.114%) in intersection
    # featureBits hg16 syntenyNetSelf
    #	340871322 bases of 2865248791 (11.897%) in intersection

    # Add entries for net and chain to human/hg17 trackDb

    # make net
    ssh eieio
    cd /cluster/data/hg17/bed/blastzSelf/axtChain
    mkdir humanNet
    time netSplit human.net humanNet
    #	real    0m52.106s
    #	user    0m43.350s
    #	sys     0m5.170s

    # extract axts from net  - this should be combined with the sort and
    # maf conversion below
    mkdir ../axtNet 
    foreach n (humanNet/chr*.net)
	set c=$n:t:r
	echo "netToAxt: $c.net -> $c.axt"
	rm -f ../axtNet/$c.axt
	netToAxt humanNet/$c.net chain/$c.chain \
		/cluster/data/hg17/nib \
		/cluster/data/hg17/nib stdout > ../axtNet/$c.axt
	echo "Complete: $c.net -> axtNet/$c.axt"
    end

    # sort axt's and convert to maf format
    mkdir ../mafNet
    foreach f (../axtNet/chr*.axt)
        set c=$f:t:r
        echo $c.axt
        mv ../axtNet/$c.axt ../axtNet/$c.unsorted.axt
        axtSort ../axtNet/$c.unsorted.axt ../axtNet/$c.axt
        rm ../axtNet/$c.unsorted.axt
        axtToMaf ../axtNet/$c.axt \
            /cluster/data/hg17/chrom.sizes /cluster/data/hg17/chrom.sizes \
                ../mafNet/$c.maf -tPrefix=hg17. -qPrefix=hg17.
    end
    # a 3 minute job

    XXXX - ! ! !   WE DO NOT NEED the Best and Tight tracks for Self ! ! !
    ssh hgwdev
    mkdir -p /cluster/data/hg17/bed/blastzSelf/axtBest
    cd /cluster/data/hg17/bed/blastzSelf/axtBest
    ln -s ../axtNet/chr*.axt .

    # copy net axt's to download area - XXX Do we need this for Self ?
    ssh hgwdev
    cd /cluster/data/hg17/bed/blastzSelf/axtNet
    mkdir /usr/local/apache/htdocs/goldenPath/hg17/vsSelf
    cp -p *.axt /usr/local/apache/htdocs/goldenPath/hg17/vsSelf
    cd /usr/local/apache/htdocs/goldenPath/hg17/vsSelf
    nice gzip *.axt
    nice md5sum *.gz > md5sum.txt
    # add README.txt file to dir (use previous assembly's copy as template)

    #  Convert those axt files to psl
    ssh eieio
    cd /cluster/data/hg17/bed/blastzSelf
    mkdir pslBest
    foreach a (axtBest/chr*.axt)
	set c=$a:t:r
	echo "processing $c.axt -> ${c}_blastzBestSelf.psl"
    /cluster/bin/i386/axtToPsl axtBest/${c}.axt \
	S1.len S2.len pslBest/${c}_blastzBestSelf.psl
	echo "Done: ${c}_blastzBestSelf.psl"
    end

    # Load tables
    ssh hgwdev
    cd /cluster/data/hg17/bed/blastzSelf/pslBest
    bash # if a csh/tcsh user
    for I in chr*BestSelf.psl
    do
	/cluster/bin/i386/hgLoadPsl -noTNameIx hg17 ${I}
	echo "done ${I}"
    done
    # exit bash if you are tcsh

    #	check results
    #	featureBits hg17 blastzBestSelf
    #	233978156 bases of 2866216770 (8.163%) in intersection
    #	featureBits hg16 blastzBestSelf
    #	225819219 bases of 2865248791 (7.881%) in intersection

# MAKING HUMAN AXTTIGHT FROM AXTBEST (NOT TO BE DONE - 2004-07-13 - Hiram)
    #	XXXX - ! ! !  DO NOT NEED axtBest for Self alignments
    #	Been done anyway, Robert and Gill like to see it.

# BLASTZ SELF CLEAN UP (DONE - 2004-07-15 - Hiram)
    ssh eieio
    cd /cluster/data/hg17/bed/blastzSelf
    nice rm -rf raw &
    nice rm axtChain/run1/chain/* &
    nice rm -fr axtChain/n1 axtChain/hNoClass.net &
    nice gzip axtChrom/* pslChrom/* lav/*/* axtChain/all.chain axtChain/*.net &


# CREATING BIG ZIPS (DONE - 2004-07-23 - Hiram)
    ssh eieio
    cd /cluster/data/hg17/jkStuff
    time ./zipAll.sh > zipAll.out 2>&1

    ssh hgwdev
    #  This stuff has to work in a different way because this stuff
    #	updates on a daily basis.
    cd /usr/local/apache/htdocs/goldenPath/hg17/bigZips
    featureBits hg17 refGene:upstream:1000 -fa=upstream1000.fa
    zip upstream1000.zip upstream1000.fa
    rm upstream1000.fa
    featureBits hg17 refGene:upstream:2000 -fa=upstream2000.fa
    zip upstream2000.zip upstream2000.fa
    rm upstream2000.fa
    featureBits hg17 refGene:upstream:5000 -fa=upstream5000.fa
    zip upstream5000.zip upstream5000.fa
    rm upstream5000.fa


# ENCODE REGIONS (DONE 2004-07-28 kate)

    ssh eieio
    cd /cluster/data/hg17/bed
    mkdir encodeRegions
    cd encodeRegions
    liftOver /cluster/data/hg16/bed/encodeRegions/encodeRegions.bed \
        /cluster/data/hg16/bed/liftOver/hg16ToHg17.over.chain \
                encodeRegions.bed encodeRegions.unmapped
    wc -l encodeRegions.*
    # 44 encodeRegions.bed
    # 0 encodeRegions.unmapped
    ssh hgwdev
    cd /cluster/data/hg17/bed/encodeRegions
    hgLoadBed hg17 encodeRegions encodeRegions.bed -noBin


# H-INVITATIONAL GENE ANNOTATION DATABASE (WORKING 2004-07-28 kate)
    # http://www.jbirc.aist.go.jp/hinv/top.html
    # Create knownGene table to reference HINV gene ID's
    #  for link on knownGenes details page
    # Also, create an HINV gene track

    # download CDNA file release 1.5 -- got release # from downloads page).
    ssh kksilo
    mkdir -p /cluster/data/hinv
    cd /cluster/data/hinv
    wget http://www.jbirc.aist.go.jp/hinv/download/alldata/flatfile/FCDNA.gz
    gunzip FCDNA.gz
    mv FCDNA FCDNA.1.5

    # set up assembly work area
    ssh eieio
    cd /cluster/data/hg17
    mkdir -p bed/hinv
    cd bed/hinv

    # extract H-INV ID's and Genbank accessions of mRNAs
    awk '/CDNA_ACCESSION-NO:/ {print $2}' < /cluster/data/hinv/FCDNA.1.5 \
                                                        > accessions.txt
    awk '/CDNA_H-INVITATIONAL-ID:/ {print $2}' < /cluster/data/hinv/FCDNA.1.5 \
                                                        > ids.txt
    paste accessions.txt ids.txt > queries.txt
    wc -l ids.txt
        #   41118 ids.txt

    # create PSL file from alignments for these mRNA's, extracted from the 
    #       table of all aligned mRNA's
    ssh hgwdev
    cd /cluster/data/hg17/bed/hinv
    hgsql hg17 -s -e "SELECT * FROM all_mrna"  | cut -f 2- > all_mrna.tab

    ssh eieio
    cd /cluster/data/hg17/bed/hinv
    pslReps /dev/null stdout /dev/null | cat - all_mrna.tab > all_mrna.psl
        # using pslReps to generate the PSL file header
    ~kate/bin/i386/pslSelect -queryPairs=queries.txt all_mrna.psl hinv_mrna.psl
        # NOTE: generated with pslSelect.c v1.3 (1.4 is broken -- test is
        # setup in hg/pslSelect/tests & I requested Robert take a look)

    # load track of mrna alignments
    hgwdev
    cd /cluster/data/hg17/bed/hinv
    hgLoadPsl hg17 -table=HInvGeneMrna hinv_mrna.psl
    hgsql hg17 -s -e \
        "select distinct(qName) from HInvGeneMrna order by qName" > hg17.mrna
    hgsql hg16 -s -e \
        "select distinct(qName) from HInvGeneMrna order by qName" > hg16.mrna
    wc -l hg*.mrna
        # 40998 hg16.mrna
        # 41023 hg17.mrna

    comm -1 -3 *.mrna > hg17.aligned
    wc -l hg17.aligned
        # 29 (transcripts newly aligned in hg17)
    comm -2 -3 *.mrna > hg16.aligned
    wc -l hg16.aligned
        # 4 (transcripts no longer aligned in hg17)
    comm -2 -3 ids.txt hg17.mrna > hg17.notaligned
    wc -l hg17.notaligned
        # 95 (transcripts not aligned in hg17 -- checking on why...)

    # also make a table with various useful items for each transcript
    ssh hgwdev
    hgsql hg17 < ~/kent/src/hg/lib/HInv.sql
    cd /cluster/data/hg17/bed/hinv
    /cluster/data/hinv/hinvToTable.pl < /cluster/data/hinv/FCDNA.1.5 > HInv.tab
    echo 'load data local infile "HInv.tab" into table HInv' | hgsql hg17
    hgsql hg16 -s -e "select count(*) from HInv"
        # 41118
    hgsql hg17 -s -e "select count(*) from HInv"
        # 41118

    # create table for knownGenes detail page
    ssh hgwdev
    cd /cluster/data/hg17/bed/hinv
    hgMapToGene hg17 HInvGeneMrna knownGene knownToHInv


# GENEID GENE PREDICTIONS (DONE 7/30/04 angie)
    ssh hgwdev
    mkdir /cluster/data/hg17/bed/geneid
    cd /cluster/data/hg17/bed/geneid
    foreach chr (`awk '{print $1;}' ../../chrom.sizes`)
      wget http://genome.imim.es/genepredictions/H.sapiens/golden_path_200405/geneid_v1.2/$chr.gtf
      wget http://genome.imim.es/genepredictions/H.sapiens/golden_path_200405/geneid_v1.2/$chr.prot
    end
    # Add ".1" suffix to each item in .prot's, to match transcript_id's in gtf
    cp /dev/null geneid.fa
    foreach f (chr*.prot)
      perl -wpe 's/^(>chr\S+)/$1.1/' $f >> geneid.fa
    end
    ldHgGene -gtf -genePredExt hg17 geneid *.gtf 
    hgPepPred hg17 generic geneidPep geneid.fa


# MITOPRED DATA FOR HGGENE (DONE 7/30/04 angie)
    ssh hgwdev
    mkdir /cluster/data/hg17/bed/mitopred
    cd /cluster/data/hg17/bed/mitopred
    wget http://mitopred.sdsc.edu/data/hum_30.out
    perl -wpe 's/^(\S+)\s+\S+\s+(.*)/$1\t$2/' hum_30.out > mitopred.tab
    cat > mitopred.sql << '_EOF_'
# Prediction of nuclear-encoded mito. proteins from http://mitopred.sdsc.edu/
CREATE TABLE mitopred (
    name varchar(10) not null,      # SwissProt ID
    confidence varchar(8) not null, # Confidence level
              #Indices
    PRIMARY KEY(name(6))
);
'_EOF_'
    # << this line makes emacs coloring happy
    hgsql hg17 < mitopred.sql
    hgsql hg17 -e 'load data local infile "mitopred.tab" into table mitopred'


# NUCLEAR PROTEIN DATABASE (IN PROGRESS 7/30/04 angie)
    ssh eieio
    mkdir /cluster/data/hg17/bed/npd
    cd /cluster/data/hg17/bed/npd
    wget ftp://ftp.hgu.mrc.ac.uk/pub/npd/database.zip
    unzip database.zip
    # OK, it's one big .mdb (Microsoft Access DB) file.  
    # Googling... can buy a converter for $40... free trial .exe...


# CREATING REFFULL - DBTSS MRNA (DONE - 2004-08-02 - Hiram)

    ssh to eieio
    mkdir /cluster/data/hg17/bed/refFull
    cd /cluster/data/hg17/bed/refFull
    wget --timestamping "ftp://ftp.hgc.jp/pub/hgc/db/dbtss/ref-full.fa.gz" .
    wget --timestamping "ftp://ftp.hgc.jp/pub/hgc/db/dbtss/readme" .
    #	See also: http://dbtss.hgc.jp/index.html

    #	gunzip it and split the ref-rull.fa file into about 200 pieces
    #	(faSplit won't do this job if it is:
    #	zcat ref-full.fa.gz | faSplit sequence stdin 50 splitRefFull
    gunzip ref-full.fa.gz
    faSplit sequence ref-full.fa 50 splitRefFull
    gzip ref-full.fa

    #	copy to Iservers
    ssh kkr1u00
    cd /cluster/data/hg17/bed/refFull
    mkdir /iscratch/i/gs.18/build35/refFull
    cp -p split*.fa /iscratch/i/gs.18/build35/refFull
    /cluster/bin/iSync
    #	no longer need these split files here
    rm -f split*.fa

    #	run alignments on kluster
    ssh kk
    cd /cluster/data/hg17/bed/refFull
    ls -1S /scratch/hg/gs.18/build35/maskedContigs > genome.lst
    ls -1S /iscratch/i/gs.18/build35/refFull > refFull.lst


    #	Use BLAT to generate refFull alignments as so:
    cat << '_EOF_' > gsub
#LOOP
/cluster/bin/i386/blat -ooc=/scratch/hg/h/11.ooc -q=dna -t=dna {check in exists /scratch/hg/gs.18/build35/maskedContigs/$(path1)} {check in exists+ /iscratch/i/gs.18/build35/refFull/$(path2)} {check out line+ psl/$(root1)/$(root2).psl}
#ENDLOOP
'_EOF_'
    # << keep emacs coloring happy

    bash # if a csh/tcsh user
    mkdir psl
    cat genome.lst | sed -e "s/.fa//" | while read C
	do
		mkdir psl/${C}
	done
    # exit bash if you are tcsh
    gensub2 genome.lst refFull.lst gsub jobList
    para create jobList
    #	18240 jobs written to batch
    para try
    para check
    para push ... etc ...
# Completed: 18240 of 18240 jobs
# CPU time in finished jobs:      37011s     616.85m    10.28h    0.43d  0.001 y
# IO & Wait Time:                 62630s    1043.84m    17.40h    0.72d  0.002 y
# Average job time:                   5s       0.09m     0.00h    0.00d
# Longest job:                       51s       0.85m     0.01h    0.00d
# Submission to last job:           850s      14.17m     0.24h    0.01d

    #	Process refFull alignments into near best in genome.
    ssh eieio
    cd /cluster/data/hg17/bed/refFull
    pslSort dirs raw.psl tmp psl/*
    pslReps -minCover=0.2 -sizeMatters -minAli=0.965 \
	-nearTop=0.001 raw.psl contig.psl /dev/null
    liftUp -nohead all_refFull.psl ../../jkStuff/liftAll.lft warn contig.psl
    pslSortAcc nohead chrom tmp all_refFull.psl
    pslCat -dir chrom > refFullAli.psl

    #	Load refFull alignments into database
    ssh hgwdev
    cd /cluster/data/hg17/bed/refFull
    hgLoadPsl hg17 -tNameIx refFullAli.psl

# VAR_MULTIZ HG17/MM5/RN3/GALGAL2/FR1 (acs 2004-08-12)

# This is a new, experimental version of multiz written by Minmei at
# PSU and sent by e-mail from Webb.  This version allows for a
# progressive alignment strategy (i.e., alignment construction in a
# post-order traversal of the tree) using only pairwise alignments of
# each sequence with the reference sequence and without any need for
# "staging".  Here's a little blurb about it from the header of
# var_multiz.v3.c.

#  var_multiz.v3.c
#
#  Variant to multiz program. It aligns two files of
#  alignment blocks where top row is always the reference,
#  assuming blocks are increasing ordered based on the
#  start position on the refernece seqence. Single-coverage
#  on reference is required at this stage.
#
#  Four arguments are required: char* arg1, char* arg2,
#  int arg3, int arg4. arg1 and arg2 are two files need
#  to be aligned together. The alignment of reference in
#  two files are either fixed or not, determined from
#  argurments arg3 and arg4. arg3 and arg4 are either 1
#  or 0, but cannot be 1 at the same time. 1 means
#  reference is fixed. v1 and v2 cannot be both 1.
#  ...

    mkdir /cluster/data/hg17/bed/var_multiz.2004-08-12

    # unpack source and compile
    cp /cluster/home/acs/var_multiz.tar.gz /cluster/data/hg17/bed/var_multiz.2004-08-12
    cd /cluster/data/hg17/bed/var_multiz.2004-08-12
    tar xfz var_multiz.tar.gz
    cd var_multiz_source
    make

     NOTE (8/14): this version of the source is already out of date!
    # Source is now checked in under hg3rdParty and updated binaries
    # are being kept under /cluster/bin/penn/var_multiz

    # script for creating the 5-way alignments for a given chromosome
    # (acs, 8/20/04) below revised after e-mail exchange with Minmei
    cat << '_EOF_' > doVarMultiz.csh
#!/bin/csh -fe

set chr = $1			# may include _random or _hla_hap[12]

set REF = hg17.$chr
set RAT = /cluster/bluearc/hg17/multiz8way/rn3/$chr.maf
set MOUSE = /cluster/bluearc/hg17/multiz8way/mm5/$chr.maf
set CHICKEN = /cluster/bluearc/hg17/multiz8way/galGal2/$chr.maf
set FISH = /cluster/bluearc/hg17/multiz8way/fr1/$chr.maf
set DEST = /cluster/data/hg17/bed/var_multiz.2004-08-12/maf/$chr.maf

set VMZ = /cluster/bin/penn/var_multiz
set PROJECT = /cluster/bin/penn/var_multiz.2004.08.12/maf_project

mkdir -p $DEST:h

if ( -s $RAT && -s $MOUSE ) then 
    echo "Aligning $RAT $MOUSE..."
    $VMZ $RAT $MOUSE 0 0 > /scratch/$chr.tmp1.maf
    echo "Projecting on $REF..."
    $PROJECT /scratch/$chr.tmp1.maf $REF > /scratch/$chr.hrm.maf
else if ( -s $RAT ) then
    cp $RAT /scratch/$chr.hrm.maf
else if ( -s $MOUSE ) then
    cp $MOUSE /scratch/$chr.hrm.maf
endif

if ( -s $CHICKEN && -s /scratch/$chr.hrm.maf ) then
    echo "Adding $CHICKEN..."
    $VMZ /scratch/$chr.hrm.maf $CHICKEN 1 0 > /scratch/$chr.tmp2.maf
    echo "Projecting on $REF..."
    $PROJECT /scratch/$chr.tmp2.maf $REF > /scratch/$chr.hrmc.maf
else if ( -s $CHICKEN ) then
    cp $CHICKEN /scratch/$chr.hrmc.maf
else if ( -s /scratch/$chr.hrm.maf ) then
    cp /scratch/$chr.hrm.maf /scratch/$chr.hrmc.maf
endif 

if ( -s $FISH && -s /scratch/$chr.hrmc.maf ) then
    echo "Adding $FISH..."
    $VMZ /scratch/$chr.hrmc.maf $FISH 1 0 > /scratch/$chr.tmp3.maf
    echo "Projecting on $REF..."
    $PROJECT /scratch/$chr.tmp3.maf $REF > $DEST
else if ( -s $FISH ) then
    cp $FISH $DEST
else if ( -s /scratch/$chr.hrmc.maf ) then
    cp /scratch/$chr.hrmc.maf $DEST
endif 


echo "Done."
rm /scratch/$chr.tmp[123].maf /scratch/$chr.hrm.maf /scratch/$chr.hrmc.maf
'_EOF_'
    # << keep emacs coloring happy    
    chmod 755 doVarMultiz.csh

    for file in `find /cluster/bluearc/hg17/multiz8way/rn3 /cluster/bluearc/hg17/multiz8way/mm5 /cluster/bluearc/hg17/multiz8way/galGal2 /cluster/bluearc/hg17/multiz8way/fr1 -name "chr*.maf"` ; do echo `basename $file .maf` ; done | sort -u > chrlist
    rm -f jobs.lst
    for chr in `cat chrlist` ; do echo "doVarMultiz.csh $chr" >> jobs.lst ; done

    # run cluster job
    ssh kk ; cd /cluster/data/hg17/bed/var_multiz.2004-08-12; para create jobs.lst ; para try ; para push 
    # (etc.)

Completed: 46 of 46 jobs
CPU time in finished jobs:      71302s    1188.36m    19.81h    0.83d  0.002 y
IO & Wait Time:                  1162s      19.37m     0.32h    0.01d  0.000 y
Average job time:                1575s      26.26m     0.44h    0.02d
Longest job:                     6353s     105.88m     1.76h    0.07d
Submission to last job:          6362s     106.03m     1.77h    0.07d

    # for now just create an ordinary maf track (conservation later)
    rm -rf /gbdb/hg17/varMultiz/maf/varMultizMm5Rn3GalGal2Fr1
    mkdir -p /gbdb/hg17/varMultiz/maf/varMultizMm5Rn3GalGal2Fr1
    ln -s /cluster/data/hg17/bed/var_multiz.2004-08-12/maf/*.maf /gbdb/hg17/varMultiz/maf/varMultizMm5Rn3GalGal2Fr1
    /cluster/bin/i386/hgLoadMaf hg17 -warn varMultizMm5Rn3GalGal2Fr1 -pathPrefix=/gbdb/hg17/varMultiz/maf/varMultizMm5Rn3GalGal2Fr1
    chmod 775 /gbdb/hg17/varMultiz /gbdb/hg17/varMultiz/maf /gbdb/hg17/varMultiz/maf/varMultizMm5Rn3GalGal2Fr1 /cluster/data/hg17/bed/var_multiz.2004-08-12 /cluster/data/hg17/bed/var_multiz.2004-08-12/maf
    chmod 664 /cluster/data/hg17/bed/var_multiz.2004-08-12/maf/*.maf

    # trackDb entry
# track varMultizMm5Rn3GalGal2Fr1
# shortLabel varMultiz5Way
# longLabel Human/Mouse/Rat/Chicken/Fugu Var-Multiz 
# group compGeno
# priority 190
# visibility hide
# type maf


# elephant human blastz alignment by Robert Aug 11 2004
mkdir /cluster/bluearc/elephant
cd /cluster/bluearc/elephant

#get reads and qual scores from trace repository
for i in `cat trace.lst`; do echo $i ; wget ftp://ftp.ncbi.nih.gov/pub/TraceDB/loxodonta_africana/fasta.loxodonta_africana.$i.gz ; done
for i in `cat trace.lst`; do echo $i ; wget ftp://ftp.ncbi.nih.gov/pub/TraceDB/loxodonta_africana/qual.loxodonta_africana.$i.gz ; done
for i in `cat trace.lst` ; do zcat fasta.loxodonta_africana.$i.gz > loxodonta_africana.$i.fa ; done     

#trim reads
for i in `cat trace.lst` ; do nice gunzip -c qual.loxodonta_africana.$i.gz > qual.loxodonta_africana.$i; faTrimRead loxodonta_africana.$i.fa qual.loxodonta_africana.$i tmp.$i.fa lift.$i.lft; mv -f tmp.$i.fa loxodonta_africana.$i.fa ; rm -f qual.loxodonta_africana.$i ; done
for i in `cat trace.lst`; do faSize -detailed=on loxodonta_africana.$i.fa > mac.$i.len ; done  
cat mac.0*.len > S2.len
for i in `cat trace.lst`; do sed -e s/S2.len/mac.$i.len/ < DEF > DEF.$i ; done

#split fa reads into 10mb chunks for blastz run and distribute to i-servers.
ssh kkr1u00
for i in `cat trace.lst`; do nice faSplit about loxodonta_africana.$i.fa 10000000 /iscratch/i/elephant/${i}.mac. ; done
cd /iscratch/i/elephant
find split -name \*.fa > /cluster/bluearc/elephant/mac.lst
cd /cluster/bluearc/elephant
hgsql hg17 -N < chromLen.sql > S1.len
cd /iscratch/i/elephant
iSync

#setup cluster run to blastz reads to human genome
ssh kk
cd /cluster/bluearc/elephant
BlastZ_run0.sh
cd run.0
para create jobList
para push

#94798 jobs in batch
#149 jobs (including everybody's) in Parasol queue.
#Checking finished jobs
#Completed: 94797 of 94798 jobs
#Crashed: 1 jobs
#CPU time in finished jobs:   14183153s  236385.89m  3939.76h  164.16d  0.450 y
#IO & Wait Time:                310938s    5182.30m    86.37h    3.60d  0.010 y
#Average job time:                 153s       2.55m     0.04h    0.00d
#Longest job:                     1770s      29.50m     0.49h    0.02d
#Submission to last job:         52186s     869.77m    14.50h    0.60d

ssh kkr9
BlastZ_run1.sh
cd run.1
para create jobList
para push
#341 jobs in batch
#151 jobs (including everybody's) in Parasol queue.
#Checking finished jobs
#Completed: 341 of 341 jobs
#CPU time in finished jobs:     142914s    2381.91m    39.70h    1.65d  0.005 y
#IO & Wait Time:                 14078s     234.63m     3.91h    0.16d  0.000 y
#Average job time:                 460s       7.67m     0.13h    0.01d
#Longest job:                      782s      13.03m     0.22h    0.01d
#Submission to last job:          1954s      32.57m     0.54h    0.02d

#generate lst and fa files for each chromosome for faster lavToAxt
cd /cluster/bluearc/elephant
echo "select chrom from chromInfo;" > chrom.sql
hgsql hg17 -B -N < chrom.sql > chrom.lst

for i in `cat chrom.lst` ; do grep -h '>' lav/$i/* | awk '{print $1}' | sed -e 's/"//g' | sed -e 's/>//g' > mac.$i.lst ; echo $i ; done

ssh kki
cd /cluster/bluearc/elephant
mkdir -p splitChrom
/bin/rm splitChrom/*
gensub2 trace.lst chrom.lst gsub.split spec.split
para create spec.split
para push
#322 jobs in batch
#Checking finished jobs
#Completed: 322 of 322 jobs
#CPU time in finished jobs:        819s      13.65m     0.23h    0.01d  0.000 y
#IO & Wait Time:                  3278s      54.63m     0.91h    0.04d  0.000 y
#Average job time:                  13s       0.21m     0.00h    0.00d
#Longest job:                       51s       0.85m     0.01h    0.00d
#Submission to last job:           462s       7.70m     0.13h    0.01d


cd /cluster/bluearc/elephant/splitChrom
for i in `cat /cluster/bluearc/elephant/chrom.lst` ; do cat mac.*.$i.fa > mac.$i.fa ; echo $i ; done

#lav to axt run
ssh kk
cd /cluster/bluearc/elephant
mkdir -p run.2
#NOTE: chr19 must be run on kolossus with 64bit executables
#change SEQ1_DIR form /scratch to /iscratch for mini cluster
. DEF
echo "#LOOP" > run.2/gsub
echo '/cluster/bin/scripts/blastz-contiglav2axt '${BASE}'/lav/$(root1) {check out line+ '${BASE}'/axtChrom/$(root1).axt} '${SEQ1_DIR}' /cluster/bluearc/elephant/splitChrom/mac.$(root1).fa' >> run.2/gsub
echo "#ENDLOOP" >> run.2/gsub
cd run.2
gensub2 ../chrom.lst single gsub jobList
para create jobList
para push
#chrM has no data and crashed
#46 jobs in batch
#Checking finished jobs
#Completed: 45 of 46 jobs
#Crashed: 1 jobs
#CPU time in finished jobs:     249970s    4166.17m    69.44h    2.89d  0.008 y
#IO & Wait Time:                  5407s      90.11m     1.50h    0.06d  0.000 y
#Average job time:                5675s      94.58m     1.58h    0.07d
#Longest job:                    27065s     451.08m     7.52h    0.31d
#Submission to last job:         47744s     795.73m    13.26h    0.55d



#split reads by prefix so axtBest will fit in memory
mkdir axtByQ
cat mac*.lst | awk -F\| '{print substr($3,1,3)}' | sort -nu >prefix.lst
for i in `cat prefix.lst` ; do cat axtChrom/*.axt | axtFilter -qStartsWith=gnl\|ti\|$i stdin | axtSwap stdin S1.len S2.len stdout | axtSort stdin axtByQ/q$i.axt ; done
mkdir axtByQBest
#lots of memory needed for reciprocal best
ssh kolossus
cd /cluster/bluearc/elephant/axtByQ
for i in `ls *.axt` ; do axtBest -quiet i all stdout | axtSwap stdin ../S2.len ../S1.len ../axtByQBest/$i ; echo $i done ; done
cd ../cluster/bluearc/elephant/axtByQBest
cat q*.axt | axtSplitByTarget stdin .
mkdir axtRecipBest
for i in `cat chrom.lst` ; do axtSort axtByQBest/$i.axt stdout | axtBest stdin $i axtRecipBest/$i.axt ; echo $i ;done 
for i in `cat chrom.lst` ; do axtToMaf axtRecipBest/$i.axt S1.len S2.len maf/$i.maf -tPrefix=hg17. -qPrefix=rm1. -scoreZero ; done 
for i in `cat chrom.lst` ; do mafFilter -minScore=1000 maf/$i.maf > mafFilter/$i.maf ; done 
# record coverage
cd mafFilter
for i in `cat ../chrom.lst` ; do nice mafCoverage hg17 $i.maf -count=2 > $i.cov ; echo done $i ; done   

# CHIMP DELS FROM HG16 (DONE 2004-08-17 kate)
# NOTE: this track just for development -- it should be regenerated from the latest
#       alignments instead of lifted.
    ssh hgwdev
    cd /cluster/data/hg17/bed
    mkdir -p chimpDels
    cd chimpDels
    hgsql -s hg16 -e "SELECT * FROM chimpDels" | cut -f 2- > chimpDels.hg16.bed
    liftOver chimpDels.hg16.bed \
        /cluster/data/hg16/bed/liftOver/hg16ToHg17.over.chain \
                chimpDels.bed chimpDels.unmapped
    wc -l chimpDels.bed chimpDels.unmapped
    # 27662 chimpDels.bed
    # 132 chimpDels.unmapped
    # 27794 total
    hgLoadBed hg17 chimpDels chimpDels.bed -noBin

### CREATE chimpFixedDiff -- panTro1 (Daryl, August 18, 2005)

    # Convert chimp quality scores from uncompressed to compressed
    # chromosome format.  This took 22 minutes on crow.
    ## previously done for hg16
    # cd /cluster/data/panTro1
    # cat */chr*.qa | qaToQac stdin chrom.qac

    # Make single base pair high quality differences into a bed file
    # and load into database
    cd /cluster/data/hg17/bed
    mkdir chimpFixedDiff
    cd chimpFixedDiff
    sed 's/simpleNucDiff/chimpFixedDiff/' ~/kent/src/hg/lib/simpleNucDiff.sql > chimpFixedDiffs.sql

    # chimpHiQualDiffs was changed to allow different 
    #    quality parameters as command line options

    set axtDir = /cluster/data/hg17/bed/blastz.panTro1/axtRBestNet
    # This crashed twice at the same place, but ran successfully when
    # each chromosome was run separately.
    ##  time chimpFixedDiffs /$axtDir /cluster/data/panTro1/chrom.qac chimpFixedDiffs.bed >& chimpFixedDiffs.log
    mkdir chroms; cd chroms
    ls -1 $axtDir | grep chr | grep axt | sed 's/.axt//' | xargs mkdir
    # rmdir chr*random
    touch cfd.log
    foreach f (chr*)
	echo -n $f "  "
	ln -s /$axtDir/$f.axt $f/$f.axt
	time nice chimpFixedDiffs $f /cluster/data/panTro1/chrom.qac $f.chimpFixedDiffs.bed>>& cfd.log
    end
    rm ../chimpFixedDiffs.bed
    cat chr*bed > ../chimpFixedDiffs.bed

    ## The load (sort) ran out of memory on hgwdev, so I sorted the
    ## file first on kolossus (3 minutes) and then loaded it on hgwdev
    ssh kolossus
    hgLoadBed -strict -sqlTable=chimpFixedDiffs.sql -noLoad hg16 chimpFixedDiff chimpFixedDiffs.bed
    exit
    ## hgwdev (37 minutes)
    hgLoadBed -hasBin -noSort -sqlTable=chimpFixedDiffs.sql hg16 chimpFixedDiff bed.tab

    TODO: need to filter out polymorphic sites (SNPs)





#  Load firstEF track (DONE 2004-08-18 braney)
    ssh hgwdev
    mkdir -p /cluster/data/hg17/bed/firstEF
    cd /cluster/data/hg17/bed/firstEF
    wget "http://bioinformatics.med.ohio-state.edu/downloads/firstEFMay04.bed.gz"
    cat << '_EOF_' > sedScript
s/chr23/chrX/g
s/chr24/chrY/g
/^>/d
/^$/d
/^No/d
'_EOF_'

    zcat firstEFMay04.bed.gz | sed -f sedScript | awk  "{OFS=\"\t\"} {\$3 +=1; print  \$0}" > firstEF.bed
    hgLoadBed hg17 firstEF firstEF.bed
    rm firstEF.tab
    gzip *.bed
#done firstEF


# GENE BOUNDS (RNACLUSTER) (DONE 08-18-2004 Chuck)
# Create rnaCluster table (depends on {est,mrna}OrientInfo)
cd ~sugnet/store1/altSplice/hg17/
mkdir rnaCluster
cd rnaCluster/
mkdir chrom
# Create a list of accessions that come from RAGE libraries and need to be excluded.
~/kent/src/hg/geneBounds/clusterRna/generateRageAccList.csh hg17 rage.libs
foreach f (/cluster/data/hg17/nib/chr*.nib)
    set c = $f:t:r
    set out = chrom/$c.bed
    # Exclude accesions in the RAGE file
    echo clusterRna -mrnaExclude=hg17.rage.libs hg17 /dev/null $out -chrom=$c
    clusterRna -mrnaExclude=hg17.rage.libs hg17 /dev/null $out -chrom=$c
end
hgLoadBed hg17 rnaCluster chrom/*.bed
mkdir /cluster/data/hg17/bed/rnaCluster
cp -r chrom /cluster/data/hg17/bed/rnaCluster

#  miRNA track (CORRECTION 2004-12-09 - Hiram)
    #	Received the following correction from Michel Weber:
    #	Could you please replace the two lines:
    #	chr6 72169974 72170045 hsa-mir-30a 480 - 72169977 72169999
    #	chr6 72170017 72170040 hsa-mir-30a-5p 480 - 72170017 72170040
    #	by:
    #	chr6 72169974 72170045 hsa-mir-30a 480 - 72169977 72169999
    #	chr6 72169974 72170045 hsa-mir-30a 480 - 72170017 72170040
    #	(The first line remains identical, only the second is changed. The 
    #	repetition of the hsa-mir-30a entry means that both strands of its
    #	hairpin structure are matured into microRNAs, named hsa-miR-30a-3p and 
    #	hsa-miR-30a-5p in Rfam database).
    ssh hgwdev
    cd /cluster/data/hg17/bed/miRNA
    mv miRNA_hg17_1.bed miRNA_hg17_1.bed.0
    cp miRNA_hg17_1.bed.0 miRNA_hg17_1.bed
    #	edit miRNA_hg17_1.bed to change the single line.  Then:
    mv hg17.bed hg17.bed.0
    egrep -v "^track |^browser " miRNA_hg17_1.bed | \
	sed -e "s/miR/mir/g; s/ sa-mir/ hsa-mir/g; s/ /\t/g;" > hg17.bed
    #	Check that the edit is in place properly:
    diff hg17.bed.0 hg17.bed
    #	and load it
    hgLoadBed hg17 miRNA hg17.bed
    #	Loaded 221 elements of size 8
    #	featureBits remains the same:
    featureBits hg17 miRNA
    #	18052 bases of 2866216770 (0.001%) in intersection

#  miRNA track (DONE 2004-09-03 - Hiram)(CORRECTED, see above 2004-12-09)
    #	The source data for this was received via email from Sam
    #	Griffiths-Jones to Donna 16 August 2004.  In other email Michel
    #	Weber asked to add one more data line to that file.

    #	data from: Sam Griffiths-Jones <sgj@sanger.ac.uk>
    #	and Michel.Weber@ibcg.biotoul.fr
    #	notify them if this assembly updates to renew this track
    cd /cluster/data/hg17/bed
    mkdir miRNA
    cd miRNA
    #	one name was missing the h in hsa-mir and one was miR instead of
    #	mir
    egrep -v "^track |^browser " miRNA_hg17_1.bed | \
	sed -e "s/miR/mir/g; s/ sa-mir/ hsa-mir/g; s/ /\t/g;" > hg17.bed
    hgLoadBed hg17 miRNA hg17.bed
    #	compare with previous results, should be relatively similar
    #	featureBits hg16 miRNA
    #	16923 bases of 2865248791 (0.001%) in intersection
    #	featureBits hg17 miRNA
    #	18052 bases of 2866216770 (0.001%) in intersection

    # entry is already in trackDb/trackDb.ra


## blastz mRNA track for internal use - Robert 8/12/04
mkdir /cluster/bluearc/hg17/mrnaBlastz
cd /cluster/bluearc/hg17/mrnaBlastz
/cluster/data/genbank/bin/i386/gbGetSeqs -gbRoot=/cluster/data/genbank genbank mrna mrna.fa -db=hg -native
mkdir -p split
faTrimPolyA mrna.fa trim.fa
faSplit about trim.fa 1000000 split/mrna
cp -ip trim.fa /panfs/ucsc.edu/home/scratch/hg17/mrnaBlastz
faSize trim.fa -detailed=on > S2.len
hgsql hg16 < chromInfo.sql > S1.len

BlastZ_run0.sh
cd run.0
para push
para time
#113894 jobs in batch
#207911 jobs (including everybody's) in Parasol queue.
#Checking finished jobs
#Completed: 113894 of 113894 jobs
#CPU time in finished jobs:   14423845s  240397.41m  4006.62h  166.94d  0.457 y
#IO & Wait Time:                334352s    5572.54m    92.88h    3.87d  0.011 y
#Average job time:                 130s       2.16m     0.04h    0.00d
#Longest job:                    38301s     638.35m    10.64h    0.44d
#Submission to last job:         59841s     997.35m    16.62h    0.69d

mkdir run.1
~angie/hummus/do.out2lav DEF > run.1/j
cd run.1
para create j
para push
para time
#341 jobs in batch
#208550 jobs (including everybody's) in Parasol queue.

#Checking finished jobs
#Completed: 341 of 341 jobs
#CPU time in finished jobs:      28990s     483.17m     8.05h    0.34d  0.001 y
#IO & Wait Time:                 43139s     718.98m    11.98h    0.50d  0.001 y
#Average job time:                 212s       3.53m     0.06h    0.00d
#Longest job:                     2015s      33.58m     0.56h    0.02d
#Submission to last job:          2187s      36.45m     0.61h    0.03d

#!/bin/tcsh
set base="/cluster/bluearc/hg17/mrnaBlastz"
    cd $base
    mkdir -p pslRaw
    foreach c (lav/*)
      pushd $c
      set chr=$c:t
      set out=$base/pslRaw/$chr.psl
      echo "Translating $chr lav to $out"
      cat `ls -1 *.lav | sort -g` \
        | lavToPsl stdin stdout \
        | sed -e 's@scratch/hg/gs.18/build35/bothMaskedNibs//@@' | sed -e 's/\.nib:[0-9]*-[0-9]*//' > $out
      popd
    end

mkdir run.2

for i in `awk '{print $1}' S1.len` ; do echo doSortFilter.sh ../pslRaw/$i.psl ../pslFilter/$i.psl  >> run.2/spec.dup ; done
cd run.2
para create spec.dup
para push
#46 jobs in batch
#3 jobs (including everybody's) in Parasol queue.
#Checking finished jobs
#Completed: 46 of 46 jobs
#CPU time in finished jobs:       4409s      73.48m     1.22h    0.05d  0.000 y
#IO & Wait Time:                  1082s      18.04m     0.30h    0.01d  0.000 y
#Average job time:                 119s       1.99m     0.03h    0.00d
#Longest job:                     3842s      64.03m     1.07h    0.04d
#Submission to last job:          3842s      64.03m     1.07h    0.04d
cd ..
for i in `awk '{print $1}' S1.len` ; do echo axtChain -linearGap=linearGap.txt -psl pslFilter/$i.psl /scratch/hg/gs.18/build35/bothMaskedNibs/ -faQ /panfs/ucsc.edu/home/scratch/hg17/mrnaBlastz/trim.fa chain/$i.chain >> spec.chain ; done
para create spec.chain
para push

cd run.3
para create spec.filter
para push
cd ..
ls /cluster/data/hg17/nib/*.nib > S1.lst
#Skip chainPreNet it is not good for mrna
#mkdir -p preNet
#
#cd chainFilter
#foreach i ( *.chain)
#chainPreNet $i ../S1.len ../S2.len ../preNet/$i
#end

mkdir run.4
cd run.4
for i in `awk '{print $1}' ../S1.len`; do echo "chainToPsl ../chainFilter/$i.chain ../S1.len ../S2.len ../S1.lst /panfs/ucsc.edu/home/scratch/hg17/mrnaBlastz/trim.fa ../psl/$i.psl" >> spec.chain2psl.new  ; done

pslCat psl/*psl > mrnaBlastz.psl
hgLoadPsl hg17 mrnaBlastz.psl

cp trim.fa /cluster/data/hg17/bed/mrnaBlastz/hg17Mrna.fa
ln /cluster/data/hg17/bed/mrnaBlastz/hg17Mrna.fa /gbdb/hg17/mrnaBlastz/ -s

hgLoadSeq -prefix=bz hg17 /gbdb/hg17/mrnaBlastz/hg17Mrna.fa

## end of blastz Mrna track

#### BUILD RETROGENE TRACK (done Robert 8/26/2004)
#### REBUILD RETROGENE TRACK (done Robert 12/24/2004 - but no notes - kuhn)
# diffs before push to beta:
#  1640 hg17.pseudoGeneLink.devOnly
#  9639 hg17.pseudoGeneLink.betaOnly
# 15091 hg17.pseudoGeneLink.common

#  RETROGENE TRACK data update - Robert - 2005-04-08
   (added by Jen 2006-01-31)
   - pushQ entry did not include psuedoMrna table. Old table is still
     present on RR. New data has since been lost on dev. 
     User impact: ~1000 sequence missing links in browser 
   - new all.joiner rule needed to link psuedoMrna to pseudoGeneLink table
   - current all.joiner rule between knownGene to pseudoGeneLink gives errors.
     the data types appear to be mismatched. pseudoGeneLink.kgName is
     a gene symbol, not the same identifier as in knownGene.name
   - data is to be regenerated soon and errors corrected at that time


mkdir /cluster/data/hg17/bed/pseudo
cd /cluster/data/hg17/bed/pseudo
ls /cluster/data/hg17/nib/*.nib > S1.lst
hgsql hg17 -N -B < allMrna.sql > allMrna.lst
cp  /cluster/data/genbank/data/aligned/genbank.142.0/hg17/full/mrna.native.psl.gz .
gunzip mrna.native.psl.gz
awk  '{OFS="\t";print $1,$2,$3,$4,$5,$6,$7,$8,$9,substr($10,1,index($10,".")-1),$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23}' mrna.native.rawPsl  > mrnaBlat.psl

hgsql hg17 -N -B < refGene.sql > refGene.tab
hgsql hg17 -B -N < mgcGene.sql > mgcGene.tab
cat ../../*/*.fa.out | awk '$5~/chr*/{OFS="\t";print $5,$6,$7}' >rmsk.bed

cd /cluster/bluearc/hg17/mrnaBlastz/
zcat /cluster/data/hg17/bed/blastz.mm5/axtChain/mouseSyn.net.gz | netToBed stdin mouseSyn.bed

hgsql hg17 < mrna.sql | grep -v matches | awk '{OFS="\t"; print $2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22}' > all_mrna.psl
ssh eieio
pslCat -nohead -check all_mrna.psl /cluster/bluearc/hg17/mrnaBlastz/psl/*.psl |awk '{print $0, $1*3-$2}' | sort -k 10,10 -k 22nr -T /tmp | awk '{OFS="\t"; print $1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21}' >  blatBlastzHg17.psl 
ssh hgwdev
cp blatBlastzHg17.psl /scratch/

tawk '$12 > 1 && $12 < 9999999{x=$11;$11=$12;$12=x;print $0}'  /cluster/data/kgDB/bed/hg17/kgBestMrna/sortedKnownGene.tab > sortedKnownGene.tab
#copy files to iServers for cluster run
ssh kkr1u00
/cluster/home/baertsch/bin/i386/pslSplit nohead -chunkSize=121 /iscratch/i/gs.18/build35/pseudo blatBlastzHg17.psl
cd /cluster/data/hg17/bed/pseudo
cp refGene.tab /iscratch/i/gs.18/build35/pseudo
cp /cluster/data/hg17/bed/simpleRepeat/simpleRepeat.bed /iscratch/i/gs.18/build35/pseudo
cp mrnaHg17.fa /iscratch/i/gs.18/build35/pseudo
cp sortedKnownGene.tab /iscratch/i/gs.18/build35/pseudo
cp rmsk.bed /iscratch/i/gs.18/build35/pseudo
cp all_mrna.psl /iscratch/i/gs.18/build35/pseudo
cp mouseSyn.bed /iscratch/i/gs.18/build35/pseudo
for i in `ls tmp*` ; do echo "doBuildkk.sh ${i%%.psl}" ; done | sed -e 's/tmp//g'  > ~/hg17/pseudo/spec.kk   
cd /iscratch/i/hg/gs.18/build35/pseudo
iSync

para create spec.kk
para push

#post process

# run from eieio
BLUE=/cluster/bluearc/hg17/pseudo
echo catting output
cat $BLUE/pseudoGeneLink[0-9]*.bed | sort -k1,1 -k2,3n >pseudoGeneLinkSort.bed ; /bin/rm $BLUE/pseudoGeneLink[0-9]*.bed
cat $BLUE/pseudo[0-9]*.psl > pseudo.psl ; /bin/rm $BLUE/pseudo[0-9]*.psl &
echo Filtering pseudoGeneLinkSort.bed
tawk '$5 > 10 && $15 > 10000 && $35 > 650 {OFS="\t";print $0}' pseudoGeneLinkSort.bed > pseudoGeneLinkSortFilter.bed
echo Removing Overlaps
doSplit
cd /cluster/bluearc/hg17/pseudo/run.o 
spec.overlap
cd ~/hg17/pseudo
cat /cluster/bluearc/hg17/pseudo/chr*pseudoNoOverlap.bed > pseudoGeneLinkNoOverlap.bed
echo Making psl
awk '{printf("%s\t%s\t%s\n", $5,$2,$3)}' pseudoGeneLinkNoOverlap.bed > pseudoGeneLinkSelect.tab
## 350 is the sacrad magic number and will probably change
tawk '$6>=350{print $0}' pseudoGeneLinkNoOverlap.bed > pseudoGeneLinkNoOverlapFilter.bed                       

pslSelect -qtStart=pseudoGeneLinkSelect.tab pseudo.psl pseudoMrna.psl
echo Loading Bed
hgLoadBed hg17 pseudoGeneLink pseudoGeneLinkNoOverlapFilter.bed -hasBin -sqlTable=/cluster/home/baertsch/kent/src/hg/lib/pseudoGeneLink.sql
echo Loading Psl
hgLoadPsl hg17 pseudoMrna.psl

## end of retroGene track


# 3-WAY MULTIZ MULTIPLE ALIGNMENT (MM5, RN3) (DONE 2004-08-27 kate)
#       HMR Maf's needed for regulatory potential track
    ssh eieio
    set multizDir = multiz.2004-08-27
    cd /cluster/data/hg17/bed/$multizDir
    set workingDir = /cluster/bluearc/hg17/$multizDir
    ln -s $workingDir /cluster/bluearc/hg17/multiz3way
    ln -s $multizDir multiz3way
    mkdir -p $workingDir
    mkdir -p /cluster/data/hg17/bed/$multizDir

# wrapper script for multiz
    # NOTE: first arg is pairwise, 2nd arg is multiple (to add to) 
    # NOTE: next time, modify script so it only needs one arg -- saves the
    # multiple dirname in a file for use by the next run
    cat << 'EOF' > doMultiz.csh
#!/bin/csh -fe
mkdir -p $3:h
/cluster/bin/penn/multiz $1 $2 - > $3
'EOF'

# << for emacs
    cat << 'EOF' > gsub
#LOOP
../doMultiz.csh {check in line /cluster/bluearc/hg17/multiz.2004-08-27/$(dir1)/$(root2).maf} {check in line /cluster/bluearc/hg17/multiz.2004-08-27/$(root1)/$(root2).maf} {check out line+ /cluster/bluearc/hg17/multiz.2004-08-27/$(root1)$(dir1)/$(root2).maf}
#ENDLOOP
'EOF'
# << for emacs

    chmod +x doMultiz.csh

    ssh eieio
    set workingDir = /cluster/bluearc/hg17/multiz.2004-08-27

    # copy mafs to bluearc -- chimp
    mkdir $workingDir/mm5
    cp /cluster/data/hg17/bed/blastz.mm5/mafNet/*.maf \
                $workingDir/mm5
    ls $workingDir/mm5/*.maf > chrom.lst

    # rat
    mkdir $workingDir/rn3
    cp /cluster/data/hg17/bed/blastz.rn3/mafNet/chr*.maf $workingDir/rn3

    # multiz - add in rn3 rat to human/mouse
    # 
    ssh kki
    set multizDir = multiz.2004-08-27
    set workingDir = /cluster/bluearc/hg17/$multizDir
    cd /cluster/data/hg17/bed/$multizDir
    mkdir run.rn3
    cd run.rn3
    echo "rn3/mm5" > species.lst
    gensub2 species.lst ../chrom.lst ../gsub jobList
    para create jobList
        # 47 jobs
    para try, check, push, check

    # copy 3-way mafs to build directory
    ssh eieio
    set multizDir = multiz.2004-08-27
    set workingDir = /cluster/bluearc/hg17/$multizDir
    ln -s $workingDir/mm5rn3 $workingDir/maf
    cd /cluster/data/hg17/bed/multiz.2004-08-27
    mkdir maf
    cp $workingDir/maf/*.maf maf


# BLASTZ TETRAODON (tetNig1) (DONE, 2004-08-26, hartera)
    # blastz requires lineage-specific repeats
    # Treat all repeats as lineage-specific.
    ssh kkr1u00
    mkdir /iscratch/i/gs.18/build35/linSpecRep.notInTetraodon
 foreach f (/iscratch/i/gs.18/build35/rmsk/chr*.fa.out)
 cp -p $f /iscratch/i/gs.18/build35/linSpecRep.notInTetraodon/$f:t:r:r.out.spec
    end

    mkdir /iscratch/i/tetNig1/linSpecRep.notInHuman
    foreach f (/iscratch/i/tetNig1/rmsk/chr*.fa.out)
      cp -p $f /iscratch/i/tetNig1/linSpecRep.notInHuman/$f:t:r:r.out.spec
    end
    iSync

    ssh kk
    mkdir -p /cluster/data/hg17/bed/blastz.tetNig1.2004-08-20
    ln -s /cluster/data/hg17/bed/blastz.tetNig1.2004-08-20 \
            /cluster/data/hg17/bed/blastz.tetNig1
    cd /cluster/data/hg17/bed/blastz.tetNig1
 
    # abridge repeats.
    # Treat all repeats as lineage-specific.
    cat << '_EOF_' > DEF
# human vs. Tetraodon
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/i386:/cluster/home/angie/schwartzbin

# use same parameters as for danRer1-fr1
ALIGN=blastz-run
BLASTZ=blastz
BLASTZ_H=2000
BLASTZ_ABRIDGE_REPEATS=1

# TARGET: Human
SEQ1_DIR=/iscratch/i/hg17/bothMaskedNibs
SEQ1_RMSK=
SEQ1_FLAG=
SEQ1_SMSK=/iscratch/i/hg17/linSpecRep.notInTetraodon
SEQ1_IN_CONTIGS=0
SEQ1_CHUNK=10000000
SEQ1_LAP=10000

# QUERY: Tetraodon
SEQ2_DIR=/iscratch/i/tetNig1/nib
SEQ2_RMSK=
SEQ2_FLAG=
SEQ2_SMSK=/iscratch/i/tetNig1/linSpecRep.notInHuman
SEQ2_IN_CONTIGS=0
SEQ2_CHUNK=10000000
SEQ2_LAP=0

BASE=/cluster/data/hg17/bed/blastz.tetNig1

DEF=$BASE/DEF
RAW=$BASE/raw
CDBDIR=$BASE
SEQ1_LEN=$BASE/S1.len
SEQ2_LEN=$BASE/S2.len

#DEBUG=1
'_EOF_'
    # << this line keeps emacs coloring happy
    # save the DEF file in the current standard place
    chmod +x DEF
    cp DEF ~angie/hummus/DEF.hg17-tetNig1.2004-08-20
 
    # make sure BlastZ_run0.sh, BlastZ_run1.sh and BlastZ_run2.sh scripts
    # are in /cluster/data/hg17/jkStuff
    # edit BlastZ_run0.sh so directory for blastz is /cluster/bin/penn

    bash # if a csh/tcsh user
    . ./DEF
    /cluster/data/hg17/jkStuff/BlastZ_run0.sh
    cd run.0
    # check batch looks ok then
    para try, check, push, check, ....
# para time
# Completed: 19437 of 19437 jobs
# CPU time in finished jobs:    3225816s   53763.60m   896.06h   37.34d  0.102 y
# IO & Wait Time:                174096s    2901.60m    48.36h    2.01d  0.006 y
# Average job time:                 175s       2.92m     0.05h    0.00d
# Longest job:                      709s      11.82m     0.20h    0.01d
# Submission to last job:          5324s      88.73m     1.48h    0.06d

    # second cluster run: lift raw alignments -> lav dir
    ssh kki
    cd /cluster/data/hg17/bed/blastz.tetNig1
    bash # if a csh/tcsh user
    . ./DEF
    /cluster/data/hg17/jkStuff/BlastZ_run1.sh
    cd run.1
    para try, check, push, check etc.
# para time
# Completed: 341 of 341 jobs
# CPU time in finished jobs:        280s       4.66m     0.08h    0.00d  0.000 y
# IO & Wait Time:                  2183s      36.39m     0.61h    0.03d  0.000 y
# Average job time:                   7s       0.12m     0.00h    0.00d
# Longest job:                       41s       0.68m     0.01h    0.00d
# Submission to last job:           469s       7.82m     0.13h    0.01d

    # third run: lav -> axt
    ssh kki
    cd /cluster/data/hg17/bed/blastz.tetNig1
    mkdir axtChrom run.2
    cd run.2
    cat << '_EOF_' > do.csh
#!/bin/csh -ef
cd $1
cat `ls -1 *.lav | sort -g` \
| lavToAxt stdin /iscratch/i/gs.18/build35/bothMaskedNibs \
/iscratch/i/tetNig1/nib stdout \
| axtSort stdin $2
'_EOF_'
    # << this line makes emacs coloring happy
    chmod a+x do.csh
    cat << '_EOF_' > gsub
#LOOP
./do.csh {check in exists $(path1)} {check out line+ /cluster/data/hg17/bed/blastz.tetNig1/axtChrom/$(root1).axt}
#ENDLOOP
'_EOF_'
    # << this line makes emacs coloring happy
    \ls -1Sd ../lav/chr* > chrom.list
    gensub2 chrom.list single gsub jobList
    wc -l jobList
    head jobList
    para create jobList
    para try, check, push, check,...
# para time
# Completed: 45 of 46 jobs
# Crashed: 1 jobs
# CPU time in finished jobs:         52s       0.87m     0.01h    0.00d  0.000 y
# IO & Wait Time:                   256s       4.27m     0.07h    0.00d  0.000 y
# Average job time:                   7s       0.11m     0.00h    0.00d
# Longest job:                       36s       0.60m     0.01h    0.00d
# Submission to last job:           275s       4.58m     0.08h    0.00d

# one job crashed because chr6_hla_hap1.axt is empty. Checked by running this
# again and then looked at the lav file which has no alignments in it.
                                                                                
   # translate sorted axt files into psl
    ssh kolossus
    cd /cluster/data/hg17/bed/blastz.tetNig1
    mkdir -p pslChrom
    set tbl = "blastzTetNig1"
    foreach f (axtChrom/chr*.axt)
      set c=$f:t:r
      echo "Processing chr $c"
      /cluster/bin/i386/axtToPsl $f S1.len S2.len pslChrom/${c}_${tbl}.psl
    end

    # Load database tables
    ssh hgwdev
    cd /cluster/data/hg17/bed/blastz.tetNig1/pslChrom
    foreach f (./*.psl)
      /cluster/bin/i386/hgLoadPsl -noTNameIx hg17 $f
      echo "$f Done"
    end

# original blastzTetNig1: 
# BLASTZ_H=2000
# BLASTZ_Y=3400
# BLASTZ_L=6000
# BLASTZ_K=2200
# BLASTZ_Q=/cluster/data/blastz/HoxD55.q
# BLASTZ_ABRIDGE_REPEATS=1
# featureBits -chrom=chr1 hg17 blastzTetNig1
# 6378680 bases of 222827847 (2.863%) in intersection
# featureBits -chrom=chr1 hg17 refGene:cds blastzTetNig1 -enrichment
# refGene:cds 1.246%, blastzTetNig1 2.863%, both 0.856%, cover 68.70%, 
# enrich 24.00x
# featureBits -chrom=chr1 hg17 refGene:cds blastzDanRer1 -enrichment
# refGene:cds 1.246%, blastzDanRer1 3.934%, both 0.831%, cover 66.72%, 
# enrich 16.96x
# comparable to zebrafish so good
# try same parameters with L=8000
# featureBits -chrom=chr1 hg17 refGene:cds blastzTetNig1L8k -enrichment
# refGene:cds 1.246%, blastzTetNig1L8k 2.095%, both 0.753%, cover 60.47%, 
# enrich 28.87x
# load chr1 with blastz using just H=2000 and default parameters
# featureBits -chrom=chr1 hg17 refGene:cds blastzTetNig1Default -enrichment
# refGene:cds 1.246%, blastzTetNig1Default 1.630%, both 0.808%, cover 64.87%, 
# enrich 39.80x
# rows in chr1_blastzTetNig1 tables 
# blastzTetNig1	95156
# blastzTetNig1L8k 58015
# blastzTetNig1Default 71342
# The default values also used for danRer1 vs fugu give good coverage and
# higher enrichment than blastzTetNig1 with less alignments so this will be
# used for the blastz track - now called blastzTetNig1.

# CHAIN TETRAODON (tetNig1) BLASTZ (DONE, 2004-08-26, hartera)
# Make chains with rescored blastz
    # Run axtChain on little cluster
    ssh kki
    cd /cluster/data/hg17/bed/blastz.tetNig1
    mkdir -p axtChain/run1
    cd axtChain/run1
    mkdir out chain
    ls -1S /cluster/data/hg17/bed/blastz.tetNig1/axtChrom/*.axt \
        > input.lst
cat << '_EOF_' > gsub
#LOOP
doChain {check in exists $(path1)} {check out line+ chain/$(root1).chain} {check out line+ out/$(root1).out}
#ENDLOOP
'_EOF_'
    # << this line makes emacs coloring happy

    # Make our own linear gap file with reduced gap penalties, 
    # in hopes of getting longer chains - works well for species at 
    # chicken-human distance or greater
    cat << '_EOF_' > ../../chickenHumanTuned.gap
tablesize	11
smallSize	111
position	1	2	3	11	111	2111	12111	32111	72111	152111	252111
qGap	325	360	400	450	600	1100	3600	7600	15600	31600	56600
tGap	325	360	400	450	600	1100	3600	7600	15600	31600	56600
bothGap	625	660	700	750	900	1400	4000	8000	16000	32000	57000
'_EOF_'
    # << this line makes emacs coloring happy

cat << '_EOF_' > doChain
#!/bin/csh
axtChain -linearGap=../../chickenHumanTuned.gap $1 \
    /iscratch/i/gs.18/build35/bothMaskedNibs \
    /iscratch/i/tetNig1/nib $2 >& $3
'_EOF_'
    # << this line makes emacs coloring happy
    chmod a+x doChain
    gensub2 input.lst single gsub jobList
    para create jobList
    para try, check, push, check...
# para time
# Completed: 45 of 46 jobs
# Crashed: 1 jobs
# CPU time in finished jobs:        553s       9.22m     0.15h    0.01d  0.000 y
# IO & Wait Time:                   102s       1.69m     0.03h    0.00d  0.000 y
# Average job time:                  15s       0.24m     0.00h    0.00d
# Longest job:                       56s       0.93m     0.02h    0.00d
# Submission to last job:           985s      16.42m     0.27h    0.01d

# one job crashed since chr6_hla_hap1.axt is empty - no alignments

   # now on the cluster server, sort chains
    ssh kksilo
    cd /cluster/data/hg17/bed/blastz.tetNig1/axtChain
    chainMergeSort run1/chain/*.chain > all.chain
    chainSplit chain all.chain
   
    # take a look at score distr's
    foreach f (chain/*.chain)
      grep chain $f | awk '{print $2;}' | sort -nr > /tmp/score.$f:t:r
      echo $f:t:r >> hist5000.out
      textHistogram -binSize=5000 /tmp/score.$f:t:r >> hist5000.out
      echo ""
    end

    # only chr19 has a very large number of chains with score < 5000
    # load chr 1 into table
    ssh hgwdev
    cd /cluster/data/hg17/bed/blastz.tetNig1/axtChain/chain
    hgLoadChain hg17 chr1_chainTetnig1 chr1.chain

# featureBits -chrom=chr1 hg17 refGene:cds chainTetnig1Link -enrichment
# refGene:cds 1.246%, chainTetnig1Link 1.582%, both 0.805%, cover 64.59%, 
# enrich 40.83x

    # try filtering with minScore of 5000
    ssh kksilo
    cd /cluster/data/hg17/bed/blastz.tetNig1/axtChain
    mv all.chain all.chain.unfiltered
    chainFilter -minScore=5000 all.chain.unfiltered > all.chain
    rm -r chain
    chainSplit chain all.chain
    ssh hgwdev
    cd /cluster/data/hg17/bed/blastz.tetNig1/axtChain/chain
    hgLoadChain hg17 chr1_chainTetNig1Filt5k chr1.chain
# featureBits -chrom=chr1 hg17 refGene:cds chainTetNig1Filt5kLink -enrichment
# refGene:cds 1.246%, chainTetNig1Filt5kLink 1.487%, both 0.789%, cover 63.33%,
# enrich 42.58x
# this cleans it up a lot with little reduction in coverage.
# check in browser - filtered version looks good.

# add all chains for minScore=5000 filtered chains
# remove test chain tables for chr1
    ssh hgwdev
    hgsql -e "drop table chr1_chainTetnig1;" hg17
    hgsql -e "drop table chr1_chainTetnig1Link;" hg17
    hgsql -e "drop table chr1_chainTetNig1Filt5k;" hg17
    hgsql -e "drop table chr1_chainTetNig1Filt5kLink;" hg17

    cd /cluster/data/hg17/bed/blastz.tetNig1/axtChain/chain
    foreach i (*.chain)
        set c = $i:r
        hgLoadChain hg17 ${c}_chainTetNig1 $i
        echo done $c
    end

# NET TETRAODON (tetNig1) BLASTZ (DONE, 2004-08-26, hartera)
    ssh kksilo
    cd /cluster/data/hg17/bed/blastz.tetNig1/axtChain
    mkdir preNet
    cd chain
    foreach i (*.chain)
       echo preNetting $i
       /cluster/bin/i386/chainPreNet $i ../../S1.len ../../S2.len \
                                     ../preNet/$i
    end
    cd ..
    mkdir n1
    cd preNet
    foreach i (*.chain)
      set n = $i:r.net
      echo primary netting $i
      /cluster/bin/i386/chainNet $i -minSpace=1 ../../S1.len ../../S2.len \
                                 ../n1/$n /dev/null
    end
    cd ..
    cat n1/*.net | /cluster/bin/i386/netSyntenic stdin noClass.net
    # memory usage 55373824, utime 415 s/100, stime 45
    # Add classification info using db tables:
    cd /cluster/data/hg17/bed/blastz.tetNig1/axtChain
    # netClass looks for ancient repeats in one of the databases
    # hg17 has this table - hand-curated by Arian but this is for
    # human-rodent comparisons so do not use here, use -noAr option
    mkdir -p /cluster/bluearc/hg17/linSpecRep.notInTetraodon
    mkdir -p /cluster/bluearc/tetNig1/linSpecRep.notInHuman
    cp /iscratch/i/hg17/linSpecRep.notInTetraodon/* \
       /cluster/bluearc/hg17/linSpecRep.notInTetraodon
    cp /iscratch/i/tetNig1/linSpecRep.notInHuman/* \
       /cluster/bluearc/tetNig1/linSpecRep.notInHuman

    ssh hgwdev
    cd /cluster/data/hg17/bed/blastz.tetNig1/axtChain
    time netClass noClass.net hg17 tetNig1 tetNig1.net \
          -tNewR=/cluster/bluearc/hg17/linSpecRep.notInTetraodon \
          -qNewR=/cluster/bluearc/tetNig1/linSpecRep.notInHuman -noAr
    # 54.100u 31.890s 2:20.01 61.4%   0+0k 0+0io 197pf+0w
    netFilter -minGap=10 tetNig1.net |  hgLoadNet hg17 netTetNig1 stdin
    # featureBits hg17 refGene:cds netTetNig1 -enrichment
    # refGene:cds 0.978%, netTetNig1 25.095%, both 0.778%, cover 79.53%, 
    # enrich 3.17x

# TWINSCAN 1.3 GENE PREDICTIONS (Done, 2004-Aug-26, heather)

    cd /cluster/data/hg17/bed
    mkdir twinscan
    tarFile=hg17_TS13_pseudomasked.tar.gz
    wget http://genes.cs.wustl.edu/predictions/human/NCBI35/hg17_TS13_pseudomasked.tar.gz
    wget http://genes.cs.wustl.edu/predictions/human/NCBI35/md5sum.txt

    # check file transferred correctly
    grep gz md5sum.txt > gz.sum
    md5sum $tarFile | diff - gz.sum

    # extract
    tar xvfz $tarFile
    unset tarFile 
    
    # check that files unzipped and untarred correctly
    # expect no differences
    cd chr_gtf
    grep gtf ../md5sum.txt > md5sum.txt
    cd ../chr_ptx
    grep ptx ../md5sum.txt > md5sum.txt
    cd ../chr_tx
    grep tx ../md5sum.txt > md5sum.txt
    cd ..
    
    md5sum chr_gtf/* > gtf.sum
    diff gtf.sum chr_gtf/md5sum.txt

    md5sum chr_ptx/* > ptx.sum
    diff ptx.sum chr_ptx/md5sum.txt

    md5sum chr_tx/* > tx.sum
    diff tx.sum chr_tx/md5sum.txt

    # pare down protein FASTA header to id and add missing .a:
    foreach c (1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 X Y)
      echo chr$c
      perl -wpe 's/^(\>\S+)\s.*$/$1.a/' < chr_ptx/chr$c.ptx > chr_ptx/chr$c-fixed.fa
    end
    ldHgGene hg17 twinscan chr_gtf/chr*.gtf -gtf -genePredExt
    hgPepPred hg17 generic twinscanPep chr_ptx/chr*-fixed.fa

# MAKE VSTETNIG1 DOWNLOADABLES (DONE, 2004-09-08, hartera)
#    Replace with gzipped versions (DONE 2004-09-14 kate)
    ssh kksilo
    # zip chains and nets
    cd /cluster/data/hg17/bed/blastz.tetNig1/axtChain
    cp all.chain tetNig1.chain
    zip -j /cluster/data/hg17/zip/tetNig1.chain.zip tetNig1.chain
    rm tetNig1.chain
    zip -j /cluster/data/hg17/zip/tetNig1.net.zip tetNig1.net
   
    ssh hgwdev
    # copy chains and nets to downloads area
    set gp = /usr/local/apache/htdocs/goldenPath/hg17
    mkdir -p $gp/vsTetNig1
    cd $gp/vsTetNig1
    mv /cluster/data/hg17/zip/tetNig1*.zip .
    md5sum *.zip > md5sum.txt

    # move axt files to downloads area and zip
    cd /cluster/data/hg17/bed/blastz.tetNig1/axtChrom
    mkdir -p $gp/vsTetNig1/axtChrom
    cp -p *.axt $gp/vsTetNig1/axtChrom
    cd $gp/vsTetNig1/axtChrom
    gzip *.axt
    md5sum *.gz > md5sum.txt

    # Copy over & edit README.txt w/pointers to chain, net formats.

# BLASTZ TETRAODON (tetNig1) CLEANUP (DONE, 2004-09-10, hartera)
    ssh kksilo
    cd /cluster/data/hg17/bed/blastz.tetNig1
    nice rm -rf raw &
    nice rm -rf lav &
    nice rm axtChain/run1/chain/* &
    nice gzip {axt,psl}Chrom/* axtChain/{all.chain,*.net} &


# regulatory potential 2X track (WORKING - 2004-09-14 - Hiram)
    ssh eieio
    mkdir /cluster/store3/gs.18/build35/bed/regPotential2X
    mkdir /cluster/store3/gs.18/build35/bed/regPotential3X
    cd /cluster/data/hg17/bed
    ln -s /cluster/store3/gs.18/build35/bed/regPotential2X .
    ln -s /cluster/store3/gs.18/build35/bed/regPotential3X .
    cd regPotential2X
    wget --timestamping 'http://www.bx.psu.edu/~james/stuff/rp_kit.tgz' .
    tar xvzf rp_kit.tgz
    # fixup the hmr_rp_score.sh and hm_rp_score.sh to set
    #	RP=. to read: RP=/cluster/data/hg17/bed/regPotential2X/rp_kit
    #	And fix the usage of SHIFT and WINDOW, the following diff shows
    #	the changes:
    #	5c5
    #	< RP_DIR=/cluster/data/hg17/bed/regPotential2X/rp_kit
    #	---
    #	> RP_DIR=.
    #	8,9c8,9
    #	< MAPPING=rp_kit/hm_5a_mapping.txt
    #	< MATRIX=rp_kit/hm_5a+3_scoreMatrix.dat
    #	---
    #	> MAPPING=hm_5a_mapping.txt
    #	> MATRIX=hm_5a+3_scoreMatrix.dat
    #	12c12
    #	< SHIFT=1
    #	---
    #	> SHIFT=5
    #	24,25c24,25
    #	<     --shiftAmount $SHIFT \
    #	<     --windowSize $WINDOW \
    #	---
    #	>     --shiftAmount 5 \
    #	>     --windowSize 100 \

    mkdir maf
    for A in  `(cd /cluster/data/hg17/bed/blastz.mm5/axtNet; ls chr*.axt)`
    do
	C=${A/.axt}
	echo "/cluster/data/hg17/bed/blastz.mm5/axtNet/${A} -> maf/${C}.maf.gz"
	axtToMaf /cluster/data/hg17/bed/blastz.mm5/axtNet/${A} \
		/cluster/data/hg17/chrom.sizes /cluster/data/mm5/chrom.sizes \
		stdout | gzip > maf/${C}.maf.gz
    done
    # Replace bad chr5 axtNet and mafNet (2006-01-05 kate) 

    #	a valid java runtime is only on hgwdev.  This is a java procedure
    ssh hgwdev
    cd /cluster/data/hg17/bed/regPotential2X
    mkdir rp_scores
    #  WARNING - the following loop takes almost 12 hours !
    for M in maf/chr*.maf.gz
    do
	C=${M/.maf.gz}
	C=${C#maf/}
	echo "$M -> rp_scores/$C.score.gz"
	(zcat ${M} | ./rp_kit/hm_rp_score.sh /dev/stdin /dev/stderr 2>&1 >/dev/null) | sort -n | \
	    gzip > rp_scores/${C}.score.gz
    done
    #	real    709m55.805s
    #	user    754m51.030s
    #	sys     20m11.000s

    #	Back to the file server to create the wiggle data

    ssh eieio
    cd /cluster/data/hg17/bed/regPotential2X
    mkdir wigData dataLimits
    for S in rp_scores/chr*.score.gz
    do
	C=${S/.score.gz}
	C=${C#rp_scores/}
	echo "$S -> wigData/$C.wig"
	zcat $S | sort -n | \
	    wigAsciiToBinary -chrom=$C -dataSpan=1 \
	    -wibFile=wigData/$C stdin 2> dataLimits/$C.limits
    done
    #	real    313m0.567s
    #	user    285m37.319s
    #	sys     23m8.301s

    #	Loading the table on hgwdev
    ssh hgwdev
    cd /cluster/data/hg17/bed/regPotential2X/wigData
    mkdir /gbdb/hg17/wib/regPotential2X
    ln -s `pwd`/*.wib /gbdb/hg17/wib/regPotential2X
    time hgLoadWiggle -pathPrefix=/gbdb/hg17/wib/regPotential2X \
	hg17 regPotential2X chr*.wig
    #	real    2m29.668s
    #	user    0m33.380s
    #	sys     0m8.200s

# regulatory potential 3X track (WORKING - 2004-09-14 - Hiram)
    #	Expects groundwork done above in the 2X track
    #	a valid java runtime is only on hgwdev.  This is a java procedure
    ssh hgwdev
    cd /cluster/data/hg17/bed/regPotential3X
    ln -s ../regPotential2X/rp_kit/hmr_rp_score.sh .
    mkdir rp_scores
    #  WARNING - the following loop takes almost 12 hours !
    for M in maf/chr*.maf.gz
    do
	C=${M/.maf.gz}
	C=${C#maf/}
	echo "$M -> rp_scores/$C.score.gz"
	(zcat ${M} | ./rp_kit/hm_rp_score.sh /dev/stdin /dev/stderr 2>&1 >/dev/null) | sort -n | \
	    gzip > rp_scores/${C}.score.gz
    done
    #	real    613m8.230s
    #	user    623m7.110s
    #	sys     20m24.550s

    #	Back to the file server to create the wiggle data

    ssh eieio
    cd /cluster/data/hg17/bed/regPotential3X
    mkdir wigData dataLimits

    for S in rp_scores/chr*.score.gz
    do
	C=${S/.score.gz}
	C=${C#rp_scores/}
	echo "$S -> wigData/$C.wig"
	zcat $S | sort -n | \
	    wigAsciiToBinary -chrom=$C -dataSpan=1 \
		-wibFile=wigData/$C stdin 2> dataLimits/$C.limits
    done

    #	Loading the table on hgwdev
    ssh hgwdev
    cd /cluster/data/hg17/bed/regPotential3X/wigData
    mkdir /gbdb/hg17/wib/regPotential3X
    ln -s `pwd`/*.wib /gbdb/hg17/wib/regPotential3X
    time hgLoadWiggle -pathPrefix=/gbdb/hg17/wib/regPotential3X \
	hg17 regPotential3X chr*.wig
    #	real    1m45.568s
    #	user    0m32.740s
    #	sys     0m6.140s

# regulatory potential 5X track (DONE - 2005-09-19 - Daryl)
    ssh kkstore02
    mkdir -p /cluster/data/hg17/bed/regPotential5X/rp_scores
    cd /cluster/data/hg17/bed/regPotential5X/rp_scores
    wget -r -l 1 -nH http://www.bx.psu.edu/~james/rp/hg17panTro1mm5rn3canFam1/all_truncate.tar
    tar xvf all_truncate.tar
    cd /cluster/data/hg17/bed/regPotential5X
    mkdir -p wigData dataLimits 
    cd wigData
    ## 8 minutes
    for S in ../rp_scores/chr*.scores.truncated.gz
    do
	C=${S/.scores.truncated.gz}
	C=${C#../rp_scores/}
	echo "$S -> wigData/$C.wig"
	zcat $S | wigEncode stdin $C.wig $C.wib 2> ../dataLimits/$C.limits
    done

    #	Loading the table on hgwdev
    ssh hgwdev
    cd /cluster/data/hg17/bed/regPotential5X/wigData
    mkdir -p /gbdb/hg17/wib/regPotential5X
    chmod o+rx /gbdb/hg17/wib/regPotential5X
    ln -s /cluster/data/hg17/bed/regPotential5X/wigData/*.wib /gbdb/hg17/wib/regPotential5X
    chmod o+r /gbdb/hg17/wib/regPotential5X/ch*wib
    time hgLoadWiggle -pathPrefix=/gbdb/hg17/wib/regPotential5X hg17 regPotential5X chr*.wig
    # 57.720u 9.960s 2:26.05 46.3%    0+0k 0+0io 213pf+0w

# SGP GENES (DONE 9/17/04 angie)
    ssh eieio
    mkdir /cluster/data/hg17/bed/sgp
    cd /cluster/data/hg17/bed/sgp
    foreach chr (`awk '{print $1;}' ../../chrom.sizes`)
      wget http://genome.imim.es/genepredictions/H.sapiens/golden_path_200405/SGP/$chr.gtf
      wget http://genome.imim.es/genepredictions/H.sapiens/golden_path_200405/SGP/$chr.prot
    end
    # Add ".1" suffix to each item in .prot's, to match transcript_id's in gtf
    cp /dev/null sgpPep.fa
    foreach f (chr*.prot)
      perl -wpe 's/^(>chr\S+)/$1.1/' $f >> sgpPep.fa
    end
    ssh hgwdev
    cd /cluster/data/hg17/bed/sgp
    ldHgGene -gtf -genePredExt hg17 sgpGene chr*.gtf
    hgPepPred hg17 generic sgpPep sgpPep.fa

# SGP GENES (UPDATE 1/18/2006)
    sgpPep table dropped, replaced by hgc generated protein seq in browser

# LIFTOVER RNAGENE FROM HG16 (09/29/04, acs)
    cd /cluster/data/hg17/bed
    mkdir rnaGene
    cd rnaGene
    liftOver -gff /cluster/data/hg16/bed/rnaGene/all.gff \
	/cluster/data/hg16/bed/liftOver/hg16ToHg17.over.chain \
	rnaGeneLift.gff rnaGeneMiss.gff
    # 7204 records passed, 16 failed
    hgsql hg17 < ~/kent/src/hg/lib/rnaGene.sql
    hgRnaGenes hg17 rnaGeneLift.gff

    
# BUILD BioCyc TABLES (DONE 10/1/04 Fan)

  - Create bioCycMapDesc table.

	CREATE TABLE bioCycMapDesc (
  	mapID varchar(40) NOT NULL default '',
  	description varchar(255) NOT NULL default '',
  	KEY mapID (mapID)
	) TYPE=MyISAM;

  - Crate bioCycPathway table.

	CREATE TABLE bioCycPathway (
  	kgID varchar(40) NOT NULL default '',
  	geneID varchar(40) NOT NULL default '',
  	mapID varchar(40) NOT NULL default '',
  	KEY kgID (kgID),
 	KEY geneID (geneID),
  	KEY mapID (mapID)
	) TYPE=MyISAM;

  Using data files sent by Peter Carp from SRI,
  per Peter's email of 10/1/04, they don't have recent update,
  so data files received last year are used.
  
  Save the BioCyc Pathway name and description table as names.txt.
  Save the pathway data file as gene-pathway.dat.
  Make sure there is no extra ^M at end of the lines.

  hgsql hg17 -e	'LOAD DATA local INFILE 'names.txt' into table bioCycMapDesc'

  Run hgBioCyc program to generate the file bioCycPathway.tab.

	 hgBioCyc gene-pathway.dat hg17

  Load into hg17.
 
	 hgsql hg17 -e 'LOAD DATA local INFILE "bioCycPathway.tab" into table bioCycPathway'
	
# MAKING FOLDUTR TABLES (DONE - 2004-10-4 Fan)
# First set up directory structure and extract UTR sequence on hgwdev
    ssh hgwdev
    mkdir -p /cluster/data/hg17/bed/rnaStruct
    cd /cluster/data/hg17/bed/rnaStruct
    mkdir -p utr3/split utr5/split utr3/fold utr5/fold
    utrFa hg17 knownGene utr3 utr3/utr.fa
    utrFa hg17 knownGene utr5 utr5/utr.fa

# Split up files and make files that define job.
    ssh kk
    cd /cluster/data/hg17/bed/rnaStruct
    faSplit sequence utr3/utr.fa 50000 utr3/split/s
    faSplit sequence utr5/utr.fa 50000 utr5/split/s
    ls -1 utr3/split > utr3/in.lst
    ls -1 utr5/split > utr5/in.lst
    cd utr3
    cat > gsub <<end
#LOOP
rnaFoldBig split/\$(path1) fold
#ENDLOOP
end
    cp gsub ../utr5

# Do cluster run for 3' UTRs
    gensub2 in.lst single gsub spec
    para create spec
    para try
    para push
# Completed: 37244 of 37244 jobs
# CPU time in finished jobs:    1036479s   17274.64m   287.91h   12.00d  0.033 y
# IO & Wait Time:                112286s    1871.44m    31.19h    1.30d  0.004 y
# Average job time:                  31s       0.51m     0.01h    0.00d
# Longest job:                     3370s      56.17m     0.94h    0.04d
# Submission to last job:          4355s      72.58m     1.21h    0.05d

# Do cluster run for 5' UTRs 
    cd ../utr5
    gensub2 in.lst single gsub spec
    para create spec
    para try
    para push
# Completed: 29817 of 29817 jobs
# CPU time in finished jobs:      98143s    1635.72m    27.26h    1.14d  0.003 y
# IO & Wait Time:                105763s    1762.71m    29.38h    1.22d  0.003 y
# Average job time:                   7s       0.11m     0.00h    0.00d
# Longest job:                     2133s      35.55m     0.59h    0.02d
# Submission to last job:          2465s      41.08m     0.68h    0.03d

# Load database
    ssh hgwdev
    cd /cluster/data/hg17/bed/rnaStruct/utr5
    hgLoadRnaFold hg17 foldUtr5 fold
    cd ../utr3
    hgLoadRnaFold hg17 foldUtr3 fold

# Clean up
    rm -r split fold err batch.bak
    cd ../utr5
    rm -r split fold err batch.bak

####### BUILD RGD HUMAN QTL TRACKS (DONE 10/7/04 Fan) ##############

mkdir -p /cluster/store8/rgd/human041007
ln -s /cluster/store8/rgd/human041007 /cluster/data/hg17/bed/rgdQtl
cd /cluster/data/hg17/bed/rgdQtl

# download data files from RGD

wget --timestamp ftp://rgd.mcw.edu/pub/RGD_genome_annotations/human/human_QTL.gff

# remove extra line feed character at the end of lines

rmLf human_QTL.gff > rgdQtl.gff

# create rgdQtl.tab
awk '{print $1"\t"$4"\t"$5"\t"$10}'  rgdQtl.gff |sed -e 's/Chr/chr/g'| \
sed -e 's/"//g' |sed -e 's/RGD://g' | sed -e 's/;//g' > rgdQtl.tab

# create rgdQtlLink.tab

awk '{printf "%s\t%s\t", $12, $10; for (i = 14;i <= NF; ++i ) {printf "%s ", $i} printf "\n"} ' rgdQtl.gff | \
sed -e 's/"//g'| sed -e 's/RGD://g' | sed -e 's/;//g'| sed -e 's/Note//g' > rgdQtlLink.tab

# load rgdQtl table
hgLoadBed hg17 rgdQtl rgdQtl.tab

# check rgdQtl table
checkTableCoords hg17 rgdQtl

# load rgdQtlLink table
hgsql hg17 -e "drop table hg17.rgdQtlLink;"
hgsql hg17 <~/kent/src/hg/lib/rgdQtlLink.sql
hgsql hg17 -e 'load data local infile "rgdQtlLink.tab" into table hg17.rgdQtlLink;'

# updated trackDb.ra under /kent/src/hg/makeDb/trackDb/human/hg17 and
# added rgdQtl.html.

#### AFFYMETRIX HG-U133 PLUS TRACK (DONE, 2004-10-11, hartera)
     ssh hgwdev
     mkdir -p /projects/compbio/data/microarray/affyHuman/HG-U133Plus2
     # Go to 
#http://www.affymetrix.com/support/technical/byproduct.affx?product=hg-u133-plus 
     # and download the consensus and exemplar sequences to this directory
     cd /projects/compbio/data/microarray/affyHuman/HG-U133Plus2
     unzip HG-U133_Plus_2_consensus.zip
     unzip HG-U133_Plus_2_exemplar.zip
     cat HG-U133_Plus_2_consensus HG-U133_Plus_2_exemplar >> U133Plus2_all.fa
     perl -pi.bak -e "s/(consensus|exemplar):HG-U133_Plus_2:/U133+2:/" \
                     U133Plus2_all.fa
     # remove ";" from probe set names
     perl -pi.bak -e "s/;//" U133Plus2_all.fa
     # clean up
     rm *.zip *.bak
     mkdir -p /cluster/data/hg17/bed/affyU133Plus2.2004-10-11
     cp U133Plus2_all.fa /cluster/data/hg17/bed/affyU133Plus2.2004-10-11
     # Set up cluster job to align consensus/exemplars to hg16
     ssh kkr1u00
     mkdir -p /iscratch/i/affy
     mv /cluster/data/hg17/bed/affyU133Plus2.2004-10-11/U133Plus2_all.fa \
        /iscratch/i/affy
     iSync

     ssh kk
     cd /cluster/data/hg17/bed/affyU133Plus2.2004-10-11
     ls -1 /iscratch/i/affy/U133Plus2_all.fa > affy.lst
     ls -1 /iscratch/i/gs.18/build35/maskedContigs/* > allctg.lst

    cat << '_EOF_' > template.sub
#LOOP
/cluster/bin/i386/blat -fine -mask=lower -minIdentity=95 -ooc=/iscratch/i/gs.18/build35/hg17.11.ooc $(path1) $(path2) {check out line+ psl/$(root1)_$(root2).psl}
#ENDLOOP
'_EOF_'
    # << for emacs
    gensub2 allctg.lst affy.lst template.sub para.spec
    mkdir psl
    para create para.spec
    para try, para check, para push .....
# para time
# Completed: 380 of 380 jobs
# CPU time in finished jobs:      24533s     408.88m     6.81h    0.28d  0.001 y
# IO & Wait Time:                  2180s      36.34m     0.61h    0.03d  0.000 y
# Average job time:                  70s       1.17m     0.02h    0.00d
# Longest job:                      751s      12.52m     0.21h    0.01d
# Submission to last job:          2425s      40.42m     0.67h    0.03d

    # Do sort, best in genome filter, and convert to chromosome coordinates
    # to create affyU133Plus2.psl
    pslSort dirs raw.psl tmp psl
                                                                                
    # use filter parameters for these sequences. only use alignments that
    # cover 30% of sequence and have at least 95% identity in aligned region.
    # minAli = 0.97 too high. low minCover as a lot of n's in these sequences
  pslReps -minCover=0.3 -minAli=0.95 -nearTop=0.005 raw.psl contig.psl /dev/null
    liftUp affyU133Plus2.psl ../../jkStuff/liftAll.lft warn contig.psl
    perl -pi.bak -e "s/U133\+2://" affyU133Plus2.psl
    # load into the database
    ssh hgwdev
    cd /cluster/data/hg17/bed/affyU133Plus2.2004-10-11
    hgLoadPsl hg17 affyU133Plus2.psl
    # Add sequence data to database
        # Copy probe sequence to /gbdb if it isn't already
    mkdir -p /gbdb/hgFixed/affyProbes
    cd /gbdb/hgFixed/affyProbes
    ln -s /projects/compbio/data/microarray/affyHuman/HG-U133Plus2/U133Plus2_all.fa .
    cd /cluster/data/hg17/bed/affyU133Plus2.2004-10-11
    hgLoadSeq -abbr=U133+2: hg17 /gbdb/hgFixed/affyProbes/U133Plus2_all.fa
    
    # clean up
    rm -r psl tmp err contig.psl raw.psl *.bak psl.tab seq.tab
# Added knownToU133Plus2 track (2004-10-14) - see GeneSorter section


#### MAF COVERAGE FIGURES FOR ADAM (DONE 10/18/04 angie)
    # First, get ranges of target coverage:
    ssh eieio
    mkdir /cluster/data/hg17/bed/var_multiz.2004-08-12/coverage
    cd /cluster/data/hg17/bed/var_multiz.2004-08-12/coverage/
    cat /cluster/data/hg17/bed/var_multiz.2004-08-12/maf.09-12-04/*.maf \
    | nice mafRanges -notAllOGap stdin hg17 hg17.mafRanges.bed
    # Get pairwise coverage as well.
    ssh kolossus
    cd /cluster/data/hg17/bed/var_multiz.2004-08-12/coverage/
    cat /cluster/bluearc/hg17/multiz8way/rn3/*.maf \
    | nice mafRanges -notAllOGap stdin hg17 hg17.rn3.mafRanges.bed
    cat /cluster/bluearc/hg17/multiz8way/mm5/*.maf \
    | nice mafRanges -notAllOGap stdin hg17 hg17.mm5.mafRanges.bed
    cat /cluster/bluearc/hg17/multiz8way/galGal2/*.maf \
    | nice mafRanges -notAllOGap stdin hg17 hg17.galGal2.mafRanges.bed
    cat /cluster/bluearc/hg17/multiz8way/fr1/*.maf \
    | nice mafRanges -notAllOGap stdin hg17 hg17.fr1.mafRanges.bed

    ssh hgwdev
    cd /cluster/data/hg17/bed/var_multiz.2004-08-12/coverage
    # To make subsequent intersections a bit quicker, output a bed with 
    # duplicate/overlapping ranges collapsed:
    nice featureBits hg17 hg17.mafRanges.bed \
      -bed=hg17.mafRangesCollapsed.bed
#1147548420 bases of 2866216770 (40.037%) in intersection
    foreach other (mm5 rn3 galGal2 fr1)
      nice featureBits hg17 hg17.$other.mafRanges.bed \
      -bed=hg17.${other}.mafRangesCollapsed.bed
    end
#1013348528 bases of 2866216770 (35.355%) in intersection
#975533772 bases of 2866216770 (34.036%) in intersection
#101623034 bases of 2866216770 (3.546%) in intersection
#46737824 bases of 2866216770 (1.631%) in intersection

    # mafCoverage barfs currently, so pass on this for now:
    #cat ../maf.09-12-04/*.maf \
    #| nice mafCoverage -count=2 hg17 stdin > hg17.mafCoverage

    # Intersect maf target coverage with gene regions -- 
    # use Adam's knownGene region files:
    nice featureBits hg17 -enrichment \
      ../phastCons/stats2/knownGenesCds.bed \
      hg17.mafRangesCollapsed.bed \
      -bed=hg17.mafCds.bed
#knownGenesCds.bed 1.166%, hg17.mafRangesCollapsed.bed 40.037%, both 1.111%, cover 95.36%, enrich 2.38x

    nice featureBits hg17 -enrichment \
      ../phastCons/stats2/knownGenesUtr3.bed \
      hg17.mafRangesCollapsed.bed \
      -bed=hg17.mafUtr3.bed
#knownGenesUtr3.bed 0.918%, hg17.mafRangesCollapsed.bed 40.037%, both 0.662%, cover 72.18%, enrich 1.80x
    nice featureBits hg17 -enrichment \
      ../phastCons/stats2/knownGenesUtr5.bed \
      hg17.mafRangesCollapsed.bed \
      -bed=hg17.mafUtr5.bed
#knownGenesUtr5.bed 0.266%, hg17.mafRangesCollapsed.bed 40.037%, both 0.198%, cover 74.42%, enrich 1.86x

    # Intersect pairwise target coverages with gene regions:
    foreach other (mm5 rn3 galGal2 fr1)
      nice featureBits hg17 -enrichment \
        ../phastCons/stats2/knownGenesCds.bed \
        hg17.$other.mafRangesCollapsed.bed -bed=hg17.${other}Cds.bed
      nice featureBits hg17 -enrichment \
        ../phastCons/stats2/knownGenesUtr3.bed \
        hg17.$other.mafRangesCollapsed.bed -bed=hg17.${other}Utr3.bed
      nice featureBits hg17 -enrichment \
        ../phastCons/stats2/knownGenesUtr5.bed \
        hg17.$other.mafRangesCollapsed.bed -bed=hg17.${other}Utr5.bed
    end
#knownGenesCds.bed 1.166%, hg17.mm5.mafRangesCollapsed.bed 35.355%, both 1.093%, cover 93.74%, enrich 2.65x
#knownGenesUtr3.bed 0.918%, hg17.mm5.mafRangesCollapsed.bed 35.355%, both 0.618%, cover 67.37%, enrich 1.91x
#knownGenesUtr5.bed 0.266%, hg17.mm5.mafRangesCollapsed.bed 35.355%, both 0.186%, cover 69.81%, enrich 1.97x
#knownGenesCds.bed 1.166%, hg17.rn3.mafRangesCollapsed.bed 34.036%, both 1.071%, cover 91.85%, enrich 2.70x
#knownGenesUtr3.bed 0.918%, hg17.rn3.mafRangesCollapsed.bed 34.036%, both 0.597%, cover 65.09%, enrich 1.91x
#knownGenesUtr5.bed 0.266%, hg17.rn3.mafRangesCollapsed.bed 34.036%, both 0.179%, cover 67.33%, enrich 1.98x
#knownGenesCds.bed 1.166%, hg17.galGal2.mafRangesCollapsed.bed 3.546%, both 0.779%, cover 66.84%, enrich 18.85x
#knownGenesUtr3.bed 0.918%, hg17.galGal2.mafRangesCollapsed.bed 3.546%, both 0.194%, cover 21.12%, enrich 5.96x
#knownGenesUtr5.bed 0.266%, hg17.galGal2.mafRangesCollapsed.bed 3.546%, both 0.056%, cover 21.03%, enrich 5.93x
#knownGenesCds.bed 1.166%, hg17.fr1.mafRangesCollapsed.bed 1.631%, both 0.714%, cover 61.26%, enrich 37.57x
#knownGenesUtr3.bed 0.918%, hg17.fr1.mafRangesCollapsed.bed 1.631%, both 0.073%, cover 7.92%, enrich 4.86x
#knownGenesUtr5.bed 0.266%, hg17.fr1.mafRangesCollapsed.bed 1.631%, both 0.039%, cover 14.82%, enrich 9.09x


# ALTERNATIVE CPG ISLANDS (DONE 10/14/04 angie)
    ssh eieio
    nice tcsh
    mkdir /cluster/data/hg17/bed/cpgIslandAlt
    cd /cluster/data/hg17/bed/cpgIslandAlt
    # Try cpg_ash (WUSTL program modified to not chop islands in half before 
    # scoring) with default params:
    cp /dev/null cpg_ash.default.cpg
    foreach f (../../?{,?}/chr*.fa.masked)
      echo running on $f:t:r:r
      ~angie/cb/hg3rdParty/cpgIslands/cpg_ash.exe $f >> cpg_ash.default.cpg
    end
    awk -f ../cpgIsland/filter.awk cpg_ash.default.cpg > cpgIslandAlt.bed
    # Run Andy Law's script on masked seq:
    cp /dev/null cpgIslandGgfAndyMasked.bed
    foreach f (../../?{,?}/chr*.fa.masked)
      set chr = $f:t:r:r
      echo running on $chr
      /cluster/home/angie/bin/$MACHTYPE/preProcGgfAndy $f \
      | /cluster/home/angie/ggf-andy-cpg-island.pl \
      | perl -wpe 'chomp; ($s,$e,$cpg,$n,$c,$g,$oE) = split("\t"); $s--; \
                   $gc = $c + $g;  $pCpG = (100.0 * 2 * $cpg / $n); \
                   $pGc = (100.0 * $gc / $n); \
                   $_ = "'$chr'\t$s\t$e\tCpG: $cpg\t$n\t$cpg\t$gc\t" . \
                        "$pCpG\t$pGc\t$oE\n";' \
      >> cpgIslandGgfAndyMasked.bed
    end
    # Compare enrichment for knownGene upstream -- an uphill battle for 
    # programs closer to meeting the stated length, GC, O/E params!
    ssh hgwdev
    nice featureBits hg17 -enrichment knownGene:upstream:1000 \
      /cluster/data/hg17/bed/cpgIsland/cpgIsland.bed
#knownGene:upstream:1000 0.857%, cpgIsland.bed 0.741%, both 0.166%, cover 19.37%, enrich 26.13x
    nice featureBits hg17 -enrichment knownGene:upstream:1000 \
      /cluster/data/hg17/bed/cpgIslandAlt/cpgIslandAlt.bed
#knownGene:upstream:1000 0.857%, cpgIslandAlt.bed 1.075%, both 0.200%, cover 23.38%, enrich 21.76x
    nice featureBits hg17 -enrichment knownGene:upstream:1000 \
      /cluster/data/hg17/bed/cpgIslandAlt/cpgIslandGgfAndyMasked.bed
#knownGene:upstream:1000 0.857%, cpgIslandGgfAndyMasked.bed 1.964%, both 0.292%, cover 34.06%, enrich 17.34x
    cd /cluster/data/hg17/bed/cpgIslandAlt
    sed -e 's/cpgIslandExt/cpgIslandAlt/g' \
      ~/kent/src/hg/lib/cpgIslandExt.sql > cpgIslandAlt.sql
    hgLoadBed -noBin -tab -sqlTable=cpgIslandAlt.sql \
      hg17 cpgIslandAlt cpgIslandAlt.bed
#Loaded 29998 elements of size 10
    sed -e 's/cpgIslandExt/cpgIslandGgfAndyMasked/g' \
      ~/kent/src/hg/lib/cpgIslandExt.sql > cpgIslandGgfAndyMasked.sql
    hgLoadBed -noBin -tab -sqlTable=cpgIslandGgfAndyMasked.sql \
      hg17 cpgIslandGgfAndyMasked cpgIslandGgfAndyMasked.bed
#Loaded 80555 elements of size 10
    # Quick length stats:
    hgsql hg17 -e 'select min(length), avg(length), max(length) from cpgIslandExt'
#|         201 |    764.1913 |       40058 |
    hgsql hg17 -e 'select min(length), avg(length), max(length) from cpgIslandAlt'
#|         200 |   1026.9194 |       32440 |
    hgsql hg17 -e 'select min(length), avg(length), max(length) from cpgIslandGgfAndyMasked'
#|         200 |    698.8257 |      100308 |
    # 1/26/05: Make better island names in cpgIslandGgfAndyMasked,
    # for Dave Burt's cross-species island comparisons.
    ssh eieio
    cd /cluster/data/hg17/bed/cpgIslandAlt
    mv cpgIslandGgfAndyMasked.bed cpgIslandGgfAndyMasked.bed.orig
    perl -wpe '@w=split("\t"); $w[3] = "hg17.$w[0]." . ($w[1]+1) . ".$w[2]"; \
               $_ = join("\t", @w);' \
      cpgIslandGgfAndyMasked.bed.orig \
    > cpgIslandGgfAndyMasked.bed
    # Now liftOver islands from mm5, rn3, galGal2:
    ssh kolossus
    cd /cluster/data/hg17/bed/cpgIslandAlt
    foreach match (50 95)
      liftOver /cluster/data/mm5/bed/cpgIslandGgfAndy/cpgIslandGgfAndyMasked.bed \
        /cluster/data/mm5/bed/bedOver/mm5Tohg17.chain -minMatch=0.$match \
        cpgIslandGAMFromMm5_$match.bed cpgIslandGAMFromMm5_$match.unmapped
      liftOver /cluster/data/rn3/bed/cpgIslandGgfAndy/cpgIslandGgfAndyMasked.bed \
        /cluster/data/rn3/bed/bedOver/rn3ToHg17.over.chain -minMatch=0.$match \
        cpgIslandGAMFromRn3_$match.bed cpgIslandGAMFromRn3_$match.unmapped
      liftOver /cluster/data/galGal2/bed/cpgIslandGgfAndy/cpgIslandGgfAndyMasked.bed \
        /cluster/data/galGal2/bed/bedOver/galGal2ToHg17.over.chain -minMatch=0.$match \
        cpgIslandGAMFromGalGal2_$match.bed cpgIslandGAMFromGalGal2_$match.unmapped
    end
    # Load up the renamed islands as well as
    ssh hgwdev
    cd /cluster/data/hg17/bed/cpgIslandAlt
    hgLoadBed -noBin -tab -sqlTable=cpgIslandGgfAndyMasked.sql \
      hg17 cpgIslandGgfAndyMasked cpgIslandGgfAndyMasked.bed


# MAKE UNIGENE/SAGE TRACK (DONE - 2004-10-15 Fan)

# First get SAGE data and determine which version of UniGene to use first
    ssh hgwdev
    cd ~/kent/src/hg/sage
    make
    # XXX = uniGene build for which SAGE was built -- not necessarily current!
    # Figure out the build number by peeking at this file:
    wget -O - ftp://ftp.ncbi.nih.gov/pub/sage/map/info.txt 2> /dev/null

# UniGene Build #44  Arabidopsis thaliana
# UniGene Build #61  Bos taurus
# UniGene Build #16  Caenorhabditis elegans
# UniGene Build #171  Homo sapiens
# UniGene Build #19  Medicago truncatula
# UniGene Build #138  Mus musculus
# UniGene Build #52  Oryza sativa
# UniGene Build #14  Pinus taeda
# UniGene Build #132  Rattus norvegicus
# UniGene Build #27  Sus scrofa
# UniGene Build #38  Triticum aestivum
# UniGene Build #11  Vitis vinifera
# UniGene Build #41  Zea mays

# From above info, set Version 171 for hg17
    ls /projects/cc/hg/sugnet/uniGene
#   set Version = XXX
    set Version=171
    mkdir /projects/cc/hg/sugnet/sage/sage.$Version
    cd /projects/cc/hg/sugnet/sage/sage.$Version

    wget -r -nH --cut-dirs=2 --timestamp ftp://ftp.ncbi.nih.gov/pub/sage/map/Hs
    wget -r -nH --cut-dirs=2 --timestamp ftp://ftp.ncbi.nih.gov/pub/sage/map/readme.txt
    wget -r -nH --cut-dirs=2 --timestamp ftp://ftp.ncbi.nih.gov/pub/sage/map/info.txt
    wget -r -nH --cut-dirs=2 --timestamp ftp://ftp.ncbi.nih.gov/pub/sage/extr
    wget -r -nH --cut-dirs=2 --timestamp ftp://ftp.ncbi.nih.gov/pub/sage/info

#  That downloaded about 1 GB of data

    cd map/Hs/NlaIII
    unzip -j SAGEmap_tag_ug-rel.zip
    cd ../../../extr/
    ../../scripts/summarizeCounts.pl expCounts.tab ./SAGE_*
    ../../scripts/countGenesPerTag.pl expCounts.tab allTags.count.tab
    ../../scripts/createArraysForTags.pl allTags.count.tab tagExpArrays.tab \
      ./SAGE_*
    ../../scripts/countsPerExp.pl expCounts.tab expList.tab

    cd ../map/Hs/NlaIII/ 
    cat << '_EOF_' > /tmp/t.pl
#!/usr/local/bin/perl

while (<>) {
 chomp($_);
 @p = split(/\t/, $_);
 print "$p[2]\t$p[3]\t$p[0]\n";
}
'_EOF_'
    chmod +x /tmp/t.pl

    cat SAGEmap_tag_ug-rel | /tmp/t.pl | sort | sed -e 's/ /_/g' \
      > SAGEmap_ug_tag-rel_Hs
    cd ../../../extr
    createSageSummary ../map/Hs/NlaIII/SAGEmap_ug_tag-rel_Hs \
      tagExpArrays.tab sageSummary.sage

# Create the uniGene alignments 
# /cluster/data/hg17/uniGene/hg17.uniGene.lifted.pslReps.psl

    # Download of the latest UniGene version is now automated by a 
    # cron job -- see /cluster/home/angie/crontab , 
    # /cluster/home/angie/unigeneVers/unigene.csh .  
    # If hgwdev gets rebooted, that needs to be restarted... maybe there's 
    # a more stable place to set up that cron job.  

    # substitute XXX -> the uniGene version used by SAGE.
    # set Version = XXX
    set Version = 171			(bash: export Version=171)
    cd /projects/cc/hg/sugnet/uniGene/uniGene.$Version
    gunzip Hs.seq.uniq.gz Hs.data.gz
    ../countSeqsInCluster.pl Hs.data counts.tab
    ../parseUnigene.pl Hs.seq.uniq Hs.seq.uniq.simpleHeader.fa leftoverData.tab
    # Distribute UniGene sequence to /iscratch/i/ (kkstore can see /projects)
    ssh kkstore
    set Version = 171 # same as above
    mkdir -p /iscratch/i/uniGene.$Version
    cp -p \
  /projects/cc/hg/sugnet/uniGene/uniGene.$Version/Hs.seq.uniq.simpleHeader.fa \
      /iscratch/i/uniGene.$Version
    ssh kkr1u00
    iSync
    ssh kk
    set Version = 171 # same as above
    mkdir -p /cluster/data/hg17/bed/uniGene.$Version
    cd /cluster/data/hg17/bed/uniGene.$Version
    ls -1S /scratch/hg/gs.18/build35/maskedContigs/*.fa > allctg.lst
    ls -1S /iscratch/i/uniGene.$Version/Hs.seq.uniq.simpleHeader.fa \
      > uniGene.lst
    cat << '_EOF_' > template.sub
#LOOP
/cluster/bin/i386/blat -mask=lower -minIdentity=95 -ooc=/scratch/hg/h/11.ooc $(path1) $(path2)  {check out 
line+ psl/$(root1)_$(root2).psl}
#ENDLOOP
'_EOF_'

    gensub2 allctg.lst uniGene.lst template.sub para.spec
    para create para.spec
    mkdir psl
    para try
    para check
    para push
# Completed: 380 of 380 jobs
# CPU time in finished jobs:      35994s     599.91m    10.00h    0.42d  0.001 y
# IO & Wait Time:                  1812s      30.19m     0.50h    0.02d  0.000 y
# Average job time:                  99s       1.66m     0.03h    0.00d
# Longest job:                     1497s      24.95m     0.42h    0.02d
# Submission to last job:          1551s      25.85m     0.43h    0.02d

    ssh eieio
    set Version = 171 # same as above
    cd /cluster/data/hg17/bed/uniGene.$Version
    pslSort dirs raw.psl tmp psl >& pslSort.log
    liftUp -type=.psl stdout ../../jkStuff/liftAll.lft warn raw.psl \
    | pslReps -minCover=0.2 -sizeMatters -minAli=0.965 -nearTop=0.002 \
      stdin hg17.uniGene.lifted.pslReps.psl /dev/null
# Processed 141416 alignments

# use hg17.uniGene.lifted.pslReps.psl for building UNIGENE/SAGE track.

    ssh hgwdev
    set Version = 171
    cd /projects/cc/hg/sugnet/sage/sage.$Version/extr
    addAveMedScoreToPsls \
      /cluster/data/hg17/bed/uniGene.$Version/hg17.uniGene.lifted.pslReps.psl \
      sageSummary.sage  uniGene.wscores.bed
    hgLoadBed hg17 uniGene_2 uniGene.wscores.bed
    hgsql hg17 < ~kent/src/hg/lib/sage.sql 
    echo "load data local infile 'sageSummary.sage' into table sage" \
        | hgsql hg17
    cd ../info
    ../../scripts/parseRecords.pl ../extr/expList.tab  > sageExp.tab
    hgsql hg17 < ~/kent/src/hg/lib/sageExp.sql 
    echo "load data local infile 'sageExp.tab' into table sageExp" | hgsql hg17
    # update ~/kent/src/hg/makeDb/trackDb/human/hg17/uniGene_2.html 
    # with current uniGene date.


# CREATE kgSpAlias TABLE FOR PB (Done 10/20/04)

    hgsql hg17 -e \
    'select kgXref.kgID, spID, alias from kgXref, kgAlias where kgXref.kgID=kgAlias.kgID' >j.tmp
    hgsql hg17 -e \
    'select kgXref.kgID, spID, alias from kgXref, kgProtAlias where kgXref.kgID=kgProtAlias.kgID'\
    >>j.tmp
    cat j.tmp|sort -u |grep -v 'kgID' >hg17.kgSpAlias.tab
    rm j.tmp

    hgsql hg17 -e 'drop table kgSpAlias';
    hgsql hg17 < ~/src/hg/lib/kgSpAlias.sql
    hgsql hg17 -e 'load data local infile "hg17.kgSpAlias.tab" into table kgSpAlias'


# SEGMENTAL DUPLICATIONS (DONE 10/21/04 angie)
    ssh hgwdev
    mkdir /cluster/data/hg17/bed/genomicSuperDups
    cd /cluster/data/hg17/bed/genomicSuperDups
    # A tar file containing files for both hg16 and hg17 was downloaded into 
    # /cluster/data/hg16/bed/genomicSuperDups;  move over the hg17 part.
    mv /cluster/data/hg16/bed/genomicSuperDups/bd35 .
    cd bd35
    # A note from Xinwei She about the contents:
#Build35 contains only 2 tables: genomicSuperDups and celeraDupPositive.
    # use tail +2 to skip past the header line:
    # actually, celeraDupPositive.tab.gz has one extra bogus line so +3 for it:
    zcat celeraDupPositive.tab.gz | tail +3 \
    | hgLoadBed -tab -sqlTable=$HOME/kent/src/hg/lib/celeraDupPositive.sql \
        hg17 celeraDupPositive stdin
    zcat genomicSuperDups.tab.gz | tail +2 \
    | hgLoadBed -tab -sqlTable=$HOME/kent/src/hg/lib/genomicSuperDups.sql \
        hg17 genomicSuperDups stdin
    # clean up
    rm bed.tab

# ECGENE TRACK (DONE, 2004-10-29, hartera)
    ssh eieio
    mkdir -p /cluster/data/hg17/bed/ECgene.2004-10-27
    ln -s /cluster/data/hg17/bed/ECgene.2004-10-27 \
          /cluster/data/hg17/bed/ECgene
    cd  /cluster/data/hg17/bed/ECgene
    wget \
"http://genome.ewha.ac.kr/ECgene/download/v1.2_ECgene/v1.2_hg17_low_gene.txt.gz"
    wget \
"http://genome.ewha.ac.kr/ECgene/download/v1.2_ECgene/v1.2_hg17_low_pep.txt.gz"
    gunzip *.gz

    # load database
    ssh hgwdev
    cd /cluster/data/hg17/bed/ECgene
    ldHgGene -predTab hg17 ECgene v1.2_hg17_low_gene.txt
    # 646778 gene predictions
    hgPepPred hg17 tab ECgenePep v1.2_hg17_low_pep.txt
    rm *.tab
    nice gzip *.txt

# LOAD ENSEMBL GENES (DONE, 2004-11-19, hartera)

    mkdir /cluster/data/hg17/bed/ensembl
    cd /cluster/data/hg17/bed/ensembl
    # Get the ensembl protein data from 
    # http://www.ensembl.org/Homo_sapiens/martview
    # Follow this sequence through the pages:
    # Page 1) Make sure that the Homo_sapiens choice is selected. Hit next.
    # Page 2) Uncheck the "Limit to" box in the region choice. Then hit next.
    # Page 3) Choose the "Structures" box. 
    # Page 4) Choose GTF as the ouput.  choose gzip compression.  hit export.
    # Save as ensemblGene.gtf.gz

    # Ensembl handles random chromosomes differently than us, so we
    # strip this data.  Fortunately it just loses a couple of genes.
    # Add "chr" to front of each line in the gene data gtf file to make 
    # it compatible with our software.
    # Finally, get rid of the ".1" or ".2" after the name
    gunzip -c ensemblGene.gtf.gz \
    | grep -v ^6_DR51 \
    | grep -v ^DR51 \
    | grep -v ^DR52 \
    | grep -v ^DR53 \
    | grep -v _NT_ \
    | perl -wpe 's/^([0-9]|X|Y|Un|MT)/chr$1/ \
                 || die "Line $. doesnt start with human chrom:\n$_"' \
    | sed -e 's/\..\"/\"/g' \
    | sed -e 's/chrMT_NC_001807/chrM/' \
    > ensGene.gtf

    ssh hgwdev
    /cluster/data/hg17/bed/ensembl
    /cluster/bin/i386/ldHgGene hg17 ensGene ensGene.gtf

    # Read 33666 transcripts in 696579 lines in 1 files
    # 33666 groups 25 seqs 1 sources 4 feature types
    # 33666 gene predictions

    # ensGtp associates geneId/transcriptId/proteinId for hgPepPred and 
    # hgKnownToSuper.  Use ensMart to create it as above, except:
    # Page 3) Choose the "Features" box. In "Ensembl Attributes", check 
    # Ensembl Gene ID, Ensembl Transcript ID, Ensembl Peptide ID.  
    # Choose Text, tab-separated as the output format.  Result name ensGtp.
    # Save file as ensGtp.txt.gz
    gunzip ensGtp.txt.gz
    hgsql hg17 < ~/kent/src/hg/lib/ensGtp.sql
    # remove header line from ensGtp.txt
    echo "load data local infile 'ensGtp.txt' into table ensGtp" | hgsql -N hg17

    # Load Ensembl peptides:
    # Get them from ensembl as above in the gene section except for
    # Page 3) Choose the "Sequences" box. 
    # Page 4) Transcripts/Proteins.  Peptide.  Format = FASTA.
    # Save file as ensemblPep.fa.gz
    gunzip ensemblPep.fa.gz
    hgPepPred hg17 ensembl ensemblPep.fa

# UPDATE KNOWN GENES TABLES (DONE 11/22/04 Fan)

# Make sure the protein databases (sp041115 and proteins041115) were built first.
  
  hgsql hg17 -e "create database kgHg17B"
  
  mkdir -p /cluster/store8/kg/kgHg17B
  cd /cluster/store6/kgDB/bed
  ln -s /cluster/store8/kg/kgHg17B kgHg17B
  cd kgHg17B

    ~/src/hg/protein/KGprocess.sh kgHg17B hg17 041115

# Found gbGetSeqs changed the format of mrna.fa output file 
# (extra version number).  Updated Kgprocess.sh and manually
# re-ran the following manually:

    grep "^>" mrna.fa |awk '{print $1}' > mrna.lis

    kgGetPep 041115 > mrnaPep.fa

    hgKgMrna kgH17BTemp mrna.fa mrna.ra tight_mrna.psl ll/loc2ref \
        mrnaPep.fa ll/mim2loc ${PDB} > kgHg17BKgMrna.out 2> kgHg17BKgMrna.err

# then run Kgprocess.sh again to continue processing.

    ~/src/hg/protein/KGprocess.sh kgHg17B hg17 041115

hgsql hg17 -e "select * from chromInfo" > chromInfo.tab
getDbTabDef hg17 chromInfo >chromInfo.sql
hgsql kgHg17B <chromInfo.sql
hgsql kgHg17B -e 'load data local infile "chromInfo.tab" into table chromInfo ignore 1 lines'

# Build kgProtMap table.  This table is needed by the Proteome Browser and 
# it should be built before all the KG tables be moved from kgHg17B to hg17.

    ~/src/hg/protein/kgProtMap.sh kgHg17B hg17 041115

# Completed: 7923 of 7923 jobs
# CPU time in finished jobs:    2502923s   41715.39m   695.26h   28.97d  0.079 y
# IO & Wait Time:                175358s    2922.63m    48.71h    2.03d  0.006 y
# Average job time:                 338s       5.63m     0.09h    0.00d
# Longest job:                     2403s      40.05m     0.67h    0.03d
# Submission to last job:          7164s     119.40m     1.99h    0.08d
  
# The script ran successfully with the last message:
# Mon Nov 22 17:11:59 PST 2004 DONE =========================

# Create database hg17Kg1 to store the old KG tables, just in case.

  hgsql hg17

  create database hg17Kg1;

  alter table cgapAlias rename as hg17Kg1.cgapAlias;
  alter table cgapBiocDesc rename as hg17Kg1.cgapBiocDesc;
  alter table cgapBiocPathway rename as hg17Kg1.cgapBiocPathway;
  alter table dupSpMrna rename as hg17Kg1.dupSpMrna;
  alter table keggMapDesc rename as hg17Kg1.keggMapDesc;
  alter table keggPathway rename as hg17Kg1.keggPathway;
  alter table kgAlias rename as hg17Kg1.kgAlias;
  alter table kgProtAlias rename as hg17Kg1.kgProtAlias;
  alter table kgXref rename as hg17Kg1.kgXref;
  alter table knownGene rename as hg17Kg1.knownGene;
  alter table knownGeneLink rename as hg17Kg1.knownGeneLink;
  alter table knownGeneMrna rename as hg17Kg1.knownGeneMrna;
  alter table knownGenePep rename as hg17Kg1.knownGenePep;
  alter table mrnaRefseq rename as hg17Kg1.mrnaRefseq;
  alter table spMrna rename as hg17Kg1.spMrna;
  alter table kgProtMap rename as hg17Kg1.kgProtMap;

# After initial inspection of tables in kgHg17B, do the following
# from mySql prompt:

  alter table kgHg17B.cgapAlias rename as hg17.cgapAlias;
  alter table kgHg17B.cgapBiocDesc rename as hg17.cgapBiocDesc;
  alter table kgHg17B.cgapBiocPathway rename as hg17.cgapBiocPathway;
  alter table kgHg17B.dupSpMrna rename as hg17.dupSpMrna;
  alter table kgHg17B.keggMapDesc rename as hg17.keggMapDesc;
  alter table kgHg17B.keggPathway rename as hg17.keggPathway;
  alter table kgHg17B.kgAlias rename as hg17.kgAlias;
  alter table kgHg17B.kgProtAlias rename as hg17.kgProtAlias;
  alter table kgHg17B.kgXref rename as hg17.kgXref;
  alter table kgHg17B.knownGene rename as hg17.knownGene;
  alter table kgHg17B.knownGeneLink rename as hg17.knownGeneLink;
  alter table kgHg17B.knownGeneMrna rename as hg17.knownGeneMrna;
  alter table kgHg17B.knownGenePep rename as hg17.knownGenePep;
  alter table kgHg17B.mrnaRefseq rename as hg17.mrnaRefseq;
  alter table kgHg17B.spMrna rename as hg17.spMrna;
  alter table kgHg17B.kgProtMap rename as hg17.kgProtMap;

# Old hg17.knownGene has 43,401 entries and the new one has 44,338 entries.
# Previously:
  
    featureBits hg17 knownGene
#      65728598 bases of 2866216770 (2.293%) in intersection
# Previously, was:
#    	 63983072 bases of 2866216770 (2.232%) in intersection
  
# Connect to genome-testdb and use hgcentraltest DB.
# Update the entry in gdbPdb table:

    delete from gdbPdb where genomeDb='hg17';
    insert into gdbPdb values('hg17', 'proteins041115');

# UPDATE KGSPALIAS TABLE TO BE USED BY PB (Done 12/20/04)

    cd /cluster/data/hg17/bed/pb
    mkdir new
    cd new
    hgsql hg17 -e \
    'select kgXref.kgID, spID, alias from kgXref, kgAlias where kgXref.kgID=kgAlias.kgID' >j.tmp
    hgsql hg17 -e \
    'select kgXref.kgID, spID, alias from kgXref, kgProtAlias where kgXref.kgID=kgProtAlias.kgID'\
    >>j.tmp
    cat j.tmp|sort -u |grep -v 'kgID' >hg17.kgSpAlias.tab
    rm j.tmp

    hgsql hg17 -e 'drop table kgSpAlias';
    hgsql hg17 < ~/src/hg/lib/kgSpAlias.sql
    hgsql hg17 -e 'load data local infile "hg17.kgSpAlias.tab" into table kgSpAlias'
    gzip kgSpAlias.tab

# Create hg17GeneList.html (to be used by Google).
# This step was done 12/08/04.

    cd /cluster/data/hg17/bed
    mkdir geneList
    cd geneList
    wget -O hg17GeneList.html "http://hgwdev-fanhsu.soe.ucsc.edu/cgi-bin/hgGeneList?db=hg17"
    cp -p hg17GeneList.html /usr/local/apache/htdocs/goldenPath
# Check this html file into CVS.

# CREATE FULL TEXT INDEX FOR KNOWN GENES (DONE 1/19/2006 JK)
# This depends on the go and uniProt databases as well as 
# the kgAlias and kgProAlias tables.  The hgKgGetText takes
# about 5 minutes when the database is not too busy.  The rest
# is real quick.
     ssh hgwdev
     cd /cluster/data/hg17/bed/kgHg17F
     mkdir index
     cd index
     hgKgGetText hg17 knownGene.text
     ixIxx knownGene.text knownGene.ix knownGene.ixx
     ln -s /cluster/data/hg17/bed/kgHg17F/index/knownGene.ix /gbdb/hg17/knownGene.ix
     ln -s /cluster/data/hg17/bed/kgHg17F/index/knownGene.ixx /gbdb/hg17/knownGene.ixx



# UPDATE TABLES NEEDED BY hgGene (DONE 11/30/04 Fan)

# UPDATE BioCyc TABLES
    hgsql hg17 -e 'delete from bioCycPathway'
    hgsql hg17 -e 'delete from bioCycMapDesc'

#  Using data files sent by Peter Carp from SRI,
#  per Peter's email of 10/1/04, they don't have recent update,
#  so data files received last year are used.
  
#  Save the BioCyc Pathway name and description table as gene-pathway.dat.
#  Save the pathway data file as gene-pathway.dat.
#  Make sure there is no extra ^M at end of the lines.

# Run hgBioCyc program to generate the file bioCycPathway.tab.

	 hgBioCyc gene-pathway.dat hg17

# Load results into hg17.

    LOAD DATA local INFILE 'pathway-names.dat' into table bioCycMapDesc;
    LOAD DATA local INFILE 'bioCycPathway.tab' into table bioCycPathway;

# REBUID FOLDUTR TABLES (DONE - 2004-11-30 Fan)
# First set up directory structure and extract UTR sequence on hgwdev
    ssh hgwdev
    cd /cluster/data/hg17/bed
    mv rnaStruct rnaStruct.2004-10-04
    mkdir -p /cluster/data/hg17/bed/rnaStruct.2004-11-30
    ln -s rnaStruct.2004-11-30 rnaStruct
    cd /cluster/data/hg17/bed/rnaStruct
    mkdir -p utr3/split utr5/split utr3/fold utr5/fold
    utrFa hg17 knownGene utr3 utr3/utr.fa
    utrFa hg17 knownGene utr5 utr5/utr.fa

# Split up files and make files that define job.
    ssh kk
    cd /cluster/data/hg17/bed/rnaStruct
    faSplit sequence utr3/utr.fa 50000 utr3/split/s
    faSplit sequence utr5/utr.fa 50000 utr5/split/s
    ls -1 utr3/split > utr3/in.lst
    ls -1 utr5/split > utr5/in.lst
    cd utr3
    cat > gsub <<end
#LOOP
rnaFoldBig split/\$(path1) fold
#ENDLOOP
end
    cp gsub ../utr5

# Do cluster run for 3' UTRs
    gensub2 in.lst single gsub spec
    para create spec
    para try
    para push
# Completed: 38115 of 38115 jobs
# CPU time in finished jobs:    1101680s   18361.33m   306.02h   12.75d  0.035 y
# IO & Wait Time:                100275s    1671.25m    27.85h    1.16d  0.003 y
# Average job time:                  32s       0.53m     0.01h    0.00d
# Longest job:                     3645s      60.75m     1.01h    0.04d
# Submission to last job:          7007s     116.78m     1.95h    0.08d

# Do cluster run for 5' UTRs 
    cd ../utr5
    gensub2 in.lst single gsub spec
    para create spec
    para try
    para push
# Completed: 30524 of 30524 jobs
# CPU time in finished jobs:     116647s    1944.12m    32.40h    1.35d  0.004 y
# IO & Wait Time:                 80477s    1341.28m    22.35h    0.93d  0.003 y
# Average job time:                   6s       0.11m     0.00h    0.00d
# Longest job:                     2449s      40.82m     0.68h    0.03d
# Submission to last job:          3386s      56.43m     0.94h    0.04d

# Load database
    ssh hgwdev
    cd /cluster/data/hg17/bed/rnaStruct/utr5
    hgLoadRnaFold hg17 foldUtr5 fold
# Parsed 30525 files
# Warning: load of foldUtr5 did not go as planned: 30525 record(s), 2 row(s) skipped, 0 warning(s) loading 
./foldUtr5.tab

    cd ../utr3
    hgLoadRnaFold hg17 foldUtr3 fold
# Parsed 38115 files
#Warning: load of foldUtr3 did not go as planned: 38115 record(s), 2 row(s) skipped, 0 warning(s) loading 
./foldUtr3.tab

# Clean up
    rm -r split fold err batch.bak
    cd ../utr5
    rm -r split fold err batch.bak

# UPDATE GENE SORTER TABLES (AKA: FAMILY BROWSER) (DONE - 2004-11-29 - Fan)
#	This should be done after knownGene tables are complete from known gene
#	process.
#
# Cluster together various alt-splicing isoforms.
#	Creates the knownIsoforms and knownCanonical tables
ssh hgwdev
mkdir /cluster/data/hg17/bed/geneSorter.2004-11-24
# remove old symbolic link
rm /cluster/data/hg17/bed/geneSorter
ln -s /cluster/data/hg17/bed/geneSorter.2004-11-24 \
	/cluster/data/hg17/bed/geneSorter
cd /cluster/data/hg17/bed/geneSorter
hgClusterGenes hg17 knownGene knownIsoforms knownCanonical

# Extract peptides from knownGenes into fasta file
# and create a blast database out of them.
mkdir /cluster/data/hg17/bed/geneSorter/blastp
cd /cluster/data/hg17/bed/geneSorter/blastp
pepPredToFa hg17 knownGenePep known.faa
#	You may need to build this binary in src/hg/near/pepPredToFa
/scratch/blast/formatdb -i known.faa -t known -n known
#	This command is in /projects/compbio/bin/$MACH/formatdb

# Copy over database to bluearc
rm -fr /cluster/bluearc/hg17/blastp
mkdir -p /cluster/bluearc/hg17/blastp
cp -p /cluster/data/hg17/bed/geneSorter/blastp/known.* \
	/cluster/bluearc/hg17/blastp

#	Had to pick up a new blastall binary (2004-06-15)
#	Our old one would no longer run on our systems that have
#	updated Linux versions
mkdir /cluster/bluearc/blast2210
cd /cluster/bluearc/blast2210
wget --timestamping \
    ftp://ftp.ncbi.nlm.nih.gov/blast/executables/release/2.2.10/blast-2.2.10-ia32-linux.tar.gz
wget --timestamping \
    ftp://ftp.ncbi.nlm.nih.gov/blast/executables/release/2.2.10/ChangeLog.txt
wget --timestamping \
    ftp://ftp.ncbi.nlm.nih.gov/blast/executables/release/2.2.10/ReleaseNotes.txt
tar xvzf blast-2.2.10-ia32-linux.tar.gz

# Split up fasta file into bite sized chunks for cluster
cd /cluster/data/hg17/bed/geneSorter/blastp
mkdir split
faSplit sequence known.faa 8000 split/kg

# Make parasol run directory
ssh kk
mkdir /cluster/data/hg17/bed/geneSorter/blastp/self
cd /cluster/data/hg17/bed/geneSorter/blastp/self
mkdir run
cd run
mkdir out

# Make blast script
cat  << '_EOF_' > blastSome
#!/bin/sh
BLASTMAT=/cluster/bluearc/blast229/data /cluster/bluearc/blast229/blastall \
-p blastp -d /cluster/bluearc/hg17/blastp/known -i $1 -o $2 \
-e 0.01 -m 8 -b 1000
'_EOF_'
    # << keep emacs happy
chmod +x blastSome

# Make gensub2 file
cat  << '_EOF_' > gsub
#LOOP
blastSome {check in line+ $(path1)} {check out line out/$(root1).tab}
#ENDLOOP
'_EOF_'
    # << keep emacs happy

# Create parasol batch
#	'ls ../../split/*.fa' is too much, hence the echo
echo ../../split/*.fa | wordLine stdin > split.lst
gensub2 split.lst single gsub jobList
para create jobList
para try
# Wait a couple of minutes, and do a para check,  if all is good
# then do a
para push
# This should finish in ~15 minutes if the cluster is free.
Completed: 7748 of 7748 jobs
CPU time in finished jobs:     191136s    3185.59m    53.09h    2.21d  0.006 y
IO & Wait Time:                 66703s    1111.72m    18.53h    0.77d  0.002 y
Average job time:                  33s       0.55m     0.01h    0.00d
Longest job:                      370s       6.17m     0.10h    0.00d
Submission to last job:           747s      12.45m     0.21h    0.01d

# Load into database.  This takes about 30 minutes
ssh hgwdev
cd /cluster/data/hg17/bed/geneSorter/blastp/self/run/out
time hgLoadBlastTab hg17 knownBlastTab *.tab
# Scanning through 7748 files
# Loading database with 12810133 rows
# 306.480u 54.190s 26:35.50 22.6% 0+0k 0+0io 206pf+0w

cd /cluster/data/hg17/bed/geneSorter
# Create table that maps between known genes and RefSeq
hgMapToGene hg17 refGene knownGene knownToRefSeq
#	may need to build this command in src/hg/near/hgMapToGene
#	hgsql -e "select count(*) from knownToRefSeq;" hg17
#	row count changed 37611

# Create table that maps between known genes and LocusLink
hgsql --skip-column-names -e "select mrnaAcc,locusLinkId from refLink" hg17 \
	> refToLl.txt
hgMapToGene hg17 refGene knownGene knownToLocusLink -lookup=refToLl.txt
#	hgsql -e "select count(*) from knownToLocusLink;" hg17
#	row count changed to 37611

# Create table that maps between known genes and Pfam domains
hgMapViaSwissProt hg17 knownGene name proteinID Pfam knownToPfam
#	hgsql -e "select count(*) from knownToPfam;" hg17
#	row count changed to 36302

# Create table to map between known genes and GNF Atlas2
# expression data.
    hgMapToGene hg17 gnfAtlas2 knownGene knownToGnfAtlas2 '-type=bed 12'
#	hgsql -e "select count(*) from knownToGnfAtlas2;" hg17
#	row count changed to 36373

# Create expression distance table - takes about an hour
    hgExpDistance hg17 hgFixed.gnfHumanAtlas2MedianRatio \
    	hgFixed.gnfHumanAtlas2MedianExps gnfAtlas2Distance \
	-lookup=knownToGnfAtlas2 &
# Have 44696 elements in hgFixed.gnfHumanAtlas2MedianRatio
# Got 36373 unique elements in hgFixed.gnfHumanAtlas2MedianRatio

#	hgsql -e "select count(*) from gnfAtlas2Distance;" hg17
#	row count changed to 36373000

# Create a table that maps between known genes and 
# the nice affy expression data.
hgMapToGene "-type=bed 12" hg17 affyUclaNorm knownGene knownToU133
#	hgsql -e "select count(*) from knownToU133;" hg17
#	row count changed to 37299

# Create expression distance table.  This will take about 2.5 hours
cd /tmp
cp -p ~/kent/src/hg/near/hgExpDistance/affyUcla.weight .
time hgExpDistance hg17 affyUclaNorm affyUclaExp knownExpDistance \
	-weights=affyUcla.weight -lookup=knownToU133 &
# Have 43039 elements in affyUclaNorm
# 211 genes, 42 weights, 26.500000 total wieght
# Got 37299 unique elements in affyUclaNorm
# 8212.320u 217.310s 2:38:07.84 88.8%     0+0k 0+0io 267pf+0w
# Create table that maps between known genes and 
# the GNF data.
cd /tmp
hgMapToGene hg17 affyU95 knownGene knownToU95
#	row count changed to 18791
#	hgFixed.gnfHumanU95Exps argument is unused, no need to exist
hgExpDistance hg17 hgFixed.gnfHumanU95MedianRatio \
	hgFixed.gnfHumanU95Exps gnfU95Distance  -lookup=knownToU95 &
# Have 11545 elements in hgFixed.gnfHumanU95MedianRatio
# Got 17682 unique elements in hgFixed.gnfHumanU95MedianRatio
#	row count changed to 17682000 

# Create known gene mapping table and expression distance tables
# for GNF Atlas 2.  (The hgExpDistance takes only 10 minutes.)

hgMapToGene hg17 affyGnf1h knownGene knownToGnf1h
hgExpDistance hg17 hgFixed.gnfHumanAtlas2MedianRatio \
	hgFixed.gnfHumanAtlas2MedianExps gnfAtlas2Distance \
	-lookup=knownToGnf1h &
# Have 44696 elements in hgFixed.gnfHumanAtlas2MedianRatio
# Got 10273 unique elements in hgFixed.gnfHumanAtlas2MedianRatio

cd /cluster/data/hg17/bed/geneSorter
hgMapToGene hg17 affyU133Plus2 knownGene knownToU133Plus2
#	row count changed to 40015 

# Make sure that GO database is up to date.
# UPDATE GO DATABASE (DONE 11/24/04 Fan)

# Download the terms and make the database.
ssh hgwdev
mkdir /cluster/store1/geneOntology/20041124
cd /cluster/store1/geneOntology/20041124

wget --timestamping http://www.godatabase.org/dev/database/archive/latest/go_200411-assocdb-data.gz

hgsql mysql <<end
create database go041124;
end
zcat go_*data.gz | hgsql go041124

wget -timestamping ftp://ftp.geneontology.org/pub/go/gene-associations/gene_association.goa_sptr.gz

wget --timestamping ftp://ftp.geneontology.org/pub/go/gene-associations/gene_association.goa_uniprot.gz

zcat gene_association.goa_uniprot.gz | hgGoAssociation go041124 goaPart stdin 
# Passed 4502016 of 5291097 of 5291097, 85.09%

# Ask sys-admin to switch the database pointer go to point to go041124.

cd /cluster/data/hg17/bed/geneSorter

XXX - DO NOT YET HAVE ensGene table - must wait on Ensembl to release that
XXX - have not created the knownToEnsembl table yet - 2004-07-15 - Hiram
# Create knownToEnsembl column
hgMapToGene hg17 ensGene knownGene knownToEnsembl
#	table row count went from previous version: 36068 to 38251

# Make knownToCdsSnp table (Heather did this table, Nov 29, 2004)
  ssh hgwdev
  nice hgMapToGene hg17 snp knownGene knownToCdsSnp -all -cds
# row count 168336
# approx. 5 minutes running time

# Make C. elegans ortholog column using blastp on wormpep.
# First make C. elegans protein database and copy it to cluster/bluearc
# There was no /cluster/bluearc/ce1/blastp, so get the latest wormpep from Sanger
cd /cluster/data/ce1/bed/blastp
mkdir old
cp -p * old
wget --timestamp ftp://ftp.sanger.ac.uk/pub/databases/wormpep/wormpep134/wormpep134
mv wormpep134 wormPep.faa
formatdb -i wormPep.faa -t wormPep -n wormPep
#copy them to /cluster/bluearc
ssh kkr1u00
mkdir -p /cluster/bluearc/ce1/blastp
cp /cluster/data/ce1/bed/blastp/wormPep.p?? /cluster/bluearc/ce1/blastp

#	The blast jobs below can be run on the kk or kk9 clusters
# Create the ceBlastTab
ssh kk9
mkdir /cluster/data/hg17/bed/geneSorter/blastp/ce1
cd /cluster/data/hg17/bed/geneSorter/blastp/ce1
mkdir run
cd run
mkdir out

# Make blast script
cat  << '_EOF_' > blastSome
#!/bin/sh
BLASTMAT=/cluster/bluearc/blast229/data /cluster/bluearc/blast229/blastall \
	-p blastp -d /cluster/bluearc/ce1/blastp/wormPep \
	-i $1 -o $2 -e 0.01 -m 8 -b 1
'_EOF_'
    # << keep emacs happy
chmod a+x blastSome

# Make gensub2 file
cat << '_EOF_' > gsub
#LOOP
blastSome {check in line+ $(path1)} {check out line out/$(root1).tab}
#ENDLOOP
'_EOF_'
    # << keep emacs happy

# Create parasol batch
echo ../../split/*.fa | wordLine stdin > split.lst
gensub2 split.lst single gsub jobList
para create jobList
para try
para check
para push ... etc ...
#	Only takes 10 minutes on an idle cluster
# Completed: 7748 of 7748 jobs
# CPU time in finished jobs:      33235s     553.91m     9.23h    0.38d  0.001 y
# IO & Wait Time:                 19891s     331.52m     5.53h    0.23d  0.001 y
# Average job time:                   7s       0.11m     0.00h    0.00d
# Longest job:                       68s       1.13m     0.02h    0.00d
# Submission to last job:           653s      10.88m     0.18h    0.01d

# Load into database.  
ssh hgwdev
cd /cluster/data/hg17/bed/geneSorter/blastp/ce1/run/out
hgLoadBlastTab hg17 ceBlastTab -maxPer=1 *.tab
#	row count changed to 28252

# Make mouse ortholog column using blastp on mouse known genes.
# First make mouse protein database and copy it to cluster/bluearc
# if it doesn't exist already
#	This already exists.  See makeMm5.doc for procedure
#	the directory: /cluster/bluearc/scratch/mus/mm5/blastp should have data

# Make parasol run directory 
ssh kk
mkdir /cluster/data/hg17/bed/geneSorter/blastp/mm5
cd /cluster/data/hg17/bed/geneSorter/blastp/mm5
mkdir run
cd run
mkdir out

# Make blast script
cat  << '_EOF_' > blastSome
#!/bin/sh
BLASTMAT=/cluster/bluearc/blast229/data /cluster/bluearc/blast229/blastall \
	-p blastp -d /cluster/bluearc/scratch/mus/mm5/blastp/known \
	-i $1 -o $2 -e 0.001 -m 8 -b 1
'_EOF_'
    # << keep emacs happy
chmod a+x blastSome

# Make gensub2 file
cat << '_EOF_' > gsub
#LOOP
blastSome {check in line+ $(path1)} {check out line out/$(root1).tab}
#ENDLOOP
'_EOF_'
    # << keep emacs happy

# Create parasol batch
#	this echo trick is used because otherwise the command line is
#	too long and you can not do a simple ls
echo ../../split/*.fa | wordLine stdin > split.lst
gensub2 split.lst single gsub jobList
para create jobList
para try
para check
para push ... etc ...
# Completed: 7748 of 7748 jobs
# CPU time in finished jobs:     141842s    2364.04m    39.40h    1.64d  0.004 y
# IO & Wait Time:                 52251s     870.85m    14.51h    0.60d  0.002 y
# Average job time:                  25s       0.42m     0.01h    0.00d
# Longest job:                      254s       4.23m     0.07h    0.00d
# Submission to last job:           540s       9.00m     0.15h    0.01d

# Load into database.  
ssh hgwdev
cd /cluster/data/hg17/bed/geneSorter/blastp/mm5/run/out
hgLoadBlastTab hg17 mmBlastTab -maxPer=1 *.tab
# Scanning through 7748 files
#	row count changed to 37549 

# Make rat ortholog column using blastp on rat known genes.
# First make rat protein database and copy it to cluster/bluearc
# if it doesn't exist already
#	This already exists.  See makeRn3.doc for procedure.
#	Files were put in this directory: /cluster/bluearc/rn3/blastp/

# Make parasol run directory 
ssh kk
mkdir /cluster/data/hg17/bed/geneSorter/blastp/rn3
cd /cluster/data/hg17/bed/geneSorter/blastp/rn3
mkdir run
cd run
mkdir out

# Make blast script
cat  << '_EOF_' > blastSome
#!/bin/sh
BLASTMAT=/cluster/bluearc/blast229/data /cluster/bluearc/blast229/blastall \
    -p blastp -d /cluster/bluearc/rn3/blastp/known \
    -i $1 -o $2 -e 0.001 -m 8 -b 1
'_EOF_'
    # << keep emacs happy
chmod a+x blastSome

# Make gensub2 file
cat << '_EOF_' > gsub
#LOOP
blastSome {check in line+ $(path1)} {check out line out/$(root1).tab}
#ENDLOOP
'_EOF_'
    # << keep emacs happy

# Create parasol batch
#	this echo trick is used because otherwise the command line is
#	too long and you can not do a simple ls
echo ../../split/*.fa | wordLine stdin > split.lst
gensub2 split.lst single gsub jobList
para create jobList
para try
para check
para push ... etc ...
# Completed: 7748 of 7748 jobs
# CPU time in finished jobs:      31786s     529.77m     8.83h    0.37d  0.001 y
# IO & Wait Time:                 25795s     429.91m     7.17h    0.30d  0.001 y
# Average job time:                   7s       0.12m     0.00h    0.00d
# Longest job:                       75s       1.25m     0.02h    0.00d
# Submission to last job:           157s       2.62m     0.04h    0.00d

# Load into database.  
ssh hgwdev
cd /cluster/data/hg17/bed/geneSorter/blastp/rn3/run/out
hgLoadBlastTab hg17 rnBlastTab -maxPer=1 *.tab
# Scanning through 7748 files
#Loading database with 26133 rows

# Make Danio rerio (zebrafish) ortholog column using blastp on Ensembl.
# First make protein database and copy it to cluster/bluearc
# if it doesn't exist already
# NOTE: data used to reside in /cluster/bluearc/dr1/blastp
mv /cluster/bluearc/dr1/blastp /cluster/bluearc/danRer1/blastp
#	the directory: /cluster/bluearc/danRer1/blastp should have data

# Make parasol run directory 
ssh kk
mkdir /cluster/data/hg17/bed/geneSorter/blastp/danRer1
cd /cluster/data/hg17/bed/geneSorter/blastp/danRer1
mkdir run
cd run
mkdir out

# Make blast script
cat  << '_EOF_' > blastSome
#!/bin/sh
BLASTMAT=/cluster/bluearc/blast229/data /cluster/bluearc/blast229/blastall \
	-p blastp -d /cluster/bluearc/danRer1/blastp/ensembl \
	-i $1 -o $2 -e 0.005 -m 8 -b 1
'_EOF_'
    # << keep emacs happy
chmod a+x blastSome

# Make gensub2 file
cat << '_EOF_' > gsub
#LOOP
blastSome {check in line+ $(path1)} {check out line out/$(root1).tab}
#ENDLOOP
'_EOF_'
    # << keep emacs happy

# Create parasol batch
echo ../../split/*.fa | wordLine stdin > split.lst
gensub2 split.lst single gsub jobList
para create jobList
para try
para check
para push ... etc ...
# Completed: 7748 of 7748 jobs
# CPU time in finished jobs:     102324s    1705.39m    28.42h    1.18d  0.003 y
# IO & Wait Time:                 47203s     786.72m    13.11h    0.55d  0.001 y
# Average job time:                  19s       0.32m     0.01h    0.00d
# Longest job:                      230s       3.83m     0.06h    0.00d
# Submission to last job:           427s       7.12m     0.12h    0.00d

# Load into database.  
ssh hgwdev
cd /cluster/data/hg17/bed/geneSorter/blastp/danRer1/run/out
hgLoadBlastTab hg17 drBlastTab -maxPer=1 *.tab
# Loading database with 33852 rows

# Make Saccharomyces cerevisiae (yeast) ortholog column using blastp on RefSeq.
# First make protein database and copy it to cluster/bluearc
# if it doesn't exist already
#	This is already done, see makeMm3.doc for procedure
#	the directory: /cluster/bluearc/sc1/blastp should have data

# Make parasol run directory 
ssh kk
mkdir /cluster/data/hg17/bed/geneSorter/blastp/sc1
cd /cluster/data/hg17/bed/geneSorter/blastp/sc1
mkdir run
cd run
mkdir out

# Make blast script
cat  << '_EOF_' > blastSome
#!/bin/sh
BLASTMAT=/cluster/bluearc/blast229/data /cluster/bluearc/blast229/blastall \
	-p blastp -d /cluster/bluearc/sc1/blastp/sgd \
	-i $1 -o $2 -e 0.01 -m 8 -b 1
'_EOF_'
    # << keep emacs happy
chmod a+x blastSome

# Make gensub2 file
cat << '_EOF_' > gsub
#LOOP
blastSome {check in line+ $(path1)} {check out line out/$(root1).tab}
#ENDLOOP
'_EOF_'
    # << keep emacs happy

# Create parasol batch
echo ../../split/*.fa | wordLine stdin > split.lst
gensub2 split.lst single gsub jobList
para create jobList
para try
para check
para push ... etc ...
# Completed: 7748 of 7748 jobs
# CPU time in finished jobs:      20983s     349.72m     5.83h    0.24d  0.001 y
# IO & Wait Time:                 25513s     425.21m     7.09h    0.30d  0.001 y
# Average job time:                   6s       0.10m     0.00h    0.00d
# Longest job:                       37s       0.62m     0.01h    0.00d
# Submission to last job:           106s       1.77m     0.03h    0.00d
# Load into database.  
ssh hgwdev
cd /cluster/data/hg17/bed/geneSorter/blastp/sc1/run/out
hgLoadBlastTab hg17 scBlastTab -maxPer=1 *.tab
# Loading database with 18489 rows

# Make Drosophila melanagaster ortholog column using blastp on FlyBase.
# First make SwissProt protein database and copy it to cluster/bluearc
# if it doesn't exist already
#	This is already done, see makeMm3.doc for procedure
#	the directory: /cluster/bluearc/dm1/blastp should have data

# Make parasol run directory 
ssh kk
mkdir /cluster/data/hg17/bed/geneSorter/blastp/dm1
cd /cluster/data/hg17/bed/geneSorter/blastp/dm1
mkdir run
cd run
mkdir out

# Make blast script
cat  << '_EOF_' > blastSome
#!/bin/sh
BLASTMAT=/cluster/bluearc/blast229/data /cluster/bluearc/blast229/blastall \
	-p blastp -d /cluster/bluearc/dm1/blastp/flyBase \
	-i $1 -o $2 -e 0.01 -m 8 -b 1
'_EOF_'
    # << keep emacs happy
chmod a+x blastSome

# Make gensub2 file
cat << '_EOF_' > gsub
#LOOP
blastSome {check in line+ $(path1)} {check out line out/$(root1).tab}
#ENDLOOP
'_EOF_'
    # << keep emacs happy

# Create parasol batch
echo ../../split/*.fa | wordLine stdin > split.lst
gensub2 split.lst single gsub jobList
para create jobList
para try
para check
para push ... etc ...
# Completed: 7748 of 7748 jobs
# CPU time in finished jobs:      83377s    1389.62m    23.16h    0.97d  0.003 y
# IO & Wait Time:                 39913s     665.21m    11.09h    0.46d  0.001 y
# Average job time:                  16s       0.27m     0.00h    0.00d
# Longest job:                      167s       2.78m     0.05h    0.00d
# Submission to last job:           365s       6.08m     0.10h    0.00d

# Load into database.  
ssh hgwdev
cd /cluster/data/hg17/bed/geneSorter/blastp/dm1/run/out
hgLoadBlastTab hg17 dmBlastTab -maxPer=1 *.tab
# Loading database with 30067 rows

# update knownToHInv table
# Verified that there is now new release of HInv data.
hgMapToGene hg17 HInvGeneMrna knownGene knownToHInv
# count changed to 33236

#### BUILD Ensembl cross-reference table, ensemblXref3 (DONE - 2004-11-30 - Fan)

    # Get the ensembl gene/protein cross-reference data from
    # http://www.ensembl.org/Homo_sapiens/martview
    # Follow this sequence through the pages:
    # Page 1) Make sure that the Homo_sapiens choice is selected. Hit next.
    # Page 2) Uncheck the "Limit to" box in the region choice. Then hit next.
    # Page 3) Choose the "Feature" box, select gene, transcript, protein,
              SPTrEMBL ID, SWISSPROT ID, and SWISSPROT AC
    # Page 4) Choose "Text, tab separated".  choose gzip compression.  hit export.
    # Save as ensXref.txt

    sed ensXref.txt -e 's/\./\t/g' > ensemblXref3.tab

    hgsql hg17 -e "drop table ensemblXref3"
    hgsql hg17 < ~/src/hg/lib/ensemblXref3.sql

    hgsql hg17 -e 'load data local infile "ensemblXref3.tab" into table ensemblXref3 ignore 1 lines'

#### BUILD SUPERFAMILY RELATED TABLES (DONE - 2004-11-30 - Fan)

# Download Superfamily data files and build the Superfamily DB
# from supfam.mrc-lmb.cam.ac.uk

    mkdir /cluster/store8/superfamily/041128
    ln -s /cluster/store8/superfamily/041128 /cluster/data/superfamily/041128
    cd /cluster/data/superfamily/041128
# ftp over the following two files:
ass_28-Nov-2004.tab.gz
      supfam_28-Nov-2004.sql.gz
    
    gzip -d *.gz
# Load the Superfamily database
    hgsql hg17 -e "create database superfam041128"
    hgsql superfam041128 < supfam_28-Nov-2004.sql
# This may take about an hour.

# Make sure to add an index on id of the des table of superfam041128.
    hgsql superfam041128 -e "create index id on des(id);"

    hgsql superfam041128 < ~/src/hg/lib/sfAssign.sql
    hgsql superfam041128 -e 'load data local infile "ass_28-Nov-2004.tab" into table 
superfam041128.sfAssign;'

# Build or rebuild Superfamily track and create sf tables needed for PB

   hgsql hg17 < ~/src/hg/lib/sfAssign.sql

   cd /cluster/data/superfamily/041128  
   hgsql hg17 -e 'load data local infile "ass_28-Nov-2004.tab" into table hg17.sfAssign;'

# If hg17.sfDes already exists, drop it.

   hgsql superfam041128 -e "select * from des" >sfDes.tab
   hgsql hg17 < ~/src/hg/lib/sfDes.sql
   hgsql hg17 -e 'load data local infile "sfDes.tab" into table hg17.sfDes ignore 1 lines;'

# If hg17.superfamily already exists, drop it.
   cd /cluster/data/hg17/bed
   mkdir /cluster/data/hg17/sf.2004-1128
   ln -s sf.2004-1128 sf
   hgSuperfam hg17 > sf.log

# It is normal that many proteins does not have corresponding Superfamily entries.

# If hg17.sfDescription exists, drop it.

   hgsql hg17 < ~/src/hg/lib/sfDescription.sql
   hgsql hg17 -e 'LOAD DATA local INFILE "sfDescription.tab"  into table hg17.sfDescription;'

# Finally, load the superfamily table.

   hgLoadBed hg17 superfamily superfamily.tab -tab

# Create knownToSuperfamily table
# Note hs is changed into ht for this Superfamily release.
   
   cat /cluster/data/superfamily/041128/ass_28-Nov-2004.tab \
   | hgKnownToSuper hg17 hs stdin
# created 25287 rows in knownToSuper

### HG17 PROTEOME BROWSER TABLES RE-BUILD ####  (DONE - 2004-12-01 - Fan)
# These are instructions for rebuilding tables 
# needed for the Proteome Browser.  
# DON'T START THESE UNTIL TABLES FOR KNOWN GENES AND kgProtMap table
# ARE REBUILT.  
# This update is based on proteins DBs dated 041115.

# Create the working directory

    ssh hgwdev
    mv /cluster/data/hg17/bed/pb /cluster/data/hg17/bed/pb.2004-06-11
    mkdir /cluster/data/hg17/bed/pb.2004-12-01
    cd /cluster/data/hg17/bed
    ln -s /cluster/data/hg17/bed/pb.2004-12-01 pb

# Move the existing PB tables by:

hgsql hg17
create database hg17Sav2;

alter table hg17.pepCCntDist rename as hg17Sav2.pepCCntDist;
alter table hg17.pepExonCntDist rename as hg17Sav2.pepExonCntDist;
alter table hg17.pepHydroDist rename as hg17Sav2.pepHydroDist;
alter table hg17.pepIPCntDist rename as hg17Sav2.pepIPCntDist;
alter table hg17.pepMolWtDist rename as hg17Sav2.pepMolWtDist;
alter table hg17.pepMwAa rename as hg17Sav2.pepMwAa;
alter table hg17.pepPi rename as hg17Sav2.pepPi;
alter table hg17.pepPiDist rename as hg17Sav2.pepPiDist;
alter table hg17.pepResDist rename as hg17Sav2.pepResDist;

alter table hg17.pbAaDistA rename as hg17Sav2.pbAaDistA;
alter table hg17.pbAaDistC rename as hg17Sav2.pbAaDistC;
alter table hg17.pbAaDistD rename as hg17Sav2.pbAaDistD;
alter table hg17.pbAaDistE rename as hg17Sav2.pbAaDistE;
alter table hg17.pbAaDistF rename as hg17Sav2.pbAaDistF;
alter table hg17.pbAaDistG rename as hg17Sav2.pbAaDistG;
alter table hg17.pbAaDistH rename as hg17Sav2.pbAaDistH;
alter table hg17.pbAaDistI rename as hg17Sav2.pbAaDistI;
alter table hg17.pbAaDistK rename as hg17Sav2.pbAaDistK;
alter table hg17.pbAaDistL rename as hg17Sav2.pbAaDistL;
alter table hg17.pbAaDistM rename as hg17Sav2.pbAaDistM;
alter table hg17.pbAaDistN rename as hg17Sav2.pbAaDistN;
alter table hg17.pbAaDistP rename as hg17Sav2.pbAaDistP;
alter table hg17.pbAaDistQ rename as hg17Sav2.pbAaDistQ;
alter table hg17.pbAaDistR rename as hg17Sav2.pbAaDistR;
alter table hg17.pbAaDistS rename as hg17Sav2.pbAaDistS;
alter table hg17.pbAaDistT rename as hg17Sav2.pbAaDistT;
alter table hg17.pbAaDistV rename as hg17Sav2.pbAaDistV;
alter table hg17.pbAaDistW rename as hg17Sav2.pbAaDistW;
alter table hg17.pbAaDistY rename as hg17Sav2.pbAaDistY;
alter table hg17.pbAnomLimit rename as hg17Sav2.pbAnomLimit;
alter table hg17.pbResAvgStd rename as hg17Sav2.pbResAvgStd;
alter table hg17.pbStamp rename as hg17Sav2.pbStamp;

quit

# Define pep* tables in hg17 DB

	cat ~/kent/src/hg/lib/pep*.sql > pepAll.sql

#  First edit out pepPred table definition, then

	hgsql hg17 < pepAll.sql

# Build the pepMwAa table

  hgsql proteins041115 -e "select info.acc, molWeight, aaSize from sp041115.info, sp041115.accToTaxon where 
accToTaxon.taxon=9606 and accToTaxon.acc = info.acc" > pepMwAa.tab

hgsql hg17 <<end
load data local infile "pepMwAa.tab" into table hg17.pepMwAa ignore 1 lines;
end

o Build the pepPi table

  hgsql proteins041115 -e "select info.acc from sp041115.info, sp041115.accToTaxon where 
accToTaxon.taxon=9606 and accToTaxon.acc = info.acc" > protAcc.lis

  pbCalPi protAcc.lis sp041115 pepPi.tab

hgsql hg17 <<end
load data local infile "pepPi.tab" into table hg17.pepPi;
end

# Calculate and load pep distributions

  pbCalDist sp041115 proteins041115 9606 hg17 >pbCalDist.out

    cat pbCalDist.out
    wc  pbCalDist.out

    hgsql hg17

    load data local infile "pepExonCntDist.tab" into table hg17.pepExonCntDist;
    load data local infile "pepCCntDist.tab" into table hg17.pepCCntDist;
    load data local infile "pepHydroDist.tab" into table hg17.pepHydroDist;
    load data local infile "pepMolWtDist.tab" into table hg17.pepMolWtDist;
    load data local infile "pepResDist.tab" into table hg17.pepResDist;
    load data local infile "pepIPCntDist.tab" into table hg17.pepIPCntDist;
    load data local infile "pepPiDist.tab" into table hg17.pepPiDist;
    quit

# Calculate frequency distributions

    pbCalResStd 041115 9606 hg17

# Create pbAnomLimit and pbResAvgStd tables

   hgsql hg17 < ~/src/hg/lib/pbAnomLimit.sql
   hgsql hg17 < ~/src/hg/lib/pbResAvgStd.sql

   hgsql hg17 -e 'load data local infile "pbResAvgStd.tab" into table hg17.pbResAvgStd;'
   hgsql hg17 -e 'load data local infile "pbAnomLimit.tab" into table hg17.pbAnomLimit;'

# Create pbStamp table for PB
   
  hgsql hg17 < ~/src/hg/lib/pbStamp.sql
  hgsql hg17Sav2 -e 'select * from pbStamp' > pbStamp.tab

  hgsql hg17 -e 'load data local infile "pbStamp.tab" into table hg17.pbStamp ignore 1 lines;'

# Adjust drawing parameters for Proteome Browser stamps

  Now invoke Proteome Browser and adjust various drawing parameters
  (mostly the ymax of each stamp) if necessary, by updating the 
  pbStamp.tab file and then delete and reload the pbStamp table. 

# Perform preliminary review of Proteome Browser for hg17, then
  notify QA for formal review.

####  Blat knownGene proteins to determine exons  (braney DONE 12/11/04)
    ssh hgwdev
    cd /cluster/data/hg17/bed
    mkdir blat.hg17KG.2004-12-08
    rm blat.hg17KG
    ln -s  blat.hg17KG.2014-12-08 blat.hg17KG
    cd blat.hg17KG
    pepPredToFa hg17 knownGenePep known.fa
    ssh kk
    cd /cluster/data/hg17/bed/blat.hg17KG
    cat << '_EOF_' > blatSome
#!/bin/csh -fe
/cluster/bin/i386/blat -t=dnax -q=prot -out=pslx $1 $2 $3
'_EOF_'
    cat << '_EOF_' > blatGsub
#LOOP
blatSome $(path1) {check in line $(path2)} {check out line psl/$(root1)/$(root2).psl}
#ENDLOOP
'_EOF_'
    # << keep emacs happy
    chmod +x blatSome
    ls -1S /scratch/hg/gs.18/build35/bothMaskedNibs/*.nib > human.lst
    mkdir kgfa
    cd kgfa
    faSplit sequence ../known.fa 3010 kg
    cd ..
    ls -1S kgfa/*.fa > kg.lst
    gensub2 human.lst kg.lst blatGsub blatSpec
    mkdir psl
    cd psl
    foreach i (`cat ../human.lst`)
	mkdir `basename $i .nib`
    end
    cd ..
    para create blatSpec
    para push

# Completed: 134130 of 134136 jobs
# Crashed: 6 jobs
# CPU time in finished jobs:   29801114s  496685.23m  8278.09h  344.92d  0.945 y
# IO & Wait Time:               1983513s   33058.55m   550.98h   22.96d  0.063 y
# Average job time:                 237s       3.95m     0.07h    0.00d
# Longest job:                    63306s    1055.10m    17.59h    0.73d
# Submission to last job:        169384s    2823.07m    47.05h    1.96d

# did 6 crashed jobs on small cluster
    ssh eieio
    cd /cluster/data/hg17/bed/blat.hg17KG
    pslSort dirs raw.psl /tmp psl/*
    pslReps -nohead -minCover=0.9 -minAli=0.9 raw.psl cooked.psl /dev/null
    pslUniq cooked.psl hg17KG.psl
    pslxToFa hg17KG.psl hg17KG_ex.fa -liftTarget=genome.lft -liftQuery=protein.lft
    kgName hg17 hg17KG.psl blastKGRef01
    cut -f 10 hg17KG.psl > kgName.lst
    faSomeRecords known.fa kgName.lst hg17KG.fa
    hgPepPred hg17 generic blastKGPep01 hg17KG.fa

    ssh hgwdev
    cd /cluster/data/hg17/bed/blat.hg17KG
    hgsql hg17 < ~/kent/src/hg/lib/blastRef.sql
    echo "rename table blastRef to blastKGRef01" | hgsql hg17
    echo "load data local infile 'blastKGRef01' into table blastKGRef01" | hgsql hg17

#### TIGR GENE INDEX (DONE 2004-12-04 Fan)
    mkdir -p /cluster/data/hg17/bed/tigr
    cd /cluster/data/hg17/bed/tigr
    wget -timestamp ftp://ftp.tigr.org/pub/data/tgi/Homo_sapiens/TGI_track_HumanGenome_build35.tgz
    tar xvzf TGI*.tgz

    foreach f (*cattle*)
      set f1 = `echo $f | sed -e 's/cattle/cow/g'`
      mv $f $f1
    end

    foreach o (mouse cow human pig rat)
      echo $o
      setenv O $o
      foreach f (chr*_$o*s)
        tail +2 $f | perl -wpe 's /THC/TC/; s/(TH?C\d+)/$ENV{O}_$1/;' > $f.gff
      end
    end

    ssh hgwdev
    cd /cluster/data/hg17/bed/tigr
    hgsql hg17 -e "drop table tigrGeneIndex"
    hgsql hg17 < ~/kent/src/hg/lib/tigrGeneIndex.sql

    foreach f (*.gff)
        echo Processing $f ...
        /cluster/home/fanhsu/bin/i386/ldHgGene -oldTable -exon=TC hg17 tigrGeneIndex $f
        hgsql hg17 -e "select count(*) from tigrGeneIndex"
    end
    # Total of 401322 entries created in tigrGeneIndex table.

    hgsql hg17 -e "update tigrGeneIndex set cdsStart = txStart;"
    hgsql hg17 -e "update tigrGeneIndex set cdsEnd = txEnd;"

    checkTableCoords hg17 tigrGeneIndex

    gzip *.gff *TCs

# BLASTZ FOR ZEBRAFISH (danRer2) (DONE, 2004-12-09, hartera)
    ssh kkr1u00
    # blastz requires lineage-specific repeats
    # Treat all repeats as lineage-specific.
 # /iscratch/i/gs.18/build35/linSpecRep.notInZebrafish exists (makeDanRer1.doc)
    mkdir -p /iscratch/i/danRer2/linSpecRep.notInHuman
    foreach f (/iscratch/i/danRer2/rmsk/chr*.fa.out)
      cp -p $f /iscratch/i/danRer2/linSpecRep.notInHuman/$f:t:r:r.out.spec
    end
    iSync

    ssh kk
    mkdir -p /cluster/data/hg17/bed/blastz.danRer2.2004-12-08
    ln -s /cluster/data/hg17/bed/blastz.danRer2.2004-12-08 \
          /cluster/data/hg17/bed/blastz.danRer2
    cd /cluster/data/hg17/bed/blastz.danRer2
    # Set L=6000 and abridge repeats - these are the same parameters used
    # for hg16 and Fugu and similar to those for hg16-galgal2
                                                                                
    cat << '_EOF_' > DEF
# human (hg17) vs zebrafish (danRer2)
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/i386:/cluster/home/angie/schwartzbin
                                                                                
ALIGN=blastz-run
BLASTZ=blastz
                                                                                
# Reuse parameters from hg16-fr1.
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=6000
BLASTZ_K=2200
BLASTZ_Q=/cluster/data/blastz/HoxD55.q
BLASTZ_ABRIDGE_REPEATS=1
                                                                                
# TARGET: Human (hg17)
SEQ1_DIR=/iscratch/i/gs.18/build35/bothMaskedNibs
SEQ1_RMSK=
SEQ1_FLAG=
SEQ1_SMSK=/iscratch/i/gs.18/build35/linSpecRep.notInZebrafish
SEQ1_IN_CONTIGS=0
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
                                                                                
# QUERY: zebrafish (danRer2)
SEQ2_DIR=/iscratch/i/danRer2/nib/
SEQ2_RMSK=
SEQ2_FLAG=
SEQ2_SMSK=/iscratch/i/danRer2/linSpecRep.notInHuman
SEQ2_IN_CONTIGS=0
SEQ2_CHUNK=10000000
SEQ2_LAP=0
                                                                                
BASE=/cluster/data/hg17/bed/blastz.danRer2
                                                                                
DEF=$BASE/DEF
RAW=$BASE/raw
CDBDIR=$BASE
SEQ1_LEN=$BASE/S1.len
SEQ2_LEN=$BASE/S2.len

#DEBUG=1
'_EOF_'
    # << this line keeps emacs coloring happy
    chmod +x DEF
    # Save the DEF file in the current standard place
    cp DEF ~angie/hummus/DEF.hg17-danRer2.2004-12-08
    # prepare first cluster run
    ssh kk
    cd /cluster/data/hg17/bed/blastz.danRer2
    bash # if a csh/tcsh user
    . ./DEF
    /cluster/data/hg17/jkStuff/BlastZ_run0.sh
    cd run.0
    para try, check, push, check ...etc.
# para time
# Completed: 58993 of 58993 jobs
# CPU time in finished jobs:   19583036s  326383.93m  5439.73h  226.66d  0.621 y
# IO & Wait Time:                471090s    7851.50m   130.86h    5.45d  0.015 y
# Average job time:                 340s       5.67m     0.09h    0.00d
# Longest job:                      885s      14.75m     0.25h    0.01d
# Submission to last job:         78245s    1304.08m    21.73h    0.91d

    ssh kki
    cd /cluster/data/hg17/bed/blastz.danRer2
    bash # if a csh/tcsh user
    . ./DEF
    /cluster/data/hg17/jkStuff/BlastZ_run1.sh
    cd run.1
    para try, check, push, etc ...
# para time
# Completed: 341 of 341 jobs
# CPU time in finished jobs:        789s      13.14m     0.22h    0.01d  0.000 y
# IO & Wait Time:                  2992s      49.87m     0.83h    0.03d  0.000 y
# Average job time:                  11s       0.18m     0.00h    0.00d
# Longest job:                       34s       0.57m     0.01h    0.00d
# Submission to last job:           391s       6.52m     0.11h    0.00d

    #   Third cluster run to convert lav's to axt's
    ssh kki
    cd /cluster/data/hg17/bed/blastz.danRer2
    mkdir axtChrom
    # a new run directory
    mkdir run.2
    cd run.2
cat << '_EOF_' > do.csh
#!/bin/csh
cd $1
cat `ls -1 *.lav | sort -g` \
| lavToAxt stdin /iscratch/i/gs.18/build35/bothMaskedNibs \
 /iscratch/i/danRer2/nib stdout \
| axtSort stdin $2
'_EOF_'
    # << this line makes emacs coloring happy
    chmod a+x do.csh
    cat << '_EOF_' > gsub
#LOOP
./do.csh {check in exists $(path1)} {check out line+ /cluster/data/hg17/bed/blastz.danRer2/axtChrom/$(root1).axt}
#ENDLOOP
'_EOF_'
    # << this line makes emacs coloring happy
    \ls -1Sd ../lav/chr* > chrom.list
    gensub2 chrom.list single gsub jobList
    wc -l jobList
    head jobList
    para create jobList
    para try, check, push, check,...
# para time
# Completed: 45 of 46 jobs
# Crashed: 1 jobs
# CPU time in finished jobs:         99s       1.64m     0.03h    0.00d  0.000 y
# IO & Wait Time:                   862s      14.37m     0.24h    0.01d  0.000 y
# Average job time:                  21s       0.36m     0.01h    0.00d
# Longest job:                       92s       1.53m     0.03h    0.00d
# Submission to last job:           456s       7.60m     0.13h    0.01d
    # crashed job: chr6_hla_hap1.axt is empty - has no alignments
    # translate sorted axt files into psl
    ssh kolossus
    cd /cluster/data/hg17/bed/blastz.danRer2
    mkdir -p pslChrom
    set tbl = "blastzDanRer2"
    foreach f (axtChrom/chr*.axt)
      set c=$f:t:r
      echo "Processing chr $c"
      /cluster/bin/i386/axtToPsl $f S1.len S2.len pslChrom/${c}_${tbl}.psl
    end
                                                                               
    # Load database tables
    ssh hgwdev
    cd /cluster/data/hg17/bed/blastz.danRer2/pslChrom
                                                                               
    foreach f (./*.psl)
      /cluster/bin/i386/hgLoadPsl -noTNameIx hg17 $f
      echo "$f Done"
    end
# try different parameters for blastz with chr1 of hg17
# featureBits -chrom=chr1 hg17 refGene:cds blastzDanRer1 -enrichment
# refGene:cds 1.301%, blastzDanRer1 3.934%, both 0.874%, cover 67.23%, 
# enrich 17.09x
# H=2000, Y=3400, L=6000, K=2200 and HoxD55.q scoring matrix
# featureBits -chrom=chr1 hg17 refGene:cds blastzDanRer2 -enrichment
# refGene:cds 1.301%, blastzDanRer2 3.845%, both 0.879%, cover 67.55%, 
# enrich 17.57x
# same parameters as above but L=8000
# featureBits -chrom=chr1 hg17 refGene:cds blastzDanRer2L8k -enrichment
# refGene:cds 1.301%, blastzDanRer2L8k 2.309%, both 0.778%, cover 59.81%, 
# enrich 25.91x
# enrichment went up but coverage dropped quite a bit.
# Default parameters with H=2000
# featureBits -chrom=chr1 hg17 refGene:cds blastzDanRer2Default -enrichment
# refGene:cds 1.301%, blastzDanRer2Default 1.701%, both 0.846%, cover 65.04%, 
# enrich 38.24x
# same as first run but with no Y option set (default Y)
# featureBits -chrom=chr1 hg17 refGene:cds blastzDanRer2NoY -enrichment
# refGene:cds 1.301%, blastzDanRer2NoY 3.980%, both 0.877%, cover 67.47%, 
# enrich 16.95x

# row count:
# danRer2  122160
# danRer2L8k 62815
# danRer2Default 75818
# danRer2NoY 124129
# can be pruned at the chaining step.
# trackDb - change Zebrafish Blastz to danRer1 Blastz and display this track
# for danRer2 as Zebrafish Blastz

# RESCORE DANRER2 BLASTZ (DONE, 2004-12-09, hartera)
    # Low scores can occur with repeats abridged and using the
    # HoxD55.q matrix. PSU's restore_rpts program rescored alignments
    # with the default matrix instead of the BLASTZ_Q matrix.
    # Rescore them here so the chainer sees the higher scores:
                                                                                
    ssh kolossus
    cd /cluster/data/hg17/bed/blastz.danRer2
    mkdir axtChrom.rescore
    foreach f (axtChrom/chr*.axt)
        axtRescore -scoreScheme=/cluster/data/blastz/HoxD55.q \
        $f axtChrom.rescore/$f:t
    end
    mv axtChrom axtChrom.orig
    mv axtChrom.rescore axtChrom
                                                                                
#   psl files and blastz tables will be the same regardless of score so
#   no need to reload

# CHAIN DANRER2 BLASTZ (DONE, 2004-12-09, hartera)
# RELOAD CHAINS WIH FILTERING (DONE, 2004-12-10, hartera)
# APPLY chainAntiRepeat TO REMOVE CHAINS THAT ARE THE RESULTS OF REPEATS
# AND DEGENERATE DNA (DONE, 2004-12-22, hartera)
    # Run axtChain on little cluster
    ssh kki
    cd /cluster/data/hg17/bed/blastz.danRer2
    mkdir -p axtChain/run1
    cd axtChain/run1
    mkdir out chain
    # create input list
    ls -1S /cluster/data/hg17/bed/blastz.danRer2/axtChrom/*.axt \
        > input.lst
    cat << '_EOF_' > gsub
#LOOP
doChain {check in exists $(path1)} {check out line+ chain/$(root1).chain} {check out line+ out/$(root1).out}
#ENDLOOP
'_EOF_'
    # << this line makes emacs coloring happy
                                                                                
    # Reuse gap penalties from hg16 vs chicken run.
    cat << '_EOF_' > ../../chickenHumanTuned.gap
tablesize^V     11
smallSize^V     111
position^V      1^V     2^V     3^V     11^V    111^V   2111^V  12111^V 32111^V
72111^V 152111^V        252111
qGap^V  325^V   360^V   400^V   450^V   600^V   1100^V  3600^V  7600^V  15600^V
31600^V 56600
bothGap^V       625^V   660^V   700^V   750^V   900^V   1400^V  4000^V  8000^V
16000^V 32000^V 57000
'_EOF_'
    # << this line makes emacs coloring happy
    cat << '_EOF_' > gsub
#LOOP
doChain {check in exists $(path1)} {check out line+ chain/$(root1).chain} {check out line+ out/$(root1).out}
#ENDLOOP
'_EOF_'
    # << this line makes emacs coloring happy
cat << '_EOF_' > doChain
#!/bin/csh
axtChain -scoreScheme=/cluster/data/blastz/HoxD55.q \
    -linearGap=../../chickenHumanTuned.gap $1 \
    /iscratch/i/gs.18/build35/bothMaskedNibs \ 
    /iscratch/i/danRer2/nib $2 >& $3
'_EOF_'
    # << this line makes emacs coloring happy
    chmod a+x doChain
    gensub2 input.lst single gsub jobList
    para create jobList
    para try, check, push, check...
# para time
# Completed: 45 of 46 jobs
# Crashed: 1 jobs
# CPU time in finished jobs:       1837s      30.62m     0.51h    0.02d  0.000 y
# IO & Wait Time:                   441s       7.35m     0.12h    0.01d  0.000 y
# Average job time:                  51s       0.84m     0.01h    0.00d
# Longest job:                      106s       1.77m     0.03h    0.00d
# Submission to last job:           419s       6.98m     0.12h    0.00d

    # crashed job is chr6_hla_hap1 which has no alignments
    # now on the cluster server, sort chains
    ssh kksilo
    cd /cluster/data/hg17/bed/blastz.danRer2/axtChain
    chainMergeSort run1/chain/*.chain > all.chain
    chainSplit chain all.chain
                                                                                
    # take a look at score distr's
    foreach f (chain/*.chain)
      grep chain $f | awk '{print $2;}' | sort -nr > /tmp/score.$f:t:r
      echo $f:t:r >> hist5000.out
      textHistogram -binSize=5000 /tmp/score.$f:t:r >> hist5000.out
      echo ""
    end

    # apart from chr19 not too many with chains with scores < 5000
    # load chr1 chain into table and check
    ssh hgwdev
    cd /cluster/data/hg17/bed/blastz.danRer2/axtChain/chain
    hgLoadChain hg17 chr1_chainDanRer2 chr1.chain
# featureBits -chrom=chr1 hg17 refGene:cds chainDanRer2Link -enrichment
# refGene:cds 1.301%, chainDanRer2Link 3.676%, both 0.877%, cover 67.42%,
# enrich 18.34x
# featureBits -chrom=chr1 hg17 refGene:cds chainDanRer2 -enrichment
# refGene:cds 1.301%, chainDanRer2 32.611%, both 1.034%, cover 79.52%, 
# enrich 2.44x
    ssh kksilo
    cd /cluster/data/hg17/bed/blastz.danRer2/axtChain
    mv all.chain all.chain.unfiltered
    chainFilter -minScore=5000 all.chain.unfiltered > all.chain
    chainSplit chainFilt5k all.chain

    # load chr1 filtered chains and check
    ssh hgwdev
    cd /cluster/data/hg17/bed/blastz.danRer2/axtChain/chainFilt5k
    hgLoadChain hg17 chr1_chainDanRer2Filt5k chr1.chain
# featureBits -chrom=chr1 hg17 refGene:cds chainDanRer2Filt5kLink -enrichment
# refGene:cds 1.301%, chainDanRer2Filt5kLink 2.907%, both 0.870%, cover 66.86%,
# enrich 23.00x
# featureBits -chrom=chr1 hg17 refGene:cds chainDanRer2Filt5k -enrichment
# refGene:cds 1.301%, chainDanRer2Filt5k 31.343%, both 1.028%, cover 79.02%, 
# enrich 2.52x
# checked browser - when filtered on minScore=5000, the low scoring 
# alignments removed are small and/or poor alignments so use this version.
    
    # remove repeats from filtered chains and reload into database
    # (2004-12-22, hartera)
    ssh kksilo
    cd /cluster/data/hg17/bed/blastz.danRer2/axtChain
    mv chainFilt5k chainRaw
    mkdir chain
    cd chainRaw
    foreach f (*.chain)
       set c = $f:r
       echo $c
       nice chainAntiRepeat /cluster/bluearc/hg17/bothMaskedNibs \
                       /cluster/bluearc/danRer2/nib $f \
                       ../chain/$c.chain
    end
    cd ..
    chainMergeSort ./chain/*.chain > all.chain.antirepeat
    chainSplit chainAR all.chain.antirepeat
    # load filtered chains and check
    ssh hgwdev
    echo 'drop table chr1_chainDanRer2Filt5k;' | hgsql hg17
    echo 'drop table chr1_chainDanRer2Filt5kLink;' | hgsql hg17
    # reload filtered chains instead of unfiltered (2004-12-10, hartera)
    # reload filtered chains with repeats removed (2004-12-22, hartera)
    cd /cluster/data/hg17/bed/blastz.danRer2/axtChain/
    cd chainAR
    foreach i (*.chain)
        set c = $i:r
        hgLoadChain hg17 ${c}_chainDanRer2 $i
        echo done $c
    end
# trackDb - change Zebrafish Chain to danRer1 Chain and display this track
# for danRer2 as Zebrafish Chain.
# after chainAntiRepeat
# featureBits -chrom=chr1 hg17 refGene:cds chainDanRer2Link -enrichment
# refGene:cds 1.304%, chainDanRer2Link 2.742%, both 0.872%, cover 66.81%, 
# enrich 24.36x 

# NET DANRER2 BLASTZ (DONE, 2004-12-09, hartera)
# RE-CREATE NET WITH FILTERED CHAINS (DONE, 2004-12-10, hartera)
# RE-DO NET WITH CHAINS FILTERED BY chainAntiRepeat (DONE, 2004-12-22, hartera) 
    ssh kksilo
    cd /cluster/data/hg17/bed/blastz.danRer2/axtChain
    rm -r preNet
    mkdir preNet
    cd chainAR
    foreach i (*.chain)
       echo preNetting $i
       /cluster/bin/i386/chainPreNet $i ../../S1.len ../../S2.len \
                                     ../preNet/$i
    end
    cd ..
    mkdir n1
    cd preNet
    foreach i (*.chain)
      set n = $i:r.net
      echo primary netting $i
      /cluster/bin/i386/chainNet $i -minSpace=1 ../../S1.len ../../S2.len \
                                 ../n1/$n /dev/null
    end
    cd ..
    cat n1/*.net | /cluster/bin/i386/netSyntenic stdin noClass.net
    # memory usage 133443584, utime 905 s/100, stime 139
                                                                                
    # Add classification info using db tables:
    # netClass looks for ancient repeats in one of the databases
    # hg17 has this table - hand-curated by Arian but this is for
    # human-rodent comparisons so do not use here, use -noAr option
    mkdir -p /cluster/bluearc/danRer2/linSpecRep.notInHuman
    # linSpecRep.notInZebrafish exists for hg17 
    cp /iscratch/i/danRer2/linSpecRep.notInHuman/* \
       /cluster/bluearc/danRer2/linSpecRep.notInHuman
    ssh hgwdev
    cd /cluster/data/hg17/bed/blastz.danRer2/axtChain
    time netClass noClass.net hg17 danRer2 zfishdanRer2.net \
         -tNewR=/cluster/bluearc/hg17/linSpecRep.notInZebrafish \
         -qNewR=/cluster/bluearc/danRer2/linSpecRep.notInHuman -noAr
    # 97.230u 54.290s 5:37.50 44.8%   0+0k 0+0io 217pf+0w
    # load net into database
    cd /cluster/data/hg17/bed/blastz.danRer2/axtChain
    netFilter -minGap=10 zfishdanRer2.net |  hgLoadNet hg17 netDanRer2 stdin
# trackDb - change Zebrafish Net to danRer1 Net and display this track
# for danRer2 as Zebrafish Net.
# after chainAntiRepeat:
# featureBits hg17 refGene:cds netDanRer2 -enrichment
# refGene:cds 1.015%, netDanRer2 22.898%, both 0.783%, cover 77.15%, 
# enrich 3.37x

    # index had NULL cardinality, analyze table to fix (2005-1-18, Heather)
    hgsql hg17
    analyze table netDanRer2

# LOAD ACEMBLY TRACK (DONE, 2005-01-24, hartera)
# ACEMBLY TABLE RELOADED AND FINISHED COLOR CODING CODE IN 
# hgTracks (2005-01-28, hartera)
# FINISHED CODE FOR FILTERING BY GENE CLASS (2005-02-03, hartera)
    mkdir -p /cluster/data/hg17/bed/acembly
    cd /cluster/data/hg17/bed/acembly
    # Data is obtained from 
    # Danielle et Jean Thierry-Mieg     mieg@ncbi.nlm.nih.gov
 
    wget --timestamping ftp://ftp.ncbi.nih.gov/repository/acedb/ncbi_35.human.genes/acembly.ncbi_35.genes.proteins.fasta.tar.gz
    wget --timestamping ftp://ftp.ncbi.nih.gov/repository/acedb/ncbi_35.human.genes/acembly.ncbi_35.genes.gff.tar.gz
    wget --timestamping ftp://ftp.ncbi.nih.gov/repository/acedb/ncbi_35.human.genes/acembly.ncbi_35.mrnas.fasta.tar.gz
    wget --timestamping ftp://ftp.ncbi.nih.gov/repository/acedb/ncbi_35.human.genes/acembly.ncbi_35.genes.pfamhits.tar.gz
    tar xvzf acembly.ncbi_35.genes.gff.tar.gz
    tar xvzf acembly.ncbi_35.genes.proteins.fasta.tar.gz
    cd acembly.ncbi_35.genes.gff
    # the acembly dataset for hg16 had problems with reverse blocks so
    # check for these
cat << '_EOF_' > checkReversedBlocks
for i in x1*.gff
do
        echo -n "$i working ..."
        awk -F"\t" '
{
if ($4 > $5) {
    printf "reverse blocks problem for $1"
        printf "\n"
}
}
' $i > $i.fixed
        echo " done"
done
'_EOF_'
    # << this line makes emacs coloring happy
    chmod +x checkReversedBlocks
    ./checkReversedBlocks  
    ls -l *.fixed
    # all *.fixed files are empty so remove - there is no reversing of blocks
    rm *.fixed

    foreach f (x1.acemblygenes.*.gff)
      set c=$f:r:e
      egrep '^[a-zA-Z0-9]+\|NT_[0-9][0-9][0-9][0-9][0-9][0-9]' $f | \
        perl -wpe 's/^(\w+)\|(\w+)/$1\/$2/' > ctg-chr${c}_random.gff
      if (-e ../../../$c/lift/random.lft) then
        liftUp chr${c}_random.gff ../../../$c/lift/random.lft warn \
          ctg-chr${c}_random.gff
      endif
      grep -v ^$c\| $f | grep -v ^Hs | perl -wpe 's/^/chr/;' | \
        grep -v "^chr//" > chr$c.gff
      echo "done $c"
    end
    
    #- Load into database - use extended genePred
    ssh hgwdev
    cd /cluster/data/hg17/bed/acembly
    # Reloaded without -genePredExt 1/6/05:
    ldHgGene -gtf hg17 acembly acembly.ncbi_35.genes.gff/chr*.gff
    # for entry with 28212470 from chr6.gff, change to chr6 
    # and for 29124352 in chr6.gff, change to chr6 (1/13/05)
    echo 'update acembly set chrom = "chr6" where chrom = "chr28212470";' \
         | hgsql hg17 
    echo 'update acembly set chrom = "chr6" where chrom = "chr29124352";' \
         | hgsql hg17 
    
    # checkTableCoords and runGeneCheck to check data
    # a number of errors so leave on hgwdev for the moment
# checkTableCoords:
# rah.acembly has 16 records with chrom not described in chromInfo.
# rah.acembly item RPL10A.sNov04 chr6:35544172-35546520: end of last block (35546519) is not the same as chromEnd (35546520).
# rah.acembly has 1 records with blockEnd[n-1] != end.
# rah.acembly has 1 records with end > chromSize.
# chr6    acembly exon    35545934        35546101        .       +       0  
# gene_id RPL10A; transcript_id RPL10A.sNov04; exon_number 5
# chr6    acembly intron  35546102        35546520        .       +       0
# gene_id RPL10A; transcript_id RPL10A.sNov04; intron_type fuzzy
# chr6    acembly CDS     35546335        35546384        .       +       0
# gene_id RPL10A; transcript_id RPL10A.sNov04; exon_number 6
# chr6    acembly exon    35546335        35546519        .       +       0
# gene_id RPL10A; transcript_id RPL10A.sNov04; exon_number 6
# chr6    acembly stop_codon      35546382        35546384        .       +
# 0       gene_id RPL10A; transcript_id RPL10A.sNov04;
# here the intron overlaps exon 6 so take 35546519 to be txEnd
      echo 'update acembly set txEnd = 35546519 where name = "RPL10A.sNov04";'\
           | hgsql hg17
# for record where end > chromSize
      echo 'select * from acembly as a, chromInfo as c where c.chrom = a.chrom \            and c.size < a.txEnd;' | hgsql hg17
      # KIR2DL5.bNov04 on chr19_random, chr19_random size is 301858, 
      # txEnd is 305266 delete this record
      echo 'delete from acembly where name = "KIR2DL5.bNov04";' | hgsql hg17
# from runGeneCheck:
# 5780   inFrameStop
# 110664 noStart
# 23085 badCdsSplice
# 23848 noStop
# 14957 badUtrSplice
# 3661 gap
# 4726 badFrame
# 261066 lines in genePred.tab

    # e-mailed authors of data (2004-12-21, hartera) 
#  notiri.aNov04 - has ctg instead of atg at start. others have no start specified: sirora.nNov04
# sirora.zmNov04 - chr1:19389-19392 is AAC (gtt) (-)
# sirora.sNov04 - chr1:8925-8928 CAA (ttg) (-)
# sirora.rNov04 - chr1:8925-8928 CAA (ttg) (-)
    # for entries with 28212470 and 29124352 instead of chr6 change to chr6
    # Re-process this x1 file to chr6.gff (2005-01-24)
    mv x1.acemblygenes.6.gff x1.acemblygenes.6.gff.broken
    sed -e "s/^28212470/6/" x1.acemblygenes.6.gff.broken | sed -e \
           "s/^29124352/6/"  > x1.acemblygenes.6.gff
    grep -v ^6\| x1.acemblygenes.6.gff | grep -v ^Hs | perl -wpe 's/^/chr/;' | \
        grep -v "^chr//" > chr6.gff
    # Received a list of genes from Jean and Danielle Mieg
    # showing genes that are "main", "putative" or "cloud" - there should be
    # no "cloud" genes in our data set (2005-01-11)
    # download acembly_gene_lists.tar.gz from e-mail
    cd /cluster/data/hg17/bed/acembly
    tar xvzf acembly_gene_lists.tar.gz
    cd acembly_gene_lists
cat << '_EOF_' > getIDs.pl
#!/usr/bin/perl -w
use strict;

while (<STDIN>) {
   my @f = split(/\s+/);
   for (my $i =0; $i <= $#f; $i++) {
       if ($f[$i] =~ /gene_id$/) {
          # if field is ID type then next value is the ID
          my $id = $f[$i+1];
          # remove ";" at end and print ID
          chop $id;
          print "$id\n";
       }
   }
}
'_EOF_'
    chmod +x getIDs.pl
    # get gene IDs from gff files
    foreach f (../acembly.ncbi_35.genes.gff/chr*.gff)
       echo "Processing $f"
       perl getIDs.pl < $f >> genesGffs.ids
    end
    # remove back slash from some names
    sort genesGffs.ids | uniq > genesGffs.ids.uniq
    # reformat gene list to get just the genes and remove first 2 lines and sort
    foreach g (*.list)
       sed -e 's/"//g;' $g | sed -e 's/Gene : //;' | sed -e '1,2d' \
              | sort | uniq > $g.IDsort
    end 
    # remove back slash from some names
    perl -pi.bak -e 's/\\//' *.IDsort
    # check if cloud genes appear in gff files list of genes
    # list of genes in cloud but not in gff
    comm -13 genesGffs.ids.uniq cloud_gene.list.IDsort > gffvscloud.out
    diff gffvscloud.out cloud_gene.list.IDsort
    # there is no difference so none of the cloud genes are in the gff files
    # check if all the other genes in the main and putative lists are in gffs
    comm -13 genesGffs.ids.uniq main_gene.list.IDsort > gffvsmain.out
    comm -13 genesGffs.ids.uniq putative_gene.list.IDsort > gffvsputative.out
    wc -l *.out
    # 14 gffvsmain.out
    # 0 gffvsputative.out

    # there are 14 genes in the main set not in the gff files
    # actually there are 12, as FCA/MR and SLA/LP are in the gff files
    # all putative genes are in the gff set
    wc -l main_gene.list.IDsort putative_gene.list.IDsort
    # 52467 main_gene.list.IDsort
    # 43978 putative_gene.list.IDsort
    # 96445 total
    wc -l genesGffs.ids.uniq    
    # 97042 genesGffs.ids.uniq
    # check discrepancy
    cat main_gene.list.IDsort putative_gene.list.IDsort > mp.ids
    sort mp.ids > mp.sort
    comm -23 genesGffs.ids.uniq mp.sort > gffNotMP.out
    wc -l gffNotMP.out
    # 609 gffNotMP.out
# create table of Acembly gene classifications
# see http://www.ncbi.nlm.nih.gov/IEB/Research/Acembly/index.html?human
# in FAQ, describes main, putative and cloud genes. The cloud genes are not 
# well confirmed and so they are not in this data set. 
    # NEED TO FILTER GENES AND RELOAD TABLES:
    # authors Jean and Danielle Mieg e-mailed back. The 12 genes in the 
    # putative list that are not in the gff files were not exported
    # as they did not find a single putative protein to describe so they 
    # were not added to the gffs. They will be added at a later date.
    # Remove these from the acemblyClass table (2005-01-21, hartera)
    # Reload acemblyClass table as problems with the gene names
    # the class table has gene IDs and the acembly table has transcript IDs
    # it is hard to look up class in the class table since just removing the 
    # transcript ID suffixes (e.g. "aNov04" after a ".") does not work as 
    # some gene IDs have a "." in them anyway.

    ssh kksilo
    cd /cluster/data/hg17/bed/acembly/acembly_gene_lists
    comm -13 gffvsmain.out main_gene.list.IDsort > main_gene.list.filt
    wc -l main_gene.list.filt
    # 52455 main_gene.list.filt
    ssh hgwdev
    cd /cluster/data/hg17/bed/acembly/acembly_gene_lists
    # drop acemblyClass table and recreate (2005-01-27, hartera)
    echo 'drop table acemblyClass;' | hgsql hg17
    # prepare a file of genes and classification
    # use transcript IDs - get these and corresponding gene IDs from gff files
    # if Gene IDs used the difficult to parse transcript ID (name column) from 
    # the acembly genePred table. e.g. notiri.aNov04 is a transcript ID so can
    # remove suffix after "." to obtain gene ID, some gene names have "." in
    # them and not all have the suffix.
    # 260446 transcript IDs (use allFiltered.gff - see below) 
    perl getClass.pl main_gene.list.filt putative_gene.list.IDsort \
         ../acembly.ncbi_35.genes.gff/allFiltered.gff 
    foreach f (main_gene.list.filt putative_gene.list.IDsort)
       if ($f == "main_gene.list.filt") then
          set t = "main"
       endif
       if ($f == "putative_gene.list.IDsort") then
          set t = "putative"
       endif
       awk 'BEGIN {OFS="\t"} {print $1, "'$t'"}' $f >> class.txt
    end
    sort classes.txt | uniq > geneIDtxID.class
    # get transcript ID and class fields for acemblyClass table
    awk 'BEGIN {OFS="\t"} {print $2,$3}' geneIDtxID.class > acemblyClass.tab
    wc -l acemblyClass.tab
    # 260446 acemblyClass.tab
    # make change to acemblyClass.as and check in:
    # change name to be transcript ID instead of gene ID
cat << '_EOF_' > $HOME/kent/src/hg/lib/acemblyClass.as
table acemblyClass
"Class for Acembly genes"
   (
    string name; "Transcript ID for Acembly gene"
    string class; "Class of gene"   
   )
'_EOF_'
    cd $HOME/kent/src/hg/lib/
    autoSql acemblyClass.as acemblyClass
    mv acemblyClass.h $HOME/kent/src/hg/inc
    # do make to check it works and commit the .as, .sql, .c and .h files to CVS
    cd /cluster/data/hg17/bed/acembly/acembly_gene_lists
    echo "drop table acemblyClass" | hgsql hg17
    hgsql hg17 < ~/kent/src/hg/lib/acemblyClass.sql
    # reload table with transcript IDs 
    echo "load data local infile 'acemblyClass.tab' into table acemblyClass" \
         | hgsql hg17
    
    # There were also 609 genes in the gff files that are not in the
    # main, putative or cloud gene lists. Jean and Danielle Mieg say that
    # these were filtered out from their data set but not from the gff files.
    # Remove these from the gff files. (gffNotMP.out) (2005-01-24)
    cd /cluster/data/hg17/bed/acembly/acembly.ncbi_35.genes.gff
    cat chr*.gff > all.gff
cat << '_EOF_' > removeGenes.pl
#!/usr/bin/perl -w
use strict;
                                                                                
my $genes = $ARGV[0];
my $gff = $ARGV[1];
                                                                                
open(GENES, $genes) || die "Can not open $genes:$!\n";
open(GFF, $gff) || die "Can not open $gff:$!\n";
open(OUT, ">removed.out") || die "Can not open removed.out:$!\n";
                                                                                
my %genes;
                                                                                
while (<GENES>) {
   chomp;
   my $g = $_;
   $genes{$g} = 1;
}
close GENES;
                                                                                
while (<GFF>) {
   my $l = $_;
   my $id;
   my @line = split(/\s+/);
   for (my $i = 0; $i <= $#line; $i++) {
      if ($line[$i] eq "gene_id") {
         $id = $line[$i+1];
      }
   }
   $id =~ s/;//;
   print "id is now $id\n";
   if (!exists($genes{$id})) {
      print $l;
   }
   else {
      print OUT $l;
   }
}
'_EOF_'
    perl removeGenes.pl ../acembly_gene_lists/gffNotMP.out all.gff \
                        > allFiltered.gff 
    # checked that gene IDs in the removed.out file are the same
    # same as those in gffNotMP.out
    # reload into the acembly table
    ssh hgwdev
    cd /cluster/data/hg17/bed/acembly
    echo 'drop table acembly;' | hgsql hg17
    # Reloaded with filtered set 2005-01-23, reload again 2005-01-28 with
    # the genePredExt option to get gene ID in name 2 field
    ldHgGene -gtf -genePredExt hg17 acembly \
             acembly.ncbi_35.genes.gff/allFiltered.gff
    # Read 260446 transcripts in 3656676 lines in 1 files
    #  260446 groups 41 seqs 1 sources 5 feature types
    #  260446 gene predictions
    # remove cdsStartStat, cdsEndStat and exonFrames fields
    echo 'alter table acembly drop column cdsStartStat;' | hgsql hg17
    echo 'alter table acembly drop column cdsEndStat;' | hgsql hg17
    echo 'alter table acembly drop column exonFrames;' | hgsql hg17

    # fix problem data found by checkTableCoords
    # here the intron overlaps exon 6 so take 35546519 to be txEnd
    echo 'update acembly set txEnd = 35546519 where name = "RPL10A.sNov04";'\
           | hgsql hg17
    # for record where end > chromSize
    echo 'select * from acembly as a, chromInfo as c where c.chrom = a.chrom \            and c.size < a.txEnd;' | hgsql hg17
    # KIR2DL5.bNov04 on chr19_random, size is 301858, txEnd is 305266
    # delete this record
    echo 'delete from acembly where name = "KIR2DL5.bNov04";' | hgsql hg17
    # acembly peptide table
    # need to just grab same sequences that are in acembly
    cd ./acembly.ncbi_35.genes.proteins.fasta
    echo 'select name from acembly;' | hgsql -N hg17 > acembly.name
    cat *.fasta > allPep.fa
    faSomeRecords allPep.fa acembly.name acemblyPep.fa
    # PEPTIDE SEQUENCES NOT LOADED
    # There are 236,554 peptide names that do not match transcript IDs in
    # the acembly table and 110,278 transcript IDs in acembly that do not
    # have a corresponding peptide. Waiting for repsonse about this from
    # Jean and Danielle (2005-01-31)  
    # hgPepPred hg17 generic acemblyPep \
    #         acembly.ncbi_35.genes.proteins.fasta/*.fasta

    # Edit hgTracks.c to get colour coded tracks based on the gene class
    # for each gene as read from the acemblyClass table.
    # Edits to hui.c, hgTrackUi.c and hgTracks.c to allow filtering of 
    # genes based on class.
# acembly trackDb entry:
# track acembly
# shortLabel Acembly Genes
# longLabel AceView Gene Models With Alt-Splicing
# group genes
# priority 41
# visibility dense
# color 155,0,125
# type genePred acemblyPep acemblyMrna
# url http://www.ncbi.nih.gov/IEB/Research/Acembly/av.cgi?db=hg17&l=$$
# itemClassTbl acemblyClass
# geneClasses main putative
# gClass_main 128,0,125
# gClass_putative 200,0,125
# urlLabel Transcript ID:

# search added:
# searchTable acembly
# searchType genePred
# searchMethod prefix
# termRegex [^[:space:]]+
# searchPriority 50

    # Received data with gene product relationship from Jean Thierry-Mieg
    # (2005-02-17)
    ssh eieio
    cd /cluster/data/hg17/bed/acembly/acembly.ncbi_35.genes.proteins.fasta
    wget --timestamp ftp://ftp.ncbi.nlm.nih.gov/repository/acedb/human/acembly.ncbi_35.gene2product.txt.gz
    gunzip acembly.ncbi_35.gene2product.txt.gz
    # these are gene ID and product mappings, need transcript ID to product
    # mappings. E-mailed Jean Thierry-Mieg to ask for this information
    
# BUILD WGRNA TRACK (DONE, 2004-12-13, Fan)

# Grab data from original miRNA track and convert them into wgRna .tab format.
    hgsql hg17 --skip-column-names -e 'select * from miRNA' >miRNA.out
    cat miRNA.out  | awk {'print $2"\t"$3"\t"$4"\t"$5"\t"$6"\t"$7"\t"$8"\t"$9"\t""miRna"'} >wgRna.tab

# Break the original custom track data file, hsa-snoRNA_track.txt, into two files j1 and j2, 
# then remove header and blank lines.
    cat j1 | awk {'print $1"\t"$2"\t"$3"\t"$4"\t"$5"\t"$6"\t"0"\t"0"\t""CDBox"'} >>wgRna.tab
    cat j2 | awk {'print $1"\t"$2"\t"$3"\t"$4"\t"$5"\t"$6"\t"0"\t"0"\t""HAcaBox"'} >>wgRna.tab

# load into wgRna table
    hgLoadBed -sqlTable=/cluster/home/fanhsu/hg/lib/wgRna.sql hg17 wgRna wgRna.tab
# create and edit wgRna.html under src/hg/makeDb/trackDb/human/hg17.

# RELOADED wgRna DATA USING wgRNA_corrected.txt SENT BY MICHEL WEBER
# Manually removed the first header line and the first column of the bin field and removed 
# the last empty line.

    cut -f 2- wgRNA_corrected.txt >wgRna.tab
    vi wgRna.tab
    hgLoadBed -sqlTable=/cluster/home/fanhsu/hg/lib/wgRna.sql hg17 wgRna wgRna.tab


# UPDATED WGRNA DATA PER EMAIL FROM WEBER (2004-12-14, Fan).

# Added the following 3 lines to j1
chr3 161715396 161715726 U90 480 -
chr11 93104316 93104387 Z40 480 -
chr11 93106041 93106114 mgh28S-2410 480 -
    
# Regenerated wgRna table
    cat miRNA.out  | awk {'print $2"\t"$3"\t"$4"\t"$5"\t"$6"\t"$7"\t"$8"\t"$9"\t""miRna"'} >wgRna.tab
    cat j1 | awk {'print $1"\t"$2"\t"$3"\t"$4"\t"$5"\t"$6"\t"0"\t"0"\t""CDBox"'} >>wgRna.tab
    cat j2 | awk {'print $1"\t"$2"\t"$3"\t"$4"\t"$5"\t"$6"\t"0"\t"0"\t""HAcaBox"'} >>wgRna.tab
    hgLoadBed -sqlTable=/cluster/home/fanhsu/hg/lib/wgRna.sql hg17 wgRna wgRna.tab

# Changed the following records to RNA type scaRna.

hgsql hg17 -e 'update wgRna set type = "scaRna" where name ="U85"'
hgsql hg17 -e 'update wgRna set type = "scaRna" where name ="U87"'
hgsql hg17 -e 'update wgRna set type = "scaRna" where name ="U88"'
hgsql hg17 -e 'update wgRna set type = "scaRna" where name ="U89"'
hgsql hg17 -e 'update wgRna set type = "scaRna" where name ="U90"'
hgsql hg17 -e 'update wgRna set type = "scaRna" where name ="U91"'
hgsql hg17 -e 'update wgRna set type = "scaRna" where name ="U92"'
hgsql hg17 -e 'update wgRna set type = "scaRna" where name ="U93"'
hgsql hg17 -e 'update wgRna set type = "scaRna" where name ="U100"'
hgsql hg17 -e 'update wgRna set type = "scaRna" where name ="ACA26"'
hgsql hg17 -e 'update wgRna set type = "scaRna" where name ="ACA35"'
hgsql hg17 -e 'update wgRna set type = "scaRna" where name ="ACA45"'
hgsql hg17 -e 'update wgRna set type = "scaRna" where name ="ACA47"'
hgsql hg17 -e 'update wgRna set type = "scaRna" where name ="ACA57"'
hgsql hg17 -e 'update wgRna set type = "scaRna" where name ="HBII-382"'
hgsql hg17 -e 'update wgRna set type = "scaRna" where name ="mgU2-19/30"'
hgsql hg17 -e 'update wgRna set type = "scaRna" where name ="mgU2-25/61"'
hgsql hg17 -e 'update wgRna set type = "scaRna" where name ="mgU2-22/U4-8"'
hgsql hg17 -e 'update wgRna set type = "scaRna" where name ="U85"'
hgsql hg17 -e 'update wgRna set type = "scaRna" where name ="U87"'
hgsql hg17 -e 'update wgRna set type = "scaRna" where name ="U88"'
hgsql hg17 -e 'update wgRna set type = "scaRna" where name ="U89"'
hgsql hg17 -e 'update wgRna set type = "scaRna" where name ="U90"'
hgsql hg17 -e 'update wgRna set type = "scaRna" where name ="U91"'
hgsql hg17 -e 'update wgRna set type = "scaRna" where name ="U92"'
hgsql hg17 -e 'update wgRna set type = "scaRna" where name ="U93"'
hgsql hg17 -e 'update wgRna set type = "scaRna" where name ="U100"'
hgsql hg17 -e 'update wgRna set type = "scaRna" where name ="ACA26"'
hgsql hg17 -e 'update wgRna set type = "scaRna" where name ="ACA35"'
hgsql hg17 -e 'update wgRna set type = "scaRna" where name ="ACA45"'
hgsql hg17 -e 'update wgRna set type = "scaRna" where name ="ACA47"'
hgsql hg17 -e 'update wgRna set type = "scaRna" where name ="ACA57"'
hgsql hg17 -e 'update wgRna set type = "scaRna" where name ="HBII-382"'
hgsql hg17 -e 'update wgRna set type = "scaRna" where name ="mgU2-19/30"'
hgsql hg17 -e 'update wgRna set type = "scaRna" where name ="mgU2-25/61"'
hgsql hg17 -e 'update wgRna set type = "scaRna" where name ="mgU2-22/U4-8"'

# Updated .../trackDb/human/hg17/wgRna.html.

# MAKE VSDANRER2 DOWNLOADABLES (DONE, 2004-12-14, hartera)
# REMAKE FOR CHAINS AND NET AFTER USING chainAntiRepeat
# (DONE, 2004-12-22, hartera)
    ssh hgwdev
    cd /cluster/data/hg17/bed/blastz.danRer2/axtChrom
    set gp = /usr/local/apache/htdocs/goldenPath/hg17
    mkdir -p $gp/vsDanRer2/axtChrom
    cp -p *.axt $gp/vsDanRer2/axtChrom
    cd $gp/vsDanRer2/axtChrom
    gzip *.axt
    md5sum *.gz > md5sum.txt
                                                                                
    # copy chains and nets to downloads area
    # re-make chains and net downloadables (2004-12-22, hartera)
    rm $gp/vsDanRer2/zebrafish*.gz $gp/vsDanRer2/md5sum.txt
    cd /cluster/data/hg17/bed/blastz.danRer2/axtChain
    gzip -c all.chain.antirepeat > \
            /cluster/data/hg17/zip/zebrafishDanRer2.chain.gz
    gzip -c zfishdanRer2.net > /cluster/data/hg17/zip/zebrafishDanRer2.net.gz
    cd $gp/vsDanRer2
    mv /cluster/data/hg17/zip/zebrafish*.gz .
    md5sum *.gz > md5sum.txt
    # Copy over & edit README.txt w/pointers to chain, net formats.
                                                                                
# CLEANUP DANRER2 BLASTZ (DONE, 2004-12-14, hartera)
# RE-DONE (DONE, 2004-12-22, hartera)
# REMOVED RAW AND LAV DIRS (DONE, 2005-02-24, hartera)
    ssh eieio
    cd /cluster/data/hg17/bed/blastz.danRer2
    nice rm axtChain/run1/chain/* &
    nice rm -fr axtChain/n1 axtChain/noClass.net &
    nice gzip axtChrom/* pslChrom/* axtChain/all.chain axtChain/all.chain.unfiltered axtChain/*.net &
    nice gzip axtChain/all.chain.antirepeat axtChain/chainAR/*.chain &
    nice rm -fr axtChain/chain axtChain/chainRaw axtChain/preNet &
    nice rm -rf raw &
    nice rm -rf lav &

# EXTRACT AXT'S AND MAF'S FROM TETRAODON (tetNig1) NET 
# (DONE, 2004-12-15, hartera) 
# Redo to remove overlaps (2006-04-07 kate)
    ssh eieio
    # create axts
    cd /cluster/data/hg17/bed/blastz.tetNig1/axtChain
    netSplit tetNig1.net tetraodonNet
    mkdir -p ../axtNet
cat > axtNet.csh << 'EOF'
    foreach f (tetraodonNet/chr*.net)
        set c = $f:t:r
        echo "axtNet on $c"
        netToAxt tetraodonNet/$c.net chain/$c.chain \
        /cluster/data/hg17/nib /cluster/data/tetNig1/nib ../axtNet/$c.axt
        echo "Complete: $c.net -> $c.axt"
        end
'EOF'

    chmod +x axtNet.csh
    csh axtNet.csh >&! axtNet.log &
    tail -100f axtNet.log

    # sort axts before making mafs - must be sorted for multiz
    cd /cluster/data/hg17/bed/blastz.tetNig1
    mv axtNet axtNet.unsorted
    mkdir axtNet
    foreach f (axtNet.unsorted/*.axt)
        set c = $f:t:r
        echo "Sorting $c"
        axtSort $f axtNet/$c.axt
    end

    # create maf
    ssh eieio
    cd /cluster/data/hg17/bed/blastz.tetNig1
    cd axtNet
    mkdir ../mafNet

cat > makeMaf.csh << 'EOF'
    foreach f (chr*.axt)
      set maf = $f:t:r.tetNig1.maf
      echo translating $f to $maf
      axtToMaf $f \
            /cluster/data/hg17/chrom.sizes /cluster/data/tetNig1/chrom.sizes \
            ../mafNet/$maf -tPrefix=hg17. -qPrefix=tetNig1.
    end
'EOF'
    csh makeMaf.csh >&! makeMaf.log &
    tail -100f makeMaf.log

    nice gzip axtChrom/* pslChrom/* axtChain/all.chain axtChain/*.net &

    # redo axt's and maf's to remove overlaps (2006-04-07 kate)
    ssh kkstore02
    cd /cluster/data/hg17/bed/blastz.tetNig1
    mv axtNet axtNet.old
    mv mafNet mafNet.old
    mkdir -p axtNet mafNet
    cd axtChain

cat > fix.csh << 'EOF'
    date
    foreach f (tetraodonNet/chr*.net)
        set c = $f:t:r
        echo $c
        netToAxt tetraodonNet/$c.net chain/$c.chain \
            /cluster/data/hg17/nib /cluster/data/tetNig1/nib stdout | \
                axtSort stdin ../axtNet/$c.axt
        echo "Complete: $c.net -> $c.axt"
        axtToMaf ../axtNet/$c.axt \
            /cluster/data/hg17/chrom.sizes /cluster/data/tetNig1/chrom.sizes \
            ../mafNet/$c.maf -tPrefix=hg17. -qPrefix=tetNig1.
    end
    date
'EOF'
    csh fix.csh >&! fix.log &
    cd /san/sanvol1/scratch/hg17/mafNet
    rm -fr tetNig1
    cp -rp /cluster/data/hg17/bed/blastz.tetNig1/mafNet tetNig1


# 10-WAY MULTIZ -- 8-WAY PLUS FROG AND TETRA (DONE 2004-12-22 kate)
#       Use older multiz (not v10) till bugs fixed

    ssh eieio
    cd /cluster/data/hg17/bed
    rm multiz10way
    mkdir multiz.2004-12-22
    ln -s multiz.2004-12-22 multiz10way
    cd multiz10way

    cat > tree.nh << 'EOF'
((((((hg17,panTro1),(rn3,mm5)),canFam1),galGal2),xenTro1),((fr1,tetNig1),danRer1))
'EOF'

    mkdir /cluster/bluearc/hg17/multiz.2004-12-22
    cd /cluster/bluearc/hg17
    mkdir 2004-12-22
    rm multiz10way
    ln -s multiz.2004-12-17 multiz10way.v10
    ln -s multiz.2004-12-22 multiz10way
    # reuse pairwise MAF's on bluearc
    mv multiz10way.v10/{canFam1,danRer1,fr1,galGal2,mm5,panTro1,rn3,tetNig1,xenTro1} multiz10way
    # NOTE: pairwise mafs were moved to /cluster/bluearc/hg17/mafNet

    # make output dir and run dir
    ssh kk9
    cd /cluster/data/hg17/bed
    cd multiz10way
    mkdir -p maf
    mkdir -p run
    cd run

    # create scripts to run multiz on cluster
cat > oneMultiz.csh << 'EOF'
#!/bin/csh -fe
    set c = $1
    set multi = /scratch/$user/multiz10way.$c
    set pairs = /cluster/bluearc/hg17/multiz10way

    # special mode --
    # with 1 arg, cleanup
    if ($#argv == 1) then
        rm -fr $multi
        exit
    endif

    set s1 = $2
    set s2 = $3

    # locate input files -- in pairwise dir, or multiple dir
    set d1 = $multi
    set d2 = $multi
    if (-d $pairs/$s1) then
        set d1 = $pairs
    endif
    if (-d $pairs/$s2) then
        set d2 = $pairs
    endif
    set f1 = $d1/$s1/$c.maf
    set f2 = $d2/$s2/$c.maf
    # write to output dir
    set out = $multi/${s2}${s1}
    mkdir -p $out

    # check for empty input file
    if (-s $f1 && -s $f2) then
        echo "Aligning $f1 $f2"
        /cluster/bin/penn/multiz $f1 $f2 - > $out/$c.maf
    else if (-s $f1) then
        cp $f1 $out
    else if (-s $f2) then
        cp $f2 $out
    endif
'EOF'
# << for emacs
chmod +x oneMultiz.csh

cat > allMultiz.csh << 'EOF'
#!/bin/csh -fe

set c = $1
oneMultiz.csh $c mm5 panTro1
oneMultiz.csh $c rn3 panTro1mm5 
oneMultiz.csh $c canFam1 panTro1mm5rn3 
oneMultiz.csh $c galGal2 panTro1mm5rn3canFam1
oneMultiz.csh $c xenTro1 panTro1mm5rn3canFam1galGal2
oneMultiz.csh $c fr1 panTro1mm5rn3canFam1galGal2xenTro1
oneMultiz.csh $c tetNig1 panTro1mm5rn3canFam1galGal2xenTro1fr1
oneMultiz.csh $c danRer1 panTro1mm5rn3canFam1galGal2xenTro1fr1tetNig1
# get final alignment file
cp /scratch/$user/multiz10way.$c/panTro1mm5rn3canFam1galGal2xenTro1fr1tetNig1danRer1/$c.maf /cluster/data/hg17/bed/multiz10way/maf/$c.maf
#cleanup
oneMultiz.csh $c
'EOF'
# << for emacs
chmod +x allMultiz.csh

cat > gsub << 'EOF'
#LOOP
allMultiz.csh $(root1) {check out line+ /cluster/data/hg17/bed/multiz10way/maf/$(root1).maf}
#ENDLOOP
'EOF'
# << for emacs
    cut -f 1 /cluster/data/hg17/chrom.sizes > chrom.lst
    gensub2 chrom.lst single gsub jobList
    para create jobList
    para try; para check
    para push

    # post-process multiz maf with maf_project to "glue" short
    # alignment blocks together
    ssh eieio
    cd /cluster/data/hg17/bed/multiz10way.v8
    mkdir -p mafGlued
    cd maf
    foreach f (*.maf)
        set c = $f:r
        echo "gluing $f"
        /cluster/bin/penn/maf_project $f hg17.$c > ../mafGlued/$c.maf
    end
    # filter out alignment blocks with no alignments in non-reference species,
    # and low-scoring alignments based on Webb Miller's latest
    # recommendations (score < -5 * ncol^2 * nrow)
    # NOTE: Webb hasn't approved the filtered alignments yet,
    # so leaving them in for now.
    #mkdir -p mafFiltered
    #cd ../mafGlued
    #foreach f (*.maf)
        #set c = $f:r
        #echo "filtering $f"
        #~kate/bin/i386/mafFilter -factor $f > ../mafFiltered/$c.maf
    #end
    #cd ..

    grep score mafGlued/chr1.maf | wc -l
    grep score mafFiltered/chr1.maf | wc -l
    grep score mafGlued/bad | wc -l
        # 43692
    grep score=0.0 bad | wc -l
        # 10206
    # load alignments into tables
    ssh hgwdev
    cd /cluster/data/hg17/bed/multiz10way.v8
    set mafDir = /gbdb/hg17/mafNet
    mkdir -p $mafDir

    # multiple alignment
    set mafDir = /gbdb/hg17/multiz10way/maf
    mkdir -p $mafDir/multiz10way
    cd /cluster/data/hg17/bed/multiz10way.v8/mafGlued
    ln -s `pwd`/*.maf $mafDir/multiz10way
    hgLoadMaf hg17 -warn multiz10way -pathPrefix=$mafDir/multiz10way

    # load summary table to replace pairwise
    cd /cluster/data/hg17/bed/multiz10way.v8/mafGlued/
    time cat chr*.maf | hgLoadMafSummary -minSize=10000 -mergeGap=500 -maxSize=50000 hg17 multiz10waySummary stdin
    # Processed 27314693 components in 9081437 mafs from stdin
    # 30 minutes


# CONSERVATION SCORING WITH PHASTCONS (DONE 2005-01-14 kate)
# 1. Partition multiple alignment into windows, using "msa_split"
# 2. Create starting tree model, with branch lengths
#       use "phyloFit" on alignments
# 3. Estimate GC avg. over all species, use "msa_view" on maf
# 4. Estimate other model params, using phastCons (via doEstimate script)

    # NOTE: no alignment filtering done -- the scores don't look
    # particularly meaningful w/ this version of multiz.
    # Next time, run on "glued" (maf_projected) 

    ssh eieio
    cd /cluster/data/hg17/bed/multiz10way.v8
    set mafDir = /cluster/bluearc/hg17/multiz10way.v8/maf
    mkdir -p $mafDir
    cp -r maf/*.maf $mafDir

    ssh kk9
    cd /cluster/data/hg17/bed/multiz10way.v8
    mkdir cons
    cd cons

    # break up the genome-wide MAFs into pieces
    # NOTE: chrom fasta files are already on the bluearc
    # from previous run
    mkdir /cluster/bluearc/hg17/chrom
    cd /cluster/data/hg17
    foreach f (`cat chrom.lst`)
        echo $f
        cp -r $f/*.fa /cluster/bluearc/hg17/chrom
    end

    cd /cluster/data/hg17/bed/multiz10way.v8/cons
    mkdir run.split
    cd run.split
    set WINDOWS = /cluster/bluearc/hg17/multiz10way.v8/cons/WINDOWS
    rm -fr $WINDOWS
    mkdir -p $WINDOWS
    cat << 'EOF' > doSplit.sh
#!/bin/sh

PHAST=/cluster/bin/phast
FA_SRC=/cluster/bluearc/hg17/chrom
WINDOWS=/cluster/bluearc/hg17/multiz10way.v8/cons/WINDOWS

maf=$1
c=`basename $maf .maf`
echo $c
mkdir -p /scratch/msa_split
${PHAST}/msa_split $maf -i MAF -M ${FA_SRC}/$c.fa -O hg17,panTro1,mm5,rn3,canFam1,galGal2,xenTro1,fr1,tetNig1,danRer1 -w 1000000,0 -r /scratch/msa_split/$c -o SS -I 1000 -B 5000
[ $? -eq 0 ] || exit 1
echo "Copying..."
cd /scratch/msa_split
for file in $c.*.ss ; do gzip -c $file > ${WINDOWS}/$file.gz ; done
[ $? -eq 0 ] || exit 1
rm -f /scratch/msa_split/$c.*.ss
echo "Done copying"
echo "Done" >> ${WINDOWS}/$c.done
'EOF'
# << for emacs

    set WINDOWS = /cluster/bluearc/hg17/multiz10way.v8/cons/WINDOWS
    chmod +x doSplit.sh
    rm -f jobList
    foreach file (/cluster/bluearc/hg17/multiz10way.v8/maf/*.maf) 
        set c = $file:t:r
	echo "doSplit.sh $file {check out line+ $WINDOWS/$c.done}" >> jobList
    end
    
    para create jobList
        # 46 jobs
    para try
    para check

    # TODO: cleanup
    # rm -fr $mafDir
    
    # now generate conservation scores and predicted elements
    set path = ($path /cluster/bin/phast); rehash
    cd /cluster/data/hg17/bed/multiz10way.v8/cons
    mkdir run.elements   
    cd run.elements

    # create a starting tree model from a chr1 ss files in WINDOWS dir.
    ssh kolossus
    cd /cluster/data/hg17/bed/multiz10way.v8/cons/run.elements
    gunzip -c /cluster/bluearc/hg17/multiz10way.v8/cons/WINDOWS/chr1.14996059-15998256.ss.gz \
                > /tmp/phastCons.$$
     phyloFit -i SS /tmp/phastCons.$$ --out-root starting-tree --tree \
        "((((((hg17,panTro1),(mm5,rn3)),canFam1),galGal2),xenTro1),((fr1,tetNig1),danRer1))" 
    rm /tmp/phastCons.$$
    cat starting-tree.mod
#ALPHABET: A C G T 
#ORDER: 0
#SUBST_MOD: REV
#TRAINING_LNL: -2635749.517410
#BACKGROUND: 0.247225 0.248374 0.250827 0.253574 
#RATE_MAT:
#-0.997890    0.201447    0.648573    0.147870 
#0.200515   -1.020796    0.190184    0.630096 
#0.639258    0.188324   -1.025170    0.197587 
#0.144168    0.617176    0.195447   -0.956791 
#TREE: ((((((hg17:0.006401,panTro1:0.008342):0.099376,(mm5:0.083404,rn3:0.105411):0.242694):0.020883,canFam1:0.221922):0.099131,galGal2:0.275759):0.041997,xenTro1:0.280306):0.064815,((fr1:0.137674,tetNig1:0.091463):0.118573,danRer1:0.250847):0.064815);

    # estimate model parameters
    # estimate avg. cross-species avg. GC content from chr1 maf's
    ssh kolossus
    set path = ($path /cluster/bin/phast); rehash
    cd /cluster/data/hg17/bed/multiz10way.v8/cons/run.elements
    msa_view --aggregate hg17,panTro1,rn3,mm5,canFam1,galGal2,xenTro1,danRer1,tetNig1,fr1 \
        -i MAF \
        --summary-only /cluster/data/hg17/bed/multiz10way.v8/maf/chr1.maf\
            > maf_summary.txt
    awk '$1 == "[aggregate]" {printf "%0.3f\n", $3 + $4}' maf_summary.txt
       # 0.424

    # generate models from random sample of genome (use 90 1Mb windows,
    # to conveniently run on rack 9 100-node cluster)
    # On first pass, used parameters from 8way alignment:
    #   expected-lengths 12 -taret-coverage .17
    # NOTE: there may be a cleverer way to select the first length param
    # On second pass, used parameters below, based on consEntropy
    # and featureBits coverage of elements, below
    cat << 'EOF' > doEstimate.sh
#!/bin/sh
zcat $1 | /cluster/bin/phast/phastCons - starting-tree.mod --gc 0.424 --nrates 1,1 --no-post-probs --ignore-missing --expected-lengths 11 --target-coverage 0.20 --quiet --log $2 --estimate-trees $3
'EOF'
    chmod u+x doEstimate.sh
    rm -fr LOG TREES
    mkdir -p LOG TREES
    ls /cluster/bluearc/hg17/multiz10way.v8/cons/WINDOWS/*.gz > all.windows
    /cluster/bin/phast/chooseLines -k 90 all.windows > subset.windows
    rm -f jobs.lst
    foreach f (`cat subset.windows`)
        set root = $f:t:r:r
	echo doEstimate.sh /cluster/bluearc/hg17/multiz10way.v8/cons/WINDOWS/$f LOG/$root.log TREES/$root >> jobs.lst
    end
    # run cluster job (about an hour)
    ssh kk9
    cd /cluster/data/hg17/bed/multiz10way.v8/cons/run.elements
    para create jobs.lst
       # 90 jobs written to batch
    para try; para check
    para push
# 2 jobs crashed with out-of-mem; as we are just taking a sample
# this is probably OK, but I've notified Adam
# Average job time:                1055s      17.58m     0.29h    0.01d
# Longest job:                     3647s      60.78m     1.01h    0.04d

    # NOTE: should have used ave.noncons.mod to improve parameter estimation
    # cp nave.noncons.mod starting-tree.mod

    ls TREES/*.cons.mod > cons.txt
    /cluster/bin/phast/phyloBoot --read-mods '*cons.txt' --output-average ave.cons.mod > cons_summary.txt
    grep TREE ave.cons.mod
# TREE: ((((((hg17:0.002313,panTro1:0.002931):0.036375,(mm5:0.029849,rn3:0.039008):0.095334):0.003258,canFam1:0.078205):0.047189,galGal2:0.158045):0.020103,xenTro1:0.169387):0.028857,((fr1:0.071610,tetNig1:0.057766):0.091165,danRer1:0.138905):0.028857);
    ls TREES/*.noncons.mod > noncons.txt
    /cluster/bin/phast/phyloBoot --read-mods '*noncons.txt' --output-average ave.noncons.mod > noncons_summary.txt
    grep TREE ave.noncons.mod
# TREE: ((((((hg17:0.007342,panTro1:0.009340):0.116009,(mm5:0.095037,rn3:0.124288):0.304355):0.010633,canFam1:0.249367):0.151476,galGal2:0.507037):0.064317,xenTro1:0.549121):0.094733,((fr1:0.231246,tetNig1:0.185161):0.296288,danRer1:0.446734):0.094733);


    # analyze conservation genome-wide
    cat << 'EOF' > doPhastCons.sh
#!/bin/sh

mkdir -p /cluster/bluearc/hg17/multiz10way.v8/phastCons/POSTPROBS /cluster/bluearc/hg17/multiz10way.v8/phastCons/ELEMENTS
pref=`basename $1 .ss.gz`
chr=`echo $pref | awk -F\. '{print $1}'`
tmpfile=/scratch/phastCons.$$
zcat $1 | /cluster/bin/phast/phastCons - ave.cons.mod,ave.noncons.mod --expected-lengths 11 --target-coverage 0.20 --quiet --seqname $chr --idpref $pref --viterbi /cluster/bluearc/hg17/multiz10way.v8/phastCons/ELEMENTS/$pref.bed --score --require-informative 0 > $tmpfile
gzip -c $tmpfile > /cluster/bluearc/hg17/multiz10way.v8/phastCons/POSTPROBS/$pref.pp.gz
rm $tmpfile
'EOF'
    chmod u+x doPhastCons.sh

    rm -fr /cluster/bluearc/hg17/multiz10way.v8/phastCons/POSTPROBS /cluster/bluearc/hg17/multiz10way.v8/phastCons/ELEMENTS
    rm -f jobs2.lst
    foreach f (/cluster/bluearc/hg17/multiz10way.v8/cons/WINDOWS/*.ss.gz)
        echo doPhastCons.sh $f >> jobs2.lst
    end

    # run cluster job (it's quick -- 10 minutes or so)
    ssh kk
    cd /cluster/data/hg17/bed/multiz10way.v8/cons/run.elements
    para create jobs2.lst
        # 2932 jobs written to batch
    para try; para check
    para push
# Average job time:                  80s       1.33m     0.02h    0.00d
# Longest job:                      157s       2.62m     0.04h    0.00d
# Submission to last job:           583s       9.72m     0.16h    0.01d

    # combine predictions and transform scores to be in 0-1000 interval
    # do in a way that avoids limits on numbers of args
    rm -f splitfiles* all.raw.bed
    find /cluster/bluearc/hg17/multiz10way.v8/phastCons/ELEMENTS -name "*.bed" > files
    split files splitfiles
    foreach s (splitfiles*)
       awk '{printf "%s\t%d\t%d\tlod=%d\t%s\n", $1, $2, $3, $5, $5;}' `cat $s` >> all.raw.bed
    end
    /cluster/bin/scripts/lodToBedScore all.raw.bed > all.bed
    rm files splitfiles* 

    ssh hgwdev
    cd /cluster/data/hg17/bed/multiz10way.v8/cons/run.elements
    hgLoadBed hg17 phastConsElements10way all.bed
    sort -rn -k 5 all.bed | sed -n '1,100000p' > top100K.bed
    hgLoadBed hg17 phastConsElements10wayTop100K top100K.bed

    # check coverage -- reran estimation and conservation steps with new parameters till
    # coverage close to 5% and expected-length parameter is close to consEntropy recommended length
    
    featureBits hg17 phastConsElements10way
        # first pass
        #       .17 12
        # 132657993 bases of 2866216770 (4.628%) in intersection
        # second pass -- used this
        #       .20 11
        # 143386170 bases of 2866216770 (5.003%) in intersection
    featureBits hg17 phastConsElements
        # 137850739 bases of 2866216770 (4.810%) in intersection

    # check expected-length parameter
    # first pass
    /cluster/bin/phast/consEntropy .17 12 \
                        ave.cons.mod ave.noncons.mod --NH 9.78
        # recommended length 10.4
    # second pass -- good enough according to Adam
    /cluster/bin/phast/consEntropy .20 11 \
                        ave.cons.mod ave.noncons.mod --NH 9.78
        #( Solving for new omega: 11.000000 12.243251 12.155776 12.155369 )
        #Transition parameters: gamma=0.200000, omega=11.000000, mu=0.090909, nu=0.022727
        #Relative entropy: H=1.263205 bits/site
        #Required length: N=7.548911 sites
        #Total entropy: NH=9.535821 bits
        #Recommended expected length: omega=12.155369 sites (for NH=9.780000)
 
    # create wiggle data files
    ssh eieio
    cd /cluster/data/hg17/bed/multiz10way.v8/cons
    # sort post-prob files by chrom position using filename, then 
    #  use wigEncode to create binary files for wiggle
    find /cluster/bluearc/hg17/multiz10way.v8/phastCons/POSTPROBS \
        -name "*.pp.gz" | sort -t\. -k2,2n | xargs zcat | \
        wigEncode stdin phastCons10way.wig phastCons10way.wib

    hgWiggle -db=hg17 -doHistogram -hBinSize=0.001 \
        -hBinCount=1000 -hMinVal=0.0 -db=hg17 phastCons >histo.8way.data
    hgWiggle -db=hg17 -doHistogram -hBinSize=0.001 \
        -hBinCount=1000 -hMinVal=0.0 -db=hg17 phastCons10way >histo.10way.data
    hgWiggle -db=hg17 -doStats \
         phastCons > stats.8way.data
    hgWiggle -db=hg17 -doStats \
         phastCons10way > stats.10way.data

    # Load gbdb and database with wiggle.
    ssh hgwdev
    cd /cluster/data/hg17/bed/multiz10way.v8/cons
    set wibDir = /gbdb/hg17/multiz10way/wib/phastCons10way
    mkdir -p $wibDir
    ln -s `pwd`/phastCons10way.wib $wibDir
    hgLoadWiggle hg17 phastCons10way phastCons10way.wig \
                -pathPrefix=$wibDir

    # create tree image:
        # edit tree.nh to create species.nh with common names
        /cluster/bin/phast/draw_tree -b -s species.nh > species10.ps
        # photoshop to enhance, then save as gif/jpg
        cp /cluster/data/hg17/bed/multiz10way.v8/species10.jpg \
            /usr/local/apache/htdocs/images/phylo/10way.jpg

    # get stats on the track
    ssh hgwdev
        
    featureBits hg17 -enrichment refGene:cds phastConsElements10way
        # refGene:cds 1.020%, phastConsElements10way 5.003%, both 0.711%, cover 69.73%, enrich 13.94x
        # compare to previous elements (generated from 8way)
    featureBits hg17 -enrichment refGene:cds phastConsElements
        # refGene:cds 1.020%, phastConsElements 4.810%, both 0.747%, cover 73.22%, enrich 15.22x

    # see how gluing reduces number of alignments
    ssh eieio
    cd /cluster/data/hg17/bed/multiz10way.v8
    mkdir stats
    grep score maf/chr22.maf | grep -v 0.0 | wc -l
        #179576
    grep score mafGlued/chr22.maf | grep -v 0.0 | wc -l
        #110550

    # look at distribution of alignment sizes after gluing
    ssh hgwdev
    cd /cluster/data/hg17/bed/multiz10way.v8
    mkdir mafTemp
    ln -s `pwd`/maf/chr1.maf mafTemp
    # load temp table
    hgLoadMaf hg17 -pathPrefix=mafTemp multiz10wayChr1
        #Loaded 1246727 mafs
    # again, compare to glued:
    echo "SELECT COUNT(*) FROM multiz10way"
        # 738030
        # again, ~40% fewer

    cd stats
    echo "SELECT chromEnd - chromStart FROM multiz10way WHERE chrom='chr1'" | \
        hgsql -N hg17 | sort -n > chr1.maf.glued.sizes
    echo "SELECT chromEnd - chromStart FROM multiz10wayChr1"| \
        hgsql -N hg17 | sort -n > chr1.maf.sizes
    # cleanup
    hgsql hg17 -e "DROP TABLE multiz10wayChr1"
    rm -fr ../mafTemp

    # coverage of multiple alignment, and pairs
    ssh kolossus
    cd /cluster/data/hg17/bed/multiz10way.v8
    cd stats
    nice mafRanges -notAllOGap ../mafGlued/chr1.maf hg17 \
                hg17.chr1.mafRanges.bed
    nice mafRanges -notAllOGap /cluster/data/hg17/bed/multiz8way/maf/chr1.maf \
                hg17 hg17.8way.chr1.mafRanges.bed
    foreach db (panTro1 canFam1 mm5 rn3 galGal2 xenTro1 fr1 tetNig1 danRer1)
        echo $db
        nice mafRanges /cluster/data/hg17/bed/blastz.$db/mafNet/chr1.*maf \
               -notAllOGap hg17 $db.chr1.mafRanges.bed
        ls /cluster/data/hg17/bed/blastz.$db/mafNet/chr1.*maf
    end

    ssh hgwdev
    cd /cluster/data/hg17/bed/multiz10way.v8/stats
    nice featureBits -chrom=chr1 hg17 refGene:cds hg17.chr1.mafRanges.bed -enrichment
# refGene:cds 1.308%, hg17.chr1.mafRanges.bed 95.725%, both 1.307%, cover 99.94%, enrich 1.04x
    nice featureBits -chrom=chr1 hg17 refGene:cds hg17.8way.chr1.mafRanges.bed -enrichment
# refGene:cds 1.308%, hg17.8way.chr1.mafRanges.bed 95.742%, both 1.307%, cover 99.97%, enrich 1.04x

    foreach db (panTro1 canFam1 mm5 rn3 galGal2 xenTro1 fr1 tetNig1 danRer1)
        nice featureBits -chrom=chr1 -enrichment hg17 refGene:cds $db.chr1.mafRanges.bed
    end

#refGene:cds 1.308%, panTro1.chr1.mafRanges.bed 93.472%, both 1.264%, cover 96.65%, enrich 1.03x
#refGene:cds 1.308%, canFam1.chr1.mafRanges.bed 55.377%, both 1.277%, cover 97.64%, enrich 1.76x
#refGene:cds 1.308%, mm5.chr1.mafRanges.bed 37.342%, both 1.280%, cover 97.92%, enrich 2.62x
#refGene:cds 1.308%, rn3.chr1.mafRanges.bed 35.429%, both 1.257%, cover 96.14%, enrich 2.71x
#refGene:cds 1.308%, galGal2.chr1.mafRanges.bed 3.840%, both 0.936%, cover 71.61%, enrich 18.65x
#refGene:cds 1.308%, xenTro1.chr1.mafRanges.bed 3.059%, both 0.881%, cover 67.36%, enrich 22.02x
#refGene:cds 1.308%, fr1.chr1.mafRanges.bed 1.892%, both 0.854%, cover 65.29%, enrich 34.50x
#refGene:cds 1.308%, tetNig1.chr1.mafRanges.bed 1.384%, both 0.805%, cover 61.57%, enrich 44.50x
#refGene:cds 1.308%, danRer1.chr1.mafRanges.bed 2.716%, both 0.847%, cover 64.81%, enrich 23.86x


# MAKE HG17-RN3 OVER.CHAIN FOR LIFTOVER  (DONE 1/25/05 angie)
    ssh kolossus
    set chainDir = /cluster/data/hg17/bed/blastz.rn3/axtChain
    netChainSubset $chainDir/rat.net.gz $chainDir/all.chain.gz \
      /cluster/data/hg17/bed/bedOver/hg17ToRn3.over.chain
    

# MAKE HG17-GALGAL2 OVER.CHAIN FOR LIFTOVER  (DONE 1/25/05 angie)
    ssh kolossus
    set chainDir = /cluster/data/hg17/bed/blastz.galGal2/axtChain
    netChainSubset $chainDir/human.net $chainDir/all.chain \
      /cluster/data/hg17/bed/bedOver/hg17ToGalGal2.over.chain
    

# DOWNLOADS FOR 10-WAY MULTIZ (2005-01-24 kate)
#   Use "glued" mafs

    ssh hgwdev
    cd /usr/local/apache/htdocs/goldenPath/hg17
    mkdir -p multiz10way
    cd multiz10way
    foreach f (/cluster/data/hg17/bed/multiz10way.v8/mafGlued/*.maf)
        set c = $f:r:t
        echo $c
        nice gzip -c $f > $c.maf.gz
    end
    # copy README and edit

    # Create upstream files for download 
    ssh hgwdev
    cd /cluster/data/hg17/bed/multiz10way.v8
    echo hg17 panTro1 mm5 rn3 canFam1 galGal2 xenTro1 fr1 tetNig1 danRer1 > org.txt
    # mafFrags takes a while
    foreach i (1000 2000 5000)
        echo "making upstream$i.maf"
        nice featureBits hg17 refGene:upstream:$i -fa=/dev/null -bed=up.bad
        awk -F '\t' '{printf("%s\t%s\t%s\t%s\t%s\t%s\n", $1, $2, $3, substr($4, 0, 9), 0, $5)}' up.bad > up.bed
        rm up.bad
        nice mafFrags hg17 multiz10way up.bed upstream$i.maf -orgs=org.txt
        rm up.bed
    end
    ssh eieio
    cd /cluster/data/hg17/bed/multiz10way.v8
    nice gzip upstream{1000,2000,5000}.maf
        # 6 mins.
    ssh hgwdev
    cd /usr/local/apache/htdocs/goldenPath/hg17
    mv /cluster/data/hg17/bed/multiz10way.v8/upstream*.maf.gz multiz10way
    cd multiz10way
    md5sum *.gz > md5sum.txt

    #	Create histogram of this phastCons data  (Hiram - 2005-02-07)
    ssh hgwdev
    cd /cluster/data/hg17/bed/multiz.2004-12-22/cons
    time hgWiggle -doHistogram \
        -hBinSize=0.001 -hBinCount=1000 -hMinVal=0.0 -verbose=2 \
	    -db=hg17 phastCons > histogram.data 2>&1
    #	34 minutes

    cat << '_EOF_' > histo.gp
set terminal png small color \
        x000000 xffffff xc000ff x66ff66 xffff00 xff0000 xff0000
set size 1.4, 0.8
set key left box
set grid noxtics
set grid ytics
set title " Hg17 Histogram phastCons track"
set xlabel "Hg17 phastCons score"
set ylabel "p-Value"
set y2label "Cumulative Probability Distribution"
set y2range [0:1]
set y2tics

plot "histogram.data" using 2:5 title " pValue" with impulses, \
        "histogram.data" using 2:7 axes x1y2 title " CPD" with lines
'_EOF_'
    
    gnuplot histo.gp > histo.png
    display histo.png &


# BLASTZ BOREOEUTHERIAN (BOREUT1) (DONE 1/29/05 braney)
    ssh kk
    mkdir /cluster/data/borEut1/bed/zb.hg17 
    ln -s /cluster/data/borEut1/bed/zb.hg17 /cluster/data/hg17/bed/blastz.borEut1
    cd /cluster/data/hg17/bed/blastz.borEut1
    # Use default (Human-Mouse) settings for starters.
    cat << '_EOF_' > DEF
# human vs. dog
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/i386:/cluster/home/angie/schwartzbin

ALIGN=blastz-run
BLASTZ=blastz

# Default
BLASTZ_H=2000
BLASTZ_ABRIDGE_REPEATS=1

# TARGET: Human
SEQ1_DIR=/scratch/hg/gs.18/build35/bothMaskedNibs
SEQ1_RMSK=
SEQ1_FLAG=
SEQ1_SMSK=/scratch/hg/gs.18/build35/linSpecRep.notInDog
SEQ1_IN_CONTIGS=0
SEQ1_CHUNK=10000000
SEQ1_LAP=10000

# QUERY: Dog
SEQ2_DIR=/iscratch/i/borEut1/nib
SEQ2_RMSK=
SEQ2_FLAG=
SEQ2_SMSK=/scratch/hg/canFam1/linSpecRep.notInHuman
SEQ2_IN_CONTIGS=0
SEQ2_CHUNK=10000000
SEQ2_LAP=0

BASE=/cluster/data/hg17/bed/blastz.borEut1

DEF=$BASE/DEF
RAW=$BASE/raw
CDBDIR=$BASE
SEQ1_LEN=$BASE/S1.len
SEQ2_LEN=$BASE/S2.len
'_EOF_'
    # << this line keeps emacs coloring happy

    /cluster/data/hg17/jkStuff/BlastZ_run0.sh
    cd run.0
    para push
# Completed: 2728 of 2728 jobs
# CPU time in finished jobs:     621440s   10357.34m   172.62h    7.19d  0.020 y
# IO & Wait Time:                 19079s     317.98m     5.30h    0.22d  0.001 y
# Average job time:                 235s       3.91m     0.07h    0.00d
# Longest job:                     2340s      39.00m     0.65h    0.03d
# Submission to last job:          2837s      47.28m     0.79h    0.03d

    ssh kki
    cd /cluster/data/hg17/bed/blastz.borEut1
    /cluster/data/hg17/jkStuff/BlastZ_run1.sh

    cd run.1
    para push
# Completed: 341 of 341 jobs
# CPU time in finished jobs:         95s       1.58m     0.03h    0.00d  0.000 y
# IO & Wait Time:                   825s      13.75m     0.23h    0.01d  0.000 y
# Average job time:                   3s       0.04m     0.00h    0.00d
# Longest job:                       10s       0.17m     0.00h    0.00d
# Submission to last job:            73s       1.22m     0.02h    0.00d

    ssh kk
    cd /cluster/data/hg17/bed/blastz.borEut1
    /cluster/data/hg17/jkStuff/BlastZ_run2.sh
    cd run.2
    para push

# /cluster/data/hg17/bed/blastz.borEut1/axtChrom/chr18_random.axt is empty
# /cluster/data/hg17/bed/blastz.borEut1/axtChrom/chr19_random.axt is empty
# ..
# Completed: 44 of 46 jobs
# Crashed: 2 jobs
# CPU time in finished jobs:        104s       1.73m     0.03h    0.00d  0.000 y
# IO & Wait Time:                   482s       8.04m     0.13h    0.01d  0.000 y
# Average job time:                  13s       0.22m     0.00h    0.00d
# Longest job:                      134s       2.23m     0.04h    0.00d
# Submission to last job:           142s       2.37m     0.04h    0.00d

# END BLASTZ BOREOEUTHERIAN


##########################################################################
# MAKE tfbsConsSites and tfbsConsFactors for TFBS conserved track (DONE braney 1/15/05)
# Questions?  weirauch@soe.ucsc.edu or braney@soe.ucsc.edu
#	tfbsConsSites table reloaded 2006-11-03 - Hiram - see below:
## reload tfbsCons table - it was based on a newer version of tfbs names that

ssh hgwdev
mkdir /cluster/data/hg17/bed/tfbsCons
cd /cluster/data/hg17/bed/tfbsCons

# Define all parameters in 'PARAMS.txt'
# Define all chromosomes in 'CHROMS.txt'
# Get tfbsConsUtils.tar.gz from Matt Weirauch with Perl scripts  weirauch@soe.ucsc.edu
set tarfile=/cluster/data/hg17/bed/tfbsCons/tfbsConsUtils.tar.gz
tar zxf $tarfile

nice ./getRefseqStats.pl &
nice ./getBatchQueries.pl &

ssh kk
mkdir /cluster/bluearc/braney/tfloc
# Copy ./tmp/ctfbs_batch_list.txt to this dir
# Copy ./scripts/doit to this dir
para create ctfbs_batch_list.txt
para try
para push

# When the run is done (within a day or so), the results will be in individual dirs, one for each chromosome.

ssh kksilo (or hgwdev, or whatever)
nice ./getBedFile.pl &

hgLoadBed -noSort hg17 tfbsConsSites \
	-sqlTable=$HOME/kent/src/hg/lib/tfbsConsSites.sql \
	tfbsConsSites.bed -tab
hgLoadBed -noSort hg17 tfbsConsFactors \
	-sqlTable=$HOME/kent/src/hg/lib/tfbsConsFactors.sql \
	tfbsConsFactors.bed -tab

# Feel free to delete or gzip anything in ./tmp
#  (particularly the huge .maf and .bed files)
#   after the final two bed files are sucessfully loaded

##########################################################################
# CHICKEN RECIPROCAL-BEST NET FOR STRINGENT LIFTOVER (DONE 2/3/05 angie)
    ssh kolossus
    cd /cluster/data/hg17/bed/blastz.galGal2/axtChain
    # Run chainNet again, this time keeping both of its outputs:
    chainPreNet all.chain ../S1.len ../S2.len stdout \
    | chainNet stdin ../S1.len ../S2.len h_g.net g_h.net
    # Get the chicken chains from the chicken-referenced (but human-centric)
    # net:
    chainSwap all.chain g_h.chain
    netChainSubset g_h.net g_h.chain stdout \
    | chainSort stdin g_h.subset.chain
    # Net those (sorted) chicken chains, and keep both outputs, to get 
    # reciprocal best nets referenced to both species:
    chainPreNet g_h.subset.chain ../S2.len ../S1.len stdout \
    | chainNet stdin ../S2.len ../S1.len g_h.rbest.net h_g.rbest.net
    # Get the chains from the recip-best nets for stringent liftOver:
    netChainSubset g_h.rbest.net g_h.chain galGal2ToHg17.rbest.over.chain
    netChainSubset h_g.rbest.net all.chain hg17ToGalGal2.rbest.over.chain



####### RE-BUILD RGD HUMAN QTL TRACKS (DONE 2/5/05 Fan) ##############

mkdir -p /cluster/store8/rgd/human050205
rm /cluster/data/hg17/bed/rgdQtl
ln -s /cluster/store8/rgd/human050205 /cluster/data/hg17/bed/rgdQtl
cd /cluster/data/hg17/bed/rgdQtl

# download data files from RGD

wget --timestamp ftp://rgd.mcw.edu/pub/RGD_genome_annotations/human/human_QTL.gff

# remove extra line feed character at the end of lines

# !!! manually corrected the line of AASTH7_H because chromStart is greater than chrEnd

rmLf human_QTL.gff > rgdQtl.gff

# create rgdQtl.tab
awk '{print $1"\t"$4"\t"$5"\t"$10}'  rgdQtl.gff |sed -e 's/Chr/chr/g'| \
sed -e 's/"//g' |sed -e 's/RGD://g' | sed -e 's/;//g' > rgdQtl.tab

# create rgdQtlLink.tab

awk '{printf "%s\t%s\t", $12, $10; for (i = 14;i <= NF; ++i ) {printf "%s ", $i} printf "\n"} ' rgdQtl.gff | \
sed -e 's/"//g'| sed -e 's/RGD://g' | sed -e 's/;//g'| sed -e 's/Note//g' > rgdQtlLink.tab

# load rgdQtl table
hgLoadBed hg17 rgdQtl rgdQtl.tab

# check rgdQtl table
checkTableCoords hg17 rgdQtl

# load rgdQtlLink table
hgsql hg17 -e "drop table hg17.rgdQtlLink;"
hgsql hg17 <~/kent/src/hg/lib/rgdQtlLink.sql
hgsql hg17 -e 'load data local infile "rgdQtlLink.tab" into table hg17.rgdQtlLink;'

# updated trackDb.ra under /kent/src/hg/makeDb/trackDb/human/hg17 and
# added rgdQtl.html.

# GENOSCOPE TETRAODON (tetNig1) ECORES (DONE, 2005-02-08, hartera)
    ssh eieio
    mkdir -p /cluster/data/hg17/bed/ecoresTetNig1
    cd /cluster/data/hg17/bed/ecoresTetNig1
                                                                                
    wget --timestamp \
         http://www.genoscope.cns.fr/externe//4ucsc/ExofishHs35Tnig1
    # this is in gff format
    # remove "Ecotig" from name field
    sed -e 's/Ecotig EG/EG/g' ExofishHs35Tnig1 > ExofishHs35Tnig1.gff
    # need to have tabs between fields not a space to load file into table
    sed -e 's/ /\t/g' ExofishHs35Tnig1.gff > Hs35Tnig1format.gff
    # if "ecore" is changed to "CDS" and "ecotig" to "transcript" this loads
    # correctly into the table.
    sed -e 's/ecore/CDS/' Hs35Tnig1format.gff | sed -e 's/ecotig/transcript/' \
           > Hg17vstetNig1.gff
    # add "chr" in front of the chromsome name in first field (2005-02-08)
    perl -pi.bak -e 's/^([0-9XYM]{1,2})/chr$1/' Hg17vstetNig1.gff
    rm *.bak
    # need to reload table

    ssh hgwdev
    cd /cluster/data/hg17/bed/ecoresTetNig1
    echo 'drop table ecoresTetNig1;' | hgsql hg17
    nice ldHgGene hg17 ecoresTetNig1 Hg17vstetNig1.gff
    # Read 40172 transcripts in 186032 lines in 1 files
    #  40172 groups 42 seqs 1 sources 2 feature types
    # 40172 gene predictions
# added ecoresTetNig1 entry to trackDb.ra in trackDb/human
# and created ecoresTetNig1.html.  Genoscope will not be maintaining this
# newest data in their Exofish comparative browser display.

# UPDATE kgSpAlias TABLE WITH NEW UNIPROT DISPLAY ID ENTRIES (done 2/11/05 Fan)
# Add new human protein display IDs to the alias table to support user search
    
    ssh hgwdev
    cd /cluster/data/hg17/bed/pb
    mkdir newDisplayId
    cd newDisplayId
 
    hgsql proteome -e 'select hg17.kgSpAlias.kgID, hg17.kgSpAlias.SpID, spOldNew.newDisplayId from spOldNew, hg17.kgSpAlias where spOldNew.acc=hg17.kgSpAlias.spID and oldDisplayId != newDisplayId' |sort -u >hg17.tab
    
    hgsql hg17 -e 'load data local infile "hg17.tab" into table hg17.kgSpAlias'

# UPDATE kgProtAlias TABLE WITH NEW UNIPROT DISPLAY ID ENTRIES (done 2/11/05 Fan)
# Add new hg17 protein display IDs to the alias table to support user search
    
    ssh hgwdev
    cd /cluster/data/hg17/bed/pb/newDisplayId

     hgsql proteome -e 'select hg17.kgSpAlias.kgID,spOldNew.oldDisplayId,spOldNew.newDisplayId from spOldNew, hg17.kgSpAlias where spOldNew.acc=hg17.kgSpAlias.spID and oldDisplayId != newDisplayId' |sort -u >hg17.kgProtAlias.tab

# get rid of the header line at the end of the file
    vi hg17.kgProtAlias.tab 

    hgsql hg17 -e 'load data local infile "hg17.kgProtAlias.tab" into table hg17.kgProtAlias'



# BLASTZ HUMAN TARGET, COW QUERY (DONE, Nov. 2004 - Jan. 2005, Heather)
  ssh kk
  # use /cluster/data/bosTau1 because more disk space there
  cd /cluster/data/bosTau1/bed
  mkdir zb.hg17

  # create DEF file 
  # for now, not doing ABRIDGE_REPEATS
  # this means I don't need to create lineage specific repeats
  # This is because blastz-run wouldn't take advantage of these
  # because my query is in scaffolds

# human vs. cow
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/i386:/cluster/home/angie/schwartzbin

ALIGN=blastz-run
BLASTZ=blastz

# Default
BLASTZ_H=2000
BLASTZ_ABRIDGE_REPEATS=0

# TARGET: Human
SEQ1_DIR=/scratch/hg/gs.18/build35/bothMaskedNibs
#SEQ1_DIR=/iscratch/i/hg17/bothMaskedNibs
SEQ1_RMSK=
SEQ1_FLAG=
SEQ1_SMSK=
SEQ1_IN_CONTIGS=0
SEQ1_CHUNK=10000000
SEQ1_LAP=10000

# QUERY: Cow
SEQ2_DIR=/iscratch/i/bosTau1/splitDir
SEQ2_RMSK=
SEQ2_FLAG=
SEQ2_SMSK=
SEQ2_IN_CONTIGS=1
SEQ2_CHUNK=10000000
SEQ2_LAP=0

BASE=/cluster/data/hg17/bed/zb.hg17

DEF=$BASE/DEF
RAW=$BASE/raw
CDBDIR=$BASE
SEQ1_LEN=$BASE/S1.len
SEQ2_LEN=$BASE/S2.len

'_EOF_'
  # << this line keeps emacs coloring happy
  bash
  cd /cluster/data/bosTau1/bed/zb.hg17
  source DEF
  mkdir $RAW run.0
  # create S2.len so make-joblist doesn't have to
  /cluster/bin/scripts/blastz-make-joblist $DEF > $BASE/run.0/j
  sh ./xdir.sh
  cd run.0
  # check how many lines in j
  sed -e 's@^blastz-run@/cluster/bin/penn/blastz-run@' j > jobList
  para create jobList
  para try, para check, para push, para check....

  # convert out to lav
  ssh kki
  cd /cluster/data/bosTau1/bed/zb.hg17
  # run bash shell if not running it already
  source DEF
  mkdir -p $BASE/run.1
  mkdir -p $BASE/lav
  # create a new job list to convert out files to lav
  /cluster/bin/scripts/blastz-make-out2lav $DEF $BASE > run.1/jobList
  cd run.1
  # make sure the job list is OK
  wc -l jobList
  head jobList
  para create jobList
  para try
  para check
  para push

  # lavToAxt
  ssh kksilo
  cd /cluster/data/bosTau1/bed/zb.hg17
  mkdir axtTemp
  cd lav
  foreach i (*)
      catDir $i | lavToAxt stdin /cluster/data/hg17/nib \
        /cluster/data/bosTau1/bosTau1.2bit ../axtTemp/$i.axt
      echo done $i
  end

  # axtChain
  ssh kki
  cd /cluster/data/bosTau1/bed/zb.hg17
  mkdir -p axtChain/run1
  cd axtChain/run1
  mkdir out chainRaw
  ls -1S /cluster/data/bosTau1/bed/zb.hg17/axtTemp/*.axt > input.lst

  cat << '_EOF_' > gsub
  #LOOP
  doChain {check in exists $(path1)} {check out line+ chainRaw/$(root1).chain} {check out exists out/$(root1).out}
  #ENDLOOP
  '_EOF_'
  # << this line makes emacs coloring happy
    
  cat << '_EOF_' > doChain
  #!/bin/csh
'_EOF_'
  axtChain $1 /iscratch/i/hg17/bothMaskedNibs /iscratch/i/bosTau1/nib/bosTau1.2bit $2 > $3
    
  # << this line makes emacs coloring happy
  chmod a+x doChain
  gensub2 input.lst single gsub jobList
  para create jobList
  para try
  para check
  para push

  # Completed: 46 of 46 jobs
  # Average job time:                  83s       1.39m     0.02h    0.00d
  # Longest job:                     1240s      20.67m     0.34h    0.01d
  # Submission to last job:          1326s      22.10m     0.37h    0.02d

  # mergesort
  ssh kksilo
  cd /cluster/data/bosTau1/bed/zb.hg17/axtChain
  chainMergeSort run1/chainRaw/*.chain > all.chain.jan3

  # chainAntiRepeat
  ssh kksilo
  cd /cluster/data/bosTau1/bed/zb.hg17/axtChain/run1
  mkdir chainAntiRepeat
  # test with just one
  chainAntiRepeat /cluster/store5/gs.18/build35/nib /cluster/data/bosTau1/bosTau1.2bit \
    chainRaw/chr18.chain chainAntiRepeat/chr18.chain
  # do them all
  foreach f (chainRaw/*.chain)
    set f1 = $f:t
    echo $f1
      chainAntiRepeat /cluster/store5/gs.18/build35/nib /cluster/data/bosTau1/bosTau1.2bit \
        $f chainAntiRepeat/$f1
  end

  # mergesort again
  ssh kksilo
  cd /cluster/data/bosTau1/bed/zb.hg17/axtChain
  chainMergeSort run1/chainAntiRepeat/*.chain > all.chain.jan5
  gzip all.chain.jan3

  # split
  mkdir chain
  chainSplit chain all.chain.jan5

  # look at the distribution
  foreach f (chain/*.chain)
    grep chain $f | awk '{print $2;}' | sort -nr > /tmp/score.$f:t:r
    echo $f:t:r
    textHistogram -binSize=5000 /tmp/score.$f:t:r
    echo ""
  end
  # see files histogram.out and histogram.interesting

  # run chainFilter
  chainFilter -minScore=5000 all.chain.jan5 > all.chain.jan5.filtered
  gzip all.chain.jan5

  # split
  rm chain/*
  chainSplit chain all.chain.jan5.filtered
  gzip all.chain.jan5.filtered

  # load
  ssh hgwdev
  cd /cluster/data/bosTau1/bed/zb.hg17/axtChain/chain
  foreach i (*.chain)
    set c = $i:r
    echo loading $c
    hgLoadChain hg17 ${c}_chainBosTau1 $i
  end

    # featureBits -chrom=chr1 hg17 chainBosTau1Link
    # 103272818 bases of 222827847 (46.346%) in intersection
    # featureBits -chrom=chr2 hg17 chainBosTau1Link
    # 105920345 bases of 237506229 (44.597%) in intersection
    # featureBits -chrom=chr3 hg17 chainBosTau1Link
    # 89582887 bases of 194635740 (46.026%) in intersection
    # featureBits -chrom=chr4 hg17 chainBosTau1Link
    # 77513949 bases of 187161218 (41.416%) in intersection
    # featureBits -chrom=chr5 hg17 chainBosTau1Link
    # 80428726 bases of 177702766 (45.260%) in intersection
    # featureBits -chrom=chr6 hg17 chainBosTau1Link
    # 71830264 bases of 167317699 (42.930%) in intersection
    # featureBits -chrom=chr7 hg17 chainBosTau1Link
    # 64561289 bases of 154759139 (41.717%) in intersection
    # featureBits -chrom=chr8 hg17 chainBosTau1Link
    # 55896735 bases of 142612826 (39.195%) in intersection
    # featureBits -chrom=chr9 hg17 chainBosTau1Link
    # 52068957 bases of 117781268 (44.208%) in intersection
    # featureBits -chrom=chr10 hg17 chainBosTau1Link
    # 57427282 bases of 131613628 (43.633%) in intersection
    # featureBits -chrom=chr11 hg17 chainBosTau1Link
    # 58412709 bases of 131130853 (44.545%) in intersection
    # featureBits -chrom=chr12 hg17 chainBosTau1Link
    # 56076163 bases of 130259811 (43.049%) in intersection
    # featureBits -chrom=chr13 hg17 chainBosTau1Link
    # 37951944 bases of 95559980 (39.715%) in intersection
    # featureBits -chrom=chr14 hg17 chainBosTau1Link
    # 39896970 bases of 88290585 (45.188%) in intersection
    # featureBits -chrom=chr15 hg17 chainBosTau1Link
    # 37507979 bases of 81341915 (46.112%) in intersection
    # featureBits -chrom=chr16 hg17 chainBosTau1Link
    # 33883573 bases of 78884754 (42.953%) in intersection
    # featureBits -chrom=chr17 hg17 chainBosTau1Link
    # 31871034 bases of 77800220 (40.965%) in intersection
    # featureBits -chrom=chr18 hg17 chainBosTau1Link
    # 30359555 bases of 74656155 (40.666%) in intersection



# NET 
# run in stages to avoid memory problems
  ssh kolossus
  cd /cluster/data/bosTau1/bed/zb.hg17/axtChain

  # PRE
  /cluster/bin/x86_64/chainPreNet all.chain.jan5.filtered ../S1.len ../S2.len chainPreNet.out

  # chainNet
  /cluster/bin/x86_64/chainNet chainPreNet.out \
    -minSpace=1 ../S1.len ../S2.len bosTau1.net.raw /dev/null

  # syntenic (using revision 1.6)
  /cluster/home/heather/bin/x86_64/netSyntenic bosTau1.net.raw bosTau1.net.syn
  # memory usage 2757492736, utime 13404 s/100, stime 616

  # backup/compress
  ssh kksilo
  cd /cluster/data/bosTau1/bed/zb.hg17/axtChain
  gzip bosTau1.net.raw
  cp bosTau1.net.syn bosTau1.net.syn.backup

  # netClass
  # takes about 4 hours
  ssh hgwdev
  cd /cluster/data/bosTau1/bed/zb.hg17/axtChain
  netClass -noAr bosTau1.net.syn hg17 bosTau1 bosTau1.net

  # backups
  ssh kksilo
  cp bosTau1.net bosTau1.net.backup
  rm bosTau1.net.syn.backup

  # load
  ssh hgwdev
  cd /cluster/data/bosTau1/bed/zb.hg17/axtChain
  netFilter -minGap=10 bosTau1.net | hgLoadNet hg17 netBosTau1 stdin
  rm bosTau1.net.backup

  # index has NULL cardinality; analyze to fix
  hgsql hg17
  analyze table netBosTau1

  # generate axts 
  ssh kksilo
  cd /cluster/data/bosTau1/bed/zb.hg17
  mkdir axtNet

  # split first (not required?)
  cd axtChain
  mkdir net
  netSplit bosTau1.net.syn net
  cd net
  foreach i (*.net)
    netToAxt $i ../chain/$i:r.chain /cluster/data/hg17/nib /cluster/data/bosTau1/bosTau1.2bit ../../axtNet/$i:r.axt
  end
  gzip bosTau1.net.syn
  gzip bosTau1.net

  # axtSort (takes about 5 minutes)
  ssh kksilo
  cd /cluster/data/bosTau1/bed/zb.hg17
  mkdir axtNetSort
  foreach f ( axtNet/*.axt )
    set c = $f:t:r
    echo "axtSort on $c"
    axtSort $f axtNetSort/$c.axt
  end

  # make maf files
  mkdir mafNet
  foreach f (axtNetSort/*.axt)
    set c = $f:t:r
    echo "axtToMaf on $c"
    axtToMaf $f /cluster/data/hg17/chrom.sizes /cluster/data/bosTau1/chrom.sizes mafNet/$c.maf -tPrefix=hg17. -qPrefix=bosTau1.
  end

# MAKE VSBOSTAU1 DOWNLOADABLES (DONE Feb. 15, 2005 Heather)
    ssh hgwdev
    cd /usr/local/apache/htdocs/goldenPath/hg17
    mkdir vsBosTau1
    cd vsBosTau1
    mkdir axtNet
    cd /cluster/data/bosTau1/bed/zb.hg17/axtChain
    cp -p all.chain.gz  /usr/local/apache/htdocs/goldenPath/hg17/vsBosTau1/cow.chain.gz
    cp -p bosTau1.net.gz /usr/local/apache/htdocs/goldenPath/hg17/vsBosTau1/cow.net.gz
    cd ../axtNet
    cp -p * /usr/local/apache/htdocs/goldenPath/hg17/vsBosTau1/axtNet

    cd /usr/local/apache/htdocs/goldenPath/hg17/vsBosTau1
    # Make a README.txt which explains the files & formats.
    md5sum *.gz > md5sum.txt
    cd axtNet
    md5sum *.gz > md5sum.txt

# YALE PSEUDOGENES (started Robert Baertsch, finished JK 2/21/05)
    ssh hgwdev
    cd /cluster/data/hg17/bed
    mkdir pseudoYale
    cd pseudoYale
    # Place file obtained from Mark Gerstein at yale in pseudoYale.gtf
    ldHgGene hg17 pseudoYale pseudoYale.gtf
    # Note - I'm guessing how this goes.  Robert left no record. -jk

# added xenoRefGene track (markd ~2005-02-20)
    add to /cluster/data/genbank/genbank.con
    hg17.refseq.mrna.xeno.load  = yes
    hg17.refseq.mrna.xeno.loadDesc = yes

# BUILD ccdsGene and ccdsInfo tables (markd 2005-02-25)
    # download files to the genbank data area, as this will eventually
    # be done automatically as part of the genbank build process.
    cd  /cluster/data/genbank
    mkdir -p data/ccds/hg17/2005-02-25
    cd data/ccds/hg17/2005-02-25

    # get the basic text dumps of the data, rather than the database dumps
    wget -r ftp://ftp.ncbi.nlm.nih.gov/pub/CCDS/
    # ends up with:
        About-NcbiHinxton.txt
        NcbiHinxton.txt
        NcbiHinxtonAllAccessions.txt

    # this is a preliminary release, it contained 2 PAR genes that had
    # bad coordinates and 7 genes that were determined at be pseudogenes
    # at the last minute.  The accessions for these 9 genes were
    # placed in skip.ccds and then removed:
    fgrep -v -f skip.ccds  /scratch/markd/gene-sets/ncbiDb/set1.5/NcbiHinxtonAllAccessions.txt >  /scratch/markd/gene-sets/ncbiDb/set1.5/NcbiHinxtonAllAccessions.cleaned.txt

    # create the tab files to load in the database
    /cluster/data/genbank/bin/i386/ccdsImport NcbiHinxtonAllAccessions.cleaned.txt ccdsGene.gp ccdsInfo.tab

    # load ccdsInfo
    hgsql hg17 <../../../../../lib/ccdsInfo.sql 
    hgsql -e 'load data local infile "ccdsInfo.tab" into table ccdsInfo'  hg17

    # load cdsGene.gp and check
    ldHgGene -predTab -genePredExt hg17 ccdsGene ccdsGene.gp 
    checkTableCoords hg17 -verbose=2 ccdsGene
    rm *.tab
    gzip -9 NcbiHinxton*.txt
  
    
# BUILD refSeqKg TABLE TO SUPPORT CCDS GENES (RE-DONE, Fan 2/26/05)

    hgsql hg17 -N -e "select * from knownGene" >kg.gp
    hgsql hg17 -N -e "select * from refGene"   >ref.gp

    overlapSelect -inCds -strand  -idOutput -fraction=fraction.out -selectCds -overlapSimilarity=0.90 -selectFmt=genePred -inFmt=genePred kg.gp ref.gp refSeqKg.90.tab
    cat fraction.out|sort -u >refSeqKg.tab

    hgsql hg17 -e 'drop table refSeqKg'
    hgsql hg17 < ~/src/hg/lib/refSeqKg.sql
    hgsql hg17 -e 'load data local infile "refSeqKg.tab" into table refSeqKg'

   rm fraction.out

# BUILD ccdsGene and ccdsInfo tables  (markd, reone 2005-03-17)
    cd /cluster/store5/genbank/data/ccds/hg17
    wget ftp://ftp.ncbi.nlm.nih.gov/pub/hcds/Hs35.1/CDSTrackDB/CCDS.20050303.tar.gz
    mkdir /scratch/tmp/ccds
    cd /scratch/tmp/ccds
    tar -zxf  /cluster/store5/genbank/data/ccds/hg17/CCDS.20050303.tar.gz     

    # import ccds database tables
    hgsql -e 'create database ccds'
    hgsql ccds </cluster/data/genbank/etc/createTables.sql 
    hgsql ccds </cluster/data/genbank/etc/createKeys.sql 
    /cluster/data/genbank/bin/i386/ccdsImport ccds data/[A-Z]*.txt

    # create and load ccdsGene and ccdsInfo tables from imported database
    /cluster/data/genbank/bin/i386/ccdsMkTables -loadDb ccds hg17 ccdsInfo ccdsGene

    # refSeqKg table

    hgsql -N -e "select * from knownGene" hg17 >kg.gp
    hgsql -N -e "select * from refGene" hg17 >ref.gp

    overlapSelect -statsOutput -strand -inCds -selectCds -overlapSimilarity=0.90 kg.gp ref.gp stdout | tail +2 | sort -u >refSeqKg.tab

    hgsql hg17 -e 'drop table refSeqKg'
    hgsql hg17 < ~/compbio/kent/src/hg/lib/refSeqKg.sql
    hgsql hg17 -e 'load data local infile "refSeqKg.tab" into table refSeqKg'
    cd ..
    rm -r ccds


# COW BACENDS (Done, Heather, Mar. 21, 2005)
 
ssh hgwdev
cd /cluster/data/hg17/bed
mkdir bacendsCow
cd bacendsCow
 
# Obtain GFF file from Denis; unzip into BACendhg15.gff

# Convert into BED 6:
 
  makebed.pl < BACendhg17.gff > BACendhg17.bed
  hgLoadBed -noBin hg17 bacendsCow BACendhg17.bed
  # 53403 warnings
 
# add to kent/src/hg/makeDb/trackDb/human/hg17/trackDb.ra

# make map between ccds and known genes (markd 2005/03/08)
# this should be run whenever either known genes or ccds is updated
/cluster/data/genbank/bin/i386/mkCcdsGeneMap  -db=hg17 -loadDb ccdsGene knownGene ccdsKgMap

# UPDATE WGRNA TRACK (DONE, 2004-12-13, Fan)

# Received updated data file, wg_track_april2005.txt, from Michel Weber by email.

   cut -f 2-10  wg_track_april2005.txt |tail +2 >wg_track_april2005.tab

# Use editor to remove the last blank line.

   hgLoadBed -sqlTable=/cluster/home/fanhsu/hg/lib/wgRna.sql hg17 wgRna wg_track_april2005.tab

# Asked Donna to update Reference section according to Michel's email.  

## refresh vega tracks with vega build30        (done 5/4/04 Robert)
##download vega mysql tables
cd /cluster/store8/ensembl
mkdir vega30_35c
cd vega30_35c
ln /cluster/store8/ensembl/vega30_35c /cluster/data/hg17/bed/vega30 -s

for i in `cat tables` ; do wget -N ftp://ftp.ensembl.org/pub/human-30.35c/data/mysql/homo_sapiens_vega_30_35c/$i.gz ; done
wget -N ftp://ftp.ensembl.org/pub/human-30.35c/data/mysql/homo_sapiens_vega_30_35c/homo_sapiens_vega_30_35c_mysql40_compatible..sql.gz 
gunzip *.gz
##create mysql database
mysql 
create database vega30
use vega30
source homo_sapiens_vega_30_35c_mysql40_compatible.sql
source dropMt.sql
source load.sql
exit


hgsql vega30 -N -B < vegaGene.sql > vegaGene.tab
awk -f vegaGene.awk < vegaGene.tab > vegaGene.gp     
ldHgGene hg17 vegaGene -predTab vegaGene.gp -gtf -genePredExt 
hgsql vega30 -N -B < vegaPseudo.sql > vegaPseudo.tab
awk -f vegaPseudo.awk < vegaPseudo.tab > vegaPseudo.gp     
ldHgGene hg17 vegaPseudoGene -predTab vegaPseudo.gp -gtf -genePredExt 

#load processed pseudogenes
grep Processed vegaPseudo.tab > vegaProcPseudo.tab
awk -f vegaPseudo.awk < vegaProcPseudo.tab > vegaProcPseudo.gp
ldHgGene hg17 vegaProcessedPseudo -predTab vegaProcPseudo.gp -gtf -genePredExt

#load vegaInfo
hgsql vega30 -N -B < vegaGeneInfo.sql > vegaInfo.tab
hgsql vega30 -N -B < vegaPseudoInfo.sql >> vegaInfo.tab

hgsql hg17 -N -B < /cluster/home/baertsch/kent/src/hg/lib/vegaInfo.sql
echo 'load data local infile "vegaInfo.tab" into table vegaInfo' | hgsql hg17 -N -B

#load down to hg16
liftOver vegaGene.gp /gbdb/hg17/liftOver/hg17ToHg16.over.chain vegaGeneHg16.gp unMapped.gp -genePred
liftOver vegaPseudo.gp /gbdb/hg17/liftOver/hg17ToHg16.over.chain vegaPseudoGeneHg16.gp unMappedPseudo.gp -genePred

ldHgGene hg16 vegaGene -predTab vegaGeneHg16.gp -gtf 
ldHgGene hg16 vegaPseudoGene -predTab vegaPseudoGeneHg16.gp -gtf 
echo 'truncate table vegaInfo' | hgsql hg16 -N -B
echo 'load data local infile "vegaInfo.tab" into table vegaInfo' | hgsql hg16 -N -B


#########################################################################
# MOUSE NET/CHAINS MM6 - Info contained in makeMm6.doc (200503 Hiram)


####################################################################################
# RE-BUILD KNOWN GENES TABLES, 2ND TRIAL WITH VARIANT PROTEINS (Started 5/13/05 Fan)

# First build protein databases, sp050415 and proteins050415
# See makeProteins050415.doc for details.

# Create working subdirectories and temporary databases (kgHg17F)

  ssh hgwdev
  cd /cluster/store10/kg
  mkdir kgHg17F  
  ln -s /cluster/store10/kg/kgHg17F /cluster/store6/kgDB/bed/kgHg17F
  ln -s /cluster/store10/kg/kgHg17F /cluster/data/hg17/bed/kgHg17F

  hgsql hg17 -e "create database kgHg17F"   
  hgsql hg17 -e "create database kgHg17FTemp"

  mkdir /cluster/bluearc/kgDB/kgHg17F
  mkdir /cluster/bluearc/kgDB/kgHg17F/protBlat
  ln -s /cluster/bluearc/kgDB/kgHg17F/protBlat /cluster/store10/kg/kgHg17F/protBlat
  cd /cluster/store10/kg/kgHg17F/protBlat

################################################################# 
# VVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVV
# The protBlat.psl was built during the first KG II build trial
# The results are still valid, except that kgHg17E was used
# instead of kgHg17F

# Create working subdirectories and temporary databases (kgHg17E)

  ssh hgwdev
  cd /cluster/store10/kg
  mkdir kgHg17E  
  ln -s /cluster/store10/kg/kgHg17E /cluster/store6/kgDB/bed/kgHg17E
  ln -s /cluster/store10/kg/kgHg17E /cluster/data/hg17/bed/kgHg17E

  hgsql hg17 -e "create database kgHg17E"   
  hgsql hg17 -e "create database kgHg17ETemp"

  mkdir /cluster/bluearc/kgDB/kgHg17E
  mkdir /cluster/bluearc/kgDB/kgHg17E/protBlat
  ln -s /cluster/bluearc/kgDB/kgHg17E/protBlat /cluster/store10/kg/kgHg17E/protBlat
  cd /cluster/store10/kg/kgHg17E/protBlat

# Get all human protein sequences

  hgsql -N sp050415 -e \
  'select proteins050415.spXref3.accession,protein.val from proteins050415.spXref3,protein where division="9606" and acc=accession' \
  |awk '{print ">" $1;print $2}' >humanProt.fa

# Prepare and perform cluster run for protein/genome alignment

  ssh kk
  cd /cluster/data/hg17/bed/kgHg17E/protBlat
  mkdir prot
  faSplit sequence humanProt.fa 1000 prot/prot
  ls /cluster/bluearc/kgDB/kgHg17E/protBlat/prot/* > prot.lis

  ssh hgwdev
  cd /cluster/data/hg17/bed/kgHg17E/protBlat
  hgsql hg17 -N -e 'select chrom from chromInfo' > chrom.lis
  exit
  
  cat << '_EOF_' > gsub
#LOOP
/cluster/bin/i386/blat -noHead -t=dnax -q=prot /cluster/data/hg17/nib/$(path1).nib $(path2) {check out line+ /cluster/bluearc/kgDB/kgHg17E/protBlat/result/$(root1)_$(root2).psl}
#ENDLOOP
'_EOF_'

  mkdir result
  gensub2 chrom.lis prot.lis gsub jobList

  para create jobList
  para try
  para check
  para push
  para check ...
# many output .psl will be empty, the warnings are OK.
[kk:protBlat> para check
45494 jobs in batch
0 jobs (including everybody's) in Parasol queue.
Checking finished jobs
tracking errors: 1
crashed: 12643
ranOk: 32850
total jobs in batch: 45494
[kk:protBlat> para time
45494 jobs in batch
0 jobs (including everybody's) in Parasol queue.
Checking finished jobs
Completed: 32850 of 45494 jobs
Crashed: 12643 jobs
para.results: file not found.  paraHub can't write to this dir?
CPU time in finished jobs:   36153510s  602558.50m 10042.64h  418.44d  1.146 y
IO & Wait Time:               1585456s   26424.27m   440.40h   18.35d  0.050 y
Average job time:                1149s      19.15m     0.32h    0.01d
Longest running job:                0s       0.00m     0.00h    0.00d
Longest finished job:          155120s    2585.33m    43.09h    1.80d
Submission to last job:        276342s    4605.70m    76.76h    3.20d

# This cluster run took about 3 days.  Crashed jobs are due to empty BLAT result.  It is OK.

# collect BLAT results

   ssh hgwdev
   cd /cluster/data/hg17/bed/kgHg17E/protBlat

   mkdir result2
   mkdir result3

   cat chrom.lis |sed -e 's/chr/do1 chr/g' >doall

   cat << '_EOF_' > do1.1
echo processing $1
cat result/$1_prot*.psl >result2/$1.psl
'_EOF_'

   cat << '_EOF_' > do1.2
echo processing $1
pslReps -nohead -minCover=0.80 -minAli=0.80 -nearTop=0.002 result2/$1.psl result3/$1.psl /dev/null >>j.out
'_EOF_'

   chmod +x do*

   cp do1.1 do1
   doall
   cp do1.2 do1
   doall

   cat result3/*.psl >protBlat.psl

#^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
# The end of protBlat.psl build, using kgHg17E
################################################################################




############################################################################
# This part process the variant splice proteins.

# First build variant splice protein tables.

# Get all variant isoform human protein sequences

  ssh hgwdev
  cd /cluster/data/swissprot/050415/build
  wget --timestamp \
 ftp://us.expasy.org/databases/uniprot/current_release/knowledgebase/complete/uniprot_sprot_varsplic.fasta.gz
  wget --timestamp \
ftp://us.expasy.org/databases/uniprot/current_release/knowledgebase/complete/uniprot_trembl_varsplic.fasta.gz

  gzip -d *varsplic.fasta.gz

  faToTab -type=protein uniprot_trembl_varsplic.fasta splicTrembl.tab
  faToTab -type=protein uniprot_sprot_varsplic.fasta splicSprot.tab

  cat splicTrembl.tab splicSprot.tab >varProtein.tab
  hgsql sp050415 < ~/src/hg/lib/varProtein.sql
  hgsql sp050415 -e 'load data local infile "varProtein.tab" into table varProtein'

  cat varProtein.tab |cut -f 1>j1
  cut -f 1 j1|sed -e 's/-/\t/g' >j2
  paste j1 j2 >splicProt.tab

  hgsql kgHg17FTemp -e 'drop table splicProt'
  hgsql kgHg17FTemp <~/src/hg/lib/splicProt.sql
  hgsql kgHg17FTemp -e 'load data local infile "splicProt.tab" into table splicProt'

  hgsql kgHg17FTemp -N -e \
  'select varAcc, varProtein.val from sp050415.varProtein,splicProt,proteins050415.spXref3 where accession=parAcc and varProtein.acc=splicProt.varAcc and division="9606"'| \
   awk '{print ">" $1;print $2}' >humanVarProt.fa


   cd /cluster/data/hg17/bed/kgHg17F

# get all Human splicProtBlat records

hgsql hg17 -N -e \
'select splicProtBlat.* from splicProtBlat,proteins050415.spXref3,kgHg17FTemp.splicProt where qName=splicProt.varAcc and parAcc=accession and division="9606"'\
|cut -f 2-22 \
>humanVarProtBlat.psl
  
# Combine the regular protein protBlat records with the variant protein psl records.
  cd /cluster/store10/kg/kgHg17F
  cat ../kgHg17E/protBlat/protBlat.psl humanVarProtBlat.psl >protBlat.psl

  hgLoadPsl hg17 protBlat.psl
# Processing protBlat.psl
# load of protBlat did not go as planned: 104064 record(s), 0 row(s) skipped, 1484 warning(s) loading psl.tab
# Looked into the cause of these 1484 warnings.  It was due to that qBaseInsert and tBaseInsert 
# have negative values, probably due to that this is protein alignment.


# create all_mrna.psl and tight_mrna.psl
   hgsql hg17 -N -e "select * from all_mrna" |cut -f 2-22 >all_mrna.psl

   pslReps -minCover=0.40 -minAli=0.97 -nearTop=0.002 \
           all_mrna.psl tight_mrna.psl /dev/null

# Use overlapSelect to get protein and mRNA alignment overlaps   
   overlapSelect  -statsOutput  -dropped=protOut.psl -overlapThreshold=0.90 \
   -selectFmt=psl -inFmt=psl tight_mrna.psl  protBlat.psl protMrna.stat

   overlapSelect  -mergeOutput  -dropped=protOut.psl -overlapThreshold=0.90 -selectFmt=psl \
   -inFmt=psl tight_mrna.psl  protBlat.psl protMrna.out

# Create protein/mRNA pair and protein lists
   cut -f 10,31 protMrna.out|sort -u >spMrna.tab
   cut -f 10    protMrna.out|sort -u >protein.lis

# Load spMrna.tab into spMrna table in temp DB.
   hgsql kgHg17FTemp < ~/src/hg/lib/spMrna.sql
   hgsql kgHg17FTemp -e 'load data local infile "spMrna.tab" into table spMrna'
   hgsql kgHg17FTemp -e 'create index mrnaID on spMrna(mrnaID)'

# Prepare and perform cluster run of protein/mRNA alignment

# Get mRNA fa file.
   cd /cluster/data/hg17/bed/kgHg17F
   /cluster/data/genbank/bin/i386/gbGetSeqs -native -db=hg17 \
   -gbRoot=/cluster/data/genbank genbank mrna mrna.fa

# Create mrnaSeq table in kgHg17FTemp DB.

   faToTab mrna.fa mrnaSeq.tab

   hgsql kgHg17FTemp -e 'drop table mrnaSeq'
   hgsql kgHg17FTemp <~/src/hg/lib/mrnaSeq.sql
   hgsql kgHg17FTemp -e 'load data local infile "mrnaSeq.tab" into table mrnaSeq'
# Prepare files for cluster run
   ~/src/hg/protein/KG2.sh kgHg17F hg17 050415

# Perform cluster run of protein/mRNA alignment
   ~/src/hg/protein/KG3.sh kgHg17F hg17 050415

# Collect cluster run results
   cd kgBestMrna

   ls out | sed -e 's/prot/do1 prot/g' >doall

# create do1 with the following 2 lines:
   cat << '_EOF_' > do1
echo processing $1
cat out/$1/*.out >>protMrnaRaw.psl
'_EOF_'

   chmod +x do*
   doall

# Filter out low quality alignments
   pslReps -nohead -singleHit -minAli=0.9 protMrnaRaw.psl protMrnaBlat.psl /dev/null
   cut -f 10,14 protMrnaBlat.psl |sort -u >protMrna.lis
   wc protMrna.lis

# Load BLAT results into temp DB.
   hgsql kgHg17FTemp < ~/src/hg/lib/protMrnaBlat.sql
   hgsql kgHg17FTemp -e 'load data local infile "protMrnaBlat.psl" into table protMrnaBlat'
   hgsql kgHg17FTemp -e 'create index tName on protMrnaBlat(tName)'

# Create CDS files from protein/mRNA alignment results.
   hgsql kgHg17FTemp -N -e \
   'select qName,"_",tName,tStart+1,":",tEnd+3 from protMrnaBlat order by qName,tName,tEnd-tStart desc'\
   |sed 's/\t_\t/_/g'|sed 's/\t:\t/../g' >protMrna.cds

# Create protMrna.psl with proteinID_mrnaID as query ID.
   cut -f 22-30 ../protMrna.out > j1.tmp
   cut -f 32-42 ../protMrna.out > j2.tmp
   cut -f 10,31 ../protMrna.out|sed -e 's/\t/_/g' >j3.tmp
   paste j1.tmp j3.tmp j2.tmp >protMrna.psl
   rm j1.tmp j2.tmp j3.tmp

# Run mrnaToGene to create protMrna.gp
   bash
   mrnaToGene -cdsFile=protMrna.cds protMrna.psl protMrna.gp 2>protMrna.err >protMrna.log
   exit

# Prepare refGene and all_mrna gp files.

   cd ..
   hgsql hg17 -N -e 'select * from refGene' >ref.gp

   hgsql hg17 -N -e \
   'select gbCdnaInfo.acc,cds.name from gbCdnaInfo,cds,all_mrna where all_mrna.qName=gbCdnaInfo.acc and   gbCdnaInfo.cds=cds.id' \
   |sort -u > all_mrna.cds

   bash
   mrnaToGene -cdsFile=all_mrna.cds all_mrna.psl all_mrna.gp 2>all_mrna.err > all_mrna.log
   exit

# Align proteins to RefSeq.

   overlapSelect -inCds -statsOutput -overlapThreshold=0.90 -selectFmt=psl -inFmt=genePred\
   protBlat.psl ref.gp ref.stat
   overlapSelect -inCds -dropped=refOut1.gp -overlapThreshold=0.90 -selectFmt=psl -inFmt=genePred\
   protBlat.psl ref.gp protRef.gp

   overlapSelect -mergeOutput -selectCds -dropped=protOut1.psl -overlapThreshold=0.80 -inFmt=psl\
   -selectFmt=genePred ref.gp protBlat.psl protRef.out

   cut -f 10,22 protRef.out | sort -u >spRef.tab
   cut -f 10 protRef.out    | sort -u >protRef.lis

   hgsql kgHg17FTemp -e 'drop table spRef'
   hgsql kgHg17FTemp <~/src/hg/lib/spRef.sql
   hgsql kgHg17FTemp -e 'load data local infile "spRef.tab" into table spRef'

# Prepare and perform cluster runs for protein/RefSeq alignments

   ~/src/hg/protein/KGRef2.sh kgHg17F hg17 050415
   ~/src/hg/protein/KGRef3.sh kgHg17F hg17 050415

   cd kgBestRef
   ls out | sed -e 's/prot/do1 prot/g' >doall

   cat << '_EOF_' > do1
echo processing $1
cat out/$1/*.out >>protRefRaw.psl
'_EOF_'

   chmod +x do*
   doall

# Filter out low quality alignments.
   pslReps -nohead -singleHit -minAli=0.9 protRefRaw.psl protRefBlat.psl /dev/null
   cut -f 10,14 protRefBlat.psl |sort -u >protRef.lis
   wc protRef.lis

   hgsql kgHg17FTemp -e 'drop table protRefBlat'
   hgsql kgHg17FTemp < ~/src/hg/lib/protRefBlat.sql
   hgsql kgHg17FTemp -e 'load data local infile "protRefBlat.psl" into table protRefBlat'
   hgsql kgHg17FTemp -e 'create index tName on protRefBlat(tName)'

# Run gene-check to filter out invalid gp entries
   cd /cluster/data/hg17/bed/kgHg17F
   cat ref.gp kgBestMrna/protMrna.gp all_mrna.gp >kgCandidate0.gp
   gene-check  -incl-ok -ok-genepred-out kgCandidate0.passed.gp -nib-dir /cluster/data/hg17/nib kgCandidate0.gp kgCandidate0.check

   hgsql kgHg17FTemp -e 'drop table kgCandidate0'
   hgsql kgHg17FTemp < ~/src/hg/lib/kgCandidate0.sql 
   hgsql kgHg17FTemp -e  'load data local infile "kgCandidate0.gp" into table kgCandidate0'

   hgsql kgHg17FTemp -e 'drop table geneCheck'
   hgsql kgHg17FTemp < ~/src/hg/lib/geneCheck.sql
   hgsql kgHg17FTemp -e  'load data local infile "kgCandidate0.check" into table geneCheck ignore 2 lines'


# Run kgCheck to get all KG candidates that pass the KG gene check criteria

   kgCheck kgHg17FTemp hg17 kgCandidate0 geneCheck kgCandidate.tab
   hgsql kgHg17FTemp -e  'drop table kgCandidate'
   hgsql kgHg17FTemp < ~/src/hg/lib/kgCandidate.sql
   hgsql kgHg17FTemp -e  'load data local infile "kgCandidate.tab" into table kgCandidate'
   hgsql kgHg17FTemp -e 'create index alignID on kgCandidate(alignID)'

# ####### NEXT TIME AROUND PUT IN AN EXTRA STEP TO BRING IN ITEMS ON A "PUT BACK" LIST
# FOR SPECIAL CASES LIKE SELENOCYSTEINE, NON-AUG INITIATION CODON, RIBOSOMAL SLIPPAGE, ETC.
# #######

# Construct the kgCandidateX table that has alignID in the name field. 
   cut -f 2-10 kgCandidate.tab >j2.tmp
   cut -f 11 kgCandidate.tab >j1.tmp
   paste j1.tmp j2.tmp >kgCandidateX.tab

   hgsql kgHg17FTemp -e  'drop table kgCandidateX'
   hgsql kgHg17FTemp < ~/src/hg/lib/kgCandidateX.sql
   hgsql kgHg17FTemp -e  'load data local infile "kgCandidateX.tab" into table kgCandidateX'

# Score protein/mRna and protein/RefSeq alignments

   kgResultBestMrna2 050415 kgHg17FTemp hg17|sort -u >protMrnaBlatScore.tab
   kgResultBestRef2  050415 kgHg17FTemp hg17|sort -u >protRefScore.tab

# Combine scoring results and load them into temp DB.
   cat protMrnaBlatScore.tab protRefScore.tab >protMrnaScore.tab
   hgsql kgHg17FTemp -e 'drop table protMrnaScore'
   hgsql kgHg17FTemp < ~/src/hg/lib/protMrnaScore.sql
   hgsql kgHg17FTemp -e 'load data local infile "protMrnaScore.tab" into table protMrnaScore'
   hgsql kgHg17FTemp -e 'create index mrnaAcc on protMrnaScore(mrnaAcc)'


# Run kgGetCds to get CDS structure of each gene

   kgGetCds kgHg17FTemp kgCandidateX jY.tmp
   cat jY.tmp |sort -u >kgCandidateY.tab
   rm jY.tmp
   hgsql kgHg17FTemp -e  'drop table kgCandidateY'
   hgsql kgHg17FTemp < ~/src/hg/lib/kgCandidateY.sql
   hgsql kgHg17FTemp -e  'load data local infile "kgCandidateY.tab" into table kgCandidateY'

# Run kgPickPrep to replace long cds structure string with cdsId.
   kgPickPrep kgHg17FTemp kgCandidateZ.tab
   hgsql kgHg17FTemp -e  'drop table kgCandidateZ'
   hgsql kgHg17FTemp < ~/src/hg/lib/kgCandidateZ.sql
   hgsql kgHg17FTemp -e  'load data local infile "kgCandidateZ.tab" into table kgCandidateZ'
   hgsql kgHg17FTemp -e 'create index cdsId on kgCandidateZ(cdsId)'

# Run kgPick to pick the representative a mrna/protein pair for each unique CDS structure.

   kgPick kgHg17FTemp hg17 proteins050415 kg3.tmp dupSpMrna.tmp
   sort -u dupSpMrna.tmp >dupSpMrna.tab

# Sort KG genes to make the kg3.gp table file.
   ~/kent/src/hg/protein/sortKg.pl kg3.tmp >kg3.gp

   hgsql kgHg17FTemp -e  'drop table knownGene'
   hgsql kgHg17FTemp < ~/src/hg/lib/knownGene.sql
   hgsql kgHg17FTemp -e  'load data local infile "kg3.gp" into table knownGene'

   hgsql hg17 -e  'drop table kg3'
   hgsql hg17 < ~/src/hg/lib/kg3.sql
   hgsql hg17 -e  'load data local infile "kg3.gp" into table kg3'

# Perform analysis before renaming the kg3 table to knownGene.

# Load data into hg17 knownGene table.
   hgsql hg17 -e  'drop table knownGene'
   hgsql hg17 < ~/src/hg/lib/knownGene.sql
   hgsql hg17 -e  'load data local infile "kg3.gp" into table knownGene'

# Build knownGeneMrna and knownGenePep tables.

   kgPepMrna kgHg17FTemp hg17 050415
   hgsql hg17 -e  'drop table knownGeneMrna'
   hgsql hg17 < ~/src/hg/lib/knownGeneMrna.sql
   hgsql hg17 -e  'load data local infile "knownGeneMrna.tab" into table knownGeneMrna'
   hgsql hg17 -e  'drop table knownGenePep'
   hgsql hg17 < ~/src/hg/lib/knownGenePep.sql
   hgsql hg17 -e  'load data local infile "knownGenePep.tab" into table knownGenePep'


# Build kgXref table

   kgXref2 kgHg17FTemp 050415 hg17

   hgsql hg17 -e  'drop table kgXref'
   hgsql hg17 < ~/src/hg/lib/kgXref.sql
   hgsql hg17 -e  'load data local infile "kgXref.tab" into table kgXref'

# Build spMrna table

   hgsql hg17 -N -e 'select name, proteinID from knownGene' >kgSpMrna.tab

   hgsql hg17 -e  'drop table spMrna'
   hgsql hg17 <~/src/hg/lib/spMrna.sql
   hgsql hg17 -e 'load data local infile "kgSpMrna.tab" into table spMrna'

# Build kgProtMap table

    ~/src/hg/protein/kgProtMap2.sh kgHg17F hg17 050415

# Update and clean up kgResultBestMrna2.c and then check it in.

#####################################
# Build alias tables.		DONE 5/18/05 Fan.
#	kgAliasM reads from proteins050415.hugo.symbol, proteins050415.hugo.aliases
#	proteins050415.hugo.withdraws, hg17.kgXref.kgID
#	to create kgAliasM.tab and geneAlias.tab
#	by picking out those kgID items from kgXref where
#	kgXref.geneSymbol == hugo.symbol

   kgAliasM hg17 proteins050415

#	kgAliasKgXref reads from hg17.knownGene.proteinID,
#	hg17.knownGene.name, hg17.kgXref.geneSymbol
#	to create kgAliasKgXref.tab

   kgAliasKgXref hg17


#	kgAliasRefseq reads from hg17.knownGene.name,
#	hg17.knownGene.proteinID, hg17.kgXref.refseq
#	to create kgAliasRefseq.tab

   kgAliasRefseq hg17

   hgsql sp050415 -N -e 'select name,gene.val from hg17.knownGene,displayId,gene where displayId.val=proteinID and displayId.acc=gene.acc' \
   | sort -u  > kgAliasP.tab

   hgsql hg17 -N -e 'select name, name from knownGene' >kgAliasDup.tab
   hgsql hg17 -N -e 'select mrnaID, dupMrnaID from dupSpMrna' >>kgAliasDup.tab
   
   cat kgAliasM.tab kgAliasRefseq.tab kgAliasKgXref.tab kgAliasP.tab kgAliasDup.tab| \
   sort |uniq > kgAlias.tab

   hgsql -e "drop table kgAlias;" hg17 
   hgsql hg17 < ~/kent/src/hg/lib/kgAlias.sql
   hgsql hg17 -e 'LOAD DATA local INFILE "kgAlias.tab" into table kgAlias' 

#	kgProtAlias reads from hg17.knownGene.name,
#	hg17.knownGene.proteinID, hg17.knownGene.alignID,
#	proteins050415.spXref3.accession, proteins050415.spSecondaryID, proteins050415.pdbSP.pdb
#	to create kgProtAlias.tab
#

   kgProtAlias hg17 050415

   hgsql hg17 -N -e \
   'select kgID, spDisplayID, protAcc from kgXref where protAcc != ""'\
   | sort -u >kgProtAliasNCBI.tab

# include variant splice protein IDs
   
   hgsql hg17 -N -e \
   'select name, proteinID, parAcc from knownGene,sp050415.varAcc where varAcc=proteinID'\
   |sort -u >kgProtAliasDup.tab
# include duplicate protein IDs from dupSpMrna table
   hgsql hg17 -N -e \
   'select name, knownGene.proteinID, dupProteinID from knownGene, dupSpMrna where name=mrnaID'\
   |sort -u >>kgProtAliasDup.tab

# catch parent acc from dupProteinID too
   hgsql hg17 -N -e\
   'select name, knownGene.proteinID, parAcc from knownGene,dupSpMrna,sp050415.varAcc where name=mrnaID and dupProteinID=varAcc.varAcc'\
   |sort -u >>kgProtAliasDup.tab
    cat kgProtAliasNCBI.tab kgProtAlias.tab kgProtAliasDup.tab | sort -u > kgProtAliasAll.tab

    echo "`date` creating table kgProtAlias"
    hgsql hg17 -e "drop table kgProtAlias;"
    hgsql hg17 <~/src/hg/lib/kgProtAlias.sql; 
    hgsql hg17 -e 'LOAD DATA local INFILE "kgProtAliasAll.tab" into table kgProtAlias;'  

# Build kgSpAlias table

    hgsql hg17 -e \
    'select kgXref.kgID, spID, alias from kgXref, kgAlias where kgXref.kgID=kgAlias.kgID' >j.tmp
    hgsql hg17 -e \
    'select kgXref.kgID, spID, alias from kgXref, kgProtAlias where kgXref.kgID=kgProtAlias.kgID'\
    >>j.tmp
    cat j.tmp|sort -u |grep -v 'kgID' >hg17.kgSpAlias.tab
    rm j.tmp

    hgsql hg17 -e 'drop table kgSpAlias';
    hgsql hg17 < ~/src/hg/lib/kgSpAlias.sql
    hgsql hg17 -e 'load data local infile "hg17.kgSpAlias.tab" into table kgSpAlias'

# MAKE FOLDUTR TABLES (DONE 2005-05-19, Fan)
# First set up directory structure and extract UTR sequence on hgwdev
    ssh hgwdev
    cd /cluster/data/hg17/bed
    mkdir rnaStruct.2005-05-18
    rm rnaStruct
    ln -s rnaStruct.2005-05-18 rnaStruct
    cd rnaStruct
    mkdir -p utr3/split utr5/split utr3/fold utr5/fold
    utrFa hg17 knownGene utr3 utr3/utr.fa
    utrFa hg17 knownGene utr5 utr5/utr.fa

# Split up files and make files that define job.
    ssh kk
    cd /cluster/data/hg17/bed/rnaStruct
    faSplit sequence utr3/utr.fa 50000 utr3/split/s
    faSplit sequence utr5/utr.fa 50000 utr5/split/s
    ls -1 utr3/split > utr3/in.lst
    ls -1 utr5/split > utr5/in.lst
    cd utr3
    cat > gsub <<end
#LOOP
rnaFoldBig split/\$(path1) fold
#ENDLOOP
end
    cp gsub ../utr5

# Do cluster run for 3' UTRs
    gensub2 in.lst single gsub spec
    para create spec
    para try
    para push
# Completed: 35692 of 35692 jobs
# CPU time in finished jobs:    1272085s   21201.42m   353.36h   14.72d  0.040 y
# IO & Wait Time:                102447s    1707.45m    28.46h    1.19d  0.003 y
# Average job time:                  39s       0.64m     0.01h    0.00d
# Longest running job:                0s       0.00m     0.00h    0.00d
# Longest finished job:            6554s     109.23m     1.82h    0.08d
# Submission to last job:          9100s     151.67m     2.53h    0.11d

# Do cluster run for 5' UTRs 
    cd ../utr5
    gensub2 in.lst single gsub spec
    para create spec
    para try
    para push

# Completed: 33693 of 33693 jobs
# CPU time in finished jobs:     393764s    6562.74m   109.38h    4.56d  0.012 y
# IO & Wait Time:                126205s    2103.41m    35.06h    1.46d  0.004 y
# Average job time:                  15s       0.26m     0.00h    0.00d
# Longest running job:                0s       0.00m     0.00h    0.00d
# Longest finished job:           51595s     859.92m    14.33h    0.60d
# Submission to last job:         52057s     867.62m    14.46h    0.60d

# Load database
    ssh hgwdev
    cd /cluster/data/hg17/bed/rnaStruct/utr5
    hgLoadRnaFold hg17 foldUtr5 fold
    cd ../utr3
    hgLoadRnaFold hg17 foldUtr3 fold

# Clean up
    rm -r split fold err batch.bak
    cd ../utr5
    rm -r split fold err batch.bak

# Build KEGG pathway tables.  DONE 5/19/05.  Fan.
   ssh hgwdev
   cd /cluster/store10/kg/kgHg17F
   md kegg
   cd kegg

   ~/src/hg/protein/KGpath.sh kgHg17F hg17 050415

   hgsql hg17 -e "drop table keggMapDesc"
   hgsql hg17 -e "drop table keggPathway"
   hgsql hg17 <~/src/hg/lib/keggMapDesc.sql
   hgsql hg17 <~/src/hg/lib/keggPathway.sql
   hgsql hg17 -e 'load data local infile "keggMapDesc.tab" into table keggMapDesc'
   hgsql hg17 -e 'load data local infile "keggPathway.tab" into table keggPathway'
  
# Build CGAP pathway tables
# RELOAD cgapAlias TABLE AFTER REMOVING REPLICATE ROWS (hartera, 2005-07-26)
# duplicate rows. (hartera, 2005-07-26)
# RELOADED cgapAlias AGAIN AS TOO MANY ROWS REMOVED BEFORE (hartera, 2005-10-06)

   cd ..
   ~/src/hg/protein/KGcgap.sh kgHg17F hg17 050415
   hgsql hg17 -e "drop table cgapAlias"
   hgsql hg17 -e "drop table cgapBiocDesc"
   hgsql hg17 -e "drop table cgapBiocPathway"
   hgsql hg17 <~/src/hg/lib/cgapAlias.sql
   hgsql hg17 <~/src/hg/lib/cgapBiocDesc.sql
   hgsql hg17 <~/src/hg/lib/cgapBiocPathway.sql
   hgsql hg17 -e 'load data local infile "cgapAlias.tab" \
                 into table cgapAlias'
   hgsql hg17 -e 'load data local infile "cgapBIOCARTAdescSorted.tab" into table cgapBiocDesc'
   hgsql hg17 -e 'load data local infile "cgapBIOCARTA.tab" into table cgapBiocPathway'

   # RELOAD cgapAlias TABLE. Sort and reload alias tab file to remove 
   # duplicate rows. (hartera, 2005-07-26)
   # DO TABLE RELOAD AGAIN AS sort -nu REMOVES MORE ROWS THAN sort -u
   # OR sort -n | uniq.
   #USE sort -n then uniq TO SORT ON THE IDs AND THEN UNIQ(hartera, 2005-10-06)
   cd /cluster/store10/kg/kgHg17F
   hgsql hg17 -e "drop table cgapAlias"
   # cgapAlias.tab has replicated rows so sort and unique before loading
   sort -n cgapAlias.tab | uniq > cgapAliasSorted.tab
   hgsql hg17 < ~/kent/src/hg/lib/cgapAlias.sql 
   hgsql hg17 -e 'load data local infile "cgapAliasSorted.tab" \
                 into table cgapAlias'

# LOAD ENSEMBL GENES (DONE, 5/23/05, Fan)
# Ensembl changed things again! Please note there are two subtle changes to make it work.

    mkdir /cluster/data/hg17/bed/ensembl
    cd /cluster/data/hg17/bed/ensembl
    mkdir new
    cd new

    # Get the ensembl protein data from 
    # http://www.ensembl.org/Homo_sapiens/martview
    # Follow this sequence through the pages:
    # Page 1) Make sure that the Homo_sapiens choice is selected. Hit next.
    # Page 2) Uncheck the "Limit to" box in the region choice. Then hit next.
    # Page 3) Choose the "Structures" box. 
    # Page 4) Choose GTF as the ouput.  choose gzip compression.  hit export.
    # Save as ensemblGene.gtf.gz

    # This time, there are some extra lines, like '		1;', 
    # that are causing problems, so added an extra filter in the beginning 
    # to get rid of them.
    # Ensembl handles random chromosomes differently than us, so we
    # strip this data.  Fortunately it just loses a couple of genes.
    # Add "chr" to front of each line in the gene data gtf file to make 
    # it compatible with our software.
    # Finally, get rid of the ".1" or ".2" after the name

cat ensemblGene.gtf |sed -e 's/\t\t/xxxxx/g' \
    |grep -v xxxxx \
    | grep -v ^6_DR51 \
    | grep -v ^DR51 \
    | grep -v ^DR52 \
    | grep -v ^DR53 \
    | grep -v _NT_ \
    | perl -wpe 's/^([0-9]|X|Y|Un|MT)/chr$1/ \
                 || die "Line $. doesnt start with human chrom:\n$_"' \
    | sed -e 's/chrMT/chrM/g' \
    | sed -e 's/\..\"/\"/g' \
>ensGene.gtf

    ssh hgwdev
    cd /cluster/data/hg17/bed/ensembl/new
    /cluster/bin/i386/ldHgGene hg17 ensGene ensGene.gtf
# Read 33581 transcripts in 699580 lines in 1 files
# 33581 groups 25 seqs 1 sources 4 feature types
# 33581 gene predictions

    # ensGtp associates geneId/transcriptId/proteinId for hgPepPred and 
    # hgKnownToSuper.  Use ensMart to create it as above, except:
    # Page 3) Choose the "Features" box. In "Ensembl Attributes", check 
    # Ensembl Gene ID, Ensembl Transcript ID, Ensembl Peptide ID.  
    # Choose Text, tab-separated as the output format.  Result name ensGtp.
    # Save file as ensGtp.txt.gz
    gunzip ensGtp.txt.gz
    hgsql hg17 -e 'drop table ensGtp'
    hgsql hg17 < ~/kent/src/hg/lib/ensGtp.sql
    hgsql hg17 -e 'load data local infile "ensGtp.txt" into table ensGtp ignore 1 lines'

# ensMart has some problem with the resulting ensemblPep.fa.gz, so use different
# processing step instead:

    wget -timestamp \
    ftp://ftp.ensembl.org/pub/current_human/data/fasta/pep/Homo_sapiens.NCBI35.may.pep.fa.gz
    zcat Homo_sapiens.NCBI35.may.pep.fa.gz | sed -e "s/transcript:/\n>/g" | grep -v 'gene:' >ensPep.fa
    faToTab -type=protein ensPep.fa ensPep.tab
    hgsql hg17 -e 'drop table ensPep'
    hgsql hg17 < ~/kent/src/hg/lib/ensPep.sql
    hgsql hg17 -e 'load data local infile "ensPep.tab" into table ensPep'

# kept the following, just in case Ensembl fixed the problem in the future
    # Load Ensembl peptides:
    # Get them from ensembl as above in the gene section except for
    # Page 3) Choose the "Sequences" box. 
    # Page 4) Transcripts/Proteins.  Peptide.  Format = FASTA.
    # Save file as ensemblPep.fa.gz
# gunzip ensemblPep.fa.gz
# hgPepPred hg17 ensembl ensemblPep.fa

# UPDATE GENE SORTER TABLES (AKA: FAMILY BROWSER) (STARTED - 2005-05-21, DONE 2005-05-23 - Fan)
#	This should be done after knownGene tables are complete from known gene
#	process.
#
# Cluster together various alt-splicing isoforms.
#	Creates the knownIsoforms and knownCanonical tables
ssh hgwdev
mkdir /cluster/data/hg17/bed/geneSorter.2005-05-21
# remove old symbolic link
rm /cluster/data/hg17/bed/geneSorter
ln -s /cluster/data/hg17/bed/geneSorter.2005-05-21 \
	/cluster/data/hg17/bed/geneSorter
cd /cluster/data/hg17/bed/geneSorter
hgClusterGenes hg17 knownGene knownIsoforms knownCanonical

# Extract peptides from knownGenes into fasta file
# and create a blast database out of them.
mkdir /cluster/data/hg17/bed/geneSorter/blastp
cd /cluster/data/hg17/bed/geneSorter/blastp
pepPredToFa hg17 knownGenePep known.faa
#	You may need to build this binary in src/hg/near/pepPredToFa
/scratch/blast/formatdb -i known.faa -t known -n known
#	This command is in /projects/compbio/bin/$MACH/formatdb

# Copy over database to bluearc
rm -fr /cluster/bluearc/hg17/blastp
mkdir -p /cluster/bluearc/hg17/blastp
cp -p /cluster/data/hg17/bed/geneSorter/blastp/known.* \
	/cluster/bluearc/hg17/blastp

# Split up fasta file into bite sized chunks for cluster
cd /cluster/data/hg17/bed/geneSorter/blastp
mkdir split
faSplit sequence known.faa 8000 split/kg

# Make parasol run directory
ssh kk
mkdir /cluster/data/hg17/bed/geneSorter/blastp/self
cd /cluster/data/hg17/bed/geneSorter/blastp/self
mkdir run
cd run
mkdir out

# Make blast script
cat  << '_EOF_' > blastSome
#!/bin/sh
BLASTMAT=/cluster/bluearc/blast229/data /cluster/bluearc/blast229/blastall \
-p blastp -d /cluster/bluearc/hg17/blastp/known -i $1 -o $2 \
-e 0.01 -m 8 -b 1000
'_EOF_'
    # << keep emacs happy
chmod +x blastSome

# Make gensub2 file
cat  << '_EOF_' > gsub
#LOOP
blastSome {check in line+ $(path1)} {check out line out/$(root1).tab}
#ENDLOOP
'_EOF_'
    # << keep emacs happy

# Create parasol batch
#	'ls ../../split/*.fa' is too much, hence the echo
echo ../../split/*.fa | wordLine stdin > split.lst
gensub2 split.lst single gsub jobList
para create jobList
para try
# Wait a couple of minutes, and do a para check,  if all is good
# then do a
para push
# This should finish in ~15 minutes if the cluster is free.
# Completed: 7739 of 7739 jobs
# CPU time in finished jobs:     150459s    2507.64m    41.79h    1.74d  0.005 y
# IO & Wait Time:                 22325s     372.09m     6.20h    0.26d  0.001 y
# Average job time:                  22s       0.37m     0.01h    0.00d
# Longest running job:                0s       0.00m     0.00h    0.00d
# Longest finished job:             198s       3.30m     0.06h    0.00d
# Submission to last job:          2019s      33.65m     0.56h    0.02d

# Load into database.  This takes about 30 minutes
ssh hgwdev
cd /cluster/data/hg17/bed/geneSorter/blastp/self/run/out
time hgLoadBlastTab hg17 knownBlastTab *.tab
# Scanning through 7739 files
# Loading database with 9836439 rows
# 232.300u 42.580s 23:13.41 19.7% 0+0k 0+0io 205pf+0w

cd /cluster/data/hg17/bed/geneSorter
# Create table that maps between known genes and RefSeq
hgMapToGene hg17 refGene knownGene knownToRefSeq
#	may need to build this command in src/hg/near/hgMapToGene
#	hgsql -e "select count(*) from knownToRefSeq;" hg17
#	row count changed 34667

# Create table that maps between known genes and LocusLink
hgsql --skip-column-names -e "select mrnaAcc,locusLinkId from refLink" hg17 \
	> refToLl.txt
hgMapToGene hg17 refGene knownGene knownToLocusLink -lookup=refToLl.txt
#	hgsql -e "select count(*) from knownToLocusLink;" hg17
#	row count changed to 34667

# Create table that maps between known genes and Pfam domains
hgMapViaSwissProt hg17 knownGene name proteinID Pfam knownToPfam
#	hgsql -e "select count(*) from knownToPfam;" hg17
#	row count changed to 36010

# Create table to map between known genes and GNF Atlas2
# expression data.
    hgMapToGene hg17 gnfAtlas2 knownGene knownToGnfAtlas2 '-type=bed 12'
#	hgsql -e "select count(*) from knownToGnfAtlas2;" hg17
#	row count changed to 32381

# Create expression distance table - takes about an hour
    hgExpDistance hg17 hgFixed.gnfHumanAtlas2MedianRatio \
    	hgFixed.gnfHumanAtlas2MedianExps gnfAtlas2Distance \
	-lookup=knownToGnfAtlas2 &
# Have 44696 elements in hgFixed.gnfHumanAtlas2MedianRatio
# Got 32381 unique elements in hgFixed.gnfHumanAtlas2MedianRatio

#	hgsql -e "select count(*) from gnfAtlas2Distance;" hg17
#	row count changed to 32381000

# Create a table that maps between known genes and 
# the nice affy expression data.
hgMapToGene "-type=bed 12" hg17 affyUclaNorm knownGene knownToU133
#	hgsql -e "select count(*) from knownToU133;" hg17
#	row count changed to 32886

# Create expression distance table.  This will take about 2.5 hours
cd /tmp
cp -p ~/kent/src/hg/near/hgExpDistance/affyUcla.weight .
time hgExpDistance hg17 affyUclaNorm affyUclaExp knownExpDistance \
	-weights=affyUcla.weight -lookup=knownToU133 &
# Have 43039 elements in affyUclaNorm
# 211 genes, 42 weights, 26.500000 total wieght
# Got 32886 unique elements in affyUclaNorm

# Create table that maps between known genes and 
# the GNF data.
cd /tmp
hgMapToGene hg17 affyU95 knownGene knownToU95
#	row count changed to 17501
#	hgFixed.gnfHumanU95Exps argument is unused, no need to exist
hgExpDistance hg17 hgFixed.gnfHumanU95MedianRatio \
	hgFixed.gnfHumanU95Exps gnfU95Distance  -lookup=knownToU95 &
# Have 11545 elements in hgFixed.gnfHumanU95MedianRatio
# Got 16450 unique elements in hgFixed.gnfHumanU95MedianRatio
#	row count changed to 16450000  

# Create known gene mapping table and expression distance tables
# for GNF Atlas 2.  (The hgExpDistance takes only 10 minutes.)

hgMapToGene hg17 affyGnf1h knownGene knownToGnf1h
hgExpDistance hg17 hgFixed.gnfHumanAtlas2MedianRatio \
	hgFixed.gnfHumanAtlas2MedianExps gnfAtlas2Distance \
	-lookup=knownToGnf1h &
# Have 44696 elements in hgFixed.gnfHumanAtlas2MedianRatio
# Got 8814 unique elements in hgFixed.gnfHumanAtlas2MedianRatio

cd /cluster/data/hg17/bed/geneSorter
hgMapToGene hg17 affyU133Plus2 knownGene knownToU133Plus2
#	row count changed to 35055 

#### UPDATE GO DATABASE (DONE 5/21/05 Fan)

# Download the terms and make the database.
ssh hgwdev
mkdir /cluster/store1/geneOntology/20050521
cd /cluster/store1/geneOntology/20050521

wget --timestamping http://www.godatabase.org/dev/database/archive/latest/go_200504-assocdb-data.gz

hgsql mysql <<end
create database go050521;
end
zcat go_*data.gz | sed -e 's/ENGINE=MyISAM DEFAULT CHARSET=latin1/TYPE=MyISAM/g' >j.tmp
hgsql go050521 <j.tmp
rm j.tmp

wget --timestamping ftp://ftp.geneontology.org/pub/go/gene-associations/gene_association.goa_uniprot.gz

zcat gene_association.goa_uniprot.gz | hgGoAssociation go050521 goaPart stdin 
# Passed 5589891 of 6584507 of 6584507, 84.89%

# Ask sys-admin to switch the database pointer go to point to go050521.

cd /cluster/data/hg17/bed/geneSorter

# Rebuilt Ensembl Gene tables.  See documentation (5/23/05 Fan) above.

# Create knownToEnsembl column
hgMapToGene hg17 ensGene knownGene knownToEnsembl
#	table row count went from previous version: 38251 to 35436

# Make knownToCdsSnp table 
  ssh hgwdev
  nice hgMapToGene hg17 snp knownGene knownToCdsSnp -all -cds
# row count 94394
# approx. 5 minutes running time

# C.ELEGANS BLASTP FOR GENE SORTER 
    # Make C. elegans ortholog column using blastp on wormpep.
    # First make C. elegans protein database and copy it to iscratch/i
    # if it doesn't exist already:

# The following section is done during mm6 build already.
#    ssh eieio
#    mkdir /cluster/data/ce2/bed/blastp
#    cd /cluster/data/ce2/bed/blastp
#    # Point a web browser at ftp://ftp.sanger.ac.uk/pub/databases/wormpep/
#    # to find out the latest version.  Then use that in place of 142 below.
#    wget -O wormPep142.faa ftp://ftp.sanger.ac.uk/pub/databases/wormpep/wormpep142/wormpep142
#    formatdb -i wormPep142.faa -t wormPep142 -n wormPep142
#    ssh kkr1u00
#    if (-e /iscratch/i/ce2/blastp) then
#      rm -r /iscratch/i/ce2/blastp
#    endif
#    mkdir -p /iscratch/i/ce2/blastp
#    cp /cluster/data/ce2/bed/blastp/wormPep142.p?? /iscratch/i/ce2/blastp
#    iSync

    # Make parasol run directory 
    ssh kk
    mkdir -p /cluster/data/hg17/bed/blastp/ce2/run/out
    cd /cluster/data/hg17/bed/blastp/ce2/run
    # Make blast script
    cat > blastSome <<end
#!/bin/csh
setenv BLASTMAT /iscratch/i/blast/data
/iscratch/i/blast/blastall -p blastp -d /iscratch/i/ce2/blastp/wormPep142 -i \$1 -o \$2 -e 0.01 -m 8 -b 1
end
    chmod a+x blastSome
    # Make gensub2 file
    cat > gsub <<end
#LOOP
blastSome {check in line+ \$(path1)} {check out line out/\$(root1).tab}
#ENDLOOP
end
    # Create parasol batch
    ls -1S /cluster/data/hg17/bed/geneSorter/blastp/split >split.lst
    #EDIT split.lst to add "../../../geneSorter/blastp/split/" in front of "kg"
    gensub2 split.lst single gsub spec
    para create spec
    para try, check, push, check, ...
# Completed: 7739 of 7739 jobs
# CPU time in finished jobs:      60973s    1016.22m    16.94h    0.71d  0.002 y
# IO & Wait Time:                 21292s     354.86m     5.91h    0.25d  0.001 y
# Average job time:                  11s       0.18m     0.00h    0.00d
# Longest running job:                0s       0.00m     0.00h    0.00d
# Longest finished job:              50s       0.83m     0.01h    0.00d
# Submission to last job:           570s       9.50m     0.16h    0.01d

# Load into database.  
    ssh hgwdev
    cd /cluster/data/hg17/bed/blastp/ce2/run/out
    hgLoadBlastTab hg17 ceBlastTab -maxPer=1 *.tab
# Scanning through 7739 files
# Loading database with 25706 rows

# Make mouse ortholog column using blastp on mouse known genes.
# First make mouse protein database and copy it to cluster/bluearc
# if it doesn't exist already
#	This already exists.  See makeMm6.doc for procedure
#	the directory: /cluster/bluearc/scratch/mus/mm6/blastp should have data

# Make parasol run directory 
ssh kk
mkdir /cluster/data/hg17/bed/geneSorter/blastp/mm6
cd /cluster/data/hg17/bed/geneSorter/blastp/mm6
mkdir run
cd run
mkdir out

# Make blast script
cat  << '_EOF_' > blastSome
#!/bin/sh
BLASTMAT=/cluster/bluearc/blast229/data /cluster/bluearc/blast229/blastall \
-p blastp -d /cluster/panasas/home/store/mm6/blastp/known \
-i $1 -o $2 -e 0.001 -m 8 -b 1
'_EOF_'
    # << keep emacs happy
chmod a+x blastSome

# Make gensub2 file
cat << '_EOF_' > gsub
#LOOP
blastSome {check in line+ $(path1)} {check out line out/$(root1).tab}
#ENDLOOP
'_EOF_'
    # << keep emacs happy

# Create parasol batch
#	this echo trick is used because otherwise the command line is
#	too long and you can not do a simple ls
echo ../../split/*.fa | wordLine stdin > split.lst
gensub2 split.lst single gsub jobList
para create jobList
para try
para check
para push ... etc ...

# Completed: 7739 of 7739 jobs
# CPU time in finished jobs:      65337s    1088.95m    18.15h    0.76d  0.002 y
# IO & Wait Time:                 20794s     346.56m     5.78h    0.24d  0.001 y
# Average job time:                  11s       0.19m     0.00h    0.00d
# Longest running job:                0s       0.00m     0.00h    0.00d
# Longest finished job:              80s       1.33m     0.02h    0.00d
# Submission to last job:           598s       9.97m     0.17h    0.01d

# Load into database.  
ssh hgwdev
cd /cluster/data/hg17/bed/geneSorter/blastp/mm6/run/out
hgLoadBlastTab hg17 mmBlastTab -maxPer=1 *.tab
# Scanning through 7739 files 
#	row count changed to 32880  

# Make rat ortholog column using blastp on rat known genes.
# First make rat protein database and copy it to cluster/bluearc
# if it doesn't exist already
#	This already exists.  See makeRn3.doc for procedure.
#	Files were put in this directory: /cluster/bluearc/rn3/blastp/

# Make parasol run directory 
ssh kk
mkdir /cluster/data/hg17/bed/geneSorter/blastp/rn3
cd /cluster/data/hg17/bed/geneSorter/blastp/rn3
mkdir run
cd run
mkdir out

# Make blast script
cat  << '_EOF_' > blastSome
#!/bin/sh
BLASTMAT=/cluster/bluearc/blast229/data /cluster/bluearc/blast229/blastall \
-p blastp -d /cluster/bluearc/rn3/blastp/known \
-i $1 -o $2 -e 0.001 -m 8 -b 1
'_EOF_'
    # << keep emacs happy
chmod a+x blastSome

# Make gensub2 file
cat << '_EOF_' > gsub
#LOOP
blastSome {check in line+ $(path1)} {check out line out/$(root1).tab}
#ENDLOOP
'_EOF_'
    # << keep emacs happy

# Create parasol batch
#	this echo trick is used because otherwise the command line is
#	too long and you can not do a simple ls
echo ../../split/*.fa | wordLine stdin > split.lst
gensub2 split.lst single gsub jobList
para create jobList
para try
para check
para push ... etc ...
# Completed: 7739 of 7739 jobs
# CPU time in finished jobs:      28325s     472.08m     7.87h    0.33d  0.001 y
# IO & Wait Time:                 20416s     340.27m     5.67h    0.24d  0.001 y
# Average job time:                   6s       0.10m     0.00h    0.00d
# Longest running job:                0s       0.00m     0.00h    0.00d
# Longest finished job:              24s       0.40m     0.01h    0.00d
# Submission to last job:           617s      10.28m     0.17h    0.01d

# Load into database.  
ssh hgwdev
cd /cluster/data/hg17/bed/geneSorter/blastp/rn3/run/out
hgLoadBlastTab hg17 rnBlastTab -maxPer=1 *.tab
# Scanning through 7739 files
# Loading database with 24140 rows

# ZEBRAFISH BLASTP FOR GENE SORTER 
    # Make Danio rerio (zebrafish) ortholog column using blastp on Ensembl.
    # First make protein database and copy it to iscratch/i
    # if it doesn't exist already:
    ssh kkstore
    mkdir /cluster/data/danRer2/bed/blastp
    cd /cluster/data/danRer2/bed/blastp
    wget ftp://ftp.ensembl.org/pub/current_zebrafish/data/fasta/pep/Danio_rerio.ZFISH4.may.pep.fa.gz 
    zcat Dan*.pep.fa.gz > ensembl.faa
    /scratch/blast/formatdb -i ensembl.faa -t ensembl -n ensembl
    ssh kkr1u00
    if (-e /iscratch/i/danRer2/blastp) then
      rm -r /iscratch/i/danRer2/blastp
    endif
    mkdir -p /iscratch/i/danRer2/blastp
    cp /cluster/data/danRer2/bed/blastp/ensembl.p?? /iscratch/i/danRer2/blastp
    iSync

    # Make parasol run directory 
    ssh kk
    mkdir -p /cluster/data/hg17/bed/blastp/danRer2/run/out
    cd /cluster/data/hg17/bed/blastp/danRer2/run
    # Make blast script
    cat > blastSome <<end
#!/bin/csh
setenv BLASTMAT /iscratch/i/blast/data
/iscratch/i/blast/blastall -p blastp -d /iscratch/i/danRer2/blastp/ensembl -i \$1 -o \$2 -e 0.005 -m 8 -b 1
end
    chmod a+x blastSome
    # Make gensub2 file
    cat > gsub <<end
#LOOP
blastSome {check in line+ \$(path1)} {check out line out/\$(root1).tab}
#ENDLOOP
end
    # Create parasol batch
    ls -1S /cluster/data/hg17/bed/geneSorter/blastp/split \
    |sed -e 's=kg=../../../geneSorter/blastp/split/kg=g' > split.lst
    gensub2 split.lst single gsub spec
    para create spec
    para try, check, push, check, ...
# Completed: 7739 of 7739 jobs
# CPU time in finished jobs:     113595s    1893.26m    31.55h    1.31d  0.004 y
# IO & Wait Time:                 26231s     437.18m     7.29h    0.30d  0.001 y
# Average job time:                  18s       0.30m     0.01h    0.00d
# Longest running job:                0s       0.00m     0.00h    0.00d
# Longest finished job:              99s       1.65m     0.03h    0.00d
# Submission to last job:           445s       7.42m     0.12h    0.01d

    # Load into database.  
    ssh hgwdev
    cd /cluster/data/hg17/bed/blastp/danRer2/run/out
    hgLoadBlastTab hg17 drBlastTab -maxPer=1 *.tab
# Scanning through 7739 files
# Loading database with 30731 rows


# Make Saccharomyces cerevisiae (yeast) ortholog column using blastp on RefSeq.
# First make protein database and copy it to cluster/bluearc
# if it doesn't exist already
#	This is already done, see makeMm3.doc for procedure
#	the directory: /cluster/bluearc/sc1/blastp should have data

# Make parasol run directory 
ssh kk
mkdir /cluster/data/hg17/bed/geneSorter/blastp/sc1
cd /cluster/data/hg17/bed/geneSorter/blastp/sc1
mkdir run
cd run
mkdir out

# Make blast script
cat  << '_EOF_' > blastSome
#!/bin/sh
BLASTMAT=/cluster/bluearc/blast229/data /cluster/bluearc/blast229/blastall \
-p blastp -d /cluster/bluearc/sc1/blastp/sgd \
-i $1 -o $2 -e 0.01 -m 8 -b 1
'_EOF_'
    # << keep emacs happy
chmod a+x blastSome

# Make gensub2 file
cat << '_EOF_' > gsub
#LOOP
blastSome {check in line+ $(path1)} {check out line out/$(root1).tab}
#ENDLOOP
'_EOF_'
    # << keep emacs happy

# Create parasol batch
echo ../../split/*.fa | wordLine stdin > split.lst
gensub2 split.lst single gsub jobList
para create jobList
para try
para check
para push ... etc ... 
# Completed: 7739 of 7739 jobs
# CPU time in finished jobs:      18630s     310.50m     5.17h    0.22d  0.001 y
# IO & Wait Time:                 20776s     346.27m     5.77h    0.24d  0.001 y
# Average job time:                   5s       0.08m     0.00h    0.00d
# Longest running job:                0s       0.00m     0.00h    0.00d
# Longest finished job:              15s       0.25m     0.00h    0.00d
# Submission to last job:           295s       4.92m     0.08h    0.00d
ssh hgwdev
cd /cluster/data/hg17/bed/geneSorter/blastp/sc1/run/out
hgLoadBlastTab hg17 scBlastTab -maxPer=1 *.tab
# Loading database with 16540 rows

# Make Drosophila melanagaster ortholog column using blastp on FlyBase.
# First make SwissProt protein database and copy it to cluster/bluearc
# The following section was already done.
# cd /cluster/data/dm1/bed
# mkdir blastp
# cd blastp
#wget ftp://ftp.fruitfly.org/pub/download/dmel_RELEASE3-1/FASTA/whole_genome_translation_dmel_RELEASE3-1.FASTA.gz
# zcat whole_ge*.gz | faFlyBaseToUcsc stdin flyBase.faa
# formatdb -i flyBase.faa -t flyBase -n flyBase
# if (-e /cluster/bluearc/dm1/blastp) then
#    rm -r /cluster/bluearc/dm1/blastp
# endif
# mkdir -p /cluster/bluearc/dm1/blastp
# cp /cluster/data/dm1/bed/blastp/flyBase.p?? /cluster/bluearc/dm1/blastp

# Make parasol run directory 
ssh kk
mkdir /cluster/data/hg17/bed/geneSorter/blastp/dm1
cd /cluster/data/hg17/bed/geneSorter/blastp/dm1
mkdir run
cd run
mkdir out

# Make blast script
cat  << '_EOF_' > blastSome
#!/bin/sh
BLASTMAT=/cluster/bluearc/blast229/data /cluster/bluearc/blast229/blastall \
-p blastp -d /cluster/bluearc/dm1/blastp/flyBase \
-i $1 -o $2 -e 0.01 -m 8 -b 1
'_EOF_'
    # << keep emacs happy
chmod a+x blastSome

# Make gensub2 file
cat << '_EOF_' > gsub
#LOOP
blastSome {check in line+ $(path1)} {check out line out/$(root1).tab}
#ENDLOOP
'_EOF_'
    # << keep emacs happy

# Create parasol batch
echo ../../split/*.fa | wordLine stdin > split.lst
gensub2 split.lst single gsub jobList
para create jobList
para try
para check
para push ... etc ...
# Completed: 7739 of 7739 jobs
# CPU time in finished jobs:      73518s    1225.30m    20.42h    0.85d  0.002 y
# IO & Wait Time:                 45038s     750.63m    12.51h    0.52d  0.001 y
# Average job time:                  15s       0.26m     0.00h    0.00d
# Longest running job:                0s       0.00m     0.00h    0.00d
# Longest finished job:              69s       1.15m     0.02h    0.00d
# Submission to last job:           762s      12.70m     0.21h    0.01d 

# Load into database.  
ssh hgwdev
cd /cluster/data/hg17/bed/geneSorter/blastp/dm1/run/out
hgLoadBlastTab hg17 dmBlastTab -maxPer=1 *.tab
# Loading database with 27212 rows

# update knownToHInv table
# Verified that there is now new release of HInv data.
hgMapToGene hg17 HInvGeneMrna knownGene knownToHInv
# count changed to 28851

# The new KG process no longer need entries in knownGeneLink (used to store
# info for DNA based RefSeqs.  So clean out the old data in knownGeneLink.

hgsql hg17 -e "delete from knownGeneLink"

#### RE-BUILD SUPERFAMILY RELATED TABLES (DONE - 2005-05-27 - Fan)

# Download latest Superfamily data files and build the Superfamily DB
# from supfam.mrc-lmb.cam.ac.uk

    mkdir /cluster/store10/superfamily/050524
    ln -s /cluster/store10/superfamily/050524 /cluster/data/superfamily/050524
    cd /cluster/data/superfamily/050524

# ftp over the following two files:
ass_22-May-2005.tab.gz
supfam_22-May-2005.sql.gz
    gzip -d *.gz

# Load the Superfamily database
    hgsql hg17 -e "create database superfam050524"
    nice hgsql superfam050524 < supfam_22-May-2005.sql &

# This may take about an hour.

# Make sure to add an index on id of the des table of superfam050524.
    hgsql superfam050524 -e "create index id on des(id);"

    hgsql superfam050524 < ~/src/hg/lib/sfAssign.sql
    hgsql superfam050524 -e 'load data local infile "ass_22-May-2005.tab" into table 
superfam050524.sfAssign;'

# Build or rebuild Superfamily track and create sf tables needed for PB

   hgsql hg17 < ~/src/hg/lib/sfAssign.sql

   cd /cluster/data/superfamily/050524  
   hgsql hg17 -e 'load data local infile "ass_22-May-2005.tab" into table hg17.sfAssign;'

# If hg17.sfDes already exists, drop it.

   hgsql superfam050524 -N -e "select * from des" >sfDes.tab
   hgsql hg17 < ~/src/hg/lib/sfDes.sql
   hgsql hg17 -e 'load data local infile "sfDes.tab" into table sfDes'

# If hg17.superfamily already exists, drop it.
   cd /cluster/data/hg17/bed
   mkdir /cluster/data/hg17/sf.2004-1128
   ln -s sf.2004-1128 sf
   hgSuperfam hg17 > sf.log

# It is normal that many proteins does not have corresponding Superfamily entries.

# If hg17.sfDescription exists, drop it.

   hgsql hg17 < ~/src/hg/lib/sfDescription.sql
   hgsql hg17 -e 'LOAD DATA local INFILE "sfDescription.tab"  into table hg17.sfDescription;'

# Finally, load the superfamily table.

   hgLoadBed hg17 superfamily superfamily.tab -tab

# Create knownToSuperfamily table
# Note hs is changed into ht for this Superfamily release.
   
   cat /cluster/data/superfamily/050524/ass_22-May-2005.tab \
   | hgKnownToSuper hg17 hs stdin
# created 25287 rows in knownToSuper

# Build tables needed by pbGlobal in proteins050415

   cd /cluster/data/superfamily/050524  
   hgsql proteins050415 -e 'load data local infile "ass_22-May-2005.tab" into table sfAssign'

   hgsql proteins050415 -e 'load data local infile "sfDes.tab" into table sfDes'

   cd /cluster/store10/kg/kgHg17F
   hgsql proteins050415 -e 'load data local infile "ensemblXref.tab" into table ensemblXref'

# These sf tables and ensemblXref3 are needed for non-HMR KG proteins.
# Should add content of ensemblXref3 of mm6 after it is done.
# And similarly for rn4 and possibly for other non-HMR species.
# CCDS <-> knownGene mapping need to be updated (markd 2005-05-29)
# this should be part of the known gene build
    /cluster/data/genbank/bin/i386/mkCcdsGeneMap -db=hg17 -loadDb ccdsGene knownGene ccdsKgMap

# AUGUSTUS GENES (DONE 6/1/2005 Andy)
    ssh hgwdev
    cd /cluster/data/hg17/bed
    mkdir augustus
    cd augustus/
    wget http://augustus.gobics.de/predictions/hg17/hg17.allchr.augustus.gtf.gz
    cp /cluster/data/dm2/bed/augustus/cleanAugustus.awk .
    zcat hg17.allchr.augustus.gtf.gz | awk -f cleanAugustus.awk | gzip > hg17.allchr.augustus.clean.gtf.gz
    ldHgGene -gtf hg17 augustus hg17.allchr.augustus.clean.gtf.gz
    rm hg17.allchr.augustus.gtf.gz

# MAKE Mouse Proteins track (DONE for chr13 braney ~5/25/05)
    ssh kkstore01
    mkdir -p /cluster/data/hg17/blastDb
    cd /cluster/data/hg17/blastDb
    awk "{print \$2}" ../*/chr*/*.lft > subChr.lst
    for i in `cat subChr.lst`
    do
	ln -s ../*/chr*/$i.fa 
	echo formatdb -i $i.fa -p F
	formatdb -i $i.fa -p F
    done
    rm *.log *.fa list
    cd ..
    for i in `cat chrom.lst`; do cat  $i/chr*/*.lft  ; done > jkStuff/subChr.lft

    ssh kkr1u00
    rm -rf /iscratch/i/hg17/blastDb
    mkdir -p /iscratch/i/hg17/blastDb
    cd /cluster/data/hg17/blastDb
    for i in nhr nin nsq; do cp *.$i /iscratch/i/hg17/blastDb     ; echo $i; done

    cd
    iSync > sync.out

    mkdir -p /cluster/data/hg17/bed/tblastn.mm6KG
    cd /cluster/data/hg17/bed/tblastn.mm6KG
    echo  /panasas/store/hg17/blastDb/*.nsq  | xargs ls -S | sed "s/\.nsq//"  > query.lst    
    # back to kkstore01
    exit

    cd /cluster/data/hg17/bed/tblastn.mm6KG
    rm -rf /cluster/bluearc/hg17/bed/tblastn.mm6KG/kgfa
    mkdir -p /cluster/bluearc/hg17/bed/tblastn.mm6KG/kgfa
    split -l 560 /cluster/data/mm6/bed/blat.mm6KG/mm6KG.psl /cluster/bluearc/hg17/bed/tblastn.mm6KG/kgfa/kg
    ln -s /cluster/bluearc/hg17/bed/tblastn.mm6KG/kgfa kgfa
    cd kgfa
    for i in *; do pslxToFa $i $i.fa; rm $i; done
    cd ..
    ls -1S kgfa/*.fa > kg.lst
    rm -rf /cluster/bluearc/hg17/bed/tblastn.mm6KG/blastOut
    mkdir -p /cluster/bluearc/hg17/bed/tblastn.mm6KG/blastOut
    ln -s /cluster/bluearc/hg17/bed/tblastn.mm6KG/blastOut
    for i in `cat kg.lst`; do  mkdir blastOut/`basename $i .fa`; done
    tcsh
    cat << '_EOF_' > blastGsub
#LOOP
blastSome $(path1) {check in line $(path2)} {check out exists blastOut/$(root2)/q.$(root1).psl } 
#ENDLOOP
'_EOF_'
    cat << '_EOF_' > blastSome
#!/bin/sh
BLASTMAT=/iscratch/i/blast/data
export BLASTMAT
g=`basename $2`
f=/tmp/`basename $3`.$g
for eVal in 0.01 0.001 0.0001 0.00001 0.000001 1E-09 1E-11
do
if /scratch/blast/blastall -M BLOSUM80 -m 0 -F no -e $eVal -p tblastn -d $1 -i $2 -o $f.8
then
        mv $f.8 $f.1
        break;
fi
done
if test -f  $f.1
then
    if /cluster/bin/i386/blastToPsl $f.1 $f.2
    then
	liftUp -nosort -type=".psl" -nohead $f.3 ../../jkStuff/subLiftAll.lft carry $f.2                     
	liftUp -nosort -type=".psl" -nohead $f.4 ../../jkStuff/liftAll.lft carry $f.3                     
	liftUp -nosort -type=".psl" -pslQ -nohead $3.tmp /cluster/data/mm6/bed/blat.mm6KG/protein.lft warn $f.4       

        if pslCheck -prot $3.tmp                                                                          
        then                                                                                              
            mv $3.tmp $3                                                                                  
            rm -f $f.1 $f.2 $f.3  $f.4
        fi
        exit 0                                                                                            
    fi                                                                                                    
fi                                                                                                        
rm -f $f.1 $f.2 $3.tmp $f.8 $f.3 $f.4
exit 1
'_EOF_'

    chmod +x blastSome
    gensub2 query.lst kg.lst blastGsub blastSpec

    ssh kk
    cd /cluster/data/hg17/bed/tblastn.mm6KG
    para create blastSpec
    para push
# Completed: 214524 of 214524 jobs
# CPU time in finished jobs:   44907411s  748456.85m 12474.28h  519.76d  1.424 y
# IO & Wait Time:                712709s   11878.48m   197.97h    8.25d  0.023 y
# Average job time:                 213s       3.54m     0.06h    0.00d
# Longest finished job:            1363s      22.72m     0.38h    0.02d
# Submission to last job:         75910s    1265.17m    21.09h    0.88d

# just for chr13
# completed: 55290 of 55290 jobs
# cCPU time in finished jobs:    1487547s   24792.46m   413.21h   17.22d  0.047 y
# cIO & Wait Time:                148854s    2480.89m    41.35h    1.72d  0.005 y
# cAverage job time:                  30s       0.49m     0.01h    0.00d
# cLongest running job:                0s       0.00m     0.00h    0.00d
# cLongest finished job:              98s       1.63m     0.03h    0.00d
# cSubmission to last job:          3904s      65.07m     1.08h    0.05d


    cat << '_EOF_' > chainGsub
#LOOP
chainSome $(path1)
#ENDLOOP
'_EOF_'

    ssh kki
    cd /cluster/data/hg17/bed/tblastn.mm6KG
    tcsh
    cat << '_EOF_' > chainOne
(cd $1; cat q."$2"* | simpleChain -prot -outPsl -maxGap=200000 stdin ../c.`basename $1`.$2.psl)
'_EOF_'
    chmod +x chainOne

    for j in blastOut/kg??; do for i in `cat ../../chrom.lst`; do echo chainOne $j chr"$i"; done ; done > chainSpec

    para create chainSpec
    para push
# CPU time in finished jobs:         90s       1.50m     0.03h    0.00d  0.000 y
# IO & Wait Time:                 19151s     319.18m     5.32h    0.22d  0.001 y
# Average job time:                   3s       0.04m     0.00h    0.00d
# Longest finished job:               5s       0.08m     0.00h    0.00d
# Submission to last job:          1642s      27.37m     0.46h    0.02d

# Completed: 7695 of 7695 jobs
# CPU time in finished jobs:         48s       0.80m     0.01h    0.00d  0.000 y
# IO & Wait Time:                 18931s     315.51m     5.26h    0.22d  0.001 y
# Average job time:                   2s       0.04m     0.00h    0.00d
# Longest finished job:               6s       0.10m     0.00h    0.00d
# Submission to last job:          1618s      26.97m     0.45h    0.02d

    exit
    # back to kkstore01
    cd /cluster/data/hg17/bed/tblastn.mm6KG/blastOut
    for i in kg??
    do 
	cat c.$i.*.psl | awk "(\$13 - \$12)/\$11 > 0.6 {print}" > c60.$i.psl
	sort -rn c60.$i.psl | pslUniq stdin u.$i.psl
	awk "((\$1 / \$11) ) > 0.90 { print   }" c60.$i.psl > m60.$i.psl
	echo $i
    done

    cat u.*.psl m60* | sort -T /tmp -k 14,14 -k 17,17n -k 17,17n  | uniq  > /cluster/data/hg17/bed/tblastn.mm6KG/blastMm6KG.psl
    cd ..
    ssh hgwdev
    cd /cluster/data/hg17/bed/tblastn.mm6KG
    hgLoadPsl hg17 blastHg17KG.psl
    # 1425966 bases of 64944656 (2.196%) 

    # back to kkstore01
    rm -rf blastOut
# End tblastn of mouse proteins


####################################################################################
# RE-BUILD KNOWN GENES TABLES, 3ND TRIAL WITH CORRECTED kgCheck and kgGetCds (DONE 6/5/05 Fan)

# Start from the step that gene-check is run and kgCandidate0.gp is produced.

   cd
   cd /cluster/store10/kg/kgHg17F

   mkdir try3
   cd try3
   hgsql kgHg17FTempTry3 -e 'drop table kgCandidate0'

   hgsql kgHg17FTempTry3 -e 'drop table kgCandidate0'
   hgsql kgHg17FTempTry3 < ~/src/hg/lib/kgCandidate0.sql 
   hgsql kgHg17FTempTry3 -e  'load data local infile "../kgCandidate0.gp" into table kgCandidate0'

   hgsql kgHg17FTempTry3 -e 'drop table geneCheck'
   hgsql kgHg17FTempTry3 < ~/src/hg/lib/geneCheck.sql
   hgsql kgHg17FTempTry3 -e  'load data local infile "../kgCandidate0.check" into table geneCheck ignore 2 lines'

# Run kgCheck to get all KG candidates that pass the KG gene check criteria

   kgCheck kgHg17FTempTry3 hg17 kgCandidate0 geneCheck kgCandidate.tab
   hgsql kgHg17FTempTry3 -e  'drop table kgCandidate'
   hgsql kgHg17FTempTry3 < ~/src/hg/lib/kgCandidate.sql
   hgsql kgHg17FTempTry3 -e  'load data local infile "kgCandidate.tab" into table kgCandidate'
   hgsql kgHg17FTempTry3 -e 'create index alignID on kgCandidate(alignID)'

# Construct the kgCandidateX table that has alignID in the name field. 
   cut -f 2-10 kgCandidate.tab >j2.tmp
   cut -f 11 kgCandidate.tab >j1.tmp
   paste j1.tmp j2.tmp >kgCandidateX.tab

   hgsql kgHg17FTempTry3 -e  'drop table kgCandidateX'
   hgsql kgHg17FTempTry3 < ~/src/hg/lib/kgCandidateX.sql
   hgsql kgHg17FTempTry3 -e  'load data local infile "kgCandidateX.tab" into table kgCandidateX'

# Score protein/mRna and protein/RefSeq alignments

#   kgResultBestMrna2 050415 kgHg17FTempTry3 hg17|sort -u >protMrnaBlatScore.tab
#   kgResultBestRef2  050415 kgHg17FTempTry3 hg17|sort -u >protRefScore.tab

# Combine scoring results and load them into temp DB.
   cat protMrnaBlatScore.tab protRefScore.tab >protMrnaScore.tab
   hgsql kgHg17FTempTry3 -e 'drop table protMrnaScore'
   hgsql kgHg17FTempTry3 < ~/src/hg/lib/protMrnaScore.sql
   hgsql kgHg17FTempTry3 -e 'load data local infile "../protMrnaScore.tab" into table protMrnaScore'
   hgsql kgHg17FTempTry3 -e 'create index mrnaAcc on protMrnaScore(mrnaAcc)'


# Run kgGetCds to get CDS structure of each gene

   kgGetCds kgHg17FTempTry3 kgCandidateX jY.tmp1
   cat jY.tmp1 |sort -u >kgCandidateY.tab
   rm jY.tmp1
   hgsql kgHg17FTempTry3 -e  'drop table kgCandidateY'
   hgsql kgHg17FTempTry3 < ~/src/hg/lib/kgCandidateY.sql
   hgsql kgHg17FTempTry3 -e  'load data local infile "kgCandidateY.tab" into table kgCandidateY'

# Run kgPickPrep to replace long cds structure string with cdsId.
   kgPickPrep kgHg17FTempTry3 kgCandidateZ.tab
   hgsql kgHg17FTempTry3 -e  'drop table kgCandidateZ'
   hgsql kgHg17FTempTry3 < ~/src/hg/lib/kgCandidateZ.sql
   hgsql kgHg17FTempTry3 -e  'load data local infile "kgCandidateZ.tab" into table kgCandidateZ'
   hgsql kgHg17FTempTry3 -e 'create index cdsId on kgCandidateZ(cdsId)'

# Run kgPick to pick the representative a mrna/protein pair for each unique CDS structure.

   kgPick kgHg17FTempTry3 hg17 proteins050415 kg3Try3.tmp dupSpMrna.tmp

   cat kg3Try3.tmp | grep NM_ > jNM
   cat kg3Try3.tmp | grep -v NM_ >jnoNM
   cut -f 1 jnoNM | sed -e "s/_/_\n/" |grep -v _ >jnoNM1
   cut -f 2-12  jnoNM >jnoNM2
   paste jnoNM1 jnoNM2 > kg3Try3B.tmp
   cat jNM >> kg3Try3B.tmp

   sort -u dupSpMrna.tmp >dupSpMrna.tab

   hgsql hg17 -e 'drop table dupSpMrna'
   hgsql hg17 < ~/src/hg/lib/dupSpMrna.sql
   hgsql hg17 -e  'load data local infile "dupSpMrna.tab" into table dupSpMrna'


# Add entries in the put back list
# Obtain from Mark the put back list, kgPutBack.lis, for human RefSeq.

   hgsql kgHg17FTempTry3 -e 'drop table kgPutBack'
   hgsql kgHg17FTempTry3 < ~/src/hg/lib/kgPutBack.sql
   hgsql kgHg17FTempTry3 -e  'load data local infile "kgPutBack.lis" into table kgPutBack'

   kgPutBack kgHg17FTempTry3 hg17 proteins050415 kgPutBack kgPutBack.gp

# Sort KG genes to make the kg3Try3.gp table file.

   cat kg3Try3B.tmp kgPutBack.gp >kg3Try3C.tmp
   ~/kent/src/hg/protein/sortKg.pl kg3Try3C.tmp >kg3Try3.gp

# Manually edit to correct one line problem of O75438_BC009691

   hgsql kgHg17FTempTry3 -e  'drop table knownGene'
   hgsql kgHg17FTempTry3 < ~/src/hg/lib/knownGene.sql
   hgsql kgHg17FTempTry3 -e  'load data local infile "kg3Try3.gp" into table knownGene'

   hgsql hg17 -e  'drop table kg3Try3'
   hgsql hg17 < ~/src/hg/lib/kg3Try3.sql
   hgsql hg17 -e  'load data local infile "kg3Try3.gp" into table kg3Try3'

# Perform analysis before renaming the kg3Try3 table to knownGene.

# Load data into hg17 knownGene table.
   hgsql hg17 -e  'drop table knownGene'
   hgsql hg17 < ~/src/hg/lib/knownGene.sql
   hgsql hg17 -e  'load data local infile "kg3Try3.gp" into table knownGene'

# Build knownGeneMrna and knownGenePep tables.
   hgsql kgHg17FTempTry3 -e  'drop table mrnaSeq'
   hgsql kgHg17FTempTry3 < ~/src/hg/lib/mrnaSeq.sql
   hgsql kgHg17FTempTry3 -e  'load data local infile "../mrnaSeq.tab" into table mrnaSeq'

   kgPepMrna kgHg17FTempTry3 hg17 050415

   hgsql hg17 -e  'drop table knownGeneMrna'
   hgsql hg17 < ~/src/hg/lib/knownGeneMrna.sql
   hgsql hg17 -e  'load data local infile "knownGeneMrna.tab" into table knownGeneMrna'
   hgsql hg17 -e  'drop table knownGenePep'
   hgsql hg17 < ~/src/hg/lib/knownGenePep.sql
   hgsql hg17 -e  'load data local infile "knownGenePep.tab" into table knownGenePep'

# Build spMrna table

   hgsql hg17 -N -e 'select proteinID, name from knownGene' |sort -u >kgSpMrna.tab

   hgsql hg17 -e  'drop table spMrna'
   hgsql hg17 <~/src/hg/lib/spMrna.sql
   hgsql hg17 -e 'load data local infile "kgSpMrna.tab" into table spMrna'

# Build kgXref table

   kgXref2 kgHg17FTempTry3 050415 hg17

   hgsql hg17 -e  'drop table kgXref'
   hgsql hg17 < ~/src/hg/lib/kgXref.sql
   hgsql hg17 -e  'load data local infile "kgXref.tab" into table kgXref'
 
# MAKE FOLDUTR TABLES 
# First set up directory structure and extract UTR sequence on hgwdev
    ssh hgwdev
    cd /cluster/data/hg17/bed
    mkdir rnaStruct.2005-06-05
    rm rnaStruct
    ln -s rnaStruct.2005-06-05 rnaStruct
    cd rnaStruct
    mkdir -p utr3/split utr5/split utr3/fold utr5/fold
    utrFa hg17 knownGene utr3 utr3/utr.fa
    utrFa hg17 knownGene utr5 utr5/utr.fa

# Split up files and make files that define job.
    ssh kk
    cd /cluster/data/hg17/bed/rnaStruct
    faSplit sequence utr3/utr.fa 50000 utr3/split/s
    faSplit sequence utr5/utr.fa 50000 utr5/split/s
    ls -1 utr3/split > utr3/in.lst
    ls -1 utr5/split > utr5/in.lst
    cd utr3
    cat > gsub <<end
#LOOP
rnaFoldBig split/\$(path1) fold
#ENDLOOP
end
    cp gsub ../utr5

# Do cluster run for 3' UTRs
    gensub2 in.lst single gsub spec
    para create spec
    para try
    para push
# Completed: 35774 of 35774 jobs
# CPU time in finished jobs:    1174534s   19575.57m   326.26h   13.59d  0.037 y
# IO & Wait Time:                 98071s    1634.51m    27.24h    1.14d  0.003 y
# Average job time:                  36s       0.59m     0.01h    0.00d
# Longest running job:                0s       0.00m     0.00h    0.00d
# Longest finished job:            5409s      90.15m     1.50h    0.06d
# Submission to last job:          6712s     111.87m     1.86h    0.08d

# Do cluster run for 5' UTRs 
    cd ../utr5
    gensub2 in.lst single gsub spec
    para create spec
    para try
    para push
# Completed: 33765 of 33765 jobs
# CPU time in finished jobs:     341000s    5683.33m    94.72h    3.95d  0.011 y
# IO & Wait Time:                106605s    1776.75m    29.61h    1.23d  0.003 y
# Average job time:                  13s       0.22m     0.00h    0.00d
# Longest running job:                0s       0.00m     0.00h    0.00d
# Longest finished job:           30479s     507.98m     8.47h    0.35d
# Submission to last job:         30622s     510.37m     8.51h    0.35d 

# Load database
    ssh hgwdev
    cd /cluster/data/hg17/bed/rnaStruct/utr5
    hgLoadRnaFold hg17 foldUtr5 fold
    cd ../utr3
    hgLoadRnaFold hg17 foldUtr3 fold

# Clean up
    rm -r split fold err batch.bak
    cd ../utr5
    rm -r split fold err batch.bak

# Build kgProtMap table

# move all files under hg17Kg to old and copy try3/kgXref.tab up.

# Note: it is important that tight_mrna.psl is here!
    cp old/tight_mrna.psl . -p

    ~/src/hg/protein/kgProtMap2.sh kgHg17F hg17 050415

# Update and clean up kgResultBestMrna2.c and then check it in.

# Build alias tables

#	kgAliasM reads from proteins050415.hugo.symbol, proteins050415.hugo.aliases
#	proteins050415.hugo.withdraws, hg17.kgXref.kgID
#	to create kgAliasM.tab and geneAlias.tab
#	by picking out those kgID items from kgXref where
#	kgXref.geneSymbol == hugo.symbol

   kgAliasM hg17 proteins050415

#	kgAliasKgXref reads from hg17.knownGene.proteinID,
#	hg17.knownGene.name, hg17.kgXref.geneSymbol
#	to create kgAliasKgXref.tab

   kgAliasKgXref hg17


#	kgAliasRefseq reads from hg17.knownGene.name,
#	hg17.knownGene.proteinID, hg17.kgXref.refseq
#	to create kgAliasRefseq.tab

   kgAliasRefseq hg17

   hgsql sp050415 -N -e 'select name,gene.val from hg17.knownGene,displayId,gene where displayId.val=proteinID and displayId.acc=gene.acc' \
   | sort -u  > kgAliasP.tab

   hgsql hg17 -N -e 'select name, name from knownGene' >kgAliasDup.tab
   hgsql hg17 -N -e 'select mrnaID, dupMrnaID from dupSpMrna' >>kgAliasDup.tab
   
   cat kgAliasM.tab kgAliasRefseq.tab kgAliasKgXref.tab kgAliasP.tab kgAliasDup.tab| \
   sort |uniq > kgAlias.tab

   hgsql -e "drop table kgAlias;" hg17 
   hgsql hg17 < ~/kent/src/hg/lib/kgAlias.sql
   hgsql hg17 -e 'LOAD DATA local INFILE "kgAlias.tab" into table kgAlias' 

#	kgProtAlias reads from hg17.knownGene.name,
#	hg17.knownGene.proteinID, hg17.knownGene.alignID,
#	proteins050415.spXref3.accession, proteins050415.spSecondaryID, proteins050415.pdbSP.pdb
#	to create kgProtAlias.tab
#

   kgProtAlias hg17 050415

   hgsql hg17 -N -e \
   'select kgID, spDisplayID, protAcc from kgXref where protAcc != ""'\
   | sort -u >kgProtAliasNCBI.tab

# include variant splice protein IDs
   
   hgsql hg17 -N -e \
   'select name, proteinID, parAcc from knownGene,sp050415.varAcc where varAcc=proteinID'\
   |sort -u >kgProtAliasDup.tab

# include duplicate protein IDs from dupSpMrna table
   hgsql hg17 -N -e \
   'select name, knownGene.proteinID, dupProteinID from knownGene, dupSpMrna where name=mrnaID'\
   |sort -u >>kgProtAliasDup.tab

# catch parent acc from dupProteinID too

   hgsql hg17 -N -e\
   'select name, knownGene.proteinID, parAcc from knownGene,dupSpMrna,sp050415.varAcc where name=mrnaID and dupProteinID=varAcc.varAcc'\
   |sort -u >>kgProtAliasDup.tab
    cat kgProtAliasNCBI.tab kgProtAlias.tab kgProtAliasDup.tab | sort -u > kgProtAliasAll.tab

    echo "`date` creating table kgProtAlias"
    hgsql hg17 -e "drop table kgProtAlias;"
    hgsql hg17 <~/src/hg/lib/kgProtAlias.sql; 
    hgsql hg17 -e 'LOAD DATA local INFILE "kgProtAliasAll.tab" into table kgProtAlias;'  

# Build kgSpAlias table

    hgsql hg17 -e \
    'select kgXref.kgID, spID, alias from kgXref, kgAlias where kgXref.kgID=kgAlias.kgID' >j.tmp
    hgsql hg17 -e \
    'select kgXref.kgID, spID, alias from kgXref, kgProtAlias where kgXref.kgID=kgProtAlias.kgID'\
    >>j.tmp
    cat j.tmp|sort -u |grep -v 'kgID' >hg17.kgSpAlias.tab
    rm j.tmp

    hgsql hg17 -e 'drop table kgSpAlias';
    hgsql hg17 < ~/src/hg/lib/kgSpAlias.sql
    hgsql hg17 -e 'load data local infile "hg17.kgSpAlias.tab" into table kgSpAlias'
    

# Build KEGG pathway tables

   ssh hgwdev
   cd /cluster/store10/kg/kgHg17F
   md kegg
   cd kegg

   ~/src/hg/protein/KGpath.sh kgHg17F hg17 050415

   hgsql hg17 -e "drop table keggMapDesc"
   hgsql hg17 -e "drop table keggPathway"
   hgsql hg17 <~/src/hg/lib/keggMapDesc.sql
   hgsql hg17 <~/src/hg/lib/keggPathway.sql
   hgsql hg17 -e 'load data local infile "keggMapDesc.tab" into table keggMapDesc'
   hgsql hg17 -e 'load data local infile "keggPathway.tab" into table keggPathway'

# Build CGAP pathway tables

   cd ..
   ~/src/hg/protein/KGcgap.sh kgHg17F hg17 050415
   hgsql hg17 -e "drop table cgapAlias"
   hgsql hg17 -e "drop table cgapBiocDesc"
   hgsql hg17 -e "drop table cgapBiocPathway"
   hgsql hg17 <~/src/hg/lib/cgapAlias.sql
   hgsql hg17 <~/src/hg/lib/cgapBiocDesc.sql
   hgsql hg17 <~/src/hg/lib/cgapBiocPathway.sql
   hgsql hg17 -e 'load data local infile "cgapAlias.tab" into table cgapAlias'
   hgsql hg17 -e 'load data local infile "cgapBIOCARTAdescSorted.tab" into table cgapBiocDesc'
   hgsql hg17 -e 'load data local infile "cgapBIOCARTA.tab" into table cgapBiocPathway'

# Build BioCyc pathway tables

# Download BioCyc DB, create and load bioCyc DB
# See makeBioCycDB.doc for details.
   
   hgsql hg17 -e "drop table bioCycMapDesc"
   hgsql hg17 <~/src/hg/lib/bioCycMapDesc.sql
   hgsql hg17 -e 'load data local infile "bioCycMapDesc.tab" into table bioCycMapDesc'

   kgBioCyc |sort -u > bioCycPathway.tab

   hgsql hg17 -e "drop table bioCycPathway"
   hgsql hg17 <~/src/hg/lib/bioCycPathway.sql
   hgsql hg17 -e 'load data local infile "bioCycPathway.tab" into table bioCycPathway'

# CCDS <-> knownGene mapping need to be updated (Fan redone 2005-06-05)
# this should be part of the known gene build
    /cluster/data/genbank/bin/i386/mkCcdsGeneMap -db=hg17 -loadDb ccdsGene knownGene ccdsKgMap


### HG17 PROTEOME BROWSER TABLES RE-BUILD ####  (DONE - 2005-06-05 - Fan)
# These are instructions for rebuilding tables 
# needed for the Proteome Browser.  
# DON'T START THESE UNTIL TABLES FOR KNOWN GENES AND kgProtMap table
# ARE REBUILT.  
# This update is based on proteins DBs dated 050415.

# Create the working directory

    ssh hgwdev
    mkdir /cluster/store10/kg/kgHg17F/pb-2005-06-05
    cd /cluster/data/hg17/bed
    rm pb
    ln -s /cluster/store10/kg/kgHg17F/pb-2005-06-05 pb
    cd pb

# Move the existing PB tables by:

hgsql hg17
create database hg17Sav4;

alter table hg17.pepCCntDist rename as hg17Sav4.pepCCntDist;
alter table hg17.pepExonCntDist rename as hg17Sav4.pepExonCntDist;
alter table hg17.pepHydroDist rename as hg17Sav4.pepHydroDist;
alter table hg17.pepIPCntDist rename as hg17Sav4.pepIPCntDist;
alter table hg17.pepMolWtDist rename as hg17Sav4.pepMolWtDist;
alter table hg17.pepMwAa rename as hg17Sav4.pepMwAa;
alter table hg17.pepPi rename as hg17Sav4.pepPi;
alter table hg17.pepPiDist rename as hg17Sav4.pepPiDist;
alter table hg17.pepResDist rename as hg17Sav4.pepResDist;

alter table hg17.pbAnomLimit rename as hg17Sav4.pbAnomLimit;
alter table hg17.pbResAvgStd rename as hg17Sav4.pbResAvgStd;
alter table hg17.pbStamp rename as hg17Sav4.pbStamp;

quit

# Define pep* tables in hg17 DB

	cat ~/kent/src/hg/lib/pep*.sql > pepAll.sql

#  First edit out pepPred table definition, then

	hgsql hg17 < pepAll.sql

# Build the pepMwAa table

  hgsql proteins050415 -N -e \
"select info.acc, molWeight, aaSize from sp050415.info, sp050415.accToTaxon where accToTaxon.taxon=9606 and accToTaxon.acc = info.acc" > pepMwAa.tab

hgsql hg17 -e 'load data local infile "pepMwAa.tab" into table pepMwAa'

o Build the pepPi table

    hgsql proteins050415 -e \
    "select info.acc from sp050415.info, sp050415.accToTaxon where accToTaxon.taxon=9606 and accToTaxon.acc = info.acc" > protAcc.lis

    hgsql hg17 -N -e 'select proteinID from knownGene where proteinID like "%-%"' | sort -u >> protAcc.lis

    pbCalPi protAcc.lis sp050415 pepPi.tab
    hgsql hg17 -e 'delete from pepPi'
    hgsql hg17 -e 'load data local infile "pepPi.tab" into table hg17.pepPi'

# Calculate and load pep distributions

    pbCalDist sp050415 proteins050415 9606 hg17 >pbCalDist.out
    wc  pbCalDist.out

    hgsql hg17
    load data local infile "pepExonCntDist.tab" into table hg17.pepExonCntDist;
    load data local infile "pepCCntDist.tab" into table hg17.pepCCntDist;
    load data local infile "pepHydroDist.tab" into table hg17.pepHydroDist;
    load data local infile "pepMolWtDist.tab" into table hg17.pepMolWtDist;
    load data local infile "pepResDist.tab" into table hg17.pepResDist;
    load data local infile "pepIPCntDist.tab" into table hg17.pepIPCntDist;
    load data local infile "pepPiDist.tab" into table hg17.pepPiDist;
    quit

# Calculate frequency distributions

    pbCalResStd sp050415 9606 hg17

# Create pbAnomLimit and pbResAvgStd tables

   hgsql hg17 -e "drop table pbAnomLimit"
   hgsql hg17 -e "drop table pbResAvgStd"
   hgsql hg17 < ~/src/hg/lib/pbAnomLimit.sql
   hgsql hg17 < ~/src/hg/lib/pbResAvgStd.sql

   hgsql hg17 -e 'load data local infile "pbResAvgStd.tab" into table hg17.pbResAvgStd;'
   hgsql hg17 -e 'load data local infile "pbAnomLimit.tab" into table hg17.pbAnomLimit;'

# Create pbStamp table for PB
  hgsql hg17 -e "drop table pbStamp"
  hgsql hg17 < ~/src/hg/lib/pbStamp.sql
  hgsql hg17Sav4 -N -e 'select * from pbStamp' > pbStamp.tab
  hgsql hg17 -e 'load data local infile "pbStamp.tab" into table hg17.pbStamp'

# Adjust drawing parameters for Proteome Browser stamps

  Now invoke Proteome Browser and adjust various drawing parameters
  (mostly the ymax of each stamp) if necessary, by updating the 
  pbStamp.tab file and then delete and reload the pbStamp table. 

# Perform preliminary review of Proteome Browser for hg17, then
  notify QA for formal review.


# RE-BUILD GENE SORTER TABLES (AKA: FAMILY BROWSER) (DONE - 2005-06-04 - Fan)
#	This should be done after KG tables are complete from known genes build
#	process.
#
# Cluster together various alt-splicing isoforms.
#	Creates the knownIsoforms and knownCanonical tables
ssh hgwdev
mkdir /cluster/data/hg17/bed/geneSorter.2005-06-04
# remove old symbolic link
rm /cluster/data/hg17/bed/geneSorter
ln -s /cluster/data/hg17/bed/geneSorter.2005-06-04 /cluster/data/hg17/bed/geneSorter
cd /cluster/data/hg17/bed/geneSorter
hgClusterGenes hg17 knownGene knownIsoforms knownCanonical

# Extract peptides from knownGenes into fasta file
# and create a blast database out of them.
mkdir /cluster/data/hg17/bed/geneSorter/blastp
cd /cluster/data/hg17/bed/geneSorter/blastp
pepPredToFa hg17 knownGenePep known.faa
#	You may need to build this binary in src/hg/near/pepPredToFa
/scratch/blast/formatdb -i known.faa -t known -n known
#	This command is in /projects/compbio/bin/$MACH/formatdb

# Copy over database to bluearc
rm -fr /cluster/bluearc/hg17/blastp
mkdir -p /cluster/bluearc/hg17/blastp
cp -p /cluster/data/hg17/bed/geneSorter/blastp/known.* /cluster/bluearc/hg17/blastp

# Split up fasta file into bite sized chunks for cluster
cd /cluster/data/hg17/bed/geneSorter/blastp
mkdir split
faSplit sequence known.faa 8000 split/kg

# Make parasol run directory
ssh kk
mkdir /cluster/data/hg17/bed/geneSorter/blastp/self
cd /cluster/data/hg17/bed/geneSorter/blastp/self
mkdir run
cd run
mkdir out

# Make blast script
cat  << '_EOF_' > blastSome
#!/bin/sh
BLASTMAT=/cluster/bluearc/blast229/data /cluster/bluearc/blast229/blastall \
-p blastp -d /cluster/bluearc/hg17/blastp/known -i $1 -o $2 \
-e 0.01 -m 8 -b 1000
'_EOF_'
    # << keep emacs happy
chmod +x blastSome

# Make gensub2 file
cat  << '_EOF_' > gsub
#LOOP
blastSome {check in line+ $(path1)} {check out line out/$(root1).tab}
#ENDLOOP
'_EOF_'
    # << keep emacs happy

# Create parasol batch
#	'ls ../../split/*.fa' is too much, hence the echo
echo ../../split/*.fa | wordLine stdin > split.lst
gensub2 split.lst single gsub jobList
para create jobList
para try
para push
para check
# Completed: 7735 of 7735 jobs
# CPU time in finished jobs:     142764s    2379.39m    39.66h    1.65d  0.005 y
# IO & Wait Time:                 67623s    1127.06m    18.78h    0.78d  0.002 y
# Average job time:                  27s       0.45m     0.01h    0.00d
# Longest running job:                0s       0.00m     0.00h    0.00d
# Longest finished job:             144s       2.40m     0.04h    0.00d
# Submission to last job:           392s       6.53m     0.11h    0.00d

# Load into database.  This takes about 30 minutes
ssh hgwdev
cd /cluster/data/hg17/bed/geneSorter/blastp/self/run/out
time hgLoadBlastTab hg17 knownBlastTab *.tab
# Scanning through 7735 files
# Loading database with 9757382 rows
# 255.200u 50.520s 25:19.66 20.1% 0+0k 0+0io 247pf+0w

cd /cluster/data/hg17/bed/geneSorter
# Create table that maps between known genes and RefSeq
hgMapToGene hg17 refGene knownGene knownToRefSeq
#	may need to build this command in src/hg/near/hgMapToGene
#	hgsql -e "select count(*) from knownToRefSeq;" hg17
#	row count changed 34667

# Create table that maps between known genes and LocusLink
hgsql --skip-column-names -e "select mrnaAcc,locusLinkId from refLink" hg17 > refToLl.txt
hgMapToGene hg17 refGene knownGene knownToLocusLink -lookup=refToLl.txt
#	hgsql -e "select count(*) from knownToLocusLink;" hg17
#	row count changed to 34773

# Create table that maps between known genes and Pfam domains
hgMapViaSwissProt hg17 knownGene name proteinID Pfam knownToPfam
#	hgsql -e "select count(*) from knownToPfam;" hg17
#	row count changed to 29171 

# Create table to map between known genes and GNF Atlas2
# expression data.
    hgMapToGene hg17 gnfAtlas2 knownGene knownToGnfAtlas2 '-type=bed 12'
#	hgsql -e "select count(*) from knownToGnfAtlas2;" hg17
#	row count changed to 32458

# Create expression distance table - takes about an hour
    hgExpDistance hg17 hgFixed.gnfHumanAtlas2MedianRatio \
    	hgFixed.gnfHumanAtlas2MedianExps gnfAtlas2Distance \
	-lookup=knownToGnfAtlas2 &
# Have 44696 elements in hgFixed.gnfHumanAtlas2MedianRatio
# Got 32458 unique elements in hgFixed.gnfHumanAtlas2MedianRatio
#	hgsql -e "select count(*) from gnfAtlas2Distance;" hg17
#	row count changed to 32381000

# Create a table that maps between known genes and 
# the nice affy expression data.
hgMapToGene "-type=bed 12" hg17 affyUclaNorm knownGene knownToU133
#	hgsql -e "select count(*) from knownToU133;" hg17
#	row count changed to 32965

# Create expression distance table.  This will take about 2.5 hours
cd /tmp
cp -p ~/kent/src/hg/near/hgExpDistance/affyUcla.weight .
time hgExpDistance hg17 affyUclaNorm affyUclaExp knownExpDistance \
	-weights=affyUcla.weight -lookup=knownToU133 &
# Have 43039 elements in affyUclaNorm
# 211 genes, 42 weights, 26.500000 total wieght
# Got 32965 unique elements in affyUclaNorm

# Create table that maps between known genes and 
# the GNF data.
cd /tmp
hgMapToGene hg17 affyU95 knownGene knownToU95
#	row count changed to 17555
#	hgFixed.gnfHumanU95Exps argument is unused, no need to exist
hgExpDistance hg17 hgFixed.gnfHumanU95MedianRatio \
	hgFixed.gnfHumanU95Exps gnfU95Distance  -lookup=knownToU95 &
# Have 11545 elements in hgFixed.gnfHumanU95MedianRatio
# Got 16501 unique elements in hgFixed.gnfHumanU95MedianRatio
#	row count changed to 16450000  

# Create known gene mapping table and expression distance tables
# for GNF Atlas 2.  (The hgExpDistance takes only 10 minutes.)

hgMapToGene hg17 affyGnf1h knownGene knownToGnf1h
hgExpDistance hg17 hgFixed.gnfHumanAtlas2MedianRatio \
	hgFixed.gnfHumanAtlas2MedianExps gnfAtlas2Distance \
	-lookup=knownToGnf1h &
# Have 44696 elements in hgFixed.gnfHumanAtlas2MedianRatio
# Got 8827 unique elements in hgFixed.gnfHumanAtlas2MedianRatio

cd /cluster/data/hg17/bed/geneSorter
hgMapToGene hg17 affyU133Plus2 knownGene knownToU133Plus2
#	row count changed to 35139  

#### UPDATE GO DATABASE (THIS PART WAS DONE 5/21/05 Fan)

# Download the terms and make the database.
ssh hgwdev
mkdir /cluster/store1/geneOntology/20050521
cd /cluster/store1/geneOntology/20050521

wget --timestamping http://www.godatabase.org/dev/database/archive/latest/go_200504-assocdb-data.gz

hgsql mysql <<end
create database go050521;
end
zcat go_*data.gz | sed -e 's/ENGINE=MyISAM DEFAULT CHARSET=latin1/TYPE=MyISAM/g' >j.tmp
hgsql go050521 <j.tmp
rm j.tmp

wget --timestamping ftp://ftp.geneontology.org/pub/go/gene-associations/gene_association.goa_uniprot.gz

zcat gene_association.goa_uniprot.gz | hgGoAssociation go050521 goaPart stdin 
# Passed 5589891 of 6584507 of 6584507, 84.89%

# Ask sys-admin to switch the database pointer go to point to go050521.

cd /cluster/data/hg17/bed/geneSorter

# Rebuilt Ensembl Gene tables.  See documentation (5/23/05 Fan) above.

# Create knownToEnsembl column
hgMapToGene hg17 ensGene knownGene knownToEnsembl
# hgsql hg17 -e "select count(*) from knownToEnsembl" 
#	table row count 35521

# Make knownToCdsSnp table 
  ssh hgwdev
  nice hgMapToGene hg17 snp knownGene knownToCdsSnp -all -cds
# hgsql hg17 -e "select count(*) from knownToCdsSnp" 
# row count 94633
# approx. 5 minutes running time

# C.ELEGANS BLASTP FOR GENE SORTER 
    # Make C. elegans ortholog column using blastp on wormpep.
    # First make C. elegans protein database and copy it to iscratch/i
    # if it doesn't exist already:

# The following section is done during mm6 build already.
#    ssh eieio
#    mkdir /cluster/data/ce2/bed/blastp
#    cd /cluster/data/ce2/bed/blastp
#    # Point a web browser at ftp://ftp.sanger.ac.uk/pub/databases/wormpep/
#    # to find out the latest version.  Then use that in place of 142 below.
#    wget -O wormPep142.faa ftp://ftp.sanger.ac.uk/pub/databases/wormpep/wormpep142/wormpep142
#    formatdb -i wormPep142.faa -t wormPep142 -n wormPep142
#    ssh kkr1u00
#    if (-e /iscratch/i/ce2/blastp) then
#      rm -r /iscratch/i/ce2/blastp
#    endif
#    mkdir -p /iscratch/i/ce2/blastp
#    cp /cluster/data/ce2/bed/blastp/wormPep142.p?? /iscratch/i/ce2/blastp
#    iSync

    # Make parasol run directory 
    ssh kk
    mkdir -p /cluster/data/hg17/bed/blastp/ce2/run/out
    cd /cluster/data/hg17/bed/blastp/ce2/run
    # Make blast script
    cat > blastSome <<end
#!/bin/csh
setenv BLASTMAT /iscratch/i/blast/data
/iscratch/i/blast/blastall -p blastp -d /iscratch/i/ce2/blastp/wormPep142 -i \$1 -o \$2 -e 0.01 -m 8 -b 1
end
    chmod a+x blastSome
    # Make gensub2 file
    cat > gsub <<end
#LOOP
blastSome {check in line+ \$(path1)} {check out line out/\$(root1).tab}
#ENDLOOP
end
    # Create parasol batch
    ls -1S /cluster/data/hg17/bed/geneSorter/blastp/split \
    |sed -e 's=kg=../../../geneSorter/blastp/split/kg=g' >split.lst
    gensub2 split.lst single gsub spec
    para create spec
    para try, check, push, check, ...  

# Initial run has 13 jobs crashed.
# crashed: 13
# ranOk: 7722

# para problems show the following typical message:

# total jobs in batch: 7735
# job: blastSome ../../../geneSorter/blastp/split/kg5911.fa out/kg5911.tab
# id: 209522384
# failure type: crash
# host: kkr2u28.kilokluster.ucsc.edu
# start time: Sat Jun  4 11:45:51 2005
# return: 0
# stderr:
# [blastall] FATAL ERROR: blast: Unable to open input file ../../../geneSorter/blastp/split/kg5911.fa

# para push again and these 13 ran fine.
# Completed: 7735 of 7735 jobs
# CPU time in finished jobs:      60319s    1005.32m    16.76h    0.70d  0.002 y
# IO & Wait Time:                 31239s     520.65m     8.68h    0.36d  0.001 y
# Average job time:                  12s       0.20m     0.00h    0.00d
# Longest running job:                0s       0.00m     0.00h    0.00d
# Longest finished job:              72s       1.20m     0.02h    0.00d
# Submission to last job:           199s       3.32m     0.06h    0.00d

# Load into database.  
    ssh hgwdev
    cd /cluster/data/hg17/bed/blastp/ce2/run/out
    hgLoadBlastTab hg17 ceBlastTab -maxPer=1 *.tab
# Scanning through 7735 files
# Loading database with 25574 rows

# Make mouse ortholog column using blastp on mouse known genes.
# First make mouse protein database and copy it to /cluster/panasas
# if it doesn't exist already
#	This already exists.  See makeMm6.doc for procedure

# Make parasol run directory 
ssh kk
mkdir /cluster/data/hg17/bed/geneSorter/blastp/mm6
cd /cluster/data/hg17/bed/geneSorter/blastp/mm6
mkdir run
cd run
mkdir out

# Make blast script
cat  << '_EOF_' > blastSome
#!/bin/sh
BLASTMAT=/cluster/bluearc/blast229/data /cluster/bluearc/blast229/blastall \
-p blastp -d /cluster/panasas/home/store/mm6/blastp/known \
-i $1 -o $2 -e 0.001 -m 8 -b 1
'_EOF_'
    # << keep emacs happy
chmod a+x blastSome

# Make gensub2 file
cat << '_EOF_' > gsub
#LOOP
blastSome {check in line+ $(path1)} {check out line out/$(root1).tab}
#ENDLOOP
'_EOF_'
    # << keep emacs happy

# Create parasol batch
#	this echo trick is used because otherwise the command line is
#	too long and you can not do a simple ls
echo ../../split/*.fa | wordLine stdin > split.lst
gensub2 split.lst single gsub jobList
para create jobList
para try
para check
para push ... etc ...
# Completed: 7735 of 7735 jobs
# CPU time in finished jobs:      85769s    1429.49m    23.82h    0.99d  0.003 y
# IO & Wait Time:                 20587s     343.11m     5.72h    0.24d  0.001 y
# Average job time:                  14s       0.23m     0.00h    0.00d
# Longest running job:                0s       0.00m     0.00h    0.00d
# Longest finished job:              78s       1.30m     0.02h    0.00d
# Submission to last job:           206s       3.43m     0.06h    0.00d

# Load into database.  
ssh hgwdev
cd /cluster/data/hg17/bed/geneSorter/blastp/mm6/run/out
hgLoadBlastTab hg17 mmBlastTab -maxPer=1 *.tab
# Scanning through 7735 files
# Loading database with 32951 rows

# Make rat ortholog column using blastp on rat known genes.
# First make rat protein database and copy it to cluster/bluearc
# if it doesn't exist already
#	This already exists.  See makeRn3.doc for procedure.
#	Files were put in this directory: /cluster/bluearc/rn3/blastp/

# Make parasol run directory 
ssh kk
mkdir /cluster/data/hg17/bed/geneSorter/blastp/rn3
cd /cluster/data/hg17/bed/geneSorter/blastp/rn3
mkdir run
cd run
mkdir out

# Make blast script
cat  << '_EOF_' > blastSome
#!/bin/sh
BLASTMAT=/cluster/bluearc/blast229/data /cluster/bluearc/blast229/blastall \
-p blastp -d /cluster/bluearc/rn3/blastp/known \
-i $1 -o $2 -e 0.001 -m 8 -b 1
'_EOF_'
    # << keep emacs happy
chmod a+x blastSome

# Make gensub2 file
cat << '_EOF_' > gsub
#LOOP
blastSome {check in line+ $(path1)} {check out line out/$(root1).tab}
#ENDLOOP
'_EOF_'
    # << keep emacs happy

# Create parasol batch
#	this echo trick is used because otherwise the command line is
#	too long and you can not do a simple ls
echo ../../split/*.fa | wordLine stdin > split.lst
gensub2 split.lst single gsub jobList
para create jobList
para try
para check
para push ... etc 
# Completed: 7735 of 7735 jobs
# CPU time in finished jobs:      27804s     463.40m     7.72h    0.32d  0.001 y
# IO & Wait Time:                 30334s     505.56m     8.43h    0.35d  0.001 y
# Average job time:                   8s       0.13m     0.00h    0.00d
# Longest running job:                0s       0.00m     0.00h    0.00d
# Longest finished job:              26s       0.43m     0.01h    0.00d
# Submission to last job:           119s       1.98m     0.03h    0.00d

# Load into database.  
ssh hgwdev
cd /cluster/data/hg17/bed/geneSorter/blastp/rn3/run/out
hgLoadBlastTab hg17 rnBlastTab -maxPer=1 *.tab
# Scanning through 7735 files
# Loading database with 24030 rows

# ZEBRAFISH BLASTP FOR GENE SORTER 
    # Make Danio rerio (zebrafish) ortholog column using blastp on Ensembl.
    # First make protein database and copy it to iscratch/i
    # if it doesn't exist already:
    ssh kkstore
    mkdir /cluster/data/danRer2/bed/blastp
    cd /cluster/data/danRer2/bed/blastp
    wget ftp://ftp.ensembl.org/pub/current_zebrafish/data/fasta/pep/Danio_rerio.ZFISH4.may.pep.fa.gz 
    zcat Dan*.pep.fa.gz > ensembl.faa
    /scratch/blast/formatdb -i ensembl.faa -t ensembl -n ensembl
    ssh kkr1u00
    if (-e /iscratch/i/danRer2/blastp) then
      rm -r /iscratch/i/danRer2/blastp
    endif
    mkdir -p /iscratch/i/danRer2/blastp
    cp /cluster/data/danRer2/bed/blastp/ensembl.p?? /iscratch/i/danRer2/blastp
    iSync

    # Make parasol run directory 
    ssh kk
    mkdir -p /cluster/data/hg17/bed/blastp/danRer2/run/out
    cd /cluster/data/hg17/bed/blastp/danRer2/run
    # Make blast script
    cat > blastSome <<end
#!/bin/csh
setenv BLASTMAT /iscratch/i/blast/data
/iscratch/i/blast/blastall -p blastp -d /iscratch/i/danRer2/blastp/ensembl -i \$1 -o \$2 -e 0.005 -m 8 -b 1
end
    chmod a+x blastSome
    # Make gensub2 file
    cat > gsub <<end
#LOOP
blastSome {check in line+ \$(path1)} {check out line out/\$(root1).tab}
#ENDLOOP
end
    # Create parasol batch
    ls -1S /cluster/data/hg17/bed/geneSorter/blastp/split \
    |sed -e 's=kg=../../../geneSorter/blastp/split/kg=g' > split.lst
    gensub2 split.lst single gsub spec
    para create spec
    para try, check, push, check, ...
# Completed: 7735 of 7735 jobs
# CPU time in finished jobs:     111467s    1857.78m    30.96h    1.29d  0.004 y
# IO & Wait Time:                 21159s     352.65m     5.88h    0.24d  0.001 y
# Average job time:                  17s       0.29m     0.00h    0.00d
# Longest running job:                0s       0.00m     0.00h    0.00d
# Longest finished job:              95s       1.58m     0.03h    0.00d
# Submission to last job:           223s       3.72m     0.06h    0.00d

    # Load into database.  
    ssh hgwdev
    cd /cluster/data/hg17/bed/blastp/danRer2/run/out
    hgLoadBlastTab hg17 drBlastTab -maxPer=1 *.tab
# Scanning through 7735 files
# Loading database with 30651 rows

# Make Saccharomyces cerevisiae (yeast) ortholog column using blastp on RefSeq.
# First make protein database and copy it to cluster/bluearc
# if it doesn't exist already
#	This is already done, see makeMm3.doc for procedure
#	the directory: /cluster/bluearc/sc1/blastp should have data

# Make parasol run directory 
ssh kk
mkdir /cluster/data/hg17/bed/geneSorter/blastp/sc1
cd /cluster/data/hg17/bed/geneSorter/blastp/sc1
mkdir run
cd run
mkdir out

# Make blast script
cat  << '_EOF_' > blastSome
#!/bin/sh
BLASTMAT=/cluster/bluearc/blast229/data /cluster/bluearc/blast229/blastall \
-p blastp -d /cluster/bluearc/sc1/blastp/sgd \
-i $1 -o $2 -e 0.01 -m 8 -b 1
'_EOF_'
    # << keep emacs happy
chmod a+x blastSome

# Make gensub2 file
cat << '_EOF_' > gsub
#LOOP
blastSome {check in line+ $(path1)} {check out line out/$(root1).tab}
#ENDLOOP
'_EOF_'
    # << keep emacs happy

# Create parasol batch
echo ../../split/*.fa | wordLine stdin > split.lst
gensub2 split.lst single gsub jobList
para create jobList
para try
para check
para push ... etc ...
# Completed: 7735 of 7735 jobs
# CPU time in finished jobs:      18194s     303.23m     5.05h    0.21d  0.001 y
# IO & Wait Time:                 24452s     407.53m     6.79h    0.28d  0.001 y
# Average job time:                   6s       0.09m     0.00h    0.00d
# Longest running job:                0s       0.00m     0.00h    0.00d
# Longest finished job:              16s       0.27m     0.00h    0.00d
# Submission to last job:           120s       2.00m     0.03h    0.00d
ssh hgwdev
cd /cluster/data/hg17/bed/geneSorter/blastp/sc1/run/out
hgLoadBlastTab hg17 scBlastTab -maxPer=1 *.tab
# Scanning through 7735 files
# Loading database with 16395 rows 

# Make Drosophila melanagaster ortholog column using blastp on FlyBase.
# First make SwissProt protein database and copy it to cluster/bluearc
# The following section was already done.
# cd /cluster/data/dm1/bed
# mkdir blastp
# cd blastp
#wget ftp://ftp.fruitfly.org/pub/download/dmel_RELEASE3-1/FASTA/whole_genome_translation_dmel_RELEASE3-1.FASTA.gz
# zcat whole_ge*.gz | faFlyBaseToUcsc stdin flyBase.faa
# formatdb -i flyBase.faa -t flyBase -n flyBase
# if (-e /cluster/bluearc/dm1/blastp) then
#    rm -r /cluster/bluearc/dm1/blastp
# endif
# mkdir -p /cluster/bluearc/dm1/blastp
# cp /cluster/data/dm1/bed/blastp/flyBase.p?? /cluster/bluearc/dm1/blastp

# Make parasol run directory 
ssh kk
mkdir /cluster/data/hg17/bed/geneSorter/blastp/dm1
cd /cluster/data/hg17/bed/geneSorter/blastp/dm1
mkdir run
cd run
mkdir out

# Make blast script
cat  << '_EOF_' > blastSome
#!/bin/sh
BLASTMAT=/cluster/bluearc/blast229/data /cluster/bluearc/blast229/blastall \
-p blastp -d /cluster/bluearc/dm1/blastp/flyBase \
-i $1 -o $2 -e 0.01 -m 8 -b 1
'_EOF_'
    # << keep emacs happy
chmod a+x blastSome

# Make gensub2 file
cat << '_EOF_' > gsub
#LOOP
blastSome {check in line+ $(path1)} {check out line out/$(root1).tab}
#ENDLOOP
'_EOF_'
    # << keep emacs happy
# Create parasol batch
echo ../../split/*.fa | wordLine stdin > split.lst
gensub2 split.lst single gsub jobList
para create jobList
para try
para check
para push ... etc ...
# Completed: 7735 of 7735 jobs
# CPU time in finished jobs:      72141s    1202.35m    20.04h    0.83d  0.002 y
# IO & Wait Time:                 41717s     695.28m    11.59h    0.48d  0.001 y
# Average job time:                  15s       0.25m     0.00h    0.00d
# Longest running job:                0s       0.00m     0.00h    0.00d
# Longest finished job:              57s       0.95m     0.02h    0.00d
# Submission to last job:           204s       3.40m     0.06h    0.00d

# Load into database.  
ssh hgwdev
cd /cluster/data/hg17/bed/geneSorter/blastp/dm1/run/out
hgLoadBlastTab hg17 dmBlastTab -maxPer=1 *.tab
# Scanning through 7735 files
# Loading database with 27109 rows

# update knownToHInv table
# Verified that there is now new release of HInv data.
hgMapToGene hg17 HInvGeneMrna knownGene knownToHInv
# count changed to 28851

# The new KG process no longer need entries in knownGeneLink (used to store
# info for DNA based RefSeqs.  So clean out the old data in knownGeneLink.

hgsql hg17 -e "delete from knownGeneLink"

#### RE-BUILD SUPERFAMILY RELATED TABLES (DONE - 2005-05-27 - Fan)

# Download latest Superfamily data files and build the Superfamily DB
# from supfam.mrc-lmb.cam.ac.uk

    mkdir /cluster/store10/superfamily/050524
    ln -s /cluster/store10/superfamily/050524 /cluster/data/superfamily/050524
    cd /cluster/data/superfamily/050524

# ftp over the following two files:
ass_22-May-2005.tab.gz
supfam_22-May-2005.sql.gz
    gzip -d *.gz

# Load the Superfamily database
    hgsql hg17 -e "create database superfam050524"
    nice hgsql superfam050524 < supfam_22-May-2005.sql &

# This may take about an hour.

# Make sure to add an index on id of the des table of superfam050524.
    hgsql superfam050524 -e "create index id on des(id);"

    hgsql superfam050524 < ~/src/hg/lib/sfAssign.sql
    hgsql superfam050524 -e 'load data local infile "ass_22-May-2005.tab" into table 
superfam050524.sfAssign;'

# Build or rebuild Superfamily track and create sf tables needed for PB

   hgsql hg17 < ~/src/hg/lib/sfAssign.sql

   cd /cluster/data/superfamily/050524  
   hgsql hg17 -e 'load data local infile "ass_22-May-2005.tab" into table hg17.sfAssign;'

# If hg17.sfDes already exists, drop it.

   hgsql superfam050524 -N -e "select * from des" >sfDes.tab
   hgsql hg17 < ~/src/hg/lib/sfDes.sql
   hgsql hg17 -e 'load data local infile "sfDes.tab" into table sfDes'

# If hg17.superfamily already exists, drop it.
   cd /cluster/data/hg17/bed
   mkdir /cluster/data/hg17/sf.2004-1128
   ln -s sf.2004-1128 sf
   hgSuperfam hg17 > sf.log

# It is normal that many proteins does not have corresponding Superfamily entries.

# If hg17.sfDescription exists, drop it.

   hgsql hg17 < ~/src/hg/lib/sfDescription.sql
   hgsql hg17 -e 'LOAD DATA local INFILE "sfDescription.tab"  into table hg17.sfDescription;'

# Finally, load the superfamily table.

   hgLoadBed hg17 superfamily superfamily.tab -tab

# Create knownToSuperfamily table
# Note hs is changed into ht for this Superfamily release.
   
   cat /cluster/data/superfamily/050524/ass_22-May-2005.tab \
   | hgKnownToSuper hg17 hs stdin
# created 32906 rows in knownToSuper

# Build tables needed by pbGlobal in proteins050415

   cd /cluster/data/superfamily/050524  
   hgsql proteins050415 -e 'load data local infile "ass_22-May-2005.tab" into table sfAssign'

   hgsql proteins050415 -e 'load data local infile "sfDes.tab" into table sfDes'

   cd /cluster/store10/kg/kgHg17F
   hgsql proteins050415 -e 'load data local infile "ensemblXref.tab" into table ensemblXref'

# These sf tables and ensemblXref3 are needed for non-HMR KG proteins.
# Should add content of ensemblXref3 of mm6 after it is done.
# And similarly for rn4 and possibly for other non-HMR species.
# CCDS <-> knownGene mapping need to be updated (markd 2005-05-29)
# this should be part of the known gene build
    /cluster/data/genbank/bin/i386/mkCcdsGeneMap -db=hg17 -loadDb ccdsGene knownGene ccdsKgMap

# Build targetScanS track - (DONE - 2005-06-22 Fan)
#       requested by: Dudekula, Dawood (NIH/NIA/IRP) DudekulaDB@grc.nia.nih.gov
    ssh hgwdev 
    mkdir -p /cluster/data/hg17/bed/targetScanS
    cd /cluster/data/hg17/bed/targetScanS
    wget --timestamp http://genes.mit.edu/targetscan/tracks/targetscan.bed

# Remove the first description line of targetscan.bed

    hgLoadBed -tab hg17 targetScanS targetscan.bed

# Create/edit/check in targetScans.html and trackDb.ra under
# kent/src/hg/makeDb/trackDb/human/hg17

# Update mrnaRefseq table (DONE - Fan 6/22/05)
# The old table contains non-human mrna/RefSeqs.
# The new table contains only human mrna/RefSeq and RefSeq/RefSeq.

# First build entrez DB tables, see makeMm6.doc for details.

    hgsql entrez -N -e \
    'select mrna, refseq from entrezRefseq, entrezMrna, hg17.all_mrna where entrezRefseq.geneID=entrezMrna.geneID and mrna=all_mrna.qName' \
    >mrnaRefseq1.tab

# Include RefSeq as valid mRNA too.
    hgsql hg17 -N -e 'select name, name from refGene' >mrnaRefseq2.tab

    cat mrnaRefseq1.tab mrnaRefseq2.tab |sort -u >mrnaRefseq.tab

    hgsql hg17 -e 'drop table mrnaRefseq'
    hgsql hg17 < ~/src/hg/lib/mrnaRefseq.sql
    hgsql hg17 -e 'load data local infile "mrnaRefseq.tab" into table mrnaRefseq'

# BUILD KNOWN GENE LIST FOR GOOGLE.  DONE 6/27/05 Fan.
# make knownGeneLists.html hg17GeneList.html mm5GeneList.html rm3GeneList.html

    cd /cluster/data/hg17/bed
    rm -rf knownGeneList/hg17

# Run hgKnownGeneList to generate the tree of HTML pages
# under ./knownGeneList/hg17

    hgKnownGeneList hg17

# copy over to /usr/local/apache/htdocs

    rm -rf /usr/local/apache/htdocs/knownGeneList/hg17
    mkdir -p /usr/local/apache/htdocs/knownGeneList/hg17
    cp -Rfp knownGeneList/hg17/* /usr/local/apache/htdocs/knownGeneList/hg17

####  Blat knownGene proteins to determine exons (DONE braney 06-30-05)
    ssh hgwdev
    cd /cluster/data/hg17/bed
    mkdir blat.hg17KG.2005-06-17
    rm blat.hg17KG
    ln -s  blat.hg17KG.2005-06-17 blat.hg17KG
    cd blat.hg17KG
    pepPredToFa hg17 knownGenePep known.fa
    hgPepPred hg17 generic blastKGPep02 known.fa
    grep ">" known.fa | sed "s/>//" > kgName.lst
    kgName hg17 kgName.lst blastKGRef02
    hgsql hg17 < ~/kent/src/hg/lib/blastRef.sql
    echo "rename table blastRef to blastKGRef02" | hgsql hg17
    echo "load data local infile 'blastKGRef02' into table blastKGRef02" | hgsql hg17
    ssh kk
    cd /cluster/data/hg17/bed/blat.hg17KG
    cat << '_EOF_' > blatSome
#!/bin/csh -fe
/cluster/bin/i386/blat -t=dnax -q=prot -out=pslx $1 $2 $3
'_EOF_'
    # << keep emacs happy
    chmod +x blatSome
    ls -1S /scratch/hg/gs.18/build35/bothMaskedNibs/*.nib > human.lst
    mkdir kgfa
    cd kgfa
    faSplit sequence ../known.fa 3020 kg
    cd ..
    ls -1S kgfa/*.fa > kg.lst
    cat << '_EOF_' > blatGsub
#LOOP
blatSome $(path1) {check in line $(path2)} {check out line psl/$(root1)/$(root2).psl}
#ENDLOOP
'_EOF_'
    # << keep emacs happy
    gensub2 human.lst kg.lst blatGsub blatSpec
    mkdir psl
    cd psl
    foreach i (`cat ../human.lst`)
	mkdir `basename $i .nib`
    end
    cd ..
    para create blatSpec
    para push
# Completed: 134320 of 134320 jobs
# CPU time in finished jobs:   22196680s  369944.67m  6165.74h  256.91d  0.704 y
# IO & Wait Time:               1712586s   28543.10m   475.72h   19.82d  0.054 y
# Average job time:                 178s       2.97m     0.05h    0.00d
# Longest finished job:            7691s     128.18m     2.14h    0.09d
# Submission to last job:        608750s   10145.83m   169.10h    7.05d

# Completed: 133676 of 133676 jobs
# CPU time in finished jobs:   29661130s  494352.16m  8239.20h  343.30d  0.941 y
# IO & Wait Time:               2181179s   36352.99m   605.88h   25.25d  0.069 y
# Average job time:                 238s       3.97m     0.07h    0.02d
# Longest job:                   105972s    1766.20m    29.44h    1.23d

    ssh eieio
    cd /cluster/data/hg17/bed/blat.hg17KG
    pslSort dirs raw.psl /tmp psl/*
    pslReps -nohead -minCover=0.9 -minAli=0.9 raw.psl cooked.psl /dev/null
    pslUniq cooked.psl hg17KG.psl
    pslxToFa hg17KG.psl hg17KG_ex.fa -liftTarget=genome.lft -liftQuery=protein.lft

    ssh hgwdev
    cd /cluster/data/hg17/bed/blat.hg17KG
    kgName hg17 hg17KG.psl blastKGRef02
    cut -f 10 hg17KG.psl > kgName.lst
    faSomeRecords known.fa kgName.lst hg17KG.fa
    hgPepPred hg17 generic blastKGPep02 hg17KG.fa
#end blat proteins
    
# MAKE Drosophila Proteins track (DONE 07-05-05 braney)
    ssh kk
    mkdir -p /cluster/data/hg17/bed/tblastn.dm2FB
    cd /cluster/data/hg17/bed/tblastn.dm2FB
    echo  /panasas/store/hg17/blastDb/*.nsq  | xargs ls -S | sed "s/\.nsq//"  > target.lst    
    mkdir fbfa
    # calculate a reasonable number of jobs 
    calc `wc /cluster/data/dm2/bed/blat.dm2FB/dm2FB.psl|awk "{print \\\$1}"`/\(264630/`wc target.lst| awk "{print \\\$1}"`\)
# 18929/(350000/5959) = 322.279746
    split -l 322 /cluster/data/dm2/bed/blat.dm2FB/dm2FB.psl fbfa/fb
    cd fbfa
    for i in *; do pslxToFa $i $i.fa; rm $i; done
    cd ..
    ls -1S fbfa/*.fa > fb.lst
    mkdir -p /cluster/bluearc/hg17/bed/tblastn.dm2FB/blastOut  
    ln -s /cluster/bluearc/hg17/bed/tblastn.dm2FB/blastOut  
    for i in `cat fb.lst`; do  mkdir blastOut/`basename $i .fa`; done
    tcsh
    cat << '_EOF_' > blastGsub
#LOOP
blastSome $(path1) {check in line $(path2)} {check out exists blastOut/$(root2)/q.$(root1).psl } 
#ENDLOOP
'_EOF_'
    cat << '_EOF_' > blastSome
#!/bin/sh
BLASTMAT=/iscratch/i/blast/data
export BLASTMAT
g=`basename $2`
f=/tmp/`basename $3`.$g
for eVal in 0.01 0.001 0.0001 0.00001 0.000001 1E-09 1E-11
do
if /scratch/blast/blastall -M BLOSUM80 -m 0 -F no -e $eVal -p tblastn -d $1 -i $2 -o $f.8
then
        mv $f.8 $f.1
        break;
fi
done
if test -f  $f.1
then
if /cluster/bin/i386/blastToPsl $f.1 $f.2
then
	liftUp -nosort -type=".psl" -nohead $f.3 /cluster/data/hg17/jkStuff/subLiftAll.lft warn $f.2  
	liftUp -nosort -type=".psl" -nohead $f.4 /cluster/data/hg17/jkStuff/liftAll.lft warn $f.3  
        liftUp -isPtoG -nosort -type=".psl" -pslQ -nohead $3.tmp /cluster/data/dm2/bed/blat.dm2FB/protein.lft warn $f.4
        mv $3.tmp $3
        rm -f $f.1 $f.2 $f.3
        exit 0
    fi
fi
rm -f $f.1 $f.2 $3.tmp $f.3 $f.8
exit 1
'_EOF_'

    chmod +x blastSome
    gensub2 target.lst fb.lst blastGsub blastSpec

    ssh kk
    cd /cluster/data/hg17/bed/tblastn.dm2FB
    para create blastSpec
    para push

# Completed: 351581 of 351581 jobs
# CPU time in finished jobs:   30733031s  512217.19m  8536.95h  355.71d  0.975 y
# IO & Wait Time:               1035790s   17263.16m   287.72h   11.99d  0.033 y
# Average job time:                  90s       1.51m     0.03h    0.00d
# Longest finished job:             816s      13.60m     0.23h    0.01d
# Submission to last job:        135367s    2256.12m    37.60h    1.57d

    ssh kki
    cd /cluster/data/hg17/bed/tblastn.dm2FB
    tcsh
    
    cat << '_EOF_' > chainGsub
#LOOP
chainSome $(path1) $(path2)
#ENDLOOP
'_EOF_'

    cat << '_EOF_' > chainSome
(cd $1; cat $2.psl | simpleChain -prot -outPsl -maxGap=150000 stdin ../c.`basename $1`.psl)
'_EOF_'
    chmod +x chainSome

    ls -1dS `pwd`/blastOut/fb?? > chain.lst
    gensub2 chain.lst single chainGsub chainSpec

    para create chainSpec
    para push
 
# Completed: 2714 of 2714 jobs
# CPU time in finished jobs:     222508s    3708.46m    61.81h    2.58d  0.007 y
# IO & Wait Time:                 10577s     176.29m     2.94h    0.12d  0.000 y
# Average job time:                  86s       1.43m     0.02h    0.00d
# Longest finished job:            9787s     163.12m     2.72h    0.11d

    cd /cluster/data/hg17/bed/tblastn.dm2FB/blastOut
    for i in fb??
    do 
	awk "(\$13 - \$12)/\$11 > 0.6 {print}" c.$i.*.psl > c60.$i.psl
	sort -rn c60.$i.psl | pslUniq stdin u.$i.psl
	awk "((\$1 / \$11) ) > 0.60 { print   }" c60.$i.psl > m60.$i.psl
	echo $i
    done

    sort -u -T /tmp -k 14,14 -k 16,16n -k 17,17n u.*.psl m60* > /cluster/data/hg17/bed/tblastn.dm2FB/blastDm2FB.psl
    cd ..

    ssh hgwdev
    cd /cluster/data/hg17/bed/tblastn.dm2FB
    hgLoadPsl hg17 blastDm2FB.psl
    exit

    # back to kksilo
    rm -rf blastOut

# End tblastn

# Build kgReactome table for KG to Reactome xref.  Done 6/28/05 Fan.

    ssh hgwdev
    mkdir -p /cluster/store10/reactome/reactome050613
    rm    /cluster/data/reactome
    ln -s /cluster/store10/reactome/reactome050613 /cluster/data/reactome

    cd /cluster/data/reactome
    
    wget --timestamp http://www.reactome.org/download/current/sql.gz
    
    hgsql hg17 -e 'drop database reactome'
    hgsql hg17 -e 'create database reactome'
    zcat sql.gz| hgsql reactome 

    hgsql reactome -N -e 'select kgId, spID, DB_ID from ReferenceEntity, hg17.kgXref where identifier=spID' >kgReactome.tab;

    hgsql hg17 -e 'drop table kgReactome'
    hgsql hg17 < ~/src/hg/lib/kgReactome.sql
    hgsql hg17 -e 'load data local infile "kgReactome.tab" into table kgReactome'

# UPDATE WGRNA TRACK (DONE, 2005-07-05, Fan)

  ssh hgwdev
  cd /cluster/data/hg17/bed

  mv wgRna wgRna-2005-06-16
  mkdir wgRna-2005-07-05
  cd wgRna-2005-07-05

# Received the data file, wgtrack_july2005.txt, from Michel Weber's email (Michel.Weber@ibcg.biotoul.fr)
# and place it under cd /cluster/data/hg17/bed/wgRna-2005-07-05.

  cat wgtrack_july2005.txt|sed -e 's/ /\t/g' >wgRna.tab
  
# edit wgRna.tab to take out the first 5 lines of data field labels.

    hgLoadBed -sqlTable=/cluster/home/fanhsu/src/hg/lib/wgRna.sql hg17 wgRna wgRna.tab

# REBUILT knownToPfam TABLE TO ALLOW KG REPRESENTED BY VARIANT SPLICE PROTEINS MAPPED TO PFAM (DONE 7/14/05, Fan)
# hgMapViaSwissProt.c was updated to support this.

# Create table that maps between known genes and Pfam domains
hgMapViaSwissProt hg17 knownGene name proteinID Pfam knownToPfam


## EVOFOLD (DONE, 2005-07-15, Jakob (jsp) ) 
# EvoFold is a new comparative method for predicting functional RNA
# secondary structures based on multiple sequence alignnments. The
# predictions generated for the EvoFold track are based on the most
# conserved elements of the 8-way alignment (multiz8way). The current
# data is the result of a pilot study (ongoing research of mine), the
# procedure used to generate the data will therefore be simplified
# when forthcoming evofold tracks for other organism are made. The
# documentation therefore skips the actual data generation, and
# instead starts with a data file I provide.

  ssh -C hg17
  mkdir -p /cluster/data/hg17/bed/evofold
  cd /cluster/data/hg17/bed/evofold
  cp /cluster/home/jsp/data/rnass/genome-scan/vertebrate/folds_hg17.bed foldsHg17.bed
  # The folds_hg17.bed is a 9-column bed file: column 1-6 provide
  # standard bed information, column 7 is element length, column 8 is
  # the RNA secondary structure in parentheses format, and column nine
  # is a commaseparated list of position specific confidence scores
  # (floats).
  hgLoadBed -notItemRgb -sqlTable=/cluster/home/jsp/prog/kent/src/hg/lib/evofold.sql hg17 evofold foldsHg17.bed

##########################################################################
# TRANSFRAG PHASE 2 TABLES  - lifted from hg15 (Jakob Skou Pedersen)
# Done: July 21, 2005
#
# These tables were lifted for use in my own research, but may be used
# for the 'Affymetrix Transcriptome Project Phase 2' tracks.
  ssh -C hgwdev
  mkdir -p /cluster/data/hg17/bed/transfrag
  cd /cluster/data/hg17/bed/transfrag
  # lifting transfrag tables from hg15 via hg16 to hg17
  for name in A375CytosolicPolyAPlusTnFg FHs738LuCytosolicPolyAPlusTnFg HepG2CytosolicPolyAMinusTnFg HepG2CytosolicPolyAPlusTnFg HepG2NuclearPolyAMinusTnFg HepG2NuclearPolyAPlusTnFg JurkatCytosolicPolyAPlusTnFg NCCITCytosolicPolyAPlusTnFg PC3CytosolicPolyAPlusTnFg SKNASCytosolicPolyAPlusTnFg U87CytosolicPolyAPlusTnFg; do
    echo "select chrom, chromStart, chromEnd, name from ${name};" | hgsql hg15 | sed -e 1d > ${name}Hg15.bed
    liftOver ${name}Hg15.bed /cluster/data/hg15/bed/liftOver/hg15ToHg16.over.chain ${name}Hg16.bed unmappedHg16.bed
    liftOver ${name}Hg16.bed /cluster/data/hg16/bed/liftOver/hg16ToHg17.over.chain ${name}Hg17.bed unmappedHg17.bed
    echo "hg16 unmapped count for ${name}: " `grep "#" unmappedHg16.bed | wc -l | awk '{print $1}'`
    echo "hg17 unmapped count for ${name}: " `grep "#" unmappedHg17.bed | wc -l | awk '{print $1}'`
    hgLoadBed hg17 ${name} ${name}Hg17.bed
    # clean up
    rm ${name}Hg15.bed ${name}Hg16.bed unmappedHg16.bed unmappedHg17.bed
  done

# GLADSTONE ARRAY TRACK (DONE 7/19/2005 Andy)
    ssh hgwdev
    cd /cluster/data/hg17/bed
    mkdir gladHumES
    cd gladHumES/
    cp /cluster/data/hg16/bed/geneAtlas2/geneAtlas2.bed .
    cut -f1-12 geneAtlas2.bed > bed.hg16
    liftOver bed.hg16 /gbdb/hg16/liftOver/hg16ToHg17.over.chain bed.hg17 /dev/null
    hgMapMicroarray bed.hg17.data hgFixed.gladHumESRatio \
       -bedIn bed.hg17
#Loaded 11087 rows of expression data from hgFixed.gladHumESRatio
#Mapped 10925,  multiply-mapped 382, missed 23266, unmapped 162
    hgLoadBed hg17 gladHumES bed.hg17.data

# PHASTODDS GENESORTER COLUMN (DONE 7/28/2005 Andy)
    ssh kolossus
    cd /panasas/store/andy
    mkdir phastOdds
    cd phastOdds/
    export PATH=${PATH}:/cluster/bin/phast/x86_64
    mkdir sso beds
    cat > runChrom.sh << "_EOF_"
#!/bin/bash
c=$1
numDir=`echo ${c#chr} | sed 's/_random//'`
ALNDIR=/cluster/data/hg17/bed/multiz10way

echo msa_view $c
/cluster/bin/phast/x86_64/msa_view --in-format MAF ${ALNDIR}/maf/${c}.maf --refseq /cluster/data/hg17/${numDir}/${c}.fa > /tmp/${c}.sso

echo phastCons $c
/cluster/bin/phast/x86_64/phastOdds -f ${ALNDIR}/cons/run.elements/ave.cons.mod -b ${ALNDIR}/cons/run.elements/ave.noncons.mod -g kg/${c}.bed /tmp/${c}.sso > /tmp/${c}.phastOdds.gtf
cp /tmp/${c}.sso sso/
rm /tmp/${c}.sso
cp /tmp/${c}.phastOdds.gtf gtfs/
rm /tmp/${c}.phastOdds.gtf
echo $c done
_EOF_
    ssh hgwdev
    cd /panasas/store/andy/phastOdds
    genePredToGtf hg17 knownGene kg.gtf
    exit 
    for c in `cut -f1 kg.gtf | sort | uniq`; do 
       grep "\b${c}\b" kg.gtf > kg/${c}.gtf; 
    done
    for f in kg/*.bed; do
       c=`basename $f .bed`;
       echo $c;
       ./runChrom.sh $c;
       addPhastOddsExons $f gtfs/$c.phastOdds.gtf beds/$c.bed
    done
    cat beds/* | sort -k4,4 -k1,1 -k2,2n -k3,3n > phastOdds.kg.bed
    cat > phastOdds.sql << "EOF"
CREATE TABLE phastOdds (
    bin smallint not null, # Speedup.
    chrom varchar(255) not null,        # Human chromosome or FPC contig
    chromStart int unsigned not null,   # Start position in chromosome
    chromEnd int unsigned not null,     # End position in chromosome
    name varchar(255) not null, # Name of item
              #Indices
    score float not null, # phastOdds score.
    index(chrom(8),bin),
    index(name(10))
);
EOF
# <<
    hgLoadBed -sqlTable=phastOdds.sql hg17 phastOdds phastOdds.kg.bed
    # Actually I probably don't need that hg17 table.
    echo create table phastOdds select name, score from hg17.phastOdds | hgsql hgFixed
    echo create index nameIndex on phastOdds (name(10)) | hgsql hgFixed    

##########################################################################
# Illumina SNPs (Heather, July 2005)
# Source: Jeff Ohmen, PhD, johmen@illumina.com, 858/232-2702

# using /cluster/store11 because /cluster/data/hg17 is on store5,
# which is currently being restored

  cd /cluster/store11/heather/illumina
  fix.pl < LinkageIVbSNP.txt > illumina.bed
  hgLoadBed hg17 snpIllumina -tab -strict -sqlTable=snpIllumina.sql illumina.bed
  # Reading illumina.bed
  # Loaded 6008 elements of size 4
  # Sorted
  # Saving bed.tab
  # Loading hg17

  # note: 28 rows where chrom = "chrXY"

# reload rankProp and psiBlast gene sorter tables to link with new
# known genes (markd 2005-07-15)
   (spLoadRankProp -noMapFile=max1k.nomap hg17 rankProp --  /cluster/bluearc/markd/rankprop/results/hs.sw+tr/max1k.rankp.gz)   >&max1k.hg17.out
   (spLoadPsiBlast hg17 spPsiBlast  /cluster/bluearc/markd/rankprop/results/hs.sw+tr.eval.gz) >&pslBlast.hg17.out 


# BLASTZ/CHAIN/NET CANFAM2 (DONE 8/2/05 angie - REDONE 12/12/05 angie - REDONE 2/6/06 angie)
# Unfortunately, this was done with a corrupted 
# /san/sanvol1/scratch/hg17/nib/chr5.nib the first time around; 
# also, a linSpecRep bug in blastz-run-ucsc has been fixed since then.
# Doh, then Kate pointed out that linSpecReps were not snipped properly --
# I had omitted the BLASTZ_ABRIDGE_REPEATS line from the DEF!!!  
# Added an error message to doBlastzChainNet.pl to catch that next time.
# Therefore I'm moving aside the previous run:
    mv /usr/local/apache/htdocs/goldenPath/hg17/vsCanFam2{,.bak}
# And rerunning...
    ssh kkstore02
    mkdir /cluster/data/hg17/bed/blastz.canFam2.2006-02-06
    cd /cluster/data/hg17/bed/blastz.canFam2.2006-02-06
    cat << '_EOF_' > DEF
# human vs. dog

BLASTZ_ABRIDGE_REPEATS=1

# TARGET: Human
SEQ1_DIR=/san/sanvol1/scratch/hg17/nib
SEQ1_SMSK=/san/sanvol1/scratch/hg17/linSpecRep.notInDog
SEQ1_LEN=/cluster/data/hg17/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000

# QUERY: Dog
SEQ2_DIR=/san/sanvol1/scratch/canFam2/nib
SEQ2_SMSK=/san/sanvol1/scratch/canFam2/linSpecRep.notInHuman
SEQ2_LEN=/cluster/data/canFam2/chrom.sizes
SEQ2_CHUNK=10000000
SEQ2_LAP=0

BASE=/cluster/data/hg17/bed/blastz.canFam2.2006-02-06
'_EOF_'
    # << for emacs
    doBlastzChainNet.pl DEF -bigClusterHub pk -smallClusterHub pk \
      -workhorse pk \
      -blastzOutRoot /san/sanvol1/scratch/blastzHg17CanFam2Out >& do.log &
    tail -f do.log
    rm -f /cluster/data/hg17/bed/blastz.canFam2
    ln -s blastz.canFam2.2006-02-06 /cluster/data/hg17/bed/blastz.canFam2

# RE-RUN NETTOAXT, AXTTOMAF FOR CANFAM2 (DONE 10/26/05 angie)
    # Kate fixed netToAxt to avoid duplicated blocks, which is important 
    # for input to multiz.  Regenerate maf using commands from sub-script 
    # netChains.csh generated by doBlastzChainNet.pl above.  
# Obsoleted by re-run of hg17-canFam2 above 12/12/05 angie...
    ssh kolossus
    cd /cluster/data/hg17/bed/blastz.canFam2.2005-08-01/axtChain
    netSplit hg17.canFam2.net.gz net
    chainSplit chain hg17.canFam2.all.chain.gz
    cd ..
    mv axtNet axtNet.orig
    mkdir axtNet
    foreach f (axtChain/net/*.net)
      netToAxt $f axtChain/chain/$f:t:r.chain \
        /panasas/store/hg17/nib /iscratch/i/canFam2/nib stdout \
      | axtSort stdin stdout \
      | gzip -c > axtNet/$f:t:r.hg17.canFam2.net.axt.gz
    end
    rm -r mafNet
    mkdir mafNet
    foreach f (axtNet/*.hg17.canFam2.net.axt.gz)
      axtToMaf -tPrefix=hg17. -qPrefix=canFam2. $f \
            /cluster/data/hg17/chrom.sizes /cluster/data/canFam2/chrom.sizes \
            stdout \
      | gzip -c > mafNet/$f:t:r:r:r:r:r.maf.gz
    end
    rm -r axtChain/{chain,net}/ axtNet.orig


############
# Sangamo/EIO DNaseI Hypersensitive Sites (2005-08-15 kate)
#   (Sangamo Biosciences and European Inst. Oncology)
#   Contact: Fyodor Umov (fumov@sangamo.com)

    cd /cluster/data/hg17/bed
    mkdir sangamo
    cd sangamo
    grep chr 3314_hs_sites_browser.bed | grep -v browser | \
        hgLoadBed -noBin hg17 sangamoDnaseHs stdin
            # Loaded 3314 elements of size 6
    checkTableCoords -table=sangamoDnaseHs hg17

    # use "antiword" to create plain text from .doc description file

# UPDATE WGRNA TRACK (DONE, 2005-08-24, Fan)

  ssh hgwdev
  cd /cluster/data/hg17/bed

  mkdir wgRna-2005-08-24
  cd wgRna-2005-08-24

# Received the data file, wgtrack_aug2005.txt, from Michel Weber's email 
# (Michel.Weber@ibcg.biotoul.fr)
# and place it under cd /cluster/data/hg17/bed/wgRna-2005-08-24.

  cut -f 2-10 wgtrack_aug2005.txt >wgRna.tab
  vi wgRna.tab
  
# edit wgRna.tab to take out the first line of data field labels.

    hgLoadBed -sqlTable=/cluster/home/fanhsu/src/hg/lib/wgRna.sql hg17 wgRna wgRna.tab

# Compared to 7/5 data, one record updated, one record dropped, one record added, out of 741 records.

# Generate snpMask files (Done Heather Sept. 1, 2005)
# Takes about 10-15 minutes
# Consumes about 1 gig of disk
  ssh hgwdev
  cd /usr/local/apache/htdocs/goldenPath/hg17
  mkdir snpMask
  cd snpMask
  foreach chrom ( `cat /cluster/data/hg17/chrom.lst` )
      snpMaskChrom hg17 ${chrom} /gbdb/hg17/nib/${chrom}.nib ${chrom}.ambigs.fa
      gzip ${chrom}.ambigs.fa
  end

#############################################################################
# BLASTZ Mm7 (WORKING - 2005-09-06 - Hiram)
#	Experiment, try the alignments without the linage specific
#	repeats
    ssh pk
    mkdir /cluster/data/hg17/bed/blastzMm7.2005-09-06
    cd /cluster/data/hg17/bed
    ln -s blastzMm7.2005-09-06 blastz.mm7
    cd blastzMm7.2005-09-06
    cat << '_EOF_' > DEF
# human vs mouse
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/cluster/home/angie/schwartzbin

BLASTZ=blastz.v7.x86_64
BLASTZ_H=2000
BLASTZ_M=50
BLASTZ_ABRIDGE_REPEATS=0

# TARGET: Human Hg17
SEQ1_DIR=/cluster/bluearc/hg17/bothMaskedNibs
SEQ1_LEN=/cluster/bluearc/hg17/chrom.sizes
SEQ1_CTGDIR=/cluster/bluearc/hg17/bothMaskedNibs
SEQ1_CTGLEN=/cluster/bluearc/hg17/chrom.sizes
SEQ1_IN_CONTIGS=0
SEQ1_CHUNK=500000
SEQ1_LAP=50

# QUERY: Mouse Mm7
SEQ2_DIR=/cluster/bluearc/mm7/mm7.2bit
SEQ2_LEN=/cluster/bluearc/mm7/chrom.sizes
SEQ2_CTGDIR=/cluster/bluearc/mm7/mm7Chroms_RandomContigs.2bit
SEQ2_CTGLEN=/cluster/bluearc/mm7/mm7Chroms_RandomContigs.sizes
SEQ2_LIFT=/cluster/bluearc/mm7/Chroms_RandomContigs.lft
SEQ2_IN_CONTIGS=0
SEQ2_CHUNK=3000000000
SEQ2_LAP=0

BASE=/cluster/data/hg17/bed/blastzMm7.2005-09-06
TMPDIR=/scratch/tmp
'_EOF_'
    # happy emacs

    cp -p /cluster/data/hg17/chrom.sizes ./S1.len
    twoBitInfo /cluster/bluearc/mm7/mm7Chroms_RandomContigs.2bit S2.len

    #	establish a screen to control this job
    screen
    time ./doBlastzChainNet.pl -stop chainMerge \
	-bigClusterHub=pk \
	`pwd`/DEF > toChainMerge.run.out 2>&1 &
    #	STARTED - 2005-09-06 - 11:00
    #	detach from screen session: Ctrl-a Ctrl-d
    #	to reattach to this screen session:
    time $HOME/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-bigClusterHub=pk \
	-continue=cat -stop=cat \
	`pwd`/DEF > catStep.out 2>&1 &

    time $HOME/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-bigClusterHub=pk \
	-continue=chainRun \
	`pwd`/DEF > continueChainRun.out 2>&1 &

    time $HOME/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-bigClusterHub=pk \
	-continue=chainMerge -stop=chainMerge \
	`pwd`/DEF > chainMerge.out 2>&1 &
    time $HOME/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-bigClusterHub=pk \
	-continue=net -stop=net \
	`pwd`/DEF > net.out 2>&1 &

    time $HOME/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-bigClusterHub=pk \
	-continue=load -stop=load \
	`pwd`/DEF > load.out 2>&1 &

    time $HOME/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-bigClusterHub=pk \
	-swap -stop=load \
	`pwd`/DEF > swap.out 2>&1 &

    #	Create plain pslChrom files to load a simple blastz track
    ssh kkstore02
    cd /cluster/data/hg17/bed/blastzMm7.2005-09-06
    mkdir -p pslChrom
(cd pslParts; ls | awk -F"." '{print $1}' | sort -u) | while read C
do
    echo -n "working ${C} ... "
    zcat pslParts/${C}.nib*.gz | gzip -c > pslChrom/${C}.psl.gz
    echo "done"
done
    #	Load those alignments
    ssh hgwdev
    cd /cluster/data/hg17/bed/blastzMm7.2005-09-06
ls pslChrom | sed -e "s/.psl.gz//" | while read T
do
echo "hgLoadPsl -fastLoad -noTNameIx hg17 -table=${T}_blastzMm7 pslChrom/${T}.psl.gz"
hgLoadPsl -fastLoad -noTNameIx hg17 -table=${T}_blastzMm7 pslChrom/${T}.psl.gz
done


    #	After this same alignment was done with Hg17 query and Mm7
    #	target, came back to these swapped results in mm7 and manually loaded
    #	the swapped tables as: chainMm7LSR, chainMm7LSRLink and
    #	netMm7LSR
    #	41,223,632 total rows in the chainMm7Link split tables
    #	58,458,613 total rows in the chainMm7LSRLink table

    time featureBits hg17 chainMm7LSRLink
    #	959444893 bases of 2866216770 (33.474%) in intersection
    #	real    36m30.822s
    #	user    14m19.620s
    #	sys     5m13.910s
    time featureBits hg17 chainMm7Link
    #	955168137 bases of 2866216770 (33.325%) in intersection
    #	real    16m13.902s
    #	user    10m20.780s
    #	sys     3m42.810s
    #	And, their intersection:
    ssh kolossus
    time HGDB_CONF=~/.hg.conf.read-only featureBits hg17 \
	chainMm7LSRLink chainMm7Link
    #	952667399 bases of 2866216770 (33.238%) in intersection
    #	real    38m53.448s
    #	user    8m38.853s
    #	sys     2m23.362s

# LOAD ACEMBLY TRACK (DONE, 2005-09-12, Fan)
    mv /cluster/data/hg17/bed/acembly /cluster/data/hg17/bed/acembly_050217
    mkdir -p /cluster/data/hg17/bed/acembly
    cd /cluster/data/hg17/bed/acembly
    # Data is obtained from Jean Thierry-Mieg     mieg@ncbi.nlm.nih.gov
 
    wget --timestamping ftp://ftp.ncbi.nih.gov/repository/acedb/ncbi_35g.human.genes/acembly.ncbi_35g.genes.proteins.fasta.tar.gz
    wget --timestamping ftp://ftp.ncbi.nih.gov/repository/acedb/ncbi_35g.human.genes/acembly.ncbi_35g.genes.gff.tar.gz
    wget --timestamping ftp://ftp.ncbi.nih.gov/repository/acedb/ncbi_35g.human.genes/acembly.ncbi_35g.mrnas.fasta.tar.gz
    wget --timestamping ftp://ftp.ncbi.nih.gov/repository/acedb/ncbi_35g.human.genes/acembly.ncbi_35g.genes.pfamhits.tar.gz
    tar xvzf acembly.ncbi_35g.genes.gff.tar.gz
    tar xvzf acembly.ncbi_35g.genes.proteins.fasta.tar.gz
    cd acembly.ncbi_35.genes.gff
    # the acembly dataset for hg16 had problems with reverse blocks so
    # check for these
cat << '_EOF_' > checkReversedBlocks
for i in x1*.gff
do
        echo -n "$i working ..."
        awk -F"\t" '
{
if ($4 > $5) {
    printf "reverse blocks problem for $1"
        printf "\n"
}
}
' $i > $i.fixed
        echo " done"
done
'_EOF_'
    # << this line makes emacs coloring happy
    chmod +x checkReversedBlocks
    ./checkReversedBlocks  
    ls -l *.fixed
    # all *.fixed files are empty so remove - there is no reversing of blocks
    rm *.fixed

    foreach f (x1.acemblygenes.*.gff)
      set c=$f:r:e
      egrep '^[a-zA-Z0-9]+\|NT_[0-9][0-9][0-9][0-9][0-9][0-9]' $f | \
        perl -wpe 's/^(\w+)\|(\w+)/$1\/$2/' > ctg-chr${c}_random.gff
      if (-e ../../../$c/lift/random.lft) then
        liftUp chr${c}_random.gff ../../../$c/lift/random.lft warn \
          ctg-chr${c}_random.gff
      endif
      grep -v ^$c\| $f | grep -v ^Hs | perl -wpe 's/^/chr/;' | \
        grep -v "^chr//" > chr$c.gff
      echo "done $c"
    end
    
    #- Load into database - use extended genePred
    ssh hgwdev
    cd /cluster/data/hg17/bed/acembly
    # Reloaded without -genePredExt 1/6/05:
    ldHgGene -gtf hg17 acembly acembly.ncbi_35.genes.gff/chr*.gff
    # for entry with 28212470 from chr6.gff, change to chr6 
    # and for 29124352 in chr6.gff, change to chr6 (1/13/05)
    echo 'update acembly set chrom = "chr6" where chrom = "chr28212470";' \
         | hgsql hg17 
    echo 'update acembly set chrom = "chr6" where chrom = "chr29124352";' \
         | hgsql hg17 
    
    # checkTableCoords and runGeneCheck to check data
    checkTableCoords hg17 acembly

hgPepPred hg17 generic acemblyPep \
        acembly.ncbi_35.genes.proteins.fasta/*.fasta

# create table of Acembly gene classifications
cd /cluster/data/hg17/bed/acembly/acembly.ncbi_35.genes.gff
rm acemblyClass.tab
foreach f (x1.acemblygenes.*.gff)

cut -f 9 $f |sed -e 's/;/\t/g' |sed -e 's/transcript_id //' >j.tmp
cut -f 2 j.tmp >j2.tmp
cut -f 3 j.tmp >j3.tmp
paste j3.tmp j2.tmp|sed -e 's/Main_gene/main/g' |sed -e 's/Putative_gene/putative/g' |sed -e 's/ //g' >>acemblyClass.tab
end

rm *.tmp
hgsql hg17 -e 'drop table acemblyClass'
hgsql hg17 < ~src/hg/lib/acemblyClass.sql
hgsql hg17 -e 'load data local infile "acemblyClass.tab" into table acemblClass.tab'
hgsql hg17 -e 'delete from acemblyClass where class!="main" and class!="putative"'

# build acemblyPep table

hgPepPred hg17 generic acemblyPep \
        acembly.ncbi_35.genes.proteins.fasta/*.fasta

# Please note, per email from Jean Thierry-Mieg on 9/9/05,
# there are AceView genes (~10,000) without corresponding 
# protein sequences.  They will fix it next time.

###########################################################################
# LOADING AFFYTXNPHASE2 TRACK (sugnet)

# cd to where data is downloaded.
cd /cluster/store10/sugnet/affyTranscription/graphs/transcriptome.affymetrix.com/download/publication/polyA_minus/graphs

# lift data from hg16 to hg17. This takes a long time.
./liftWigFilesHg16ToHg17.sh

# make the .wib and .wig files. This takes a long time.
./makeWibWigHg17.sh

# Copy .wib files to /cluster/data/hg17 files
mkdir /cluster/data/hg17/bed/affyTxnPhase2/wigData/
cp `find ./ -name "*.hg17.wib"` /cluster/data/hg17/bed/affyTxnPhase2/wigData/
chmod 775 /cluster/data/hg17/bed/affyTxnPhase2/wigData/
chmod 664 /cluster/data/hg17/bed/affyTxnPhase2/wigData/*

# Make gbdb entry
mkdir /gbdb/hg17/wib/affyTxnPhase2
chmod 775 /gbdb/hg17/wib/affyTxnPhase2
cd /gbdb/hg17/wib/affyTxnPhase2
ln -s /cluster/data/hg17/bed/affyTxnPhase2/wigData/* .
cd -

# Load the database tables (using bash) this takes a while
for file in `find ./ -name "*hg17.wig"`; do
  base=`basename $file .hg17.wig`
  echo "Doing ${base}Txn"
  hgLoadWiggle -pathPrefix=/gbdb/hg17/wib/affyTxnPhase2  hg17 ${base}Txn $file
done

# Do the transfrags
cd ../transfrags
./liftHg16ToHg17.sh
./loadHg17Tnfg.sh

# End of affyTxnPhase2

###########################################################################
#  Creating download files for the affyTxnPhase2 data
#		(DONE - 2006-11-20 - Hiram)
    #	Copy all of the data above to a temporary /san/sanvol1/scratch/
    #	location, and run the following script:
#!/bin/sh

mkdir -p rawData/hg17

TOP=`pwd`
export TOP
for dir in `find ./ -type d | grep '_' | grep -v A375_cytosolic_polyAPlus | grep -v FHs738Lu_cytosolic_polyAPlus | grep -v HepG2_CytosolVsNucleusDifferenceGraphs | grep -v HepG2_cytosolic_polyAPlus | grep -v HepG2_cytosolic_polyAMinus | sed -e "s#^./##"`; do
base=`echo $dir | sed -e 's/\.\///; s/\//_/g' | sed -e 's/polyA-/polyAMinus/g' | sed -e 's/-/_/g' | sed -e 's/\+/Plus/g' | $TOP/changeName.pl`
RAW=$TOP/rawData/hg17/$base.data
echo $RAW
   cd $dir;
   zcat `ls -1 *hg17.bed.gz` | bedSort stdin stdout | cut -f 1,2,3,4 | grep chr | $TOP/avgerizeBed.pl > $RAW
   cd $TOP;
done

    #	Then copy the rawData/hg17/ results directory back to:
    /cluster/data/hg17/bed/affyTxnPhase2/rawResults/
    #	And deliver to hgdownloads via symlinks on hgwdev:
    cd /usr/local/apache/htdocs/goldenPath/hg17/affyTxnPhase2/
    # to:
    ln -s /cluster/data/hg17/bed/affyTxnPhase2/rawData/*.data.gz .
    #	Remove the san scratch data

###########################################################################
# ALTGRAPHX TRACK (sugnet)

/cluster/store1/sugnet/altSplice
mkdir hg17-2005.03.28

# First get the RNA clusters
cd hg17-2005.03.28

# Don't use RAGE libraries for clone bounds.
~/kent/src/hg/geneBounds/clusterRna/generateRageAccList.csh  hg17 rage.libs

# Make spec file to run.
foreach c (`echo 'select chrom from chromInfo' | hgsql hg17 | grep -v chrom`)
    set out = chrom/$c.bed
    echo "clusterRna -mrnaExclude=hg17.rage.libs hg17 /dev/null $out -chrom=$c" >> clusterRna.spec
end

# Tried running it on the minicluster, but can't connect to the 
# cluster accounts so run it from here on hgwdev.
./clusterRna.spec >& clusterRna.log

cd ..

# Make script to setup parasol job file for raw altGraphX files on human
cat << '_EOF_' > makeRun.sh
#!/bin/sh

for chrom in `echo "select chrom from chromInfo" | hgsql hg17 | grep -v chrom`; do
echo 'echo "Doing $chrom"'
echo "/cluster/home/sugnet/bin/i386/altSplice -db=hg17   -beds=rnaCluster/chrom/$chrom.bed -agxOut=agxs/hg17.$chrom.agx -consensus -weightMrna -localMem -minAli=.95 -minCover=.5 -chromNib=/cluster/data/hg17/nib/$chrom.nib"
done
'_EOF_'
    # << this line makes emacs coloring happy
mkdir agxs
chmod 755 makeRun.sh

# Minicluster down, have to run on hgwdev.
./makeRun.sh > toRun.sh
chmod 755 toRun.sh
./toRun.sh >& toRun.log &

cat agxs/*.agx > hg17.agx

# make raw altGraphX files for mouse
mkdir ../mm5-2005.03.28/
cd ../mm5-2005.03.28/

# make the rnaClusters
mkdir rnaCluster
cd rnaCluster/
mkdir chrom

# Don't use RAGE libraries for clone bounds.
~/jk/hg/geneBounds/clusterRna/generateRageAccList.csh mm5 rage.libs
# Doing select on mm5 into mm5.rage.libs
# Done.

# Make spec file to run.
foreach c (`echo 'select chrom from chromInfo' | hgsql mm5 | grep -v chrom`)
    set out = chrom/$c.bed
    echo "clusterRna -mrnaExclude=mm5.rage.libs mm5 /dev/null $out -chrom=$c" >> clusterRna.spec
end

# Tried running it on the minicluster, but can't connect to the 
# cluster accounts so run it from here on hgwdev.
chmod 755 clusterRna.spec
./clusterRna.spec >& clusterRna.log &


# Make the gene bounds in rnaCluster.
mkdir agxs

# This script generates the jobs, one per chromosome.
echo << '_EOF_' > makeRun.sh
#!/bin/sh

for chrom in `echo "select chrom from chromInfo" | hgsql mm5 | grep -v chrom`; do
echo 'echo "Doing $chrom"'
echo "/cluster/home/sugnet/bin/i386/altSplice -db=mm5   -beds=rnaCluster/chrom/$chrom.bed -agxOut=agxs/mm5.$chrom.agx -consensus -weightMrna -localMem -minAli=.95 -minCover=.5 -chromNib=/cluster/data/mm5/nib/$chrom.nib"
done
'_EOF_'
chmod 755 makeRun.sh
./makeRun.sh > toRun.sh
chmod 755 toRun.sh
./toRun.sh >& toRun.log & # Takes an hour or so...

# Consolodiate all of the records in a single file.
cat agxs/*.agx > mm5.agx


# Make the orthologous splicing graphs.
mkdir orthoSpliceExoniphy
cd orthoSpliceExoniphy/

# Get the exoniphy exons...
echo "select chrom, txStart, txEnd, name, id, strand from exoniphy order by chrom, txStart;" | hgsql hg17 | grep -v txStart > hg17.exoniphy.bed

# Set up the commands for the orthosplice run.
echo 'select chrom, size from chromInfo' | hgsql hg17 | grep -v chrom > chromSizes.tab
ln -s /cluster/data/hg17/bed/blastz.mm5/axtChain/mouseNet/ nets
ln -s /cluster/data/hg17/bed/blastz.mm5/axtChain/chain/ chains
mkdir agx report logs

cat << '_EOF_' > makeRun.sh

#!/usr/bin/perl -w

open(IN, 'chromSizes.tab') or die "Can't open chromSizes.tab\n";
while(<IN>) {
    chomp;
    @w = split;
    print "/cluster/home/sugnet/bin/i386/orthoSplice -chromSize=$w[1] -exonFile=hg17.exoniphy.bed -trumpNum=3 -chrom=$w[0] -altInFile=../agxs/hg17.$w[0].agx -orthoAgxFile=../../mm5-2005.03.28/mm5.agx -db=hg17 -orthoDb=mm5 -netFile=nets/$w[0].net -chainFile=chains/$w[0].chain -commonFile=agx/$w[0].hg17.mm5.cons.t3.agx -reportFile=report/$w[0].hg17.report -edgeFile=report/$w[0].hg17.edge.report >& logs/$w[0].test.log\n";
}
'_EOF_'
# << emacs

./makeRun.sh > orthoSplice.para.spec
ssh kki
cd /cluster/store1/sugnet/altSplice/hg17-2005.03.28/orthoSpliceExoniphy
para create orthoSplice.para.spec
para push

cat agx/*.agx > hg17.mm5.t3.exoniphy.agx

# Make bed file
agxToBed hg17.mm5.t3.exoniphy.agx hg17.mm5.t3.exoniphy.bed

# Load up files
hgLoadBed hg17 agxBed hg17.mm5.t3.exoniphy.bed
hgLoadBed -notItemRgb -sqlTable=/cluster/home/sugnet/kent/src/hg/lib/altGraphX.sql hg17 altGraphX hg17.mm5.t3.exoniphy.agx

# end altGraphX track

# EXONWALK TRACK (sugnet)

# make altGraphX track (see above)

cd /cluster/store1/sugnet/altSplice/hg17-2005.03.28/orthoSpliceExoniphy

cd exonWalk
mkdir beds

# Make parasol script.
foreach file (`ls ../agx/*.agx`)
  set base=`basename $file .agx`
  echo "/cluster/home/sugnet/bin/i386/exonWalk db=hg17 minPercent=0 trumpSize=100000 $file beds/$base.bed" >> exonWalk.para.spec
end
para create exonWalk.para.spec
para push
cat beds/*.bed > hg17.mm5.cons.t3.exoniphy.bed

# Predict orfs
mkdir orfs
cd orfs
mkdir bedOrf beds fa borf
cp ~/store1/altSplice/hg17-2005.01.09/orthoSpliceExonify/exonWalk/orfs.mrna2/*.sh ./
splitFile ../../hg17.mm5.cons.t3.exoniphy.bed 500 exonWalk.
cat < < '_EOF_' > makeFa.sh
#!/bin/sh

for file in "$@"
do
 base=`basename $file`
 echo "Doing $file"
 echo "sequenceForBed -db=hg17 -bedIn=$file -fastaOut=fa/$base.fa "
 sequenceForBed -db=hg17 -bedIn=$file -fastaOut=fa/$base.fa 
done
'_EOF_'
chmod 755 makeFa.sh
makeFa.sh beds/*

# Run borf lots of times...
makeSpec.sh beds/* > para.spec
para create para.spec
para push
mkdir genePred
cat << '_EOF_' > makeGenePred.sh
#!/bin/sh

for file in "$@"
do
  base=`basename $file`
  /cluster/home/sugnet/bin/i386/borfMatcher -keepNmd beds/$base borf/$base.borf bedOrf/$base.bed genePred/$base.gp 
done
'_EOF_'
    # << this line makes emacs coloring happy
chmod 755 makeGenePred.sh 
makeGenePred.sh beds/*

cat beds/* > hg17.mm5.exonWalk.bed
cat genePred/*.gp > hg17.mm5.exonWalk.gp

wc *.bed *.gp
# 155470 1865640 29956585 hg17.mm5.exonWalk.bed
#  98433  984330 32495119 hg17.mm5.exonWalk.gp

# Load it into the database.
ldHgGene -predTab hg17 exonWalk hg17.mm5.exonWalk.gp 

# end exonWalk

####################################################################
### hapmapRecombRate (Daryl; September 19, 2005)
    Lifted from hg16; see makeHg16.doc for details
    # Update (Jen; October 25, 2005)
    Data points that lifted to chroms other than 1-22 + X removed
    before release to RR (confirmed with Daryl).
    chr4_random: 11 data points
    chr6_hla_hap1: 25 data points
### hapmapRecombHotspot (Daryl; September 19, 2005)
    Lifted from hg16; see makeHg16.doc for details

### HapMap SNPs (Daryl; February 4, 2006)
  # most of this work was done in October and November 2005 for the ENCODE workshop
    cd /cluster/store4/gs.17/build34/bed/hapmap/frequencies/2005-10/non-redundant/hapmapSnps
    ln -sf ../hg17.daf.all/daf.txt.gz .
    ln -sf ../hg17.panTro1.rheMac1.txt.gz .
    zcat hg17.panTro1.rheMac1.txt | grep -v chrom | sort >! hg17.panTro1.rheMac1.sort.txt
    zcat daf.txt                  | grep -v chrom | sort >! daf.sort.txt
    # check that order matches; should be empty
    paste hg17.panTro1.rheMac1.sort.txt daf.sort.txt | awk '$1!=$17||$2!=$18||$3!=$19||$4!=$20||$5!=$21||$6!=$22||$7!=$23||$8!=$24||$11!=$25||$12!=$27||$15!=$26||$16!=$28{print $0;}'
    paste hg17.panTro1.rheMac1.sort.txt daf.sort.txt | awk '{printf "%s\t%d\t%d\t%s\t0\t%c\t%c\t%c\t%c\t%c\t%d\t%d\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\n",$1,$2,$3,$4,$6,$7,$8,$11,$15,$12,$16,$29,$30,$31,$32,$33}' >! hapmapSnps.bed
    hgLoadBed hg17 hapmapSnps -sqlTable=hapmapSnps.sql hapmapSnps.bed

############################################################################################
# HapMap SNPs rel21a (Feb. 2007, Heather)
# June 2007 [partial fix of hapmapAllelesSummary released 6/25/07:
# using hg17 instead of hg18 liftOver files... for most but not all
# chroms! :( not documented below; error found by user]
# 1/11/08, 1/24/08 (angie): regenerated hapmapAllelesSummary with corrected
# hapmapAllelesChimp.

# get files for each chrom, for each population
# these contain data for all individuals
# not using the JPT+CHB files
ssh kkstore05
cd /cluster/store12/snp/hapmap/rel21a/genotypes/2007-01/rs_strand/non-redundant/zips
wget http://www.hapmap.org/downloads/genotypes/2007-01/rs_strand/non-redundant/* 

# get population data (needed to define children in CEU and YRI trios)
cd /cluster/store12/snp/hapmap
wget http://www.hapmap.org/downloads/samples_individuals/*gz
gunzip pedinfo2sample_CEU.txt.gz
filterPedigree.pl < pedinfo2sample_CEU.txt > CEU.filtered
cp CEU.filtered rel21a/genotypes/2007-01/rs_strand/non-redundant/CEU.list
gunzip pedinfo2sample_YRI.txt.gz
filterPedigree.pl < pedinfo2sample_YRI.txt > YRI.filtered
cp YRI.filtered rel21a/genotypes/2007-01/rs_strand/non-redundant/YRI.list

#!/usr/bin/env perl
while (<STDIN>) {
    my @fields = split;
    if ($fields[2] == 0 && $fields[3] == 0) {
        @subfields = split /:/, $fields[6];
        print $subfields[4];
        print "\n";
    }
}

cd /cluster/store12/snp/hapmap/rel21a/genotypes/2007-01/rs_strand/non-redundant
zcat zips/*chr22_CEU* | head -1 > header.CEU
zcat zips/*chr22_YRI* | head -1 > header.YRI
# add carriage returns to header.CEU and header.YRI
grep -n -f CEU.list header.CEU > offsets.CEU
grep -n -f YRI.list header.YRI > offsets.YRI
# delete ids in offsets.CEU and offsets.YRI so just column numbers remain

# for each population, combine all chroms, and combine all individuals
# for CEU and YRI, filter out children from trios
# This creates CEU.merge, CHB.merge, JPT.merge, YRI.merge

./merge.csh

#!/bin/tcsh 
rm -f CEU.merge
rm -f CHB.merge
rm -f JPT.merge
rm -f YRI.merge

foreach chrom (`cat chrom.list`)

    echo $chrom

    # CEU
    echo "CEU"
    set fileName=`echo $chrom | awk '{printf "zips/genotypes_%s_CEU_r21a_nr.txt.gz", $1}'`
    zcat $fileName | filterCEU.pl >> CEU.merge

    # CHB
    echo "CHB"
    set fileName=`echo $chrom | awk '{printf "zips/genotypes_%s_CHB_r21a_nr.txt.gz", $1}'`
    zcat $fileName | filterCHB.pl >> CHB.merge

    # JPT
    echo "JPT"
    set fileName=`echo $chrom | awk '{printf "zips/genotypes_%s_JPT_r21a_nr.txt.gz", $1}'`
    zcat $fileName | filterJPT.pl >> JPT.merge

    # YRI
    echo "YRI"
    set fileName=`echo $chrom | awk '{printf "zips/genotypes_%s_YRI_r21a_nr.txt.gz", $1}'`
    zcat $fileName | filterYRI.pl >> YRI.merge

end


# Below is filterCEU.pl
# The others are very similar: YRI uses "offsets.YRI"
# CHB and JPT just read the input directly

#!/usr/bin/env perl 

# read in a list of the columns that we are keeping
sub initList {
open LIST, "offsets.CEU";
chomp(@list = <LIST>);
close LIST;
$listSize = @list;
}

&initList;

while (<STDIN>) {
    my @fields = split;
    # skip header
    if ($fields[0] eq "rs#") { next; }
    # chrom
    print $fields[2];
    print " ";
    # position: add zero-based start coord
    print $fields[3] - 1;
    print " ";
    print $fields[3];
    print " ";
    # rsId
    print $fields[0];
    print " ";
    # score
    print "0 ";
    # strand
    print $fields[4];
    print " ";
    # observed
    print $fields[1];
    print " ";
    @alleles = ();
    for ( my $loop = 0; $loop < $listSize; $loop++ ) {
	push (@alleles, $fields[@list[$loop]-1]);
    }
    # N is used for missing data
    $nCount = 0;
    # counts
    $aCountHomo = 0;
    $cCountHomo = 0;
    $gCountHomo = 0;
    $tCountHomo = 0;
    $aCountHetero = 0;
    $cCountHetero = 0;
    $gCountHetero = 0;
    $tCountHetero = 0;
    foreach $allele (@alleles) {
        $parent1 = substr($allele, 0, 1);
        $parent2 = substr($allele, 1, 1);
	# Ns must be together
	if ($parent1 eq "N" && $parent2 ne "N") { die "Unexpected input"; }
	if ($parent2 eq "N" && $parent1 ne "N") { die "Unexpected input"; }
	if ($parent1 eq "N" && $parent2 eq "N") { $nCount++; next; }

	if ($parent1 eq "A" && $parent2 eq "A") { 
	    $aCountHomo = $aCountHomo + 2;
	    next; 
	}
	if ($parent1 eq "C" && $parent2 eq "C") { 
	    $cCountHomo = $cCountHomo + 2;
	    next; 
	}
	if ($parent1 eq "G" && $parent2 eq "G") { 
	    $gCountHomo = $gCountHomo + 2;
	    next; 
	}
	if ($parent1 eq "T" && $parent2 eq "T") { 
	    $tCountHomo = $tCountHomo + 2;
	    next; 
	}

	if ($parent1 eq "A") { $aCountHetero++; }
	if ($parent1 eq "C") { $cCountHetero++; }
	if ($parent1 eq "G") { $gCountHetero++; }
	if ($parent1 eq "T") { $tCountHetero++; }
	if ($parent2 eq "A") { $aCountHetero++; }
	if ($parent2 eq "C") { $cCountHetero++; }
	if ($parent2 eq "G") { $gCountHetero++; }
	if ($parent2 eq "T") { $tCountHetero++; }
    }
    print "A ";
    print $aCountHomo;
    print " ";
    print $aCountHetero;
    print " ";
    print "C ";
    print $cCountHomo;
    print " ";
    print $cCountHetero;
    print " ";
    print "G ";
    print $gCountHomo;
    print " ";
    print $gCountHetero;
    print " ";
    print "T ";
    print $tCountHomo;
    print " ";
    print $tCountHetero;
    print " ";

    print "\n";

}

# << emacs
# Switch to C programs from kent/src/hg/snp/snpLoad.

# Determine allele1 and allele2 (set allele2 to "none" if monomorphic)
# Alleles are in alphabetical order
# Calculate score (minor allele frequency)
# Log and skip if wrong number of elements in row
# Log and skip if triallelic or quadallelic
# Log and skip degenerate case (no alleles) 
# No errors this run
# Still running on kkstore05
# Could rename "hapmap1" to "hapmapCondense"
/cluster/home/heather/kent/src/hg/snp/snpLoad/hapmap1 CEU.merge CEU.condense
wc -l hapmap1.log
/cluster/home/heather/kent/src/hg/snp/snpLoad/hapmap1 CHB.merge CHB.condense
wc -l hapmap1.log
/cluster/home/heather/kent/src/hg/snp/snpLoad/hapmap1 JPT.merge JPT.condense
wc -l hapmap1.log
/cluster/home/heather/kent/src/hg/snp/snpLoad/hapmap1 YRI.merge YRI.condense
wc -l hapmap1.log

# save some space
gzip *merge

# load
ssh hgwdev
cd /cluster/store12/snp/hapmap/rel21a/genotypes/2007-01/rs_strand/non-redundant
cp /cluster/home/heather/kent/src/hg/lib/hapmapSnps.sql .
# modify hapmapSnps for 4 populations
hgLoadBed hg17 hapmapSnpsCEU -sqlTable=hapmapSnpsCEU.sql CEU.condense
hgLoadBed hg17 hapmapSnpsCHB -sqlTable=hapmapSnpsCHB.sql CHB.condense
hgLoadBed hg17 hapmapSnpsJPT -sqlTable=hapmapSnpsJPT.sql JPT.condense
hgLoadBed hg17 hapmapSnpsYRI -sqlTable=hapmapSnpsYRI.sql YRI.condense

# save some more space
ssh kkstore05
cd /cluster/store12/snp/hapmap/rel21a/genotypes/2007-01/rs_strand/non-redundant
gzip *condense

# sanity check
mysql> select count(*) from hapmapSnpsCEU where homoCount1 + homoCount2 + heteroCount = 0;
+----------+
| count(*) |
+----------+
|        0 |
+----------+

mysql> select count(*) from hapmapSnpsCHB where homoCount1 + homoCount2 + heteroCount = 0;
+----------+
| count(*) |
+----------+
|        0 |
+----------+

mysql> select count(*) from hapmapSnpsJPT where homoCount1 + homoCount2 + heteroCount = 0;
+----------+
| count(*) |
+----------+
|        0 |
+----------+

mysql> select count(*) from hapmapSnpsYRI where homoCount1 + homoCount2 + heteroCount = 0;
+----------+
| count(*) |
+----------+
|        0 |
+----------+

mysql> select max(score) from hapmapSnpsCEU;  
+------------+
| max(score) |
+------------+
|        500 |
+------------+


# create indexes
mysql> alter table hapmapSnpsCEU add index name (name);
mysql> alter table hapmapSnpsCEU add index chrom (chrom, bin);
mysql> alter table hapmapSnpsCHB add index name (name);
mysql> alter table hapmapSnpsCHB add index chrom (chrom, bin);
mysql> alter table hapmapSnpsJPT add index name (name);
mysql> alter table hapmapSnpsJPT add index chrom (chrom, bin);
mysql> alter table hapmapSnpsYRI add index name (name);
mysql> alter table hapmapSnpsYRI add index chrom (chrom, bin);

# 2nd step in processing: create hapmapSnpsCombined
ssh hgwdev
cd /cluster/data/hg17/bed/hapmap/rel21a
/cluster/home/heather/kent/src/hg/snp/snpLoad/hapmap2 hg17
hgLoadBed hg17 hapmapSnpsCombined -sqlTable=/cluster/home/heather/kent/src/hg/lib/hapmapSnpsCombined.sql hapmapSnpsCombined.tab

# create indexes (not used by browser)
mysql> alter table hapmapSnpsCombined add index name (name);
mysql> alter table hapmapSnpsCombined add index chrom (chrom, bin);

# errors
# nothing that isn't biallelic
# nothing with mixed positions
# over 500K that were not available in all 4 populations
# YRI: 187,485
# CEU: 129,359
# CHB and JPT: 97,095

# Also, 2 strand corrections done
grep -v missing hapmap2.errors
# different strands for rs1621378
# different strands for rs5768

# cleanup to save space
rm hapmapSnpsCombined.tab

# monomorphism

YRI        867,835 
CEU      1,252,743 
CHB      1,496,438 
JPT      1,539,094 
combined   607,393 

# observed strings
# why is A/T different from other transversions?

A/G      1,344,043
C/T      1,344,542
A/C        352,875
A/T        275,670
C/G        354,299
G/T        354,149
triallelic   1,370
quadallelic    403
other        1,226

# some details on the others:
125     -/A/T
124     -/A/G
107     -/C/T
85      -/A/C
79      -/G/T
25      -/C/G

18      -/A/C/T
13      -/A/G/T
12      -/A/C/G
11      -/C/G/T

7       (LARGEINSERTION)
5       (LARGEDELETION)
6       microsat
2       het

# check for collisions (more than one SNP at the same location)
# none found
ssh hgwdev
cd /cluster/data/hg17/bed/hapmap/rel21a
/cluster/home/heather/kent/src/hg/snp/snpLoad/snpCheckCluster2 hg17 hapmapSnpsCombined > snpCheckCluster2.out

# check against hg17.snp125
ssh hgwdev
cd /cluster/data/hg17/bed/hapmap/rel21a
/cluster/home/heather/kent/src/hg/snp/snpLoad/hapmapLookup hg17 hapmapSnpsCombined snp125 snp125Exceptions

# 1817 total that are complex type from dbSNP (hapmapLookup.log)
# This is not based on observed string, only on size, class and locType
# 1176 class = mixed
#  616 class = single but locType != exact
#   11 class = named
#    6 class = insertion
#    4 class = deletion
#    2 class = microsat
#    2 class = het

# Generally if class = single the observed string is bi-allelic as expected
# Exceptions to that:
#    rs700519 quad-allelic, locType = rangeDeletion
#    rs1572672 tri-allelic, locType = between
#    rs2357412 tri-allelic, locType = range
#    rs2364671 tri-allelic, locType = rangeSubstitution
#    rs3959788 quad-allelic, locType = between

# 74 items in hapmapLookup.error
# 59 reverse complement (that's okay)
#  7 multiple alignment (6 from chrX:154,219,000-154,220,500 which is close to PAR)
# Also rs6645103 which is PAR 
mysql> select chrom, chromStart, chromEnd, strand, observed, class, locType, weight from snp125 where name = "rs6645103";
+-------------+------------+----------+--------+----------+--------+---------+--------+
| chrom       | chromStart | chromEnd | strand | observed | class  | locType | weight |
+-------------+------------+----------+--------+----------+--------+---------+--------+
| chrX_random |     273788 |   273789 | -      | C/T      | single | exact   |      3 |
| chrX        |     421141 |   421142 | +      | C/T      | single | exact   |      3 |
| chrY        |     421141 |   421142 | +      | C/T      | single | exact   |      3 |
+-------------+------------+----------+--------+----------+--------+---------+--------+


#  4 observed with dbSNP complex, hapmap biallelic
#  all positive strand, locType = between
#  all cluster errors in dbSNP
#  rs10485830
#  rs7625205 (intronic)
#  rs713582
#  rs11403115 (class = insertion)

# 3 observed mismatch
# all dbSNP clustering error
# rs2230624 (tri-allelic)
# rs3963317 (monomorphic in hapmap, rangeSubstitution in dbSNP)
# rs5017503 (monomoprhic in hapmap)

# a strange one
# rs731449
# dbSNP strand = -, hapmap strand = +
# dbSNP observed = G/T, hapmap observed = C/T
# dbSNP clustering error rs2321451, which is C/T
# hapmap monomorphic for T
# ortho A
# no repeats, no genes, no mRNAs, no conservation

# Counts of rows where 3 populations have one major allele, the 4th has the other

hapmapMixed hg17
# countCEU = 162931
# countCHB = 46543
# countJPT = 48791
# countYRI = 309105

# Generate summary table (used by filters)
# Summary table includes ortho allele and ortho qual score
# Summary table score is heterozygosity
# Individual zygosity is *not* preserved
ssh hgwdev
# 6/25/08: regenerated with mostly-corrected hapmapAllelesChimp
# 1/11/08 angie: regenerated with finally-corrected (I hope) hapmapAllelesChimp
# 1/24/08 angie: regenerated with finally-corrected (I hope!) hapmapAllelesChimp
cd /cluster/data/hg17/bed/hapmap/rel21a
/cluster/home/heather/kent/src/hg/snp/snpLoad/hapmapSummary hg17 hapmapSnpsCombined hapmapAllelesChimp hapmapAllelesMacaque
hgLoadBed hg17 hapmapAllelesSummary -sqlTable=/cluster/home/heather/kent/src/hg/lib/hapmapAllelesSummary.sql hapmapSummary.tab -tab

# sanity check
mysql> select count(*) from hapmapAllelesSummary where majorAlleleCountCEU > totalAlleleCountCEU;
+----------+
| count(*) |
+----------+
|        0 |
+----------+

mysql> select count(*) from hapmapAllelesSummary where majorAlleleCountCHB > totalAlleleCountCHB;
+----------+
| count(*) |
+----------+
|        0 |
+----------+

mysql> select max(score) from hapmapAllelesSummary;
+------------+
| max(score) |
+------------+
|        500 |
+------------+

mysql> select count(*), popCount from hapmapAllelesSummary group by popCount;
+----------+----------+
| count(*) | popCount |
+----------+----------+
|    52479 |        1 |
|    72977 |        2 |
|   207643 |        3 |
|  3700478 |        4 |
+----------+----------+

mysql> select count(*), isMixed from hapmapAllelesSummary group by isMixed; 

+----------+---------+
| count(*) | isMixed |
+----------+---------+
|  3192896 | NO      |
|   840681 | YES     |
+----------+---------+


# histogram of heterozygosity:

  0 ************************************************************ 883400
 25 ************** 204000
 50 ************* 188703
 75 *********** 157404
100 ********** 143119
125 ********* 131575
150 ********* 126916
175 ********* 128585
200 ******** 123440
225 ******** 119815
250 ******** 120646
275 ******** 120239
300 ******** 122654
325 ********* 128233
350 ********* 130069
375 ********** 144699
400 ********** 152829
425 ************ 172513
450 *************** 225645
475 ********************************** 503166
500  5927

############################################################################################

### HapMap LD (Daryl; February 11, 2006)
    ## start from the genotypes files, run Haploview, reformat, and load
    mkdir -p /san/sanvol1/hg17/bed/hapmap/genotypes/2006-01/non-redundant/para
    cd /san/sanvol1/hg17/bed/hapmap/genotypes/2006-01/non-redundant
    # wget all genotype data:
    # ftp://www.hapmap.org/genotypes/2006-01/non-redundant/genotypes_chr*_*.b35.txt.gz

    # Haploview had to be recompiled because there was a missing JPT sample in the ped file
    ##runHaploview.csh
    #!/bin/csh
    if ( $#argv < 2 ) then
	echo "usage: $0 <absolutePath> <genotypeFileName.gz> [<javaMaxMem>]"
	echo "       $0 /cluster/bin/foo bar.gz 2G"
	exit 1
    endif
    set path = $1
    set file = $2
    set root = $file:r
    set memFlag = ""
    if ( $#argv >= 3  ) then
	set memFlag = "-Xmx$3"
    endif
    cd /scratch
    /bin/cp     -f $path/$file .
    /bin/gunzip -f $file
    /usr/java/jre1.5.0_06/bin/java -d64 $memFlag -jar /cluster/home/daryl/haploview/haploview/Haploview.jar -c -d -n -maxDistance 250 -a $root >&! $root.log
    /bin/gzip   -f $root.LD    $root.CHECK >>& $root.log
    /bin/mv     -f $root.LD.gz $root.CHECK.gz  $root.log $path/
    /bin/rm     -f $root*
    ###

    cd para
    set hv = /cluster/home/daryl/scripts/runHaploview.csh
    set ldDir = /cluster/store5/gs.18/build35/bed/hapmap/genotypes/2006-01/non-redundant
    
    foreach pop (YRI CEU CHB JPT JPT+CHB)
	foreach chrom (1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 X)
	    echo $hv $ldDir genotypes_chr${chrom}_{$pop}.b35.txt.gz 4G >> jobList
	end
    end
    ssh pk
    # para create, para try, para push -maxNode=25 ...
    #Completed: 120 of 120 jobs
    #CPU time in finished jobs:    1564915s   26081.91m   434.70h   18.11d  0.050 y
    #IO & Wait Time:                 21862s     364.37m     6.07h    0.25d  0.001 y
    #Average job time:               13223s     220.39m     3.67h    0.15d
    #Longest running job:                0s       0.00m     0.00h    0.00d
    #Longest finished job:           40742s     679.03m    11.32h    0.47d
    #Submission to last job:        104809s    1746.82m    29.11h    1.21d

    #### makeDcc.pl 
    #!/usr/bin/perl -W
    $pop   = shift || die "usage: makeDcc.pl <pop> <chr>\n";
    $chrom = shift || die "usage: makeDcc.pl <pop> <chr>\n";
    $geno  = "geno/genotypes_${chrom}_${pop}.b35.txt.gz";
    $ld    = "ld/genotypes_${chrom}_${pop}.b35.txt.LD.gz";
    $txt   = "dcc/ld_${chrom}_${pop}.b35.txt";
    open(GENO,"zcat $geno | " ) || die "can't open $geno";
    open(LD,  "zcat $ld   | " ) || die "can't open $ld";
    open(TXT, " > $txt      " ) || die "can't open $txt";
    <GENO>;#ignore header
    while (<GENO>) { @fields = split / /; $pos{$fields[0]} = $fields[3]; }
    close(GENO);
    <LD>;#ignore header;
    while (<LD>) { @fields     = split /\t/; $chromStart = $pos{$fields[0]}; $chromEnd   = $pos{$fields[1]};
	print TXT "$chromStart $chromEnd $pop $fields[0] $fields[1] $fields[2] $fields[4] $fields[3]\n"; }
    close(LD);
    close(TXT);
    system("gzip $txt");
    ####

    #### makeDcc.csh
    #!/bin/csh
    #set path = "/cluster/home/daryl/scripts";
    set path = ".";
    foreach pop (CEU CHB JPT YRI JPT+CHB)
	foreach chr (1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 X Y)
	    echo $path/makeDcc.pl $pop chr$chr
	end
    end
    ####

    #### makeLdBed.pl
    #!/usr/bin/perl -W
    sub min ($$)
    {
	my $a = shift @_;
	my $b = shift @_;
	if ($a<$b) {return $a;}
	return $b;
    }
    sub encodeDprime($)
    {
	my $val = shift @_;
	if    ( ($val > 1) || ($val < -1) ) { die "Dprime value ($val) is out of range [-1,1]";}
	elsif ($val>=0) { $ret = ord('a') + $val*9;}
	else         { $ret = ord('A') - $val*9;}
	return chr($ret);
    }
    sub encodeRsquared($)
    {
	my $val = shift @_;
	if ( ($val > 1) || ($val < 0) ) { die "R^2 value ($val) is out of range [0,1]";}
	return encodeDprime($val);
    }
    sub encodeLod($$)
    {
	my $lod    = shift @_;
	my $dPrime = shift @_;
	$ret = ord('a');
	if ($lod>=2) # high LOD
	{
	if (abs($dPrime)<0.5) { $ret = ord('y'); } # high LOD, low D'  -> pink
	else { $ret += min((int($lod-abs($dPrime)-1.5)), 9) ;}
	}
	elsif (abs($dPrime)>0.99) { $ret = ord('z'); } # high D', low LOD  -> blue
	return chr($ret); 
    }
    $inDir  = shift||"data"; 
    $outDir = shift||"bed";
    $foo    = "";
    $bar    = ""; 
    @rest   = (); 
    @pops   = ("CEU", "CHB", "JPT", "YRI", "JPT+CHB");
    printf("> Starting        \t" . `date` . "\n");
    foreach $pop (@pops)
    {
	opendir(DIR, $inDir) || die "can't open $inDir";
	if ($pop eq "JPT+CHB") { @hmFiles = grep {/^ld_/ && /_JPT/ && /CHB.b35.txt.gz$/} readdir(DIR); }
	else { @hmFiles = grep {/^ld_/ && /_${pop}.b35.txt.gz$/} readdir(DIR); }
	closedir(DIR);
	printf "POP:\t$pop\t$#hmFiles\n";
	foreach $hmFile (sort @hmFiles)
	{
	    ($foo, $chrom, $bar) = split /_/, $hmFile;
	    $chrom =~ s/chrx/chrX/;
	    $chrom =~ s/chry/chrY/;
	    $outfile = "$outDir/${pop}_${chrom}.hg17.bed";
	    if ((-e $outfile)||(-e "$outfile.gz")) { next; }
	    $tmpFile = "/tmp/${pop}_${chrom}.hg17.bed";
	    printf("$inDir/$hmFile => $outfile.gz\t" . `date`);
	    open(OUT, "> $tmpFile" ) || die "can't open $tmpFile";
	    open(IN, "zcat $inDir/$hmFile | " ) || die "can't open $inDir/$hmFile";
	    $line = <IN>;
	    if (!defined $line){next;}
	    chomp($line);
	    ($chromStart, $chromEnd, $pop, $name, $marker2, $dprime, $rsquared, $lod, @rest) = split / /, $line;
	    $ldCount = 1;
	    while (<IN>)
	    {
	        chomp();
		($chromStartNew, $chromEndNew, $pop, $nameNew, $marker2, $dprime, $rsquared, $lod, @rest) = split / /;
		if ($chromStart ne $chromStartNew)
		{
		    $chromStart--;
		    printf(OUT "$chrom\t$chromStart\t$chromEnd\t$name\t$ldCount\t$dprimeList\t$rsquaredList\t$lodList\n");
		    $chromStart   = $chromStartNew;
		    $chromEnd     = $chromEndNew;
		    $name         = $nameNew;
		    $ldCount      = 1;
		    $dprimeList   = encodeDprime($dprime);
		    $rsquaredList = encodeRsquared($rsquared);
		    $lodList      = encodeLod($lod, $dprime);
		}
		elsif ($chromEndNew-$chromStartNew<250000)
		{
		    $chromEnd     = $chromEndNew;
		    $ldCount++;
		    $dprimeList   .= encodeDprime($dprime);
		    $rsquaredList .= encodeRsquared($rsquared);
		    $lodList      .= encodeLod($lod, $dprime);
		}
	    }
	    close(IN);
	    $chromStart--;
	    printf(OUT "$chrom\t$chromStart\t$chromEnd\t$name\t$ldCount\t$dprimeList\t$rsquaredList\t$lodList\n");
	    close(OUT);
	    system("gzip $tmpFile");
	    system("mv $tmpFile.gz $outDir");
	}
    }
    printf("> Finished        \t" . `date` . "\n");
    ####

    #### getMax.csh -- check for consistency by chrom and population
    #!/bin/csh
    set out = maxDist.txt
    rm -f $out
    touch $out
    echo this takes about 4 hours to run completely >> $out
    foreach f (dcc/ld_*.b35.txt.gz)
	echo -n "$f    " >> $out
	zcat $f | awk '{if ($2-$1>max) max=$2-$1} END {print max}' >> $out
    end

    #### getSizes.csh -- should all be 249999
    #!/bin/csh
    set out = wcList.txt
    rm -f $out
    touch $out
    echo "this takes about 2 hours to run completely"
    foreach f (dcc/*.txt.gz) 
	echo -n $f:r:r "    " | sed 's/ld_//;s/chr//;s/_/\t/' >> $out
	zcat $f | cut -f1 -d " " | uniq | wc -l >> $out
    end

    #### load.csh
    #!/bin/csh
    set db = hg17
    sed 's/hapmapLd/hapmapLdCeu/'    ${HOME}/kent/src/hg/lib/hapmapLd.sql | hgsql ${db}
    sed 's/hapmapLd/hapmapLdChb/'    ${HOME}/kent/src/hg/lib/hapmapLd.sql | hgsql ${db}
    sed 's/hapmapLd/hapmapLdJpt/'    ${HOME}/kent/src/hg/lib/hapmapLd.sql | hgsql ${db}
    sed 's/hapmapLd/hapmapLdYri/'    ${HOME}/kent/src/hg/lib/hapmapLd.sql | hgsql ${db}
    sed 's/hapmapLd/hapmapLdChbJpt/' ${HOME}/kent/src/hg/lib/hapmapLd.sql | hgsql ${db}
    # about half an hour to an hour per population
    foreach c (1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 X)
	hgLoadBed -noSort -oldTable -strict ${db} hapmapLdCeu CEU_chr${c}.${db}.bed.gz
	hgLoadBed -noSort -oldTable -strict ${db} hapmapLdChb CHB_chr${c}.${db}.bed.gz
	hgLoadBed -noSort -oldTable -strict ${db} hapmapLdJpt JPT_chr${c}.${db}.bed.gz
	hgLoadBed -noSort -oldTable -strict ${db} hapmapLdYri YRI_chr${c}.${db}.bed.gz
	hgLoadBed -noSort -oldTable -strict ${db} hapmapLdChbJpt JPT+CHB_chr${c}.${db}.bed.gz
    end
    rm -f bed.tab
    ###


# AFFYHUEX1 TRACK (sugnet Wed Oct  5 12:16:42 PDT 2005)

mkdir hg17
cd hg17
pwd
# /cluster/store1/sugnet/affymetrixHumanAllExon/hg17
mkdir gff beds annot
cd gff
# download gff design files

# parse gff script...
#!/usr/bin/perl -w
if(scalar(@ARGV) == 0) {
    print STDERR "parseGff.pl - Parse out affymetrixes gff annotation 
probesets for human all exon design.
usage:
   parseGff.pl file1.design.gff file2.design.gff ... fileN.design.gff
";
    exit(1);
}

sub splitField($) {
    my $l = shift(@_);
    my @w = split / /, $l;
    return $w[1];
}

while($file = shift(@ARGV)) {
    if(!($file =~ /(.+)\.gff/)) {
	die "$file doesn't have .gff suffix\n";
    }
    $prefix = $1;
    print STDERR "Doing file $file.\n";
    open(IN, $file) or die "Can't open $file to read.";
    open(BED, ">../beds/$prefix.pset.bed") or die "Can't open ../beds/$prefix.pset.bed to write.";
    open(ANNOT, ">../annot/$prefix.tab") or die "Can't open ../annot/$prefix.tab to write.";
    while($line = <IN>) {
	# Only want the probeset records.
	if($line =~ /\tprobeset\t/) {
	    $score = 0;
	    $cds = 0;
	    $bounded = 0;
	    chomp($line);
	    # pop off an microsoft line endings.
	    $line =~ s/\r$//;
	    @words = split /\t/, $line;
	    # This makes the evidence be comman separated.
	    $words[8] =~ s/\" \"/,/g;
	    # This gets rid of pesky quotes.
	    $words[8] =~ s/\"//g;

	    # Set the score based on the annotation type
	    if($words[8] =~ /full/) {
		$score = 200;
	    }
	    elsif($words[8] =~ /extended/) {
		$score = 500;
	    }
	    elsif($words[8] =~ /core/) {
		$score = 900;
	    }
	    if($words[8] =~ /bounded/) {
		$score -= 200;
	    }
	    if($words[8] =~ /cds/) {
		$score += 100;
	    }
	    if($score <= 0) {
		$score = 100;
	    }
		
	    # Print out the annotation fields.
	    @fields = split /; /,$words[8];
	    $id = splitField($fields[1]);
	    $f = shift(@fields);
	    $f = splitField($f);
	    print ANNOT "$f";
	    while($f = shift(@fields)) {
		if($f =~ /^bounded/) {
		    $bounded = 1;
		}
		if($f =~ /^cds/) {
		    $cds = 1;
		}
		if(!($f =~ /^bounded/ || $f =~ /^cds/)) {
		    $f = splitField($f);
		    print ANNOT "\t$f";
		}
	    }
	    print ANNOT "\t$bounded\t$cds";
	    print ANNOT "\n";
	    print BED "$words[0]\t$words[3]\t$words[4]\t$id\t$score\t$words[6]\n";
	}
    }
    close(IN);
    close(BED);
    close(ANNOT);
}

./parseGff.pl *.gff
cat beds/*.bed > affyHuEx1.bed
hgLoadBed hg17 affyHuEx1 affyHuEx1.bed -strict
cat annot/*.tab > affyHuEx1.annot.tab 

# Contents of affyHuEx1Annot.sql file
CREATE TABLE affyHuEx1Annot (
  numIndependentProbes smallint not null,
  probesetId int(11) not null,
  exonClustId int(11) not null,
  numNonOverlapProbes smallint not null,
  probeCount smallint not null,
  transcriptClustId int(11) not null,
  probesetType smallint not null,
  numXHybeProbe smallint not null,
  psrId int(11) not null,
  level varchar(10) not null,
  evidence varchar(255) not null,
  bounded smallint not null,
  cds smallint not null,
  PRIMARY KEY (probesetId)
);
hg17S -A < affyHuEx1Annot.sql 
echo "load data local infile 'affyHuEx1.annot.tab' into table affyHuEx1Annot;" | hg17S -A

# end AFFYHUEX1 track

##########################################################################
# AFFY HUEX1 OFF-BY-ONE FIX (Andy 2006-12-14)
   ssh hgwdev
   cd /cluster/data/hg17/bed/affyHumanExon
   echo "select * from affyHuEx1" | hgsql hg17 | \
      tail +2 | awk 'BEGIN{OFS="\t"}{print $2,$3-1,$4,$5,$6,$7}' \
    > affyHuEx1.fixed.bed
   hgLoadBed hg17 affyHuEx1 affyHuEx1.fixed.bed

##########################################################################
# NSCAN composite track - (2005-09-29 markd)  loaded proteins 2005-10-13
    cd /cluster/data/hg17/bed/nscan/
    # obtained NSCAN and NSCAN-EST predictions from michael brent's group
    # at WUSTL
    wget http://genome.cse.wustl.edu/predictions/human/hg17_nscan_mm5_9_14_2005/hg17_nscan_mm5_9_14_2005.tar.gz
    tar -zxf hg17_nscan_mm5_9_14_2005.tar.gz 

    wget http://genome.cse.wustl.edu/predictions/human/NCBI35_NSCAN_EST_4-16-2005.tar  
    gzip -9 NCBI35_NSCAN_EST_4-16-2005.tar

    # change protein fasta file to have transcript id in header
    foreach f (chr_ptx/*.ptx)
        awk '/^>/{$0=$1".a"}{print $0}' $f >$f.fix
    end
    foreach f (NCBI35_NSCAN_EST_4-16-2005/chr_ptx/*.ptx)
        awk '/^>/{$0=$1".a"}{print $0}' $f >$f.fix
    end

    # load tracks.  Note that these have *utr features, rather than
    # exon featres.  currently ldHgGene creates separate genePred exons
    # for these.
    ldHgGene -gtf -genePredExt hg17 nscanGene  chr_gtf/chr*.gtf
    hgPepPred hg17 generic nscanPep chr_ptx/chr*.fix
    rm -rf chr_* *.tab

    ldHgGene -gtf -genePredExt hg17 nscanEstGene NCBI35_NSCAN_EST_4-16-2005/chr_gtf/chr*.gtf
    hgPepPred hg17 generic nscanEstPep NCBI35_NSCAN_EST_4-16-2005/chr_ptx/chr*.fix
    rm -rf NCBI35_NSCAN_EST_4-16-2005 *.tab

    # update trackDb; need a hg17-specific page to describe informants
    human/hg17/nscan.html    
    human/hg17/trackDb.ra

##########################################################################
# NHGRI DNASE I HYPERSENSITIVE SITES (2005-10-05 kate)
#  Submitted by Greg Crawford via web site,
#  http://research.nhgri.nih.gov/DNaseHS/May2005/
#  In addition, a file containing the 'randoms' was FTP'ed by Greg
#  Submitted for hg16 -- lifted to hg17.
#  Details of hg16 data prep are in makeHg16.doc

    mkdir /cluster/data/hg17/bed/nhgri
    cd /cluster/data/hg17/bed/nhgri
    cp /cluster/data/hg16/bed/nhgri/hs.bed hs.hg16.bed
    liftOver hs.hg16.bed /gbdb/hg16/liftOver/hg16ToHg17.over.chain \
                hs.hg17.bed hs.unmapped
    grep '^chr' hs.unmapped | wc -l
        # 8 unmapped
    hgLoadBed hg17 nhgriDnaseHs hs.hg17.bed
        # Loaded 14216 elements of size 5
    checkTableCoords hg17 nhgriDnaseHs

# UPDATE WGRNA TRACK (DONE, 2005-10-20, Fan)

  ssh hgwdev
  cd /cluster/data/hg17/bed

  mkdir wgRna-2005-10-20
  cd wgRna-2005-10-20

# Received the data file, wgtrack_no_bin_oct2005.txt, from Michel Weber's email 
# (Michel.Weber@ibcg.biotoul.fr)
# and place it under cd /cluster/data/hg17/bed/wgRna-2005-10-20.

  cp wgtrack_no_bin_oct2005.txt wgRna.tab
  vi wgRna.tab
  
# edit wgRna.tab to take out the first line of data field labels.

    hgLoadBed -sqlTable=/cluster/home/fanhsu/src/hg/lib/wgRna.sql hg17 wgRna wgRna.tab

# Compared to 8/24/05 data, a few records were changed.

##########################################################################
# REBUILD hg17.gnfAtlas2Distance TABLE.  SOMEHOW IT HAD MUCH FEWER RECORDS. (DONE 10/27/05, Fan)

# Create table to map between known genes and GNF Atlas2
# expression data.
    hgMapToGene hg17 gnfAtlas2 knownGene knownToGnfAtlas2 '-type=bed 12'

# Create expression distance table - takes about an hour
    hgExpDistance hg17 hgFixed.gnfHumanAtlas2MedianRatio \
        hgFixed.gnfHumanAtlas2MedianExps gnfAtlas2Distance \
        -lookup=knownToGnfAtlas2 &

#       hgsql -e "select count(*) from gnfAtlas2Distance;" hg17
#       row count changed to 32458000
			
##########################################################################
# BUILD ALLEN BRAIN TRACK (DONE 10/29/05 JK)

# Make the working directory
    ssh hgwdev
    cd /cluster/data/hg17/bed
    mkdir allenBrain
    cd allenBrain

# Remap the probe alignments from mm7 to hg17
    zcat /cluster/data/mm7/bed/bedOver/mm7.hg17.over.chain.gz \
        |  pslMap -chainMapFile -swapMap \
	       /cluster/data/mm7/bed/allenBrain/allenBrainAli.psl stdin stdout
	|  sort -k 14,14 -k 16,16n > unscored.psl
    pslRecalcMatch unscored.psl /cluster/data/hg17/nib \
        /cluster/data/mm7/bed/allenBrain/allProbes.fa allenBrainAli.psl
    

# Load the database
   hgsql hg17 < ~/kent/src/hg/lib/allenBrainUrl.sql
   hgsql hg17 -e 'load data local infile "/cluster/data/mm7/bed/allenBrain/allenBrainUrl.tab" into table allenBrainUrl;'
   hgLoadPsl hg17 allenBrainAli.psl
   mkdir /gbdb/hg17/allenBrain
   ln -s /cluster/data/mm7/bed/allenBrain/allProbes.fa /gbdb/hg17/allenBrain/allProbes.fa
   hgLoadSeq hg17 /gbdb/hg17/allenBrain/allProbes.fa

# Make mapping between known genes and allenBrain	
   hgMapToGene hg17 allenBrainAli -type=psl knownGene knownToAllenBrain 

##########################################################################
# BUILD NIBB IMAGE PROGES (DONE 11/07/05 JK)

# Make directory on san for cluster job and copy in sequence
    ssh pk
    mkdir /san/sanvol1/scratch/hg17/nibbPics
    cd /san/sanvol1/scratch/hg17/nibbPics
    cp /cluster/data/xenTro1/bed/nibbPics/nibbImageProbes.fa .

# Make parasol job dir and sequence list files
    mkdir run
    cd run
    mkdir psl
    ls -1 /cluster/sanvol1/scratch/hg17/nib/*.nib > genome.lst
    echo ../nibbImageProbes.fa > rna.lst

# Create parasol gensub file file
cat << '_EOF_' > gsub
#LOOP
blatz -rna -minScore=6000 -out=psl $(path1) $(path2) psl/$(root1)_$(root2).psl
#ENDLOOP
'_EOF_'

# Create parasol batch
    gensub2 genome.lst mrna.lst gsub spec
    para create spec

# Do para try/push/time etc.
#Completed: 46 of 46 jobs
#CPU time in finished jobs:      11818s     196.97m     3.28h    0.14d  0.000 y
#IO & Wait Time:                   145s       2.41m     0.04h    0.00d  0.000 y
#Average job time:                 260s       4.33m     0.07h    0.00d
#Longest running job:                0s       0.00m     0.00h    0.00d
#Longest finished job:            1022s      17.03m     0.28h    0.01d
#Submission to last job:          1060s      17.67m     0.29h    0.01d

# Make sort and filter
    catDir psl | sort -k 10 \
        | pslReps stdin stdout /dev/null -nohead -minAli=0.60 -nearTop=0.001 -minCover=0.10 -minNearTopSize=80 \
	| sort -k 14,14 -k 16,16n \
	| sed 's/..\/..\/nib\/chr/chr/' \
	| sed 's/.nib//' > ../nibbImageProbes.psl

# Make bed file and copy in stuff
    ssh hgwdev
    cd /cluster/data/hg17/bed
    mkdir nibbPics
    cd nibbPics
    cp /cluster/data/xenTro1/bed/nibbPics/nibbImageProbes.fa .
    cp /san/sanvol1/scratch/hg17/nibbPics/nibbImageProbes.psl .

# Load into database
    ln -s /cluster/data/xenTro1/bed/nibbPics/nibbImageProbes.fa /gbdb/hg17/nibbImageProbes.fa
    hgLoadSeq hg17 /gbdb/hg17/nibbImageProbes.fa
    hgLoadPsl hg17 nibbImageProbes.psl

###########################################################################
# EXONIPHY WITH DOG (acs, 11/22/05) -- MM7, RN3, CANFAM2, HG17

    # first build 4-way multiz alignment from syntenic nets (helps reduce
    # false positive predictions due to paralogous alignments)

    # (prepare mafNet files from syntenic nets and copy to
    # /cluster/bluearc/hg17/mafNetSyn; do this for mm7, rn3, canFam2,
    # and galGal2)

    # make output dir and run dir
    ssh pk
    cd /cluster/data/hg17/bed/multiz.hg17Mm7Rn3CanFam2

    mkdir -p mafSyn runSyn
    cd runSyn

    # create scripts to run multiz on cluster
    cat > oneMultiz.csh << 'EOF'
#!/bin/csh -fe
    set c = $1
    set multi = /scratch/$user/multiz.hg17Mm7Rn3CanFam2.$c
    set pairs = /cluster/bluearc/hg17/mafNetSyn

    # special mode --
    # with 1 arg, cleanup
    if ($#argv == 1) then
        rm -fr $multi
        exit
    endif

    #  special mode --
    # with 3 args, saves an alignment file
    if ($#argv == 3) then
        cp $multi/$2/$c.maf $3
        exit
    endif 
        
    set s1 = $2
    set s2 = $3
    set flag = $4

    # locate input files -- in pairwise dir, or multiple dir
    set d1 = $multi
    set d2 = $multi
    if (-d $pairs/$s1) then
        set d1 = $pairs
    endif
    if (-d $pairs/$s2) then
        set d2 = $pairs
    endif
    set f1 = $d1/$s1/$c.maf
    set f2 = $d2/$s2/$c.maf
    # write to output dir
    set out = $multi/${s1}${s2}
    mkdir -p $out

    # check for empty input file
    if (-s $f1 && -s $f2) then
        echo "Aligning $f1 $f2 $flag"
        /cluster/bin/penn/multiz.v10.5 $f1 $f2 $flag > $out/$c.tmp.maf
        echo "Ordering $c.maf"
        /cluster/bin/penn/maf_project $out/$c.tmp.maf hg17.$c > $out/$c.maf
    else if (-s $f1) then
        cp $f1 $out
    else if (-s $f2) then
        cp $f2 $out
    endif
'EOF'
# << for emacs
    chmod +x oneMultiz.csh

    cat > allMultiz.csh << 'EOF'
#!/bin/csh -fe
set c = $1
oneMultiz.csh $c mm7 rn3 0
oneMultiz.csh $c mm7rn3 canFam2 1
# get final alignment file
oneMultiz.csh $c mm7rn3canFam2 /cluster/data/hg17/bed/multiz.hg17Mm7Rn3CanFam2/mafSyn/$c.maf
#cleanup
oneMultiz.csh $c
'EOF'
# << for emacs
    chmod +x allMultiz.csh

cat > gsub << 'EOF'
#LOOP
allMultiz.csh $(root1) {check out line+ /cluster/data/hg17/bed/multiz.hg17Mm7Rn3CanFam2/mafSyn/$(root1).maf}
#ENDLOOP
'EOF'
# << for emacs
    cut -f 1 /cluster/data/hg17/chrom.sizes > chrom.lst
    set path = (/parasol/bin $path);rehash
    gensub2 chrom.lst single gsub jobList
    para create jobList
        # 46 jobs
    para try; para check
    para push

    # build chromosome-by-chromosome SS files

    cd /cluster/data/hg17/bed/multiz.hg17Mm7Rn3CanFam2
    mkdir run-ss-syn
    cd run-ss-syn
    mkdir -p /cluster/bluearc/hg17/multiz.hg17Mm7Rn3CanFam2/ssSyn
    cat > makeSS.csh << 'EOF'
#!/bin/csh -fe
set c = $1
/cluster/bin/phast/msa_view -i MAF -o SS /cluster/bluearc/hg17/multiz.hg17Mm7Rn3CanFam2/mafSyn/$c.maf --refseq /cluster/bluearc/hg17/chrom/$c.fa | gzip -c >  /cluster/bluearc/hg17/multiz.hg17Mm7Rn3CanFam2/ssSyn/$c.ss.gz
'EOF'
# << for emacs
    chmod +x makeSS.csh

    rm -f jobList
    foreach chr (`cut -f 1 /cluster/data/hg17/chrom.sizes`)
	echo "makeSS.csh $chr" >> jobList
    end

    para create jobList
        # 46 jobs
    para try; para check
    para push

    # now train hmm, with indel model
    # note: commands below require bash

    # first get a clean set of genes for training (with --indel-strict)
    mkdir -p /cluster/data/hg17/bed/exoniphy/train
    cd /cluster/data/hg17/bed/exoniphy/train
    mkdir -p stats genes
    CHROMS="chr1 chr2 chr3 chr4 chr5 chr6 chr7 chr8 chr9 chr10 chr11 chr12 chr13 chr14 chr15 chr16 chr17 chr18 chr19 chr20 chr21 chr22"
    for chr in ${CHROMS} ; do 
	echo $chr 
	zcat /cluster/bluearc/hg17/multiz.hg17Mm7Rn3CanFam2/ssSyn/$chr.ss.gz | clean_genes genes/refGene.$chr.gff - --stats stats/$chr.stats --conserved --indel-strict --groupby exon_id --offset3 4 --offset5 4 > genes/refGene.$chr.clean.gff 
    done

    # get conserved noncoding seqs and add to GFFs
    mkdir -p cns
    for chr in ${CHROMS} ; do 
	    echo $chr
	    featureBits -bed=cns/$chr.bed -chrom=$chr hg17 phastConsElementsPaper \!knownGene:exon:100 \!refGene:exon:100 \!mrna \!ensGene \!intronEst \!twinscan
	cp genes/refGene.$chr.clean.gff genes/refGene.$chr.withCNS.gff 
	awk '{printf "%s\tphastCons\tCNS\t%d\t%d\t.\t.\t.\texon_id \"CNS.%s\"\n", $1, $2+1, $3, $4}' cns/$chr.bed >> genes/refGene.$chr.withCNS.gff 
    done

    # now train HMM
    # note: actually have to unzip SS files before this step
    rm -f alns gffs
    for chr in ${CHROMS} ; do 
	echo /cluster/bluearc/hg17/multiz.hg17Mm7Rn3CanFam2/ssSyn/$chr.ss >> alns 
	echo genes/refGene.$chr.withCNS.gff >> gffs 
    done
    hmm_train -m '*alns' -c ~/phast/data/exoniphy/default.cm -g '*gffs' -R exon_id -i SS -I CDS,background,CNS,5\'splice,3\'splice,prestart -t "((hg17,(mm7,rn3)),canFam2)" > indels.hmm

    # training complete; now run exoniphy genome-wide

    # first need to split up alignments
    
    mkdir -p /cluster/data/hg17/bed/exoniphy/test/run-split
    cd /cluster/data/hg17/bed/exoniphy/test/run-split
    
    cat > doSplit.csh << 'EOF'
#!/bin/csh -fe
set c = $1
mkdir -p /cluster/bluearc/hg17/multiz.hg17Mm7Rn3CanFam2/ssSynFrags/$c
/cluster/bin/phast/msa_split /cluster/bluearc/hg17/multiz.hg17Mm7Rn3CanFam2/mafSyn/$c.maf --refseq /cluster/bluearc/hg17/chrom/$c.fa -i MAF --windows 100000,0 --between-blocks 5000 --min-informative 1000 --out-format SS --out-root /cluster/bluearc/hg17/multiz.hg17Mm7Rn3CanFam2/ssSynFrags/$c/$c --tuple-size 3
gzip /cluster/bluearc/hg17/multiz.hg17Mm7Rn3CanFam2/ssSynFrags/$c/$c*.ss
'EOF'
# << for emacs
    chmod +x doSplit.csh

    rm -f jobList
    for file in /cluster/bluearc/hg17/multiz.hg17Mm7Rn3CanFam2/mafSyn/chr*.maf ; do echo doSplit.csh `basename $file .maf` >> jobList ; done

    para create jobList
        # 43 jobs
    para try; para check
    para push

    # now set up exoniphy run

    mkdir -p /cluster/data/hg17/bed/exoniphy/test/run-exoniphy
    cd /cluster/data/hg17/bed/exoniphy/test/run-exoniphy
    cp -p ../../train/indels.hmm /cluster/bluearc/hg17/exoniphy/training
    mkdir -p /cluster/bluearc/hg17/exoniphy/GFF
    
    cat > doExoniphy.sh << 'EOF'
#!/usr/local/bin/bash

root=`basename $1 .ss.gz`
chrom=`echo $root | awk -F\. '{print $1}'`
no=`echo $root | awk 'BEGIN{FS="[-.]"} {printf "%d\n", ($2+10000)/100000}'`

if [ ! -d /cluster/bluearc/hg17/exoniphy/GFF/$chrom ] ; then 
    mkdir -p /cluster/bluearc/hg17/exoniphy/GFF/$chrom
fi

zcat $1 | /cluster/bin/phast/exoniphy - --hmm  /cluster/bluearc/hg17/exoniphy/training/indels.hmm --reflect-strand --extrapolate default --score --indels --alias "hg17=human; mm7=mouse; rn3=rat; canFam2=dog" --seqname $chrom --idpref $chrom.$no  > /cluster/bluearc/hg17/exoniphy/GFF/$chrom/$root.gff
'EOF'
# << for emacs
    chmod +x doExoniphy.sh

    rm -f jobList
    for dir in /cluster/bluearc/hg17/multiz.hg17Mm7Rn3CanFam2/ssSynFrags/* ; do find $dir -name '*.ss.gz' | awk '{printf "doExoniphy.sh %s\n", $1}' >> jobList ; done

    para create jobList
        # 27070 jobs
    para try; para check
    para push

#Completed: 27059 of 27070 jobs
#Crashed: 11 jobs
#CPU time in finished jobs:    8573545s  142892.41m  2381.54h   99.23d  0.272 y
#IO & Wait Time:                 73412s    1223.54m    20.39h    0.85d  0.002 y
#Average job time:                 320s       5.33m     0.09h    0.00d
#Longest running job:                0s       0.00m     0.00h    0.00d
#Longest finished job:             593s       9.88m     0.16h    0.01d
#Submission to last job:         22823s     380.38m     6.34h    0.26d    

    # crashed jobs all on random chroms, chrM, etc., and appear to be
    # due to all species not being present; okay to ignore

    # collect predictions and create track
    rm -f exoniphy.gff
    for dir in /cluster/bluearc/hg17/exoniphy/GFF/chr* ; do \
	rm -f files.* tmp.gff ;\
	find $dir -name "chr*.gff" > files ;\
	split -l 1000 files files. ;\
	for l in files.* ; do cat `cat $l` >> tmp.gff ; done ;\
	refeature --sort tmp.gff >> exoniphy.gff ;\
    done
    ldHgGene -genePredExt -gtf hg17 exoniphyDog exoniphy.gff


# COW SYNTENY (Done, Heather, Dec. 2005)
# Data from Harris A. Lewin <h-lewin@uiuc.edu>

ssh hgwdev
cd /cluster/data/hg17/bed
mkdir syntenyCow
cd syntenyCow

hgLoadBed -noBin hg17 syntenyCow syntenyCow.bed

# add to kent/src/hg/makeDb/trackDb/human/hg17/trackDb.ra
 
###########################################################################
# New Conservation track (WORKING 2005-12-15 kate)

# Pairwise alignments needed for: monDom2, danRer3, bosTau2
# Use existing alignments for:  
# macaque_rheMac1
# rat_rn3 
# mouse_mm7
# dog_canFam2
# chicken_galGal2
# xenopus_xenTro1
# fugu_fr1
# rabbit_oryCun1
# armadillo_dasNov1
# elephant_loxAfr1
# tenrec_echTel1
# tetraodon_tetNig1

#########################################################################
# BLASTZ danRer3 (DONE - 2005-12-20 kate)
# Includes both randoms
    ssh pk
    mkdir /cluster/data/hg17/bed/blastz.danRer3.2005-12-20
    cd /cluster/data/hg17/bed
    ln -s blastz.danRer3.2005-12-20 blastz.danRer3
    cd blastz.danRer3

    cat << 'EOF' > DEF
# human target, zebrafish query
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/cluster/home/angie/schwartzbin:/parasol/bin

BLASTZ=blastz.v7.x86_64
BLASTZ_ABRIDGE_REPEATS=1

# use parameters suggested for human-fish evolutionary distance
# recommended in doBlastzChainNet.pl help
# (previously used for  hg16-fr1, danrer1-mm5)
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=6000
BLASTZ_K=2200
BLASTZ_Q=/san/sanvol1/scratch/blastz/HoxD55.q

# TARGET: Human hg17
SEQ1_DIR=/san/sanvol1/scratch/hg17/nib
SEQ1_SMSK=/cluster/bluearc/hg17/linSpecRep.notInZebrafish
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
SEQ1_LEN=/san/sanvol1/scratch/hg17/chrom.sizes

# QUERY: zebrafish danRer3
# Use all chroms, including both randoms (chrUn and chrNA)
SEQ2_DIR=/san/sanvol1/scratch/danRer3/nib
SEQ2_SMSK=/san/sanvol1/scratch/danRer3/linSpecRep.notInOthers
SEQ2_LEN=/cluster/bluearc/danRer3/chrom.sizes
SEQ2_CHUNK=30000000
SEQ2_LAP=1000

BASE=/cluster/data/hg17/bed/blastz.danRer3.2005-12-20
TMPDIR=/scratch/tmp
'EOF'
    # << happy emacs

    /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose  \
        -stop=net \
	`pwd`/DEF >& blastz.out &

    # mistakenly started this in blastz.danRer3.2005-12-18 dir --
    # need to move DEF file and blastz.out to 2005-12-20 dir.

    # bogus stop at net step -- thinks it can't find chains
    # I'm just restarting there
    /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose  \
        -continue=net  \
	`pwd`/DEF >& blastz.2.out &

    # stopped because vsDanRer3 downloads already there from
    # previous run. 
    ssh hgwdev "rm -fr /usr/local/apache/htdocs/goldenPath/hg17/vsDanRer3"
    /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose  \
        -continue=download  \
	`pwd`/DEF >& blastz.3.out &

    # measurements
    ssh hgwdev "featureBits hg17 chainDanRer2Link" >& danRer2.fb; cat danRer2.fb
        # 70696998 bases of 2866216770 (2.467%) in intersection
    ssh hgwdev "featureBits hg17 chainDanRer3Link" >& danRer3.fb; cat danRer3.fb
        # 55625762 bases of 2866216770 (1.941%) in intersection

    # not sure why there's lower coverage from the newer assembly.
    # It's possibly due to different parameters used in the other
    # alignment.  Rachel is experimenting with hg18/danRer3, and
    # if warranted, we might replace this later

#########################################################################
# BLASTZ bosTau2 (DONE - 2005-12-19 kate)
    ssh pk
    mkdir /cluster/data/hg17/bed/blastz.bosTau2.2005-12-19
    cd /cluster/data/hg17/bed
    rm blastz.bosTau2
    ln -s blastz.bosTau2.2005-12-19 blastz.bosTau2
    cd blastz.bosTau2

    cat << 'EOF' > DEF
# human vs. cow
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn/x86_64:/cluster/bin/x86_64:/parasol/bin

BLASTZ=blastz.x86_64

# using parameter used when not using lineage specific repeat
# abridging.  This parameter restricts the # matches used by
# dynamic masking.  (We can't currently use LSR repeat abridging
# when either assembly sequence is in .2bit).

BLASTZ_M=50

# TARGET: Human (hg17)
SEQ1_DIR=/san/sanvol1/scratch/hg17/nib
SEQ1_LEN=/san/sanvol1/scratch/hg17/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000

# QUERY: Cow (bosTau2)
# chunk it as we can't do whole-genome on 2bits
SEQ2_DIR=/scratch/hg/bosTau2/bosTau2.noBin0.2bit
SEQ2_LEN=/scratch/hg/bosTau2/noBin0.sizes
SEQ2_CHUNK=300000000
SEQ2_LAP=10000

BASE=/cluster/data/hg17/bed/blastz.bosTau2.2005-12-19
TMPDIR=/scratch/tmp
'EOF'
    # << happy emacs
    # use chain parameters for "close" species
    /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
	`pwd`/DEF >& blastz.out &

    ssh hgwdev "featureBits hg17 chainBosTau1Link" >& bosTau1.fb; cat bosTau1.fb
    ssh hgwdev "featureBits hg17 chainBosTau2Link" >& bosTau2.fb; cat bosTau2.fb

    #	swapping to get the lift over file in the other direction (Hiram)
    ssh pk
    mkdir /cluster/data/bosTau2/bed/blastz.hg17.swap
    cd /cluster/data/bosTau2/bed
    ln -s blastz.hg17.swap blastz.hg17
    cd blastz.hg17.swap
    
    time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-swap -bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
	/cluster/data/hg17/bed/blastz.bosTau2.2005-12-19/DEF > swap.out 2>&1 &
    #	this failed during the load of the tables, but that is OK, we
    #	just wanted the liftOver files from this

    #	manually cleaned this up since the run faild during the MySQL
    #	load due to out of space problems.  These tables do not need to
    #	be loaded anyway.
    sh kkstore02
    cd /cluster/data/bosTau2/bed/blastz.hg17.swap

    rm -fr psl/

    rm -fr axtChain/run/chain/
    rm -f  axtChain/noClass.net
    rm -fr axtChain/net/
    rm -fr axtChain/chain/

#########################################################################
# BLASTZ rheMac2 (2006-02-08 kate)
    ssh pk
    mkdir /cluster/data/hg17/bed/blastz.rheMac2.2006-02-08
    cd /cluster/data/hg17/bed
    ln -s blastz.rheMac2.2006-02-08 blastz.rheMac2
    cd blastz.rheMac2

    cat << 'EOF' > DEF
# macaca mulatta vs. hg18
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn/x86_64:/cluster/bin/x86_64:/parasol/bin

ALIGN=blastz-run
BLASTZ=blastz.v7.x86_64

# TARGET - hg17
SEQ1_DIR=/san/sanvol1/scratch/hg17/nib
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
SEQ1_LEN=/san/sanvol1/scratch/hg17/chrom.sizes

# QUERY - macaca mulatta
SEQ2_DIR=/san/sanvol1/scratch/rheMac2/rheMac2.2bit
SEQ2_CHUNK=5000000
SEQ2_LAP=0
SEQ2_LEN=/san/sanvol1/scratch/rheMac2/rheMac2.sizes 

BASE=/san/sanvol1/scratch/hg17/blastz.rheMac2/
RAW=$BASE/raw
TMPDIR=/scratch/tmp
'EOF'
    # << happy emacs
    # use chain parameters for "close" species
    /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
	`pwd`/DEF >& blastz.out &

    /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
	-continue=chainRun \
	`pwd`/DEF >& continueChainRun.out &

    # NOTE: must set -fileServer (e.g. to pk) if using base dir on SAN

    /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-bigClusterHub=pk -fileServer=pk \
        -chainMinScore=3000 -chainLinearGap=medium \
	-continue=chainMerge \
	`pwd`/DEF >& continueChainMerge.out &

    # netClass was crashing as it expected a bin in the
    # unsplit gap table.  Robert added the bin field.

    /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-bigClusterHub=pk -fileServer=pk \
	-continue=download \
	`pwd`/DEF >& continueDownload.out &

    ssh hgwdev "featureBits hg17 chainRheMac1Link" >& rheMac1.fb; cat rheMac1.fb
    ssh hgwdev "featureBits hg17 chainRheMac2Link" >& rheMac2.fb; cat rheMac2.fb

    ssh kkstore02 
    cd /cluster/data/hg17/bed/blastz.rheMac2
    cp -rp mafNet /san/sanvol1/scratch/hg17/mafNet/rheMac2

# SWAP CHAIN AND NET ALIGNMENTS OVER TO RHESUS (rheMac2) 
# CREATE CHAIN AND NET TRACKS, AXTNET, MAFNET, LIFTOVER AND ALIGNMENT DOWNLOADS
# (DONE, 2006-03-22, hartera) 
    # Do the swap of hg17/rheMac2 alignments over to rheMac2 to produce
    # rheMac2/hg17 alignments. 
    ssh pk
    cd /cluster/data/hg17/bed/blastz.rheMac2
    # use chain parameters for "close" species
    /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
        -swap -bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
        `pwd`/DEF >& swap.log &
    # Took about 3 hours 40 minutes to run.

#############################################################################
# 17-WAY MULTIZ ALIGNMENTS (DONE - 2005-12-20 kate)
#       # redo fix overlaps from xenTro1 and tetNig1 (2006-04-08 kate)

    # copy net mafs to cluster-friendly storage for multiz run (2006-01-25 kate)

    ssh kkstore01
    cd /cluster/data/hg17/bed/blastz.monDom2
    cp -rp mafNet /san/sanvol1/scratch/hg17/mafNet/monDom2

    ssh kkstore02
    cd /cluster/data/hg17/bed
    mkdir -p multiz17way.2005-12-20
    ln -s multiz17way.2005-12-20 multiz17way
    cd multiz17way

    # copy MAF's to cluster-friendly server
    # These MAF's already on bluearc:
    #  canFam2, fr1, galGal2, panTro1, rn3
    mkdir -p /san/sanvol1/scratch/hg17/mafNet
    cd /san/sanvol1/scratch/hg17/mafNet
    ln -s /cluster/bluearc/hg17/mafNet/{*} .

    # copy others
    foreach s (rheMac1 oryCun1 dasNov1 \
            loxAfr1 bosTau2 monDom1 xenTro1 tetNig1 danRer3)
        echo $s
        cp -Rp /cluster/data/hg17/bed/blastz.$s/mafNet $s
    end
    # a few more
    set s = echTel1
    cp -Rp /cluster/data/hg17/bed/blastz.$s/mafNet $s
    set s = mm7
    cp -Rp /cluster/data/hg17/bed/blastz.$s/mafNet $s
    set s = canFam2
    cp -Rp /cluster/data/hg17/bed/blastz.$s/mafNet $s
    set s = rheMac2
    cp -Rp /cluster/data/hg17/bed/blastz.$s/mafNet $s

    # thanks for the tree, Hiram! Taken from mm7 17way...
    # Hiram says this is derived from the latest ENCODE
    # tree, with some species removed and branch lengths
    # adjusted.  The ENCODE tree  from the Sept. freeze is:
    # ftp://kronos.nhgri.nih.gov/pub/outgoing/elliott/msa/SEP-2005/phylo/tree_4d.tba.v2.nh
    cd /cluster/data/hg17/bed/multiz17way
    cat << '_EOF_' > 17way.nh
(((((((((
(human_hg17:0.006690,chimp_panTro1:0.007571):0.024272,
  macaque_rheMac2:0.0592):0.023960,
  ((rat_rn3:0.081728,mouse_mm7:0.077017):0.229273,
      rabbit_oryCun1:0.206767):0.1065):0.023026,
(cow_bosTau2:0.159182,dog_canFam2:0.147731):0.039450):0.028505,
armadillo_dasNov1:0.149862):0.015994,
(elephant_loxAfr1:0.104891,tenrec_echTel1:0.259797):0.040371):0.218400,
monodelphis_monDom2:0.371073):0.189124,
chicken_galGal2:0.454691):0.123297,
xenopus_xenTro1:0.782453):0.156067,
((tetraodon_tetNig1:0.199381,fugu_fr1:0.239894):0.492961,
    zebrafish_danRer3:0.782561):0.156067);
'_EOF_'

    /cluster/bin/phast/draw_tree 17way.nh > 17way.ps
    /cluster/bin/phast/all_dists 17way.nh > 17way.distances.txt
    grep hg17 17way.distances.txt | sort -k3,3n | \
        awk '{printf ("%.4f\t%s\n", $3, $2)}' > distances.txt
    # edit distances.txt to include featureBits, and chain parameters
    # from blastz run.
    cat distances.txt
        # 0.0143  chimp_panTro1
        # 0.0902  macaque_rheMac2
        # 0.2563  armadillo_dasNov1
        # 0.2651  dog_canFam2
        # 0.2677  elephant_loxAfr1
        # 0.2766  cow_bosTau2
        # 0.3682  rabbit_oryCun1
        # 0.4226  tenrec_echTel1
        # 0.4677  mouse_mm7
        # 0.4724  rat_rn3
    # use loose chain params and score from here, down (5000)
        # 0.7119  monodelphis_monDom1
        # 0.9847  chicken_galGal2
        # 1.4357  xenopus_xenTro1
        # 1.6577  tetraodon_tetNig1
        # 1.6983  fugu_fr1
        # 1.7480  zebrafish_danRer3

    # the order in the browser display will be by tree topology,
    #  not by distance, so it will be: 
#  >>         # 0.0143  chimp_panTro1
#  >>         # 0.0902  macaque_rheMac2
#  >>         # 0.4677  mouse_mm7
#  >>         # 0.4724  rat_rn3
#  >>         # 0.3682  rabbit_oryCun1
#  >>         # 0.2651  dog_canFam2
#  >>         # 0.2766  cow_bosTau2
#  >>         # 0.2563  armadillo_dasNov1
#  >>         # 0.2677  elephant_loxAfr1
#  >>         # 0.4226  tenrec_echTel1
#  >>         # 0.7119  monodelphis_monDom1
#  >>         # 0.9847  chicken_galGal2
#  >>         # 1.4357  xenopus_xenTro1
#  >>         # 1.6577  tetraodon_tetNig1
#  >>         # 1.6983  fugu_fr1
#  >>         # 1.7480  zebrafish_danRer3

    # make output dir and run dir
    ssh pk
    cd /cluster/data/hg17/bed/multiz17way.2005-12-20

    # create species list and stripped down tree for autoMZ
    sed 's/[a-z][a-z]*_//g; s/:[0-9\.][0-9\.]*//g; s/;//' 17way.nh > tmp.nh
    echo `cat tmp.nh` > tree-commas.nh
    echo `cat tree-commas.nh` | sed 's/ //g; s/,/ /g' > tree.nh
    sed 's/[()]//g; s/,/ /g' tree.nh > species.lst

    mkdir -p maf run
    cd run

    # stash binaries 
    mkdir penn
    cp -p /cluster/bin/penn/multiz.v11.x86_64/multiz-tba/multiz penn
    cp -p /cluster/bin/penn/multiz.v11.x86_64/multiz-tba/maf_project penn
    cp -p /cluster/bin/penn/multiz.v11.x86_64/multiz-tba/autoMZ penn

cat > autoMultiz.csh << 'EOF'
#!/bin/csh -ef
    set db = hg17
    set c = $1
    set maf = $2
    set run = `pwd`
    set tmp = /scratch/tmp/$db/multiz.$c
    set pairs = /san/sanvol1/scratch/$db/mafNet
    rm -fr $tmp
    mkdir -p $tmp
    cp ../{tree.nh,species.lst} $tmp
    pushd $tmp
    foreach s (`cat species.lst`)
        set in = $pairs/$s/$c.maf
        set out = $db.$s.sing.maf
        if ($s == hg17) then
            continue
        endif
        if (-e $in.gz) then
            zcat $in.gz > $out
        else if (-e $in) then
            cp $in $out
        else
            echo "##maf version=1 scoring=autoMZ" > $out
        endif
    end
    set path = ($run/penn $path); rehash
    $run/penn/autoMZ + T=$tmp E=$db "`cat tree.nh`" $db.*.sing.maf $c.maf
    popd
    cp $tmp/$c.maf $maf
    rm -fr $tmp
'EOF'
# << happy emacs
    chmod +x autoMultiz.csh

cat  << 'EOF' > spec
#LOOP
./autoMultiz.csh $(root1) {check out line+ /cluster/data/hg17/bed/multiz17way.2005-12-20/maf/$(root1).maf}
#ENDLOOP
'EOF'
# << happy emacs

    awk '{print $1}' /cluster/data/hg17/chrom.sizes > chrom.lst

    # REDO FOR OVERLAPS (2006-04-07 kate)
    mv ../maf ../maf.old
    # edit spec file to fix maf dir path
    gensub2 chrom.lst single spec jobList
    para create jobList
        # 46 files
    para try
    para check
    para push
    para time > run.time
        # 36 hrs (not typical -- previous runs were ~16 hrs)


# PHASTCONS CONSERVATION (2006-01-05 kate)
# Redone when multiz redone to fix overlaps (2006-04-12)
# This process is distilled from Hiram and Adam's experiments
# on mouse (mm7) 17way track.  Many parameters are now fixed, without
# being experimentally derived, either because the experiments
# were lengthy and produced similar results, or because they
# weren't runnable given the alignment size.
# These parameters are:
# --rho
# --expected-length
# --target-coverage
# Also, instead of generating cons and noncons tree models,
# we use a single, pre-existing tree model -- Elliot Margulies' model 
# from the (37-way) ENCODE alignments.
#
# NOTE: Redone 3/20/06, adding rheMac2 to non-informative options,
#       by recommendation of Adam Siepel, to correct unwanted
#       high conservation in regions with primate-only alignments

    # NOTE: reusing cluster-friendly chrom fasta files created earlier
    #cd /cluster/data/hg17
    #foreach f (`cat chrom.lst`)
        #echo $f
        #cp $f/*.fa /cluster/bluearc/hg17/chrom
    #end

    # Split chromosome MAF's into windows and use to generate
    # "sufficient statistics" (ss) files for phastCons input
    # NOTE: as the SAN fs has lotsa space, we're leaving these
    # big (temp) files unzipped, to save time during phastCons run.
    # Note also the larger chunk sizes from previous runs -- this
    # reduces run-time on the split, slows down the actual phastCons
    # enough so jobs don't crash (jobs are very quick, just a minute
    # or so), and according to Adam, will produce better results.
    # The previous small chunks were probably required by
    # the phyloFit step, which we are no longer using for the
    # human alignments.
    ssh pk
    mkdir /cluster/data/hg17/bed/multiz17way.2005-12-20/cons
    cd /cluster/data/hg17/bed/multiz17way.2005-12-20/cons
    cp /san/sanvol1/scratch/mm7/cons/elliotsEncode.mod .
        # edit, changing rheMac1 -> rheMac2
    mkdir run.split
    cd run.split
    set WINDOWS = /san/sanvol1/scratch/hg17/multiz17way.2005-12-20/cons/ss
    rm -fr $WINDOWS
    mkdir -p $WINDOWS

    cat << 'EOF' > doSplit.csh
#!/bin/csh -ef
    set MAFS = /cluster/data/hg17/bed/multiz17way.2005-12-20/maf
    set WINDOWS = /san/sanvol1/scratch/hg17/multiz17way.2005-12-20/cons/ss
    cd $WINDOWS
    set c = $1
    echo $c
    rm -fr $c
    mkdir $c
    /cluster/bin/phast/$MACHTYPE/msa_split $MAFS/$c.maf -i MAF \
        -M /cluster/bluearc/hg17/chrom/$c.fa \
        -o SS -r $c/$c -w 10000000,0 -I 1000 -B 5000
    echo "Done" >> $c.done
'EOF'
# << happy emacs
    chmod +x doSplit.csh

    rm -f jobList
    foreach f (../../maf/*.maf) 
        set c = $f:t:r
	echo "doSplit.csh $c {check out line+ $WINDOWS/$c.done}" >> jobList
    end
    
    para create jobList
        # 46 jobs
    para try
    para check
    para push
# CPU time in finished jobs:       9511s     158.52m     2.64h    0.11d  0.000 y
# IO & Wait Time:                  5391s      89.85m     1.50h    0.06d  0.000 y
# Average job time:                 324s       5.40m     0.09h    0.00d
# Longest running job:                0s       0.00m     0.00h    0.00d
# Longest finished job:            2354s      39.23m     0.65h    0.03d
# Submission to last job:          2358s      39.30m     0.66h    0.03d

    # check tree model on 5MB chunk, using params recommended by Adam,
    # (to verify branch lengths on 2X species)
    # he ok'ed the results -- not necessary for next human run
    ssh kolossus
    cd /cluster/data/hg17/bed/multiz17way.2005-12-20/cons
    /cluster/bin/phast/$MACHTYPE/phyloFit -i SS -E -p MED -s HKY85 \
        --tree "`cat ../tree-commas.nh`" \
        /san/sanvol1/scratch/hg17/multiz17way.2005-12-20/cons/ss/chr7/chr7.115000658-120000000.ss \
        -o phyloFit.tree

    # Run phastCons
    #	This job is I/O intensive in its output files, thus it is all
    #	working over in /scratch/tmp/
    cd ..
    mkdir run.cons
    cd run.cons
    cat > doPhast.csh << 'EOF'
#!/bin/csh -fe
set c = $1
set f = $2
set len = $3
set cov = $4
set rho = $5
set tmp = /scratch/tmp/$f
mkdir -p $tmp
set san = /san/sanvol1/scratch/hg17/multiz17way.2005-12-20/cons
cp -p $san/ss/$c/$f.ss ../elliotsEncode.mod $tmp
pushd $tmp > /dev/null
/cluster/bin/phast/$MACHTYPE/phastCons $f.ss elliotsEncode.mod \
   --rho $rho --expected-length $len --target-coverage $cov --quiet \
    --not-informative panTro1,rheMac2 \
	--seqname $c --idpref $c --viterbi $f.bed --score > $f.pp
popd > /dev/null
mkdir -p $san/pp/$c $san/bed/$c
sleep 1
mv $tmp/$f.pp $san/pp/$c
mv $tmp/$f.bed $san/bed/$c
rm -fr $tmp
'EOF'
    # emacs happy
    chmod a+x doPhast.csh

    #	root1 == chrom name, file1 == ss file name without .ss suffix
    # Create gsub file
    cat > template << 'EOF'
#LOOP
doPhast.csh $(root1) $(file1) 14 .008 .28
#ENDLOOP
'EOF'
    #	happy emacs

    # Create parasol batch and run it
    pushd /san/sanvol1/scratch/hg17/multiz17way.2005-12-20/cons
    ls -1 ss/chr*/chr*.ss | sed 's/.ss$//' > \
        /cluster/data/hg17/bed/multiz17way/cons/run.cons/in.list
    popd

    gensub2 in.list single template jobList
    para create jobList
        # 333 jobs
    para try
    para check
    para push
    # NOTE: these jobs go fast -- some crashed apparently having
    # difficulty accessing input files.  Just restart them and
    # they work
#CPU time in finished jobs:      15520s     258.67m     4.31h    0.18d  0.000 y
#IO & Wait Time:                 15796s     263.27m     4.39h    0.18d  0.001 y
#Average job time:                  94s       1.57m     0.03h    0.00d
#Longest running job:                0s       0.00m     0.00h    0.00d
#Longest finished job:             180s       3.00m     0.05h    0.00d
#Submission to last job:         48266s     804.43m    13.41h    0.56d

    # create Most Conserved track
    ssh kolossus
    cd /san/sanvol1/scratch/hg17/multiz17way.2005-12-20/cons
    #	The sed's and the sort get the file names in chrom,start order
    # (Hiram tricks -- split into columns on [.-/] with 
    #    identifying x,y,z, to allow column sorting and
    #    restoring the filename.  Warning: the sort column
    # will depend on how deep you are in the dir
    find ./bed -name "chr*.bed" | \
        sed -e "s/\// x /g" -e "s/\./ y /g" -e "s/-/ z /g" | \
	sort -k7,7 -k9,9n | \
	sed -e "s/ x /\//g" -e "s/ y /\./g" -e "s/ z /-/g" | \
	xargs cat | \
	awk '{printf "%s\t%d\t%d\tlod=%d\t%s\n", $1, $2, $3, $5, $5;}' | \
	/cluster/bin/scripts/lodToBedScore /dev/stdin > mostConserved.bed
    #	~ 1 minute
    cp -p mostConserved.bed /cluster/data/hg17/bed/multiz17way/cons

    # load into database
    ssh hgwdev
    cd /cluster/data/hg17/bed/multiz17way/cons
    hgLoadBed -strict hg17 phastConsElements17way mostConserved.bed
        # Loaded 2212445 elements of size 5
    # compare with previous tracks
    hgsql hg17 -e "select count(*) from phastConsElements10way"
        # 2011952
    hgsql hg17 -e "select count(*) from phastConsElements"
        # 1601903
    # Try for 5% overall cov, and 70% CDS cov (used elen=14, tcov=.008, rho=.28)
    
    featureBits hg17 -enrichment refGene:cds phastConsElements17way
        # refGene:cds 1.065%, phastConsElements17way 5.116%, both 0.759%, cover 71.27%, enrich 13.93x

    # compare with previous tracks
    featureBits hg17 -enrichment refGene:cds phastConsElements10way
        # refGene:cds 1.062%, phastConsElements10way 5.003%, both 0.734%, cover 69.18%, enrich 13.83x
    featureBits hg17 -enrichment refGene:cds phastConsElements
        # refGene:cds 1.062%, phastConsElements 4.810%, both 0.771%, cover 72.65%, enrich 15.11x

    # experiments
    # previous tracks
    featureBits hg17 -enrichment refGene:cds -chrom=chr7 phastConsElements
        # refGene:cds 0.873%, phastConsElements 4.497%, both 0.630%, cover 72.10%, enrich 16.04x
    hgsql hg17 -e "select count(*) from phastConsElements where chrom='chr7'"
        # 81785
    featureBits hg17 -enrichment refGene:cds -chrom=chr7 phastConsElements10way
        # refGene:cds 0.873%, phastConsElements10way 4.700%, both 0.602%, cover 68.94%, enrich 14.67x
    hgsql hg17 -e "select count(*) from phastConsElements10way where chrom='chr7'"
        # 102959

    # len=13, cov=.007, rho=.27
    # looks best -- similar chr7 measurements to previous tracks
    featureBits hg17 -enrichment refGene:cds -chrom=chr7 phastConsElements17way_13_007_27
        # refGene:cds 0.874%, phastConsElements17way_13_007_27 4.854%, both 0.607%, cover 69.43%, enrich 14.31x
    hgsql hg17 -e "select count(*) from phastConsElements17way_13_007_27 where chrom='chr7'"

    featureBits hg17 -enrichment refGene:cds -chrom=chr7 phastConsElements17way_13_005_28
       # refGene:cds 0.873%, phastConsElements17way_13_005_28 4.802%, both 0.612%, cover 70.12%, enrich 14.60x
    hgsql hg17 -e "select count(*) from phastConsElements17way_13_005_28 where chrom='chr7'"
       # 95203 

    # experiments with other parameters, below
    # len=15, cov=.10 
    featureBits hg17 -enrichment refGene:cds -chrom=chr7 phastConsElements17way_15_10
        # refGene:cds 0.873%, phastConsElements17way 7.989%, both 0.627%, cover 71.77%, enrich 8.98x
    hgsql hg17 -e "select count(*) from phastConsElements17way_15_10 where chrom='chr7'"
        # 217767 
    # => too much overall covg, and too many elements

    # len=15, cov=.05
    featureBits hg17 -enrichment refGene:cds -chrom=chr7 phastConsElements17way_15_05
        # refGene:cds 0.873%, phastConsElements17way_15_05 6.880%, both 0.627%, cover 71.77%, enrich 10.43x
    hgsql hg17 -e "select count(*) from phastConsElements17way_15_05 where chrom='chr7'"
        # 166868

    # len=15, cov=.01
    # These values were used by Elliott for ENCODE
    featureBits hg17 -enrichment refGene:cds -chrom=chr7 phastConsElements17way_15_01
        # refGene:cds 0.873%, phastConsElements17way_15_01 5.721%, both 0.628%, cover 71.89%, enrich 12.57x
    hgsql hg17 -e "select count(*) from phastConsElements17way_15_01 where chrom='chr7'"
        # 106034

    # len=20, cov=.01
    featureBits hg17 -enrichment refGene:cds -chrom=chr7 phastConsElements17way_20_01
        # refGene:cds 0.873%, phastConsElements17way_20_01 7.751%, both 0.634%, cover 72.56%, enrich 9.36x
    hgsql hg17 -e "select count(*) from phastConsElements17way_20_01 where chrom='chr7'"
        # 106005
    # -> wrong direction on coverage

    # len=10, cov=.01
    featureBits hg17 -enrichment refGene:cds -chrom=chr7 phastConsElements17way_10_01
        # refGene:cds 0.873%, phastConsElements17way_10_01 4.653%, both 0.616%, cover 70.48%, enrich 15.15x
    hgsql hg17 -e "select count(*) from phastConsElements17way_10_01 where chrom='chr7'"
        # 108279
    # => looks good on coverage and element count, check smoothness in browser
    # => undersmoothed

    # len=10, cov=.05
    featureBits hg17 -enrichment refGene:cds -chrom=chr7 phastConsElements17way_10_05
        # refGene:cds 0.873%, phastConsElements17way_10_05 5.365%, both 0.615%, cover 70.44%, enrich 13.13x 
    hgsql hg17 -e "select count(*) from phastConsElements17way_10_05 where chrom='chr7'"
        # 178372
    # => fragmented elements

    # len=15, cov=.005
    featureBits hg17 -enrichment refGene:cds -chrom=chr7 phastConsElements17way_15_005
        # refGene:cds 0.873%, phastConsElements17way_15_005 5.444%, both 0.628%, cover 71.93%, enrich 13.21x
    hgsql hg17 -e "select count(*) from phastConsElements17way_15_005 where chrom='chr7'"
        # 90855

    # len=20, cov=.005
    featureBits hg17 -enrichment refGene:cds -chrom=chr7 phastConsElements17way_20_005
        # refGene:cds 0.873%, phastConsElements17way_20_005 7.373%, both 0.634%, cover 72.61%, enrich 9.85x
    hgsql hg17 -e "select count(*) from phastConsElements17way_20_005 where chrom='chr7'"
        # 91858

    # len=17, cov=.005 rho=.3
    featureBits hg17 -enrichment refGene:cds -chrom=chr7 phastConsElements17way_17_005
        # refGene:cds 0.873%, phastConsElements17way_17_005 6.126%, both 0.631%, cover 72.24%, enrich 11.79x
    hgsql hg17 -e "select count(*) from phastConsElements17way_17_005 where chrom='chr7'"
        # 91243

    # len=12, cov=.01, rho=.28 -panTro1
    featureBits hg17 -enrichment refGene:cds -chrom=chr7 phastConsElements17way_12_01_28_p
        # refGene:cds 0.873%, phastConsElements17way_12_01_28_p 4.829%, both 0.612%, cover 70.02%, enrich 14.50x
    hgsql hg17 -e "select count(*) from phastConsElements17way_12_01_28_p where chrom='chr7'"
        # 123638

    # len=13, cov=.01, rho=.25 -panTro1
    featureBits hg17 -enrichment refGene:cds -chrom=chr7 phastConsElements17way_13_01_25_p
        # refGene:cds 0.873%, phastConsElements17way_13_01_25_p 4.793%, both 0.594%, cover 67.99%, enrich 14.19x
    hgsql hg17 -e "select count(*) from phastConsElements17way_13_01_25_p where chrom='chr7'"
        # 131895

    # len=14, cov=.008, rho=.28
    featureBits hg17 -enrichment refGene:cds -chrom=chr7 phastConsElements17way_14_008_28
       # refGene:cds 0.874%, phastConsElements17way_14_008_28 5.227%, both 0.615%, cover 70.37%, enrich 13.46x
    hgsql hg17 -e "select count(*) from phastConsElements17way_14_008_28 where chrom='chr7'"
        # 106071

    # Create merged posterier probability file and wiggle track data files
    #	pk is currently closer to the san than any other machine
    ssh pk
    cd /san/sanvol1/scratch/hg17/multiz17way.2005-12-20/cons/
    # sort by chromName, chromStart so that items are in numerical order 
    #  for wigEncode
    #next time try Angie's simpler sort, below
    find ./pp -name "chr*.pp" | \
        sed -e "s/\// x /g" -e "s/\./ y /g" -e "s/-/ z /g" | \
	sort -k7,7 -k9,9n | \
	sed -e "s/ x /\//g" -e "s/ y /\./g" -e "s/ z /-/g" | \
	xargs cat | \
        nice wigEncode stdin phastCons17way.wig phastCons17way.wib
    # about 23 minutes for above
        # GOT HERE ON REDO
        # NOTE: remember to flip /gbdb link from cons.old to cons
    #foreach chr (`awk '{print $1}' /cluster/data/hg17/chrom.sizes`)
      #echo $chr
      set chr = chr22
      cat `ls -1 pp/$chr/$chr.*.pp | sort -t\. -k2,2n` \
        | nice wigEncode stdin phastCons17wayNewChr22.wig phastCons17wayNewChr22.wib
    #end
    date

    cp -p phastCons17way.wi? /cluster/data/hg17/bed/multiz17way/cons

    # Load gbdb and database with wiggle.
    ssh hgwdev
    cd /cluster/data/hg17/bed/multiz17way/cons
    ln -s /cluster/data/hg17/bed/multiz17way/cons/phastCons17way.wib \
        /gbdb/hg17/multiz17way/phastCons17way.wib
    hgLoadWiggle -pathPrefix=/gbdb/hg17/multiz17way hg17 \
        phastCons17way phastCons17way.wig

############################################################################
    ## Run phastCons on Placental mammals
    ssh pk
    cd /cluster/data/hg17/bed/multiz17way.2005-12-20/cons
    mkdir placental
    mkdir run.cons.alt
    cd run.cons.alt

    # create pruned trees 
    set tree_doctor = /cluster/bin/phast/tree_doctor
    sed 's/ /,/g' ../../species.lst 
    # hg17,panTro1,rheMac2,rn3,mm7,oryCun1,bosTau2,canFam2,dasNov1,loxAfr1,echTel1,monDom2,galGal2,xenTro1,tetNig1,fr1,danRer3
    mkdir placental
    $tree_doctor ../elliotsEncode.mod \
        --prune-all-but=hg17,panTro1,rheMac2,rn3,mm7,oryCun1,bosTau2,canFam2,dasNov1,loxAfr1,echTel1 \
                > placental/placental.mod

    cat > doPhast.csh << 'EOF'
#!/bin/csh -fe
set c = $1
set f = $2
set len = $3
set cov = $4
set rho = $5
set grp = $6
set tmp = /scratch/tmp/hg17/$grp/$f
mkdir -p $tmp
set san = /san/sanvol1/scratch/hg17/multiz17way.2005-12-20/cons
cp -p $san/ss/$c/$f.ss $grp/$grp.mod $tmp
pushd $tmp > /dev/null
/cluster/bin/phast/$MACHTYPE/phastCons $f.ss $grp.mod \
   --rho $rho --expected-length $len --target-coverage $cov --quiet \
    --not-informative panTro1,rheMac2 \
	--seqname $c --idpref $c --viterbi $f.bed --score > $f.pp
popd > /dev/null
mkdir -p $san/$grp/pp/$c $san/$grp/bed/$c
sleep 1
mv $tmp/$f.pp $san/$grp/pp/$c
mv $tmp/$f.bed $san/$grp/bed/$c
rm -fr $tmp
'EOF'
    # << emacs happy

    chmod a+x doPhast.csh

    # Create gsub file
    cat > template << 'EOF'
#LOOP
# template for 5% cov
doPhast.csh $(root1) $(file1) 14 .2 .28 placental
#ENDLOOP
'EOF'
    cat > template << 'EOF'
#LOOP
# template same as vertebrate
doPhast.csh $(root1) $(file1) 14 .008 .28 placental
#ENDLOOP
'EOF'
    #	happy emacs

    # Create parasol batch and run it
    pushd /san/sanvol1/scratch/hg17/multiz17way.2005-12-20/cons
    ls -1 ss/chr*/chr*.ss | sed 's/.ss$//' > \
        /cluster/data/hg17/bed/multiz17way/cons/run.cons.alt/in.list
    popd

    gensub2 in.list single template jobList
    para create jobList
        # 333 jobs
    para try
    para check
    para push
#.2
#CPU time in finished jobs:      15164s     252.74m     4.21h    0.18d  0.000 y
#IO & Wait Time:                 14852s     247.53m     4.13h    0.17d  0.000 y
#Average job time:                  90s       1.50m     0.03h    0.00d
#Longest running job:                0s       0.00m     0.00h    0.00d
#Longest finished job:             170s       2.83m     0.05h    0.00d
#Submission to last job:         86364s    1439.40m    23.99h    1.00d

#.008
#CPU time in finished jobs:      13712s     228.53m     3.81h    0.16d  0.000 y
#IO & Wait Time:                 14407s     240.12m     4.00h    0.17d  0.000 y
#Average job time:                  84s       1.41m     0.02h    0.00d
#Longest running job:                0s       0.00m     0.00h    0.00d
#Longest finished job:             159s       2.65m     0.04h    0.00d
#Submission to last job:          5291s      88.18m     1.47h    0.06d

    ssh pk
    cd /san/sanvol1/scratch/hg17/multiz17way.2005-12-20/cons/placental
    #	The sed's and the sort get the file names in chrom,start order
    # (Hiram tricks -- split into columns on [.-/] with 
    #    identifying x,y,z, to allow column sorting and
    #    restoring the filename.  Warning: the sort column
    # will depend on how deep you are in the dir
    find ./bed -name "chr*.bed" | \
        sed -e "s/\// x /g" -e "s/\./ y /g" -e "s/-/ z /g" | \
	sort -k7,7 -k9,9n | \
	sed -e "s/ x /\//g" -e "s/ y /\./g" -e "s/ z /-/g" | \
	xargs cat | \
	awk '{printf "%s\t%d\t%d\tlod=%d\t%s\n", $1, $2, $3, $5, $5;}' | \
	/cluster/bin/scripts/lodToBedScore /dev/stdin > mostConserved.bed
    #	~ 1 minute
    cp -p mostConserved.bed /cluster/data/hg17/bed/multiz17way/cons/placental

    # load into database
    ssh hgwdev
    cd /cluster/data/hg17/bed/multiz17way/cons/placental
    hgLoadBed -strict hg17 phastConsElementsPlacental mostConserved.bed
        # .2
        # Loaded 3775983 elements of size 5
        # .008
        # Loaded 1290060 elements of size 5

    # compare with vertebrate cons
    hgsql hg17 -e "select count(*) from phastConsElements17way"
        # 2212445
    featureBits hg17 -enrichment refGene:cds phastConsElementsPlacental
    featureBits hg17 -enrichment refGene:cds phastConsElementsPlacental_14_2_28
    featureBits hg17 -enrichment refGene:cds phastConsElements17way

    featureBits hg17 -enrichment refGene:cds phastConsElementsPlacental
# refGene:cds 1.070%, phastConsElementsPlacental 3.844%, both 0.667%, cover 62.32%, enrich 16.21x
# refGene:cds 1.069%, phastConsElementsPlacental_14_008_28 3.844%, both 0.667%, cover 62.37%, enrich 16.22x

    featureBits hg17 -enrichment refGene:cds phastConsElementsPlacental_14_2_28
#refGene:cds 1.070%, phastConsElementsPlacental_14_2_28 5.223%, both 0.691%, cover 64.62%, enrich 12.37x
    featureBits hg17 -enrichment refGene:cds phastConsElements17way
#refGene:cds 1.070%, phastConsElements17way 5.116%, both 0.763%, cover 71.27%, enrich 13.93x


    # Create merged posterier probability file and wiggle track data files
    #	pk is currently closer to the san than any other machine
    ssh pk
    cd /san/sanvol1/scratch/hg17/multiz17way.2005-12-20/cons/placental
    # sort by chromName, chromStart so that items are in numerical order 
    #  for wigEncode
    #next time try Angie's simpler sort, below
    find ./pp -name "chr*.pp" | \
        sed -e "s/\// x /g" -e "s/\./ y /g" -e "s/-/ z /g" | \
	sort -k7,7 -k9,9n | \
	sed -e "s/ x /\//g" -e "s/ y /\./g" -e "s/ z /-/g" | \
	xargs cat | \
        nice wigEncode stdin phastConsPlacental.wig phastConsPlacental.wib
    # about 23 minutes for above
        # GOT HERE ON REDO
        # NOTE: remember to flip /gbdb link from cons.old to cons

    cp -p phastConsPlacental.wi?  \
                /cluster/data/hg17/bed/multiz17way/cons/placental

    # Load gbdb and database with wiggle.
    ssh hgwdev
    cd /cluster/data/hg17/bed/multiz17way/cons/placental
    ln -s \
     /cluster/data/hg17/bed/multiz17way/cons/placental/phastConsPlacental.wib \
        /gbdb/hg17/multiz17way/phastConsPlacental.wib
    hgLoadWiggle -pathPrefix=/gbdb/hg17/multiz17way hg17 \
        phastConsPlacental phastConsPlacental.wig

############################################################################
    ## Run phastCons on subgroups (mammals, placentals, and w/o low-cov)
    ssh pk
    cd /cluster/data/hg17/bed/multiz17way.2005-12-20/cons
    mkdir run.cons.groups
    cd run.cons.groups

    # create pruned trees 
    set tree_doctor = /cluster/bin/phast/tree_doctor
    sed 's/ /,/g' ../../species.lst 
    # hg17,panTro1,rheMac2,rn3,mm7,oryCun1,bosTau2,canFam2,dasNov1,loxAfr1,echTel1,monDom2,galGal2,xenTro1,tetNig1,fr1,danRer3

    $tree_doctor ../elliotsEncode.mod \
        --prune-all-but=hg17,panTro1,rheMac2,rn3,mm7,bosTau2,canFam2,monDom2,galGal2,xenTro1,tetNig1,fr1,danRer3 \
                > vertebrate-high.mod
    $tree_doctor ../elliotsEncode.mod \
        --prune-all-but=hg17,panTro1,rheMac2,rn3,mm7,bosTau2,canFam2,monDom2 \
                > mammal-high.mod
    $tree_doctor ../elliotsEncode.mod \
        --prune-all-but=hg17,panTro1,rheMac2,rn3,mm7,bosTau2,canFam2 \
                > placental-high.mod
    $tree_doctor ../elliotsEncode.mod \
        --prune-all-but=hg17,panTro1,rheMac2,rn3,mm7,oryCun1,bosTau2,canFam2,dasNov1,loxAfr1,echTel1,monDom2 \
                > mammal.mod
    $tree_doctor ../elliotsEncode.mod \
        --prune-all-but=hg17,panTro1,rheMac2,rn3,mm7,oryCun1,bosTau2,canFam2,dasNov1,loxAfr1,echTel1 \
                > placental.mod
    foreach f (*.mod)
        mkdir $f:r
        mv $f $f:r
    end
    cat > doPhast.csh << 'EOF'
#!/bin/csh -fe
set c = $1
set f = $2
set len = $3
set cov = $4
set rho = $5
set grp = $6
set tmp = /scratch/tmp/hg17/$grp/$f
mkdir -p $tmp
set san = /san/sanvol1/scratch/hg17/multiz17way.2005-12-20/cons
cp -p $san/ss/$c/$f.ss $grp/$grp.mod $tmp
pushd $tmp > /dev/null
/cluster/bin/phast/$MACHTYPE/phastCons $f.ss $grp.mod \
   --rho $rho --expected-length $len --target-coverage $cov --quiet \
    --not-informative panTro1,rheMac2 \
	--seqname $c --idpref $c --viterbi $f.bed --score > $f.pp
popd > /dev/null
mkdir -p $san/$grp/pp/$c $san/$grp/bed/$c
sleep 1
mv $tmp/$f.pp $san/$grp/pp/$c
mv $tmp/$f.bed $san/$grp/bed/$c
rm -fr $tmp
'EOF'
    # emacs happy
    chmod a+x doPhast.csh

    #	root1 == chrom name, file1 == ss file name without .ss suffix
    # Create gsub file
    cat > template << 'EOF'
#LOOP
doPhast.csh $(root1) $(file1) 14   .21 .28 placental-high
doPhast.csh $(root1) $(file1) 14   .2 .28 placental
doPhast.csh $(root1) $(file1) 14   .11 .28 mammal
doPhast.csh $(root1) $(file1) 14   .1 .28 mammal-high
doPhast.csh $(root1) $(file1) 14 .0028 .28 vertebrate-high
#ENDLOOP

'EOF'
    #	happy emacs

    # Create parasol batch for just chr7 (for test purposes) and run it
    pushd /san/sanvol1/scratch/hg17/multiz17way.2005-12-20/cons
    ls -1 ss/chr7/chr*.ss | sed 's/.ss$//' > \
        /cluster/data/hg17/bed/multiz17way/cons/run.cons.groups/in.list
    popd

    gensub2 in.list single template jobList
    para create jobList
        # 80 jobs
    para try
    para check
    para push
    # 24 minutes


    ## create Alt Most Conserved track
    ssh hgwdev
    cd /san/sanvol1/scratch/hg17/multiz17way.2005-12-20/cons
cat > loadAltElements.csh << 'EOF'
    set b = /cluster/data/hg17/bed/multiz17way/cons/run.cons.groups
    foreach d (mammal* placental* vertebrate*)
      echo $d
      cd $d
      find ./bed -name "chr*.bed" | \
        sed -e "s/\// x /g" -e "s/\./ y /g" -e "s/-/ z /g" | \
	sort -k7,7 -k9,9n | \
	sed -e "s/ x /\//g" -e "s/ y /\./g" -e "s/ z /-/g" | \
	xargs cat | \
	awk '{printf "%s\t%d\t%d\tlod=%d\t%s\n", $1, $2, $3, $5, $5;}' | \
	/cluster/bin/scripts/lodToBedScore /dev/stdin \
            > $b/$d/mostConserved.bed
        set table = `echo $d | perl -wpe "s/(.*)/phastConsElements\u$1/;s/-(.*)/\u$1/"`
        hgLoadBed -strict hg17 $table $b/$d/mostConserved.bed
        featureBits hg17 -enrichment refGene:cds -chrom=chr7 $table
      cd ..
    end
'EOF'
    csh loadAltElements.csh >&! loadAltElements.log &
    grep refGene loadAltElements.log | sort -n -k4
# refGene:cds 0.884%, phastConsElementsPlacentalHigh 4.828%, both 0.606%, cover 68.51%, enrich 14.19x
# refGene:cds 0.884%, phastConsElementsMammal 4.869%, both 0.580%, cover 65.62%, enrich 13.48x
# refGene:cds 0.884%, phastConsElementsMammalHigh 4.887%, both 0.624%, cover 70.60%, enrich 14.45x
# refGene:cds 0.884%, phastConsElementsPlacental 4.904%, both 0.558%, cover 63.14%, enrich 12.88x
# refGene:cds 0.884%, phastConsElementsVertebrateHigh 4.965%, both 0.652%, cover 73.74%, enrich 14.85x

    featureBits hg17 -enrichment refGene:cds -chrom=chr7 phastConsElements17way
# refGene:cds 0.884%, phastConsElements17way 4.851%, both 0.623%, cover 70.48%, enrich 14.53x

    ssh kkstore02
    cd /cluster/data/hg17/bed/multiz17way/cons/run.cons.groups
cat > makeAltWiggle.csh << 'EOF'
    set b = `pwd`
    set san = /san/sanvol1/scratch/hg17/multiz17way/cons
    pushd $san
    foreach d (mammal* placental* vertebrate*)
      echo $d
      cd $d
      set table = `echo $d | perl -wpe 's/(.*)/phastCons\u$1/;s/-(.*)/\u$1/'`
      echo $table
      find ./pp -name "chr*.pp" | \
        sed -e "s/\// x /g" -e "s/\./ y /g" -e "s/-/ z /g" | \
	sort -k7,7 -k9,9n | \
	sed -e "s/ x /\//g" -e "s/ y /\./g" -e "s/ z /-/g" | \
	xargs cat | \
        nice wigEncode stdin $table.wig $table.wib
      mv $table.wig $table.wib $b/$d
      cd ..
    end
    popd
'EOF'
    csh makeAltWiggle.csh >&! makeAltWiggle.log &

    ssh hgwdev
    cd /cluster/data/hg17/bed/multiz17way/cons/run.cons.groups
cat > loadAltWiggle.csh << 'EOF'
    set b = `pwd`
    foreach d (mammal* placental* vertebrate*)
        echo $d
        cd $d
        set table = `echo $d | perl -wpe 's/(.*)/phastCons\u$1/;s/-(.*)/\u$1/'`
        echo $table
        ln -s `pwd`/$table.wib /gbdb/hg17/multiz17way
        hgLoadWiggle -pathPrefix=/gbdb/hg17/multiz17way hg17 $table $table.wig
        cd ..
    end
'EOF'
    csh loadAltWiggle.csh >&! loadAltWiggle.log &

    # Create parasol batch for just chr7 (for test purposes) and run it
    pushd /san/sanvol1/scratch/hg17/multiz17way.2005-12-20/cons
    ls -1 ss/chr7/chr*.ss | sed 's/.ss$//' > \
        /cluster/data/hg17/bed/multiz17way/cons/run.cons.groups/in.list
    popd

    gensub2 in.list single template jobList
    para create jobList
        # 80 jobs
    para try
    para check
    para push
    # 24 minutes

    # Downloads  (2006-02-22 kate)
    ssh hgwdev
    cd /cluster/data/hg17/bed/multiz17way
    mkdir mafDownloads
    cd mafDownloads
    # upstream mafs 
cat > mafFrags.csh << 'EOF'
    date
    foreach i (1000 2000 5000)
        echo "making upstream$i.maf"
        nice featureBits hg17 refGene:upstream:$i -fa=/dev/null -bed=up.bad
        awk -F '\t' '{printf("%s\t%s\t%s\t%s\t%s\t%s\n", $1, $2, $3, substr($4, 0, 9), 0, $5)}' up.bad > up.bed
        rm up.bad
        nice mafFrags hg17 multiz17way up.bed upstream$i.maf \
                -orgs=../species.lst
        nice gzip upstream$i.maf
        rm up.bed
    end
    date
'EOF'
    time csh mafFrags.csh >&! mafFrags.log &
        # ~1 hour

    ssh kkstore02
    cd cluster/data/hg17/bed/multiz17way/mafDownloads
cat > downloads.csh << 'EOF'
    date
    foreach f (../maf/chr*.maf)
	set c = $f:t:r
        echo $c
	nice gzip -c $f > $c.maf.gz
    end
    md5sum *.gz > md5sum.txt
    date
'EOF'
    time csh downloads.csh >&! downloads.log
        # ~2 hours
        # GOT HERE

    ssh hgwdev
    set dir = /usr/local/apache/htdocs/goldenPath/hg17/multiz17way
    mkdir $dir
    ln -s /cluster/data/hg17/bed/multiz17way/mafDownloads/{*.gz,md5sum.txt} $dir
    cp /usr/local/apache/htdocs/goldenPath/mm7/multiz17way/README.txt $dir
    # edit README


# PHASTCONS SCORES DOWNLOADABLES FOR 17WAY (2006-03-20 kate)

    ssh kkstore02
    cd /cluster/data/hg17/bed/multiz17way 
    mkdir phastConsDownloads
    cd phastConsDownloads
cat > downloads.csh << 'EOF'
    date
    cd /san/sanvol1/scratch/hg17/multiz17way.2005-12-20/cons/pp
    foreach chr (`awk '{print $1}' /cluster/data/hg17/chrom.sizes`)
      echo $chr
      cat `ls -1 $chr/$chr.*.pp | sort -t\. -k2,2n` \
        | nice gzip -c \
            > /cluster/data/hg17/bed/multiz17way/phastConsDownloads/$chr.gz
    end
    date
'EOF'
    csh downloads.csh >&! downloads.log &

        # ~20 minutes
    # << happy emacs
    ssh hgwdev
    cd /cluster/data/hg17/bed/multiz17way/phastConsDownloads
    md5sum *.gz > md5sum.txt
    set dir = /usr/local/apache/htdocs/goldenPath/hg17/phastCons17way
    mkdir $dir
    ln -s /cluster/data/hg17/bed/multiz17way/phastConsDownloads/{*.gz,md5sum.txt} $dir
    cp /usr/local/apache/htdocs/goldenPath/hg17/phastCons/README.txt $dir
    # edit 


# UPDATE MONKEY DOWNLOADS (2006-01-12 kate)
# EXTRACT AXT'S AND MAF'S FROM THE RheMac1 NET
# The chr1 was hugely oversized -- the other's were OK, but
# axt's were numbered oddly.
    ssh kkstore2
    cd /cluster/data/hg17/bed/blastz.rheMac1/axtChain
    gunzip -c  hg17.rheMac1.net.gz | netSplit stdin humanNet
    gunzip -c hg17.rheMac1.all.chain.gz | chainSplit chain stdin
    mkdir ../axtNet.new ../mafNet.new
cat > makeMaf.csh << 'EOF'
    foreach f (humanNet/chr*.net)
        set c = $f:t:r
        echo "axtNet on $c"
        netToAxt humanNet/$c.net chain/$c.chain /cluster/data/hg17/nib /cluster/data/rheMac1/rheMac1.2bit stdout | axtSort stdin ../axtNet.new/$c.axt
        axtToMaf ../axtNet.new/$c.axt \
            /cluster/data/hg17/chrom.sizes /cluster/data/rheMac1/chrom.sizes \
            ../mafNet.new/$c.maf -tPrefix=hg17. -qPrefix=rheMac1.
    end
    cp -rp ../mafNet.new /san/sanvol1/scratch/hg17/mafNet/rheMac1.new
'EOF'
    csh makeMaf.csh >&! makeMaf.log &
    tail -100f makeMaf.log
    pushd /san/sanvol1/scratch/hg17/mafNet
    rm -fr rheMac1
    mv rheMac1.new rheMac1
    popd

    rm -fr axtNet
    mv axtNet.new axtNet
    cd axtNet
    nice gzip *.axt
    md5sum *.gz > md5sum.txt
    # cleanup
    cd ..
    rm -fr chain humanNet

    ssh hgwdev
    ln -s /cluster/data/hg17/bed/blastz.rheMac1/axtNet \
         /usr/local/apache/htdocs/goldenPath/rheMac1/axtNet
    # Request push to downloads server


# UPDATE OPOSSUM DOWNLOADS (2006-01-17 kate)
# Fix overlaps

    ssh kkstore2
    cd /cluster/data/hg17/bed/blastz.monDom1
    mv axtNet axtNet.old
    mv mafNet mafNet.old
    mkdir axtNet mafNet
    cd axtChain/chain
    nice gunzip *.gz
    cd ..
    nice gunzip -c  human.net.gz | netSplit stdin humanNet
cat > makeMaf.csh << 'EOF'
    foreach f (humanNet/chr*.net)
        set c = $f:t:r
        echo "axtNet on $c"
        netToAxt humanNet/$c.net chain/$c.chain /cluster/data/hg17/nib /cluster/data/monDom1/monDom1.2bit stdout | axtSort stdin ../axtNet/$c.axt
        axtToMaf ../axtNet/$c.axt \
            /cluster/data/hg17/chrom.sizes /cluster/data/monDom1/chrom.sizes \
            ../mafNet/$c.maf -tPrefix=hg17. -qPrefix=monDom1.
    end
    cp -rp ../mafNet /san/sanvol1/scratch/hg17/mafNet/monDom1.new
'EOF'
    csh makeMaf.csh >&! makeMaf.log &
    tail -100f makeMaf.log
    pushd /san/sanvol1/scratch/hg17/mafNet
    rm -fr monDom1
    mv monDom1.new monDom1
    popd

    rm -fr axtNet
    mv axtNet.new axtNet
    cd axtNet
    nice gzip *.axt
    md5sum *.gz > md5sum.txt

    # cleanup
    cd ..
    rm -fr chain humanNet

    ssh hgwdev
    ln -s /cluster/data/hg17/bed/blastz.monDom1/axtNet \
         /usr/local/apache/htdocs/goldenPath/monDom1/axtNet
    # Request push to downloads server


# UPDATE COW DOWNLOADS (2006-01-17 kate)
# Fix overlaps

    ssh kkstore2
    cd /cluster/data/bosTau1/bed/zb.hg17
    mv axtNet axtNet.old
    mv mafNet mafNet.old
    mkdir axtNet mafNet
cat > makeMaf.csh << 'EOF'
    foreach f (net/chr*.net)
        set c = $f:t:r
        echo "axtNet on $c"
        netToAxt net/$c.net chain/$c.chain /cluster/data/hg17/nib /cluster/data/bosTau1/bosTau1.2bit stdout | axtSort stdin ../axtNet/$c.axt
        axtToMaf ../axtNet/$c.axt \
            /cluster/data/hg17/chrom.sizes /cluster/data/bosTau1/chrom.sizes \
            ../mafNet/$c.maf -tPrefix=hg17. -qPrefix=bosTau1.
    end
'EOF'
    csh makeMaf.csh >&! makeMaf.log &
    tail -100f makeMaf.log

    cd axtNet
    nice gzip *.axt
    md5sum *.gz > md5sum.txt

    ssh hgwdev
    ln -s /cluster/data/hg17/bed/blastz.bosTau1/axtNet \
         /usr/local/apache/htdocs/goldenPath/bosTau1/axtNet
    # Request push to downloads server


##### UPDATE hg17 knownToVisiGene (2006-01-21 galt)
# Create table that maps between known genes and visiGene database 
    # mapping to other species such as mouse, zebrafish, frog
    # requires visiGene probe track vgImageProbes be created first
    knownToVisiGene hg17 -fromProbePsl=vgImageProbes


##### UPDATE hg17 mmBlastTab (2006-01-22 galt)
# Make the protein seqs from mm7.knownGenePep
cd /cluster/data/hg17/bed/geneSorter/blastp
mkdir mm7
cd mm7
pepPredToFa mm7 knownGenePep known.faa
#       You may need to build this binary in src/hg/near/pepPredToFa
/cluster/bluearc/blast229/formatdb -i known.faa -t known -n known
mkdir -p /cluster/panasas/home/store/mm7/blastp/
cp known.* /cluster/panasas/home/store/mm7/blastp/

# Make parasol run directory
ssh kk
cd /cluster/data/hg17/bed/geneSorter/blastp/mm7
mkdir run
cd run
mkdir out

# Make blast script 
# NOTE!! left off " b 1" from the end of the script because
#  we wanted to be able to get the near-best, not just the best one.
cat  << '_EOF_' > blastSome
#!/bin/sh
BLASTMAT=/cluster/bluearc/blast229/data /cluster/bluearc/blast229/blastall \
-p blastp -d /cluster/panasas/home/store/mm7/blastp/known \
-i $1 -o $2 -e 0.001 -m 8
'_EOF_'
    # << keep emacs happy
chmod a+x blastSome

# Make gensub2 file
cat << '_EOF_' > gsub
#LOOP
blastSome {check in line+ $(path1)} {check out line out/$(root1).tab}
#ENDLOOP
'_EOF_'
    # << keep emacs happy

# Create parasol batch
#       this echo trick is used because otherwise the command line is
#       too long and you can not do a simple ls
echo ../../split/*.fa | wordLine stdin > split.lst
gensub2 split.lst single gsub jobList
para create jobList
para try
para check
para push ... etc ...

Completed: 7735 of 7735 jobs
CPU time in finished jobs:      97096s    1618.26m    26.97h    1.12d  0.003 y
IO & Wait Time:                564656s    9410.94m   156.85h    6.54d  0.018 y
Average job time:                  86s       1.43m     0.02h    0.00d
Longest running job:                0s       0.00m     0.00h    0.00d
Longest finished job:             240s       4.00m     0.07h    0.00d
Submission to last job:          1272s      21.20m     0.35h    0.01d


# Load into database.
ssh hgwdev
cd /cluster/data/hg17/bed/geneSorter/blastp/mm7/run/out
hgLoadBlastTab hg17 mmBlastTab -maxPer=1 *.tab
Scanning through 7735 files
Loading database with 33306 rows

# changed mm6 to mm7 in src/hg/hgGene/hgGeneData/Human/hg17/otherOrgs.ra
# and checked it in.

# hgLoadBlastTab hg17 mmBlastTabTopN -maxPer=250 *.tab
#  (not done, this was only used for research)

# hgLoadBlastTab hg17 mmBlastNearBest -topPercent=5 *.tab > hgMmNearBest.stats
#  (this will be the new way to go)
Reading seq lengths from hg17.knownGenePep
Finding max gene combined-coverage scores in 7735 files
Scanning through 7735 files
Loading database with 51520 rows


##########################################################################
# MYTOUCH FIX - jen - 2006-01-24
  sudo mytouch hg17 gencodeGeneClassJun05 0508301200.00
  note - gencodeGeneClassJun05 table on dev only
  sudo mytouch hg17 knownGeneLink 0506050000.00
  sudo mytouch hg17 ensGtp 0505241200.00
  sudo mytouch hg17 ccdsInfo 0505241200.00


##########################################################################
# BLASTZ OPOSSUM monDom2 (WORKING - 2006-01-23 - Hiram)

    ssh kk
    #	running out of disk space on store5:
    [hiram@kk /cluster/data/hg17/bed] df -h .
    #Filesystem            Size  Used Avail Use% Mounted on
    #			   1.5T  1.3T   79G  95% /cluster/store5
    #	So, keep this elsewhere, and symlink it:
    cd /cluster/data/hg17/bed
    ln -s /cluster/store9/hg17/bed/blastzMonDom2.2006-01-23 \
	./blastzMonDom2.2006-01-23
    ln -s blastzMonDom2.2006-01-23 blastz.monDom2

    cd /cluster/data/hg17/bed/blastzMonDom2.2006-01-23

    cat << '_EOF_' > DEF
# human vs. opossum
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/i386:/parasol/bin

BLASTZ=blastz.v7

# Specific settings for chicken (per Webb email to Brian Raney)
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=10000
BLASTZ_K=2200
BLASTZ_Q=/cluster/data/blastz/HoxD55.q
BLASTZ_ABRIDGE_REPEATS=0

# TARGET: Human (hg17)
SEQ1_DIR=/scratch/hg/hg17/bothMaskedNibs
SEQ1_LEN=/cluster/data/hg17/chrom.sizes
SEQ1_IN_CONTIGS=0
SEQ1_CHUNK=10000000
SEQ1_LAP=10000

# QUERY: Opossum monDom2
SEQ2_DIR=/scratch/hg/monDom2/monDom2.2bit
SEQ2_LEN=/scratch/hg/monDom2/chrom.sizes
SEQ2_IN_CONTIGS=0
SEQ2_CHUNK=30000000
SEQ2_LIMIT=100
SEQ2_LAP=0

BASE=/cluster/data/hg17/bed/blastzMonDom2.2006-01-23
TMPDIR=/scratch/tmp
'_EOF_'
    #	<< happy emacs

    cd /cluster/data/hg17/bed/blastzMonDom2.2006-01-23
    time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-bigClusterHub=kk -chainMinScore=5000 -chainLinearGap=loose \
	`pwd`/DEF > blastz.out 2>&1 &

    #	real    1122m44.191s
    #	failed during the load of chr19
    #	hgLoadChain hg17 chr19_chainMonDom2 chr19.chain
    #	Out of memory needMem - request size 56 bytes

    #	So, go to kolossus:
    ssh kolossus
    #	There isn't any hg17 db here yet, get it established with a
    #	chromInfo and a 2bit sequence:
    hgsql -e "create database hg17;" mysql
    cd /cluster/data/hg17
    twoBitInfo hg17.2bit stdout |
        awk '{printf "%s\t%s\t/gbdb/hg17/hg17.2bit\n", $1,$2}' \
		> chromInfo.kolossus.tab
    hgsql hg17 < $HOME/kent/src/hg/lib/chromInfo.sql
    hgsql hg17 \
-e 'load data local infile "chromInfo.kolossus.tab" into table chromInfo;'
    #	it appears /gbdb/hg17 already exists
    ln -s /cluster/data/hg17/hg17.2bit /gbdb/hg17/hg17.2bit
    #	now, loading only chr19:
    cd /cluster/data/hg17/bed/blastzMonDom2.2006-01-23/axtChain
    hgLoadChain hg17 chr19_chainMonDom2 chain/chr19.chain
    #	real    33m31.689s
    #	while that is running, back on hgwdev, get the other chains loaded
    ssh hgwdev
    cd /cluster/data/hg17/bed/blastzMonDom2.2006-01-23/axtChain
    cp loadUp.csh loadUp.noChr19.csh
    #	change the foreach line to eliminate the chr19.chain:
    diff loadUp.csh loadUp.noChr19.csh
    < foreach f (*.chain)
    ---
    > foreach f (`ls *.chain | grep -v chr19.chain`)
    #	And then run that script
    time ./loadUp.noChr19.csh > load.noChr19.out 2>&1
    #	real    76m8.757s
    
    #	When the kolossus load finishes, email to push-request and ask
    #	for the two tables to be pushed from kolossus to hgwdev:
    #	chr19_chainMonDom2
    #	chr19_chainMonDom2Link

    #	then, continuing:
    time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-continue=download -bigClusterHub=pk -chainMinScore=5000 \
	-chainLinearGap=loose `pwd`/DEF > download.out 2>&1 &
    #	real    2m42.505s

    #	now, back on kolossus to run a featurebits
    time featureBits hg17 chainMonDom2Link >fb.hg17.chainMonDom2Link 2>&1
    #	355119482 bases of 2866216770 (12.390%) in intersection
    featureBits hg17 chainMonDom1Link
    #	456069062 bases of 2866216770 (15.912%) in intersection

    #	Then, to swap the results:
    ssh kk
    cd /cluster/data/hg17/bed/blastz.monDom2
    time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-swap -bigClusterHub=pk -chainMinScore=5000 \
	-chainLinearGap=loose `pwd`/DEF > swap.out 2>&1 &
    #	running 2006-01-30 11:25
    #	real    47m27.082s
    #	failed during the load - as with the Hg18 experiment, something
    #	is really huge about these results.

#########################################################################
# BUILD MAF ANNOTATION FOR MULTIZ17WAY  (kate 2006-02-16)
# Redo to fix overlaps (2006-04-09 kate)
# rebuild frames to get bug fix, using 1-pass maf methodology (2006-06-09 markd)

    ssh kkstore01
    cd /cluster/data/rheMac2
    twoBitInfo -nBed rheMac2.2bit rheMac2.N.bed

    ssh kkstore02
    cd /cluster/data/hg17/bed/multiz17way
    mkdir anno 
    cd anno
    mkdir maf run
    cd run
    rm sizes nBeds
    foreach i (`cat /cluster/data/hg17/bed/multiz17way/species.lst`)
        ln -s  /cluster/data/$i/chrom.sizes $i.len
        ln -s  /cluster/data/$i/$i.N.bed $i.bed
        echo $i.bed  >> nBeds
        echo $i.len  >> sizes
    end

    rm jobs.csh
    echo date > jobs.csh
    foreach i (../../maf/*.maf)
        echo nice mafAddIRows -nBeds=nBeds -sizes=sizes $i /cluster/data/hg17/hg17.2bit ../maf/`basename $i` >> jobs.csh
        echo "echo $i" >> jobs.csh
    end 
    echo date >> jobs.csh

    # do smaller jobs first
    tac jobs.csh > jobsRev.csh
    mv jobsRev.csh jobs.csh
    
    csh jobs.csh >&! jobs.log &
        # 1.5 hrs.
        # 9 hours for redo -- something wrong ?

    ssh hgwdev
    cd /cluster/data/hg17/bed/multiz17way/anno/maf
    mkdir -p /gbdb/hg17/multiz17way/anno/maf
    ln -s /cluster/data/hg17/bed/multiz17way/anno/maf/*.maf \
        /gbdb/hg17/multiz17way/anno/maf
cat > loadMaf.csh << 'EOF'
    date
    hgLoadMaf -pathPrefix=/gbdb/hg17/multiz17way/anno/maf \
                hg17 multiz17way
    date
'EOF'
    csh loadMaf.csh >&! loadMaf.log &

    # load summary table on kolossus, as it crashes on hgwdev
    ssh kolossus
    cd /cluster/data/hg17/bed/multiz17way/anno/maf
    cat *.maf | \
        nice hgLoadMafSummary hg17 -minSize=30000 -mergeGap=1500 \
            -maxSize=200000  multiz17waySummary stdin

        # Created 3212623 summary blocks from 114139253 components and 17522217 mafs from stdin
    # request push to hgwdev

    # Dropped unused indexes (2006-05-09 kate)
    # NOTE: this is not required in the future, as the loader
    # has been fixed to not generate these indexes
    hgsql hg17 -e "alter table multiz17waySummary drop index chrom_2"
    hgsql hg17 -e "alter table multiz17waySummary drop index chrom_3"

    ssh kkstore02
    cd /cluster/data/hg17/bed/multiz17way
    set sanDir = /san/sanvol1/scratch/hg17/multiz17way/frames
    mkdir -p $sanDir/maf
    cp -rp maf/* $sanDir/maf
    mkdir frames
    cd frames
    cp /cluster/data/mm7/bed/multiz17wayFrames/mkMafFrames .
    cp /cluster/data/mm7/bed/multiz17wayFrames/Makefile .
    #edit Makefile to correct species names and set and sanDir

    ssh hgwdev
    cd /cluster/data/hg17/bed/multiz17way/frames
    make getGenes >&! getGenes.log &
        # ~1 minute
    make getFrames >&! getFrames.log &
        # ~2 hours
    # NOTE: if jobs get hung up (e.g. running for hours, when
    #   they should run for minutes, do 'para stop' so that
    #   the 'para make' can restart the job
    make loadDb >&! loadDb.log &

    ###
    # rebuild frames to get bug fix, using 1-pass maf methodology
    # (2006-06-09 markd)
    ssh kkstore02
    cd /cluster/data/hg17/bed/multiz17way/frames
    mv mafFrames/ mafFrames.old
    nice tcsh # easy way to get process niced
    (cat  ../maf/*.maf | genePredToMafFrames hg17 stdin stdout bosTau2 genes/bosTau2.gp.gz canFam2 genes/canFam2.gp.gz danRer3 genes/danRer3.gp.gz fr1 genes/fr1.gp.gz galGal2 genes/galGal2.gp.gz hg17 genes/hg17.gp.gz mm7 genes/mm7.gp.gz oryCun1 genes/oryCun1.gp.gz panTro1 genes/panTro1.gp.gz rheMac2 genes/rheMac2.gp.gz rn3 genes/rn3.gp.gz xenTro1 genes/xenTro1.gp.gz |  gzip >multiz17way.mafFrames.gz)>&log&
    ssh hgwdev
    cd /cluster/data/hg17/bed/multiz17way/frames

    hgLoadMafFrames hg17 multiz17wayFrames multiz17way.mafFrames.gz >&log&


# EXTRACT LINEAGE-SPECIFIC REPEATS FOR RAT (DONE 2/8/06 angie)
    ssh kolossus
    mkdir /cluster/data/hg17/rmsk
    cd /cluster/data/hg17/rmsk
    ln -s ../*/chr*.fa.out .
    # Run Arian's DateRepsinRMoutput.pl to add extra columns telling 
    # whether repeats in -query are also expected in -comp species.  
    # Even though we already have the human-mouse linSpecReps,
    # extractLinSpecReps requires two columns of DateRepsinRMoutput.pl
    # additions.  So add mouse, then ignore it.  
    # Rat in extra column 1, Mouse in extra column 2
    foreach outfl ( *.out )
        echo "$outfl"
        /cluster/bluearc/RepeatMasker/DateRepeats \
          ${outfl} -query human -comp rat -comp mouse
    end
    # Now extract rat (extra column 1), ignore mouse.
    cd ..
    mkdir linSpecRep.notInRat
    foreach f (rmsk/*.out_rat*_mus-musculus)
        set base = $f:t:r:r
        echo $base.out.spec
        /cluster/bin/scripts/extractLinSpecReps 1 $f > \
                        linSpecRep.notInRat/$base.out.spec
    end
    # Distribute and clean up.
    rsync -av linSpecRep.notInRat /san/sanvol1/scratch/hg17/
    rm -r rmsk


# BLASTZ/CHAIN/NET RN4 (DONE 2/10/06 angie)
    ssh kkstore01
    mkdir /cluster/data/hg17/bed/blastz.rn4.2006-02-08
    cd /cluster/data/hg17/bed/blastz.rn4.2006-02-08
    cat << '_EOF_' > DEF
# human vs. rat

BLASTZ_ABRIDGE_REPEATS=1

# TARGET: Human
SEQ1_DIR=/san/sanvol1/scratch/hg17/nib
SEQ1_SMSK=/san/sanvol1/scratch/hg17/linSpecRep.notInRat
SEQ1_LEN=/cluster/data/hg17/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000

# QUERY: Rat
SEQ2_DIR=/san/sanvol1/scratch/rn4/nib
SEQ2_SMSK=/san/sanvol1/scratch/rn4/linSpecRep.notInHuman
SEQ2_LEN=/cluster/data/rn4/chrom.sizes
SEQ2_CHUNK=10000000
SEQ2_LAP=0

BASE=/cluster/data/hg17/bed/blastz.rn4.2006-02-08
'_EOF_'
    # << for emacs
    doBlastzChainNet.pl DEF -chainLinearGap medium \
      -bigClusterHub pk -smallClusterHub pk -workhorse pk \
      -blastzOutRoot /san/sanvol1/scratch/blastzHg17Rn4Out >& do.log &
    tail -f do.log
    rm -f /cluster/data/hg17/bed/blastz.rn4
    ln -s blastz.rn4.2006-02-08 /cluster/data/hg17/bed/blastz.rn4

# UPDATE WGRNA TRACK (DONE, 2006-02-15, Fan)

  ssh hgwdev
  cd /cluster/data/hg17/bed

  mkdir wgRna-2006-02-15
  cd wgRna-2006-02-15

# Received the data file, wg_track_hg17_feb2006_completed.txt, from Michel Weber's email
# (Michel.Weber@ibcg.biotoul.fr)
# and place it under cd /cluster/data/hg17/bed/wgRna-2006-02-15.

  cp -p wg_track_hg17_feb2006_completed.txt wgRna.tab
  hgLoadBed -sqlTable=/cluster/home/fanhsu/src/hg/lib/wgRna.sql hg17 wgRna wgRna.tab

# Compared to previous data, 2 records deleted, 27 records added.


########################################################################
#  BLASTZ Opossum monDom4 (DONE - 2006-02-21 - 2006-02-26 - Hiram)
    ssh pk
    mkdir /cluster/data/hg17/bed/blastzMonDom4.2006-02-21
    cd /cluster/data/hg17/bed
    ln -s blastzMonDom4.2006-02-21 blastz.monDom4
    cd blastzMonDom4.2006-02-21

    cat << '_EOF_' > DEF
# human vs. opossum
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/parasol/bin

BLASTZ=blastz.v7.x86_64

# settings for more distant organism alignments
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=10000
BLASTZ_K=2200
BLASTZ_M=50
BLASTZ_Q=/cluster/data/blastz/HoxD55.q

# TARGET: Human (hg17)
SEQ1_DIR=/scratch/hg/hg17/bothMaskedNibs
SEQ1_LEN=/cluster/data/hg17/chrom.sizes
SEQ1_CHUNK=50000000
SEQ1_LAP=10000

# QUERY: Opossum monDom4
SEQ2_DIR=/san/sanvol1/scratch/monDom4/monDom4.2bit
SEQ2_LEN=/san/sanvol1/scratch/monDom4/chrom.sizes
SEQ2_CHUNK=30000000
SEQ2_LAP=0

BASE=/cluster/data/hg17/bed/blastzMonDom4.2006-02-21
TMPDIR=/scratch/tmp
'_EOF_'
    #  << happy emacs

    time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
	-continue=chainMerge `pwd`/DEF > chainMerge.out 2>&1 &
    time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
	-swap `pwd`/DEF > swap.out 2>&1 &

    time nice -n +19 featureBits hg17 chainMonDom4Link \
	> fb.hg17.chainMonDom4Link 2>&1
    time nice -n +19 featureBits monDom4 chainHg17Link \
	> fb.monDom4.chainHg17Link 2>&1

########################################################################
##  Measuring MonDom4 chain pile ups (DONE - 2006-02-26 - Hiram)
    ssh kkstore02
    cd /cluster/data/hg17/bed/blastz.monDom4/axtChain
    #	extract coordinates on the target genome of the chains
    zcat hg17.monDom4.all.chain.gz | grep "^chain " \
    | awk '{printf "%s\t%s\t%s\t%s\t%s\n", $3, $6, $7, $5, $2}' \
        | gzip -c > target.chain.bed.gz
    # turn that into a wiggle graph with bedItemOverlapCount
    # use HGDB_CONF for read-only access to the hg17 DB in bedItemOverlapCount
    #	it wants to read chromInfo ...
    export HGDB_CONF=~/.hg.conf.read-only
    #	ignore chains longer than 1,000,000
    zcat target.chain.bed.gz | awk '$3-$2<1000000 {print}' \
	| sort -k1,1 -k2,2n \
	    | bedItemOverlapCount hg17 stdin \
		| wigEncode stdin monDom4PileUps.wig monDom4PileUps.wib
    #	Do the same for the query coordinates to find out where these
    #	chains are coming from
    zcat hg17.monDom4.all.chain.gz | grep "^chain " \
    | awk '{printf "%s\t%s\t%s\t%s\t%s\n", $8, $11, $12, $10, $2}' \
        | gzip -c > query.chain.bed.gz
    zcat query.chain.bed.gz | awk '$3-$2<1000000 {print}' \
    | sort -k1,1 -k2,2n \
        | bedItemOverlapCount monDom4 stdin \
            | wigEncode stdin hg17PileUps.wig hg17PileUps.wib
    #	load those wiggles
    ssh hgwdev
    cd /cluster/data/hg17/bed/blastz.monDom4/axtChain
    ln -s `pwd`/monDom4PileUps.wib /gbdb/hg17/wib
    ln -s `pwd`/hg17PileUps.wib /gbdb/monDom4/wib

    hgLoadWiggle -verbose=2 hg17 monDom4PileUp monDom4PileUps.wig
    hgLoadWiggle -verbose=2 monDom4 hg17PileUps hg17PileUps.wig
    #	add wiggle track type entries to the respective trackDb.ra files



# UPDATE hg17 knownToVisiGene (2006-03-07 galt)
# Create table that maps between known genes and visiGene database 
    # mapping to other species such as mouse, zebrafish, frog
    # requires visiGene probe track vgImageProbes be created first
    knownToVisiGene hg17 -fromProbePsl=vgImageProbes

############################################################################
# Add Landmark track (2006-03-08 giardine)
# Note: This track is for regulatory regions and other landmarks that are not
    #included in other tracks.  It is being gathered from the locus experts
    #that are contributing data to the Human Mutation track.  This should
    #be helpful in understanding the data in the mutation track.
#table definitions for autoSql
autoSql landmark.as landmark -dbLink
#change index on bin to normal index not primary key
#move bin in struct so works as bed 4+
#copy autoSql files to hg/lib and hg/inc (add .o file to makefile)
#cat together landmark files from sources in landmark.bed then sort
grep "^chr" landmark.bed | sort -k1,1 -k2,2n > sortedLandmark.bed
#loading
hgsql hg17 < landmark.sql
hgLoadBed hg17 landmark sortedLandmark.bed -noSort -oldTable -tab
#add to trackDb.ra file (human hg17 level)

#changed landmark track to provide links and attributes in prep for ORegAnno
#data.  Got set of test data by grabbing their .gff file used for custom
#tracks and converting to bed, then to landmarks format.
cd humPhen/landmarkData/June06/
#convert data to new formats then
cat newLandmark.txt landmarkORA.txt  > allLandmarks.txt
grep "^chr" allLandmarks.txt | sort -k1,1 -k2,2n > sortedAllLandmark.txt
#start new tables
cd humPhen/kent/src/hg/lib/
autoSql landmark.as landmark -dbLink
#move bin in .h file to end of structure, to make load work
mv landmark.h ../inc/landmark.h
#change primary key to indexes where not unique, add index on landmarkId
#limit name, landmarkType, raKey size to 64
hgsql -e "drop table landmark;" hg17
hgsql hg17 < landmark.sql
cd ~giardine/humPhen/landmarkData/June06/
hgLoadBed hg17 landmark sortedAllLandmark.txt -noSort -oldTable -tab
hgsql hg17
        load data local infile "landmarkAttrORA.txt" into table landmarkAttr;
        load data local infile "landmarkAttrLinkORA.txt" into table landmarkAttrLink;
        load data local infile "landmarkAttrCat.txt" into table landmarkAttrCat;
cd ../../kent/src/
make clean
make libs
cd hg
make cgi
cd makeDb/trackDb
make DBS=hg17 update
#test in hgwdev-giardine

#redo landmarks, moving categories out of database
convertORAformat < ORegAnnoBed
#start new tables
cd humPhen/kent/src/hg/lib/
autoSql landmark.as landmark -dbLink
#move bin in .h file to end of structure, to make load work
mv landmark.h ../inc/landmark.h
#change primary key to indexes, add primary key on landmarkId
#limit name, landmarkType, raKey size to 64
#only need to reload attributes rest of data & tables same
hgsql -e "drop table landmarkAttr;" hg17
hgsql -e "drop table landmarkAttrCat;" hg17
cd ../../../../landmarkData/June06/
hgsql hg17
        #cut and paste in create table landmarkAttr
        load data local infile "landmarkAttrORA.txt" into table landmarkAttr;
#Records: 2028  Deleted: 0  Skipped: 0  Warnings: 8 ???
cd ../../kent/src/
make clean
make libs
cd hg
make cgi
cd makeDb/trackDb
make DBS=hg17 update
#test in hgwdev-giardine


############################################################################
# hg15 -> hg17 LIFTOVER CHAINS (STARTED 3/9/06, DONE 3/10/06 Fan)
    # I used a size of 10kb instead of 3kb for the split (blat query) sizes in
    # hg17.  This had a huge affect on the amount of hits in the blat, which
    # then had a huge effect on the amount of chains.  I should also mention 
    # that hg17 chromosomes chr1 and chr2 were split further 
    # into more than a single query file.  This helped a LOT in avoiding 
    # cluster hippos classically associated with those chroms.
    
    ######## LIFTOVER PREPARATION
    # Split up hg17
    ssh pk
    cd /san/sanVol1/scratch/hg17
    mkdir -p liftSplits/{split,lift}
    bash
    for fa in /cluster/data/hg17/?{,?,*hap*}/*.fa; do
      c=`basename $fa .fa`
      echo $c
      faSplit -lift=liftSplits/lift/${c}.lft size $fa -oneFile 10000 liftSplits/split/$c
    done
    mkdir -p biggerSplits/split
    cd biggerSplits/
    ln -s ../liftSplits/lift
    cd split/
    ln -s ../../liftSplits/split/* .
    faSplit sequence chr1.fa 5 chr1_
    faSplit sequence chr2.fa 5 chr2_
    rm chr{1,2}.fa

    # Make some dirs
    cd /san/sanVol1/scratch
    mkdir -p hg15

    # Copy 11.ooc files to hg15 subdirectory.
    cp -p /cluster/store5/gs.16/build33/11.ooc hg15

    ## First, copy over scripts. (Already done before)

    # mkdir -p /san/sanVol1/scratch/fan
    # cp -p /san/sanVol1/scratch/fan/*.sh /san/sanVol1/scratch/fan
    # cp /san/sanVol1/scratch/andy/psl.header /san/sanVol1/scratch/fan

    ######## LIFTOVER BLATTING  

    # HG15
    ssh pk
    cd /cluster/data/hg15
#    makeLoChain-align hg15 /scratch/hg/hg15/bothMaskedNibs hg17 \     

    makeLoChain-align hg15 /scratch/hg/hg15/chromTrfMixedNib hg17 \
/san/sanVol1/scratch/hg17/biggerSplits/split
    cd bed

    mv blat.hg17.2006-03-09 /san/sanVol1/scratch/hg15 
    cd /san/sanVol1/scratch/hg15/blat.hg17.2006-03-09/run/
    sed 's/^blat/blat.sh/; s/\}.*$/}/' spec | awk '{print "/san/sanVol1/scratch/fan/" $0 " hg15ToHg17"}' > newspec
    para create newspec
    para try
    para push
# Saw some failures, keep pushing again, they finally all finished.  
# The problems were all from one node.  
# Used "para remove machine ..." to remove that node from the cluster.
# Completed: 2376 of 2376 jobs
# CPU time in finished jobs:     626355s   10439.25m   173.99h    7.25d  0.020 y
# IO & Wait Time:                 49512s     825.20m    13.75h    0.57d  0.002 y
# Average job time:                 284s       4.74m     0.08h    0.00d
# Longest running job:                0s       0.00m     0.00h    0.00d
# Longest finished job:            3693s      61.55m     1.03h    0.04d
# Submission to last job:          4165s      69.42m     1.16h    0.05d

    ######## LIFTOVER CHAINING
    # LIFTING
    ssh pk
    cd /san/sanVol1/scratch/fan
    cp mm7SplitLift.sh hg17SplitLift.sh

    # change andy to fan, mm7 to hg17, and chrX to chr2, and remove chrUn_random 
    vi hg17SplitLift.sh

    cat << 'EOF' > hg17ChainMergeSplit.sh
#!/bin/bash
cp -r chainRaw/ /scratch/fan/hg17Lifts
pushd /scratch/fan/hg17Lifts
mkdir chain
/cluster/bin/x86_64/chainMergeSort chainRaw/*.chain | /cluster/bin/x86_64/chainSplit chain stdin
cp -r chain `dirs +1`
rm -rf chain chainRaw
'EOF'

    chmod +x hg17ChainMergeSplit.sh

    # HG15
    cd /san/sanVol1/scratch/hg15/blat.hg17.2006-03-09/raw
    /san/sanVol1/scratch/fan/hg17SplitLift.sh
    cd ../    
    mkdir chainRun chainRaw
    cd chainRun
    cat > gsub << 'EOF'
#LOOP
/cluster/bin/x86_64/axtChain -verbose=0 -linearGap=medium -psl $(path1) /scratch/hg/hg15/chromTrfMixedNib  /san/sanVol1/scratch/hg17/nib {check out line+ ../chainRaw/$(root1).chain}
#ENDLOOP
'EOF'
    ls -1S ../psl/*.psl > in.lst
    gensub2 in.lst single gsub spec
    para create spec
    para push
    para time
# Completed: 46 of 46 jobs
# CPU time in finished jobs:       3546s      59.10m     0.98h    0.04d  0.000 y
# IO & Wait Time:                   895s      14.92m     0.25h    0.01d  0.000 y
# Average job time:                  97s       1.61m     0.03h    0.00d
# Longest running job:                0s       0.00m     0.00h    0.00d
# Longest finished job:             270s       4.50m     0.07h    0.00d
# Submission to last job:           270s       4.50m     0.07h    0.00d

    ######### CHAINMERGE/NET/NETSUBSET
    ssh kolossus
    mkdir -p /scratch/fan/hg17Lifts
    cd /scratch/fan/hg17Lifts

    cp -r /san/sanvol1/scratch/hg15/blat.hg17.2006-03-09/chainRaw/ .
    mkdir chain
    /cluster/bin/x86_64/chainMergeSort chainRaw/* | /cluster/bin/x86_64/chainSplit chain stdin
# about 30 minutes.

    cp -rp chain /san/sanvol1/scratch/hg15/blat.hg17.2006-03-09/
    rm -rf chain
    rm -rf chainRaw

    ssh pk
    cd /san/sanvol1/scratch/fan
    cat << 'EOF' > netOver.sh 
#!/bin/bash

chain=$1
chrom=`basename $chain .chain`
sizesHGOld=$2
sizesHG17=/cluster/data/hg17/chrom.sizes
chainDir=`dirname $chain`
blatDir=`dirname $chainDir`
net=${blatDir}/net/${chrom}.net
over=${blatDir}/over/${chrom}.over

mkdir -p ${blatDir}/{over,net}
/cluster/bin/x86_64/chainNet $chain $sizesHGOld $sizesHG17 $net /dev/null
/cluster/bin/x86_64/netChainSubset $net $chain $over
'EOF'
# << emacs
    chmod +x netOver.sh

    mkdir netRun

    cd netRun/

    find /san/sanVol1/scratch/hg15/blat.hg17.2006-03-09/chain -name "*.chain" \
     | awk '{print "/san/sanVol1/scratch/fan/netOver.sh " $1 " /cluster/data/hg15/chrom.sizes"}' >> spec
    para create spec
    para push
    para time
# Completed: 44 of 44 jobs
# CPU time in finished jobs:        427s       7.12m     0.12h    0.00d  0.000 y
# IO & Wait Time:                   248s       4.13m     0.07h    0.00d  0.000 y
# Average job time:                  15s       0.26m     0.00h    0.00d
# Longest running job:                0s       0.00m     0.00h    0.00d
# Longest finished job:              29s       0.48m     0.01h    0.00d
# Submission to last job:            46s       0.77m     0.01h    0.00d

# seems much faster than mm7.

    ########## FINISHING
    ssh hgwdev

    # HG15
    cd /san/sanvol1/scratch/hg15/blat.hg17.2006-03-09/over
    cat * >> ../hg15ToHg17.over.chain
    cd ../
    rm -rf psl/ net/ chain/ chainRaw/ over/
    cd ../
    cp -rp blat.hg17.2006-03-09/ /cluster/data/hg15/bed

    cd /cluster/data/hg15/bed
    ln -s blat.hg17.2006-03-09 blat.hg17
    ln -s `pwd`/blat.hg17/hg15ToHg17.over.chain liftOver/hg15ToHg17.over.chain
    ln -s `pwd`/liftOver/hg15ToHg17.over.chain /gbdb/hg15/liftOver/hg15ToHg17.over.chain
    mkdir -p /usr/local/apache/htdocs/goldenPath/hg15/liftOver
    cd /usr/local/apache/htdocs/goldenPath/hg15/liftOver
    cp /gbdb/hg15/liftOver/hg15ToHg17.over.chain .
    gzip hg15ToHg17.over.chain
    hgAddLiftOverChain hg15 hg17 /gbdb/hg15/liftOver/hg15ToHg17.over.chain

# UPDATED hg17.knownToVisiGene (2006-03-14 galt)
#  after making sure hg17.vgAllProbes was up to date (see makeVisiGene.doc)
ssh hgwdev
knownToVisiGene hg17 -fromProbePsl=vgAllProbes

########################################################################
### microRNA targets tracks  (DONE - 2006-03-17 - 2006-04-27 - Hiram)
### from: http://pictar.bio.nyu.edu/ Rajewsky Lab
### Nikolaus Rajewsky nr@scarbo.bio.nyu.edu
### Yi-Lu Wang ylw205@nyu.edu
### dg@thp.Uni-Koeln.DE
    ssh hgwdev
    mkdir /cluster/data/hg17/bed/picTar
    cd /cluster/data/hg17/bed/picTar
    wget --timestamping \
	'http://pictar.bio.nyu.edu/ucsc/new_mammals_bed' -O newMammals.bed

    wget --timestamping \
	'http://pictar.bio.nyu.edu/ucsc/new_mammals_chicken_bed' \
	-O newMammalsChicken.bed

    grep -v "^track" newMammals.bed \
	| hgLoadBed -strict hg17 picTarMiRNA4Way stdin
    #	Loaded 205263 elements of size 9
    grep -v "^track" newMammalsChicken.bed \
	| hgLoadBed -strict hg17 picTarMiRNA5Way stdin
    #	Loaded 43081 elements of size 9

    nice -n +19 featureBits hg17 picTarMiRNA4Way
    #	608549 bases of 2866216770 (0.021%) in intersection
    nice -n +19 featureBits hg17 picTarMiRNA5Way
    #	109059 bases of 2866216770 (0.004%) in intersection

############################################################################
# dbSNP BUILD 125 (Heather, March 2006)

# Set up directory structure
ssh kkstore02
cd /cluster/data/dbSnp
mkdir 125
cd 125
mkdir shared
mkdir shared/data
mkdir shared/schema
mkdir organisms
mkdir organisms/human_9606
mkdir organisms/human_9606/rs_fasta
mkdir organisms/human_9606/database
mkdir organisms/human_9606/database/organism_data
mkdir organisms/human_9606/database/organism_data/hg17
mkdir organisms/human_9606/database/schema

# Get data from NCBI

# Shared data includes data dictionary, 
# Shared data includes defined types such as validity, class, function, locType
# Actually this is independent of hg17 build and should go in separate makeDoc
cd shared/data
ftp ftp.ncbi.nih.gov
cd snp/database/organism_shared_data
mget *.gz

cd ../schema
ftp ftp.ncbi.nih.gov
cd snp/database/schema/shared_schema
mget *.gz

# using headers of fasta files for molType, class and observed
cd ../organisms/human_9606/rs_fasta
ftp ftp.ncbi.nih.gov
cd snp/organisms/human_9606/rs_fasta
mget *.gz

cd ../database/organism_data/hg17
ftp ftp.ncbi.nih.gov
cd snp/organisms/human_9606/database/organism_data
# ContigLoc table has coords, orientation, loc_type, and refNCBI allele
get b125_SNPContigLoc_35_1.bcp.gz
# ContigLocusId has function
get b125_SNPContigLocusId_35_1.bcp.gz
get b125_ContigInfo_35_1.bcp.gz
# MapInfo has alignment weights
get b125_SNPMapInfo_35_1.bcp.gz
# SNP has validation status and heterozygosity
get SNP.bcp.gz
# done with FTP

# rename
mv b125_SNPContigLoc_35_1.bcp.gz ContigLoc.gz
mv b125_SNPContigLocusId_35_1.bcp.gz ContigLocusId.gz
mv b125_ContigInfo_35_1.bcp.gz ContigInfo.gz
mv b125_SNPMapInfo_35_1.bcp.gz MapInfo.gz
mv SNP.bcp.gz SNP.gz

# edit table descriptions
cd /cluster/data/dbSnp/125/organisms/human_9606/database/schema
# get CREATE statements from human_9606_table.sql for our 5 tables
# store in table.tmp
# convert and rename tables
sed -f 'mssqlToMysql.sed' table.tmp > table2.tmp
rm table.tmp
sed -f 'tableRename.sed' table2.tmp > table.sql
rm table2.tmp

# get header lines from rs_fasta
cd /cluster/data/dbSnp/125/organisms/human_9606/rs_fasta
/bin/csh gnl.csh

# load on kkr5u00
ssh kkr5u00
hgsql -e mysql 'create database dbSnpHumanBuild125' 
cd /cluster/data/dbSnp/125/organisms/human_9606/database/schema
hgsql dbSnpHumanBuild125 < table.sql
cd ../organism_data/hg17
/bin/csh load.csh

# note rowcount
# ContigLoc     24135144 
# SNP           10430754 
# MapInfo       10271016 
# ContigLocusId  9539145 

# create working /scratch dir
cd /scratch/snp
mkdir 125
cd 125
mkdir human
cd human

# get hg17 ctgPos, load into dbSnpHumanBuild125, compare contig list between ctgPos and ContigInfo

# get gnl files
cp /cluster/data/dbSnp/125/organisms/human_9606/rs_fasta/*.gnl .

# examine ContigInfo for group_term and edit pipeline.csh
# use "ref_haplotype" 

# filter ContigLoc into ContigLocFilter
# this gets rid of alternate assemblies and poor quality alignments
# uses ContigInfo and MapInfo (weight == 10 || weight == 3)
# assumes all contigs are positively oriented
# will abort if not true

mysql> desc ContigLocFilter;
#  +---------------+-------------+------+-----+---------+-------+
#  | Field         | Type        | Null | Key | Default | Extra |
#  +---------------+-------------+------+-----+---------+-------+
#  | snp_id        | int(11)     | NO   |     |         |       |
#  | ctg_id        | int(11)     | NO   |     |         |       |
#  | chromName     | varchar(32) | NO   |     |         |       |
#  | loc_type      | tinyint(4)  | NO   |     |         |       |
#  | phys_pos_from | int(11)     | NO   |     |         |       |
#  | phys_pos      | varchar(32) | YES  |     | NULL    |       |
#  | orientation   | tinyint(4)  | NO   |     |         |       |
#  | allele        | blob        | YES  |     | NULL    |       |
#  +---------------+-------------+------+-----+---------+-------+
 
/cluster/home/heather/kent/src/hg/snp/snpLoad/snpContigLocFilter dbSnpHumanBuild125 ref_haplotype
# note rowcount
# ContigLocFilter  10113426
# how many are positive strand? hopefully 90%
mysql> select count(*) from ContigLocFilter where orientation = 0;
# 9161012

# filter ContigLocusId into ContigLocusIdFilter
/cluster/home/heather/kent/src/hg/snp/snpLoad/snpContigLocusIdFilter dbSnpHumanBuild125 ref_haplotype
# note rowcount 
# ContigLocusIdFilter  5352542

# condense ContigLocusIdFilter into ContigLocusIdCondense (one SNP can have multiple functions)
# assumes SNPs are in numerical order
# will errAbort if not true
/cluster/home/heather/kent/src/hg/snp/snpLoad/snpContigLocusIdCondense dbSnpHumanBuild125 
# note rowcount
# expect about 50% for human
# ContigLocusIdCondense 4129899
# could delete ContigLocusIdFilter table here

# create chrN_snpFasta tables from *.gnl files
# snpLoadFasta.error will report all SNPs with "lengthTooLong"
# here we have 4428 SNPs with lengthTooLong
# these are noted as ObservedNotAvailable
/cluster/home/heather/kent/src/hg/snp/snpLoad/snpLoadFasta dbSnpHumanBuild125 

# split ContigLocFilter by chrom (could start using pipeline.csh here)
# pipeline.csh takes about 35 minutes to run
# create the first chrN_snpTmp
# we will reuse this table name, adding/changing columns as we go
# at this point chrN_snpTmp will have the same description as ContigLocFilter
# this opens a file handle for every chrom, so will not scale to scaffold-based assemblies
/cluster/home/heather/kent/src/hg/snp/snpLoad/snpSplitByChrom dbSnpHumanBuild125 ref_haplotype

# generate true coords using loc_type
# possible errors logged to snpLocType.error:
# "Missing quotes in phys_pos for range"
# "Chrom end <= chrom start for range"
# "Wrong size for exact"
# "Unknown locType"
# "Unable to get chromEnd"
# We got none of these

# possible exceptions logged to snpLocType.exceptions:
# RefAlleleWrongSize
# this run got just 40 

# morph chrN_snpTmp 

mysql> desc chr1_snpTmp;

#  +---------------+-------------+------+-----+---------+-------+
#  | Field         | Type        | Null | Key | Default | Extra |
#  +---------------+-------------+------+-----+---------+-------+
#  | snp_id        | int(11)     | NO   |     |         |       |
#  | ctg_id        | int(11)     | NO   |     |         |       |
#  | chromStart    | int(11)     | NO   |     |         |       |
#  | chromEnd      | int(11)     | NO   |     |         |       |
#  | loc_type      | tinyint(4)  | NO   |     |         |       |
#  | orientation   | tinyint(4)  | NO   |     |         |       |
#  | allele        | blob        | YES  |     | NULL    |       |
#  +---------------+-------------+------+-----+---------+-------+

/cluster/home/heather/kent/src/hg/snp/snpLoad/snpLoctype dbSnpHumanBuild125 ref_haplotype

# expand allele as necessary
# report syntax errors to snpExpandAllele.errors
# this run had 63 of these
# possible exceptions logged to snpExpandAllele.exceptions:
# RefAlleleWrongSize
# this run has 512 

/cluster/home/heather/kent/src/hg/snp/snpLoad/snpExpandAllele dbSnpHumanBuild125 ref_haplotype

# the next few steps prepare for working in UCSC space
# sort by position
/cluster/home/heather/kent/src/hg/snp/snpLoad/snpSort dbSnpHumanBuild125 ref_haplotype
# get hg17 nib files
# get hg17 chromInfo, load into dbSnpHumanBuild125 with editted path
hgsql -e "rename table chrMT_snpTmp to chrM_snpTmp" dbSnpHumanBuild125

# lookup reference allele in nibs
# keep reverse complement to use in error checking (snpCheckAlleles)
# check here for SNPs larger than 1024
# errAbort if detected
/cluster/home/heather/kent/src/hg/snp/snpLoad/snpRefUCSC dbSnpHumanBuild125

# morph chrN_snpTmp 

mysql> desc chr1_snpTmp;

#  +--------------------+-------------+------+-----+---------+-------+
#  | Field              | Type        | Null | Key | Default | Extra |
#  +--------------------+-------------+------+-----+---------+-------+
#  | snp_id             | int(11)     | NO   |     |         |       |
#  | ctg_id             | int(11)     | NO   |     |         |       |
#  | chromStart         | int(11)     | NO   |     |         |       |
#  | chromEnd           | int(11)     | NO   |     |         |       |
#  | loc_type           | tinyint(4)  | NO   |     |         |       |
#  | orientation        | tinyint(4)  | NO   |     |         |       |
#  | allele             | blob        | YES  |     | NULL    |       |
#  | refUCSC            | blob        | YES  |     | NULL    |       |
#  | refUCSCReverseComp | blob        | YES  |     | NULL    |       |
#  +--------------------+-------------+------+-----+---------+-------+

# compare allele from dbSNP to refUCSC
# locType between is excluded from this check
# log exceptions to snpCheckAllele.exceptions
# if SNP is positive strand, expect allele == refUCSC
# log RefAlleleMismatch if not
# if SNP is negative strand, if not allele == refUCSC, then check for allele == refUCSCReverseComp
# If allele == refUCSCRevComp, log RefAlleleNotRevComp
# If allele doesn't match either of refUCSC or refUCSCReverseComp, log RefAlleleMismatch
# This run we got:
# 0 RefAlleleMismatch
# 49763   RefAlleleNotRevComp
/cluster/home/heather/kent/src/hg/snp/snpLoad/snpCheckAlleles dbSnpHumanBuild125

# add class, observed and molType from chrN_snpFasta tables
# log errors to snpReadFasta.errors
# errors detected: no data available, duplicate data
# This run we got:
# 49 no data available
# 226048 duplicate
# chrN_snpFasta has class = 'in-del'
# we promote this to 'deletion' for locType 1&2 and to 'insertion' for locType 3

/cluster/home/heather/kent/src/hg/snp/snpLoad/snpReadFasta dbSnpHumanBuild125

# morph chrN_snpTmp
#  +--------------------+---------------+------+-----+---------+-------+
#  | Field              | Type          | Null | Key | Default | Extra |
#  +--------------------+---------------+------+-----+---------+-------+
#  | snp_id             | int(11)       | NO   |     |         |       |
#  | chromStart         | int(11)       | NO   |     |         |       |
#  | chromEnd           | int(11)       | NO   |     |         |       |
#  | loc_type           | tinyint(4)    | NO   |     |         |       |
#  | class              | varchar(255)  | NO   |     |         |       |
#  | orientation        | tinyint(4)    | NO   |     |         |       |
#  | molType            | varchar(255)  | NO   |     |         |       |
#  | allele             | blob          | YES  |     | NULL    |       |
#  | refUCSC            | blob          | YES  |     | NULL    |       |
#  | refUCSCReverseComp | blob          | YES  |     | NULL    |       |
#  | observed           | blob          | YES  |     | NULL    |       |
#  +--------------------+---------------+------+-----+---------+-------+

# generate exceptions for class and observed

# SingleClassBetweenLocType
# SingleClassRangeLocType
# NamedClassWrongLocType

# ObservedNotAvailable
# ObservedWrongFormat
# ObservedWrongSize
# ObservedMismatch

# RangeSubstitutionLocTypeExactMatch

# SingleClassTriAllelic
# SingleClassQuadAllelic

# This will also detect IUPAC symbols in allele

/cluster/home/heather/kent/src/hg/snp/snpLoad/snpCheckClassAndObserved dbSnpHumanBuild125

# add function
/cluster/home/heather/kent/src/hg/snp/snpLoad/snpFunction dbSnpHumanBuild125

# add validation status and heterozygosity
# log error if validation status > 31 or missing
# this run we got 8 missing
/cluster/home/heather/kent/src/hg/snp/snpLoad/snpSNP dbSnpHumanBuild125

# generate chrN_snp125 and snp125Exceptions tables
cp snpCheckAlleles.exceptions snpCheckAlleles.tab
cp snpCheckClassAndObserved.exceptions snpCheckClassAndObserved.tab
cp snpExpandAllele.exceptions snpExpandAllele.tab
cp snpLocType.exceptions snpLocType.tab
/cluster/home/heather/kent/src/hg/snp/snpLoad/snpFinalTable dbSnpHumanBuild125

# PAR SNPs
/cluster/home/heather/kent/src/hg/snp/snpLoad/snpPAR dbSnpHumanBuild125
hgsql -e 'load data local infile "snpPARexceptions.tab" into table snp125Exceptions' dbSnpHumanBuild125

# concat into snp125.tab
# cat chr*_snp125.tab >> snp125.tab
/bin/sh concat.sh

# load
hgsql dbSnpHumanBuild125 < /cluster/home/heather/kent/src/hg/lib/snp125.sql
hgsql -e 'load data local infile "snp125.tab" into table snp125' dbSnpHumanBuild125

# check for multiple alignments
/cluster/home/heather/kent/src/hg/snp/snpLoad/snpMultiple dbSnpHumanBuild125
mysql> load data local infile 'snpMultiple.tab' into table snp125Exceptions;

# run and review snpCompareLoctype (currently tuned for 124/125 differences)
/cluster/home/heather/kent/src/hg/snp/snpLoad/snpCompareLoctype dbSnpHumanBuild125 snp124subset snp125
# cat snpCompareLoctypeCounts.out
# exactToExact = 8310192
# exactToBetween = 107956
# exactToRange = 16200
# betweenToBetween = 206224
# betweenToExact = 4012
# betweenToRange = 715
# rangeToRange = 98648
# rangeToBetween = 3151
# rangeToExact = 6198
# oldToNew = 10224

# 12043 coord changes in exact (.1%)
# 1370 moved to different chroms

# 3664 coord changes in between (1.7%)
# 2260 off-by-one
# 13 moved to different chroms

# 22198 coord changes in range (22.5%)
# 19548 look like fixes: observedLengthOld != coordSpanOld && observedLengthNew == coordSpanNew
# 1296 look like errors: observedLengthOld == coordSpanOld && observedLengthNew != coordSpanNew

# load on hgwdev
cp snp125.tab /cluster/home/heather/transfer/snp
hgsql dbSnpHumanBuild125 -e 'select * from snp125Exceptions' > /cluster/home/heather/transfer/snp/snp125Exceptions.tab
ssh hgwdev
mysql> load data local infile 'snp125.tab' into table snp125; 

# create indexes
mysql> alter table snp125 add index name (name);
mysql> alter table snp125 add index chrom (chrom, bin);

mysql> load data local infile 'snp125Exceptions.tab' into table snp125Exceptions; 
mysql> alter table snp125Exceptions add index name(name);

# create snp125ExceptionDesc table
cd /cluster/data/dbSnp
# add counts to exception.template
hgsql hg17 < snp125ExceptionDesc.sql
mysql> load data local file 'exception.template' into table snp125ExceptionDesc;

#######
# Add new case for ObservedWrongSize (Heather June 9, 2006)
# revisions 1.25 and 1.26 kent/src/hg/snp/snpLoad/snpCheckClassAndObserved.c
ssh kkr5u00
cd /scratch/snp/125/human
/bin/csh pipeline.csh
# wait 35 minutes
grep ObservedWrongSize snpCheckClassAndObserved.exceptions > ObservedWrongSize
grep ObservedWrongSize snpPARexceptions.tab >> ObservedWrongSize
cp ObservedWrongSize /cluster/home/heather/transfer/snp
ssh hgwdev
hgsql -e 'alter table snp125Exceptions drop index name' hg17
hgsql -e 'load data local infile "/cluster/home/heather/transfer/snp/ObservedWrongSize" into table snp125Exceptions' hg17
hgsql -e 'alter table snp125Exceptions add index name'

# fix counts
hgsql -e 'select count(*), exception from snp125Exceptions group by exception' hg17

+----------+------------------------------------+
| count(*) | exception                          |
+----------+------------------------------------+
|   785903 | MultipleAlignments                 |
|      623 | NamedClassWrongLocType             |
|     7686 | ObservedMismatch                   |
|     4333 | ObservedNotAvailable               |
|       97 | ObservedWrongFormat                |
|    73558 | ObservedWrongSize                  |
|      466 | RangeSubstitutionLocTypeExactMatch |
|       62 | RefAlleleMismatch                  |
|    99849 | RefAlleleNotRevComp                |
|     1278 | RefAlleleWrongSize                 |
|    20749 | SingleClassBetweenLocType          |
|     2306 | SingleClassQuadAllelic             |
|    15639 | SingleClassRangeLocType            |
|    19330 | SingleClassTriAllelic              |
+----------+------------------------------------+

# edit /cluster/data/dbSNP/exception.template (need to automate this)
hgsql -e 'delete from snp125ExceptionDesc' hg17
hgsql -e 'load data local infile "/cluster/data/dbSNP/exception.template" into table snp125ExceptionDesc' hg17


###########################

# add rs_fasta to seq/extFile (Heather Nov 2006)
# use 126 rs_fasta files because I didn't save 125 version
ssh hgwdev
mkdir /gbdb/hg17/snp
ln -s /cluster/store12/snp/126/human/rs_fasta/snp.fa /gbdb/hg17/snp/snp.fa
cd /cluster/store12/snp/126/human/rs_fasta
hgLoadSeq hg17 /gbdb/hg18/snp/snp.fa
# clean up after hgLoadSeq
rm seq.tab

# look up id in extFile
# move into separate table
hgsql hg17 < snpSeq.sql
hgsql -e 'insert into snpSeq select acc, file_offset from seq where extFile = 33852294' hg17
hgsql -e 'delete from seq where extFile = 33852294' hg17
hgsql -e 'alter table snpSeq add index acc (acc)' hg17

#############################################################
# Get panTro2 and rheMac2 allele for all SNPs (Heather, Dec 2006, Feb 2007 and
# June 2007 [partial fix released 6/25/07: using hg17 instead of hg18 liftOver
# files... for most but not all chroms! :( not documented below; error found 
# by user]
# 1/11/08 (angie): re-running panTro2Qual and subsequent chimp & summary 
# steps, so hg17 liftOver files will have been used for all outputs.
# Deletions will probably lift okay
# The insertions have start == end so none of them will lift
# 1/24/08 (angie): constant quality score of 98 for chimp chr{21,M,Y,Y_random}
# was previously put in score field -- corrected to orthoScore.

ssh hgwdev
cd /san/sanvol1/snp/liftOver/hg17
mkdir panTro2All
mkdir rheMac2All
mkdir input
cd input
hgsql -N -e 'select chrom, chromStart, chromEnd, name, score, strand from snp125' hg17 > snp125.bed
lineFileSplit snp.bed lines 100000 snp-
ln -s /san/sanvol1/snp/liftOver/hg17/input /san/sanvol1/snp/liftOver/hg17/panTro2All/input
ln -s /san/sanvol1/snp/liftOver/hg17/input /san/sanvol1/snp/liftOver/hg17/rheMac2All/input
cd ../panTro2All
./makeJobList.csh
mkdir output
mkdir unmapped
cd ../rheMac2All
./makeJobList.csh
mkdir output
mkdir unmapped

# cluster run
ssh pk
cd /san/sanvol1/snp/liftOver/hg17/panTro2All
para create jobList; para try; para check; para push
para time
# Completed: 108 of 108 jobs
# CPU time in finished jobs:      67758s    1129.29m    18.82h    0.78d  0.002 y
# IO & Wait Time:                   961s      16.02m     0.27h    0.01d  0.000 y
# Average job time:                 636s      10.60m     0.18h    0.01d
# Longest running job:                0s       0.00m     0.00h    0.00d
# Longest finished job:            1543s      25.72m     0.43h    0.02d
# Submission to last job:         61513s    1025.22m    17.09h    0.71d

cd /san/sanvol1/snp/liftOver/hg17/rheMac2All
para create jobList; para try; para check; para push
para time
# Completed: 108 of 108 jobs
# CPU time in finished jobs:       1833s      30.56m     0.51h    0.02d  0.000 y
# IO & Wait Time:                  1744s      29.06m     0.48h    0.02d  0.000 y
# Average job time:                  33s       0.55m     0.01h    0.00d
# Longest running job:                0s       0.00m     0.00h    0.00d
# Longest finished job:              82s       1.37m     0.02h    0.00d
# Submission to last job:         59987s     999.78m    16.66h    0.69d

# add sequence
# next time do this at the same time as lift

cd /san/sanvol1/snp/liftOver/hg17
mkdir panTro2Seq
mkdir panTro2Seq/input
mkdir panTro2Seq/output
cp panTro2All/output/snp*out panTro2Seq/input
cd panTro2Seq
./makeJobList.csh

cat << 'EOF' > makeJobList.csh
#!/bin/tcsh
rm -f jobList
foreach fileName (`ls input/*`)
    set baseName = $fileName:t
    echo $baseName
    echo "/cluster/home/heather/kent/src/hg/snp/snpLoad/fetchSeq $fileName /scratch/hg/panTro2/panTro2.2bit output/$baseName" >> jobList
end
'EOF'

cd /san/sanvol1/snp/liftOver/hg17
mkdir rheMac2Seq
mkdir rheMac2Seq/input
mkdir rheMac2Seq/output
cp rheMac2All/output/snp*out rheMac2Seq/input
cd rheMac2Seq
cat << 'EOF' > makeJobList.csh
#!/bin/tcsh
rm -f jobList
foreach fileName (`ls input/*`)
    set baseName = $fileName:t
    echo $baseName
    echo "/cluster/home/heather/kent/src/hg/snp/snpLoad/fetchSeq $fileName /scratch/hg/rheMac2/rheMac2.2bit output/$baseName" >> jobList
end
'EOF'

# cluster run for sequence
ssh pk
cd /san/sanvol1/snp/liftOver/hg17/panTro2Seq
para create jobList; para try; para check; para push
para time
# Completed: 108 of 108 jobs
# CPU time in finished jobs:      30509s     508.48m     8.47h    0.35d  0.001 y
# IO & Wait Time:                   325s       5.42m     0.09h    0.00d  0.000 y
# Average job time:                 286s       4.76m     0.08h    0.00d
# Longest running job:                0s       0.00m     0.00h    0.00d
# Longest finished job:             551s       9.18m     0.15h    0.01d
# Submission to last job:          1195s      19.92m     0.33h    0.01d

cd /san/sanvol1/snp/liftOver/hg17/rheMac2Seq
para create jobList; para try; para check; para push
para time
# Completed: 108 of 108 jobs
# CPU time in finished jobs:      28517s     475.28m     7.92h    0.33d  0.001 y
# IO & Wait Time:                   576s       9.61m     0.16h    0.01d  0.000 y
# Average job time:                 269s       4.49m     0.07h    0.00d
# Longest running job:                0s       0.00m     0.00h    0.00d
# Longest finished job:             509s       8.48m     0.14h    0.01d
# Submission to last job:          1166s      19.43m     0.32h    0.01d

# quality scores 
# This takes about 24 hours for each species!!  Ugh.
# Solution is to use -bedFile argument to hgWiggle
ssh hgwdev
cd /san/sanvol1/snp/liftOver/hg17
cd panTro2Seq/output
cat << 'EOF' > concat.csh
#!/bin/tcsh
rm -f all.out
foreach fileName (`ls snp*.out`)
    cat $fileName >> all.out
end
'EOF'

sort all.out > all.sort
rm all.out

cd /san/sanvol1/snp/liftOver/hg17
mkdir panTro2Qual
cp panTro2Seq/output/all.sort panTro2Qual
cd panTro2Qual
mkdir input
splitFileByColumn all.sort input
mkdir output

# If we do this again, we should write a c program to read qac files into
# memory -- much faster than one hgWiggle process per line.
cat << 'EOF' > addQual.pl
#!/usr/bin/perl -W
$db=shift;
$chromName=shift;
while (<STDIN>)
{
    my @fields = split;
    my $chrom = $fields[0];
    my $chromStart = $fields[1];
    my $chromEnd = $fields[2];
    my $name = $fields[3];
    my $strand = $fields[5];
    my $allele = $fields[6];

    $cmd="hgWiggle -db=$db -chrom=$chromName -position=$chrom:$chromStart-$chromStart -rawDataOut quality";
    open(RESULT, "$cmd |") or die "can't start '$cmd'\n";
    while ($line = <RESULT>)
    {
        $score = int($line);
        print "$chrom\t$chromStart\t$chromEnd\t$name\t$score\t$strand\t$allele\n";
    }
}
'EOF'
cat << 'EOF' > getQual.csh
#!/bin/tcsh
foreach fileName (`ls input/*`)
    set chromName = $fileName:t:r
    echo $chromName
    addQual.pl panTro2 $chromName < $fileName > output/$chromName
end
'EOF'
# << emacs
./getQual.csh

cd rheMac2Seq/output
concat.csh
sort all.out > all.sort
rm all.out
cd /san/sanvol1/snp/liftOver/hg17
mkdir rheMac2Qual
cp rheMac2Seq/output/all.sort rheMac2Qual
cd rheMac2Qual
mkdir input
splitFileByColumn all.sort input
mkdir output
./getQual.csh

# concatenate, merge and load
# chimp has no qual scores for chr21, chrY and chrM, just use seq files
cd /san/sanvol1/snp/liftOver/hg17/panTro2Qual/output
grep chr21 ../../panTro2Seq/output/all.sort > chr21
grep chrY ../../panTro2Seq/output/all.sort | grep -v random > chrY
grep chrY ../../panTro2Seq/output/all.sort | grep random > chrY_random
grep chrM ../../panTro2Seq/output/all.sort > chrM
#-----------------------------------------------------------------------------
# 1/11/08: replace outputs for chroms that apparently were skipped in the June 
# run, and re-run subsequent steps for chimp.  
cd /san/sanvol1/snp/liftOver/hg17/panTro2Qual
mv output output-jun25
foreach f (output-jun25/chr*)
  if ( "X"`cmp $f output-feb26/$f:t` == "X" ) then
    echo $f:t
  endif
end
#chr21
#chrM
#chrY
#chrY_random
# <<-- those are the ones that may not have actually been regenerated.

# It appears that the Feb. outputs, instead of the June Seq files, were copied 
# to the June output for those chroms.  oops!
# As a minor improvement, skip duplicate rows instead of just copying.
foreach chr (chr21 chrM chrY chrY_random)
  echo $chr
  uniq input/$chr.sort > output/$chr
end
# << emacs

mkdir output-jun25-incorrect
mv output-jun25/chr{21,M,Y,Y_random} output-jun25-incorrect
cat output-jun25/chr* output/chr* > output/qual.tab

# end 1/11/08 fix-specific; proceeding to post-concat.csh chimp steps.
#-----------------------------------------------------------------------------
./concat.csh
/cluster/home/heather/kent/src/hg/snp/snpLoad/snpOrtho hg17 snp125 qual.tab
hgLoadBed hg17 snp125OrthoPanTro2 snpOrtho.tab -tab -sqlTable=/cluster/home/heather/kent/src/hg/lib/snp125OrthoPanTro2.sql
#Loaded 9591230 elements of size 17
# previously 9590961
# add index
hgsql hg17
  alter table snp125OrthoPanTro2 add index name (name);
  alter table snp125OrthoPanTro2 add index chrom (chrom, bin);
# 1/24/08: these used to set score; should have set orthoScore all along.
# tweak to match panTro2 assembly
  update snp125OrthoPanTro2 set orthoScore = 98 where orthoChrom = "chr21";
#Query OK, 129170 rows affected (25.37 sec)
#Rows matched: 129170  Changed: 129170  Warnings: 0
  update snp125OrthoPanTro2 set orthoScore = 98 where orthoChrom = "chrY";
#Query OK, 22081 rows affected (25.16 sec)
#Rows matched: 22081  Changed: 22081  Warnings: 0
  update snp125OrthoPanTro2 set orthoScore = 98 where orthoChrom = "chrY_random";
#Query OK, 155 rows affected (25.41 sec)
#Rows matched: 155  Changed: 155  Warnings: 0

# macaque
cd /san/sanvol1/snp/liftOver/hg17/rheMac2Qual/output
./concat.csh
/cluster/home/heather/kent/src/hg/snp/snpLoad/snpOrtho hg17 snp125 qual.tab
hgLoadBed hg17 snp125OrthoRheMac2 snpOrtho.tab -tab -sqlTable=/cluster/home/heather/kent/src/hg/lib/snp125OrthoRheMac2.sql
# add index
alter table snp125OrthoRheMac2 add index name (name);
alter table snp125OrthoRheMac2 add index chrom (chrom, bin);

# get hapmap subset for chimp
# skip if lift wasn't size 1
# this run 124822 skipped
cd /cluster/data/hg17/bed/hapmap/rel21a
time /cluster/home/heather/kent/src/hg/snp/snpLoad/hapmapOrtho hg17 \
  hapmapSnpsCombined snp125OrthoPanTro2
#108.505u 16.869s 2:26.22 85.7%  0+0k 0+0io 4pf+0w
hgLoadBed hg17 hapmapAllelesChimp hapmapOrtho.tab -tab -sqlTable=/cluster/home/heather/kent/src/hg/lib/hapmapAllelesChimp.sql
#Loaded 3930564 elements of size 13
hgsql hg17 -e 'alter table hapmapAllelesChimp add index name(name); \
  alter table hapmapAllelesChimp add index chrom (chrom, bin);'

# get hapmap subset for macaque
# this run 106607 skipped
/cluster/home/heather/kent/src/hg/snp/snpLoad/hapmapOrtho hg17 hapmapSnpsCombined snp125OrthoRheMac2
hgLoadBed hg17 hapmapAllelesMacaque hapmapOrtho.tab -tab -sqlTable=/cluster/home/heather/kent/src/hg/lib/hapmapAllelesMacaque.sql
rm hapmapOrtho.tab
rm hapmapOrtho.err
rm bed.tab
alter table hapmapAllelesMacaque add index name(name);
alter table hapmapAllelesMacaque add index chrom (chrom, bin);

##############################################################################
# HapMap Recombination Rate Phase 2 (Heather Feb. 2006)
# Contacts:
# Gil McVean [mcvean@stats.ox.ac.uk]                                                                                                 
# Colin Freeman [cfreeman@stats.ox.ac.uk]                                                                                            
# Simon Myers [smyers@broad.mit.edu]         
# Data is missing chromEnd.  I am setting chromEnd = chromStart + 1 as a
# kludge for now.
# Solution is to interpolate range but remove gaps.

## ****************************************
# This is a bad assumption about the data format -- here is a description.
## ****************************************
# The recombination rates are for the regions _between_ snps, so these
# files need to be processed slightly differently.  For each line i in
# the file (except the header and the last line), the recombination
# rate is for the position on the current line minus 1 [pos(${i})-1] to
# the position on the subsequent line [pos({$i+1})].  The precision is
# a bit obnoxious and can be truncated to 3 or 4 significant figures.
# (Note that the recombination rate on the last line is 0, as this is a
# placeholder.)  Here is an example:
#
# > head genetic_map_chr1.txt 
# position COMBINED_rate(cM/Mb) Genetic_Map(cM)
# 45413 2.98182170902573 0
# 72434 2.08241435350679 0.0805718043995841
# 78032 2.08135840137317 0.0922291599505152
# 244859 2.88844902005393 0.439455937976397
# 604461 2.88749757426825 1.47814798248583
# 604484 2.88586385769306 1.47821439493004
# 605296 2.88389196108775 1.48055771638249
#
### BED format (like a bedGraph)
# chr1 45412  72434  2.982
# chr1 72433  78302  2.082
# chr1 78031  244859 2.081
# chr1 244858 604461 2.888
# chr1 604460 604484 2.887
# chr1 604483 605296 2.886
# chr1 605295 .....  2.884
#
# See /cluster/data/hg16/bed/hapmap/recombination/Perlegen/makeBed.pl for an example.  /cluster/data/hg16/bed/hapmap/recombination/Perlegen/cmds.csh is also useful. 
## ****************************************

ssh hgwdev
cd /cluster/data/hg17/bed/hapmap
mkdir recombination
cd recombination
mkdir phase2
cd phase2
wget --no-check-certificate -N https://mathgen.stats.ox.ac.uk/HapMap_Phase2_rates_hotspots/HapMap_Phase2_rates_hotspots.tgz  
# data also available at
# http://www.hapmap.org/downloads/recombination/2006-10_rel21_phaseII
gunzip *.tgz
tar xvf *.tar
cat << 'EOF' > makeBed.csh
#!/bin/tcsh
rm -f recomb.bed
foreach chrom (`cat chrom.list`)
    echo $chrom
    set fileName=`echo $chrom | awk '{printf "genetic_map_%s.txt", $1}'`
    makeBed.pl $chrom < $fileName >> recomb.bed
end
makeBed.pl chrX < genetic_map_chrX_par1.txt >> recomb.bed
makeBed.pl chrX < genetic_map_chrX_non-par.txt >> recomb.bed
makeBed.pl chrX < genetic_map_chrX_par2.txt >> recomb.bed
'EOF'
cat << 'EOF' > makeBed.pl
#!/usr/bin/env perl
$chromName = shift;
while (<STDIN>) {
    my @fields = split;
    # skip header
    if ($fields[0] eq "position") { next; }
    print $chromName;
    print "\t";
    print $fields[0];
    print "\t";
    print $fields[0] + 1;
    print "\t";
    my $val1000 = $fields[1] * 1000;
    my $valRound = int($val1000);
    my $newVal = $valRound / 1000.0;
    print $newVal;
    print "\n";
}
'EOF'
./makeBed.csh
hgLoadBed hg17 snpRecombRateHapmapPhase2 recomb.bed -tab -bedGraph=4
hgsql -e 'alter table snpRecombRateHapmapPhase add index chrom (chrom, bin)' hg17

############
# UPDATE hg17 knownToVisiGene (2006-04-05 galt)
# Create table that maps between known genes and visiGene database 
    # mapping to other species such as mouse, zebrafish, frog
    # requires visiGene probe track vgImageProbes be created first
    knownToVisiGene hg17 -fromProbePsl=vgImageProbes
#############################################################
# ADD A NEW TRACK GROUP (DONE, 6/3/06, Fan)
# Create a new track group, "phenDis".

    echo 'INSERT INTO grp (name, label, priority) VALUES ("phenDis", "Phenotype and Disease Associations", 2.5)' \
   | hgsql hg17

#############################################################
# hgMut - Human Mutation track - Belinda Giardine
# list of tables by show tables like 'hgMut%'
# summary of current load June 7, 2006
#table definitions for autoSql
autoSql hgMut.as hgMut -dbLink
#move bin in struct so works as bed 4+
#hgMut.sql: change INDEXes as needed, put in enums
#shrink mutId to 64 chars, plus acc to 48
#data files and details under ~giardine/humPhen/
cd humPhen/hgMutData/April2006/
cat hgMutHbVar.txt hgMutPah.txt hgMutBgmut.txt hgMutCftr.txt hgMutARdb.txt > hgMutUnsorted.txt
grep "^chr" hgMutUnsorted.txt | sort -k1,1 -k2,2n > hgMut.bed
#create tables
hgsql hg17 < ../../hgMut.sql
#loading
hgLoadBed hg17 hgMut hgMut.bed -noSort -oldTable -tab
#load small vocab control tables
hgsql hg17 < hgMutLink.sql
hgsql hg17 < hgMutAttrClass.sql
hgsql hg17 < hgMutAttrName.sql
hgsql hg17 < hgMutSrc.sql
#from hgsql hg17
load data local infile "hgMutExtLinkHbVar.txt" into table hgMutExtLink;
load data local infile "hgMutExtLinkARdb.txt" into table hgMutExtLink;
load data local infile "hgMutExtLinkBgmut.txt" into table hgMutExtLink;
load data local infile "hgMutExtLinkCFTR.txt" into table hgMutExtLink;
load data local infile "hgMutExtLinkPah.txt" into table hgMutExtLink;
load data local infile "hgMutExtLinkSP.txt" into table hgMutExtLink;
load data local infile "hgMutAttrHbVar2.txt" into table hgMutAttr;
load data local infile "hgMutAttrHbvarProt2.txt" into table hgMutAttr;
load data local infile "hgMutAttrARdb.txt" into table hgMutAttr;
load data local infile "hgMutAttrARdbProt.txt" into table hgMutAttr;
load data local infile "hgMutAliasHbVar.txt" into table hgMutAlias;
load data local infile "hgMutAliasARdb.txt" into table hgMutAlias;
load data local infile "hgMutAliasBgmut.txt" into table hgMutAlias;
load data local infile "hgMutAliasPah.txt" into table hgMutAlias;
load data local infile "hgMutExtLinkHbVarOmim.txt" into table hgMutExtLink;
load data local infile "hgMutAttrLink.txt" into table hgMutAttrLink;
load data local infile "hgMutAttrSP.txt" into table hgMutAttr;

#############################################################
# gv*  Belinda Giardine
# These tables are to replace the hgMut tables
# Most data is converted by me (on PSU machines) to loadable format and copied.
# The Swiss-Prot/UniProt data, is generated from the UniProt database at UCSC, 
# using perl scripts and table dumps.
# scripts in kent/src/hg/utils/gvParsers/swissProt/
# everything redone to not depend on the dv track in July 2006

#make list of variants from Swiss-Prot (make sure featureClass 23 is variant)
hgsql -N uniProt > spVars.txt <<end
select feature.acc, start, end-start, featureType.val, featureId.val from
feature, featureType, accToTaxon, featureId where featureClass=23 and
featureType=featureType.id and accToTaxon.acc=feature.acc and taxon=9606 and
feature.featureId=featureId.id;
end
#need list mapping 3 letter amino acids to 1 letter. (aminoInfoDump from PSU)
#known gene protein map (kgProtMap) has psl data from (blastp)
# with qName being the spId
hgsql -N hg17 > kgProtMapDump.txt <<end
select kgProtMap.* from kgProtMap, uniProt.feature where kgProtMap.qName =
uniProt.feature.acc;
end
#table join duplicates; perl script to throw out extra before use
uniqueRows < kgProtMapDump.txt > kgProtMapUniq.txt
#check variables for output and input file names
computeSpVars > errors.txt
#errors.txt will list variants that couldn't be mapped
#July 18, 2006
#37 gaps, 564 proteins (2228 variants) not in kgProtMap (test one did align)
#found 22389
#Swiss-Prot attributes: 
hgsql hg17 < listSPconnections.sql > listSPconnections.txt
hgsql proteome < listSpXref2.sql > listSpXref2.txt
convertOmimTitle > gvLinkSPomim.txt
hgsql hg17 < listGeneVals.sql > listGeneVals.txt
convertDisPoly > gvLinkSPuni.txt
cat gvLinkSPuni.txt gvLinkSPomim.txt > gvLinkSp.txt
cp gvLinkSp.txt ../../../gv/gvData/

#creating gv* tables and loading 
#June 27, 2006
autoSql gv.as gv -dbLink
#edit indexes and string lengths in .sql file
id=48, srcId=48, raKey=48, attrType=48,
primary key=index on bin and attr and link ids (id, attrType for attrs)
#do enums
#add unique index, to prevent doubles
UNIQUE KEY (chrom(12), chromStart, chromEnd, name)
#added id field to gvPos struct so can keep ID when change name
#    char *id;   /* Added field to hold ID if change name */
#set to null in gv.c file

#reload data July 2006 with more data and corrected Swiss-Prot data
#also moved gv*(except gvPos) and omimTitle to hgFixed
#prep data: concatenate all the gvPos data, sort
cat gvPosSP.txt gvPosHbVar.txt gvPosARdb.txt gvPosBgmut.txt gvPosCftr.txt
gvPosPah.txt gvPosSrd5a2.txt gvPosBrca.txt > gvPosAll.txt
grep "^chr" gvPosAll.txt | sort -k1,1 -k2,2n > gvPosSortedHg17.bed

#load tables
hgLoadBed hg17 gvPos gvPosSortedHg17.bed -noSort -oldTable -tab
hgsql hg17 < gvSrc.sql
hgsql hg17
        load data local infile "gvBrca.txt" into table gv;
        load data local infile "gvAttrBrca.txt" into table gvAttr;
        load data local infile "gvLinkBrca.txt" into table gvLink;
        load data local infile "gvLinkSP.txt" into table gvLink;
        load data local infile "gvLinkSPgene.txt" into table gvLink;
        load data local infile "gvSP.txt" into table gv;
        load data local infile "gvAttrSP.txt" into table gvAttr;
        load data local infile "gvAttrLongSP.txt" into table gvAttrLong;
        load data local infile "gvLinkHbVar.txt" into table gvLink;
        load data local infile "gvHbVar.txt" into table gv;
        load data local infile "gvAttrHbVar.txt" into table gvAttr;
        load data local infile "gvARdb.txt" into table gv;
        load data local infile "gvAttrARdb.txt" into table gvAttr;
        load data local infile "gvBgmut.txt" into table gv;
        load data local infile "gvAttrBgmut.txt" into table gvAttr;
        load data local infile "gvAttrLongBgmut.txt" into table gvAttrLong;
        load data local infile "gvLinkBgmut.txt" into table gvLink;
        load data local infile "gvCftr.txt" into table gv;
        load data local infile "gvAttrCftr.txt" into table gvAttr;
        load data local infile "gvPah.txt" into table gv;
        load data local infile "gvAttrPah.txt" into table gvAttr;
        load data local infile "gvLinkPah.txt" into table gvLink;
        load data local infile "gvSrd5a2.txt" into table gv;
        load data local infile "gvAttrSrd5a2.txt" into table gvAttr;
        load data local infile "gvAttrConservedDisease.txt" into table gvAttr;

#get disease association predictions for conserved variants
#get list a variants that are already done
hgsql -N hg17 > gvWithDiseaseStatus.txt <<end
select id from gvAttr where attrType = 'disease';
end
#use table browser to get variants that intersect most conserved track
#set conserved variants that are null to likely
computeDiseaseAssocCons > gvAttrConservedDisease.txt

#Belinda Giardine Sept 2006
#reload tables, removed ones with sequence mismatches, added label and strand
#added new lsbd BTKbase
#Sequence mismatches were determined by using the position in the reference
#sequence to fetch the sequence affected by the variant.  Then for substitions
#and deletions with the nts deleted listed, the sequence was compared.
#Insertions and large deletions could not be checked.

#Belinda Giardine Dec 2006
#reload tables, additions to previous sources and more IDbases
#details in hg18 doc

#Belinda Giardine Jan 2007
#reload tables, additions and corrections, details in hg18 doc

#############################################################
# Illumina Hap300 (Heather, July 2006)

ssh hgwdev
cd /cluster/data/hg17/bed
mkdir illumina
cd illumina
trim.pl < Illumina_HumanHap300_SNPlist_01.13.2006.txt > trim.out
hgsql hg17 < illuminaTmp.sql
hgsql -e "load data local infile 'trim.out' into table illuminaTmp" hg17
# illuminaLookup generates bin
/cluster/home/heather/kent/src/hg/snp/snpLoad/illuminaLookup hg17 illuminaTmp snp125 snp125Exceptions illuminaLookup.out illuminaLookup.err
# errors:
# unexpected chrom chr1 for snp rs1291584
# unexpected chrom chr17 for snp rs3826555
# unexpected locType between for snp rs2036773
# unexpected locType between for snp rs2249255
# unexpected locType between for snp rs8051412
# unexpected locType between for snp rs1017238
# unexpected locType between for snp rs5019493
# 16 with locType = range*
# 402 not found!
# None that have multiple alignments.
hgsql hg17 < snpArrayIllumina300.sql
hgsql -e "load data local infile 'illuminaLookup.out' into table snpArrayIllumina300" hg17
hgsql -e "alter table snpArrayIllumina300 add index name (name)" hg17
hgsql -e "alter table snpArrayIllumina300 add index chrom (chrom, bin)" hg17

#############################################################
# Illumina Hap550 and Hap650 (Heather, April 2007)
# Transfer from hg18 for Bert Gold at NCI

ssh hgwdev
cd /cluster/data/hg17/bed/illumina
hgsql hg18 < getHg18-550.sql > 550.hg18
hgsql hg18 < getHg18-650.sql > 650.hg18
# get name, chrom, chromStart, chromEnd, strand observed from snp125
# where class = "single" and locType = "exact" and chromEnd = chromStart + 1
# Including tri/quad allelic and multiple-aligning for now
hgsql hg17 < getHg17.sql > snp125single.hg17
# sort and join
sort 550.hg18 > 550.hg18.sort
sort 650.hg18 > 650.hg18.sort
sort snp125single.hg17 > snp125single.hg17.sort
# 560704 lines in 550.join
# 660137 lines in 650.join
# 687 lines in 550.missing
# 706 lines in 650.missing
join 550.hg18.sort snp125single.hg17.sort > 550.join
join 650.hg18.sort snp125single.hg17.sort > 650.join
join -v 1 550.hg18.sort snp125single.hg17.sort > 550.missing
join -v 1 650.hg18.sort snp125single.hg17.sort > 650.missing
# fix column order
awk '{print $2, $3, $4, $1, 0, $5, $6}' 550.join > 550.bed
awk '{print $2, $3, $4, $1, 0, $5, $6}' 650.join > 650.bed
# load
hgLoadBed hg17 snpArrayIllumina550 550.bed -sqlTable=snpArrayIllumina550.sql
hgLoadBed hg17 snpArrayIllumina650 650.bed -sqlTable=snpArrayIllumina650.sql
# indices
mysql> alter table snpArrayIllumina550 add index name (name);
mysql> alter table snpArrayIllumina550 add index chrom (chrom, bin);
mysql> alter table snpArrayIllumina650 add index name (name);
mysql> alter table snpArrayIllumina650 add index chrom (chrom, bin);

#############################################################
# Affy 500K (Heather, September 2006)
# look up rsId using position

ssh hgwdev
cd /cluster/data/hg17/bed/snp/affyData/500K
# awk to create bed format from tsv files
/bin/csh cmds.csh
hgsql hg17 < affy250Nsp.sql
hgsql hg17 < affy250Sty.sql
hgsql -e "load data local infile 'Mapping250K_Nsp.bed' into table affy250Nsp" hg17
hgsql -e "load data local infile 'Mapping250K_Sty.bed' into table affy250Sty" hg17

# look up dbSNP rsIDs using position
# affy250Nsp
# 4311 missing, 7276 multiple
/cluster/home/heather/kent/src/hg/snp/snpLoad/affyLookup hg17 affy250Nsp snp125
mv affyLookup.out affy250Nsp.bed
mv affyLookup.err affy250Nsp.err
hgsql hg17 < snpArrayAffy250Nsp.sql
hgLoadBed hg17 snpArrayAffy250Nsp affy250Nsp.bed -sqlTable=snpArray250Nsp.sql -tab
hgsql -e "alter table snpArrayAffy250Nsp add index name (name)" hg17
hgsql -e "alter table snpArrayAffy250Nsp add index chrom (chrom, bin)" hg17

# affy250Sty
# 3540 missing, 6901 multiple
/cluster/home/heather/kent/src/hg/snp/snpLoad/affyLookup hg17 affy250Sty snp125
mv affyLookup.out affy250Sty.bed
mv affyLookup.err affy250Sty.err
hgsql hg17 < snpArrayAffy250Sty.sql
hgLoadBed hg17 snpArrayAffy250Sty affy250Sty.bed -sqlTable=snpArray250Sty.sql -tab
hgsql -e "alter table snpArrayAffy250Sty add index name (name)" hg17
hgsql -e "alter table snpArrayAffy250Sty add index chrom (chrom, bin)" hg17


#############################################################
# Affy 10K (Sept. 2006, Heather)
# look up rsId using position

ssh hgwdev
cd /cluster/data/hg17/bed/snp/affyData/10K100Kagain

# affy10
# 14 missing, 807 multiple
cp affy10K.txt affy10Temp.bed
hgLoadBed hg17 affy10Temp affy10Temp.bed -sqlTable=affy10Temp.sql -tab -noBin
/cluster/home/heather/kent/src/hg/snp/snpLoad/affyLookup hg17 affy10Temp snp125
mv affyLookup.out affy10.bed
mv affyLookup.err affy10.err
hgLoadBed hg17 snpArrayAffy10 affy10.bed -sqlTable=snpArrayAffy10.sql -tab

# affy10v2
# 12 missing, 716 multiple
hgLoadBed hg17 affy10v2Temp affy10v2Temp.bed -sqlTable=affy10v2Temp.sql -tab -noBin
/cluster/home/heather/kent/src/hg/snp/snpLoad/affyLookup hg17 affy10v2Temp snp125
mv affyLookup.out affy10v2.bed
mv affyLookup.err affy10.errv2
hgLoadBed hg17 snpArrayAffy10v2 affy10v2.bed -sqlTable=snpArrayAffy10v2.sql -tab

# affy50HindIII
# 156 missing, 1396 multiple
hgLoadBed hg17 affy50HindIIITemp affy50HindIII.bed -sqlTable=affy50HindIIITemp.sql -tab -noBin
/cluster/home/heather/kent/src/hg/snp/snpLoad/affyLookup hg17 affy50HindIIITemp snp125
mv affyLookup.out affy50HindIII.bed
mv affyLookup.err affy50HindIII.err
hgLoadBed hg17 snpArrayAffy50HindIII affy50HindIII.bed -sqlTable=snpArrayAffy50HindIII.sql -tab
hgsql -e "alter table snpArrayAffy50HindIII add index name (name)" hg17
hgsql -e "alter table snpArrayAffy50HindIII add index chrom (chrom, bin)" hg17

# affy50XbaI
# 115 missing, 1745 multiple
hgLoadBed hg17 affy50XbaITemp affy50XbaI.bed -sqlTable=affy50XbaITemp.sql -tab -noBin
/cluster/home/heather/kent/src/hg/snp/snpLoad/affyLookup hg17 affy50XbaI snp125
mv affyLookup.out affy50XbaI.bed
mv affyLookup.err affy50XbaI.err
hgLoadBed hg17 snpArrayAffy50XbaI affy50XbaI.bed -sqlTable=snpArrayAffy50XbaI.sql -tab
hgsql -e "alter table snpArrayAffy50XbaI add index name (name)" hg17
hgsql -e "alter table snpArrayAffy50XbaI add index chrom (chrom, bin)" hg17


#########################################################################
# REGULATORY POTENTIAL (DONE - 2006-06-14 - Hiram)
    #	download data from "James Taylor" <james@bx.psu.edu>
    ssh kkstore02
    mkdir /cluster/store11/hg17/bed/regPotential7X
    cd /cluster/data/hg17/bed
    ln -s /cluster/store11/hg17/bed/regPotential7X ./regPotential7X
    cd regPotential7X
    
    #	This is a lot of data
    time for C in 1 2 3 4 5 6 7 8 9 X Y 10 11 12 13 14 15 16 17 18 19 20 21 22
    do
    wget --timestamping \
"http://www.bx.psu.edu/~james/esperr_rp_7way_scores/genome_scores_hg17/chr${C}.scores.truncated.bz2"
    done
    #	real    115m1.855s

    wget --timestamping \
"http://www.bx.psu.edu/~james/esperr_rp_7way_scores/genome_scores_hg17/trackDb.html" -O description.html

    time for C in 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 X Y
    do
	bzcat chr${C}.scores.truncated.bz2
    done | wigEncode -noOverlap  stdin regPotential7X.wig regPotential7X.wib
    #	Converted stdin, upper limit 1.00, lower limit -0.00
    #	real    33m48.487s

    #	Loading the table on hgwdev
    ssh hgwdev
    cd /cluster/data/hg17/bed/regPotential7X
    ln -s /cluster/data/hg17/bed/regPotential7X/regPotential7X.wib \
	/gbdb/hg17/wib/regPotential7X.wib
    #	using the tmpDir is faster since it is on local disk and it will
    #	clean up any temporary .tab file it creates there
    time hgLoadWiggle -tmpDir=/scratch/tmp \
	hg17 regPotential7X regPotential7X.wig

    #	How about a histogram of the data.
    ssh kolossus
    cd /cluster/data/hg17/bed/regPotential7X
    time hgWiggle -verbose=2 -doHistogram -hBinSize=0.01 -hBinCount=100 \
	-hMinVal=0.0 -db=hg17 regPotential7X > histogram.data 2>&1
    #	real    2m48.810s
    #	73 % of the data values are zero

    #	create download gzip files from the bz2 files:
    for F in chr*.scores.truncated.bz2
    do
	C=`echo $F | awk -F'.' '{print $1}'`
	echo -n "${C}.regPotential7X.hg17.gz working ... "
	bzcat ${F} | gzip > ${C}.regPotential7X.hg17.gz
	echo
    done

#########################################################################
####### RE-BUILD RGD HUMAN QTL TRACKS (DONE 06/21/06 Fan) ##############
# DELETED RECORD FROM rgdQtlLink SO CONSISTENT WITH REMOVAL FROM rgdQtl
# (DONE, 2006-06-30, hartera)

ssh hgwdev
mkdir -p /cluster/store8/rgd/human12062005
rm /cluster/data/hg17/bed/rgdQtl
ln -s /cluster/store8/rgd/human12062005 /cluster/data/hg17/bed/rgdQtl
cd /cluster/data/hg17/bed/rgdQtl

# download data files from RGD

wget --timestamp ftp://rgd.mcw.edu/pub/RGD_genome_annotations/human/rgd_human_qtl_12062005.gff

# remove extra line feed character at the end of lines

rmLf rgd_human_qtl_12062005.gff > rgdQtl.gff

# create rgdQtl.tab
awk '{print $1"\t"$4"\t"$5"\t"$10}'  rgdQtl.gff |sed -e 's/Chr/chr/g'| \
sed -e 's/"//g' |sed -e 's/RGD://g' | sed -e 's/;//g' |sort -u > rgdQtl.tab

# create rgdQtlLink.tab

cat rgdQtl.gff |cut -f 9 |sed -e 's/; Note /\t/g'|\
sed -e 's/Alignment //' |sed -e 's/;Note /\t/' |\
sed -e 's/"//g' |sed -e 's/RGD://' >j.tmp
cut -f 2 j.tmp >j.1
cut -f 1,3 j.tmp >j.2
paste j.1 j.2 |sort -u >rgdQtlLink.tab
rm j.1 j.2 j.tmp

# load rgdQtl table
hgLoadBed hg17 rgdQtl rgdQtl.tab

# check rgdQtl table
checkTableCoords hg17 rgdQtl

# Go the following error messages:

#hg17.rgdQtl has 2 records with end > chromSize.
#hg17.rgdQtl has 2 records with end > chromSize.
#hg17.rgdQtl has 2 records with end > chromSize.
#hg17.rgdQtl has 1 records with end > chromSize.
#hg17.rgdQtl has 2 records with end > chromSize.
#hg17.rgdQtl has 3 records with end > chromSize.
#hg17.rgdQtl has 1 records with end > chromSize.
#hg17.rgdQtl has 3 records with end > chromSize.
#hg17.rgdQtl has 2 records with end > chromSize.
#hg17.rgdQtl has 1 records with end > chromSize.
#hg17.rgdQtl has 3 records with end > chromSize.
#hg17.rgdQtl has 2 records with end > chromSize.
#hg17.rgdQtl has 1 records with end > chromSize.
#hg17.rgdQtl has 1 records with end > chromSize.
#hg17.rgdQtl has 1 records with end > chromSize.
#hg17.rgdQtl has 1 records with end > chromSize.
#hg17.rgdQtl has 1 records with end < start.

hgsql hg17 -N -e 'select "do1", name, c.size from rgdQtl r, chromInfo c where chromEnd > c.size and r.chrom=c.chrom' >doall

cat << '_EOF_' > do1
hgsql hg17 -e "update rgdQtl set chromEnd = '${2}' where name='${1}'"
'_EOF_'

chmod +x do*
doall

checkTableCoords hg17 rgdQtl
#hg17.rgdQtl has 1 records with end < start.

hgsql hg17 -e 'select * from rgdQtl where chromEnd < chromStart'
# bin     chrom   chromStart      chromEnd        name
# 9       chr10   7135612 371019  BW63_H
# 0       chr20   77628133        5242324 AASTH39_H

# Don't know why checkTableCoords only catches one of the two erros.
hgsql hg17 -e "update rgdQtl set chromStart = 271019 where name='BW63_H'"
hgsql hg17 -e "update rgdQtl set chromEnd = 7135612 where name='BW63_H'"

# Delete the following record.  The RGD QTL is very questionable. 
hgsql hg17 -e "delete from rgdQtl where name='AASTH39_H'"

# load rgdQtlLink table
hgsql hg17 -e "drop table hg17.rgdQtlLink;"
hgsql hg17 <~/kent/src/hg/lib/rgdQtlLink.sql
hgsql hg17 -e 'load data local infile "rgdQtlLink.tab" into table hg17.rgdQtlLink;'

# Delete the record from rgdQtlLink table that was removed from the rgdQtl
# table above. (hartera, 2006-06-30)
hgsql hg17 -e "delete from rgdQtlLink where name='AASTH39_H'"

########################################################################

#########################################################################
#Reload omimTitle table			Belinda Giardine June 28, 2006
#fetched omim.txt.Z from OMIM downloads.
#parse out title lines (*FIELD* TI)
convertTitle < omim.txt > omimTitle.txt
#load into omimTitle table
truncate table omimTitle;
load data local infile "omimTitle.txt" into table omimTitle;

#############################################################
# Lift SV track from hg16 (Heather, July 2006)
# hg16 SV track is comprised of 7 subtracks:
# cnpFosmid, cnpSebat, cnpIafrate, cnpSharp, delConrad, delMccarroll, delHinds
# Use the same table formats as hg16; pre-create
# (No bin for del tables)

cd /cluster/data/hg17/bed
mkdir svMixed
cd svMixed

# I got hg17 coords from Andy Sharp for cnpFosmid and delHinds
trimFosmid.pl < cnpFosmid.txt > cnpFosmid.bed
hgLoadBed -tab hg17 cnpFosmid cnpFosmid.bed
hinds.pl < hinds.txt > delHinds.bed
hgLoadBed -tab -noBin hg17 delHinds delHinds.bed

# (7-27-2006 Brooke Rhead -- edited the cnpFosmid table)
# According to Andy Sharp, the name='Gap' items should be removed from
# cnpFosmid.  I dumped the table, removed the 'Gap' lines, then dumped the
# table again.

cd /cluster/data/hg17/bed/svMixed
hgsql hg17 -e "select * from cnpFosmid" > cnpFosmid_withGaps.bed

hgsql hg17
delete from cnpFosmid where name='Gap';

hgsql hg17 -e "select * from cnpFosmid" > cnpFosmid_withoutGaps.bed

# Simple lifts for delMccarroll
cat << '_EOF_' > liftConrad.csh
#!/bin/csh
hgsql -N -e 'select * from delMccarroll' hg16 > delMccarroll.hg16
liftOver -minMatch=0.7 delMccarroll.hg16 /gbdb/hg16/liftOver/hg16ToHg17.over.chain.gz delMccarroll.bed delMccarroll.err
hgLoadBed -sqlTable=delMccarroll.sql -tab -noBin hg17 delMccarroll delMccarroll.bed
'_EOF_'

# Lift both chromStart/chromEnd and thickStart/thickEnd for delConrad and join
cat << '_EOF_' > liftConrad.csh
#!/bin/csh
hgsql -N -e 'select chrom, chromStart, chromEnd, name, score, strand from delConrad' hg16 > delConrad.hg16.1
hgsql -N -e 'select chrom, thickStart, thickEnd, name, score, strand from delConrad' hg16 > delConrad.hg16.2
liftOver -minMatch=0.7 delConrad.hg16.1 /gbdb/hg16/liftOver/hg16ToHg17.over.chain.gz delConrad.tmp.1 delConrad.err.1
liftOver -minMatch=0.7 delConrad.hg16.2 /gbdb/hg16/liftOver/hg16ToHg17.over.chain.gz delConrad.tmp.2 delConrad.err.2
trimConrad.pl < delConrad.tmp.1 > delConrad.trim.1
trimConrad.pl < delConrad.tmp.2 > delConrad.trim.2
sort delConrad.trim.1 > delConrad.sort.1
sort delConrad.trim.2 > delConrad.sort.2
join delConrad.sort.1 delConrad.sort.2 > delConrad.join
awk '{print $2, $3, $4, $1, 1000, $5, $7, $8}' delConrad.join > delConrad.bed
hgLoadBed -sqlTable=delConrad.sql -noBin hg17 delConrad delConrad.bed
'_EOF_'

# Andy Sharp says the Sebat data has already been lifted, so be conservative here
# Create hg16.cnpSebatLiftCandidate that excludes 5 rows that had wild proliferations
cat << '_EOF_' > liftSebat.csh
hgsql -N -e 'select chrom, chromStart, chromEnd, name, probes, individuals from cnpSebatLiftCandidate' hg16 > cnpSebat.hg16
liftOver -minMatch=0.7 -bedPlus=4 cnpSebat.hg16 /gbdb/hg16/liftOver/hg16ToHg17.over.chain.gz cnpSebat.bed cnpSebat.err
hgLoadBed -sqlTable=cnpSebat.sql -tab hg17 cnpSebat cnpSebat.bed
'_EOF_'

# For Andy's data, use bacEndPairs first, then lift the remainder
cat << '_EOF_' > liftSharp.csh
# assumes a copy of hg16.cnpSharp in hg17.cnpSharpHg16Copy
/cluster/home/heather/kent/src/hg/snp/snpLoad/cnpLookup hg17 bacEndPairs cnpSharpHg16Copy cnpSharpLookup.out cnpSharpLookup.lift cnpSharpLookup.log
sed -e 's/Gain and Loss/GainAndLoss/' cnpSharpLookup.lift > cnpSharpLookup.lift.fix
mv cnpSharpLookup.lift.fix cnpSharpLookup.lift
liftOver -minMatch=0.7 -bedPlus=4 cnpSharpLookup.lift /gbdb/hg16/liftOver/hg16ToHg17.over.chain.gz cnpSharp.bed cnpSharp.err
sed -e 's/GainAndLoss/Gain And Loss/' cnpSharp.bed > cnpSharp.bed.fix
mv cnpSharp.bed.fix cnpSharp.bed
hgLoadBed -tab -sqlTable=cnpSharp.sql hg17 cnpSharp cnpSharpLookup.out
hgLoadBed -tab -oldTable hg17 cnpSharp cnpSharp.bed
'_EOF_'

# For the Iafrate data, the BAC End lookup wasn't good, so just lift
# Create hg16.cnpIafrateLiftCandidate that excludes 2 rows that had wild proliferations
cat << '_EOF_' > liftIafrate.csh
hgsql -N -e 'select chrom, chromStart, chromEnd, name, variationType, score from cnpIafrateLiftCandidate' hg16 > cnpIafrate.hg16
sed -e 's/Gain and Loss/GainAndLoss/' cnpIafrate.hg16 > cnpIafrate.hg16.fix
mv cnpIafrate.hg16.fix cnpIafrate.hg16
liftOver -minMatch=0.7 -bedPlus=4 cnpIafrate.hg16 /gbdb/hg16/liftOver/hg16ToHg17.over.chain.gz cnpIafrate.bed cnpIafrate.err
sed -e 's/GainAndLoss/Gain And Loss/' cnpIafrate.bed > cnpIafrate.bed.fix
mv cnpIafrate.bed.fix cnpIafrate.bed
hgLoadBed -sqlTable=cnpIafrate.sql -tab hg17 cnpIafrate cnpIafrate.bed
'_EOF_'

##############################################################################
# Add HapMap CNVRs from Matt Hurles (Heather Dec 2006)
ssh hgwdev
cd /cluster/data/hg17/bed/svRedon
# File from Matthew Hurles (meh@sanger.ac.uk) was essentially bed 4
# I decided to use bed 6 with score always 0 and strand always +
awk '{printf "%st%d\t%d\tcnp%s\t0\t%s\n", $1, $4, $5, $3, $7}' input.gff > input.bed
hgLoadBed hg17 cnpRedon input.bed

##############################################################################
#  dbRIP POLYALUL1SVA track added (2006-07-14 - DONE - Hiram)
#  dbRIP polyAluL1SVA
#	Data provider: Dr. Liang at the Liang lab:
#	http://falcon.roswellpark.org/index.html
#	Ping.Liang@roswellpark.org

#	Adding this track is a new data type into our browser.
#	data definitions for dbRIP and polyGenotype were added to
#	the hg/lib/ directory:
#	-rw-rw-r--  1  351 Jul 13 12:20 polyGenotype.as
#	-rw-rw-r--  1  694 Jul 13 12:22 polyGenotype.sql
#	-rw-rw-r--  1 6398 Jul 13 12:22 polyGenotype.c
#	-rw-rw-r--  1   980 Jul 10 17:59 dbRIP.as
#	-rw-rw-r--  1 11408 Jul 13 11:16 dbRIP.c
#	-rw-rw-r--  1  1578 Jul 13 12:06 dbRIP.sql
#	With associated .h files in hg/inc/
#	-rw-rw-r--  1 4600 Jul 10 18:00 dbRIP.h
#	-rw-rw-r--  1 4375 Jul 13 16:16 polyGenotype.h
#	Changes in hgTracks and hgc to make this track appear as it does
#	at their browser:
#	http://falcon.roswellpark.org:9090/cgi-bin/hgTables

#	For this first instance of the track, the data was obtained
#	directly from their Genome browser via the tables browser,
#	dumping the tables:
#	hg17.polyAluL1 and hg17.polyGenotype
#	saving these data dumps to:
#		(after a couple of versions were used ...)
    ssh hgwdev
    mkdir /cluster/data/hg17/bed/dbRIP
    cd /cluster/data/hg17/bed/dbRIP
    #	-rw-rw-r--  1  994485 Aug  1 16:03 dbRIP.2006-08-01.txt.gz
    #	-rw-rw-r--  1   18532 Aug  1 16:05 polyGenotype.2006-08-01.txt.gz

    #	Rearrange their data columns to more closely match the
    #	standard BED definitions, and split into three different
    #	data sets:
zcat dbRIP.2006-08-01.txt.gz | headRest 1 stdin | awk -F'\t' '
{
chromStart=$6
chromStart -= 1
chromEnd=$7
if (match($1,"^RIP_SVA_.*")) {
printf "%s\t%s\t%s\t%s\t0\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n", $5, chromStart, chromEnd, $1, $12, $2, $3, $4, $8, $9, $10, $11, $15, $13, $14, $16, $17, $18, $19, $20, $21
}
}' | sort -k1,1 -k2,2n > dbRIP.SVA.txt

zcat dbRIP.2006-08-01.txt.gz | headRest 1 stdin | awk -F'\t' '
{
chromStart=$6
chromStart -= 1
chromEnd=$7
if (match($1,"^RIP_L1_.*")) {
printf "%s\t%s\t%s\t%s\t0\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n", $5, chromStart, chromEnd, $1, $12, $2, $3, $4, $8, $9, $10, $11, $15, $13, $14, $16, $17, $18, $19, $20, $21
}
}' | sort -k1,1 -k2,2n > dbRIP.L1.txt

zcat dbRIP.2006-08-01.txt.gz | headRest 1 stdin | awk -F'\t' '
{
chromStart=$6
chromStart -= 1
chromEnd=$7
if (match($1,"^RIP_Alu_.*")) {
printf "%s\t%s\t%s\t%s\t0\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n", $5, chromStart, chromEnd, $1, $12, $2, $3, $4, $8, $9, $10, $11, $15, $13, $14, $16, $17, $18, $19, $20, $21
}
}' | sort -k1,1 -k2,2n > dbRIP.Alu.txt

    #	Create three specific sql table create definitions:
    sed -e "s/dbRIP/dbRIP_SVA/" $HOME/kent/src/hg/lib/dbRIP.sql > dbRIP_SVA.sql
    sed -e "s/dbRIP/dbRIP_L1/" $HOME/kent/src/hg/lib/dbRIP.sql > dbRIP_L1.sql
    sed -e "s/dbRIP/dbRIP_Alu/" $HOME/kent/src/hg/lib/dbRIP.sql > dbRIP_Alu.sql

    #	And loading those three data tables:
    hgLoadBed -verbose=3 -maxChromNameLength=6 -strict -tab -notItemRgb \
        -sqlTable=dbRIP_SVA.sql hg17 dbRIP_SVA dbRIP.SVA.txt

    hgLoadBed -verbose=3 -maxChromNameLength=6 -strict -tab -notItemRgb \
	-sqlTable=dbRIP_L1.sql hg17 dbRIP_L1 dbRIP.L1.txt

    hgLoadBed -verbose=3 -maxChromNameLength=6 -strict -tab -notItemRgb \
	-sqlTable=dbRIP_Alu.sql hg17 dbRIP_Alu dbRIP.Alu.txt

    #	And an associated table of genotype frequencies
    #	Add three extra rows to the original data to provide a better handle
    #	on MySQL lookups for allele Frequency
    hgsql hg17 -e "drop table polyGenotype;"
    hgsql hg17 < $HOME/kent/src/hg/lib/polyGenotype.sql
    zcat polyGenotype.2006-08-01.txt.gz | headRest 1 stdin | \
awk -F'\t' '
{
sampleSize = $3 + $4 + $5
plus = ($3 * 2) + $4
minus = ($5 * 2) + $4
if ((plus + minus) < 1) { alleleFreq=0 } else
        { alleleFreq = plus / (plus + minus) }
if (sampleSize > 0) {
heteroZyg = (2 * alleleFreq * (1.0 - alleleFreq)) * ((sampleSize * 2)/((sampleSize * 2) - 1))
} else {
heteroZyg = 2 * alleleFreq * (1.0 - alleleFreq)
}
printf "%s\t%s\t%d\t%d\t%d\t%.3f\t%.3f\t%.3f\n", $1, $2, $3, $4, $5, sampleSize, alleleFreq, heteroZyg
}
' > polyGenotype.txt

    hgsql hg17 -e \
	'load data local infile "polyGenotype.txt" into table polyGenotype;'

    #	A composite track was added to human/hg17/trackDb.ra to contain
    #	these three tracks, and search methods to get the name column
    #	participating in the search.  Need to figure out how to get some
    #	of the other text-rich columns participating in the search.

##############################################################################
# hg17 -> hg15 LIFTOVER CHAINS (DONE 7/27/06 Fan)
    # I used a size of 10kb instead of 3kb for the split (blat query) sizes in
    # hg15.  This had a huge affect on the amount of hits in the blat, which
    # then had a huge effect on the amount of chains.  I should also mention 
    # that hg15 chromosomes chr1 and chr2 were split further 
    # into more than a single query file.  This helped a LOT in avoiding 
    # cluster hippos classically associated with those chroms.
    
    ######## LIFTOVER PREPARATION
    # Split up hg15
    ssh pk
    cd /san/sanVol1/scratch/hg15
    mkdir -p liftSplits/{split,lift}
    bash
    for fa in /cluster/data/hg15/?{,?}/*.fa; do
      c=`basename $fa .fa`
      echo $c
      faSplit -lift=liftSplits/lift/${c}.lft size $fa -oneFile 10000 liftSplits/split/$c
    done
    mkdir -p biggerSplits/split
    cd biggerSplits/
    ln -s ../liftSplits/lift
    cd split/
    ln -s ../../liftSplits/split/* .
    faSplit sequence chr1.fa 5 chr1_
    faSplit sequence chr2.fa 5 chr2_
    rm chr{1,2}.fa

    # Make some dirs
    # cd /san/sanVol1/scratch
    # mkdir -p hg17

    # Copy 11.ooc files to hg17 subdirectory.
    # cp -p /cluster/store5/gs.16/build33/11.ooc hg17

    ## First, copy over scripts. (Already done before)

    # mkdir -p /san/sanVol1/scratch/fan
    # cp -p /san/sanVol1/scratch/fan/*.sh /san/sanVol1/scratch/fan
    # cp /san/sanVol1/scratch/andy/psl.header /san/sanVol1/scratch/fan

    ######## LIFTOVER BLATING  

    # HG17
    ssh kk
    cd /cluster/data/hg17
    #makeLoChain-align hg17 /scratch/hg/hg17/nib hg15 /san/sanVol1/scratch/hg15/biggerSplits/split

    makeLoChain-align hg17 /scratch/hg/hg17/bothMaskedNibs hg15  /san/sanVol1/scratch/hg15/liftOver/biggerSplits/split 

# Completed: 2392 of 2392 jobs
# CPU time in finished jobs:   25651277s  427521.28m  7125.35h  296.89d  0.813 y
# IO & Wait Time:                 74118s    1235.30m    20.59h    0.86d  0.002 y
# Average job time:               10755s     179.25m     2.99h    0.12d
# Longest running job:                0s       0.00m     0.00h    0.00d
# Longest finished job:           82545s    1375.75m    22.93h    0.96d
# Submission to last job:         82579s    1376.32m    22.94h    0.96d

    ssh kkstore02

    cd /cluster/data/hg17
    cd bed

    mv blat.hg15.2006-07-25 /san/sanVol1/scratch/hg17 

    ssh pk
    cd /san/sanVol1/scratch/hg17/blat.hg15.2006-07-25/run/
    sed 's/^blat/blat.sh/; s/\}.*$/}/' spec | awk '{print "/san/sanVol1/scratch/fan/" $0 " hg17ToHg15"}' > newspec
    para create newspec
    para try
    para push
# Completed: 2392 of 2392 jobs
# CPU time in finished jobs:     612316s   10205.26m   170.09h    7.09d  0.019 y
# IO & Wait Time:                 12421s     207.02m     3.45h    0.14d  0.000 y
# Average job time:                 261s       4.35m     0.07h    0.00d
# Longest running job:                0s       0.00m     0.00h    0.00d
# Longest finished job:            3524s      58.73m     0.98h    0.04d
# Submission to last job:          3588s      59.80m     1.00h    0.04d

    ######## LIFTOVER CHAINING
    # LIFTING
    ssh pk
    cd /san/sanVol1/scratch/fan
    cp mm7SplitLift.sh hg15SplitLift.sh

    # change andy to fan, mm7 to hg15, and chrX to chr2, and remove chrUn_random 
    vi hg15SplitLift.sh

    cat << 'EOF' > hg15ChainMergeSplit.sh
#!/bin/bash
cp -r chainRaw/ /scratch/fan/hg15Lifts
pushd /scratch/fan/hg15Lifts
mkdir chain
/cluster/bin/x86_64/chainMergeSort chainRaw/*.chain | /cluster/bin/x86_64/chainSplit chain stdin
cp -r chain `dirs +1`
rm -rf chain chainRaw
'EOF'

    chmod +x hg15ChainMergeSplit.sh

    # HG17
    cd /san/sanVol1/scratch/hg17/blat.hg15.2006-07-25/raw
    /san/sanVol1/scratch/fan/hg15SplitLift.sh
# There was an extra file, nib22.fa, under /cluster/data/hg15/nib, which should not be there.
# -rw-rw-r--  1 2429 protein  50466533 May 20  2003 nib22.fa
# This caused hg15SplitLift.sh to end abnormally.

    cd ../    
    mkdir chainRun chainRaw
    cd chainRun
    cat > gsub << 'EOF'
#LOOP
/cluster/bin/x86_64/axtChain -verbose=0 -linearGap=medium -psl $(path1) /scratch/hg/hg17/bothMaskedNibs  /san/sanVol1/scratch/hg15/nib {check out line+ ../chainRaw/$(root1).chain}
#ENDLOOP
'EOF'
    ls -1S ../psl/*.psl > in.lst
    gensub2 in.lst single gsub spec
    para create spec
    para try
    para push
    para time
# Completed: 44 of 44 jobs
# CPU time in finished jobs:       3596s      59.94m     1.00h    0.04d  0.000 y
# IO & Wait Time:                   919s      15.31m     0.26h    0.01d  0.000 y
# Average job time:                 103s       1.71m     0.03h    0.00d
# Longest running job:                0s       0.00m     0.00h    0.00d
# Longest finished job:             274s       4.57m     0.08h    0.00d
# Submission to last job:           284s       4.73m     0.08h    0.00d
    ######### CHAINMERGE/NET/NETSUBSET
    ssh kolossus
    mkdir -p /scratch/fan/hg15Lifts
    cd /scratch/fan/hg15Lifts

    cp -r /san/sanvol1/scratch/hg17/blat.hg15.2006-07-25/chainRaw/ .
    mkdir chain
    /cluster/bin/x86_64/chainMergeSort chainRaw/* | /cluster/bin/x86_64/chainSplit chain stdin
# about 30 minutes.

    cp -rp chain /san/sanvol1/scratch/hg17/blat.hg15.2006-07-25/
    rm -rf chain
    rm -rf chainRaw

    ssh pk
    cd /san/sanvol1/scratch/fan
    cat << 'EOF' > netOver.sh 
#!/bin/bash

chain=$1
chrom=`basename $chain .chain`
sizesHGOld=$2
sizesHG15=/cluster/data/hg15/chrom.sizes
chainDir=`dirname $chain`
blatDir=`dirname $chainDir`
net=${blatDir}/net/${chrom}.net
over=${blatDir}/over/${chrom}.over

mkdir -p ${blatDir}/{over,net}
/cluster/bin/x86_64/chainNet $chain $sizesHGOld $sizesHG15 $net /dev/null
/cluster/bin/x86_64/netChainSubset $net $chain $over
'EOF'
    chmod +x netOver.sh

    mkdir netRun

    cd netRun/

    find /san/sanVol1/scratch/hg17/blat.hg15.2006-07-25/chain -name "*.chain" \
     | awk '{print "/san/sanVol1/scratch/fan/netOver.sh " $1 " /cluster/data/hg17/chrom.sizes"}' > spec
    para create spec
    para push
    para time
# Completed: 46 of 46 jobs
# CPU time in finished jobs:        438s       7.30m     0.12h    0.01d  0.000 y
# IO & Wait Time:                   118s       1.97m     0.03h    0.00d  0.000 y
# Average job time:                  12s       0.20m     0.00h    0.00d
# Longest running job:                0s       0.00m     0.00h    0.00d
# Longest finished job:              28s       0.47m     0.01h    0.00d
# Submission to last job:            67s       1.12m     0.02h    0.00d

    ########## FINISHING
    ssh hgwdev

    # HG17
    cd /san/sanvol1/scratch/hg17/blat.hg15.2006-07-25/over
    cat * >> ../hg17ToHg15.over.chain
    cd ../
    rm -rf psl/ net/ chain/ chainRaw/ over/
    cd ../
    cp -rp blat.hg15.2006-07-25/ /cluster/data/hg17/bed

    cd /cluster/data/hg17/bed
    ln -s blat.hg15.2006-07-25 blat.hg15
    ln -s `pwd`/blat.hg15/hg17ToHg15.over.chain liftOver/hg17ToHg15.over.chain
    ln -s `pwd`/liftOver/hg17ToHg15.over.chain /gbdb/hg17/liftOver/hg17ToHg15.over.chain
    mkdir -p /usr/local/apache/htdocs/goldenPath/hg17/liftOver

    gzip /cluster/store5/gs.18/build35/bed/blat.hg15/hg17ToHg15.over.chain
    ln -s /cluster/store5/gs.18/build35/bed/blat.hg15/hg17ToHg15.over.chain.gz /gbdb/hg17/liftOver/
    cp -p /cluster/store5/gs.18/build35/bed/blat.hg15/hg17ToHg15.over.chain.gz /cluster/data/hg17/bed/liftOver 

    cd /usr/local/apache/htdocs/goldenPath/hg17/liftOver
    ln -s /cluster/store5/gs.18/build35/bed/blat.hg15/hg17ToHg15.over.chain.gz hg17ToHg15.over.chain.gz
    hgAddLiftOverChain hg17 hg15 

############################################################################

############################################################################
# Reload CCDS (ccdsInfo, ccdsGene, ccdsKgMap) (2006-08-15 markd)
    cd /cluster/data/genbank/data/ccds/hg17
    ftp ftp-private.ncbi.nih.gov (user ccds, needs password)
    ftp> get CCDS.20060815.tar.gz
    mkdir /scratch/tmp/ccds
    cd /scratch/tmp/ccds
    tar -zxf  /cluster/data/genbank/data/ccds/hg17/CCDS.20060815.tar.gz

    # import ccds database tables
    hgsql -e 'create database ccds'
    hgsql ccds </cluster/data/genbank/etc/createTables.sql 
    hgsql ccds </cluster/data/genbank/etc/createKeys.sql 
    /cluster/data/genbank/bin/x86_64/ccdsImport ccds data/*.txt

    # create and load ccdsGene and ccdsInfo tables from imported database
    /cluster/data/genbank/bin/x86_64/ccdsMkTables -loadDb ccds hg17 ccdsInfo ccdsGene
    checkTableCoords hg17 -verbose=2 ccdsGene

    /cluster/data/genbank/bin/x86_64/mkCcdsGeneMap  -db=hg17 -loadDb ccdsGene knownGene ccdsKgMap
    rm -rf /scratch/tmp/ccds

    # 2006-08-23 - found bug with some source genes missing from ccdsInfo, fixed ccdsMkTables
    # and reload
    /cluster/data/genbank/bin/x86_64/ccdsMkTables -loadDb ccds hg17 ccdsInfo ccdsGene
    checkTableCoords hg17 -verbose=2 ccdsGene
    joinerCheck -database=hg17 -identifier=ccdsGeneId ~/compbio/kent/src/hg/makeDb/schema/all.joiner
    /cluster/data/genbank/bin/x86_64/mkCcdsGeneMap  -db=hg17 -loadDb ccdsGene knownGene ccdsKgMap
    # request push of 
        ccdsGene
        ccdsInfo
        ccdsKgMap

    # << emacs
##########################################################################
#  hars 1 to 202  Sol 09/10/2006

    set bedDir = /gbdb/hg17/haseq/bed
    mkdir -p $bedDir/hars
    pushd /projects/hg/wet/Sol/hars1to49
    cp -p hars_1to202.hg17.bed  $bedDir/hars/hars_1to202.bed
    hgLoadBed hg17 hars         $bedDir/hars/hars_1to202.bed
    rm -f                       $bedDir/hars/hars_1to202.bed
    popd

# BUILD HPRD DATA FOR KNOWN GENE DETAILS PAGE LINKS (DONE 9/11/06)

# First, build hprdToCdna.tab and hprdToUniProt.tab. 
# See hg18.txt for details.

    cd ~/data/hprd
    mkdir hg17
    cd hg17

    hgsql hg17 -e 'drop table hprdToCdna'
    hgsql hg17 <~/src/hg/lib/hprdToCdna.sql
    hgsql hg17 -e 'load data local infile "../hprdToCdna.tab" into table hprdToCdna'

    hgsql hg17 -e 'drop table hprdToUniProt'
    hgsql hg17 <~/src/hg/lib/hprdToUniProt.sql
    hgsql hg17 -e 'load data local infile "../hprdToUniProt.tab" into table hprdToUniProt'

# build knownToHprd table

    hgsql hg17 -N -e 'select kgId,hprdId from hprdToCdna, kgXref where cdnaId=kgId' >j.kg1
    hgsql hg17 -N -e 'select kgId,hprdId from hprdToUniProt, kgXref where uniProtId=spId' >j.kg2

    cat j.kg1 j.kg2 |sort -u >knownToHprd.tab
    wc knownToHprd.tab

    hgsql hg17 -e 'drop table knownToHprd'
    hgsql hg17 <~/src/hg/lib/knownToHprd.sql

    hgsql hg17 -e 'load data local infile "knownToHprd.tab" into table knownToHprd'
    hgsql hg17 -e 'select count(*) from knownToHprd'

# 19,345 records created.

# remove temporary files.

    rm j*

# Do the same for hg17.  See hg17.txt for details.


############################################################################
# Reload CCDS (ccdsInfo, ccdsGene, ccdsKgMap) (2006-09-20 markd)
# Reloaded due to bug that results in multiple versions of the same accession
# in the ccdsInfo table.

    cd /cluster/data/genbank/data/ccds/hg17
    ftp ftp-private.ncbi.nih.gov (user ccds, needs password)
    get CCDS.20060920.tar.gz
    mkdir /scratch/tmp/ccds
    cd /scratch/tmp/ccds
    tar -zxf  /cluster/data/genbank/data/ccds/hg17/CCDS.20060920.tar.gz

    # import ccds database tables
    hgsql -e 'drop database ccds; create database ccds'
    hgsql ccds </cluster/data/genbank/etc/createTables.sql 
    hgsql ccds </cluster/data/genbank/etc/createKeys.sql 
    /cluster/data/genbank/bin/x86_64/ccdsImport ccds data/*.txt

    # create and load ccdsGene and ccdsInfo tables from imported database
    /cluster/data/genbank/bin/x86_64/ccdsMkTables -loadDb ccds hg17 ccdsInfo ccdsGene
    /cluster/data/genbank/bin/x86_64/mkCcdsGeneMap  -db=hg17 -loadDb ccdsGene knownGene ccdsKgMap
    checkTableCoords hg17 -verbose=2 ccdsGene
    joinerCheck -database=hg17 -identifier=ccdsGeneId ~/compbio/kent/src/hg/makeDb/schema/all.joiner
    rm -rf /scratch/tmp/ccds

    # request push of 
        ccdsGene
        ccdsInfo
        ccdsKgMap

    # << emacs


#########################################################
# BUILD GAD TRACK (Done, 9/26/06, Fan)

   mkdir /cluster/store/gad060926
   ln -s /cluster/store/gad060926 /cluster/data/gad

# Receive "allxlsAStxt.txt" from GAD/NIA 
# contact person: Shenoy, Narmada, shenoyn@grc.nia.nih.gov  
   
   hgsql hg17 -e 'drop table gadAll'
   hgsql hg17 <~/src/hg/lib/gadAll.sql
   hgsql hg17 -e 'load data local infile "allxlsAStxt.txt" into table gadAll ignore 2 lines'
   hgsql hg17 -e 'create index geneSymbol on gadAll(geneSymbol(10))'

# create gad table

   hgsql hg17 -N -e \
   'select "chr",chromosome, chromStart, chromEnd, geneSymbol from gadAll where chromStart <>0'|\
   sed -e 's/chr\t/chr/' |grep -v "chr\." |grep -v " "|sort -u >gad.bed

   hgLoadBed hg17 gad gad.bed

#####################################################################
# YALE TRANSCRIPTIONALLY ACTIVE REGIONS (TARs/TransFrags) TRACK IDENTIFIED 
# USING A WHOLE GENOME TILING ARRAY (DONE, 2006-10-20, hartera)
# Data is from the paper: Bertone et al. Science 24 December 2004:
# Vol. 306. no. 5705, pp. 2242 - 2246. From Mark Gerstein's lab at Yale.
# Contact at Yale: Joel S. Rozowsky, joel.rozowsky@yale.edu
# The data consist of Transcriptionally Active Regions (TARs or TransFrags)
# found using Affymetrix genome tiling arrays. The data is from the lab
# of Mark Gerstein at Yale.
     ssh kkstore02
     mkdir /cluster/data/hg17/bed/yaleBertoneTars/
     cd /cluster/data/hg17/bed/yaleBertoneTars/
     # download Bertone et al. data from this URL:
    #http://dart.gersteinlab.org/cgi-bin/ar/download.cgi?ID=TAR_data_NCBI31.txt
     # and put it in this directory.
     # The sequences used to design the microarrays were from
     # UCSC hg13/NCBI Build 31 so the sequences
     # should be aligned again using Blat since this is probably better
     # than using liftOver across so many assemblies.

     # Get sequences from TARs file and put in FASTA format:
     # Remove characters from Windows:
     dos2unix TAR_data_NCBI31.txt
     # The TARs are in order of IDs in the file so the first TAR has ID 1, the
     # second is 2 up to the last which is 17517. These IDs are used to link
     # to the DART database of TARs at Yale so use these IDs in the FASTA
     # header lines. Need to add "TAR" as prefix to ID so that it is unique
     # in the seq table.
   awk 'BEGIN {FS="\t";n=0;}{if ($1 ~ /^chr/) print ">TAR"n"\n"$14"\n";n++;}' \
         TAR_data_NCBI31.txt > yaleBertoneTARSeqs.fa
     ssh pk
     mkdir -p /san/sanvol1/scratch/hg17/TARs/
     cp /cluster/data/hg17/bed/yaleBertoneTars/yaleBertoneTARSeqs.fa \
        /san/sanvol1/scratch/hg17/TARs/
     # Set up to Blat the TAR sequences against hg17
     cd /cluster/data/hg17/bed/yaleBertoneTars
     ls -1 /san/sanvol1/scratch/hg17/TARs/yaleBertoneTARSeqs.fa > tars.lst
     ls -1 /san/sanvol1/scratch/hg17/nib/*.nib > genome.lst
     # output dir
     mkdir psl

     cat << '_EOF_' > template.sub
#LOOP
/cluster/bin/x86_64/blat -repeats=lower -minIdentity=90 -ooc=/san/sanvol1/scratch/hg17/11.ooc $(path1) $(path2) {check out line+ psl/$(root1)_$(root2).psl}
#ENDLOOP
'_EOF_'
    # << for emacs
     gensub2 genome.lst tars.lst template.sub para.spec
     para create para.spec
     para try, para check, para push ...
     para time
# Completed: 46 of 46 jobs
# CPU time in finished jobs:        429s       7.16m     0.12h    0.00d  0.000 y
# IO & Wait Time:                   153s       2.54m     0.04h    0.00d  0.000 y
# Average job time:                  13s       0.21m     0.00h    0.00d
# Longest running job:                0s       0.00m     0.00h    0.00d
# Longest finished job:              38s       0.63m     0.01h    0.00d
# Submission to last job:           107s       1.78m     0.03h    0.00d
     # sort and then filter
     pslSort dirs raw.psl tmp psl
     # use these parameters as for Genbank alignments of native mRNAs
     # for finished assemblies.
     pslCDnaFilter -minId=0.96 -minCover=0.25 -localNearBest=0.001 \
       -minQSize=20 -minNonRepSize=16 -ignoreNs -bestOverlap \
       raw.psl yaleBertoneTars.psl
#                         seqs    aligns
#             total:     17512   37530
# drop minNonRepSize:     121     254
#     drop minIdent:     3827    14532
#     drop minCover:     571     897
#       weird over:     232     837
#        kept weird:     197     201
#    drop localBest:     2359    3896
#              kept:     17498   17951
     
     # 99.9% were kept. 
     # check how many aligned
     grep '>' yaleBertoneTARSeqs.fa | wc -l
     # 17517
     # 99.89% of the original set of sequences are in this filtered PSL file.
     pslCheck yaleBertoneTars.psl
     # psl is ok

     # load into database
     ssh hgwdev
     cd /cluster/data/hg17/bed/yaleBertoneTars
     hgLoadPsl hg17 yaleBertoneTars.psl

     # Add sequences to /gbdb/hg18 and to seq and extFile tables.
     mkdir -p /gbdb/hg17/yaleTARs/
     ln -s /cluster/data/hg17/bed/yaleBertoneTars/yaleBertoneTARSeqs.fa \
           /gbdb/hg17/yaleTARs/
     hgLoadSeq hg17 /gbdb/hg17/yaleTARs/yaleBertoneTARSeqs.fa
    
     # trackDb.ra entry is in trackDb/human/trackDb.ra and 
     # a description exist already as this track is also on hg18.

######################################################################
## reload tfbsCons table - it was based on a newer version of tfbs names that
#	are not yet public domain (2006-11-03 - Hiram)
    mkdir /cluster/data/hg17/bed/tfbsCons
    cd /cluster/data/hg17/bed/tfbsCons
    cp -p /cluster/store6/weirauch/TFLOC/hg17/tfbsConsSites.bed .
    hgLoadBed -strict hg17 tfbsConsSites \
	-sqlTable=$HOME/kent/src/hg/lib/tfbsConsSites.sql \
	tfbsConsSites.bed -tab
# this leads to a bunch of extra names in Factors
    hgsql -N -e "select name from tfbsConsSites;" hg17 | sort -u > names.new
    hgsql -N -e "select name from tfbsConsFactors;" hg17 \
	| sort -u > names.factors
    comm -13 names.new names.factors > names.extra.factors
for N in `cat extra.names.factors`
do
        echo "delete from tfbsConsFactors where name=\"${N}\";" hg17
        hgsql -e "delete from tfbsConsFactors where name=\"${N}\";" hg17
done

#########################################################
# BUILD GAD TRACK (Re-Re-Done, 12/12/06, Fan)

   mkdir /cluster/store12/gad061211
   rm /cluster/data/gad
   ln -s /cluster/store12/gad061211 /cluster/data/gad

# Receive "GAD-Hg17DATA.txt" from GAD/NIA 
# contact person: Shenoy, Narmada, shenoyn@grc.nia.nih.gov  
   
   hgsql hg17 -e 'drop table gadAll'
   hgsql hg17 <~/src/hg/lib/gadAll.sql
   hgsql hg17 -e 'load data local infile "GAD-Hg17DATA.txt" into table gadAll ignore 1 lines'
   hgsql hg17 -e 'create index geneSymbol on gadAll(geneSymbol(10))'

# create gad table

   hgsql hg17 -N -e \
   'select "chr",chromosome, chromStart, chromEnd, geneSymbol from gadAll
   where chromStart <>0 and chromosome<>""'|\
   sed -e 's/chr\t/chr/' |grep -v "chr\." |grep -v " "|sort -u >gad.bed

   hgLoadBed hg17 gad gad.bed

##########################################################################
#  xxBlastTab - Help filter out unwanted paralogs  (Galt 2007-01-11)
#
# Background: The xxBlastTab tables are made with a simple blastall 
# (blastp with -b 1) which chooses the best match.  Unfortunately this
# means that if there is no proper match it will still pick something
# even though it's probably not orthologous. This is especially a problem
# in organisms like rat knownGene which has only 30% gene coverage. 
# The strategy here is to filter our xxBlastTab using synteny mappings from
# the chains. This is done by simply taking $db.kg and using /gbdb/$db chains
# and pslMap to lift the genes to the target xx assembly.  Then hgMapToGene
# will find which of those mapped ids have good overlap with xx.knownGene.
# The final mapping is then created by doing an inner join between 
# the traditional xxBlastTab and the mapping table produced above.
# Then simply drop the old table and rename the new table.
#
# We are starting with xxBlastTab tables already built in the usual way with
# blastall/blastp, probably with doHgNearBlastp.pl script.
#
# I created a new utility script called synBlastp.csh since I have to do this
# several times. 
#
# we want to update hg17 for rat and mouse, 
# so check ./hgGeneData/Human/hg17/otherOrgs.ra for current settings

ssh hgwdev

synBlastp.csh hg17 rn3
#hg17.rnBlastTab results:
#new number of unique query values:
#10728
#new number of unique target values
#5177
#old number of unique query values:
#24030
#old number of unique target values
#5535

synBlastp.csh hg17 mm7
#new number of unique query values:
#25506
#new number of unique target values
#13462
#old number of unique query values:
#32951
#old number of unique target values
#14803



#####################################################################
##########################################################################
# GenBank gbMiscDiff table (markd 2007-01-10)
# Supports `NCBI Clone Validation' section of mgcGenes details page

   # genbank release 157.0 now contains misc_diff fields for MGC clones
   # reloading mRNAs results in gbMiscDiff table being created.
   ./bin/gbDbLoadStep -reload -srcDb=genbank -type=mrna hg17

############
# UPDATE hg17 knownToVisiGene (DONE galt 2007-02-15)
# Create table that maps between known genes and visiGene database 
    # mapping to other species such as mouse, zebrafish, frog
    # requires visiGene probe track vgImageProbes be created first
    knownToVisiGene hg17 -fromProbePsl=vgImageProbes

#########################################################
# Chimp Paralogy data from Eichlers lab (DONE Heather Feb. 2007)
cd /cluster/data/hg17/bed/eichler
hgLoadBed hg17 chimpParalogy chimpParalogy.bed -tab -sqlTable=chimpParalogy.sql

############################################################################
# Reload CCDS (ccdsInfo, ccdsGene, ccdsKgMap) (2007-03-02 markd)

    cd /cluster/data/genbank/data/ccds/
    ftp ftp-private.ncbi.nih.gov (user ccds, needs password)
    get CCDS.20070228.tar.gz
    mkdir /scratch/tmp/ccds
    cd /scratch/tmp/ccds
    tar -zxf /cluster/data/genbank/data/ccds/CCDS.20070228.tar.gz

    # import ccds database tables
    /cluster/data/genbank/bin/x86_64/ccdsImport ccds data/*.txt

    # create and load ccdsGene and ccdsInfo tables from imported database
    /cluster/data/genbank/bin/x86_64/ccdsMkTables -loadDb ccds hg17 ccdsInfo ccdsGene
    /cluster/data/genbank/bin/x86_64/mkCcdsGeneMap  -db=hg17 -loadDb ccdsGene knownGene ccdsKgMap
    checkTableCoords hg17 -verbose=2 ccdsGene
    joinerCheck -database=hg17 -identifier=ccdsGeneId ~/compbio/genbank/kent/src/hg/makeDb/schema/all.joiner

    # build initial version of ccdsMgcMap table, updated by nightly  genbank update
    /cluster/data/genbank/bin/x86_64/mkCcdsGeneMap -loadDb -db=hg17 ccdsGene mgcGenes ccdsMgcMap
    
    # load trackDb
    cd kent/src/hg/makeDb/trackDb
    make alpha
    # check in browser

    # request push of 
        ccdsGene
        ccdsInfo
        ccdsKgMap
        ccdsMgcMap
    # << emacs

#####################################################
# Vista Enhancers (galt 2007-02-23 done)
#
# Vista from Lawrence-Berkeley has assayed
# 301 human conserved non-coding intra- and inter-
# genic elements for their ability to promote
# lacZ in mouse embryos.  A positive looks like
# a mouse with a purple spine.
#
# They provided a custom track with two tracks for pos and neg.
# http://enhancer-test.lbl.gov/cgi-bin/customTrack.cgi
# I am combining the tracks into one with high score for pos.
#
cd /cluster/data/hg17/bed
mkdir vistaEnhancers
cd vistaEnhancers
wget -O custTrk "http://enhancer-test.lbl.gov/cgi-bin/customTrack.cgi"
cat custTrk | head -116 | tail +2 > pos
cat custTrk | tail +118 > neg
cat pos | gawk '{print $1"\t"$2"\t"$3"\t"$4"\t900"}' > bed5
cat neg | gawk '{print $1"\t"$2"\t"$3"\t"$4"\t200"}' >> bed5
wc -l bed5
#301 bed5
hgLoadBed hg17 vistaEnhancers bed5
#Loaded 301 elements of size 5

# add to human/trackDb.ra
track vistaEnhancers
shortLabel Vista Enhancers
longLabel Vista HMR-Conserved Non-coding Human Enhancers from LBNL
group regulation
priority 93
visibility hide
color 50,70,120
type bed 5 .
useScore 1
url http://enhancer-test.lbl.gov/cgi-bin/imagedb.pl?form=presentation&show=1&experiment_id=$$

###
# UPDATES (2007-10-18, conodera)
# see also /projects/compbiousr/wet/browser/vista_enhancer/17Oct2007/Makefile

cd /projects/compbiousr/wet/browser/vista_enhancer/

# download data file from the vista browser (coordinates are for hg17)
# http://enhancer.lbl.gov/cgi-bin/imagedb.pl?show=1;search.result=yes;form=search;search.form=no;action=search;search.sequence=1
# save as enhancerbrowser.datadownload.txt

# give elements with positive label a score of 900, 
# give elements with negative label a score of 200.
# print to 5-field bed file
vista_enhancer.hg17.txt: enhancerbrowser.datadownload.txt
        grep ">" $< \
        | sed -e 's/>//' \
        | tr :- '       ' \
        | sed -e 's/positive/900/'\
        | sed -e 's/negative/200/' \
        | awk '{print $$1"\t"$$2"\t"$$3"\telement_"$$6"\t"$$8}' \
        > $@; \
hgLoadBed hg17 vistaEnhancers vista_enhancer.hg17.txt;
# loaded 446 elements of length 5


#########################################################################
# EPONINE-TSS (TRANSCRIPTON START SITE) PREDICTION 
# (DONE, 2007-03-08, hartera)
# The Eponine software is version 2 and has not changed in several years
# (contact: Thomas Down at Sanger, td2@sanger.ac.uk). The version downloaded
# for hg16 should be the same as the current version but download again just
# to check. The application includes the TSS model file: eponine-tss2.xml 

     ssh kkstore02
     # Eponine runs fine on 2.5Mb contig, but barfs on much larger contig;
     # chop up sequence at gaps into ~2.5Mb chunks for cluster run.
     mkdir /san/sanvol1/scratch/hg17/chunks
     cd /cluster/data/hg17
     foreach f (?{,?}/NT_*/NT_??????.fa)
       set ctg = $f:t:r
       /cluster/bin/x86_64/faSplit -minGapSize=10 \
        -lift=/san/sanvol1/scratch/hg17/chunks/${ctg}.lft \
        gap $f 2500000 /san/sanvol1/scratch/hg17/chunks/${ctg}.chunk
     end
     # seems to ignore the chunk part of the file name
     mkdir /cluster/data/hg17/bed/eponine
     cd /cluster/data/hg17/bed/eponine
     wget --timestamping \
       http://www.sanger.ac.uk/Software/analysis/eponine/eponine-scan.jar
     # file has the same date and same size as the one downloaded for hg16
     # the script requires all of the path setting found in my .tcshrc file.
     # Using only set path = (/usr/java/jre1.5.0_06/bin $path)
     # as in the doEpo file for hg16 does not work.
     cat << '_EOF_' > doEpo
#!/bin/csh -ef
set path = (/usr/java/jre1.5.0_06/bin /bin /usr/bin /usr/X11R6/bin \
             /usr/local/bin . /cluster/home/hartera/bin/x86_64 \
             /cluster/bin/x86_64 /projects/compbio/bin/x86_64 \
             /projects/compbio/bin /projects/compbio/bin/x86_64-linux \
             /cluster/bin/scripts)
java -jar ./eponine-scan.jar -threshold 0.999 -seq $1 > $2
'_EOF_'
     # << emacs
     chmod a+x doEpo
     cp /dev/null jobList
     foreach f (/san/sanvol1/scratch/hg17/chunks/NT*.fa)
        echo "./doEpo {check in line+ $f} {check out exists out/$f:t:r.gff}" \
      >> jobList
     end
     mkdir out
     ssh pk
     cd /cluster/data/hg17/bed/eponine
     /parasol/bin/para create jobList
     /parasol/bin/para try, check, push, check etc.....
     /parasol/bin/para time
# Completed: 1415 of 1415 jobs
# CPU time in finished jobs:     104501s    1741.68m    29.03h    1.21d  0.003 y
# IO & Wait Time:                  6594s     109.91m     1.83h    0.08d  0.000 y
# Average job time:                  79s       1.31m     0.02h    0.00d
# Longest running job:                0s       0.00m     0.00h    0.00d
# Longest finished job:             127s       2.12m     0.04h    0.00d
# Submission to last job:           488s       8.13m     0.14h    0.01d
     # lift chunks -> contigs
    mkdir contigs/
    foreach l (/san/sanvol1/scratch/hg17/chunks/*.lft)
      set ctg = $l:t:r
      liftUp contigs/$ctg.gff $l warn out/${ctg}*.gff
    end
    # lift contigs -> chrom
    liftUp eponine.gff /cluster/data/hg17/jkStuff/liftAll.lft \
           warn contigs/NT_*.gff
    # Translate to bed 4 + float-score -- it would be a shame to lose 
    # those scores in genePred or bed 5 (int score)
    awk 'BEGIN {i=0;} \
         {printf "%s\t%d\t%d\t%s.%d\t%s\t%s\n", $1, $4-1, $5, $1, i, $6, $7; \
          i = i + 1;}' \
      eponine.gff > eponine.bed
    # load up
    ssh hgwdev
    cd /cluster/data/hg17/bed/eponine
    sed -e 's/bed6FloatScore/eponine/g' \
      $HOME/kent/src/hg/lib/bed6FloatScore.sql > eponine.sql
    hgLoadBed hg17 eponine eponine.bed -tab -sqlTable=eponine.sql
    # Loaded 61013 elements of size 6
    # trackDb.ra entry and eponine.html already exist in trackDb directory.

###########################################################################
# ACEScan Track (DONE 2007-03-15  Andy

ssh hgwdev
cd /cluster/data/hg17/bed
mkdir acescan
cd acescan/
cp ~/acescan.gff .
tail +2 acescan.gff > acescan.nh.gff
ldHgGene -out=gp hg17 acescan acescan.nh.gff 
rm *.gff
ldHgGene -predTab hg17 acescan acescan.hg17.gp

###########################################################################
# augustusHints track (DONE 2007-4-5 Mario)

mkdir -p /cluster/data/hg17/bed/augustus/usingHints/predictions/Trefseq.hmRNA.hsEST.R.Xp.final
cd /cluster/data/hg17/bed/augustus/usingHints/predictions/Trefseq.hmRNA.hsEST.R.Xp.final
wget http://augustus.gobics.de/predictions/hg17/usingEvidence/augustus.hg17.Trefseq.hmRNA.hsEST.R.Xp.gff
wget http://augustus.gobics.de/predictions/hg17/usingEvidence/augustus.hg17.Trefseq.hmRNA.hsEST.R.pep.aa
ldHgGene -bin hg17 augustusHints augustus.hg17.Trefseq.hmRNA.hsEST.R.Xp.gff
hgPepPred hg17 generic augustusHintsPep augustus.hg17.Trefseq.hmRNA.hsEST.R.Xp.pep.aa

###########################################################################
# augustus de novo track (DONE 2007-4-5 Mario)

mkdir -p /cluster/data/hg17/bed/augustus/usingHints/predictions/Xp.RA.it
cd /cluster/data/hg17/bed/augustus/usingHints/predictions/Xp.RA.it
wget http://augustus.gobics.de/predictions/hg17/deNovo/augustus.hg17.Xp.RA.it.gff
wget http://augustus.gobics.de/predictions/hg17/deNovo/augustus.hg17.Xp.RA.it.pep.aa
ldHgGene -bin hg17 augustusXRA augustus.hg17.Xp.RA.it.gff
hgPepPred hg17 generic augustusXRAPep augustus.hg17.Xp.RA.it.pep.aa


###########################################################################
# SwitchDB TSS Track (DONE 2007-04-12 Andy)

ssh hgwdev
mkdir /cluster/data/hg17/bed/switchDbTss
cd /cluster/data/hg17/bed/switchDbTss
# (obtained from Nathan Trinklein <nathant@switchgeargenomics.com>)
cp ~/all_tss_switchdb_psgene.gz .
gunzip all_tss_switchdb_psgene.gz
cat << "EOF" > reformat.awk
BEGIN{FS="\t"}
{
if (NR > 1)
   {
   if ($9 !~ "^PSEUDO.*")
       {
       pseudo = "none";
       }
   else
       {
       pseudo = $9;
       }
   printf("%s\t%d\t%d\t%s\t1000\t%s\t%s\t%s\t%s\t%s\t%s\n", $2, $8, $8+1, $6, $5, $7, $1, $3, $4, pseudo); 
   }
}
EOF
awk -f reformat.awk all_tss_switchdb_psgene > switchDbTss.bed
ln -s ~/kent/src/hg/lib/switchDbTss.sql
hgLoadBed -sqlTable=switchDbTss.sql hg17 switchDbTss switchDbTss.bed

############################################################################
# enable ORFeome track build.  (markd 2007-05-02)
    cd ~/kent/src/hg/makeDb/genbank
    cvs update -d etc
    # edit etc/genbank.conf to add 
        hg17.orfeomeTables.hgwdev = yes
        hg17.orfeomeTables.hgwbeta = yes
    # will need to enable for rr later.  In the future, this can just be enabled
    # as part the normal genbank build.  Change above to:
        hg18.orfeomeTables.default = yes
    
###########################################################################
# Transcriptome Phase 3 tracks (Andy 2007-06-10)

ssh hgwdev
bash
cd /san/sanVol1/scratch/andy
mkdir transcriptome
cd transcriptome/
cp /var/ftp/encode/Affy_transcriptome_phase3.tar .
tar xfv Affy_transcriptome_phase3.tar 
find . -name '*.bz2' -exec bunzip2 '{}' \;
cat > processWig.sh << "EOF"
#!/bin/bash

theDir=`dirname $1`;
theFile=`basename $1`;
table=affyTxnPhase3${theFile%.sig.wig};
tmp=/scratch/tmp/trans3rdPhase.$$

mkdir $tmp
cp $1 $tmp
pushd $tmp
head -n1 $theFile > $table.sig.track.txt
tail +2 $theFile > tmp; mv tmp $theFile
wigEncode $theFile $table.wig $table.wib
popd
cp $tmp/${table}.* $theDir
rm -rf $tmp
EOF
chmod +x processWig.sh
cat > gsub << "EOF"
#LOOP
./processWig.sh {check in line+ $(path1)} {check out exists $(dir1)/$(root1).track.txt}
#ENDLOOP
EOF
find . -name '*.sig.wig' > wig.lst
gensub2 wig.lst single gsub spec
ssh pk
cd /san/sanVol1/scratch/andy/transcriptome
para create spec
para push
exit
cd /cluster/data/hg17/bed
mkdir transcriptome3rdPhase/{wig,wib,bed}
cd transcriptome3rdPhase/wib/
cp /san/sanVol1/scratch/andy/transcriptome/graphs/human_{long,short}_rna/affyTxnPhase3*.wib .
pushd /gbdb/hg17/wib
ln -s /cluster/data/hg17/bed/transcriptome3rdPhase/wib/* .
popd
cd ../wig/
cp /san/sanVol1/scratch/andy/transcriptome/graphs/human_{long,short}_rna/affyTxnPhase3*.wig .
for f in *; do
   hgLoadWiggle hg17 ${f%.wig} $f
done
cd ../bed
for f in /san/sanVol1/scratch/andy/transcriptome/transfrags/human_{long,short}_rna/*; do
   newName=`basename $f`;
   newName=${newName%.bz2};
   bzcat $f | tail +2 > $newName;
   tab=affyTxnPhase3Frags${newName%.bed};
   hgLoadBed hg17 $tab $newName;
done

#######################################################################
# CLEANUP OF DANRER1 BLASTZ SWAP (DONE, 2007-06-25, hartera)
   ssh kkstore02
   cd /cluster/data/hg17/bed/blastz.danRer1.swap/axtChain
   rm -r run1
   cd ../mafNet.new
   gzip *.maf
   # we don't tend to keep the Blastz PSLs anymore and this is an old 
   # zebrafish assembly so remove these.
   cd ../
   rm -r pslChrom
   # this removed 1.2 G of data.

#######################################################################
# CLEANUP OF ACEMBLY_050217 DATA (DONE, 2007-06-25, hartera)
   ssh kkstore02
   cd /cluster/store5/gs.18/build35/bed/acembly_050217
   rm GeneCheck.out GeneCheck2 acembly acembly.chk acembly.details \
      chrom.out genePred.tab hg16.name hg16Pep.name
   cd acembly_gene_lists
   rm test transcripts.names *.bak main_gene.list.IDsort mp.ids mp.sort ptest \
      maintest gid.tab gid.tab.sort genesGffs.ids genesGffs.ids.uniq 
   cd ../
   # remove fasta files as included in gzipped tar file
   rm -r acembly.ncbi_35.genes.proteins.fasta 
   cd acembly.ncbi_35.genes.gff
   gzip *.gff 

#######################################################################
# CLEANUP OF DANRER2 BLASTZ SWAP (DONE, 2007-06-25, hartera)
   ssh kkstore02
   cd /cluster/store5/gs.18/build35/bed/blastz.danRer2.2004-12-08
   # remove old axtChrom directory
   rm -r axtChrom.orig
   cd axtChain
   # chain directories can be recreated from all.chain files so remove
   rm -r chain chainAR
   # gzip net files
   gzip net/*.net
   # gzip .over files
   gzip over/*.over
   # removed ~1.3 G data 

#############################################################################
# Duke DNaseI HS (2007-06-26 kate)
#
#  Submitted by Terry Furey <terry.furey@duke.edu>
#  in collaboration with Greg Crawford
# Resubmitted 9/26/07 from FTP site
# Resubmitted 10/25/07 from FTP site

    ssh kkstore02
    cd /cluster/data/hg17/bed

    # download 19GB archive from Duke site, password protected,
    # user=ucsc, password=dnase
    mkdir -p dukeDnase/2007-10-25/lab
    cd dukeDnase/2007-10-25/lab

    sftp ucsc@sftp.igsp.duke.edu
    mget *
    # dukeDnaseHsCd4.bed
    # dukeDnaseHsCd4Wiggle.tgz

    # unpack and load wiggle (signal) data
    nice tar xvfz dukeDnaseHsCd4Wiggle.tgz
    # packaged as chr*_dukeDnaseHsCd4Wiggle.out
    # fixedStep 1 files

    # create wiggle and load into database
    cd ..
    cat lab/chr*.out | nice wigEncode stdin \
        dukeDnaseCd4Signal.wig dukeDnaseCd4Signal.wib >&! wigencode.log &
        # upper limit 25.74, lower limit -0.66

    ssh hgwdev
    cd /cluster/data/hg17/bed/dukeDnase/2007-10-25
    rm -f /gbdb/hg17/wib/dukeDnaseCd4Signal.wib
    ln -s /cluster/data/hg17/bed/dukeDnase/2007-10-25/dukeDnaseCd4Signal.wib \
            /gbdb/hg17/wib
    nice hgLoadWiggle hg17 dukeDnaseCd4Signal -pathPrefix=/gbdb/hg17/wib \
                dukeDnaseCd4Signal.wig

    # load bed file (sites)
    ssh hgwdev
    cd /cluster/data/hg17/bed/dukeDnase/2007-10-25/
    set table = dukeDnaseCd4Sites
    sed "s/bed5FloatScore/$table/" ~/kent/src/hg/lib/bed5FloatScore.sql > \
            $table.sql
    hgsql hg17 -e "DROP TABLE IF EXISTS $table"
    hgsql hg17 < $table.sql
    hgLoadBed -sqlTable=$table.sql hg17 $table lab/dukeDnaseHsCd4.bed
        # Loaded 95723 elements of size 6
        # min value:  0.000103164
        # max value: 25.7442
    #textHistogram -col=5 lab/dukeDnaseHsCd4.bed -binSize=50 
    300 ******************************** 11789
    350 ************************************************************ 22253
    400 ********************************************* 16854
    450 ********************************* 12333
    500 ************************* 9179
    550 ********************* 7870
    600 ************* 4987
    650 ********* 3271
    700 ******** 2789
    750 ****** 2153
    800 **** 1303
    850 ** 567
    900 * 219
    950  85
    1000  71


###########################################################################
# Stanford ChIP-seq (Apr - July 2007, Heather)
# Submitted 2007-03-14 by David Johnson <seasquirtdoctor@gmail.com>
# 25bp tags (Solexa sequencing of IP fragments)
# genome-wide, but funded by ENCODE, hence the location of the data

    ssh hgwdev
    cd /cluster/data/encode/stanford
    mkdir -p 2007-03-14/lab
    cd 2007-03-14/lab
    sort NRSF_chipseq_hg17.bed > data.bed
    sort NRSF_chipseq_control_hg17.bed > control.bed
    fix.pl < data.bed > fix.bed
    fix.pl < control.bed > control_fix.bed
    hgLoadBed hg17 stanfordNRSFEnriched fix.bed -tab
    hgLoadBed hg17 stanfordNRSFControl control_fix.bed -tab

############################################################################
# Stanford ChIP/chip
# Submitted 2007-07-11 by David Johnson (seasquirtdoctor@gmail.com)
# Replaces submission from 2007-03-23
# 12 subtracks
# genome-wide, but funded by ENCODE, hence the location of the data

    ssh hgwdev
    cd /cluster/data/encode/stanford/2007-07-11/lab
    # Dave gave us bed 5, we need bed 4
    ./shrink.sh
    ./load.sh

#########################################################################
# REGULATORY POTENTIAL 7X UPDATED (DONE - 2007-08-01 - Hiram)
    #	download data from "James Taylor" <james@bx.psu.edu>
    ssh kkstore02
    mkdir /cluster/data/hg17/bed/regPotential7X.update
    cd /cluster/data/hg17/bed/regPotential7X.update
    ## In theory, only chr4, chr8, chr9 and chrY have updated, fetch them
    ## all and verify with ../regPotential7X
for C in 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 X Y
do
wget --timestamping \
"http://www.bx.psu.edu/~james/esperr_rp_7way_scores/genome_scores_hg17/chr${C}.scores.truncated.bz2"
echo "DONE - chr${C}.scores.bz2"
done

    #	create download gzip files from the bz2 files:
    time for F in chr*.scores.truncated.bz2
    do
	C=`echo $F | awk -F'.' '{print $1}'`
	echo -n "${C}.regPotential7X.hg17.gz working ... "
	bzcat ${F} | gzip > ${C}.regPotential7X.hg17.gz
	echo "done"
    done

    time for C in 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 X Y
    do
	zcat chr${C}.regPotential7X.hg17.gz
    done | wigEncode -noOverlap  stdin regPotential7X.wig regPotential7X.wib
    #	Converted stdin, upper limit 1.00, lower limit -0.00
    #	real    16m51.215s

    #	Loading the table on hgwdev
    ssh hgwdev
    cd /cluster/data/hg17/bed/regPotential7X.update
    mkdir /gbdb/hg17/wib/061116
    ln -s /cluster/data/hg17/bed/regPotential7X.update/regPotential7X.wib \
	/gbdb/hg17/wib/061116/regPotential7X.wib
    #	using the tmpDir is faster since it is on local disk and it will
    #	clean up any temporary .tab file it creates there
    time nice -n +19 hgLoadWiggle -tmpDir=/scratch/tmp \
	-pathPrefix=/gbdb/hg17/wib/061116 hg17 regPotential7X regPotential7X.wig
    #	real    0m40.523s

    #	How about a histogram of the data.
    ssh kolossus
    cd /cluster/data/hg17/bed/regPotential7X.update
    time nice -n +19 hgWiggle -verbose=2 -doHistogram -hBinSize=0.01 \
      -hBinCount=100 -hMinVal=0.0 -db=hg17 regPotential7X > histogram.data 2>&1
    #	real    3m3.829s
    #	73 % of the data values are zero


    # renaming file directory -- kuhn 08-17-2007
    cd /gbdb/hg17/wib
    mv 061116 regPot061116
    hgsql -e " update regPotential7X SET file = \
      /gbdb/hg17/wib/regPot061116/regPotential7X.wib" hg17
    Query OK, 2366123 rows affected (31.46 sec)
    Rows matched: 2366123  Changed: 2366123  Warnings: 0

###########################################################################
## Create gc5Base download raw data file (DONE - 2007-08-29 - Hiram)
    ssh kkstore02
    cd /cluster/data/hg17/bed/gc5Base
    hgGcPercent -wigOut -doGaps -file=stdout -win=5 \
	hg17 /cluster/data/hg17/hg17.2bit 2> /dev/null \
	| gzip > hg17.gc5Base.txt.gz
    ssh hgwdev
    mkdir /usr/local/apache/htdocs/goldenPath/hg17/gc5Base
    cd /usr/local/apache/htdocs/goldenPath/hg17/gc5Base
    ln -s /cluster/data/hg17/bed/gc5Base/hg17.gc5Base.txt.gz .

############################################################################
# INDEL-BASED CONSERVATION TRACK (DONE, 2007-10-02 - 2007-10-03, hartera)
# Data from the Gerton Lunter (gerton.lunter@anat.ox.ac.uk), MRC 
# Functional Genetics Unit, University of Oxford, United Kingdom.
# Data is from the paper:
# Lunter G, Ponting CP and Hein J Genome-wide identification of human
# functional DNA using a neutral indel model. PLoS Comput Biol. 2006
# Jan;2(1):e5.
    ssh kkstore02 
    mkdir -p /cluster/data/hg17/bed/consIndels/data
    cd /cluster/data/hg17/bed/consIndels/
    # Add a README.indels with the e-mail from Gerton Lunter, copy over 
    # from hg18 condIndels
    cp /cluster/data/hg18/bed/consIndels/README.indels .
    # get the data
    cd data
    wget --timestamping \
         http://wwwfgu.anat.ox.ac.uk/~gerton/IPS/IPSs.zip
    # 15 Mb zip file in GFF format. This contains data for hg17
    # comparing it to mm5 (NCBI Build 33) and 
    # canFam1 (Broad Institute, July 2004). The chr*.mm5.GFF data is old
    # data that can be removed.
    unzip IPSs.zip    
    cd /cluster/data/hg17/bed/consIndels
    rm ./data/*mm5.GFF
    foreach f (./data/*.GFF)
       set r = $f:r
       echo $r
       grep -v "track" $f > ${r}NoHeader.gff
    end
  
    # strip off the end of the name e.g. IGS0001:p=.26
    # so that the name displayed is short - IGS0001.1. The score field
    # is used to determine colouring and this is calculated from FDR 
    ssh kkstore02 
    cd /cluster/data/hg18/bed/consIndels
    perl -pi.bak -e \
         's/(IGS[0-9a-z]+\.?[0-9XY]*):p=?<?\.[0-9]+/$1/'  \
         ./data/chr*NoHeader.gff
    # check this looks ok then clean up
    rm *.bak
    # makes sense to store this as a BED5 table in order to use the score
    # for display.
    foreach f (./data/*NoHeader.gff)
       awk 'BEGIN {FS="\t"} {OFS="\t"} {print $1,$4,$5,$9,$6}' $f \
       >> consIndelsHg17Mm5CanFam1.bed
    end
 
    # load data
    ssh hgwdev
    cd /cluster/data/hg17/bed/consIndels
    hgLoadBed hg17 consIndelsHg17Mm5CanFam1 consIndelsHg17Mm5CanFam1.bed
    # Loaded 593298 elements of size 5
    
    # Get the IDs, posterior probabilities (p) for the segment being neutral, 
    # and the FDR from the original GFFs for a separate table. Some items 
    # have p<.001. Can not do Table Browser queries restricting 
    # p to <, =, or > a specified value unless all values are floats.
    # Contacted the data contributor, Gerton Lunter, and he said it would be 
    # ok to change all p<.001 to p=0.0005
    ssh kkstore02
    
    cd /cluster/data/hg17/bed/consIndels/
    awk '{if ($1 !~ /random/) print $1;}' /cluster/data/hg17/chrom.sizes \
        | sed -e 's/chr//' | sort -n > chrom.lst
    grep -v 'hap' chrom.lst > tmp2
    tail +4 tmp2 > tmp3
    echo "X\nY\n" >> chrom.lst
    rm tmp2 tmp3 
    # chrom.lst has a list of chroms 1-22, then X and Y
    foreach c (`cat chrom.lst`)
       echo $c
       foreach f (./data/chr${c}.GFF)
          echo $f
          awk 'BEGIN {FS="\t"} {OFS="\t"}{if ($9 ~ /IGS/) print $9,$6;}' $f \
              | sed -e 's/:/\t/' \
              | sed -e 's/p=\./0\./' | sed -e 's/p<\.001/0\.0005/' \
              >> consIndelsConf.txt
       end
    end
    # Add the FDR. 
    # For this set, there is no false discovery rate (FDR) field but it 
    # can be related to the score. If score is 999 then FDR is 1% (0.01) and
    # if score is 500 then FDR is 10% (0.10). Score is in column 6.
    # there are no GFF files for the haplotype chroms
   
    awk 'BEGIN {FS="\t"} {OFS="\t"} {if ($3 ~ /500/) print $1, $2, "0.10"; else if ($3 ~ /999/) print $1, $2, "0.01";}' consIndelsConf.txt > consIndelsHg17Mm5CanFam1Conf.txt 

    # Create a table definition for the table of identifier,  posterior 
    # probability and false discovery rate (FDR). Already created for hg18
    # track (see hg18.txt). It is $HOME/kent/src/hg/lib/itemConf.as.
   
    ssh hgwdev
    cd /cluster/data/hg17/bed/consIndels
    hgLoadSqlTab hg17 consIndelsHg17Mm5CanFam1Conf \
         $HOME/kent/src/hg/lib/itemConf.sql \
         consIndelsHg17Mm5CanFam1Conf.txt
    # check that all itesm are in this table.
    hgsql -N -e 'select distinct(name) from consIndelsHg17Mm5CanFam1;' hg17 \
         | sort > consIndels.names.sort
    hgsql -N -e 'select distinct(id) from consIndelsHg17Mm5CanFam1Conf;' hg17 \
         | sort > consIndels.idsfromConf.sort
    wc -l *.sort
    # 593298 consIndels.idsfromConf.sort
    # 593298 consIndels.names.sort

    comm -12 consIndels.names.sort consIndels.idsfromConf.sort | wc -l
    # 593298
    # so all element IDs are in both tables.
    # cleanup
    rm ./data/*.bak *.sort
    
    # add trackDb/human/hg17/trackDb.ra entry and add description that
    # was written by the data contributor. Add code to hgc.c to display
    # the posterior probability and the FDR on the details page for
    # track elements. Gerton Lunter provided a description for the data
    # on 2007-09-12.
    cd ~/kent/src/hg/makeDb/trackDb/human/hg17
    cp ../hg18/consIndelsHg18Mm8CanFam2.html consIndelsHg17Mm5CanFam1.html
    # check this is correct and add trackDb.ra track entry and search.  

##############################################################
# NIMH Bipolar Genome Graphs built-in (DONE 2007-10-04 Galt)
#
#  See hg18.txt for details.
#############################################################

#############################################################
# CCC Genome Graphs (DONE 2007-Sept Andy)
# 
# See hg18 make doc.

###############################################################
# Affy Transcriptome Phase 3 chrY fix (DONE 2007-12-10, Andy)

ssh kkstore05
cd /cluster/store12/hg17/bed/affyTxnPhase3/raw
zcat sRNA.affyTxnPhase3HeLaBottomStrand.wig.gz | grep -n chrY
#256994657:variableStep chrom=chrY span=1
zcat sRNA.affyTxnPhase3HeLaBottomStrand.wig.gz | head -n256994656 | gzip -c >tmp.wig.gz
mv tmp.wig.gz sRNA.affyTxnPhase3HeLaBottomStrand.wig.gz
zcat sRNA.affyTxnPhase3HeLaTopStrand.wig.gz | grep -n chrY 
256994657:variableStep chrom=chrY span=1
zcat sRNA.affyTxnPhase3HeLaTopStrand.wig.gz | head -n256994656 | gzip -c > tmp.wig.gz
mv tmp.wig.gz sRNA.affyTxnPhase3HeLaTopStrand.wig.gz
ssh kolossus
cd /cluster/store12/hg17/bed/affyTxnPhase3/raw
wigEncode sRNA.affyTxnPhase3HeLaBottomStrand.wig.gz affyTxnPhase3HeLaBottomStrand.{wig,wib}
wigEncode sRNA.affyTxnPhase3HeLaTopStrand.wig.gz affyTxnPhase3HeLaTopStrand.{wig,wib}
mv *.wig /cluster/data/hg17/bed/affyTxnPhase3/wig/
mv *.wib /cluster/data/hg17/bed/affyTxnPhase3/wib/
ssh hgwdev
cd /cluster/data/hg17/bed/affyTxnPhase3/wig
hgLoadWiggle hg17 affyTxnPhase3HeLaTopStrand{,.wig}
hgLoadWiggle hg17 affyTxnPhase3HeLaBottomStrand{,.wig}

###########################################################################
# Reload CCDS (2007-12-12 markd)
    # import ccds database as described in ccds.txt
    set db=hg17
    # create and load ccdsGene and ccdsInfo tables from imported database
    /cluster/data/genbank/bin/x86_64/ccdsMkTables -loadDb ccds ${db} ccdsInfo ccdsGene

    # ccdsKgMap
    /cluster/data/genbank/bin/x86_64/mkCcdsGeneMap -db=${db} -loadDb ccdsGene knownGene ccdsKgMap

    # build initial version of ccdsMgcMap table, updated by nightly genbank update
    /cluster/data/genbank/bin/x86_64/mkCcdsGeneMap -db=${db} -loadDb ccdsGene mgcGenes ccdsMgcMap

    checkTableCoords ${db} -verbose=2 ccdsGene
    # update all.jointer to include ${db} in ccdsDb
    joinerCheck -database=${db} -identifier=ccdsGeneId ~/compbio/genbank/kent/src/hg/makeDb/schema/all.joiner
    # request push of 
        ccdsGene
        ccdsInfo
        ccdsKgMap
        ccdsMgcMap
    # << emacs

############################################################################
# ADD LINKS TO GENETESTS ON hgGene DETAILS PAGE (DONE 12/12/07 Fan)

See hg18.txt for details.

############################################################################
# Reload CCDS (2008-02-01 markd)
    # import ccds database as described in ccds.txt
    set db=hg17
    # create and load ccdsGene and ccdsInfo tables from imported database
    /cluster/data/genbank/bin/x86_64/ccdsMkTables -loadDb ccds ${db} ccdsInfo ccdsGene

    # ccdsKgMap
    /cluster/data/genbank/bin/x86_64/mkCcdsGeneMap -db=${db} -loadDb ccdsGene knownGene ccdsKgMap

    checkTableCoords ${db} -verbose=2 ccdsGene
    # update all.jointer to include ${db} in ccdsDb
    joinerCheck -database=${db} -identifier=ccdsGeneId ~/compbio/genbank/kent/src/hg/makeDb/schema/all.joiner
    # request push of 
        ccdsGene
        ccdsInfo
        ccdsKgMap
    # << emacs

############################################################################
# CREATE huge TABLE FOR HuGE LINK (DONE 3/6/08, Fan)

# See the HuGE section in hg18.txt for details.
############################################################################


############################################################################
# DGV V10 (DATABASE OF GENOMIC VARIANTS) (DONE 11/10/10 angie - color change 2/22/11 #2917)
# DGV V9 done 3/26/10
# DGV V8 done 8/12/09 (changed color of inverted 11/05/09 kuhn)
# DGV V7 done 3/11/09
# DGV V6 thin regions dropped 2/23/09
# DGV V6 with useless thin regions done 11/12/08
# DGV V5 done 8/11/08
# DGV V4 done 5/9/08
# 11-04-2009 color change from brown to magenta:
# old color
# 6553700	Inversion  (100,0,100)
# new:
# 13107400	Inversion  (200,0,200)
# 2/22/11 color change (Bug #2917): swap blue and red; green -> brown
# Old DGV format is obsolete; see the following section.

#######################################################################
# DGV BETA (DATABASE OF GENOMIC VARIANTS) (DONE 2/11/13 angie)
    # DGV has changed their data format, and for the time being the data are
    # served by a beta web site, http://dgvbeta.tcag.ca/ ; in time that will
    # replace their current site.
    set today = `date +%y%m%d`
    mkdir -p /hive/data/genomes/hg17/bed/dgv/$today
    cd /hive/data/genomes/hg17/bed/dgv/$today
    wget http://dgvbeta.tcag.ca/dgv/docs/NCBI35_hg17_2012-11-23.txt
    head -1 NCBI35_hg17*.txt
#variantaccession        chr     start   end     varianttype     variantsubtype  reference       pubmedid        method  platform        mergeid mergedorsample  frequency       samplesize      cohortdescription       genes
    # It's more complicated than Gain/Loss/Complex or Inversion now (+ stray commas):
    cut -f 5,6 NCBI35_hg17*.txt | sort | uniq -c | head -100
#  16978 CNV
#   1193 CNV     ""
#  27902 CNV     CNV
#   2179 CNV     Complex
#    319 CNV     Deletion
#   1203 CNV     Duplication
#  39279 CNV     Gain
#   1715 CNV     Gain+Loss
#  23850 CNV     Insertion
# 105187 CNV     Loss
#     78 OTHER
#      4 OTHER   ""
#   1396 OTHER   Inversion
#      1 varianttype     variantsubtype
    # shuffle fields into bed9+ w/itemRgb
    set purple = "200,0,200"
    set red = "200,0,0"
    set blue = "0,0,200"
    set brown = "139,69,19"
    tail -n +2 NCBI35_hg17*.txt \
    | perl -wpe 'chomp; \
      s/""//; \
      ($id, $chr, $start, $end, $varType, $varSubType, $ref, $pmid, $method, $platform, \
       undef, undef, undef, $sampleSize, $sampleDesc, $genes) = split("\t"); \
      $start-- unless ($start == 0); \
      $landmark = $genes; \
      $landmark =~ s/,/, /g; \
      $varSubType =~ s/^,//;  $varSubType =~ s/,$//; \
      $varTypeOut = "$varType ($varSubType)"; \
      $ref =~ s/_/ /g; \
      $method =~ s/_/ /g;  $method =~ s/,/, /g; \
      $sample = $sampleDesc; \
      $sample .= " (sample size: $sampleSize)" if ($sampleSize); \
      $method .= " ($platform)" if ($platform && $platform ne "Not Provided"); \
      $rgb = "0,0,0"; \
      if ($varType eq "CNV") { \
        if ($varSubType eq "Gain" || $varSubType eq "Insertion" || $varSubType eq "Duplication") {\
          $rgb = "'$blue'"; \
        } elsif ($varSubType eq "Loss" ||$varSubType eq "Deletion") { \
          $rgb = "'$red'"; \
        } elsif ($varSubType eq "") { \
          $varTypeOut = $varType; \
        } else { \
          $rgb = "'$brown'"; \
        } \
      } elsif ($varType eq "OTHER") { \
        if ($varSubType eq "Inversion") { \
          $rgb =  "'$purple'"; \
        } elsif ($varSubType eq "Tandem Duplication") { \
          $rgb = "'$blue'"; \
        } else { \
          $varTypeOut = $varType; \
        } \
      } \
      $_ = join("\t", "chr$chr", $start, $end, $id, 0, "+", \
                $start, $start, $rgb, $landmark, $varTypeOut, \
                $ref, $pmid, $method, $sample) . "\n";' \
        > dgv.bed
    hgLoadBed hg17 dgv dgv.bed \
      -sqlTable=$HOME/kent/src/hg/lib/dgv.sql -renameSqlTable -tab
#Read 221283 elements of size 15 from dgv.bed
    # 2/11/13: checkTableCoords caught some coords past the end of a few
    # chromosomes.  DGV is looking into it.  Truncate:
    hgsql hg17 -e 'update dgv set chromEnd = 154824264 where chrom = "chrX" && chromEnd > 154824264'
    hgsql hg17 -e 'update dgv set chromEnd = 199505740 where chrom = "chr3" && chromEnd > 199505740'
    hgsql hg17 -e 'update dgv set chromEnd = 170975699 where chrom = "chr6" && chromEnd > 170975699'
    hgsql hg17 -e 'update dgv set chromEnd = 158628139 where chrom = "chr7" && chromEnd > 158628139'
    hgsql hg17 -e 'update dgv set chromEnd = 138429268 where chrom = "chr9" && chromEnd > 138429268'
    checkTableCoords -verbose=2 hg17 dgv
    # No output, good.


############################################################################
# KIDD/EICHLER DISCORDANT CLONE ENDS (DONE 6/10/08 angie)
# 8/11/08: Added kiddEichlerToNcbi (ID xref table).
    ssh kkstore02
    mkdir /cluster/data/hg17/bed/kiddEichlerDiscordant
    cd /cluster/data/hg17/bed/kiddEichlerDiscordant
    wget --user=uuuu  --password=ppppppp \
      http://eichlerlab.gs.washington.edu/kiddj/downloads/fosmids.hg17.tgz
    tar xvzf fosmids.hg17.tgz
    cd bd35
    # 8 clone-end linkedFeaturesSeries tracks and one bed custom track.
    # bed has illegal coords (maybe for unplaced ends?).

    # Load the tracks (translate bacEndPairs format to bed12):
    ssh hgwdev
    cd /cluster/data/hg17/bed/kiddEichlerDiscordant/bd35
    foreach f (abc*.txt)
      set track = `echo $f:r \
        | perl -wpe 's/^(G|abc)(\d+)discordant/kiddEichlerDisc\u$1$2/ || die;'`
      if ($status != 0) break
      perl -wpe 'next if s/^#.*\n$//; \
        ($c, $s, $e, $n, $sc, $st, undef, $bs, $bSt, $bSz)=split; \
        @bSts = split(",", $bSt);  @bSzs = split(",", $bSz); \
        $s--; \
        if ($n =~ /transchr/) { \
          $bs = 1; \
          $#bSts = 0;  $#bSzs = 0; \
          $bSts[0]--;  $e--; \
          $bSts[0] -= $s; \
        } elsif ($n =~ /OEA/) { \
          $bSts[0]--; \
          die "bSts[0] $bSts[0] != s $s\n" if ($bSts[0] != $s); \
          $bE = $bSts[0] + $bSzs[0]; \
          die "bE $bE != e $e\n" if ($bE != $e); \
          $bSts[0] -= $s; \
        } elsif ($bs == 2) { \
          $bSts[0]--;  $bSts[1]--; \
          if ($bSts[0] > $bSts[1]) { \
            # warn "Swapping $n ($bSts[0] > $bSts[1])\n"; \
            $tmp = $bSts[0];  $bSts[0] = $bSts[1];  $bSts[1] = $tmp; \
            $tmp = $bSzs[0];  $bSzs[0] = $bSzs[1];  $bSzs[1] = $tmp; \
          } \
          if ($bSts[0] != $s) { \
            # warn "Tweaking $n start from $s to $bSts[0]\n"; \
            $s = $bSts[0]; \
          } \
          $bE0 = $bSts[0] + $bSzs[0]; \
          $bE1 = $bSts[1] + $bSzs[1]; \
          $bE = $bE0 > $bE1 ? $bE0 : $bE1; \
          if ($bE != $e) { \
            # warn "Tweaking $n end from $e to $bE\n"; \
            $e = $bE; \
          } \
          $bSts[0] -= $s;  $bSts[1] -= $s; \
        } else { die "#blks is $bs for $n\n"; } \
        $bSt = join(",", @bSts) . ",";  $bSz = join(",", @bSzs) . ","; \
        $rgb = ($n =~ /deletion/) ? "224,0,0" : \
               ($n =~ /insertion/) ? "0,0,224" : \
               ($n =~ /inversion/) ? "0,224,0" : \
               ($n =~ /OEA/) ? "240,160,64" : "0,0,0"; \
        $_ = join("\t", $c, $s, $e, $n, $sc, $st, $s, $e, $rgb, \
                        $bs, $bSz, $bSt) . "\n";' $f \
      | hgLoadBed -tab hg17 $track stdin
    end
    perl -pe 'next if s/^track .*\n$//; \
        ($c, $s, $e, $n, $sc, $st, $tS, $tE, $r, $bs, $bSz, $bSt) = split; \
        @bSts = split(",", $bSt);  @bSzs = split(",", $bSz); \
        if ($n =~ /transchr/) { \
          $bs = 1; \
          $#bSts = 0;  $#bSzs = 0; \
        } elsif ($n =~ /OEA/) { \
          $s--; # weird that this is required only for OEA here \
          die "$n: bSts[0] $bSts[0] != 0\n" if ($bSts[0] != 0); \
          $bE = $s + $bSts[0] + $bSzs[0]; \
          die "$n: bE $bE != e $e\n" if ($bE != $e); \
        } elsif ($bs == 2) { \
          $bSts[0] += $s;  $bSts[1] += $s; \
          if ($bSts[0] > $bSts[1]) { \
            # warn "Swapping $n ($bSts[0] > $bSts[1])\n"; \
            $tmp = $bSts[0];  $bSts[0] = $bSts[1];  $bSts[1] = $tmp; \
            $tmp = $bSzs[0];  $bSzs[0] = $bSzs[1];  $bSzs[1] = $tmp; \
          } \
          if ($bSts[0] != $s) { \
            # warn "Tweaking $n start from $s to $bSts[0]\n"; \
            $s = $bSts[0]; \
          } \
          $bE0 = $bSts[0] + $bSzs[0]; \
          $bE1 = $bSts[1] + $bSzs[1]; \
          $bE = $bE0 > $bE1 ? $bE0 : $bE1; \
          if ($bE != $e) { \
            # warn "Tweaking $n end from $e to $bE\n"; \
            $e = $bE; \
          } \
         $bSts[0] -= $s;  $bSts[1] -= $s; \
        } else { die "#blks is $bs\n"; } \
        $bSt = join(",", @bSts) . ",";  $bSz = join(",", @bSzs) . ","; \
        $tS = $s;  $tE = $e; \
        $rgb = ($n =~ /deletion/) ? "224,0,0" : \
               ($n =~ /insertion/) ? "0,0,224" : \
               ($n =~ /inversion/) ? "0,224,0" : \
               ($n =~ /OEA/) ? "240,160,64" : "0,0,0"; \
        $_ = join("\t", $c, $s, $e, $n, $sc, $st, $tS, $tE, $rgb, \
                        $bs, $bSz, $bSt) . "\n";' G248discordant.txt \
    | hgLoadBed -tab hg17 kiddEichlerDiscG248 \
        stdin

    # 8/11/08: get clone ID -> NCBI acc mapping.
    ssh kkstore02
    mkdir /cluster/data/hg17/bed/kiddEichlerDiscordant/cloneIds
    cd /cluster/data/hg17/bed/kiddEichlerDiscordant/cloneIds
    # Saved off emailed file from Jeff Kidd to clones_used_3nov.txt.accessions;
    # get trace archive trace names for end reads:
    foreach n (7 9 10 11 12 13 14)
      wget http://hgsv.washington.edu/general/download/clone_mapping/ABC$n/ABC$n.conversion.gz
    end
    # ABC8 has _a and _b files:
    wget http://hgsv.washington.edu/general/download/clone_mapping/ABC8/ABC8_a.conversion.gz
    wget http://hgsv.washington.edu/general/download/clone_mapping/ABC8/ABC8_b.conversion.gz
    # That file is not available for G248.
    gunzip *.gz
    # Combine the relevant data from the .conversion files; keep only those
    # IDs that are used in the tracks.
    cut -f 4 ../bd35/*discordant.txt \
    | egrep -v '^(#chrom|track|name)' \
    | sed -e 's/,.*//' \
    | sort -u > discIds.txt
    perl -wpe 's/^OurClone.*\n// || s/^\d+_(HUMAN|\d+_).*\n$// || \
      s/^(\d+_)?(ABC|G)(\d+)(_\d\d?)?(_\d\d?)?_0*(\d+?_[A-Z]\d\d?)\.(F|FORWARD|R|REVERSE)\.\d+\t(\w+)$/$2$3_$6\t$7\t$8/ || \
      warn "Parse line $.:\n$_";' \
      *.conversion \
    | sort > allEnds.tab
    grep -wFf discIds.txt allEnds.tab > discEnds.txt
    wc -l discIds.txt allEnds.tab discEnds.txt 
#   223051 discIds.txt
# 17498527 allEnds.tab
#   573974 discEnds.txt
    # discEnds.txt has 2 lines (forward & reverse) for most of its ids...
    # ideally we would see 2*(223051) lines in discEnds.txt.
    # Get a list of which discordant clone IDs don't have ends in *.conv*:
    cut -f 1 allEnds.tab | uniq > all.tmp
    comm -23 discIds.txt all.tmp > discNotInConv.txt
    wc -l discNotInConv.txt
#16318 discNotInConv.txt
    cat > combine.pl <<'_EOF_'
#!/usr/bin/perl -w
use strict;
my ($cloneFile, $endsFile) = @ARGV;
open(CLONES, $cloneFile) || die "Can't open $cloneFile: $!\n";
my %idInfo;
while(<CLONES>) {
  (s/^(\d+_)?(ABC|G)(\d+)(_\d\d?)?(_\d\d?)?_0*(\d+?_[A-Z]\d\d?)\t(\w+)$/$2$3_$6\t$7/ && m/^(\w+)\t(\w+)/) || \
  m/^(G248\w+)\t(\w+)$/ || die "Parse line $.:$_";
  my ($id, $acc) = ($1, $2);
  $idInfo{$id}->[0] = $acc;
}
close(CLONES);
open(ENDS, $endsFile) || die "Can't open $endsFile: $!\n";
while (<ENDS>) {
  chomp; my ($id, $dir, $traceName) = split("\t");
  if ($dir =~ /^F/) {
    $idInfo{$id}->[1] = $traceName;
  } elsif ($dir =~ /^R/) {
    $idInfo{$id}->[2] = $traceName;
  } else { die "What is this \$dir: $dir ?\n"; }
}
close(ENDS);
foreach my $id (sort keys %idInfo) {
  my $infoRef = $idInfo{$id};
  $infoRef->[0] = '' if (! defined $infoRef->[0]);
  $infoRef->[1] = 0 if (! defined $infoRef->[1]);
  $infoRef->[2] = 0 if (! defined $infoRef->[2]);
  print join("\t", $id, @{$infoRef}) . "\n";
}
'_EOF_'
    # << emacs
    chmod a+x combine.pl
    combine.pl clones_used_3nov.txt.accessions discEnds.txt \
    | sort > kiddEichlerToNcbi.txt
    # Load table:
    ssh hgwdev
    cd /cluster/data/hg17/bed/kiddEichlerDiscordant/cloneIds
    hgLoadSqlTab hg17 kiddEichlerToNcbi \
      $HOME/kent/src/hg/lib/kiddEichlerToNcbi.sql kiddEichlerToNcbi.txt
    # Add to makeDb/schema/all.joiner, then check:
    runJoiner.csh hg17 kiddEichlerToNcbi $HOME/kent/src/hg/makeDb/schema


############################################################################
# TRANSMAP vertebrate.2008-05-20 build  (2008-05-24 markd)

vertebrate-wide transMap alignments were built  Tracks are created and loaded
by a single Makefile. This is available from:
   svn+ssh://hgwdev.soe.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2008-05-20

see doc/builds.txt for specific details.
############################################################################


############################################################################
# KIDD/EICHLER VALIDATED SITES (DONE 6/11/08 angie)
    ssh hgwdev
    mkdir /cluster/data/hg17/bed/kiddEichlerValid
    cd /cluster/data/hg17/bed/kiddEichlerValid
    wget http://hgsv.washington.edu/general/download/validated_sites/Kidd_2008_sample_level_valided_sites.xls
    # Open in Excel, save as Kidd_2008_sample_level_valided_sites.txt,
    # move first 9 lines to Kidd_2008_sample_level_valided_sites.header.
    # Split into one file per individual:
    foreach id (Abc7 Abc8 Abc9 Abc10 Abc11 Abc12 Abc13 Abc14 G248)
      set ID = `echo $id | tr 'a-z' 'A-Z'`
      grep ${ID}_ Kidd_2008_sample_level_valided_sites.txt \
      | perl -wpe 'chomp; s/\r//; ($c, $s, $e, $n, $t) = split; \
        $rgb = ($n =~ /deletion/) ? "224,0,0" : \
               ($n =~ /insertion/) ? "0,0,224" : \
               ($n =~ /inversion/) ? "0,224,0" : "0,0,0"; \
        $t =~ s/:/,/g; \
        $n =~ s/^'$ID'_//;  $n = "$n,$t"; \
        $_ = join("\t", $c, $s, $e, $n, "0", "+", $s, $e, $rgb) . \
                  "\n";' \
      | hgLoadBed -tab hg17 kiddEichlerValid$id stdin
    end


################################################
# SPLIT EXPRESSION & REGULATION GROUPS
# (2008-09-09 kate)

echo "insert into grp (name, label, priority, defaultIsClosed) values ('expression', 'Expression', 4.5, 1)" | hgsql hg17
echo "update grp set label='Regulation' where name='regulation'" | hgsql hg17

################################################
# AUTOMATE UPSTREAM FILE CREATION (2008-10-15 markd)
update genbank.conf:
hg17.upstreamGeneTbl = refGene
hg17.upstreamMaf = multiz17way /hive/data/genomes/hg17/bed/multiz17way/species.lst


#############################################################################
# MAKE PCR TARGET FOR UCSC GENES (DONE 11/4/08 angie)
    ssh hgwdev
    mkdir /cluster/data/hg17/bed/mrnaPcr
    cd /cluster/data/hg17/bed/mrnaPcr
    # First, get consistent FA and PSL for UCSC Genes.
    genePredToBed /cluster/data/hg17/bed/kgHg17F/try3/kg3Try3.gp > ucscGenes.bed
    hgsql hg17 -NBe 'select kgId,geneSymbol from kgXref' \
    | perl -wpe 's/^(\S+)\t(\S+)/$1\t${1}__$2/ || die;' \
      > idSub.txt
    subColumn 4 ucscGenes.bed idSub.txt ucscGenesIdSubbed.bed
    sequenceForBed -keepName -db=hg17 -bedIn=ucscGenesIdSubbed.bed \
      -fastaOut=stdout \
    | faToTwoBit -ignoreDups stdin kgTargetSeq.2bit
    cut -f 1-10 /cluster/data/hg17/bed/kgHg17F/try3/kg3Try3.gp \
    | genePredToFakePsl hg17 stdin kgTargetAli.psl /dev/null

    # Load up the UCSC Genes target PSL table and put 2bit in /gbdb::
    cd /cluster/data/hg17/bed/mrnaPcr
    hgLoadPsl hg17 kgTargetAli.psl
    mkdir /gbdb/hg17/targetDb
    ln -s /cluster/data/hg17/bed/mrnaPcr/kgTargetSeq.2bit /gbdb/hg17/targetDb/

    # Ask cluster-admin to start an untranslated, -stepSize=5 gfServer on
    # /gbdb/hg17/targetDb/kgTargetSeq.2bit .

    ssh hgwdev
    # Add records to hgcentraltest blatServers and targetDb:
    hgsql hgcentraltest -e \
      'INSERT into blatServers values ("hg17Kg", "blat13", 17797, 0, 1);'
    hgsql hgcentraltest -e \
      'INSERT into targetDb values("hg17Kg", "UCSC Genes", \
         "hg17", "kgTargetAli", "", "", \
         "/gbdb/hg17/targetDb/kgTargetSeq.2bit", 1, now(), "");'


#############################################################################
# fox2ClipSeq from Gene Yeo (DONE - 2009-01-08 - Hiram)
    mkdir /hive/data/genomes/hg17/bed/fox2ClipSeq
    cd /hive/data/genomes/hg17/bed/fox2ClipSeq
    #	fetch data
    wget --timestamping \
'http://www.snl.salk.edu/~geneyeo/stuff/FOX2.rmsk.BED.gz' \
        -O FOX2.rmsk.BED.gz
    #	remove track line and sort
    zcat FOX2.rmsk.BED.gz | grep -v "^track" | sort -k1,1 -k2,2n \
        | gzip > sorted.bed.gz
    #	separate strand data, and turn the positive into blue
    zcat sorted.bed.gz | awk '$6 == "+"' | sed -e "s/255,0,0/0,0,255/" \
	| gzip > forwardStrand.bed.gz
    zcat sorted.bed.gz | awk '$6 == "-"' | gzip > reverseStrand.bed.gz
    #	turn into wiggle density plot
    zcat forwardStrand.bed.gz | bedItemOverlapCount hg17 stdin \
        | wigEncode stdin fox2ClipSeqDensityForwardStrand.wig \
	fox2ClipSeqDensityForwardStrand.wib
    #	Converted stdin, upper limit 2401.00, lower limit 1.00
    zcat reverseStrand.bed.gz | bedItemOverlapCount hg17 stdin \
        | wigEncode stdin fox2ClipSeqDensityReverseStrand.wig \
		fox2ClipSeqDensityReverseStrand.wib
    #	Converted stdin, upper limit 1406.00, lower limit 1.00
    #	and load tables
    zcat forwardStrand.bed.gz reverseStrand.bed.gz \
	| hgLoadBed hg17 fox2ClipSeq stdin
    #	Loaded 4418298 elements of size 9
    ln -s `pwd`/*.wib /gbdb/hg17/wib
    hgLoadWiggle hg17 fox2ClipSeqDensityForwardStrand \
	fox2ClipSeqDensityForwardStrand.wig
    hgLoadWiggle hg17 fox2ClipSeqDensityReverseStrand \
	fox2ClipSeqDensityReverseStrand.wig
    #	add composite track definitions to makeDb/trackDb/human/trackDb.ra

#############################################################################
# LIFTOVER TO Hg19 (DONE - 2009-04-24 - Hiram )
    mkdir /hive/data/genomes/hg17/bed/blat.hg19.2009-04-24
    cd /hive/data/genomes/hg17/bed/blat.hg19.2009-04-24
    # -debug run to create run dir, preview scripts...
    doSameSpeciesLiftOver.pl -debug hg17 hg19
    # Real run:
    time nice -n +19 doSameSpeciesLiftOver.pl -verbose=2 \
	-bigClusterHub=pk -dbHost=hgwdev -workhorse=hgwdev \
	 hg17 hg19 > do.log 2>&1 &
    #	real    84m19.022s

#############################################################################
############################################################################
# TRANSMAP vertebrate.2009-09-13 build  (2009-09-20 markd)

vertebrate-wide transMap alignments were built  Tracks are created and loaded
by a single Makefile. This is available from:
   svn+ssh://hgwdev.soe.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2009-09-13

see doc/builds.txt for specific details.
############################################################################
# UPDATE KEGG TABLES (DONE, Fan, 6/18/10)

mkdir -p /hive/data/genomes/hg17/bed/pathways/kegg
cd /hive/data/genomes/hg17/bed/pathways/kegg

wget --timestamping ftp://ftp.genome.jp/pub/kegg/pathway/map_title.tab

cat map_title.tab | sed -e 's/\t/\thsa\t/' > j.tmp
cut -f 2 j.tmp >j.hsa
cut -f 1,3 j.tmp >j.1
paste j.hsa j.1 |sed -e 's/\t//' > keggMapDesc.tab
rm j.hsa j.1
rm j.tmp

hgsql hg17 -e 'drop table keggMapDesc'
hgsql hg17 < ~/kent/src/hg/lib/keggMapDesc.sql
hgsql hg17 -e 'load data local infile "keggMapDesc.tab" into table keggMapDesc'

wget --timestamping ftp://ftp.genome.jp/pub/kegg/genes/organisms/hsa/hsa_pathway.list

cat hsa_pathway.list| sed -e 's/path://'|sed -e 's/:/\t/' > j.tmp
hgsql hg17 -e 'drop table keggPathway'
hgsql hg17 < ~/kent/src/hg/lib/keggPathway.sql
hgsql hg17 -e 'load data local infile "j.tmp" into table keggPathway'

hgsql hg17 -N -e \
'select name, locusID, mapID from keggPathway p, knownToLocusLink l where p.locusID=l.value' \
>keggPathway.tab

hgsql hg17 -e 'delete from keggPathway'

hgsql hg17 -e 'load data local infile "keggPathway.tab" into table keggPathway'

rm j.tmp
############################################################################
# Add KEGG column to hg17 Gene Sorter (Done, Fan, 6/18/2010)

mkdir -p /hive/data/genomes/hg17/bed/geneSorter
cd /hive/data/genomes/hg17/bed/geneSorter
hgsql hg17 -N -e 'select kgId, mapID, mapID, "+", locusID from keggPathway' |sort -u|sed -e 's/\t+\t/+/' > knownToKeggEntrez.tab

hgsql hg17 -e 'drop table knownToKeggEntrez'

hgsql hg17 < ~/kent/src/hg/lib/knownToKeggEntrez.sql

hgsql hg17 -e 'load data local infile "knownToKeggEntrez.tab" into table knownToKeggEntrez'

#############################################################################
# LIFTOVER TO HG38 (DONE 10/8/14 angie)
    mkdir /hive/data/genomes/hg17/bed/blat.hg38.2014-10-08
    cd /hive/data/genomes/hg17/bed/blat.hg38.2014-10-08
    # -debug run to create run dir and scripts to verify this command
    # is OK
    doSameSpeciesLiftOver.pl -debug -buildDir=`pwd` \
          -ooc=/hive/data/genomes/hg17/11.ooc hg17 hg38
    # then running for real
    time doSameSpeciesLiftOver.pl -buildDir=`pwd` \
            -ooc=/hive/data/genomes/hg17/11.ooc hg17 hg38 >&! do.log & tail -f do.log
#0.238u 0.246s 31:25.58 0.0%     0+0k 0+0io 0pf+0w

    # verify the convert link on the browser is now active from hg17 to hg38

#############################################################################
# haplotype locations from BLAT/SelfChain (DONE - 2015-07-03 - Galt )
#
# I want to have the placement positions of the haplotypes for viewing
# with the new multi-window browser capability that I am developing.
# Haplotypes provided in hg17 were 2, hg18, 4.
# It seems these were provided by NCBI as sequence, but without placement information.
#
# haplotypePos is psl table made by Fan using BLAT and parasol. 
# It once had a trackDb entry, but that is gone now.
# It has several ambigous results for most of the haplotypes, 
# although choosing the row with the highest number of matches for each haplotype would probably work.
#
# Hiram made haplotypeLocations table by taking the hg19 haplotypes and using liftOver.
# It sort of works, but problems are that the hg18 chr22 haplo is not found in hg19 at all,
# and the hap# changes from hg18 to hg19, so having that in the ID is not very helpful.
# It also looks like the haplotypes in hg19 were made without subtracting 1 from the start for BED.
# Finally, they do not match exactly the results in haplotypePos table.
#
# hg18.haplotypeLocationsEnsembl
# Known that Ensembl has good browser support for haplotypes, I went to their archive browser 
# equivalent to hg18, and got their 4 haplotype positions by clicking on the label in the chromosome ideogram.
# Since there are only 4, it is do-able manually.  
# Did they figure out the positions for themselves, or did they get official placement positions from NCBI?
#
# In any case, I cannot use the same method for hg17 because Ensembl does not keep any browsers that old in their archive.
# Does anybody know of some ancient Ensembl mirrors that might have it?
# Would anyone at NCBI know the old assembly haplotype placements for hg17 and hg18?
#
#hg17 haplotypes
#
#only 2:
#
#DR52       DR52 haplotype for the MHC region on chromosome 6
#DR53       DR53 haplotype for the MHC region on chromosome 6
#
#mysql> select * from chromInfo where chrom like '%hap%';
#+---------------+--------+----------------------------------+
#| chrom         | size   | fileName                         |
#+---------------+--------+----------------------------------+
#| chr6_hla_hap1 | 139182 | /gbdb/hg17/nib/chr6_hla_hap1.nib |
#| chr6_hla_hap2 | 150447 | /gbdb/hg17/nib/chr6_hla_hap2.nib |
#+---------------+--------+----------------------------------+
#
# I BLATed 200 bp from the left and right ends of chr6_hla_hap1 and chr6_hla_hap2
#
# The left side of hla_hap2 did not BLAT correctly because it is in a repeat-region.
# Do not ask me why the people defining these regions did not pick unique sequence
# for the left and right sides so that these could be easily placed.
# Using the information from the hg17 self-repeat track, I was able to
# narrow down the left-side location to within a few hundred bases by examining the chains.
#

cd /hive/data/genomes/hg17/bed
mkdir haplotypeLocations
cd haplotypeLocations

vi haplotypeLocations.bed
# paste in bed4 haplotype locations from BLAT/SelfChain
--------------------------------
chr6 32511422 32667119 chr6_hla_hap1
chr6 32589475 32668709 chr6_hla_hap2

hgLoadBed -type=bed4 hg17 haplotypeLocations haplotypeLocations.bed
#Reading haplotypeLocations.bed
#Read 2 elements of size 4 from haplotypeLocations.bed
#Sorted
#Creating table definition for haplotypeLocations
#Saving bed.tab
#Loading hg17

vi README
# I wrote some notes about the source and reasons for the data

# I have not made any trackDb entry for this table at this time.

# I renamed the table to altLocations to match use in other assemblies.

#############################################################################
