# for emacs: -*- mode: sh; -*-


# This file describes browser build for the mouse
# genome, February 2006, ncbi mouse_36 - Mm8
#
#	"$Id: mm8.txt,v 1.76 2010/01/07 20:47:49 rhead Exp $"
#

#  NOTE:  this doc may have genePred loads that fail to include
#  the bin column.  Please correct that for the next build by adding
#  a bin column when you make any of these tables:
#
#  mysql> SELECT tableName, type FROM trackDb WHERE type LIKE "%Pred%";
#  +-------------+-------------------------------------+
#  | tableName   | type                                |
#  +-------------+-------------------------------------+
#  | knownGene   | genePred knownGenePep knownGeneMrna |
#  | refGene     | genePred refPep refMrna             |
#  | xenoRefGene | genePred xenoRefPep xenoRefMrna     |
#  | mgcGenes    | genePred                            |
#  | ensGene     | genePred ensPep                     |
#  | genscan     | genePred genscanPep                 |
#  +-------------+-------------------------------------+


#######################################################################
# DOWNLOAD THE MOUSE SEQUENCE FROM NCBI (DONE - 2006-02-14 - Hiram)
#
#	Examine disk space issues, find some goodly amount of space
    ssh kkstore01
    mkdir /cluster/store9/mm8
    ln -s /cluster/store9/mm8 /cluster/data/mm8
    cd /cluster/data/mm8
    mkdir ncbi
    cd ncbi
    cp -p /cluster/data/mm7/ncbi/.wgetrc .
    WGETRC=`pwd`/.wgetrc
    export WGETRC
    wget --timestamping --force-directories --directory-prefix=. \
	--dont-remove-listing --recursive --level=4 --no-parent \
	--no-host-directories --cut-dirs=1 \
	ftp://ftp-private.ncbi.nih.gov/mouse_36
    #	Downloaded: 2,201,934,141 bytes in 50 files
    #	real    44m48.975s

    #	The pre-release sequence, Feb 27th:
    mkdir /cluster/data/mm8/pre_release
    cd /cluster/data/mm8/pre_release
    #	The .wgetrc is the anonymous user
    WGETRC=`pwd`/.wgetrc
    export WGETRC
    wget --timestamping --force-directories --directory-prefix=. \
	--dont-remove-listing --recursive --level=4 --no-parent \
	--no-host-directories --cut-dirs=3 \
	ftp://ftp.ncbi.nih.gov/genomes/M_musculus/pre_release

#  Fixup the agp and seq_contig.md files to add chrM
#  No chrM or chrMT was delivered.  Copy from previous assembly
    ssh kkstore01
    cd /cluster/data/mm8/ncbi/chrfasta
    cp -p /cluster/data/mm7/ncbi/chrfasta/chrM.fa.gz .
    cd ../contigfasta
    cp -p /cluster/data/mm7/ncbi/contigfasta/chrM.fa.gz .
#	with a fixed up header line to be like all the others:
#	>lcl|chrM.fa gi|34538597|ref|NC_005089.1| Mus musculus mitochondrion, complete genome

    cd /cluster/data/mm8
    zcat ncbi/allrefcontig.chr.agp.gz > allrefcontig.chr.agp
    echo -e "chrM\t1\t16299\t1\tF\tAY172335.1\t1\t16299\t+" >> \
	allrefcontig.chr.agp
    gzip allrefcontig.chr.agp
    #	I don't see allcontig.agp being used anywhere else ?
    # zcat ncbi/allcontig.agp.gz > allcontig.agp
    # echo -e "NC_005089\t1\t16299\t1\tF\tAY172335\t\t1\t16299\t+" >> \
    #	    allcontig.agp
    # gzip allcontig.agp
    zcat ncbi/seq_contig.md.gz | egrep -v "Celera|129_substrain" \
	| sed -e "238i\
10090\tM\t0\t0\t+\tstart\t-1\tCONTIG\tC57BL/6J\t10\n\
10090\tM\t1\t16299\t+\tNC_005089\tGI:34538597\tCONTIG\tC57BL/6J\tna\n\
10090\tM\t16299\t16299\t+\tend\t-2\tCONTIG\tC57BL/6J\t10" > seq_contig.md
    #	(curiously, this sed command would not work on hgwdev,
    #	only when logged into kkstore01 ?)
    #	The line number 238 was found by checking the contents of
    #	ncbi/seq_contig.md.gz (after the egrep filter) and it was
    #	the line starting with:
    #	10090   Un|NT_039877    1       35798
    #	Wanted this chrM information before that line.
    #   summarize sequence counts

    mkdir faCounts
    time faCount ncbi/chrfasta/chr*.fa.gz > faCounts/chrfasta.faCount 2>&1 &
    #	about 1.5 minutes
    time faCount ncbi/contigfasta/chr*.fa.gz > \
	faCounts/contigfasta.faCount 2>&1 &
    #	about 3 minutes
    time zcat ncbi/chrfasta/chr*.fa.gz | grep "^>" > \
	faCounts/chrfasta.headers 2>&1 &
    time zcat ncbi/contigfasta/chr*.fa.gz | grep "^>" > \
	faCounts/contigfasta.headers 2>&1 &
    #	about 2 minutes each for the above two zcat/greps

#############################################################################
#  BREAK UP SEQUENCE INTO 5 MB CHUNKS at NON-BRIDGED CONTIGS
#			(DONE - 2006-02-14 - Hiram)
#########  Are these necessary ?  They may no longer be needed.
#########  TRF can run on full chroms on the kki kluster
#	It would be better to use . in place of the /cluster/data/mm8
#	for the outputDir argument to splitFaIntoContigs so this script
#	is independent of specific locations, thus it works in .
    ssh kkstore01
    cd /cluster/data/mm8
    for F in ncbi/chrfasta/chr*.fa.gz
    do
	CHR=`basename ${F} | sed -e "s/.fa.gz//; s/chr//"`
	echo ${CHR} ${F}
	mkdir -p "${CHR}"
	zcat allrefcontig.chr.agp.gz | \
	    perl -we "while(<>){if (/^chr${CHR}\t/) {print;}}" > \
		${CHR}/chr${CHR}.agp
	zcat ncbi/chrfasta/chr${CHR}.fa.gz | \
	    perl -wpe 's/^>lcl\|(chr\w+)\.fa.*/>$1/' | \
		splitFaIntoContigs ${CHR}/chr${CHR}.agp \
		    stdin /cluster/data/mm8 -nSize=5000000
    done
    #	The above loop takes about 5 minutes
    #	Some of these in the chr1 directory got overwritten on 2006-02-27
    #	during an attempt to verify that the pre-release directory at
    #	NCBI was the same as what we worked with here.

#############################################################################
# CREATE CHROM-LEVEL AGP AND FASTA FOR _RANDOMS (DONE 2006-02-14 - Hiram)
    ssh kkstore01
    mkdir /cluster/data/mm8/jkStuff
    cd /cluster/data/mm8
    mkdir Un tmp
    cp -p /cluster/data/mm7/jkStuff/ncbiFixAgp ./jkStuff
    zcat allrefcontig.chr.agp.gz | ./jkStuff/ncbiFixAgp /dev/stdin > \
	allrefcontig.chr.ordered.agp
    #	Set the appropriate release number here, this one is 35
    #	Fetch the script from the previous assembly
    sed -e "s/buildNum = 35/buildNum = 36/" \
	/cluster/data/mm7/jkStuff/ncbiToRandomAgps > \
	    jkStuff/ncbiToRandomAgps
    chmod +x jkStuff/ncbiToRandomAgps
    #	NOTE ! * ! This mm8 contig.idmap now includes the celera assembly
    #	Filter that out for use here.
    #	There were two broken lines that began _36 - they were removed
    #	after I reported them and the contig.idmap.gz file here was
    #	updated later.
    zcat ncbi/contig.idmap.gz | grep ref_strain | grep -v "^_36" \
	| ./jkStuff/ncbiToRandomAgps seq_contig.md \
		allrefcontig.chr.ordered.agp \
                        /dev/stdin . 2> dbg
    for C in ? ??
    do
	if [ -s ${C}/chr${C}_random.ctg.agp ]; then
	    echo "building ${C}/chr${C}_random.fa"
	    rm -f ./tmp.fa
	    zcat ncbi/contigfasta/chr${C}.fa.gz | \
		perl -wpe 's/^>lcl\|(Mm\w+)\s+.*$/>$1/' > ./tmp.fa
	    agpToFa -verbose=2 -simpleMulti \
		${C}/chr${C}_random.ctg.agp chr${C}_random \
		    ${C}/chr${C}_random.fa ./tmp.fa
	    rm -f ./tmp.fa
	fi
    done > tmp/agpToFa.out 2>&1
    #	the above loop takes about 3 minutes, examine the tmp/agpToFa.out
    #	record for any errors

    #	We need the lift information from these random.ctg.agp files
    cp -p /cluster/data/mm7/jkStuff/agpToLift.pl ./jkStuff
    for AGP in ?/*_random.ctg.agp ??/*_random.ctg.agp
do
    CHR=`dirname ${AGP}`
    echo ${CHR}
    mkdir -p ${CHR}/lift
    ./jkStuff/agpToLift.pl ${AGP} > ${CHR}/lift/ctg_random.lft
done
    # Clean these up to avoid confusion later... they're easily rebuilt
    #   with the ncbiToRandomAgps script above
    rm ?/*_random.ctg.agp ??/*_random.ctg.agp
    gzip seq_contig.md allrefcontig.chr.ordered.agp

#############################################################################
# BREAK UP _RANDOMS INTO 5 MB CHUNKS AT NON-BRIDGED CONTIGS
#					(DONE 2006-02-14 - Hiram)
    ssh kkstore01
    cd /cluster/data/mm8
    for C in ? ??
    do
	if [ -s ${C}/chr${C}_random.fa ]; then
	    splitFaIntoContigs  -nSize=5000000 ${C}/chr${C}_random.agp \
		${C}/chr${C}_random.fa .
	    mkdir -p ${C}/lift
	    rm -f ${C}/lift/rOut.lst ${C}/lift/random.lft ${C}/lift/random.lst
	    mv ${C}_random/lift/oOut.lst ${C}/lift/rOut.lst
	    mv ${C}_random/lift/ordered.lft ${C}/lift/random.lft
	    mv ${C}_random/lift/ordered.lst ${C}/lift/random.lst
	    rmdir ${C}_random/lift
	    rm ${C}_random/chr${C}_random.agp ${C}_random/chr${C}_random.fa
	    rm -rf ${C}/chr${C}_random_*
	    mv ${C}_random/chr${C}_random_* ${C}
	    rmdir ${C}_random
	fi
    done > tmp/split.out 2>&1
    #	the above loop takes less than a minute
    #	scan the tmp/split.out file for possible errors

#############################################################################
# MAKE LIFTALL.LFT (DONE - 2006-02-14 - Hiram)
    ssh kkstore01
    cd /cluster/data/mm8
    cat ?/lift/*.lft ??/lift/*.lft > jkStuff/liftAll.lft

#############################################################################
# CREATING DATABASE (DONE - 2006-02-14 - Hiram)
    ssh kkstore01
    cd /cluster/data/mm8
    faToTwoBit ?/chr?.fa ??/chr??.fa ?/chr?_random.fa ??/chr??_random.fa \
	mm8.2bit
    twoBitInfo mm8.2bit stdout | sort -rn +1 > chrom.sizes
    grep -v random chrom.sizes | cut -f1 | sed -e "s/chr//" > chrom.lst
    twoBitInfo mm8.2bit stdout |
        awk '{printf "%s\t%s\t/gbdb/mm8/mm8.2bit\n", $1,$2}' > chromInfo.tab

    ssh hgwdev
    cd /cluster/data/mm8
    hgsql -e "create database mm8;" mysql
    #	Make sure we have enough room (eventually ~ 70Gb) for mysql tables:
    df -h | grep mysql
    #	/dev/sda1             472G  225G  223G  51% /var/lib/mysql2
    #	/dev/sdc1             1.8T  1.5T  190G  89% /var/lib/mysql

    # CREATING GRP TABLE FOR TRACK GROUPING (DONE - 2006-02-14 - Hiram)
    #   Use any of the newest databases to ensure that the organization
    #   of the grp table is up to date
    ssh hgwdev
    hgsql mm8 -e "create table grp (PRIMARY KEY(NAME)) select * from hg18.grp"
    hgsql mm8 < $HOME/kent/src/hg/lib/chromInfo.sql
    hgsql mm8 -e 'load data local infile "chromInfo.tab" into table chromInfo;'

    # Enter mm8 into dbDb and defaultDb so test browser knows about it:
    hgsql -e 'INSERT INTO dbDb (name, description, nibPath, organism, \
        defaultPos, active, orderKey, genome, scientificName, \
        htmlPath, hgNearOk, hgPbOk, sourceName) \
        VALUES("mm8", "Feb 2006", "/gbdb/mm8", "Mouse", \
        "chr6:28912411-28925620", 1, 22, "Mouse", \
        "Mus musculus", "/gbdb/mm8/html/description.html", 0, 0, \
        "NCBI Build 36");' -h localhost hgcentraltest
    #	Reset default position to be like Mm7, 2006-03-09 - Hiram
    hgsql -e \
'update dbDb set defaultPos="chr12:50258170-50263946" where name="mm8";' \
	hgcentraltest
    #	Do *NOT* set default genome on genome-test until ready for release
    # hgsql hgcentraltest \
    #	-e 'update defaultDb set name="mm8" where genome="Mouse";'
    # start a new entry in the trackDb hierarchy
    cd $HOME/kent/src/hg/makeDb/trackDb/mouse
    mkdir mm8
    cvs add mm8
    cd mm8
    cp ../mm7/description.html .
    vi description.html - fixup text for this assembly
    cvs add description.html
    cvs commit
    cd ../..
    vi makefile - add mm8 to the list
    mkdir /cluster/data/mm8/html
    mkdir /gbdb/mm8
    ln -s /cluster/data/mm8/html /gbdb/mm8/html
    ln -s /cluster/data/mm8/mm8.2bit /gbdb/mm8/mm8.2bit
    cp -p mouse/mm8/description.html /gbdb/mm8/html
    make DBS=mm8

#############################################################################
#  GOLD GAP tracks (DONE - 2006-02-14 - Hiram)
    ssh hgwdev
    cd /cluster/data/mm8
    #	make sure these tmp contig agp files are gone, easily generated
    #	as above with jkStuff/ncbiToRandomAgps
    mkdir ffa
    zcat ncbi/sequence.inf.gz > ffa/sequence.inf
    hgGoldGapGl -chromLst=chrom.lst mm8 /cluster/data/mm8 .
    featureBits mm8 gold
    #	2567283971 bases of 2567283971 (100.000%) in intersection
    featureBits mm7 gold
    #	2583394090 bases of 2583394090 (100.000%) in intersection
    featureBits mm6 gold
    #	2597150411 bases of 2597150411 (100.000%) in intersection
    featureBits mm5 gold
    #	2615483787 bases of 2615483787 (100.000%) in intersection
    featureBits mm4 gold
    #	2627444668 bases of 2627444668 (100.000%) in intersection

    featureBits mm8 gap
    #	97171117 bases of 2567283971 (3.785%) in intersection
    featureBits mm7 gap
    #	264323239 bases of 2583394090 (10.232%) in intersection
    featureBits mm6 gap
    #	482483041 bases of 2597150411 (18.577%) in intersection
    featureBits mm5 gap
    #	549468286 bases of 2615483787 (21.008%) in intersection
    featureBits mm4 gap
    #	325167539 bases of 2627444668 (12.376%) in intersection

#############################################################################
# GC5BASE (DONE - 2006-02-14 - Hiram)
    ssh kkstore01
    mkdir -p /cluster/data/mm8/bed/gc5Base
    cd /cluster/data/mm8/bed/gc5Base
    time hgGcPercent -wigOut -doGaps -file=stdout -win=5 mm8 \
        /cluster/data/mm8 | wigEncode stdin gc5Base.wig gc5Base.wib

    #       Calculating gcPercent with window size 5
    #       Using twoBit: /cluster/data/mm8/mm7.2bit
    #       File stdout created
    #	Converted stdin, upper limit 100.00, lower limit 0.00

    #	runs for about 14 minutes

    #	load database
    ssh hgwdev
    cd /cluster/data/mm8/bed/gc5Base
    mkdir /gbdb/mm8/wib
    ln -s `pwd`/gc5Base.wib /gbdb/mm8/wib
    time hgLoadWiggle -pathPrefix=/gbdb/mm8/wib mm8 gc5Base gc5Base.wig
    #	29 second load time

    #	verify index is correct:
    hgsql mm8 -e "show index from gc5Base;"
    #	should see good numbers in Cardinality column

#############################################################################
#  DISTRIBUTE SEQUENCE TO INTERMEDIATE SERVERS FOR KLUSTER RUNS
#	(DONE - 2006-02-14 - Hiram)
    ssh kkstore01
    cd /cluster/data/mm8

    # break up into 500,000 sized chunks for repeat masker runs
TOP=`pwd`
export TOP
for CTG_DIR in ?/chr?_[0-9]* ??/chr??_[0-9]* ?/chr?_random_[0-9]* \
        ??/chr??_random_[0-9]*
do
    ctg=`basename ${CTG_DIR}`
    cd ${CTG_DIR}
    faSplit size ${ctg}.fa 500000 ${ctg}_ -lift=${ctg}.lft -maxN=500000
    cd ${TOP}
done > tmp/ctg_split.out 2>&1
    #	about 3 minutes, check the tmp/ctg_split.out for anything unusual

    #	make a list of the contigs
TOP=`pwd`
export TOP
for CTG_DIR in ?/chr?_[0-9]* ??/chr??_[0-9]* ?/chr?_random_[0-9]* \
        ??/chr??_random_[0-9]*
do
    ctg=`basename ${CTG_DIR}`
    cd ${CTG_DIR}
    ls ${ctg}_* | while read F
    do
        echo ${CTG_DIR}/${F}
    done
    cd ${TOP}
done > contig500K.lst
    #	count 'em
    wc -l contig500K.lst
    #	5772   contig500K.lst

    mkdir /cluster/bluearc/scratch/hg/mm8
    mkdir /cluster/bluearc/scratch/hg/mm8/contigs
    rsync -a --progress --files-from=contig500K.lst . \
        /cluster/bluearc/scratch/hg/mm8/contigs/

    #	verify the contig copy above functioned OK
    cd /cluster/bluearc/scratch/hg/mm8
    find ./contigs -type f | wc -l
    #	 5772

#############################################################################
# SIMPLE REPEAT TRACK (DONE - 2006-02-14 Hiram)
    # TRF can be run in parallel with RepeatMasker
    #   since it doesn't require masked input sequence.
    ssh kkr1u00
    mkdir /iscratch/i/mm8
    cd /iscratch/i/mm8
    mkdir fa
    cd fa
    cp -p /cluster/data/mm8/?/*.fa .
    cp -p /cluster/data/mm8/??/*.fa .

    for R in 2 3 4 5 6 7 8
    do
	rsync -a --progress /iscratch/i/mm8/ kkr${R}u00:/iscratch/i/mm8/
    done

    ssh kki 
    mkdir /cluster/data/mm8/bed/simpleRepeat
    cd /cluster/data/mm8/bed/simpleRepeat

    mkdir trf
    cat << '_EOF_' > runTrf
#!/bin/csh -fe 
#
set path1 = /iscratch/i/mm8/fa/$1
set inputFN = $1  
set outpath = $2
set outputFN = $2:t
mkdir -p /scratch/tmp/$outputFN
cp $path1 /scratch/tmp/$outputFN
pushd .
cd /scratch/tmp/$outputFN
/cluster/bin/i386/trfBig -trf=/cluster/bin/i386/trf $inputFN /dev/null -bedAt=$outputFN -tempDir=/scratch/tmp
popd
rm -f $outpath
cp -p /scratch/tmp/$outputFN/$outputFN $outpath
rm -fr /scratch/tmp/$outputFN/*
rmdir --ignore-fail-on-non-empty /scratch/tmp/$outputFN
'_EOF_'
    # << happy emacs
    chmod +x runTrf

    cat << '_EOF_' > template
#LOOP
./runTrf $(path1) {check out line trf/$(root1).bed}
#ENDLOOP
'_EOF_'
    # << keep emacs coloring happy

    ls -1S /iscratch/i/mm8/fa > genome.lst
    gensub2 genome.lst single template jobList
    para create jobList
    para try ... check ... push ... etc
    para time
# Completed: 34 of 34 jobs
# CPU time in finished jobs:      14385s     239.75m     4.00h    0.17d  0.000 y
# IO & Wait Time:                   794s      13.24m     0.22h    0.01d  0.000 y
# Average job time:                 446s       7.44m     0.12h    0.01d
# Longest finished job:            1437s      23.95m     0.40h    0.02d
# Submission to last job:          1685s      28.08m     0.47h    0.02d

    # Load into the database
    ssh hgwdev
    cd /cluster/data/mm8/bed/simpleRepeat
    cat trf/chr*.bed > simpleRepeat.bed
    hgLoadBed -strict mm8 simpleRepeat simpleRepeat.bed \
      -sqlTable=$HOME/src/hg/lib/simpleRepeat.sql
    #	Loaded 1141941 elements of size 16

    featureBits mm8 simpleRepeat
    # 77752377 bases of 2567283971 (3.029%) in intersection
    featureBits mm7 simpleRepeat
    # 77021175 bases of 2583394090 (2.981%) in intersection
    featureBits mm6 simpleRepeat
    # 83220723 bases of 2597150411 (3.204%) in intersection
    featureBits mm5 simpleRepeat
    # 81414259 bases of 2615483787 (3.113%) in intersection
    featureBits mm4 simpleRepeat
    # 82600648 bases of 2627444668 (3.144%) in intersection
    featureBits mm3 simpleRepeat
    # 75457193 bases of 2505900260 (3.011%) in intersection

###########################################################################
# CREATE MICROSAT TRACK (done 2006-7-5 JK)
     ssh hgwdev
     cd /cluster/data/mm8/bed
     mkdir microsat
     cd microsat
     awk '($5==2 || $5==3) && $6 >= 15 && $8 == 100 && $9 == 0 {printf("%s\t%s\t%s\t%dx%s\n", $1, $2, $3, $6, $16);}' ../simpleRepeat/simpleRepeat.bed > microsat.bed 
    /cluster/bin/i386/hgLoadBed mm8 microsat microsat.bed


#############################################################################
# PROCESS SIMPLE REPEATS INTO MASK (DONE - 2006-02-14 - Hiram)

    # After the simpleRepeats track has been built, make a filtered version
    # of the trf output: keep trf's with period <= 12:
    ssh kkstore01
    cd /cluster/data/mm8/bed/simpleRepeat
    mkdir trfMask
    for F in trf/chr*.bed
    do
	echo "${F} -> ${F/trf\//}"
	awk '{if ($5 <= 12) print;}' ${F} > trfMask/${F/trf\//}
    done

#############################################################################
# REPEATMASKER RUN (after contigs have been distributed to bluearc FS)
#	(DONE - 2006-02-14 - 2006-02-15 - Hiram)
#	Record RM version used:
    cat /cluster/bluearc/RepeatMasker060120/Libraries/version
#   RM database version 20060120
    ssh pk

    #- Make the run directory and job list:
    mkdir /cluster/data/mm8/RMRun
    cd /cluster/data/mm8/RMRun
    cat << '_EOF_' > ../jkStuff/RMMouse
#!/bin/csh -fe
set C = $1:h
set F = $1:t
set R = $F:r
cd /cluster/data/mm8/$C
/bin/mkdir -p /scratch/tmp/mm8/$R
/bin/cp /cluster/bluearc/scratch/hg/mm8/contigs/$1 /scratch/tmp/mm8/$R
pushd /scratch/tmp/mm8/$R
/cluster/bluearc/RepeatMasker060120/RepeatMasker -ali -s -species mus $F
popd
/bin/cp /scratch/tmp/mm8/$R/$R.fa.out ./
if (-e /scratch/tmp/mm8/$R/$R.fa.align) /bin/cp /scratch/tmp/mm8/$R/$R.fa.align ./
if (-e /scratch/tmp/mm8/$R/$R.fa.tbl) /bin/cp /scratch/tmp/mm8/$R/$R.fa.tbl ./
if (-e /scratch/tmp/mm8/$R/$R.fa.cat) /bin/cp /scratch/tmp/mm8/$R/$R.fa.cat ./
/bin/rm -fr /scratch/tmp/mm8/$R/*
/bin/rmdir --ignore-fail-on-non-empty /scratch/tmp/mm8/$R
/bin/rmdir --ignore-fail-on-non-empty /scratch/tmp/mm8
'_EOF_'
    #	<< happy emacs
    chmod +x ../jkStuff/RMMouse

    cat << '_EOF_' > template
#LOOP
../jkStuff/RMMouse $(path1) {check out line ../$(dir1)/$(root1).fa.out}
#ENDLOOP
'_EOF_'
    #	<< happy emacs

    gensub2 ../contig500K.lst single template jobList
    para create jobList
    wc -l jobList
    #	5772 jobList
    para try ... check ... push ... etc
# Completed: 6172 of 6172 jobs
# CPU time in finished jobs:   26381042s  439684.03m  7328.07h  305.34d  0.837 y
# IO & Wait Time:                 46088s     768.13m    12.80h    0.53d  0.001 y
# Average job time:                4282s      71.36m     1.19h    0.05d
# Longest finished job:            6370s     106.17m     1.77h    0.07d
# Submission to last job:        127318s    2121.97m    35.37h    1.47d

    #- Lift up the split-contig .out's to contig-level .out's
    ssh kkstore01
    cd /cluster/data/mm8
    for D in ?/chr?_[0-9]* ??/chr??_[0-9]* ?/chr?_random_[0-9]* \
	??/chr??_random_[0-9]*
    do
	CONTIG=`basename ${D}`
	liftUp ${D}/${CONTIG}.fa.out ${D}/${CONTIG}.lft error \
		${D}/${CONTIG}_[0-9]*.fa.out
    done > tmp/RM.lift.outs 2>&1
    #	real    2m32.275s
    #	scan tmp/RM.lift.outs for unusual errors or difficulties

    cat << '_EOF_' > jkStuff/liftRM_out_to_chr.sh
#!/bin/sh
for C in ? ??
do
    echo "lifting ${C}"
    cd ${C}
    if [ -s lift/ordered.lft ]; then
	liftUp chr${C}.fa.out lift/ordered.lft error `cat lift/oOut.lst`
    else
	echo "WARNING: Can not find ${C}/lift/ordered.lft"
    fi
    if [ -s lift/random.lft ]; then
	liftUp chr${C}_random.fa.out lift/random.lft error `cat lift/rOut.lst`
    fi
    cd ..
done
'_EOF_'
    # << happy emacs
    chmod +x jkStuff/liftRM_out_to_chr.sh
    ./jkStuff/liftRM_out_to_chr.sh > tmp/liftRM_out_to_chr.out 2>&1
    #	real    0m24.873s
    #	scan the results tmp/liftRM_out_to_chr.out
    #	there is a single: WARNING: Can not find Un/lift/ordered.lft
    #	which is OK
    #	List the final .out files, nothing should be size 0:
    ls -og */*.fa.out | sort -k3,3nr

    #- Load the .out files into the database with:
    ssh hgwdev
    cd /cluster/data/mm8
    hgLoadOut mm8 ?/chr?.fa.out ??/chr??.fa.out ?/chr?_random.fa.out \
	??/chr??_random.fa.out > tmp/hgLoadOut.out 2>&1
    #	about 7 minutes, there are always a few of these errors:

    #	verify everything seems normal compared with previous builds

    featureBits mm8 rmsk
    #	1087735582 bases of 2567283971 (42.369%) in intersection
    featureBits mm7 rmsk
    #	1092611581 bases of 2583394090 (42.294%) in intersection
    featureBits mm6 rmsk
    #	1110222842 bases of 2597150411 (42.748%) in intersection
    featureBits mm5 rmsk
    #	1137310280 bases of 2615483787 (43.484%) in intersection
    featureBits mm4 rmsk
    #	1130883581 bases of 2627444668 (43.041%) in intersection
    featureBits mm3 rmsk
    #	1080265553 bases of 2505900260 (43.109%) in intersection

#############################################################################
# PROCESS REPEAT MASKER AND SIMPLE REPEATS INTO MASKED SEQUENCE
#		(DONE - 2006-02-16 - Hiram)
    ssh kkstore01
    cd /cluster/data/mm8
    time for CHR in ?/chr?.fa ??/chr??.fa ?/chr?_random.fa ??/chr??_random.fa
    do
	FA=${CHR#*\/}
	C=${FA%.fa}
	echo -n "repeat masking ${C} ... "
	/cluster/bin/i386/maskOutFa -soft ${CHR} ${CHR}.out ${CHR}
	echo -n "adding simpleRepeats ... "
	/cluster/bin/i386/maskOutFa -softAdd ${CHR} \
		bed/simpleRepeat/trfMask/${C}.bed ${CHR}
	echo "done - ${CHR}"
    done > tmp/addRM_and_Simple.out 2>&1
    #	about 4 minutes

    # you will note the usual warnings about troublesome coordinates
    # in the repeat masker outputs - even more than when they were lifted.

    #	and make the hard masked sequences from these soft masked sequences
    time for CHR in ?/chr?.fa ??/chr??.fa ?/chr?_random.fa ??/chr??_random.fa
    do
	echo "maskOutFa ${CHR} hard ${CHR}.masked"
	/cluster/bin/i386/maskOutFa ${CHR} hard ${CHR}.masked
    done > tmp/hardMask.out 2>&1
    #	about 2 minutes

    #	rebuild the nib file
    time faToTwoBit ?/chr?.fa ??/chr??.fa ?/chr?_random.fa ??/chr??_random.fa \
	mm8Soft.2bit
    #	2 minutes
    #	verify the sequence is still the same size as before:
    twoBitInfo mm8Soft.2bit stdout | sort -rn +1 | sum -r
    #	20673     1
    sum -r chrom.sizes
    #	20673     1
    #	Let's see how much is masked:
    time twoBitToFa mm8Soft.2bit stdout | faSize stdin
    #	2664455088 bases (97171400 N's 2567283688 real 1477933003 upper
    #	1089350685 lower) in 34 sequences in 1 files
    # and bc says:
    #	1089350685/2664455088 = .408845
    #	1089350685/2567283688 = .424320

    #	replace the former unmasked 2bit file with this new one:
    rm mm8.2bit; mv mm8Soft.2bit mm8.2bit
    #	check the browser, make sure it is functioning OK

    #	Generate fasta file for random contigs
    #	THIS IS OPTIONAL STUFF, not really needed, well, it is used in
    #	genscan to make the gene names there look pretty.  This script
    #	has been checked into the source tree in hg/utils/lft2BitToFa.pl
    #	use it from there next time
    cp -p /cluster/data/mm7/jkStuff/lft2BitToFa.pl ./jkStuff

    mkdir randomContigs
    for L in ?/lift/ctg_random.lft ??/lift/ctg_random.lft
do
    D=${L/\/lift*}
    echo $L $D
    ./jkStuff/lft2BitToFa.pl mm8.2bit ${L} \
	> randomContigs/chr${D}_random.ctg.fa
done
    #
    #	Verify these *.ctg.fa files have the same bases as the ordinary
    #	chr*_random.fa files:
    faSize ?/chr?_random.fa ??/chr??_random.fa
    # 20361100 bases (3250000 N's 17111100 real 7094373 upper 10016727 lower)
    #	in 12 sequences in 12 files

    faSize randomContigs/*.ctg.fa
    # 17111100 bases (0 N's 17111100 real 7094373 upper 10016727 lower)
    #	in 77 sequences in 12 files
    #	Note the number of real, upper and lower bases are the same

    #	This random contig business isn't actually needed
    #	Create a 2bit file with the full chrom sequences and these
    #	random contigs for use in blastz:
    # faToTwoBit ?/chr?.fa ??/chr??.fa randomContigs/chr*.ctg.fa \
    #	    mm8Chroms_RandomContigs.2bit

    #	Copy to bluearc unit for kluster runs
    # cp -p mm8.2bit /cluster/bluearc/mm8
    # cp -p mm8Chroms_RandomContigs.2bit /cluster/bluearc/mm8
    #	And the lift file to go with it
    # cat ?/lift/ctg_random.lft ??/lift/ctg_random.lft \
    #	    > jkStuff/Chroms_RandomContigs.lft
    #	cp -p jkStuff/Chroms_RandomContigs.lft /cluster/bluearc/mm8

    #	create full chrom nibs for blastz SEQ1 target with Lin Spec Repeats
    mkdir nib
    for FA in ?/chr?.fa ??/chr??.fa ?/chr?_random.fa ??/chr??_random.fa
do
    B=${FA/*\/}
    B=${B/.fa/}
    echo faToNib -softMask ${FA} nib/${B}.nib
    rm -f nib/${B}.nib
    faToNib -softMask ${FA} nib/${B}.nib
done
 
    mkdir /cluster/bluearc/scratch/hg/mm8/nib
    cp -p nib/*.nib /cluster/bluearc/scratch/hg/mm8/nib
    cp -p chrom.sizes /cluster/bluearc/scratch/hg/mm8
    cp -p mm8.2bit /cluster/bluearc/scratch/hg/mm8
    #	The contigs over there are no longer needed
    rm -fr /cluster/bluearc/scratch/hg/mm8/contigs
    #	after lineage specific repeats are created below, this business
    #	can be pushed to the kluster kk nodes and over to the Iservers

#############################################################################
# PREPARE "bigZips" files for public release
#	(DONE 2006-02-16 - Hiram)
    ssh kkstore01
    mkdir /cluster/data/mm8/downloads
    mkdir /cluster/data/mm8/downloads/bigZips
    mkdir /cluster/data/mm8/downloads/chromosomes
    cd /cluster/data/mm8/downloads/chromosomes
    cp -p ../../?/chr?.fa ../../??/chr??.fa \
	../../?/chr?_random.fa ../../??/chr??_random.fa.
    gzip chr*.fa
    #	12 minutes
    #	copy previous release README.txt
    scp hgwdev:/usr/local/apache/htdocs/goldenPath/mm8/chromosomes/README.txt .
    #	edit it to bring it up to date
    cd /cluster/data/mm8/downloads/bigZips
    #	copy previous release README.txt
    scp hgwdev:/usr/local/apache/htdocs/goldenPath/mm8/bigZips/README.txt .
    # edit README.txt to indicate proper version of sequence and
    #	RepeatMasker
    cd /cluster/data/mm8
    cp -p ?/chr*.fa ??/chr*.fa downloads/chromosomes
    tar cvzf downloads/bigZips/chromAgp.tar.gz ?/chr*.agp ??/chr*.agp
    tar cvzf downloads/bigZips/chromFa.tar.gz ?/chr*.fa ??/chr*.fa
    #	12 minutes
    tar cvzf downloads/bigZips/chromFaMasked.tar.gz ?/chr*.fa.masked \
	??/chr*.fa.masked
    tar cvzf downloads/bigZips/chromOut.tar.gz ?/chr*.fa.out ??/chr*.fa.out
    cd /cluster/data/mm8/bed/simpleRepeat
    tar cvzf ../../downloads/bigZips/chromTrf.tar.gz ./trfMask

    # get GenBank native mRNAs and refGene (DONE 2006-02-23)
    #	after the genbank run was complete
    ssh hgwdev
    cd /cluster/data/genbank
    time ./bin/i386/gbGetSeqs -db=mm8 -native GenBank mrna \
	/cluster/data/mm8/downloads/bigZips/mrna.fa
    #	2 minutes
    cd /cluster/data/mm8/downloads/bigZips
    gzip mrna.fa
    cd /cluster/data/mm8/downloads/bigZips
    for I in 1000 2000 5000
    do
	echo "upstream${I} working ... "
	featureBits mm8 refGene:upstream:${I} -fa=stdout \
		| gzip -c > upstream${I}.fa.gz
	echo "upstream${I} done"
    done
    #	real    11m25.493s

    ssh kkstore01
    cd /cluster/data/mm8/downloads/bigZips
    cp -p ../../mm8.2bit .
    md5sum *.gz *.2bit README.txt > md5sum.txt

    ssh hgwdev
    mkdir -p /usr/local/apache/htdocs/goldenPath/mm8
    ln -s /cluster/data/mm8/downloads/bigZips \
	/usr/local/apache/htdocs/goldenPath/mm8/bigZips
    ln -s /cluster/data/mm8/downloads/chromosomes \
	/usr/local/apache/htdocs/goldenPath/mm8/chromosomes

#############################################################################
# PREPARE LINEAGE SPECIFIC REPEAT FILES FOR BLASTZ (DONE - 2006-02-16 - Hiram)

    ssh kkr1u00
    mkdir /iscratch/i/mm8/rmsk
    cd /cluster/data/mm8
    cp -p */chr*.fa.out /iscratch/i/mm8/rmsk
    cd /iscratch/i/mm8
    for R in 2 3 4 5 6 7 8
    do
	rsync -a --progress /iscratch/i/mm8/ kkr${R}u00:/iscratch/i/mm8/
    done
    cd rmsk

    ssh kki
    mkdir /cluster/data/mm8/linSpecRep
    cd /cluster/data/mm8/linSpecRep
    ls -1S /iscratch/i/mm8/rmsk > fa.list
    
    cat << '_EOF_' > mkLSR.csh
#!/bin/csh -fe
pushd /iscratch/i/mm8/rmsk
/cluster/bluearc/RepeatMasker060120/DateRepeats \
	    $1 -query mouse -comp human -comp rat -comp dog -comp cow \
		-comp rabbit
popd
/bin/cp -p /iscratch/i/mm8/rmsk/$1_homo-sapiens_rattus_canis-familiaris_bos-taurus_oryctolagus-cuniculus .
'_EOF_'
    #	<< happy emacs
    chmod +x mkLSR.csh

    cat << '_EOF_' > template
#LOOP
./mkLSR.csh $(path1) {check out line+ $(path1)_homo-sapiens_rattus_canis-familiaris_bos-taurus_oryctolagus-cuniculus}
#ENDLOOP
'_EOF_'
    #	<< happy emacs

    gensub2 fa.list single template jobList
    para try ... check ... push ... etc...
    para time
# Completed: 34 of 34 jobs
# CPU time in finished jobs:       1338s      22.29m     0.37h    0.02d  0.000 y
# IO & Wait Time:                   112s       1.87m     0.03h    0.00d  0.000 y
# Average job time:                  43s       0.71m     0.01h    0.00d
# Longest finished job:              92s       1.53m     0.03h    0.00d
# Submission to last job:           181s       3.02m     0.05h    0.00d

    ssh kkstore01
    cd /cluster/data/mm8/linSpecRep
    mkdir notInHuman notInRat notInDog notInCow notInRabbit
    for F in chr*.out_homo-sapiens*
    do
	B=${F/.fa.out*/}
	echo $B 
        /cluster/bin/scripts/extractRepeats 1 ${F} > \
		notInHuman/${B}.out.spec
        /cluster/bin/scripts/extractRepeats 2 ${F} > \
		notInRat/${B}.out.spec
        /cluster/bin/scripts/extractRepeats 3 ${F} > \
		notInDog/${B}.out.spec
        /cluster/bin/scripts/extractRepeats 4 ${F} > \
		notInCow/${B}.out.spec
        XXXXX /cluster/bin/scripts/extractRepeats 4 ${F} > \ XXXXX
		notInRabbit/${B}.out.spec XXXXX
    done
   # NOTE: rabbit should be column 5 instead of 4.
   # This isn't a problem, as we're not using rabbit anyway (see below)

    #	the notInHuman, notInDog, notInCow and notInRabit ended up being
    #	identical.  Only the notInRat was different than them
    #	To check identical
    find . -name "*.out.spec" | \
	while read FN; do echo `cat ${FN} | sum -r` ${FN}; done \
	| sort -k1,1n | sort -t"/" -k3,3
    #	Copy to scratch/hg for use in kluster runs
    mkdir /cluster/bluearc/scratch/hg/mm8/linSpecRep
    mkdir /cluster/bluearc/scratch/hg/mm8/linSpecRep/notInRat
    mkdir /cluster/bluearc/scratch/hg/mm8/linSpecRep/notInOthers
    cp -p notInHuman/* /cluster/bluearc/scratch/hg/mm8/linSpecRep/notInOthers
    cp -p notInRat/* /cluster/bluearc/scratch/hg/mm8/linSpecRep/notInRat
    #	Request this scratch/hg/mm8 directory push to the kk nodes

    #	and we can do the Iservers simply:
    ssh kkr1u00
    cd /iscratch/i/mm8
    #	no longer need these two directories
    rm -fr fa rmsk
    rsync -a --progress /cluster/bluearc/scratch/hg/mm8/ .
    for R in 2 3 4 5 6 7 8
    do
	rsync -a --progress /iscratch/i/mm8/ kkr${R}u00:/iscratch/i/mm8/
    done

############################################################################
#  BLATSERVERS ENTRY (DONE - 2006-02-16 - Hiram)
#	After getting a blat server assigned by the Blat Server Gods,
    ssh hgwdev

    hgsql -e 'INSERT INTO blatServers (db, host, port, isTrans, canPcr) \
	VALUES ("mm8", "blat17", "17784", "1", "0"); \
	INSERT INTO blatServers (db, host, port, isTrans, canPcr) \
	VALUES ("mm8", "blat17", "17785", "0", "1");' \
	    hgcentraltest
    #	test it with some sequence

#########################################################################
# CPGISLANDS (DONE - 2006-02-16 - Hiram)
    ssh hgwdev
    mkdir /cluster/data/mm8/bed/cpgIsland
    cd /cluster/data/mm8/bed/cpgIsland

    # Build software from Asif Chinwalla (achinwal@watson.wustl.edu)
    cvs co hg3rdParty/cpgIslands
    cd hg3rdParty/cpgIslands
    make
    #	gcc readseq.c cpg_lh.c -o cpglh.exe
    cd ../..
    ln -s hg3rdParty/cpgIslands/cpglh.exe .
    
    # cpglh.exe requires hard-masked (N) .fa's.  
    # There may be warnings about "bad character" for IUPAC ambiguous 
    # characters like R, S, etc.  Ignore the warnings.  
    ssh kkstore01
    cd /cluster/data/mm8/bed/cpgIsland
    for F in ../../*/chr*.fa.masked
    do
	FA=${F/*\/}
	C=${FA/.fa.masked/}
	echo "./cpglh.exe ${FA} > ${C}.cpg"
	./cpglh.exe ${F} > ${C}.cpg
    done > cpglh.out 2>&1 &
    #	about 3 minutes 20 seconds

    #	Several chroms have 0 results:
    #	-rw-rw-r--  1     0 Feb 16 15:19 chr10_random.cpg
    #	-rw-rw-r--  1     0 Feb 16 15:20 chr15_random.cpg
    #	-rw-rw-r--  1     0 Feb 16 15:22 chr8_random.cpg
    #	-rw-rw-r--  1     0 Feb 16 15:22 chr9_random.cpg
    #	-rw-rw-r--  1     0 Feb 16 15:22 chrM.cpg
    #	-rw-rw-r--  1     0 Feb 16 15:22 chrX_random.cpg
    #	-rw-rw-r--  1     0 Feb 16 15:22 chrY.cpg

    # Transform cpglh output to bed +
    cat << '_EOF_' > filter.awk
{
$2 = $2 - 1;
width = $3 - $2;
printf("%s\t%d\t%s\t%s %s\t%s\t%s\t%0.0f\t%0.1f\t%s\t%s\n",
       $1, $2, $3, $5,$6, width,
       $6, width*$7*0.01, 100.0*2*$6/width, $7, $9);
}
'_EOF_'
    #	<< happy emacs
    awk -f filter.awk chr*.cpg | sort -k1,1 -k2,2n > cpgIsland.bed

    ssh hgwdev
    cd /cluster/data/mm8/bed/cpgIsland
    hgLoadBed -strict mm8 cpgIslandExt -tab -noBin \
      -sqlTable=$HOME/kent/src/hg/lib/cpgIslandExt.sql cpgIsland.bed
    #	Reading cpgIsland.bed
    #	Loaded 15963 elements of size 10
    featureBits mm8 cpgIslandExt
    #	10456823 bases of 2567283971 (0.407%) in intersection
    featureBits mm7 cpgIslandExt
    #	10439328 bases of 2583394090 (0.404%) in intersection
    featureBits mm6 cpgIslandExt
    #	10432360 bases of 2597150411 (0.402%) in intersection
    featureBits mm5 cpgIslandExt
    #	10422989 bases of 2615483787 (0.399%) in intersection
    featureBits mm4 cpgIsland
    #	11109692 bases of 2627444668 (0.423%) in intersection
    featureBits mm3 cpgIsland
    #	10102968 bases of 2505900260 (0.403%) in intersection

#########################################################################
# ANDY LAW CPGISSLANDS (DONE - 2006-02-16 - Hiram)
    # See notes in makeGalGal2.doc and makeCanFam2.doc
    ssh kkstore01
    mkdir /cluster/data/mm8/bed/cpgIslandGgfAndy
    cd /cluster/data/mm8/bed/cpgIslandGgfAndy

    #	Build the preProcGgfAndy program in
    #	kent/src/oneShot/preProcGgfAndy into your ~/bin/$MACHTYPE

    # Use masked sequence since this is a mammal...
    for F in ../../*/chr*.fa.masked
    do
	FA=${F/*\/}
	C=${FA/.fa.masked/}
	echo preproc and run on masked "${C} ${F}" 1>/dev/stderr
	~/bin/$MACHTYPE/preProcGgfAndy ${F} \
	| /cluster/home/angie/ggf-andy-cpg-island.pl \
	| perl -wpe 'chomp; ($s,$e,$cpg,$n,$c,$g1,$oE) = split("\t"); $s--;
                   $gc=$c+$g1;  $pCpG=(100.0 * 2 * $cpg / $n);
                   $pGc=(100.0 * $gc / $n);
                   $_="'${C}'\t$s\t$e\tCpG: $cpg\t$n\t$cpg\t$gc\t" .
                        "$pCpG\t$pGc\t$oE\n";'
    done | sort -k1,1 -k2,2n > cpgIslandGgfAndyMasked.bed

    # load into database:
    ssh hgwdev
    cd /cluster/data/mm8/bed/cpgIslandGgfAndy
    sed -e 's/cpgIslandExt/cpgIslandGgfAndyMasked/g' \
      $HOME/kent/src/hg/lib/cpgIslandExt.sql > cpgIslandGgfAndyMasked.sql
    hgLoadBed -strict mm8 cpgIslandGgfAndyMasked -tab -noBin \
      -sqlTable=cpgIslandGgfAndyMasked.sql cpgIslandGgfAndyMasked.bed
    #	Loaded 67442 elements of size 10
    featureBits mm8 cpgIslandExt
    #	10456823 bases of 2567283971 (0.407%) in intersection
    featureBits mm7 cpgIslandExt
    #	10439328 bases of 2583394090 (0.404%) in intersection
    featureBits mm8 cpgIslandGgfAndyMasked
    #	38850121 bases of 2567283971 (1.513%) in intersection
    featureBits mm7 cpgIslandGgfAndyMasked
    #	38774242 bases of 2583394090 (1.501%) in intersection
    wc -l ../cpgIsland/cpgIsland.bed *bed
    #	15963 ../cpgIsland/cpgIsland.bed
    #	67442 cpgIslandGgfAndyMasked.bed

#########################################################################
# BLASTZ HUMAN Hg18 (DONE - 2006-02-16 - 2006-02-18 - Hiram)
    ssh pk
    mkdir /cluster/data/mm8/bed/blastzHg18.2006-02-16
    cd /cluster/data/mm8/bed
    ln -s blastzHg18.2006-02-16 blastz.hg18
    cd blastzHg18.2006-02-16
    #	Started this before the rsync to /scratch/hg/mm8/ had completed,
    #	hence the /cluster/bluearc/scratch/hg/mm8/ location is used
    #	here.

    cat << '_EOF_' > DEF
# mouse vs human
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/cluster/home/angie/schwartzbin:/parasol/bin

BLASTZ=blastz.v7.x86_64
BLASTZ_ABRIDGE_REPEATS=1

# TARGET: Mouse Mm8
SEQ1_DIR=/cluster/bluearc/scratch/hg/mm8/nib
SEQ1_SMSK=/cluster/bluearc/scratch/hg/mm8/linSpecRep/notInOthers
SEQ1_LEN=/cluster/bluearc/scratch/hg/mm8/chrom.sizes
SEQ1_CHUNK=50000000
SEQ1_LAP=10000

# QUERY: Human Hg18 - single chunk big enough to run each chrom by itself
SEQ2_DIR=/scratch/hg/hg18/nib
SEQ2_SMSK=/scratch/hg/hg18/linSpecRep/notInMouse
SEQ2_LEN=/scratch/hg/hg18/chrom.sizes
SEQ2_CHUNK=300000000
SEQ2_LAP=0

BASE=/cluster/data/mm8/bed/blastzHg18.2006-02-16
TMPDIR=/scratch/tmp
'_EOF_'
    # << happy emacs

    #	establish a screen to control this job
    screen
    time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
	`pwd`/DEF > blastz.out 2>&1 &
    #	Started 2006-02-16 16:15
    #	failed due to pk node difficulties, finish the run.blastz
    #	manually
# Completed: 3724 of 3724 jobs
# CPU time in finished jobs:    5190293s   86504.89m  1441.75h   60.07d  0.165 y
# IO & Wait Time:                259150s    4319.16m    71.99h    3.00d  0.008 y
# Average job time:                1463s      24.39m     0.41h    0.02d
# Longest finished job:           10621s     177.02m     2.95h    0.12d
# Submission to last job:         74153s    1235.88m    20.60h    0.86d

    #	continuing
    time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
	-continue=cat `pwd`/DEF > cat.out 2>&1 &
    #	Done 2006-02-17 15:02

    #	Then to swap over to Hg18
    time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-swap -bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
	`pwd`/DEF > swap.out 2>&1 &
    #	Started 2006-02-17 15:30

    ssh hgwdev
    time nice -n +19 featureBits mm8 chainHg18Link
    #	984380268 bases of 2567283971 (38.343%) in intersection
    time nice -n +19 featureBits hg18 chainMm8Link
    #	994530182 bases of 2881515245 (34.514%) in intersection

#########################################################################
# BLASTZ RAT Rn4 (DONE - 2006-02-16 - 2006-02-18 - Hiram)
    ssh kkr1u00
    cd /iscratch/i/rn4
    rsync -a --progress /cluster/data/rn4/linSpecRep.notInMouse/ \
	./linSpecRep.notInMouse
    rsync -a --progress /cluster/data/rn4/nib/ ./nib/
    cp -p /cluster/data/rn4/chrom.sizes .
    for R in 2 3 4 5 6 7 8
    do
	rsync -a --progress /iscratch/i/rn4/ kkr${R}u00:/iscratch/i/rn4/
    done

    ssh kk
    mkdir /cluster/data/mm8/bed/blastzRn4.2006-02-16
    cd /cluster/data/mm8/bed
    ln -s blastzRn4.2006-02-16 blastz.rn4
    cd blastzRn4.2006-02-16
    #	Started this before the rsync to /scratch/hg/mm8/ had completed,
    #	hence the /cluster/bluearc/scratch/hg/mm8/ location is used
    #	here.

    cat << '_EOF_' > DEF
# mouse vs rat
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/i386:/cluster/home/angie/schwartzbin:/parasol/bin

BLASTZ=blastz.v7
BLASTZ_ABRIDGE_REPEATS=1

# TARGET: Mouse Mm8
SEQ1_DIR=/cluster/bluearc/scratch/hg/mm8/nib
SEQ1_SMSK=/cluster/bluearc/scratch/hg/mm8/linSpecRep/notInRat
SEQ1_LEN=/cluster/bluearc/scratch/hg/mm8/chrom.sizes
SEQ1_CHUNK=20000000
SEQ1_LAP=10000

# QUERY: Rat Rn4 - single chunk big enough to run each chrom by itself
SEQ2_DIR=/iscratch/i/rn4/nib
SEQ2_SMSK=/iscratch/i/rn4/linSpecRep.notInMouse
SEQ2_LEN=/iscratch/i/rn4/chrom.sizes
SEQ2_CHUNK=300000000
SEQ2_LAP=0

BASE=/cluster/data/mm8/bed/blastzRn4.2006-02-16
TMPDIR=/scratch/tmp
'_EOF_'
    # << happy emacs

    #	establish a screen to control this job
    screen
    time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-bigClusterHub=kk -chainMinScore=3000 -chainLinearGap=medium \
	`pwd`/DEF > blastz.out 2>&1 &
    #	Started 2006-02-16 16:15
    time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-bigClusterHub=kk -chainMinScore=3000 -chainLinearGap=medium \
	-continue=cat `pwd`/DEF > cat.out 2>&1 &

    time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-bigClusterHub=kk -chainMinScore=3000 -chainLinearGap=medium \
	-swap `pwd`/DEF > swap.out 2>&1 &

    time nice -n +19 featureBits mm8 chainRn4Link
    #	1770319811 bases of 2567283971 (68.957%) in intersection
    time nice -n +19 featureBits rn4 chainMm8Link
    #	1791093685 bases of 2571531505 (69.651%) in intersection

##############################################################################
# CLONE ENDS - BACEND TRACK (DONE - 2006-02-17 - Hiram)
    ssh kkstore01
    cd /cluster/data/mm8
    # check disk space: 73Gb free
    df -h .
# Filesystem            Size  Used Avail Use% Mounted on
# /export/cluster/store5
#                       1.5T  1.3T   73G  95% /cluster/store5
    mkdir -p bed/cloneend/ncbi
    cd bed/cloneend/ncbi

    wget --timestamping \
	ftp://ftp.ncbi.nih.gov/genomes/CLONEEND/mus_musculus/*

    cd /cluster/data/mm8/bed/cloneend
    # seems like the *.mfa files were split just for convenience
    # concatenate
    for F in ncbi/*.mfa.gz
    do
	zcat ${F}
    done | gzip > all.mfa.gz

    # Convert the title line of the all.mfa file
    cat << '_EOF_' > convert.pl
#!/usr/bin/env perl

use strict;
use warnings;

while (my $line = <>) {
    if ($line !~ m/^>/) {
	print $line
    } else {
        my @fields = split('\|', $line);
	my $fieldCount = scalar(@fields);
        my $printed = 0;
        for (my $i = 0; $i < $fieldCount; $i++) {
                if ($fields[$i] eq "gb" || $fields[$i] eq "dbj") {
                        (my $name, my $vers) = split(/\./,$fields[$i+1]);
                        print ">$name\n";
                        $i= $fieldCount;
                        $printed = 1;
                }
        }
        if (!$printed) {
                die("Failed for $line\n");
        }
    }
}
'_EOF_'
    # << happy emacs
    chmod +x convert.pl
    zcat all.mfa.gz | ./convert.pl | gzip > cloneEnds.fa.gz

    #	make sure nothing got broken:
    faSize all.mfa.gz
# 498162791 bases (16779168 N's 481383623 real 304962409 upper 176421214
# lower) in 789466 sequences in 1 files

    faSize cloneEnds.fa.gz
# 498162791 bases (16779168 N's 481383623 real 304962409 upper 176421214
# lower) in 789466 sequences in 1 files
    #	identical numbers, curiously, these are exactly the same numbers
    #	as were seen during the build of Mm7.  Do these things not
    #	change with time ?

    # concatenate the text files, too
    for F in ncbi/*.txt.gz
    do
	zcat ${F}
    done | gzip > all.txt.gz

    # generate cloneEndPairs.txt and cloneEndSingles.txt
    cp -p /cluster/data/mm7/bed/cloneend/convertTxt.pl .
    zcat all.txt.gz | ./convertTxt.pl stdin
    # Reading in end info
    # Writing out pair info
    # Writing out singleton info
    # 354485 pairs and 78423 singles

    #	faSplit does not function correctly if given a .gz source file
    #	AND, we need the unzipped file for sequence loading below
    gunzip cloneEnds.fa.gz
    # split
    mkdir splitdir
    cd splitdir
    faSplit sequence ../cloneEnds.fa 100 cloneEnds
    #	Check to ensure no breakage:
    cat *.fa | faSize stdin
# 498162791 bases (16779168 N's 481383623 real 304962409 upper 176421214
# lower) in 789466 sequences in 1 files
    #	same numbers as before

    #	Copy to san for cluster runs
    mkdir /san/sanvol1/scratch/mm8/cloneEnds
    cp -p *.fa /san/sanvol1/scratch/mm8/cloneEnds
    rm *
    cd ..
    rmdir splitdir
    #	may as well remove the previous assembly copy:
    rm -fr /san/sanvol1/scratch/mm7/cloneEnds

    # load sequences
    ssh hgwdev
    mkdir /gbdb/mm8/cloneend
    cd /gbdb/mm8/cloneend
    ln -s /cluster/data/mm8/bed/cloneend/cloneEnds.fa .
    cd /tmp
    hgLoadSeq mm8 /gbdb/mm8/cloneend/cloneEnds.fa
    #  Advisory lock created
    # Creating .tab file
    # Adding /gbdb/mm8/cloneend/cloneEnds.fa
    # 789466 sequences
    # Updating seq table
    # Advisory lock has been released
    # All done

############################################################################
# BACEND SEQUENCE ALIGNMENTS (DONE - 2006-02-17 - 2006-02-22 - Hiram)
    ssh kkstore01
    mkdir /cluster/data/mm8/noMask
    cd /cluster/data/mm8/
    #	Need an unmasked sequence for this work
    for CHR in ?/chr?.fa ??/chr??.fa ?/chr?_random.fa ??/chr??_random.fa
    do
	C=`basename ${CHR}`
	echo -n "working ${C} ... "
	head -1 ${CHR} > noMask/${C}
	tail +2 ${CHR} | tr [:lower:] [:upper:] >> noMask/${C}
	echo "done"
    done
    mkdir /san/sanvol1/scratch/mm8/noMask
    time cp --verbose -p noMask/chr*.fa /san/sanvol1/scratch/mm8/noMask

    #	Size of mouse non-gap genome: 2567283971
    #	Size of  Hg18 non-gap genome: 2881515245
    #	Adjusting the 1024 number from typical human ooc generation:
    #	1024 * (2567283971 / 2881515245) = 912

    time blat mm8.2bit \
	/dev/null /dev/null -tileSize=11 -makeOoc=11.ooc -repMatch=912
    #	Wrote 29643 overused 11-mers to 11.ooc
    #	real    2m13.206
    # Copy over to the san
    cp -p 11.ooc /san/sanvol1/scratch/mm8

    
    #	and for the kluster run
    ssh pk
    mkdir /cluster/data/mm8/bed/bacends
    cd /cluster/data/mm8/bed/bacends
    mkdir out

    # allow blat to run politely in /tmp while it writes output, then
    # copy results to results file:
    cat << '_EOF_' > runBlat.sh
#!/bin/sh
root1=$1
root2=$2
result=$3
rm -fr /scratch/tmp/${root1}_${root2}
mkdir /scratch/tmp/${root1}_${root2}
pushd /scratch/tmp/${root1}_${root2}
/cluster/bin/x86_64/blat /san/sanvol1/scratch/mm8/noMask/${root1}.fa \
	/san/sanvol1/scratch/mm8/cloneEnds/${root2}.fa \
	-ooc=/san/sanvol1/scratch/mm8/11.ooc ${root1}.${root2}.psl
popd
mkdir -p out/${root2}
rm -f ${result}
mv /scratch/tmp/${root1}_${root2}/${root1}.${root2}.psl ${result}
rm -fr /scratch/tmp/${root1}_${root2}
'_EOF_'
    #	<< happy emacs
    chmod +x runBlat.sh

    cat << '_EOF_' > template
#LOOP
./runBlat.sh $(root1) $(root2) {check out line+ out/$(root2)/$(root1).$(root2).psl}
#ENDLOOP
'_EOF_'
    # << emacs happy

    ls -1S /san/sanvol1/scratch/mm8/cloneEnds/cloneEnds???.fa > bacEnds.lst
    ls -1S /san/sanvol1/scratch/mm8/noMask/chr*.fa > contig.lst
    gensub2 contig.lst bacEnds.lst template jobList
    para create jobList
    # 3322 jobs written to batch
    para try, check, push, etc ...
# Completed: 3332 of 3332 jobs
# CPU time in finished jobs:     649465s   10824.42m   180.41h    7.52d  0.021 y
# IO & Wait Time:                 11633s     193.88m     3.23h    0.13d  0.000 y
# Average job time:                 198s       3.31m     0.06h    0.00d
# Longest finished job:            1326s      22.10m     0.37h    0.02d
# Submission to last job:        429201s    7153.35m   119.22h    4.97d

    ssh kkstore01
    cd /cluster/data/mm8/bed/bacends
    screen

    mkdir temp
    time pslSort dirs raw.psl temp out/* > pslSort.out 2>&1 &
    #	real    22m4.019s
    #	-rw-rw-r--    1 8422362557 Feb 22 15:35 raw.psl

    time pslReps -nearTop=0.01 -minCover=0.7 -minAli=0.8 -noIntrons \
	raw.psl bacEnds.psl /dev/null > pslReps.out 2>&1 &
    #	real    6m15.981s
    #	-rw-rw-r--    1  197029888 Feb 22 15:37 bacEnds.psl

    #	utilize the scripts from the previous build
    cp -p /cluster/data/mm7/bed/bacends/split.pl .
    cp -p /cluster/data/mm7/bed/bacends/header .

    time ./split.pl header < bacEnds.psl
    #	real    0m26.983s

    mv bacEnds.psl bacEnds.psl.save
    time pslSort dirs bacEnds.psl temp split
    #	real    2m19.131s
    #	-rw-rw-r--    1 1227866614 Feb 22 15:48 bacEnds.psl

    # Copy files to final destination and remove
    mkdir /cluster/data/mm8/bacends
    cp -p bacEnds.psl /cluster/data/mm8/bacends

############################################################################
# BACEND PAIRS TRACK (DONE - 2006-02-22 - Hiram)

    ssh kolossus
    cd /cluster/data/mm8/bacends

time /cluster/bin/x86_64/pslPairs -tInsert=10000 -minId=0.91 -noBin -min=25000 \
-max=350000 -slopval=10000 -hardMax=500000 -slop -short -long -orphan \
-mismatch -verbose bacEnds.psl \
	../bed/cloneend/cloneEndPairs.txt all_bacends bacEnds
    #	real    0m47.401s


    # create header required by "rdb" tools
    echo -e \
"chr\tstart\tend\tclone\tscore\tstrand\tall\tfeatures\tstarts\tsizes" > header
    echo -e "10\t10N\t10N\t10\t10N\t10\t10\t10N\t10\t10" >> header

    cat header bacEnds.pairs | \
	/cluster/bin/scripts/row score ge 300 | \
	/cluster/bin/scripts/sorttbl chr start | \
	/cluster/bin/scripts/headchg -del > bacEndPairs.bed
    #	-rw-rw-r--  1   23816801 Feb 22 15:52 bacEndPairs.bed


    cat header bacEnds.slop bacEnds.short bacEnds.long bacEnds.mismatch \
	bacEnds.orphan | /cluster/bin/scripts/row score ge 300 | \
	/cluster/bin/scripts/sorttbl chr start | \
	/cluster/bin/scripts/headchg -del > bacEndPairsBad.bed
    #	-rw-rw-r--  1    6843775 Feb 22 15:54 bacEndPairsBad.bed

    /cluster/bin/scripts/extractPslLoad -noBin bacEnds.psl bacEndPairs.bed \
	bacEndPairsBad.bed >j1.out
    cat j1.out| /cluster/bin/scripts/sorttbl tname tstart >j2.out
    cat j2.out | /cluster/bin/scripts/headchg -del > bacEnds.load.psl
    #	-rw-rw-r--  1  983668200 Feb 22 16:04 bacEnds.load.psl

    rm j1.out j2.out

    #	CHECK bacEndPairs.bed ID's to make sure they have no blanks in them
    awk '{print $5}' bacEndPairs.bed | sort -u
    #	result should be the scores, no extraneous strings:
#	1000
#	300
#	375
#	500
#	750
    #	edit the file and fix it if it has a bad name.

    # load into database
    ssh hgwdev
    cd /cluster/data/mm8/bacends
    hgLoadBed -strict -notItemRgb mm8 bacEndPairs bacEndPairs.bed \
	-sqlTable=$HOME/kent/src/hg/lib/bacEndPairs.sql
    #	Loaded 235440 elements of size 11

    # note - this track isn't pushed to RR, just used for assembly QA
    hgLoadBed -strict -notItemRgb mm8 bacEndPairsBad bacEndPairsBad.bed \
	-sqlTable=$HOME/kent/src/hg/lib/bacEndPairsBad.sql
    #	Loaded 95099 elements of size 11

    # NOTE: truncates file to 0 if -nobin is used
    time hgLoadPsl mm8 -table=all_bacends bacEnds.load.psl
# load of all_bacends did not go as planned: 8132116 record(s), 0 row(s)
# skipped, 1 warning(s) loading psl.tab
#	skipped, 1 warning(s) loading psl.tab
#	real    20m45.055s

    featureBits mm8 all_bacends
# 327086559 bases of 2567283971 (12.741%) in intersection
    featureBits mm7 all_bacends
# 334161740 bases of 2583394090 (12.935%) in intersection
    featureBits mm6 all_bacends
# 336981828 bases of 2597150411 (12.975%) in intersection
    featureBits mm5 all_bacends
# 268502414 bases of 2615483787 (10.266%) in intersection
    featureBits mm4 all_bacends
# 243096171 bases of 2627444668 (9.252%) in intersection

    featureBits mm8 bacEndPairs
# 2572527283 bases of 2567283971 (100.204%) in intersection
    featureBits mm7 bacEndPairs
# 2578837424 bases of 2583394090 (99.824%) in intersection
    featureBits mm6 bacEndPairs
# 2570768812 bases of 2597150411 (98.984%) in intersection
    featureBits mm5 bacEndPairs
# 2567958504 bases of 2615483787 (98.183%) in intersection
    featureBits mm4 bacEndPairs
# 2549945356 bases of 2627444668 (97.050%) in intersection

    featureBits mm8 bacEndPairsBad
# 879222026 bases of 2567283971 (34.247%) in intersection
    featureBits mm7 bacEndPairsBad
# 954662115 bases of 2583394090 (36.954%) in intersection
    featureBits mm6 bacEndPairsBad
# 1006314997 bases of 2597150411 (38.747%) in intersection
    featureBits mm5 bacEndPairsBad
# 541027882 bases of 2615483787 (20.686%) in intersection
    featureBits mm4 bacEndPairsBad
# 1074505863 bases of 2627444668 (40.895%) in intersection

#########################################################################
# GENBANK auto update (DONE - 2006-02-17 - 2006-02-23 - Hiram)
    # align with revised genbank process. drop xeno ESTs.
    ssh hgwdev
    cd ~/kent/src/hg/makeDb/genbank
    cvs update -d -P etc
    # edit etc/genbank.conf to add mm8, it is a copy of mm7 with changes:

# mm8
mm8.serverGenome = /cluster/data/mm8/mm8.2bit
mm8.clusterGenome = /scratch/hg/mm8/mm8.2bit
mm8.ooc = /cluster/data/mm8/11.ooc
mm8.align.unplacedChroms = chrUn_random
mm8.lift = /cluster/data/mm8/jkStuff/liftAll.lft
mm8.refseq.mrna.native.pslCDnaFilter  = ${ordered.refseq.mrna.native.pslCDnaFilter}
mm8.refseq.mrna.xeno.pslCDnaFilter    = ${ordered.refseq.mrna.xeno.pslCDnaFilter}
mm8.genbank.mrna.native.pslCDnaFilter = ${ordered.genbank.mrna.native.pslCDnaFilter}
mm8.genbank.mrna.xeno.pslCDnaFilter   = ${ordered.genbank.mrna.xeno.pslCDnaFilter}
mm8.genbank.est.native.pslCDnaFilter  = ${ordered.genbank.est.native.pslCDnaFilter}
mm8.downloadDir = mm8
mm8.refseq.mrna.xeno.load  = yes
mm8.refseq.mrna.xeno.loadDesc = yes
mm8.mgcTables.default = full
mm8.mgcTables.mgc = all

    #	check that into CVS, then
    # update /cluster/data/genbank/
    make etc-update

    ssh kkstore04
    cd /cluster/data/genbank
    nice bin/gbAlignStep -initial mm8 &
    #	var/build/logs/2006.02.17-16:10:17.mm8.initalign.log
    #	the parasol batch job on kk broke down in:
    #	/cluster/bluearc/genbank/work/initial.mm8/align
    #	go to kk and this directory and get the batch finished
    nice bin/gbAlignStep -continue=finish -initial mm8 &
    #	var/build/logs/2006.02.22-20:26:54.mm8.initalign.log

    # load database when finished
    ssh hgwdev
    cd /cluster/data/genbank
    nice ./bin/gbDbLoadStep -drop -initialLoad  mm8 &
    #	var/dbload/hgwdev/logs/2006.02.23-10:21:36.dbload.log
    #	real    228m59.734s

#########################################################################
# BLASTZ rheMac2 (DONE - 2006-02-17 - Hiram)
    ssh pk
    mkdir /cluster/data/mm8/bed/blastz.rheMac2.2006-02-17
    cd /cluster/data/mm8/bed
    ln -s blastz.rheMac2.2006-02-17 blastz.rheMac2
    cd blastz.rheMac2

    cat << '_EOF_' > DEF
# mouse vs macaca mulatta
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn/x86_64:/cluster/bin/x86_64:/parasol/bin

ALIGN=blastz-run
BLASTZ=blastz.v7.x86_64

# TARGET - mm8
SEQ1_DIR=/scratch/hg/mm8/nib
SEQ1_LEN=/scratch/hg/mm8/chrom.sizes
SEQ1_CHUNK=50000000
SEQ1_LAP=10000

# QUERY - macaca mulatta - big enough chunk to do whole chroms at once
SEQ2_DIR=/san/sanvol1/scratch/rheMac2/rheMac2.2bit
SEQ2_LEN=/san/sanvol1/scratch/rheMac2/rheMac2.sizes 
SEQ2_CHUNK=250000000
SEQ2_LAP=0

BASE=/cluster/data/mm8/bed/blastz.rheMac2.2006-02-17
TMPDIR=/scratch/tmp
'_EOF_'
    #	<< happy emacs

    time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
	`pwd`/DEF > blastz.out 2>&1 &
    #	Started 2006-02-17 16:42
    #	crashed due to no copies of mm8 in /scratch/hg/mm8/ on the
    #	Iservers.  Fix that up and get the chain run done.  Continuing.
    time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
	-continue=chainMerge `pwd`/DEF > chainMerge.out 2>&1 &

    time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
	-swap `pwd`/DEF > swap.out 2>&1 &
    time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
	-swap -continue=net `pwd`/DEF > swap.net.out 2>&1 &
    #	failed during a san hiccup,  finish that off, then:
    time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
	-swap -continue=load `pwd`/DEF > swap.load.out 2>&1 &

    time nice -n +19 featureBits mm8 chainRheMac2Link
    #	891310108 bases of 2567283971 (34.718%) in intersection
    time nice -n +19 featureBits rheMac2 chainMm8Link
    #	877906099 bases of 2646704109 (33.170%) in intersection

#########################################################################
# BLASTZ canFam2 (DONE - 2006-02-18 - Hiram)
    ssh pk
    mkdir /cluster/data/mm8/bed/blastz.canFam2.2006-02-18
    cd /cluster/data/mm8/bed
    ln -s blastz.canFam2.2006-02-18 blastz.canFam2
    cd blastz.canFam2

    cat << '_EOF_' > DEF
# mouse vs dog
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/cluster/home/angie/schwartzbin:/parasol/bin

BLASTZ=blastz.v7.x86_64
BLASTZ_ABRIDGE_REPEATS=1

# TARGET: Mouse Mm8
SEQ1_DIR=/scratch/hg/mm8/nib
SEQ1_SMSK=/scratch/hg/mm8/linSpecRep/notInOthers
SEQ1_LEN=/scratch/hg/mm8/chrom.sizes
SEQ1_CHUNK=50000000
SEQ1_LAP=10000

# QUERY: Dog CanFam2 - chunk big enough to do all chroms in single whole pieces
SEQ2_DIR=/scratch/hg/canFam2/nib
SEQ2_SMSK=/san/sanvol1/scratch/canFam2/linSpecRep.notInMouse
SEQ2_LEN=/san/sanvol1/scratch/canFam2/chrom.sizes
SEQ2_CHUNK=200000000
SEQ2_LAP=0

BASE=/cluster/data/mm8/bed/blastzCanFam2.2006-02-18
TMPDIR=/scratch/tmp
'_EOF_'
    #	<< happy emacs

    time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
	`pwd`/DEF > blastz.out 2>&1 &
    time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
	-swap `pwd`/DEF > swap.out 2>&1 &

    time nice -n +19 featureBits mm8 chainCanFam2Link
    #	828741604 bases of 2567283971 (32.281%) in intersection
    time nice -n +19 featureBits canFam2 chainMm8Link
    #	816262344 bases of 2384996543 (34.225%) in intersection

#########################################################################
# BLASTZ bosTau2 (DONE - 2006-02-18 - Hiram)
    ssh pk
    mkdir /cluster/data/mm8/bed/blastz.bosTau2.2006-02-18
    cd /cluster/data/mm8/bed
    ln -s blastz.bosTau2.2006-02-18 blastz.bosTau2
    cd blastz.bosTau2

    cat << '_EOF_' > DEF
# mouse vs cow
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/cluster/home/angie/schwartzbin:/parasol/bin

BLASTZ=blastz.v7.x86_64

# TARGET: Mouse Mm8
SEQ1_DIR=/scratch/hg/mm8/nib
SEQ1_LEN=/scratch/hg/mm8/chrom.sizes
SEQ1_CHUNK=50000000
SEQ1_LAP=10000

# QUERY: Cow (bosTau2)
#  large enough chunk to do chroms in one piece
SEQ2_DIR=/scratch/hg/bosTau2/bosTau2.noBin0.2bit
SEQ2_LEN=/scratch/hg/bosTau2/noBin0.sizes
SEQ2_CHUNK=150000000
SEQ2_LIMIT=100
SEQ2_LAP=0

BASE=/cluster/data/mm8/bed/blastzBosTau.2006-02-18
TMPDIR=/scratch/tmp
'_EOF_'
    #	<< happy emacs

    time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
	`pwd`/DEF > blastz.out 2>&1 &
    time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
	-swap `pwd`/DEF > swap.out 2>&1 &

    time nice -n +19 featureBits mm8 chainBosTau2Link
    #	688859641 bases of 2567283971 (26.832%) in intersection
    time nice -n +19 featureBits bosTau2 chainMm8Link
    #	683178156 bases of 2812203870 (24.293%) in intersection

#########################################################################
# BLASTZ galGal2 (DONE - 2006-02-18 - Hiram)
    ssh kk
    mkdir /cluster/data/mm8/bed/blastz.galGal2.2006-02-18
    cd /cluster/data/mm8/bed
    ln -s blastz.galGal2.2006-02-18 blastz.galGal2
    cd blastz.galGal2

    cat << '_EOF_' > DEF
# mouse vs chicken
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/i386:/cluster/home/angie/schwartzbin:/parasol/bin

BLASTZ=blastz.v7
BLASTZ_ABRIDGE_REPEATS=1

# TARGET: Mouse Mm8
SEQ1_DIR=/scratch/hg/mm8/nib
SEQ1_SMSK=/scratch/hg/mm8/linSpecRep/notInOthers
SEQ1_LEN=/scratch/hg/mm8/chrom.sizes
SEQ1_CHUNK=50000000
SEQ1_LAP=10000

# QUERY: Chicken galGal2 - single chunk big enough for whole chroms at once
SEQ2_DIR=/scratch/hg/galGal2/nib
SEQ2_LEN=/scratch/hg/galGal2/chrom.sizes
SEQ2_SMSK=/scratch/hg/galGal2/linSpecRep
SEQ2_IN_CONTIGS=0
SEQ2_CHUNK=200000000
SEQ2_LAP=0

BASE=/cluster/data/mm8/bed/blastzGalGal2.2006-02-18
TMPDIR=/scratch/tmp
'_EOF_'
    #	<< happy emacs

    time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-bigClusterHub=kk -chainMinScore=5000 -chainLinearGap=loose \
	`pwd`/DEF > blastz.out 2>&1 &
    time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-bigClusterHub=kk -chainMinScore=5000 -chainLinearGap=loose \
	-continue=cat `pwd`/DEF > cat.out 2>&1 &
    time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-bigClusterHub=kk -chainMinScore=5000 -chainLinearGap=loose \
	-continue=net `pwd`/DEF > net.out 2>&1 &
    time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-bigClusterHub=kk -chainMinScore=5000 -chainLinearGap=loose \
	-swap `pwd`/DEF > swap.out 2>&1 &

    time nice -n +19 featureBits mm8 chainGalGal2Link
    #	65517358 bases of 2567283971 (2.552%) in intersection
    time nice -n +19 featureBits galGal2 chainMm8Link
    #	57074100 bases of 1054197620 (5.414%) in intersection

#########################################################################
# BLASTZ dasNov1 (DONE - 2006-02-19 - Hiram)
    ssh pk
    mkdir /cluster/data/mm8/bed/blastz.dasNov1.2006-02-19
    cd /cluster/data/mm8/bed
    ln -s blastz.dasNov1.2006-02-19 blastz.dasNov1
    cd blastz.dasNov1

    cat << '_EOF_' > DEF
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/parasol/bin

BLASTZ=blastz.v7.x86_64

# TARGET: Mouse Mm8
SEQ1_DIR=/scratch/hg/mm8/nib
SEQ1_LEN=/scratch/hg/mm8/chrom.sizes
SEQ1_CHUNK=50000000
SEQ1_LAP=10000


# QUERY - Armadillo dasNov1
SEQ2_DIR=/scratch/hg/dasNov1/dasNov1.2bit
SEQ2_LEN=/scratch/hg/dasNov1/chrom.sizes
SEQ2_LIMIT=100
SEQ2_CHUNK=50000000
SEQ2_LAP=0

BASE=/cluster/data/mm8/bed/blastzDasNov1.2006-02-19
TMPDIR=/scratch/tmp
'_EOF_'
    #	<< happy emacs

    time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
	`pwd`/DEF > blastz.out 2>&1 &
    time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
	-continue=cat `pwd`/DEF > cat.out 2>&1 &

    time nice -n +19 featureBits mm8 chainDasNov1Link
    #	431944142 bases of 2567283971 (16.825%) in intersection

#########################################################################
# BLASTZ echTel1 (DONE - 2006-02-19 - Hiram)
    ssh pk
    mkdir /cluster/data/mm8/bed/blastz.echTel1.2006-02-19
    cd /cluster/data/mm8/bed
    ln -s blastz.echTel1.2006-02-19 blastz.echTel1
    cd blastz.echTel1

    cat << '_EOF_' > DEF
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/parasol/bin

BLASTZ=blastz.v7.x86_64

# TARGET: Mouse Mm8
SEQ1_DIR=/scratch/hg/mm8/nib
SEQ1_LEN=/scratch/hg/mm8/chrom.sizes
SEQ1_CHUNK=50000000
SEQ1_LAP=10000


# QUERY - Tenrec echTel1
SEQ2_DIR=/scratch/hg/echTel1/echTel1.2bit
SEQ2_LEN=/scratch/hg/echTel1/chrom.sizes
SEQ2_LIMIT=100
SEQ2_CHUNK=50000000
SEQ2_LAP=0

BASE=/cluster/data/mm8/bed/blastzEchTel1.2006-02-19
TMPDIR=/scratch/tmp
'_EOF_'
    #	<< happy emacs

    time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
	`pwd`/DEF > blastz.out 2>&1 &
    time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
	-continue=cat `pwd`/DEF > cat.out 2>&1 &
    time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
	-continue=chainRun `pwd`/DEF > chain.out 2>&1 &

    time nice -n +19 featureBits mm8 chainEchTel1Link
    #	292970406 bases of 2567283971 (11.412%) in intersection

#########################################################################
# BLASTZ fr1 (DONE - 2006-02-19 - Hiram)
    ssh pk
    mkdir /cluster/data/mm8/bed/blastz.fr1.2006-02-19
    cd /cluster/data/mm8/bed
    ln -s blastz.fr1.2006-02-19 blastz.fr1
    cd blastz.fr1

    cat << '_EOF_' > DEF
# mouse vs. fugu
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/i386:/cluster/home/angie/schwartzbin:/parasol/bin
 
BLASTZ=blastz.v7

# Reuse parameters from human-chicken, except L=6000 (more relaxed)
BLASTZ_H=2000
BLASTZ_Y=3400 
BLASTZ_L=6000
BLASTZ_K=2200
BLASTZ_Q=/cluster/data/blastz/HoxD55.q

# TARGET: Mouse Mm8
SEQ1_DIR=/scratch/hg/mm8/nib
SEQ1_LEN=/scratch/hg/mm8/chrom.sizes
SEQ1_CHUNK=50000000
SEQ1_LAP=10000

# QUERY: Fugu - chunk big enough to run the whole chrom at once
SEQ2_DIR=/san/sanvol1/scratch/fr1/nib
SEQ2_LEN=/san/sanvol1/scratch/fr1/chrom.sizes
SEQ2_CHUNK=400000000
SEQ2_LAP=0
 
BASE=/cluster/data/mm8/bed/blastzFr1.2006-02-19
TMPDIR=/scratch/tmp
'_EOF_'
    #	<< happy emacs

    time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
	`pwd`/DEF > blastz.out 2>&1 &
    time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
	-swap `pwd`/DEF > swap.out 2>&1 &
    time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
	-swap -continue=net `pwd`/DEF > swap.net.out 2>&1 &

    time nice -n +19 featureBits mm8 chainFr1Link
    #	48949500 bases of 2567283971 (1.907%) in intersection
    time nice -n +19 featureBits fr1 chainMm8Link
    #	42671288 bases of 315518167 (13.524%) in intersection

#########################################################################
# BLASTZ loxAfr1 (DONE - 2006-02-19 - Hiram)
    ssh kk
    mkdir /cluster/data/mm8/bed/blastz.loxAfr1.2006-02-19
    cd /cluster/data/mm8/bed
    ln -s blastz.loxAfr1.2006-02-19 blastz.loxAfr1
    cd blastz.loxAfr1

    cat << '_EOF_' > DEF
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/i386:/parasol/bin

BLASTZ=blastz.v7

# TARGET: Mouse Mm8
SEQ1_DIR=/scratch/hg/mm8/nib
SEQ1_LEN=/scratch/hg/mm8/chrom.sizes
SEQ1_CHUNK=30000000
SEQ1_LAP=10000

# QUERY - Elephant loxAfr1
SEQ2_DIR=/scratch/hg/loxAfr1/loxAfr1.2bit
SEQ2_LEN=/scratch/hg/loxAfr1/chrom.sizes
SEQ2_LIMIT=100
SEQ2_CHUNK=30000000
SEQ2_LAP=0

BASE=/cluster/data/mm8/bed/blastzLoxAfr1.2006-02-19
TMPDIR=/scratch/tmp
'_EOF_'
    #	<< happy emacs

    time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-bigClusterHub=kk -chainMinScore=3000 -chainLinearGap=medium \
	`pwd`/DEF > blastz.out 2>&1 &
    #	failed during the cat, fixed the script
    time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-bigClusterHub=kk -chainMinScore=3000 -chainLinearGap=medium \
	-continue=chainRun `pwd`/DEF > chain.out 2>&1 &

    time nice -n +19 featureBits mm8 chainLoxAfr1Link
    #	472168702 bases of 2567283971 (18.392%) in intersection

#########################################################################
# BLASTZ tetNig1 (DONE - 2006-02-19 - Hiram)
    ssh kk
    mkdir /cluster/data/mm8/bed/blastz.tetNig1.2006-02-19
    cd /cluster/data/mm8/bed
    ln -s blastz.tetNig1.2006-02-19 blastz.tetNig1
    cd blastz.tetNig1

    cat << '_EOF_' > DEF
# Mouse vs tetraodon
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/i386:/cluster/home/angie/schwartzbin:/parasol/bin

BLASTZ=blastz.v7

BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=6000
BLASTZ_K=2200
BLASTZ_Q=/cluster/data/blastz/HoxD55.q

# TARGET: Mouse Mm8
SEQ1_DIR=/scratch/hg/mm8/nib
SEQ1_LEN=/scratch/hg/mm8/chrom.sizes
SEQ1_CHUNK=50000000
SEQ1_LAP=10000

# QUERY: Tetraodon TetNig1 - single chunk big enough to run whole chroms
SEQ2_DIR=/san/sanvol1/scratch/tetNig1/tetNig1.2bit
SEQ2_LEN=/san/sanvol1/scratch/tetNig1/chrom.sizes
SEQ2_CHUNK=200000000
SEQ2_LAP=0

BASE=/cluster/data/mm8/bed/blastzTetNig1.2006-02-19
TMPDIR=/scratch/tmp
'_EOF_'
    #	<< happy emacs

    time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-bigClusterHub=kk -chainMinScore=5000 -chainLinearGap=loose \
	`pwd`/DEF > blastz.out 2>&1 &
    time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-bigClusterHub=kk -chainMinScore=5000 -chainLinearGap=loose \
	-continue=net `pwd`/DEF > net.out 2>&1 &

    time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-bigClusterHub=kk -chainMinScore=5000 -chainLinearGap=loose \
	-swap `pwd`/DEF > swap.out 2>&1 &
    time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-bigClusterHub=kk -chainMinScore=5000 -chainLinearGap=loose \
	-swap -continue=net `pwd`/DEF > swap-net.out 2>&1 &


    time nice -n +19 featureBits mm8 chainTetNig1Link
    #	50358792 bases of 2567283971 (1.962%) in intersection
    time nice -n +19 featureBits tetNig1 chainMm8Link
    #	47024263 bases of 342403326 (13.734%) in intersection

#########################################################################
# BLASTZ oryCun1 (DONE - 2006-02-21 - Hiram)
    ssh pk
    mkdir /cluster/data/mm8/bed/blastz.oryCun1.2006-02-21
    cd /cluster/data/mm8/bed
    ln -s blastz.oryCun1.2006-02-21 blastz.oryCun1
    cd blastz.oryCun1

    cat << '_EOF_' > DEF
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/parasol/bin

BLASTZ=blastz.v7.x86_64

# TARGET: Mouse Mm8
SEQ1_DIR=/scratch/hg/mm8/nib
SEQ1_LEN=/scratch/hg/mm8/chrom.sizes
SEQ1_CHUNK=50000000
SEQ1_LAP=10000

# QUERY - Rabbit oryCun1
SEQ2_DIR=/scratch/hg/oryCun1/oryCun1.2bit
SEQ2_LEN=/scratch/hg/oryCun1/chrom.sizes
SEQ2_LIMIT=100
SEQ2_CHUNK=50000000
SEQ2_LAP=0

BASE=/cluster/data/mm8/bed/blastzOryCun1.2006-02-21
TMPDIR=/scratch/tmp
'_EOF_'
    #	<< happy emacs

    time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
	`pwd`/DEF > blastz.out 2>&1 &

    time nice -n +19 featureBits mm8 chainOryCun1Link
    #	496060619 bases of 2567283971 (19.322%) in intersection

#########################################################################
# BLASTZ xenTro1 (DONE - 2006-02-21 - Hiram)
    ssh kk
    mkdir /cluster/data/mm8/bed/blastz.xenTro1.2006-02-21
    cd /cluster/data/mm8/bed
    ln -s blastz.xenTro1.2006-02-21 blastz.xenTro1
    cd blastz.xenTro1

    cat << '_EOF_' > DEF
# mouse vs. frog
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/i386:/cluster/home/angie/schwartzbin:/parasol/bin

BLASTZ=blastz.v7

# Specific settings for chicken (per Webb email to Brian Raney)
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=8000
BLASTZ_K=2200
BLASTZ_Q=/cluster/data/blastz/HoxD55.q

# TARGET: Mouse Mm8
SEQ1_DIR=/scratch/hg/mm8/nib
SEQ1_LEN=/scratch/hg/mm8/chrom.sizes
SEQ1_CHUNK=50000000
SEQ1_LAP=10000

# QUERY: Frog xenTro1 - single chunk big enough to run two of the
#               largest scaffolds in one job
SEQ2_DIR=/scratch/hg/xenTro1/xenTro1.2bit
SEQ2_LEN=/scratch/hg/xenTro1/chrom.sizes
SEQ2_CHUNK=20000000
SEQ2_LAP=0
SEQ2_LIMIT=100

BASE=/cluster/data/mm8/bed/blastzXenTro1.2006-02-21
TMPDIR=/scratch/tmp
'_EOF_'
    #	<< happy emacs

    time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-bigClusterHub=kk -chainMinScore=5000 -chainLinearGap=loose \
	`pwd`/DEF > blastz.out 2>&1 &
    time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-bigClusterHub=kk -chainMinScore=5000 -chainLinearGap=loose \
	-continue=cat `pwd`/DEF > cat.out 2>&1 &
    time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-bigClusterHub=kk -chainMinScore=5000 -chainLinearGap=loose \
	-swap `pwd`/DEF > swap.out 2>&1 &

    time nice -n +19 featureBits mm8 chainXenTro1Link
    #	62015601 bases of 2567283971 (2.416%) in intersection
    time nice -n +19 featureBits xenTro1 chainMm8Link
    #	59307185 bases of 1381238994 (4.294%) in intersection

#########################################################################
# BLASTZ monDom4 (DONE - 2006-02-23 - Hiram)
    ssh pk
    mkdir /cluster/data/mm8/bed/blastz.monDom4.2006-02-23
    cd /cluster/data/mm8/bed
    ln -s blastz.monDom4.2006-02-23 blastz.monDom4
    cd blastz.monDom4

    cat << '_EOF_' > DEF
# Mouse vs. opossum
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/parasol/bin

BLASTZ=blastz.v7.x86_64

# Specific settings for chicken (per Webb email to Brian Raney)
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=10000
BLASTZ_K=2200
BLASTZ_M=20
BLASTZ_Q=/cluster/data/blastz/HoxD55.q

# TARGET: Mouse (mm8)
SEQ1_DIR=/scratch/hg/mm8/nib
SEQ1_LEN=/scratch/hg/mm8/chrom.sizes
SEQ1_CHUNK=100000000
SEQ1_LAP=10000

# QUERY: Opossum monDom2
SEQ2_DIR=/cluster/bluearc/scratch/hg/monDom4/monDom4.2bit
SEQ2_LEN=/cluster/bluearc/scratch/hg/monDom4/chrom.sizes
SEQ2_CHUNK=50000000
SEQ2_LIMIT=100
SEQ2_LAP=0

BASE=/cluster/data/mm8/bed/blastzMonDom4.2006-02-23
TMPDIR=/scratch/tmp
'_EOF_'
    #	<< happy emacs

    time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
	`pwd`/DEF > blastz.out 2>&1 &
    time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
	-swap `pwd`/DEF > swap.out 2>&1 &

    time nice -n +19 featureBits mm8 chainMonDom4Link
    #	211663336 bases of 2567283971 (8.245%) in intersection
    time nice -n +19 featureBits monDom4 chainMm8Link
    #	210933035 bases of 3501643220 (6.024%) in intersection

    #	Something caused the loaded chains and nets on Mm8 to disappear.
    #	to reload them  (DONE - Hiram - 2006-07-18)
    #	recover the individual chain files
    ssh kkstore04
    cd /cluster/data/mm8/bed/blastzMonDom4.2006-02-23/axtChain
    nice chainSplit chain mm8.monDom4.all.chain.gz
    ssh hgwdev
    cd /cluster/data/mm8/bed/blastzMonDom4.2006-02-23/axtChain/chain
    foreach f (*.chain)
	set c = $f:r
	echo hgLoadChain mm8 ${c}_chainMonDom4 $f
	hgLoadChain mm8 ${c}_chainMonDom4 $f
    end

    time netFilter -minGap=10 mm8.monDom4.net.gz \
	| hgLoadNet -verbose=0 mm8 netMonDom4 stdin

    #	clean up
    ssh kkstore04
    cd /cluster/data/mm8/bed/blastzMonDom4.2006-02-23/axtChain
    rm -fr chain

#########################################################################
# BLASTZ panTro1 (DONE - 2006-02-23 - Hiram)
    ssh pk
    mkdir /cluster/data/mm8/bed/blastz.panTro1.2006-02-23
    cd /cluster/data/mm8/bed
    ln -s blastz.panTro1.2006-02-23 blastz.panTro1
    cd blastz.panTro1

    cat << '_EOF_' > DEF
# mouse vs chimp
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/cluster/home/angie/schwartzbin:/parasol/bin

BLASTZ=blastz.v7.x86_64
BLASTZ_M=50

# TARGET: Mouse Mm7
SEQ1_DIR=/scratch/hg/mm8/nib
SEQ1_LEN=/scratch/hg/mm8/chrom.sizes
SEQ1_CHUNK=50000000
SEQ1_LAP=10000

# QUERY: Chimp PanTro1
SEQ2_DIR=/scratch/hg/panTro1/nib
SEQ2_LEN=/scratch/hg/panTro1/chrom.sizes
SEQ2_CHUNK=30000000
SEQ2_LAP=0

BASE=/cluster/data/mm8/bed/blastzPanTro1.2006-02-23
TMPDIR=/scratch/tmp
'_EOF_'
    #	<< happy emacs

    time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
	`pwd`/DEF > blastz.out 2>&1 &
    time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
	-swap `pwd`/DEF > swap.out 2>&1 &

    time nice -n +19 featureBits mm8 chainPanTro1Link
    #	901276629 bases of 2567283971 (35.106%) in intersection
    time nice -n +19 featureBits panTro1 chainMm8Link
    #	901976621 bases of 2733948177 (32.992%) in intersection

#########################################################################
# BLASTZ danRer4 (DONE - 2006-04-26 - 2006-04-28 - Hiram)
    ssh pk
    mkdir /cluster/data/mm8/bed/blastzDanRer4.2006-04-26
    cd /cluster/data/mm8/bed
    ln -s blastzDanRer4.2006-04-26 blastz.danRer4
    cd blastz.danRer4

    cat << '_EOF_' > DEF
# mouse vs zebrafish
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn/x86_64:/cluster/bin/x86_64:/parasol/bin

BLASTZ=blastz.v7.x86_64
BLASTZ_ABRIDGE_REPEATS=1

# Reuse parameters from hg16-fr1, danRer-hg17 and mm5-danRer
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=6000
BLASTZ_K=2200
BLASTZ_M=50
BLASTZ_Q=/cluster/data/blastz/HoxD55.q

# TARGET: Mouse Mm8
SEQ1_DIR=/scratch/hg/mm8/nib
SEQ1_LEN=/scratch/hg/mm8/chrom.sizes
SEQ1_SMSK=/scratch/hg/mm8/linSpecRep/notInOthers
SEQ1_CHUNK=20000000
SEQ1_LAP=10000

# QUERY: Zebrafish (danRer4)
#  large enough chunk to do complete chroms at once
SEQ2_DIR=/san/sanvol1/scratch/danRer4/chromNib
SEQ2_LEN=/san/sanvol1/scratch/danRer4/chromNib.sizes
SEQ2_SMSK=/san/sanvol1/scratch/danRer4/linSpecRep.notInOthers
SEQ2_CHUNK=100000000
SEQ2_LAP=0

BASE=/cluster/data/mm8/bed/blastzDanRer4.2006-04-26
TMPDIR=/scratch/tmp
'_EOF_'
    #	<< happy emacs

    cd /cluster/data/mm8/bed/blastzDanRer4.2006-04-26
    time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
	`pwd`/DEF > blastz.out 2>&1 &
    time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
	-continue=net `pwd`/DEF > net.out 2>&1 &

    #	swap, see also makeDanRer4.doc
    time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
	-swap `pwd`/DEF > swap.out 2>&1 &

    time nice -n +19 featureBits mm8 chainDanRer4Link \
	> fb.mm8.chainDanRer4Link 2>&1 &
    cat fb.mm8.chainDanRer4Link
    #	54036008 bases of 2567283971 (2.105%) in intersection
    time nice -n +19 featureBits danRer4 chainMm8Link \
	> fb.danRer4.chainDanRer4Link 2>&1 &
    cat fb.danRer4.chainDanRer4Link
    #	58145856 bases of 1626093931 (3.576%) in intersection

#########################################################################
# BLASTZ danRer4 (DONE - 2006-04-26 - 2006-04-28 - Hiram)
# REMAKE THIS USING ALL CHROMS FOR danRer4 (2005-05-22 -  ).
    ssh pk
    mkdir /cluster/data/mm8/bed/blastzDanRer4.2006-05-22
    cd /cluster/data/mm8/bed/blastzDanRer4.2006-05-22
   # ln -s blastzDanRer4.2006-04-26 blastz.danRer4

    cat << '_EOF_' > DEF
# mouse vs zebrafish
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn/x86_64:/cluster/bin/x86_64:/parasol/bin

BLASTZ=blastz.v7.x86_64
BLASTZ_ABRIDGE_REPEATS=1

# Reuse parameters from hg16-fr1, danRer-hg17 and mm5-danRer
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=6000
BLASTZ_K=2200
BLASTZ_M=50
BLASTZ_Q=/cluster/data/blastz/HoxD55.q

# TARGET: Mouse Mm8
SEQ1_DIR=/scratch/hg/mm8/nib
SEQ1_LEN=/scratch/hg/mm8/chrom.sizes
SEQ1_SMSK=/scratch/hg/mm8/linSpecRep/notInOthers
SEQ1_CHUNK=20000000
SEQ1_LAP=10000

# QUERY: Zebrafish (danRer4)
#  large enough chunk to do complete chroms at once
SEQ2_DIR=/san/sanvol1/scratch/danRer4/nib
SEQ2_LEN=/san/sanvol1/scratch/danRer4/chrom.sizes
SEQ2_SMSK=/san/sanvol1/scratch/danRer4/linSpecRep.notInOthers
SEQ2_CHUNK=300000000
SEQ2_LAP=0

BASE=/cluster/data/mm8/bed/blastzDanRer4.2006-05-22
TMPDIR=/scratch/tmp
'_EOF_'
    #	<< happy emacs
    chmod +x DEF
    cd /cluster/data/mm8/bed/blastzDanRer4.2006-05-22
    time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
	`pwd`/DEF >& blastz.out &
    # 0.118u 0.107s 4:05:08.71 0.0%   0+0k 0+0io 0pf+0w
    time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
	-continue=net `pwd`/DEF >& net.out &
    # 0.121u 0.072s 4:48.04 0.0%      0+0k 0+0io 0pf+0w
    cd /cluster/data/mm8/bed/blastzDanRer4.2006-05-22
    #	swap, see also makeDanRer4.doc
    time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
	-swap `pwd`/DEF >& swap.out &
    # 0.129u 0.109s 5:02.55 0.0%      0+0k 0+0io 0pf+0w 
    ssh hgwdev
    cd /cluster/data/mm8/bed/blastzDanRer4.2006-05-22
    featureBits mm8 chainDanRer4Link >& fb.mm8.chainDanRer4Link &
    cat fb.mm8.chainDanRer4Link
    # 55147954 bases of 2567283971 (2.148%) in intersection
    featureBits danRer4 chainMm8Link >& fb.danRer4.chainDanRer4Link &
    cat fb.danRer4.chainDanRer4Link
    # 60721886 bases of 1626093931 (3.734%) in intersection
    featureBits -chrom=chr1 mm8 refGene:cds chainDanRer4Link -enrichment
    # refGene:cds 0.856%, chainDanRer4Link 1.867%, both 0.584%, 
    # cover 68.16%, enrich 36.51x
    featureBits -chrom=chr1 mm8 refGene:cds chainDanRer3Link -enrichment
    # refGene:cds 0.856%, chainDanRer3Link 1.760%, both 0.492%, cover 57.49%, 
    # enrich 32.67x
    featureBits -chrom=chr1 danRer4 refGene:cds chainMm8Link -enrichment
    # refGene:cds 0.746%, chainMm8Link 3.807%, both 0.566%, cover 75.86%, 
    # enrich 19.93x
    featureBits -chrom=chr1 danRer3 refGene:cds chainMm8Link -enrichment
    # refGene:cds 0.786%, chainMm8Link 4.581%, both 0.612%, cover 77.88%, 
    # enrich 17.00x
    # Higher coverage than for danRer3 chains on mm8 and similar coverage
    # for mm8 chains on danRer4 as on danRer3 so that is good.
 
#########################################################################
# BLASTZ danRer3 (DONE - 2006-02-28 - Hiram)
    ssh pk
    mkdir /cluster/data/mm8/bed/blastz.danRer3.2006-02-28
    cd /cluster/data/mm8/bed
    ln -s blastz.danRer3.2006-02-28 blastz.danRer3
    cd blastz.danRer3

    cat << '_EOF_' > DEF
# mouse vs zebrafish
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn/x86_64:/cluster/bin/x86_64:/parasol/bin

BLASTZ=blastz.v7.x86_64

# Reuse parameters from hg16-fr1, danRer-hg17 and mm5-danRer
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=6000
BLASTZ_K=2200
BLASTZ_M=50
BLASTZ_Q=/cluster/data/blastz/HoxD55.q

# TARGET: Mouse Mm8
SEQ1_DIR=/scratch/hg/mm8/nib
SEQ1_LEN=/scratch/hg/mm8/chrom.sizes
SEQ1_CHUNK=20000000
SEQ1_LAP=10000

# QUERY: Zebrafish (danRer3)
#  large enough chunk to do complete chroms at once
SEQ2_DIR=/san/sanvol1/scratch/danRer3/chromNib
SEQ2_LEN=/san/sanvol1/scratch/danRer3/chromNib.sizes
SEQ2_CHUNK=100000000
SEQ2_LAP=0

BASE=/cluster/data/mm8/bed/blastzDanRer3.2006-02-28
TMPDIR=/scratch/tmp
'_EOF_'
    #	<< happy emacs

    time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
	`pwd`/DEF > blastz.out 2>&1 &
    #	real    216m23.425s
    time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
	-swap `pwd`/DEF > swap.out 2>&1 &


    time nice -n +19 featureBits mm8 chainDanRer3Link
    #	53125783 bases of 2567283971 (2.069%) in intersection
    time nice -n +19 featureBits danRer3 chainMm8Link
    #	54831876 bases of 1630323462 (3.363%) in intersection

#############################################################################
# STS MARKERS DATA DOWNLOAD (DONE - 2006-02-23 - 2006-02-28 - Hiram)
###   *** PLEASE NOTE - STS markers redone 2006-08-29 - look for section:
##  redoing STS markers track to get them more correct
###	later in this file
    ssh kkstore01
    mkdir -p /cluster/data/mm8/bed/STSmarkers/downloads
    cd /cluster/data/mm8/bed/STSmarkers/downloads
    # these files appear to be new almost every day
    wget --timestamping \
	ftp://ftp.ncbi.nih.gov/repository/UniSTS/UniSTS_mouse.sts
    wget --timestamping \
	ftp://ftp.ncbi.nih.gov/repository/UniSTS/UniSTS.aliases

    #	The new feature in the .aliases file this time are names with
    #	spaces in them !  This changes our parsing business below,
    #	hopefully the spaces in the names won't cause trouble elsewhere.

    wget --timestamping \
ftp://ftp.ncbi.nih.gov/repository/UniSTS/UniSTS_MapReports/Mus_musculus/*

    # these reports from jax.org appear to be changing daily
    wget --timestamping \
	ftp://ftp.informatics.jax.org/pub/reports/MRK_Dump2.rpt
    wget --timestamping \
	ftp://ftp.informatics.jax.org/pub/reports/MRK_Sequence.rpt
    wget --timestamping \
	ftp://ftp.informatics.jax.org/pub/reports/PRB_PrimerSeq.rpt
    ls -ogrt
#	-rw-rw-r--  1      676 Mar 11  2004 README
#	-rw-rw-r--  1   396858 Jan 28  2005 10090.MGI.txt
#	-rw-rw-r--  1   390139 Mar 16  2005 10090.WI_MRC_RH.txt
#	-rw-rw-r--  1   240688 Mar 16  2005 10090.WI-YAC.txt
#	-rw-rw-r--  1   173344 Mar 16  2005 10090.WI-Genetic.txt
#	-rw-rw-r--  1 25691253 Jan 13 16:42 UniSTS.aliases
#	-rw-rw-r--  1  4140920 Feb 22 18:43 UniSTS_mouse.sts
#	-rw-rw-r--  1  4576611 Feb 23 02:22 MRK_Dump2.rpt
#	-rw-rw-r--  1  2549974 Feb 23 02:23 PRB_PrimerSeq.rpt
#	-rw-rw-r--  1  4531489 Feb 23 02:23 MRK_Sequence.rpt
    #	 I note the UniSTS.aliases file is over twice as big as was in
    #	 Mm7 build.  I wonder what got into it ...
    #	What got into it was that it was completely broken.  It appeared
    #	to have a vast section of itself duplicated again in the file.
    #	It was cleaned up via:
    echo -e "#Unique ID\tAliases" > uniqueSTS.aliases
    grep -v "^#" UniSTS.aliases | sort -n | uniq >> uniqueSTS.aliases
    mv UniSTS.aliases UniSTS.aliases.broken
    mv uniqueSTS.aliases UniSTS.aliases

    # back to our work area, update the bed file
    #	to do this we need a new UniSTS_mouse.alias file
    # it is created by a combination of information from several
    # of the above files ! AND ! the previous stsInfoMouse.bed file

    cd /cluster/data/mm8/bed/STSmarkers/downloads
    cp -p /cluster/data/mm7/bed/STSmarkers/downloads/*.sh .
    cp -p /cluster/data/mm7/bed/STSmarkers/downloads/*.pl .
    #	There is a line in the fetchAllAliases.sh script that needs to
    #	be updated, it must point to the previous bed file:
    #   BEDFile=/cluster/data/mm7/bed/STSmarkers/stsInfoMouse.bed
    #	Next time, this should read:
    #   BEDFile=/cluster/data/mm8/bed/STSmarkers/stsInfoMouse.bed

    #	This process has been captured in the script:
    #	/cluster/data/mm5/bed/STSmarkers/downloads/fetchAllAliases.sh
    # which uses a couple of perl scripts in that same directory.
    # briefly it is:
    
    # ./UniSTSParse.pl UniSTS_mouse.sts UniSTS.aliases > UniSTS_mouse_alias.0
    # grep MGI: UniSTS.aliases > MGI.aliases
    # ./stsInfoMouseParse.pl /cluster/store5/mouseMarker/stsInfoMouse.bed > \
    #	stsInfoAliases.txt
    # ./UniSTSParse.pl stsInfoAliases.txt UniSTS.aliases > stsInfo.aliases
    # cat UniSTS_mouse_alias.0 MGI.aliases stsInfo.aliases | sort -u \
    #    | sort -n > UniSTS_mouse.alias

    time ./fetchAllAliases.sh > fetchAllAliases.out 2>&1

    #	Here is a normal set of errors:
# processing UniSTS_mouse.sts to find aliases
# #       ERROR: KNOWN(==OK) duplicate ID: '108991' encountered at line
# #       2384
# processing MGI.aliases
# fetching existing aliases from previous stsInfoMouse.bed file
# found 27648 potential errors in
#	/cluster/data/mm7/bed/STSmarkers/stsInfoMouse.bed
# to see the errors: grep ERROR stsInfoAliases.txt
# verify those stsInfoMouse.bed aliases with UniSTS.aliases

    #	those errors in the previous stsInfoMouse.bed file are an
    #	accumulation of errors from a long long time ago in this chain
    #	of processing.  Some day it might be nice to fix them, but they
    #	don't seem to bother anything, so they continue to be carried
    #	forward, and a couple of new ones are added with each assembly.

    # with that, we can create a new stsInfoMouse.bed file:
    #	Update the m m 7 directory name here to m m 8
    #	for the next build of m m 9
    cd /cluster/data/mm8/bed/STSmarkers
    /cluster/store5/mouseMarker/code/updateBed.pl \
	/cluster/data/mm7/bed/STSmarkers/stsInfoMouse.bed \
	downloads/MRK_Dump2.rpt downloads/PRB_PrimerSeq.rpt \
	downloads/MRK_Sequence.rpt downloads/UniSTS_mouse.alias \
	downloads/UniSTS_mouse.sts | sed -e "s/\t*$//" > newbedfile

    # Yontao updated /cluster/store5/mouseMarker/code/cleanInfo.pl 8/10/04
    /cluster/store5/mouseMarker/code/cleanInfo.pl newbedfile > stsInfoMouse.bed
	
    # copy the stsInfoMouse.bed file from working dir to the marker
    #	info storage fold.  added 2 new steps by Yontao	
    #	be wary of the archive name here, check the directory and get
    #	the name right here.
    mv /cluster/store5/mouseMarker/stsInfoMouse.bed \
	/cluster/store5/mouseMarker/stsInfoMouse.bed_mm7
    cp -p stsInfoMouse.bed /cluster/store5/mouseMarker/stsInfoMouse.bed

    # comparing to previous, numbers increase slightly each time
    wc /cluster/store5/mouseMarker/stsInfoMouse.bed \
	/cluster/store5/mouseMarker/stsInfoMouse.bed_mm7 \
	/cluster/store5/mouseMarker/stsInfoMouse.bed_mm6 \
	/cluster/store5/mouseMarker/stsInfoMouse.bed_mm5
    #	60440   801181  6871232 /cluster/store5/mouseMarker/stsInfoMouse.bed
    #	59843   794642  6802825 /cluster/store5/mouseMarker/stsInfoMouse.bed_mm7
    #	58980   784786  6690105 /cluster/store5/mouseMarker/stsInfoMouse.bed_mm6
    #	58493   778055  6524821 /cluster/store5/mouseMarker/stsInfoMouse.bed_mm5

    # and from that, create new primer fa, epcr, etc:
    /cluster/store5/mouseMarker/code/luConvertPrimerToFa \
	stsInfoMouse.bed mouseP.fa mouseC.fa mouseP.info
    # the mouseC.fa file will be empty, should be more than last time
    wc mouse?.*
    #	     0       0       0 mouseC.fa
    #	305991  305937 6910111 mouseP.fa
    #	 34475  172467 2195057 mouseP.info
    #	340466  478404 9105168 total

    #	the equivalent Mm7 files:
    #      0       0       0 mouseC.fa
    # 300968  300914 6798466 mouseP.fa
    #  33838  169275 2153113 mouseP.info
    # 334806  470189 8951579 total
    #	the equivalent Mm6 files:
    #	     0       0       0 mouseC.fa
    #	293305  293251 6624638 mouseP.fa
    #	 32890  164528 2087271 mouseP.info
    #	326195  457779 8711909 total
    #	the equivalent Mm5 files:
    #	     0       0       0 mouseC.fa
    #	286740  286686 6474893 mouseP.fa
    #	 32232  161234 2044810 mouseP.info
    #	318972  447920 8519703 total

    #	copy the primers over to some filesystem close to the klusters
    #	and split them up to have a small number of sequences in one file
    

    mkdir /cluster/bluearc/mm8/stsMarkers
    cp -p mouseP.fa /cluster/bluearc/mm8/stsMarkers
    cd /cluster/bluearc/mm8/stsMarkers
    cp -p /cluster/data/mm8/11.ooc .
    mkdir split
    #	400 files for 34,475 sequences, == about 80 sequences per file
    faSplit sequence mouseP.fa 400 split/mm_


    # PLEASE NOTE /cluster/bin/i386/blat.2 SPECIFICALLY IS USED HERE. 
    #	This process could convert to a modern version of blat with the
    #	filters as described, for example, in the STS markers build in Hg18

    #  CLUSTER RUN FOR THE STS PRIMERS
    ssh kk
    mkdir /cluster/data/mm8/bed/STSmarkers/primer
    mkdir /cluster/data/mm8/bed/STSmarkers/ePCR
    cd /cluster/data/mm8/bed/STSmarkers/primer
    mkdir out

    #	interestingly, this blat2.2 binary did not function correctly
    #	when given nib files.  It has only about 1/4th of the number of
    #	alignments as it gets when it used fa files for the target
    #	sequence.

    ls -1S /cluster/bluearc/mm8/stsMarkers/split > primers.list
    ls -1S /cluster/bluearc/mm8/stsMarkers/chroms > chr.list

    cat << '_EOF_' > runBlat2.csh
#!/bin/csh -fe
set primer = /cluster/bluearc/mm8/stsMarkers/split/$1
set fa = /cluster/bluearc/mm8/stsMarkers/chroms/$2
set ooc = /cluster/bluearc/mm8/stsMarkers/11.ooc
set root2 = $2:r
mkdir -p out/${root2}
set out = $3

/cluster/bin/i386/blat.2 ${fa} ${primer} -ooc=${ooc} \
        -minMatch=1 -minScore=0 -minIdentity=80 -oneOff ${out}
'_EOF_'
    #	<< happy emacs
    chmod +x runBlat2.csh

    cat << '_EOF_' > template
#LOOP
./runBlat2.csh $(path1) $(path2) {check out line+ out/$(root2)/$(root1).psl}
#ENDLOOP
'_EOF_'
    #	<< happy emacs

    gensub2 primers.list chr.list template jobList
    para create jobList
    para try ... check ... push ... etc ...
# Completed: 12104 of 12104 jobs
# CPU time in finished jobs:    1075037s   17917.28m   298.62h   12.44d  0.034 y
# IO & Wait Time:               7444257s  124070.95m  2067.85h   86.16d  0.236 y
# Average job time:                 704s      11.73m     0.20h    0.01d
# Longest finished job:           61869s    1031.15m    17.19h    0.72d
# Submission to last job:        168538s    2808.97m    46.82h    1.95d
    #	some of the jobs got stuck for unknown reasons.  Had to find
    #	them and kill them on their nodes.  Their blat.2 process was
    #	stuck and would not kill.  Don't know what happened there.

    # on the file server
    ssh kkstore01
    cd /cluster/data/mm8/bed/STSmarkers/primer
    time pslSort dirs primers.raw.psl temp out/chr*
    #	-rw-rw-r--   1 586124177 Feb 26 21:28 primers.raw.psl

    #	filter alignments for (qEnd-qStart) vs. (tEnd-tStart)
    #	should not be more than 100 bases different.
    #	This filters out about 1,028,202 alignments, or
    #	%17.4 = 100.0 * 1028202 / 5921712
    time pslSort dirs stdout temp out/chr* | awk -F"\t" '
{ if (((($13 - $12) - ($17 - $16)) > -100) &&
	((($13 - $12) - ($17 - $16)) < 100)) {print}
}
' > primers.psl.100

    rmdir temp

    wc -l *.100 *.psl
    #	5445367 primers.raw.psl
    #	4500528 primers.psl.100
    #	 944839 difference

    # a rough comparison with previous results:
    wc -l /cluster/data/mm7/bed/STSmarkers/primer/primers.psl.100
    #	4893510  102763628  510563575 primers.psl.100
    wc primers.psl  (unfiltered, Mm7)
    #	5921712 124355891 636898117 primers.psl
    wc /cluster/data/mm7/bed/STSmarkers/primer/primers.psl
    #	5724127 120206606 615248041
    wc /cluster/data/mm5/bed/STSmarkers/primer/primers.psl
    #	5719969 120119288 590806241
    wc /cluster/data/mm4/bed/STSmarkers/primer/primers.psl
    #	5745617 120657896 592135728 

    # another kluster run for the ePCR
    ssh pk
    cd /cluster/data/mm8/bed/STSmarkers/ePCR
    ls -1S /cluster/bluearc/mm8/stsMarkers/chroms > chr.list

    #	pick up e-PCR source from
    #	ftp://ftp.ncbi.nlm.nih.gov/pub/schuler/e-PCR/
    #	version 2.3.1 11 Feb 2005
    #	Had to add the following to both re-PCR_main.cpp and
    #	e-PCR_main.cpp to get them to compile on kolossus:
// max and min Copied from /usr/include/mysql/my_global.h
#define max(a, b)       ((a) >? (b))
#define min(a, b)       ((a) <? (b))

    mkdir out
    cat << '_EOF_' > runPCR
#!/bin/csh -fe
/cluster/bin/x86_64/e-PCR /cluster/data/mm8/bed/STSmarkers/mouseP.info \
	/cluster/bluearc/mm8/stsMarkers/chroms/$1 N=1 M=50 W=5 > $2
'_EOF_'
    # << happy emacs
    chmod +x runPCR

    cat << '_EOF_' > template
#LOOP
./runPCR $(path1) {check out line+ out/$(num1).epcr}
#ENDLOOP
'_EOF_'
    # the mouseP.info was created above
    gensub2 chr.list single template jobList
    para create jobList
    para try
    para check
    para push
    ... etc ...
    # STARTED 2006-02-27 16:24
    #	There is a single job that produces no output:
    ./runPCR chrX_random.fa out/30.epcr
    #	WARNING: 96 STSs have primer shorter than W
    #	WARNING: 21 STSs have ambiguities within W of 3' end
    #	Not sure what's up with that
# Completed: 33 of 34 jobs
# Crashed: 1 jobs
# CPU time in finished jobs:      67601s    1126.69m    18.78h    0.78d  0.002 y
# IO & Wait Time:                  1028s      17.13m     0.29h    0.01d  0.000 y
# Average job time:                2080s      34.66m     0.58h    0.02d
# Longest finished job:            5134s      85.57m     1.43h    0.06d
# Submission to last job:          5134s      85.57m     1.43h    0.06d

    ssh kkstore01
    cd /cluster/data/mm8/bed/STSmarkers/ePCR
    # all those results become all.epcr
    cat out/*.epcr > all.epcr

    # comparing to previous results:
    wc -l all.epcr
    #	58088 all.epcr
    wc -l /cluster/data/mm7/bed/STSmarkers/ePCR/all.epcr
    #	57709 /cluster/data/mm7/bed/STSmarkers/ePCR/all.epcr
    wc -l /cluster/data/mm6/bed/STSmarkers/ePCR/all.epcr
    #	55871 /cluster/data/mm6/bed/STSmarkers/ePCR/all.epcr
    wc /cluster/data/mm5/bed/STSmarkers/ePCR/all.epcr
    #	55677  222708 2945623 /cluster/data/mm5/bed/STSmarkers/ePCR/all.epcr
    wc /cluster/data/mm4/bed/STSmarkers/ePCR/all.epcr
    #	74705  298820 3971712 /cluster/data/mm4/bed/STSmarkers/ePCR/all.epcr
    #	Mm4 seems to be out of whack

    cd /cluster/data/mm8/bed/STSmarkers/primer

    /cluster/bin/scripts/filterSTSPrimers \
    -mouse ../stsInfoMouse.bed primers.psl.100 \
        ../mouseP.info ../ePCR/all.epcr > primers.psl.filter.blat

    #  The output should show an increasing count:
    #	Reading name info
    #	Reading primer info
    #	Processing file
    #	100000
    #	200000
    #	300000
    #	...
    #	4500000
    #	Determining ePCR not found
    #
    wc -l primers.psl.filter.blat
    #	34026 primers.psl.filter.blat
    wc -l /cluster/data/mm7/bed/STSmarkers/primer/primers.psl.filter.blat
    #	33986 /cluster/data/mm7/bed/STSmarkers/primer/primers.psl.filter.blat
    wc -l /cluster/data/mm6/bed/STSmarkers/primer/primers.psl.filter.blat
    #	33128 /cluster/data/mm6/bed/STSmarkers/primer/primers.psl.filter.blat
    wc -l /cluster/data/mm5/bed/STSmarkers/primer/primers.psl.filter.blat
    #	33476 /cluster/data/mm5/bed/STSmarkers/primer/primers.psl.filter.blat

    # create accession_info.rdb
    touch empty_sequence.inf
    /cluster/bin/scripts/compileAccInfo -mouse \
	/cluster/data/mm8 empty_sequence.inf
    # works with errors on missing randoms, etc...:
    # cat: /cluster/data/mm5/11/chr11_random.agp: No such file or directory
    # cat: /cluster/data/mm5/M/chrM_random.agp: No such file or directory
    mv accession_info.rdb accession_info.rdb.tmp
    /cluster/bin/scripts/sorttbl Chr Ord Start < accession_info.rdb.tmp > \
	accession_info.rdb
    rm accession_info.rdb.tmp
    # comparing results to previous
    #	Continuing the trend that began with Mm7, the numbers in
    #	accession_info.rdb continue to decrease.  Even Mm8 has much less
    #	fragments than did mm7:
    #	e.g.:
    [hiram@kkstore01 /cluster/data] wc -l mm8/*/chr*.agp | tail -1
    #	21910 total
    [hiram@kkstore01 /cluster/data] wc -l mm7/*/chr*.agp | tail -1
    #	70125 total
    [hiram@kkstore01 /cluster/data] wc -l mm6/*/chr*.agp | tail -1
    #	170812 total

    wc -l accession_info.rdb
    #	20385 accession_info.rdb
    wc -l /cluster/data/mm7/bed/STSmarkers/primer/accession_info.rdb
    #	44046  484510 3112816 accession_info.rdb
    wc /cluster/data/mm7/bed/STSmarkers/primer/accession_info.rdb
    #	93052 1023576 6824900 accession_info.rdb
    wc /cluster/data/mm5/bed/STSmarkers/primer/accession_info.rdb
    #	131845 1450299 9681940
    wc /cluster/data/mm4/bed/STSmarkers/primer/accession_info.rdb
    #	86935  956289 6374930 

    # creates epcr.not.found.nomatch and epcr.not.found.psl
    #	/cluster/bin/scripts/epcrToPsl
    #	Fixed this script (in mm7) to make it not look for contigs in the usual
    #	manner, we don't have those for this assembly	
    sed -e "s/mm7/mm8/g" /cluster/data/mm7/bed/STSmarkers/primer/epcrToPsl \
	> ./epcrToPsl
    chmod +x epcrToPsl
    ./epcrToPsl -mouse \
	epcr.not.found ../mouseP.info \
	accession_info.rdb /cluster/data/mm8

    # Comparing results to previous:
    wc -l epcr*
    #	 501 epcr.not.found
    #	   0 epcr.not.found.nomatch
    #	 501 epcr.not.found.psl
    #	 158 epcrToPsl
    #	1160 total

    # Mm7 wc epcr*
    wc -l /cluster/data/mm7/bed/STSmarkers/primer/epcr*
    #	 474 /cluster/data/mm7/bed/STSmarkers/primer/epcr.not.found
    #	   0 /cluster/data/mm7/bed/STSmarkers/primer/epcr.not.found.nomatch
    #	 474 /cluster/data/mm7/bed/STSmarkers/primer/epcr.not.found.psl
    #	 158 /cluster/data/mm7/bed/STSmarkers/primer/epcrToPsl
    #	1106 total

    # Mm6 wc epcr*
    wc -l /cluster/data/mm6/bed/STSmarkers/primer/epcr*
    #	 472 /cluster/data/mm6/bed/STSmarkers/primer/epcr.not.found
    #	  63 /cluster/data/mm6/bed/STSmarkers/primer/epcr.not.found.nomatch
    #	 404 /cluster/data/mm6/bed/STSmarkers/primer/epcr.not.found.psl
    #	 158 /cluster/data/mm6/bed/STSmarkers/primer/epcrToPsl
    #	1097 total

    cat primers.psl.filter.blat epcr.not.found.psl > primers.psl.filter
    wc -l primers.psl.filter
    #	34527 primers.psl.filter

    wc -l /cluster/data/mm7/bed/STSmarkers/primer/primers.psl.filter
    #	34460 /cluster/data/mm7/bed/STSmarkers/primer/primers.psl.filter

    wc -l /cluster/data/mm6/bed/STSmarkers/primer/primers.psl.filter
    #	33532 /cluster/data/mm6/bed/STSmarkers/primer/primers.psl.filter

    wc -l /cluster/data/mm5/bed/STSmarkers/primer/primers.psl.filter.lifted
    # 33691 /cluster/data/mm5/bed/STSmarkers/primer/primers.psl.filter.lifted

    # create primers.psl.filter.lifted.initial
    #	if you do not run with scripts in your path, add the PATH business
    PATH=/cluster/bin/scripts:$PATH /cluster/bin/scripts/extractPslInfo \
	primers.psl.filter
    wc -l  primers.psl.filter.initial
    #	34513 primers.psl.filter.initial

    wc -l /cluster/data/mm7/bed/STSmarkers/primer/primers.psl.filter.initial
    #	34443 /cluster/data/mm7/bed/STSmarkers/primer/primers.psl.filter.initial
    wc -l /cluster/data/mm6/bed/STSmarkers/primer/primers.psl.filter.initial
    #	33514 /cluster/data/mm6/bed/STSmarkers/primer/primers.psl.filter.initial
    wc -l \
       /cluster/data/mm5/bed/STSmarkers/primer/primers.psl.filter.lifted.initial
    # 33689 

    # create primers.psl.filter.lifted.initial.acc
    /cluster/bin/scripts/findAccession -agp \
	-mouse primers.psl.filter.initial /cluster/data/mm8
    #	it complains about missing _random items, it is OK
    wc -l primers.psl.filter.initial.acc
    #	34513 primers.psl.filter.initial.acc

    wc -l /cluster/data/mm7/bed/STSmarkers/primer/primers.psl.filter.initial.acc
    #	34443

    # this needs to be -rat as that specifies how to scan the
    # stsInfoMouse.bed file and it does not work if you use -mouse
    /cluster/bin/scripts/getStsId -rat \
	../stsInfoMouse.bed  primers.psl.filter.initial.acc \
	| sort -k4,4n > primers.final
    wc -l primers.final
    #	34513 primers.final
    wc -l /cluster/data/mm7/bed/STSmarkers/primer/primers.final
    #	34443 /cluster/data/mm7/bed/STSmarkers/primer/primers.final

    cd /cluster/data/mm8/bed/STSmarkers
    # stsMarkers.final is empty for mouse
    touch stsMarkers.final dummy
    #	if you do not run with scripts in your path, add the PATH business
    PATH=/cluster/bin/scripts:$PATH /cluster/bin/scripts/combineSeqPrimerPos \
	stsMarkers.final primer/primers.final > stsMarkers_pos.rdb
    wc -l stsMarkers_pos.rdb
    #	33075 stsMarkers_pos.rdb
    wc -l /cluster/data/mm7/bed/STSmarkers/stsMarkers_pos.rdb
    #	32869 /cluster/data/mm7/bed/STSmarkers/stsMarkers_pos.rdb

    wc -l /cluster/data/mm6/bed/STSmarkers/stsMarkers_pos.rdb
    #	31889 /cluster/data/mm6/bed/STSmarkers/stsMarkers_pos.rdb
    wc -l /cluster/data/mm5/bed/STSmarkers/stsMarkers_pos.rdb
    #	32085 /cluster/data/mm5/bed/STSmarkers/stsMarkers_pos.rdb
    wc -l /cluster/data/mm4/bed/STSmarkers/stsMarkers_pos.rdb
    #	31270 /cluster/data/mm4/bed/STSmarkers/stsMarkers_pos.rdb

    /projects/cc/hg/ytlu/bin/script/perl/createStsBed \
	stsInfoMouse.bed  stsMarkers_pos.rdb 500 \
	| sort -k1,1 -k2,2n > stsMapMouse.bed
    #	Fixup --- 2006-04-12 - Hiram - it was found that column 12 had blanks
    #	as the first character of the field.  This isn't what is needed
    #	here.  Let's take those blanks out, turns out these were the
    #	only blanks in the file:
    mv stsMapMouse.bed stsMapMouse_withBlanks.bed
    sed -e "s/ //" stsMapMouse_withBlanks.bed > stsMapMouse.bed

    wc stsMapMouse.bed
    #	29888  308263 2087726 stsMapMouse.bed
    wc /cluster/data/mm7/bed/STSmarkers/stsMapMouse.bed
    #	29079  301678 2097544 stsMapMouse.bed
    wc /cluster/data/mm5/bed/STSmarkers/stsMapMouse.bed
    #	29069  301535 2123622 /cluster/data/mm5/bed/STSmarkers/stsMapMouse.bed

    #  loading STS markers tables
    ssh hgwdev
    cd /cluster/data/mm8/bed/STSmarkers
    cp -p /cluster/data/mm7/bed/STSmarkers/ucscAlias.pl .
    ./ucscAlias.pl stsInfoMouse.bed > ucscStsAlias.tab 2> ucscStsAlias.warnings
    #	this does leave messages in ucscStsAlias.warnings but they seem
    #	to be very similar to Mm6 with just a few new ones
     
    wc ucscStsAlias.tab  (after applying filter to primers.psl above)
    #	144570  433667 3366815 ucscStsAlias.tab
    wc ucscStsAlias.tab  (before applying filter to primers.psl above)
    #	144570  433667 3366815 ucscStsAlias.tab
    wc /cluster/data/mm7/bed/STSmarkers/ucscStsAlias.tab
    #	141585  424725 3284106 ucscStsAlias.tab
    wc /cluster/store6/mm5/bed/STSmarkers/ucscStsAlias.tab
    # 126624  379859 3037850 /cluster/store6/mm5/bed/STSmarkers/ucscStsAlias.tab
     
    #	Use the drop tables if reloading
    #	hgsql -e "drop table stsAlias;" mm8
    hgsql mm8 < ~/kent/src/hg/lib/stsAlias.sql
    hgsql -e \
	'load data local infile "ucscStsAlias.tab" into table stsAlias;' mm8
    #   reloaded stsMapMouseNew 2006-04-12 to remove blanks in col 12 - Hiram
    #	hgsql -e "drop table stsMapMouseNew;" mm8
    hgsql mm8 < ~/kent/src/hg/lib/stsMapMouseNew.sql
    hgsql -e \
'load data local infile "stsMapMouse.bed" into table stsMapMouseNew;' mm8
    #	hgsql -e "drop table stsInfoMouseNew;" mm8
    hgsql mm8 < ~/kent/src/hg/lib/stsInfoMouseNew.sql
    hgsql -e \
     'load data local infile "stsInfoMouse.bed" into table stsInfoMouseNew;' mm8

    hgLoadPsl -nobin -table=all_sts_primer mm8 primer/primers.psl.filter
    # load of all_sts_primer did not go as planned: 34527 record(s), 0
    # row(s) skipped, 19 warning(s) loading primer/primers.psl.filter

    # load primer sequences	
    mkdir /gbdb/mm8/stsMarker
    ln -s /cluster/data/mm8/bed/STSmarkers/mouseP.fa \
	/gbdb/mm8/stsMarker/mouseP.fa
    # PLEASE NOTE THAT THE If you are going to reload this business, use the
    #	-replace option on this hgLoadSeq
    #	hgLoadSeq -replace mm8 /gbdb/mm8/stsMarker/mouseP.fa
    # otherwise there will be a problem that the seq and extFile tables 
    # will be out of sync. 
    hgLoadSeq mm8 /gbdb/mm8/stsMarker/mouseP.fa
    #  Adding /gbdb/mm8/stsMarker/mouseP.fa
    #  33838 sequences

    featureBits mm8 all_sts_primer
    #	3746196 bases of 2567283971 (0.146%) in intersection
    featureBits mm7 all_sts_primer
    #	3757119 bases of 2583394090 (0.145%) in intersection
    featureBits mm6 all_sts_primer
    #	3677372 bases of 2597150411 (0.142%) in intersection
    featureBits mm8 stsMapMouseNew
    #	4801964 bases of 2567283971 (0.187%) in intersection
    featureBits mm7 stsMapMouseNew
    #	4805958 bases of 2583394090 (0.186%) in intersection
    featureBits mm6 stsMapMouseNew
    #	4638338 bases of 2597150411 (0.179%) in intersection

    hgsql -N mm8 -e "select count(*) from stsAlias;"
    #	141981
    hgsql -N mm7 -e "select count(*) from stsAlias;"
    #	140649
    hgsql -N mm7 -e "select count(*) from stsAlias;"
    #	137738
    hgsql -N mm5 -e "select count(*) from stsAlias;"
    #	122944
    hgsql -N mm8 -e "select count(*) from stsInfoMouseNew;"
    #	60440
    hgsql -N mm7 -e "select count(*) from stsInfoMouseNew;"
    #	59843
    hgsql -N mm7 -e "select count(*) from stsInfoMouseNew;"
    #	58980
    hgsql -N mm5 -e "select count(*) from stsInfoMouseNew;"
    #	58493

    #	compare old and new name lists:
    awk '{print $4}' stsMapMouse.bed | sort -u > mm8.nameList
    awk '{print $4}' /cluster/data/mm7/bed/STSmarkers/stsMapMouse.bed | \
	sort -u > mm7.nameList
    comm -12 mm?.nameList | wc -l
    #	28253   <- 28,253 names in common
    comm -23 mm7.nameList mm8.nameList | wc -l
    #	174     <- 174 unique to mm7 list
    comm -13 mm7.nameList mm8.nameList | wc -l
    #	445     <- 445 unique to mm8 list

    #	previously, Mm6 vs Mm7:
    #	27320   <- 27,320 names in common
    #	188     <- 188 unique to mm6 list
    #	1107    <- 1,107 unique to mm7 list

####################################################################################
# BUILD KNOWN GENES TABLES (STARTED 2/25/06, PART I DONE 2/27/06 Fan)

# First build protein databases, sp060115 and proteins060115
# See makeProteins060115.doc for details.

# Create working subdirectories and temporary databases (kgMm8A)

  ssh hgwdev
  mkdir /cluster/store9/kg
  cd /cluster/store9/kg
  mkdir kgMm8A  
  ln -s /cluster/store9/kg/kgMm8A /cluster/store6/kgDB/bed/kgMm8A
  ln -s /cluster/store9/kg/kgMm8A /cluster/data/mm8/bed/kgMm8A

  hgsql mm8 -e "create database kgMm8A"   
  hgsql mm8 -e "create database kgMm8ATemp"

  mkdir /cluster/bluearc/kgDB/kgMm8A
  mkdir /cluster/bluearc/kgDB/kgMm8A/protBlat
  ln -s /cluster/bluearc/kgDB/kgMm8A/protBlat /cluster/store9/kg/kgMm8A/protBlat
  cd /cluster/store9/kg/kgMm8A/protBlat

# Get all mouse protein sequences

  hgsql -N sp060115 -e \
  'select p.acc, p.val from protein p, accToTaxon x where x.taxon=10090 and p.acc=x.acc'\
  |awk '{print ">" $1;print $2}' >mouseProt.fa

  hgsql -N sp060115 -e \
  'select v.varAcc, p.val from varAcc v, protein p, accToTaxon x where v.parAcc = p.acc and x.taxon=10090   and v.parAcc=x.acc'\
  |awk '{print ">" $1;print $2}' \
  >mouseVarProt.fa

# append var proteins to mouseProt.fa
  cat mouseVarProt.fa >>mouseProt.fa

# Prepare and perform cluster run for protein/genome alignment

  ssh pk
  cd /cluster/data/mm8/bed/kgMm8A/protBlat
  mkdir prot
  faSplit sequence mouseProt.fa 2000 prot/prot
  ls /cluster/bluearc/kgDB/kgMm8A/protBlat/prot/* > prot.lis

  ssh hgwdev
  cd /cluster/data/mm8/bed/kgMm8A/protBlat
  hgsql mm8 -N -e 'select chrom from chromInfo' > chrom.lis
  exit
  
  cat << '_EOF_' > gsub
#LOOP
/cluster/bin/x86_64/blat -t=dnax -q=prot /cluster/data/mm8/nib/$(path1).nib $(path2) {check out line+ /cluster/bluearc/kgDB/kgMm8A/protBlat/result/$(root1)_$(root2).psl}
#ENDLOOP
'_EOF_'

  mkdir result
  gensub2 chrom.lis prot.lis gsub jobList

  para create jobList
  para try
  para check
  para push
  para check ...

# started 8:15 AM 2/25/06, done 3:12 AM 2/26/06.
# Two jobs crashed due to empty result, push again and finished OK in a few minutes.

# Completed: 67354 of 67354 jobs
# CPU time in finished jobs:   12580047s  209667.46m  3494.46h  145.60d  0.399 y
# IO & Wait Time:                237270s    3954.49m    65.91h    2.75d  0.008 y
# Average job time:                 190s       3.17m     0.05h    0.00d
# Longest running job:                0s       0.00m     0.00h    0.00d
# Longest finished job:           19991s     333.18m     5.55h    0.23d
# Submission to last job:         68128s    1135.47m    18.92h    0.79d

# collect BLAT results

   pslSort -nohead dirs raw.psl temp result
   pslReps -nohead -minCover=0.80 -minAli=0.80 -nearTop=0.002 raw.psl protBlat.psl /dev/null

   ssh hgwdev
   cd /cluster/bluearc/kgDB/kgMm8A/protBlat
   hgLoadPsl mm8 protBlat.psl

# create all_mrna.psl and tight_mrna.psl

   hgsql mm8 -N -e "select * from all_mrna" |cut -f 2-22 >all_mrna.psl

   pslReps -minCover=0.40 -minAli=0.97 -nearTop=0.002 \
           all_mrna.psl tight_mrna.psl /dev/null

# Save a copy of the following mm8 tables, to be used later to construct 
# kgMore and kgEvenmore

all_mrna
gbCdnaInfo
gbExtFile
gbLoaded
gbSeq
gbStatus
refFlat
refGene
refLink
refSeqAli
refSeqStatus
refSeqSummary
xenoMrna
xenoRefFlat
xenoRefGene
xenoRefSeqAli

# Use overlapSelect to get protein and mRNA alignment overlaps   
   overlapSelect  -statsOutput  -dropped=protOut.psl -overlapThreshold=0.90 \
   -selectFmt=psl -inFmt=psl tight_mrna.psl  protBlat.psl protMrna.stat

   overlapSelect  -mergeOutput  -dropped=protOut.psl -overlapThreshold=0.90 -selectFmt=psl \
   -inFmt=psl tight_mrna.psl  protBlat.psl protMrna.out

# Create protein/mRNA pair and protein lists
   cut -f 10,31 protMrna.out|sort -u >spMrna.tab
   cut -f 10    protMrna.out|sort -u >protein.lis
   cp -p protein.lis /cluster/data/mm8/bed/kgMm8A

# Load spMrna.tab into spMrna table in temp DB.
   hgsql kgMm8ATemp < ~/src/hg/lib/spMrna.sql
   hgsql kgMm8ATemp -e 'load data local infile "spMrna.tab" into table spMrna'
   hgsql kgMm8ATemp -e 'create index mrnaID on spMrna(mrnaID)'

# Prepare and perform cluster run of protein/mRNA alignment

# Get mRNA fa file.
   cd /cluster/data/mm8/bed/kgMm8A
   /cluster/data/genbank/bin/i386/gbGetSeqs -native -db=mm8 \
   -gbRoot=/cluster/data/genbank genbank mrna mrna.fa

# Create mrnaSeq table in kgMm8ATemp DB.

   faToTab mrna.fa mrnaSeq.tab

   hgsql kgMm8ATemp -e 'drop table mrnaSeq'
   hgsql kgMm8ATemp <~/src/hg/lib/mrnaSeq.sql
   hgsql kgMm8ATemp -e 'load data local infile "mrnaSeq.tab" into table mrnaSeq'

# Prepare files for cluster run

   cd /cluster/bluearc/kgDB/kgMm8A
   ~/src/hg/protein/KG2.sh kgMm8A mm8 060115

# Perform cluster run of protein/mRNA alignment
   ~/src/hg/protein/KG3.sh kgMm8A mm8 060115

# Collect cluster run results
   cd kgBestMrna

   ls out | sed -e 's/prot/do1 prot/g' >doall

# create do1 with the following 2 lines:
   cat << '_EOF_' > do1
echo processing $1
cat out/$1/*.out >>protMrnaRaw.psl
'_EOF_'

   chmod +x do*
   doall

# Filter out low quality alignments
   pslReps -nohead -singleHit -minAli=0.9 protMrnaRaw.psl protMrnaBlat.psl /dev/null
   cut -f 10,14 protMrnaBlat.psl |sort -u >protMrna.lis
   wc protMrna.lis

# Load BLAT results into temp DB.
   ssh hgwdev
   cd /cluster/store9/kg/kgMm8A/kgBestMrna
   hgsql kgMm8ATemp < ~/src/hg/lib/protMrnaBlat.sql
   hgsql kgMm8ATemp -e 'load data local infile "protMrnaBlat.psl" into table protMrnaBlat'
   hgsql kgMm8ATemp -e 'create index tName on protMrnaBlat(tName)'

# Create CDS files from protein/mRNA alignment results.
   hgsql kgMm8ATemp -N -e \
   'select qName,"_",tName,tStart+1,":",tEnd+3 from protMrnaBlat order by qName,tName,tEnd-tStart desc'\
   |sed 's/\t_\t/_/g'|sed 's/\t:\t/../g' >protMrna.cds

# Create protMrna.psl with proteinID_mrnaID as query ID.
   cut -f 22-30 ../protBlat/protMrna.out > j1.tmp
   cut -f 32-42 ../protBlat/protMrna.out > j2.tmp
   cut -f 10,31 ../protBlat/protMrna.out|sed -e 's/\t/_/g' >j3.tmp
   paste j1.tmp j3.tmp j2.tmp >protMrna.psl
   rm j1.tmp j2.tmp j3.tmp

# Run mrnaToGene to create protMrna.gp
   bash
   mrnaToGene -cdsFile=protMrna.cds protMrna.psl protMrna.gp 2>protMrna.err >protMrna.log
   exit

# move kgBestMrna to /san/sanvol1 to save space on store9

   mv /cluster/store9/kg/kgMm8A/kgBestMrna/clusterRun /san/sanvol1/scratch/fan/mm8/kgMm8A/kgBestMrna
   ln -s /san/sanvol1/scratch/fan/mm8/kgMm8A/kgBestMrna/clusterRun \
   /cluster/store9/kg/kgMm8A/kgBestMrna/clusterRun

# Prepare refGene and all_mrna gp files.

   cd ..
   cp -p base/refGene.tab ref.gp

   hgsql mm8 -N -e \
   'select gbCdnaInfo.acc,cds.name from gbCdnaInfo,cds,all_mrna where all_mrna.qName=gbCdnaInfo.acc and   gbCdnaInfo.cds=cds.id' \
   |sort -u > all_mrna.cds

   cat base/all_mrna.tab |cut -f 2-22 >all_mrna.psl
   bash
   mrnaToGene -cdsFile=all_mrna.cds all_mrna.psl all_mrna.gp 2>all_mrna.err > all_mrna.log
   exit

# Align proteins to RefSeq.

   overlapSelect -inCds -statsOutput -overlapThreshold=0.90 -selectFmt=psl -inFmt=genePred\
   protBlat/protBlat.psl ref.gp ref.stat
   overlapSelect -inCds -dropped=refOut1.gp -overlapThreshold=0.90 -selectFmt=psl -inFmt=genePred\
   protBlat/protBlat.psl ref.gp protRef.gp

   overlapSelect -mergeOutput -selectCds -dropped=protOut1.psl -overlapThreshold=0.80 -inFmt=psl\
   -selectFmt=genePred ref.gp protBlat/protBlat.psl protRef.out

   cut -f 10,22 protRef.out | sort -u >spRef.tab
   cut -f 10 protRef.out    | sort -u >protRef.lis

   hgsql kgMm8ATemp -e 'drop table spRef'
   hgsql kgMm8ATemp <~/src/hg/lib/spRef.sql
   hgsql kgMm8ATemp -e 'load data local infile "spRef.tab" into table spRef'

# Prepare and perform cluster runs for protein/RefSeq alignments

   ~/src/hg/protein/KGRef2.sh kgMm8A mm8 060115
   ~/src/hg/protein/KGRef3.sh kgMm8A mm8 060115

   cd kgBestRef
   ls out | sed -e 's/prot/do1 prot/g' >doall

   cat << '_EOF_' > do1
echo processing $1
cat out/$1/*.out >>protRefRaw.psl
'_EOF_'

   chmod +x do*
   doall

# Filter out low quality alignments.
   pslReps -nohead -singleHit -minAli=0.9 protRefRaw.psl protRefBlat.psl /dev/null
   cut -f 10,14 protRefBlat.psl |sort -u >protRef.lis
   wc protRef.lis

   hgsql kgMm8ATemp -e 'drop table protRefBlat'
   hgsql kgMm8ATemp < ~/src/hg/lib/protRefBlat.sql
   hgsql kgMm8ATemp -e 'load data local infile "protRefBlat.psl" into table protRefBlat'
   hgsql kgMm8ATemp -e 'create index tName on protRefBlat(tName)'

# Run gene-check to filter out invalid gp entries
   cd /cluster/data/mm8/bed/kgMm8A
   cat ref.gp kgBestMrna/protMrna.gp all_mrna.gp >kgCandidate0.gp
   gene-check  -incl-ok -ok-genepred-out kgCandidate0.passed.gp -nib-dir /cluster/data/mm8/nib kgCandidate0.gp kgCandidate0.check

   hgsql kgMm8ATemp -e 'drop table kgCandidate0'
   hgsql kgMm8ATemp < ~/src/hg/lib/kgCandidate0.sql 
   hgsql kgMm8ATemp -e  'load data local infile "kgCandidate0.gp" into table kgCandidate0'

   hgsql kgMm8ATemp -e 'drop table geneCheck'
   hgsql kgMm8ATemp < ~/src/hg/lib/geneCheck.sql
   hgsql kgMm8ATemp -e  'load data local infile "kgCandidate0.check" into table geneCheck ignore 2 lines'

# Run kgCheck to get all KG candidates that pass the KG gene check criteria

   kgCheck kgMm8ATemp mm8 kgCandidate0 geneCheck kgCandidate.tab
   hgsql kgMm8ATemp -e  'drop table kgCandidate'
   hgsql kgMm8ATemp < ~/src/hg/lib/kgCandidate.sql
   hgsql kgMm8ATemp -e  'load data local infile "kgCandidate.tab" into table kgCandidate'
   hgsql kgMm8ATemp -e 'create index alignID on kgCandidate(alignID)'

# Construct the kgCandidateX table that has alignID in the name field. 
   cut -f 2-10 kgCandidate.tab >j2.tmp
   cut -f 11 kgCandidate.tab >j1.tmp
   paste j1.tmp j2.tmp >kgCandidateX.tab

   hgsql kgMm8ATemp -e  'drop table kgCandidateX'
   hgsql kgMm8ATemp < ~/src/hg/lib/kgCandidateX.sql
   hgsql kgMm8ATemp -e  'load data local infile "kgCandidateX.tab" into table kgCandidateX'

# Score protein/mRna and protein/RefSeq alignments

   kgResultBestMrna2 060115 kgMm8ATemp mm8 protMrnaBlat|sort -u >protMrnaBlatScore.tab
   kgResultBestRef2  060115 kgMm8ATemp mm8 protRefBlat|sort -u >protRefScore.tab

# Combine scoring results and load them into temp DB.
   cat protMrnaBlatScore.tab protRefScore.tab >protMrnaScore.tab
   hgsql kgMm8ATemp -e 'drop table protMrnaScore'
   hgsql kgMm8ATemp < ~/src/hg/lib/protMrnaScore.sql
   hgsql kgMm8ATemp -e 'load data local infile "protMrnaScore.tab" into table protMrnaScore'
   hgsql kgMm8ATemp -e 'create index mrnaAcc on protMrnaScore(mrnaAcc)'

# Run kgGetCds to get CDS structure of each gene

   kgGetCds kgMm8ATemp 060115 kgCandidateX jY.tmp
# G171564 does not have cds.
# G171565 does not have cds.
   cat jY.tmp |sort -u >kgCandidateY.tab
   rm jY.tmp
   hgsql kgMm8ATemp -e  'drop table kgCandidateY'
   hgsql kgMm8ATemp < ~/src/hg/lib/kgCandidateY.sql
   hgsql kgMm8ATemp -e  'load data local infile "kgCandidateY.tab" into table kgCandidateY'

# Run kgPickPrep to replace long cds structure string with cdsId.
   kgPickPrep kgMm8ATemp kgCandidateZ.tab
   hgsql kgMm8ATemp -e  'drop table kgCandidateZ'
   hgsql kgMm8ATemp < ~/src/hg/lib/kgCandidateZ.sql
   hgsql kgMm8ATemp -e  'load data local infile "kgCandidateZ.tab" into table kgCandidateZ'
   hgsql kgMm8ATemp -e 'create index cdsId on kgCandidateZ(cdsId)'

# Run kgPick to pick the representative a mrna/protein pair for each unique CDS structure.

   kgPick kgMm8ATemp mm8 sp060115 kg3.tmp dupSpMrna.tmp
   sort -u dupSpMrna.tmp >dupSpMrna.tab

# Create put back list

# gbGetSeqs2, a modified version of gbGetSeqs output the RefSeq IDs at the beginning of each output line.

   gbGetSeqs2 -gbRoot=/cluster/data/genbank db=mm8 -get=ra RefSeq mrna ref.ra
   cat ref.ra | sed -e 's/ /\t/' | sort -u >refRa.tab

   hgsql mm8 -e  'drop table refRa'
   hgsql mm8 < ~/src/hg/lib/refRa.sql
   hgsql mm8 -e  'load data local infile "refRa.tab" into table refRa ignore 1 lines'

    hgsql mm8 -N -e \
'select r.acc, r.attr, r.val from refRa r, refRa r2, refRa r3 where r.attr="selenocysteine" and r.acc=r2.acc and r2.attr="rss" and r2.val="rev" and r3.acc=r.acc and r3.attr="org" and  r3.val="Mus musculus"' \
    >kgPutBack2.tab

    hgsql mm8 -N -e \
'select r.acc, r.attr, r.val from refRa r, refRa r2, refRa r3 where r.attr="cno" and r.val like "%ribosomal frameshift%" and r.acc=r2.acc and r2.attr="rss" and r2.val="rev" and r2.val="rev" and r3.acc=r.acc and r3.attr="org" and r3.val="Mus musculus"' \
    >>kgPutBack2.tab

    hgsql mm8 -N -e \
'select r.acc, r.attr, r.val from refRa r, refRa r2, refRa r3 where r.attr="cno" and r.val like "%non-AUG%" and r.acc=r2.acc and r2.attr="rss" and r2.val="rev" and r2.val="rev" and r3.acc=r.acc and r3.attr="org" and r3.val="Mus musculus"' \
    >>kgPutBack2.tab

    hgsql mm8 -N -e \
    'select r.acc, r.attr, r.val from refRa r, refRa r2, refRa r3 where r.attr="translExcept" and r.acc=r2.acc and r2.attr="rss" and r2.val="rev" and r2.val="rev" and r3.acc=r.acc and r3.attr="org" and r3.val="Mus musculus"' \
    >>kgPutBack2.tab 

    hgsql mm8 -N -e \
'select r.acc, r.attr, r.val from refRa r, refRa r2, refRa r3 where r.attr="exception" and r.acc=r2.acc and r2.attr="rss" and r2.val="rev" and r2.val="rev" and r3.acc=r.acc and r3.attr="org" and r3.val="Mus musculus"' \
    >>kgPutBack2.tab

   hgsql kgMm8ATemp -e 'drop table kgPutBack2'
   hgsql kgMm8ATemp < ~/src/hg/lib/kgPutBack2.sql
   hgsql kgMm8ATemp -e  'load data local infile "kgPutBack2.tab" into table kgPutBack2'

   kgPutBack kgMm8ATemp mm8 sp060115 kgPutBack2 kgPutBack2.gp
# No matching protein found for NM_008523.
# No matching protein found for NM_194444.
# No matching protein found for NM_206941.

# Sort KG genes to make the kg4.gp table file.
   cat kgPutBack2.gp kg3.tmp > kg4.tmp
   ~/kent/src/hg/protein/sortKg.pl kg4.tmp >knownGene.tab

   hgsql kgMm8ATemp -e  'drop table knownGene'
   hgsql kgMm8ATemp < ~/src/hg/lib/knownGene.sql
   hgsql kgMm8ATemp -e  'load data local infile "knownGene.tab" into table knownGene'

# Load data into mm8 knownGene table.
   hgsql mm8 -e  'drop table knownGene'
   hgsql mm8 < ~/src/hg/lib/knownGene.sql
   hgsql mm8 -e  'load data local infile "knownGene.tab" into table knownGene'
  
# Load dupSpMrna table after knownGene table is loaded so that joinerCheck does not complain.

   hgsql mm8 -e  'drop table dupSpMrna'
   hgsql mm8 < ~/src/hg/lib/dupSpMrna.sql
   hgsql mm8 -e  'load data local infile "dupSpMrna.tab" into table dupSpMrna'

# Perform analysis on KG

nice featureBits mm8 knownGene
# 54684224 bases of 2567283971 (2.130%) in intersection
nice featureBits mm8 knownGene:cds
# 28459053 bases of 2567283971 (1.109%) in intersection
  
nice featureBits mm8 refGene
# 46256526 bases of 2567283971 (1.802%) in intersection
nice featureBits mm8 refGene:cds
# 27221018 bases of 2567283971 (1.060%) in intersection
  
nice featureBits mm8 refGene knownGene
# 43441486 bases of 2567283971 (1.692%) in intersection
nice featureBits mm8 refGene:cds knownGene:cds
# 25164531 bases of 2567283971 (0.980%) in intersection

nice featureBits mm7 knownGene
# 53165921 bases of 2583394090 (2.058%) in intersection
nice featureBits mm7 knownGene:cds
# 27531524 bases of 2583394090 (1.066%) in intersection
 
nice featureBits mm7 refGene
# 46425940 bases of 2583394090 (1.797%) in intersection
nice featureBits mm7 refGene:cds
# 27319308 bases of 2583394090 (1.057%) in intersection
 
nice featureBits mm7 refGene knownGene
# 41777202 bases of 2583394090 (1.617%) in intersection
nice featureBits mm7 refGene:cds knownGene:cds
# 24297646 bases of 2583394090 (0.941%) in intersection

# Build knownGeneMrna and knownGenePep tables.

   kgPepMrna kgMm8ATemp mm8 060115
   hgsql mm8 -e  'drop table knownGeneMrna'
   hgsql mm8 < ~/src/hg/lib/knownGeneMrna.sql
   hgsql mm8 -e  'load data local infile "knownGeneMrna.tab" into table knownGeneMrna'
   hgsql mm8 -e  'drop table knownGenePep'
   hgsql mm8 < ~/src/hg/lib/knownGenePep.sql
   hgsql mm8 -e  'load data local infile "knownGenePep.tab" into table knownGenePep'

# Build kgXref table

   kgXref2 kgMm8ATemp 060115 mm8

   hgsql mm8 -e  'drop table kgXref'
   hgsql mm8 < ~/src/hg/lib/kgXref.sql
   hgsql mm8 -e  'load data local infile "kgXref.tab" into table kgXref'

# Build spMrna table

   hgsql mm8 -N -e 'select proteinID, name from knownGene' >kgSpMrna.tab

   hgsql mm8 -e  'drop table spMrna'
   hgsql mm8 <~/src/hg/lib/spMrna.sql
   hgsql mm8 -e 'load data local infile "kgSpMrna.tab" into table spMrna'

# Build kgProtMap table

    ssh hgwdev
    cd /cluster/store9/kg/kgMm8A
    ln -s protBlat/tight_mrna.psl .
    ~/src/hg/protein/kgProtMap2.sh kgMm8A mm8 060115

#####################################
# Build alias tables. (DONE 2/28/06, Fan)		

   ssh hgwdev
   cd /cluster/store9/kg/kgMm8A
   mkdir alias
   cd alias
   kgAliasM mm8 proteins060115

#	kgAliasKgXref reads from mm8.knownGene.proteinID,
#	mm8.knownGene.name, mm8.kgXref.geneSymbol
#	to create kgAliasKgXref.tab

   kgAliasKgXref mm8

#	kgAliasRefseq reads from mm8.knownGene.name,
#	mm8.knownGene.proteinID, mm8.kgXref.refseq
#	to create kgAliasRefseq.tab

   kgAliasRefseq mm8

   hgsql sp060115 -N -e 'select name,gene.val from mm8.knownGene,displayId,gene where displayId.val=proteinID and displayId.acc=gene.acc' \
   | sort -u  > kgAliasP.tab

   hgsql mm8 -N -e 'select name, name from knownGene' >kgAliasDup.tab
   hgsql mm8 -N -e 'select mrnaID, dupMrnaID from dupSpMrna' >>kgAliasDup.tab
   
   cat kgAliasM.tab kgAliasRefseq.tab kgAliasKgXref.tab kgAliasP.tab kgAliasDup.tab| \
   sort |uniq > kgAlias.tab

   hgsql -e "drop table kgAlias;" mm8 
   hgsql mm8 < ~/kent/src/hg/lib/kgAlias.sql
   hgsql mm8 -e 'LOAD DATA local INFILE "kgAlias.tab" into table kgAlias' 

#	kgProtAlias reads from mm8.knownGene.name,
#	mm8.knownGene.proteinID, mm8.knownGene.alignID,
#	proteins060115.spXref3.accession, proteins060115.spSecondaryID, proteins060115.pdbSP.pdb
#	to create kgProtAlias.tab#

   kgProtAlias mm8 060115

   hgsql mm8 -N -e \
   'select kgID, spDisplayID, protAcc from kgXref where protAcc != ""'\
   | sort -u >kgProtAliasNCBI.tab

# include variant splice protein IDs
   
   hgsql mm8 -N -e \
   'select name, proteinID, parAcc from knownGene,sp060115.varAcc where varAcc=proteinID'\
   |sort -u >kgProtAliasDup.tab

# include duplicate protein IDs from dupSpMrna table
   hgsql mm8 -N -e \
   'select name, knownGene.proteinID, dupProteinID from knownGene, dupSpMrna where name=mrnaID'\
   |sort -u >>kgProtAliasDup.tab

# catch parent acc from dupProteinID too
   hgsql mm8 -N -e\
   'select name, knownGene.proteinID, parAcc from knownGene,dupSpMrna,sp060115.varAcc where name=mrnaID and dupProteinID=varAcc.varAcc'\
   |sort -u >>kgProtAliasDup.tab
    cat kgProtAliasNCBI.tab kgProtAlias.tab kgProtAliasDup.tab | sort -u > kgProtAliasAll.tab

    echo "`date` creating table kgProtAlias"
    hgsql mm8 -e "drop table kgProtAlias;"
    hgsql mm8 <~/src/hg/lib/kgProtAlias.sql; 
    hgsql mm8 -e 'LOAD DATA local INFILE "kgProtAliasAll.tab" into table kgProtAlias;'  

# Build kgSpAlias table

    hgsql mm8 -e \
    'select kgXref.kgID, spID, alias from kgXref, kgAlias where kgXref.kgID=kgAlias.kgID' >j.tmp
    hgsql mm8 -e \
    'select kgXref.kgID, spID, alias from kgXref, kgProtAlias where kgXref.kgID=kgProtAlias.kgID'\
    >>j.tmp
    cat j.tmp|sort -u |grep -v 'kgID' >mm8.kgSpAlias.tab
    rm j.tmp

    hgsql mm8 -e 'drop table kgSpAlias';
    hgsql mm8 < ~/src/hg/lib/kgSpAlias.sql
    hgsql mm8 -e 'load data local infile "mm8.kgSpAlias.tab" into table kgSpAlias'

#############################################################################
# 17-WAY VAR_MULTIZ - ALIGNMENTS (DONE - 2006-02-28 - 2006-03-02 - Hiram)
#	Re-DONE with panTro2 in place of panTro1 - 2006-04-19 - Hiram)
#	And again with xenTro1 in place of xenTro2 - 2006-04-24
#	And again with danRer4 in place of danRer3 - 2006-05-02
    ssh kkstore04
    mkdir /cluster/data/mm8/bed/multiz17way
    cd /cluster/data/mm8/bed/multiz17way

    #	create tree diagram to guide work below.
    #	This tree was constructed from one that Adam is using for
    #	ENCODE work and a 27-way alignment.  Took that file and
    #	removed some of the entries, adding together the appropriate
    #	distances.

    cat << '_EOF_' > 17way.nh
(((((((((
(human_hg18:0.006690,chimp_panTro2:0.007571):0.024272,
  macaque_rheMac2:0.0592):0.023960,

  ((rat_rn4:0.081728,mouse_mm8:0.077017):0.229273,
      rabbit_oryCun1:0.206767):0.1065):0.023026,

(cow_bosTau2:0.159182,dog_canFam2:0.147731):0.039450):0.028505,

armadillo_dasNov1:0.149862):0.015994,

(elephant_loxAfr1:0.104891,tenrec_echTel1:0.259797):0.040371):0.218400,

monodelphis_monDom4:0.371073):0.189124,

chicken_galGal2:0.454691):0.123297,

xenopus_xenTro2:0.782453):0.156067,

((tetraodon_tetNig1:0.199381,fugu_fr1:0.239894):0.492961,
    zebrafish_danRer4:0.782561):0.156067);
'_EOF_'
    #	<< happy emacs

    /cluster/bin/phast/draw_tree 17way.nh > 17way.ps
    /cluster/bin/phast/all_dists 17way.nh > 17way.distances.txt
    grep -y mm8 17way.distances.txt | sort -k3,3n
    #	Print out that file for reference, and use the calculated
    #	distances in the table below to order the organisms and check
    #	the button order on the browser.  Zebrafish ends up before
    #	tetraodon and fugu on the browser despite its distance.
    #	And if you can fill in the table below entirely, you have
    #	succeeded in finishing all the alignments required.
    #
#                         featureBits chainLink measures
#                                           chainMm8Link   chain   linearGap
#    distance                       on Mm8      on other   minScore
#  1  0.1587 - rat rn4            (% 68.957)  (% 69.651)   3000     medium
#  2  0.4677 - human hg18         (% 38.343)  (% 34.514)   3000     medium
#  3  0.4686 - chimp panTro2      (% 37.549)  (% 33.614)   3000     medium
#  4  0.4960 - macaque rheMac2    (% 34.718)  (% 33.170)   3000     medium
#  5  0.5131 - rabbit oryCun1     (% 19.322)  (no swap )   3000     medium
#  6  0.6142 - armadillo dasNov1  (% 16.825)  (no swap )   3000     medium
#  7  0.6230 - dog canFam2        (% 32.281)  (% 34.255)   3000     medium
#  8  0.6256 - elephant loxAfr1   (% 18.392)  (no swap )   3000     medium
#  9  0.6344 - cow bosTau2        (% 26.832)  (% 24.293)   3000     medium
# 10  0.7805 - tenrec echTel1     (% 11.412)  (no swap )   5000     loose
# 11  1.0698 - opossum monDom4    (%  8.245)  (%  6.024)   5000     loose
# 12  1.3425 - chicken galGal2    (%  2.552)  (%  5.414)   5000     loose
# 13  1.7936 - frog xenTro2       (%  2.651)  (%  5.358)   5000     loose
# 14  2.0157 - tetraodon tetNig1  (%  1.962)  (% 13.734)   5000     loose
# 15  2.0562 - fugu fr1           (%  1.907)  (% 13.524)   5000     loose
# 16  2.1059 - zebrafish danRer4  (%  2.105)  (%  3.576)   5000     loose

    cd /cluster/data/mm8/bed/multiz17way
    #	bash shell syntax here ...
    export H=/cluster/data/mm8/bed
    mkdir mafLinks
    for G in rn4 hg18 panTro2 rheMac2 oryCun1 dasNov1 canFam2 \
	loxAfr1 bosTau2 echTel1 monDom4 galGal2 xenTro2 tetNig1 fr1 danRer4
    do
	mkdir mafLinks/$G
	if [ ! -d ${H}/blastz.${G}/mafNet ]; then
	echo "missing directory blastz.${G}/mafNet"
		exit 255
	fi
	ln -s ${H}/blastz.$G/mafNet/*.maf.gz ./mafLinks/$G
    done

    #	Copy MAFs to some appropriate NFS server for kluster run
    ssh kkstore04
    mkdir /san/sanvol1/scratch/mm8/multiz17way
    cd /san/sanvol1/scratch/mm8/multiz17way
    time rsync -a --copy-links --progress \
	/cluster/data/mm8/bed/multiz17way/mafLinks/ .

    #	We have about 5.9 Gb of data here, takes ~ 10 minutes to copy

    mkdir penn
    cp -p /cluster/bin/penn/multiz.v11.x86_64/multiz-tba/multiz penn
    cp -p /cluster/bin/penn/multiz.v11.x86_64/multiz-tba/maf_project penn
    cp -p /cluster/bin/penn/multiz.v11.x86_64/multiz-tba/autoMZ penn

    # the autoMultiz cluster run
    ssh pk
    cd /cluster/data/mm8/bed/multiz17way/

    # create species list and stripped down tree for autoMZ
    sed 's/[a-z][a-z]*_//g; s/:[0-9\.][0-9\.]*//g; s/;//; /^ *$/d' \
	17way.nh > tmp.nh
    echo `cat tmp.nh` > tree-commas.nh
    echo `cat tree-commas.nh` | sed 's/ //g; s/,/ /g' > tree.nh
    sed 's/[()]//g; s/,/ /g' tree.nh > species.lst

    #	the maf directory here is a symlink to a /cluster/store8 
    #	directory to even out the data load on store9 and store8 on kkstore04
    mkdir /cluster/store8/mm8/bed/multiz17way/maf
    ln -s /cluster/store8/mm8/bed/multiz17way/maf ./maf
    mkdir run
    cd run

    #	NOTE: you need to set the db properly in this script

    cat > autoMultiz << '_EOF_'
#!/bin/csh -ef
set db = mm8
set c = $1
set maf = $2
set binDir = /san/sanvol1/scratch/$db/multiz17way/penn
set tmp = /scratch/tmp/$db/multiz.$c
set pairs = /san/sanvol1/scratch/$db/multiz17way
rm -fr $tmp
mkdir -p $tmp
cp ../{tree.nh,species.lst} $tmp
pushd $tmp
foreach s (`cat species.lst`)
    set in = $pairs/$s/$c.maf
    set out = $db.$s.sing.maf
    if ($s == $db) then
	continue
    endif
    if (-e $in.gz) then
	zcat $in.gz > $out
    else if (-e $in) then
	cp $in $out
    else
	echo "##maf version=1 scoring=autoMZ" > $out
    endif
end
set path = ($binDir $path); rehash
$binDir/autoMZ + T=$tmp E=$db "`cat tree.nh`" $db.*.sing.maf $c.maf
popd
cp $tmp/$c.maf $maf
rm -fr $tmp
'_EOF_'
# << happy emacs
    chmod +x autoMultiz

cat  << '_EOF_' > template
#LOOP
autoMultiz $(root1) {check out line+ /cluster/store8/mm8/bed/multiz17way/maf/$(root1).maf}
#ENDLOOP
'_EOF_'
# << happy emacs

    awk '{print $1}' /cluster/data/mm8/chrom.sizes > chrom.lst
    gensub2 chrom.lst single template jobList
    para create jobList
    # 34 jobs
    para try ... check ... push ... etc ...
# Completed: 34 of 34 jobs
# CPU time in finished jobs:     210573s    3509.55m    58.49h    2.44d  0.007 y
# IO & Wait Time:                  4870s      81.17m     1.35h    0.06d  0.000 y
# Average job time:                6337s     105.61m     1.76h    0.07d
# Longest finished job:           17786s     296.43m     4.94h    0.21d
# Submission to last job:         41755s     695.92m    11.60h    0.48d

    #	combine results into a single file for loading and gbdb reference
    ssh kkstore04
    cd /cluster/data/mm8/bed/multiz17way
    #	There used to be a mafFilter here with a minScore of 500, but it
    #	turns out that the scores in these maf files are pretty much
    #	useless.  They range from very large negatives to very large
    #	positives.
    time catDir maf > multiz17way.maf
    #	real    10m17.400s
    #	makes an 17 Gb file:
    #	-rw-rw-r--   1 17334936245 Apr 20 10:31 multiz17way.maf

    #	Create per-chrom individual maf files for downloads
    #	These are actually done after the annotation mafs are made
    ##	re-done with corrected annotated mafs 2007-03-28 - Hiram
    ssh kkstore04
    cd /cluster/data/mm8/bed/multiz17way
    mkdir mafDownloads
    time for M in anno/maf/chr*.maf
    do
	B=`basename $M`
	nice -n +19 cp -p ${M} mafDownloads/${B}
	nice -n +19 gzip mafDownloads/${B}
	echo ${B} done
    done
    #	real    59m16.415s
    cd mafDownloads
    md5sum *.gz > md5sum.txt

    #	deliver to downloads
    ssh hgwdev
    ln -s /cluster/data/mm8/bed/multiz17way/mafDownloads \
	/usr/local/apache/htdocs/goldenPath/mm8/multiz17way

    # Load into database, actually annotation mafs are loaded later
    ssh hgwdev
    cd /cluster/data/mm8/bed/multiz17way
    mkdir /gbdb/mm8/multiz17way
    ln -s /cluster/data/mm8/bed/multiz17way/multiz17way.maf \
	/gbdb/mm8/multiz17way
    time nice -n +19 hgLoadMaf mm8 multiz17way
    #	Loaded 11601035 mafs in 1 files from /gbdb/mm8/multiz17way
    #	real    27m29.960s

    time nice -n +19 hgLoadMafSummary -minSize=10000 -mergeGap=500 \
	-maxSize=50000 mm8 multiz17waySummary multiz17way.maf
    #	Created 5782229 summary blocks from 65123362 components and
    #	11601035 mafs from multiz17way.maf
    #	real    32m34.791s

    # Dropped unused indexes (2006-05-09 kate)
    # NOTE: this is not required in the future, as the loader
    # has been fixed to not generate these indexes
    hgsql mm8 -e "alter table multiz17waySummary drop index chrom_2"
    hgsql mm8 -e "alter table multiz17waySummary drop index chrom_3"

    #	This was done for Mm7, same image can be reused
    # create tree image:
    #	cat << '_EOF_' > species.nh
# ((((((human,(mouse,rat)),(dog,cow)),opossum),chicken),frog),(tetraodon,zebrafish))
# '_EOF_'
#    /cluster/bin/phast/draw_tree -b -s species.nh > species10.ps
    # photoshop to enhance, reduce the amount of whitespace to make it
    # smaller, then save as jpg
#    cp species10.jpg /usr/local/apache/htdocs/images/phylo/Mm7_17way.jpg

    # creating upstream mafs (DONE - 2006-07-31 - Hiram)
    ssh hgwdev
    #	data data load balancing in the kkstore04 filesystems
    mkdir /cluster/store8/mm8/bed/multiz17way/upstreamMafs
    cd /cluster/data/mm8/bed/multiz17way
    ln -s /cluster/store8/mm8/bed/multiz17way/upstreamMafs ./upstreamMafs
    #	rebuilt 2007-12-21 to fix difficulty in mafFrags when species.lst
    #	did not have mm8 as the first one
for S in 1000 2000 5000
do
    echo "making upstream${S}.maf"
    nice -n +19 $HOME/bin/$MACHTYPE/featureBits -verbose=2 mm8 \
        refGene:upstream:${S} -fa=/dev/null -bed=stdout \
        | perl -wpe 's/_up[^\t]+/\t0/' | sort -k1,1 -k2,2n \
        | $HOME/kent/src/hg/ratStuff/mafFrags/mafFrags mm8 multiz17way \
                stdin stdout -orgs=species.lst \
        | gzip -c > upstreamMafs/upstream${S}.maf.gz
    echo "done upstream${S}.maf.gz"
done

    ssh hgwdev
    cd /usr/local/apache/htdocs/goldenPath/mm8/multiz17way
    ln -s /cluster/data/mm8/bed/multiz17way/upstreamMafs/upstream*.maf.gz .
############################################################################
# ANNOTATE MULTIZ17WAY MAF AND LOAD TABLES (DONE - 2006-04-24 - Hiram)
#	RE-DONE 2006-05-03 with danRer4 in place of danRer3
## Redone to correct usage of nBeds and sizes file (2007-03-28 - Hiram)
    ssh kolossus
    mkdir /cluster/data/mm8/bed/multiz17way/anno
    cd /cluster/data/mm8/bed/multiz17way/anno
    mkdir maf run
    cd run
    rm -f sizes nBeds
    twoBitInfo -nBed /cluster/data/mm8/mm8.{2bit,N.bed}
    for DB in `cat /cluster/data/mm8/bed/multiz17way/species.lst`
    do
        ln -s  /cluster/data/${DB}/chrom.sizes ${DB}.len
        ln -s  /cluster/data/${DB}/${DB}.N.bed ${DB}.bed
	echo ${DB}.bed  >> nBeds
	echo ${DB}.len  >> sizes
	echo $DB
    done

    echo '#!/bin/csh -ef' > jobs.csh
    echo date >> jobs.csh
    # do smaller jobs first so you can see some progress immediately:
    for F in `ls -1rS ../../maf/*.maf`
    do
      echo nice mafAddIRows -nBeds=nBeds -sizes=sizes $F \
        /cluster/data/mm8/mm8.2bit ../maf/`basename $F` >> jobs.csh
      echo "echo $F" >> jobs.csh
    done 
    echo date >> jobs.csh
    chmod +x jobs.csh
    time ./jobs.csh > jobs.log 2>&1 &
    #	to watch progress;
    tail -f jobs.log
    #	real    218m16.272s

    # Load anno/maf
    ssh hgwdev
    cd /cluster/data/mm8/bed/multiz17way/anno/maf
    mkdir -p /gbdb/mm8/multiz17way/anno/maf
    ln -s /cluster/data/mm8/bed/multiz17way/anno/maf/*.maf \
      /gbdb/mm8/multiz17way/anno/maf
    time nice -n +19 hgLoadMaf \
	-pathPrefix=/gbdb/mm8/multiz17way/anno/maf mm8 multiz17way
    #	Loaded 12484442 mafs in 34 files from /gbdb/mm8/multiz17way/anno/maf
    #	real    8m14.757s

    # Do the computation-intensive part of hgLoadMafSummary on a workhorse 
    # machine and then load on hgwdev:
    ssh hgwdev64
    cd /cluster/data/mm8/bed/multiz17way/anno/maf
    time cat *.maf | \
    	nice -n +19 hgLoadMafSummary mm8 -minSize=30000 -mergeGap=1500 \
	-maxSize=200000 -test multiz17waySummary stdin
    #	Created 3153839 summary blocks from 65123362 components
    #	and 12484442 mafs from stdin
    #	real    13m25.961s

    ssh hgwdev
    cd /cluster/data/mm8/bed/multiz17way/anno/maf
    time nice -n +19 hgLoadSqlTab mm8 multiz17waySummary \
	~/kent/src/hg/lib/mafSummary.sql multiz17waySummary.tab
    #	real    0m53.525s
    rm *.tab

#######################################################################
# MULTIZ17WAY MAF FRAMES (DONE - 2006-04-24 - 2006-04-25 - Hiram)
#	RE-DONE 2006-05-03 to replace danRer3 with danRer4
    ssh hgwdev
    mkdir /cluster/data/mm8/bed/multiz17way/frames
    cd /cluster/data/mm8/bed/multiz17way/frames
    # The following is adapted from MarkD's Makefile used for mm7...

    #------------------------------------------------------------------------
    # get the genes for all genomes
    # mRNAs with CDS.  single select to get cds+psl, then split that up and
    # create genePred
    # using mrna table as genes:
    mkdir genes
    for qDB in oryCun1 panTro2 rheMac2 canFam2 bosTau2 danRer4 loxAfr1 \
	tetNig1 fr1
    #	single danRer4 re-run 2006-05-03, removed danRer3
    for qDB in danRer4
    do
      tmpExt=`mktemp temp.XXXXXX`
      tmpMrnaCds=${qDB}.mrna-cds.${tmpExt}
      tmpMrna=${qDB}.mrna.${tmpExt}
      tmpCds=${qDB}.cds.${tmpExt}
      echo $qDB
      hgsql -N -e 'select all_mrna.qName,cds.name,all_mrna.* \
                   from all_mrna,gbCdnaInfo,cds \
                   where (all_mrna.qName = gbCdnaInfo.acc) and \
                     (gbCdnaInfo.cds != 0) and (gbCdnaInfo.cds = cds.id)' \
       ${qDB} > ${tmpMrnaCds}
      cut -f 1-2  ${tmpMrnaCds} > ${tmpCds}
      cut -f 4-100  ${tmpMrnaCds} > ${tmpMrna}
      mrnaToGene -cdsFile=${tmpCds} -smallInsertSize=8 -quiet ${tmpMrna} \
        stdout \
      | genePredSingleCover stdin stdout | gzip -2c \
        > /scratch/tmp/$qDB.tmp.gz
      rm ${tmpMrnaCds} ${tmpMrna} ${tmpCds}
      mv /scratch/tmp/$qDB.tmp.gz genes/$qDB.gp.gz
      rm -f $tmpExt
    done
    #	tried to use monDom4 in the above loop, but got this error:
    #	(450211944 450214274) out of range (0 400000000) in binKeeperAdd
    #	Which is interesting.  This should be looked into to see why
    #	this is here.

    # using knownGene for rn4 mm8 hg18
    # using refGene for galGal2
    # using mgcGenes for xenTro2
    # no genes for monDom4 dasNov1 echTel1
    # genePreds; (must keep only the first 10 columns for knownGene)
    for qDB in rn4 mm8 hg18 galGal2 xenTro2
    do
      if [ $qDB = "xenTro2" ]; then
        geneTbl=mgcGenes
      elif [ $qDB = "galGal2" ]; then
        geneTbl=refGene
      else
        geneTbl=knownGene
      fi
      echo hgsql -N -e 'select * from '"$geneTbl ${qDB}"
      hgsql -N -e "select * from $geneTbl" ${qDB} | cut -f 1-10 \
      | genePredSingleCover stdin stdout | gzip -2c \
        > /scratch/tmp/$qDB.tmp.gz
      mv /scratch/tmp/$qDB.tmp.gz genes/$qDB.gp.gz
      rm -f $tmpExt
    done

    #------------------------------------------------------------------------
    # create frames
    #	beware, BASH syntax here ...
    # rebuild frames to get bug fix, using 1-pass maf methodology (2006-06-09 markd)
    clusterDir=/cluster/bluearc/mm8/multiz17wayFrames
    multizDir=/cluster/data/mm8/bed/multiz17way
    mafDir=$multizDir/mafDownloads
    geneDir=$multizDir/frames/genes
    clusterMafDir=${clusterDir}/maf
    clusterGeneDir=${clusterDir}/genes
    clusterFramesDir=${clusterDir}/mafFrames.kki

    # copy mafs to cluster storage
    mkdir $clusterDir
    ssh -x kkstore04 "rsync -av $mafDir/*.maf.gz $clusterMafDir/"

    # copy genes to cluster storage
    ssh -x kkstore04 "rsync -av $geneDir/*.gp.gz $clusterGeneDir/"

    # run cluster jobs
    tmpExt=`mktemp temp.XXXXXX`
    paraDir=$multizDir/frames/para.${tmpExt}
    cd /cluster/data/mm8/bed/multiz17way/frames
    mkdir mafFrames $paraDir
    mkdir ${clusterFramesDir}
    for qDB in `cat /cluster/data/mm8/bed/multiz17way/species.lst`
    do
      mkdir ${clusterFramesDir}/${qDB}
      for C in `awk '{print $1;}' /cluster/data/mm8/chrom.sizes`
      do
        if [ -e ${clusterGeneDir}/${qDB}.gp.gz ]; then
          echo /cluster/bin/scripts/mkMafFrames.pl ${qDB} mm8 \
            ${clusterGeneDir}/${qDB}.gp.gz ${clusterMafDir}/$C.maf.gz \
            ${clusterFramesDir}/${qDB}/$C.mafFrames \
            >> $paraDir/jobList
        fi
      done
    done
    rm -f $tmpExt
    ssh -x kki "cd ${paraDir} && para make jobList && para time"
# Completed: 476 of 476 jobs
# CPU time in finished jobs:       6235s     103.91m     1.73h    0.07d  0.000 y
# IO & Wait Time:                 13538s     225.64m     3.76h    0.16d  0.000 y
# Average job time:                  42s       0.69m     0.01h    0.00d
# Longest finished job:             237s       3.95m     0.07h    0.00d
# Submission to last job:          1242s      20.70m     0.34h    0.01d

    # combine results from cluster
    for qDB in \
	`sed -e "s/ dasNov1//; s/ echTel1//; s/ monDom4//;" ../species.lst`
    do
      ssh -x kolossus "cat ${clusterFramesDir}/${qDB}/*.mafFrames | gzip -2c > ${multizDir}/frames/mafFrames/${qDB}.mafFrames.gz"
      echo "${qDB}"
    done

    #------------------------------------------------------------------------
    # load the database
    ssh hgwdev
    cd /cluster/data/mm8/bed/multiz17way/frames
    time nice -n +19 hgLoadMafFrames mm8 multiz17wayFrames \
	mafFrames/*.mafFrames.gz
    #	real    1m11.457s

    #------------------------------------------------------------------------
    # clean up
    rm -rf ${clusterDir}

    ###
    # rebuild frames to get bug fix, using 1-pass maf methodology (2006-06-09 markd)
    ssh kkstore04
    cd /cluster/data/mm8/bed/multiz17way/frames
    mv mafFrames/ mafFrames.old
    nice tcsh # easy way to get process niced
    (cat  ../maf/*.maf | time genePredToMafFrames mm8 stdin stdout bosTau2 genes/bosTau2.gp.gz canFam2 genes/canFam2.gp.gz danRer4 genes/danRer4.gp.gz fr1 genes/fr1.gp.gz galGal2 genes/galGal2.gp.gz hg18 genes/hg18.gp.gz loxAfr1 genes/loxAfr1.gp.gz mm8 genes/mm8.gp.gz oryCun1 genes/oryCun1.gp.gz panTro2 genes/panTro2.gp.gz rheMac2 genes/rheMac2.gp.gz rn4 genes/rn4.gp.gz tetNig1 genes/tetNig1.gp.gz xenTro2 genes/xenTro2.gp.gz  bosTau2 genes/bosTau2.gp.gz |  gzip >multiz17way.mafFrames.gz)>&log&
    ssh hgwdev
    cd /cluster/data/mm8/bed/multiz17way/frames

    hgLoadMafFrames mm8 multiz17wayFrames multiz17way.mafFrames.gz |&mail markd&


############################################################################
# CREATE CONSERVATION WIGGLE WITH PHASTCONS
#		(DONE - 2006-03-02 - Hiram)
#	(RE-DONE - 2006-04-25 with panTro2 and xenTro2 - Hiram)
#	(RE-DONE - 2006-05-03 with danRer4 instead of danRer3 - Hiram)

    #	Will skip this estimate for Mm8 since it was well done in Mm7
    #	and in Hg17, skip to the creation of the SS files
# Estimate phastCons parameters
    ssh kkstore01
    mkdir /cluster/data/mm8/bed/multiz17way/cons
    cd /cluster/data/mm8/bed/multiz17way/cons

    # Create a starting-tree.mod based on chr2 (the largest one)
    /cluster/bin/phast/$MACHTYPE/msa_split ../maf/chr2.maf \
	--refseq ../../../2/chr2.fa --in-format MAF \
	--windows 100000000,1000 --out-format SS \
	--between-blocks 5000 --out-root s1
    #	10 minutes

    /cluster/bin/phast/$MACHTYPE/phyloFit -i SS s1.*.ss \
--tree "((((((((((hg18,panTro2),rheMac2),((rn4,mm8),oryCun1)),(bosTau2,canFam2)),dasNov1),(loxAfr1,echTel1)),monDom4),galGal2),xenTro2),((tetNig1,fr1),danRer4))" \
    --out-root starting-tree
    #	real    840m53.157s
    #	That is 14 hours !

    rm s1.*.ss
    # add up the C and G:
    grep BACKGROUND starting-tree.mod | awk '{printf "%0.3f\n", $3 + $4;}'
    #	0.407
    #	This 0.407 is used in the --gc argument below

    #	CONTINUE HERE, no estimation required
    # Create big bad bloated SS files on san filesystem (takes ~ 2h 20m)
    #	Increasing their size this time from 1,000,000 to 10,000,000 to
    #	slow down the phastCons pk jobs
    ssh kkstore04
    mkdir -p  /san/sanvol1/scratch/mm8/cons/ss
    cd  /san/sanvol1/scratch/mm8/cons/ss
    time for C in `awk '{print $1}' /cluster/data/mm8/chrom.sizes`
    do
      if [ -s /cluster/data/mm8/bed/multiz17way/maf/${C}.maf ]; then
	mkdir ${C}
	echo msa_split $C
	chrN=${C/chr/}
	chrN=${chrN/_random/}
	/cluster/bin/phast/$MACHTYPE/msa_split \
	    /cluster/data/mm8/bed/multiz17way/maf/${C}.maf \
	    --refseq /cluster/data/mm8/${chrN}/${C}.fa \
	    --in-format MAF --windows 4000000,0 --between-blocks 5000 \
	    --out-format SS --out-root ${C}/${C}
      fi
    done &
    #	real    94m49.273s

    #  Again, going to SKIP this tuning business this time and use the
    #  previous numbers.

    # Create a random list of 50 1 mb regions  (do not use the _randoms)
    cd /san/sanvol1/scratch/mm8/cons/ss
    ls -1l chr*/chr*.ss | grep -v random | \
	awk '$5 > 4000000 {print $9;}' | randomLines stdin 50 ../randomSs.list

    # Set up parasol directory to calculate trees on these 50 regions
    ssh pk
    mkdir /san/sanvol1/scratch/mm8/cons/treeRun1
    cd /san/sanvol1/scratch/mm8/cons/treeRun1
    mkdir tree log

    #	Tuning this loop should come back to here to recalculate 
    # Create little script that calls phastCons with right arguments
    #	--target-coverage of 0.20 is about right for mouse, will be
    #	tuned exactly below
    cat > makeTree.csh << '_EOF_'
#!/bin/csh -fe
set C=$1:h
mkdir -p log/${C} tree/${C}
    /cluster/bin/phast/$MACHTYPE/phastCons ../ss/$1 \
      /cluster/data/mm8/bed/multiz17way/cons/starting-tree.mod \
      --gc 0.407 --nrates 1,1 --no-post-probs --ignore-missing \
      --expected-lengths 12 --target-coverage 0.17 \
      --quiet --log log/$1 --estimate-trees tree/$1
'_EOF_'
    #	<< happy emacs
    chmod a+x makeTree.csh

    # Create gensub file
    cat > template << '_EOF_'
#LOOP
makeTree.csh $(path1)
#ENDLOOP
'_EOF_'
    #	<< happy emacs

    # Make cluster job and run it
    gensub2 ../randomSs.list single template jobList
    para create jobList
    para try/push/check/etc
# Completed: 50 of 50 jobs
# CPU time in finished jobs:     354644s    5910.74m    98.51h    4.10d  0.011 y
# IO & Wait Time:                   352s       5.86m     0.10h    0.00d  0.000 y
# Average job time:                7100s     118.33m     1.97h    0.08d
# Longest finished job:           29358s     489.30m     8.15h    0.34d
# Submission to last job:         29446s     490.77m     8.18h    0.34d

    # Now combine parameter estimates.  We can average the .mod files
    # using phyloBoot.  This must be done separately for the conserved
    # and nonconserved models
    ssh kkstore01
    cd /san/sanvol1/scratch/mm8/cons/treeRun1
    ls -1 tree/chr*/*.cons.mod > cons.list
    time /cluster/bin/phast/$MACHTYPE/phyloBoot --read-mods '*cons.list' \
	--output-average ../ave.cons.mod > cons_summary.txt 2>&1 &
    ls -1 tree/chr*/*.noncons.mod > noncons.list
    /cluster/bin/phast/$MACHTYPE/phyloBoot --read-mods '*noncons.list' \
	--output-average ../ave.noncons.mod > noncons_summary.txt
    cd ..
    cp -p ave.*.mod /cluster/data/mm8/bed/multiz17way/cons

    #	measuring entropy
    #	consEntopy <target coverage> <expected lengths>
    #		 ave.cons.mod ave.noncons.mod --NH 9.78
    #	never stops with the --NH argument
    /cluster/bin/phast/$MACHTYPE/consEntropy .17 12 \
                        ave.cons.mod ave.noncons.mod
XXXX - does not work:  2005-11-28
[hiram@kkstore01 /san/sanvol1/scratch/mm8/cons] /cluster/bin/phast/$MACHTYPE/consEntropy .17 12 ave.cons.mod ave.noncons.mod
ERROR: with no separate source alignment, ss_from_msas expects sequences of positive length and no SS object.

#Transition parameters:gamma=0.100000, omega=12.000000, mu=0.083333, nu=0.009259
# Relative entropy: H=1.454874 bits/site
# Required length: N=7.596943 sites
# Total entropy: NH=11.052595 bits

# consEntropy .20 12 ave.cons.mod.1 ave.noncons.mod.1
# Transition params: gamma=0.200000, omega=12.000000, mu=0.083333, nu=0.020833
# Relative entropy: H=1.454874 bits/site
# Required length: N=6.629337 sites
# Total entropy: NH=9.644850 bits

# consEntropy .10 12 ave.cons.mod.2 ave.noncons.mod.2
# Transition params: gamma=0.100000, omega=12.000000, mu=0.083333, nu=0.009259
# Relative entropy: H=1.527815 bits/site
# Required length: N=7.205526 sites
# Total entropy: NH=11.008713 bits

# consEntropy .20 8 ave.cons.mod.3 ave.noncons.mod.3
# Transition params: gamma=0.200000, omega=8.000000, mu=0.125000, nu=0.031250
# Relative entropy: H=1.654878 bits/site
# Required length: N=5.146793 sites
# Total entropy: NH=8.517313 bits

### !!! ***  This one with .17 and 12 is the one that was finally used
# consEntropy .17 12 ave.cons.mod.4 ave.noncons.mod.4
# Transition params: gamma=0.170000, omega=12.000000, mu=0.083333, nu=0.017068
# Relative entropy: H=1.478838 bits/site
# Required length: N=6.753382 sites
# Total entropy: NH=9.987159 bits

    #	SKIP to here passing by the tuning numbers
    ssh pk
    # Create cluster dir to do main phastCons run
    mkdir /san/sanvol1/scratch/mm8/cons/consRun3
    cd /san/sanvol1/scratch/mm8/cons
    cp /san/sanvol1/scratch/mm7/cons/elliotsEncode.mod .
    #	edit, change monDom2 to monDom4, hg17 to hg18, rheMac1 to
    #	rheMac2, rn3 to rn4, mm7 to mm8
    #	danRer3 to danRer4
    #	It looks like:
ALPHABET: A C G T
ORDER: 0
SUBST_MOD: REV
TRAINING_LNL: -988246.132962
BACKGROUND: 0.295 0.205 0.205 0.295
RATE_MAT:
  -1.165221    0.315494    0.589884    0.259843
   0.189778   -0.878194    0.208718    0.479698
   0.444622    0.261535   -0.885604    0.179447
   0.234867    0.720815    0.215191   -1.170872
TREE: (((((((((((((hg18:0.006690,panTro2:0.007571):0.024272,(colobus_monkey:0.015404,(baboon:0.008258,rheMac2:0.028617):0.008519):0.022120):0.023960,(dusky_titi:0.025662,(owl_monkey:0.012151,marmoset:0.029549):0.008236):0.027158):0.066101,(mouse_lemur:0.059024,galago:0.121375):0.032386):0.017073,((rn4:0.081728,mm8:0.077017):0.229273,oryCun1:0.206767):0.023340):0.023026,(((bosTau2:0.159182,canFam2:0.147731):0.004946,rfbat:0.138877):0.010150,(hedgehog:0.193396,shrew:0.261724):0.054246):0.024354):0.028505,dasNov1:0.149862):0.015994,(loxAfr1:0.104891,echTel1:0.259797):0.040371):0.218400,monDom4:0.371073):0.065268,platypus:0.468116):0.123856,galGal2:0.454691):0.123297,xenTro2:0.782453):0.156067,((tetNig1:0.199381,fr1:0.239894):0.492961,danRer4:0.782561):0.156067);


    cd /san/sanvol1/scratch/mm8/cons/consRun3
    mkdir ppRaw bed

    # Create script to run phastCons with right parameters
    #	These parameters:
    #	--rho 0.28 --expected-length 14 --target-coverage 0.008 --quiet \
    #	were taken from Kate's 17-way in Hg17, removing the
    #	--not-informative panTro2 since that isn't relevant here, nor
    #	would be --not-informative rn4 - Jim says rn4 is far enough away
    #	from mm8 that it is informative.
    #	This job is I/O intensive in its output files, thus it is all
    #	working over in /scratch/tmp/
    cat > doPhast << '_EOF_'
#!/bin/csh -fe
mkdir /scratch/tmp/${2}
cp -p ../ss/${1}/${2}.ss ../elliotsEncode.mod /scratch/tmp/${2}
pushd /scratch/tmp/${2} > /dev/null
/cluster/bin/phast/${MACHTYPE}/phastCons ${2}.ss elliotsEncode.mod \
   --rho 0.28 --expected-length 14 --target-coverage 0.008 --quiet \
	--seqname ${1} --idpref ${1} --viterbi ${2}.bed --score > ${2}.pp
popd > /dev/null
mkdir -p ppRaw/${1}
mkdir -p bed/${1}
mv /scratch/tmp/${2}/${2}.pp ppRaw/${1}
mv /scratch/tmp/${2}/${2}.bed bed/${1}
rm /scratch/tmp/${2}/elliotsEncode.mod
rm /scratch/tmp/${2}/${2}.ss
rmdir /scratch/tmp/${2}
'_EOF_'
    # << happy emacs
    chmod a+x doPhast

    #	root1 == chrom name, file1 == ss file name without .ss suffix
    # Create gsub file
    cat > template << '_EOF_'
#LOOP
doPhast $(root1) $(file1)
#ENDLOOP
'_EOF_'
    #	<< happy emacs

    # Create parasol batch and run it
    ls -1 ../ss/chr*/chr*.ss | sed 's/.ss$//' > in.list

    gensub2 in.list single template jobList
    para create jobList
    para try/check/push/etc.
    #	These jobs are very fast and very I/O intensive, even on the san
    #	they will hang it up as they work at full tilt.
# Completed: 689 of 689 jobs
# CPU time in finished jobs:      12806s     213.44m     3.56h    0.15d  0.000 y
# IO & Wait Time:                 16079s     267.98m     4.47h    0.19d  0.001 y
# Average job time:                  42s       0.70m     0.01h    0.00d
# Longest finished job:              94s       1.57m     0.03h    0.00d
# Submission to last job:           350s       5.83m     0.10h    0.00d

    # combine predictions and transform scores to be in 0-1000 interval
    #	it uses a lot of memory, so on kolossus:
    ssh kolossus
    cd /san/sanvol1/scratch/mm8/cons/consRun3
    #	The sed's and the sort get the file names in chrom,start order
    #	You might like to verify it is correct by first looking at the
    #	list it produces:
    find ./bed -type f | sed -e "s#/# x #g; s#\.# y #g; s#-# z #g" \
	| sort -k7,7 -k9,9n \
	| sed -e "s# z #-#g; s# y #\.#g; s# x #/#g" | less
    #	if that looks right, then let it run:
    #	FOR NEXT TIME - the result file should be named:
    #	phastConsElements17way.bed since that is the name of the DB
    #	table that it is loaded into.  (instead of mostConserved.bed)
    find ./bed -type f | sed -e "s#/# x #g; s#\.# y #g; s#-# z #g" \
	| sort -k7,7 -k9,9n \
	| sed -e "s# z #-#g; s# y #\.#g; s# x #/#g" | xargs cat \
	| awk '{printf "%s\t%d\t%d\tlod=%d\t%s\n", $1, $2, $3, $5, $5;}' \
	| /cluster/bin/scripts/lodToBedScore /dev/stdin > mostConserved.bed
    #	~ 1 minute
    cp -p mostConserved.bed /cluster/data/mm8/bed/multiz17way

    # Figure out how much is actually covered by the bed file as so:
    #	Get the non-n genome size from faSize on all chroms:
    ssh kkstore01
    cd /cluster/data/mm8
    faSize ?{,?}/chr*.fa
    #	2664455088 bases (97171400 N's 2567283688 real 1477933003 upper
    #	1089350685 lower) in 34 sequences in 34 files

    cd /san/sanvol1/scratch/mm8/cons/consRun3
    #	The 2567283688 comes from the non-n genome as counted above.
    awk '
{sum+=$3-$2}
END{printf "%% %.2f = 100.0*%d/2567283688\n",100.0*sum/2567283688,sum}' \
	mostConserved.bed
    #	--rho 0.28 --expected-length 14 --target-coverage 0.008
    #	% 5.40 = 100.0*138575691/2567283688 danRer4 instead of danRer3
    #	% 5.43 = 100.0*139309333/2567283688 panTro2 and xenTro2
    #	% 5.39 = 100.0*138300407/2567283688 panTro1 and xenTro1

    #	Aiming for %70 coverage in
    #	the following featureBits measurement on CDS:
    # Beware of negative scores when too high.  The logToBedScore
    # will output an error on any negative scores.

    HGDB_CONF=~/.hg.conf.read-only time nice -n +19 featureBits mm8 \
	-enrichment refGene:cds mostConserved.bed
    #	--rho 0.28 --expected-length 14 --target-coverage 0.008
    #	with danRer4 instead of danRer3:
    #	refGene:cds 1.062%, mostConserved.bed 5.398%, both 0.743%, cover
    #	69.99%, enrich 12.97x
    #	with panTro2 and xenTro2:
    #	refGene:cds 1.060%, mostConserved.bed 5.426%, both 0.740%, cover
    #	69.85%, enrich 12.87x
    #	with panTro1 and xenTro1:
    #	refGene:cds 1.060%, mostConserved.bed 5.387%, both 0.739%, cover
    #	69.71%, enrich 12.94x

    # Load most conserved track into database
    ssh hgwdev
    cd /cluster/data/mm8/bed/multiz17way
    #	the copy was already done above
    # cp -p /san/sanvol1/scratch/mm8/cons/consRun3/mostConserved.bed .
    time nice -n +19 hgLoadBed -strict mm8 phastConsElements17way \
	mostConserved.bed
    #	Loaded 1883370 elements of size 5
    #	real    2m54.033s

    #	should measure the same as above
    time nice -n +19 featureBits mm8 -enrichment refGene:cds \
	phastConsElements17way
    #	with danRer4 in place of danRer3:
    #	refGene:cds 1.062%, phastConsElements17way 5.398%, both 0.743%,
    #	cover 69.99%, enrich 12.97x
    #	with panTro2 and xenTro2:
    #	refGene:cds 1.060%, phastConsElements 5.426%, both 0.740%, cover
    #	69.85%, enrich 12.87x
    #	with panTro1 and xenTro1:
    #	refGene:cds 1.060%, phastConsElements 5.387%, both 0.739%, cover
    #	69.71%, enrich 12.94x

    # Create merged posterier probability file and wiggle track data files
    ssh kkstore04
    cd /san/sanvol1/scratch/mm8/cons/consRun3
    # the sed business gets the names sorted by chromName, chromStart
    #	so that everything goes in numerical order into wigEncode
    #	This was verified above to be correct
    time nice -n +19 find ./ppRaw -type f \
	| sed -e "s#/# x #g; s#\.# y #g; s#-# z #g" \
	| sort -k7,7 -k9,9n \
	| sed -e "s# z #-#g; s# y #\.#g; s# x #/#g" | xargs cat \
	    | $HOME/bin/$MACHTYPE/wigEncode -noOverlap stdin \
		phastCons17.wig phastCons17.wib
    #	real    15m59.846s
    #	-rw-rw-r--   1 1961998053 May  3 12:22 phastCons17.wib
    #	-rw-rw-r--   1  237229239 May  3 12:22 phastCons17.wig

    time nice -n +19 cp -p phastCons17.wi? /cluster/data/mm8/bed/multiz17way/
    #	real    1m21.329s

    #	prepare compressed copy of ascii data values for downloads
    ssh pk
    cd /san/sanvol1/scratch/mm8/cons/consRun3
    cat << '_EOF_' > gzipAscii.sh
#!/bin/sh

TOP=`pwd`
export TOP

mkdir -p phastCons17Scores

for D in ppRaw/chr*
do
    C=${D/ppRaw\/}
    out=phastCons17Scores/${C}.data.gz
    echo "========================== ${C} ${D}"
    find ./${D} -type f | sed -e "s#/# x #g; s#\.# y #g; s#-# z #g" \
	| sort -k7,7 -k9,9n \
	| sed -e "s# z #-#g; s# y #\.#g; s# x #/#g" | xargs cat |
	    gzip > ${out}
done
'_EOF_'
    #	<< happy emacs
    chmod +x gzipAscii.sh
    time nice -n +19 ./gzipAscii.sh
    #	real    18m15.212s

    #	copy them for downloads
    ssh kkstore04
    #	this directory is actually a symlink from store9 to store8 to
    #	avoid the data full problem on store9
    mkdir /cluster/data/mm8/bed/multiz17way/phastCons17Scores
    cd /cluster/data/mm8/bed/multiz17way/phastCons17Scores
    cp -p  /san/sanvol1/scratch/mm8/cons/consRun3/phastCons17Scores/* .

    ssh hgwdev
    cd /usr/local/apache/htdocs/goldenPath/mm8
    ln -s /cluster/data/mm8/bed/multiz17way/phastCons17Scores .

    # Load gbdb and database with wiggle.
    ssh hgwdev
    cd /cluster/data/mm8/bed/multiz17way
    ln -s `pwd`/phastCons17.wib /gbdb/mm8/wib/phastCons17.wib
    time nice -n +19 hgLoadWiggle mm8 phastCons17 phastCons17.wig
    #	real    2m55.836s

    #  Create histogram to get an overview of all the data
    ssh hgwdev
    cd /cluster/data/mm8/bed/multiz17way
    time nice -n +19 hgWiggle -doHistogram \
	-hBinSize=0.001 -hBinCount=1000 -hMinVal=0.0 -verbose=2 \
	    -db=mm8 phastCons17 > histogram.data 2>&1
    #	real    28m24.388s

    #	create plot of histogram:

    cat << '_EOF_' | gnuplot > histo.png
set terminal png small color \
        x000000 xffffff xc000ff x66ff66 xffff00 x00ffff xff0000
set size 1.4, 0.8
set key left box
set grid noxtics
set grid ytics
set title " Mouse Mm8 Histogram phastCons17 track"
set xlabel " phastCons17 score"
set ylabel " Relative Frequency"
set y2label " Cumulative Relative Frequency (CRF)"
set y2range [0:1]
set y2tics
set yrange [0:0.02]

plot "histogram.data" using 2:5 title " RelFreq" with impulses, \
        "histogram.data" using 2:7 axes x1y2 title " CRF" with lines
'_EOF_'
    #	<< happy emacs

    display histo.png &

# QA NOTE: (ASZ: 5/1/2006) changed name of phastConsElements table to phastConsElements17way
# QA NOTE: (ASZ: 5/1/2006) changed name of phastCons17 table to phastCons17way
# Hiram Note:  phastCons17 never changed to phastCons17way at any time

#########################################################################
# MAKE FOLDUTR TABLES (DONE 2006-02-28, Fan)
# First set up directory structure and extract UTR sequence on hgwdev
    ssh hgwdev
    cd /cluster/data/mm8/bed
    rm rnaStruct

    mkdir /san/sanvol1/scratch/mm8/rnaStruct.2006-02-28
    ln -s /san/sanvol1/scratch/mm8/rnaStruct.2006-02-28 rnaStruct
    cd rnaStruct
    mkdir -p utr3/split utr5/split utr3/fold utr5/fold
    utrFa mm8 knownGene utr3 utr3/utr.fa
    utrFa mm8 knownGene utr5 utr5/utr.fa

# Split up files and make files that define job.
    ssh pk
    cd /cluster/data/mm8/bed/rnaStruct
    faSplit sequence utr3/utr.fa 4000 utr3/split/s
    faSplit sequence utr5/utr.fa 4000 utr5/split/s
    ls -1 utr3/split > utr3/in.lst
    ls -1 utr5/split > utr5/in.lst
    cd utr3
    cat > gsub <<end
#LOOP
rnaFoldBig split/\$(path1) fold
#ENDLOOP
end
    cp gsub ../utr5

# Do cluster run for 3' UTRs
    gensub2 in.lst single gsub spec
    para create spec
    para try
    para push
# Completed: 3897 of 3897 jobs
# CPU time in finished jobs:     227530s    3792.17m    63.20h    2.63d  0.007 y
# IO & Wait Time:                 44046s     734.10m    12.23h    0.51d  0.001 y
# Average job time:                  70s       1.16m     0.02h    0.00d
# Longest running job:                0s       0.00m     0.00h    0.00d
# Longest finished job:            1337s      22.28m     0.37h    0.02d
# Submission to last job:          1886s      31.43m     0.52h    0.02d

# Do cluster run for 5' UTRs 
    cd ../utr5
    gensub2 in.lst single gsub spec
    para create spec
    para try
    para push
# Completed: 3762 of 3762 jobs
# CPU time in finished jobs:      42244s     704.07m    11.73h    0.49d  0.001 y
# IO & Wait Time:                 10250s     170.83m     2.85h    0.12d  0.000 y
# Average job time:                  14s       0.23m     0.00h    0.00d
# Longest running job:                0s       0.00m     0.00h    0.00d
# Longest finished job:            2014s      33.57m     0.56h    0.02d
# Submission to last job:          2083s      34.72m     0.58h    0.02d

# Load database
    ssh hgwdev
    cd /cluster/data/mm8/bed/rnaStruct/utr5
    hgLoadRnaFold mm8 foldUtr5 fold
    cd ../utr3
    hgLoadRnaFold mm8 foldUtr3 fold

# Clean up
    rm -r split fold err batch.bak
    cd ../utr5
    rm -r split fold err batch.bak

# Build KEGG pathway tables.  (DONE 3/8/06.  Fan)
   ssh hgwdev
   cd /cluster/store9/kg/kgMm8A
   md kegg
   cd kegg

   ~/src/hg/protein/KGpath.sh kgMm8A mm8 060115

   hgsql mm8 -e "drop table keggMapDesc"
   hgsql mm8 -e "drop table keggPathway"
   hgsql mm8 <~/src/hg/lib/keggMapDesc.sql
   hgsql mm8 <~/src/hg/lib/keggPathway.sql
   hgsql mm8 -e 'load data local infile "keggMapDesc.tab" into table keggMapDesc'
   hgsql mm8 -e 'load data local infile "keggPathway.tab" into table keggPathway'
  
# Build CGAP pathway tables

   cd ..
   ~/src/hg/protein/KGcgap.sh kgMm8A mm8 060115

   cat cgapBIOCARTAdesc.tab |sort -u > cgapBIOCARTAdescSorted.tab
   hgsql mm8 -e "drop table cgapAlias"
   hgsql mm8 -e "drop table cgapBiocDesc"
   hgsql mm8 -e "drop table cgapBiocPathway"
   hgsql mm8 <~/src/hg/lib/cgapAlias.sql
   hgsql mm8 <~/src/hg/lib/cgapBiocDesc.sql
   hgsql mm8 <~/src/hg/lib/cgapBiocPathway.sql

   hgsql mm8 -e 'load data local infile "cgapAlias.tab" into table cgapAlias'
   hgsql mm8 -e 'load data local infile "cgapBIOCARTAdescSorted.tab" into table cgapBiocDesc'
   hgsql mm8 -e 'load data local infile "cgapBIOCARTA.tab" into table cgapBiocPathway'

####################################################################################

# BUILD PROTEOME BROWSER TABLES FOR mm8 (DONE 3/8/06, Fan) 

# These are instructions for building tables needed for the Proteome Browser. 
 
# DON'T START THESE UNTIL TABLES FOR KNOWN GENES AND kgProtMap table
# ARE REBUILT.  
# This build is based on proteins DBs dated 060115.

# Create the working directory

    ssh hgwdev
    mkdir /cluster/store9/kg/kgMm8A/pb-2006-03-08
    cd /cluster/data/mm8/bed
    rm pb
    ln -s /cluster/store9/kg/kgMm8A/pb-2006-03-08 pb
    cd pb

# Define pep* tables in mm8 DB

	cat ~/kent/src/hg/lib/pep*.sql > pepAll.sql

#  First edit out pepPred table definition, then

	hgsql mm8 < pepAll.sql

# Build the pepMwAa table

  hgsql proteins060115 -N -e \
"select info.acc, molWeight, aaSize from sp060115.info, sp060115.accToTaxon where accToTaxon.taxon=10090 and accToTaxon.acc = info.acc" > pepMwAa.tab

hgsql mm8 -e 'load data local infile "pepMwAa.tab" into table pepMwAa'

o Build the pepPi table

    hgsql proteins060115 -e \
    "select info.acc from sp060115.info, sp060115.accToTaxon where accToTaxon.taxon=10090 and accToTaxon.acc = info.acc" > protAcc.lis

    hgsql mm8 -N -e 'select proteinID from knownGene where proteinID like "%-%"' | sort -u >> protAcc.lis

    pbCalPi protAcc.lis sp060115 pepPi.tab
    hgsql mm8 -e 'delete from pepPi'
    hgsql mm8 -e 'load data local infile "pepPi.tab" into table mm8.pepPi'

# Calculate and load pep distributions

    pbCalDist sp060115 proteins060115 10090 mm8 >pbCalDist.out
    wc  pbCalDist.out

    hgsql mm8
    load data local infile "pepExonCntDist.tab" into table mm8.pepExonCntDist;
    load data local infile "pepCCntDist.tab" into table mm8.pepCCntDist;
    load data local infile "pepHydroDist.tab" into table mm8.pepHydroDist;
    load data local infile "pepMolWtDist.tab" into table mm8.pepMolWtDist;
    load data local infile "pepResDist.tab" into table mm8.pepResDist;
    load data local infile "pepIPCntDist.tab" into table mm8.pepIPCntDist;
    load data local infile "pepPiDist.tab" into table mm8.pepPiDist;
    quit

# Calculate frequency distributions

    pbCalResStd sp060115 10090 mm8

# Create pbAnomLimit and pbResAvgStd tables

   hgsql mm8 -e "drop table pbAnomLimit"
   hgsql mm8 -e "drop table pbResAvgStd"
   hgsql mm8 < ~/src/hg/lib/pbAnomLimit.sql
   hgsql mm8 < ~/src/hg/lib/pbResAvgStd.sql

   hgsql mm8 -e 'load data local infile "pbResAvgStd.tab" into table mm8.pbResAvgStd;'
   hgsql mm8 -e 'load data local infile "pbAnomLimit.tab" into table mm8.pbAnomLimit;'

# Create pbStamp table for PB
  hgsql mm8 -e "drop table pbStamp"
  hgsql mm8 < ~/src/hg/lib/pbStamp.sql
  hgsql mm7 -N -e 'select * from pbStamp' > pbStamp.tab
  hgsql mm8 -e 'load data local infile "pbStamp.tab" into table mm8.pbStamp'

# ENABLE PB FOR mm8 IN HGCENTRALTEST 

    echo " insert into gdbPdb values('mm8', 'proteins060115')" \
      | hgsql -h genome-testdb hgcentraltest

    echo "update dbDb set hgPbOk = 1 where name = 'mm8';" \
      | hgsql -h genome-testdb hgcentraltest

# Adjust drawing parameters for Proteome Browser stamps

  Now invoke Proteome Browser and adjust various drawing parameters
  (mostly the ymax of each stamp) if necessary, by updating the 
  pbStamp.tab file and then delete and reload the pbStamp table. 

  hgsql mm8 -e "drop table pbStamp"
  hgsql mm8 < ~/src/hg/lib/pbStamp.sql
  hgsql mm8 -e 'load data local infile "pbStamp.tab" into table mm8.pbStamp'

# Perform preliminary review of Proteome Browser for mm8, then
  notify QA for formal review.

# BUILD MISC STUFF FOR KG

# Build mrnaRefseq table

# First make sure the entrez DB is updated. (recently updated on 2/8/06).

    ssh hgwdev
    cd /cluster/store9/kg/kgMm8A

    hgsql entrez -N -e \
     'select mrna, refseq from entrezRefseq, entrezMrna, mm8.all_mrna where qName=mrna and    entrezRefseq.geneID=entrezMrna.geneID' \
      >mrnaRefseq1.tab.tab

    hgsql mm8 -N -e 'select name, name from refGene' >mrnaRefseq2.tab

    cat mrnaRefseq1.tab mrnaRefseq2.tab |sort -u >mrnaRefseq.tab

    hgsql mm8 -e 'drop table mrnaRefseq'
    hgsql mm8 < ~/src/hg/lib/mrnaRefseq.sql
    hgsql mm8 -e 'load data local infile "mrnaRefseq.tab" into table mrnaRefseq'

# CREATE FULL TEXT INDEX FOR KNOWN GENES (DONE 3/8/06 Fan)
# This depends on the go and uniProt databases as well as 
# the kgAlias and kgProAlias tables.  The hgKgGetText takes
# about 5 minutes when the database is not too busy.  The rest
# is real quick.
     ssh hgwdev
     cd /cluster/store9/kg/kgMm8A
     mkdir index
     cd index
     hgKgGetText mm8 knownGene.text
     ixIxx knownGene.text knownGene.ix knownGene.ixx
     ln -s /cluster/store9/kg/kgMm8A/index/knownGene.ix  /gbdb/mm8/knownGene.ix
     ln -s /cluster/store9/kg/kgMm8A/index/knownGene.ixx /gbdb/mm8/knownGene.ixx

# BUILD KNOWN GENE LIST FOR GOOGLE.  
# make knownGeneLists.html mm8GeneList.html mm5GeneList.html rm3GeneList.html

    cd /cluster/data/mm8/bed
    rm -rf knownGeneList/mm8

# Run hgKnownGeneList to generate the tree of HTML pages
# under ./knownGeneList/mm8

    hgKnownGeneList mm8

# copy over to /usr/local/apache/htdocs

    rm -rf /usr/local/apache/htdocs/knownGeneList/mm8
    mkdir -p /usr/local/apache/htdocs/knownGeneList/mm8
    cp -Rfp knownGeneList/mm8/* /usr/local/apache/htdocs/knownGeneList/mm8

##################################################################################
# Create description.html for mm8

mkdir -p ~/kent/src/hg/makeDb/trackDb/mouse/mm8
cd ~/kent/src/hg/makeDb/trackDb/mouse/mm8
cp ../hg17/description.html .

vi description.html
# Change release date and build number and change hg17 to mm8
# Check it into CVS

mkdir -p /cluster/data/mm8/html
cp -p description.html /cluster/data/mm8/html

ln -s /cluster/data/mm8/html/description.html /gbdb/mm8/html/description.html

# BUILD GENE SORTER TABLES (AKA: FAMILY BROWSER) (STARTED 2006-03-08, DONE 2006-02-14 - Fan)
#	This should be done after KG tables are complete from known genes build
#	process.
#
# Cluster together various alt-splicing isoforms.
#	Creates the knownIsoforms and knownCanonical tables
ssh hgwdev
mkdir /cluster/data/mm8/bed/geneSorter.2006-03-08
# remove old symbolic link
rm /cluster/data/mm8/bed/geneSorter
ln -s /cluster/data/mm8/bed/geneSorter.2006-03-08 /cluster/data/mm8/bed/geneSorter
cd /cluster/data/mm8/bed/geneSorter
hgClusterGenes mm8 knownGene knownIsoforms knownCanonical

# Extract peptides from knownGenes into fasta file
# and create a blast database out of them.
mkdir /cluster/data/mm8/bed/geneSorter/blastp
cd /cluster/data/mm8/bed/geneSorter/blastp
pepPredToFa mm8 knownGenePep known.faa
#	You may need to build this binary in src/hg/near/pepPredToFa
/scratch/blast/formatdb -i known.faa -t known -n known
#	This command is in /projects/compbio/bin/$MACH/formatdb

# Copy over database to bluearc
rm -fr /cluster/bluearc/mm8/blastp
mkdir -p /cluster/bluearc/mm8/blastp
cp -p /cluster/data/mm8/bed/geneSorter/blastp/known.* /cluster/bluearc/mm8/blastp

# Split up fasta file into bite sized chunks for cluster
cd /cluster/data/mm8/bed/geneSorter/blastp
mkdir split
faSplit sequence known.faa 8000 split/kg

# Make parasol run directory
ssh pk
mkdir /cluster/data/mm8/bed/geneSorter/blastp/self
cd /cluster/data/mm8/bed/geneSorter/blastp/self
mkdir run
cd run
mkdir out

# Make blast script
cat  << '_EOF_' > blastSome
#!/bin/sh
BLASTMAT=/cluster/bluearc/blast229/data /cluster/bluearc/blast229/blastall \
-p blastp -d /cluster/bluearc/mm8/blastp/known -i $1 -o $2 \
-e 0.01 -m 8 -b 1000
'_EOF_'
    # << keep emacs happy
chmod +x blastSome

# Make gensub2 file
cat  << '_EOF_' > gsub
#LOOP
blastSome {check in line+ $(path1)} {check out line out/$(root1).tab}
#ENDLOOP
'_EOF_'
    # << keep emacs happy

# Create parasol batch
#	'ls ../../split/*.fa' is too much, hence the echo
echo ../../split/*.fa | wordLine stdin > split.lst
gensub2 split.lst single gsub jobList
para create jobList
para try
para push
para check
Completed: 7730 of 7730 jobs
CPU time in finished jobs:      35194s     586.56m     9.78h    0.41d  0.001 y
IO & Wait Time:                 29033s     483.89m     8.06h    0.34d  0.001 y
Average job time:                   8s       0.14m     0.00h    0.00d
Longest running job:                0s       0.00m     0.00h    0.00d
Longest finished job:              43s       0.72m     0.01h    0.00d
Submission to last job:           206s       3.43m     0.06h    0.00d

# Load into database.  This takes about 20 minutes
ssh hgwdev
cd /cluster/data/mm8/bed/geneSorter/blastp/self/run/out
bash
time hgLoadBlastTab mm8 knownBlastTab *.tab
# Scanning through 7730 files
# Loading database with 5270545 rows
# real    13m30.534s

cd /cluster/data/mm8/bed/geneSorter
# Create table that maps between known genes and RefSeq
hgMapToGene mm8 refGene knownGene knownToRefSeq

# Create table that maps between known genes and LocusLink
hgsql --skip-column-names -e "select mrnaAcc,locusLinkId from refLink" mm8 > refToLl.txt
hgMapToGene mm8 refGene knownGene knownToLocusLink -lookup=refToLl.txt
hgsql -e "select count(*) from knownToLocusLink;" mm8
# 27636

# Create table that maps between known genes and Pfam domains
hgMapViaSwissProt mm8 knownGene name proteinID Pfam knownToPfam
hgsql -e "select count(*) from knownToPfam;" mm8
# 29479

############################################################################

### MAKE THE affyU74 TRACK - needed for the Gene Sorter (DONE
#                              
# MAKE THE affyU74 TRACK using Affy consensus sequences instead of 
# target sequences. Recalculate alignments and load data
----------------------------------
# Load up semi-local disk with target sequences for Affy mouse U74 chips.
# ssh kkr1u00
# mkdir -p /iscratch/i/affy
#	This /projects filesystem is not available on kkr1u00
#	but it is on kk
# ssh kk
# cp /projects/compbio/data/microarray/affyGnfMouse/sequences/U74*consensus.fa /iscratch/i/affy

ssh kkr1u00
iSync

# Run cluster job to do alignments
ssh kk
mkdir /cluster/data/mm8/
cd /cluster/data/mm8/bed/affyU74.2006-03-08
mkdir run
cd run
mkdir psl
#echo /scratch/mus/mm8/maskedContigs/*.fa | wordLine stdin > genome.lst
echo /scratch/hg/mm8/nib/*.nib | wordLine stdin > genome.lst
ls -1 /iscratch/i/affy/U74*consensus.fa > affy.lst
cat << '_EOF_' > gsub
#LOOP
/cluster/bin/i386/blat -fine -mask=lower -minIdentity=95 -ooc=/scratch/hg/h/11.ooc $(path1) {check in line+ $(path2)} {check out line+ psl/$(root1)_$(root2).psl}
#ENDLOOP
'_EOF_'
# << this line makes emacs coloring happy

gensub2 genome.lst affy.lst gsub jobList
para create jobList
para try
para check
para push
# Completed: 102 of 102 jobs
# CPU time in finished jobs:       5846s      97.43m     1.62h    0.07d  0.000 y
# IO & Wait Time:                   367s       6.12m     0.10h    0.00d  0.000 y
# Average job time:                  61s       1.02m     0.02h    0.00d
# Longest running job:                0s       0.00m     0.00h    0.00d
# Longest finished job:             189s       3.15m     0.05h    0.00d
# Submission to last job:           200s       3.33m     0.06h    0.00d

# Do sort, best in genome filter, and convert to chromosome coordinates
# to create affyU74.psl.
ssh kk
cd /cluster/data/mm8/bed/affyU74.2006-03-08/run
pslSort dirs raw.psl tmp psl

# change filter parameters for these sequences. only use alignments that
# cover 30% of sequence and have at least minAli = 0.95.
# minAli = 0.97 too high. low minCover as a lot of n's in these sequences
#pslReps -minCover=0.3 -sizeMatters -minAli=0.95 -nearTop=0.005 raw.psl contig.psl /dev/null
pslReps -minCover=0.3 -sizeMatters -minAli=0.95 -nearTop=0.005 raw.psl ../all_affyU74.psl /dev/null

# Sort by chromosome and load into database.

ssh hgwdev
cd /cluster/data/mm8/bed/affyU74.2006-03-08
pslSortAcc nohead chrom temp all_affyU74.psl
cat chrom/*.psl > affyU74.psl

# shorten qName to "xxxx_at" instead of "U74Xv2:xxxx_at;"
# and reload data into table

mv affyU74.psl affyU74.psl.orig

cut -f 1-9 affyU74.psl.orig >j1.tmp
cut -f 10 affyU74.psl.orig | sed -e 's/:/\t/' | cut -f 2 > j2.tmp
cut -f 11-21 affyU74.psl.orig >j3.tmp
paste j1.tmp j2.tmp j3.tmp >affyU74.psl

hgLoadPsl mm8 affyU74.psl
rm -rf chrom temp run

##   MAKE THE affyGnfU74 TRACKs (DONE 3/8/06, Fan)
# Make bed files and load consensus sequences for Affy U74 chip set.

#This needs to be done after affyU74 is already made.
ssh hgwdev
mkdir -p /cluster/data/mm8/bed/affyGnf.2006-03-08
cd /cluster/data/mm8/bed/affyGnf.2006-03-08
#	may need to build this command in src/hg/affyGnf
~/src/hg/affyGnf/affyPslAndAtlasToBed ../affyU74.2006-03-08/affyU74.psl \
	/projects/compbio/data/microarray/affyGnfMouse/data/data_public_U74 \
	affyGnfU74A.bed affyGnfU74A.exp -newType -chip=U74Av2
~/src/hg/affyGnf/affyPslAndAtlasToBed ../affyU74.2006-03-08/affyU74.psl \
	/projects/compbio/data/microarray/affyGnfMouse/data/U74B_b.txt \
	affyGnfU74B.bed affyGnfU74B.exp -newType -chip=U74Bv2
~/src/hg/affyGnf/affyPslAndAtlasToBed ../affyU74.2006-03-08/affyU74.psl \
	/projects/compbio/data/microarray/affyGnfMouse/data/U74C_b.txt \
	affyGnfU74C.bed affyGnfU74C.exp -newType -chip=U74Cv2

# edit 3 .bed files to shorten qName to "xxxx_at" instead of "U74Xv2:xxxx_at;"
mkdir sav
cp *.bed sav -p
cat sav/affyGnfU74A.bed|sed -e "s/U74Av2://" >affyGnfU74A.bed
cat sav/affyGnfU74B.bed|sed -e "s/U74Bv2://" >affyGnfU74B.bed
cat sav/affyGnfU74C.bed|sed -e "s/U74Cv2://" >affyGnfU74C.bed

# and reload data into table
hgLoadBed -strict mm8 affyGnfU74A affyGnfU74A.bed
hgLoadBed -strict mm8 affyGnfU74B affyGnfU74B.bed
hgLoadBed -strict mm8 affyGnfU74C affyGnfU74C.bed

# Add in sequence data for U74 tracks.
# Copy consensus sequence to /gbdb if it isn't already
# [THE SYM LINKS WERE ALREADY DONE.]
#    mkdir -p /gbdb/hgFixed/affyProbes
    cd /gbdb/hgFixed/affyProbes
    # fix broken symlinks after directory structure changed
    # /projects/compbiodata ----> /projects/compbio/data
    rm U74*
    # make correct symlinks (hartera, 2005-05-03)
    ln -s /projects/compbio/data/microarray/affyGnfMouse/sequences/U74Av2_consensus.fa .
    ln -s /projects/compbio/data/microarray/affyGnfMouse/sequences/U74Bv2_consensus.fa .
    ln -s /projects/compbio/data/microarray/affyGnfMouse/sequences/U74Cv2_consensus.fa .

    # used perl -pi.bak -e 's/;/ /' <file> to remove ";" after probe name
    # ASSUMED THIS IS ALREADY DONE LAST TIME FOR MM4.
    # reload sequences with prefix removed so acc matches name used in
    # other dependent tables
                                                    
    hgLoadSeq -abbr=U74Av2: mm8 /gbdb/hgFixed/affyProbes/U74Av2_consensus.fa
    hgLoadSeq -abbr=U74Bv2: mm8 /gbdb/hgFixed/affyProbes/U74Bv2_consensus.fa
    hgLoadSeq -abbr=U74Cv2: mm8 /gbdb/hgFixed/affyProbes/U74Cv2_consensus.fa

### GNF ATLAS 2  (DONE 3/9/06, Fan)
    # Align probes from GNF1M chip.
    ssh kk
    cd /cluster/data/mm8/bed
    mkdir -p geneAtlas2/run/psl
    cd geneAtlas2/run

    echo /scratch/hg/mm8/nib/*.nib | wordLine stdin > genome.lst

    ls -1 /cluster/bluearc/geneAtlas2/gnf1m.fa > mrna.lst
    echo '#LOOP\nblat -fine -ooc=/scratch/hg/h/mouse11.ooc  $(path1) $(path2) {check out line+ psl/$(root1)_$(root2).psl}\n#ENDLOOP' > gsub
    gensub2 genome.lst mrna.lst gsub spec
    para create spec
    para try
    para check
    para push
    para time
# Completed: 34 of 34 jobs
# CPU time in finished jobs:      53165s     886.08m    14.77h    0.62d  0.002 y
# IO & Wait Time:                   241s       4.02m     0.07h    0.00d  0.000 y
# Average job time:                1571s      26.18m     0.44h    0.02d
# Longest running job:                0s       0.00m     0.00h    0.00d
# Longest finished job:            3929s      65.48m     1.09h    0.05d
# Submission to last job:          3929s      65.48m     1.09h    0.05d

    # Do sort, best in genome filter, and convert to chromosome coordinates
    # to create gnf1h.psl.
    pslSort dirs raw.psl tmp psl
    pslReps -minCover=0.3 -minAli=0.95 -nearTop=0.005 raw.psl ../affyGnf1m.psl /dev/null

    #rm -r contig.psl raw.psl psl

    # Load probes and alignments from GNF1H into database.
    ssh hgwdev
    cd /cluster/data/mm8/bed/geneAtlas2
#    ln -s /projects/compbio/data/microarray/geneAtlas2/mouse/gnf1m.fa /gbdb/hgFixed/affyProbes
    hgLoadPsl mm8 affyGnf1m.psl
    hgLoadSeq mm8 /gbdb/hgFixed/affyProbes/gnf1m.fa

    # Load up track
    hgMapMicroarray gnfAtlas2.bed hgFixed.gnfMouseAtlas2MedianRatio \
    	affyGnf1m.psl
    # Note that the unmapped 5000 records are from all-N sequences.
    hgLoadBed -strict mm8 gnfAtlas2 gnfAtlas2.bed

# MOUSE AFFYMETRIX MOE430 TRACK (DONE  Fan  2006-03-09)
#    mkdir -p /projects/compbio/data/microarray/affyMouse
    # Download MOE430A and MOE430B consensus sequences from Affymetrix web site
    # http://www.affymetrix.com/support/technical/byproduct.affx?product=moe430
#    unzip MOE430*_consensus.zip

    # check for duplicate probes: there are none, all have unique names
    # check for duplicate probes: 100 from 136745_at to 1367551_a_at
    # remove "consensus:" and ";" from FASTA headers to shorten probeset
    # names for database

#    sed -e 's/consensus://' MOE430A_consensus | sed -e 's/;/ /' > MOE430_all.fa
#    sed -e 's/consensus://' MOE430B_consensus | sed -e 's/;/ /' >> MOE430_all.fa
 
#    cp /projects/compbio/data/microarray/affyMouse/MOE430_all.fa \
#       /cluster/bluearc/affy/

    # THE ABOVE WAS ALREADY TBD)

    # Set up cluster job to align MOE430 consensus sequences to mm8
    ssh kkr1u00
    cd /cluster/data/mm8/bed
    mkdir -p affyMOE430
    cd affyMOE430
#    mkdir -p /iscratch/i/affy
#    cp /cluster/bluearc/affy/MOE430_all.fa /iscratch/i/affy
#    iSync

    ssh kk
    cd /cluster/data/mm8/bed/affyMOE430
    ls -1 /iscratch/i/affy/MOE430_all.fa > affy.lst
    echo /scratch/hg/mm8/nib/*.nib | wordLine stdin > genome.lst

    echo '#LOOP\n/cluster/bin/i386/blat -fine -mask=lower -minIdentity=95 -ooc=/scratch/hg/h/mouse11.ooc  $(path1) $(path2) {check out line+ psl/$(root1)_$(root2).psl}\n#ENDLOOP' > template.sub
    gensub2 genome.lst affy.lst template.sub para.spec
    mkdir psl
    para create para.spec
    # Do the job with usual para try/check/push/time etc.
# Completed: 34 of 34 jobs
# CPU time in finished jobs:       9196s     153.26m     2.55h    0.11d  0.000 y
# IO & Wait Time:                   362s       6.04m     0.10h    0.00d  0.000 y
# Average job time:                 281s       4.69m     0.08h    0.00d
# Longest running job:                0s       0.00m     0.00h    0.00d
# Longest finished job:             639s      10.65m     0.18h    0.01d
# Submission to last job:           639s      10.65m     0.18h    0.01d

    # Do sort, best in genome filter, and convert to chromosome coordinates
    # to create affyRAE230.psl
    pslSort dirs raw.psl tmp psl

    # only use alignments that cover 30% of sequence and have at least
    # 95% identity in aligned region. 
    # low minCover as a lot of n's in these sequences
    pslReps -minCover=0.3 -sizeMatters -minAli=0.95 -nearTop=0.005 raw.psl affyMOE430.psl /dev/null

    # Load alignments and sequences into database
    ssh hgwdev
    cd /cluster/data/mm8/bed/affyMOE430
    # shorten names in psl file
    sed -e 's/MOE430//' affyMOE430.psl > affyMOE430.psl.bak
    mv affyMOE430.psl.bak affyMOE430.psl

    # load track into database

    hgLoadPsl mm8 affyMOE430.psl
 
    # Add consensus sequences for MOE430
    # Copy sequences to gbdb is they are not there already
#    mkdir -p /gbdb/hgFixed/affyProbes
#    ln -s /projects/compbio/data/microarray/affyMouse/MOE430_all.fa \ 
#       /gbdb/hgFixed/affyProbes

    hgLoadSeq -abbr=MOE430 mm8 /gbdb/hgFixed/affyProbes/MOE430_all.fa
    
    # Clean up
#    rm batch.bak contig.psl raw.psl 
    
    # BELOW TWO THINGS WERE DONE BY RACHEL ALREDAY FOR MM4
    # add entry to trackDb.ra in ~kent/src/hg/makeDb/trackDb/mouse/
    # add affyMOE430.html file and then do make alpha to add to trackDb table

# Create known gene mapping table and expression distance tables
# for GNF Atlas 2.  (The hgExpDistance takes an hour.)

hgMapToGene mm8 affyGnf1m knownGene knownToGnf1m
hgExpDistance mm8 hgFixed.gnfMouseAtlas2MedianRatio \
	hgFixed.gnfMouseAtlas2MedianExps gnfAtlas2Distance -lookup=knownToGnf1m
Have 34863 elements in hgFixed.gnfMouseAtlas2MedianRatio
Got 22937 unique elements in hgFixed.gnfMouseAtlas2MedianRatio

# Create table that maps between known genes and RefSeq
hgMapToGene mm8 refGene knownGene knownToRefSeq
#	may need to build this command in src/hg/near/hgMapToGene

# Create a table that maps between known genes and 
# the nice affy expression data.
hgMapToGene mm8 affyU74  knownGene knownToU74
hgMapToGene mm8 affyMOE430 knownGene knownToMOE430
hgMapToGene mm8 affyMOE430 -prefix=A: knownGene knownToMOE430A

# Format and load Rinn et al sex expression data
mkdir /cluster/data/mm8/bed/rinnSex
cd /cluster/data/mm8/bed/rinnSex
hgMapMicroarray rinnSex.bed hgFixed.mouseRinnSexMedianRatio \
../affyMOE430/affyMOE430.psl
hgLoadBed mm8 rinnSex rinnSex.bed

# Format and load the GNF data
mkdir /cluster/data/mm8/bed/affyGnf95
cd /cluster/data/mm8/bed/affyGnf95
~/src/hg/affyGnf/affyPslAndAtlasToBed -newType ../affyU95.psl \
/projects/compbio/data/microarray/affyGnfHuman/data_public_U95 \
affyGnfU95.tab affyGnfU95Exps.tab -shortOut

#	this .sql load was in preceeding instructions, but this .sql file
#	appears to not exist and it doesn't seem to be needed anyway.
#	Everything below this seems to create tables OK.
#  hgsql mm8 < ~/kent/src/hg/affyGnf/affyGnfU95.sql

# Create table that gives distance in expression space between 
# GNF genes.  These commands take about 15 minutes each
#	The affyGnfU74?Exps arguments appear to be unused in 
# hgExpDistance
cd /cluster/data/mm8/bed/geneSorter
hgExpDistance mm8 affyGnfU74A affyGnfU74AExps affyGnfU74ADistance -lookup=knownToU74
hgExpDistance mm8 affyGnfU74B affyGnfU74BExps affyGnfU74BDistance -lookup=knownToU74
hgExpDistance mm8 affyGnfU74C affyGnfU74CExps affyGnfU74CDistance -lookup=knownToU74

# Create table to map between known genes and GNF Atlas2
# expression data.
    hgMapToGene mm8 gnfAtlas2 knownGene knownToGnfAtlas2 '-type=bed 12'
#	hgsql -e "select count(*) from knownToGnfAtlas2;" mm8
#	row count changed to 22978

# Create expression distance table - takes about an hour
    hgExpDistance mm8 hgFixed.gnfMouseAtlas2MedianRatio \
    	hgFixed.gnfMouseAtlas2MedianExps gnfAtlas2Distance \
	-lookup=knownToGnfAtlas2 &
#	hgsql -e "select count(*) from gnfAtlas2Distance;" mm8
#	row count changed to 22937000 

# HGNEAR PROTEIN BLAST TABLES (DONE 3/14/06 Fan)

    ssh hgwdev
    mkdir /cluster/data/mm8/bed/hgNearBlastp
    cd /cluster/data/mm8/bed/hgNearBlastp
    cat << _EOF_ > config.ra
# Latest mouse vs. other Gene Sorter orgs:
# human, rat, zebrafish, worm, yeast, fly

targetGenesetPrefix mouse
targetDb mm8
queryDbs hg18 rn4 danRer3 ce2 sacCer1 dm2

mm8Fa /cluster/data/mm8/bed/geneSorter/blastp/known.faa 
hg18Fa /cluster/data/hg18/bed/geneSorter/blastp/known.faa
rn4Fa /cluster/data/rn4/bed/blastp/known.faa
danRer3Fa /cluster/data/danRer3/bed/blastp/ensembl.faa
ce2Fa /cluster/data/ce2/bed/blastp/wormPep154.faa
sacCer1Fa /cluster/data/sacCer1/bed/blastp/sgdPep.faa
dm2Fa /cluster/data/dm2/bed/flybase4.1/flybasePep.fa

buildDir /cluster/data/mm8/bed/hgNearBlastp
scratchDir /san/sanvol1/scratch/mm8HgNearBlastp
_EOF_

    doHgNearBlastp.pl config.ra >do.log 

# output was like this:
...
Scanning through 671 files^M
Loading database with 14470 rows^M
# ssh -x pk rm -rf /san/sanvol1/scratch/mm8HgNearBlastp/mm8.split^M
# ssh -x pk rm -rf /san/sanvol1/scratch/mm8HgNearBlastp/mm8.formatdb^M
# ssh -x pk rm -rf /san/sanvol1/scratch/mm8HgNearBlastp/hg18.split^M
# ssh -x pk rm -rf /san/sanvol1/scratch/mm8HgNearBlastp/hg18.formatdb^M
# ssh -x pk rm -rf /san/sanvol1/scratch/mm8HgNearBlastp/rn4.split^M
# ssh -x pk rm -rf /san/sanvol1/scratch/mm8HgNearBlastp/rn4.formatdb^M
# ssh -x pk rm -rf /san/sanvol1/scratch/mm8HgNearBlastp/danRer3.split^M
# ssh -x pk rm -rf /san/sanvol1/scratch/mm8HgNearBlastp/danRer3.formatdb^M
# ssh -x pk rm -rf /san/sanvol1/scratch/mm8HgNearBlastp/ce2.split^M
# ssh -x pk rm -rf /san/sanvol1/scratch/mm8HgNearBlastp/ce2.formatdb^M
# ssh -x pk rm -rf /san/sanvol1/scratch/mm8HgNearBlastp/sacCer1.split^M
# ssh -x pk rm -rf /san/sanvol1/scratch/mm8HgNearBlastp/sacCer1.formatdb^M
# ssh -x pk rm -rf /san/sanvol1/scratch/mm8HgNearBlastp/dm2.split^M
# ssh -x pk rm -rf /san/sanvol1/scratch/mm8HgNearBlastp/dm2.formatdb^M
# ssh -x pk rmdir /san/sanvol1/scratch/mm8HgNearBlastp^M
^M
 *** All done!^M
 *** Check these tables in mm8:^M
 *** mouseBlastTab hgBlastTab rnBlastTab drBlastTab ceBlastTab scBlastTab dmBlastTab ^M
 *** and mmBlastTab in these databases:^M
 *** hg18 rn4 danRer3 ce2 sacCer1 dm2 ^M

# MAKE ORGANISM-SPECIFIC HGNEARDATA FILES 
    cd ~/kent/src/hg/near/hgNear/hgNearData
    mkdir -p Mouse/mm8
    cd Mouse/mm8
    cp ../mm7/otherOrgs.ra
# Edit ortherOrgs.ra to reflect the latest genomes used in blastp jobs
    vi ortherOrgs.ra
# then check it into CVS.

# ENABLE HGNEAR FOR mm8 IN HGCENTRALTEST
    echo "update dbDb set hgNearOk = 1 where name = 'mm8';" \
      | hgsql -h genome-testdb hgcentraltest

# END OF HGNEAR STUFF

#########################################################################
# BLASTZ panTro2 after chr9 re-masked (DONE - 2006-03-30 - Hiram)
    ssh pk
    mkdir /cluster/data/mm8/bed/blastzPanTro2.2006-03-28
    cd /cluster/data/mm8/bed
    rm blastz.panTro2
    ln -s blastzPanTro2.2006-03-28 blastz.panTro2
    cd blastz.panTro2

    cat << '_EOF_' > DEF
# mouse vs chimp
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/cluster/home/angie/schwartzbin:/parasol/bin

BLASTZ=blastz.v7.x86_64
BLASTZ_ABRIDGE_REPEATS=1

# TARGET: Mouse Mm7
SEQ1_DIR=/scratch/hg/mm8/nib
SEQ1_LEN=/scratch/hg/mm8/chrom.sizes
SEQ1_SMSK=/cluster/bluearc/scratch/hg/mm8/linSpecRep/notInOthers
SEQ1_CHUNK=50000000
SEQ1_LAP=10000

# QUERY: Chimp PanTro2
SEQ2_DIR=/scratch/hg/panTro2/nib
SEQ2_LEN=/scratch/hg/panTro2/chrom.sizes
SEQ2_SMSK=/cluster/bluearc/panTro2/linSpecRep/notInRodent
SEQ2_CHUNK=50000000
SEQ2_LAP=0

BASE=/cluster/data/mm8/bed/blastzPanTro2.2006-03-28
TMPDIR=/scratch/tmp
'_EOF_'
    #	<< happy emacs

    time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
	`pwd`/DEF > blastz.out 2>&1 &
    #	broken during blastz run due to panassas failure
    time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
	-continue=cat `pwd`/DEF > cat.out 2>&1 &
    
    #	Do not have this measurement for the first time around, tables
    #	got loaded again before I thought of that.
    time nice -n +19 featureBits mm8 chainPanTro2Link \
	> fb.mm8.chainPanTro2Link
    #	963977790 bases of 2567283971 (37.549%) in intersection

    #	For panTro1 this was:
    time nice -n +19 featureBits mm8 chainPanTro1Link \
	> fb.mm8.chainPanTro1Link
    #	901276629 bases of 2567283971 (35.106%) in intersection

    ssh pk
    mv /cluster/data/panTro2/bed/blastz.mm8.swap \
	/cluster/data/panTro2/bed/blastz.mm8.swap.2006-03-21
    mkdir /cluster/data/panTro2/bed/blastz.mm8.swap
    cd /cluster/data/panTro2/bed/blastz.mm8.swap

    time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-swap -bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
	/cluster/data/mm8/bed/blastzPanTro2.2006-03-28/DEF \
	> blastz.out 2>&1 &
    #	completed the downloads manually since they failed due to the
    #	existing downloads.  Then cleanup:
    ssh hgwbeta
    cd /cluster/data/panTro2/bed/blastz.mm8.swap
    time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-swap -bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
	-continue=cleanup /cluster/data/mm8/bed/blastzPanTro2.2006-03-28/DEF \
	> cleanup.out 2>&1 &

    time nice -n +19 featureBits panTro2 chainMm8Link \
	> fb.panTro2.chainMm8Link 2>&1 &
    #	978002566 bases of 2909512873 (33.614%) in intersection
    # first time before the chr9 fix was:
    #	986978326 bases of 2909512873 (33.922%) in intersection

#########################################################################
# BLASTZ panTro2 (DONE - 2006-03-15 - Hiram)
    ssh pk
    mkdir /cluster/data/mm8/bed/blastz.panTro2.2006-02-23
    cd /cluster/data/mm8/bed
    ln -s blastz.panTro2.2006-02-23 blastz.panTro2
    cd blastz.panTro2

    cat << '_EOF_' > DEF
# mouse vs chimp
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/cluster/home/angie/schwartzbin:/parasol/bin

BLASTZ=blastz.v7.x86_64
BLASTZ_ABRIDGE_REPEATS=1

# TARGET: Mouse Mm7
SEQ1_DIR=/scratch/hg/mm8/nib
SEQ1_LEN=/scratch/hg/mm8/chrom.sizes
SEQ1_SMSK=/cluster/bluearc/scratch/hg/mm8/linSpecRep/notInOthers
SEQ1_CHUNK=50000000
SEQ1_LAP=10000

# QUERY: Chimp PanTro2
SEQ2_DIR=/scratch/hg/panTro2/nib
SEQ2_LEN=/scratch/hg/panTro2/chrom.sizes
SEQ2_SMSK=/cluster/bluearc/panTro2/linSpecRep/notInRodent
SEQ2_CHUNK=50000000
SEQ2_LAP=0

BASE=/cluster/data/mm8/bed/blastzPanTro2.2006-03-15
TMPDIR=/scratch/tmp
'_EOF_'
    #	<< happy emacs

    time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
	`pwd`/DEF > blastz.out 2>&1 &
    #	broken during chain step due to missing files on the Iservers
    #	completed chain run manually, then continuing
    time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
	-continue=chainMerge `pwd`/DEF > chainMerge.out 2>&1 &
    #	broken during loadUp due to script bug, ran loadUp.csh manually
    time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
	-swap `pwd`/DEF > swap.out 2>&1 &
    
    #	mistakenly did PanTro1 here ...  should have been PanTro2
    time nice -n +19 featureBits mm8 chainPanTro1Link
    #	901276629 bases of 2567283971 (35.106%) in intersection
    time nice -n +19 featureBits panTro2 chainMm8Link \
	> fb.panTro2.chainMm8Link 2>&1
    #	986978326 bases of 2909512873 (33.922%) in intersection


#############################################################################
# UPDATED mm8.knownToVisiGene (2006-03-15 galt)
ssh hgwdev
knownToVisiGene mm8

#############################################################################
# BLASTZ SELF (DONE - 2006-03-20 - 2006-03-22 - Hiram)
#	using chain min score of 10,000 to cut down on volumn of data
    ssh pk
    mkdir /cluster/data/mm8/bed/blastzSelf.2006-03-20
    cd /cluster/data/mm8/bed
    ln -s blastzSelf.2006-03-20 blastz.mm8
    cd blastzSelf.2006-03-20

    cat << '_EOF_' > DEF
# mouse vs mouse
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/cluster/home/angie/schwartzbin:/parasol/bin

BLASTZ=blastz.v7.x86_64
BLASTZ_H=2000
BLASTZ_M=200

# TARGET: Mouse Mm8
SEQ1_DIR=/scratch/hg/mm8/nib
SEQ1_LEN=/scratch/hg/mm8/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000

# QUERY: Mouse Mm8
SEQ2_DIR=/scratch/hg/mm8/nib
SEQ2_LEN=/scratch/hg/mm8/chrom.sizes
SEQ2_CHUNK=10000000
SEQ2_LAP=0

BASE=/cluster/data/mm8/bed/blastzSelf.2006-03-20
TMPDIR=/scratch/tmp
'_EOF_'
    #	<< happy emacs

    cd /cluster/data/mm8/bed/blastzSelf.2006-03-20
    time /cluster/home/hiram/kent/src/utils/doBlastzChainNet.pl -verbose=2 \
	-chainMinScore=10000 -chainLinearGap=medium -bigClusterHub=pk \
	`pwd`/DEF > blastz.out 2>&1 &
    #	broke during the load step due to doBlastz script changes,
    #	finished the load manually, then:
    time /cluster/home/hiram/kent/src/utils/doBlastzChainNet.pl -verbose=2 \
	-chainMinScore=10000 -chainLinearGap=medium -bigClusterHub=pk \
	-continue=download `pwd`/DEF > download.out 2>&1 &

    ssh kolossus
    cd /cluster/data/mm8/bed/blastzSelf.2006-03-20
    time HGDB_CONF=~/.hg.conf.read-only featureBits mm8 \
	chainSelfLink >fb.mm8.chainSelfLink 2>&1
    cat fb.mm8.chainSelfLink
    #	362483673 bases of 2567283971 (14.119%) in intersection

#############################################################################
# UPDATED mm8.knownToVisiGene (2006-04-05 galt)
ssh hgwdev
knownToVisiGene mm8


############################################################################
# LIFTOVER (DROPUNDER) CHAINS TO MM7 (2006-04-06 kate)

    # Split (using makeLoChain-split) of mm7 is doc'ed in makeMm7.doc
    # Do what makeLoChain-split says to do next (start blat alignment)
    ssh kk
    cd /cluster/data/mm8/bed/liftOver
    makeLoChain-align mm8 /scratch/hg/mm8/nib mm7 \
        /iscratch/i/mm7/split10k \
        /cluster/bluearc/mm7/11.ooc >&! align.log &
    # Do what its output says to do next (start cluster job)
    cd /cluster/data/mm8/bed/blat.mm7.2006-04-06/run
    para shove
    para time >&! run.time
#CPU time in finished jobs:     906023s   15100.39m   251.67h   10.49d  0.029 y
#IO & Wait Time:                 22074s     367.90m     6.13h    0.26d  0.001 y
#Average job time:                 343s       5.72m     0.10h    0.00d
#Longest running job:                0s       0.00m     0.00h    0.00d
#Longest finished job:            4260s      71.00m     1.18h    0.05d
#Submission to last job:          4965s      82.75m     1.38h    0.06d

    # lift alignments
    ssh kkr1u00
    cd /cluster/data/mm8/bed/liftOver
    makeLoChain-lift mm8 mm7 >&! lift.log &

    # chain alignments
    ssh kki
    cd /cluster/data/mm8/bed/liftOver
    makeLoChain-chain mm8 /scratch/hg/mm8/nib \
                mm7 /scratch/hg/mm7/nib >&! chain.log &
    # Do what its output says to do next (start cluster job)
    cd /cluster/data/mm8/bed/blat.mm7.2006-04-06/chainRun
    para shove
    para time >&! run.time
#CPU time in finished jobs:       3884s      64.73m     1.08h    0.04d  0.000 y
#IO & Wait Time:                   594s       9.91m     0.17h    0.01d  0.000 y
#Average job time:                  86s       1.44m     0.02h    0.00d
#Longest running job:                0s       0.00m     0.00h    0.00d
#Longest finished job:             245s       4.08m     0.07h    0.00d
#Submission to last job:           401s       6.68m     0.11h    0.00d

    # net alignment chains
    ssh kkstore03
    cd /cluster/data/mm8/bed/liftOver
    makeLoChain-net mm8 mm7 >&! net.log &

    # load reference to over.chain into database table, 
    # and create symlinks  /gbdb  and download area
    ssh hgwdev
    cd /cluster/data/mm8/bed/liftOver
    makeLoChain-load mm8 mm7 >&! load.log &

    # test by converting a region using the "convert" link on
    # the browser, and comparing to blat of the same region

#############################################################################
# Create Allen Brain Atlas mapping. (DONE 2006-04-12 galt)

# compile allenCollectSeq
    ssh hgwdev
    cd ~/kent/src/hg/makeDb/allenBrain/allenCollectSeq
    make

# Set up directory
    ssh kk
    cd /cluster/data/mm8/bed
    mkdir allenBrain
    cd allenBrain

# In /san/sanvol1/visiGene/offline/allenBrain/probesAndData/
#  allen20051021.tab (converted from spreadsheet mailed by Susan Sunkin <SusanS@alleninstitute.org>)
#  probeSeq.20051027.fasta (also from Susan).

# Create a list of probe sequences filling ones missing from probeSeq.20050127.fa
# with some NCBI and TIGR files, and some downloaded one at a time.
     allenCollectSeq /san/sanvol1/visiGene/offline/allenBrain/probesAndData/allen20051021.tab /san/sanvol1/visiGene/offline/allenBrain/probesAndData/probeSeq.20051027.fasta /cluster/data/mm7/bed/ncbiXm/ncbiNm.fa /cluster/data/mm7/bed/ncbiXm/ncbiXm.fa /cluster/data/mm6/bed/tigrMgiTc/tigrMgiTc.fa ~/kent/src/hg/makeDb/allenBrain/allenCollectSeq/extra.fa allProbes.fa allProbes.tab missing.tab allenBrainUrl.tab
    
# Set up a blat run to align the probes.
    mkdir split
    faSplit sequence allProbes.fa 200 split/rp
    mkdir run
    cd run
    ls -1 ../split/*.fa > mrna.lst
    ls -1 /scratch/hg/mm8/nib/*.nib > genome.lst
    mkdir psl
    cat << '_EOF_' > gsub
#LOOP
blat -ooc=/scratch/hg/h/mouse11.ooc $(path1) $(path2) {check out line+ psl/$(root1)_$(root2).psl}
#ENDLOOP
'_EOF_'
    gensub2 genome.lst mrna.lst gsub spec
    para create spec
# Then do the usual para try/push/time/check until the run is finished

# Then do sorting and near-best-in-genome step on file server
    ssh kkstore02
    cd /cluster/data/mm8/bed/allenBrain/run
    pslSort dirs raw.psl tmp psl
    pslReps raw.psl ../best.psl -nohead -minCover=0.20 -minAli=0.96 -nearTop=0.001 /dev/null
    sort -k 14,14 -k 16,16n ../best.psl > ../allenBrainAli.psl

# Clean up big files no longer needed
   rm raw.psl
   rm -r psl
   rm -r ../split

# Load up database
   ssh hgwdev
   cd /cluster/data/mm8/bed/allenBrain

# Make a new table that contains the URLs for the allen brain genes
# Make this one first since all.joiner considers it the master table.
   hgsql mm8 < ~/kent/src/hg/lib/allenBrainUrl.sql
   hgsql mm8 -e 'load data local infile "allenBrainUrl.tab" into table allenBrainUrl;'

# Make probe alignment table, and load sequence.
   hgLoadPsl mm8 allenBrainAli.psl
   mkdir /gbdb/mm8/allenBrain
   ln -s /cluster/data/mm8/bed/allenBrain/allProbes.fa /gbdb/mm8/allenBrain/allProbes.fa
   hgLoadSeq mm8 /gbdb/mm8/allenBrain/allProbes.fa

# Make mapping between known genes and allenBrain	
   hgMapToGene mm8 allenBrainAli -type=psl knownGene knownToAllenBrain 

#########################################################################
# BLASTZ HUMAN Hg17 (DONE - 2006-04-13 - 2006-04-19 - Hiram)
    ssh pk
    mkdir /cluster/data/mm8/bed/blastzHg17.2006-04-13
    cd /cluster/data/mm8/bed
    ln -s blastzHg17.2006-04-13 blastz.hg17
    cd blastzHg17.2006-04-13

    cat << '_EOF_' > DEF
# mouse vs human
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/cluster/home/angie/schwartzbin:/parasol/bin

BLASTZ=blastz.v7.x86_64
BLASTZ_ABRIDGE_REPEATS=1

# TARGET: Mouse Mm8
SEQ1_DIR=/scratch/hg/mm8/nib
SEQ1_SMSK=/scratch/hg/mm8/linSpecRep/notInOthers
SEQ1_LEN=/scratch/hg/mm8/chrom.sizes
SEQ1_CHUNK=50000000
SEQ1_LAP=10000

# QUERY: Human Hg17 - single chunk big enough to run each chrom by itself
SEQ2_DIR=/scratch/hg/hg17/bothMaskedNibs
SEQ2_SMSK=/scratch/hg/hg17/linSpecRep.notInMouse
SEQ2_LEN=/cluster/data/hg17/chrom.sizes
SEQ2_CHUNK=300000000
SEQ2_LAP=0

BASE=/cluster/data/mm8/bed/blastzHg17.2006-04-13
TMPDIR=/scratch/tmp
'_EOF_'
    # << happy emacs

    #	establish a screen to control this job
    screen
    cd /cluster/data/mm8/bed/blastzHg17.2006-04-13
    time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
	`pwd`/DEF > blastz.out 2>&1 &
    #	real    656m20.633s

    #	Then to swap over to Hg17
    mkdir /cluster/data/hg17/bed/blastz.mm8.swap
    cd /cluster/data/hg17/bed
    ln -s blastz.mm8.swap blastz.mm8
    cd blastz.mm8.swap
    time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
	-swap /cluster/data/mm8/bed/blastzHg17.2006-04-13/DEF \
	> swap.out 2>&1 &

    ssh hgwdev
    time nice -n +19 featureBits mm8 chainHg17Link
    #	984380268 bases of 2567283971 (38.343%) in intersection
    time nice -n +19 featureBits hg17 chainMm8Link
    #	994530172 bases of 2881515245 (34.514%) in intersection
    cd /cluster/data/mm8/bed/blastzHg17.2006-04-13
    time nice -n +19 featureBits mm8 chainHg17Link > fb.mm8.chainHg17Link 2>&1
    #	990554882 bases of 2567283971 (38.584%) in intersection
    time nice -n +19 featureBits hg17 chainMm8Link > fb.hg17.chainMm8Link 2>&1
    #	997368618 bases of 2866216770 (34.797%) in intersection

########################################################################
# BLASTZ/CHAIN/NET XENTRO2 (DONE - 2006-04-20 - Hiram)
    ssh kk
    mkdir /cluster/data/mm8/bed/blastz.xenTro2.2006-04-20
    cd /cluster/data/mm8/bed
    ln -s blastz.xenTro2.2006-04-20 blastz.xenTro2
    cd blastz.xenTro2.2006-04-20

    cat << '_EOF_' > DEF
# mouse vs. frog
BLASTZ=/cluster/bin/penn/blastz.v7

# Use same params as used for mammal-xenTro1 (see makeXenTro1.doc)
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=8000
BLASTZ_K=2200
BLASTZ_Q=/cluster/data/blastz/HoxD55.q

# TARGET: Mouse mm8
SEQ1_DIR=/scratch/hg/mm8/nib
SEQ1_LEN=/cluster/data/mm8/chrom.sizes
SEQ1_CHUNK=50000000
SEQ1_LAP=10000

# QUERY: Frog xenTro2 - single chunk big enough to run two of the
#               largest scaffolds in one job
SEQ2_DIR=/scratch/hg/xenTro2/xenTro2.2bit
SEQ2_LEN=/san/sanvol1/scratch/xenTro2/chrom.sizes
SEQ2_CHUNK=20000000
SEQ2_LAP=0
SEQ2_LIMIT=100

BASE=/cluster/data/mm8/bed/blastz.xenTro2.2006-04-20
'_EOF_'
    # << emacs

    time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-bigClusterHub=kk -chainMinScore=5000 -chainLinearGap=loose \
	`pwd`/DEF > blastz.out 2>&1 &
    #	 XXX running 2006-04-20


    #	Then to swap over to xenTro2
    mkdir /cluster/data/xenTro2/bed/blastz.mm8.swap
    cd /cluster/data/xenTro2/bed
    ln -s blastz.mm8.swap blastz.mm8
    cd blastz.mm8.swap
    time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-bigClusterHub=kk -chainMinScore=5000 -chainLinearGap=loose \
	-swap /cluster/data/mm8/bed/blastz.xenTro2.2006-04-20/DEF \
	> swap.out 2>&1 &

    ssh hgwdev
    cd /cluster/data/mm8/bed/blastz.xenTro2.2006-04-20
    time nice -n +19 featureBits mm8 chainXenTro2Link \
	> fb.mm8.chainXenTro2Link 2>&1 &
    #	68050843 bases of 2567283971 (2.651%) in intersection
    cd /cluster/data/xenTro2/bed/blastz.mm8.swap
    time nice -n +19 featureBits xenTro2 chainMm8Link \
	> fb.xenTro2.chainMm8Link 2>&1
    #	72840135 bases of 1359412157 (5.358%) in intersection

#######################################################################
## LIFTOVER To Mm7 (DONE - 2006-04-21 - 2006-04-24 - Hiram)
    ssh kkr1u00
    $HOME/kent/src/hg/makeDb/makeLoChain/makeLoChain-split.csh \
      mm7 /cluster/data/mm7/nib
    # as it says, DO THIS NEXT:
    ssh kk
    $HOME/kent/src/hg/makeDb/makeLoChain/makeLoChain-align.csh \
	mm8 /scratch/hg/mm8/nib mm7 /iscratch/i/mm7/split10k \
	/cluster/data/mm7/11.ooc
    # as it says, DO THIS NEXT:
    cd /cluster/data/mm8/bed/blat.mm7.2006-04-21/run
    para try, check, push, check, ...
# Completed: 1360 of 1360 jobs
# CPU time in finished jobs:    3890058s   64834.31m  1080.57h   45.02d  0.123 y
# IO & Wait Time:                 13326s     222.09m     3.70h    0.15d  0.000 y
# Average job time:                2870s      47.84m     0.80h    0.03d
# Longest finished job:           27224s     453.73m     7.56h    0.32d
# Submission to last job:         80553s    1342.55m    22.38h    0.93d

    # as it says, DO THIS NEXT:
    #	this does the liftUp and makes the psl files
    ssh kkr1u00
    cd /cluster/data/mm8/bed
    ln -s blat.mm7.2006-04-21 blat.mm7
    time $HOME/kent/src/hg/makeDb/makeLoChain/makeLoChain-lift.csh mm8 mm7
    #	real    16m5.091s
    # as it says, DO THIS NEXT:
    #	the prepares the batch to run for the chaining
    ssh kki
    time $HOME/kent/src/hg/makeDb/makeLoChain/makeLoChain-chain.csh \
	mm8 /cluster/data/mm8/nib mm7 /cluster/data/mm7/nib
    # as it says, DO THIS NEXT:
    #	running the chain batch
    cd /cluster/data/mm8/bed/blat.mm7.2006-04-21/chainRun
    para try, check, push, check, ...
    Completed: 40 of 40 jobs
# CPU time in finished jobs:       5381s      89.68m     1.49h    0.06d  0.000 y
# IO & Wait Time:                  2119s      35.32m     0.59h    0.02d  0.000 y
# Average job time:                 188s       3.12m     0.05h    0.00d
# Longest finished job:             652s      10.87m     0.18h    0.01d
# Submission to last job:           685s      11.42m     0.19h    0.01d
    ssh kkstore04
    $HOME/kent/src/hg/makeDb/makeLoChain/makeLoChain-net.csh mm8 mm7
    #	Created /cluster/data/mm8/bed/liftOver/mm8ToMm7.over.chain.gz
    # as it says, DO THIS NEXT:
    ssh hgwdev
    $HOME/kent/src/hg/makeDb/makeLoChain/makeLoChain-load.csh mm8 mm7
    #	It says this:
    # 	Now, add link for
    #	/usr/local/apache/htdocs/goldenPath/mm8/liftOver/mm8ToMm7.over.chain
    #	to hgLiftOver
    #	But I believe that link was already done:
    cd /gbdb/mm8/liftOver
    ls -og mm8ToMm7*
    #	lrwxrwxrwx    1       53 Apr 24 12:32 mm8ToMm7.over.chain.gz -> \
    #		/cluster/data/mm8/bed/liftOver/mm8ToMm7.over.chain.gz

########################################################################
##  CYTOBAND - ideogram track (DONE - 2006-04-28 - Hiram)
    ssh hgwdev
    cd /cluster/data/mm8/pre_release
    #	The .wgetrc is the anonymous user
    WGETRC=`pwd`/.wgetrc
    export WGETRC
    wget --timestamping \
	ftp://ftp.ncbi.nih.gov/genomes/M_musculus/pre_release/ideogram

    mkdir /cluster/data/mm8/cytoBand
    cd /cluster/data/mm8/cytoBand

    # Create bed file
    $HOME/kent/src/utils/createNcbiCytoBand.pl \
	/cluster/data/mm8/pre_release/ideogram
    # Load the bed file
    hgLoadBed -strict -noBin \
	-sqlTable=$HOME/kent/src/hg/lib/cytoBand.sql mm8 \
	cytoBand cytoBand.bed
    # Make cytoBandIdeo track for ideogram gif on hgTracks page.
    # For mouse cytoBandIdeo is just a replicate of the cytoBand track.
    hgsql -e "drop table cytoBandIdeo;" mm8
    hgsql mm8 -e "create table cytoBandIdeo (index(chrom(10),chromStart)) as select * from cytoBand;"


#########################################################################
# GENSCAN PREDICTIONS (DONE - 2006-05-03 - 2006-05-05 - Hiram)
    ssh kkstore04
    #	Create a 2bit file with the full chrom sequences and the
    #	random contigs, all hard masked
    cat ?/chr?.fa ??/chr??.fa randomContigs/chr*.ctg.fa \
	| maskOutFa stdin hard stdout \
	    | faToTwoBit stdin mm8Chroms_RandomContigs.hard.2bit
    #	make sure it still has all the unmasked sequence in it:
    twoBitToFa mm8Chroms_RandomContigs.hard.2bit stdout \
	| faSize stdin
    #	2661205088 bases (1183272085 N's 1477933003 real 1477933003
    #	upper 0 lower) in 99 sequences in 1 files
    twoBitToFa mm8.2bit stdout | faSize stdin
    #	2664455088 bases (97171400 N's 2567283688 real 1477933003 upper
    #	1089350685 lower) in 34 sequences in 1 files
    #	note the 'real' bases are the same, the lowers have become N's
    #	1089350685 + 97171400 = 1186522085 
    #	1186522085 - 1183272085  = 3250000 == N's in gaps between contigs

    #	And, make sure there aren't any sequences in this lot that have
    #	become all N's with no sequence left in them:
    twoBitToFa mm8Chroms_RandomContigs.hard.2bit stdout \
	| faCount stdin > chroms_randoms.faCount
    #	the lowest three are:
    egrep -v "^#|^total" chroms_randoms.faCount \
	| awk '{print $1,$2-$7}' | sort -k2,2nr | tail -3
    #	MmUn_162590_36 1631
    #	Mm1_163269_36 1581
    #	MmUn_102813_36 1479

    #	creating 4,000,000 sized chunks, the chroms stay together as
    #	single pieces.  The contigs get grouped together into 4,000,000
    #	sized fasta files.  You don't want to break these things up
    #	because genscan will be doing its own internal 2.4 million
    #	window on these pieces, and the gene names are going to be
    #	constructed from the sequence name in these fasta files.  The
    #	gene names are much better when they are this simple chrN.M
    #	numbering scheme, or in the case of a contig: contig_name.M
    #	where the M is a sequence number that genscan will assign to
    #	each gene it discovers.
    mkdir hardChunks
    twoBitToFa mm8Chroms_RandomContigs.hard.2bit stdout \
	| faSplit about stdin 4000000 hardChunks/c_
    rsync -a --progress hardChunks/ /cluster/bluearc/mm8/hardChunks/

    ssh hgwdev
    mkdir /cluster/data/mm8/bed/genscan
    cd /cluster/data/mm8/bed/genscan
    # Check out hg3rdParty/genscanlinux to get latest genscan:
    cvs co hg3rdParty/genscanlinux

    # Run on small cluster (more mem than big cluster).
    ssh kki
    cd /cluster/data/mm8/bed/genscan
    # Make 3 subdirectories for genscan to put their output files in
    mkdir gtf pep subopt
    # Generate a list file, genome.list, of all the hard-masked contigs that 
    # *do not* consist of all-N's (which would cause genscan to blow up)
    #	Since we split on gaps, we have no chunks like that.  You can
    #	verify with faCount on the chunks.
    ls -1S /cluster/bluearc/mm8/hardChunks/c_*.fa > genome.list

    # Create template file, gsub, for gensub2.  For example (3-line file):
    cat << '_EOF_' > template
#LOOP
/cluster/bin/x86_64/gsBig {check in line+ $(path1)} {check out line gtf/$(root1).gtf} -trans={check out line pep/$(root1).pep} -subopt={check out line subopt/$(root1).bed} -exe=hg3rdParty/genscanlinux/genscan -par=hg3rdParty/genscanlinux/HumanIso.smat -tmp=/tmp -window=2400000
#ENDLOOP
'_EOF_'
    # << emacs
    gensub2 genome.list single template jobList
    para create jobList
    para try, check, push, check, ...
# Completed: 673 of 673 jobs
# CPU time in finished jobs:      76339s    1272.32m    21.21h    0.88d  0.002 y
# IO & Wait Time:                  2327s      38.78m     0.65h    0.03d  0.000 y
# Average job time:                 117s       1.95m     0.03h    0.00d
# Longest finished job:            1993s      33.22m     0.55h    0.02d
# Submission to last job:          7526s     125.43m     2.09h    0.09d

    #	There was a failed job, going to kolossus and running with a
    #	reduced window size:
    ssh kolossus
    cd /cluster/data/mm8/bed/genscan
    time /cluster/bin/x86_64/gsBig /cluster/bluearc/mm8/hardChunks/c_01.fa \
        gtf/c_01.gtf -trans=pep/c_01.pep -subopt=subopt/c_01.bed \
        -exe=hg3rdParty/genscanlinux/genscan \
        -par=hg3rdParty/genscanlinux/HumanIso.smat \
	-tmp=/scratch/tmp -window=2000000
    #	real    258m34.800s

    # cat and lift the results into single files
    ssh kkstore04
    cd /cluster/data/mm8/bed/genscan
    cat gtf/c_*.gtf | liftUp -type=.gtf genscan.gtf \
	../../jkStuff/liftAll.lft carry stdin
    cat subopt/c_*.bed | liftUp -type=.bed genscanSubopt.bed \
	../../jkStuff/liftAll.lft carry stdin
    cat pep/c_*.pep > genscan.pep

    # Load into the database as so:
    ssh hgwdev
    cd /cluster/data/mm8/bed/genscan
    ldHgGene mm8 -gtf genscan genscan.gtf
    #	Read 44899 transcripts in 323099 lines in 1 files
    #	44899 groups 34 seqs 1 sources 1 feature types
    #	44899 gene predictions

    hgPepPred mm8 generic genscanPep genscan.pep
    hgLoadBed -strict mm8 genscanSubopt genscanSubopt.bed
    #	Loaded 530201 elements of size 6

    #	check the numbers
    time nice -n +19 featureBits mm8 genscan
    #	54455852 bases of 2567283971 (2.121%) in intersection
    time nice -n +19 featureBits mm8 knownGene:cds
    #	28459053 bases of 2567283971 (1.109%) in intersection
    featureBits mm7 genscan
    #	54864694 bases of 2583394090 (2.124%) in intersection
    time nice -n +19 featureBits mm7 knownGene:cds
    #	27531524 bases of 2583394090 (1.066%) in intersection
    featureBits mm6 genscan
    #	54894283 bases of 2597150411 (2.114%) in intersection
    featureBits mm5 genscan
    #	55024722 bases of 2615483787 (2.104%) in intersection
    featureBits mm4 genscan
    #	56164126 bases of 2627444668 (2.138%) in intersection
    featureBits mm3 genscan
    #	51697165 bases of 2505900260 (2.063%) in intersection

    featureBits mm8 genscanSubopt
    #	57048581 bases of 2567283971 (2.222%) in intersection
    featureBits mm7 genscanSubopt
    #	57512333 bases of 2583394090 (2.226%) in intersection
    featureBits mm6 genscanSubopt
    #	57856316 bases of 2597150411 (2.228%) in intersection
    featureBits mm5 genscanSubopt
    #	58474899 bases of 2615483787 (2.236%) in intersection
    featureBits mm4 genscanSubopt
    #	59601009 bases of 2627444668 (2.268%) in intersection
    featureBits mm3 genscanSubopt
    #	56085184 bases of 2505900260 (2.238%) in intersection

##########################################################################
# BUILD NIBB IMAGE PROGES (in progress 2007-05-05 Jim)

# Make directory on san for cluster job and copy in sequence
    ssh pk
    mkdir /san/sanvol1/scratch/mm8/nibbPics
    cd /san/sanvol1/scratch/mm8/nibbPics
    cp /cluster/data/xenTro1/bed/nibbPics/nibbImageProbes.fa .

# Make parasol job dir and sequence list files
    mkdir run
    cd run
    mkdir psl
    ls -1 /scratch/hg/mm8/nib/*.nib > genome.lst
    echo ../nibbImageProbes.fa > mrna.lst

# Create parasol gensub file file
cat << '_EOF_' > gsub
#LOOP
blatz -rna -minScore=6000 -out=psl $(path1) $(path2) psl/$(root1)_$(root2).psl
#ENDLOOP
'_EOF_'

# Create parasol batch
    gensub2 genome.lst mrna.lst gsub spec
    para create spec

# Do para try/push/time etc.
#Completed: 49 of 49 jobs
#CPU time in finished jobs:      12585s     209.74m     3.50h    0.15d  0.000 y
#IO & Wait Time:                   411s       6.86m     0.11h    0.00d  0.000 y
#Average job time:                 265s       4.42m     0.07h    0.00d
#Longest running job:                0s       0.00m     0.00h    0.00d
#Longest finished job:            1145s      19.08m     0.32h    0.01d
#Submission to last job:          1195s      19.92m     0.33h    0.01d


# Make sort and filter
    catDir psl | sort -k 10 \
        | pslReps stdin stdout /dev/null -nohead -minAli=0.60 -nearTop=0.001 -minCover=0.10 -minNearTopSize=80 \
	| sort -k 14,14 -k 16,16n \
	| sed 's#/scratch/hg/mm8/nib/chr#chr#' \
	| sed 's/.nib//' > ../nibbImageProbes.psl

# Make bed file and copy in stuff
    ssh hgwdev
    cd /cluster/data/mm8/bed
    mkdir nibbPics
    cd nibbPics
    cp /cluster/data/xenTro1/bed/nibbPics/nibbImageProbes.fa .
    cp /san/sanvol1/scratch/mm8/nibbPics/nibbImageProbes.psl .

# Load into database
    ln -s /cluster/data/xenTro1/bed/nibbPics/nibbImageProbes.fa /gbdb/mm8/nibbImageProbes.fa
    hgLoadSeq mm8 /gbdb/mm8/nibbImageProbes.fa
    hgLoadPsl mm8 nibbImageProbes.psl


#############################################################################
#  miRNA track (DONE - 2006-05-22 - Fan)
    #   data from: Michel.Weber@ibcg.biotoul.fr
    #   notify them when done.
    ssh hgwdev
    cd /cluster/data/mm8/bed
    mkdir miRNA-2006-05-22
    cd miRNA-2006-0522
    # save the mm8_miRNA_track_may2006.txt file from email
    cat mm8_miRNA_track_may2006.txt|sed -e 's/ /\t/g' >miRNA.tab

    hgLoadBed -strict mm8 miRNA miRNA.tab

# check previous release track before update
    featureBits mm8 miRNA
    #   28630 bases of 2567283971 (0.001%) in intersection
    featureBits mm7 miRNA
    #   20620 bases of 2583394090 (0.001%) in intersection
    featureBits mm6 miRNA
    #   21167 bases of 2597150411 (0.001%) in intersection
    featureBits mm5 miRNA
    #   17957 bases of 2615483787 (0.001%) in intersection


#########################################################################
# BLASTZ CHICKEN galGal3 (DONE 5/24/06 angie)
    ssh pk
    mkdir /cluster/data/mm8/bed/blastz.galGal3.2006-05-23
    cd /cluster/data/mm8/bed/blastz.galGal3.2006-05-23
    cat << '_EOF_' > DEF
# mouse vs chicken
BLASTZ=blastz.v7.x86_64

# Specific settings for chicken (per Webb email to Brian Raney)
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=10000
BLASTZ_K=2200
BLASTZ_Q=/cluster/data/blastz/HoxD55.q
BLASTZ_ABRIDGE_REPEATS=1

# TARGET: Mouse mm8
SEQ1_DIR=/scratch/hg/mm8/nib
SEQ1_SMSK=/san/sanvol1/scratch/mm8/linSpecRep/notInNonMammal
SEQ1_LEN=/scratch/hg/mm8/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000

# QUERY: Chicken galGal3 - single chunk big enough to run entire chrom
SEQ2_DIR=/san/sanvol1/galGal3/nib
SEQ2_LEN=/cluster/data/galGal3/chrom.sizes
SEQ2_SMSK=/san/sanvol1/galGal3/linSpecRep
SEQ2_CHUNK=200000000
SEQ2_LAP=0

BASE=/cluster/data/mm8/bed/blastz.galGal3.2006-05-23
'_EOF_'
    # << emacs
    doBlastzChainNet.pl DEF -blastzOutRoot /san/sanvol1/scratch/gg3vsmm8 \
      -bigClusterHub=pk -smallClusterHub=pk \
      -chainMinScore=5000 -chainLinearGap=loose \
      >& do.log & tail -f do.log
    ln -s blastz.galGal3.2006-05-23 /cluster/data/mm8/bed/blastz.galGal3

#########################################################################
# ADD LINK TO GENENETWORK (DONE. 5/31/06 Fan).

# Copy geneNetwork ID list from mm7

    ssh hgwdev
    mkdir -p /cluster/data/mm8/bed/geneNetwork
    cd /cluster/data/mm8/bed/geneNetwork

    hgsql mm7 -N -e 'select * from geneNetworkId' > geneNetworkId.tab

    hgsql mm8 -e 'drop table geneNetworkId'
    hgsql mm8 < ~/src/hg/lib/geneNetworkId.sql
    hgsql mm8 -e \
    'load data local infile "geneNetworkId.tab" into table geneNetworkId'

############################################################################
# SGP GENES (DONE - 2006-06-12 - Hiram)
    ssh kkstore02
    cd  /cluster/data/mm8/bed
    ln -s /cluster/store8/mm8/bed/sgp .
    cd sgp
    #   They don't do chrM
    for C in `awk '{print $1}' /cluster/data/mm8/chrom.sizes | grep -v chrM`
    do
        wget --timestamping \
"http://genome.imim.es/genepredictions/M.musculus/mmMar2006/SGP/humangp200603/${C}.gtf" \
        -O "${C}.gtf"
    done

    ssh hgwdev
    cd /cluster/data/mm8/bed/sgp
    ldHgGene -gtf -genePredExt mm8 sgpGene chr*.gtf

    featureBits mm8 -enrichment refGene:CDS sgpGene
    #	refGene:CDS 1.063%, sgpGene 1.455%, both 0.918%, cover 86.32%,
    #	enrich 59.32x

#########################################################################
# BUILD KNOWN GENE LIST FOR GOOGLE. (DONE.  6/6/06 Fan).

    cd /cluster/data/mm8/bed
    rm -rf knownGeneList/mm8

# Run hgKnownGeneList to generate the tree of HTML pages
# under ./knownGeneList/mm8

    hgKnownGeneList mm8

    # copy over to /usr/local/apache/htdocs

    rm -rf /usr/local/apache/htdocs/knownGeneList/mm8
    mkdir -p /usr/local/apache/htdocs/knownGeneList/mm8
    cp -Rfp knownGeneList/mm8/* /usr/local/apache/htdocs/knownGeneList/mm8


#########################################################################
### IGTC (Int'l GeneTrap Consortium) (DONE - 2006-06-12 - angie)
### fasta added 2006-06-21
### Doug Stryke <stryke@cgl.ucsf.edu> in Tom Ferrin's lab

### NOTE -- as of 2007-03-01 the igtc track will be automatically 
### updated on hgwdev by the scripts monthlyUpdateIgtc.csh and 
### updateIgtc.pl in kent/src/hg/utils/automation/ .

    ssh hgwdev
    mkdir /cluster/data/mm8/bed/igtc
    cd /cluster/data/mm8/bed/igtc
    wget http://www.genetrap.org/blattrack/genetrap_mm8.psl
    grep -v ^track genetrap_mm8.psl \
    | hgLoadPsl mm8 -table=igtc stdin
    # Probe fasta is shared by all assemblies:
    wget http://www.genetrap.org/blattrack/genetrap.fasta
    mkdir /gbdb/mm8/igtc
    ln -s /cluster/data/mm8/bed/igtc/genetrap.fasta /gbdb/mm8/igtc/
    hgLoadSeq -replace mm8 /gbdb/mm8/igtc/genetrap.fasta


#########################################################################
# REGULATORY POTENTIAL (DONE - 2006-06-12 - Hiram)
    #	download data from "James Taylor" <james@bx.psu.edu>
    ssh kkstore04
    cd /cluster/data/mm8/bed
    mkdir /cluster/store8/mm8/bed/regPotential7X
    ln -s /cluster/store8/mm8/bed/regPotential7X .
    cd regPotential7X
    
    #	This is a lot of data
    time for C in 1 2 3 4 5 6 7 8 9 X 10 11 12 13 14 15 16 17 18 19
    do
    wget --timestamping \
"http://www.bx.psu.edu/~james/esperr_rp_7way_scores/genome_scores_mm8/chr${C}.scores.truncated.bz2"
    done
    #	real    79m32.840s

    wget --timestamping \
"http://www.bx.psu.edu/~james/esperr_rp_7way_scores/genome_scores_mm8/trackDb.html" -O description.html

    time for C in 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 X
    do
	bzcat chr${C}.scores.truncated.bz2
    done | wigEncode -noOverlap  stdin regPotential7X.wig regPotential7X.wib
    #	Converted stdin, upper limit 1.00, lower limit 0.00
    #	real    22m28.583s

    #	Loading the table on hgwdev
    ssh hgwdev
    cd /cluster/data/mm8/bed/regPotential7X
    ln -s /cluster/data/mm8/bed/regPotential7X/regPotential7X.wib \
	/gbdb/mm8/wib/regPotential7X.wib
    #	using the tmpDir is faster since it is on local disk and it will
    #	clean up any temporary .tab file it creates there
    time hgLoadWiggle -tmpDir=/scratch/tmp \
	mm8 regPotential7X regPotential7X.wig
    #	real    0m28.683s

    #	create a histogram
    ssh kolossus
    cd /cluster/data/mm8/bed/regPotential7X
    time hgWiggle -verbose=2 -doHistogram -hBinSize=0.01 -hBinCount=100 \
	-hMinVal=0.0 -db=mm8 regPotential7X > histogram.data 2>&1
    #	real    18m29.167s

    #	create download gzip files from the bz2 files:
    ssh kkstore04
    cd /cluster/data/mm8/bed/regPotential7X

    for F in chr*.scores.truncated.bz2
    do
	C=`echo $F | awk -F'.' '{print $1}'`
	echo -n "${C}.regPotential7X.mm8.gz working ... "
	bzcat ${F} | gzip > ${C}.regPotential7X.mm8.gz
	echo
    done

#############################################################################
# SIB Transcriptome (DONE Aug 29, 2007 - JK)

   # Create working directory and download data from where Christian Iseli
   # (Christian.Iseli@licr.org) put it, and unpack.  The download takes about
   # ten minutes (161M file).
   cd /cluster/data/mm8/bed
   mkdir sibTranscriptome
   cd sibTranscriptome
   wget ftp://ftp.licr.org/pub/databases/trome/mouse/MTR.gtf.gz
   wget ftp://ftp.licr.org/pub/databases/trome/mouse/txg.tar.gz
   tar -zxvf txg.tar.gz

   # Load up sibGene table
   zcat MTR.gtf.gz | ldHgGene mm8 sibGene stdin

   # Do a little data cleanup and transformation and load splice graphs into database.
   sed 's/altGraphX/sibTxGraph/' ~/src/hg/lib/altGraphX.sql > sibTxGraph.sql
   cat txg/*.txg | txgToAgx stdin stdout | hgLoadBed -notItemRgb -sqlTable=sibTxGraph.sql mm8 sibTxGraph stdin

   # Create sibAltEvents track for analysed alt-splices.
   cat txg/*.txg | txgAnalyze stdin /cluster/data/mm8/mm8.2bit sibAltEvents.bed
   awk '$2 >= 0' sibAltEvents.bed | sort | uniq > foo.bed
   hgLoadBed mm8 sibAltEvents foo.bed


#########################################################################
# MAP CONTIGS TRACK (DONE - 2005-10-04 - Hiram)
    ssh hgwdev
    mkdir -p /cluster/data/mm8/bed/ctgPos
    cd /cluster/data/mm8/bed/ctgPos
    # hgCtgPos uses the lift files... but mouse lift files are for the
    # 5MB contigs from splitFaIntoContigs, not for the real NT_ contigs
    # from the assembly.  (In the future, we should go with the NT's!)
    # So... just for this release, go straight from the seq_contig.md
    # to the table def'n: contig, size, chrom, chromStart, chromEnd
    #	This script is an improvement from before, this is now doing the
    #	randoms properly.
    cat << '_EOF_' > seqContigToCtgPos.pl
#!/usr/bin/env perl

use warnings;
use strict;

my $prevRandom="";
my $randomPosition=0;

while(my $line=<>)
{
chomp($line);
my @a = split('\s+',$line);
if ($a[1] =~ m/\|/)
    {
    my @b = split('\|',$a[1]);
    if ($b[0] ne $prevRandom)
	{
	$randomPosition=0;
	$prevRandom=$b[0];
	}
    my $size = $a[3]-$a[2]+1;
    my $start = $randomPosition;
    my $end = $randomPosition + $size;
    printf "%s\t%d\tchr%s_random\t%d\t%d\n", $a[5],$size,$b[0],$start,$end;
    if ($b[0] ne "Un") { $randomPosition += 50000; }
	else { $randomPosition += 50000; }
    $randomPosition += $size;
    }
elsif ($a[5] =~ m/^N[TC]_\d+$/)
    {
    my $start = $a[2]-1;
    my $end = $a[3];
    my $size = $end-$start;
    printf "%s\t%d\tchr%s\t%d\t%d\n", $a[5],$size,$a[1],$start,$end;
    }
}
'_EOF_'
    #	<< emacs happy
    chmod +x seqContigToCtgPos.pl

    egrep "ref_strain|C57BL" ../../seq_contig.md \
	| ./seqContigToCtgPos.pl > ctgPos.tab

    cat ../../seq_contig.md | ./seqContigToCtgPos.pl > ctgPos.tab

    hgsql mm8 -e "drop table ctgPos;"
    hgsql mm8 < ~/kent/src/hg/lib/ctgPos.sql
    hgsql mm8 -e 'load data local infile "ctgPos.tab" into table ctgPos;'

    featureBits -countGaps mm8 ctgPos
    #	2573322222 bases of 2664455088 (96.580%) in intersection
    featureBits -countGaps mm7 ctgPos
    #	2608810329 bases of 2847717329 (91.611%) in intersection
    featureBits -countGaps mm6 ctgPos
    #	2638893452 bases of 3079633452 (85.689%) in intersection
    featureBits -countGaps mm5 ctgPos
    #	2557081173 bases of 3164952073 (80.794%) in intersection

#####################################################################
#### LOAD ENSEMBL GENES (DONE - 2006-06-21 - Hiram)
# ADDED PEPTIDE TABLE, ENSPEP (DONE, 2006-07-11, hartera)
# ADDDED STABLE URL TO TRACKDB BLOCK (V39, JUN 2006) (2008-01-10, rhead)
   mkdir /cluster/data/mm8/bed/ensGene
   cd /cluster/data/mm7/bed/ensGene

        Get the Ensembl BioMart at http://www.ensembl.org/Multi/martview
        Choose Ensembl 39 and Mus musculus, click next
	It displays status in a window on the right, indicating how many
	entries are here, currently: 27,967
    	The next page is the "filter" step, we do not want any filters,
	nothing is changed on this page, click next
	Now we are on the "output" tab, the filter in the window on the right
	indicates that 27,967 passed the filter.  (there is no filter)
	Now, on this output page, change the pull-down menu item from
	its default of "features" to read "structures"
	All the check-boxes now change.
	Mark the check box GTF under output format
	Under Gene Ensemble Attributes,
	Unselect Biotype
	Select
	Ensembl Gene ID
	Ensembl Transcript ID
	External Gene ID

	gzip compression
	and give it a filename: ensGeneMm8
	it will add the .gff.gz suffix
	press "export"

#	The random coordinates are given in contig
#	coordinates, need to lift them to chroms, create a lift file:
	echo << '_EOF_' > mkRandomNTLift.sh
#!/bin/sh

grep random /cluster/data/mm8/chrom.sizes | while read R
do
        chr=`echo $R | awk '{print $1}'`
        size=`echo $R | awk '{print $2}'`
        hgsql -N -e "select * from ctgPos where chrom=\"$chr\";" mm8 | \
awk '
BEGIN {size="'$size'"}
{
        printf "%s\t%s\t%s\t%s\t%s\n", $4, $1, $2, $3, size
}
'
done
'_EOF_'
    # << happy emacs
    chmod +x ./mkRandomNTLift.sh
    ./mkRandomNTLift.sh > randomNT.lft

    # Add "chr" to front of each line (that is a normal chrom number)
    #	 in the gene data gtf file to make 
    # it compatible with ldHgGene and convert the chrMT name, and lift
    # the random coordinates

    zcat ensGeneMm8.gff.gz \
	| sed -e "s/^\([0-9XY][0-9]*\)/chr\1/; s/^MT/chrM/" \
	| liftUp ensGene.gtf randomNT.lft carry stdin
    ldHgGene mm8 ensGene ensGene.gtf
# Read 34831 transcripts in 597575 lines in 1 files
#   34831 groups 34 seqs 1 sources 4 feature types
# 34831 gene predictions

    featureBits mm8 ensGene
    #	56159487 bases of 2567283971 (2.188%) in intersection
    featureBits mm7 ensGene
    #	57484684 bases of 2583394090 (2.225%) in intersection
    featureBits mm6 ensGene
    #	54791625 bases of 2597150411 (2.110%) in intersection

    # Load ensGtp table.
    # ensGtp associates geneId/transcriptId/proteinId for hgPepPred and 
    # hgKnownToSuper.  

    # Get the Ensembl BioMart at http://www.ensembl.org/Multi/martview
    # Choose Ensembl 39 and Mus musculus, click next
    # Follow this sequence through the pages:
    # 1) No filters in the filter section, click next go to Output
    # 2) Select "Structures".
    # 3) select Ensembl Gene ID, Ensembl Transcript ID, Ensembl Peptide ID. 
    # 4) select "Text, tab separated" gzip and name the output file as "ensGtp"
    # 5) download the output file "ensGtp.tsv.gz"
    #       the tsv.gz is added automatically to the ensGtp

    #	Something is unusual in this download.  The lines are duplicated
    #	about 8 times more than necessary
    zcat ensGtp.tsv.gz | wc -l
    #	284554
    zcat ensGtp.tsv.gz | sort -u | wc -l
    gunzip ensGtp.tsv.gz
    #	34832

    hgsql mm8 < ~/kent/src/hg/lib/ensGtp.sql
    #	The 'tail -n +2' skips the first line with is just column
    #	heading labels.  The sort -u will eliminate the duplicate lines:
    zcat ensGtp.tsv.gz | tail -n +2 | sort -u \
    	| hgsql mm8 -e \
	'load data local infile "/dev/stdin" into table ensGtp;' 
    hgsql -e "select count(*) from ensGtp;" mm8
    #	34831
    #	properly, one less than the count above

    #	clean up
    gzip ensGene.gtf
    rm genePred.tab

    #	Now, an experiment to determine if the Ensembl peptide sequences
    #	are the same thing we get here upon translation of the CDS coding
    #	sequence from the genome
    mkdir /cluster/data/mm8/bed/ensGene/testPeptides
    cd /cluster/data/mm8/bed/ensGene/testPeptides
    getRnaPred -cdsOnly mm8 ensGene all stdout | gzip > all.cdsOnly.gz
    #	Obtaining protein sequence from EnsMart
    #	Select "sequences" from the pull-down on the output page
    #	check Peptide in the "Sequences" selection area
    #	and "Ensembl Transcript ID (versioned) in the Transcript
    #	Attributes area
    #	Text,Fasta output, gzip, file name: ensPepMm8
    #	becomes ensPepMm8.fasta.gz

    #	A special faToTab.pl script to allow an exclude list, first need
    #	to obtain the exclude list from the ensembl set:
    zcat ensPepMm8.fasta.gz \
	| ~/kent/src/utils/faToTab/faToTab.pl /dev/null /dev/stdin \
	    | sed -e "/^$/d; s/\*$//" | sort > ensPepMm8.fa.tab
    #	extract the exclude list from that
    grep "Sequence unavailable" ensPepMm8.fa.tab \
	| awk '{print $1}' > excludeList.txt
    #	now filter via that exclude list, remove the final '*' character
    #	from their protein sequence and sort by name
    zcat ensPepMm8.fasta.gz \
	| ~/kent/src/utils/faToTab/faToTab.pl excludeList.txt /dev/stdin \
	    | sed -e "/^$/d; s/\*$//" | sort > ensPepMm8.fa.tab
    #	and then our peptides, same filter, remove the final 'Z' character
    #	from this protein sequence (the stop codon):
    zcat all.cdsOnly.gz | faTrans stdin stdout \
	| ~/kent/src/utils/faToTab/faToTab.pl excludeList.txt /dev/stdin \
	    | sed -e "/^$/d; s/Z$//" | sort > all.fa.tab
    #	do we have the same lists:
    awk '{print $1}' ensPepMm8.fa.tab > ensList		
    awk '{print $1}' all.fa.tab > ucscList
    diff ensList ucscList
    #	no differences in the name list, numbering:
    wc -l ensList ucscList
    #	31302 ensList
    #	31302 ucscList
    #	How many proteins different:
    diff ensPepMm8.fa.tab all.fa.tab | grep "^>" | awk '{print $2}' | wc -l
    #	37
    #	Taking a look at that difference, it is difficult to see the
    #	individual differences, some are single amino acid
    #	differences, others are more radically different:
    diff ensPepMm8.fa.tab all.fa.tab | less

    #	Conclusion, the 37 differences out of 31,302 are not worth the
    #	trouble to load up the entire Ensembl peptide table 
    # Add Ensembl peptide table - requested by a user (hartera, 2006-07-11)
    ssh hgwdev
    cd /cluster/data/mm8/bed/ensGene
cat << EOF > ensPep.sql
CREATE TABLE ensPep (
name varchar(255) not null,     # Name of gene - same as in genePred
seq longblob not null,     # Peptide sequence
#Indices
PRIMARY KEY(name(64))
);
EOF
    cp ./testPeptides/ensPepMm8.fa.tab.gz .
    gunzip ensPepMm8.fa.tab.gz
    hgLoadSqlTab mm8 ensPep ensPep.sql ensPepMm8.fa.tab -warn

###########################################################################
## MAKE SUPERFAMILY TRACK (DONE, 6/22/06, Fan)

# If mm8.superfamily already exists, drop it.
   cd /cluster/data/mm8/bed
   mkdir /cluster/data/mm8/bed/sf.20060622
   ln -s sf.20060622 sf
   cd sf
   hgSuperfam mm8 superfam060619 > sf.log

# It is normal that many proteins do not have corresponding Superfamily entries.

# If mm8.sfDescription exists, drop it.

   hgsql mm8 < ~/src/hg/lib/sfDescription.sql
   hgsql mm8 -e 'LOAD DATA local INFILE "sfDescription.tab"  into table mm8.sfDescription;'

# Finally, load the superfamily table.

   hgLoadBed mm8 superfamily superfamily.tab -tab

# Create knownToEnsembl table
   hgMapToGene mm8 ensGene knownGene knownToEnsembl

# Create knownToSuperfamily table
# Note hs is changed into ht for this Superfamily release.

   cat /cluster/data/superfamily/060619/ass_18-Jun-2006.tab \
   | hgKnownToSuper mm8 mm stdin
# 26547 records output


###########################################################################
# dbSNP BUILD 126 (Heather, August 2006)

# Set up directory structure
ssh kkstore02
cd /cluster/data/dbSNP/126/mouse
mkdir mm8
cd mm8
mkdir data
mkdir schema
mkdir rs_fasta

# Get data from NCBI (anonymous FTP)
cd /cluster/data/dbSNP/126/mouse/mm8/data
ftp ftp.ncbi.nih.gov
cd snp/organisms/mouse_10090/database/organism_data
# ContigLoc table has coords, orientation, loc_type, and refNCBI allele
get b126_SNPContigLoc_36_1.bcp.gz
# ContigLocusId has function
get b126_SNPContigLocusId_36_1.bcp.gz
get b126_ContigInfo_36_1.bcp.gz
# MapInfo has alignment weights
get b126_SNPMapInfo_36_1.bcp.gz
# SNP has univar_id, validation status and heterozygosity
get SNP.bcp.gz

# Get schema from NCBI
cd /cluster/data/dbSNP/126/mouse/mm8/schema
ftp ftp.ncbi.nih.gov
cd snp/organisms/mouse_10090/database/organism_schema
get mouse_10090_table.sql.gz

# Get fasta files from NCBI
# using headers of fasta files for molType
cd /cluster/data/dbSNP/126/mouse/rs_fasta
ftp ftp.ncbi.nih.gov
cd snp/organisms/mouse_10090/rs_fasta
prompt
mget *.gz

# add rs_fasta to seq/extFile
# 2 edits first: strip header to just rsId, and remove duplicates
# work on /cluster/store12 (kkstore05) which has more disk space
cp rs_ch*.fas.gz /cluster/store12/snp/126/mouse/rs_fasta
ssh kkstore05
cd /cluster/store12/snp/126/mouse/rs_fasta
# concat into rsAll.fas
cat << '_EOF_' > concat.csh
#!/bin/csh -ef
rm -f rsAll.fas
foreach file (rs_ch*.fas)
    echo $file
    zcat $file >> rsAll.fas
end
'_EOF_' 
# snpCleanSeq strips the header and skips duplicates
/cluster/home/heather/kent/src/hg/snp/snpLoad/snpCleanSeq rsAll.fas snp.fa
rm rsAll.fas
# load on hgwdev
ssh hgwdev
mkdir /gbdb/mm8/snp
ln -s /cluster/store12/snp/126/mouse/rs_fasta/snp.fa /gbdb/mm8/snp/snp.fa
cd /cluster/store12/snp/126/mouse/rs_fasta
hgLoadSeq mm8 /gbdb/mm8/snp/snp.fa

# look up id in extFile
# move into separate table
hgsql mm8 < snpSeq.sql
hgsql -e 'insert into snpSeq select acc, file_offset from seq where extFile = 9642470' mm8
hgsql -e 'delete from seq where extFile = 9642470' mm8
hgsql -e 'alter table snpSeq add index acc (acc)' mm8

# clean up after hgLoadSeq
rm seq.tab

# Simplify names of data files
cd /cluster/data/dbSNP/126/mouse/mm8/data
mv b126_ContigInfo_36_1.bcp.gz ContigInfo.gz
mv b126_SNPContigLoc_36_1.bcp.gz ContigLoc.gz
mv b126_SNPContigLocusId_36_1.bcp.gz ContigLocusId.gz
mv b126_SNPMapInfo_36_1.bcp.gz MapInfo.gz
mv SNP.bcp.gz SNP.gz
ls -1 *.gz > filelist

# edit table descriptions
cd /cluster/data/dbSNP/126/mouse/mm8/schema
# get CREATE statements from mouse_10090_table.sql for our 5 tables
# store in table.tmp
# convert and rename tables
sed -f 'mssqlToMysql.sed' table.tmp > table2.tmp
rm table.tmp
sed -f 'tableRename.sed' table2.tmp > table.sql
rm table2.tmp

# Get updated UniVariation table
cd /cluster/data/dbSNP/126/shared
ftp ftp.ncbi.nih.gov
cd snp/database/shared_data
get UniVariation.bcp.gz
cd ../shared_schema
get dbSNP_main_table.sql.gz
# get UniVariation CREATE statement from dbSNP_main_table.sql
# use mssqlToMysql.sed to convert 

# get header lines from rs_fasta
cd /cluster/data/dbSNP/126/mouse/mm8/rs_fasta
/bin/csh gnl.csh

# load on kkr5u00
ssh kkr5u00
hgsql -e mysql 'create database mm8snp126' 
cd /cluster/data/dbSNP/126/mouse/mm8/schema
hgsql mm8snp126 < table.sql
cd ../data
/bin/csh load.csh

# note rowcount
# ContigLoc     23811983 
# SNP           10837184 
# MapInfo       23570302 
# ContigLocusId 10317095 

cd /cluster/data/dbSNP/126/shared
hgsql mm8snp126 < UniVariation.sql
zcat UniVariation.bcp.gz | hgsql -e 'load data local infile "/dev/stdin" into table UniVariation' mm8snp126

# create working /scratch dir
cd /scratch/snp/126
mkdir mouse
cd mouse

# get mm8 ctgPos, load into mm8snp126, compare contig list between ctgPos and ContigInfo
# No issues in non-random
# No PAR issues 

# get gnl files
cp /cluster/data/dbSNP/126/mouse/mm8/rs_fasta/*.gnl .

# examine ContigInfo for group_term and edit pipeline.csh
# use "ref_strain" 

# filter ContigLoc into ContigLocFilter
# this lifts from contig coords to chrom coords
# phys_pos_from is used to check coords for non-random chroms
# errors reported to stdout
# this gets rid of alternate assemblies (using ContigInfo)
# this also gets rid of poor quality alignments (weight == 10 || weight == 0 in MapInfo)
# assumes all contigs are positively oriented; will abort if not true
# Note for mouse we also screen on assembly = "C57BL/6J" in MapInfo

mysql> desc ContigLocFilter;
#  +---------------+-------------+------+-----+---------+-------+
#  | Field         | Type        | Null | Key | Default | Extra |
#  +---------------+-------------+------+-----+---------+-------+
#  | snp_id        | int(11)     | NO   |     |         |       |
#  | ctg_id        | int(11)     | NO   |     |         |       |
#  | chromName     | varchar(32) | NO   |     |         |       |
#  | loc_type      | tinyint(4)  | NO   |     |         |       |
#  | start         | int(11)     | NO   |     |         |       |
#  | end           | int(11)     | YES  |     | NULL    |       |
#  | orientation   | tinyint(4)  | NO   |     |         |       |
#  | allele        | blob        | YES  |     | NULL    |       |
#  +---------------+-------------+------+-----+---------+-------+
 
/cluster/home/heather/kent/src/hg/snp/snpLoad/snpContigLocFilter mm8snp126 ref_strain C57BL/6J
# note rowcount
# ContigLocFilter  7923033
# how many are positive strand? hopefully 90%
mysql> select count(*) from ContigLocFilter where orientation = 0;
# 7779413
# note count by loc_type
mysql> select count(*), loc_type from ContigLocFilter group by loc_type;
# +----------+----------+
# | count(*) | loc_type |
# +----------+----------+
# |     2144 |        1 |
# |  7903966 |        2 |
# |    13105 |        3 |
# |     1052 |        4 |
# |      523 |        5 |
# |     2243 |        6 |
# +----------+----------+


# filter ContigLocusId into ContigLocusIdFilter
/cluster/home/heather/kent/src/hg/snp/snpLoad/snpContigLocusIdFilter mm8snp126 ref_strain
# note rowcount 
# ContigLocusIdFilter  3484757

# condense ContigLocusIdFilter into ContigLocusIdCondense (one SNP can have multiple functions)
# assumes SNPs are in numerical order; will errAbort if not true
/cluster/home/heather/kent/src/hg/snp/snpLoad/snpContigLocusIdCondense mm8snp126 
# note rowcount; expect about 50% (ascertainment bias for SNPs within genes)
# ContigLocusIdCondense 2789998 
# could delete ContigLocusIdFilter table here

# create chrN_snpFasta tables from *.gnl files
# we are just using molType, but also storing class and observed
# need chromInfo for this
/cluster/home/heather/kent/src/hg/snp/snpLoad/snpLoadFasta mm8snp126 

# (could start using pipeline.csh here)
# (pipeline.csh takes about 35 minutes to run)

# split ContigLocFilter by chrom 
# create the first chrN_snpTmp
# we will reuse this table name, adding/changing columns as we go
# at this point chrN_snpTmp will have the same description as ContigLocFilter
# this opens a file handle for every chrom, so will not scale to scaffold-based assemblies
/cluster/home/heather/kent/src/hg/snp/snpLoad/snpSplitByChrom mm8snp126 ref_strain

# adjust coords using loc_type
# possible errors logged to snpLocType.error:
# Unknown locType 
# Between with end != start + 1 
# Between with allele != '-' 
# Exact with end != start 
# Range with end < start 

# possible exceptions logged to snpLocType.exceptions:
# RefAlleleWrongSize

# This run no errors, no exceptions

# morph chrN_snpTmp 

mysql> desc chr1_snpTmp;

#  +---------------+-------------+------+-----+---------+-------+
#  | Field         | Type        | Null | Key | Default | Extra |
#  +---------------+-------------+------+-----+---------+-------+
#  | snp_id        | int(11)     | NO   |     |         |       |
#  | ctg_id        | int(11)     | NO   |     |         |       |
#  | chromStart    | int(11)     | NO   |     |         |       |
#  | chromEnd      | int(11)     | NO   |     |         |       |
#  | loc_type      | tinyint(4)  | NO   |     |         |       |
#  | orientation   | tinyint(4)  | NO   |     |         |       |
#  | allele        | blob        | YES  |     | NULL    |       |
#  +---------------+-------------+------+-----+---------+-------+

/cluster/home/heather/kent/src/hg/snp/snpLoad/snpLoctype mm8snp126 ref_strain

# expand allele as necessary
# report syntax errors to snpExpandAllele.errors
# possible exceptions logged to snpExpandAllele.exceptions:
# RefAlleleWrongSize
# This run no errors, no exceptions
# 200? alleles expanded

/cluster/home/heather/kent/src/hg/snp/snpLoad/snpExpandAllele mm8snp126 ref_strain

# the next few steps prepare for working in UCSC space

# sort by position
/cluster/home/heather/kent/src/hg/snp/snpLoad/snpSort mm8snp126 ref_strain

# rename MT --> M (pipeline.csh takes care of this)
hgsql -e "rename table chrMT_snpTmp to chrM_snpTmp" mm8snp126

# get mm8 nib files
# get mm8 chromInfo, load into mm8snp126 with editted path
# lookup reference allele in nibs
# keep reverse complement to use in error checking (snpCheckAlleles)
# check here for SNPs larger than 1024
# errAbort if detected
# check for coords that are too large, log to snpRefUCSC.error and skip
# This run no errors
/cluster/home/heather/kent/src/hg/snp/snpLoad/snpRefUCSC mm8snp126

# morph chrN_snpTmp 

mysql> desc chr1_snpTmp;

#  +--------------------+-------------+------+-----+---------+-------+
#  | Field              | Type        | Null | Key | Default | Extra |
#  +--------------------+-------------+------+-----+---------+-------+
#  | snp_id             | int(11)     | NO   |     |         |       |
#  | ctg_id             | int(11)     | NO   |     |         |       |
#  | chromStart         | int(11)     | NO   |     |         |       |
#  | chromEnd           | int(11)     | NO   |     |         |       |
#  | loc_type           | tinyint(4)  | NO   |     |         |       |
#  | orientation        | tinyint(4)  | NO   |     |         |       |
#  | allele             | blob        | YES  |     | NULL    |       |
#  | refUCSC            | blob        | YES  |     | NULL    |       |
#  | refUCSCReverseComp | blob        | YES  |     | NULL    |       |
#  +--------------------+-------------+------+-----+---------+-------+

# compare allele from dbSNP to refUCSC
# locType between is excluded from this check
# log exceptions to snpCheckAllele.exceptions
# if SNP is positive strand, expect allele == refUCSC
# log RefAlleleMismatch if not
# if SNP is negative strand, if not allele == refUCSC, then check for allele == refUCSCReverseComp
# If allele == refUCSCRevComp, log RefAlleleNotRevComp
# If allele doesn't match either of refUCSC or refUCSCReverseComp, log RefAlleleMismatch
# This run we got:
# 0 RefAlleleMismatch
# 9621   RefAlleleNotRevComp
/cluster/home/heather/kent/src/hg/snp/snpLoad/snpCheckAlleles mm8snp126

# add class and observed using univar_id from SNP table
# to get class (subsnp_class) and observed (var_str) from UniVariation
# log errors to snpClassAndObserved.errors
# errors detected: 
# class = 0 in UniVariation
# class > 8 in UniVariation
# univar_id = 0 in SNP
# no row in SNP for snp_id in chrN_snpTmp
# This run we got:
# 3 class = 0 in UniVariation
# 0 class > 8 in UniVariation
# 2890606 univar_id = 0 in SNP (strange, but okay)
# 0 no row in SNP for snp_id in chrN_snpTmp 
# dbSNP has class = 'in-del'
# we promote this to 'deletion' for locType 1&2 and to 'insertion' for locType 3

/cluster/home/heather/kent/src/hg/snp/snpLoad/snpClassAndObserved mm8snp126

# morph chrN_snpTmp
#  +--------------------+---------------+------+-----+---------+-------+
#  | Field              | Type          | Null | Key | Default | Extra |
#  +--------------------+---------------+------+-----+---------+-------+
#  | snp_id             | int(11)       | NO   |     |         |       |
#  | chromStart         | int(11)       | NO   |     |         |       |
#  | chromEnd           | int(11)       | NO   |     |         |       |
#  | loc_type           | tinyint(4)    | NO   |     |         |       |
#  | class              | varchar(255)  | NO   |     |         |       |
#  | orientation        | tinyint(4)    | NO   |     |         |       |
#  | allele             | blob          | YES  |     | NULL    |       |
#  | refUCSC            | blob          | YES  |     | NULL    |       |
#  | refUCSCReverseComp | blob          | YES  |     | NULL    |       |
#  | observed           | blob          | YES  |     | NULL    |       |
#  +--------------------+---------------+------+-----+---------+-------+

# generate exceptions for class and observed

# SingleClassBetweenLocType
# SingleClassRangeLocType
# NamedClassWrongLocType

# ObservedWrongFormat
# ObservedWrongSize 
# ObservedMismatch 

# RangeSubstitutionLocTypeExactMatch

# SingleClassTriAllelic
# SingleClassQuadAllelic

# This will also detect IUPAC symbols in allele

/cluster/home/heather/kent/src/hg/snp/snpLoad/snpCheckClassAndObserved mm8snp126

# add function
/cluster/home/heather/kent/src/hg/snp/snpLoad/snpFunction mm8snp126

# add validation status and heterozygosity
# log error if validation status > 31 or missing
# no errors this run
/cluster/home/heather/kent/src/hg/snp/snpLoad/snpSNP mm8snp126

# add molType
# errors detected: missing or duplicate molType
# 57709 duplicates
/cluster/home/heather/kent/src/hg/snp/snpLoad/snpMoltype mm8snp126

# generate chrN_snp126 and snp126Exceptions tables
cp snpCheckAlleles.exceptions snpCheckAlleles.tab
cp snpCheckClassAndObserved.exceptions snpCheckClassAndObserved.tab
cp snpExpandAllele.exceptions snpExpandAllele.tab
cp snpLocType.exceptions snpLocType.tab
/cluster/home/heather/kent/src/hg/snp/snpLoad/snpFinalTable mm8snp126 126

# concat into snp126.tab
# cat chr*_snp126.tab >> snp126.tab
/bin/sh concat.sh

# check for multiple alignments
/cluster/home/heather/kent/src/hg/snp/snpLoad/snpMultiple mm8snp126
mysql> load data local infile 'snpMultiple.tab' into table snp126Exceptions;

# load on hgwdev
cp snp126.tab /cluster/home/heather/transfer/snp
hgsql mm8snp126 -e 'select * from snp126Exceptions' > /cluster/home/heather/transfer/snp/snp126Exceptions.tab
ssh hgwdev
mysql> load data local infile 'snp126.tab' into table snp126; 
mysql> load data local infile 'snp126Exceptions.tab' into table snp126Exceptions; 

# create indexes
mysql> alter table snp126 add index name (name);
mysql> alter table snp126 add index chrom (chrom, bin);
mysql> alter table snp126Exceptions add index name(name);

# create snp126ExceptionDesc table
cd /cluster/data/dbSNP
hgsql mm8 < snp126ExceptionDesc.sql
# add counts to exception.human.126, can start with exception.template
hgsql -e 'select count(*), exception from snp126Exceptions group by exception' mm8
mysql> load data local infile 'exception.mouse.126' into table snp126ExceptionDesc;

mysql> select count(*), exception from snp126Exceptions group by exception;
+----------+---------------------------+
| count(*) | exception                 |
+----------+---------------------------+
|    97271 | MultipleAlignments        |
|     1600 | ObservedMismatch          |
|       27 | ObservedWrongFormat       |
|      272 | ObservedWrongSize         |
|     9621 | RefAlleleNotRevComp       |
|    11169 | SingleClassBetweenLocType |
|      346 | SingleClassQuadAllelic    |
|     5023 | SingleClassRangeLocType   |
|     3905 | SingleClassTriAllelic     |
+----------+---------------------------+

####################################################################
##  redoing STS markers track to get them more correct
##		(DONE - 2006-09-15 - Hiram)
    #	Went into the updateBed.pl script, reworked it, made it safer,
    #	debugged a lot of things and placed it into the source tree.

    ssh hgwdev
    mkdir /cluster/data/mm8/bed/STSmarkers.2006-08-29
    cd /cluster/data/mm8/bed/STSmarkers.2006-08-29
    # with that fixed script, create a new stsInfoMouse.bed file:
    #	Update the m m 7 directory name here to m m 8
    #	for the next build of m m 9,  ...etc... and so forth
    time ~/kent/src/hg/stsMarkers/updateBed.pl \
        /cluster/data/mm7/bed/STSmarkers/stsInfoMouse.bed \
        ../STSmarkers/downloads/MRK_Dump2.rpt \
	../STSmarkers/downloads/PRB_PrimerSeq.rpt \
        ../STSmarkers/downloads/MRK_Sequence.rpt \
	../STSmarkers/downloads/UniSTS_mouse.alias \
        ../STSmarkers/downloads/UniSTS_mouse.sts \
        -g ../STSmarkers/downloads/10090.WI-Genetic.txt \
        -r ../STSmarkers/downloads/10090.WI_MRC_RH.txt \
        -verbose 2> dbg.updateBed | sed -e "s/\t*$//" > newbedfile

    ~/kent/src/hg/stsMarkers/cleanInfo.pl -mouse newbedfile \
        | sed -e "s/\t*$//" > mm8.stsInfoMouse.bed

    # copy the stsInfoMouse.bed file from working dir to the marker
    #	info storage fold.  added 2 new steps by Yontao	
    #	be wary of the archive name here, check the directory and get
    #	the name right here.
    mv /cluster/store5/mouseMarker/stsInfoMouse.bed \
	/cluster/store5/mouseMarker/stsInfoMouse.bed_mm8.firstTime
    cp -p mm8.stsInfoMouse.bed /cluster/store5/mouseMarker/stsInfoMouse.bed

    # comparing to previous, numbers increase slightly each time
    wc -l /cluster/store5/mouseMarker/stsInfoMouse.bed \
	/cluster/store5/mouseMarker/stsInfoMouse.bed_mm8.firstTime \
	/cluster/store5/mouseMarker/stsInfoMouse.bed_mm7 \
	/cluster/store5/mouseMarker/stsInfoMouse.bed_mm6 \
	/cluster/store5/mouseMarker/stsInfoMouse.bed_mm5
    #	60631 /cluster/store5/mouseMarker/stsInfoMouse.bed
    #	60440 /cluster/store5/mouseMarker/stsInfoMouse.bed_mm8.firstTime
    #	59843 /cluster/store5/mouseMarker/stsInfoMouse.bed_mm7
    #	58980 /cluster/store5/mouseMarker/stsInfoMouse.bed_mm6
    #	58493 /cluster/store5/mouseMarker/stsInfoMouse.bed_mm5

    # and from that, create new primer fa, epcr, etc:
    time ~/kent/src/hg/stsMarkers/luConvertPrimerToFa \
	mm8.stsInfoMouse.bed mouseP.fa mouseC.fa mouseP.info
    # the mouseC.fa file will be empty, should be more than last time
    wc -l mouse?.* ../STSmarkers/mouse?.*
    #	     0 mouseC.fa
    #	308384 mouseP.fa
    #	 34666 mouseP.info
    #	     0 ../STSmarkers/mouseC.fa
    #	305991 ../STSmarkers/mouseP.fa
    #	 34475 ../STSmarkers/mouseP.info

    #	the equivalent Mm7 files:
    #      0       0       0 mouseC.fa
    # 300968  300914 6798466 mouseP.fa
    #  33838  169275 2153113 mouseP.info
    # 334806  470189 8951579 total
    #	the equivalent Mm6 files:
    #	     0       0       0 mouseC.fa
    #	293305  293251 6624638 mouseP.fa
    #	 32890  164528 2087271 mouseP.info
    #	326195  457779 8711909 total
    #	the equivalent Mm5 files:
    #	     0       0       0 mouseC.fa
    #	286740  286686 6474893 mouseP.fa
    #	 32232  161234 2044810 mouseP.info
    #	318972  447920 8519703 total

    #	copy the primers over to some filesystem close to the klusters
    #	and split them up to have a small number of sequences in one file
    

    mkdir /cluster/bluearc/mm8/stsMarkers.2006-08-29
    cp -p mouseP.fa /cluster/bluearc/mm8/stsMarkers.2006-08-29
    cd /cluster/bluearc/mm8/stsMarkers.2006-08-29
    cp -p /cluster/data/mm8/11.ooc .
    mkdir split
    #	356 files for 34,666 sequences, == about 97 sequences per file
    faSplit sequence mouseP.fa 400 split/mm_

    # PLEASE NOTE /cluster/bin/i386/blat.2 SPECIFICALLY IS USED HERE. 
    #	This process could convert to a modern version of blat with the
    #	filters as described, for example, in the STS markers build in Hg18

    #  CLUSTER RUN FOR THE STS PRIMERS
    ssh kk
    cd /cluster/data/mm8/bed/STSmarkers.2006-08-29
    mkdir primer
    mkdir ePCR
    cd primer
    mkdir out

    #	interestingly, this blat2.2 binary did not function correctly
    #	when given nib files.  It has only about 1/4th of the number of
    #	alignments as it gets when it used fa files for the target
    #	sequence.

    ls -1S /cluster/bluearc/mm8/stsMarkers.2006-08-29/split > primers.list
    #	re-using chrom sequences from first time
    ls -1S /cluster/bluearc/mm8/stsMarkers/chroms > chr.list

    cat << '_EOF_' > runBlat2.csh
#!/bin/csh -fe
set primer = /cluster/bluearc/mm8/stsMarkers.2006-08-29/split/$1
set fa = /cluster/bluearc/mm8/stsMarkers/chroms/$2
set ooc = /cluster/bluearc/mm8/stsMarkers.2006-08-29/11.ooc
set root2 = $2:r
mkdir -p out/${root2}
set out = $3

/cluster/bin/i386/blat.2 ${fa} ${primer} -ooc=${ooc} \
        -minMatch=1 -minScore=0 -minIdentity=80 -oneOff ${out}
'_EOF_'
    #	<< happy emacs
    chmod +x runBlat2.csh

    cat << '_EOF_' > template
#LOOP
./runBlat2.csh $(path1) $(path2) {check out line+ out/$(root2)/$(root1).psl}
#ENDLOOP
'_EOF_'
    #	<< happy emacs

    gensub2 primers.list chr.list template jobList
    para create jobList
    p80ara try ... check ... push ... etc ...
# Completed: 12104 of 12104 jobs
# CPU time in finished jobs:    1078733s   17978.89m   299.65h   12.49d  0.034 y
# IO & Wait Time:              13537140s  225618.99m  3760.32h  156.68d  0.429 y
# Average job time:                1208s      20.13m     0.34h    0.01d
# Longest finished job:           11831s     197.18m     3.29h    0.14d
# Submission to last job:         20458s     340.97m     5.68h    0.24d

    # on the file server
    ssh kkstore04
    cd /cluster/data/mm8/bed/STSmarkers.2006-08-29/primer
    time pslSort dirs primers.raw.psl temp out/chr*
    #	real    3m30.758s
    #	-rw-rw-r--   1 588001891 Sep 15 10:02 primers.raw.psl

    #	filter alignments for (qEnd-qStart) vs. (tEnd-tStart)
    #	should not be more than 100 bases different.
    #	This filters out about 948,260 alignments, or
    #	%17.4 = 100.0 * 948260 / 5462936
    time pslSort dirs stdout temp out/chr* | awk -F"\t" '
{ if (((($13 - $12) - ($17 - $16)) > -100) &&
	((($13 - $12) - ($17 - $16)) < 100)) {print}
}
' > primers.psl.100

    rmdir temp

    wc -l *.100 *.psl
    #	5462936 primers.raw.psl
    #	4514676 primers.psl.100
    #    948260 difference

    # a rough comparison with previous results:
    wc -l /cluster/data/mm8/bed/STSmarkers/primer/primers.psl.100
    #	4500528 /cluster/data/mm8/bed/STSmarkers/primer/primers.psl.100

    # another kluster run for the ePCR
    ssh pk
    cd /cluster/data/mm8/bed/STSmarkers.2006-08-29/ePCR
    ls -1S /cluster/bluearc/mm8/stsMarkers/chroms > chr.list

    #	pick up e-PCR source from
    #	ftp://ftp.ncbi.nlm.nih.gov/pub/schuler/e-PCR/
    #	version 2.3.1 11 Feb 2005
    #	Had to add the following to both re-PCR_main.cpp and
    #	e-PCR_main.cpp to get them to compile on kolossus:
// max and min Copied from /usr/include/mysql/my_global.h
#define max(a, b)       ((a) >? (b))
#define min(a, b)       ((a) <? (b))

    mkdir out
    cat << '_EOF_' > runPCR
#!/bin/csh -fe
/cluster/bin/x86_64/e-PCR \
	/cluster/data/mm8/bed/STSmarkers.2006-08-29/mouseP.info \
	/cluster/bluearc/mm8/stsMarkers/chroms/$1 N=1 M=50 W=5 > $2
'_EOF_'
    # << happy emacs
    chmod +x runPCR

    cat << '_EOF_' > template
#LOOP
./runPCR $(path1) {check out line+ out/$(num1).epcr}
#ENDLOOP
'_EOF_'
    # << the mouseP.info was created above
    gensub2 chr.list single template jobList
    para create jobList
    para try
    para check
    para push
    ... etc ...
    #	There is a single job that produces no output:
    ./runPCR chrX_random.fa out/30.epcr
    #	WARNING: 96 STSs have primer shorter than W
    #	WARNING: 21 STSs have ambiguities within W of 3' end
    #	Not sure what's up with that
# Completed: 33 of 34 jobs
# Crashed: 1 jobs
# CPU time in finished jobs:      64904s    1081.73m    18.03h    0.75d  0.002 y
# IO & Wait Time:                  1860s      31.00m     0.52h    0.02d  0.000 y
# Average job time:                2023s      33.72m     0.56h    0.02d
# Longest finished job:            4861s      81.02m     1.35h    0.06d
# Submission to last job:          4862s      81.03m     1.35h    0.06d

    ssh kkstore04
    cd /cluster/data/mm8/bed/STSmarkers.2006-08-29/ePCR
    # all those results become all.epcr
    cat out/*.epcr > all.epcr

    # comparing to previous results:
    wc -l all.epcr
    #	58162 all.epcr
    wc -l /cluster/data/mm8/bed/STSmarkers/ePCR/all.epcr
    #	58088 all.epcr

    cd /cluster/data/mm8/bed/STSmarkers.2006-08-29/primer

    ~/kent/src/hg/stsMarkers/filterSTSPrimers \
    -mouse ../mm8.stsInfoMouse.bed primers.psl.100 \
        ../mouseP.info ../ePCR/all.epcr > primers.psl.filter.blat

    #  The output should show an increasing count:
    #	Reading name info
    #	Reading primer info
    #	Processing file
    #	100000
    #	200000
    #	300000
    #	...
    #	4500000
    #	Determining ePCR not found from ePCR results
    #	Out of 25749 ePCR alignments examined, not found: 520
    #
    wc -l primers.psl.filter.blat
    #	34043 primers.psl.filter.blat
    wc -l /cluster/data/mm8/bed/STSmarkers/primer/primers.psl.filter.blat
    #	34026 primers.psl.filter.blat

    # create file accession_info.rdb
    touch empty_sequence.inf
    ~/kent/src/hg/stsMarkers/compileAccInfo -mouse \
	/cluster/data/mm8 empty_sequence.inf
    #	20502 processed
    mv accession_info.rdb accession_info.rdb.tmp
    ~/kent/src/hg/stsMarkers/sorttbl -x Chr Ord Start \
	< accession_info.rdb.tmp > accession_info.rdb
    #	The -x prints the debug statement:
    #	sort arg:  -t"  " +0 -1 +1 -2g +2 -3g
    rm accession_info.rdb.tmp

    # comparing results to previous
    #	Continuing the trend that began with Mm7, the numbers in
    #	accession_info.rdb continue to decrease.  Even Mm8 has much less
    #	fragments than did mm7:
    #	e.g.:
    [hiram@kkstore04 /cluster/data] wc -l mm8/*/chr*.agp | tail -1
    #	21910 total
    [hiram@kkstore04 /cluster/data] wc -l mm7/*/chr*.agp | tail -1
    #	70125 total
    [hiram@kkstore04 /cluster/data] wc -l mm6/*/chr*.agp | tail -1
    #	170812 total

    wc -l accession_info.rdb
    #	20385 accession_info.rdb
    wc -l ../../STSmarkers/primer/accession_info.rdb
    #	20385 ../../STSmarkers/primer/accession_info.rdb

    # creates epcr.not.found.nomatch and epcr.not.found.psl
    ~/kent/src/hg/stsMarkers/epcrToPsl -mouse \
	epcr.not.found ../mouseP.info \
	accession_info.rdb /cluster/data/mm8 2> dbg.epcrToPsl
    #	the dbg.epcrToPsl has a number of lines complaining about bad
    #	primers in ../mouseP.info - and indeed they are bad primers,
    #	they do not have a second primer.

    # Comparing results to previous:
    wc -l epcr*
    #	520 epcr.not.found
    #	  0 epcr.not.found.nomatch
    #	520 epcr.not.found.psl
    wc -l ../../STSmarkers/primer/epcr*
    #	501 ../../STSmarkers/primer/epcr.not.found
    #	  0 ../../STSmarkers/primer/epcr.not.found.nomatch
    #	501 ../../STSmarkers/primer/epcr.not.found.psl

    # Mm7 wc epcr*
    wc -l /cluster/data/mm7/bed/STSmarkers/primer/epcr*
    #	 474 /cluster/data/mm7/bed/STSmarkers/primer/epcr.not.found
    #	   0 /cluster/data/mm7/bed/STSmarkers/primer/epcr.not.found.nomatch
    #	 474 /cluster/data/mm7/bed/STSmarkers/primer/epcr.not.found.psl
    #	 158 /cluster/data/mm7/bed/STSmarkers/primer/epcrToPsl
    #	1106 total

    # Mm6 wc epcr*
    wc -l /cluster/data/mm6/bed/STSmarkers/primer/epcr*
    #	 472 /cluster/data/mm6/bed/STSmarkers/primer/epcr.not.found
    #	  63 /cluster/data/mm6/bed/STSmarkers/primer/epcr.not.found.nomatch
    #	 404 /cluster/data/mm6/bed/STSmarkers/primer/epcr.not.found.psl
    #	 158 /cluster/data/mm6/bed/STSmarkers/primer/epcrToPsl
    #	1097 total

    cat primers.psl.filter.blat epcr.not.found.psl > primers.psl.filter
    wc -l primers.psl.filter
    #	34563 primers.psl.filter
    wc -l ../../STSmarkers/primer/primers.psl.filter
    #	34527 primers.psl.filter

    wc -l /cluster/data/mm7/bed/STSmarkers/primer/primers.psl.filter
    #	34460 /cluster/data/mm7/bed/STSmarkers/primer/primers.psl.filter

    wc -l /cluster/data/mm6/bed/STSmarkers/primer/primers.psl.filter
    #	33532 /cluster/data/mm6/bed/STSmarkers/primer/primers.psl.filter

    wc -l /cluster/data/mm5/bed/STSmarkers/primer/primers.psl.filter.lifted
    # 33691 /cluster/data/mm5/bed/STSmarkers/primer/primers.psl.filter.lifted

    # create primers.psl.filter.lifted.initial
    #	The PATH setting allows extractPslInfo to find other programs that it
    #	is going to use.
    PATH=~/kent/src/hg/stsMarkers:$PATH \
	~/kent/src/hg/stsMarkers/extractPslInfo primers.psl.filter

    wc -l *.initial
    #	34545 primers.psl.filter.initial
    wc -l ../../STSmarkers/primer/*.initial
    #	34513 ../../STSmarkers/primer/primers.psl.filter.initial
    wc -l /cluster/data/mm7/bed/STSmarkers/primer/primers.psl.filter.initial
    #	34443 /cluster/data/mm7/bed/STSmarkers/primer/primers.psl.filter.initial
    wc -l /cluster/data/mm6/bed/STSmarkers/primer/primers.psl.filter.initial
    #	33514 /cluster/data/mm6/bed/STSmarkers/primer/primers.psl.filter.initial
    wc -l \
       /cluster/data/mm5/bed/STSmarkers/primer/primers.psl.filter.lifted.initial
    # 33689 

    # create primers.psl.filter.lifted.initial.acc
    PATH=~/kent/src/hg/stsMarkers:$PATH \
    ~/kent/src/hg/stsMarkers/findAccession -agp \
	-mouse primers.psl.filter.initial /cluster/data/mm8
    wc -l primers.psl.filter.initial.acc
    #	34545 primers.psl.filter.initial.acc
    wc -l ../../STSmarkers/primer/primers.psl.filter.initial.acc
    #	34513 primers.psl.filter.initial.acc


    # this needs to be -rat as that specifies how to scan the
    # stsInfoMouse.bed file and it does not work if you use -mouse
    # it is not clear what -mouse would mean to this script, some other file
    # format perhaps from the stsInfoMouse.bed format.
    ~/kent/src/hg/stsMarkers/getStsId -rat \
	../mm8.stsInfoMouse.bed  primers.psl.filter.initial.acc \
	| sort -k4,4n > primers.final
    wc -l primers.final
    #	34545 primers.final
    wc -l ../STSmarkers/primer/primers.final
    #	34513 primers.final

    cd /cluster/data/mm8/bed/STSmarkers.2006-08-29
    # stsMarkers.final is empty for mouse
    touch stsMarkers.final dummy
    PATH=~/kent/src/hg/stsMarkers:$PATH \
    ~/kent/src/hg/stsMarkers/combineSeqPrimerPos \
	stsMarkers.final primer/primers.final > stsMarkers_pos.rdb
    wc -l stsMarkers_pos.rdb
    #	33048 stsMarkers_pos.rdb
    wc -l ../STSmarkers/stsMarkers_pos.rdb
    #	33075 stsMarkers_pos.rdb

    PATH=~/kent/src/hg/stsMarkers:$PATH \
    ~/kent/src/hg/stsMarkers/createStsBed \
	mm8.stsInfoMouse.bed  stsMarkers_pos.rdb 500 \
	| sort -k1,1 -k2,2n | sed -e "s/ //g" > stsMapMouse.bed
    #	The sed removes unneeded blanks
    #	verify score profile remains similar
    awk -F'\t' '{print $5}' stsMapMouse.bed | sort -n | uniq -c
    #	  546 500
    #	 1650 750
    #	27705 1000
    awk -F'\t' '{print $5}' ../STSmarkers/stsMapMouse.bed | sort -n | uniq -c
    #	  546 500
    #	 1648 750
    #	27692 1000

    wc -l stsMapMouse.bed
    #	29901  stsMapMouse.bed
    wc -l ../STSmarkers/stsMapMouse.bed
    #	29888  stsMapMouse.bed

    #  loading STS markers tables
    ssh hgwdev
    cd /cluster/data/mm8/bed/STSmarkers.2006-08-29
    ~/kent/src/hg/stsMarkers/ucscAlias.pl \
	mm8.stsInfoMouse.bed > ucscStsAlias.tab 2> ucscStsAlias.warnings
    #	this does leave messages in ucscStsAlias.warnings but they seem
    #	to be very similar to Mm6 with just a few new ones
     
    wc -l ucscStsAlias.tab
    #	146767  ucscStsAlias.tab
    wc -l ../STSmarkers/ucscStsAlias.tab
    #	146064  ucscStsAlias.tab

    #	After extensive comparison with the currently existing STS markers, it
    #	appears that this new set only has a couple of new ones, and a couple
    #	of ones have been dropped.  It seems that the primary correction has
    #	been to the marker positions.


    ssh hgwdev
    cd /cluster/data/mm8/bed/STSmarkers.2006-08-29
    #	Saving the existing tables for archival purposes
    hgsql -e "alter table stsInfoMouseNew rename as stsInfoMouseNewFeb2006;" mm8
    hgsql -e "alter table stsAlias rename as stsAliasFeb2006;" mm8
    hgsql -e "alter table all_sts_primer rename as all_sts_primerFeb2006;" mm8
    hgsql -e "alter table stsMapMouseNew rename as stsMapMouseNewFeb2006;" mm8

    hgsql mm8 < ~/kent/src/hg/lib/stsAlias.sql
    hgsql -e \
	'load data local infile "ucscStsAlias.tab" into table stsAlias;' mm8
    hgsql mm8 < ~/kent/src/hg/lib/stsMapMouseNew.sql
    hgsql -e \
'load data local infile "stsMapMouse.bed" into table stsMapMouseNew;' mm8
    hgsql mm8 < ~/kent/src/hg/lib/stsInfoMouseNew.sql
    hgsql -e \
     'load data local infile "mm8.stsInfoMouse.bed" into table stsInfoMouseNew;' mm8

    hgLoadPsl -nobin -table=all_sts_primer mm8 primer/primers.psl.filter
    # load of all_sts_primer did not go as planned: 34563 record(s), 0
    # row(s) skipped, 1 warning(s) loading primer/primers.psl.filter
    #	After warnings, checkTableCoords to find problems:
    checkTableCoords -verboseBlocks mm8 all_sts_primer
# mm8.all_sts_primer item 61999 chr10:62418012-62418048: blocks 0 and 1 overlap.
# mm8.all_sts_primer has 1 records with overlapping blocks.
    #	Strip the offending item from the load:
    #	Verify the grep takes out only one item:
    wc -l primer/primers.psl.filter
    #	34563 primer/primers.psl.filter
    grep -P "\t61999\t" primer/primers.psl.filter | wc -l
    #	1
    #	and thus leaves the rest
    grep -v -P "\t61999\t" primer/primers.psl.filter | wc -l
    #	34562
    grep -v -P "\t61999\t" primer/primers.psl.filter > fixed.primers.psl.filter
    hgLoadPsl -nobin -table=all_sts_primer mm8 fixed.primers.psl.filter

    # load primer sequences	
    rm /gbdb/mm8/stsMarker/mouseP.fa
    ln -s /cluster/data/mm8/bed/STSmarkers.2006-08-29/mouseP.fa \
	/gbdb/mm8/stsMarker/mouseP.fa
    # PLEASE NOTE THAT THE If you are going to reload this business, use the
    #	-replace option on this hgLoadSeq
    #	hgLoadSeq -replace mm8 /gbdb/mm8/stsMarker/mouseP.fa
    # otherwise there will be a problem that the seq and extFile tables 
    # will be out of sync. 
    hgLoadSeq -replace mm8 /gbdb/mm8/stsMarker/mouseP.fa
    #  Adding /gbdb/mm8/stsMarker/mouseP.fa
    #  34666 sequences
    #	Warning: load of seq did not go as planned: 34666 record(s),
    #	0 row(s) skipped, 1 warning(s) loading ./seq.tab


    featureBits mm8 all_sts_primer
    #	3700897 bases of 2567283971 (0.144%) in intersection
    featureBits mm8 all_sts_primerFeb2006
    #	3746196 bases of 2567283971 (0.146%) in intersection
    featureBits mm7 all_sts_primer
    #	3757119 bases of 2583394090 (0.145%) in intersection
    featureBits mm6 all_sts_primer
    #	3677372 bases of 2597150411 (0.142%) in intersection
    featureBits mm8 stsMapMouseNew
    #	4812616 bases of 2567283971 (0.187%) in intersection
    featureBits mm8 stsMapMouseNewFeb2006
    #	4801964 bases of 2567283971 (0.187%) in intersection
    featureBits mm7 stsMapMouseNew
    #	4805958 bases of 2583394090 (0.186%) in intersection
    featureBits mm6 stsMapMouseNew
    #	4638338 bases of 2597150411 (0.179%) in intersection

    hgsql -N mm8 -e "select count(*) from stsAlias;"
    #	146767
    hgsql -N mm8 -e "select count(*) from stsAliasFeb2006;"
    #	141981
    hgsql -N mm7 -e "select count(*) from stsAlias;"
    #	140649
    hgsql -N mm7 -e "select count(*) from stsAlias;"
    #	137738
    hgsql -N mm5 -e "select count(*) from stsAlias;"
    #	122944
    hgsql -N mm8 -e "select count(*) from stsInfoMouseNew;"
    #	60440
    hgsql -N mm7 -e "select count(*) from stsInfoMouseNew;"
    #	59843
    hgsql -N mm7 -e "select count(*) from stsInfoMouseNew;"
    #	58980
    hgsql -N mm5 -e "select count(*) from stsInfoMouseNew;"
    #	58493

    #	compare old and new name lists, not much difference:
    awk '{print $4}' stsMapMouse.bed | sort -u > mm8.nameList
    #	in common with previous version
    comm -12 ../STSmarkers/mm8.nameList mm8.nameList | wc -l
    #	28687
    #	unique to previous version
    comm -23 ../STSmarkers/mm8.nameList mm8.nameList | wc -l
    #	11
    #	unique to this new set
    comm -13 ../STSmarkers/mm8.nameList mm8.nameList | wc -l
    #	20

##########################################################################
# N-SCAN gene predictions (nscanGene) - (2006-08-30 markd)
    cd /cluster/data/mm8/bed/nscan/

    # obtained NSCAN predictions from michael brent's group
    # at WUSTL
    mv ardor.wustl.edu/jeltje/mm8/chr_ptx .
    rm -rf ardor.wustl.edu
    rm chr_*/index.html*
    gzip chr_*/*
    chmod a-w chr_*/*.gz

    # load tracks.  Note that these have *utr features, rather than
    # exon features.  currently ldHgGene creates separate genePred exons
    # for these.
    ldHgGene -bin -gtf -genePredExt mm8 nscanGene chr_gtf/chr*.gtf.gz

    # load protein, add .1 suffix to match transcript id
    hgPepPred -suffix=.1 mm8 generic nscanPep chr_ptx/chr*.fa.gz
    rm *.tab

    # update trackDb; need a mm8-specific page to describe informants
    mouse/mm8/nscanGene.html   (copy from hg18 and edit)
    mouse/mm8/trackDb.ra
    # changed search regex to
        termRegex chr[0-9a-zA-Z_].*\.[0-9]+\.[0-9]


#####################################################################
# SEGMENTAL DUPLICATIONS (DONE 9/18/06 angie)
    # File emailed from Ginger Cheng <ginger2@u.washington.edu>
    mkdir /cluster/data/mm8/bed/genomicSuperDups
    cd /cluster/data/mm8/bed/genomicSuperDups
    awk '($3 - $2) >= 1000 && ($9 - $8) >= 1000 {print;}' mm8_WGAC.tab \
    | hgLoadBed mm8 genomicSuperDups stdin \
      -tab -sqlTable=$HOME/kent/src/hg/lib/genomicSuperDups.sql
    # 8/29/07 Gak!  Kayla found that the strand values were "+" and "_" -- fix:
    hgsql mm8 -e 'update genomicSuperDups set strand = "-" where strand = "_";'


#####################################################################
# CELERA COVERAGE (WSSD -- DEPTH OF COVERAGE) (DONE 10/16/06 angie)
    # File emailed from Ginger Cheng <ginger2@u.washington.edu>
    mkdir /cluster/data/mm8/bed/wssd
    cd /cluster/data/mm8/bed/wssd
    tail +2 mm8_WSSD_DOC.tab \
    | hgLoadBed mm8 wssdCoverage stdin


#####################################################################
## NIA Mouse Gene Index - (DONE, Fan, 10/6/06)
#       requested by: Dudekula, Dawood (NIH/NIA/IRP) DudekulaDB@grc.nia.nih.gov
    ssh hgwdev 
    mkdir -p /cluster/data/mm8/bed/NIAGene061003
    cd /cluster/data/mm8/bed
    ln -s NIAGene061003 NIAGene
    cd NIAGene
    wget --timestamp http://lgsun.grc.nia.nih.gov/geneindex/mm8/download/T-fasta.ff.gz
    wget --timestamp http://lgsun.grc.nia.nih.gov/geneindex/mm8/download/T-psl.txt.gz
    gzip -d *.gz
    
    cut -f 1-21 T-psl.txt >NIAGene.tab
    hgLoadPsl mm8 NIAGene.tab

    mkdir /gbdb/mm8/NIAGene
    ln -s /cluster/data/mm8/bed/NIAGene/T-fasta.fa /gbdb/mm8/NIAGene/T-fasta.fa
    
    hgLoadSeq mm8 /gbdb/mm8/NIAGene/T-fasta.fa

# Create/edit/check in NIAGene.html and trackDb.ra under
    
        kent/src/hg/makeDb/trackDb/mouse/mm8

#####################################################################
# LOAD GENEID GENES (DONE - 2006-10-09 - Fan)
    ssh hgwdev
    mkdir -p /cluster/data/mm8/bed/geneid/download
    cd /cluster/data/mm8/bed/geneid/download

    bash
    awk '{print $1}' ../../../chrom.sizes | while read C
    do
      echo $C
      wget --timestamping \
      http://genome.imim.es/genepredictions/M.musculus/mmMar2006/geneid_v1.2/$C.gtf
      wget --timestamping \
      http://genome.imim.es/genepredictions/M.musculus/mmMar2006/geneid_v1.2/$C.prot
    done
    exit

    # Add missing .1 to protein id's

    foreach f (*.prot)
      perl -wpe 's/^(>chr\w+)$/$1.1/' $f > $f:r-fixed.prot
    end
    cd ..
    ldHgGene -genePredExt -gtf mm8 geneid download/*.gtf

#Read 35954 transcripts in 284585 lines in 34 files
# 35954 groups 34 seqs 1 sources 3 feature types
# 35954 gene predictions

    hgPepPred mm8 generic geneidPep download/*-fixed.prot
    featureBits mm8 -enrichment refGene geneid
# refGene 1.842%, geneid 1.592%, both 0.883%, cover 47.95%, enrich 30.13x
    featureBits mm7 -enrichment refGene geneid
# refGene 1.835%, geneid 1.579%, both 0.866%, cover 47.18%, enrich 29.88x


#####################################################################
# RN4 RECIPROCAL BEST CHAINS/NETS (DONE - 2006-10-10 - Angie)
    doRecipBest.pl mm8 rn4 \
      >& /cluster/data/mm8/bed/blastz.rn4/axtChain/recipBest.log &
    tail -f /cluster/data/mm8/bed/blastz.rn4/axtChain/recipBest.log


##############################################################################

############################################################################
# Load CCDS (ccdsInfo, ccdsGene, ccdsKgMap) (2006-10-10 markd)

    cd /cluster/data/genbank/data/ccds/
    ftp ftp-private.ncbi.nih.gov (user ccds, needs password)
    get CCDS.20061010.tar.gz
    mkdir /scratch/tmp/ccds
    cd /scratch/tmp/ccds
    tar -zxf /cluster/data/genbank/data/ccds/CCDS.20061010.tar.gz

    # import ccds database tables
    /cluster/data/genbank/bin/x86_64/ccdsImport ccds data/*.txt

    # create and load ccdsGene and ccdsInfo tables from imported database
    /cluster/data/genbank/bin/x86_64/ccdsMkTables -loadDb ccds mm8 ccdsInfo ccdsGene
    /cluster/data/genbank/bin/x86_64/mkCcdsGeneMap  -db=mm8 -loadDb ccdsGene knownGene ccdsKgMap
    checkTableCoords mm8 -verbose=2 ccdsGene
    joinerCheck -database=mm8 -identifier=ccdsGeneId ~/compbio/kent/src/hg/makeDb/schema/all.joiner
    rm -rf /scratch/tmp/ccds

    # build initial version of ccdsMgcMap table
    ./x86_64/mkCcdsGeneMap -loadDb -db=mm8 -loadDb ccdsGene mgcGenes ccdsMgcMap

    
    # load trackDb
    cd kent/src/hg/makeDb/trackDb
    make alpha

    # request push of 
        ccdsGene
        ccdsInfo
        ccdsKgMap

    # << emacs


############################################################################
# JAX TRACKS (DONE 10/20/06 angie - UPDATED 7/18/07, 9/27/07)
# Table jaxQTL renamed to jaxQtl on 1/7/10 (see NOTE FOR NEXT TIME below)
    ssh kkstore04
    mkdir /cluster/data/mm8/bed/jax/2007_09
    cd /cluster/data/mm8/bed/jax/2007_09
    wget ftp://ftp.informatics.jax.org/pub/gbrowse/\*
    wget ftp://ftp.informatics.jax.org/pub/reports/MGI_PhenotypicAllele.rpt

    # Jax Rep Transcript track
    # SEQ_RepTransGenomic_rpt.gff --> jaxRepTranscript{,Alias}
    # -- names like AK016604_4933401J01Rik, NM_001011874_AY534250
    # -- aliases ~ MGI:\d+
    # Use simple perl script to uniquify transcript names and make alias.tab.
    # Inspired by the mm6 version, but format has changed.
    ../2007_07/parseRepTranscript.pl SEQ_RepTransGenomic_rpt.gff \
      > jaxRepTranscript.gff

    # Jax Allele track
    # AL_*.gff --> jaxAllele{,Info}
    # -- bed12Source -- add type from filename
    # -- names like NM_011283_Rp1h<tm1Jnz>, XM_129721_Slc9a2<tm1Ges>
    # -- Info: name, mgiID, source {"Gene trapped", ...}
    cp ../2007_07/parseAllele.pl .
    # Edit to accomodate latest format tweaks.
    rm -f jaxAllele.bed jaxAlleleInfo.tab fixJaxAllele.sql
    foreach f (AL*.gff)
      set type = `echo $f:t:r \
        | sed -e 's/AL_//; s/GTRAP/GeneTrapped/; s/IND/Induced/; \
            s/OTHER/Other/; s/SPON/Spontaneous/; s/TARG/Targeted/; \
            s/TRANS/Transgenic/;'`
      parseAllele.pl $f \
      | ldHgGene mm8 placeholder stdin -nobin -out=stdout \
      | /cluster/bin/scripts/genePredToBed \
      | sed -e 's/$/'"\t$type"'/' \
      >> jaxAllele.bed
    end
    # This round's formatting inconsistencies:
#source not given for NM_015770_a<jIs(17_In2)1Gso>
#source not given for NM_029931_Mllt3<T(4Mllt3_9Mll)1Thr>
#source not given for NM_009521_Wnt3<In(11Trp53_11Wnt3)8Brd>
#source not given for NM_011640_Trp53<In(11Trp53_11Wnt3)8Brd>
#source not given for NM_001081049_Mll1<T(4Mllt3_9Mll)1Thr>
#Missing > for mRNA name NM_001081193_Lemd3<Gt(XST167)Byg

    # Jax Phenotype track
    # MP_*.gff --> jaxPhenotype{,Alias}
    # -- bed12Source -- add type from filename
    # -- names like NM_001001488_Atp8b1
    rm -f jaxPhenotype.bed jaxPhenotypeAlias.tab fixJaxPhenotype.sql
    foreach f (MP_*.gff)
      set type = `echo $f:t:r \
        | perl -wpe 's/MP_[0-9]*_//; s/[_-](\w)/\u$1/g; s/^(\w)/\u$1/; \
                    s@AdiposeTissue@Adipose@; \
                    s@BehaviorNeurological@Behavior@; \
                    s@CardiovascularSystem@Cardiovascular@; \
                    s@DigestiveAlimentary@Digestive@; \
                    s@EndocrineExocrineGland@Gland@; \
                    s@GrowthSize@Growth Size@; \
                    s@HearingEar@Hearing/Ear@; \
                    s@HematopoieticSystem@Hematopoietic@; \
                    s@HomeostasisMetabolism@Homeostasis@; \
                    s@ImmuneSystem@Immune@; \
                    s@LethalityEmbryonicPerinatal@Embryonic Lethal@; \
                    s@LethalityPostnatal@Postnatal Lethal@; \
                    s@LifeSpanPostWeaningAging@Life Span@; \
                    s@LimbsDigitsTail@Limbs and Tail@; \
                    s@LiverBiliarySystem@Liver and Bile@; \
                    s@NervousSystem@Nervous System@; \
                    s@RenalUrinarySystem@Renal/Urinary@; \
                    s@ReproductiveSystem@Reproductive@; \
                    s@RespiratorySystem@Respiratory@; \
                    s@SkinCoatNails@Skin/Coat/Nails@; \
                    s@TasteOlfaction@Taste/Smell@; \
                    s@TouchVibrissae@Touch@; \
                    s@Tumorigenesis@Tumorigenesis@; \
                    s@VisionEye@Vision/Eye@;'`
      echo $type
      ../2006_10/parsePhenotype.pl $f \
      | ldHgGene mm8 placeholder stdin -nobin -out=stdout \
      | /cluster/bin/scripts/genePredToBed \
      | sed -e 's@$@'"\t$type"'@' \
      >> jaxPhenotype.bed
    end
    sort -u jaxPhenotypeAlias.tab > tmp
    mv tmp jaxPhenotypeAlias.tab

    # Jax QTL track
    # QTL*.gff --> jaxQtl2 (or 3?)... but we're missing MIT SSLP marker
    # and CM distance for 2, or those plus flanking markers for 3...
    perl -wpe 'chomp; s/\s*$//; \
      ($chr, undef, undef, $start, $end, undef, $strand, undef, $info) = \
        split("\t"); \
      if ($info =~ /QTL (\w+);  Dbxref "(MGI:\d+)"; Alias .*;  Note "([^"]+)"/) { \
        ($name, $mgiID, $desc) = ($1, $2, $3); \
      } else { die "parse\n$info"; } \
      $start--; \
      s/^.*$/$chr\t$start\t$end\t$name\t1000\t$strand\t\t$mgiID\t$desc\t0.0\n/;' \
      QTL_build36_03_alias.gff > jaxQtl.bed

    # Extract phenotype-allele relationships:
    # Make a file for the one code not already in a filename:
    cp /dev/null MP_0003012_no_phenotypic_analysis
    # Wrote a script to extract the phenotype-allele relationships --
    # it uses the filenames to map MP:* codes to our phenotype names.
    ../2007_07/parsePhenotypicAllele.pl MGI_PhenotypicAllele.rpt \
      > jaxAllelePheno.tab
    # The file "err" has messages about missing data (no gene name in 
    # PhenotypicAllele.rpt, or gene/mgiId not found in jaxAlleleInfo).

    # Load tables
    ssh hgwdev
    cd /cluster/data/mm8/bed/jax/2007_09
    # jaxRepTranscript
    ldHgGene mm8 jaxRepTranscript jaxRepTranscript.gff
    hgsql mm8 < fixJaxRepTranscript.sql
    sed -e 's/genericAlias/jaxRepTranscriptAlias/g' \
      ~/kent/src/hg/lib/genericAlias.sql > jaxRepTranscriptAlias.sql 
    hgLoadSqlTab mm8 jaxRepTranscriptAlias \
      jaxRepTranscriptAlias.sql jaxRepTranscriptAlias.tab
    # jaxAllele
    sed -e 's/bed12Source/jaxAllele/g' \
      $HOME/kent/src/hg/lib/bed12Source.sql > jaxAllele.sql
    hgLoadBed -sqlTable=jaxAllele.sql mm8 jaxAllele jaxAllele.bed
    hgsql mm8 < fixJaxAllele.sql
    hgLoadSqlTab mm8 jaxAlleleInfo \
      ~/kent/src/hg/lib/jaxAlleleInfo.sql jaxAlleleInfo.tab
    # jaxPhenotype
    sed -e 's/bed12Source/jaxPhenotype/g' \
      $HOME/kent/src/hg/lib/bed12Source.sql > jaxPhenotype.sql
    hgLoadBed -tab -sqlTable=jaxPhenotype.sql mm8 jaxPhenotype jaxPhenotype.bed
    hgsql mm8 < fixJaxPhenotype.sql
    sed -e 's/genericAlias/jaxPhenotypeAlias/' \
      ~/kent/src/hg/lib/genericAlias.sql > jaxPhenotypeAlias.sql
    hgLoadSqlTab mm8 jaxPhenotypeAlias \
      jaxPhenotypeAlias.sql jaxPhenotypeAlias.tab
### NOTE FOR NEXT TIME ###
### Call the table jaxQtl instead of jaxQTL -- QA doesn't like jaxQTL.
### (brooke) In fact, QA renamed the table to jaxQtl on 1/7/10 on hgwdev and
### mysqlbeta with this command:  mysql> alter table jaxQTL rename to jaxQtl;
### (to make trackDb load with a single trackDb.ra entry for mm8 and mm9)
### Use -sqlTable=$HOME/kent/src/hg/lib/jaxQtl.sql .
    # jaxQTL
    hgLoadBed -tab -notItemRgb -noBin \
      -sqlTable=$HOME/kent/src/hg/lib/jaxQTL.sql \
      mm8 jaxQTL jaxQtl.bed
    checkTableCoords -verbose=2 mm8 jaxQTL
#mm8.jaxQTL item Scpro11 chr18:131504376-131504512: chromEnd > chromSize 90736837
#mm8.jaxQTL item Tswt chr18:134822025-134822132: chromEnd > chromSize 90736837
#mm8.jaxQTL item Ath13 chr14:164794113-164794369: chromEnd > chromSize 123978870
#mm8.jaxQTL item Dob7 chr11:131434708-131434798: chromEnd > chromSize 121798632
    # Fix coords > chromSize:
    perl -wpe 's/^(\w+)\t(\d+)$/ \
      delete from jaxQTL where chrom="$1" and chromStart >= $2; \
      update jaxQTL set chromEnd = $2 where chrom="$1" and chromEnd > $2;/' \
      ../../../chrom.sizes \
    | hgsql mm8
    checkTableCoords -verbose=2 mm8 jaxQTL
    # phenotype-allele relationships
    hgLoadSqlTab mm8 jaxAllelePheno \
      ~/kent/src/hg/lib/jaxAllelePheno.sql jaxAllelePheno.tab

    # Check joiner:
    runJoiner.csh mm8 jaxAllele
    runJoiner.csh mm8 jaxPhenotype


##########################################################################
# SWAP/CHAIN/NET GASACU1 (DONE 10/23/06 angie)
    ssh kkstore04
    mkdir /cluster/data/mm8/bed/blastz.gasAcu1.swap
    cd /cluster/data/mm8/bed/blastz.gasAcu1.swap
    doBlastzChainNet.pl -swap /cluster/data/gasAcu1/bed/blastz.mm8/DEF \
      -chainMinScore=2000 -chainLinearGap=loose >& do.log & tail -f do.log
    ln -s blastz.gasAcu1.swap /cluster/data/mm8/bed/blastz.gasAcu1
    nice featureBits mm8 chainGasAcu1Link
#52781141 bases of 2567283971 (2.056%) in intersection


#########################################################################
# BLASTZ/CHAIN/NET FELCAT3 (Done Nov 15 2006 heather)
# working in /cluster/data/felCat3 because /cluster/data/mm8 is 94% full
    mkdir /cluster/data/felCat3/bed/blastz.mm8.2006-11-14
    ln -s /cluster/data/felCat3/bed/blastz.mm8.2006-11-14 /cluster/data/mm8/bed/blastz.felCat3
    cd /cluster/data/felCat3/bed/blastz.mm8.2006-11-14
    cat << '_EOF_' > DEF

BLASTZ_M=50

# TARGET: Mouse mm8
SEQ1_DIR=/scratch/hg/mm8/nib
SEQ1_LEN=/scratch/hg/mm8/chrom.sizes
SEQ1_CHUNK=50000000
SEQ1_LAP=10000

# QUERY: Cat felCat3 
SEQ2_DIR=/san/sanvol1/scratch/felCat3/felCat3.2bit
SEQ2_LEN=/san/sanvol1/scratch/felCat3/chrom.sizes
# Maximum number of scaffolds that can be lumped together
SEQ2_LIMIT=500
SEQ2_CHUNK=30000000
SEQ2_LAP=0

BASE=/cluster/data/felCat3/bed/blastz.mm8.2006-11-14
TMPDIR=/scratch/tmp
'_EOF_'
    # << this line keeps emacs coloring happy

    doBlastzChainNet.pl DEF \
      -bigClusterHub pk
      -chainMinScore=3000 -chainLinearGap=medium
      -blastzOutRoot /cluster/bluearc/felCat3/blastz.mm8 >& do.log &
    tail -f do.log

    nice featureBits -chrom=chr1 mm8 chainFelCat3Link
    36333124 bases of 191450312 (18.978%) in intersection


#########################################################################
# BLASTZ/CHAIN/NET BOSTAU3 (Done March 2007 heather)
    mkdir /cluster/data/mm8/bed/blastz.bosTau3.2007-03-14
    ln -s /cluster/data/mm8/bed/blastz.bosTau3.2007-03-14 /cluster/data/mm8/bed/blastz.bosTau3
    cd /cluster/data/mm8/bed/blastz.bosTau3
    cat << '_EOF_' > DEF

BLASTZ_M=50

# TARGET: Mouse mm8
SEQ1_DIR=/scratch/hg/mm8/nib
SEQ1_LEN=/scratch/hg/mm8/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000

# QUERY: Cow bosTau3
SEQ2_DIR=/san/sanvol1/scratch/bosTau3/bosTau3.2bit
SEQ2_LEN=/san/sanvol1/scratch/bosTau3/chrom.sizes
SEQ2_LIMIT=500
SEQ2_CHUNK=50000000
SEQ2_LAP=0

BASE=/cluster/data/mm8/bed/blastz.bosTau3.2007-03-14
TMPDIR=/scratch/tmp
'_EOF_'
    # << this line keeps emacs coloring happy
    doBlastzChainNet.pl DEF \
      -bigClusterHub pk \
      -chainMinScore=3000 -chainLinearGap=medium \
      -blastzOutRoot /cluster/bluearc/bosTau3/blastz.mm8 >& do.log &
    tail -f do.log

    nice featureBits -chrom=chr1 mm8 chainBosTau3Link
    # 49896121 bases of 191450312 (26.062%) in intersection


#############################################################################
#  REBUILD miRNA TRACK (DONE - 2006-12-01 - Fan)
    #   updated data from: Michel.Weber@ibcg.biotoul.fr
    #   notify them when done.
    ssh hgwdev
    cd /cluster/data/mm8/bed
    mkdir miRNA-2006-12-01
    cd miRNA-2006-12-01
    # save the mmu8_miRNA.txt file from email

    # add the following line in mmu8_miRNA.txt per email from Michel.

    chrM      16114   16209   mmu-mir-805     480     -
    
    hgLoadBed -strict mm8 miRNA  mmu8_miRNA.txt 

# check previous release track before update
    featureBits mm8 miRNA
    # 33033 bases of 2567283971 (0.001%) in intersection
    featureBits mm7 miRNA
    # 20620 bases of 2583394090 (0.001%) in intersection

#############################################################################
# Create Allen Brain Atlas mapping. (Done 2007-02-08 Galt)
# We are creating several things: a psl probe-track for the RR on mouse,
# a link out from kg to the probe to the ABA website, 
# and a set of gene/probe info which visiGene will use.
# (This needs to be done after have created sequences in
# ncbiXm and tigrMgiTc as above.)

# metadata.log and SRGEsequence.log was provided by 
#  Susan Sunkin <SusanS@alleninstitute.org>
# this is an update to the visiGene with 6000 new images.

# See mm6.txt for steps not needing to be repeated.

# copy in the data files (directory already exists from previous build)
    ssh hgwdev
    cd /cluster/data/mm8/bed/allenBrain
    mkdir old
    mv * old/
    cp /cluster/data/mm6/bed/allenBrain/allen20061204.tab .
    cp /cluster/data/mm6/bed/allenBrain/probeSeq.20061204.fasta .
    cp /cluster/data/mm6/bed/allenBrain/allProbes.fa .
    cp /cluster/data/mm6/bed/allenBrain/allProbes.tab .
    cp /cluster/data/mm6/bed/allenBrain/allenBrainUrl.tab .


# Set up a blat run to align the probes.
    ssk pk
    cd /cluster/data/mm8/bed/allenBrain
    mkdir split
    faSplit sequence allProbes.fa 200 split/rp
    mkdir run
    cd run
    ls -1 ../split/*.fa > mrna.lst
    ls -1 /scratch/hg/mm8/nib/*.nib > genome.lst
    mkdir psl
    cat << '_EOF_' > gsub
#LOOP
blat -ooc=/scratch/hg/h/mouse11.ooc $(path1) $(path2) {check out line+ psl/$(root1)_$(root2).psl}
#ENDLOOP
'_EOF_'
    # << happy emacs

    gensub2 genome.lst mrna.lst gsub spec
    para create spec
# Then do the usual para try/push/time/check until the run is finished
#Completed: 6596 of 6596 jobs
#CPU time in finished jobs:      27258s     454.30m     7.57h    0.32d  0.001 y
#IO & Wait Time:                 19700s     328.33m     5.47h    0.23d  0.001 y
#Average job time:                   7s       0.12m     0.00h    0.00d
#Longest running job:                0s       0.00m     0.00h    0.00d
#Longest finished job:              39s       0.65m     0.01h    0.00d
#Submission to last job:           549s       9.15m     0.15h    0.01d

# Then do sorting and near-best-in-genome step on file server
    ssh kkstore
    cd /cluster/data/mm8/bed/allenBrain/run
    pslSort dirs raw.psl tmp psl
    pslReps raw.psl ../best.psl -nohead -minCover=0.20 -minAli=0.96 -nearTop=0.001 /dev/null
    sort -k 14,14 -k 16,16n ../best.psl > ../allenBrainAli.psl

# Clean up big files no longer needed
    rm raw.psl
    rm -r psl
    rm -r ../split

# Load up database
    ssh hgwdev
    cd /cluster/data/mm8/bed/allenBrain

# Make a new table that contains the URLs for the allen brain genes
# Make this one first since all.joiner considers it the master table.

    hgsql mm8 -e 'drop table allenBrainUrl'
    hgsql mm8 < ~/kent/src/hg/lib/allenBrainUrl.sql
    hgsql mm8 -e 'load data local infile "allenBrainUrl.tab" into table allenBrainUrl'

# Make probe alignment table, and load sequence.
    hgLoadPsl mm8 allenBrainAli.psl
    rm /gbdb/mm8/allenBrain/allProbes.fa
    ln -s /cluster/data/mm8/bed/allenBrain/allProbes.fa /gbdb/mm8/allenBrain/allProbes.fa
    hgLoadSeq -replace mm8 /gbdb/mm8/allenBrain/allProbes.fa

# Make mapping between known genes and allenBrain
    hgMapToGene mm8 allenBrainAli -type=psl knownGene knownToAllenBrain 



##########################################################################
#  xxBlastTab - Help filter out unwanted paralogs  (Galt 2007-01-11)
#
# We are starting with xxBlastTab tables already built in the usual way with
# blastall/blastp, probably with doHgNearBlastp.pl script.
#
# we want to update mm8 for human and rat, 
# so check ./hgGeneData/Mouse/mm8/otherOrgs.ra for current settings

ssh hgwdev

synBlastp.csh mm8 hg18 
#mm8.hgBlastTab
#new number of unique query values:
#25178
#new number of unique target values
#15328
#old number of unique query values:
#28286
#old number of unique target values
#15901


synBlastp.csh mm8 rn4
#mm8.rnBlastTab:
#new number of unique query values:
#11163
#new number of unique target values
#6573
#old number of unique query values:
#23183
#old number of unique target values
#6890



##########################################################################
# GenBank gbMiscDiff table (markd 2007-01-10)
# Supports `NCBI Clone Validation' section of mgcGenes details page

   # genbank release 157.0 now contains misc_diff fields for MGC clones
   # reloading mRNAs results in gbMiscDiff table being created.
   ./bin/gbDbLoadStep -reload -srcDb=genbank -type=mrna mm8

##########################################################################
## WindowMasker (DONE - 2007-01-30 - Hiram)
    ssh kolossus
    mkdir /cluster/data/mm8/bed/WindowMasker.2007-01-29
    cd /cluster/data/mm8/bed/WindowMasker.2007-01-29
    # copy *.csh scripts from
    # /cluster/data/danRer4/bed/WindowMasker.2006-12-04
    #	and fixup the db name and work directory in those scripts, then:
    time nice -n +19 ./doCount.csh > doCount.out 2>&1
    #	real    67m32.178s
    time nice -n +19 ./doSdust.csh >doSdust.out 2>&1
    #	real    477m24.667s
    ssh kkstore04
    cd /cluster/data/mm8/bed/WindowMasker.2007-01-29
    gzip windowmasker.sdust.bed
    time nice -n +19 ./applyMask.csh > applyMask.out 2>&1
    time nice -n +19 ./addTrf.csh > addTrf.out 2>&1
    twoBitToFa mm8.sdTrf.2bit stdout | faSize stdin
    #	2664455088 bases (97171400 N's 2567283688 real 1644888505 upper
    #	922395183 lower) in 34 sequences in 1 files
    ssh hgwdev
    cd /cluster/data/mm8/bed/WindowMasker.2007-01-29
    

##########################################################################
## AUGUSTUS ab initio predictions (DONE, 2007-01-30 - Mario)
    ssh hgwdev
    mkdir /cluster/data/mm8/bed/augustus
    cd /cluster/data/mm8/bed/augustus

    # get the program AUGUSTUS, e.g. from the web
    wget http://augustus.gobics.de/binaries/augustus.2.0.1.src.tar.gz
    # unpack
    tar xzf augustus.2.0.1.src.tar.gz

    # compile the binary if necessary
    cd augustus/src
    make augustus

    # create output directory
    cd /cluster/data/mm8/bed/augustus
    mkdir out err

    # create file with sequences and their sizes by modifying chrom.sizes
    cat ../../chrom.sizes | perl -e 'while(<>){s/chr([0-9a-zA-Z]+)(_random|)/\/cluster\/data\/mm8\/$1\/chr$1$2.fa.masked/; print;}' > seq.lst

    # create the job list
    augustus/scripts/createAugustusJoblist.pl --sequences seq.lst --chunksize 5300000 --overlap 300000 --command "/cluster/data/panTro2/bed/augustus/augustus/src/augustus --AUGUSTUS_CONFIG_PATH=/cluster/data/panTro2/bed/augustus/augustus/config --species=human --sample=100 --/augustus/verbosity=0" --outputdir /cluster/data/mm8/bed/augustus/out/ --errordir /cluster/data/mm8/bed/augustus/err/ --joblist job.lst

    para try
    para check
    para push

# CPU time in finished jobs:    2984823s   49747.06m   829.12h   34.55d  0.095 y
# IO & Wait Time:                 19258s     320.96m     5.35h    0.22d  0.001 y
# Average job time:                5403s      90.05m     1.50h    0.06d
# Longest running job:                0s       0.00m     0.00h    0.00d
# Longest finished job:            7896s     131.60m     2.19h    0.09d
# Submission to last job:         15716s     261.93m     4.37h    0.18d

    # check the error files, should be no errors
    cat err/*.err

    cat out/*.gff | augustus/scripts/join_aug_pred.pl > augustus.pep.gff
    augustus/scripts/getAnnoFasta.pl augustus.pep.gff
    cat augustus.pep.gff | egrep "CDS|codon"> augustus.gff

    # load into database

    ssh hgwdev
    cd /cluster/data/panTro2/bed/augustus/
    ldHgGene -bin mm8 augustus augustus.gff
    # 32377 gene predictions

    hgPepPred panTro2 generic augustusPep augustus.pep.aa

    featureBits mm8 augustus
    # 35380585 bases of 2567283971 (1.378%) in intersection

#########################################################################
## BLASTZ ANOCAR1 - Lizard - (DONE - 2007-02-19 - 2007-02-20 - Hiram)
    ssh kkstore04
    mkdir /cluster/data/mm8/bed/blastz.anoCar1.2007-02-19
    cd /cluster/data/mm8/bed/blastz.anoCar1.2007-02-19

    cat << '_EOF_' > DEF
# Mouse vs lizard
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=6000
BLASTZ_K=2200
BLASTZ_M=50
BLASTZ_Q=/cluster/data/blastz/HoxD55.q

# TARGET: Mouse Mm8
SEQ1_DIR=/san/sanvol1/scratch/mm8/mm8.sdTrf.2bit
SEQ1_LEN=/scratch/hg/mm8/chrom.sizes
SEQ1_CHUNK=50000000
SEQ1_LAP=10000

# QUERY: Lizard AnoCar1 - largest chunk big enough for largest scaffold
SEQ2_DIR=/san/sanvol1/scratch/anoCar1/anoCar1.2bit
SEQ2_LEN=/san/sanvol1/scratch/anoCar1/chrom.sizes
SEQ2_CHUNK=20000000
SEQ2_LIMIT=30
SEQ2_LAP=0

BASE=/cluster/data/mm8/bed/blastz.anoCar1.2007-02-19
TMPDIR=/scratch/tmp
'_EOF_'
    # << happy emacs

    time doBlastzChainNet.pl DEF -chainMinScore=5000 -chainLinearGap=loose \
	-tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \
	-verbose=2 -bigClusterHub=pk \
	-blastzOutRoot /cluster/bluearc/mm8AnoCar1 > do.log 2>&1 &
    #	real    544m52.722s

    #	appears to have successfully finished
    ssh hgwdev
    cd /cluster/data/mm8/bed/blastz.anoCar1.2007-02-19
    time nice -n +19 featureBits mm8 chainAnoCar1Link \
	> fb.mm8.chainAnoCar1Link.txt 2>&1
    #	real    1m37.380s
    #	96286498 bases of 2567283971 (3.751%) in intersection

    #	running the swap to anoCar1 - instructions in anoCar1.txt
    cd /cluster/data/anoCar1/bed/blastz.mm8.swap
    time nice -n +19 featureBits anoCar1 chainMm8Link \
	> fb.anoCar1.chainMm8Link.txt 2>&1
    #	real    2m1.527s
    #	82784787 bases of 1741478929 (4.754%) in intersection

#############################################################################
# UPDATED mm8.knownToVisiGene (DONE galt 2007-02-15)


#########################################################################
# BLASTZ ORNANA1 (PLATYPUS) - (DONE 2007-03-02 angie)
    ssh kkstore04
    mkdir /cluster/data/mm8/bed/blastz.ornAna1.2007-02-27
    cd /cluster/data/mm8/bed/blastz.ornAna1.2007-02-27

    cat << '_EOF_' > DEF
# mouse vs. platypus

# Use same params as used for hg18-danRer4
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=6000
BLASTZ_K=2200
BLASTZ_M=50
BLASTZ_Q=/cluster/data/blastz/HoxD55.q

# TARGET: mm8
SEQ1_DIR=/scratch/hg/mm8/nib
SEQ1_LEN=/scratch/hg/mm8/chrom.sizes
SEQ1_CHUNK=20000000
SEQ1_LAP=10000

# QUERY: ornAna1
SEQ2_DIR=/iscratch/i/ornAna1/ornAna1.2bit
SEQ2_LEN=/iscratch/i/ornAna1/chrom.sizes
SEQ2_CHUNK=20000000
SEQ2_LIMIT=300
SEQ2_LAP=0

BASE=/cluster/data/mm8/bed/blastz.ornAna1.2007-02-27
TMPDIR=/scratch/tmp
'_EOF_'
    # << emacs

    doBlastzChainNet.pl DEF \
      -workhorse kkr6u00 \
      -blastzOutRoot /cluster/bluearc/mm8.ornAna1 \
      >& do.log & tail -f do.log


############################################################################
# Reload CCDS (ccdsInfo, ccdsGene, ccdsKgMap) (2007-03-02 markd)

    # see hg17.txt for build temporary ccds database for CCDS.20070228

    # create and load ccdsGene and ccdsInfo tables from imported database
    /cluster/data/genbank/bin/x86_64/ccdsMkTables -loadDb ccds mm8 ccdsInfo ccdsGene
    /cluster/data/genbank/bin/x86_64/mkCcdsGeneMap  -db=mm8 -loadDb ccdsGene knownGene ccdsKgMap
    checkTableCoords mm8 -verbose=2 ccdsGene
    # update all.jointer to include mm8 in ccdsDb
    joinerCheck -database=mm8 -identifier=ccdsGeneId ~/compbio/genbank/kent/src/hg/makeDb/schema/all.joiner

    # build initial version of ccdsMgcMap table, updated by nightly genbank update
    /cluster/data/genbank/bin/x86_64/mkCcdsGeneMap -loadDb -db=mm8 ccdsGene mgcGenes ccdsMgcMap
    
    # load trackDb
    cd kent/src/hg/makeDb/trackDb
    make alpha
    # check in browser

    # request push of 
        ccdsGene
        ccdsInfo
        ccdsKgMap
        ccdsMgcMap
    # << emacs


############################################################################
# CGAP SAGE (DONE Andy 2007-03-01)
ssh hgwdev
cd san/andy/mouseSage/
wget ftp://ftp1.nci.nih.gov/pub/SAGE/MOUSE/Mm.libraries.gz
wget ftp://ftp1.nci.nih.gov/pub/SAGE/MOUSE/Mm_long.frequencies.gz
wget ftp://ftp1.nci.nih.gov/pub/SAGE/SAGE_mm_long_forward_v36.1.tar.gz
wget ftp://ftp1.nci.nih.gov/pub/SAGE/SAGE_mm_long_reverse_v36.1.tar.gz
tar xvfz SAGE_mm_long_forward_v36.1.tar.gz 
tar xvfz SAGE_mm_long_reverse_v36.1.tar.gz 
rm *.tar.gz
chmod a+r -R mm_* 
chmod +x mm_*
cd mm_forward/
cat * | awk 'BEGIN{OFS="\t"}{print $1, $3, $4, $2, 1000, "+"}' > ../unlifted.bed
cd ../mm_reverse/
cat * | awk 'BEGIN{OFS="\t"}{print $1, $4, $3, $2, 1000, "-"}' >> ../unlifted.bed
ctgPosToLft mm8 mm8.lft
liftUp lifted.bed mm8.lft warn unlifted.bed 
awk 'BEGIN{OFS="\t"}{strand = $6; start = $2; end = $3; if (strand == "-") { thickStart = end; } else { start = start - 1; thickStart = start
 - 4; } thickEnd = thickStart + 4; print $1, start, end, $4, $5, strand, thickStart, thickEnd; }' lifted.bed > mapping.bed
gunzip *.gz
rm -rf mm_forward/ mm_reverse/ unlifted.bed lifted.bed mm8.lft 
awk 'BEGIN{FS="\t"}{sex = $13; for (i=1; i<=12; i++) { printf("%s\t", $i); } if (sex == "unknown") { sex = ""; } else if (sex == "male and fe
male") { sex = "male,female,"} else if (sex == "male") { sex = "male,"} else {sex = "female,"}; printf("%s\t", sex); for (i=14; i<=20; i++) {
 printf("%s\t", $i); } print $21}' Mm.libraries | tail +2 > massaged.Mm.libraries
cgapSageBedAddFreqs -noEmpty mapping.bed Mm_long.frequencies massaged.Mm.libraries cgapSage.bed
ln -s ~/hg/lib/cgapSage/cgapSageLib.sql 
ln -s ~/hg/lib/cgapSage/cgapSage.sql 
hgLoadBed -sqlTable=cgapSage.sql mm8 cgapSage cgapSage.bed
hgLoadSqlTab mm8 cgapSageLib cgapSageLib.sql massaged.Mm.libraries 

############################
# HUMAN (hg18) PROTEINS TRACK (DONE braney 2007-04-02)
    ssh kkstore04
    bash 

    mkdir /cluster/data/mm8/blastDb
    cd /cluster/data/mm8
    ls noMask/*.fa | grep -v random > temp.lst
    ls randomContigs/*.fa >> temp.lst
    cat `cat temp.lst` > temp.fa
    faSplit gap temp.fa 1000000 blastDb/x -lift=blastDb.lft
    rm temp.fa
    cd blastDb
    for i in *.fa
    do
	/cluster/bluearc/blast229/formatdb -i $i -p F
    done
    rm *.fa

    mkdir -p /san/sanvol1/scratch/mm8/blastDb
    cd /cluster/data/mm8/blastDb
    for i in nhr nin nsq; 
    do 
	echo $i
	cp *.$i /san/sanvol1/scratch/mm8/blastDb
    done

    mkdir -p /cluster/data/mm8/bed/tblastn.hg18KG
    cd /cluster/data/mm8/bed/tblastn.hg18KG
    echo  /san/sanvol1/scratch/mm8/blastDb/*.nsq | xargs ls -S | sed "s/\.nsq//"  > query.lst
    wc -l query.lst
# 2733 query.lst

   # we want around 150000 jobs
   calc `wc /cluster/data/hg18/bed/blat.hg18KG/hg18KG.psl | awk "{print \\\$1}"`/\(150000/`wc query.lst | awk "{print \\\$1}"`\)
# 36727/(150000/2733) = 669.165940

   mkdir -p /cluster/bluearc/mm8/bed/tblastn.hg18KG/kgfa
   split -l 670 /cluster/data/hg18/bed/blat.hg18KG/hg18KG.psl  /cluster/bluearc/mm8/bed/tblastn.hg18KG/kgfa/kg
   ln -s /cluster/bluearc/mm8/bed/tblastn.hg18KG/kgfa kgfa
   cd kgfa
   for i in *; do 
     nice pslxToFa $i $i.fa; 
     rm $i; 
     done
   cd ..
   ls -1S kgfa/*.fa > kg.lst
   mkdir -p /cluster/bluearc/mm8/bed/tblastn.hg18KG/blastOut
   ln -s /cluster/bluearc/mm8/bed/tblastn.hg18KG/blastOut
   for i in `cat kg.lst`; do  mkdir blastOut/`basename $i .fa`; done
   tcsh
   cd /cluster/data/mm8/bed/tblastn.hg18KG
   cat << '_EOF_' > blastGsub
#LOOP
blastSome $(path1) {check in line $(path2)} {check out exists blastOut/$(root2)/q.$(root1).psl }
#ENDLOOP
'_EOF_'

   cat << '_EOF_' > blastSome
#!/bin/sh
BLASTMAT=/cluster/bluearc/blast229/data
export BLASTMAT
g=`basename $2`
f=/tmp/`basename $3`.$g
for eVal in 0.01 0.001 0.0001 0.00001 0.000001 1E-09 1E-11
do
if /cluster/bluearc/blast229/blastall -M BLOSUM80 -m 0 -F no -e $eVal -p tblastn -d $1 -i $2 -o $f.8
then
        mv $f.8 $f.1
        break;
fi
done
if test -f  $f.1
then
    if /cluster/bin/i386/blastToPsl $f.1 $f.2
    then
	liftUp -nosort -type=".psl" -nohead $f.3 /cluster/data/mm8/blastDb.lft carry $f.2
        liftUp -nosort -type=".psl" -pslQ -nohead $3.tmp /cluster/data/hg18/bed/blat.hg18KG/protein.lft warn $f.3

        if pslCheck -prot $3.tmp
        then
            mv $3.tmp $3
            rm -f $f.1 $f.2 $f.3 $f.4
        fi
        exit 0 
    fi
fi 
rm -f $f.1 $f.2 $3.tmp $f.8 $f.3 $f.4
exit 1
'_EOF_'
    # << happy emacs
    chmod +x blastSome
    gensub2 query.lst kg.lst blastGsub blastSpec
    exit # back to bash
    
    ssh pk
    cd /cluster/data/mm8/bed/tblastn.hg18KG
    para create blastSpec

    para time
# Completed: 150315 of 150315 jobs
# CPU time in finished jobs:   24349624s  405827.07m  6763.78h  281.82d  0.772 y
# IO & Wait Time:               1825515s   30425.24m   507.09h   21.13d  0.058 y
# Average job time:                 174s       2.90m     0.05h    0.00d
# Longest finished job:             673s      11.22m     0.19h    0.01d
# Submission to last job:         79743s    1329.05m    22.15h    0.92d

    ssh kkstore04
    cd /cluster/data/mm8/bed/tblastn.hg18KG
    for i in blastOut/*
    do  
	echo "cd $i; cat *.psl | pslSortAcc nohead chrom /tmp/ stdin ; cd ../.."
    done > sort.jobs

    sh -x sort.jobs

    tcsh
    mkdir chainRun
    cd chainRun
    cat << '_EOF_' > chainGsub
#LOOP
chainOne $(path1)
#ENDLOOP
'_EOF_'

    cat << '_EOF_' > chainOne
/cluster/home/braney/bin/x86_64/simpleChain -prot -outPsl -maxGap=150000 $1
`dirname $1`/c.`basename $1`.psl
'_EOF_'
    chmod +x chainOne
    ls  ../blastOut/*/chrom/*.psl > chain.lst
    gensub2 chain.lst single chainGsub chainSpec
    # do the cluster run for chaining
    ssh pk
    cd /cluster/data/mm8/bed/tblastn.hg18KG/chainRun
    para create chainSpec
    para maxNode 30
    para try, check, push, check etc.

#two batches 

# Completed: 2574 of 2574 jobs
# CPU time in finished jobs:    3338223s   55637.04m   927.28h   38.64d  0.106 y
# IO & Wait Time:                 21934s     365.57m     6.09h    0.25d  0.001 y
# Average job time:                1305s      21.76m     0.36h    0.02d
# Longest finished job:           88204s    1470.07m    24.50h    1.02d
# Submission to last job:         92614s    1543.57m    25.73h    1.07d

# Completed: 2871 of 2871 jobs
# CPU time in finished jobs:    2495054s   41584.24m   693.07h   28.88d  0.079 y
# IO & Wait Time:                 47207s     786.78m    13.11h    0.55d  0.001 y
# Average job time:                 885s      14.76m     0.25h    0.01d
# Longest finished job:           59971s     999.52m    16.66h    0.69d
# Submission to last job:         78852s    1314.20m    21.90h    0.91d

    ssh kkstore04
    cd /cluster/data/mm8/bed/tblastn.hg18KG/blastOut
    bash 
    for i in kg??
    do
       cat $i/chrom/c.*.psl|awk "(\$13 - \$12)/\$11 > 0.6 {print}" > c60.$i.psl
       sort -rn c60.$i.psl | pslUniq stdin u.$i.psl
       awk "((\$1 / \$11) ) > 0.60 { print   }" c60.$i.psl > m60.$i.psl
       echo $i
    done
    sort -T /tmp -k 14,14 -k 16,16n -k 17,17n u.*.psl m60* | uniq > /cluster/data/mm8/bed/tblastn.hg18KG/preLift.psl
    cd /cluster/data/mm8/bed/tblastn.hg18KG 
    liftUp -type=.psl -nohead stdout ../../jkStuff/liftAll.lft carry preLift.psl | sort -k 14,14 -k 16,16n -k 17,17n > blastHg18KG.psl
    pslCheck blastHg18KG.psl

    # load table 
    ssh hgwdev
    cd /cluster/data/mm8/bed/tblastn.hg18KG
    hgLoadPsl mm8 blastHg18KG.psl

    # check coverage
    nice featureBits mm8 blastHg18KG 
# 40445290 bases of 2567283971 (1.575%) in intersection
# In comparison to cat and dog:
    nice featureBits felCat3  blastHg18KG
# 15218612 bases of 1642698377 (0.926%) in intersection
    nice featureBits canFam2 blastHg18KG
# 32565727 bases of 2384996543 (1.365%) in intersection

    featureBits mm8 refGene:cds blastHg18KG  -enrichment
# refGene:cds 1.157%, blastHg18KG 1.575%, both 0.927%, cover 80.15%, enrich
# 50.88x

    ssh kkstore04
    rm -rf /cluster/data/mm8/bed/tblastn.hg18KG/blastOut
    rm -rf /cluster/bluearc/mm8/bed/tblastn.hg18KG/blastOut
#end tblastn

# EXONIPHY MM8, lifted from hg18 (DONE acs 2007-04-08)

    ssh hgwdev
    cd /cluster/data/mm8/bed
    mkdir exoniphy
    cd exoniphy
    hgLoadGenePred -genePredExt mm8 exoniphy exoniphyMm8.gp

    # exoniphyMm8.gp was prepared at Cornell as follows
    hgsql hg18 -e "select * from exoniphy" --skip-column-names > exoniphyHg18.gp
    liftOver -genePred exoniphyHg18.gp /usr/data/hg18/dbDerived/netSynteny/hg18.mm8.syn.chain exoniphyMm8.gp unmapped
    (where hg18.mm8.syn.chain representes the human/mouse syntenic net)

#########################################################################
# BLASTZ/CHAIN/NET HORSE (DONE 2/21/07 Fan)
    ssh kkstore05
    mkdir /cluster/data/equCab1/bed/blastz.mm8.2007-02-17
    cd /cluster/data/equCab1/bed/blastz.mm8.2007-02-17
    cat << '_EOF_' > DEF
# Horse vs. Mouse

BLASTZ_M=50

# TARGET: Horse equCab1
SEQ1_DIR=/san/sanvol1/scratch/equCab1/equCab1.2bit
SEQ1_LEN=/san/sanvol1/scratch/equCab1/chrom.sizes       
# Maximum number of scaffolds that can be lumped together
SEQ1_LIMIT=500     
SEQ1_CHUNK=30000000
SEQ1_LAP=10000

# QUERY: Mouse mm8
SEQ2_DIR=/scratch/hg/mm8/mm8.2bit
SEQ2_LEN=/cluster/data/mm8/chrom.sizes 
SEQ2_CHUNK=10000000
SEQ2_LAP=0

BASE=/cluster/data/equCab1/bed/blastz.mm8.2007-02-17
TMPDIR=/scratch/tmp
'_EOF_'
# Fix script coloring _EOF_ 
    # << this line keeps emacs coloring happy
    doBlastzChainNet.pl DEF \
      -bigClusterHub pk \
      -chainMinScore=3000 -chainLinearGap=medium \
      -blastzOutRoot /cluster/bluearc/equCab1/blastz.mm8 >& do.log &
    tail -f do.log

    ssh hgwdev
    cd /cluster/data/equCab1/bed/blastz.mm8.2007-02-17
    ln -s blastz.mm8.2007-02-17 /cluster/data/equCab1/bed/blastz.mm8
    nice featureBits equCab1 -chrom=chr1 chainMm8Link
# 70800969 bases of 177498097 (39.888%) in intersection

    bash
    time nice -n 19 featureBits equCab1 chainMm8Link \
	> fb.equCab1.chainMm8Link.txt 2>&1
# 903993981 bases of 2421923695 (37.325%) in intersection

    ssh kkstore05
    mkdir /cluster/data/mm8/bed/blastz.equCab1.swap
    cd /cluster/data/mm8/bed/blastz.equCab1.swap
    bash
    time doBlastzChainNet.pl \
	/cluster/data/equCab1/bed/blastz.mm8.2007-02-17/DEF \
	-chainMinScore=3000 -chainLinearGap=medium \
	-verbose=2 -swap -bigClusterHub=pk > swap.log 2>&1 &
    tail -f swap.log
# real    76m34.873s

    ssh hgwdev
    cd /cluster/data/mm8/bed/blastz.equCab1.swap
    bash
    time nice -n 19 featureBits mm8 chainEquCab1Link \
	> fb.mm8.chainEquCab1Link.txt 2>&1
    # 906568751 bases of 2567283971 (35.312%) in intersection

#########################################################################
# CGAP SAGE (Done 2007-05-04)

ssh hgwdev
cd /san/sanVol1/scratch/andy
mkdir cgapSage.mm8
cd cgapSage.mm8
wget ftp://ftp1.nci.nih.gov/pub/SAGE/MOUSE/Mm.libraries.gz
wget ftp://ftp1.nci.nih.gov/pub/SAGE/MOUSE/Mm_long.frequencies.gz
hgsql -e 'select * from snp126 where class="single" and locType="exact"' mm8 \
    | tail +2 | cut -f2- > snps.txt
hgsql -e 'select name from snp126Exceptions where exception="ObservedWrongSize" 
    or exception="SingleClassBetweenLocType" or exception="SingleClassRangeLocType" 
    or exception="MultipleAlignment"' mm8 \
    | tail +2 > exceptions
tabGrep -v exceptions 4 snps.txt > tmp
mv tmp snps.txt
rm exceptions
hgsql -e 'select chrom,chromStart,chromEnd,name from simpleRepeat' mm8 | tail +2 > trf.bed
cut -f1-4 snps.txt > snps.bed
overlapSelect -nonOverlapping trf.bed snps.bed /dev/stdout | cut -f4 > goodSnps.txt
tabGrep goodSnps.txt 4 snps.txt > tmp
mv tmp snps.txt
rm trf.bed goodSnps.txt snps.bed
ln -s /cluster/data/mm8/mm8.2bit
ln -s /cluster/data/mm8/chrom.sizes
ln -s ~/kent/src/hg/lib/cgapSage/cgapSageLib.sql
tail +2 Mm.libraries | awk -f cleanLibs.awk > libs.txt
hgLoadSqlTab mm8 cgapSageLib cgapSageLib.sql libs.txt
partitionSequence.pl -lstDir small 5000000 30 mm8.2bit chrom.sizes 0 > sequence.lst
grep -v small sequence.lst > seq.lst
cat small/* >> seq.lst
mv seq.lst sequence.lst
rm -rf small/
for part in `cat sequence.lst`; do ./doJobList.sh $part >> jobList; done
ssh pk
cd /san/sanVol1/scratch/andy/cgapSage.mm8
para create jobList
para try
para push
# takes like 5-10 min
exit # back to hgwdev
find output/ -name '*.bed' -exec cat '{}' >> output.bed \;
cgapSageDupeRemove output.bed tmp.bed
cgapSageDupeRemove -unique tmp.bed final.bed
ln -s ~/kent/src/hg/lib/cgapSage/cgapSage.sql
hgLoadBed -sqlTable=cgapSage.sql -tab mm8 cgapSage final.bed 

#############################################################################
#  REBUILD miRNA TRACK (DONE - 2007-05-31 - Fan)
    #   updated data from: Michel.Weber@ibcg.biotoul.fr
    #   notify them when done.
    ssh hgwdev
    cd /cluster/data/mm8/bed
    mkdir miRNA-2007-05-31
    cd miRNA-2007-05-31
    # save the mouse_miRNA_track_may2007.txt file from email

    cat mouse_miRNA_track_may2007.txt|sed -e 's/ /\t/g' > miRNA.tab

    hgLoadBed mm8 miRNA miRNA.tab

# check previous release track before update
    featureBits mm8 miRNA
    #33398 bases of 2567283971 (0.001%) in intersection

    featureBits mm7 miRNA
    # 20620 bases of 2583394090 (0.001%) in intersection


#############################################################################
# LIFTOVER TO MM9 (DONE 7/25/07 angie)
    ssh kkstore04
    # -debug run to create run dir, preview scripts...
    doSameSpeciesLiftOver.pl -debug mm8 mm9 \
      -ooc /san/sanvol1/scratch/mm8/11.ooc
    # Real run:
    cd /cluster/data/mm8/bed/blat.mm9.2007-07-24
    doSameSpeciesLiftOver.pl mm8 mm9 \
      -ooc /san/sanvol1/scratch/mm8/11.ooc \
      >& do.log & tail -f do.log


#############################################################################
# CONTRAST GENES (2007-10-02 markd)
# recieved predictions from Sam Gross <ssgross@stanford.edu>

    cd /cluster/data/mm8/bed/contrastGene/
    wget http://www.stanford.edu/~ssgross/contrast.mm8.bed
    # this is a custom track, not a pure BED
    tail +2 contrast.mm8.bed | hgLoadBed -tab mm8 contrastGene stdin

    # verify 
    # load track db (ra and contrastGene.html are global
    # request push of contrastGene

###########################################################################
#  loading affy mouse Exon probes and transcripts (DONE - 2007-10-04 - Hiram)
    # data was supplied from Venu Valmeekam Venu_Valmeekam@affymetrix.com
    #	dropped via FTP to genome-test
    ssh hgwdev
    mkdir /cluster/data/mm8/bed/affyMoEx1
    cd /cluster/data/mm8/bed/affyMoEx1
    # the files received:
# -rw-r--r--  1  8909954 Oct  3 10:48 transcript_cluster_mm.bed.gz
# -rw-r--r--  1 48178714 Oct  4 13:35 probe_mm_score.bed.gz
    # loading:
    hgLoadBed -tmpDir=/scratch/tmp mm8 affyMoEx1Probe probe_mm_score.bed.gz
    #	Loaded 4549897 elements of size 6
    hgLoadBed -tmpDir=/scratch/tmp mm8 affyMoEx1Transcript \
	transcript_cluster_mm.bed.gz
    # Loaded 270140 elements of size 12
    # working on description pages for these with Venu.

    #	I manually set the scores in the affyMoEx1Transcript track to
    #	1000 so it would work OK (not color) with the useScore 1 so that
    #	the affyMoEx1Probe would color itself on the score

###########################################################################
# LIFT RM ALIGN FILES, MAKE PER-CHROM DOWNLOADS (DONE 12/7/07 angie)
# Lifting of .align files is now automated by doRepeatMasker.pl, but we
# got a user request for .align files from this pre-automation db.
    ssh kkstore04
    cd /cluster/data/mm8
    mkdir downloads/RMalign
    foreach c (?{,?})
      echo linking/lifting to contigs of $c:t
      foreach ctgdir ($c/chr$c{,_random}_?{,?})
        set ctg = $ctgdir:t
        if (! -f $ctgdir/$ctg.fa.align) then
          pushd $ctgdir
          liftRMAlign.pl $ctg.lft > $ctg.fa.align
          popd
        endif
        ln -s $ctg/$ctg.fa.align $c/
      end
      set chr = chr$c:t
      if (-e $c/lift/ordered.lft && ! -z $c/lift/ordered.lft) then
        echo lifting contigs to chr$c
        liftRMAlign.pl $c/lift/ordered.lft \
        | gzip -c > downloads/RMalign/$chr.fa.align.gz
      endif
      if (-e $c/lift/random.lft && ! -z $c/lift/random.lft) then
        echo lifting contigs to chr${c}_random
        liftRMAlign.pl $c/lift/random.lft \
        | gzip -c > downloads/RMalign/${chr}_random.fa.align.gz
      endif
    end
    # Got some messages like these for chunks that fall entirely
    # within gaps (e.g. centromere, huge unbridged...)
#FYI Couldn't open chr1_1_00.fa.align: No such file or directory
#...
#FYI Couldn't open chr1_1_05.fa.align: No such file or directory
#FYI Couldn't open chr1_17_02.fa.align: No such file or directory
#...

    md5sum downloads/RMalign/*.gz > downloads/RMalign/md5sum.txt
    ssh hgwdev ln -s /cluster/data/mm8/downloads/RMalign \
      /usr/local/apache/htdocs/goldenPath/mm8/


############################################################################
# Reload CCDS (2007-12-12 markd)
    # import ccds database as described in ccds.txt
    set db=mm8
    # create and load ccdsGene and ccdsInfo tables from imported database
    /cluster/data/genbank/bin/x86_64/ccdsMkTables -loadDb ccds ${db} ccdsInfo ccdsGene

    # ccdsKgMap
    /cluster/data/genbank/bin/x86_64/mkCcdsGeneMap -db=${db} -loadDb ccdsGene knownGene ccdsKgMap

    # build initial version of ccdsMgcMap table, updated by nightly genbank update
    /cluster/data/genbank/bin/x86_64/mkCcdsGeneMap -db=${db} -loadDb ccdsGene mgcGenes ccdsMgcMap

    checkTableCoords ${db} -verbose=2 ccdsGene
    # update all.jointer to include ${db} in ccdsDb
    joinerCheck -database=${db} -identifier=ccdsGeneId ~/compbio/genbank/kent/src/hg/makeDb/schema/all.joiner
    # request push of 
        ccdsGene
        ccdsInfo
        ccdsKgMap
        ccdsMgcMap
    # << emacs

############################################################################
# Reload CCDS (2008-02-01 markd)
    # import ccds database as described in ccds.txt
    set db=mm8
    # create and load ccdsGene and ccdsInfo tables from imported database
    /cluster/data/genbank/bin/x86_64/ccdsMkTables -loadDb ccds ${db} ccdsInfo ccdsGene

    # ccdsKgMap
    /cluster/data/genbank/bin/x86_64/mkCcdsGeneMap -db=${db} -loadDb ccdsGene knownGene ccdsKgMap

    checkTableCoords ${db} -verbose=2 ccdsGene
    # update all.jointer to include ${db} in ccdsDb
    joinerCheck -database=${db} -identifier=ccdsGeneId ~/compbio/genbank/kent/src/hg/makeDb/schema/all.joiner
    # request push of 
        ccdsGene
        ccdsInfo
        ccdsKgMap
    # << emacs

############################################################################
# Broad whole-genome ChIP-Seq in stem and progenitor cells
# Mikkelson et al., Nature Aug. 2, 2007
# Requested by David Haussler
# 21 data sets, ~4M sequences/dataset
#     7 antibodies (histone meth & pol2), 
#     4 cell sources (ES, NP, MEF, ES+)
# alignments/ sequences and mappings for 27bp reads
#   format: chrom, start, end, strand, read_id, mismatches, sequence 
# densities/ indication of #reads near the base, 25bp fixed window, -1 if unalignable base
# Allele-specific fragment counts
#   format: chr start allele1  allele2  # # 
# Enriched intervals by HMM
#   BED3
# Enriched intervals by fixed-size windows
#   BED3
# Also, gene expression data

# Track organization:
# Broad ChIP ES supertrack, with tracks:
#       - Broad Stem ChIP Seq (read alignments)
#       - Broad Stem ChIP Sig (density in 25bp windows)
#       - Broad Stem ChIP Sites (regions from HMM, windowing)
# Each track has subtracks for different cell types and antibodies

# Also, a track for the expression data: Broad ES
#      

    ssh kkstore04
    cd /cluster/data/mm8/bed
    mkdir -p broadStemChip
    cd broadStemChip/
    wget -r ftp://ftp.broad.mit.edu/pub/papers/chipseq/
    mv pub/papers/chipseq .
    rm -fr pub
    # original data
    ln -s chipseq lab
    cd lab

    ###############
    # Sites track

    # HMM Sites -- BED3
    mkdir -p hmmSites
    cd hmmSites
    tar xvfz ../HMMIntervals.tar.gz

    ssh hgwdev
    cd /cluster/data/mm8/bed/broadStemChip
cat > hmmSites.csh << 'EOF'
    foreach f (/hmmSites/HMM_ES_*.txt)
        set b = $f:t
        set ab = `echo $b | perl -wpe 's/HMM_ES_(.+).txt/H3$1me3/'`
        echo $ab
        tail +2 $f | sed 's/^/chr/' | \
                hgLoadBed mm8 broadStemChipHmmSites${ab}Es stdin
    end
'EOF'
# Fix script coloring EOF
 
    csh hmmSites.csh >&! hmmSites.log
    # Loaded 1788 - 19523 elements in 5 tracks
    # H3K{20,27,36,4,9)me3

    mkdir -p WindowSites
    cd WindowSites
    tar xvfz ../WindowIntervals.tar.gz
    cd ..
    awk '{print $4}' *K*.txt | sort -n | head -1

    # Sites from Window algorithm -- BED3 plus float score
    # min: 2.75, max: 275.50
    # distribution of data values:
     awk '{print $4}' *K*.txt | sort | textHistogram -binSize=10 maxBinCount=30 -real stdin
     0.000000 ************************************************************ 38346
     10.000000 ************************** 16385
     20.000000 ************** 9186
     30.000000 ********* 5705
     40.000000 ******* 4607
     50.000000 ****** 3686
     60.000000 **** 2243
     70.000000 ** 1094
     80.000000 * 382
     90.000000  112
     100.000000  31
     110.000000  10
     120.000000  3
     130.000000  2
     140.000000  0
     150.000000  0
     160.000000  0
     170.000000  0
     180.000000  0
     190.000000  1
     200.000000  0
     210.000000  0
     220.000000  2
     230.000000  0
     240.000000  0
     250.000000  0
     260.000000  0
     270.000000  1
    
     # To range score display from 300 to 1000, use:
     #  (x * 2) + 300

mkdir windowSites
cat > windowSites.csh << 'EOF'
    foreach f (chipseq/windowSites/*.K*.txt)
        set b = $f:t
        set ab = `echo $b | perl -wpe 's/\w+.(\w)(\w+).txt/H3\u$1\L$2me3/'`
        set cell = `echo $b | perl -wpe 's/(\w)(\w+).*/\u$1\L$2/'`
        tail +2 $f |  awk '{printf "%s\t%d\t%d\t \t%d\t%s\n", $1, $2, $3, ($4 * 2) + 300, $4}' > windowSites/$cell.$ab.tab
        # using kate's version, testing -renameSqlTable option
        /cluster/home/kate/bin/x86_64/hgLoadBed mm8 -tab -noNameIx -renameSqlTable \
            -sqlTable=/cluster/bin/sqlCreate/bed5FloatScore.sql \
                broadStemChipWinSites${ab}${cell} windowSites/$cell.$ab.tab
    end
'EOF'
# Fix script coloring EOF 
    csh windowSites.csh >&! windowSites.log

    ###############
    # Signal track
    # indication of #reads near the base, 25bp fixed window, -1 if unalignable base

    ssh kkstore04
    cd /cluster/data/mm8/bed/broadStemChip/lab/densities
    mkdir -p alignable
    cd alignable
    tar xvfz ../alignable.tar.gz
    cd ../..

    # Get a list of the datasets
    mkdir -p signal
    tar tfz chipseq/densities/chr1.tar.gz | \
        perl -wpe 's/chr\w.(\w+.\w+).txt/$1/' > signal/datasets.txt
    # ignore control (whole-cell extract)
    grep -v WCE signal/datasets.txt > signal/subtracks.txt
    wc -l signal/subtracks.txt
        # 18

    # Extract datasets from by-chrom packaging
    # Weed out missing data which are represented as -1 values
# Convert to wiggle

cat > makeWig.csh << 'EOF'
    foreach s (`cat signal/subtracks.txt`)
        set ab = `echo $s | perl -wpe 's/\w+.(\w)(\w+)/H3\u$1\L$2/'`
        set cell = `echo $s | perl -wpe 's/(\w)(\w+).\w+/\u$1\L$2/'`
        set table = broadStemChipSignal${ab}${cell}
        echo $table
        rm -f signal/$s.wigVar
        foreach f (chipseq/densities/chr*.tar.gz)
            set c = $f:t:r:r
            (echo "fixedStep chrom=$c start=1 step=25 span=25"; \
                tar xfzO $f $c.$s.txt) | \
            nice fixStepToBedGraph.pl | \
            nice grep -v '\-1$' | \
            nice wigBedToStep stdin stdout >> signal/$s.wigVar
        end
        nice wigEncode signal/$s.wigVar signal/$s.wig signal/$s.wib
    end
'EOF'
# Fix script coloring EOF 

# NEWER

cat > makeWig.csh << 'EOF'
    foreach s (`cat signal/subtracks.txt`)
        set ab = `echo $s | perl -wpe 's/\w+.(\w)(\w+)/\u$1\L$2/'`
        set cell = `echo $s | perl -wpe 's/(\w)(\w+).\w+/\u$1\L$2/'`
        set table = broadStemChipSignal${ab}${cell}
        echo $table
        rm -f signal/$s.wigVar
        foreach f (chipseq/densities/chr*.tar.gz)
            set c = $f:t:r:r
            echo "variableStep chrom=$c span=25" >> signal/$table.wigVar
            tar xfzO $f $c.$s.txt | \
                awk 'BEGIN {pos = 1} {print pos, $1; pos += 25}' | \
                grep -v '\-1$' >> signal/$table.wigVar
        end
        cd signal
        nice wigEncode $table.wigVar $table.wig $table.wib
        cd ..
    end
'EOF'
    # Fix script coloring EOF 
    csh makeWig.csh >&! makeWig.log &
    # check output and cleanup
    cd signal
    gzip *.wigVar
    csh makeWig.csh >&! makeWig.log &
    # check output and cleanup
    cd signal
    gzip *.wigVar

    ######## Load wiggles?
    ssh hgwdev
    mkdir /gbdb/mm8/broadStemChip

    cd /cluster/data/mm8/bed/broadStemChip
cat > loadWig.csh << \_EOF_
#!/bin/csh -fe
    cd /cluster/data/mm8/bed/broadStemChip/signal
    foreach f (*.wib)
        set wi = $f:t:r
        set wig = $wi.wig
        echo Start: $wig
        echo "ln -s `pwd`/$f /gbdb/mm8/broadStemChip/$wi.wib"
        time nice -n +19 hgLoadWiggle -pathPrefix=/gbdb/mm8/broadStemChip mm8 $wi $wig
        echo Finished: $wig
     end
_EOF_
    chmod +x loadWig.csh

    time nice -n +19 ./loadWig.csh >> loadWig.log 2>&1 &
    
    # Try it by hand.
    time nice -n +19 hgLoadWiggle -pathPrefix=/gbdb/mm8/broadStemChip mm8 broadStemChipSignalH3Es broadStemChipSignalH3Es.wig
    
    # Now Try it again.
cat > loadWig.csh << \_EOF_
#!/bin/csh -fe
    cd /cluster/data/mm8/bed/broadStemChip/signal
    foreach f (*.wib)
        set wi = $f:t:r
        set wig = $wi.wig
        time hgLoadWiggle -pathPrefix=/gbdb/mm8/broadStemChip mm8 $wi $wig
        echo Finished: $wig
     end
_EOF_
    # Try it again.
    time nice -n +19 ./loadWig.csh >> loadWig.log 2>&1 &

# | broadStemChipSignalH3Es              |
# | broadStemChipSignalK20Es             |
# | broadStemChipSignalK27Es             |
# | broadStemChipSignalK27Mef            |
# | broadStemChipSignalK27Np             |
# | broadStemChipSignalK36Es             |
# | broadStemChipSignalK36Eshyb          |
# | broadStemChipSignalK36Mef            |
# | broadStemChipSignalK36Np             |
# | broadStemChipSignalK4Es              |
# | broadStemChipSignalK4Eshyb           |
# | broadStemChipSignalK4Mef             |
# | broadStemChipSignalK4Np              |
# | broadStemChipSignalK9Es              |
# | broadStemChipSignalK9Eshyb           |
# | broadStemChipSignalK9Mef             |
# | broadStemChipSignalK9Np              |
# | broadStemChipSignalRpolEs            |

    # Noticed tables badly named, renamed them and corresponding files
    hgsql mm8
    rename table broadStemChipSignalK4Es     to broadStemChipSignalH3K4Es    ;
    rename table broadStemChipSignalK4Eshyb  to broadStemChipSignalH3K4Eshyb ;
    rename table broadStemChipSignalK4Mef    to broadStemChipSignalH3K4Mef   ;
    rename table broadStemChipSignalK4Np     to broadStemChipSignalH3K4Np    ;
    rename table broadStemChipSignalK9Es     to broadStemChipSignalH3K9Es    ;
    rename table broadStemChipSignalK9Eshyb  to broadStemChipSignalH3K9Eshyb ;
    rename table broadStemChipSignalK9Mef    to broadStemChipSignalH3K9Mef   ;
    rename table broadStemChipSignalK9Np     to broadStemChipSignalH3K9Np    ;
    rename table broadStemChipSignalK20Es    to broadStemChipSignalH4K20Es   ;
    rename table broadStemChipSignalK27Es    to broadStemChipSignalH3K27Es   ;
    rename table broadStemChipSignalK27Mef   to broadStemChipSignalH3K27Mef  ;
    rename table broadStemChipSignalK27Np    to broadStemChipSignalH3K27Np   ;
    rename table broadStemChipSignalK36Es    to broadStemChipSignalH3K36Es   ;
    rename table broadStemChipSignalK36Eshyb to broadStemChipSignalH3K36Eshyb;
    rename table broadStemChipSignalK36Mef   to broadStemChipSignalH3K36Mef  ;
    rename table broadStemChipSignalK36Np    to broadStemChipSignalH3K36Np   ;

# | broadStemChipSignalH3K4Es         |
# | broadStemChipSignalH3K4Eshyb      |
# | broadStemChipSignalH3K4Mef        |
# | broadStemChipSignalH3K4Np         |

# | broadStemChipSignalH3K9Es         |
# | broadStemChipSignalH3K9Eshyb      |
# | broadStemChipSignalH3K9Mef        |
# | broadStemChipSignalH3K9Np         |

# | broadStemChipSignalH4K20Es        |

# | broadStemChipSignalH3K27Es        |
# | broadStemChipSignalH3K27Mef       |
# | broadStemChipSignalH3K27Np        |

# | broadStemChipSignalH3K36Es        |
# | broadStemChipSignalH3K36Eshyb     |
# | broadStemChipSignalH3K36Mef       |
# | broadStemChipSignalH3K36Np        |

# | broadStemChipSignalH3Es           |
# | broadStemChipSignalRpolEs         |

### ### ### Finished Signals 2008-05-08 


######### Alignments

    ### Sample from ES.H3.txt.gz
    # chr10   63848447        63848474        -       3084.4.1        0       GAGAGCCAATGGCTAGGCAGGGCATCA
    ### Convert to 
    #chr10  63848447  63848474  3084.4.1  0  -  63848447  63848474  0,255,0  0  GAGAGCCAATGGCTAGGCAGGGCATCA
    # convert to bed-9+   color at 9, mismatch at 10 and seq at 11; grabbed some example code from encodeHg17.txt PET

    ssh hgwdev
    cd /cluster/data/mm8/bed/broadStemChip/lab/alignments
    mkdir bed
    cd bed
cat << \_EOF_ > makeBed9PlusFromAlignments.csh
#!/usr/bin/perl
# replace "reserved" field of BED >=9 fields with RGB value from 8-scale 
# black->red palette, based on score value.

use warnings;
use strict;

while (<>) {
    next if (/^track/ || /^\s*\#/);
    chomp;
    my @words = split("\t");
    if (scalar(@words) < 7) {
      @words = split(/\s+/);
      die "Expecting at least 7 tab-sep fields but got fewer, line $.\n"
        if (scalar(@words) < 7);
    }
    my @newWordOrder = ("","","","","","","","","","","");
    $newWordOrder[0] = $words[0];  # chr
    $newWordOrder[1] = $words[1];  # beg
    $newWordOrder[2] = $words[2];  # end
    $newWordOrder[3] = $words[4];  # name
    #$newWordOrder[4] = "0";        # score
    $newWordOrder[4] = 1000 - ($words[5] * 100);        # score  0=1000 1=900 2=800 
    $newWordOrder[5] = $words[3];  # strand
    $newWordOrder[6] = $words[1];  # beg
    $newWordOrder[7] = $words[2];  # end
    $newWordOrder[8] = "0,0,0";    # color to be set later
    $newWordOrder[9] = $words[5];  # mismatch
    $newWordOrder[10] = $words[6]; # seq
    
    print join("\t", @newWordOrder) . "\n";
}
_EOF_

cat << \_EOF_ > makeColoredBedOnStrand.csh
#!/usr/bin/perl
# replace "reserved" field of BED >=9 fields with RGB value from 8-scale 
# black->red palette, based on score value.

use warnings;
use strict;

# palette consistes of red, green blue
my @blues  = ("0,0,255","0,0,204","0,0,170");
my @greens = ("0,255,0","0,187,0","0,136,0");

while (<>) {
    next if (/^track/ || /^\s*\#/);
    chomp;
    my @words = split("\t");
    if (scalar(@words) < 9) {
      @words = split(/\s+/);
      die "Expecting at least 9 tab-sep fields but got fewer, line $.\n"
        if (scalar(@words) < 9);
    }
    die "More than 9 mismatches found line $.\n"
        if ($words[9] > 9);
    my $strand = $words[5];
    if ($strand eq '+') {
        if( $words[9] > 2 ) { 
            $words[8] = $blues[2]; # green
        } else {
            $words[8] = $blues[$words[9]]; # green
        }
    } else {
        if( scalar($words[9]) > 2 ) { 
            $words[8] = $greens[2]; # blue
        } else {
            $words[8] = $greens[$words[9]]; # blue
        }
    }
    print join("\t", @words) . "\n";
}
_EOF_

cat << \_EOF_ > convertToBed.csh
#!/bin/csh -fe
    cd /cluster/data/mm8/bed/broadStemChip/lab/alignments/bed
    foreach f (../*.txt.gz)
        set root = `echo $f:t:r:r`
        zcat $f | ./makeBed9PlusFromAlignments.csh | ./makeColoredBedOnStrand.csh | gzip > $root.bed.gz
        echo $root.bed.gz done 
    end
_EOF_


chmod +x makeBed9PlusFromAlignments.csh
chmod +x makeColoredBedOnStrand.csh
chmod +x convertToBed.csh

    zcat ../ES.H3.txt.gz | head | ./makeBed9PlusFromAlignments.csh | ./makeColoredBedOnStrand.csh 
    ## How to make bash work ???
    #for f in ../*.txt.gz; do
    #  root=${f##*/}
    #  root=${root%.*}
    #  root=${root%.*}
    #  zcat $f | ./makeBed9PlusFromAlignments.csh | ./makeColoredBedOnStrand.csh | gzip > $root.bed.gz
    #  echo $root.bed.gz done 
    #done

    ssh kkstore04
    cd /cluster/data/mm8/bed/broadStemChip/lab/alignments/bed
    time nice -n +19 ./convertToBed.csh > convert.log 2>&1 &
    
    # failed because mismatches exceeded 2, so used following to determin max mismatches: 6 in ES.H3
    zcat ../ES.H3.txt.gz | head -100 | awk '{print $6}' | sort -n | uniq -c | wc -l 
    # real    55m8.275s

# Two were not gzipped!
cat << \_EOF_ > convertTxtToBed.csh
#!/bin/csh -fe
    cd /cluster/data/mm8/bed/broadStemChip/lab/alignments/bed
    foreach f (../ES.*.txt)
        set root = `echo $f:t:r`
        ./makeBed9PlusFromAlignments.csh < $f | ./makeColoredBedOnStrand.csh | gzip > $root.bed.gz
        echo $root.bed.gz done 
    end
_EOF_
chmod +x convertTxtToBed.csh
    time nice -n +19 ./convertTxtToBed.csh >> convert.log 2>&1 &

#  Add comments:
cat << \_EOF_ > commentBedFiles.csh 
#!/bin/csh -fe
    cd /cluster/data/mm8/bed/broadStemChip/lab/alignments/bed
    set descr1 = `grep Primary ../readme.txt | tr -d "\r"`
    set descr2 = `grep pluripotent ../readme.txt | tr -d "\r"`
    foreach f (ES.*.bed.gz)
        set root = `echo $f:t:r:r`
        set comment = `grep $root ../readme.txt | tr -d "\r"`
        echo "# $comment - ${descr1} ${descr2}" > new.${root}.bed 
        zcat $f >> new.${root}.bed 
        gzip new.${root}.bed 
    end
_EOF_

    ssh kkstore04
    cd /cluster/data/mm8/bed/broadStemChip/lab/alignments/bed
    time nice -n +19 ./commentBedFiles.csh > comment.log 2>&1 &
    
    # Rename to match other identifiers?
    # | broadStemChipHmmSitesH3K20me3Es  |
    # | broadStemChipHmmSitesH3K27me3Es  |
    # | broadStemChipHmmSitesH3K36me3Es  |
    # | broadStemChipHmmSitesH3K4me3Es   |
    # | broadStemChipHmmSitesH3K9me3Es   |
    # | broadStemChipWinSitesH3K27me3Es  |
    # | broadStemChipWinSitesH3K27me3Mef |
    # | broadStemChipWinSitesH3K27me3Np  |
    # | broadStemChipWinSitesH3K4me3Es   |
    # | broadStemChipWinSitesH3K4me3Mef  |
    # | broadStemChipWinSitesH3K4me3Np   |
    # | broadStemChipWinSitesH3K9me3Es   |
    # | broadStemChipWinSitesH3K9me3Mef  |
    # | broadStemChipWinSitesH3K9me3Np   |

     zcat new.ES.K9.bed.gz | head -1 | awk '{ print $5 }'
     head -1 new.*.bed | awk '{ print $5 }'
     for f in new.ES.K*.gz; do zcat $f | head -1 | awk '{ print $2,$5 "Es"}'; done
     for f in new.ES.WCE.*.gz; do zcat $f | head -1 | awk '{ print $2,"WceEs"}'; done
     for f in new.ES.H3.*.gz; do zcat $f | head -1 | awk '{ print $2,"H3panEs"}'; done
     for f in new.ES.R*.gz; do zcat $f | head -1 | awk '{ print $2,"RPolEs"}'; done
     for f in new.ESHyb.*.gz; do zcat $f | head -1 | awk '{ print $2,"ES" $6 "EsHyb"}'; done
     for f in new.MEF.K*.gz; do zcat $f | head -1 | awk '{ print $2,$4 "Mef"}'; done
     for f in new.MEF.WCE.*.gz; do zcat $f | head -1 | awk '{ print $2,"WceMef"}'; done
     for f in new.NP.K*.gz; do zcat $f | head -1 | awk '{ print $2,$5 "Np"}'; done
     for f in new.NP.WCE.*.gz; do zcat $f | head -1 | awk '{ print $2,"WceNp"}'; done
     
mv new.ES.K20.bed.gz H4K20Me3Es.bed.gz
mv new.ES.K27.bed.gz H3K27Me3Es.bed.gz
mv new.ES.K36.bed.gz H3K36Me3Es.bed.gz
mv new.ES.K4.bed.gz H3K4Me3Es.bed.gz
mv new.ES.K9.bed.gz H3K9Me3Es.bed.gz
mv new.ES.WCE.bed.gz WceEs.bed.gz
mv new.ES.H3.bed.gz H3panEs.bed.gz
mv new.ES.RPol.bed.gz RPolEs.bed.gz
mv new.ESHyb.K36.bed.gz ESH3K36Me3EsHyb.bed.gz
mv new.ESHyb.K4.bed.gz ESH3K4Me3EsHyb.bed.gz
mv new.ESHyb.K9.bed.gz ESH3K9Me3EsHyb.bed.gz
mv new.MEF.K27.bed.gz H3K27Me3Mef.bed.gz
mv new.MEF.K36.bed.gz H3K36Me3Mef.bed.gz
mv new.MEF.K4.bed.gz H3K4Me3Mef.bed.gz
mv new.MEF.K9.bed.gz H3K9Me3Mef.bed.gz
mv new.MEF.WCE.bed.gz WceMef.bed.gz
mv new.NP.K27.bed.gz H3K27Me3Np.bed.gz
mv new.NP.K36.bed.gz H3K36Me3Np.bed.gz
mv new.NP.K4.bed.gz H3K4Me3Np.bed.gz
mv new.NP.K9.bed.gz H3K9Me3Np.bed.gz
mv new.NP.WCE.bed.gz WceNp.bed.gz


        #hgLoadBed mm8 broadStemChipAlign${root} ${f}

    time nice -n +19 hgLoadBed mm8 broadStemChipAlignmentsWceEs WceEs.bed.gz &
    
    ### Failed!  All that work to put a nice comment in the bed file, and hgLoadBed does not handle it!
    ### Fixed this in hgLoadBed.c
    
    
cat << \_EOF_ > myBedTbl.sql
    CREATE TABLE myBedTbl (
      bin smallint unsigned not null,
      chrom varchar(255) not null,
      chromStart int unsigned not null,
      chromEnd int unsigned not null,
      name varchar(255) not null,
      score int unsigned not null,
      strand char(1) not null,
      thickStart int unsigned not null,
      thickEnd int unsigned not null,
      reserved int unsigned  not null,
      mismatchCount int unsigned not null,
      seq varchar(255) not null,
    #Indices
      INDEX(name(16)),
      INDEX(chrom(5),bin)
    )
_EOF_

cat << \_EOF_ > loadBedFiles.csh
#!/bin/csh -fe
    cd /cluster/data/mm8/bed/broadStemChip/lab/alignments/bed
    foreach f (*.bed.gz)
        set root = `echo $f:t:r:r`
        ~/bin/x86_64/hgLoadBed -sqlTable=myBedTbl.sql -renameSqlTable mm8 broadStemChipAlignments${root} ${f} 
        echo broadStemChipAlignments${root} ${f} done 
    end
_EOF_
chmod +x loadBedFiles.csh
    time nice -n +19 ./loadBedFiles.csh &
real    62m46.504s

    # Noticed 3 tables badly named, renamed them and corresponding files
    hgsql mm8
    rename table broadStemChipAlignmentsESH3K36Me3EsHyb to broadStemChipAlignmentsH3K36Me3EsHyb;
    rename table broadStemChipAlignmentsESH3K4Me3EsHyb to broadStemChipAlignmentsH3K4Me3EsHyb;
    rename table broadStemChipAlignmentsESH3K9Me3EsHyb to broadStemChipAlignmentsH3K9Me3EsHyb;
    
    # edited trackDb.broadStem.ra
    
broadStemChipAlignmentsH3K4Me3Es
broadStemChipAlignmentsH3K4Me3Mef
broadStemChipAlignmentsH3K4Me3Np

broadStemChipAlignmentsH3K9Me3Es
broadStemChipAlignmentsH3K9Me3Mef
broadStemChipAlignmentsH3K9Me3Np

broadStemChipAlignmentsH4K20Me3Es

broadStemChipAlignmentsH3K27Me3Es
broadStemChipAlignmentsH3K27Me3Mef
broadStemChipAlignmentsH3K27Me3Np

broadStemChipAlignmentsH3K36Me3Es
broadStemChipAlignmentsH3K36Me3Mef
broadStemChipAlignmentsH3K36Me3Np

broadStemChipAlignmentsH3K9Me3EsHyb
broadStemChipAlignmentsH3K36Me3EsHyb
broadStemChipAlignmentsH3K4Me3EsHyb

broadStemChipAlignmentsWceEs
broadStemChipAlignmentsWceMef
broadStemChipAlignmentsWceNp

broadStemChipAlignmentsRPolEs
broadStemChipAlignmentsH3panEs

### ### ### Finished Alignments 2008-04-29 

### ### ### Edited mouse/mmm8/trackDb.broadStem.ra to include new broadChromatinChIPSeq 
### ### ### track with 53 subtracts covering sites (HMM, Windowing), siganl & alignments
### ### ### for ES, MAF, NP, ES_hybrid cell lines
### ### ### and H3K4me3 H3K9me3 H4K20me3 H3K27me3 H3K36me3 antibodies
### ### ### and WCE, RPOL-II and pan-H3 controls  

############################################################################
# Adding more tracks from Broad (Meissner2008)
# (Start 2008-7-14 Tim  Done: 2008-07-18)

    ssh kkstore04
    cd /cluster/data/mm8/bed/broadStemChip/chipseq
    mkdir -p Meissner2008
    cd Meissner2008/
    wget -r ftp://ftp.broad.mit.edu/pub/papers/chipseq/Meissner2008/
    mv pub/papers/chipseq/Meissner2008 .
    rm -fr pub
    # original data
    ln -s chipseq lab
    cd lab

    ###############
    # Sites track
    mkdir windowSites/Meissner2008
    cd windowSites/Meissner2008
    tar xvfz ../../Meissner2008/WindowIntervals.tar.gz
    awk '{print $4}' *.sites | sort -n | head -1

    # Sites from Window algorithm -- BED3 plus float score
    # min: 2.50, max: 275.50
    # distribution of data values:
     awk '{print $4}' *.sites | sort | textHistogram -binSize=10 maxBinCount=30 -real stdin
    # 0.000000 ************************************************************ 155307
    # 10.000000 **************** 42020
    # 20.000000 ****** 14576
    # 30.000000 **** 10408
    # 40.000000 ** 5717
    # 50.000000 * 2299
    # 60.000000  718
    # 70.000000  232
    # 80.000000  60
    # 90.000000  15
    # 100.000000  3
    # 110.000000  6
    # 120.000000  1
    # 130.000000  1
    # 140.000000  1
    mv Brain.H3K27me3.sites ../Brain.K27me3.sites
    mv Brain.H3K4me2.sites  ../Brain.K4me2.sites
    mv Brain.H3K4me3.sites  ../Brain.K4me3.sites
    mv ES.H3K4me1.sites     ../ES.K4me1.sites
    mv ES.H3K4me2.sites     ../ES.K4me2.sites
    mv NP.H3K4me1.sites     ../NP.K4me1.sites
    mv NP.H3K4me2.sites     ../NP.K4me2.sites
    mv readme.txt ../readme.Meissner2008.txt
    cd ..
    rmdir Meissner2008/
    # Continue to distinguish by .sites
    # Brain.K27me3.sites  ES.K27.txt      ES.K4me2.sites  MEF.K4.txt  NP.K4.txt       NP.K9.txt
    # Brain.K4me2.sites   ES.K4.txt       ES.K9.txt       MEF.K9.txt  NP.K4me1.sites  readme.Meissner2008.txt
    # Brain.K4me3.sites   ES.K4me1.sites  MEF.K27.txt     NP.K27.txt  NP.K4me2.sites  readme.txt
    
     # To range score display from 300 to 1000, use THE SAME CONVERSION AS for the whole group:
     #  (x * 2) + 300

    cd /cluster/data/mm8/bed/broadStemChip
mkdir windowSites
cat > windowSites.Meissner2008.csh << \_EOF_
    foreach f (chipseq/windowSites/*.sites)
        set b = $f:t
        set ab = `echo $b | perl -wpe 's/\w+.(\w)(\w+).sites/H3$1\L$2/'`
        set cell = `echo $b | perl -wpe 's/(\w)(\w+).*/\u$1\L$2/'`
        echo $cell $ab $b
        tail +2 $f |  awk '{printf "%s\t%d\t%d\t \t%d\t%s\n", $1, $2, $3, ($4 * 2) + 300, $4}' > windowSites/$cell.$ab.tab
        # using kate's version, testing -renameSqlTable option
        /cluster/home/kate/bin/x86_64/hgLoadBed mm8 -tab -noNameIx -renameSqlTable \
            -sqlTable=/cluster/bin/sqlCreate/bed5FloatScore.sql \
                broadStemChipWinSites${ab}${cell} windowSites/$cell.$ab.tab
    end
_EOF_
# Fix script coloring EOF 
    chmod +x windowSites.Meissner2008.csh 
    csh windowSites.Meissner2008.csh > windowSites.Meissner2008.log 2>&1
    
    ###############
    # Signal track
    # indication of #reads near the base, 25bp fixed window, -1 if unalignable base

    ssh kkstore04
    cd /cluster/data/mm8/bed/broadStemChip/lab/Meissner2008/densities
    mkdir -p alignable
    cd alignable
    foreach f (../*.tar.gz)
        tar xvfz $f
    end

    cd ../..

    # Get a list of the datasets
    #mkdir -p signal
    tar tfz chipseq/Meissner2008/densities/chr1.tar.gz | \
        perl -wpe 's/chr\w.(\w+.\w+).txt/$1/' > signal/datasetsMeissner2008.txt
    # ignore control (whole-cell extract)
    grep -v WCE signal/datasetsMeissner2008.txt > signal/subtracksMeissner2008.txt
    wc -l signal/subtracksMeissner2008.txt
        # 7

    # Extract datasets from by-chrom packaging
    # Weed out missing data which are represented as -1 values
# Convert to wiggle

cat > makeWigMeissner2008.csh << \_EOF_
    foreach s (`cat signal/subtracksMeissner2008.txt`)
        set ab = `echo $s | perl -wpe 's/\w+.(\w)(\w+)/\u$1\u$2/'`
        set cell = `echo $s | perl -wpe 's/(\w)(\w+).\w+/\u$1\L$2/'`
        set table = broadStemChipSignal${ab}${cell}
        echo $table $s
        rm -f signal/$table.wigVar
        foreach f (chipseq/Meissner2008/densities/chr*.tar.gz)
            set c = $f:t:r:r
            echo "variableStep chrom=$c span=25" >> signal/$table.wigVar
            tar xfzO $f $c.$s.txt | \
                awk 'BEGIN {pos = 1} {print pos, $1; pos += 25}' | \
                grep -v '\-1$' >> signal/$table.wigVar
        end
        cd signal
        nice wigEncode $table.wigVar $table.wig $table.wib
        cd ..
    end
_EOF_
    # Fix script coloring EOF
    chmod +x makeWigMeissner2008.csh 
    csh makeWigMeissner2008.csh > makeWigMeissner2008.log 2>&1 &
    # check output and cleanup
    cd signal
    gzip *.wigVar

    ######## Load wiggles?
    ssh hgwdev
    #mkdir /gbdb/mm8/broadStemChip

    cd /cluster/data/mm8/bed/broadStemChip
cat > loadWigMeissner2008.csh << \_EOF_
#!/bin/csh -fe
    cd /cluster/data/mm8/bed/broadStemChip/signal
    foreach f (*H3K*me*.wib)
        set wi = $f:t:r
        set wig = $wi.wig
        echo Start: $wig
        echo "ln -s `pwd`/$f /gbdb/mm8/broadStemChip/$wi.wib"
        hgLoadWiggle -pathPrefix=/gbdb/mm8/broadStemChip mm8 $wi $wig
        echo Finished: $wig
     end
_EOF_
    chmod +x loadWigMeissner2008.csh
    ./loadWigMeissner2008.csh
    time nice -n +19 ./loadWigMeissner2008.csh >> loadWigMeissner2008.log 2>&1 &
    
    # Noticed tables badly named, renamed them and corresponding files
    #     hgsql mm8
    #     rename table broadStemChipSignalH3Es          to broadStemChipSignalH3panEs      
    #     rename table broadStemChipSignalH3K27Es       to broadStemChipSignalH3K27me3Es   
    #     rename table broadStemChipSignalH3K27Mef      to broadStemChipSignalH3K27me3Mef  
    #     rename table broadStemChipSignalH3K27Np       to broadStemChipSignalH3K27me3Np   
    #     rename table broadStemChipSignalH3K36Es       to broadStemChipSignalH3K36me3Es   
    #     rename table broadStemChipSignalH3K36EsHyb    to broadStemChipSignalH3K36Esme3Hyb
    #     rename table broadStemChipSignalH3K36Mef      to broadStemChipSignalH3K36me3Mef  
    #     rename table broadStemChipSignalH3K36Np       to broadStemChipSignalH3K36me3Np   
    #     rename table broadStemChipSignalH3K4Es        to broadStemChipSignalH3K4me3Es    
    #     rename table broadStemChipSignalH3K4EsHyb     to broadStemChipSignalH3K4Esme3Hyb 
    #     rename table broadStemChipSignalH3K4Mef       to broadStemChipSignalH3K4me3Mef   
    #     rename table broadStemChipSignalH3K4Np        to broadStemChipSignalH3K4me3Np    
    #     rename table broadStemChipSignalH3K9Es        to broadStemChipSignalH3K9me3Es    
    #     rename table broadStemChipSignalH3K9EsHyb     to broadStemChipSignalH3K9Esme3Hyb 
    #     rename table broadStemChipSignalH3K9Mef       to broadStemChipSignalH3K9me3Mef   
    #     rename table broadStemChipSignalH3K9Np        to broadStemChipSignalH3K9me3Np    
    #     rename table broadStemChipSignalH4K20Es       to broadStemChipSignalH4K20me3Es   

######### Alignments

    ### Sample from Brain.H3K27me3.aligned.gz
    #chr10   63848447        63848474        -       3084.4.1        0       GAGAGCCAATGGCTAGGCAGGGCATCA
    ### Convert to 
    #chr10  63848447  63848474  3084.4.1  0  -  63848447  63848474  0,255,0  0  GAGAGCCAATGGCTAGGCAGGGCATCA
    # convert to bed-9+   color at 9, mismatch at 10 and seq at 11; grabbed some example code from encodeHg17.txt PET

    ssh hgwdev
    cd /cluster/data/mm8/bed/broadStemChip/lab/Meissner2008/alignments
    mkdir bed
    cd bed
    
    cp lab/alignments/bed/make* lab/Meissner2008/alignments/bed
    # cat << \_EOF_ > makeBed9PlusFromAlignments.csh
    # #!/usr/bin/perl
    # # replace "reserved" field of BED >=9 fields with RGB value from 8-scale 
    # # black->red palette, based on score value.
    # 
    # use warnings;
    # use strict;
    # 
    # while (<>) {
    #     next if (/^track/ || /^\s*\#/);
    #     chomp;
    #     my @words = split("\t");
    #     if (scalar(@words) < 7) {
    #       @words = split(/\s+/);
    #       die "Expecting at least 7 tab-sep fields but got fewer, line $.\n"
    #         if (scalar(@words) < 7);
    #     }
    #     my @newWordOrder = ("","","","","","","","","","","");
    #     $newWordOrder[0] = $words[0];  # chr
    #     $newWordOrder[1] = $words[1];  # beg
    #     $newWordOrder[2] = $words[2];  # end
    #     $newWordOrder[3] = $words[4];  # name
    #     #$newWordOrder[4] = "0";        # score
    #     $newWordOrder[4] = 1000 - ($words[5] * 100);        # score  0=1000 1=900 2=800 
    #     $newWordOrder[5] = $words[3];  # strand
    #     $newWordOrder[6] = $words[1];  # beg
    #     $newWordOrder[7] = $words[2];  # end
    #     $newWordOrder[8] = "0,0,0";    # color to be set later
    #     $newWordOrder[9] = $words[5];  # mismatch
    #     $newWordOrder[10] = $words[6]; # seq
    #     
    #     print join("\t", @newWordOrder) . "\n";
    # }
    # _EOF_
    # 
    # cat << \_EOF_ > makeColoredBedOnStrand.csh
    # #!/usr/bin/perl
    # # replace "reserved" field of BED >=9 fields with RGB value from 8-scale 
    # # black->red palette, based on score value.
    # 
    # use warnings;
    # use strict;
    # 
    # # palette consistes of red, green blue
    # my @blues  = ("0,0,255","0,0,204","0,0,170");
    # my @greens = ("0,255,0","0,187,0","0,136,0");
    # 
    # while (<>) {
    #     next if (/^track/ || /^\s*\#/);
    #     chomp;
    #     my @words = split("\t");
    #     if (scalar(@words) < 9) {
    #       @words = split(/\s+/);
    #       die "Expecting at least 9 tab-sep fields but got fewer, line $.\n"
    #         if (scalar(@words) < 9);
    #     }
    #     die "More than 9 mismatches found line $.\n"
    #         if ($words[9] > 9);
    #     my $strand = $words[5];
    #     if ($strand eq '+') {
    #         if( $words[9] > 2 ) { 
    #             $words[8] = $blues[2]; # green
    #         } else {
    #             $words[8] = $blues[$words[9]]; # green
    #         }
    #     } else {
    #         if( scalar($words[9]) > 2 ) { 
    #             $words[8] = $greens[2]; # blue
    #         } else {
    #             $words[8] = $greens[$words[9]]; # blue
    #         }
    #     }
    #     print join("\t", @words) . "\n";
    # }
    # _EOF_
    # chmod +x makeBed9PlusFromAlignments.csh
    # chmod +x makeColoredBedOnStrand.csh

cat << \_EOF_ > convertToBed.csh
#!/bin/csh -fe
    cd /cluster/data/mm8/bed/broadStemChip/lab/Meissner2008/alignments/bed
    foreach f (../*.aligned.gz)
        set root = `echo $f:t:r:r`
        zcat $f | ./makeBed9PlusFromAlignments.csh | ./makeColoredBedOnStrand.csh | gzip > $root.bed.gz
        echo $root.bed.gz done 
    end
_EOF_
chmod +x convertToBed.csh

    zcat ../Brain.H3K27me3.aligned.gz | head | ./makeBed9PlusFromAlignments.csh | ./makeColoredBedOnStrand.csh
    # chr14   12537326        12537362        205CY.7.1       1000    -       12537326        125373620,255,0 0       GGGATATGGACTGAAATAATTAGGAAAGAAATAACT 
    ## How to make bash work ???
    #for f in ../*.txt.gz; do
    #  root=${f##*/}
    #  root=${root%.*}
    #  root=${root%.*}
    #  zcat $f | ./makeBed9PlusFromAlignments.csh | ./makeColoredBedOnStrand.csh | gzip > $root.bed.gz
    #  echo $root.bed.gz done 
    #done

    ssh kkstore04
    cd /cluster/data/mm8/bed/broadStemChip/lab/Meissner2008/alignments/bed
    time nice -n +19 ./convertToBed.csh > convert.log 2>&1 &
    # real    25m22.762s
    # Brain.H3K27me3.bed.gz done
    # Brain.H3K4me2.bed.gz done
    # Brain.H3K4me3.bed.gz done
    # ES.H3K4me1.bed.gz done
    # ES.H3K4me2.bed.gz done
    # NP.H3K4me1.bed.gz done
    # NP.H3K4me2.bed.gz done
    # zcat Brain.H3K27me3.bed.gz | head -2
    # chr14   12537326        12537362        205CY.7.1       1000    -       1253732612537362        0,255,0 0       GGGATATGGACTGAAATAATTAGGAAAGAAATAACT
    # chr2    70236933        70236969        205CY.7.2       900     +       7023693370236969        0,0,204 1       GAATCCTTGAACATATTTATAATCATTCTTTTTAAT
    # Compared to: zcat ../../../alignments/bed/ES.K20.bed.gz | head -2
    # chr8    77978889        77978916        3080.2.1        1000    +       7797888977978916        0,0,255 0       GAAGGAAATCAGTCTTTGTTGAGCAGT
    # chr12   38598403        38598430        3080.2.2        1000    +       3859840338598430        0,0,255 0       GATATTTCATTCCTTGGAGAAGGGTAA
    
cp ../../../alignments/bed/myBedTbl.sql .
    # cat << \_EOF_ > myBedTbl.sql
    #     CREATE TABLE myBedTbl (
    #       bin smallint unsigned not null,
    #       chrom varchar(255) not null,
    #       chromStart int unsigned not null,
    #       chromEnd int unsigned not null,
    #       name varchar(255) not null,
    #       score int unsigned not null,
    #       strand char(1) not null,
    #       thickStart int unsigned not null,
    #       thickEnd int unsigned not null,
    #       reserved int unsigned  not null,
    #       mismatchCount int unsigned not null,
    #       seq varchar(255) not null,
    #     #Indices
    #       INDEX(name(16)),
    #       INDEX(chrom(5),bin)
    #     )
    # _EOF_

cat << \_EOF_ > loadBedFiles.csh
#!/bin/csh -fe
    cd /cluster/data/mm8/bed/broadStemChip/lab/Meissner2008/alignments/bed
    foreach f (*.bed.gz)
        set root = `echo $f:t:r:r`
        set ab = `echo $root | perl -wpe 's/\w+.(\w)(\w+)/\u$1\u$2/'`
        set cell = `echo $root | perl -wpe 's/(\w)(\w+).\w+/\u$1\L$2/'`
        set table = broadStemChipAlignments${ab}${cell}
        ~/bin/x86_64/hgLoadBed -sqlTable=myBedTbl.sql -renameSqlTable mm8 ${table} ${f} 
        echo ${table} ${f} done 
    end
_EOF_
chmod +x loadBedFiles.csh
    time nice -n +19 ./loadBedFiles.csh > load.log 2>&1 &
    real    28m9.939s
    # broadStemChipAlignmentsH3K27me3Brain
    # broadStemChipAlignmentsH3K4me1Es
    # broadStemChipAlignmentsH3K4me1Np
    # broadStemChipAlignmentsH3K4me2Brain
    # broadStemChipAlignmentsH3K4me2Es
    # broadStemChipAlignmentsH3K4me2Np
    # broadStemChipAlignmentsH3K4me3Brain
    #   
    # broadStemChipWinSitesH3K27me3Brain
    # broadStemChipWinSitesH3K4me1Es
    # broadStemChipWinSitesH3K4me1Np
    # broadStemChipWinSitesH3K4me2Brain
    # broadStemChipWinSitesH3K4me2Es
    # broadStemChipWinSitesH3K4me2Np
    # broadStemChipWinSitesH3K4me3Brain
    #   
    # broadStemChipSignalH3K27me3Brain
    # broadStemChipSignalH3K4me1Es
    # broadStemChipSignalH3K4me1Np
    # broadStemChipSignalH3K4me2Brain
    # broadStemChipSignalH3K4me2Es
    # broadStemChipSignalH3K4me2Np
    # broadStemChipSignalH3K4me3Brain

    # edited trackDb.broadStem.ra

############################################################################
#  mm8 - Mouse - Ensembl Genes (DONE - 2008-03-06 - hiram)
    ssh kkstore04
    cd /cluster/data/mm8
    cat << '_EOF_' > mm8.ensGene.ra
# required db variable
db mm8
# optional liftRandoms yes/no or absent
liftRandoms yes
# optional nameTranslation, the sed command that will transform
#       Ensemble names to UCSC names.  With quotes just to make sure.
nameTranslation "s/^\([0-9XY][0-9]*\)/chr\1/; s/^MT/chrM/"
# optionally update the knownToEnsembl table after ensGene updated
knownToEnsembl yes
'_EOF_'
#  << happy emacs

    doEnsGeneUpdate.pl -ensVersion=46 mm8.ensGene.ra
    ssh hgwdev
    cd /cluster/data/mm8/bed/ensGene.46
    featureBits mm8 ensGene
    # 56654064 bases of 2567283971 (2.207%) in intersection
############################################################################
# Reload CCDS from CCDS.20080502 dump (2008-05-03 markd)
    # import ccds database as described in ccds.txt
    set db=mm8
    set ncbiBld=36.1
    # create and load ccdsGene and ccdsInfo tables from imported database
    /cluster/data/genbank/bin/x86_64/ccdsMkTables -loadDb ccds ${db} ${ncbiBld} ccdsInfo ccdsGene

    # ccdsKgMap
    /cluster/data/genbank/bin/x86_64/mkCcdsGeneMap -db=${db} -loadDb ccdsGene knownGene ccdsKgMap

    checkTableCoords ${db} -verbose=2 ccdsGene
    # update all.jointer to include ${db} in ccdsDb
    joinerCheck -database=${db} -identifier=ccdsGeneId ~/compbio/genbank/kent/src/hg/makeDb/schema/all.joiner
    # request push of 
        ccdsGene
        ccdsInfo
        ccdsKgMap
    # << emacs
############################################################################
# AGILENT CGH PROBES (Done 2008-05-13, Andy)
# (see hg18.txt)
############################################################################
############################################################################
# TRANSMAP vertebrate.2008-05-20 build  (2008-05-24 markd)

vertebrate-wide transMap alignments were built  Tracks are created and loaded
by a single Makefile. This is available from:
   svn+ssh://hgwdev.soe.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2008-05-20

see doc/builds.txt for specific details.
############################################################################

#############################################################################
# MOUSE TISSUE EXON ARRAYS (Melissa Cline, cline@biology.ucsc.edu, 10/14/08)  
# (to build the affyExonTissues track, see the steps outlined in hg18.txt)
#############################################################################

########################################################################
## AFFY ALL EXON PROBESETS (MM8) (DONE 2009-01-29, Andy)
ssh hgwdev
mkdir /hive/data/genomes/mm8/bed/affyAllExonProbes
cd /hive/data/genomes/mm8/bed/affyAllExonProbes
ln -s MoEx-1_0-st-v1.r2.dt1.mm8.csv mm8.csv
wget --load-cookies affycookies.txt http://www.affymetrix.com/Auth/analysis/downloads/na20/exon/MoEx-1_0-st-v1.r2.dt1.mm8.zip
sed '1,12d' mm8.csv | tr ',' '\t' | cut -f 1,5-8,12 \
  | sed 's/\"//g' | grep -v "\-\-\-" \
  | awk 'BEGIN{FS="\t";OFS="\t";}{if ($6 == "core") score = 1000; else if ($6 == "extended") score = 700; else if ($6 == "full") score = 300; else score = 100; name = $1"|"$6; print $2, $4-1, $5, name, score, $3}' \
  | bedSort stdin mm8.bed
hgLoadBed mm8 affyAllExonProbes mm8.bed
rm MoEx-1_0-st-v1.r2.dt1.mm8.{cor,ext,full,zip}* bed.tab affycookies.txt mm8.csv
gzip MoEx-1_0-st-v1.r2.dt1.mm8.csv mm8.bed

################################################
# AUTOMATE UPSTREAM FILE CREATION (2008-10-15 markd)
update genbank.conf:
mm8.upstreamGeneTbl = refGene
mm8.upstreamMaf = multiz17way /hive/data/genomes/mm8/bed/multiz17way/species.lst

#############################################################################
# MAKE PCR TARGET FOR UCSC GENES (DONE 11/4/08)
    ssh hgwdev
    mkdir /cluster/data/mm8/bed/mrnaPcr
    cd /cluster/data/mm8/bed/mrnaPcr
    hgsql mm8 -NBe 'select * from knownGene' > knownGene.gp
    genePredToBed knownGene.gp > ucscGenes.bed
    hgsql mm8 -NBe 'select kgId,geneSymbol from kgXref' \
    | perl -wpe 's/^(\S+)\t(\S+)/$1\t${1}__$2/ || die;' \
      > idSub.txt
    subColumn 4 ucscGenes.bed idSub.txt ucscGenesIdSubbed.bed
    sequenceForBed -keepName -db=mm8 -bedIn=ucscGenesIdSubbed.bed \
      -fastaOut=stdout \
    | faToTwoBit -ignoreDups stdin kgTargetSeq.2bit
    cut -f 1-10 knownGene.gp \
    | genePredToFakePsl mm8 stdin kgTargetAli.psl /dev/null

    # Load up the UCSC Genes target PSL table and put 2bit in /gbdb::
    cd /cluster/data/mm8/bed/mrnaPcr
    hgLoadPsl mm8 kgTargetAli.psl
    mkdir /gbdb/mm8/targetDb
    ln -s /cluster/data/mm8/bed/mrnaPcr/kgTargetSeq.2bit /gbdb/mm8/targetDb/

    # Ask cluster-admin to start an untranslated, -stepSize=5 gfServer on
    # /gbdb/mm8/targetDb/kgTargetSeq.2bit .

    ssh hgwdev
    # Add records to hgcentraltest blatServers and targetDb:
    hgsql hgcentraltest -e \
      'INSERT into blatServers values ("mm8Kg", "blat13", 17803, 0, 1);'
    hgsql hgcentraltest -e \
      'INSERT into targetDb values("mm8Kg", "UCSC Genes", \
         "mm8", "kgTargetAli", "", "", \
         "/gbdb/mm8/targetDb/kgTargetSeq.2bit", 1, now(), "");'


#############################################################################
############################################################################
# TRANSMAP vertebrate.2009-09-13 build  (2009-09-20 markd)

vertebrate-wide transMap alignments were built  Tracks are created and loaded
by a single Makefile. This is available from:
   svn+ssh://hgwdev.soe.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2009-09-13

see doc/builds.txt for specific details.
############################################################################
# UPDATE KEGG TABLES (DONE, Fan, 6/18/10)
mkdir -p /hive/data/genomes/mm8/bed/pathways/kegg
cd /hive/data/genomes/mm8/bed/pathways/kegg

wget --timestamping ftp://ftp.genome.jp/pub/kegg/pathway/map_title.tab

cat map_title.tab | sed -e 's/\t/\tmmu\t/' > j.tmp
cut -f 2 j.tmp >j.mmu
cut -f 1,3 j.tmp >j.1
paste j.mmu j.1 |sed -e 's/\t//' > keggMapDesc.tab
rm j.mmu j.1
rm j.tmp

hgsql mm8 -e 'drop table keggMapDesc'
hgsql mm8 < ~/kent/src/hg/lib/keggMapDesc.sql
hgsql mm8 -e 'load data local infile "keggMapDesc.tab" into table keggMapDesc'

wget --timestamping ftp://ftp.genome.jp/pub/kegg/genes/organisms/mmu/mmu_pathway.list

cat mmu_pathway.list| sed -e 's/path://'|sed -e 's/:/\t/' > j.tmp
hgsql mm8 -e 'drop table keggPathway'
hgsql mm8 < ~/kent/src/hg/lib/keggPathway.sql
hgsql mm8 -e 'load data local infile "j.tmp" into table keggPathway'

hgsql mm8 -N -e \
'select name, locusID, mapID from keggPathway p, knownToLocusLink l where p.locusID=l.value' \
>keggPathway.tab

hgsql mm8 -e 'delete from keggPathway'

hgsql mm8 -e 'load data local infile "keggPathway.tab" into table keggPathway'

rm j.tmp
############################################################################
# Add KEGG column to mm8 Gene Sorter (Done, Fan, 6/18/2010)

mkdir -p /hive/data/genomes/mm8/bed/geneSorter
cd /hive/data/genomes/mm8/bed/geneSorter
hgsql mm8 -N -e 'select kgId, mapID, mapID, "+", locusID from keggPathway' |sort -u|sed -e 's/\t+\t/+/' > knownToKeggEntrez.tab

hgsql mm8 -e 'drop table knownToKeggEntrez'

hgsql mm8 < ~/kent/src/hg/lib/knownToKeggEntrez.sql

hgsql mm8 -e 'load data local infile "knownToKeggEntrez.tab" into table knownToKeggEntrez'

#############################################################################
# LIFTOVER TO MM9 (RE-DONE 2010-07-24 galt)
    mkdir /hive/data/genomes/mm8/bed/blat.mm9.2010-07-24
    cd /hive/data/genomes/mm8/bed/blat.mm9.2010-07-24
    # -debug run to create run dir, preview scripts...
    #   verifies files can be found
    doSameSpeciesLiftOver.pl -debug mm8 mm9
    # Real run:
    screen
    nice doSameSpeciesLiftOver.pl -verbose=2 \
        -bigClusterHub=swarm -dbHost=hgwdev -workhorse=hgwdev \
         mm8 mm9 >& do.log

There was a question by jdidion on the mailing
list in which the liftover in the 9->8 direction
was working better than in the other direction.

We found that fastMap cannot be used with queries of size greater than 5000.
This information was known to Kate back in 2004, and was forgotten by oct 2006
when doSameSpecies.pl was being used with query sizes of 10k chunks.

I contacted Jim, and he told me make blat errAbort if the query size
exceeds this limit when fastMap is used.  
This will prevent problems in the future.
I tested that this actually happens, so we know the check is working.
CONFIRMED: New blat 34x9 does errAbort with chunk size 10k and -fastMap.

Then I re-ran it with the smaller allowed sizes of 5k chunks
which should take care of the problem.
We will probably also need to re-run other same-species liftovers.

CONFIRMED: the problem lifting reported by the user jdidion is now fixed.

on hgwdev, liftOver from mm8
chr2:22766881-22766905
to mm9
chr2:22770754-22770778
now works perfectly.


#############################################################################
# LIFTOVER TO MM9 (DONE 2010-08-05 galt)
    # using the new doSameSpeciesLiftOver.pl
    # which uses chunks of max size 5k (to prevent blat -fastMap issues)
    # and an overlap of 500bp, which we did not have before.
    # Although we will use the new process in future, 
    # we are not changing old ones.  This particular liftover is an 
    # exception since it does fix a tiny problem found by user jdidion.
    mkdir /hive/data/genomes/mm8/bed/blat.mm9.2010-08-05
    cd /hive/data/genomes/mm8/bed/blat.mm9.2010-08-05
    # -debug run to create run dir, preview scripts...
    #   verifies files can be found
    doSameSpeciesLiftOver.pl -debug mm8 mm9
    # Real run:
    screen
    nice doSameSpeciesLiftOver.pl -verbose=2 \
        -bigClusterHub=swarm -dbHost=hgwdev -workhorse=hgwdev \
         mm8 mm9 >& do.log

#############################################################################
# construct liftOver to mm10 (DONE - 2012-05-01 - Hiram) 
    screen -S mm8	# manage this longish running job in a screen
    mkdir /hive/data/genomes/mm8/bed/blat.mm10.2012-05-01
    cd /hive/data/genomes/mm8/bed/blat.mm10.2012-05-01
    # check it with -debug first to see if it is going to work:
    time doSameSpeciesLiftOver.pl -buildDir=`pwd` -bigClusterHub=swarm \
	-ooc=/scratch/data/mm8/11.ooc \
	-debug -dbHost=hgwdev -workhorse=hgwdev mm8 mm10
    # if that is OK, then run it:
    time doSameSpeciesLiftOver.pl -buildDir=`pwd` -bigClusterHub=swarm \
	-ooc=/scratch/data/mm8/11.ooc \
	-dbHost=hgwdev -workhorse=hgwdev mm8 mm10 > do.log 2>&1
    #   real    86m46.700s

    # verify this file exists:
    og -L /gbdb/mm8/liftOver/mm8ToMm10.over.chain.gz
# -rw-rw-r-- 1 279647 May  1 10:41 /gbdb/mm8/liftOver/mm8ToMm10.over.chain.gz

    # and try out the conversion on genome-test from mm8 to mm10 
############################################################################
