# for emacs: -*- mode: sh; -*-


# This file describes browser build for the mouse
# genome, August 2005, ncbi mouse_35 - Mm7
#
#	"$Id: mm7.txt,v 1.12 2008/10/17 01:06:34 markd Exp $"
#
#######################################################################
# DOWNLOAD THE MOUSE SEQUENCE FROM NCBI (DONE - 2005-08-26 - Hiram)
#
#	Examine disk space issues, summarize mm6 usage:
    ssh kkstore01
    cd /cluster/store10
    du -hsc mm6
#	188G    mm6
#    There is an amount of disk space on kkstore02 that would be
#    appropriate for this
#    store10, thus:
    ssh kkstore02
    df -h | grep sd
# /dev/sdc1             1.5T  1.1T  330G  77% /export/cluster/store5
    #	So,
    mkdir /cluster/store5/mm7
    ln -s /cluster/store5/mm7 /cluster/data/mm7
    mkdir /cluster/data/mm7/ncbi
    cd /cluster/data/mm7/ncbi
#	set the login name and password in a .wgetrc file in this
#	directory, permissions 600, its format:
#	login = name
#	passwd = xxxx
    WGETRC=`pwd`/.wgetrc
    export WGETRC
    wget --timestamping --force-directories --directory-prefix=. \
	--dont-remove-listing --recursive --level=4 --no-parent \
	--no-host-directories --cut-dirs=1 \
	ftp://ftp-private.ncbi.nih.gov/mouse_35
#Downloaded: 2,348,948,224 bytes in 55 files
#real    48m1.328s
#user    0m2.696s
#sys     0m21.548s

#  Fixup the agp and seq_contig.md files to add chrM
#  No chrM or chrMT was delivered.  Copy from previous assembly
    ssh kkstore02
    cd /cluster/data/mm7/ncbi/chrfasta
    cp -p /cluster/data/mm6/ncbi/chrfasta/chrM.fa.gz .
    cd ../contigfasta
    cp -p /cluster/data/mm6/ncbi/contigfasta/chrM.fa.gz .
#	with a fixed up header line to be like all the others:
#	>lcl|chrM.fa gi|34538597|ref|NC_005089.1| Mus musculus mitochondrion, complete genome

    cd /cluster/data/mm7
    zcat ncbi/allrefcontig.chr.agp.gz > allrefcontig.chr.agp
    echo -e "chrM\t1\t16299\t1\tF\tAY172335.1\t1\t16299\t+" >> \
	allrefcontig.chr.agp
    gzip allrefcontig.chr.agp
    zcat ncbi/allcontig.agp.gz > allcontig.agp
    echo -e "NC_005089\t1\t16299\t1\tF\tAY172335\t\t1\t16299\t+" >> \
	allcontig.agp
    gzip allcontig.agp
    zcat ncbi/seq_contig.md.gz | sed -e "3863i\
10090\tM\t0\t0\t+\tstart\t-1\tCONTIG\tC57BL/6J\t10\n\
10090\tM\t1\t16299\t+\tNC_005089\tGI:34538597\tCONTIG\tC57BL/6J\tna\n\
10090\tM\t16299\t16299\t+\tend\t-2\tCONTIG\tC57BL/6J\t10" > seq_contig.md
    #	The line number 3863 was found by checking the contents of
    #	ncbi/seq_contig.md.gz and it was the line starting with:
    #	10090^IUn|NT_039862^I1^I3190
    #	Wanted this chrM information before that line.
    gzip seq_contig.md

    #   summarize sequence counts

    mkdir faCounts
    time faCount ncbi/chrfasta/chr*.fa.gz > faCounts/chrfasta.faCount 2>&1 &
    #	about 1.5 minutes
    time faCount ncbi/contigfasta/chr*.fa.gz > \
	faCounts/contigfasta.faCount 2>&1 &
    #	about 3 minutes
    time zcat ncbi/chrfasta/chr*.fa.gz | grep "^>" > \
	faCounts/chrfasta.headers 2>&1 &
    time zcat ncbi/contigfasta/chr*.fa.gz | grep "^>" > \
	faCounts/contigfasta.headers 2>&1 &
    #	about 2 minutes each for the above two zcat/greps

#############################################################################
#  BREAK UP SEQUENCE INTO 5 MB CHUNKS at NON-BRIDGED CONTIGS
#			(DONE - 2005-08-26 - Hiram)
    ssh kkstore02
    cd /cluster/data/mm7
    for F in ncbi/chrfasta/chr*.fa.gz
    do
	CHR=`basename ${F} | sed -e "s/.fa.gz//; s/chr//"`
	echo ${CHR} ${F}
	mkdir -p "${CHR}"
	zcat allrefcontig.chr.agp.gz | \
	    perl -we "while(<>){if (/^chr${CHR}\t/) {print;}}" > \
		${CHR}/chr${CHR}.agp
	zcat ncbi/chrfasta/chr${CHR}.fa.gz | \
	    perl -wpe 's/^>lcl\|(chr\w+)\.fa.*/>$1/' | \
		splitFaIntoContigs ${CHR}/chr${CHR}.agp \
		    stdin /cluster/data/mm7 -nSize=5000000
    done
    #	The above loop takes about 5 minutes

#############################################################################
# CREATE CHROM-LEVEL AGP AND FASTA FOR _RANDOMS (DONE 2005-08-26 - Hiram)
    ssh kkstore02
    mkdir /cluster/data/mm7/jkStuff
    cd /cluster/data/mm7
    mkdir Un tmp
    cp -p /cluster/data/mm6/jkStuff/ncbiFixAgp ./jkStuff
    zcat allrefcontig.chr.agp.gz | ./jkStuff/ncbiFixAgp /dev/stdin | gzip > \
	allrefcontig.chr.ordered.agp.gz
    #	Set the appropriate release number here, this one is 35
    #	Fetch the script from the previous assembly
    sed -e "s/buildNum = 34/buildNum = 35/" \
	/cluster/data/mm6/jkStuff/ncbiToRandomAgps > \
	    jkStuff/ncbiToRandomAgps
    chmod +x jkStuff/ncbiToRandomAgps
    gunzip seq_contig.md.gz allrefcontig.chr.ordered.agp.gz
    #	NOTE ! * ! This mm7 contig.idmap now includes the celera assembly
    #	Filter that out for use here.
    grep ref_strain ncbi/contig.idmap \
	| ./jkStuff/ncbiToRandomAgps seq_contig.md \
		allrefcontig.chr.ordered.agp \
                        /dev/stdin .
    #  The chrUn_random.agp created by this is too large with the 5000
    #  gaps.  it will work with 1000 gaps, so fixup the chrUn_random
    #  agp:
    grep ref_strain ncbi/contig.idmap \
	| ./jkStuff/ncbiToRandomAgps -gapLen 1000 -chrom Un \
		seq_contig.md allrefcontig.chr.ordered.agp /dev/stdin .
    for C in ? ??
    for C in Un
    do
	if [ -s ${C}/chr${C}_random.ctg.agp ]; then
	    echo "building ${C}/chr${C}_random.fa"
	    rm -f ./tmp.fa
	    zcat ncbi/contigfasta/chr${C}.fa.gz | \
		perl -wpe 's/^>lcl\|(Mm\w+)\s+.*$/>$1/' > ./tmp.fa
	    agpToFa -verbose=2 -simpleMulti \
		${C}/chr${C}_random.ctg.agp chr${C}_random \
		    ${C}/chr${C}_random.fa ./tmp.fa
	    rm -f ./tmp.fa
	fi
    done > tmp/agpToFa.out 2>&1
    #	the above loop takes about 6 minutes, examine the tmp/agpToFa.out
    #	record for any errors

    #	We need the lift information from these random.ctg.agp files
    cp -p /cluster/data/cb2/scripts/agpToLift.pl ./jkStuff
    for AGP in ?/*_random.ctg.agp ??/*_random.ctg.agp
do
    CHR=`dirname ${AGP}`
    echo ${CHR}
    mkdir -p ${CHR}/lift
    ./jkStuff/agpToLift.pl ${AGP} > ${CHR}/lift/ctg_random.lft
done
    # Clean these up to avoid confusion later... they're easily rebuilt
    #   with the ncbiToRandomAgps script above
    rm ?/*_random.ctg.agp ??/*_random.ctg.agp
    gzip seq_contig.md allrefcontig.chr.ordered.agp

#############################################################################
# BREAK UP _RANDOMS INTO 5 MB CHUNKS AT NON-BRIDGED CONTIGS
#					(DONE 2005-08-26 - Hiram)
    ssh kkstore02
    cd /cluster/data/mm7
    for C in ? ??
    do
	if [ -s ${C}/chr${C}_random.fa ]; then
	    splitFaIntoContigs  -nSize=5000000 ${C}/chr${C}_random.agp \
		${C}/chr${C}_random.fa .
	    mkdir -p ${C}/lift
	    rm -f ${C}/lift/rOut.lst ${C}/lift/random.lft ${C}/lift/random.lst
	    mv ${C}_random/lift/oOut.lst ${C}/lift/rOut.lst
	    mv ${C}_random/lift/ordered.lft ${C}/lift/random.lft
	    mv ${C}_random/lift/ordered.lst ${C}/lift/random.lst
	    rmdir ${C}_random/lift
	    rm ${C}_random/chr${C}_random.agp ${C}_random/chr${C}_random.fa
	    rm -rf ${C}/chr${C}_random_*
	    mv ${C}_random/chr${C}_random_* ${C}
	    rmdir ${C}_random
	fi
    done > tmp/split.out 2>&1
    #	the above loop takes less than a minute
    #	scan the tmp/split.out file for possible errors

#############################################################################
# MAKE LIFTALL.LFT (DONE - 2005-08-26 - Hiram)
    ssh kkstore02
    cd /cluster/data/mm7
    cat ?/lift/*.lft ??/lift/*.lft > jkStuff/liftAll.lft

#############################################################################
# CREATING DATABASE (DONE - 2005-08-26 - Hiram)
    ssh kkstore02
    cd /cluster/data/mm7
    faToTwoBit ?/chr?.fa ??/chr??.fa ?/chr?_random.fa ??/chr??_random.fa \
	mm7.2bit
    twoBitInfo mm7.2bit stdout | sort -rn +1 > chrom.sizes
    grep -v random chrom.sizes | cut -f1 | sed -e "s/chr//" > chrom.lst
    twoBitInfo mm7.2bit stdout |
        awk '{printf "%s\t%s\t/gbdb/mm7/mm7.2bit\n", $1,$2}' > chromInfo.tab

    ssh hgwdev
    cd /cluster/data/mm7
    hgsql -e "create database mm7;" mysql
    #	Make sure we have enough room (eventually ~ 70Gb) for mysql tables:
    df -h | grep mysql
    #	/dev/sda1             472G  227G  222G  51% /var/lib/mysql2
    #	/dev/sdc1             1.8T 1010G  650G  61% /var/lib/mysql

    # CREATING GRP TABLE FOR TRACK GROUPING (DONE - 2005-08-26 - Hiram)
    #   Use any of the newest databases to ensure that the organization
    #   of the grp table is up to date
    ssh hgwdev
    hgsql mm7 -e "create table grp (PRIMARY KEY(NAME)) select * from hg17.grp"
    hgsql mm7 < $HOME/kent/src/hg/lib/chromInfo.sql
    hgsql mm7 -e 'load data local infile "chromInfo.tab" into table chromInfo;'

    # Enter mm7 into dbDb and defaultDb so test browser knows about it:
    hgsql -e 'INSERT INTO dbDb (name, description, nibPath, organism, \
        defaultPos, active, orderKey, genome, scientificName, \
        htmlPath, hgNearOk, hgPbOk, sourceName) \
        VALUES("mm7", "Aug 2005", "/gbdb/mm7", "Mouse", \
        "chr6:28912411-28925620", 1, 23, "Mouse", \
        "Mus musculus", "/gbdb/mm7/html/description.html", 0, 0, \
        "NCBI Build 35");' -h localhost hgcentraltest
    #	It can be the default genome on genome-test
    hgsql hgcentraltest \
	-e 'update defaultDb set name="mm7" where genome="Mouse";'
    # start a new entry in the trackDb hierarchy
    cd $HOME/kent/src/hg/makeDb/trackDb/mouse
    mkdir mm7
    cvs add mm7
    cd mm7
    cp ../mm6/description.html .
    vi description.html - fixup text for this assembly
    cvs add description.html
    cvs commit
    cd ../..
    vi trackDb.ra - add mm7 to the list
    mkdir /cluster/data/mm7/html
    mkdir /gbdb/mm7
    ln -s /cluster/data/mm7/html /gbdb/mm7/html
    ln -s /cluster/data/mm7/mm7.2bit /gbdb/mm7/mm7.2bit
    cp -p mouse/mm7/description.html /gbdb/mm7/html
    make DBS=mm7

#############################################################################
#  GOLD GAP tracks (DONE - 2005-08-26 - Hiram)
    ssh hgwdev
    cd /cluster/data/mm7
    #	make sure these tmp contig agp files are gone, easily generated
    #	as above with jkStuff/ncbiToRandomAgps
    mkdir ffa
    zcat ncbi/sequence.inf.gz > ffa/sequence.inf
    hgGoldGapGl -chromLst=chrom.lst mm7 /cluster/data/mm7 .
    featureBits mm7 gold
    #	2583394090 bases of 2583394090 (100.000%) in intersection
    featureBits mm6 gold
    #	2597150411 bases of 2597150411 (100.000%) in intersection
    featureBits mm5 gold
    #	2615483787 bases of 2615483787 (100.000%) in intersection
    featureBits mm4 gold
    #	2627444668 bases of 2627444668 (100.000%) in intersection

    featureBits mm7 gap
    #	264323239 bases of 2583394090 (10.232%) in intersection
    featureBits mm6 gap
    #	482483041 bases of 2597150411 (18.577%) in intersection
    featureBits mm5 gap
    #	549468286 bases of 2615483787 (21.008%) in intersection
    featureBits mm4 gap
    #	325167539 bases of 2627444668 (12.376%) in intersection

#############################################################################
#  DISTRIBUTE SEQUENCE TO INTERMEDIATE SERVERS FOR KLUSTER RUNS
#	(DONE - 2005-08-26 - Hiram)
    ssh kkstore02
    mkdir /cluster/bluearc/mm7
    cd /cluster/data/mm7

    # break up into 500,000 sized chunks for repeat masker runs
TOP=`pwd`
export TOP
for CTG_DIR in ?/chr?_[0-9]* ??/chr??_[0-9]* ?/chr?_random_[0-9]* \
        ??/chr??_random_[0-9]*
do
    ctg=`basename ${CTG_DIR}`
    cd ${CTG_DIR}
    faSplit size ${ctg}.fa 500000 ${ctg}_ -lift=${ctg}.lft -maxN=500000
    cd ${TOP}
done > tmp/ctg_split.out 2>&1
    #	about 3 minutes, check the tmp/ctg_split.out for anything unusual

    #	make a list of the contigs
TOP=`pwd`
export TOP
for CTG_DIR in ?/chr?_[0-9]* ??/chr??_[0-9]* ?/chr?_random_[0-9]* \
        ??/chr??_random_[0-9]*
do
    ctg=`basename ${CTG_DIR}`
    cd ${CTG_DIR}
    ls ${ctg}_* | while read F
    do
        echo ${CTG_DIR}/${F}
    done
    cd ${TOP}
done > contig500K.lst
    #	count 'em
    wc contig500K.lst
    #	6172   6172 157096 contig500K.lst

    mkdir /cluster/bluearc/mm7/contigs
    rsync -a --progress --files-from=contig500K.lst . \
        /cluster/bluearc/mm7/contigs/

    #	verify the contig copy above functioned OK
    find /cluster/bluearc/mm7/contigs -type f | wc
    #	6172    6172  336084

#############################################################################
# SIMPLE REPEAT TRACK (DONE - 2005-08-29 Hiram)
    # TRF can be run in parallel with RepeatMasker
    #   since it doesn't require masked input sequence.
    ssh pk 
    mkdir /cluster/data/mm7/bed/simpleRepeat
    cd /cluster/data/mm7/bed/simpleRepeat
    mkdir trf
    cat << '_EOF_' > runTrf
#!/bin/csh -fe 
#
set path1 = $1
set inputFN = $1:t  
set outpath = $2
set outputFN = $2:t
mkdir -p /tmp/$outputFN
cp $path1 /tmp/$outputFN
pushd .
cd /tmp/$outputFN
/cluster/bin/i386/trfBig -trf=/cluster/bin/i386/trf $inputFN /dev/null -bedAt=$outputFN -tempDir=/tmp
popd
rm -f $outpath
cp -p /tmp/$outputFN/$outputFN $outpath
rm -fr /tmp/$outputFN/*
rmdir --ignore-fail-on-non-empty /tmp/$outputFN
'_EOF_'
    # happy emacs
    chmod +x runTrf

cat << '_EOF_' > gsub
#LOOP
./runTrf {check in line+ $(path1)}  {check out line trf/$(root1).bed}
#ENDLOOP
'_EOF_'
    # << keep emacs coloring happy

    ls -1S /cluster/data/mm7/?/chr*.fa /cluster/data/mm7/??/chr*.fa > genome.lst
    gensub2 genome.lst single gsub jobList
    para create jobList
    #	be gentle on the start up of these things since each starting
    #	job is a copy of the .fa file, a 'para try' starts 10 jobs
    #	there are only 40 total jobs
    para try
    sleep 30
    para check
    para try
    sleep 30
    para check
    para try
    sleep 30
    para check
    para try
    para check ... all 40 are running at this point, some are already done
    para time
# Completed: 40 of 40 jobs
# CPU time in finished jobs:      15539s     258.98m     4.32h    0.18d  0.000 y
# IO & Wait Time:                   532s       8.87m     0.15h    0.01d  0.000 y
# Average job time:                 402s       6.70m     0.11h    0.00d
# Longest finished job:            1803s      30.05m     0.50h    0.02d
# Submission to last job:          1845s      30.75m     0.51h    0.02d

    # Load into the database
    ssh hgwdev
    cd /cluster/data/mm7/bed/simpleRepeat
    cat trf/chr*.bed > simpleRepeat.bed
    hgLoadBed -strict mm7 simpleRepeat simpleRepeat.bed \
      -sqlTable=$HOME/src/hg/lib/simpleRepeat.sql
    #	Loaded 1143549 elements of size 16

    featureBits mm7 simpleRepeat
    # 77021175 bases of 2583394090 (2.981%) in intersection
    featureBits mm6 simpleRepeat
    # 83220723 bases of 2597150411 (3.204%) in intersection
    featureBits mm5 simpleRepeat
    # 81414259 bases of 2615483787 (3.113%) in intersection
    featureBits mm4 simpleRepeat
    # 82600648 bases of 2627444668 (3.144%) in intersection
    featureBits mm3 simpleRepeat
    # 75457193 bases of 2505900260 (3.011%) in intersection

###########################################################################
# CREATE MICROSAT TRACK (done 2006-7-5 JK)
     ssh hgwdev
     cd /cluster/data/mm7/bed
     mkdir microsat
     cd microsat
     awk '($5==2 || $5==3) && $6 >= 15 && $8 == 100 && $9 == 0 {printf("%s\t%s\t%s\t%dx%s\n", $1, $2, $3, $6, $16);}' ../simpleRepeat/simpleRepeat.bed > microsat.bed 
    /cluster/bin/i386/hgLoadBed mm7 microsat microsat.bed

#############################################################################
# PROCESS SIMPLE REPEATS INTO MASK (DONE - 2005-08-29 - Hiram)

    # After the simpleRepeats track has been built, make a filtered version
    # of the trf output: keep trf's with period <= 12:
    ssh kkstore02
    cd /cluster/data/mm7/bed/simpleRepeat
    mkdir trfMask
    for F in trf/chr*.bed
    do
	echo "${F} -> ${F/trf\//}"
	awk '{if ($5 <= 12) print;}' ${F} > trfMask/${F/trf\//}
    done

#############################################################################
# REPEATMASKER RUN (after contigs have been distributed to bluearc FS)
#	(DONE - 2005-08-26 - 2005-08-29 - Hiram)
#	Record RM version used:
    cat /cluster/bluearc/RepeatMasker050305/Libraries/version
#   RepBase Update 9.11, RM database version 20050112
#	/cluster/bluearc/RepeatMasker050305
    ssh pk

    #- Make the run directory and job list:
    cd /cluster/data/mm7
    cat << '_EOF_' > jkStuff/RMMouse
#!/bin/csh -fe

cd /cluster/data/mm7/$1
pushd .
/bin/mkdir -p /tmp/mm7/$2
/bin/cp /cluster/bluearc/mm7/contigs/$1/$2 /tmp/mm7/$2
cd /tmp/mm7/$2
/cluster/bluearc/RepeatMasker050305/RepeatMasker -ali -s -species mus $2
popd
/bin/cp /tmp/mm7/$2/$2.out ./
if (-e /tmp/mm7/$2/$2.align) /bin/cp /tmp/mm7/$2/$2.align ./
if (-e /tmp/mm7/$2/$2.tbl) /bin/cp /tmp/mm7/$2/$2.tbl ./
if (-e /tmp/mm7/$2/$2.cat) /bin/cp /tmp/mm7/$2/$2.cat ./
/bin/rm -fr /tmp/mm7/$2/*
/bin/rmdir --ignore-fail-on-non-empty /tmp/mm7/$2
/bin/rmdir --ignore-fail-on-non-empty /tmp/mm7
'_EOF_'
    #	happy emacs
    chmod +x jkStuff/RMMouse

    mkdir -p RMRun
    rm -f RMRun/RMJobs
    cat << '_EOF_' > jkStuff/mkRMJobs.pl
#!/usr/bin/env perl

use strict;
use warnings;

use File::Basename;


while (my $line=<>)
{
    chomp $line;
    my $basename = basename($line);
    my $dirname = dirname($line);
    print "/cluster/data/mm7/jkStuff/RMMouse $dirname $basename {check out line+ /cluster/data/mm7/$dirname/$basename.out}\n";
}
'_EOF_'
    #	happy emacs
    chmod +x jkStuff/mkRMJobs.pl

    
    cat contig500K.lst | ./jkStuff/mkRMJobs.pl > RMRun/RMJobs
    wc RMRun/RMJobs
    # 6172  43204 770920 RMRun/RMJobs

    #- Do the run
    cd /cluster/data/mm7/RMRun
    para create RMJobs
    para try, para check, para check, para push, para check,...
# Completed: 6172 of 6172 jobs
# CPU time in finished jobs:   26381042s  439684.03m  7328.07h  305.34d  0.837 y
# IO & Wait Time:                 46088s     768.13m    12.80h    0.53d  0.001 y
# Average job time:                4282s      71.36m     1.19h    0.05d
# Longest finished job:            6370s     106.17m     1.77h    0.07d
# Submission to last job:        127318s    2121.97m    35.37h    1.47d

    #- Lift up the split-contig .out's to contig-level .out's
    ssh kkstore02
    cd /cluster/data/mm7
    for D in ?/chr?_[0-9]* ??/chr??_[0-9]* ?/chr?_random_[0-9]* \
	??/chr??_random_[0-9]*
    do
	CONTIG=`basename ${D}`
	liftUp ${D}/${CONTIG}.fa.out ${D}/${CONTIG}.lft warn \
		${D}/${CONTIG}_[0-9]*.fa.out
    done > tmp/RM.lift.outs 2>&1
    #	scan tmp/RM.lift.outs for unusual errors or difficulties

    cat << '_EOF_' > jkStuff/liftRM_out_to_chr.sh
#!/bin/sh
for C in ? ??
do
    echo "lifting ${C}"
    cd ${C}
    if [ -s lift/ordered.lft ]; then
	liftUp chr${C}.fa.out lift/ordered.lft warn `cat lift/oOut.lst`
    else
	echo "WARNING: Can not find ${C}/lift/ordered.lft"
    fi
    if [ -s lift/random.lft ]; then
	liftUp chr${C}_random.fa.out lift/random.lft warn `cat lift/rOut.lst`
    fi
    cd ..
done
'_EOF_'
    # happy emacs
    chmod +x jkStuff/liftRM_out_to_chr.sh
    ./jkStuff/liftRM_out_to_chr.sh > tmp/liftRM_out_to_chr.out 2>&1
    #	scan the results tmp/liftRM_out_to_chr.out
    #	there is a single: WARNING: Can not find Un/lift/ordered.lft
    #	which is OK
    #	List the final .out files, nothing should be size 0:
    ls -og */*.fa.out

    #- Load the .out files into the database with:
    ssh hgwdev
    cd /cluster/data/mm7
    hgLoadOut mm7 ?/chr?.fa.out ??/chr??.fa.out ?/chr?_random.fa.out \
	??/chr??_random.fa.out
    #	about 7 minutes, there are always a few of these errors:
Strange perc. field -5.6 line 254091 of 1/chr1.fa.out
Strange perc. field -3.8 line 77848 of 6/chr6.fa.out
Strange perc. field -6.2 line 77848 of 6/chr6.fa.out
Strange perc. field -12631.5 line 86222 of 14/chr14.fa.out
Strange perc. field -14047.1 line 86222 of 14/chr14.fa.out
Strange perc. field -2988.3 line 86222 of 14/chr14.fa.out
Strange perc. field -2.1 line 151263 of 14/chr14.fa.out
Strange perc. field -6327.2 line 29247 of Y/chrY_random.fa.out
Strange perc. field -1122.4 line 29247 of Y/chrY_random.fa.out
Strange perc. field -74.6 line 29247 of Y/chrY_random.fa.out
    #	And there are always some of these too:
note: 402 records dropped due to repStart > repEnd


    #	verify everything seems normal compared with previous builds

    featureBits mm7 rmsk
    #	1092611581 bases of 2583394090 (42.294%) in intersection
    featureBits mm6 rmsk
    #	1110222842 bases of 2597150411 (42.748%) in intersection
    featureBits mm5 rmsk
    #	1137310280 bases of 2615483787 (43.484%) in intersection
    featureBits mm4 rmsk
    #	1130883581 bases of 2627444668 (43.041%) in intersection
    featureBits mm3 rmsk
    #	1080265553 bases of 2505900260 (43.109%) in intersection

    featureBits -countGaps mm7 rmsk
    #	1092611581 bases of 2847717329 (38.368%) in intersection
    featureBits -countGaps mm6 rmsk
    #	1110222842 bases of 3079633452 (36.050%) in intersection
    featureBits -countGaps mm5 rmsk
    #	1137310280 bases of 3164952073 (35.935%) in intersection
    featureBits -countGaps mm4 rmsk
    #	1130883581 bases of 2952612207 (38.301%) in intersection
    featureBits -countGaps mm3 rmsk
    #	1080265553 bases of 2708220133 (39.888%) in intersection


#############################################################################
# GC5BASE (DONE - 2005-08-29 - Hiram)
    ssh kkstore02
    mkdir -p /cluster/data/mm7/bed/gc5Base
    cd /cluster/data/mm7/bed/gc5Base
    hgGcPercent -wigOut -doGaps -file=stdout -win=5 mm7 \
        /cluster/data/mm7 | wigEncode stdin gc5Base.wig gc5Base.wib

    #       Calculating gcPercent with window size 5
    #       Using twoBit: /cluster/data/mm7/mm7.2bit
    #       File stdout created
    #	Converted stdin, upper limit 100.00, lower limit 0.00

    #	runs for about 17 minutes

    #	load database
    ssh hgwdev
    cd /cluster/data/mm7/bed/gc5Base
    mkdir /gbdb/mm7/wib
    ln -s `pwd`/gc5Base.wib /gbdb/mm7/wib
    hgLoadWiggle -pathPrefix=/gbdb/mm7/wib mm7 gc5Base gc5Base.wig

    #	verify index is correct:
    hgsql mm7 -e "show index from gc5Base;"
    #	should see good numbers in Cardinality column

#############################################################################
# PROCESS REPEAT MASKER AND SIMPLE REPEATS INTO MASKED SEQUENCE
#		(DONE - 2005-08-29 - Hiram)
    ssh kkstore02
    cd /cluster/data/mm7
    time for CHR in ?/chr?.fa ??/chr??.fa ?/chr?_random.fa ??/chr??_random.fa
    do
	FA=${CHR#*\/}
	C=${FA%.fa}
	echo -n "repeat masking ${C} ... "
	/cluster/bin/i386/maskOutFa -soft ${CHR} ${CHR}.out ${CHR}
	echo -n "adding simpleRepeats ... "
	/cluster/bin/i386/maskOutFa -softAdd ${CHR} \
		bed/simpleRepeat/trfMask/${C}.bed ${CHR}
	echo "done - ${CHR}"
    done > tmp/addRM_and_Simple.out 2>&1
    #	about 4 minutes

    # you will note the usual warnings about troublesome coordinates
    # in the repeat masker outputs - even more than when they were lifted.

    #	and make the hard masked sequences from these soft masked sequences
    time for CHR in ?/chr?.fa ??/chr??.fa ?/chr?_random.fa ??/chr??_random.fa
    do
	echo "maskOutFa ${CHR} hard ${CHR}.masked"
	/cluster/bin/i386/maskOutFa ${CHR} hard ${CHR}.masked
    done > /tmp/hardMask.out 2>&1

    #	rebuild the nib file
    time faToTwoBit ?/chr?.fa ??/chr??.fa ?/chr?_random.fa ??/chr??_random.fa \
	mm7Soft.2bit
    #	verify the sequence is still the same size as before:
    twoBitInfo mm7Soft.2bit stdout | sort -rn +1 | sum -r
    #	37784     1
    sum -r chrom.sizes
    #	37784     1
    #	replace the former unmasked 2bit file with this new one:
    rm mm7.2bit
    mv mm7Soft.2bit mm7.2bit
    #	check the browser, make sure it is functioning OK

    #	Generate fasta file for random contigs
    cat << '_EOF_' > jkStuff/lft2BitToFa.pl
#!/usr/bin/env perl

use strict;
use warnings;
use File::Basename;

sub usage()
{
printf "usage: %s <file.2bit> <file.lft> [more_files.lft]\n",  basename($0);
printf "\tfasta output is to stdout, therefore route stdout to result file\n";
exit 255;
}

my $argc = scalar(@ARGV);

usage if ($argc < 2);

my $twoBitFile = shift;

while (my $liftFile = shift)
{
open (FH,"<$liftFile") or die "Can not open $liftFile";
while (my $line=<FH>)
    {
    chomp $line;
    my ($start, $contig, $length, $chrom, $chrom_length) = split('\s',$line);
    my $cmd=sprintf("twoBitToFa $twoBitFile:%s:%d-%d stdout",
        $chrom, $start, $start+$length);
    print `$cmd | sed -e "s#^>.*#>$contig#"`;
    }
close (FH);
}
'_EOF_'
    #	happy emacs
    chmod +x jkStuff/lft2BitToFa.pl

    mkdir randomContigs
    for L in ?/lift/ctg_random.lft ??/lift/ctg_random.lft
do
    D=${L/\/lift*}
    echo $L $D
    ./jkStuff/lft2BitToFa.pl mm7.2bit ${L} \
	> randomContigs/chr${D}_random.ctg.fa
done
    #
    #	Verify these *.ctg.fa files have the same bases as the ordinary
    #	chr*_random.fa files:
    faSize ?/chr?_random.fa ??/chr??_random.fa
    #	225162013 bases (155337148 N's 69824865 real 32381553 upper 37443312 lower) in 18 sequences in 18 files
    faSize randomContigs/*.ctg.fa
    #	69825013 bases (148 N's 69824865 real 32381553 upper 37443312 lower) in 4827 sequences in 18 files
    #	Note the number of real, upper and lower bases are the same

    #	Create a 2bit file with the full chrom sequences and these
    #	random contigs for use in blastz:
    faToTwoBit ?/chr?.fa ??/chr??.fa randomContigs/chr*.ctg.fa \
	mm7Chroms_RandomContigs.2bit

    #	Copy to bluearc unit for kluster runs
    cp -p mm7.2bit /cluster/bluearc/mm7
    cp -p mm7Chroms_RandomContigs.2bit /cluster/bluearc/mm7
    #	And the lift file to go with it
    cat ?/lift/ctg_random.lft ??/lift/ctg_random.lft \
	> jkStuff/Chroms_RandomContigs.lft
    cp -p jkStuff/Chroms_RandomContigs.lft /cluster/bluearc/mm7

    #	create full chrom nibs for blastz SEQ1 target
    mkdir nib
    for FA in ?/chr?.fa ??/chr??.fa ?/chr?_random.fa ??/chr??_random.fa
do
    B=${FA/*\/}
    B=${B/.fa/}
    echo faToNib -softMask ${FA} nib/${B}.nib
    rm -f nib/${B}.nib
    faToNib -softMask ${FA} nib/${B}.nib
done
    mkdir /cluster/bluearc/mm7/nib
    cp -p nib/*.nib /cluster/bluearc/mm7/nib

#############################################################################
# BLASTZ SELF experiment 3 (DONE - 2005-11-17 - 2005-11-22 - Hiram)
#	re-scoring to chain min score of 10,000 to cut down on volumn of data
    ssh pk
    mkdir /cluster/data/mm7/bed/blastzSelf.2005-11-17
    cd /cluster/data/mm7/bed/blastzSelf.2005-11-17

    cat << '_EOF_' > DEF
# mouse vs mouse
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/cluster/home/angie/schwartzbin:/parasol/bin

BLASTZ=blastz.v7.x86_64
BLASTZ_H=2000
BLASTZ_M=200

# TARGET: Mouse Mm7
SEQ1_DIR=/scratch/hg/mm7/nib
SEQ1_LEN=/scratch/hg/mm7/chrom.sizes
SEQ1_CTGDIR=/scratch/hg/mm7/mm7Chroms_RandomContigs.2bit
SEQ1_CTGLEN=/scratch/hg/mm7/mm7Chroms_RandomContigs.sizes
SEQ1_LIFT=/scratch/hg/mm7/Chroms_RandomContigs.lft
SEQ1_LIMIT=30
SEQ1_IN_CONTIGS=0
SEQ1_CHUNK=10000000
SEQ1_LAP=10000

# QUERY: Mouse Mm7
SEQ2_DIR=/scratch/hg/mm7/nib
SEQ2_LEN=/scratch/hg/mm7/chrom.sizes
SEQ2_CTGDIR=/scratch/hg/mm7/mm7Chroms_RandomContigs.2bit
SEQ2_CTGLEN=/scratch/hg/mm7/mm7Chroms_RandomContigs.sizes
SEQ2_LIFT=/scratch/hg/mm7/Chroms_RandomContigs.lft
SEQ2_SELF=1
SEQ2_LIMIT=30
SEQ2_IN_CONTIGS=0
SEQ2_CHUNK=10000000
SEQ2_LAP=0

BASE=/cluster/data/mm7/bed/blastzSelf.2005-11-17
TMPDIR=/scratch/tmp
'_EOF_'
    #	happy emacs

    cd /cluster/data/mm7/bed/blastzSelf.2005-11-17
    time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-chainMinScore=10000 -chainLinearGap=medium -bigClusterHub=pk \
	-stop=net \
	`pwd`/DEF > blastz.out 2>&1 &
    time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-chainMinScore=10000 -chainLinearGap=medium -bigClusterHub=pk \
	-continue=cat -stop=net \
	`pwd`/DEF > cat-to-net.out 2>&1 &

    time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-chainMinScore=10000 -chainLinearGap=medium -bigClusterHub=pk \
	-continue=chainMerge -stop=net \
	`pwd`/DEF > chainMerge-to-net.out 2>&1 &

    time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-chainMinScore=10000 -chainLinearGap=medium -bigClusterHub=pk \
	-continue=load -stop=load \
	`pwd`/DEF > load.out 2>&1 &

    ssh kolossus
    cd /cluster/data/mm7/bed/blastzSelf.2005-11-17
    time HGDB_CONF=~/.hg.conf.read-only featureBits mm7 \
	chainSelfLink >fb.mm7.chainSelfLink 2>&1
    cat fb.mm7.chainSelfLink
    #	409297278 bases of 2583394090 (15.843%) in intersection

    time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-chainMinScore=10000 -chainLinearGap=medium -bigClusterHub=pk \
	-continue=download \
	`pwd`/DEF > download.out 2>&1 &

#############################################################################
# BLASTZ SELF experiment 1 (DONE - 2005-09-14 - 2005-09-17 - Hiram)
    ssh pk
    mkdir /cluster/data/mm7/bed/blastzSelf
    cd /cluster/data/mm7/bed/blastzSelf
    cat << '_EOF_' > DEF
# mouse vs mouse
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/cluster/home/angie/schwartzbin:/parasol/bin

BLASTZ=blastz.v7.x86_64
BLASTZ_H=2000
BLASTZ_M=50
BLASTZ_ABRIDGE_REPEATS=0

# TARGET: Mouse Mm7
SEQ1_DIR=/cluster/bluearc/mm7/nib
SEQ1_CTGDIR=/cluster/bluearc/mm7/mm7.2bit
SEQ1_IN_CONTIGS=0
SEQ1_CHUNK=500000
SEQ1_LAP=50

# QUERY: Mouse Mm7
SEQ2_DIR=/cluster/bluearc/mm7/mm7Chroms_RandomContigs.2bit
SEQ2_CTGDIR=/cluster/bluearc/mm7/mm7.2bit
SEQ2_LIFT=/cluster/bluearc/mm7/Chroms_RandomContigs.lft
SEQ2_SELF=1
SEQ2_IN_CONTIGS=0
SEQ2_CHUNK=3000000000
SEQ2_LAP=0

BASE=/cluster/data/mm7/bed/blastzSelf

SEQ1_LEN=$BASE/S1.len
SEQ2_LEN=$BASE/S2.len
SEQ2_CTGLEN=$BASE/S2ctg.len
TMPDIR=/scratch/tmp
'_EOF_'
    # happy emacs

    cd /cluster/data/mm7/bed/blastzSelf
    nibSize /cluster/bluearc/mm7/nib/*.nib \
	| awk '{printf "%s\t%s\n", $2, $3}' | sort -k2,2nr > S1.len
    twoBitInfo /cluster/bluearc/mm7/mm7Chroms_RandomContigs.2bit stdout \
	| sort -k2,2nr > S2ctg.len
    twoBitInfo /cluster/bluearc/mm7/mm7.2bit stdout | sort -k2,2nr > S2.len

    time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-bigClusterHub=kk \
	`pwd`/DEF > blastz.out 2>&1 &

    time $HOME/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-bigClusterHub=pk \
	-stop=blastz \
	`pwd`/DEF > blastz.out 2>&1 &
    #	real    1573m31.130s
    #	user    0m0.081s
    #	sys     0m0.071s

    time $HOME/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-bigClusterHub=pk \
	-continue=cat -stop=cat \
	`pwd`/DEF > cat.out 2>&1 &
    time $HOME/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-bigClusterHub=pk \
	-continue=chainRun -stop=chainRun \
	`pwd`/DEF > chainRun.out 2>&1 &
    time $HOME/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-bigClusterHub=pk \
	-continue=chainMerge -stop=load \
	`pwd`/DEF > chainMergeToLoad.out 2>&1 &
    #	real    233m20.096s
    #	user    0m0.101s
    #	sys     0m0.088s

    #	The chainLink measurements need kolossus:
    ssh kolossus
    cd /cluster/data/mm7/bed/blastzSelf
    time HGDB_CONF=~/.hg.conf.read-only featureBits mm7 chainSelfLink \
	> featBits.chainSelfLink.mm7 2>&1
    #	real    28m20.150s
    #	user    9m4.878s
    #	sys     2m53.059s
    #	434012375 bases of 2583394090 (16.800%) in intersection
    time HGDB_CONF=~/.hg.conf.read-only featureBits mm6 chainSelfLink \
	> featBits.chainSelfLink.mm6 2>&1
    #	real    109m50.760s
    #	user    32m4.830s
    #	sys     11m42.215s
    #	417927047 bases of 2597150411 (16.092%) in intersection

    time featureBits mm7 netSelf
    # 2336281173 bases of 2597150411 (89.956%) in intersection
    time featureBits mm7 chainSelf
    #  2579948751 bases of 2597150411 (99.338%) in intersection
    time featureBits mm3 mouseChain
    # 889252994 bases of 2505900260 (35.486%) in intersection

    #	the chainLink measurements need kolossus:
    ssh kolossus
    time HGDB_CONF=~/.hg.conf.read-only featureBits mm6 chainSelfLink
    #	417927047 bases of 2597150411 (16.092%) in intersection
    #	244 minutes
    time HGDB_CONF=~/.hg.conf.read-only featureBits mm3 mouseChainLink
    #	383345536 bases of 2505900260 (15.298%) in intersection

    #	Gill likes to see the blastzSelf track:
    ssh eieio
    cd /cluster/data/mm7/bed/blastzSelf
    #	cat the pslParts together, per-chrom, and in chromStart order:
    ls pslParts | sed -e "s/.nib.*//" | sort -u | while read C
    do
	echo -n "working: ${C} ... "
	zcat `ls pslParts/${C}.nib* | sort --field-separator=':' -k1,1 -k3,3n` \
            | gzip > pslChrom/${C}_blastzSelf.psl.gz
	echo "done"
    done

    # Load blastzSelf
    ssh hgwdev
    cd /cluster/data/mm7/bed/blastzSelf/pslChrom
    for I in *.psl.gz
    do
	$HOME/bin/i386/hgLoadPsl -noTNameIx mm7 ${I}
	echo "done: ${I}"
    done
    #	STARTED - 2005-04-06 15:24
    #	4h 24m load time - chrUn_random failed to load
    ssh kolossus
    time HGDB_CONF=~/.hg.conf.read-only featureBits mm7 blastzSelf
    #	8h 34m job
    #	471978757 bases of 2597150411 (18.173%) in intersection

#############################################################################
# PREPARE "bigZips" files for public release
#	(DONE 2005-09-20 - Hiram)
    ssh kkstore02
    mkdir /cluster/data/mm7/bigZips
    cd /cluster/data/mm7/bigZips
    scp hgwdev:/usr/local/apache/htdocs/goldenPath/mm6/bigZips/README.txt .
    # edit README.txt to indicate proper version of sequence and
    #	RepeatMasker
    cd /cluster/data/mm7
    tar cvzf bigZips/chromAgp.tar.gz ?/chr*.agp ??/chr*.agp
    tar cvzf bigZips/chromFa.tar.gz ?/chr*.fa ??/chr*.fa
    tar cvzf bigZips/chromFaMasked.tar.gz ?/chr*.fa.masked ??/chr*.fa.masked
    tar cvzf bigZips/chromOut.tar.gz ?/chr*.fa.out ??/chr*.fa.out
    cd /cluster/data/mm7/bed/simpleRepeat
    tar cvzf ../../bigZips/chromTrf.tar.gz ./trfMask

    # get GenBank native mRNAs
    ssh hgwdev
    cd /cluster/data/genbank
    ./bin/i386/gbGetSeqs -db=mm7 -native GenBank mrna \
	/cluster/data/mm7/bigZips/mrna.fa
    cd /cluster/data/mm7/bigZips
    gzip mrna.fa
    
    ssh kkstore02
    cd /cluster/data/mm7/bigZips
    md5sum *.gz > md5sum.txt

    ssh hgwdev
    mkdir -p /usr/local/apache/htdocs/goldenPath/mm7/bigZips
    cd /usr/local/apache/htdocs/goldenPath/mm7/bigZips
    ln -s /cluster/data/mm7/bigZips/* .

#########################################################################
# GENBANK auto update (DONE 2005-09-29 markd)
    # align with revised genbank process. drop xeno ESTs.
    cd ~kent/src/makeDb/genbank
    cvs update -d etc
    # edit etc/genbank.conf to add mm7

# mm7
mm7.serverGenome = /cluster/data/mm7/mm7.2bit
mm7.clusterGenome = /cluster/bluearc/mm7/mm7.2bit
mm7.ooc = /cluster/bluearc/mm7/11.ooc
mm7.align.unplacedChroms = chrUn_random
mm7.lift = /cluster/data/mm7/jkStuff/liftAll.lft
mm7.refseq.mrna.native.pslCDnaFilter  = ${ordered.refseq.mrna.native.pslCDnaFilter}
mm7.refseq.mrna.xeno.pslCDnaFilter    = ${ordered.refseq.mrna.xeno.pslCDnaFilter}
mm7.genbank.mrna.native.pslCDnaFilter = ${ordered.genbank.mrna.native.pslCDnaFilter}
mm7.genbank.mrna.xeno.pslCDnaFilter   = ${ordered.genbank.mrna.xeno.pslCDnaFilter}
mm7.genbank.est.native.pslCDnaFilter  = ${ordered.genbank.est.native.pslCDnaFilter}
mm7.downloadDir = mm7
mm7.refseq.mrna.xeno.load  = yes
mm7.refseq.mrna.xeno.loadDesc = yes
mm7.mgcTables.default = full
mm7.mgcTables.mgc = all

    # update /cluster/data/genbank/
    make etc-update

    ssh kkstore02
    cd /cluster/data/genbank
    nice bin/gbAlignStep -initial mm7 &

    # load database when finished
    ssh hgwdev
    cd /cluster/data/genbank
    nice ./bin/gbDbLoadStep -drop -initialLoad  mm7&


    # enable daily alignment and update of hgwdev
    cd ~kent/src/makeDb/genbank
    cvs update -d etc
    # add mm7 to:
        etc/align.dbs
        etc/hgwdev.dbs 
    cvs commit
    make etc-update




#########################################################################
# PRODUCING GENSCAN PREDICTIONS (DONE - 2005-08-31 - 2005-09-06 - Hiram)
    ssh kkstore02
    #	create hard masked sequence
    mkdir /cluster/data/mm7/hardMasked
    cd /cluster/data/mm7/hardMasked
    twoBitToFa ../mm7Chroms_RandomContigs.2bit stdout \
	| maskOutFa stdin hard stdout \
	> Chroms_RandomContigs.hardMasked.fa
    faCount Chroms_RandomContigs.hardMasked.fa > faCount.hardMasked
    tail -1 faCount.chunks
# total   2692380329      437167178       307449158       307469169       437328614       1202966210      13463576
    #	These counts are going to be different than the soft masked
    #	sequence since a lot of stuff has been turned into N's
    #	In fact, a lot of these are all N's, filter the name list to allow
    #	only sequences with at least 18 nucleotides to prevent genscan
    #	from going bonkers.
    egrep -v "^#|^total" faCount.hardMasked \
	| awk '{if ($2-$7 > 17) {print $1}}' > over18.lst
    faSomeRecords Chroms_RandomContigs.hardMasked.fa over18.lst over18.fa
    #	creating 10,000,000 sized chunks
    mkdir hardChunks
    time faSplit about over18.fa 10000000 hardChunks/c_
    mkdir /cluster/bluearc/mm7/hardChunks
    cp -p hardChunks/c_*.fa /cluster/bluearc/mm7/hardChunks

    ssh hgwdev
    mkdir /cluster/data/mm7/bed/genscan
    cd /cluster/data/mm7/bed/genscan
    # Check out hg3rdParty/genscanlinux to get latest genscan:
    cvs co hg3rdParty/genscanlinux
    # Run on small cluster (more mem than big cluster).
    ssh kki
    cd /cluster/data/mm7/bed/genscan
    # Make 3 subdirectories for genscan to put their output files in
    mkdir gtf pep subopt
    # Generate a list file, genome.list, of all the hard-masked contigs that 
    # *do not* consist of all-N's (which would cause genscan to blow up)
    ls -1S /cluster/bluearc/mm7/hardChunks/c_*.fa > genome.list

    # XXX There is an error in the following template, note the extra
    # space between the - and par=
    #	It turns out the default for the -par argument is this same
    #	matrix so the extra space had no effect on the end result.
    # Create template file, gsub, for gensub2.  For example (3-line file):
    cat << '_EOF_' > gsub
#LOOP
/cluster/bin/x86_64/gsBig {check in line+ $(path1)} {check out line gtf/$(root1).gtf} -trans={check out line pep/$(root1).pep} -subopt={check out line subopt/$(root1).bed} -exe=hg3rdParty/genscanlinux/genscan - par=hg3rdParty/genscanlinux/HumanIso.smat -tmp=/tmp -window=2400000
#ENDLOOP
'_EOF_'
    # happy emacs
    gensub2 genome.list single gsub jobList
    para create jobList
    para try, check, push, check, ...
    #	Had three jobs crash:
# Completed: 25 of 28 jobs
# Crashed: 3 jobs
# CPU time in finished jobs:     269839s    4497.32m    74.96h    3.12d  0.009 y
# IO & Wait Time:                 14374s     239.57m     3.99h    0.17d  0.000 y
# Average job time:               11369s     189.48m     3.16h    0.13d
# Longest finished job:           29640s     494.00m     8.23h    0.34d
# Submission to last job:         29687s     494.78m     8.25h    0.34d

    # If there are crashes, diagnose with "para problems".  
    #	Three of them needed to be rerun, adjust window down to 2000000 to
    #	get them to complete.  Lower that number if the error persists.
    #	These jobs can take over 4 hours on kolossus.
    ssh kolossus
    cd /cluster/data/mm7/bed/genscan
    # XXX There is an error in the following commands, note the extra
    # space between the - and par=
    #	It turns out the default for the -par argument is this same
    #	matrix so the extra space had no effect on the end result.
    /cluster/bin/x86_64/gsBig /cluster/bluearc/mm7/hardChunks/c_03.fa gtf/c_03.gtf -trans=pep/c_03.pep -subopt=subopt/c_03.bed -exe=hg3rdParty/genscanlinux/genscan - par=hg3rdParty/genscanlinux/HumanIso.smat -tmp=/tmp -window=2000000

    /cluster/bin/x86_64/gsBig /cluster/bluearc/mm7/hardChunks/c_12.fa gtf/c_12.gtf -trans=pep/c_12.pep -subopt=subopt/c_12.bed -exe=hg3rdParty/genscanlinux/genscan - par=hg3rdParty/genscanlinux/HumanIso.smat -tmp=/tmp -window=2000000

    /cluster/bin/x86_64/gsBig /cluster/bluearc/mm7/hardChunks/c_01.fa gtf/c_01.gtf -trans=pep/c_01.pep -subopt=subopt/c_01.bed -exe=hg3rdParty/genscanlinux/genscan - par=hg3rdParty/genscanlinux/HumanIso.smat -tmp=/tmp -window=2000000
    #	The last one on c_01.fa timing is:
    #	real    259m46.197s
    #	user    256m27.702s
    #	sys     2m35.809s

    # cat and lift the results into single files
    ssh kkstore02
    cd /cluster/data/mm7/bed/genscan
    cat gtf/c_*.gtf | liftUp -type=.gtf genscan.gtf \
	/cluster/data/mm7/jkStuff/Chroms_RandomContigs.lft carry stdin
    cat subopt/c_*.bed | liftUp -type=.bed genscanSubopt.bed \
	/cluster/data/mm7/jkStuff/Chroms_RandomContigs.lft carry stdin
    cat pep/c_*.pep > genscan.pep

    # Load into the database as so:
    ssh hgwdev
    cd /cluster/data/mm7/bed/genscan
    ldHgGene mm7 -gtf genscan genscan.gtf
    #	Read 46116 transcripts in 325432 lines in 1 files
    #	  46116 groups 40 seqs 1 sources 1 feature types
    #	46116 gene predictions
    hgPepPred mm7 generic genscanPep genscan.pep
    hgLoadBed -strict mm7 genscanSubopt genscanSubopt.bed
    #	Loaded 533197 elements of size 6

    #	check the numbers
    featureBits mm7 genscan
    #	54864694 bases of 2583394090 (2.124%) in intersection
    featureBits mm6 genscan
    #	54894283 bases of 2597150411 (2.114%) in intersection
    featureBits mm5 genscan
    #	55024722 bases of 2615483787 (2.104%) in intersection
    featureBits mm4 genscan
    #	56164126 bases of 2627444668 (2.138%) in intersection
    featureBits mm3 genscan
    #	51697165 bases of 2505900260 (2.063%) in intersection

    featureBits mm7 genscanSubopt
    #	57512333 bases of 2583394090 (2.226%) in intersection
    featureBits mm6 genscanSubopt
    #	57856316 bases of 2597150411 (2.228%) in intersection
    featureBits mm5 genscanSubopt
    #	58474899 bases of 2615483787 (2.236%) in intersection
    featureBits mm4 genscanSubopt
    #	59601009 bases of 2627444668 (2.268%) in intersection
    featureBits mm3 genscanSubopt
    #	56085184 bases of 2505900260 (2.238%) in intersection


#########################################################################
# BLASTZ HUMAN Hg17 second time (DONE - 2005-11-14 - 2005-11-21 - Hiram)
    #	After fixing a bug in the lineage specific repeat snip business
    #	in blastz-run-ucsc script
    ssh pk
    mkdir /cluster/data/mm7/bed/blastzHg17.2005-11-14
    cd /cluster/data/mm7/bed
    rm blastz.hg17
    ln -s blastzHg17.2005-11-14 blastz.hg17
    cd lastzHg17.2005-11-14

    cat << '_EOF_' > DEF
# mouse vs human
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/cluster/home/angie/schwartzbin:/parasol/bin

BLASTZ=blastz.v7.x86_64
BLASTZ_ABRIDGE_REPEATS=1

# TARGET: Mouse Mm7
SEQ1_DIR=/scratch/hg/mm7/nib
SEQ1_SMSK=/scratch/hg/mm7/linSpecRep/notInHumanDogCow
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
SEQ1_LEN=/scratch/hg/mm7/chrom.sizes

# QUERY: Human Hg17 - single chunk big enough to run entire genome
SEQ2_DIR=/scratch/hg/gs.18/build35/bothMaskedNibs
SEQ2_SMSK=/scratch/hg/gs.18/build35/linSpecRep.notInMouse
SEQ2_LEN=/cluster/bluearc/hg17/chrom.sizes
SEQ2_CHUNK=3000000000
SEQ2_LAP=0

BASE=/cluster/data/mm7/bed/blastzHg17.2005-11-14
TMPDIR=/scratch/tmp
'_EOF_'
    # << happy emacs

    #	establish a screen to control this job
    screen
    time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-bigClusterHub=pk \
	-stop=net \
	`pwd`/DEF > blastz-to-net.out 2>&1 &
    #	Started 2005-11-14 11:15
    time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-bigClusterHub=pk \
	-continue=load -stop=load \
	`pwd`/DEF > load.out 2>&1 &

    time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-bigClusterHub=pk -swap -stop=net `pwd`/DEF > swap-to-net.out 2>&1 &
    
    time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-bigClusterHub=pk -swap -continue=load -stop=load \
	`pwd`/DEF > swap-load.out 2>&1 &

    #	re-scoring the chains
    ssh kkstore02
    cd /cluster/data/mm7/bed/blastzHg17.2005-11-14
    rm -fr axtNet mafNet axtChain

    ssh pk
    cd /cluster/data/mm7/bed/blastzHg17.2005-11-14

    time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
	-continue=chainRun -stop=load \
	`pwd`/DEF > rescoreChain.out 2>&1 &

    time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
	-swap -stop=load \
	`pwd`/DEF > rescoreChainSwap.out 2>&1 &

    time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
	-swap -continue=load -stop=load \
	`pwd`/DEF > rescoreChainSwapLoad.out 2>&1 &

    time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
	-continue=download \
	`pwd`/DEF > rescoreDownload.out 2>&1 &

    time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
	-swap -continue=download \
	`pwd`/DEF > rescoreDownloadSwap.out 2>&1 &

    #	Measurements:
    ssh kolossus
    cd  /cluster/data/mm7/bed/blastzHg17.2005-11-14
    time HGDB_CONF=~/.hg.conf.read-only featureBits mm7 \
	chainHg17Link > fb.mm7.chainHg17Link.rescore 2>&1
    cat fb.mm7.chainHg17Link.rescore
    #	996434728 bases of 2583394090 (38.571%) in intersection
    time HGDB_CONF=~/.hg.conf.read-only featureBits hg17 \
	chainMm7Link > fb.hg17.chainMm7Link.rescore 2>&1
    cat fb.hg17.chainMm7Link.rescore
    #	994737081 bases of 2866216770 (34.706%) in intersection


#########################################################################
# BLASTZ HUMAN Hg17 experiment (DONE - 2005-09-16 - 2005-10-04 - Hiram)
    ssh kkstore02
    #	Create the composite chrom/contigs from randoms 2bit file
    #	for Hg17  ***!!! THIS IS NOT NEEDED  !!!***
    #	These can't be used with linage specific repeats
    #	You can't use 2bit files with SMSK settings.
    mkdir /cluster/data/hg17/randomContigs
    cd /cluster/data/hg17
    rm jkStuff/hg17Chroms_RandomContigs.lft
for C in `cat chrom.lst`
do
    L="${C}/lift/random.lft"
    if [ -f "${L}" ]; then
        ls -og "${L}"
	cat ${L} >> jkStuff/hg17Chroms_RandomContigs.lft
        /cluster/data/mm7/jkStuff/lft2BitToFa.pl hg17.2bit ${L} \
                > randomContigs/chr${C}_random.ctg.fa
    fi
done
    #	Check that the sequence remains the same
    faSize randomContigs/*.ctg.fa
    # 14578373 bases  (110698 N's 14467675 real 7182813 upper 7284862
    #	lower) in 86 sequences in 19 files
    faSize */chr*_random.fa
    # 17928373 bases (3460698 N's 14467675 real 7182813 upper 7284862
    #	lower) in 19 sequences in 19 files
    #	Note, only the N's are different
    faToTwoBit ?/chr?.fa ??/chr??.fa 6_hla_hap?/chr*.fa \
	randomContigs/chr*.ctg.fa hg17Chroms_RandomContigs.2bit
    #	Verify sequence isn't broken:
    twoBitToFa hg17Chroms_RandomContigs.2bit stdout | faSize stdin
    # 3091666460 bases (225561672 N's 2866104788 real 1474041767
    # upper 1392063021 lower) in 113 sequences in 1 files
    twoBitToFa hg17.2bit stdout | faSize stdin
    # 3095016460 bases (228911672 N's 2866104788 real 1474041767
    # upper 1392063021 lower) in 46 sequences in 1 files
    #	Only difference is the N count. 228911672 - 225561672 = 3350000
    cp -p hg17Chroms_RandomContigs.2bit /cluster/bluearc/hg17
    cp -p jkStuff/hg17Chroms_RandomContigs.lft /cluster/bluearc/hg17

    mkdir /cluster/data/mm7/bed/blastzHg17.2005-09-16
    cd /cluster/data/mm7/bed
    ln -s blastzHg17.2005-09-16 blastz.hg17
    cd /cluster/data/mm7/bed/blastzHg17.2005-09-16
    cat << '_EOF_' > DEF
# mouse vs human
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/cluster/home/angie/schwartzbin:/parasol/bin

BLASTZ=blastz.v7.x86_64
BLASTZ_ABRIDGE_REPEATS=1

# TARGET: Mouse Mm7
SEQ1_DIR=/cluster/bluearc/mm7/nib
SEQ1_SMSK=/cluster/bluearc/mm7/linSpecRep/notInHumanDogCow
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
SEQ1_LEN=/cluster/bluearc/mm7/chrom.sizes

# QUERY: Human Hg17
SEQ2_DIR=/cluster/bluearc/hg17/bothMaskedNibs
SEQ2_SMSK=/cluster/bluearc/hg17/linSpecRep.notInMouse
SEQ2_LEN=/cluster/bluearc/hg17/chrom.sizes
SEQ2_CHUNK=3000000000
SEQ2_LAP=0

BASE=/cluster/data/mm7/bed/blastzHg17.2005-09-16
TMPDIR=/scratch/tmp
'_EOF_'
    # << keep emacs coloring happy

    cd /cluster/data/mm7/bed/blastzHg17.2005-09-16
    #	establish a screen to control this job
    screen
    time $HOME/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-bigClusterHub=pk \
	-stop=blastz \
	`pwd`/DEF > blastz.out 2>&1 &
    time $HOME/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-bigClusterHub=pk \
	-continue=cat -stop=load \
	`pwd`/DEF > catToLoad.out 2>&1 &
    #	real    335m51.785s
    #	user    0m0.080s
    #	sys     0m0.051s

    #	detach from screen session: Ctrl-a Ctrl-d
    #	to reattach to this screen session:
    ssh kkstore02
    screen -d -r
    time $HOME/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-bigClusterHub=pk \
	-continue=download \
	`pwd`/DEF > download-cleanup.out 2>&1 &

    #	Special swap, the load scripts have been disabled so these can
    #	be loaded manually to give them special names for comparison
    #	experiments
    time $HOME/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-bigClusterHub=pk \
	-swap -stop=load \
	`pwd`/DEF > swapLoadReady.out 2>&1 &
    #	Manually loaded tables chainHg17LSR, chainHg17LSRLink and
    #	netHg17LSR
    #	real    268m48.953s
    #	user    21m46.640s
    #	sys     5m0.510s

    featureBits mm7 chainHg17noLSRLink
    #	952987983 bases of 2583394090 (36.889%) in intersection
    time featureBits mm7 chainHg17Link
    #	961671385 bases of 2583394090 (37.225%) in intersection
    #	And, their intersection:
    ssh kolossus
    time HGDB_CONF=~/.hg.conf.read-only featureBits mm7 \
	chainHg17Link chainHg17noLSRLink
    #	950261805 bases of 2583394090 (36.783%) in intersection
    #	real    33m16.210s
    #	user    8m39.179s
    #	sys     2m33.198s

    #	Loading lineage specific repeats for measurement purposes:
    ssh kkstore02
    cd /cluster/bluearc/mm7/linSpecRep/notInHumanDogCow
    time grep -h chr chr*.out.spec | awk '{print $5,$6,$7,$10,$1}' \
	| sort -k1,1 -k2,2n \
	> /cluster/data/mm7/bed/blastzHg17.2005-09-16/mm7LSRhg.bed 
    cd /cluster/bluearc/hg17/linSpecRep.notInMouse
    time grep -h chr chr*.out.spec | awk '{print $5,$6,$7,$10,$1}' \
	| sort -k1,1 -k2,2n \
	> /cluster/data/mm7/bed/blastzHg17.2005-09-16/hg17LSRmm.bed 

    ssh hgwdev
    cd  /cluster/data/mm7/bed/blastzHg17.2005-09-16
    hgLoadBed mm7 linSpecRepNotHg -strict mm7LSRhg.bed
    #	Loaded 2816082 elements of size 5
    hgLoadBed hg17 linSpecRepNotMm -strict hg17LSRmm.bed
    #	had one broken line in the file, removed it
    #	Expecting 5 words line 298322 of hg17LSRmm.bed got 4
    #	Loaded 1639435 elements of size 5
    ssh kolossus
    cd  /cluster/data/mm7/bed/blastzHg17.2005-09-16
    time HGDB_CONF=~/.hg.conf.read-only featureBits mm7 \
	chainHg17Link linSpecRepNotHg \
	> fbMm7.chainHg17Link.linSpecRepNotHg 2>&1
    time HGDB_CONF=~/.hg.conf.read-only featureBits mm7 \
	chainHg17noLSRLink linSpecRepNotHg \
	> fbMm7.chainHg17noLSRLink.linSpecRepNotHg 2>&1


    time HGDB_CONF=~/.hg.conf.read-only featureBits hg17 \
	chainMm7LSRLink linSpecRepNotMm \
	> fbHg17.chainMm7LSRLink.linSpecRepNotMm
    time HGDB_CONF=~/.hg.conf.read-only featureBits hg17 \
	chainMm7Link linSpecRepNotMm \
	> fbHg17.chainMm7Link.linSpecRepNotMm

##########################################################################
    #	A measurement script to do all featureBits combinations:
    cd /cluster/data/mm7/jkStuff
    cat << '_EOF_' >  netChainCheck.sh
#!/bin/sh

usage()
{
echo "usage: netChainCheck.sh <db0> <db1> <targetDb>"
echo "    does: featureBits <db0> net<targetDb>"
echo "          featureBits <db1> net<targetDb>"
echo "    as well as the chain and chainLink tables,"
echo "    and on the targetDb:"
echo "          featureBits <targetDb> net<db0>"
echo "          featureBits <targetDb> net<db1>"
echo "    and the chain and chainLink tables."
echo -e "\texample: netChainCheck.sh mm7 mm5 fr1"
}

doOne() 
{   
db=$1
tbl=$2
echo "    featureBits $db $tbl"
echo -en "    #\t"
time featureBits $db $tbl
}

ucFirstLetter()
{
ucString="$1"
fc=`echo "${ucString}" | sed -e "s/\(.\).*/\1/"`
rest=`echo "${ucString}" | sed -e "s/.\(.*\)/\1/"`
FC=`echo "${fc}" | tr '[a-z]' '[A-Z]'`
echo "${FC}${rest}"
}

if [ "$#" -ne 3 ]; then
    usage
    exit 255
fi

db0=$1
db1=$2
targetDb=$3

targetDB=`ucFirstLetter "${targetDb}"`
DB0=`ucFirstLetter "${db0}"`
DB1=`ucFirstLetter "${db1}"`

export db0 db1 targetDb targetDB DB0 DB1
# echo "${db0} ${db1} ${targetDb} ${targetDB} ${DB0} ${DB1}"

doOne "${db0}" net${targetDB}
doOne "${db1}" net${targetDB}
doOne "${db0}" chain${targetDB}
doOne "${db1}" chain${targetDB}
doOne "${db0}" chain${targetDB}Link
doOne "${db1}" chain${targetDB}Link
doOne ${targetDb} net${DB0}
doOne ${targetDb} net${DB1}
doOne ${targetDb} chain${DB0}
doOne ${targetDb} chain${DB1}
doOne ${targetDb} chain${DB0}Link
doOne ${targetDb} chain${DB1}Link
'_EOF_'
    # << keep emacs coloring happy

#########################################################################
# BLASTZ RAT Rn3 second time (WORKING - 2005-11-14 - Hiram)
    #	After fixing a bug in the lineage specific repeat snip business
    #	in blastz-run-ucsc script
    ssh pk
    mkdir /cluster/data/mm7/bed/blastzRn3.2005-11-14
    cd /cluster/data/mm7/bed
    rm blastz.rn3
    ln -s blastzRn3.2005-11-14 blastz.rn3
    cd blastzRn3.2005-11-14

    cat << '_EOF_' > DEF
# mouse vs rat
export
PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/i386:/cluster/home/angie/schwartzbin:/parasol/bin

BLASTZ=blastz.v7
BLASTZ_ABRIDGE_REPEATS=1
    
# TARGET: Mouse Mm7
SEQ1_DIR=/scratch/hg/mm7/nib
SEQ1_SMSK=/scratch/hg/mm7/linSpecRep/notInRat
SEQ1_LEN=/scratch/hg/mm7/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000

# QUERY: Rat Rn3 - chunk big enough to do all chroms in single whole
pieces
SEQ2_DIR=/scratch/rat/rn3/softNib
SEQ2_SMSK=/cluster/bluearc/rat/rn3/linSpecRep.notInMouse
SEQ2_LEN=/cluster/bluearc/rat/rn3/chrom.sizes
SEQ2_CHUNK=300000000
SEQ2_LAP=0

BASE=/cluster/data/mm7/bed/blastzRn3.2005-11-14
TMPDIR=/scratch/tmp
'_EOF_'
    # << happy emacs

    #	establish a screen to control this job
    screen
    time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-bigClusterHub=pk \
	-stop=load \
	`pwd`/DEF > to-load.out 2>&1 &
    #	Started 2005-11-16 09:37

    ssh kolossus
    cd /cluster/data/mm7/bed/blastzRn3.2005-11-14
    time HGDB_CONF=~/.hg.conf.read-only featureBits mm7 chainRn3Link > \
	fb.mm7.chainRn3Link
    #	1779807900 bases of 2583394090 (68.894%) in intersection
    #	real    65m22.546s

    time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-bigClusterHub=pk \
	-swap -stop=load \
	`pwd`/DEF > swap-to-load.out 2>&1 &

    #	re-scoring the chains
    ssh kkstore02
    cd /cluster/data/mm7/bed/blastzRn3.2005-09-16
    rm -fr axtNet mafNet axtChain

    ssh pk
    cd /cluster/data/mm7/bed/blastzRn3.2005-09-16

    time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
	-continue=chainRun -stop=load \
	`pwd`/DEF > rescoreChain.out 2>&1 &
    time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
	-swap -stop=load \
	`pwd`/DEF > rescoreChainSwap.out 2>&1 &
    
    time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
	-continue=download \
	`pwd`/DEF > rescoreChainDownload.out 2>&1 &

    time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
	-swap -continue=download \
	`pwd`/DEF > rescoreChainDownloadSwap.out 2>&1 &

    #	Measurements:
    ssh kolossus
    cd  /cluster/data/mm7/bed/blastzRn3.2005-11-14
    time HGDB_CONF=~/.hg.conf.read-only featureBits mm7 \
	chainRn3Link > fb.mm7.chainRn3Link.rescore 2>&1
    cat fb.mm7.chainRn3Link.rescore
    #	1775924100 bases of 2583394090 (68.744%) in intersection
    time HGDB_CONF=~/.hg.conf.read-only featureBits rn3 \
	chainMm7Link > fb.rn3.chainMm7Link.rescore 2>&1
    cat fb.rn3.chainMm7Link.rescore
    #	1781501323 bases of 2571104688 (69.289%) in intersection

#########################################################################
# BLASTZ RAT Rn3 (DONE - 2005-09-16 - 2005-10-18 - Hiram)
    ssh kkstore02
    mkdir /cluster/data/mm7/bed/blastzRn3.2005-09-16
    cd /cluster/data/mm7/bed
    ln -s blastzRn3.2005-09-16 blastz.rn3
    cd blastzRn3.2005-09-16

    cat << '_EOF_' > DEF
# mouse vs rat
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/cluster/home/angie/schwartzbin:/parasol/bin

BLASTZ=blastz.v7.x86_64
BLASTZ_ABRIDGE_REPEATS=1

# TARGET: Mouse Mm7
SEQ1_DIR=/cluster/bluearc/mm7/nib
SEQ1_SMSK=/cluster/bluearc/mm7/linSpecRep/notInRat
SEQ1_LEN=/cluster/bluearc/mm7/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000

# QUERY: Rat Rn3 - chunk big enough to do all chroms in single whole pieces
SEQ2_DIR=/cluster/bluearc/rat/rn3/softNib
SEQ2_SMSK=/cluster/bluearc/rat/rn3/linSpecRep.notInMouse
SEQ2_LEN=/cluster/bluearc/rat/rn3/chrom.sizes
SEQ2_CHUNK=300000000
SEQ2_LAP=0

BASE=/cluster/data/mm7/bed/blastzRn3.2005-09-16
TMPDIR=/scratch/tmp
'_EOF_'
    # << happy emacs

    #	establish a screen to control this job
    screen
    time $HOME/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-bigClusterHub=pk \
	`pwd`/DEF > full.run.out 2>&1 &
    #	various debugging situations as the script is reorganized
    time $HOME/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-continue=load -stop=load -bigClusterHub=pk \
	`pwd`/DEF > load.out 2>&1 &
    #	detach from screen session: Ctrl-a Ctrl-d
    #	to reattach to this screen session:
    ssh kkstore02
    screen -d -r
# Completed: 40713 of 40713 jobs
# CPU time in finished jobs:   18170174s  302836.24m  5047.27h  210.30d  0.576 y
# IO & Wait Time:               1770530s   29508.83m   491.81h   20.49d  0.056 y
# Average job time:                 490s       8.16m     0.14h    0.01d
# Longest running job:                0s       0.00m     0.00h    0.00d
# Longest finished job:           28252s     470.87m     7.85h    0.33d
# Submission to last job:         69864s    1164.40m    19.41h    0.81d

# Completed: 331 of 331 jobs
# CPU time in finished jobs:       1168s      19.46m     0.32h    0.01d  0.000 y
# IO & Wait Time:                  3047s      50.79m     0.85h    0.04d  0.000 y
# Average job time:                  13s       0.21m     0.00h    0.00d
# Longest job:                      119s       1.98m     0.03h    0.00d
# Submission to last job:           359s       5.98m     0.10h    0.00d

# Completed: 40 of 40 jobs
# CPU time in finished jobs:      12274s     204.56m     3.41h    0.14d  0.000 y
# IO & Wait Time:                  1719s      28.66m     0.48h    0.02d  0.000 y
# Average job time:                 350s       5.83m     0.10h    0.00d
# Longest job:                     1016s      16.93m     0.28h    0.01d
# Submission to last job:          1482s      24.70m     0.41h    0.02d

    time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-continue=download -bigClusterHub=pk -fileServer=kolossus \
	`pwd`/DEF > download-clean.out 2>&1 &

    #	Measurements:
    time featureBits mm7 netRn3
    #	2644171540 bases of 2583394090 (102.353%) in intersection
    time featureBits mm6 netRn3
    #	expect ~ 2m 12s
    #	2720144602 bases of 2597150411 (104.736%) in intersection
    time featureBits mm5 netRn3
    #	2638255333 bases of 2615483787 (100.871%) in intersection

    time featureBits mm7 chainRn3
    #	2690727913 bases of 2583394090 (104.155%) in intersection
    time featureBits mm6 chainRn3
    #	expect ~ 10m 30s to 13m 25s
    #	2768422449 bases of 2597150411 (106.595%) in intersection
    time featureBits mm5 chainRn3
    #	2646682349 bases of 2615483787 (101.193%) in intersection

    ssh kolossus
    time HGDB_CONF=~/.hg.conf.read-only featureBits mm7 chainRn3Link
    #	1798351560 bases of 2583394090 (69.612%) in intersection
    time HGDB_CONF=~/.hg.conf.read-only featureBits mm6 chainRn3Link
    #	1802980225 bases of 2597150411 (69.421%) in intersection
    #	real    94m48.021s
    time HGDB_CONF=~/.hg.conf.read-only featureBits mm5 chainRn3Link
    #	1798705001 bases of 2615483787 (68.771%) in intersection
    #	real    76m44.580s

    ssh kkstore02
    screen -r -d
    cd /cluster/data/mm7/bed/blastzRn3.2005-09-16
    time $HOME/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-swap -bigClusterHub=pk \
	`pwd`/DEF > swap.out 2>&1 &
    #	STARTED - 2005-10-04 16:29
    time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-swap -continue=download -bigClusterHub=pk -fileServer=kolossus \
	`pwd`/DEF > swap-download-clean.out 2>&1 &

    ssh kolossus
    time HGDB_CONF=~/.hg.conf.read-only featureBits rn3 chainMm7Link
    #	1802674646 bases of 2571104688 (70.113%) in intersection
    time HGDB_CONF=~/.hg.conf.read-only featureBits rn3 chainMm6Link
    #	1812992492 bases of 2571104688 (70.514%) in intersection
    time HGDB_CONF=~/.hg.conf.read-only featureBits rn3 chainMm5Link
    #	1673171206 bases of 2571104688 (65.076%) in intersection

#########################################################################
# BLASTZ FUGU fr1 (DONE - 2005-09-16 - 2005-11-21 - Hiram)
    ssh pk
    mkdir /cluster/data/mm7/bed/blastzFr1.2005-09-16
    cd /cluster/data/mm7/bed
    ln -s blastzFr1.2005-09-16 blastz.fr1
    cd blastzFr1.2005-09-16

    cat << '_EOF_' > DEF
# mouse vs. fugu
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/cluster/home/angie/schwartzbin:/parasol/bin

BLASTZ=blastz.v7.x86_64

# Reuse parameters from human-chicken, except L=6000 (more relaxed)
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=6000
BLASTZ_K=2200
BLASTZ_Q=/cluster/data/blastz/HoxD55.q

# TARGET: Mouse Mm7 - testing 100,000,000 sized chunk on pk kluster
SEQ1_DIR=/cluster/bluearc/mm7/nib
SEQ1_LEN=/cluster/bluearc/mm7/chrom.sizes
SEQ1_CHUNK=100000000
SEQ1_LAP=10000

# QUERY: Fugu Fr1 - chunk big enough to run the whole chrom at once
SEQ2_DIR=/san/sanvol1/scratch/fr1/nib
SEQ2_LEN=/san/sanvol1/scratch/fr1/chrom.sizes
SEQ2_CHUNK=400000000
SEQ2_LAP=0

BASE=/cluster/data/mm7/bed/blastzFr1.2005-09-16
'_EOF_'
    # << happy emacs

    #	establish a screen to control this job
    ssh pk
    cd /cluster/data/mm7/bed/blastzFr1.2005-09-16
    screen
    time $HOME/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-bigClusterHub=pk -chainMinScore=5000 -stop=load \
	`pwd`/DEF > thruLoad.out 2>&1 &
    #	With 100,000,000 and 400,000,000 chunk sizes, makes only 56 jobs
    #	for the blastz run
    #	STARTED - 2005-09-16 16:21
    #	detach from screen session: Ctrl-a Ctrl-d
    #	to reattach to this screen session:
    ssh pk
    screen -d -r
    #	real    708m20.607s
    #	user    0m0.106s
    #	sys     0m0.107s
    time $HOME/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-bigClusterHub=pk -chainMinScore=5000 -continue=net -stop=load \
	`pwd`/DEF > netThruLoad.out 2>&1 &
    #	real    11m37.061s
    #	user    0m0.054s
    #	sys     0m0.052s
    time $HOME/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-bigClusterHub=pk -chainMinScore=5000 -continue=download \
	`pwd`/DEF > download.clean.out 2>&1 &

    time $HOME/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-bigClusterHub=pk -chainMinScore=5000 -swap \
	`pwd`/DEF > swap.out 2>&1 &


    #	measurements
    featureBits mm7 chainFr1Link
    #	49240999 bases of 2583394090 (1.906%) in intersection
    featureBits mm6 chainFr1Link
    #   55355465 bases of 2597150411 (2.131%) in intersection
    featureBits mm7 netFr1
    #	611679163 bases of 2583394090 (23.677%) in intersection
    featureBits mm6 netFr1
    #   618129802 bases of 2597150411 (23.800%) in intersection
    featureBits mm7 chainFr1
    #	659036967 bases of 2583394090 (25.511%) in intersection
    featureBits mm6 chainFr1
    #   666835089 bases of 2597150411 (25.676%) in intersection

    featureBits fr1 chainMm7Link
    #	42546014 bases of 315518167 (13.484%) in intersection
    featureBits fr1 chainMm6Link
    #   46266090 bases of 315518167 (14.664%) in intersection
    featureBits fr1 netMm7
    #	143256422 bases of 315518167 (45.404%) in intersection
    featureBits fr1 netMm6
    #   146828640 bases of 315518167 (46.536%) in intersection
    featureBits fr1 chainMm7
    #	155565064 bases of 315518167 (49.305%) in intersection
    featureBits fr1 chainMm6
    #   160874127 bases of 315518167 (50.987%) in intersection

    #	re-scoring the chains
    ssh kkstore02
    cd /cluster/data/mm7/bed/blastzFr1.2005-09-16
    rm -fr axtNet mafNet axtChain

    ssh pk
    cd /cluster/data/mm7/bed/blastzFr1.2005-09-16

    time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
	-continue=chainRun -stop=load \
	`pwd`/DEF > rescoreChain.out 2>&1 &
    #	real    403m27.362s
    time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
	-swap -stop=load \
	`pwd`/DEF > rescoreChainSwap.out 2>&1 &

    time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
	-continue=download \
	`pwd`/DEF > rescoreChainDownload.out 2>&1 &

    time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
	-swap -continue=download \
	`pwd`/DEF > rescoreChainDownloadSwap.out 2>&1 &

    #	Measurements:
    ssh kolossus
    cd  /cluster/data/mm7/bed/blastzFr1.2005-09-16
    time HGDB_CONF=~/.hg.conf.read-only featureBits mm7 \
	chainFr1Link > fb.mm7.chainFr1Link.rescore 2>&1
    cat fb.mm7.chainFr1Link.rescore
    #	49240999 bases of 2583394090 (1.906%) in intersection
    time HGDB_CONF=~/.hg.conf.read-only featureBits fr1 \
	chainMm7Link > fb.fr1.chainMm7Link.rescore 2>&1
    cat fb.fr1.chainMm7Link.rescore
    #	42546014 bases of 315518167 (13.484%) in intersection

#########################################################################
# BLASTZ TETRAODON tetNig1 (DONE - 2005-09-20 - 2005-10-11 - Hiram)
    ssh pk
    mkdir /cluster/data/mm7/bed/blastzTetNig1.2005-09-20
    cd /cluster/data/mm7/bed
    ln -s blastzTetNig1.2005-09-20 blastz.tetNig1
    cd blastzTetNig1.2005-09-20
    # use same parameters as for danRer1-mm5
    #	NOTE: The BLASTZ_Q score matrix should have been the Tuned.gap
    #	one which is recreated below during the re-score
    cat << '_EOF_' > DEF
# mouse (mm7) vs Tetraodon tetNig1
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/parasol/bin

BLASTZ=blastz.v7.x86_64

# Reuse parameters from hg16-fr1 and danRer1-hg17.
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=6000
BLASTZ_K=2200
BLASTZ_Q=/san/sanvol1/scratch/blastz/HoxD55.q

# TARGET: Mouse (mm7)
#	small enough chunk to get reasonable running time
SEQ1_DIR=/scratch/hg/mm7/mm7.2bit
SEQ1_LEN=/scratch/hg/mm7/chrom.sizes
SEQ1_CTGDIR=/scratch/hg/mm7/mm7Chroms_RandomContigs.2bit
SEQ1_CTGLEN=/scratch/hg/mm7/mm7Chroms_RandomContigs.sizes
SEQ1_LIFT=/scratch/hg/mm7/Chroms_RandomContigs.lft
SEQ1_IN_CONTIGS=0
SEQ1_CHUNK=10000000
SEQ1_LAP=5000

# QUERY: Tetraodon tetNig1
#  large enough chunk to do all genome in one piece
SEQ2_DIR=/san/sanvol1/scratch/tetNig1/chromsAndScafs/tetNig1ChromsRandomScafs.2bit
SEQ2_CTGDIR=/san/sanvol1/scratch/tetNig1/tetNig1.2bit
SEQ2_LIFT=/san/sanvol1/scratch/tetNig1/chromsAndScafs/chromsAndScafs.lft
SEQ2_LEN=/san/sanvol1/scratch/tetNig1/chrom.sizes
SEQ2_CTGLEN=/san/sanvol1/scratch/tetNig1/chromsAndScafs/chromsAndScafs.sizes
SEQ2_IN_CONTIGS=0
SEQ2_CHUNK=410000000
SEQ2_LAP=0

BASE=/cluster/data/mm7/bed/blastzTetNig1.2005-09-20
TMPDIR=/scratch/tmp
'_EOF_'
    # < happy emacs

    #	establish a screen to control this job
    screen
    time $HOME/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-bigClusterHub=pk \
	`pwd`/DEF > blastz.out 2>&1 &
    #	detach from screen session: Ctrl-a Ctrl-d
    #	to reattach to this screen session:
    ssh pk
    screen -d -r

# Completed: 435 of 435 jobs
# CPU time in finished jobs:    3994116s   66568.60m  1109.48h   46.23d  0.127 y
# IO & Wait Time:                180402s    3006.70m    50.11h    2.09d  0.006 y
# Average job time:                9597s     159.94m     2.67h    0.11d
# Longest finished job:           20697s     344.95m     5.75h    0.24d
# Submission to last job:        108662s    1811.03m    30.18h    1.26d

# Completed: 435 of 435 jobs
# CPU time in finished jobs:         16s       0.27m     0.00h    0.00d  0.000 y
# IO & Wait Time:                  1403s      23.38m     0.39h    0.02d  0.000 y
# Average job time:                   3s       0.05m     0.00h    0.00d
# Longest finished job:               7s       0.12m     0.00h    0.00d
# Submission to last job:            92s       1.53m     0.03h    0.00d

# Completed: 183 of 183 jobs
# CPU time in finished jobs:       2603s      43.38m     0.72h    0.03d  0.000 y
# IO & Wait Time:                  1175s      19.59m     0.33h    0.01d  0.000 y
# Average job time:                  21s       0.34m     0.01h    0.00d
# Longest finished job:             321s       5.35m     0.09h    0.00d
# Submission to last job:        232082s    3868.03m    64.47h    2.69d

    #	Broken chaining run on empty result files, finish that off
    #	manually, then      (mistakenly overwrite blastz.out here)
    time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-continue chainMerge -bigClusterHub=pk \
	`pwd`/DEF > blastz.out 2>&1 &

    #	swap results to place mm7 alignments onto TetNig1
    ssh hgwdev
    cd /cluster/data/mm7/bed/blastzTetNig1.2005-09-20
    time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-swap -bigClusterHub=pk -fileServer kolossus \
	`pwd`/DEF > swap.out 2>&1 &

    #	Measurements:
    cat << '_EOF_' > measurements.sh
#!/bin/sh

doOne()
{
db=$1
tbl=$2
echo "    featureBits $db $tbl"
echo -en "    #\t"
time featureBits $db $tbl
}

doOne mm7 netTetNig1
doOne mm5 netTetNig1
doOne mm7 chainTetNig1
doOne mm5 chainTetNig1
doOne mm7 chainTetNig1Link
doOne mm5 chainTetNig1Link
doOne tetNig1 netMm6
doOne tetNig1 netMm5
doOne tetNig1 chainMm6
doOne tetNig1 chainMm5
doOne tetNig1 chainMm6Link
doOne tetNig1 chainMm5Link
'_EOF_'
    # << keep emacs happy
    chmod +x measurements.sh
    time ./measurements.sh

    featureBits mm7 netTetNig1
    #	705879667 bases of 2583394090 (27.324%) in intersection
    featureBits mm6 netTetNig1
    #   720943295 bases of 2597150411 (27.759%) in intersection
    featureBits mm5 netTetNig1
    #   618111072 bases of 2615483787 (23.633%) in intersection
    featureBits mm7 chainTetNig1
    #	750370420 bases of 2583394090 (29.046%) in intersection
    featureBits mm6 chainTetNig1
    #   771732145 bases of 2597150411 (29.715%) in intersection
    featureBits mm5 chainTetNig1
    #   652622662 bases of 2615483787 (24.952%) in intersection
    featureBits mm7 chainTetNig1Link
    #	51363209 bases of 2583394090 (1.988%) in intersection
    featureBits mm6 chainTetNig1Link
    #   62346107 bases of 2597150411 (2.401%) in intersection
    featureBits mm5 chainTetNig1Link
    #   43905129 bases of 2615483787 (1.679%) in intersection
    featureBits tetNig1 netMm7
    #	160849376 bases of 342403326 (46.977%) in intersection
    featureBits tetNig1 netMm6
    #   176451958 bases of 342403326 (51.533%) in intersection
    featureBits tetNig1 netMm5
    #   152232538 bases of 342403326 (44.460%) in intersection
    featureBits tetNig1 chainMm7
    #	175481308 bases of 342403326 (51.250%) in intersection
    featureBits tetNig1 chainMm6
    #   197657323 bases of 342403326 (57.726%) in intersection
    featureBits tetNig1 chainMm5
    #   163683179 bases of 342403326 (47.804%) in intersection
    featureBits tetNig1 chainMm7Link
    #	47408068 bases of 342403326 (13.846%) in intersection
    featureBits tetNig1 chainMm6Link
    #   55282376 bases of 342403326 (16.145%) in intersection
    featureBits tetNig1 chainMm5Link
    #   41736750 bases of 342403326 (12.189%) in intersection

    #	re-scoring the chains
    ssh kkstore02
    cd /cluster/data/mm7/bed/blastzTetNig1.2005-09-20
    rm -fr axtNet mafNet axtChain

    ssh pk
    cd /cluster/data/mm7/bed/blastzTetNig1.2005-09-20

    time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
	-continue=chainRun -stop=load \
	`pwd`/DEF > rescoreChain.out 2>&1 &
    #	real    155m15.610s
    time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
	-swap -stop=load \
	`pwd`/DEF > rescoreChain.out 2>&1

    time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
	-continue=download \
	`pwd`/DEF > rescoreChainDownload.out 2>&1 &

    time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
	-swap -continue=download \
	`pwd`/DEF > rescoreChainDownloadSwap.out 2>&1 &
 
    #	Measurements:
    ssh kolossus
    cd  /cluster/data/mm7/bed/blastzTetNig1.2005-09-20
    time HGDB_CONF=~/.hg.conf.read-only featureBits mm7 \
	chainTetNig1Link > fb.mm7.chainTetNig1Link.rescore 2>&1
    cat fb.mm7.chainTetNig1Link.rescore
    #	50521413 bases of 2583394090 (1.956%) in intersection
    time HGDB_CONF=~/.hg.conf.read-only featureBits tetNig1 \
	chainMm7Link > fb.tetNig1.chainMm7Link.rescore 2>&1
    cat fb.tetNig1.chainMm7Link.rescore
    #	46775251 bases of 342403326 (13.661%) in intersection


#########################################################################
# CPGISLANDS (DONE - 2005-09-20 - Hiram)
    ssh hgwdev
    mkdir -p /cluster/data/mm7/bed/cpgIsland
    cd /cluster/data/mm7/bed/cpgIsland

    # Build software from Asif Chinwalla (achinwal@watson.wustl.edu)
    cvs co hg3rdParty/cpgIslands
    cd hg3rdParty/cpgIslands
    make
    #	gcc readseq.c cpg_lh.c -o cpglh.exe
    mv cpglh.exe ../..
    
    # cpglh.exe requires hard-masked (N) .fa's.  
    # There may be warnings about "bad character" for IUPAC ambiguous 
    # characters like R, S, etc.  Ignore the warnings.  
    ssh kkstore02
    cd /cluster/data/mm7/bed/cpgIsland
    for F in ../../*/chr*.fa.masked
    do
	FA=${F/*\/}
	C=${FA/.fa.masked/}
	echo "./cpglh.exe ${FA} > ${C}.cpg"
	./cpglh.exe ${F} > ${C}.cpg
    done > cpglh.out 2>&1 &

    #	Bad char 0x52 = 'R' at line 164245, base 8212187, sequence chr14
    #	Bad char 0x53 = 'S' at line 167424, base 8371114, sequence chr14
    #	Bad char 0x53 = 'S' at line 167426, base 8371198, sequence chr14
    #	Several chroms have 0 results:
    #	-rw-rw-r--  1     0 Sep 20 11:52 chrM.cpg
    #	-rw-rw-r--  1     0 Sep 20 11:52 chrY.cpg
    #	-rw-rw-r--  1     0 Sep 20 11:52 chrY_random.cpg

# XXX - this is interesting that chrY, either one, have nothing.
#	the previous mm5 release did have some on chrY
#	Evidently the new chrY is too short - this chrY is being
#	reconstructed and only a small part of it is known in this
#	assembly.  The bulk of chrY from previous assemblies is now in
#	chrY_random

    # Transform cpglh output to bed +
    cat << '_EOF_' > filter.awk
{
$2 = $2 - 1;
width = $3 - $2;
printf("%s\t%d\t%s\t%s %s\t%s\t%s\t%0.0f\t%0.1f\t%s\t%s\n",
       $1, $2, $3, $5,$6, width,
       $6, width*$7*0.01, 100.0*2*$6/width, $7, $9);
}
'_EOF_'
    # << this line makes emacs coloring happy
    awk -f filter.awk chr*.cpg | sort -k1,1 -k2,2n > cpgIsland.bed

    ssh hgwdev
    cd /cluster/data/mm7/bed/cpgIsland
    hgLoadBed -strict mm7 cpgIslandExt -tab -noBin \
      -sqlTable=$HOME/kent/src/hg/lib/cpgIslandExt.sql cpgIsland.bed
    #	Reading cpgIsland.bed
    #	Loaded 16030 elements of size 10
    #	Sorted
    #	Saving bed.tab
    #	Loading mm7
    featureBits mm7 cpgIslandExt
    #	10439328 bases of 2583394090 (0.404%) in intersection
    featureBits mm6 cpgIslandExt
    #	10432360 bases of 2597150411 (0.402%) in intersection
    featureBits mm5 cpgIslandExt
    #	10422989 bases of 2615483787 (0.399%) in intersection
    featureBits mm4 cpgIsland
    #	11109692 bases of 2627444668 (0.423%) in intersection
    featureBits mm3 cpgIsland
    #	10102968 bases of 2505900260 (0.403%) in intersection

#########################################################################
# ANDY LAW CPGISSLANDS (DONE - 2005-09-20 - Hiram)
    # See notes in makeGalGal2.doc and makeCanFam2.doc
    ssh kkstore02
    mkdir /cluster/data/mm7/bed/cpgIslandGgfAndy
    cd /cluster/data/mm7/bed/cpgIslandGgfAndy

    #	Build the preProcGgfAndy program in
    #	kent/src/oneShot/preProcGgfAndy into your ~/bin/$MACHTYPE

    # Use masked sequence since this is a mammal...
    for F in ../../*/chr*.fa.masked
    do
	FA=${F/*\/}
	C=${FA/.fa.masked/}
	echo preproc and run on masked "${C} ${F}" 1>/dev/stderr
	~/bin/$MACHTYPE/preProcGgfAndy ${F} \
	| /cluster/home/angie/ggf-andy-cpg-island.pl \
	| perl -wpe 'chomp; ($s,$e,$cpg,$n,$c,$g1,$oE) = split("\t"); $s--;
                   $gc=$c+$g1;  $pCpG=(100.0 * 2 * $cpg / $n);
                   $pGc=(100.0 * $gc / $n);
                   $_="'${C}'\t$s\t$e\tCpG: $cpg\t$n\t$cpg\t$gc\t" .
                        "$pCpG\t$pGc\t$oE\n";'
    done | sort -k1,1 -k2,2n > cpgIslandGgfAndyMasked.bed

    # load into database:
    ssh hgwdev
    cd /cluster/data/mm7/bed/cpgIslandGgfAndy
    sed -e 's/cpgIslandExt/cpgIslandGgfAndyMasked/g' \
      $HOME/kent/src/hg/lib/cpgIslandExt.sql > cpgIslandGgfAndyMasked.sql
    hgLoadBed mm7 cpgIslandGgfAndyMasked -tab -noBin \
      -sqlTable=cpgIslandGgfAndyMasked.sql cpgIslandGgfAndyMasked.bed
    #	Loaded 67616 elements of size 10
    featureBits mm7 cpgIslandExt
    #	10439328 bases of 2583394090 (0.404%) in intersection
    featureBits mm7 cpgIslandGgfAndyMasked
    #	38774242 bases of 2583394090 (1.501%) in intersection
    wc -l ../cpgIsland/cpgIsland.bed *bed
    #	16030 ../cpgIsland/cpgIsland.bed
    #	67616 cpgIslandGgfAndyMasked.bed

#########################################################################
# BLASTZ Cow bosTau2 (DONE - 2005-09-20 - 2005-11-22 - Hiram)
    ssh kkstore02
    #	There is no need to use the chrBin0 in this alignment
    #	Create a bosTau2 2bit file without the chrBin0 sequence
    mkdir /cluster/data/bosTau2/noBin0
    cd /cluster/data/bosTau2/noBin0
    twoBitInfo ../bosTau2.2bit stdout | grep -v chrBin0 \
        | awk '{print $1}' > chrList
    twoBitToFa ../bosTau2.2bit stdout -seqList=chrList \
	| faToTwoBit stdin bosTau2.noBin0.2bit
    twoBitInfo bosTau2.noBin0.2bit noBin0.sizes
    rm chrList

    mkdir /cluster/data/mm7/bed/blastzBosTau2.2005-09-20
    cd /cluster/data/mm7/bed
    ln -s blastzBosTau2.2005-09-20 blastz.bosTau2
    cd blastzBosTau2.2005-09-20

    cat << '_EOF_' > DEF
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn/x86_64:/cluster/bin/x86_64:/parasol/bin

BLASTZ=blastz.x86_64
BLASTZ_M=50

# mouse vs. cow

# TARGET: Mouse (mm7)
#	small enough chunk to get reasonable running time
SEQ1_DIR=/cluster/bluearc/mm7/mm7Chroms_RandomContigs.2bit
SEQ1_CTGDIR=/cluster/bluearc/mm7/mm7.2bit
SEQ1_LIFT=/cluster/bluearc/mm7/Chroms_RandomContigs.lft
SEQ1_LEN=/cluster/bluearc/mm7/chrom.sizes
SEQ1_CTGLEN=/cluster/bluearc/mm7/mm7Chroms_RandomContigs.sizes
SEQ1_LIMIT=30
SEQ1_IN_CONTIGS=0
SEQ1_CHUNK=500000
SEQ1_LAP=5000

# QUERY: Cow bosTau2
#  large enough chunk to do all the genome in one piece
SEQ2_DIR=/san/sanvol1/scratch/bosTau2/bosTau2.noBin0.2bit
SEQ2_LEN=/san/sanvol1/scratch/bosTau2/noBin0.sizes
SEQ2_CHUNK=3200000000
SEQ2_LAP=0

BASE=/cluster/data/mm7/bed/blastzBosTau2.2005-09-20
TMPDIR=/scratch/tmp
'_EOF_'
    # << keep emacs coloring happy

    #	establish a screen to control this job
    screen
    time $HOME/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-bigClusterHub=pk \
	`pwd`/DEF > blastz.out 2>&1 &
    time /cluster/bin/scripts/doBlastzChainNet.pl \
	-continue=cat -bigClusterHub=pk \
	`pwd`/DEF > continue.cat.out 2>&1 &

    time /cluster/bin/scripts/doBlastzChainNet.pl \
	-continue=chainMerge -bigClusterHub=pk \
	`pwd`/DEF > continue.chainMerge.out 2>&1 &

    time /cluster/bin/scripts/doBlastzChainNet.pl \
	-swap -bigClusterHub=pk \
	`pwd`/DEF > swap.out 2>&1 &

    time /cluster/bin/scripts/doBlastzChainNet.pl \
	`pwd`/DEF > blast.run.out 2>&1 &
    #	detach from screen session: Ctrl-a Ctrl-d
    #	to reattach to this screen session:
    ssh kkstore02
    screen -d -r
    #	STARTED - 2005-03-18 13:20
    #	BROKEN - 2005-03-20 - 22:03 - power failure to all machines
    #	RESTARTED - 2005-03-30 14:35
    #	After several reruns of the batch, believe it may be finished.
    #	establish check point marker in the run.time file:
    para time > run.time
    #	Now to the rest of the story:
    ssh eieio
    cd /cluster/data/mm7/bed/blastzBosTau1.2005_03_18
    time /cluster/bin/scripts/doBlastzChainNet.pl -fileServer eieio \
	-continue cat `pwd`/DEF > cat.run.out 2>&1 &
# Completed: 40 of 40 jobs
# CPU time in finished jobs:        834s      13.90m     0.23h    0.01d  0.000 y
# IO & Wait Time:                  2421s      40.35m     0.67h    0.03d  0.000 y
# Average job time:                  81s       1.36m     0.02h    0.00d
# Longest job:                      334s       5.57m     0.09h    0.00d
# Submission to last job:           365s       6.08m     0.10h    0.00d

    #	measurements:
    ssh hgwdev
    cd /cluster/data/mm7/bed/blastzBosTau1.2005_03_18
    time ../../jkStuff/netChainCheck.sh mm7 mm5 bosTau1 >measurements.out 2>&1 &
    featureBits mm7 netBosTau1
    #   1483158691 bases of 2597150411 (57.107%) in intersection
    featureBits mm5 netBosTau1
    #   1491250043 bases of 2615483787 (57.016%) in intersection
    featureBits mm7 chainBosTau1
    #   1551920940 bases of 2597150411 (59.755%) in intersection
    featureBits mm5 chainBosTau1
    #   1557897465 bases of 2615483787 (59.564%) in intersection
    featureBits mm7 chainBosTau1Link
    #   603091864 bases of 2597150411 (23.221%) in intersection
    featureBits mm5 chainBosTau1Link
    #   606973993 bases of 2615483787 (23.207%) in intersection

    #	Looking OK, so do the swap
    ssh eieio
    cd /cluster/data/mm7/bed/blastzBosTau1.2005_03_18
    time /cluster/bin/scripts/doBlastzChainNet.pl -fileServer eieio \
	-swap `pwd`/DEF > swap.run.out 2>&1 &
    #	308 m = 5h 8m
    #	failed on kolossus due to NFS problems
    ssh kolossus
    cd /cluster/data/bosTau1/bed/blastz.mm7.swap/axtChain
    #	extract the unfinished portion of netChains.csh into
    #	finiChains.csh and run it:
    time ./finiChains.csh
    #	STARTED - 2005-04-06
    #	13h 50m
    #	continuing
    ssh eieio
    cd /cluster/data/mm7/bed/blastzBosTau1.2005_03_18
    time /cluster/bin/scripts/doBlastzChainNet.pl -fileServer eieio \
	-swap -continue load `pwd`/DEF > load.run.out 2>&1 &
    #	5h 6min load time
    #	checking measurements:
    featureBits bosTau1 netMm6
    #	1317934269 bases of 2261116798 (58.287%) in intersection
    featureBits bosTau1 netMm5
    #   1317539731 bases of 2261116798 (58.269%) in intersection
    featureBits bosTau1 chainMm6
    # 1325743373 bases of 2261116798 (58.632%) in intersection
    featureBits bosTau1 chainMm5
    #   1325445280 bases of 2261116798 (58.619%) in intersection
    featureBits bosTau1 chainMm6Link
    # 589779558 bases of 2261116798 (26.084%) in intersection
    featureBits bosTau1 chainMm5Link
    #   588460684 bases of 2261116798 (26.025%) in intersection

    #	looks good, done.

    #	re-scoring the chains
    ssh kkstore02
    cd /cluster/data/mm7/bed/blastzBosTau2.2005-09-20
    rm -fr axtNet mafNet axtChain

    ssh pk
    cd /cluster/data/mm7/bed/blastzBosTau2.2005-09-20

    time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
	-continue=chainRun -stop=load \
	`pwd`/DEF > rescoreChain.out 2>&1 &
    
    time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
	-swap -stop=load \
	`pwd`/DEF > rescoreChainSwap.out 2>&1 &

    #	Measurements:
    ssh kolossus
    cd  /cluster/data/mm7/bed/blastzBosTau2.2005-11-14
    time HGDB_CONF=~/.hg.conf.read-only featureBits mm7 \
	chainBosTau2Link > fb.mm7.chainBosTau2Link.rescore 2>&1
    cat fb.mm7.chainBosTau2Link.rescore
    #	692466702 bases of 2583394090 (26.805%) in intersection
    time HGDB_CONF=~/.hg.conf.read-only featureBits bosTau2 \
	chainMm7Link > fb.bosTau2.chainMm7Link.rescore 2>&1
    cat fb.bosTau2.chainMm7Link.rescore
    #	678966583 bases of 2812203870 (24.144%) in intersection

    time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
	-continue=download \
	`pwd`/DEF > rescoreChainDownload.out 2>&1 &

    time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
	-swap -continue=download \
	`pwd`/DEF > rescoreChainSwapDownload.out 2>&1 &

#############################################################################
# BLASTZ CHICKEN galGal2 - second time (DONE - 2005-11-14 - 2005-11-21 - Hiram)
    #	After fixing a bug in the lineage specific repeat snip business
    #	in blastz-run-ucsc script
    ssh kk
    mkdir /cluster/data/mm7/bed/blastzGalGal2.2005-11-14
    cd /cluster/data/mm7/bed
    rm blastz.galGal2
    ln -s blastzGalGal2.2005-11-14 blastz.galGal2
    cd blastzGalGal2.2005-11-14

    cat << '_EOF_' > DEF
# mouse vs. chicken
export
PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/i386:/parasol/bin:/cluster/home/angie/schwartzbin

BLASTZ=blastz.v7

# Specific settings for chicken (per Webb email to Brian Raney)
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=10000
BLASTZ_K=2200
BLASTZ_Q=/cluster/data/blastz/HoxD55.q
BLASTZ_ABRIDGE_REPEATS=1

# TARGET: Mouse (mm7)
SEQ1_DIR=/scratch/hg/mm7/nib
SEQ1_SMSK=/scratch/hg/mm7/linSpecRep/notInChicken
SEQ1_LEN=/scratch/hg/mm7/chrom.sizes
SEQ1_IN_CONTIGS=0
SEQ1_CHUNK=10000000
SEQ1_LAP=10000

# QUERY: Chicken galGal2 - single chunk big enough for whole chroms at once
SEQ2_DIR=/scratch/hg/galGal2/nib
SEQ2_LEN=/scratch/hg/galGal2/chrom.sizes
SEQ2_SMSK=/scratch/hg/galGal2/linSpecRep
SEQ2_IN_CONTIGS=0
SEQ2_CHUNK=200000000
SEQ2_LAP=0

BASE=/cluster/data/mm7/bed/blastzGalGal2.2005-11-14
TMPDIR=/scratch/tmp
'_EOF_'
    # happy emacs

    #	establish a screen to control this job
    screen
    time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-bigClusterHub=kk \
	-stop=net \
	`pwd`/DEF > blastz-to-net.out 2>&1 &
    #	Started 2005-11-14 15:03
# Completed: 16524 of 16524 jobs
# CPU time in finished jobs:   10883816s  181396.93m  3023.28h  125.97d  0.345 y
# IO & Wait Time:                259868s    4331.13m    72.19h    3.01d  0.008 y
# Average job time:                 674s      11.24m     0.19h    0.01d
# Longest finished job:           10058s     167.63m     2.79h    0.12d
# Submission to last job:         43463s     724.38m    12.07h    0.50d

# Completed: 306 of 306 jobs
# CPU time in finished jobs:         33s       0.55m     0.01h    0.00d  0.000 y
# IO & Wait Time:                  1244s      20.73m     0.35h    0.01d  0.000 y
# Average job time:                   4s       0.07m     0.00h    0.00d
# Longest finished job:              11s       0.18m     0.00h    0.00d
# Submission to last job:            82s       1.37m     0.02h    0.00d

# Completed: 40 of 40 jobs
# CPU time in finished jobs:        976s      16.27m     0.27h    0.01d  0.000 y
# IO & Wait Time:                   229s       3.81m     0.06h    0.00d  0.000 y
# Average job time:                  30s       0.50m     0.01h    0.00d
# Longest finished job:              82s       1.37m     0.02h    0.00d
# Submission to last job:           105s       1.75m     0.03h    0.00d
    time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-bigClusterHub=kk \
	-continue=net -stop=load \
	`pwd`/DEF > net-to-load.out 2>&1 &

    #	measurements are looking good:
    featureBits mm7 chainGalGal2Link
    #	81103709 bases of 2583394090 (3.139%) in intersection

    time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-bigClusterHub=kk \
	-swap -stop=load \
	`pwd`/DEF > swap-to-load.out 2>&1 &

    #	72202388 bases of 1054197620 (6.849%) in intersection

    #	re-scoring the chains
    ssh kkstore02
    cd /cluster/data/mm7/bed/blastzGalGal2.2005-11-14
    rm -fr axtNet mafNet axtChain

    ssh pk
    cd /cluster/data/mm7/bed/blastzGalGal2.2005-11-14

    time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
	-continue=chainRun -stop=load \
	`pwd`/DEF > rescoreChain.out 2>&1 &
    #	real    72m11.827s

    time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
	-swap -stop=load \
	`pwd`/DEF > rescoreChainSwap.out 2>&1 &
    #	real    13m42.163s

    time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
	-continue=download \
	`pwd`/DEF > rescoreDownload.out 2>&1 &

    time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
	-swap -continue=download \
	`pwd`/DEF > rescoreDownloadSwap.out 2>&1 &

    #	Measurements:
    ssh kolossus
    cd  /cluster/data/mm7/bed/blastzGalGal2.2005-11-14
    time HGDB_CONF=~/.hg.conf.read-only featureBits mm7 \
	chainGalGal2Link > fb.mm7.chainGalGal2Link.rescore 2>&1
    cat fb.mm7.chainGalGal2Link.rescore
    #	79325073 bases of 2583394090 (3.071%) in intersection
    time HGDB_CONF=~/.hg.conf.read-only featureBits galGal2 \
	chainMm7Link > fb.galGal2.chainMm7Link.rescore 2>&1
    cat fb.galGal2.chainMm7Link.rescore
    #	70793655 bases of 1054197620 (6.715%) in intersection

#############################################################################
# BLASTZ CHICKEN - (DONE - 2005-10-03 - Hiram)

    # MAKE LINEAGE-SPECIFIC REPEATS FOR CHICKEN
    # In an email 2/13/04, Arian said we could treat all human repeats as 
    # lineage-specific for human-chicken blastz.  Do the same for mouse.  
    
    ssh pk
    mkdir /cluster/data/mm7/bed/blastzGalGal2.2005-09-29
    cd /cluster/data/mm7/bed/
    ln -s blastzGalGal2.2005-09-29 blastz.galGal2
    cd blastzGalGal2.2005-09-29

    cat << '_EOF_' > DEF
# mouse vs. chicken
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/parasol/bin

BLASTZ=blastz.v7.x86_64

# Specific settings for chicken (per Webb email to Brian Raney)
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=10000
BLASTZ_K=2200
BLASTZ_Q=/cluster/data/blastz/HoxD55.q
BLASTZ_ABRIDGE_REPEATS=1

# TARGET: Mouse (mm7)
SEQ1_DIR=/scratch/hg/mm7/nib
SEQ1_SMSK=/scratch/hg/mm7/linSpecRep/notInChicken
SEQ1_LEN=/scratch/hg/mm7/chrom.sizes
SEQ1_IN_CONTIGS=0
SEQ1_CHUNK=10000000
SEQ1_LAP=10000

# QUERY: Chicken galGal2
SEQ2_DIR=/scratch/hg/galGal2/nib
SEQ2_LEN=/scratch/hg/galGal2/chrom.sizes
SEQ2_SMSK=/scratch/hg/galGal2/linSpecRep
SEQ2_IN_CONTIGS=0
SEQ2_CHUNK=30000000
SEQ2_LAP=0

BASE=/cluster/data/mm7/bed/blastzGalGal2.2005-09-29
TMPDIR=/scratch/tmp
'_EOF_'
    # << this line keeps emacs coloring happy

    #	Probably should have done this with chainMinScore=5000
    #	But it doesn't seem to have made much difference according to the
    #	featureBits output measurements.
    ssh pk
    screen
    cd /cluster/data/mm7/bed/blastzGalGal2.2005-09-29
    time $HOME/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-bigClusterHub=pk \
	`pwd`/DEF > blastz.out 2>&1 &
    #	detach from screen session: Ctrl-a Ctrl-d
    #	to reattach to this screen session:
    ssh pk
    screen -d -r
# Completed: 24480 of 24480 jobs
# CPU time in finished jobs:    4362447s   72707.44m  1211.79h   50.49d  0.138 y
# IO & Wait Time:                 80948s    1349.14m    22.49h    0.94d  0.003 y
# Average job time:                 182s       3.03m     0.05h    0.00d
# Longest running job:                0s       0.00m     0.00h    0.00d
# Longest finished job:             645s      10.75m     0.18h    0.01d
# Submission to last job:         13853s     230.88m     3.85h    0.16d

# Completed: 306 of 306 jobs
# CPU time in finished jobs:         34s       0.56m     0.01h    0.00d  0.000 y
# IO & Wait Time:                  1060s      17.67m     0.29h    0.01d  0.000 y
# Average job time:                   4s       0.06m     0.00h    0.00d
# Longest running job:                0s       0.00m     0.00h    0.00d
# Longest finished job:               7s       0.12m     0.00h    0.00d
# Submission to last job:            72s       1.20m     0.02h    0.00d

# Completed: 40 of 40 jobs
# CPU time in finished jobs:        935s      15.58m     0.26h    0.01d  0.000 y
# IO & Wait Time:                   182s       3.04m     0.05h    0.00d  0.000 y
# Average job time:                  28s       0.47m     0.01h    0.00d
# Longest running job:                0s       0.00m     0.00h    0.00d
# Longest finished job:              82s       1.37m     0.02h    0.00d
# Submission to last job:           104s       1.73m     0.03h    0.00d


    #	measurements are looking good:
    featureBits mm7 chainGalGal2Link
    #	80992755 bases of 2583394090 (3.135%) in intersection
    featureBits mm6 chainGalGal2Link
    #   82018349 bases of 2597150411 (3.158%) in intersection
    featureBits mm5 chainGalGal2Link
    #   78951466 bases of 2615483787 (3.019%) in intersection

    featureBits mm7 netGalGal2
    #	1946029450 bases of 2583394090 (75.328%) in intersection
    featureBits mm6 netGalGal2
    #   1937053597 bases of 2597150411 (74.584%) in intersection
    featureBits mm5 netGalGal2
    #   1958796258 bases of 2615483787 (74.892%) in intersection

    featureBits mm7 chainGalGal2
    #	1975402545 bases of 2583394090 (76.465%) in intersection
    featureBits mm6 chainGalGal2
    #   1969505681 bases of 2597150411 (75.833%) in intersection
    featureBits mm5 chainGalGal2
    #   1990102297 bases of 2615483787 (76.089%) in intersection

    #	Since those are OK, now do the swap:
    time $HOME/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-bigClusterHub=pk -swap -chainMinScore=5000 \
	`pwd`/DEF > swap.out 2>&1 &
    #	Had a failure during loading, finished it manually, then
    time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-bigClusterHub=pk -continue=download -swap -chainMinScore=5000 \
	`pwd`/DEF > swap.download-clean.out 2>&1 &

    #	and measure:
    featureBits galGal2 netMm6
    #   832583709 bases of 1054197620 (78.978%) in intersection
    featureBits galGal2 netMm5
    #   835277984 bases of 1054197620 (79.234%) in intersection
    featureBits galGal2 chainMm6
    #   843746491 bases of 1054197620 (80.037%) in intersection
    featureBits galGal2 chainMm5
    #   846905330 bases of 1054197620 (80.336%) in intersection
    featureBits galGal2 chainMm7Link
    #	72046938 bases of 1054197620 (6.834%) in intersection
    featureBits galGal2 chainMm6Link
    #   72687426 bases of 1054197620 (6.895%) in intersection
    featureBits galGal2 chainMm5Link
    #   70542788 bases of 1054197620 (6.692%) in intersection

#############################################################################
# BLASTZ OPOSSUM second time (WORKING - 2006-01-24 - Hiram)
    #	With opossum sequence now properly repeat masked

    ssh pk
    mkdir /cluster/data/mm7/bed/blastzMonDom2.2006-01-24
    cd /cluster/data/mm7/bed/blastzMonDom2.2006-01-24

    cat << '_EOF_' > DEF
# Mouse vs. opossum
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/parasol/bin

BLASTZ=blastz.v7.x86_64

# Specific settings for chicken (per Webb email to Brian Raney)
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=10000
BLASTZ_K=2200
BLASTZ_Q=/cluster/data/blastz/HoxD55.q
BLASTZ_ABRIDGE_REPEATS=0

# TARGET: Mouse (mm7)
SEQ1_DIR=/scratch/hg/mm7/nib
SEQ1_LEN=/scratch/hg/mm7/chrom.sizes
SEQ1_IN_CONTIGS=0
SEQ1_CHUNK=10000000
SEQ1_LAP=10000

# QUERY: Opossum monDom2
SEQ2_DIR=/scratch/hg/monDom2/monDom2.2bit
SEQ2_LEN=/scratch/hg/monDom2/chrom.sizes
SEQ2_IN_CONTIGS=0
SEQ2_CHUNK=30000000
SEQ2_LIMIT=100
SEQ2_LAP=0

BASE=/cluster/data/mm7/bed/blastzMonDom2.2006-01-24
'_EOF_'
    #	happy emacs

    cd /cluster/data/mm7/bed/blastzMonDom2.2006-01-24
    time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
	`pwd`/DEF >blastz.out 2>&1
    #	real    570m27.849s

    #	ran into difficulty during install downloads because the
    #	previous results exists.  So, go to hgwdev
    ssh hgwdev
    cd /usr/local/apache/htdocs/goldenPath/mm7
    rm -fr vsMonDom2
    rm liftOver/mm7ToMonDom2.over.chain.gz

    #	then, repeat the downloads step
    cd /cluster/data/mm7/bed/blastzMonDom2.2006-01-24
    time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-continue=download \
	-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
	`pwd`/DEF > download.out 2>&1

    #	And a featureBits measurement:
    ssh kolossus
    time featureBits mm7 chainMonDom2Link > fb.mm7.chainMonDom2Link 2>&1 &
    #	real    61m59.651s
    #	215440797 bases of 2583394090 (8.339%) in intersection

    #	before you can do the swap, you need to remove the existing swap
    

#############################################################################
# BLASTZ OPOSSUM (DONE - 2005-10-03 - 2005-11-24 - Hiram)
    ssh pk
    mkdir /cluster/data/mm7/bed/blastzMonDom2.2005-10-03
    cd /cluster/data/mm7/bed
    ln -s blastzMonDom2.2005-10-03 blastz.monDom2
    cd blastzMonDom2.2005-10-03

    cat << '_EOF_' > DEF
# mouse vs. opossum
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/parasol/bin

BLASTZ=blastz.v7.x86_64

# Specific settings for chicken (per Webb email to Brian Raney)
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=10000
BLASTZ_K=2200
BLASTZ_Q=/cluster/data/blastz/HoxD55.q
BLASTZ_ABRIDGE_REPEATS=0

# TARGET: Mouse (mm7)
SEQ1_DIR=/scratch/hg/mm7/nib
SEQ1_LEN=/scratch/hg/mm7/chrom.sizes
SEQ1_IN_CONTIGS=0
SEQ1_CHUNK=10000000
SEQ1_LAP=10000

# QUERY: Chicken galGal2
SEQ2_DIR=/scratch/hg/monDom2/monDom2.2bit
SEQ2_LEN=/scratch/hg/monDom2/chrom.sizes
SEQ2_IN_CONTIGS=0
SEQ2_CHUNK=30000000
SEQ2_LAP=0

BASE=/cluster/data/mm7/bed/blastzMonDom2.2005-10-03
TMPDIR=/scratch/tmp
'_EOF_'
    # << this line keeps emacs coloring happy

    #	This should have been done with chainMinScore=5000
    #	After this was all finished, went back manually and reran the
    #	chaining step to get a -minScore=5000 in there.

    #	Probably should have done this with chainMinScore=5000
    #	But it doesn't seem to have made much difference according to the
    #	featureBits output measurements.
    ssh pk
    screen
    cd /cluster/data/mm7/bed/blastzMonDom2.2005-10-03
    time $HOME/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-bigClusterHub=pk \
	`pwd`/DEF > blastz.out 2>&1 &
    #	STARTED 2005-10-03 14:30
    #	real    579m33.438s
    #	user    0m0.064s
    #	sys     0m0.043s
    #	detach from screen session: Ctrl-a Ctrl-d
    #	to reattach to this screen session:

    ssh kolossus
    time HGDB_CONF=~/.hg.conf.read-only featureBits mm7 chainMonDom2Link

    ssh pk
    screen -d -r
# Completed: 44370 of 44370 jobs
# CPU time in finished jobs:    7966937s  132782.29m  2213.04h   92.21d  0.253 y
# IO & Wait Time:                195416s    3256.93m    54.28h    2.26d  0.006 y
# Average job time:                 184s       3.07m     0.05h    0.00d
# Longest running job:                0s       0.00m     0.00h    0.00d
# Longest finished job:            6810s     113.50m     1.89h    0.08d
# Submission to last job:         60543s    1009.05m    16.82h    0.70d

    time $HOME/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-bigClusterHub=pk \
	-continue=cat \
	`pwd`/DEF > continue.cat.out 2>&1 &

    time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-chainMinScore=5000 -bigClusterHub=pk \
	-swap -fileServer=kolossus \
	`pwd`/DEF > swap.out 2>&1 &

    #	The load of the swapped chains can not complete, out of memory
    #	So, try to split them up into 1000 lumps:
    ssh kkstore01
    cd /cluster/data/monDom2/bed/blastz.mm7.swap/axtChain
    chainSplit -lump=1000 lump1000 monDom2.mm7.all.chain.gz

    ssh kolossus
    cd  /cluster/data/mm7/bed/blastzMonDom1.2005-10-03
    
    time HGDB_CONF=~/.hg.conf.read-only featureBits mm7 chainMonDom2Link

    featureBits mm6 netMonDom1
    #   2082064216 bases of 2597150411 (80.167%) in intersection
    featureBits mm5 netMonDom1
    #   2094316044 bases of 2615483787 (80.074%) in intersection
    featureBits mm7 chainMonDom1
    #   2109438148 bases of 2597150411 (81.221%) in intersection
    featureBits mm5 chainMonDom1
    #   2121448151 bases of 2615483787 (81.111%) in intersection
    featureBits mm7 chainMonDom1Link
    #   249576105 bases of 2597150411 (9.610%) in intersection
    featureBits mm5 chainMonDom1Link
    #   248180346 bases of 2615483787 (9.489%) in intersection

    #	measurements:
    time HGDB_CONF=~/.hg.conf.read-only featureBits monDom2 chainMm7Link

    featureBits monDom1 netMm6
    # 2884735370 bases of 3492108230 (82.607%) in intersection
    featureBits monDom1 netMm5
    #   2889580530 bases of 3492108230 (82.746%) in intersection
    featureBits monDom1 chainMm6
    # 2908045004 bases of 3492108230 (83.275%) in intersection
    featureBits monDom1 chainMm5
    #   2913812625 bases of 3492108230 (83.440%) in intersection
    featureBits monDom1 chainMm6Link
    # 253105698 bases of 3492108230 (7.248%) in intersection
    featureBits monDom1 chainMm5Link
    #   249594220 bases of 3492108230 (7.147%) in intersection

    #	looks OK, done

    #	re-scoring the chains
    ssh kkstore02
    cd /cluster/data/mm7/bed/blastzMonDom2.2005-10-03
    rm -fr axtNet mafNet axtChain

    ssh pk
    cd /cluster/data/mm7/bed/blastzMonDom2.2005-10-03

    time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
	-continue=chainRun -stop=load \
	`pwd`/DEF > rescoreChain.out 2>&1 &

    time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
	-swap -stop=load \
	`pwd`/DEF > rescoreChainSwap.out 2>&1 &

    time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
	-swap -continue=net -stop=load \
	`pwd`/DEF > rescoreChainSwap.net-to-load.out 2>&1 &

    time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
	-continue=download \
	`pwd`/DEF > rescoreChainDownload.out 2>&1 &

##############################################################################
# BLASTZ FROG Xenopus tropicalis (DONE - 2005-10-03 - 2005-11-21 - Hiram)
    ssh pk
    mkdir /cluster/data/mm7/bed/blastzXenTro1.2005-10-03
    cd /cluster/data/mm7/bed
    ln -s blastzXenTro1.2005-10-03 blastz.xenTro1
    cd blastzXenTro1.2005-10-03

    cat << '_EOF_' > DEF
# mouse vs. frog
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/parasol/bin

BLASTZ=blastz.v7.x86_64

# Specific settings for chicken (per Webb email to Brian Raney)
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=8000
BLASTZ_K=2200
BLASTZ_Q=/cluster/data/blastz/HoxD55.q
BLASTZ_ABRIDGE_REPEATS=0

# TARGET: Mouse (mm7)
SEQ1_DIR=/scratch/hg/mm7/nib
SEQ1_LEN=/scratch/hg/mm7/chrom.sizes
SEQ1_IN_CONTIGS=0
SEQ1_CHUNK=10000000
SEQ1_LAP=10000

# QUERY: Frog xenTro1
SEQ2_DIR=/scratch/hg/xenTro1/xenTro1.2bit
SEQ2_LEN=/scratch/hg/xenTro1/chrom.sizes
SEQ2_IN_CONTIGS=0
SEQ2_CHUNK=30000000
SEQ2_LAP=0

BASE=/cluster/data/mm7/bed/blastzXenTro1.2005-10-03
TMPDIR=/scratch/tmp
'_EOF_'
    # << keep emacs coloring happy

    screen
    time $HOME/bin/scripts/doBlastzChainNet.pl -verbose=2 -chainMinScore=5000 \
	-bigClusterHub=pk \
	`pwd`/DEF > blastz.out 2>&1 &
    #	STARTED 2005-10-03 14:30
    #	real    918m58.309s
    #	user    0m0.063s
    #	sys     0m0.056s

    #	Had some trouble, finished the blastz manually, then continuing:
    time $HOME/bin/scripts/doBlastzChainNet.pl -verbose=2 -chainMinScore=5000 \
	-bigClusterHub=pk -continue=cat \
	`pwd`/DEF > continue.cat.out 2>&1 &
    #	detach from screen session: Ctrl-a Ctrl-d
    #	to reattach to this screen session:
    ssh kkstore02
    screen -d -r

    #	measurements
    ssh hgwdev
    cd /cluster/data/mm7/bed/blastzXenTro1.2005_04_05
    time ../../jkStuff/netChainCheck.sh mm7 mm5 xenTro1 > measures.out 2>&1 &
    featureBits mm7 netXenTro1
    #	1033982128 bases of 2583394090 (40.024%) in intersection
    featureBits mm6 netXenTro1
    #   1033071781 bases of 2597150411 (39.777%) in intersection
    featureBits mm5 netXenTro1
    #   1042210258 bases of 2615483787 (39.848%) in intersection
    featureBits mm7 chainXenTro1
    #	1065396669 bases of 2583394090 (41.240%) in intersection
    featureBits mm6 chainXenTro1
    #   1063392793 bases of 2597150411 (40.945%) in intersection
    featureBits mm5 chainXenTro1
    #   1078618413 bases of 2615483787 (41.240%) in intersection
    featureBits mm7 chainXenTro1Link
    #	62465913 bases of 2583394090 (2.418%) in intersection
    featureBits mm6 chainXenTro1Link
    #   67119684 bases of 2597150411 (2.584%) in intersection
    featureBits mm5 chainXenTro1Link
    #   73115446 bases of 2615483787 (2.795%) in intersection

    #	Those are looking good, now to the swap:
    ssh pk
    cd /cluster/data/mm7/bed/blastzXenTro1.2005-10-03
    time $HOME/bin/scripts/doBlastzChainNet.pl -verbose=2 -chainMinScore=5000 \
	-swap -bigClusterHub=pk -continue=cat \
	`pwd`/DEF > swap.out 2>&1 &
    #	70 minutes
    #	Measurements:

    featureBits xenTro1 netMm7
    #	681240580 bases of 1381238994 (49.321%) in intersection
    featureBits xenTro1 netMm6
    #   683225633 bases of 1381238994 (49.465%) in intersection
    featureBits xenTro1 netMm5
    #   697384254 bases of 1381238994 (50.490%) in intersection
    featureBits xenTro1 chainMm7
    #	697097012 bases of 1381238994 (50.469%) in intersection
    featureBits xenTro1 chainMm6
    #   700638086 bases of 1381238994 (50.725%) in intersection
    featureBits xenTro1 chainMm5
    #   721494705 bases of 1381238994 (52.235%) in intersection
    HGDB_CONF=~/.hg.conf.read-only featureBits xenTro1 chainMm7Link
    #	59339777 bases of 1381238994 (4.296%) in intersection
    featureBits xenTro1 chainMm6Link
    #   64584213 bases of 1381238994 (4.676%) in intersectio
    featureBits xenTro1 chainMm5Link
    #   76415718 bases of 1381238994 (5.532%) in intersection

    #	re-scoring the chains
    ssh kkstore02
    cd /cluster/data/mm7/bed/blastzXenTro1.2005-10-03
    rm -fr axtNet mafNet axtChain

    ssh pk
    cd /cluster/data/mm7/bed/blastzXenTro1.2005-10-03

    time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
	-continue=chainRun -stop=load \
	`pwd`/DEF > rescoreChain.out 2>&1 &
    #	real    101m3.953s
    time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
	-swap -stop=load \
	`pwd`/DEF > rescoreChainSwap.out 2>&1 &

    time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
	-continue=download \
	`pwd`/DEF > rescoreChainDownload.out 2>&1 &

    time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
	-swap -continue=download \
	`pwd`/DEF > rescoreChainDownloadSwap.out 2>&1 &

    #	Measurements:
    ssh kolossus
    cd  /cluster/data/mm7/bed/blastzXenTro1.2005-10-03
    time HGDB_CONF=~/.hg.conf.read-only featureBits mm7 \
	chainXenTro1Link > fb.mm7.chainXenTro1Link.rescore 2>&1
    cat fb.mm7.chainXenTro1Link.rescore
    #	62465913 bases of 2583394090 (2.418%) in intersection
    time HGDB_CONF=~/.hg.conf.read-only featureBits xenTro1 \
	chainMm7Link > fb.xenTro1.chainMm7Link.rescore 2>&1
    cat fb.xenTro1.chainMm7Link.rescore
    #	59339777 bases of 1381238994 (4.296%) in intersection
    
#############################################################################
# BLASTZ CHIMP PanTro1 (DONE - 2005-10-04 - 2005-10-20 - Hiram)
#	This sequence didn't function correctly in the multiple
#	alignment.  The blastz run was done a second time.
    ssh kkstore03
    #	Create the composite chrom/contigs from randoms 2bit file
    #	for PanTro1
    mkdir /cluster/data/panTro1/randomContigs
    cd /cluster/data/panTro1/randomContigs
    cp -p /cluster/data/cb2/scripts/agpToLift.pl ../jkStuff
    for AGP in ../pan_troglodytes_agp/ptr*_random.agp
do
    CHR=`basename ${AGP}`
    CHR=${CHR/.agp}
    CHR=${CHR#ptr}
    echo ${CHR}
    sed -e "s/^ptr/chr/" ${AGP} | ../jkStuff/agpToLift.pl /dev/stdin \
	> chr${CHR}.lft
    /cluster/data/mm7/jkStuff/lft2BitToFa.pl ../panTro1.2bit chr${CHR}.lft \
                > chr${CHR}.ctg.fa
done
    cat chr*.lft > ../jkStuff/panTro1Chroms_RandomContigs.lft

    #	Check that the sequence remains the same
    cd /cluster/data/panTro1
    faSize randomContigs/*.ctg.fa
    #	371582039 bases (50043664 N's 321538375 real 158541165
    #	upper 162997210 lower) in 31264 sequences in 26 files
    faSize */chr*_random.fa
    #	1336382039 bases (1014843664 N's 321538375 real 158541165
    #	upper 162997210 lower) in 26 sequences in 26 files
    #	Note, only the N's are different
    faToTwoBit ?/chr?.fa ??/chr??.fa \
	randomContigs/chr*.ctg.fa panTro1Chroms_RandomContigs.2bit
    #	Verify sequence isn't broken:
    twoBitToFa panTro1Chroms_RandomContigs.2bit stdout | faSize stdin
    #	3455575440 bases (721627263 N's 2733948177 real 1461884170
    #	upper 1272064007 lower) in 31290 sequences in 1 files

    twoBitToFa panTro1.2bit stdout | faSize stdin
    # 4420375440 bases (1686427263 N's 2733948177 real 1461884170
    #	upper 1272064007 lower) in 52 sequences in 1 files
    #	Only difference is the N count. 1686427263 - 721627263 = 964800000

XXX something is broken for the chrUn lift file:
faSize chrUn_random.fa
240967748 bases (159721331 Ns 81246417 real 36386260 upper 44860157 lower)
	in 1 sequences in 1 files
The agp file for chrUn has a contig gap as the last entry in the file:
ptrUn_random    240945901       240957748       29919   W       scaffold_9998   1       11848   +
ptrUn_random    240957749       240967748       29920   N       10000   contig 

    twoBitInfo panTro1Chroms_RandomContigs.2bit \
	panTro1Chroms_RandomContigs.sizes

    ssh pk
    cd /cluster/data/panTro1
    mkdir /san/sanvol1/scratch/panTro1/
    cp -p panTro1.2bit /san/sanvol1/scratch/panTro1
    cp -p chrom.sizes /san/sanvol1/scratch/panTro1
    cp -p panTro1Chroms_RandomContigs.sizes /san/sanvol1/scratch/panTro1
    cp -p panTro1Chroms_RandomContigs.2bit /san/sanvol1/scratch/panTro1
    cp -p jkStuff/panTro1Chroms_RandomContigs.lft /san/sanvol1/scratch/panTro1

    mkdir /cluster/data/mm7/bed/blastzPanTro1.2005-10-05
    cd /cluster/data/mm7/bed
    ln -s blastzPanTro1.2005-10-05 blastz.panTro1
    cd blastzPanTro1.2005-10-05

    #	same parameters as Human alignment, except for the use of the
    #	SMSK linSpecRepeats - in this case, using none.  Should be an
    #	interesting comparison if the lineage specific repeats make much
    #	difference in the result.
    cat << '_EOF_' > DEF
# human vs mouse
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn/x86_64:/cluster/bin/x86_64:/cluster/home/angie/schwartzbin

BLASTZ=blastz.x86_64
BLASTZ_H=2000
BLASTZ_M=50

# TARGET: Mouse (mm7)  small enough chunk to get reasonable run time
SEQ1_DIR=/scratch/hg/mm7/mm7.2bit
SEQ1_LEN=/scratch/hg/mm7/chrom.sizes
SEQ1_CTGDIR=/scratch/hg/mm7/mm7Chroms_RandomContigs.2bit
SEQ1_CTGLEN=/scratch/hg/mm7/mm7Chroms_RandomContigs.sizes
SEQ1_LIFT=/scratch/hg/mm7/Chroms_RandomContigs.lft
SEQ1_LIMIT=30
SEQ1_IN_CONTIGS=0
SEQ1_CHUNK=500000
SEQ1_LAP=5000


# QUERY: Chimp panTro1 large enough chunk to go all at once
SEQ2_DIR=/san/sanvol1/scratch/panTro1/panTro1.2bit
SEQ2_LEN=/san/sanvol1/scratch/panTro1/chrom.sizes
SEQ2_CTGDIR=/san/sanvol1/scratch/panTro1/panTro1Chroms_RandomContigs.2bit
SEQ2_CTGLEN=/san/sanvol1/scratch/panTro1/panTro1Chroms_RandomContigs.sizes
SEQ2_LIFT=/san/sanvol1/scratch/panTro1/panTro1Chroms_RandomContigs.lft
SEQ2_IN_CONTIGS=0
SEQ2_CHUNK=4421000000
SEQ2_LAP=0

BASE=/cluster/data/mm7/bed/blastzPanTro1.2005-10-05
TMPDIR=/scratch/tmp
'_EOF_'
    # << happy emacs

    screen
    time $HOME/bin/scripts/doBlastzChainNet.pl \
	-bigClusterHub=pk \
	`pwd`/DEF > blast.run.out 2>&1 &
    #	STARTED 2005-10-07 12:48

    time /cluster/bin/scripts/doBlastzChainNet.pl \
	-continue=cat -bigClusterHub=pk \
	`pwd`/DEF > continue.cat.out 2>&1 &

    #	Had some problems due to chrom.sizes being incorrect because all
    #	of the *_random chroms *end* with a gap !  Manually edited the
    #	chrom.sizes file in /san/sanvol1/ to take off 50000 from the
    #	length (or 10000 for the chrUn_random) to get the nets to
    #	complete.
    time /cluster/bin/scripts/doBlastzChainNet.pl \
	-continue=chainMerge -bigClusterHub=pk \
	`pwd`/DEF > continue.chainMerge.out 2>&1 &
    time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-continue=net -bigClusterHub=pk \
	`pwd`/DEF > continue.net.out 2>&1 &

    #	check feature bits
    ssh kolossus
    HGDB_CONF=~/.hg.conf.read-only featureBits mm7 chainPanTro1Link
    #	917177798 bases of 2583394090 (35.503%) in intersection

    #	Looks OK, now to swap
    time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-swap -bigClusterHub=pk -fileServer=kolossus \
	`pwd`/DEF > swap.out 2>&1 &
    #	Had some difficulties with the netting - after fixing, then load:
    time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-continue=load -swap -bigClusterHub=kk \
	`pwd`/DEF > load.swap.out 2>&1 &

    #	489 minutes = 8h 09m
    #	detach from screen session: Ctrl-a Ctrl-d
    #	to reattach to this screen session:
    ssh kkstore02
    screen -d -r
Completed: 155570 of 155570 jobs
CPU time in finished jobs:   14707939s  245132.32m  4085.54h  170.23d  0.466 y
IO & Wait Time:                609798s   10163.29m   169.39h    7.06d  0.019 y
Average job time:                  98s       1.64m     0.03h    0.00d
Longest running job:                0s       0.00m     0.00h    0.00d
Longest finished job:            5146s      85.77m     1.43h    0.06d
Submission to last job:         20972s     349.53m     5.83h    0.24d

Completed: 331 of 331 jobs
CPU time in finished jobs:        260s       4.33m     0.07h    0.00d  0.000 y
IO & Wait Time:                  1135s      18.92m     0.32h    0.01d  0.000 y
Average job time:                   4s       0.07m     0.00h    0.00d
Longest running job:                0s       0.00m     0.00h    0.00d
Longest finished job:              16s       0.27m     0.00h    0.00d
Submission to last job:           234s       3.90m     0.07h    0.00d

Completed: 40 of 40 jobs
CPU time in finished jobs:       7229s     120.48m     2.01h    0.08d  0.000 y
IO & Wait Time:                   207s       3.46m     0.06h    0.00d  0.000 y
Average job time:                 186s       3.10m     0.05h    0.00d
Longest running job:                0s       0.00m     0.00h    0.00d
Longest finished job:             597s       9.95m     0.17h    0.01d
Submission to last job:          1287s      21.45m     0.36h    0.01d

    ssh hgwdev
    cd /cluster/data/mm7/bed/blastzPanTro1.2005_04_08
    featureBits mm7 netPanTro1
    #   2569701404 bases of 2597150411 (98.943%) in intersection
    time featureBits mm7 netHg17
    #	2579747741 bases of 2597150411 (99.330%) in intersection
    featureBits mm7 chainPanTro1
    #   2585896564 bases of 2597150411 (99.567%) in intersection
    time featureBits mm7 chainHg17
    #	2596946329 bases of 2597150411 (99.992%) in intersection
    featureBits mm6 chainPanTro1Link
    #   924893452 bases of 2597150411 (35.612%) in intersection
    featureBits mm7 chainHg17Link (on kolossus)
    #	966916309 bases of 2597150411 (37.230%) in intersection

    # Looks about correct, now for the swap
    ssh eieio
    cd /cluster/data/mm7/bed/blastzPanTro1.2005_04_08
    time /cluster/bin/scripts/doBlastzChainNet.pl \
	-swap -fileServer eieio `pwd`/DEF > swap.run.out 2>&1 &
    #	107 minutes
    featureBits panTro1 netMm6
    #	3306360710 bases of 2733948177 (120.937%) in intersection
    featureBits panTro1 chainMm6
    #	3363239156 bases of 2733948177 (123.018%) in intersection
    featureBits panTro1 chainMm6Link
    #	922583825 bases of 2733948177 (33.745%) in intersection
    featureBits -countGaps panTro1 netMm6
    #	3306360710 bases of 4420375440 (74.798%) in intersection
    featureBits -countGaps panTro1 netHg16
    #	4015411490 bases of 4420375440 (90.839%) in intersection
    featureBits -countGaps panTro1 chainMm6
    #	3363239156 bases of 4420375440 (76.085%) in intersection
    featureBits -countGaps panTro1 chainHg16
    #	4056193816 bases of 4420375440 (91.761%) in intersection
    #	on kolossus:
    HGDB_CONF=~/.hg.conf.read-only featureBits -countGaps panTro1 chainHg16Link
    #	2611490291 bases of 4420375440 (59.078%) in intersection
    HGDB_CONF=~/.hg.conf.read-only featureBits -countGaps panTro1 chainMm6Link
    #	922583825 bases of 4420375440 (20.871%) in intersection

    #	Appears to be reasonable, check the genome-test browser on both
    #	the Mm6 assembly and the PanTro1 assembly to see if the net and
    #	chain tracks appear and are in the proper order.

    #	re-scoring the chains
    ssh kkstore02
    cd /cluster/data/mm7/bed/blastzPanTro1.2005-10-05
    rm -fr axtNet mafNet axtChain

    ssh pk
    cd /cluster/data/mm7/bed/blastzPanTro1.2005-10-05

    time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
	-continue=chainRun -stop=load \
	`pwd`/DEF > rescoreChain.out 2>&1 &
    
    time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
	-swap -stop=load \
	`pwd`/DEF > rescoreChainSwap.out 2>&1 &
    
    time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
	-swap -continue=load -stop=load \
	`pwd`/DEF > rescoreChainSwapLoad.out 2>&1 &
    

#############################################################################
# STS MARKERS DATA DOWNLOAD (DONE - 2005-09-29 - 2005-10-07 - Hiram)
#	Applied a filter to primers.psl - 2005-10-20 - Hiram
    ssh kkstore02
    mkdir -p /cluster/data/mm7/bed/STSmarkers/downloads
    cd /cluster/data/mm7/bed/STSmarkers/downloads
    # these files appear to be new almost every day
    wget --timestamping \
	ftp://ftp.ncbi.nih.gov/repository/UniSTS/UniSTS_mouse.sts
    wget --timestamping \
	ftp://ftp.ncbi.nih.gov/repository/UniSTS/UniSTS.aliases

    #	The new feature in the .aliases file this time are names with
    #	spaces in them !  This changes our parsing business below,
    #	hopefully the spaces in the names won't cause trouble elsewhere.

    wget --timestamping \
ftp://ftp.ncbi.nih.gov/repository/UniSTS/UniSTS_MapReports/Mus_musculus/*

    # these reports from jax.org appear to be changing daily
    wget --timestamping \
	ftp://ftp.informatics.jax.org/pub/reports/MRK_Dump2.rpt
    wget --timestamping \
	ftp://ftp.informatics.jax.org/pub/reports/MRK_Sequence.rpt
    wget --timestamping \
	ftp://ftp.informatics.jax.org/pub/reports/PRB_PrimerSeq.rpt
    ls -ogrt
#	-r--r--r--  1      676 Mar 11  2004 README
#	-r--r--r--  1   396858 Jan 28  2005 10090.MGI.txt
#	-r--r--r--  1   240688 Mar 16  2005 10090.WI-YAC.txt
#	-r--r--r--  1   390139 Mar 16  2005 10090.WI_MRC_RH.txt
#	-r--r--r--  1   173344 Mar 16  2005 10090.WI-Genetic.txt
#	-rw-rw-r--  1  4540721 Sep 29 02:18 MRK_Dump2.rpt
#	-rw-rw-r--  1  2520760 Sep 29 02:19 PRB_PrimerSeq.rpt
#	-rw-rw-r--  1  4345973 Sep 29 02:19 MRK_Sequence.rpt
#	-r--r--r--  1 13832082 Sep 29 07:20 UniSTS.aliases
#	-r--r--r--  1  4106299 Sep 29 07:21 UniSTS_mouse.sts


    # back to our work area, update the bed file
    #	to do this we need a new UniSTS_mouse.alias file
    # it is created by a combination of information from several
    # of the above files ! AND ! the previous stsInfoMouse.bed file

    cd /cluster/data/mm7/bed/STSmarkers/downloads
    cp -p /cluster/data/mm6/bed/STSmarkers/downloads/*.sh .
    cp -p /cluster/data/mm6/bed/STSmarkers/downloads/*.pl .
    #	There is a line in the fetchAllAliases.sh script that needs to
    #	be updated, it must point to the previous bed file:
    #   BEDFile=/cluster/data/mm6/bed/STSmarkers/stsInfoMouse.bed
    #	Next time, this should read:
    #   BEDFile=/cluster/data/mm7/bed/STSmarkers/stsInfoMouse.bed

    #	This process has been captured in the script:
    #	/cluster/data/mm5/bed/STSmarkers/downloads/fetchAllAliases.sh
    # which uses a couple of perl scripts in that same directory.
    # briefly it is:
    
    # ./UniSTSParse.pl UniSTS_mouse.sts UniSTS.aliases > UniSTS_mouse_alias.0
    # grep MGI: UniSTS.aliases > MGI.aliases
    # ./stsInfoMouseParse.pl /cluster/store5/mouseMarker/stsInfoMouse.bed > \
    #	stsInfoAliases.txt
    # ./UniSTSParse.pl stsInfoAliases.txt UniSTS.aliases > stsInfo.aliases
    # cat UniSTS_mouse_alias.0 MGI.aliases stsInfo.aliases | sort -u \
    #    | sort -n > UniSTS_mouse.alias

    time ./fetchAllAliases.sh
    #	Here is a normal set of errors:
# processing UniSTS_mouse.sts to find aliases
# #       ERROR: KNOWN(==OK) duplicate ID: '108991' encountered at line
# #       2388
# processing MGI.aliases
# fetching existing aliases from previous stsInfoMouse.bed file
# found 27615 potential errors in /cluster/data/mm6/bed/STSmarkers/stsInfoMouse.bed
# to see the errors: grep ERROR stsInfoAliases.txt
# verify those stsInfoMouse.bed aliases with UniSTS.aliases

    # with that, we can create a new stsInfoMouse.bed file:
    #	Update the mm6 directory name here to mm7
    cd /cluster/data/mm7/bed/STSmarkers
    /cluster/store5/mouseMarker/code/updateBed.pl \
	/cluster/data/mm6/bed/STSmarkers/stsInfoMouse.bed \
	downloads/MRK_Dump2.rpt downloads/PRB_PrimerSeq.rpt \
	downloads/MRK_Sequence.rpt downloads/UniSTS_mouse.alias \
	downloads/UniSTS_mouse.sts | sed -e "s/\t*$//" > newbedfile

    # Yontao updated /cluster/store5/mouseMarker/code/cleanInfo.pl 8/10/04
    /cluster/store5/mouseMarker/code/cleanInfo.pl newbedfile > stsInfoMouse.bed
	
    # copy the stsInfoMouse.bed file from working dir to the marker info storage fold.
    # added 2 new steps by Yontao	
    mv /cluster/store5/mouseMarker/stsInfoMouse.bed \
	/cluster/store5/mouseMarker/stsInfoMouse.bed_mm6
    cp -p stsInfoMouse.bed /cluster/store5/mouseMarker/stsInfoMouse.bed

    # comparing to previous
    wc /cluster/store5/mouseMarker/stsInfoMouse.bed \
	/cluster/store5/mouseMarker/stsInfoMouse.bed_mm6 \
	/cluster/store5/mouseMarker/stsInfoMouse.bed_mm5
    #	59843   794642  6802825 /cluster/store5/mouseMarker/stsInfoMouse.bed
    #	58980   784786  6690105 /cluster/store5/mouseMarker/stsInfoMouse.bed_mm6
    #	58493   778055  6524821 /cluster/store5/mouseMarker/stsInfoMouse.bed_mm5

    # and from that, create new primer fa, epcr, etc:
    /cluster/store5/mouseMarker/code/luConvertPrimerToFa \
	stsInfoMouse.bed mouseP.fa mouseC.fa mouseP.info
    # the mouseC.fa file will be empty
    wc mouse?.*
    #      0       0       0 mouseC.fa
    # 300968  300914 6798466 mouseP.fa
    #  33838  169275 2153113 mouseP.info
    # 334806  470189 8951579 total
    #	the equivalent Mm6 files:
    #	     0       0       0 mouseC.fa
    #	293305  293251 6624638 mouseP.fa
    #	 32890  164528 2087271 mouseP.info
    #	326195  457779 8711909 total
    #	the equivalent Mm5 files:
    #	     0       0       0 mouseC.fa
    #	286740  286686 6474893 mouseP.fa
    #	 32232  161234 2044810 mouseP.info
    #	318972  447920 8519703 total

    #	copy the primers over to the san for the kluster run
    ssh pk
    mkdir -p /san/sanvol1/scratch/mm7/fasta
    cd /cluster/data/mm7
    time cp --verbose -p ?/*.fa ??/*.fa /san/sanvol1/scratch/mm7/fasta
    cd /cluster/data/mm7/bed/STSmarkers
    mkdir -p /san/sanvol1/scratch/mm7/STSmarkers
    cp -p mouseP.fa /san/sanvol1/scratch/mm7/STSmarkers
    cp -p mouseP.info /san/sanvol1/scratch/mm7/STSmarkers

    #  CLUSTER RUN FOR THE STS PRIMERS

    mkdir -p /cluster/data/mm7/bed/STSmarkers/primer
    mkdir -p /cluster/data/mm7/bed/STSmarkers/ePCR
    cd /cluster/data/mm7/bed/STSmarkers/primer

    # the mouseP.fa comes from above

    # PLEASE NOTE /cluster/bin/i386/blat.2 SPECIFICALLY IS USED HERE. 

cat << '_EOF_' > template
#LOOP
/cluster/bin/i386/blat.2 $(path1) /san/sanvol1/scratch/mm7/STSmarkers/mouseP.fa -ooc=/scratch/hg/h/mouse11.ooc  -minMatch=1 -minScore=0 -minIdentity=80 -oneOff {check out line+ primers.out/$(root1).psl}
#ENDLOOP
'_EOF_'
    # << emacs
    mkdir primers.out

    ls -1S /san/sanvol1/scratch/mm7/fasta/chr*.fa > contig.lst
    gensub2 contig.lst single template jobList
    para create jobList
    para try
    para check
    para push
    #	STARTED - 2005-09-20 14:26 
Completed: 40 of 40 jobs
CPU time in finished jobs:     367939s    6132.32m   102.21h    4.26d  0.012 y
IO & Wait Time:                   739s      12.31m     0.21h    0.01d  0.000 y
Average job time:                9217s     153.62m     2.56h    0.11d
Longest running job:                0s       0.00m     0.00h    0.00d
Longest finished job:           42051s     700.85m    11.68h    0.49d
Submission to last job:         42051s     700.85m    11.68h    0.49d

    # on the file server
    ssh kkstore02
    cd /cluster/data/mm7/bed/STSmarkers/primer
    #	filter alignments for (qEnd-qStart) vs. (tEnd-tStart)    2005-10-20
    #	should not be more than 100 bases different.
    #	This filters out about 1,028,202 alignments, or
    #	%17.4 = 100.0 * 1028202 / 5921712
    pslSort dirs stdout temp primers.out | awk -F"\t" '
{ if (((($13 - $12) - ($17 - $16)) > -100) &&
	((($13 - $12) - ($17 - $16)) < 100)) {print}
}
' > primers.psl.100

    rmdir temp

    # a rough comparison with previous results:
    wc primers.psl.100
    #	4893510  102763628  510563575 primers.psl.100
    wc primers.psl  (unfiltered, Mm7)
    #	5921712 124355891 636898117 primers.psl
    wc /cluster/data/mm6/bed/STSmarkers/primer/primers.psl
    #	5724127 120206606 615248041
    wc /cluster/data/mm5/bed/STSmarkers/primer/primers.psl
    #	5719969 120119288 590806241
    wc /cluster/data/mm4/bed/STSmarkers/primer/primers.psl
    #	5745617 120657896 592135728 

    # another kluster run for the ePCR
    ssh pk
    cd /cluster/data/mm7/bed/STSmarkers/ePCR
    ls -1S /san/sanvol1/scratch/mm7/fasta/chr*.fa > contig.lst

    #	pick up e-PCR source from
    #	ftp://ftp.ncbi.nlm.nih.gov/pub/schuler/e-PCR/
    #	version 2.3.1 11 Feb 2005
    #	Had to add the following to both re-PCR_main.cpp and
    #	e-PCR_main.cpp to get them to compile on kolossus:
// max and min Copied from /usr/include/mysql/my_global.h
#define max(a, b)       ((a) >? (b))
#define min(a, b)       ((a) <? (b))

    mkdir epcr.out
    cat << '_EOF_' > runPCR.csh
#!/bin/csh -fe
/cluster/bin/x86_64/e-PCR $1 $2 N=1 M=50 W=5 > $3
'_EOF_'
    # << emacs
    chmod +x runPCR.csh

    cat << '_EOF_' > template
#LOOP
./runPCR.csh /san/sanvol1/scratch/mm7//STSmarkers/mouseP.info $(path1) {check out line+ epcr.out/$(num1).epcr}
#ENDLOOP
'_EOF_'
    # << the mouseP.info was created above
    gensub2 contig.lst single template jobList
    para create jobList
    para try
    para check
    para push
    ... etc ...
    # STARTED 2005-09-30 11:45
# Completed: 40 of 40 jobs
# CPU time in finished jobs:      62439s    1040.64m    17.34h    0.72d  0.002 y
# IO & Wait Time:                   180s       3.01m     0.05h    0.00d  0.000 y
# Average job time:                1565s      26.09m     0.43h    0.02d
# Longest running job:                0s       0.00m     0.00h    0.00d
# Longest finished job:            4582s      76.37m     1.27h    0.05d
# Submission to last job:          4608s      76.80m     1.28h    0.05d

    ssh kkstore02
    cd /cluster/data/mm7/bed/STSmarkers/ePCR
    # all those results become all.epcr
    cat epcr.out/*.epcr > all.epcr

    # comparing to previous results:
    wc all.epcr
    #	57709  230836 3185188 all.epcr
    wc /cluster/data/mm6/bed/STSmarkers/ePCR/all.epcr
    #	55871  223484 3086148 all.epcr
    wc /cluster/data/mm5/bed/STSmarkers/ePCR/all.epcr
    #	55677  222708 2945623 /cluster/data/mm5/bed/STSmarkers/ePCR/all.epcr
    wc /cluster/data/mm4/bed/STSmarkers/ePCR/all.epcr
    #	74705  298820 3971712 /cluster/data/mm4/bed/STSmarkers/ePCR/all.epcr
    #	Mm4 seems to be out of whack

    cd /cluster/data/mm7/bed/STSmarkers/primer

    /cluster/bin/scripts/filterSTSPrimers \
    -mouse ../stsInfoMouse.bed primers.psl.100 \
        ../mouseP.info ../ePCR/all.epcr > primers.psl.filter.blat

    #  The output should show an increasing count:
    #	Reading name info
    #	Reading primer info
    #	Processing file
    #	100000
    #	200000
    #	300000
    #	...
    #	5700000
    #	Determining ePCR not found
    #
    wc primers.psl.filter.blat  (after filter applied above to primers.psl)
    #	33986  713706 3637462 primers.psl.filter.blat
    wc primers.psl.filter.blat  (before filter applied above to primers.psl)
    #	34674  728154 3718714 primers.psl.filter.blat
    wc /cluster/data/mm6/bed/STSmarkers/primer/primers.psl.filter.blat
    #	33662  706902 3605847 primers.psl.filter.blat
    wc /cluster/data/mm5/bed/STSmarkers/primer/primers.psl.filter.blat
    #	33476  702996 3442402 
    wc /cluster/data/mm4/bed/STSmarkers/primer/primers.psl.filter.blat
    #	32729  687309 3331894
    #	It appears Mm4 became sane after the filter

    # create accession_info.rdb
    touch empty_sequence.inf
    /cluster/bin/scripts/compileAccInfo -mouse \
	/cluster/data/mm7 empty_sequence.inf
    # works with errors on missing randoms, etc...:
    # cat: /cluster/data/mm5/11/chr11_random.agp: No such file or directory
    # cat: /cluster/data/mm5/M/chrM_random.agp: No such file or directory
    mv accession_info.rdb accession_info.rdb.tmp
    /cluster/bin/scripts/sorttbl Chr Ord Start < accession_info.rdb.tmp > \
	accession_info.rdb
    rm accession_info.rdb.tmp
    # comparing results to previous
    #	There is a dramatic decrease in Mm7 - this appears to be due to
    #	the lower numbers of fragments in the agp files.
    #	e.g.:
    [hiram@pk /cluster/data] wc mm6/*/chr*.agp | tail -1
    #	170812 1459570 9240332 total
    [hiram@pk /cluster/data] wc mm7/*/chr*.agp | tail -1
    #	70125  605050 3696903 total

    wc accession_info.rdb
    #	44046  484510 3112816 accession_info.rdb
    wc /cluster/data/mm6/bed/STSmarkers/primer/accession_info.rdb
    #	93052 1023576 6824900 accession_info.rdb
    wc /cluster/data/mm5/bed/STSmarkers/primer/accession_info.rdb
    #	131845 1450299 9681940
    wc /cluster/data/mm4/bed/STSmarkers/primer/accession_info.rdb
    #	86935  956289 6374930 

    # creates epcr.not.found.nomatch and epcr.not.found.psl
    #	/cluster/bin/scripts/epcrToPsl
    #	Fixed this script (in mm6) to make it not look for contigs in the usual
    #	manner, we don't have those for this assembly	
    sed -e "s/mm6/mm7/g" /cluster/data/mm6/bed/STSmarkers/primer/epcrToPsl \
	> ./epcrToPsl
    ./epcrToPsl -mouse \
	epcr.not.found ../mouseP.info \
	accession_info.rdb /cluster/data/mm7

    # Comparing results to previous:
    wc epcr*
    #	 474  1896 17400 epcr.not.found
    #	   0     0     0 epcr.not.found.nomatch
    #	 474  9954 47288 epcr.not.found.psl
    #	 158   535  4308 epcrToPsl
    #	1106 12385 68996 total

    # Mm6 wc epcr*
    wc /cluster/data/mm6/bed/STSmarkers/primer/epcr*
    #	 467    1868   17135 epcr.not.found
    #	  63     756    6041 epcr.not.found.nomatch
    #	 404    8484   40254 epcr.not.found.psl
    #	 158     535    4308 epcrToPsl
    #	1092   11643   67738 total

    # Mm5 wc epcr*
    wc /cluster/data/mm5/bed/STSmarkers/primer/epcr*
    #	 463    1852   17080 epcr.not.found
    #	  61     732    5845 epcr.not.found.nomatch
    #	 398    8358   38591 epcr.not.found.psl
    #	 402    8442   39011 epcr.not.found.psl.orig
    #	1324   19384  100527 total

    # Mm4 wc epcr*
    wc /cluster/data/mm4/bed/STSmarkers/primer/epcr*
    #	328    1312   12011 epcr.not.found
    #	 57     684    5474 epcr.not.found.nomatch
    #	266    5586   25711 epcr.not.found.psl
    #	163     552    4370 epcrToPsl
    #	814    8134   47566 total

    cat primers.psl.filter.blat epcr.not.found.psl > primers.psl.filter
    wc primers.psl.filter  (after filter applied above to primers.psl)
    #	34460  723660 3684750 primers.psl.filter
    wc primers.psl.filter  (before filter applied above to primers.psl)
    #	35148  738108 3766002 primers.psl.filter

    wc /cluster/data/mm6/bed/STSmarkers/primer/primers.psl.filter
    #	34066  715386 3646101 primers.psl.filter

    wc /cluster/data/mm5/bed/STSmarkers/primer/primers.psl.filter.lifted
    # 33691  707511 3601164 primers.psl.filter.lifted

    # create primers.psl.filter.lifted.initial
    PATH=/cluster/bin/scripts:$PATH /cluster/bin/scripts/extractPslInfo \
	primers.psl.filter
    wc primers.psl.filter.initial (after filter applied above to primers.psl)
    #	34443  206658 1833793 primers.psl.filter.initial
    wc primers.psl.filter.initial (before filter applied above to primers.psl)
    #	35131  210786 1870800 primers.psl.filter.initial
    wc /cluster/data/mm6/bed/STSmarkers/primer/primers.psl.filter.initial
    #	34048  204288 1815222 primers.psl.filter.initial
    wc /cluster/data/mm5/bed/STSmarkers/primer/primers.psl.filter.lifted.initial
    # 33689  202134 1799016 primers.psl.filter.lifted.initial

    # create primers.psl.filter.lifted.initial.acc
    /cluster/bin/scripts/findAccession -agp \
	-mouse primers.psl.filter.initial /cluster/data/mm7
    #	it complains about missing _random items, it is OK
    wc primers.psl.filter.initial.acc (after filter applied above to primers.psl)
    #	34443  241101 2160540 primers.psl.filter.initial.acc
    wc primers.psl.filter.initial.acc (before filter applied above to primers.psl)
    #	35131  245917 2204115 primers.psl.filter.initial.acc
    wc /cluster/data/mm6/bed/STSmarkers/primer/primers.psl.filter.initial.acc
    #	34048  238336 2154798 primers.psl.filter.initial.acc

    wc /cluster/data/mm5/bed/STSmarkers/primer/primers.psl.filter.lifted.initial.acc
    # 33689  235823 2158029 primers.psl.filter.lifted.initial.acc

    # this needs to be -rat as that specifies how to scan the
    # stsInfoMouse.bed file and it does not work if you use -mouse
    /cluster/bin/scripts/getStsId -rat \
	../stsInfoMouse.bed  primers.psl.filter.initial.acc \
	> primers.initial.acc.trans
    wc primers.initial.acc.trans  (after filter applied above to primers.psl)
    #	34443  241101 1830431 primers.initial.acc.trans
    wc primers.initial.acc.trans  (before filter applied above to primers.psl)
    #	35131  245917 1867180 primers.initial.acc.trans
    wc /cluster/data/mm6/bed/STSmarkers/primer/primers.initial.acc.trans
    wc primers.initial.acc.trans
    #	34041  238287 1829724 primers.initial.acc.trans
    #	I don't see any of these errors in the Mm7 build:
    #	No id for 61645_RH126840
    #	No id for 4187_D10MIT171.2
    #	No id for 63449_RH125771
    #	No id for 67188_PMC99911P4
    #	No id for 8839_D6MIT360.1
    #	No id for 62732_RH126829
    #	No id for 63746_RH127126

    sort -k 4n primers.initial.acc.trans > primers.final
    wc primers.final  (after filter applied above to primers.psl)
    #	34443  241101 1830431 primers.final
    wc primers.final  (before filter applied above to primers.psl)
    #	35131  245917 1867180 primers.final
    wc /cluster/data/mm6/bed/STSmarkers/primer/primers.final 
    #	34041  238287 1829724 primers.final
    wc /cluster/data/mm5/bed/STSmarkers/primer/primers.final 
    #	33689  235823 1834889 /cluster/data/mm5/bed/STSmarkers/primer/primers.final

    rm primers.initial.acc.trans

    cd /cluster/data/mm7/bed/STSmarkers
    # stsMarkers.final is empty for mouse
    touch stsMarkers.final dummy
    PATH=/cluster/bin/scripts:$PATH /cluster/bin/scripts/combineSeqPrimerPos \
	stsMarkers.final primer/primers.final > stsMarkers_pos.rdb
    wc stsMarkers_pos.rdb  (after filter applied to primers.psl above)
    #	32869  230083 1868276 stsMarkers_pos.rdb
    wc stsMarkers_pos.rdb  (before filter applied to primers.psl above)
    #	33421  233947 1903951 stsMarkers_pos.rdb
    wc /cluster/data/mm6/bed/STSmarkers/stsMarkers_pos.rdb
    #	32350  226450 1909506 stsMarkers_pos.rdb
    wc /cluster/data/mm5/bed/STSmarkers/stsMarkers_pos.rdb
    # 32085  224595 1862816 /cluster/data/mm5/bed/STSmarkers/stsMarkers_pos.rdb
    wc /cluster/data/mm4/bed/STSmarkers/stsMarkers_pos.rdb
    # 31270  218890 1869417 /cluster/data/mm4/bed/STSmarkers/stsMarkers_pos.rdb

    /projects/cc/hg/ytlu/bin/script/perl/createStsBed \
	stsInfoMouse.bed  stsMarkers_pos.rdb 500 > stsMapMouse.bed
    wc stsMapMouse.bed  (after filter applied to primers.psl above)
    #	29842  308123 2135776 stsMapMouse.bed
    wc stsMapMouse.bed  (before filter applied to primers.psl above)
    #	30296  312257 2166725 stsMapMouse.bed
    wc /cluster/data/mm6/bed/STSmarkers/stsMapMouse.bed
    #	29079  301678 2097544 stsMapMouse.bed
    wc /cluster/data/mm5/bed/STSmarkers/stsMapMouse.bed
    #	29069  301535 2123622 /cluster/data/mm5/bed/STSmarkers/stsMapMouse.bed

    #  loading STS markers tables
    ssh hgwdev
    cd /cluster/data/mm7/bed/STSmarkers
    cp -p /cluster/data/mm6/bed/STSmarkers/ucscAlias.pl .
    ./ucscAlias.pl stsInfoMouse.bed > ucscStsAlias.tab 2> ucscStsAlias.warnings
    #	this does leave messages in ucscStsAlias.warnings but they seem
    #	to be very similar to Mm6 with just a few new ones
     
    wc ucscStsAlias.tab  (after applying filter to primers.psl above)
    #	144570  433667 3366815 ucscStsAlias.tab
    wc ucscStsAlias.tab  (before applying filter to primers.psl above)
    #	144570  433667 3366815 ucscStsAlias.tab
    wc /cluster/data/mm6/bed/STSmarkers/ucscStsAlias.tab
    #	141585  424725 3284106 ucscStsAlias.tab
    wc /cluster/store6/mm5/bed/STSmarkers/ucscStsAlias.tab
    # 126624  379859 3037850 /cluster/store6/mm5/bed/STSmarkers/ucscStsAlias.tab
     
    hgsql -e "drop table stsAlias;" mm7
    hgsql mm7 < ~/kent/src/hg/lib/stsAlias.sql
    hgsql -e \
	'load data local infile "ucscStsAlias.tab" into table stsAlias;' mm7
    hgsql -e "drop table stsMapMouseNew;" mm7
    hgsql mm7 < ~/kent/src/hg/lib/stsMapMouseNew.sql
    hgsql -e \
'load data local infile "stsMapMouse.bed" into table stsMapMouseNew;' mm7
    hgsql -e "drop table stsInfoMouseNew;" mm7
    hgsql mm7 < ~/kent/src/hg/lib/stsInfoMouseNew.sql
    hgsql -e \
     'load data local infile "stsInfoMouse.bed" into table stsInfoMouseNew;' mm7

    hgLoadPsl -nobin -table=all_sts_primer mm7 primer/primers.psl.filter
    #	load of all_sts_primer did not go as planned: 34460 record(s),
    #	0 row(s) skipped, 21 warning(s) loading primer/primers.psl.filter

    #	load of all_sts_primer did not go as planned: 35148 record(s), 0
    #	row(s) skipped, 21 warning(s) loading primer/primers.psl.filter

    # load primer sequences	
    mkdir /gbdb/mm7/stsMarker
    ln -s /cluster/data/mm7/bed/STSmarkers/mouseP.fa \
	/gbdb/mm7/stsMarker/mouseP.fa
    # PLEASE NOTE THAT THE If you are going to reload this business, use the
    #	-replace option on this hgLoadSeq
    #	hgLoadSeq -replace mm7 /gbdb/mm7/stsMarker/mouseP.fa
    # otherwise there will be a problem that the seq and extFile tables 
    # will be out of sync. 
    hgLoadSeq mm7 /gbdb/mm7/stsMarker/mouseP.fa
    #  Adding /gbdb/mm7/stsMarker/mouseP.fa
    #  33838 sequences

    #	After applying the filter to primers.psl above  (and in mm6 too)
    featureBits mm7 all_sts_primer
    #	3757119 bases of 2583394090 (0.145%) in intersection
    featureBits mm6 all_sts_primer
    #	3706406 bases of 2597150411 (0.143%) in intersection
    featureBits mm7 stsMapMouseNew
    #	4805958 bases of 2583394090 (0.186%) in intersection
    featureBits mm6 stsMapMouseNew
    #	4638338 bases of 2597150411 (0.179%) in intersection

    #	Before applying the filter to primers.psl above
    featureBits mm7 all_sts_primer
    #	3794153 bases of 2583394090 (0.147%) in intersection
    featureBits mm6 all_sts_primer
    #	3735649 bases of 2597150411 (0.144%) in intersection
    featureBits mm5 all_sts_primer
    #	3727268 bases of 2615483787 (0.143%) in intersection
    featureBits mm7 stsMapMouseNew
    #	4917456 bases of 2583394090 (0.190%) in intersection
    featureBits mm6 stsMapMouseNew
    #	4736039 bases of 2597150411 (0.182%) in intersection
    featureBits mm5 stsMapMouseNew
    #	4719679 bases of 2615483787 (0.180%) in intersection

    hgsql -N mm7 -e "select count(*) from stsAlias;"
    #	140649
    hgsql -N mm6 -e "select count(*) from stsAlias;"
    #	137738
    hgsql -N mm5 -e "select count(*) from stsAlias;"
    #	122944
    hgsql -N mm7 -e "select count(*) from stsInfoMouseNew;"
    #	59843
    hgsql -N mm6 -e "select count(*) from stsInfoMouseNew;"
    #	58980
    hgsql -N mm5 -e "select count(*) from stsInfoMouseNew;"
    #	58493

    #	compare old and new name lists:
    awk '{print $4}' /cluster/data/mm6/bed/STSmarkers/stsMapMouse.bed | \
	sort -u > mm6.nameList
    awk '{print $4}' stsMapMouse.bed | sort -u > mm7.nameList
    #	After applying the filter to primers.psl above  (and in mm6 too)
    comm -12 mm?.nameList | wc
    #	27320   27320  265970		<- 27,320 names in common
    comm -23 mm6.nameList mm7.nameList | wc
    #	188     188    1695		<- 188 unique to mm6 list
    comm -13 mm6.nameList mm7.nameList | wc
    #	1107    1107   12626		<- 1,107 unique to mm7 list

    #	Before applying the filter to primers.psl above
    comm -12 mm?.nameList | wc
    #	27658   27658  268827		<- 27,658 names in commmon
    comm -23 mm6.nameList mm7.nameList | wc
    #	197     197    1781		<- 197 unique to mm6 list
    comm -13 mm6.nameList mm7.nameList | wc
    #	1184    1184   13480		<- 1,184 unique to mm7 list

    #	Mm6 vs Mm5:
    #	27454   27454  266951		<- 27,545 names in common
    #	182     182    1685		<- 182 unique to mm5 list
    #	1625    1625   15090		<- 1,625 unique to mm6 list

#############################################################################
# 17-WAY VAR_MULTIZ - ALIGNMENTS (WORKING - 2005-10-25)
    ssh kkstore02
    mkdir /cluster/data/mm7/bed/multiz17way
    cd /cluster/data/mm7/bed/multiz17way

    #	create tree diagram to guide work below.
    #	This tree was constructed from one that Adam is using for
    #	ENCODE work and a 27-way alignment.  Took that file and
    #	removed some of the entries, adding together the appropriate
    #	distances.

    cat << '_EOF_' > 17way.nh
(((((((((
(human_hg17:0.006690,chimp_panTro1:0.007571):0.024272,
  macaque_rheMac1:0.0592):0.023960,

  ((rat_rn3:0.081728,mouse_mm7:0.077017):0.229273,
      rabbit_oryCun1:0.206767):0.1065):0.023026,

(cow_bosTau2:0.159182,dog_canFam2:0.147731):0.039450):0.028505,

armadillo_dasNov1:0.149862):0.015994,

(elephant_loxAfr1:0.104891,tenrec_echTel1:0.259797):0.040371):0.218400,

monodelphis_monDom2:0.371073):0.189124,

chicken_galGal2:0.454691):0.123297,

xenopus_xenTro1:0.782453):0.156067,

((tetraodon_tetNig1:0.199381,fugu_fr1:0.239894):0.492961,
    zebrafish_danRer3:0.782561):0.156067);
'_EOF_'
    #	happy emacs

    /cluster/bin/phast/draw_tree 17way.nh > 17way.ps
    /cluster/bin/phast/all_dists 17way.nh > 17way.distances.txt
    grep -y mm7 17way.distances.txt | sort -k3,3n
    #	Print out that file for reference, and use the calculated
    #	distances in the table below to order the organisms and check
    #	the button order on the browser.  Zebrafish ends up before
    #	tetraodon and fugu on the browser despite its distance.
    #	And if you can fill in the table below entirely, you have
    #	succeeded in finishing all the alignments required.
    #
#                         featureBits chainLink measures
#                                           chainMm7Link   chain   linearGap
#    distance                       on Mm7      on other   minScore
#  1  0.1587 - rat rn3            (% 68.744)  (% 69.289)   3000     medium
#  2  0.4677 - human hg17         (% 38.571)  (% 34.706)   3000     medium
#  3  0.4686 - chimp panTro1      (% 35.100)  (% 32.910)   3000     medium
#  4  0.4960 - macaque rheMac1    (% 33.898)  (% 32.023)   3000     medium
#  5  0.5131 - rabbit oryCun1     (% 19.319)  (% 24.164)   3000     medium
#  6  0.6142 - armadillo dasNov1  (% 16.838)  (% 20.60x)   3000     medium
#  7  0.6230 - dog canFam2        (% 32.266)  (% 34.135)   3000     medium
#  8  0.6256 - elephant loxAfr1   (% 18.381)  (% 20.555)   3000     medium
#  9  0.6344 - cow bosTau2        (% 26.805)  (% 24.144)   3000     medium
# 10  0.7805 - tenrec echTel1     (% 11.421)  (% 14.36x)   5000     loose
# 11  1.0698 - opossum monDom2    (%  8.501)  (%  6.319)   5000     loose
# 12  1.3425 - chicken galGal2    (%  3.071)  (%  6.715)   5000     loose
# 13  1.7936 - frog xenTro1       (%  2.418)  (%  4.296)   5000     loose
# 14  2.0157 - tetraodon tetNig1  (%  1.956)  (% 13.661)   5000     loose
# 15  2.0562 - fugu fr1           (%  1.906)  (% 13.484)   5000     loose
# 16  2.1059 - zebrafish danRer3  (%  2.694)  (%  4.372)   5000     loose

    export H=/cluster/data/mm7/bed
    mkdir mafLinks
    for G in rn3 hg17 panTro1 rheMac1 oryCun1 dasNov1 canFam2 \
	loxAfr1 bosTau2 echTel1 monDom2 galGal2 xenTro1 tetNig1 fr1 danRer3
    do
	mkdir mafLinks/$G
	if [ ! -d ${H}/blastz.${G}/mafNet ]; then
	echo "missing directory blastz.${G}/mafNet"
		exit 255
	fi
	ln -s ${H}/blastz.$G/mafNet/*.maf.gz ./mafLinks/$G
    done

    #	Copy MAFs to SAN for kluster run
    ssh pk
    mkdir /san/sanvol1/scratch/mm7/multiz17way
    cd /san/sanvol1/scratch/mm7/multiz17way
    rsync -a --copy-links --progress \
	/cluster/data/mm7/bed/multiz17way/mafLinks/ .

    #	We have about 5.9 Gb of data here, takes ~ 10 minutes to copy

    mkdir penn
    cp -p /cluster/bin/penn/v10.5.x86_64/multiz-tba/multiz penn
    cp -p /cluster/bin/penn/v10.5.x86_64/multiz-tba/maf_project penn

#       Progressive alignment up the tree w/o stager, 
#       using multiz.v10.5 (var_multiz)
#       Method: align internal subtrees (using 0 flag to var_multiz)
#               Then, align these to mouse (using 1 flag to var_multiz)
#       NOTE: must use maf_project after each multiz run, in order
#       to order output.  Single-cov guaranteed by use of net MAF's,
#       so it is not necessary to run single_cov2.

    # make output dir and run dir
    ssh pk
    cd /cluster/data/mm7/bed/multiz17way
    mkdir -p maf
    mkdir -p run
    cd run

    # create scripts to run var_multiz on cluster

cat > oneMultiz.csh << 'EOF'
#!/bin/csh -fe
    set c = $1
    set multi = /scratch/mm7/multiz17way.$c
    set pairs = /san/sanvol1/scratch/mm7/multiz17way

    # special mode --
    # with 1 arg, cleanup
    if ($#argv == 1) then
        rm -fr $multi
        exit
    endif

    # special mode --
    # with 3 args, saves an alignment file
    if ($#argv == 3) then
        cp $multi/$2/$c.maf $3
        exit
    endif 
        
    set s1 = $2
    set s2 = $3
    set flag = $4
    
    # locate input files -- in pairwise dir, or multiple dir
    set d1 = $multi
    set d2 = $multi 
    if (-d $pairs/$s1) then
        set d1 = $pairs
	set f1 = $d1/$s1/$c.maf.gz
	set t1 = /tmp/$s1.$c.maf
	zcat $f1 > $t1
    else
	set f1 = $d1/$s1/$c.maf
	set t1 = /tmp/$s1.$c.maf
	cp -p $f1 $t1
    endif
    if (-d $pairs/$s2) then
        set d2 = $pairs
	set f2 = $d2/$s2/$c.maf.gz
	set t2 = /tmp/$s2.$c.maf
	zcat $f2 > $t2
    else
	set f2 = $d2/$s2/$c.maf
	set t2 = /tmp/$s2.$c.maf
	cp -p $f2 $t2
    endif
    # write to output dir
    set out = $multi/${s1}${s2}
    mkdir -p $out

    # check for empty input file
    if (-s $t1 && -s $t2) then
        echo "Aligning $f1 $f2 $flag"
        /san/sanvol1/scratch/mm7/multiz17way/penn/multiz $t1 $t2 $flag \
		$out/$c.unused1.maf $out/$c.unused2.maf > $out/$c.full.maf
        cat $out/$c.full.maf $out/$c.unused1.maf $out/$c.unused2.maf > \
                $out/$c.tmp.maf
        echo "Ordering $c.maf"
        /san/sanvol1/scratch/mm7/multiz17way/penn/maf_project \
	$out/$c.tmp.maf mm7.$c > $out/$c.maf
	rm -f $t1 $t2
    else if (-s $t1) then
        cp -p $t1 $out/$c.maf
	rm -f $t1
    else if (-s $t2) then
        cp -p $t2 $out/$c.maf
	rm -f $t2
    endif
'EOF'
# << keep emacs coloring happy
    chmod +x oneMultiz.csh
    cp -p oneMultiz.csh /san/sanvol1/scratch/mm7/multiz17way/penn/oneMultiz.csh

    #	using your tree diagram printed above, arrange these alignments
    #	in order of the tree branches

cat > allMultiz.csh << 'EOF'
#!/bin/csh -fe
    # multiple alignment steps:
set c = $1
set s = "/san/sanvol1/scratch/mm7/multiz17way/penn/oneMultiz.csh"

$s $c hg17 panTro1 0
$s $c hg17panTro1 rheMac1 1
$s $c rn3 oryCun1 0
$s $c hg17panTro1rheMac1 rn3oryCun1 1
$s $c canFam2 bosTau2 0
$s $c hg17panTro1rheMac1rn3oryCun1 canFam2bosTau2 1
$s $c hg17panTro1rheMac1rn3oryCun1canFam2bosTau2 dasNov1 1
$s $c loxAfr1 echTel1 0
$s $c hg17panTro1rheMac1rn3oryCun1canFam2bosTau2dasNov1 loxAfr1echTel1 1
$s $c hg17panTro1rheMac1rn3oryCun1canFam2bosTau2dasNov1loxAfr1echTel1 monDom2 1
$s $c hg17panTro1rheMac1rn3oryCun1canFam2bosTau2dasNov1loxAfr1echTel1monDom2 \
	galGal2 1
$s $c \
hg17panTro1rheMac1rn3oryCun1canFam2bosTau2dasNov1loxAfr1echTel1monDom2galGal2 \
	xenTro1 1
$s $c tetNig1 fr1  0
$s $c tetNig1fr1 danRer3 1
$s $c \
hg17panTro1rheMac1rn3oryCun1canFam2bosTau2dasNov1loxAfr1echTel1monDom2galGal2xenTro1 \
	tetNig1fr1danRer3 1
#	 get final alignment file
$s $c \
hg17panTro1rheMac1rn3oryCun1canFam2bosTau2dasNov1loxAfr1echTel1monDom2galGal2xenTro1tetNig1fr1danRer3 \
	/cluster/data/mm7/bed/multiz17way/maf/$c.maf
#cleanup
$s $c
'EOF'
# << keep emacs coloring happy
    chmod +x allMultiz.csh

cat  << 'EOF' > template
#LOOP
./allMultiz.csh $(root1) {check out line+ /cluster/data/mm7/bed/multiz17way/maf/$(root1).maf}
#ENDLOOP
'EOF'

    cd /cluster/data/mm7/bed/multiz17way/run
    awk '{print $1}' ../../../chrom.sizes > chrom.lst

    gensub2 chrom.lst single template jobList
    para create jobList
    para try; para check
    para push

# Completed: 40 of 40 jobs
# CPU time in finished jobs:     460579s    7676.32m   127.94h    5.33d  0.015 y
# IO & Wait Time:                 13669s     227.81m     3.80h    0.16d  0.000 y
# Average job time:               11856s     197.60m     3.29h    0.14d
# Longest finished job:           48345s     805.75m    13.43h    0.56d
# Submission to last job:         48345s     805.75m    13.43h    0.56d

    #	combine results into a single file for loading
    #	This step could be skipped and the catDir output sent directly
    #	into the loader.
    ssh kkstore02
    cd /cluster/data/mm7/bed/multiz17way
    #	There used to be a mafFilter here with a minScore of 500, but it
    #	turns out that the scores in these maf files are pretty much
    #	useless.  They range from very large negatives to very large
    #	positives.
    time catDir maf > multiz17way.maf
    #	real    18m12.267s
    #	makes an 15 Gb file:
    #	-rw-rw-r--    1 15433191843 Nov 23 09:25 multiz17way.maf

    #	Create per-chrom individual maf files for downloads
    #		(DONE - 2005-11-30)
    ssh kkstore02
    cd /cluster/data/mm7/bed/multiz17way
    mkdir mafDownloads
    for M in maf/chr*.maf
    do
	B=`basename $M`
	cp -p ${M} mafDownloads/${B}
	gzip mafDownloads/${B}
	echo ${B} done
    done

    ssh hgwdev
    mkdir /usr/local/apache/htdocs/goldenPath/mm7/multiz17way
    cd /usr/local/apache/htdocs/goldenPath/mm7/multiz17way
    ln -s /cluster/data/mm7/bed/multiz17way/mafDownloads/chr*.maf.gz .

    # Load into database
    ssh hgwdev
    cd /cluster/data/mm7/bed/multiz17way
    mkdir /gbdb/mm7/multiz17way
    ln -s /cluster/data/mm7/bed/multiz17way/multiz17way.maf \
	/gbdb/mm7/multiz17way
    time hgLoadMaf mm7 multiz17way
    #	Loaded 11964967 mafs in 1 files from /gbdb/mm7/multiz17way
    #	real    32m40.228s

    time hgLoadMafSummary -minSize=10000 -mergeGap=500 -maxSize=50000 mm7 \
	multiz17waySummary multiz17way.maf
    #	Processed 66508536 components in 11964967 mafs from multiz17way.maf
    #	real    37m56.218s

    # Dropped unused indexes (2006-05-09 kate)
    # NOTE: this is not required in the future, as the loader
    # has been fixed to not generate these indexes
    hgsql mm7 -e "alter table multiz17waySummary drop index chrom_2"
    hgsql mm7 -e "alter table multiz17waySummary drop index chrom_3"

    # create tree image:
    cat << '_EOF_' > species.nh
((((((human,(mouse,rat)),(dog,cow)),opossum),chicken),frog),(tetraodon,zebrafish))
'_EOF_'
    /cluster/bin/phast/draw_tree -b -s species.nh > species10.ps
    # photoshop to enhance, reduce the amount of whitespace to make it
    # smaller, then save as jpg
    cp species10.jpg /usr/local/apache/htdocs/images/phylo/Mm7_17way.jpg

############################################################################
# CREATE CONSERVATION WIGGLE WITH PHASTCONS
#		(DONE 2005-11-23 - 2005-12-07 - Hiram)

# Estimate phastCons parameters
    ssh kkstore02
    mkdir /cluster/data/mm7/bed/multiz17way/cons
    cd /cluster/data/mm7/bed/multiz17way/cons

    # Create a starting-tree.mod based on chr2 (the largest one)
    /cluster/bin/phast/$MACHTYPE/msa_split ../maf/chr2.maf \
	--refseq ../../../2/chr2.fa --in-format MAF \
        --windows 100000000,1000 --out-format SS \
        --between-blocks 5000 --out-root s1
    #	10 minutes

    /cluster/bin/phast/$MACHTYPE/phyloFit -i SS s1.*.ss \
--tree "((((((((((hg17,panTro1),rheMac1),((rn3,mm7),oryCun1)),(bosTau2,canFam2)),dasNov1),(loxAfr1,echTel1)),monDom2),galGal2),xenTro1),((tetNig1,fr1),danRer3))" \
    --out-root starting-tree
    #	real    840m53.157s
    #	That is 14 hours !

    rm s1.*.ss
    # add up the C and G:
    grep BACKGROUND starting-tree.mod | awk '{printf "%0.3f\n", $3 + $4;}'
    #	0.407
    #	This 0.407 is used in the --gc argument below

    # Create big bad bloated SS files on san filesystem (takes ~ 2h 20m)
    ssh kkstore02
    mkdir -p  /san/sanvol1/scratch/mm7/cons/ss
    cd  /san/sanvol1/scratch/mm7/cons/ss
    for C in `awk '{print $1}' /cluster/data/mm7/chrom.sizes`
    do
      if [ -s /cluster/data/mm7/bed/multiz17way/maf/${C}.maf ]; then
	mkdir ${C}
	echo msa_split $C
	chrN=${C/chr/}
	chrN=${chrN/_random/}
	/cluster/bin/phast/$MACHTYPE/msa_split \
	    /cluster/data/mm7/bed/multiz17way/maf/${C}.maf \
	    --refseq /cluster/data/mm7/${chrN}/${C}.fa \
	    --in-format MAF --windows 1000000,0 --between-blocks 5000 \
	    --out-format SS --out-root ${C}/${C}
      fi
    done
    #	real    143m42.404s

    # Create a random list of 50 1 mb regions  (do not use the _randoms)
    cd /san/sanvol1/scratch/mm7/cons/ss
    ls -1l chr*/chr*.ss | grep -v random | \
	awk '$5 > 4000000 {print $9;}' | randomLines stdin 50 ../randomSs.list

    # Set up parasol directory to calculate trees on these 50 regions
    ssh pk
    mkdir /san/sanvol1/scratch/mm7/cons/treeRun1
    cd /san/sanvol1/scratch/mm7/cons/treeRun1
    mkdir tree log

    #	Tuning this loop should come back to here to recalculate 
    # Create little script that calls phastCons with right arguments
    #	--target-coverage of 0.20 is about right for mouse, will be
    #	tuned exactly below
    cat > makeTree.csh << '_EOF_'
#!/bin/csh -fe
set C=$1:h
mkdir -p log/${C} tree/${C}
    /cluster/bin/phast/$MACHTYPE/phastCons ../ss/$1 \
      /cluster/data/mm7/bed/multiz17way/cons/starting-tree.mod \
      --gc 0.407 --nrates 1,1 --no-post-probs --ignore-missing \
      --expected-lengths 12 --target-coverage 0.17 \
      --quiet --log log/$1 --estimate-trees tree/$1
'_EOF_'
    #	emacs happy
    chmod a+x makeTree.csh

    # Create gensub file
    cat > template << '_EOF_'
#LOOP
makeTree.csh $(path1)
#ENDLOOP
'_EOF_'
    #	happy emacs

    # Make cluster job and run it
    gensub2 ../randomSs.list single template jobList
    para create jobList
    para try/push/check/etc
XXXX - working 2005-11-28 10:45
# Completed: 50 of 50 jobs
# CPU time in finished jobs:     354644s    5910.74m    98.51h    4.10d  0.011 y
# IO & Wait Time:                   352s       5.86m     0.10h    0.00d  0.000 y
# Average job time:                7100s     118.33m     1.97h    0.08d
# Longest running job:                0s       0.00m     0.00h    0.00d
# Longest finished job:           29358s     489.30m     8.15h    0.34d
# Submission to last job:         29446s     490.77m     8.18h    0.34d

    # Now combine parameter estimates.  We can average the .mod files
    # using phyloBoot.  This must be done separately for the conserved
    # and nonconserved models
    ssh kkstore02
    cd /san/sanvol1/scratch/mm7/cons/treeRun1
    ls -1 tree/chr*/*.cons.mod > cons.list
    time /cluster/bin/phast/$MACHTYPE/phyloBoot --read-mods '*cons.list' \
	--output-average ../ave.cons.mod > cons_summary.txt 2>&1 &
    ls -1 tree/chr*/*.noncons.mod > noncons.list
    /cluster/bin/phast/$MACHTYPE/phyloBoot --read-mods '*noncons.list' \
	--output-average ../ave.noncons.mod > noncons_summary.txt
    cd ..
    cp -p ave.*.mod /cluster/data/mm7/bed/multiz17way/cons

    #	measuring entropy
    #	consEntopy <target coverage> <expected lengths>
    #		 ave.cons.mod ave.noncons.mod --NH 9.78
    #	never stops with the --NH argument
    /cluster/bin/phast/$MACHTYPE/consEntropy .17 12 \
                        ave.cons.mod ave.noncons.mod
XXXX - doesn not work:  2005-11-28
[hiram@kkstore02 /san/sanvol1/scratch/mm7/cons] /cluster/bin/phast/$MACHTYPE/consEntropy .17 12 ave.cons.mod ave.noncons.mod
ERROR: with no separate source alignment, ss_from_msas expects sequences of positive length and no SS object.

#Transition parameters:gamma=0.100000, omega=12.000000, mu=0.083333, nu=0.009259
# Relative entropy: H=1.454874 bits/site
# Required length: N=7.596943 sites
# Total entropy: NH=11.052595 bits

# consEntropy .20 12 ave.cons.mod.1 ave.noncons.mod.1
# Transition params: gamma=0.200000, omega=12.000000, mu=0.083333, nu=0.020833
# Relative entropy: H=1.454874 bits/site
# Required length: N=6.629337 sites
# Total entropy: NH=9.644850 bits

# consEntropy .10 12 ave.cons.mod.2 ave.noncons.mod.2
# Transition params: gamma=0.100000, omega=12.000000, mu=0.083333, nu=0.009259
# Relative entropy: H=1.527815 bits/site
# Required length: N=7.205526 sites
# Total entropy: NH=11.008713 bits

# consEntropy .20 8 ave.cons.mod.3 ave.noncons.mod.3
# Transition params: gamma=0.200000, omega=8.000000, mu=0.125000, nu=0.031250
# Relative entropy: H=1.654878 bits/site
# Required length: N=5.146793 sites
# Total entropy: NH=8.517313 bits

### !!! ***  This one with .17 and 12 is the one that was finally used
# consEntropy .17 12 ave.cons.mod.4 ave.noncons.mod.4
# Transition params: gamma=0.170000, omega=12.000000, mu=0.083333, nu=0.017068
# Relative entropy: H=1.478838 bits/site
# Required length: N=6.753382 sites
# Total entropy: NH=9.987159 bits

    ssh pk
    # Create cluster dir to do main phastCons run
    mkdir /san/sanvol1/scratch/mm7/cons/consRun1
    cd /san/sanvol1/scratch/mm7/cons/consRun1
    mkdir ppRaw bed

    # Create script to run phastCons with right parameters
    #	This job is I/O intensive in its output files, thus it is all
    #	working over in /scratch/tmp/
    cat > doPhast.csh << '_EOF_'
#!/bin/csh -fe
mkdir /scratch/tmp/${2}
cp -p ../ss/${1}/${2}.ss ../elliotsEncode.mod /scratch/tmp/${2}
pushd /scratch/tmp/${2} > /dev/null
/cluster/bin/phast/${MACHTYPE}/phastCons ${2}.ss elliotsEncode.mod \
   --rho 0.3 --expected-length 11 --target-coverage 0.16 --quiet \
	--seqname ${1} --idpref ${1} --viterbi ${2}.bed --score > ${2}.pp
popd > /dev/null
mkdir -p ppRaw/${1}
mkdir -p bed/${1}
mv /scratch/tmp/${2}/${2}.pp ppRaw/${1}
mv /scratch/tmp/${2}/${2}.bed bed/${1}
rm /scratch/tmp/${2}/elliotsEncode.mod
rm /scratch/tmp/${2}/${2}.ss
rmdir /scratch/tmp/${2}
'_EOF_'
    # emacs happy
    chmod a+x doPhast.csh

    #	root1 == chrom name, file1 == ss file name without .ss suffix
    # Create gsub file
    cat > template << '_EOF_'
#LOOP
doPhast.csh $(root1) $(file1)
#ENDLOOP
'_EOF_'
    #	happy emacs

    # Create parasol batch and run it
    ls -1 ../ss/chr*/chr*.ss | sed 's/.ss$//' > in.list

    gensub2 in.list single template jobList
    para create jobList
    para try/check/push/etc.
    #	These jobs are very fast and very I/O intensive, even on the san
    #	they will hang it up as they work at full tilt.
# Completed: 2867 of 2867 jobs
# CPU time in finished jobs:      15359s     255.98m     4.27h    0.18d  0.000 y
# IO & Wait Time:                 34015s     566.92m     9.45h    0.39d  0.001 y
# Average job time:                  17s       0.29m     0.00h    0.00d
# Longest running job:                0s       0.00m     0.00h    0.00d
# Longest finished job:             259s       4.32m     0.07h    0.00d
# Submission to last job:          3236s      53.93m     0.90h    0.04d

    # combine predictions and transform scores to be in 0-1000 interval
    #	it uses a lot of memory, so on kolossus:
    ssh kolossus
    cd /san/sanvol1/scratch/mm7/cons/consRun1
    #	The sed's and the sort get the file names in chrom,start order
    find ./bed -type f | sed -e "s#/# x #g; s#\.# y #g; s#-# z #g" \
	| sort -k7,7 -k9,9n \
	| sed -e "s# z #-#g; s# y #\.#g; s# x #/#g" | xargs cat \
	| awk '{printf "%s\t%d\t%d\tlod=%d\t%s\n", $1, $2, $3, $5, $5;}' \
	| /cluster/bin/scripts/lodToBedScore /dev/stdin > mostConserved.bed
    #	~ 1 minute
    cp -p mostConserved.bed /cluster/data/mm7/bed/multiz17way

    # Figure out how much is actually covered by the bed files as so:
    #	The 2583393846 comes from the non-n genome size,
    #	from faSize on all chroms:
    ssh kkstore02
    cd /cluster/data/mm7
    faSize ?/chr*.fa ??/chr*.fa
    #	2847717329 bases (264323483 N's 2583393846 real 1489414119
    #	upper 1093979727

    cd /san/sanvol1/scratch/mm7/cons/consRun1
    awk '
{sum+=$3-$2}
END{printf "%% %.2f = 100.0*%d/2583393846\n",100.0*sum/2583393846,sum}' \
	mostConserved.bed
    -target-coverage 0.16: % 7.00 = 100.0*187599245/2583393846 length 11
    -target-coverage 0.17: % 7.26 = 100.0*187599245/2583393846 length 12
    -target-coverage 0.13: % 6.86 = 100.0*177139744/2583393846 length 12
    -target-coverage 0.08: % 6.39 = 100.0*165132383/2583393846 length 12
    -target-coverage 0.03: % 5.88 = 100.0*151876064/2583393846 length 12
    -target-coverage 0.013: % 5.62 = 100.0*145240686/2583393846 length 12
    -target-coverage 0.08: % 6.10 = 100.0*157463396/2583393846 length 10
    -target-coverage 0.08: % 5.50 = 100.0*142084011/2583393846 length 7
    -target-coverage 0.03: % 5.02 = 100.0*129774852/2583393846 length 7
    -target-coverage 0.03: % 4.74 = 100.0*122386185/2583393846 length 6

    #	We used to aim for %4 in the above measurement, but as you can
    #	see it won't go down that low.  Instead, aim for %70 coverage in
    #	the following featureBits measurement on CDS:
    # Beware of negative scores when too high.  The logToBedScore
    # will output an error on any negative scores.

    HGDB_CONF=~/.hg.conf.read-only featureBits mm7 \
	-enrichment refGene:cds mostConserved.bed
    #	~ 1.5 minutes
# -target-coverage 0.16 and expected lengths 11:
# refGene:cds 1.022%, mostConserved.bed 6.999%, both 0.721%, cover 70.50%,
#	enrich 10.07x
# -target-coverage 0.17 and expected lengths 12:
# refGene:cds 1.022%, mostConserved.bed 7.262%, both 0.724%, cover 70.79%,
#	enrich 9.75x
# -target-coverage 0.13 and expected lengths 12:
# refGene:cds 1.022%, mostConserved.bed 6.857%, both 0.723%, cover 70.75%,
#	enrich 10.32x

    # Load most conserved track into database
    ssh hgwdev
    cd /cluster/data/mm7/bed/multiz17way
    hgLoadBed -strict mm7 phastConsElements mostConserved.bed

    #	Loaded 4767525 elements of size 5
    #	5 minute load time
    #	should measure the same as above
    featureBits mm7 -enrichment refGene:cds phastConsElements
# -target-coverage 0.16 and expected lengths 11:
# refGene:cds 1.022%, phastConsElements 6.999%, both 0.721%, cover 70.50%,
#	enrich 10.07x
# -target-coverage 0.17 and expected lengths 12:
# refGene:cds 1.022%, phastConsElements 7.262%, both 0.724%, cover 70.78%,
#	enrich 9.75x

    # Create merged posterier probability file and wiggle track data files
    #	pk is currently closer to the san than any other machine
    ssh pk
    cd /san/sanvol1/scratch/mm7/cons/consRun1
    # the sed business gets the names sorted by chromName, chromStart
    #	so that everything goes in numerical order into wigEncode
    find ./ppRaw -type f | sed -e "s#/# x #g; s#\.# y #g; s#-# z #g" \
	| sort -k7,7 -k9,9n \
	| sed -e "s# z #-#g; s# y #\.#g; s# x #/#g" | xargs cat \
	    | wigEncode stdin phastCons17.wig phastCons17.wib
    # about 23 minutes for above

    time rsync -a --progress phastCons17.wi? \
	kkstore02:/cluster/data/mm7/bed/multiz17way/
    # 3m 05s copy from pk to on kkstore02

    #	prepare compressed copy of ascii data values for downloads
    ssh pk
    cd /san/sanvol1/scratch/mm7/cons/consRun1
    cat << '_EOF_' > gzipAscii.sh
#!/bin/sh

TOP=`pwd`
export TOP

mkdir -p phastCons17Scores

for D in ppRaw/chr*
do
    C=${D/ppRaw\/}
    out=phastCons17Scores/${C}.data.gz
    echo "========================== ${C} ${D}"
    find ./${D} -type f | sed -e "s#/# x #g; s#\.# y #g; s#-# z #g" \
	| sort -k7,7 -k9,9n \
	| sed -e "s# z #-#g; s# y #\.#g; s# x #/#g" | xargs cat |
	    gzip > ${out}
done
'_EOF_'
    #	happy emacs
    chmod +x gzipAscii.sh
    time ./gzipAscii.sh
    #	takes about 40 minutes, makes 2.9 Gb of data
    #	copy them for downloads
    ssh hgwdev
    mkdir /cluster/data/mm7/bed/multiz17way/phastCons17Scores
    cd /cluster/data/mm7/bed/multiz17way/phastCons17Scores
    cp -p  /san/sanvol1/scratch/mm7/cons/consRun1/phastCons17Scores/* .
    #	~12 minute copy

    mkdir /usr/local/apache/htdocs/goldenPath/mm7/phastCons17Scores
    cd /usr/local/apache/htdocs/goldenPath/mm7/phastCons17Scores
    ln -s /cluster/data/mm7/bed/multiz17way/phastCons17Scores/*.gz .

    # Load gbdb and database with wiggle.
    ssh hgwdev
    cd /cluster/data/mm7/bed/multiz17way
    ln -s `pwd`/phastCons17.wib /gbdb/mm7/wib/phastCons17.wib
    hgLoadWiggle mm7 phastCons17 phastCons17.wig
    #  ~ 3 minute load

    #  Create histogram to get an overview of all the data
    ssh hgwdev
    cd /cluster/data/mm7/bed/multiz17way/cons
    time hgWiggle -doHistogram \
	-hBinSize=0.001 -hBinCount=1000 -hMinVal=0.0 -verbose=2 \
	    -db=mm7 phastCons17 > histogram.data 2>&1
    #	about 27 minutes to scan all data

    #	create plot of histogram:

    cat << '_EOF_' | gnuplot > histo.png
set terminal png small color \
        x000000 xffffff xc000ff x66ff66 xffff00 x00ffff xff0000
set size 1.4, 0.8
set key left box
set grid noxtics
set grid ytics
set title " Mouse Mm7 Histogram phastCons17 track"
set xlabel " phastCons17 score"
set ylabel " Relative Frequency"
set y2label " Cumulative Relative Frequency (CRF)"
set y2range [0:1]
set y2tics
set yrange [0:0.02]

plot "histogram.data" using 2:5 title " RelFreq" with impulses, \
        "histogram.data" using 2:7 axes x1y2 title " CRF" with lines
'_EOF_'
    #	happy emacs

    display histo.png &

############################################################################
# BUILD KNOWN GENES TABLES (Started 3/19/05, done 4/13/05. Fan)

# First build protein databases, sp050315 and proteins050315
# See makeProteins050315.doc for details.

# Create working subdirectories and temporary databases

  ssh hgwdev
  cd /cluster/store10/kg
  mkdir kgMm6A  
  ln -s /cluster/store10/kg/kgMm6A /cluster/store6/kgDB/bed/kgMm6A
  ln -s /cluster/store10/kg/kgMm6A /cluster/data/mm7/bed/kgMm6A
   
  hgsql mm7 -e "create database kgMm6ATemp"

  mkdir /cluster/bluearc/kgDB/kgMm6A
  mkdir /cluster/bluearc/kgDB/kgMm6A/protBlat
  ln -s /cluster/bluearc/kgDB/kgMm6A/protBlat /cluster/store10/kg/kgMm6A/protBlat
  cd /cluster/store10/kg/kgMm6A/protBlat

# Get all mouse protein sequences

  hgsql -N sp050315 -e \
  'select proteins050315.spXref3.accession,protein.val from proteins050315.spXref3,protein where division="10090" and acc=accession' \
  |awk '{print ">" $1;print $2}' >mouseProt.fa

# Prepare and perform cluster run for protein/genome alignment

  ssh kk
  cd /cluster/data/mm7/bed/kgMm6A/protBlat
  mkdir prot
  faSplit sequence mouseProt.fa 1000 prot/prot

  ls /cluster/bluearc/kgDB/kgMm6A/protBlat/prot/* > prot.lis
  hgsql mm7 -N -e 'select chrom from chromInfo'   > chrom.lis

  cat << '_EOF_' > gsub
#LOOP
/cluster/bin/i386/blat -noHead -t=dnax -q=prot /panasas/store/mm7/nib/$(path1).nib $(path2) {check out line+ /cluster/bluearc/kgDB/kgMm6A/protBlat/result/$(root1)_$(root2).psl}
#ENDLOOP
'_EOF_'

  mkdir result
  gensub2 chrom.lis prot.lis gsub jobList

  para create jobList
  para check
  para push
  para check ...

# This cluster run takes about two days.  Crashed jobs are due to empty BLAT result.  It is OK.
Completed: 31081 of 39600 jobs
Crashed: 8519 jobs
CPU time in finished jobs:   28671747s  477862.45m  7964.37h  331.85d  0.909 y
IO & Wait Time:               1469964s   24499.40m   408.32h   17.01d  0.047 y
Average job time:                 970s      16.16m     0.27h    0.01d
Longest running job:                0s       0.00m     0.00h    0.00d
Longest finished job:           39632s     660.53m    11.01h    0.46d
Submission to last job:        124276s    2071.27m    34.52h    1.44d

# collect BLAT results

   ssh hgwdev
   cd /cluster/data/mm7/bed/kgMm6A/protBlat

   mkdir result2
   mkdir result3

   cat chrom.lis |sed -e 's/chr/do1 chr/g' >doall

   cat << '_EOF_' > do1.1
echo processing $1
cat result/$1_prot*.psl >result2/$1.psl
'_EOF_'

   cat << '_EOF_' > do1.1
echo processing $1
pslReps -nohead -minCover=0.80 -minAli=0.80 -nearTop=0.002 result2/$1.psl result4/$1.psl /dev/null >>j.out
'_EOF_'

   chmod +x do*

   cp do1.1 do1
   doall
   cp do1.2 do1
   doall

   cat result3/*.psl >protBlat.psl
   hgLoadPsl mm7 protBlat.psl

# Remember to remove result2 and result3 when KG is built and validated.

   cd /cluster/data/mm7/bed/kgMm6A

# create all_mrna.psl and tight_mrna.psl
   hgsql mm7 -N -e "select * from all_mrna" |cut -f 2-22 >all_mrna.psl

   pslReps -minCover=0.40 -minAli=0.97 -nearTop=0.002 \
           all_mrna.psl tight_mrna.psl /dev/null

# Use overlapSelect to get protein and mRNA alignment overlaps   
   overlapSelect  -statsOutput  -dropped=protOut.psl -overlapThreshold=0.90 \
   -selectFmt=psl -inFmt=psl tight_mrna.psl  protBlat/protBlat.psl protMrna.stat

   overlapSelect  -mergeOutput  -dropped=protOut.psl -overlapThreshold=0.90 -selectFmt=psl \
   -inFmt=psl tight_mrna.psl  protBlat/protBlat.psl protMrna.out

# Create protein/mRNA pair and protein lists
   cut -f 10,31 protMrna.out|sort -u >spMrna.tab
   cut -f 10    protMrna.out|sort -u >protein.lis

# Load spMrna.tab into spMrna table in temp DB.
   hgsql kgMm6ATemp < ~/src/hg/lib/spMrna.sql
   hgsql kgMm6ATemp -e 'load data local infile "spMrna.tab" into table spMrna'
   hgsql kgMm6ATemp -e 'create index mrnaID on spMrna(mrnaID)'

# Prepare and perform cluster run of protein/mRNA alignment

# Get mRNA fa file.
   /cluster/data/genbank/bin/i386/gbGetSeqs -native -db=mm7 \
   -gbRoot=/cluster/data/genbank genbank mrna mrna.fa

# Create mrnaSeq table in kgMm6ATemp DB.

   hgFaToTab mrna.fa mrnaSeq.tab

   hgsql kgMm6ATemp <~/src/hg/lib/mrnaSeq.sql
   hgsql kgMm6ATemp -e "load data local infile "mrnaSeq.tab" into table mrnaSeq"

# Prepare files for cluster run
   ~/src/hg/protein/KG2.sh kgMm6A mm7 050315

# Perform cluster run of protein/mRNA alignment
   ~/src/hg/protein/KG3.sh kgMm6A mm7 050315

# Collect cluster run results
   cd kgBestMrna

   ls out | sed -e 's/prot/do1 prot/g' >doall

# create do1 with the following 2 lines:
   cat << '_EOF_' > do1
echo processing $1
cat out/$1/*.out >>protMrnaRaw.psl
'_EOF_'

   chmod +x do*
   doall

# Filter out low quality alignments
   pslReps -nohead -singleHit -minAli=0.9 protMrnaRaw.psl protMrnaBlat.psl /dev/null
   cut -f 10,14 protMrnaBlat.psl |sort -u >protMrna.lis
   wc protMrna.lis

# Load BLAT results into temp DB.
   hgsql kgMm6ATemp < ~/src/hg/lib/protMrnaBlat.sql
   hgsql kgMm6ATemp -e 'load data local infile "protMrnaBlat.psl" into table protMrnaBlat'
   hgsql kgMm6ATemp -e 'create index tName on protMrnaBlat(tName)'

# Create CDS files from protein/mRNA alignment results.
   hgsql kgMm6ATemp -N -e \
   'select qName,"_",tName,tStart+1,":",tEnd+3 from protMrnaBlat order by qName,tName,tEnd-tStart desc'\
   |sed 's/\t_\t/_/g'|sed 's/\t:\t/../g' >protMrna.cds

# Create protMrna.psl with proteinID_mrnaID as query ID.
   cut -f 22-30 ../protMrna.out > j1.tmp
   cut -f 32-42 ../protMrna.out > j2.tmp
   cut -f 10,31 ../protMrna.out|sed -e 's/\t/_/g' >j3.tmp
   paste j1.tmp j3.tmp j2.tmp >protMrna.psl
   rm j1.tmp j2.tmp j3.tmp

# Run mrnaToGene to create protMrna.gp
   bash
   mrnaToGene -cdsFile=protMrna.cds protMrna.psl protMrna.gp 2>protMrna.err >protMrna.log
   exit

# Prepare refGene and all_mrna gp files.

   cd ..
   hgsql mm7 -N -e 'select * from refGene' >ref.gp

   hgsql mm7 -N -e \
   'select gbCdnaInfo.acc,cds.name from gbCdnaInfo,cds,all_mrna where all_mrna.qName=gbCdnaInfo.acc and   gbCdnaInfo.cds=cds.id' \
   |sort -u > all_mrna.cds

   bash
   mrnaToGene -cdsFile=all_mrna.cds all_mrna.psl all_mrna.gp 2>all_mrna.err > all_mrna.log
   exit

# Align proteins to RefSeq.

   overlapSelect -inCds -statsOutput -overlapThreshold=0.90 -selectFmt=psl -inFmt=genePred\
   protBlat/protBlat.psl ref.gp ref.stat
   overlapSelect -inCds -dropped=refOut1.gp -overlapThreshold=0.90 -selectFmt=psl -inFmt=genePred\
   protBlat/protBlat.psl ref.gp protRef.gp

   overlapSelect -mergeOutput -selectCds -dropped=protOut1.psl -overlapThreshold=0.90 -inFmt=psl\
   -selectFmt=genePred ref.gp protBlat/protBlat.psl protRef.out

   cut -f 10,22 protRef.out | sort -u >spRef.tab
   cut -f 10 protRef.out    | sort -u >protRef.lis

   hgsql kgMm6ATemp <~/src/hg/lib/spRef.sql
   hgsql kgMm6ATemp -e 'load data local infile "spRef.tab" into table spRef'

# Prepare and perform cluster runs for protein/RefSeq alignments

   ~/src/hg/protein/KGRef2.sh kgMm6A mm7 050315
   ~/src/hg/protein/KGRef3.sh kgMm6A mm7 050315

   cd kgBestRef
   ls out | sed -e 's/prot/do1 prot/g' >doall

   cat << '_EOF_' > do1
echo processing $1
cat out/$1/*.out >>protRefRaw.psl
'_EOF_'

   chmod +x do*
   doall

# Filter out low quality alignments.
   pslReps -nohead -singleHit -minAli=0.9 protRefRaw.psl protRefBlat.psl /dev/null
   cut -f 10,14 protRefBlat.psl |sort -u >protRef.lis
   wc protRef.lis

   hgsql kgMm6ATemp < ~/src/hg/lib/protRefBlat.sql
   hgsql kgMm6ATemp -e 'load data local infile "protRefBlat.psl" into table protRefBlat'
   hgsql kgMm6ATemp -e 'create index tName on protRefBlat(tName)'

# Run gene-check to filter out invalid gp entries

   cat ref.gp protMrna.gp all_mrna.gp >kgCandidate0.gp
   gene-check  -incl-ok -ok-genepred-out kgCandidate0.passed.gp -nib-dir /cluster/store10/mm7/nib kgCandidate0.gp kgCandidate0.check

   hgsql kgMm6ATemp < ~/src/hg/lib/kgCandidate0.sql
   hgsql kgMm6ATemp -e  'load data local infile "kgCandidate0.gp" into table kgCandidate0'

   hgsql kgMm6ATemp < ~/src/hg/lib/geneCheck.sql
   hgsql kgMm6ATemp -e  'load data local infile "kgCandidate0.check" into table geneCheck ignore 2 lines'

# Run kgKeep to filter out invalid gene candidates

   kgCheck kgMm6ATemp mm7 kgCandidate.tab
   hgsql kgMm6ATemp -e  'drop table kgCandidate'
   hgsql kgMm6ATemp < ~/src/hg/lib/kgCandidate.sql
   hgsql kgMm6ATemp -e  'load data local infile "kgCandidate.tab" into table kgCandidate'

# Update and clean up kgResultBestMrna2.c and then check it in.

# Score protein/mRna and protein/RefSeq alignments

   kgResultBestMrna2 050201 kgMm6ATemp mm7|sort -u >protMrnaBlatScore.tab
   kgResultBestRef2  050315 kgMm6ATemp mm7|sort -u >protRefScore.tab

# Combine scoring results and load them into temp DB.
   cat protMrnaBlatScore.tab protRefScore.tab >protMrnaScore.tab
   hgsql kgMm6ATemp < ~/src/hg/lib/protMrnaScore.sql
   hgsql kgMm6ATemp -e 'load data local infile "protMrnaScore.tab" into table protMrnaScore'
   hgsql kgMm6ATemp -e 'create index mrnaAcc on protMrnaScore(mrnaAcc)'

# Run kgSelect to select highest scoring mRNA or RefSeq for each protein.
 
   kgSelect kgMm6ATemp kgCandidate2.gp
   hgsql kgMm6ATemp -e  'drop table kgCandidate2'
   hgsql kgMm6ATemp < ~/src/hg/lib/kgCandidate2.sql
   hgsql kgMm6ATemp -e 'load data local infile "kgCandidate2.gp" into table kgCandidate2'

# Create sorted file to get entries with identical CDS regions group together. 
   hgsql kgMm6ATemp -N -e \
   'select name,chrom,cdsStart,cdsEnd,score,proteinID from kgCandidate2,protMrnaScore where proteinID=protAcc and name=mrnaAcc order by name,cdsStart,cdsEnd,score desc,proteinID' \
   >kgSorted.tab

# Run kgUniq to pick the top mRNA/RefSeq with hightest score for each CDS structure.

   kgUniq kgMm6ATemp sp050315 kgSorted.tab knownGene.gp dupSpMrna.tab

   hgsql mm7 -e  'drop table dupSpMrna'
   hgsql mm7 <~/src/hg/lib/dupSpMrna.sql
   hgsql mm7 -e 'load data local infile "dupSpMrna.tab" into table dupSpMrna'

# Build mrnaRefseq table first before loading knownGene table

   cd /cluster/store10/entrez
   mkdir 050401
   ln -s /cluster/store10/entrez/050401 /cluster/data/entrez/050401
   cd /cluster/data/entrez/050401

   wget -- timestamp ftp://ftp.ncbi.nlm.nih.gov/gene/DATA/gene2accession.gz
   wget -- timestamp ftp://ftp.ncbi.nlm.nih.gov/gene/DATA/gene_info.gz
   wget -- timestamp ftp://ftp.ncbi.nlm.nih.gov/gene/DATA/gene2refseq.gz
   gzip -d *.gz

   cut -f 2,4 gene2accession | sort -u | grep -v "-" | grep -v "NM_" | sed -e 's/\./\t/g' > entrezMrna.tab
   cut -f 2,4 gene2refseq | grep "NM_"| sort -u | grep -v "-" | sed -e 's/\./\t/g' > entrezRefseq.tab
   cut -f 2,4,6 gene2accession | grep "NM_"| grep "NP_"|sort -u | sed -e 's/\./\t/g' > entrezRefProt.tab
   
   hgsql entrez -e 'drop table entrezRefseq'
   hgsql entrez -e 'drop table entrezMrna'
   hgsql entrez -e 'drop table entrezRefProt'

   hgsql entrez < ~/src/hg/lib/entrezRefseq.sql
   hgsql entrez < ~/src/hg/lib/entrezMrna.sql
   hgsql entrez < ~/src/hg/lib/entrezRefProt.sql

   hgsql entrez -e 'load data local infile "entrezRefseq.tab" into table entrezRefseq'
   hgsql entrez -e 'load data local infile "entrezMrna.tab" into table entrezMrna'
   hgsql entrez -e 'load data local infile "entrezRefProt.tab" into table entrezRefProt'

   hgsql entrez -N -e \
   'select mrna, refseq from entrezRefseq, entrezMrna where entrezRefseq.geneID=entrezMrna.geneID' \
   >mrnaRefseq.tab

   hgsql mm7 < ~/src/hg/lib/mrnaRefseq.sql
   hgsql mm7 -e 'load data local infile "mrnaRefseq.tab" into table mrnaRefseq'

# Sort knownGene table
   ~/kent/src/hg/protein/sortKg.pl knownGene.gp > sortedKnownGene.gp

# Load knownGene table
   cd /cluster/data/kgDB/bed/kgMm6A
   hgsql mm7 -e  'drop table knownGene'
   hgsql mm7 <~/src/hg/lib/knownGene.sql
   hgsql mm7 -e 'load data local infile "sortedKnownGene.gp" into table knownGene'

# Build kgXref table

   kgXref2 mm7 proteins050315 mm7

   hgsql mm7 -e  'drop table kgXref'
   hgsql mm7 <~/src/hg/lib/kgXref.sql
   hgsql mm7 -e 'load data local infile "kgXref.tab" into table kgXref'

# Build spMrna table

   hgsql mm7 -N -e 'select proteinID, name from knownGene' >kgSpMrna.tab

   hgsql mm7 -e  'drop table spMrna'
   hgsql mm7 <~/src/hg/lib/spMrna.sql
   hgsql mm7 -e 'load data local infile "kgSpMrna.tab" into table spMrna'

# Build knownGenePep table

   hgsql mm7 -N -e \
   'select name, protein.val from knownGene, sp050315.displayId, sp050315.protein where proteinID=displayId.val and displayId.acc=protein.acc' \
   >knownGenePep.tab

   hgsql mm7 -e  'drop table knownGenePep'
   hgsql mm7 <~/src/hg/lib/knownGenePep.sql
   hgsql mm7 -e 'load data local infile "knownGenePep.tab" into table knownGenePep'

# Build knownGeneMrna table

   /cluster/data/genbank/bin/i386/gbGetSeqs -native -db=mm7 \
   -gbRoot=/cluster/data/genbank refseq mrna stdout \
   | faToTab stdin refseqSeq.tab

   hgsql kgMm6ATemp -e "drop table refseqSeq"
   hgsql kgMm6ATemp <~/src/hg/lib/refseqSeq.sql
   hgsql kgMm6ATemp -e 'load data local infile "refseqSeq.tab" into table refseqSeq'

   hgsql kgMm6ATemp -N -e \
   'select knownGene.name, seq from refseqSeq, mm7.knownGene where knownGene.name=refseqSeq.name'\
   >j1.tmp

   hgsql kgMm6ATemp -N -e \
   'select knownGene.name, seq from mrnaSeq, mm7.knownGene where knownGene.name=mrnaSeq.name' \
   >j2.tmp
   cat j1.tmp j2.tmp >knownGeneMrna.tab
   rm j1.tmp j2.tmp

   hgsql mm7 -e "drop table mm7.knownGeneMrna"
   hgsql mm7 <~/src/hg/lib/knownGeneMrna.sql
   hgsql mm7 -e 'load data local infile "knownGeneMrna.tab" into table knownGeneMrna'

# update mrnaRefseq table
  
  hgsql mm7 -N -e 'select * from mrnaRefseq' >j.1
  hgsql mm7 -N -e 'select name, name from knownGene where name like "NM_"' >j.2
  cat j.1 j.2 |sort -u >mrnaRefseq2.tab
  hgsql mm7 -e 'delete from mrnaRefseq'
  hgsql mm7 -e 'load data local infile "mrnaRefseq2.tab" into table mrnaRefseq'
  
# Create empty knownGeneLink table to make some old code happy.

   hgsql mm7 <~/src/hg/lib/knownGeneLink.sql

# Build KEGG pathway tables

   ~/src/hg/protein/KGpath.sh kgMm6A mm7 050315

   hgsql kgMm6ATemp -e "drop table keggList"
   hgsql kgMm6ATemp <~/src/hg/lib/keggList.sql
   hgsql kgMm6ATemp -e 'load data local infile "keggList.tab" into table keggList'

   hgsql mm7 -e "drop table keggMapDesc"
   hgsql mm7 -e "drop table keggPathway"
   hgsql mm7 <~/src/hg/lib/keggMapDesc.sql
   hgsql mm7 <~/src/hg/lib/keggPathway.sql
   hgsql mm7 -e 'load data local infile "keggMapDesc.tab" into table keggMapDesc'
   hgsql mm7 -e 'load data local infile "keggPathway.tab" into table keggPathway'

# Build CGAP pathway tables

   ~/src/hg/protein/KGcgap.sh kgMm6A mm7 050315

   hgsql sp050315 -N -e \
   'select name, gene.val from mm7.knownGene, gene, displayId where proteinID=displayId.val and gene.acc=displayId.acc' \
   | sort -u >kgAliasP.tab

# Build alias tables

#	kgAliasM reads from proteins050315.hugo.symbol, proteins050315.hugo.aliases
#	proteins050315.hugo.withdraws, mm7.kgXref.kgID
#	to create kgAliasM.tab and geneAlias.tab
#	by picking out those kgID items from kgXref where
#	kgXref.geneSymbol == hugo.symbol

   kgAliasM mm7 proteins050315

#	kgAliasKgXref reads from mm7.knownGene.proteinID,
#	mm7.knownGene.name, mm7.kgXref.geneSymbol
#	to create kgAliasKgXref.tab

   kgAliasKgXref mm7


#	kgAliasRefseq reads from mm7.knownGene.name,
#	mm7.knownGene.proteinID, mm7.kgXref.refseq
#	to create kgAliasRefseq.tab

   kgAliasRefseq mm7

   hgsql sp050315 -N -e \
   'select name, gene.val from mm7.knownGene, gene, displayId where proteinID=displayId.val and gene.acc=displayId.acc' \
   | sort -u >kgAliasP.tab

   cat kgAliasM.tab kgAliasRefseq.tab kgAliasKgXref.tab kgAliasP.tab | \
   sort |uniq > kgAlias.tab

   hgsql -e "drop table kgAlias;" mm7 
   hgsql mm7 < ~/kent/src/hg/lib/kgAlias.sql
   hgsql mm7 -e 'LOAD DATA local INFILE "kgAlias.tab" into table kgAlias' 

#	kgProtAlias reads from mm7.knownGene.name,
#	mm7.knownGene.proteinID, mm7.knownGene.alignID,
#	proteins050315.spXref3.accession, proteins050315.spSecondaryID, proteins050315.pdbSP.pdb
#	to create kgProtAlias.tab
#

   kgProtAlias mm7 050315

   hgsql mm7 -N -e \
   'select kgID, spDisplayID, protAcc from kgXref where protAcc != ""'\
   | sort -u >kgProtAliasNCBI.tab

    cat kgProtAliasNCBI.tab kgProtAlias.tab | sort | uniq > kgProtAliasBoth.tab
    rm kgProtAliasNCBI.tab kgProtAlias.tab

    echo "`date` creating table kgProtAlias"
    hgsql mm7 -e "drop table kgProtAlias;"
    hgsql mm7 <~/src/hg/lib/kgProtAlias.sql; 
    hgsql mm7 -e 'LOAD DATA local INFILE "kgProtAliasBoth.tab" into table kgProtAlias;'  


# MAKING FOLDUTR TABLES (TBD)
# First set up directory structure and extract UTR sequence on hgwdev
    ssh hgwdev
    mkdir -p /cluster/data/mm7/bed/rnaStruct
    cd /cluster/data/mm7/bed/rnaStruct
    mkdir -p utr3/split utr5/split utr3/fold utr5/fold
    utrFa mm7 knownGene utr3 utr3/utr.fa
    utrFa mm7 knownGene utr5 utr5/utr.fa

# Split up files and make files that define job.
    ssh kk
    cd /cluster/data/mm7/bed/rnaStruct
    faSplit sequence utr3/utr.fa 50000 utr3/split/s
    faSplit sequence utr5/utr.fa 50000 utr5/split/s
    ls -1 utr3/split > utr3/in.lst
    ls -1 utr5/split > utr5/in.lst
    cd utr3
    cat > gsub <<end
#LOOP
rnaFoldBig split/\$(path1) fold
#ENDLOOP
end
    cp gsub ../utr5

# Do cluster run for 3' UTRs
    gensub2 in.lst single gsub spec
    para create spec
    para try
    para push

Completed: 19325 of 19325 jobs
CPU time in finished jobs:     421619s    7026.99m   117.12h    4.88d  0.013 y
IO & Wait Time:                 87355s    1455.91m    24.27h    1.01d  0.003 y
Average job time:                  26s       0.44m     0.01h    0.00d
Longest running job:                0s       0.00m     0.00h    0.00d
Longest finished job:            7988s     133.13m     2.22h    0.09d
Submission to last job:          8644s     144.07m     2.40h    0.10d

# Do cluster run for 5' UTRs 
    cd ../utr5
    gensub2 in.lst single gsub spec
    para create spec
    para try
    para push

Completed: 18053 of 18053 jobs
CPU time in finished jobs:      25309s     421.82m     7.03h    0.29d  0.001 y
IO & Wait Time:                 53547s     892.45m    14.87h    0.62d  0.002 y
Average job time:                   4s       0.07m     0.00h    0.00d
Longest running job:                0s       0.00m     0.00h    0.00d
Longest finished job:             314s       5.23m     0.09h    0.00d
Submission to last job:           565s       9.42m     0.16h    0.01d

# Load database
    ssh hgwdev
    cd /cluster/data/mm7/bed/rnaStruct/utr5
    hgLoadRnaFold mm7 foldUtr5 fold
    cd ../utr3
    hgLoadRnaFold mm7 foldUtr3 fold

# Clean up
    rm -r split fold err batch.bak
    cd ../utr5
    rm -r split fold err batch.bak


# Update default Browser position
# bring up mySQL on genome-testdb and use hgcentraltest DB:

   update dbDb set defaultPos="chr6:29107216-29120872" where name="mm7";

# Create QA Push Queue entry with the following tables:
# cgapAlias
# cgapBiocDesc
# cgapBiocPathway
# dupSpMrna
# keggMapDesc
# keggPathway
# kgAlias
# kgProtAlias
# kgXref
# knownGene
# knownGeneMrna
# knownGenePep
# mrnaRefseq
# spMrna
# foldUtr3
# foldUtr5

############################################################################
# CYTOBAND TRACK (DONE - 2005-10-11 - Hiram)
    ssh hgwdev
    mkdir /cluster/data/mm7/cytoBand
    cd /cluster/data/mm7/cytoBand
    # Get file from NCBI
    WGETRC=/cluster/data/mm7/ncbi/.wgetrc
    export WGETRC
    wget --timestamping ftp://ftp-private.ncbi.nih.gov/mouse_35/ideogram.gz

    gunzip ideogram
    # Create bed file
    /cluster/bin/scripts/createNcbiCytoBand ideogram
    # Load the bed file
    hgLoadBed -strict -noBin \
	-sqlTable=/cluster/home/kent/src/hg/lib/cytoBand.sql mm7 \
	cytoBand cytoBand.bed
    # Make cytoBandIdeo track for ideogram gif on hgTracks page.
    # For mouse cytoBandIdeo is just a replicate of the cytoBand track.
    hgsql mm7 -e "create table cytoBandIdeo (index(chrom(10),chromStart)) as select * from cytoBand;"

############################################################################
#  BLATSERVERS ENTRY (DONE - 2005-08-31 - Hiram)
    ssh hgwdev

    hgsql -e 'INSERT INTO blatServers (db, host, port, isTrans, canPcr) \
	VALUES ("mm7", "blat16", "17778", "1", "0"); \
	INSERT INTO blatServers (db, host, port, isTrans, canPcr) \
	VALUES ("mm7", "blat16", "17779", "0", "1");' \
	    hgcentraltest

############################################################################

### MAKE THE affyU74 TRACK - needed for the Gene Sorter 
#                              (TBD)
# MAKE THE affyU74 TRACK using Affy consensus sequences instead of 
# target sequences. Recalculate alignments and load data
----------------------------------
# Load up semi-local disk with target sequences for Affy mouse U74 chips.
# ssh kkr1u00
# mkdir -p /iscratch/i/affy
#	This /projects filesystem is not available on kkr1u00
#	but it is on kk
# ssh kk
# cp /projects/compbio/data/microarray/affyGnfMouse/sequences/U74*consensus.fa /iscratch/i/affy

ssh kkr1u00
iSync

# Run cluster job to do alignments
ssh kk
mkdir /cluster/data/mm7/bed/affyU74.2005-04-14
cd /cluster/data/mm7/bed/affyU74.2005-04-14
mkdir run
cd run
mkdir psl
#echo /scratch/mus/mm7/maskedContigs/*.fa | wordLine stdin > genome.lst
echo /panasas/store/mm7/nib/*.nib | wordLine stdin > genome.lst
ls -1 /iscratch/i/affy/U74*consensus.fa > affy.lst
cat << '_EOF_' > gsub
#LOOP
/cluster/bin/i386/blat -fine -mask=lower -minIdentity=95 -ooc=/scratch/hg/h/11.ooc $(path1) {check in line+ $(path2)} {check out line+ psl/$(root1)_$(root2).psl}
#ENDLOOP
'_EOF_'
# << this line makes emacs coloring happy

gensub2 genome.lst affy.lst gsub jobList
para create jobList
para try
# do usual para check/para push etc. until the job is done. 
# Completed: 120 of 120 jobs
# CPU time in finished jobs:       7197s     119.94m     2.00h    0.08d  0.000 y
# IO & Wait Time:                  1047s      17.46m     0.29h    0.01d  0.000 y
# Average job time:                  69s       1.15m     0.02h    0.00d
# Longest running job:                0s       0.00m     0.00h    0.00d
# Longest finished job:             208s       3.47m     0.06h    0.00d
# Submission to last job:           751s      12.52m     0.21h    0.01d

# Do sort, best in genome filter, and convert to chromosome coordinates
# to create affyU74.psl.
ssh kk
cd /cluster/data/mm7/bed/affyU74.2005-04-14/run
pslSort dirs raw.psl tmp psl

# change filter parameters for these sequences. only use alignments that
# cover 30% of sequence and have at least minAli = 0.95.
# minAli = 0.97 too high. low minCover as a lot of n's in these sequences
#pslReps -minCover=0.3 -sizeMatters -minAli=0.95 -nearTop=0.005 raw.psl contig.psl /dev/null
pslReps -minCover=0.3 -sizeMatters -minAli=0.95 -nearTop=0.005 raw.psl ../all_affyU74.psl /dev/null

# Sort by chromosome and load into database.
ssh hgwdev
cd /cluster/data/mm7/bed/affyU74.2005-04-14
pslSortAcc nohead chrom temp all_affyU74.psl
cat chrom/*.psl > affyU74.psl
# shorten qName to "xxxx_at" instead of "U74Xv2:xxxx_at;"
# and reload data into table
hgLoadPsl mm7 affyU74.psl
# rm -fr chrom temp run

##   MAKE THE affyGnfU74 TRACKs (TBD)
# Make bed files and load consensus sequences for Affy U74 chip set.
# Fix broken symlinks to microarray data after directory structure changed
# (TBD)
----------------------------------
#This needs to be done after affyU74 is already made.
ssh hgwdev
mkdir -p /cluster/data/mm7/bed/affyGnf.2005-04-14
cd /cluster/data/mm7/bed/affyGnf.2005-04-14
#	may need to build this command in src/hg/affyGnf
affyPslAndAtlasToBed ../affyU74.2005-04-14/affyU74.psl \
	/projects/compbio/data/microarray/affyGnfMouse/data/data_public_U74 \
	affyGnfU74A.bed affyGnfU74A.exp -newType -chip=U74Av2
affyPslAndAtlasToBed ../affyU74.2005-04-14/affyU74.psl \
	/projects/compbio/data/microarray/affyGnfMouse/data/U74B_b.txt \
	affyGnfU74B.bed affyGnfU74B.exp -newType -chip=U74Bv2
affyPslAndAtlasToBed ../affyU74.2005-04-14/affyU74.psl \
	/projects/compbio/data/microarray/affyGnfMouse/data/U74C_b.txt \
	affyGnfU74C.bed affyGnfU74C.exp -newType -chip=U74Cv2

# edit 3 .bed files to shorten qName to "xxxx_at" instead of "U74Xv2:xxxx_at;"
mkdir sav
cp *.bed sav -p
cat sav/affyGnfU74A.bed|sed -e "s/U74Av2://" >affyGnfU74A.bed
cat sav/affyGnfU74B.bed|sed -e "s/U74Bv2://" >affyGnfU74B.bed
cat sav/affyGnfU74C.bed|sed -e "s/U74Cv2://" >affyGnfU74C.bed

# and reload data into table
hgLoadBed -strict mm7 affyGnfU74A affyGnfU74A.bed
hgLoadBed -strict mm7 affyGnfU74B affyGnfU74B.bed
hgLoadBed -strict mm7 affyGnfU74C affyGnfU74C.bed

# Add in sequence data for U74 tracks.
# Copy consensus sequence to /gbdb if it isn't already
# [THE SYM LINKS WERE ALREADY DONE.]
#    mkdir -p /gbdb/hgFixed/affyProbes
    cd /gbdb/hgFixed/affyProbes
    # fix broken symlinks after directory structure changed
    # /projects/compbiodata ----> /projects/compbio/data
    rm U74*
    # make correct symlinks (hartera, 2005-05-03)
    ln -s /projects/compbio/data/microarray/affyGnfMouse/sequences/U74Av2_consensus.fa .
    ln -s /projects/compbio/data/microarray/affyGnfMouse/sequences/U74Bv2_consensus.fa .
    ln -s /projects/compbio/data/microarray/affyGnfMouse/sequences/U74Cv2_consensus.fa .

    # used perl -pi.bak -e 's/;/ /' <file> to remove ";" after probe name
    # ASSUMED THIS IS ALREADY DONE LAST TIME FOR MM4.
    # reload sequences with prefix removed so acc matches name used in
    # other dependent tables
                                                    
    hgLoadSeq -abbr=U74Av2: mm7 /gbdb/hgFixed/affyProbes/U74Av2_consensus.fa
    hgLoadSeq -abbr=U74Bv2: mm7 /gbdb/hgFixed/affyProbes/U74Bv2_consensus.fa
    hgLoadSeq -abbr=U74Cv2: mm7 /gbdb/hgFixed/affyProbes/U74Cv2_consensus.fa

### GNF ATLAS 2  [TBD)
    # Align probes from GNF1M chip.
    ssh kk
    cd /cluster/data/mm7/bed
    mkdir -p geneAtlas2/run/psl
    cd geneAtlas2/run
    #mkdir -p /cluster/bluearc/geneAtlas2
    #cp /projects/compbio/data/microarray/geneAtlas2/mouse/gnf1m.fa /cluster/bluearc/geneAtlas2
    #ls -1 /scratch/mus/mm7/maskedContigs/ > genome.lst
    echo /panasas/store/mm7/nib/*.nib | wordLine stdin > genome.lst

    ls -1 /cluster/bluearc/geneAtlas2/gnf1m.fa > mrna.lst
    echo '#LOOP\nblat -fine -ooc=/scratch/hg/h/mouse11.ooc  $(path1) $(path2) {check out line+ psl/$(root1)_$(root2).psl}\n#ENDLOOP' > gsub
    gensub2 genome.lst mrna.lst gsub spec
    para create spec
    para try
    para check
    para push
    para time

# Completed: 40 of 40 jobs
# CPU time in finished jobs:      56570s     942.84m    15.71h    0.65d  0.002 y
# IO & Wait Time:                   392s       6.53m     0.11h    0.00d  0.000 y
# Average job time:                1424s      23.73m     0.40h    0.02d
# Longest running job:                0s       0.00m     0.00h    0.00d
# Longest finished job:            3979s      66.32m     1.11h    0.05d
# Submission to last job:          3993s      66.55m     1.11h    0.05d


    # Do sort, best in genome filter, and convert to chromosome coordinates
    # to create gnf1h.psl.
    pslSort dirs raw.psl tmp psl
    pslReps -minCover=0.3 -minAli=0.95 -nearTop=0.005 raw.psl ../affyGnf1m.psl /dev/null

    #rm -r contig.psl raw.psl psl

    # Load probes and alignments from GNF1H into database.
    ssh hgwdev
    cd /cluster/data/mm7/bed/geneAtlas2
#    ln -s /projects/compbio/data/microarray/geneAtlas2/mouse/gnf1m.fa /gbdb/hgFixed/affyProbes
    hgLoadPsl mm7 affyGnf1m.psl
    hgLoadSeq mm7 /gbdb/hgFixed/affyProbes/gnf1m.fa

    # Load up track
    hgMapMicroarray gnfAtlas2.bed hgFixed.gnfMouseAtlas2MedianRatio \
    	affyGnf1m.psl
    # Note that the unmapped 5000 records are from all-N sequences.
    hgLoadBed -strict mm7 gnfAtlas2 gnfAtlas2.bed

# MOUSE AFFYMETRIX MOE430 TRACK (TBD)
#    mkdir -p /projects/compbio/data/microarray/affyMouse
    # Download MOE430A and MOE430B consensus sequences from Affymetrix web site
    # http://www.affymetrix.com/support/technical/byproduct.affx?product=moe430
#    unzip MOE430*_consensus.zip

    # check for duplicate probes: there are none, all have unique names
    # check for duplicate probes: 100 from 136745_at to 1367551_a_at
    # remove "consensus:" and ";" from FASTA headers to shorten probeset
    # names for database

#    sed -e 's/consensus://' MOE430A_consensus | sed -e 's/;/ /' > MOE430_all.fa
#    sed -e 's/consensus://' MOE430B_consensus | sed -e 's/;/ /' >> MOE430_all.fa
 
#    cp /projects/compbio/data/microarray/affyMouse/MOE430_all.fa \
#       /cluster/bluearc/affy/

    # THE ABOVE WAS ALREADY TBD)

    # Set up cluster job to align MOE430 consensus sequences to mm7
    ssh kkr1u00
    cd /cluster/data/mm7/bed
    mkdir -p affyMOE430
    cd affyMOE430
#    mkdir -p /iscratch/i/affy
#    cp /cluster/bluearc/affy/MOE430_all.fa /iscratch/i/affy
#    iSync

    ssh kk
    cd /cluster/data/mm7/bed/affyMOE430
    ls -1 /iscratch/i/affy/MOE430_all.fa > affy.lst
    echo /panasas/store/mm7/nib/*.nib | wordLine stdin > genome.lst

    echo '#LOOP\n/cluster/bin/i386/blat -fine -mask=lower -minIdentity=95 -ooc=/scratch/hg/h/mouse11.ooc  $(path1) $(path2) {check out line+ psl/$(root1)_$(root2).psl}\n#ENDLOOP' > template.sub
    gensub2 genome.lst affy.lst template.sub para.spec
    mkdir psl
    para create para.spec
    # Actually do the job with usual para try/check/push/time etc.
# Completed: 40 of 40 jobs
# CPU time in finished jobs:       9414s     156.90m     2.61h    0.11d  0.000 y
# IO & Wait Time:                   281s       4.69m     0.08h    0.00d  0.000 y
# Average job time:                 242s       4.04m     0.07h    0.00d
# Longest running job:                0s       0.00m     0.00h    0.00d
# Longest finished job:             597s       9.95m     0.17h    0.01d
# Submission to last job:           657s      10.95m     0.18h    0.01d

    # Do sort, best in genome filter, and convert to chromosome coordinates
    # to create affyRAE230.psl
    pslSort dirs raw.psl tmp psl

    # only use alignments that cover 30% of sequence and have at least
    # 95% identity in aligned region. 
    # low minCover as a lot of n's in these sequences
    pslReps -minCover=0.3 -sizeMatters -minAli=0.95 -nearTop=0.005 raw.psl affyMOE430.psl /dev/null

    # Load alignments and sequences into database
    ssh hgwdev
    cd /cluster/data/mm7/bed/affyMOE430
    # shorten names in psl file
    sed -e 's/MOE430//' affyMOE430.psl > affyMOE430.psl.bak
    mv affyMOE430.psl.bak affyMOE430.psl

    # load track into database

    hgLoadPsl mm7 affyMOE430.psl
 
    # Add consensus sequences for MOE430
    # Copy sequences to gbdb is they are not there already
#    mkdir -p /gbdb/hgFixed/affyProbes
#    ln -s /projects/compbio/data/microarray/affyMouse/MOE430_all.fa \ 
#       /gbdb/hgFixed/affyProbes

    hgLoadSeq -abbr=MOE430 mm7 /gbdb/hgFixed/affyProbes/MOE430_all.fa
    
    # Clean up
#    rm batch.bak contig.psl raw.psl 
    
    # BELOW TWO THINGS WERE DONE BY RACHEL ALREDAY FOR MM4
    # add entry to trackDb.ra in ~kent/src/hg/makeDb/trackDb/mouse/
    # add affyMOE430.html file and then do make alpha to add to trackDb table

######## MAKING GENE SORTER TABLES #######  (TBD)
# These are instructions for building the
# Gene Sorter.  Don't start these until
# there is a knownGene track and the affy tracks

# Cluster together various alt-splicing isoforms.
#	Creates the knownIsoforms and knownCanonical tables
ssh hgwdev
cd /tmp
hgClusterGenes mm7 knownGene knownIsoforms knownCanonical
#	You may need to build this binary in src/hg/near/hgClusterGenes
#	Got 24603 clusters, from 41208 genes in 43 chromosomes
#	featureBits mm7 knownCanonical
# 	686054706 bases of 2597150411 (26.416%) in intersection
#	featureBits mm5 knownCanonical
#	853516995 bases of 2615483787 (32.633%) in intersection
#	featureBits mm4 knownCanonical
#	840021165 bases of 2627444668 (31.971%) in intersection
#	featureBits mm3 knownCanonical
#	825943052 bases of 2505900260 (32.960%) in intersection
#	! ! ! Can not do featureBits on knownIsoforms

# Extract peptides from knownGenes into fasta file
# and create a blast database out of them.
ssh hgwdev
mkdir -p  /cluster/data/mm7/bed/geneSorter/blastp
cd /cluster/data/mm7/bed/geneSorter/blastp
pepPredToFa mm7 knownGenePep known.faa
#	You may need to build this binary in src/hg/near/pepPredToFa
/cluster/bluearc/blast229/formatdb -i known.faa -t known -n known

# Copy over database to bluearc scratch
mkdir /cluster/panasas/home/store/mm7/blastp
cp -p /cluster/data/mm7/bed/geneSorter/blastp/known.* \
/cluster/panasas/home/store/mm7/blastp

# Split up fasta file into bite sized chunks for cluster
cd /cluster/data/mm7/bed/geneSorter/blastp
mkdir split
faSplit sequence known.faa 8000 split/kg

# Make parasol run directory 
ssh kk
mkdir /cluster/data/mm7/bed/geneSorter/blastp/self
cd /cluster/data/mm7/bed/geneSorter/blastp/self
mkdir run
cd run
mkdir out

# Make blast script
cat  << '_EOF_' > blastSome
#!/bin/sh
BLASTMAT=/cluster/bluearc/blast229/data /cluster/bluearc/blast229/blastall \
	-p blastp -d /cluster/panasas/home/store/mm7/blastp/known \
	-i $1 -o $2 -e 0.01 -m 8 -b 1000
'_EOF_'
    # << keep emacs happy
chmod a+x blastSome

# Make gensub2 file
cat  << '_EOF_' > gsub
#LOOP
blastSome {check in line+ $(path1)} {check out line out/$(root1).tab}
#ENDLOOP
'_EOF_'
    # << keep emacs happy

# Create parasol batch
#	'ls ../../split/*.fa' is too much, hence the echo
echo ../../split/*.fa | wordLine stdin > split.lst
gensub2 split.lst single gsub jobList
para create jobList
para try
para check
para push ... etc ...

# Completed: 7715 of 7715 jobs
# CPU time in finished jobs:      31525s     525.42m     8.76h    0.36d  0.001 y
# IO & Wait Time:                 34031s     567.18m     9.45h    0.39d  0.001 y
# Average job time:                   8s       0.14m     0.00h    0.00d
# Longest running job:                0s       0.00m     0.00h    0.00d
# Longest finished job:              61s       1.02m     0.02h    0.00d
# Submission to last job:           142s       2.37m     0.04h    0.00d

# Load into database.  This takes about an hour.
ssh hgwdev
cd /cluster/data/mm7/bed/geneSorter/blastp/self/run/out
hgLoadBlastTab mm7 knownBlastTab *.tab
Scanning through 7715 files
Loading database with 1972005 rows

# Create known gene mapping table and expression distance tables
# for GNF Atlas 2.  (The hgExpDistance takes an hour.)
# TBD)

hgMapToGene mm7 affyGnf1m knownGene knownToGnf1m
hgExpDistance mm7 hgFixed.gnfMouseAtlas2MedianRatio \
	hgFixed.gnfMouseAtlas2MedianExps gnfAtlas2Distance -lookup=knownToGnf1m

# Create table that maps between known genes and RefSeq
hgMapToGene mm7 refGene knownGene knownToRefSeq
#	may need to build this command in src/hg/near/hgMapToGene

# Create a table that maps between known genes and 
# the nice affy expression data.
hgMapToGene mm7 affyU74  knownGene knownToU74
hgMapToGene mm7 affyMOE430 knownGene knownToMOE430
hgMapToGene mm7 affyMOE430 -prefix=A: knownGene knownToMOE430A

# Format and load Rinn et al sex expression data
mkdir /cluster/data/mm7/bed/rinnSex
cd !$
hgMapMicroarray rinnSex.bed hgFixed.mouseRinnSexMedianRatio \
    ../affyMOE430/affyMOE430.psl
hgLoadBed -strict mm7 rinnSex rinnSex.bed

# Format and load the GNF data
mkdir /cluster/data/mm7/bed/affyGnf95
cd /cluster/data/mm7/bed/affyGnf95
affyPslAndAtlasToBed -newType ../affyU95.psl \
	/projects/compbio/data/microarray/affyGnfHuman/data_public_U95 \
	affyGnfU95.tab affyGnfU95Exps.tab -shortOut

#	this .sql load was in preceeding instructions, but this .sql file
#	appears to not exist and it doesn't seem to be needed anyway.
#	Everything below this seems to create tables OK.
#  hgsql mm7 < ~/kent/src/hg/affyGnf/affyGnfU95.sql

# Create table that gives distance in expression space between 
# GNF genes.  These commands take about 15 minutes each
#	The affyGnfU74?Exps arguments appear to be unused in 
# hgExpDistance
hgExpDistance mm7 affyGnfU74A affyGnfU74AExps affyGnfU74ADistance -lookup=knownToU74
# Got 7720 unique elements in affyGnfU74A
hgExpDistance mm7 affyGnfU74B affyGnfU74BExps affyGnfU74BDistance -lookup=knownToU74
# Got 4619 unique elements in affyGnfU74B
hgExpDistance mm7 affyGnfU74C affyGnfU74CExps affyGnfU74CDistance -lookup=knownToU74
# Got 1406 unique elements in affyGnfU74C

# C.ELEGANS BLASTP FOR GENE SORTER (TBD)
    # Make C. elegans ortholog column using blastp on wormpep.
    # First make C. elegans protein database and copy it to iscratch/i
    # if it doesn't exist already:
    ssh eieio
    mkdir /cluster/data/ce2/bed/blastp
    cd /cluster/data/ce2/bed/blastp
    # Point a web browser at ftp://ftp.sanger.ac.uk/pub/databases/wormpep/
    # to find out the latest version.  Then use that in place of 142 below.
    wget -O wormPep142.faa ftp://ftp.sanger.ac.uk/pub/databases/wormpep/wormpep142/wormpep142
    formatdb -i wormPep142.faa -t wormPep142 -n wormPep142
    ssh kkr1u00
    if (-e /iscratch/i/ce2/blastp) then
      rm -r /iscratch/i/ce2/blastp
    endif
    mkdir -p /iscratch/i/ce2/blastp
    cp /cluster/data/ce2/bed/blastp/wormPep142.p?? /iscratch/i/ce2/blastp
    iSync

    # Make parasol run directory 
    ssh kk
    mkdir -p /cluster/data/mm7/bed/blastp/ce2/run/out
    cd /cluster/data/mm7/bed/blastp/ce2/run
    # Make blast script
    cat > blastSome <<end
#!/bin/csh
setenv BLASTMAT /iscratch/i/blast/data
/iscratch/i/blast/blastall -p blastp -d /iscratch/i/ce2/blastp/wormPep142 -i \$1 -o \$2 -e 0.01 -m 8 -b 1
end
    chmod a+x blastSome
    # Make gensub2 file
    cat > gsub <<end
#LOOP
blastSome {check in line+ \$(path1)} {check out line out/\$(root1).tab}
#ENDLOOP
end
    # Create parasol batch
    ls -1S /cluster/data/mm7/bed/geneSorter/blastp/split >split.lst
    #EDIT split.lst to add "../../../geneSorter/blastp/split/" in front of "kg"
    gensub2 split.lst single gsub spec
    para create spec
    para try, check, push, check, ...
# Completed: 7715 of 7715 jobs
# CPU time in finished jobs:      29337s     488.96m     8.15h    0.34d  0.001 y
# IO & Wait Time:                 24651s     410.84m     6.85h    0.29d  0.001 y
# Average job time:                   7s       0.12m     0.00h    0.00d
# Longest running job:                0s       0.00m     0.00h    0.00d
# Longest finished job:              40s       0.67m     0.01h    0.00d
# Submission to last job:           206s       3.43m     0.06h    0.00d

    # Load into database.  
    ssh hgwdev
    cd /cluster/data/mm7/bed/blastp/ce2/run/out
    hgLoadBlastTab mm7 ceBlastTab -maxPer=1 *.tab

# HUMAN BLASTP FOR GENE SORTER (TBD)
    # Make human ortholog column using blastp on human known genes.
    # First make human protein database and copy it to iscratch/i
    # if it doesn't exist already:
    mkdir /cluster/data/hg17/bed/blastp
    cd /cluster/data/hg17/bed/blastp
    pepPredToFa hg17 knownGenePep known.faa
    formatdb -i known.faa -t known -n known
# PLEASE NOTE, hg17B IS USED INSTEAD OF hg17 for /iscratch/i,
# TO GO AROUND A SUBDIRECTORY ACCESS RIGHT PROBLEM.

    ssh kkr1u00
    if (-e /iscratch/i/hg17B/blastp) then
      rm -r /iscratch/i/hg17B/blastp
    endif
    mkdir -p /iscratch/i/hg17B/blastp
    cp /cluster/data/hg17/bed/blastp/known.p?? /iscratch/i/hg17B/blastp
    iSync
    # Make parasol run directory 
    ssh kk
    mkdir -p /cluster/data/mm7/bed/blastp/hg17/run/out
    cd /cluster/data/mm7/bed/blastp/hg17/run
    # Make blast script
    cat > blastSome <<end
#!/bin/csh
setenv BLASTMAT /iscratch/i/blast/data
/iscratch/i/blast/blastall -p blastp -d /iscratch/i/hg17B/blastp/known -i \$1 -o \$2 -e 0.001 -m 8 -b 1
end
    chmod a+x blastSome
    # Make gensub2 file
    cat > gsub <<end
#LOOP
blastSome {check in line+ \$(path1)} {check out line out/\$(root1).tab}
#ENDLOOP
end
    # Create parasol batch
    ls -1S /cluster/data/mm7/bed/geneSorter/blastp/split >split.lst
    #EDIT split.lst to add "../../../geneSorter/blastp/split/" in front of "kg"
    gensub2 split.lst single gsub spec
    para create spec
    para try, check, push, check, ...
# Completed: 7715 of 7715 jobs
# CPU time in finished jobs:      67090s    1118.17m    18.64h    0.78d  0.002 y
# IO & Wait Time:                 22543s     375.72m     6.26h    0.26d  0.001 y
# Average job time:                  12s       0.19m     0.00h    0.00d
# Longest running job:                0s       0.00m     0.00h    0.00d
# Longest finished job:              83s       1.38m     0.02h    0.00d
# Submission to last job:           213s       3.55m     0.06h    0.00d
 
    # Load into database.  
    ssh hgwdev
    cd /cluster/data/mm7/bed/blastp/hg17/run/out
    hgLoadBlastTab mm7 hgBlastTab -maxPer=1 *.tab

# ZEBRAFISH BLASTP FOR GENE SORTER (TBD)
    # Make Danio rerio (zebrafish) ortholog column using blastp on Ensembl.
    # First make protein database and copy it to iscratch/i
    # if it doesn't exist already:
    ssh kkstore
    mkdir /cluster/data/danRer1/bed/blastp
    cd /cluster/data/danRer1/bed/blastp
    wget ftp://ftp.ensembl.org/pub/current_zebrafish/data/fasta/pep/Danio_rerio.ZFISH4.apr.pep.fa.gz
    
    zcat Dan*.pep.fa.gz > ensembl.faa
    formatdb -i ensembl.faa -t ensembl -n ensembl
    ssh kkr1u00
    if (-e /iscratch/i/danRer1/blastp) then
      rm -r /iscratch/i/danRer1/blastp
    endif
    mkdir -p /iscratch/i/danRer1/blastp
    cp /cluster/data/danRer1/bed/blastp/ensembl.p?? /iscratch/i/danRer1/blastp
    iSync

    # Make parasol run directory 
    ssh kk
    mkdir -p /cluster/data/mm7/bed/blastp/danRer1/run/out
    cd /cluster/data/mm7/bed/blastp/danRer1/run
    # Make blast script
    cat > blastSome <<end
#!/bin/csh
setenv BLASTMAT /iscratch/i/blast/data
/iscratch/i/blast/blastall -p blastp -d /iscratch/i/danRer1/blastp/ensembl -i \$1 -o \$2 -e 0.005 -m 8 -b 1
end
    chmod a+x blastSome
    # Make gensub2 file
    cat > gsub <<end
#LOOP
blastSome {check in line+ \$(path1)} {check out line out/\$(root1).tab}
#ENDLOOP
end
    # Create parasol batch
    ls -1S /cluster/data/mm7/bed/geneSorter/blastp/split >split.lst
    #EDIT split.lst to add "../../../geneSorter/blastp/split/" in front of "kg"
    gensub2 split.lst single gsub spec
    para create spec
    para try, check, push, check, ...
# Completed: 7715 of 7715 jobs
# CPU time in finished jobs:      53430s     890.51m    14.84h    0.62d  0.002 y
# IO & Wait Time:                 24688s     411.46m     6.86h    0.29d  0.001 y
# Average job time:                  10s       0.17m     0.00h    0.00d
# Longest running job:                0s       0.00m     0.00h    0.00d
# Longest finished job:              76s       1.27m     0.02h    0.00d
# Submission to last job:           202s       3.37m     0.06h    0.00d

    # Load into database.  
    ssh hgwdev
    cd /cluster/data/mm7/bed/blastp/danRer1/run/out
    hgLoadBlastTab mm7 drBlastTab -maxPer=1 *.tab

# YEAST BLASTP FOR GENE SORTER (TBD)
    # Make Saccharomyces cerevisiae (yeast) ortholog column using blastp on 
    # RefSeq.  First make protein database and copy it to iscratch/i
    # if it doesn't exist already:
    mkdir /cluster/data/sacCer1/bed/blastp
    cd /cluster/data/sacCer1/bed/blastp
    wget ftp://genome-ftp.stanford.edu/pub/yeast/data_download/sequence/genomic_sequence/orf_protein/orf_trans.fasta.gz
    zcat orf_trans.fasta.gz > sgdPep.faa
    formatdb -i sgdPep.faa -t sgdPep -n sgdPep

    ssh kkr1u00
    # Note: sacCer1 is a name conflict with SARS coronavirus... oh well, 
    # fortunately we won't be looking for homologs there.  :)
    if (-e /iscratch/i/sacCer1/blastp) then
      rm -r /iscratch/i/sacCer1/blastp
    endif
    mkdir -p /iscratch/i/sacCer1/blastp
    cp /cluster/data/sacCer1/bed/blastp/sgdPep.p?? /iscratch/i/sacCer1/blastp
    iSync

    # Make parasol run directory 
    ssh kk
    mkdir -p /cluster/data/mm7/bed/blastp/sacCer1/run/out
    cd /cluster/data/mm7/bed/blastp/sacCer1/run
    # Make blast script
    cat > blastSome <<end
#!/bin/csh
setenv BLASTMAT /iscratch/i/blast/data
/iscratch/i/blast/blastall -p blastp -d /iscratch/i/sacCer1/blastp/sgdPep -i \$1 -o \$2 -e 0.01 -m 8 -b 1
end
    chmod a+x blastSome
    # Make gensub2 file
    cat > gsub <<end
#LOOP
blastSome {check in line+ \$(path1)} {check out line out/\$(root1).tab}
#ENDLOOP
end
    # Create parasol batch
    ls -1S /cluster/data/mm7/bed/geneSorter/blastp/split >split.lst
    #EDIT split.lst to add "../../../geneSorter/blastp/split/" in front of "kg"
    gensub2 split.lst single gsub spec
    para create spec
    para try, check, push, check, ...
# Completed: 7715 of 7715 jobs
# CPU time in finished jobs:       8741s     145.68m     2.43h    0.10d  0.000 y
# IO & Wait Time:                 20376s     339.60m     5.66h    0.24d  0.001 y
# Average job time:                   4s       0.06m     0.00h    0.00d
# Longest running job:                0s       0.00m     0.00h    0.00d
# Longest finished job:              17s       0.28m     0.00h    0.00d
# Submission to last job:           199s       3.32m     0.06h    0.00d

    # Load into database.  
    ssh hgwdev
    cd /cluster/data/mm7/bed/blastp/sacCer1/run/out
    hgLoadBlastTab mm7 scBlastTab -maxPer=1 *.tab

# DM1 BLASTP FOR GENE SORTER (TBD)
    # Make Drosophila melanagaster ortholog column using blastp on FlyBase.
    # First make protein database and copy it to iscratch/i
    # if it doesn't exist already:
    # This is already done, see makeMm3.doc for procedure
    # the directory: /cluster/bluearc/dm1/blastp should have data

    # ssh kkr1u00
    # if (-e /iscratch/i/dm1/blastp) then
    #   rm -r /iscratch/i/dm1/blastp
    # endif
    # mkdir -p /iscratch/i/dm1/blastp
    # cp /cluster/data/dm1/bed/blastp/bdgp.p?? /iscratch/i/dm1/blastp
    # iSync
    # THE ABOVE IS ALREADY DONE BY ANGIE

    # Make parasol run directory 
    ssh kk
    mkdir -p /cluster/data/mm7/bed/blastp/dm1/run/out
    cd /cluster/data/mm7/bed/blastp/dm1/run
    # Make blast script
    cat > blastSome <<end
#!/bin/csh
setenv BLASTMAT /iscratch/i/blast/data
/iscratch/i/blast/blastall -p blastp -d /iscratch/i/dm1/blastp/bdgp -i \$1 -o \$2 -e 0.001 -m 8 -b 1
end
    chmod a+x blastSome
    # Make gensub2 file
    cat > gsub <<end
#LOOP
blastSome {check in line+ \$(path1)} {check out line out/\$(root1).tab}
#ENDLOOP
end
    # Create parasol batch
    ls -1S /cluster/data/mm7/bed/geneSorter/blastp/split >split.lst
    #EDIT split.lst to add "../../../geneSorter/blastp/split/" in front of "kg"
    gensub2 split.lst single gsub spec
    para create spec
    para try, check, push, check, ...

# Completed: 7715 of 7715 jobs
# CPU time in finished jobs:      33260s     554.33m     9.24h    0.38d  0.001 y
# IO & Wait Time:                 24452s     407.54m     6.79h    0.28d  0.001 y
# Average job time:                   7s       0.12m     0.00h    0.00d
# Longest running job:                0s       0.00m     0.00h    0.00d
# Longest finished job:              45s       0.75m     0.01h    0.00d
# Submission to last job:           121s       2.02m     0.03h    0.00d

    # Load into database.  
    ssh hgwdev
    cd /cluster/data/mm7/bed/blastp/dm1/run/out
    hgLoadBlastTab mm7 dmBlastTab -maxPer=1 *.tab

# Create table that maps between known genes and LocusLink (TBD)
hgsql --skip-column-names -e "select mrnaAcc,locusLinkId from refLink" mm7 \
        > refToLl.txt
hgMapToGene mm7 refGene knownGene knownToLocusLink -lookup=refToLl.txt
#       row count is 17480 

# Create table that maps between known genes and Pfam domains
hgMapViaSwissProt mm7 knownGene name proteinID Pfam knownToPfam
# row count is 17132

# Create table to map between known genes and GNF Atlas2
# expression data.
    hgMapToGene mm7 gnfAtlas2 knownGene knownToGnfAtlas2 '-type=bed 12'

# Create table that maps between known genes and visiGene database 
    knownToVisiGene mm7

# ENABLE GENE SORTER FOR mm7 IN HGCENTRALTEST (TBD)
    echo "update dbDb set hgNearOk = 1 where name = 'mm7';" \
      | hgsql -h genome-testdb hgcentraltest

# RAT BLASTP FOR GENE SORTER (TBD)
    # Make RAT ortholog column using blastp on RAT known genes.
    # First make RAT protein database and copy it to iscratch/i
    # if it doesn't exist already:
    mkdir /cluster/data/rn3/bed/blastp
    cd /cluster/data/rn3/bed/blastp
    pepPredToFa rn3 knownGenePep known.faa
    formatdb -i known.faa -t known -n known

    ssh kkr1u00
    if (-e /iscratch/i/rn3/blastp) then
      rm -r /iscratch/i/rn3/blastp
    endif
    mkdir -p /iscratch/i/rn3/blastp
    cp /cluster/data/rn3/bed/blastp/known.p?? /iscratch/i/rn3/blastp
    iSync
    # Make parasol run directory 
    ssh kk
    mkdir -p /cluster/data/mm7/bed/blastp/rn3/run/out
    cd /cluster/data/mm7/bed/blastp/rn3/run
    # Make blast script
    cat > blastSome <<end
#!/bin/csh
setenv BLASTMAT /iscratch/i/blast/data
/iscratch/i/blast/blastall -p blastp -d /iscratch/i/rn3/blastp/known -i \$1 -o \$2 -e 0.001 -m 8 -b 1
end
    chmod a+x blastSome
    # Make gensub2 file
    cat > gsub <<end
#LOOP
blastSome {check in line+ \$(path1)} {check out line out/\$(root1).tab}
#ENDLOOP
end
    # Create parasol batch
    ls -1S /cluster/data/mm7/bed/geneSorter/blastp/split >split.lst
    #EDIT split.lst to add "../../../geneSorter/blastp/split/" in front of "kg"
    gensub2 split.lst single gsub spec
    para create spec
    para try, check, push, check, ...
# Completed: 7715 of 7715 jobs
# CPU time in finished jobs:      12896s     214.93m     3.58h    0.15d  0.000 y
# IO & Wait Time:                 21725s     362.08m     6.03h    0.25d  0.001 y
# Average job time:                   4s       0.07m     0.00h    0.00d
# Longest running job:                0s       0.00m     0.00h    0.00d
# Longest finished job:              22s       0.37m     0.01h    0.00d
# Submission to last job:           246s       4.10m     0.07h    0.00d
 
    # Load into database.  
    ssh hgwdev
    cd /cluster/data/mm7/bed/blastp/rn3/run/out
    hgLoadBlastTab mm7 rnBlastTab -maxPer=1 *.tab

# END OF GENE SORTER STUFF
#############################################################################

### MM6 PROTEOME BROWSER TABLES BUILD ####  (TBD)
# These are instructions for building tables 
# needed for the Proteome Browser to be used with mm7.  
# DON'T START THESE UNTIL TABLES FOR KNOWN GENES AND kgProtMap table
# ARE REBUILT.  
# This build is based on proteins DBs dated 050315.

# Create the working directory

   ssh hgwdev
   mkdir /cluster/data/mm7/bed/pb.2005-04-20
   cd /cluster/data/mm7/bed
   ln -s /cluster/data/mm7/bed/pb.2005-04-20 pb
   cd pb

# Define pep* tables in mm7 DB

   cat ~/kent/src/hg/lib/pep*.sql > pepAll.sql

# First edit out pepPred table definition, then

   hgsql mm7 < pepAll.sql

# Build the pepMwAa table

  hgsql proteins050315 -e "select info.acc, molWeight, aaSize from sp050315.info, sp050315.accToTaxon where 
accToTaxon.taxon=10090 and accToTaxon.acc = info.acc" > pepMwAa.tab

hgsql mm7 -e 'load data local infile "pepMwAa.tab" into table mm7.pepMwAa ignore 1 lines;'

o Build the pepPi table

  hgsql proteins050315 -e "select info.acc from sp050315.info, sp050315.accToTaxon where accToTaxon.taxon=10090 and accToTaxon.acc = info.acc" > protAcc.lis

  pbCalPi protAcc.lis sp050315 pepPi.tab

hgsql mm7 -e 'load data local infile "pepPi.tab" into table mm7.pepPi;'


# Calculate and load pep distributions

  pbCalDist sp050315 proteins050315 10090 mm7 >pbCalDist.out

    cat pbCalDist.out
    wc  pbCalDist.out

    hgsql mm7

    load data local infile "pepExonCntDist.tab" into table mm7.pepExonCntDist;
    load data local infile "pepCCntDist.tab" into table mm7.pepCCntDist;
    load data local infile "pepHydroDist.tab" into table mm7.pepHydroDist;
    load data local infile "pepMolWtDist.tab" into table mm7.pepMolWtDist;
    load data local infile "pepResDist.tab" into table mm7.pepResDist;
    load data local infile "pepIPCntDist.tab" into table mm7.pepIPCntDist;
    load data local infile "pepPiDist.tab" into table mm7.pepPiDist;
    quit

# Calculate frequency distributions

    pbCalResStd 050315 10090 mm7

# Create pbAnomLimit and pbResAvgStd tables

   hgsql mm7 < ~/src/hg/lib/pbAnomLimit.sql
   hgsql mm7 < ~/src/hg/lib/pbResAvgStd.sql

   hgsql mm7 -e 'load data local infile "pbResAvgStd.tab" into table mm7.pbResAvgStd;'
   hgsql mm7 -e 'load data local infile "pbAnomLimit.tab" into table mm7.pbAnomLimit;'

# UPDATE kgSpAlias TABLE TO BE USED BY PB (Done 4/20/05)

    cd /cluster/data/mm7/bed/pb
    hgsql mm7 -e \
    'select kgXref.kgID, spID, alias from kgXref, kgAlias where kgXref.kgID=kgAlias.kgID' >j.tmp
    hgsql mm7 -e \
    'select kgXref.kgID, spID, alias from kgXref, kgProtAlias where kgXref.kgID=kgProtAlias.kgID'\
    >>j.tmp
    cat j.tmp|sort -u |grep -v 'kgID' >mm7.kgSpAlias.tab
    rm j.tmp

    hgsql mm7 -e 'drop table kgSpAlias';
    hgsql mm7 < ~/src/hg/lib/kgSpAlias.sql
    hgsql mm7 -e 'load data local infile "mm7.kgSpAlias.tab" into table kgSpAlias'
    gzip mm7.kgSpAlias.tab

# Create pbStamp table for PB
   
  hgsql mm7 < ~/src/hg/lib/pbStamp.sql
  hgsql mm5 -e 'select * from pbStamp' > pbStamp.tab

  hgsql mm7 -e 'delete from pbStamp'
  hgsql mm7 -e 'load data local infile "pbStamp.tab" into table mm7.pbStamp ignore 1 lines;'

# ENABLE PROTEOME BROWSER FOR mm7 IN HGCENTRALTEST 
    echo "update dbDb set hgPbOk = 1 where name = 'mm7';" \
      | hgsql -h genome-testdb hgcentraltest

# Connect to genome-testdb and use hgcentraltest DB.
# Update the entry in gdbPdb table from mySql prompt:

    delete from gdbPdb where genomeDb='mm7';
    insert into gdbPdb values('mm7', 'proteins050415');
    
# Adjust drawing parameters for Proteome Browser stamps

  Now invoke Proteome Browser and adjust various drawing parameters
  (mostly the ymax of each stamp) if necessary, by updating the 
  pbStamp.tab file and then delete and reload the pbStamp table. 

# Perform preliminary review of Proteome Browser for mm7, then
  notify QA for formal review.

#####################################################################
# MAP CONTIGS TRACK (DONE - 2005-10-04 - Hiram)
    ssh hgwdev
    mkdir -p /cluster/data/mm7/bed/ctgPos
    cd /cluster/data/mm7/bed/ctgPos
    # hgCtgPos uses the lift files... but mouse lift files are for the
    # 5MB contigs from splitFaIntoContigs, not for the real NT_ contigs
    # from the assembly.  (In the future, we should go with the NT's!)
    # So... just for this release, go straight from the seq_contig.md
    # to the table def'n: contig, size, chrom, chromStart, chromEnd
    #	This script is an improvement from before, this is now doing the
    #	randoms properly.
    cat << '_EOF_' > seqContigToCtgPos.pl
#!/usr/bin/env perl

use warnings;
use strict;

my $prevRandom="";
my $randomPosition=0;

while(my $line=<>)
{
chomp($line);
my @a = split('\s+',$line);
if ($a[1] =~ m/\|/)
    {
    my @b = split('\|',$a[1]);
    if ($b[0] ne $prevRandom)
	{
	$randomPosition=0;
	$prevRandom=$b[0];
	}
    my $size = $a[3]-$a[2]+1;
    my $start = $randomPosition;
    my $end = $randomPosition + $size;
    printf "%s\t%d\tchr%s_random\t%d\t%d\n", $a[5],$size,$b[0],$start,$end;
    if ($b[0] ne "Un") { $randomPosition += 50000; }
	else { $randomPosition += 1000; }
    $randomPosition += $size;
    }
elsif ($a[5] =~ m/^N[TC]_\d+$/)
    {
    my $start = $a[2]-1;
    my $end = $a[3];
    my $size = $end-$start;
    printf "%s\t%d\tchr%s\t%d\t%d\n", $a[5],$size,$a[1],$start,$end;
    }
}
'_EOF_'
    #	emacs happy
    chmod +x seqContigToCtgPos.pl

    # /cluster/data/mm7/ncbi/seq_contig.md contains more than just C57BL/6J.
    # Filter those out with the grep.
    cat ../../seq_contig.md | grep C57BL | \
	./seqContigToCtgPos.pl > ctgPos.tab

    hgsql mm7 < ~/kent/src/hg/lib/ctgPos.sql
    hgsql mm7 -e 'load data local infile "ctgPos.tab" into table ctgPos;'

    featureBits -countGaps mm7 ctgPos
    #	2608810329 bases of 2847717329 (91.611%) in intersection
    featureBits -countGaps mm6 ctgPos
    #	2638893452 bases of 3079633452 (85.689%) in intersection
    featureBits -countGaps mm5 ctgPos
    #	2557081173 bases of 3164952073 (80.794%) in intersection

#########################################################################
####  Blat knownGene proteins to determine exons (TBD)
    ssh hgwdev
    cd /cluster/data/mm7/bed
    mkdir blat.mm7KG.2005-05-02
    rm blat.mm7KG
    ln -s  blat.mm7KG.2005-05-02 blat.mm7KG
    cd blat.mm7KG
    pepPredToFa mm7 knownGenePep known.fa
    hgPepPred mm7 generic blastKGPep03 known.fa
    grep ">" known.fa | sed "s/>//" > kgName.lst
    ssh kk
    cd /cluster/data/mm7/bed/blat.mm7KG
    cat << '_EOF_' > blatSome
#!/bin/csh -fe
/cluster/bin/i386/blat -t=dnax -q=prot -out=pslx $1 $2 $3
'_EOF_'
    # << keep emacs happy
    chmod +x blatSome
    ls -1S /panasas/store/mm7/nib/*.nib > mouse.lst
    mkdir kgfa
    cd kgfa
    faSplit sequence ../known.fa 3000 kg
    cd ..
    ls -1S kgfa/*.fa > kg.lst
    cat << '_EOF_' > blatGsub
#LOOP
blatSome $(path1) {check in line $(path2)} {check out line psl/$(root1)/$(root2).psl}
#ENDLOOP
'_EOF_'
    # << keep emacs happy
    gensub2 mouse.lst kg.lst blatGsub blatSpec
    mkdir psl
    cd psl
    foreach i (`cat ../mouse.lst`)
	mkdir `basename $i .nib`
    end
    cd ..
    para create blatSpec
    para push

# Completed: 115720 of 115720 jobs
# CPU time in finished jobs:   14938417s  248973.62m  4149.56h  172.90d  0.474 y
# IO & Wait Time:               2116275s   35271.25m   587.85h   24.49d  0.067 y
# Average job time:                 147s       2.46m     0.04h    0.00d
# Longest finished job:            9235s     153.92m     2.57h    0.11d
# Submission to last job:         25264s     421.07m     7.02h    0.29d

    ssh eieio
    cd /cluster/data/mm7/bed/blat.mm7KG
    pslSort dirs raw.psl /tmp psl/*
    pslReps -nohead -minCover=0.9 -minAli=0.9 raw.psl cooked.psl /dev/null
    pslUniq cooked.psl mm7KG.psl
    pslxToFa mm7KG.psl mm7KG_ex.fa -liftTarget=genome.lft -liftQuery=protein.lft

    ssh hgwdev
    kgName mm7 mm7KG.psl blastKGRef03
    hgsql mm7 < ~/kent/src/hg/lib/blastRef.sql
    echo "rename table blastRef to blastKGRef03" | hgsql mm7
    echo "load data local infile 'blastKGRef03' into table blastKGRef03" | hgsql mm7

##############################################################################
# LOAD GENEID GENES (DONE - 2005-12-20 - Hiram)
    ssh hgwdev
    mkdir -p /cluster/data/mm7/bed/geneid/download
    cd /cluster/data/mm7/bed/geneid/download
    awk '{print $1}' ../../../chrom.sizes | while read C
    do
      echo $C
      wget --timestamping \
http://genome.imim.es/genepredictions/M.musculus/mmDec2005/geneid_v1.2/$C.gtf
      wget --timestamping \
http://genome.imim.es/genepredictions/M.musculus/mmDec2005/geneid_v1.2/$C.prot
    done
    # Add missing .1 to protein id's
    foreach f (*.prot)
      perl -wpe 's/^(>chr\w+)$/$1.1/' $f > $f:r-fixed.prot
    end
    cd ..
    ldHgGene -genePredExt -gtf mm7 geneid download/*.gtf
    #	Read 34992 transcripts in 295837 lines in 40 files
    #	34992 groups 40 seqs 1 sources 3 feature types
    #	34992 gene predictions
    hgPepPred mm7 generic geneidPep download/*-fixed.prot
    featureBits mm7 -enrichment refGene geneid
# refGene 1.707%, geneid 1.579%, both 0.831%, cover 48.68%, enrich 30.83x
    featureBits mm6 -enrichment refGene geneid
# refGene 1.623%, geneid 1.561%, both 0.794%, cover 48.91%, enrich 31.34x

##############################################################################
# CLONE ENDS - BACEND TRACK (DONE - 2005-09-29 - Hiram)

    ssh kkstore02
    cd /cluster/data/mm7
    # check disk space: 73Gb free
    df -h .
# Filesystem            Size  Used Avail Use% Mounted on
# /export/cluster/store5
#                       1.5T  1.3T   73G  95% /cluster/store5
    mkdir -p bed/cloneend/ncbi
    cd bed/cloneend/ncbi

    wget --timestamping \
	ftp://ftp.ncbi.nih.gov/genomes/CLONEEND/mus_musculus/*

    cd /cluster/data/mm7/bed/cloneend
    # seems like the *.mfa files were split just for convenience
    # concatenate
    for F in ncbi/*.mfa.gz
    do
	zcat ${F}
    done | gzip > all.mfa.gz

    # Convert the title line of the all.mfa file
    cat << '_EOF_' > convert.pl
#!/usr/bin/env perl

use strict;
use warnings;

while (my $line = <>) {
    if ($line !~ m/^>/) {
	print $line
    } else {
        my @fields = split('\|', $line);
	my $fieldCount = scalar(@fields);
        my $printed = 0;
        for (my $i = 0; $i < $fieldCount; $i++) {
                if ($fields[$i] eq "gb" || $fields[$i] eq "dbj") {
                        (my $name, my $vers) = split(/\./,$fields[$i+1]);
                        print ">$name\n";
                        $i= $fieldCount;
                        $printed = 1;
                }
        }
        if (!$printed) {
                die("Failed for $line\n");
        }
    }
}
'_EOF_'
    # < happy emacs
    chmod +x convert.pl
    zcat all.mfa | ./convert.pl | gzip > cloneEnds.fa.gz

    #	make sure nothing got broken:
    faSize all.mfa.gz
# 498162791 bases (16779168 N's 481383623 real 304962409 upper 176421214 lower) in 789466 sequences in 1 files
    faSize cloneEnds.fa.gz
# 498162791 bases (16779168 N's 481383623 real 304962409 upper 176421214 lower) in 789466 sequences in 1 files
    #	identical numbers

    # concatenate the text files, too
    for F in ncbi/*.txt.gz
    do
	zcat ${F}
    done | gzip > all.txt.gz

    # generate cloneEndPairs.txt and cloneEndSingles.txt
    cp -p /cluster/data/mm6/bed/cloneend/ncbi/convertTxt.pl .
    zcat all.txt.gz | ./convertTxt.pl stdin

    # Reading in end info
    # Writing out pair info
    # Writing out singleton info
    # 354485 pairs and 78423 singles

    #	faSplit does not function correctly if given a .gz source file
    #	AND, we need the unzipped file for sequence loading below
    gunzip cloneEnds.fa.gz
    # split
    mkdir splitdir
    cd splitdir
    faSplit sequence ../cloneEnds.fa 100 cloneEnds
    #	Check to ensure no breakage:
    cat *.fa | faSize stdin
# 498162791 bases (16779168 N's 481383623 real 304962409 upper 176421214 lower) in 789466 sequences in 1 files
    #	same numbers as before

    #	Copy to san for cluster runs
    ssh pk
    cd /cluster/data/mm7/bed/cloneend/splitDir
    mkdir /san/sanvol1/scratch/mm7/cloneEnds
    cp -p *.fa /san/sanvol1/scratch/mm7/cloneEnds
    rm *
    cd ..
    rmdir splitDir

    # load sequences
    ssh hgwdev
    mkdir /gbdb/mm7/cloneend
    cd /gbdb/mm7/cloneend
      ln -s /cluster/data/mm7/bed/cloneend/cloneEnds.fa .
    cd /tmp
    hgLoadSeq mm7 /gbdb/mm7/cloneend/cloneEnds.fa
    #  Advisory lock created
    # Creating .tab file
    # Adding /gbdb/mm7/cloneend/cloneEnds.fa
    # 789466 sequences
    # Updating seq table
    # Advisory lock has been released
    # All done

############################################################################
# BACEND SEQUENCE ALIGNMENTS (DONE - 2005-09-29 - 2005-10-03 - Hiram)
    ssh kkstore02
    mkdir /cluster/data/mm7/noMask
    cd /cluster/data/mm7/
    #	Need an unmasked sequence for this work
    for CHR in ?/chr?.fa ??/chr??.fa ?/chr?_random.fa ??/chr??_random.fa
    do
	C=`basename ${CHR}`
	echo -n "working ${C} ... "
	head -1 ${CHR} > noMask/${C}
	tail +2 ${CHR} | tr [:lower:] [:upper:] >> noMask/${C}
	echo "done"
    done
    mkdir ooc
    ls noMask/chr*.fa > fa.list
    #	ooc file was created earlier into /cluster/bluearc/mm7/11.ooc
    #	and /san/sanvol1/scratch/mm7/11.ooc

    ssh pk
    mkdir /san/sanvol1/scratch/mm7/noMask
    cd /san/sanvol1/scratch/mm7/noMask
    time cp --verbose -p /cluster/data/mm7/noMask/chr*.fa .
    
    # allow blat to run politely in /tmp while it writes output, then
    # copy results to results file:

    mkdir /cluster/data/mm7/bed/bacends
    cd /cluster/data/mm7/bed/bacends

    cat << '_EOF_' > runBlat.sh
#!/bin/sh
path1=$1
path2=$2
root1=$3
root2=$4
result=$5
rm -fr /tmp/${root1}_${root2}
mkdir /tmp/${root1}_${root2}
pushd /tmp/${root1}_${root2}
/cluster/bin/x86_64/blat ${path1} ${path2} \
	-ooc=/san/sanvol1/scratch/mm7/11.ooc ${root1}.${root2}.psl
popd
rm -f ${result}
mv /tmp/${root1}_${root2}/${root1}.${root2}.psl ${result}
rm -fr /tmp/${root1}_${root2}
'_EOF_'
    # << emacs happy
    chmod +x runBlat.sh

    cat << '_EOF_' > template
#LOOP
./runBlat.sh {check in exists $(path1)} {check in exists $(path2)} $(root1) $(root2) {check out line+ bacEnds.out/$(root2)/$(root1).$(root2).psl}
#ENDLOOP
'_EOF_'
    # << emacs happy

    ls -1S /san/sanvol1/scratch/mm7/cloneEnds/cloneEnds???.fa > bacEnds.lst
    mkdir bacEnds.out
    #	create results directories for each to avoid the all result files in
    #	one directory problem
    foreach f (`cat bacEnds.lst`)
	set b = $f:t:r
	echo $b
	mkdir bacEnds.out/$b
    end

    ls -1S /san/sanvol1/scratch/mm7/noMask/chr*.fa > contig.lst
    gensub2 contig.lst bacEnds.lst template jobList
    para create jobList
    # 3920 jobs written to batch
    para try, check, push, etc ...
    XXX - STARTED - 2005-10-02 - Hiram

# Completed: 3920 of 3920 jobs
# CPU time in finished jobs:     683325s   11388.75m   189.81h    7.91d  0.022 y
# IO & Wait Time:                 51220s     853.67m    14.23h    0.59d  0.002 y
# Average job time:                 187s       3.12m     0.05h    0.00d
# Longest running job:                0s       0.00m     0.00h    0.00d
# Longest finished job:            1388s      23.13m     0.39h    0.02d
# Submission to last job:         65899s    1098.32m    18.31h    0.76d

    ssh kkstore02
    cd /cluster/data/mm7/bed/bacends
    screen

    mkdir temp
    time pslSort dirs raw.psl temp bacEnds.out/* > pslSort.out 2>&1 &
    #	real    27m20.352s
    #	user    20m10.329s
    #	sys     1m55.287s

    time pslReps -nearTop=0.01 -minCover=0.7 -minAli=0.8 -noIntrons \
	raw.psl bacEnds.psl /dev/null > pslReps.out 2>&1 &
    #	real    8m15.671s
    #	user    7m18.229s
    #	sys     0m20.554s

    cp -p ~booch/clusterJobs/bacends/split.pl .
    cp -p ~booch/clusterJobs/bacends/header .
    time ./split.pl header < bacEnds.psl

    cp -p bacEnds.psl bacEnds.psl.save
    time pslSort dirs bacEnds.psl temp split
    #	~ 3 minutes

    # Copy files to final destination and remove
    mkdir /cluster/data/mm7/bacends
    cp -p bacEnds.psl /cluster/data/mm7/bacends

############################################################################
# BACEND PAIRS TRACK (DONE - 2005-10-03 - 2005-10-04 - Hiram)

    ssh kolossus
    cd /cluster/data/mm7/bacends

time /cluster/bin/x86_64/pslPairs -tInsert=10000 -minId=0.91 -noBin -min=25000 \
-max=350000 -slopval=10000 -hardMax=500000 -slop -short -long -orphan \
-mismatch -verbose bacEnds.psl \
	../bed/cloneend/cloneEndPairs.txt all_bacends bacEnds

    # create header required by "rdb" tools
    echo -e \
"chr\tstart\tend\tclone\tscore\tstrand\tall\tfeatures\tstarts\tsizes" > header
    echo -e "10\t10N\t10N\t10\t10N\t10\t10\t10N\t10\t10" >> header

    cat header bacEnds.pairs | \
	/cluster/bin/scripts/row score ge 300 | \
	/cluster/bin/scripts/sorttbl chr start | \
	/cluster/bin/scripts/headchg -del > bacEndPairs.bed

    cat header bacEnds.slop bacEnds.short bacEnds.long bacEnds.mismatch \
	bacEnds.orphan | /cluster/bin/scripts/row score ge 300 | \
	/cluster/bin/scripts/sorttbl chr start | \
	/cluster/bin/scripts/headchg -del > bacEndPairsBad.bed

    /cluster/bin/scripts/extractPslLoad -noBin bacEnds.psl bacEndPairs.bed \
	bacEndPairsBad.bed >j1.out
    cat j1.out| /cluster/bin/scripts/sorttbl tname tstart >j2.out
    cat j2.out | /cluster/bin/scripts/headchg -del > bacEnds.load.psl

    rm j1.out j2.out

    #	CHECK bacEndPairs.bed ID's to make sure they have no blanks in them
    awk '{print $5}' bacEndPairs.bed | sort -u
    #	result should be the scores, no extraneous strings:
#	1000
#	300
#	375
#	500
#	750
    #	edit the file and fix it if it has a bad name.

    # load into database
    ssh hgwdev
    cd /cluster/data/mm7/bacends
    hgLoadBed -strict -notItemRgb mm7 bacEndPairs bacEndPairs.bed \
	-sqlTable=$HOME/kent/src/hg/lib/bacEndPairs.sql
    #	Loaded 231570 elements of size 11

    # note - this track isn't pushed to RR, just used for assembly QA
    hgLoadBed -strict -notItemRgb mm7 bacEndPairsBad bacEndPairsBad.bed \
	-sqlTable=$HOME/kent/src/hg/lib/bacEndPairsBad.sql
    #	Loaded 95099 elements of size 11

    # NOTE: truncates file to 0 if -nobin is used
    hgLoadPsl mm7 -table=all_bacends bacEnds.load.psl
# load of all_bacends did not go as planned: 8397416 record(s), 0 row(s)
#	skipped, 1 warning(s) loading psl.tab
#	This takes about 40 minutes

    featureBits mm7 all_bacends
# 334161740 bases of 2583394090 (12.935%) in intersection
    featureBits mm6 all_bacends
# 336981828 bases of 2597150411 (12.975%) in intersection
    featureBits mm5 all_bacends
# 268502414 bases of 2615483787 (10.266%) in intersection
    featureBits mm4 all_bacends
# 243096171 bases of 2627444668 (9.252%) in intersection

    featureBits mm7 bacEndPairs
# 2578837424 bases of 2583394090 (99.824%) in intersection
    featureBits mm6 bacEndPairs
# 2570768812 bases of 2597150411 (98.984%) in intersection
    featureBits mm5 bacEndPairs
# 2567958504 bases of 2615483787 (98.183%) in intersection
    featureBits mm4 bacEndPairs
# 2549945356 bases of 2627444668 (97.050%) in intersection

    featureBits mm7 bacEndPairsBad
# 954662115 bases of 2583394090 (36.954%) in intersection
    featureBits mm6 bacEndPairsBad
# 1006314997 bases of 2597150411 (38.747%) in intersection
    featureBits mm5 bacEndPairsBad
# 541027882 bases of 2615483787 (20.686%) in intersection
    featureBits mm4 bacEndPairsBad
# 1074505863 bases of 2627444668 (40.895%) in intersection

############################################################################
# SGP GENES (DONE - 2006-01-18 - Hiram)
    ssh kkstore02
    mkdir /cluster/data/mm7/bed/sgp
    cd /cluster/data/mm7/bed/sgp
    #	They don't do chrM
    for CHR in `awk '{print $1}' ../../chrom.sizes | grep -v chrM`
    do
	wget --timestamping \
"http://genome.imim.es/genepredictions/M.musculus/mmDec2005/SGP/humangp200405/${CHR}.gtf" \
	-O "${CHR}.gtf"
	wget --timestamping \
"http://genome.imim.es/genepredictions/M.musculus/mmDec2005/SGP/humangp200405/${CHR}.prot" \
	-O "${CHR}.prot"
    done

    # Add ".1" suffix to each item in .prot's, to match transcript_id's in gtf
    for P in chr*.prot
    do
	sed -e "s/^>\(.*\)/>\1.1/" ${P}
    done > sgpPep.fa

    ssh hgwdev
    cd /cluster/data/mm7/bed/sgp
    ldHgGene -gtf -genePredExt mm7 sgpGene chr*.gtf
    hgPepPred mm7 generic sgpPep sgpPep.fa
    featureBits mm7 -enrichment refGene:CDS sgpGene
# refGene:CDS 1.033%, sgpGene 1.441%, both 0.888%, cover 85.94%, enrich 59.63

############################################################################
# RE-BUILD KNOWN GENES RELATED TABLES for mm7 (TBD)

# First build protein databases, sp050415 and proteins050415
# See makeProteins050415.doc for details.
# Please note that the protein and displayId tables in sp050415 have data of variant splice proteins.

# Create working subdirectories and temporary databases

  ssh hgwdev
  cd /cluster/store10/kg
  mkdir kgMm6B  
  ln -s /cluster/store10/kg/kgMm6B /cluster/store6/kgDB/bed/kgMm6B
  ln -s /cluster/store10/kg/kgMm6B /cluster/data/mm7/bed/kgMm6B

  hgsql mm7 -e "create database kgMm6B"   
  hgsql mm7 -e "create database kgMm6BTemp"

  mkdir /cluster/bluearc/kgDB/kgMm6B
  mkdir /cluster/bluearc/kgDB/kgMm6B/protBlat
  ln -s /cluster/bluearc/kgDB/kgMm6B/protBlat /cluster/store10/kg/kgMm6B/protBlat
  cd /cluster/store10/kg/kgMm6B/protBlat

# Get all human protein sequences

  hgsql -N sp050415 -e \
  'select proteins050415.spXref3.accession,protein.val from proteins050415.spXref3,protein where division="10090" and acc=accession' \
  |awk '{print ">" $1;print $2}' >mm7Prot.fa

# Prepare and perform cluster run for protein/genome alignment

  ssh kk
  cd /cluster/data/mm7/bed/kgMm6B/protBlat
  mkdir prot
  faSplit sequence mm7Prot.fa 1000 prot/prot
  ls /cluster/bluearc/kgDB/kgMm6B/protBlat/prot/* > prot.lis

  ssh hgwdev
  cd /cluster/data/mm7/bed/kgMm6B/protBlat
  hgsql mm7 -N -e 'select chrom from chromInfo' > chrom.lis
  exit
  

  cat << '_EOF_' > gsub
#LOOP
/cluster/bin/i386/blat -noHead -t=dnax -q=prot /cluster/data/mm7/nib/$(path1).nib $(path2) {check out line+ /cluster/bluearc/kgDB/kgMm6B/protBlat/result/$(root1)_$(root2).psl}
#ENDLOOP
'_EOF_'

  mkdir result
  gensub2 chrom.lis prot.lis gsub jobList

  para create jobList
  para try
  para check
  para push
  para check ...
# Completed: 31386 of 39600 jobs
# Crashed: 8214 jobs
# CPU time in finished jobs:   32377544s  539625.74m  8993.76h  374.74d  1.027 y
# IO & Wait Time:                727341s   12122.34m   202.04h    8.42d  0.023 y
# Average job time:                1055s      17.58m     0.29h    0.01d
# Longest running job:                0s       0.00m     0.00h    0.00d
# Longest finished job:           34182s     569.70m     9.49h    0.40d
# Submission to last job:         57659s     960.98m    16.02h    0.67d

# Many output .psl files are empty, these warnings are OK.  
# Check to see if there is any other error type.

  para problems |grep empty|wc
#   8214   24642  642357

# collect BLAT results

   ssh hgwdev
   cd /cluster/data/mm7/bed/kgMm6B/protBlat

   mkdir result2
   mkdir result3

   cat chrom.lis |sed -e 's/chr/do1 chr/g' >doall

   cat << '_EOF_' > do1.1
echo processing $1
cat result/$1_prot*.psl >result2/$1.psl
'_EOF_'

   cat << '_EOF_' > do1.2
echo processing $1
pslReps -nohead -minCover=0.80 -minAli=0.80 -nearTop=0.002 result2/$1.psl result3/$1.psl /dev/null >>j.out
'_EOF_'

   chmod +x do*

   cp do1.1 do1
   doall
   cp do1.2 do1
   doall

   cat result3/*.psl >protBlat.psl
   hgLoadPsl mm7 protBlat.psl
# Processing protBlat.psl
# load of protBlat did not go as planned: 82296 record(s), 0 row(s) skipped, 750 warning(s) loading psl.tab
# Looked into the cause of the warnings before and found that it was due to that qBaseInsert 
# and tBaseInsert have negative values, probably due to that this is protein alignment.

# Remember to remove result2 and result3 when KG is built and validated.

   cd /cluster/data/mm7/bed/kgMm6B

# create all_mrna.psl and tight_mrna.psl
   hgsql mm7 -N -e "select * from all_mrna" |cut -f 2-22 >all_mrna.psl

   pslReps -minCover=0.40 -minAli=0.97 -nearTop=0.002 all_mrna.psl tight_mrna.psl /dev/null
# Processed 194640 alignments

# Use overlapSelect to get protein and mRNA alignment overlaps   
   overlapSelect  -statsOutput  -dropped=protOut.psl -overlapThreshold=0.90 \
   -selectFmt=psl -inFmt=psl tight_mrna.psl  protBlat/protBlat.psl protMrna.stat

   overlapSelect  -mergeOutput  -dropped=protOut.psl -overlapThreshold=0.90 -selectFmt=psl \
   -inFmt=psl tight_mrna.psl  protBlat/protBlat.psl protMrna.out

# Create protein/mRNA pair and protein lists
   cut -f 10,31 protMrna.out|sort -u >spMrna.tab
   cut -f 10    protMrna.out|sort -u >protein.lis

# Load spMrna.tab into spMrna table in temp DB.
   hgsql kgMm6BTemp < ~/src/hg/lib/spMrna.sql
   hgsql kgMm6BTemp -e 'load data local infile "spMrna.tab" into table spMrna'
   hgsql kgMm6BTemp -e 'create index mrnaID on spMrna(mrnaID)'

# Prepare and perform cluster run of protein/mRNA alignment

# Get mRNA fa file.
   cd /cluster/data/mm7/bed/kgMm6B
   /cluster/data/genbank/bin/i386/gbGetSeqs -native -db=mm7 \
   -gbRoot=/cluster/data/genbank genbank mrna mrna.fa

# Create mrnaSeq table in kgMm6BTemp DB.

   hgFaToTab mrna.fa mrnaSeq.tab

   hgsql kgMm6BTemp -e 'drop table mrnaSeq'
   hgsql kgMm6BTemp <~/src/hg/lib/mrnaSeq.sql
   hgsql kgMm6BTemp -e 'load data local infile "mrnaSeq.tab" into table mrnaSeq'
   rm mrnaSeq.tab

# Prepare files for cluster run
   ~/src/hg/protein/KG2.sh kgMm6B mm7 050415

# Perform cluster run of protein/mRNA alignment
   ~/src/hg/protein/KG4.sh kgMm6B mm7 050415

# Collect cluster run results
   cd kgBestMrna
   ls out | sed -e 's/prot/do1 prot/g' >doall

# create do1 with the following 2 lines:
   cat << '_EOF_' > do1
echo processing $1
cat out/$1/*.out >>protMrnaRaw.psl
'_EOF_'

   chmod +x do*
   doall

# Filter out low quality alignments
   pslReps -nohead -singleHit -minAli=0.9 protMrnaRaw.psl protMrnaBlat.psl /dev/null
   cut -f 10,14 protMrnaBlat.psl |sort -u >protMrna.lis
   wc protMrna.lis

# Load BLAT results into temp DB.
   hgsql kgMm6BTemp < ~/src/hg/lib/protMrnaBlat.sql
   hgsql kgMm6BTemp -e 'load data local infile "protMrnaBlat.psl" into table protMrnaBlat'
   hgsql kgMm6BTemp -e 'create index tName on protMrnaBlat(tName)'

# Create CDS files from protein/mRNA alignment results.
   hgsql kgMm6BTemp -N -e \
   'select qName,"_",tName,tStart+1,":",tEnd+3 from protMrnaBlat order by qName,tName,tEnd-tStart desc'\
   |sed 's/\t_\t/_/g'|sed 's/\t:\t/../g' >protMrna.cds

# Create protMrna.psl with proteinID_mrnaID as query ID.
   cut -f 22-30 ../protMrna.out > j1.tmp
   cut -f 32-42 ../protMrna.out > j2.tmp
   cut -f 10,31 ../protMrna.out|sed -e 's/\t/_/g' >j3.tmp
   paste j1.tmp j3.tmp j2.tmp >protMrna.psl
   rm j1.tmp j2.tmp j3.tmp

# Run mrnaToGene to create protMrna.gp
   bash
   mrnaToGene -cdsFile=protMrna.cds protMrna.psl protMrna.gp 2>protMrna.err >protMrna.log
   exit

# Prepare refGene and all_mrna gp files.

   cd ..
   hgsql mm7 -N -e 'select * from refGene' >ref.gp

   hgsql mm7 -N -e \
   'select gbCdnaInfo.acc,cds.name from gbCdnaInfo,cds,all_mrna where all_mrna.qName=gbCdnaInfo.acc and   gbCdnaInfo.cds=cds.id' \
   |sort -u > all_mrna.cds

   bash
   mrnaToGene -cdsFile=all_mrna.cds all_mrna.psl all_mrna.gp 2>all_mrna.err > all_mrna.log
   exit

# Align proteins to RefSeq.

   overlapSelect -inCds -statsOutput -overlapThreshold=0.90 -selectFmt=psl -inFmt=genePred\
   protBlat/protBlat.psl ref.gp ref.stat
   overlapSelect -inCds -dropped=refOut1.gp -overlapThreshold=0.90 -selectFmt=psl -inFmt=genePred\
   protBlat/protBlat.psl ref.gp protRef.gp

   overlapSelect -mergeOutput -selectCds -dropped=protOut1.psl -overlapThreshold=0.80 -inFmt=psl\
   -selectFmt=genePred ref.gp protBlat/protBlat.psl protRef.out

   cut -f 10,22 protRef.out | sort -u >spRef.tab
   cut -f 10 protRef.out    | sort -u >protRef.lis

   hgsql kgMm6BTemp -e 'drop table spRef'
   hgsql kgMm6BTemp <~/src/hg/lib/spRef.sql
   hgsql kgMm6BTemp -e 'load data local infile "spRef.tab" into table spRef'

# Prepare and perform cluster runs for protein/RefSeq alignments

   ~/src/hg/protein/KGRef2.sh kgMm6B mm7 050415
   ~/src/hg/protein/KGRef3.sh kgMm6B mm7 050415

   cd kgBestRef
   ls out | sed -e 's/prot/do1 prot/g' >doall

   cat << '_EOF_' > do1
echo processing $1
cat out/$1/*.out >>protRefRaw.psl
'_EOF_'

   chmod +x do*
   doall

# Filter out low quality alignments.
   pslReps -nohead -singleHit -minAli=0.9 protRefRaw.psl protRefBlat.psl /dev/null
   cut -f 10,14 protRefBlat.psl |sort -u >protRef.lis
   wc protRef.lis

   hgsql kgMm6BTemp -e 'drop table protRefBlat'
   hgsql kgMm6BTemp < ~/src/hg/lib/protRefBlat.sql
   hgsql kgMm6BTemp -e 'load data local infile "protRefBlat.psl" into table protRefBlat'
   hgsql kgMm6BTemp -e 'create index tName on protRefBlat(tName)'

# Run gene-check to filter out invalid gp entries
   cd /cluster/data/mm7/bed/kgMm6B
   cat ref.gp kgBestMrna/protMrna.gp all_mrna.gp >kgCandidate0.gp
   gene-check  -incl-ok -ok-genepred-out kgCandidate0.passed.gp -nib-dir \
   /cluster/data/mm7/nib kgCandidate0.gp kgCandidate0.check

   hgsql kgMm6BTemp < ~/src/hg/lib/kgCandidate0.sql
   hgsql kgMm6BTemp -e  'load data local infile "kgCandidate0.gp" into table kgCandidate0'

   hgsql kgMm6BTemp < ~/src/hg/lib/geneCheck.sql
   hgsql kgMm6BTemp -e  'load data local infile "kgCandidate0.check" into table geneCheck ignore 2 lines'

# Run kgCheck to get all KG candidates that pass the KG gene check criteria

   kgCheck kgMm6BTemp mm7 kgCandidate0 geneCheck kgCandidate.tab
   hgsql kgMm6BTemp -e  'drop table kgCandidate'
   hgsql kgMm6BTemp < ~/src/hg/lib/kgCandidate.sql
   hgsql kgMm6BTemp -e  'load data local infile "kgCandidate.tab" into table kgCandidate'
   hgsql kgMm6BTemp -e 'create index alignID on kgCandidate(alignID)'

# Construct the kgCandidateX table that has alignID in the name field. 
   cut -f 2-10 kgCandidate.tab >j2.tmp
   cut -f 11 kgCandidate.tab >j1.tmp
   paste j1.tmp j2.tmp >kgCandidateX.tab

   hgsql kgMm6BTemp -e  'drop table kgCandidateX'
   hgsql kgMm6BTemp < ~/src/hg/lib/kgCandidateX.sql
   hgsql kgMm6BTemp -e  'load data local infile "kgCandidateX.tab" into table kgCandidateX'

# Score protein/mRna and protein/RefSeq alignments

   kgResultBestMrna2 050415 kgMm6BTemp mm7|sort -u >protMrnaBlatScore.tab
   kgResultBestRef2  050415 kgMm6BTemp mm7|sort -u >protRefScore.tab

# Combine scoring results and load them into temp DB.
   cat protMrnaBlatScore.tab protRefScore.tab >protMrnaScore.tab
   hgsql kgMm6BTemp -e 'drop table protMrnaScore'
   hgsql kgMm6BTemp < ~/src/hg/lib/protMrnaScore.sql
   hgsql kgMm6BTemp -e 'load data local infile "protMrnaScore.tab" into table protMrnaScore'
   hgsql kgMm6BTemp -e 'create index mrnaAcc on protMrnaScore(mrnaAcc)'

# Run kgGetCds to get CDS structure of each gene

   kgGetCds kgMm6BTemp kgCandidateX jY.tmp
   cat jY.tmp |sort -u >kgCandidateY.tab
   rm jY.tmp
   hgsql kgMm6BTemp -e  'drop table kgCandidateY'
   hgsql kgMm6BTemp < ~/src/hg/lib/kgCandidateY.sql
   hgsql kgMm6BTemp -e  'load data local infile "kgCandidateY.tab" into table kgCandidateY'

# Run kgPickPrep to replace long cds structure string with cdsId.
   kgPickPrep kgMm6BTemp kgCandidateZ.tab
   hgsql kgMm6BTemp -e  'drop table kgCandidateZ'
   hgsql kgMm6BTemp < ~/src/hg/lib/kgCandidateZ.sql
   hgsql kgMm6BTemp -e  'load data local infile "kgCandidateZ.tab" into table kgCandidateZ'
   hgsql kgMm6BTemp -e 'create index cdsId on kgCandidateZ(cdsId)'

# Run kgPick to pick the representative a mrna/protein pair for each unique CDS structure.

   kgPick kgMm6BTemp mm7 proteins050415 kg4.tmp dupSpMrna.tmp
   sort -u dupSpMrna.tmp >dupSpMrna.tab
   hgsql mm7 -e  'drop table dupSpMrna'
   hgsql mm7 < ~/src/hg/lib/dupSpMrna.sql
   hgsql mm7 -e  'load data local infile "dupSpMrna.tab" into table dupSpMrna'

# Sort KG genes to make the kg4.gp table file.
   ~/kent/src/hg/protein/sortKg.pl kg4.tmp >kg4.gp

   hgsql kgMm6BTemp -e  'drop table knownGene'
   hgsql kgMm6BTemp < ~/src/hg/lib/knownGene.sql
   hgsql kgMm6BTemp -e  'load data local infile "kg4.gp" into table knownGene'

   hgsql mm7 -e  'drop table kg4'
   hgsql mm7 < ~/src/hg/lib/kg4.sql
   hgsql mm7 -e  'load data local infile "kg4.gp" into table kg4'

# Perform analysis before loading kg4 table data to mm7.knownGene table.

# Load data into mm7 knownGene table.
   hgsql mm7 -e  'drop table knownGene'
   hgsql mm7 < ~/src/hg/lib/knownGene.sql
   hgsql mm7 -e  'load data local infile "kg4.gp" into table knownGene'

# Build knownGeneMrna and knownGenePep tables.

   kgPepMrna kgMm6BTemp mm7 050415
   hgsql mm7 -e  'drop table knownGeneMrna'
   hgsql mm7 < ~/src/hg/lib/knownGeneMrna.sql
   hgsql mm7 -e  'load data local infile "knownGeneMrna.tab" into table knownGeneMrna'
   hgsql mm7 -e  'drop table knownGenePep'
   hgsql mm7 < ~/src/hg/lib/knownGenePep.sql
   hgsql mm7 -e  'load data local infile "knownGenePep.tab" into table knownGenePep'


# Build kgXref table

   kgXref2 kgMm6BTemp 050415 mm7

   hgsql mm7 -e  'drop table kgXref'
   hgsql mm7 < ~/src/hg/lib/kgXref.sql
   hgsql mm7 -e  'load data local infile "kgXref.tab" into table kgXref'

# Build spMrna table

   hgsql mm7 -N -e 'select name, proteinID from knownGene' >kgSpMrna.tab

   hgsql mm7 -e  'drop table spMrna'
   hgsql mm7 <~/src/hg/lib/spMrna.sql
   hgsql mm7 -e 'load data local infile "kgSpMrna.tab" into table spMrna'

# Build mrnaRefseq table

   cd /cluster/store10/entrez
   mkdir 050601
   rm /cluster/data/entrez
   ln -s /cluster/store10/entrez/050601 /cluster/data/entrez
   cd /cluster/data/entrez

   wget -- timestamp ftp://ftp.ncbi.nlm.nih.gov/gene/DATA/gene2accession.gz
   wget -- timestamp ftp://ftp.ncbi.nlm.nih.gov/gene/DATA/gene_info.gz
   wget -- timestamp ftp://ftp.ncbi.nlm.nih.gov/gene/DATA/gene2refseq.gz
   gzip -d *.gz

   cut -f 2,4 gene2accession | sort -u | grep -v "-" | grep -v "NM_" | sed -e 's/\./\t/g' > entrezMrna.tab
   cut -f 2,4 gene2refseq | grep "NM_"| sort -u | grep -v "-" | sed -e 's/\./\t/g' > entrezRefseq.tab
   cut -f 2,4,6 gene2accession | grep "NM_"| grep "NP_"|sort -u | sed -e 's/\./\t/g' > entrezRefProt.tab
   
   hgsql entrez -e 'drop table entrezRefseq'
   hgsql entrez -e 'drop table entrezMrna'
   hgsql entrez -e 'drop table entrezRefProt'

   hgsql entrez < ~/src/hg/lib/entrezRefseq.sql
   hgsql entrez < ~/src/hg/lib/entrezMrna.sql
   hgsql entrez < ~/src/hg/lib/entrezRefProt.sql

   hgsql entrez -e 'load data local infile "entrezRefseq.tab" into table entrezRefseq'
   hgsql entrez -e 'load data local infile "entrezMrna.tab" into table entrezMrna'
   hgsql entrez -e 'load data local infile "entrezRefProt.tab" into table entrezRefProt'

   hgsql entrez -N -e \
   'select mrna, refseq from entrezRefseq, entrezMrna where entrezRefseq.geneID=entrezMrna.geneID' \
   >mrnaRefseq.tab

   hgsql mm7 -e 'drop table mrnaRefseq'
   hgsql mm7 < ~/src/hg/lib/mrnaRefseq.sql
   hgsql mm7 -e 'load data local infile "mrnaRefseq.tab" into table mrnaRefseq'

# Build kgProtMap table

    ~/src/hg/protein/kgProtMap2.sh kgMm6B mm7 050415

# Update and clean up kgResultBestMrna2.c and then check it in.

# Build alias tables.		
#	kgAliasM reads from proteins050415.hugo.symbol, proteins050415.hugo.aliases
#	proteins050415.hugo.withdraws, mm7.kgXref.kgID
#	to create kgAliasM.tab and geneAlias.tab
#	by picking out those kgID items from kgXref where
#	kgXref.geneSymbol == hugo.symbol

   cd /cluster/store10/kg/kgMm6B
   mkdir alias
   cd alias
   kgAliasM mm7 proteins050415

#	kgAliasKgXref reads from mm7.knownGene.proteinID,
#	mm7.knownGene.name, mm7.kgXref.geneSymbol
#	to create kgAliasKgXref.tab

   kgAliasKgXref mm7

#	kgAliasRefseq reads from mm7.knownGene.name,
#	mm7.knownGene.proteinID, mm7.kgXref.refseq
#	to create kgAliasRefseq.tab

   kgAliasRefseq mm7

   hgsql sp050415 -N -e 'select name,gene.val from mm7.knownGene,displayId,gene where displayId.val=proteinID and displayId.acc=gene.acc' \
   | sort -u  > kgAliasP.tab

   hgsql mm7 -N -e 'select name, name from knownGene' >kgAliasDup.tab
   hgsql mm7 -N -e 'select mrnaID, dupMrnaID from dupSpMrna' >>kgAliasDup.tab
   
   cat kgAliasM.tab kgAliasRefseq.tab kgAliasKgXref.tab kgAliasP.tab kgAliasDup.tab| \
   sort |uniq > kgAlias.tab

   hgsql -e "drop table kgAlias;" mm7 
   hgsql mm7 < ~/kent/src/hg/lib/kgAlias.sql
   hgsql mm7 -e 'LOAD DATA local INFILE "kgAlias.tab" into table kgAlias' 

#	kgProtAlias reads from mm7.knownGene.name,
#	mm7.knownGene.proteinID, mm7.knownGene.alignID,
#	proteins050415.spXref3.accession, proteins050415.spSecondaryID, proteins050415.pdbSP.pdb
#	to create kgProtAlias.tab

   kgProtAlias mm7 050415

   hgsql mm7 -N -e \
   'select kgID, spDisplayID, protAcc from kgXref where protAcc != ""'\
   | sort -u >kgProtAliasNCBI.tab

# include variant splice protein IDs
   
   hgsql mm7 -N -e \
   'select name, proteinID, parAcc from knownGene,sp050415.varAcc where varAcc=proteinID'\
   |sort -u >kgProtAliasDup.tab

# include duplicate protein IDs from dupSpMrna table
   hgsql mm7 -N -e \
   'select name, knownGene.proteinID, dupProteinID from knownGene, dupSpMrna where name=mrnaID'\
   |sort -u >>kgProtAliasDup.tab

# catch parent acc from dupProteinID too
   hgsql mm7 -N -e\
   'select name, knownGene.proteinID, parAcc from knownGene,dupSpMrna,sp050415.varAcc where name=mrnaID and dupProteinID=varAcc.varAcc'\
   |sort -u >>kgProtAliasDup.tab
    cat kgProtAliasNCBI.tab kgProtAlias.tab kgProtAliasDup.tab | sort -u > kgProtAliasAll.tab

    echo "`date` creating table kgProtAlias"
    hgsql mm7 -e "drop table kgProtAlias;"
    hgsql mm7 <~/src/hg/lib/kgProtAlias.sql; 
    hgsql mm7 -e 'LOAD DATA local INFILE "kgProtAliasAll.tab" into table kgProtAlias;'  

# Build kgSpAlias table

    hgsql mm7 -e \
    'select kgXref.kgID, spID, alias from kgXref, kgAlias where kgXref.kgID=kgAlias.kgID' >j.tmp
    hgsql mm7 -e \
    'select kgXref.kgID, spID, alias from kgXref, kgProtAlias where kgXref.kgID=kgProtAlias.kgID'\
    >>j.tmp
    cat j.tmp|sort -u |grep -v 'kgID' >mm7.kgSpAlias.tab
    rm j.tmp

    hgsql mm7 -e 'drop table kgSpAlias';
    hgsql mm7 < ~/src/hg/lib/kgSpAlias.sql
    hgsql mm7 -e 'load data local infile "mm7.kgSpAlias.tab" into table kgSpAlias'

# MAKE FOLDUTR TABLES (TBD)
# First set up directory structure and extract UTR sequence on hgwdev
    ssh hgwdev
    cd /cluster/data/mm7/bed
    mkdir rnaStruct.2005-05-31
    rm rnaStruct
    ln -s rnaStruct.2005-05-31 rnaStruct
    cd rnaStruct
    mkdir -p utr3/split utr5/split utr3/fold utr5/fold
    utrFa mm7 knownGene utr3 utr3/utr.fa
    utrFa mm7 knownGene utr5 utr5/utr.fa

# Split up files and make files that define job.
    ssh kk
    cd /cluster/data/mm7/bed/rnaStruct
    faSplit sequence utr3/utr.fa 50000 utr3/split/s
    faSplit sequence utr5/utr.fa 50000 utr5/split/s
    ls -1 utr3/split > utr3/in.lst
    ls -1 utr5/split > utr5/in.lst
    cd utr3
    cat > gsub <<end
#LOOP
rnaFoldBig split/\$(path1) fold
#ENDLOOP
end
    cp gsub ../utr5

# Do cluster run for 3' UTRs
    gensub2 in.lst single gsub spec
    para create spec
    para try
    para push
# Completed: 25085 of 25085 jobs
# CPU time in finished jobs:     553473s    9224.55m   153.74h    6.41d  0.018 y
# IO & Wait Time:                 66725s    1112.08m    18.53h    0.77d  0.002 y
# Average job time:                  25s       0.41m     0.01h    0.00d
# Longest running job:                0s       0.00m     0.00h    0.00d
# Longest finished job:            6003s     100.05m     1.67h    0.07d
# Submission to last job:          6524s     108.73m     1.81h    0.08d
# Do cluster run for 5' UTRs 
    cd ../utr5
    gensub2 in.lst single gsub spec
    para create spec
    para try
    para push
# Completed: 23507 of 23507 jobs
# CPU time in finished jobs:      98380s    1639.66m    27.33h    1.14d  0.003 y
# IO & Wait Time:                 60713s    1011.89m    16.86h    0.70d  0.002 y
# Average job time:                   7s       0.11m     0.00h    0.00d
# Longest running job:                0s       0.00m     0.00h    0.00d
# Longest finished job:            7754s     129.23m     2.15h    0.09d
# Submission to last job:          7840s     130.67m     2.18h    0.09d

# Load database
    ssh hgwdev
    cd /cluster/data/mm7/bed/rnaStruct/utr5
    hgLoadRnaFold mm7 foldUtr5 fold
    cd ../utr3
    hgLoadRnaFold mm7 foldUtr3 fold

# Clean up
    rm -r split fold err batch.bak
    cd ../utr5
    rm -r split fold err batch.bak

# Build KEGG pathway tables.  TBD)

   ssh hgwdev
   cd /cluster/store10/kg/kgMm6B
   md kegg
   cd kegg

   ~/src/hg/protein/KGpath.sh kgMm6B mm7 050415

   hgsql mm7 -e "drop table keggMapDesc"
   hgsql mm7 -e "drop table keggPathway"
   hgsql mm7 <~/src/hg/lib/keggMapDesc.sql
   hgsql mm7 <~/src/hg/lib/keggPathway.sql
   hgsql mm7 -e 'load data local infile "keggMapDesc.tab" into table keggMapDesc'
   hgsql mm7 -e 'load data local infile "keggPathway.tab" into table keggPathway'

# Build CGAP pathway tables

   cd ..
   ~/src/hg/protein/KGcgap.sh kgMm6B mm7 050415
   hgsql mm7 -e "drop table cgapAlias"
   hgsql mm7 -e "drop table cgapBiocDesc"
   hgsql mm7 -e "drop table cgapBiocPathway"
   hgsql mm7 <~/src/hg/lib/cgapAlias.sql
   hgsql mm7 <~/src/hg/lib/cgapBiocDesc.sql
   hgsql mm7 <~/src/hg/lib/cgapBiocPathway.sql
   hgsql mm7 -e 'load data local infile "cgapAlias.tab" into table cgapAlias'
   hgsql mm7 -e 'load data local infile "cgapBIOCARTAdescSorted.tab" into table cgapBiocDesc'
   hgsql mm7 -e 'load data local infile "cgapBIOCARTA.tab" into table cgapBiocPathway'


######## RE-BUILD GENE SORTER TABLES #######  (TBD)
# These are instructions for building the
# Gene Sorter.  Don't start these until
# there is a knownGene track and the affy tracks

# Cluster together various alt-splicing isoforms.
#	Creates the knownIsoforms and knownCanonical tables
ssh hgwdev
cd /tmp
hgClusterGenes mm7 knownGene knownIsoforms knownCanonical
# Got 17843 clusters, from 27131 genes in 40 chromosomes
#	featureBits mm7 knownCanonical
# 	764263619 bases of 2597150411 (29.427%) in intersection
#	featureBits mm5 knownCanonical
#	853516995 bases of 2615483787 (32.633%) in intersection
#	featureBits mm4 knownCanonical
#	840021165 bases of 2627444668 (31.971%) in intersection
#	featureBits mm3 knownCanonical
#	825943052 bases of 2505900260 (32.960%) in intersection

# Extract peptides from knownGenes into fasta file
# and create a blast database out of them.
ssh hgwdev
mkdir -p  /cluster/data/mm7/bed/geneSorter/blastp
cd /cluster/data/mm7/bed/geneSorter/blastp
pepPredToFa mm7 knownGenePep known.faa
#	You may need to build this binary in src/hg/near/pepPredToFa
/cluster/bluearc/blast229/formatdb -i known.faa -t known -n known

# Copy over database to bluearc scratch
mkdir /cluster/panasas/home/store/mm7/blastp
cp -p /cluster/data/mm7/bed/geneSorter/blastp/known.* /cluster/panasas/home/store/mm7/blastp

# Split up fasta file into bite sized chunks for cluster
cd /cluster/data/mm7/bed/geneSorter/blastp
mkdir split
faSplit sequence known.faa 8000 split/kg

# Make parasol run directory 
ssh kk
mkdir /cluster/data/mm7/bed/geneSorter/blastp/self
cd /cluster/data/mm7/bed/geneSorter/blastp/self
mkdir run
cd run
mkdir out

# Make blast script
cat  << '_EOF_' > blastSome
#!/bin/sh
BLASTMAT=/cluster/bluearc/blast229/data /cluster/bluearc/blast229/blastall \
-p blastp -d /cluster/panasas/home/store/mm7/blastp/known \
-i $1 -o $2 -e 0.01 -m 8 -b 1000
'_EOF_'
    # << keep emacs happy
chmod a+x blastSome

# Make gensub2 file
cat  << '_EOF_' > gsub
#LOOP
blastSome {check in line+ $(path1)} {check out line out/$(root1).tab}
#ENDLOOP
'_EOF_'
    # << keep emacs happy

# Create parasol batch
#	'ls ../../split/*.fa' is too much, hence the echo
echo ../../split/*.fa | wordLine stdin > split.lst
gensub2 split.lst single gsub jobList
para create jobList
para try
para check
para push ... etc ...
# Completed: 7729 of 7729 jobs
# CPU time in finished jobs:      58630s     977.16m    16.29h    0.68d  0.002 y
# IO & Wait Time:                 39839s     663.99m    11.07h    0.46d  0.001 y
# Average job time:                  13s       0.21m     0.00h    0.00d
# Longest running job:                0s       0.00m     0.00h    0.00d
# Longest finished job:             116s       1.93m     0.03h    0.00d
# Submission to last job:           188s       3.13m     0.05h    0.00d

# Load into database.  This takes about an hour.
ssh hgwdev
cd /cluster/data/mm7/bed/geneSorter/blastp/self/run/out
hgLoadBlastTab mm7 knownBlastTab *.tab
# Scanning through 7729 files
# Loading database with 3391069 rows

# Create known gene mapping table and expression distance tables
# for GNF Atlas 2.  (The hgExpDistance takes an hour.)
# TBD)

hgMapToGene mm7 affyGnf1m knownGene knownToGnf1m
hgExpDistance mm7 hgFixed.gnfMouseAtlas2MedianRatio \
	hgFixed.gnfMouseAtlas2MedianExps gnfAtlas2Distance -lookup=knownToGnf1m
# Have 34863 elements in hgFixed.gnfMouseAtlas2MedianRatio
# Got 20114 unique elements in hgFixed.gnfMouseAtlas2MedianRatio

# Create table that maps between known genes and RefSeq
hgMapToGene mm7 refGene knownGene knownToRefSeq
#	may need to build this command in src/hg/near/hgMapToGene

# Create a table that maps between known genes and 
# the nice affy expression data.
hgMapToGene mm7 affyU74  knownGene knownToU74
hgMapToGene mm7 affyMOE430 knownGene knownToMOE430
hgMapToGene mm7 affyMOE430 -prefix=A: knownGene knownToMOE430A

# Format and load Rinn et al sex expression data
mkdir /cluster/data/mm7/bed/rinnSex
cd !$
hgMapMicroarray rinnSex.bed hgFixed.mouseRinnSexMedianRatio \
../affyMOE430/affyMOE430.psl
hgLoadBed -strict mm7 rinnSex rinnSex.bed

# Format and load the GNF data
mkdir /cluster/data/mm7/bed/affyGnf95
cd /cluster/data/mm7/bed/affyGnf95
affyPslAndAtlasToBed -newType ../affyU95.psl \
/projects/compbio/data/microarray/affyGnfHuman/data_public_U95 \
affyGnfU95.tab affyGnfU95Exps.tab -shortOut

#	this .sql load was in preceeding instructions, but this .sql file
#	appears to not exist and it doesn't seem to be needed anyway.
#	Everything below this seems to create tables OK.
#  hgsql mm7 < ~/kent/src/hg/affyGnf/affyGnfU95.sql

# Create table that gives distance in expression space between 
# GNF genes.  These commands take about 15 minutes each
#	The affyGnfU74?Exps arguments appear to be unused in 
# hgExpDistance
cd /cluster/data/mm7/bed/geneSorter
hgExpDistance mm7 affyGnfU74A affyGnfU74AExps affyGnfU74ADistance -lookup=knownToU74
# Got 10157 unique elements in affyGnfU74A
hgExpDistance mm7 affyGnfU74B affyGnfU74BExps affyGnfU74BDistance -lookup=knownToU74
# Got 6076 unique elements in affyGnfU74B
hgExpDistance mm7 affyGnfU74C affyGnfU74CExps affyGnfU74CDistance -lookup=knownToU74
# Got 1793 unique elements in affyGnfU74C

# C.ELEGANS BLASTP FOR GENE SORTER 
    # Make C. elegans ortholog column using blastp on wormpep.
    # First make C. elegans protein database and copy it to iscratch/i
    # if it doesn't exist already:
    ssh eieio
    mkdir /cluster/data/ce2/bed/blastp
    cd /cluster/data/ce2/bed/blastp
    # Point a web browser at ftp://ftp.sanger.ac.uk/pub/databases/wormpep/
    # to find out the latest version.  Then use that in place of 142 below.
    wget -O wormPep142.faa ftp://ftp.sanger.ac.uk/pub/databases/wormpep/wormpep142/wormpep142
    formatdb -i wormPep142.faa -t wormPep142 -n wormPep142
    ssh kkr1u00
    if (-e /iscratch/i/ce2/blastp) then
      rm -r /iscratch/i/ce2/blastp
    endif
    mkdir -p /iscratch/i/ce2/blastp
    cp /cluster/data/ce2/bed/blastp/wormPep142.p?? /iscratch/i/ce2/blastp
    iSync

    # Make parasol run directory 
    ssh kk
    mkdir -p /cluster/data/mm7/bed/blastp/ce2/run/out
    cd /cluster/data/mm7/bed/blastp/ce2/run
    # Make blast script
    cat > blastSome <<end
#!/bin/csh
setenv BLASTMAT /iscratch/i/blast/data
/iscratch/i/blast/blastall -p blastp -d /iscratch/i/ce2/blastp/wormPep142 -i \$1 -o \$2 -e 0.01 -m 8 -b 1
end
    chmod a+x blastSome
    # Make gensub2 file
    cat > gsub <<end
#LOOP
blastSome {check in line+ \$(path1)} {check out line out/\$(root1).tab}
#ENDLOOP
end
# Create parasol batch
ls -1S /cluster/data/mm7/bed/geneSorter/blastp/split \
|sed -e 's=kg=../../../geneSorter/blastp/split/kg=g' >split.lst
gensub2 split.lst single gsub spec
para create spec
para try, check, push, check, ... 
# Completed: 7729 of 7729 jobs
# CPU time in finished jobs:      40061s     667.69m    11.13h    0.46d  0.001 y
# IO & Wait Time:                 21049s     350.81m     5.85h    0.24d  0.001 y
# Average job time:                   8s       0.13m     0.00h    0.00d
# Longest running job:                0s       0.00m     0.00h    0.00d
# Longest finished job:              33s       0.55m     0.01h    0.00d
# Submission to last job:           134s       2.23m     0.04h    0.00d

# Load into database.  
ssh hgwdev
cd /cluster/data/mm7/bed/blastp/ce2/run/out
hgLoadBlastTab mm7 ceBlastTab -maxPer=1 *.tab

# HUMAN BLASTP FOR GENE SORTER (TBD)
    # Make human ortholog column using blastp on human known genes.
    # First make human protein database and copy it to iscratch/i
    # if it doesn't exist already:
    mkdir /cluster/data/hg17/bed/blastp
    cd /cluster/data/hg17/bed/blastp
    pepPredToFa hg17 knownGenePep known.faa
    formatdb -i known.faa -t known -n known

    ssh kkr1u00
    if (-e /iscratch/i/hg17/blastp) then
      rm -r /iscratch/i/hg17/blastp
    endif
    mkdir -p /iscratch/i/hg17/blastp
    cp /cluster/data/hg17/bed/blastp/known.p?? /iscratch/i/hg17/blastp
    iSync
    # Make parasol run directory 
    ssh kk
    mkdir -p /cluster/data/mm7/bed/blastp/hg17/run/out
    cd /cluster/data/mm7/bed/blastp/hg17/run
    # Make blast script
    cat > blastSome <<end
#!/bin/csh
setenv BLASTMAT /iscratch/i/blast/data
/iscratch/i/blast/blastall -p blastp -d /iscratch/i/hg17/blastp/known -i \$1 -o \$2 -e 0.001 -m 8 -b 1
end
    chmod a+x blastSome
    # Make gensub2 file
    cat > gsub <<end
#LOOP
blastSome {check in line+ \$(path1)} {check out line out/\$(root1).tab}
#ENDLOOP
end
    # Create parasol batch
ls -1S /cluster/data/mm7/bed/geneSorter/blastp/split \
|sed -e 's=kg=../../../geneSorter/blastp/split/kg=g' >split.lst

gensub2 split.lst single gsub spec
para create spec
para try, check, push, check, ...
# Completed: 7729 of 7729 jobs
# CPU time in finished jobs:      81526s    1358.76m    22.65h    0.94d  0.003 y
# IO & Wait Time:                 23670s     394.51m     6.58h    0.27d  0.001 y
# Average job time:                  14s       0.23m     0.00h    0.00d
# Longest running job:                0s       0.00m     0.00h    0.00d
# Longest finished job:              84s       1.40m     0.02h    0.00d
# Submission to last job:           185s       3.08m     0.05h    0.00d

# Load into database.  
ssh hgwdev
cd /cluster/data/mm7/bed/blastp/hg17/run/out
hgLoadBlastTab mm7 hgBlastTab -maxPer=1 *.tab

# ZEBRAFISH BLASTP FOR GENE SORTER 
    # Make Danio rerio (zebrafish) ortholog column using blastp on Ensembl.
    # First make protein database and copy it to iscratch/I
    # The below is done by hg17, that section from makeHg17.doc is copied here.
    ssh kkstore
    mkdir /cluster/data/danRer2/bed/blastp
    cd /cluster/data/danRer2/bed/blastp
    wget ftp://ftp.ensembl.org/pub/current_zebrafish/data/fasta/pep/Danio_rerio.ZFISH4.apr.pep.fa.gz
    
    zcat Dan*.pep.fa.gz > ensembl.faa
    formatdb -i ensembl.faa -t ensembl -n ensembl
    ssh kkr1u00
    if (-e /iscratch/i/danRer2/blastp) then
      rm -r /iscratch/i/danRer2/blastp
    endif
    mkdir -p /iscratch/i/danRer2/blastp
    cp /cluster/data/danRer2/bed/blastp/ensembl.p?? /iscratch/i/danRer2/blastp
    iSync

# The above is copied from makeHg17.doc.

    # Make parasol run directory 
    ssh kk
    mkdir -p /cluster/data/mm7/bed/blastp/danRer2/run/out
    cd /cluster/data/mm7/bed/blastp/danRer2/run
    # Make blast script
    cat > blastSome <<end
#!/bin/csh
setenv BLASTMAT /iscratch/i/blast/data
/iscratch/i/blast/blastall -p blastp -d /iscratch/i/danRer2/blastp/ensembl -i \$1 -o \$2 -e 0.005 -m 8 -b 1
end
    chmod a+x blastSome
    # Make gensub2 file
    cat > gsub <<end
#LOOP
blastSome {check in line+ \$(path1)} {check out line out/\$(root1).tab}
#ENDLOOP
end
    
# Create parasol batch
ls -1S /cluster/data/mm7/bed/geneSorter/blastp/split \
|sed -e 's=kg=../../../geneSorter/blastp/split/kg=g' >split.lst

gensub2 split.lst single gsub spec
para create spec
para try, check, push, check, ...

# Completed: 7729 of 7729 jobs
# CPU time in finished jobs:      72894s    1214.89m    20.25h    0.84d  0.002 y
# IO & Wait Time:                 21284s     354.74m     5.91h    0.25d  0.001 y
# Average job time:                  12s       0.20m     0.00h    0.00d
# Longest running job:                0s       0.00m     0.00h    0.00d
# Longest finished job:              73s       1.22m     0.02h    0.00d
# Submission to last job:           176s       2.93m     0.05h    0.00d

# Load into database.  
ssh hgwdev
cd /cluster/data/mm7/bed/blastp/danRer2/run/out
hgLoadBlastTab mm7 drBlastTab -maxPer=1 *.tab

# YEAST BLASTP FOR GENE SORTER 
    # Make Saccharomyces cerevisiae (yeast) ortholog column using blastp on 
    # RefSeq.  First make protein database and copy it to iscratch/i
    # if it doesn't exist already:
    mkdir /cluster/data/sacCer1/bed/blastp
    cd /cluster/data/sacCer1/bed/blastp
    wget ftp://genome-ftp.stanford.edu/pub/yeast/data_download/sequence/genomic_sequence/orf_protein/orf_trans.fasta.gz
    zcat orf_trans.fasta.gz > sgdPep.faa
    formatdb -i sgdPep.faa -t sgdPep -n sgdPep

    ssh kkr1u00
    # Note: sacCer1 is a name conflict with SARS coronavirus... oh well, 
    # fortunately we won't be looking for homologs there.  :)
    if (-e /iscratch/i/sacCer1/blastp) then
      rm -r /iscratch/i/sacCer1/blastp
    endif
    mkdir -p /iscratch/i/sacCer1/blastp
    cp /cluster/data/sacCer1/bed/blastp/sgdPep.p?? /iscratch/i/sacCer1/blastp
    iSync

    # Make parasol run directory 
    ssh kk
    mkdir -p /cluster/data/mm7/bed/blastp/sacCer1/run/out
    cd /cluster/data/mm7/bed/blastp/sacCer1/run
    # Make blast script
    cat > blastSome <<end
#!/bin/csh
setenv BLASTMAT /iscratch/i/blast/data
/iscratch/i/blast/blastall -p blastp -d /iscratch/i/sacCer1/blastp/sgdPep -i \$1 -o \$2 -e 0.01 -m 8 -b 1
end
    chmod a+x blastSome
    # Make gensub2 file
    cat > gsub <<end
#LOOP
blastSome {check in line+ \$(path1)} {check out line out/\$(root1).tab}
#ENDLOOP
end
# Create parasol batch
ls -1S /cluster/data/mm7/bed/geneSorter/blastp/split \
|sed -e 's=kg=../../../geneSorter/blastp/split/kg=g' >split.lst
gensub2 split.lst single gsub spec
para create spec
para try, check, push, check, ...
# Completed: 7729 of 7729 jobs
# CPU time in finished jobs:      11663s     194.38m     3.24h    0.13d  0.000 y
# IO & Wait Time:                 20479s     341.32m     5.69h    0.24d  0.001 y
# Average job time:                   4s       0.07m     0.00h    0.00d
# Longest running job:                0s       0.00m     0.00h    0.00d
# Longest finished job:              11s       0.18m     0.00h    0.00d
# Submission to last job:           143s       2.38m     0.04h    0.00d
# Load into database.  
ssh hgwdev
cd /cluster/data/mm7/bed/blastp/sacCer1/run/out
hgLoadBlastTab mm7 scBlastTab -maxPer=1 *.tab

# DM1 BLASTP FOR GENE SORTER (TBD)
    # Make Drosophila melanagaster ortholog column using blastp on FlyBase.
    # First make protein database and copy it to iscratch/i
    # if it doesn't exist already:
    # This is already done, see makeMm3.doc for procedure
    # the directory: /cluster/bluearc/dm1/blastp should have data

    # ssh kkr1u00
    # if (-e /iscratch/i/dm1/blastp) then
    #   rm -r /iscratch/i/dm1/blastp
    # endif
    # mkdir -p /iscratch/i/dm1/blastp
    # cp /cluster/data/dm1/bed/blastp/bdgp.p?? /iscratch/i/dm1/blastp
    # iSync
    # THE ABOVE IS ALREADY DONE BY ANGIE

    # Make parasol run directory 
    ssh kk
    mkdir -p /cluster/data/mm7/bed/blastp/dm1/run/out
    cd /cluster/data/mm7/bed/blastp/dm1/run
    # Make blast script
    cat > blastSome <<end
#!/bin/csh
setenv BLASTMAT /iscratch/i/blast/data
/iscratch/i/blast/blastall -p blastp -d /iscratch/i/dm1/blastp/bdgp -i \$1 -o \$2 -e 0.001 -m 8 -b 1
end
    chmod a+x blastSome
    # Make gensub2 file
    cat > gsub <<end
#LOOP
blastSome {check in line+ \$(path1)} {check out line out/\$(root1).tab}
#ENDLOOP
end
# Create parasol batch
ls -1S /cluster/data/mm7/bed/geneSorter/blastp/split \
|sed -e 's=kg=../../../geneSorter/blastp/split/kg=g' >split.lst

gensub2 split.lst single gsub spec
para create spec
para try, check, push, check, ...
# Completed: 7729 of 7729 jobs
# CPU time in finished jobs:      45146s     752.44m    12.54h    0.52d  0.001 y
# IO & Wait Time:                 21289s     354.81m     5.91h    0.25d  0.001 y
# Average job time:                   9s       0.14m     0.00h    0.00d
# Longest running job:                0s       0.00m     0.00h    0.00d
# Longest finished job:              43s       0.72m     0.01h    0.00d
# Submission to last job:           139s       2.32m     0.04h    0.00d

# Load into database.  
ssh hgwdev
cd /cluster/data/mm7/bed/blastp/dm1/run/out
hgLoadBlastTab mm7 dmBlastTab -maxPer=1 *.tab

# Create table that maps between known genes and LocusLink 
cd /cluster/data/mm7/bed/geneSorter
hgsql --skip-column-names -e "select mrnaAcc,locusLinkId from refLink" mm7 > refToLl.txt
hgMapToGene mm7 refGene knownGene knownToLocusLink -lookup=refToLl.txt
#       row count is 23074  

# Create table that maps between known genes and Pfam domains
hgMapViaSwissProt mm7 knownGene name proteinID Pfam knownToPfam
# row count is 22525 

# Create table to map between known genes and GNF Atlas2
# expression data.
    hgMapToGene mm7 gnfAtlas2 knownGene knownToGnfAtlas2 '-type=bed 12'

# Create table that maps between known genes and visiGene database 
    knownToVisiGene mm7

# ENABLE GENE SORTER FOR mm7 IN HGCENTRALTEST (already done during first mm7 KG build)
    echo "update dbDb set hgNearOk = 1 where name = 'mm7';" \
      | hgsql -h genome-testdb hgcentraltest

# RAT BLASTP FOR GENE SORTER 
    # Make RAT ortholog column using blastp on RAT known genes.
    # First make RAT protein database and copy it to iscratch/i
    # if it doesn't exist already:
    mkdir /cluster/data/rn3/bed/blastp
    cd /cluster/data/rn3/bed/blastp
    pepPredToFa rn3 knownGenePep known.faa
    formatdb -i known.faa -t known -n known

    ssh kkr1u00
    if (-e /iscratch/i/rn3/blastp) then
      rm -r /iscratch/i/rn3/blastp
    endif
    mkdir -p /iscratch/i/rn3/blastp
    cp /cluster/data/rn3/bed/blastp/known.p?? /iscratch/i/rn3/blastp
    iSync
    # Make parasol run directory 
    ssh kk
    mkdir -p /cluster/data/mm7/bed/blastp/rn3/run/out
    cd /cluster/data/mm7/bed/blastp/rn3/run
    # Make blast script
    cat > blastSome <<end
#!/bin/csh
setenv BLASTMAT /iscratch/i/blast/data
/iscratch/i/blast/blastall -p blastp -d /iscratch/i/rn3/blastp/known -i \$1 -o \$2 -e 0.001 -m 8 -b 1
end
    chmod a+x blastSome
    # Make gensub2 file
    cat > gsub <<end
#LOOP
blastSome {check in line+ \$(path1)} {check out line out/\$(root1).tab}
#ENDLOOP
end
# Create parasol batch
ls -1S /cluster/data/mm7/bed/geneSorter/blastp/split \
|sed -e 's=kg=../../../geneSorter/blastp/split/kg=g' >split.lst
gensub2 split.lst single gsub spec
para create spec
para try, check, push, check, ...
# Completed: 7729 of 7729 jobs
# CPU time in finished jobs:      17126s     285.44m     4.76h    0.20d  0.001 y
# IO & Wait Time:                 20493s     341.54m     5.69h    0.24d  0.001 y
# Average job time:                   5s       0.08m     0.00h    0.00d
# Longest running job:                0s       0.00m     0.00h    0.00d
# Longest finished job:              24s       0.40m     0.01h    0.00d
# Submission to last job:           131s       2.18m     0.04h    0.00d

# Load into database.  
ssh hgwdev
cd /cluster/data/mm7/bed/blastp/rn3/run/out
hgLoadBlastTab mm7 rnBlastTab -maxPer=1 *.tab

# END OF GENE SORTER STUFF
#############################################################################

### MM6 PROTEOME BROWSER TABLES RE-BUILD ####  (TBD)
# These are instructions for re-building tables 
# needed for the Proteome Browser to be used with mm7.  
# DON'T START THESE UNTIL TABLES FOR KNOWN GENES AND kgProtMap table
# ARE REBUILT.  
# This build is based on proteins DBs dated 050415.

# Create the working directory
   ssh hgwdev
   mkdir /cluster/data/mm7/bed/pb.2005-06-01
   cd /cluster/data/mm7/bed
   rm pb
   ln -s /cluster/data/mm7/bed/pb.2005-06-01 pb
   cd pb

# Define pep* tables in mm7 DB

   cat ~/kent/src/hg/lib/pep*.sql > pepAll.sql

# delete from the following tables (previously built):

  hgsql mm7
  delete from  pepCCntDist ;
  delete from  pepExonCntDist ;
  delete from  pepHydroDist ;
  delete from  pepIPCntDist ;
  delete from  pepMolWtDist ;
  delete from  pepMwAa ;
  delete from  pepPi ;
  delete from  pepPiDist ;
  delete from  pepPred ;
  delete from  pepResDist ;
  delete from pbAnomLimit;
  delete from pbResAvgStd;
  delete from pbStamp; 
  quit; 

# Build the pepMwAa table

  hgsql proteins050415 -e \
"select info.acc, molWeight, aaSize from sp050415.info, sp050415.accToTaxon where accToTaxon.taxon=10090 and accToTaxon.acc = info.acc" > pepMwAa.tab

hgsql mm7 -e 'load data local infile "pepMwAa.tab" into table mm7.pepMwAa ignore 1 lines;'

o Build the pepPi table

  hgsql proteins050415 -e "select info.acc from sp050415.info, sp050415.accToTaxon where accToTaxon.taxon=10090 and accToTaxon.acc = info.acc" > protAcc.lis

  pbCalPi protAcc.lis sp050415 pepPi.tab

  hgsql mm7 -e 'load data local infile "pepPi.tab" into table mm7.pepPi;'


# Calculate and load pep distributions

  pbCalDist sp050415 proteins050415 10090 mm7 >pbCalDist.out

    cat pbCalDist.out
    wc  pbCalDist.out

    hgsql mm7

    load data local infile "pepExonCntDist.tab" into table mm7.pepExonCntDist;
    load data local infile "pepCCntDist.tab" into table mm7.pepCCntDist;
    load data local infile "pepHydroDist.tab" into table mm7.pepHydroDist;
    load data local infile "pepMolWtDist.tab" into table mm7.pepMolWtDist;
    load data local infile "pepResDist.tab" into table mm7.pepResDist;
    load data local infile "pepIPCntDist.tab" into table mm7.pepIPCntDist;
    load data local infile "pepPiDist.tab" into table mm7.pepPiDist;
    quit

# Calculate frequency distributions

    pbCalResStd sp050415 10090 mm7

# Create pbAnomLimit and pbResAvgStd tables

#  hgsql mm7 < ~/src/hg/lib/pbAnomLimit.sql
#  hgsql mm7 < ~/src/hg/lib/pbResAvgStd.sql

   hgsql mm7 -e 'load data local infile "pbResAvgStd.tab" into table mm7.pbResAvgStd;'
   hgsql mm7 -e 'load data local infile "pbAnomLimit.tab" into table mm7.pbAnomLimit;'

# UPDATE kgSpAlias TABLE TO BE USED BY PB 

    cd /cluster/data/mm7/bed/pb
    hgsql mm7 -e \
    'select kgXref.kgID, spID, alias from kgXref, kgAlias where kgXref.kgID=kgAlias.kgID' >j.tmp
    hgsql mm7 -e \
    'select kgXref.kgID, spID, alias from kgXref, kgProtAlias where kgXref.kgID=kgProtAlias.kgID'\
    >>j.tmp
    cat j.tmp|sort -u |grep -v 'kgID' >mm7.kgSpAlias.tab
    rm j.tmp

    hgsql mm7 -e 'drop table kgSpAlias';
    hgsql mm7 < ~/src/hg/lib/kgSpAlias.sql
    hgsql mm7 -e 'load data local infile "mm7.kgSpAlias.tab" into table kgSpAlias'
    gzip mm7.kgSpAlias.tab

# Create pbStamp table for PB
   
  hgsql mm7 < ~/src/hg/lib/pbStamp.sql
  hgsql mm5 -N -e 'select * from pbStamp' > pbStamp.tab

  hgsql mm7 -e 'delete from pbStamp'
  hgsql mm7 -e 'load data local infile "pbStamp.tab" into table mm7.pbStamp'

# ENABLE PROTEOME BROWSER FOR mm7 IN HGCENTRALTEST (already done previously)
    echo "update dbDb set hgPbOk = 1 where name = 'mm7';" \
      | hgsql -h genome-testdb hgcentraltest

# Adjust drawing parameters for Proteome Browser stamps

  Now invoke Proteome Browser and adjust various drawing parameters
  (mostly the ymax of each stamp) if necessary, by updating the 
  pbStamp.tab file and then delete and reload the pbStamp table. 

# Perform preliminary review of Proteome Browser for mm7, then
  notify QA for formal review.

# Update default Browser position
# bring up mySQL on genome-testdb and use hgcentraltest DB (done previously):

   update dbDb set defaultPos="chrX:87947304-87959012" where name="mm7";

# Create QA Push Queue entry with the following tables:

 ceBlastTab 
 cgapAlias 
 cgapBiocDesc 
 cgapBiocPathway 
 dmBlastTab 
 drBlastTab 
 dupSpMrna 
 foldUtr3 
 foldUtr5 
 gnfAtlas2Distance 
 hgBlastTab 
 keggMapDesc 
 keggPathway 
 kgAlias 
 kgProtAlias 
 kgProtMap 
 kgXref 
 knownBlastTab 
 knownCanonical 
 knownGene 
 knownGeneMrna 
 knownGenePep 
 knownIsoforms 
 knownToGenePix 
 knownToGnf1m 
 knownToGnfAtlas2 
 knownToLocusLink 
 knownToMOE430 
 knownToMOE430A 
 knownToPfam 
 knownToRefSeq 
 knownToU74 
 knownToXmBest 
 rinnSex 
 rnBlastTab 
 scBlastTab 
 spMrna

# END OF mm7 KG/GS/PB RE-BUILD. 6/1/05 Fan.
#####################################################################

####################################################################################
# RE-BUILD KNOWN GENES TABLES, 3RD TRIAL WITH CORRECTED kgCheck and kgGetCds (TBD)
   ssh hgwdev 
   cd /cluster/store10/kg/kgMm6B
   mkdir try2
   mv * try2

   hgsql mm7 -e 'create database kgMm6BTempTry2'

   hgsql kgMm6BTempTry2 -e 'drop table kgCandidate0'
   hgsql kgMm6BTempTry2 < ~/src/hg/lib/kgCandidate0.sql 
   hgsql kgMm6BTempTry2 -e  'load data local infile "try2/kgCandidate0.gp" into table kgCandidate0'

   hgsql kgMm6BTempTry2 -e 'drop table geneCheck'
   hgsql kgMm6BTempTry2 < ~/src/hg/lib/geneCheck.sql
   hgsql kgMm6BTempTry2 -e  'load data local infile "try2/kgCandidate0.check" into table geneCheck ignore 2 lines'

# Run kgCheck to get all KG candidates that pass the KG gene check criteria

   kgCheck kgMm6BTempTry2 mm7 kgCandidate0 geneCheck kgCandidate.tab
   hgsql kgMm6BTempTry2 -e  'drop table kgCandidate'
   hgsql kgMm6BTempTry2 < ~/src/hg/lib/kgCandidate.sql
   hgsql kgMm6BTempTry2 -e  'load data local infile "kgCandidate.tab" into table kgCandidate'
   hgsql kgMm6BTempTry2 -e 'create index alignID on kgCandidate(alignID)'

# Construct the kgCandidateX table that has alignID in the name field. 
   cut -f 2-10 kgCandidate.tab >j2.tmp
   cut -f 11 kgCandidate.tab >j1.tmp
   paste j1.tmp j2.tmp >kgCandidateX.tab
   rm j1.tmp j2.tmp
   
   hgsql kgMm6BTempTry2 -e  'drop table kgCandidateX'
   hgsql kgMm6BTempTry2 < ~/src/hg/lib/kgCandidateX.sql
   hgsql kgMm6BTempTry2 -e  'load data local infile "kgCandidateX.tab" into table kgCandidateX'

# Score protein/mRna and protein/RefSeq alignments

#   kgResultBestMrna2 050415 kgMm6BTempTry2 mm7|sort -u >protMrnaBlatScore.tab
#   kgResultBestRef2  050415 kgMm6BTempTry2 mm7|sort -u >protRefScore.tab

# Combine scoring results and load them into temp DB.
#   cat protMrnaBlatScore.tab protRefScore.tab >protMrnaScore.tab
   hgsql kgMm6BTempTry2 -e 'drop table protMrnaScore'
   hgsql kgMm6BTempTry2 < ~/src/hg/lib/protMrnaScore.sql
   hgsql kgMm6BTempTry2 -e 'load data local infile "try2/protMrnaScore.tab" into table protMrnaScore'
   hgsql kgMm6BTempTry2 -e 'create index mrnaAcc on protMrnaScore(mrnaAcc)'


# Run kgGetCds to get CDS structure of each gene

   kgGetCds kgMm6BTempTry2 kgCandidateX jY.tmp
   cat jY.tmp |sort -u >kgCandidateY.tab
#   rm jY.tmp
   hgsql kgMm6BTempTry2 -e  'drop table kgCandidateY'
   hgsql kgMm6BTempTry2 < ~/src/hg/lib/kgCandidateY.sql
   hgsql kgMm6BTempTry2 -e  'load data local infile "kgCandidateY.tab" into table kgCandidateY'

# Run kgPickPrep to replace long cds structure string with cdsId.
   kgPickPrep kgMm6BTempTry2 kgCandidateZ.tab
   hgsql kgMm6BTempTry2 -e  'drop table kgCandidateZ'
   hgsql kgMm6BTempTry2 < ~/src/hg/lib/kgCandidateZ.sql
   hgsql kgMm6BTempTry2 -e  'load data local infile "kgCandidateZ.tab" into table kgCandidateZ'
   hgsql kgMm6BTempTry2 -e 'create index cdsId on kgCandidateZ(cdsId)'

# Run kgPick to pick the representative a mrna/protein pair for each unique CDS structure.

   kgPick kgMm6BTempTry2 mm7 proteins050415 kgTry2.tmp dupSpMrna.tmp

   cat kgTry2.tmp | grep NM_ > jNM
   cat kgTry2.tmp | grep -v NM_ >jnoNM
   cut -f 1 jnoNM | sed -e "s/_/_\n/" |grep -v _ >jnoNM1
   cut -f 2-12  jnoNM >jnoNM2
   paste jnoNM1 jnoNM2 > kgTry2B.tmp
   cat jNM >> kgTry2B.tmp

   sort -u dupSpMrna.tmp >dupSpMrna.tab
   hgsql mm7 -e  'drop table dupSpMrna'
   hgsql mm7 < ~/src/hg/lib/dupSpMrna.sql
   hgsql mm7 -e  'load data local infile "dupSpMrna.tab" into table dupSpMrna'

# Add entries in the put back list
# Obtain the mouse put back list from Mark and save it as kgPutBack.tab

   hgsql kgMm6BTempTry2 -e  'drop table kgPutBack'
   hgsql kgMm6BTempTry2 < ~/src/hg/lib/kgPutBack.sql
   hgsql kgMm6BTempTry2 -e  'load data local infile "kgPutBack.tab" into table kgPutBack'

   kgPutBack kgMm6BTempTry2 mm7 proteins050415 kgPutBack kgPutBack.gp

# Sort KG genes to make the kgTry2.gp table file.

   cat kgTry2B.tmp kgPutBack.gp >kgTry2C.tmp
  ~/kent/src/hg/protein/sortKg.pl kgTry2C.tmp >kgTry2.gp

# Manually edit to correct one line problem of O75438_BC009691

   hgsql kgMm6BTempTry2 -e  'drop table knownGene'
   hgsql kgMm6BTempTry2 < ~/src/hg/lib/knownGene.sql
   hgsql kgMm6BTempTry2 -e  'load data local infile "kgTry2.gp" into table knownGene'

# Load data into mm7 knownGene table.
   hgsql mm7 -e  'drop table knownGene'
   hgsql mm7 < ~/src/hg/lib/knownGene.sql
   hgsql mm7 -e  'load data local infile "kgTry2.gp" into table knownGene'

# Build knownGeneMrna and knownGenePep tables.
   hgsql kgMm6BTempTry2 -e  'drop table mrnaSeq'
   hgsql kgMm6BTempTry2 < ~/src/hg/lib/mrnaSeq.sql
#  hgsql kgMm6BTempTry2 -e  'load data local infile "try2/mrnaSeq.tab" into table mrnaSeq'
   hgsql kgMm6BTempTry2 -e  'load data local infile "/cluster/store10/kg/kgMm6A/mrnaSeq.tab" into table mrnaSeq'
   kgPepMrna kgMm6BTempTry2 mm7 050415

   hgsql mm7 -e  'drop table knownGeneMrna'
   hgsql mm7 < ~/src/hg/lib/knownGeneMrna.sql
   hgsql mm7 -e  'load data local infile "knownGeneMrna.tab" into table knownGeneMrna'
   hgsql mm7 -e  'drop table knownGenePep'
   hgsql mm7 < ~/src/hg/lib/knownGenePep.sql
   hgsql mm7 -e  'load data local infile "knownGenePep.tab" into table knownGenePep'


# Build kgXref table

   kgXref2 kgMm6BTempTry2 050415 mm7

   hgsql mm7 -e  'drop table kgXref'
   hgsql mm7 < ~/src/hg/lib/kgXref.sql
   hgsql mm7 -e  'load data local infile "kgXref.tab" into table kgXref'

# Build kgProtMap table

    ~/src/hg/protein/kgProtMap2.sh kgMm6B mm7 050415

# Update and clean up kgResultBestMrna2.c and then check it in.

# Build spMrna table

   hgsql mm7 -N -e 'select name, proteinID from knownGene' |sort -u| >kgSpMrna.tab

   hgsql mm7 -e  'drop table spMrna'
   hgsql mm7 <~/src/hg/lib/spMrna.sql
   hgsql mm7 -e 'load data local infile "kgSpMrna.tab" into table spMrna'

# Build mrnaRefseq table

   cd /cluster/store10/entrez
   mkdir 050601
   rm /cluster/data/entrez
   ln -s /cluster/store10/entrez/050601 /cluster/data/entrez
   cd /cluster/data/entrez

   wget -- timestamp ftp://ftp.ncbi.nlm.nih.gov/gene/DATA/gene2accession.gz
   wget -- timestamp ftp://ftp.ncbi.nlm.nih.gov/gene/DATA/gene_info.gz
   wget -- timestamp ftp://ftp.ncbi.nlm.nih.gov/gene/DATA/gene2refseq.gz
   gzip -d *.gz

   cut -f 2,4 gene2accession | sort -u | grep -v "-" | grep -v "NM_" | sed -e 's/\./\t/g' > entrezMrna.tab
   cut -f 2,4 gene2refseq | grep "NM_"| sort -u | grep -v "-" | sed -e 's/\./\t/g' > entrezRefseq.tab
   cut -f 2,4,6 gene2accession | grep "NM_"| grep "NP_"|sort -u | sed -e 's/\./\t/g' > entrezRefProt.tab
   
   hgsql entrez -e 'drop table entrezRefseq'
   hgsql entrez -e 'drop table entrezMrna'
   hgsql entrez -e 'drop table entrezRefProt'

   hgsql entrez < ~/src/hg/lib/entrezRefseq.sql
   hgsql entrez < ~/src/hg/lib/entrezMrna.sql
   hgsql entrez < ~/src/hg/lib/entrezRefProt.sql

   hgsql entrez -e 'load data local infile "entrezRefseq.tab" into table entrezRefseq'
   hgsql entrez -e 'load data local infile "entrezMrna.tab" into table entrezMrna'
   hgsql entrez -e 'load data local infile "entrezRefProt.tab" into table entrezRefProt'

   hgsql entrez -N -e \
   'select mrna, refseq from entrezRefseq, entrezMrna where entrezRefseq.geneID=entrezMrna.geneID' \
   >mrnaRefseq.tab

   hgsql mm7 -e 'drop table mrnaRefseq'
   hgsql mm7 < ~/src/hg/lib/mrnaRefseq.sql
   hgsql mm7 -e 'load data local infile "mrnaRefseq.tab" into table mrnaRefseq'

# Build alias tables.		
#	kgAliasM reads from proteins050415.hugo.symbol, proteins050415.hugo.aliases
#	proteins050415.hugo.withdraws, mm7.kgXref.kgID
#	to create kgAliasM.tab and geneAlias.tab
#	by picking out those kgID items from kgXref where
#	kgXref.geneSymbol == hugo.symbol

   cd /cluster/store10/kg/kgMm6B
   mkdir alias
   cd alias
   kgAliasM mm7 proteins050415

#	kgAliasKgXref reads from mm7.knownGene.proteinID,
#	mm7.knownGene.name, mm7.kgXref.geneSymbol
#	to create kgAliasKgXref.tab

   kgAliasKgXref mm7

#	kgAliasRefseq reads from mm7.knownGene.name,
#	mm7.knownGene.proteinID, mm7.kgXref.refseq
#	to create kgAliasRefseq.tab

   kgAliasRefseq mm7

   hgsql sp050415 -N -e 'select name,gene.val from mm7.knownGene,displayId,gene where displayId.val=proteinID and displayId.acc=gene.acc' \
   | sort -u  > kgAliasP.tab

   hgsql mm7 -N -e 'select name, name from knownGene' >kgAliasDup.tab
   hgsql mm7 -N -e 'select mrnaID, dupMrnaID from dupSpMrna' >>kgAliasDup.tab
   
   cat kgAliasM.tab kgAliasRefseq.tab kgAliasKgXref.tab kgAliasP.tab kgAliasDup.tab| \
   sort |uniq > kgAlias.tab

   hgsql -e "drop table kgAlias;" mm7 
   hgsql mm7 < ~/kent/src/hg/lib/kgAlias.sql
   hgsql mm7 -e 'LOAD DATA local INFILE "kgAlias.tab" into table kgAlias' 

#	kgProtAlias reads from mm7.knownGene.name,
#	mm7.knownGene.proteinID, mm7.knownGene.alignID,
#	proteins050415.spXref3.accession, proteins050415.spSecondaryID, proteins050415.pdbSP.pdb
#	to create kgProtAlias.tab

   kgProtAlias mm7 050415

   hgsql mm7 -N -e \
   'select kgID, spDisplayID, protAcc from kgXref where protAcc != ""'\
   | sort -u >kgProtAliasNCBI.tab

# include variant splice protein IDs
   
   hgsql mm7 -N -e \
   'select name, proteinID, parAcc from knownGene,sp050415.varAcc where varAcc=proteinID'\
   |sort -u >kgProtAliasDup.tab

# include duplicate protein IDs from dupSpMrna table
   hgsql mm7 -N -e \
   'select name, knownGene.proteinID, dupProteinID from knownGene, dupSpMrna where name=mrnaID'\
   |sort -u >>kgProtAliasDup.tab

# catch parent acc from dupProteinID too
   hgsql mm7 -N -e\
   'select name, knownGene.proteinID, parAcc from knownGene,dupSpMrna,sp050415.varAcc where name=mrnaID and dupProteinID=varAcc.varAcc'\
   |sort -u >>kgProtAliasDup.tab
    cat kgProtAliasNCBI.tab kgProtAlias.tab kgProtAliasDup.tab | sort -u > kgProtAliasAll.tab

    echo "`date` creating table kgProtAlias"
    hgsql mm7 -e "drop table kgProtAlias;"
    hgsql mm7 <~/src/hg/lib/kgProtAlias.sql; 
    hgsql mm7 -e 'LOAD DATA local INFILE "kgProtAliasAll.tab" into table kgProtAlias;'  

# Build kgSpAlias table

    hgsql mm7 -e \
    'select kgXref.kgID, spID, alias from kgXref, kgAlias where kgXref.kgID=kgAlias.kgID' >j.tmp
    hgsql mm7 -e \
    'select kgXref.kgID, spID, alias from kgXref, kgProtAlias where kgXref.kgID=kgProtAlias.kgID'\
    >>j.tmp
    cat j.tmp|sort -u |grep -v 'kgID' >mm7.kgSpAlias.tab
    rm j.tmp

    hgsql mm7 -e 'drop table kgSpAlias';
    hgsql mm7 < ~/src/hg/lib/kgSpAlias.sql
    hgsql mm7 -e 'load data local infile "mm7.kgSpAlias.tab" into table kgSpAlias'

# MAKE FOLDUTR TABLES 
# First set up directory structure and extract UTR sequence on hgwdev
    ssh hgwdev
    cd /cluster/data/mm7/bed
    mkdir rnaStruct.2005-06-05
    rm rnaStruct
    ln -s rnaStruct.2005-06-05 rnaStruct
    cd rnaStruct
    mkdir -p utr3/split utr5/split utr3/fold utr5/fold
    utrFa mm7 knownGene utr3 utr3/utr.fa
    utrFa mm7 knownGene utr5 utr5/utr.fa

# Split up files and make files that define job.
    ssh kk
    cd /cluster/data/mm7/bed/rnaStruct
    faSplit sequence utr3/utr.fa 50000 utr3/split/s
    faSplit sequence utr5/utr.fa 50000 utr5/split/s
    ls -1 utr3/split > utr3/in.lst
    ls -1 utr5/split > utr5/in.lst
    cd utr3
    cat > gsub <<end
#LOOP
rnaFoldBig split/\$(path1) fold
#ENDLOOP
end
    cp gsub ../utr5

# Do cluster run for 3' UTRs
    gensub2 in.lst single gsub spec
    para create spec
    para try
    para push
# Completed: 25133 of 25133 jobs
# CPU time in finished jobs:     554915s    9248.58m   154.14h    6.42d  0.018 y
# IO & Wait Time:                 67099s    1118.32m    18.64h    0.78d  0.002 y
# Average job time:                  25s       0.41m     0.01h    0.00d
# Longest running job:                0s       0.00m     0.00h    0.00d
# Longest finished job:            6713s     111.88m     1.86h    0.08d
# Submission to last job:          7435s     123.92m     2.07h    0.09d

    cd ../utr5
    gensub2 in.lst single gsub spec
    para create spec
    para try
    para push
Completed: 23548 of 23548 jobs
CPU time in finished jobs:     102308s    1705.14m    28.42h    1.18d  0.003 y
IO & Wait Time:                 64370s    1072.83m    17.88h    0.75d  0.002 y
Average job time:                   7s       0.12m     0.00h    0.00d
Longest running job:                0s       0.00m     0.00h    0.00d
Longest finished job:            8211s     136.85m     2.28h    0.10d
Submission to last job:          8311s     138.52m     2.31h    0.10d

# Load database
    ssh hgwdev
    cd /cluster/data/mm7/bed/rnaStruct/utr5
    hgLoadRnaFold mm7 foldUtr5 fold
    cd ../utr3
    hgLoadRnaFold mm7 foldUtr3 fold

# Clean up
    rm -r split fold err batch.bak
    cd ../utr5
    rm -r split fold err batch.bak

# Build KEGG pathway tables.  
   ssh hgwdev
   cd /cluster/store10/kg/kgMm6B
   md kegg
   cd kegg

   ~/src/hg/protein/KGpath.sh kgMm6B mm7 050415

   hgsql mm7 -e "drop table keggMapDesc"
   hgsql mm7 -e "drop table keggPathway"
   hgsql mm7 <~/src/hg/lib/keggMapDesc.sql
   hgsql mm7 <~/src/hg/lib/keggPathway.sql
   hgsql mm7 -e 'load data local infile "keggMapDesc.tab" into table keggMapDesc'
   hgsql mm7 -e 'load data local infile "keggPathway.tab" into table keggPathway'

# Build CGAP pathway tables

   cd ..
   ~/src/hg/protein/KGcgap.sh kgMm6B mm7 050415
   hgsql mm7 -e "drop table cgapAlias"
   hgsql mm7 -e "drop table cgapBiocDesc"
   hgsql mm7 -e "drop table cgapBiocPathway"
   hgsql mm7 <~/src/hg/lib/cgapAlias.sql
   hgsql mm7 <~/src/hg/lib/cgapBiocDesc.sql
   hgsql mm7 <~/src/hg/lib/cgapBiocPathway.sql
   hgsql mm7 -e 'load data local infile "cgapAlias.tab" into table cgapAlias'
   hgsql mm7 -e 'load data local infile "cgapBIOCARTAdescSorted.tab" into table cgapBiocDesc'
   hgsql mm7 -e 'load data local infile "cgapBIOCARTA.tab" into table cgapBiocPathway'

 
### MM6 PROTEOME BROWSER TABLES RE-BUILD ####  (TBD)
# These are instructions for re-building tables 
# needed for the Proteome Browser to be used with mm7.  
# DON'T START THESE UNTIL TABLES FOR KNOWN GENES AND kgProtMap table
# ARE REBUILT.  
# This build is based on proteins DBs dated 050415.

# Create the working directory
   ssh hgwdev
   mkdir /cluster/data/mm7/bed/pb.2005-06-06
   cd /cluster/data/mm7/bed
   rm pb
   ln -s /cluster/data/mm7/bed/pb.2005-06-06 pb
   cd pb

# Define pep* tables in mm7 DB

#   cat ~/kent/src/hg/lib/pep*.sql > pepAll.sql

# delete from the following tables (previously built):

  hgsql mm7
  delete from  pepCCntDist ;
  delete from  pepExonCntDist ;
  delete from  pepHydroDist ;
  delete from  pepIPCntDist ;
  delete from  pepMolWtDist ;
  delete from  pepMwAa ;
  delete from  pepPi ;
  delete from  pepPiDist ;
  delete from  pepPred ;
  delete from  pepResDist ;
  delete from pbAnomLimit;
  delete from pbResAvgStd;
  delete from pbStamp; 
  quit; 

# Build the pepMwAa table

  hgsql proteins050415 -e \
"select info.acc, molWeight, aaSize from sp050415.info, sp050415.accToTaxon where accToTaxon.taxon=10090 and accToTaxon.acc = info.acc" > pepMwAa.tab

hgsql mm7 -e 'load data local infile "pepMwAa.tab" into table mm7.pepMwAa ignore 1 lines;'

o Build the pepPi table

  hgsql proteins050415 -e "select info.acc from sp050415.info, sp050415.accToTaxon where accToTaxon.taxon=10090 and accToTaxon.acc = info.acc" > protAcc.lis

  pbCalPi protAcc.lis sp050415 pepPi.tab

  hgsql mm7 -e 'load data local infile "pepPi.tab" into table mm7.pepPi;'


# Calculate and load pep distributions

  pbCalDist sp050415 proteins050415 10090 mm7 >pbCalDist.out

    cat pbCalDist.out
    wc  pbCalDist.out

    hgsql mm7

    load data local infile "pepExonCntDist.tab" into table mm7.pepExonCntDist;
    load data local infile "pepCCntDist.tab" into table mm7.pepCCntDist;
    load data local infile "pepHydroDist.tab" into table mm7.pepHydroDist;
    load data local infile "pepMolWtDist.tab" into table mm7.pepMolWtDist;
    load data local infile "pepResDist.tab" into table mm7.pepResDist;
    load data local infile "pepIPCntDist.tab" into table mm7.pepIPCntDist;
    load data local infile "pepPiDist.tab" into table mm7.pepPiDist;
    quit

# Calculate frequency distributions

    pbCalResStd sp050415 10090 mm7

# Create pbAnomLimit and pbResAvgStd tables

#  hgsql mm7 < ~/src/hg/lib/pbAnomLimit.sql
#  hgsql mm7 < ~/src/hg/lib/pbResAvgStd.sql

   hgsql mm7 -e 'load data local infile "pbResAvgStd.tab" into table mm7.pbResAvgStd;'
   hgsql mm7 -e 'load data local infile "pbAnomLimit.tab" into table mm7.pbAnomLimit;'

# UPDATE kgSpAlias TABLE TO BE USED BY PB 

    cd /cluster/data/mm7/bed/pb
    hgsql mm7 -e \
    'select kgXref.kgID, spID, alias from kgXref, kgAlias where kgXref.kgID=kgAlias.kgID' >j.tmp
    hgsql mm7 -e \
    'select kgXref.kgID, spID, alias from kgXref, kgProtAlias where kgXref.kgID=kgProtAlias.kgID'\
    >>j.tmp
    cat j.tmp|sort -u |grep -v 'kgID' >mm7.kgSpAlias.tab
    rm j.tmp

    hgsql mm7 -e 'drop table kgSpAlias';
    hgsql mm7 < ~/src/hg/lib/kgSpAlias.sql
    hgsql mm7 -e 'load data local infile "mm7.kgSpAlias.tab" into table kgSpAlias'
    gzip mm7.kgSpAlias.tab

# Create pbStamp table for PB
   
  hgsql mm7 < ~/src/hg/lib/pbStamp.sql
  hgsql mm5 -N -e 'select * from pbStamp' > pbStamp.tab

  hgsql mm7 -e 'delete from pbStamp'
  hgsql mm7 -e 'load data local infile "pbStamp.tab" into table mm7.pbStamp'

# ENABLE PROTEOME BROWSER FOR mm7 IN HGCENTRALTEST (already done previously)
    echo "update dbDb set hgPbOk = 1 where name = 'mm7';" \
      | hgsql -h genome-testdb hgcentraltest

# Adjust drawing parameters for Proteome Browser stamps

  Now invoke Proteome Browser and adjust various drawing parameters
  (mostly the ymax of each stamp) if necessary, by updating the 
  pbStamp.tab file and then delete and reload the pbStamp table. 

# Perform preliminary review of Proteome Browser for mm7, then
  notify QA for formal review.

# Update default Browser position
# bring up mySQL on genome-testdb and use hgcentraltest DB (done previously):

   update dbDb set defaultPos="chrX:87947304-87959012" where name="mm7";

# Create QA Push Queue entry with the following tables:

 ceBlastTab 
 cgapAlias 
 cgapBiocDesc 
 cgapBiocPathway 
 dmBlastTab 
 drBlastTab 
 dupSpMrna 
 foldUtr3 
 foldUtr5 
 gnfAtlas2Distance 
 hgBlastTab 
 keggMapDesc 
 keggPathway 
 kgAlias 
 kgProtAlias 
 kgProtMap 
 kgXref 
 knownBlastTab 
 knownCanonical 
 knownGene 
 knownGeneMrna 
 knownGenePep 
 knownIsoforms 
 knownToGenePix 
 knownToGnf1m 
 knownToGnfAtlas2 
 knownToLocusLink 
 knownToMOE430 
 knownToMOE430A 
 knownToPfam 
 knownToRefSeq 
 knownToU74 
 knownToXmBest 
 rinnSex 
 rnBlastTab 
 scBlastTab 
 spMrna

# END OF mm7 KG/GS/PB RE-BUILD. 6/6/05 Fan.
#####################################################################

## NIA Mouse Gene Index - (TBD)
#       requested by: Dudekula, Dawood (NIH/NIA/IRP) DudekulaDB@grc.nia.nih.gov
    ssh hgwdev 
    mkdir -p /cluster/data/mm7/bed/NIAGene
    cd /cluster/data/mm7/bed/NIAGene
    wget --timestamp http://lgsun.grc.nia.nih.gov/geneindex5/download/T-psl.txt.gz

    cut -f 1-21 T-psl.txt >NIAGene.tab
    hgLoadPsl mm7 NIAGene.tab

    wget --timestamp http://lgsun.grc.nia.nih.gov/geneindex5/download/T-fasta.fa.gz
    gzip -d T-fasta.fa.gz
    
    mkdir /gbdb/mm7/NIAGene
    ln -s /cluster/data/mm7/bed/NIAGene/T-fasta.fa /gbdb/mm7/NIAGene/T-fasta.fa
    
    hgLoadSeq mm7 /gbdb/mm7/NIAGene/T-fasta.fa

    Create/edit/check in NIAGene.html and trackDb.ra under
    
        kent/src/hg/makeDb/trackDb/mouse/mm7

# Update mrnaRefseq table (TBD)
# The old table contains non-mouse mrna/RefSeqs.
# The new table contains only mouse mrna/RefSeq and RefSeq/RefSeq.

# First build entrez DB tables, see the section on mrnaRefseq earlier 
# for details.

    ssh hgwdev
    cd /cluster/store10/kg/kgMm6B
    hgsql entrez -N -e \
    'select mrna, refseq from entrezRefseq, entrezMrna, mm7.all_mrna where entrezRefseq.geneID=entrezMrna.geneID and mrna=all_mrna.qName' \
    >mrnaRefseq1.tab

# Include RefSeq as valid mRNA too.
    hgsql mm7 -N -e 'select name, name from refGene' >mrnaRefseq2.tab

    cat mrnaRefseq1.tab mrnaRefseq2.tab |sort -u >mrnaRefseq.tab

    hgsql mm7 -e 'drop table mrnaRefseq'
    hgsql mm7 < ~/src/hg/lib/mrnaRefseq.sql
    hgsql mm7 -e 'load data local infile "mrnaRefseq.tab" into table mrnaRefseq'

# BUILD KNOWN GENE LIST FOR GOOGLE.  TBD)

    cd /cluster/data/mm7/bed
    rm -rf knownGeneList/mm7

# Run hgKnownGeneList to generate the tree of HTML pages
# under ./knownGeneList/mm7

    hgKnownGeneList mm7

# copy over to /usr/local/apache/htdocs

    rm -rf /usr/local/apache/htdocs/knownGeneList/mm7
    mkdir -p /usr/local/apache/htdocs/knownGeneList/mm7
    cp -Rfp knownGeneList/mm7/* /usr/local/apache/htdocs/knownGeneList/mm7

# Build kgReactome table for KG to Reactome xref.  Done 6/28/05 Fan.

# First, make sure the reactome DB is built.  See makeHg17.doc for details.

    ssh hgwdev
    mkdir -p /cluster/data/mm7/bed/reactome
    cd /cluster/data/mm7/bed/reactome

    hgsql reactome -N -e 'select kgId, spID, DB_ID from ReferenceEntity, mm7.kgXref where identifier=spID' >kgReactome.tab;

    hgsql mm7 -e 'drop table kgReactome'
    hgsql mm7 < ~/src/hg/lib/kgReactome.sql
    hgsql mm7 -e 'load data local infile "kgReactome.tab" into table kgReactome'

#############################################################################
#  miRNA track (DONE - 2005-10-27 - Hiram)
    #   data from: Michel.Weber@ibcg.biotoul.fr
    #   notify them when done.
    cd /cluster/data/mm7/bed
    mkdir miRNA
    cd miRNA
    save miRNA_track_mm7.txt file from email
    cp miRNA_track_mm7.txt miRNA.bed
    
# edit miRNA.bed to get rid of the top field description lines

    hgLoadBed -strict mm7 miRNA miRNA.bed
    
# check previous release track before update
    featureBits mm7 miRNA
    #	20620 bases of 2583394090 (0.001%) in intersection
    featureBits mm6 miRNA
    #	21167 bases of 2597150411 (0.001%) in intersection
    featureBits mm5 miRNA
    #	17957 bases of 2615483787 (0.001%) in intersection

#############################################################################
# ADDED THE EXONPRIMER TO QUICK LINKS SECTION OF KG DEAILS PAGE (05/07/11, Fan) 

# Added the following lines to links.ra under src/hg/hgGene/hgGeneData/Mouse/mm7

name exonPrimer
shortLabel ExonPrimer
tables kgXref
idSql select kgID from kgXref where kgID = '%s'
url http://ihg.gsf.de/cgi-bin/primer/ExonPrimerUCSC.pl?db=mm7&acc=%s
priority 95


# REBUILT knownToPfam TABLE TO ALLOW KG REPRESENTED BY VARIANT SPLICE PROTEINS MAPPED TO PFAM (TBD)
# hgMapViaSwissProt.c was updated to support this.
# Create table that maps between known genes and Pfam domains
~/bin/i386/hgMapViaSwissProt mm7 knownGene name proteinID Pfam knownToPfam
# row count is  24650

# SCDb CLONES (7/12/2005 Andy)
    cd /cluster/data/mm7/bed
    mkdir blat.SCDb-07-05-2005
    cd blat.SCDb-07-05-2005/
    ln -s `pwd` ~/scdb
    pushd /santest/scratch/andy
    wget http://stemcell.princeton.edu/download/scdb.fa.gz
    mkdir scdb
    faSplit sequence scdb.fa.gz 80 scdb/scdb_
    popd
    find /santest/scratch/andy/scdb -type f > scdb.lst
    find /panasas/store/mm7/nib -type f > mm7.lst
    cat << "_EOF_" > blat.sh
#!/bin/bash
cdir=${3%/*}
mkdir -p $cdir
blat -q=dna -t=dna -noHead -ooc=/iscratch/i/mm7/ooc/11.ooc $1 $2 $3
_EOF_
    cat << "_EOF_" > gsub
#LOOP
./blat.sh {check in exists $(path2)} {check in line+ $(path1)} {check out line /cluster/bluearc/andy/scdb.psl/$(root2)/$(root2)_$(root1).psl}
#ENDLOOP
_EOF_
    chmod +x blat.sh
    ssh kk
    cd /cluster/data/mm7/bed/blat.SCDb-07-05-2005
    gensub2 scdb.lst mm7.lst gsub spec
    para create spec
    para try
    para push
    para time
#Completed: 3200 of 3200 jobs
#CPU time in finished jobs:      24158s     402.64m     6.71h    0.28d  0.001 y
#IO & Wait Time:                 14437s     240.61m     4.01h    0.17d  0.000 y
#Average job time:                  12s       0.20m     0.00h    0.00d
#Longest running job:                0s       0.00m     0.00h    0.00d
#Longest finished job:             244s       4.07m     0.07h    0.00d
#Submission to last job:           727s      12.12m     0.20h    0.01d
    ssh hgwdev
    cd /cluster/data/mm7/bed/blat.SCDb-07-05-2005
    # See if things check out.
    find /cluster/bluearc/andy/scdb.psl -type f -exec cat '{}' ';' > scdb.all.psl
    pslReps -singleHit scdb.all.psl scdb.best.psl info.psr
    # All the original names
    grep '>' scdb.fa | sed 's/^>//' | cut -f1 -d' ' | sort | uniq > names.scdb
    # All the names from ones that hit.
    cut -f10 scdb.all.psl | sort | uniq > all.names.scdb
    # All the ones with a "best" hit.
    cut -f10 scdb.best.psl | sort | uniq > best.names.scdb
    # Yeah a bunch of them (4,443/37,386) are missing.  It seems many of the 
    # clones aren't from mouse anyways.
    mkdir ../scdb
    cp scdb.best.psl ../scdb/scdb.psl
    cp scdb.fa ../scdb/
    cp best.names.scdb ../scdb/
    cd ../scdb/
    faSomeRecords scdb.fa best.names.scdb scdb.best.fa
    rm scdb.fa
    mkdir /gbdb/mm7/scdb
    ln -s /cluster/data/mm7/bed/scdb/scdb.best.fa /gbdb/mm7/scdb/scdb.fa    
    hgLoadSeq mm7 /gbdb/mm7/scdb/scdb.fa
    # clean up the names... basically take the middle part out.   
    sed 's/SC|\([^|]\+\)|[0-9]\+/\1/' scdb.best.fa > new.scdb.best.fa
    sed 's/SC|\([^|]\+\)|[0-9]\+/\1/' scdb.psl > new.scdb.psl
    mv scdb.psl old.scdb.psl
    mv new.scdb.psl scdb.psl
    mv scdb.best.fa old.scdb.best.fa
    mv new.scdb.best.fa scdb.best.fa
    hgLoadPsl -table=scdb mm7 scdb.psl
    hgLoadSeq mm7 /gbdb/mm7/scdb/scdb.fa
#Warning: load of seq did not go as planned: 37381 record(s), 1 row(s) skipped, 0 warning(s) loading ./seq.tab
    # Oh well. 
    # Update 7/26/2005: I'm going more restrictive on the pslReps.
    ssh hgwdev
    cd /cluster/data/mm7/bed/blat.SCDb-07-05-2005
    pslReps -minCover=0.8 -singleHit scdb.all.psl tmp scdb.psr
    sed 's/SC|\([^|]\+\)|[0-9]\+/\1/' tmp > scdb.psl
    rm tmp
    hgLoadPsl mm7 scdb.psl
    
## REBUILD NIA Mouse Gene Index - (TBD)
#       requested by: Dudekula, Dawood (NIH/NIA/IRP) DudekulaDB@grc.nia.nih.gov
    ssh hgwdev 
    cd /cluster/data/mm7/bed
    mv NIAGene NIAGene_050621
    mkdir NIAGene

    wget --timestamp http://lgsun.grc.nia.nih.gov/geneindex5/download/T-psl.txt.gz

    cut -f 1-21 T-psl.txt >NIAGene.tab
    hgLoadPsl mm7 NIAGene.tab

    wget --timestamp http://lgsun.grc.nia.nih.gov/geneindex5/download/T-fasta.fa.gz
    gzip -d T-fasta.fa.gz
 
    rm /gbdb/mm7/NIAGene/T-fasta.fa
    ln -s /cluster/data/mm7/bed/NIAGene/T-fasta.fa /gbdb/mm7/NIAGene/T-fasta.fa
   
# Load the sequences.  PLEASE NOTE THE "-replace" OPTION SHOULD BE USED!!!   
    hgLoadSeq -replace mm7 /gbdb/mm7/NIAGene/T-fasta.fa

############################################################################
# BLASTZ ZEBRAFISH (danRer3) (DONE - 2005-09-19 - 2005-11-21 - Hiram)
    #	Create a single 2bit file that has the full chroms and the
    #	random contigs from chrUn and chrNA
    ssh kkstore02
    cd /cluster/data/danRer3/
    faToTwoBit [1-9]/chr*.fa [12][0-9]/chr*.fa M/chrM.fa \
	Un/scaffoldUn.fa NA/scaffoldNA.fa \
	chrUnNA.2bit
    #	Verify the sequence hasn't been damaged:
    twoBitToFa danRer3.2bit stdout | faSize stdin
# 1644032962 bases (48201758 N's 1595831204 real 816464533 upper
# 779366671 lower) in 28 sequences in 1 files
    twoBitToFa chrUnNA.2bit stdout | faSize stdin
# 1636563462 bases (40732258 N's 1595831204 real 816464533 upper
# 779366671 lower) in 14967 sequences in 1 files
    #	Note, only the N's are different
    #	48201758 - 40732258 = 7469500
    #	1644032962 - 1636563462 = 7469500
    #	Copy to bluearc:
    cp -p chrUnNA.2bit /cluster/bluearc/danRer3
    twoBitInfo chrUnNA.2bit chrUnNA.sizes
    cp -p chrUnNA.sizes /cluster/bluearc/danRer3
    cp -p liftSupertoChrom/liftNAandUnScaffoldsToChrom.lft \
	/cluster/bluearc/danRer3

    ssh pk
    mkdir /cluster/data/mm7/bed/blastzDanRer3.2005-09-19
    cd /cluster/data/mm7/bed
    ln -s blastzDanRer3.2005-09-19 blastz.danRer3
    cd blastzDanRer3.2005-09-19
    # use parameters as for mm5 - see makeMm5.doc
    cat << '_EOF_' > DEF
# mouse (mm7) vs zebrafish (danRer3)
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn/x86_64:/cluster/bin/x86_64:/parasol/bin

BLASTZ=blastz.x86_64

# Reuse parameters from hg16-fr1, danRer-hg17 and mm5-danRer
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=6000
BLASTZ_K=2200
BLASTZ_M=50
BLASTZ_Q=/cluster/data/blastz/HoxD55.q

# TARGET: Mouse (mm7)
#	small enough chunk to get reasonable running time
SEQ1_DIR=/scratch/hg/mm7/mm7.2bit
SEQ1_LEN=/scratch/hg/mm7/chrom.sizes
SEQ1_CTGDIR=/scratch/hg/mm7/mm7Chroms_RandomContigs.2bit
SEQ1_CTGLEN=/scratch/hg/mm7/mm7Chroms_RandomContigs.sizes
SEQ1_LIFT=/scratch/hg/mm7/Chroms_RandomContigs.lft
SEQ1_LIMIT=30
SEQ1_IN_CONTIGS=0
SEQ1_CHUNK=1000000
SEQ1_LAP=5000

# QUERY: Zebrafish (danRer3)
#  large enough chunk to do all genome in one piece
SEQ2_DIR=/cluster/bluearc/danRer3/danRer3.2bit
SEQ2_LEN=/cluster/bluearc/danRer3/chrom.sizes
SEQ2_CTGDIR=/cluster/bluearc/danRer3/chrUnNA.2bit
SEQ2_CTGLEN=/cluster/bluearc/danRer3/chrUnNA.sizes
SEQ2_LIFT=/cluster/bluearc/danRer3/liftNAandUnScaffoldsToChrom.lft
SEQ2_IN_CONTIGS=0
SEQ2_CHUNK=1700000000
SEQ2_LAP=0

BASE=/cluster/data/mm7/bed/blastzDanRer3.2005-09-19
TMPDIR=/scratch/tmp
'_EOF_'
    # < happy emacs

    time $HOME/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-chainMinScore=5000 -bigClusterHub=pk \
	`pwd`/DEF > blastz.out 2>&1 &
XXX - Started - 2005-10-07 12:13
    time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-continue=chainMerge -chainMinScore=5000 -bigClusterHub=pk \
	-fileServer=kolossus \
	`pwd`/DEF > chainMerge.out 2>&1 &

    time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-swap -chainMinScore=5000 -bigClusterHub=pk \
	-fileServer=kolossus \
	`pwd`/DEF > swap.out 2>&1 &

# featureBits -chrom=chr1 mm7 refGene:cds chainDanRer3Link -enrichment
# refGene:cds 0.810%, chainDanRer3Link 2.365%, both 0.536%, cover 66.19%,
# enrich 27.99x
# featureBits -chrom=chr1 mm6 refGene:cds chainDanRer3Link -enrichment
# refGene:cds 0.808%, chainDanRer3Link 5.196%, both 0.522%, cover 64.64%, 
# enrich 12.44x
# featureBits -chrom=chr1 mm5 refGene:cds chainDanRer2Link -enrichment
# refGene:cds 0.818%, chainDanRer2Link 2.058%, both 0.546%, cover 66.75%, 
# enrich 32.43x
# featureBits -chrom=chr1 mm6 refGene:cds chainDanRer2Link -enrichment
# refGene:cds 0.808%, chainDanRer2Link 6.412%, both 0.542%, cover 67.04%, 
# enrich 10.46x

    time featureBits mm7 chainDanRer3Link
    #	69591785 bases of 2583394090 (2.694%) in intersection
    time featureBits mm6 chainDanRer3Link
    #	134615477 bases of 2597150411 (5.183%) in intersection
    time featureBits danRer3 chainMm7Link
    #	71277532 bases of 1630323462 (4.372%) in intersectio
    time featureBits danRer2 chainMm6Link
    #	176391894 bases of 1560497282 (11.304%) in intersection

    #	re-scoring the chains
    ssh kkstore02
    cd /cluster/data/mm7/bed/blastzDanRer3.2005-09-19
    rm -fr axtNet mafNet axtChain

    ssh pk
    cd /cluster/data/mm7/bed/blastzDanRer3.2005-09-19

    time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
	-continue=chainRun -stop=load \
	`pwd`/DEF > rescoreChain.out 2>&1 &
    #	real    186m35.984s
    time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
	-swap -stop=load \
	`pwd`/DEF > rescoreChainSwap.out 2>&1 &

    time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
	-continue=download \
	`pwd`/DEF > rescoreChainDownload.out 2>&1 &

    time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
	-swap -continue=download \
	`pwd`/DEF > rescoreChainDownloadSwap.out 2>&1 &

    #	Measurements:
    ssh kolossus
    cd  /cluster/data/mm7/bed/blastzDanRer3.2005-09-19
    time HGDB_CONF=~/.hg.conf.read-only featureBits mm7 \
	chainDanRer3Link > fb.mm7.chainDanRer3Link.rescore 2>&1
    cat fb.mm7.chainDanRer3Link.rescore
    #	69591785 bases of 2583394090 (2.694%) in intersection
    time HGDB_CONF=~/.hg.conf.read-only featureBits danRer3 \
	chainMm7Link > fb.danRer3.chainMm7Link.rescore 2>&1
    cat fb.danRer3.chainMm7Link.rescore
    #	71277532 bases of 1630323462 (4.372%) in intersection

###########################################################################
#### BUILD SUPERFAMILY RELATED TABLES (TBD)

# Download Superfamily data files and build the Superfamily DB
# from supfam.mrc-lmb.cam.ac.uk

    mkdir -p /cluster/store11/superfamily/050817
    ln -s /cluster/store11/superfamily/050817 /cluster/data/superfamily/050817
    cd /cluster/data/superfamily/050817

# add the following line to ~/.netrc

    machine supfam.mrc-lmb.cam.ac.uk login license password XXXXX

# ftp over to supfam.mrc-lmb.cam.ac.uk and get the following two files:

    supfam_14-Aug-2005.sql.gz
    ass_14-Aug-2005.tab.gz
    
    gzip -d *.gz
# Load the Superfamily database
    hgsql mm7 -e "create database superfam050817"
    zcat supfam_14-Aug-2005.sql.gz | hgsql superfam050817

# This may take about an hour.

# Make sure to add an index on id of the des table of superfam050817.
    hgsql superfam050817 -e "create index id on des(id);"
    gzip -d ass_14-Aug-2005.tab.gz
    hgsql superfam050817 < ~/src/hg/lib/sfAssign.sql
    hgsql superfam050817 -e \
    'load data local infile "ass_14-Aug-2005.tab" into table superfam050817.sfAssign;'

# Build or rebuild Superfamily track and create sf tables needed for PB

   hgsql mm7 < ~/src/hg/lib/sfAssign.sql

   cd /cluster/data/superfamily/050817  
   hgsql mm7 -e 'load data local infile "ass_14-Aug-2005.tab" into table mm7.sfAssign;'

# If mm7.sfDes already exists, drop it.

   hgsql superfam050817 -e "select * from des" >sfDes.tab
   hgsql mm7 < ~/src/hg/lib/sfDes.sql
   hgsql mm7 -e 'load data local infile "sfDes.tab" into table mm7.sfDes ignore 1 lines;'

# If mm7.superfamily already exists, drop it.
   cd /cluster/data/mm7/bed
   mkdir /cluster/data/mm7/sf.2005-0817
   ln -s sf.2005-0817 sf
   hgSuperfam mm7 superfam050817 > sf.log

# It is normal that many proteins does not have corresponding Superfamily entries.

# If mm7.sfDescription exists, drop it.

   hgsql mm7 < ~/src/hg/lib/sfDescription.sql
   hgsql mm7 -e 'LOAD DATA local INFILE "sfDescription.tab" into table mm7.sfDescription;'

# Finally, load the superfamily table.

   hgLoadBed -strict mm7 superfamily superfamily.tab -tab

# Create knownToSuperfamily table
# Note hs is changed into ht for this Superfamily release.
   
   cat /cluster/data/superfamily/050817/ass_14-Aug-2005.tab | hgKnownToSuper mm7 mm stdin
# 21185 records output 

###########################################################################
# BLASTZ canFam2 second time (WORKING - 2005-11-14 Hiram)
    #	After fixing a bug in the lineage specific repeat snip business
    #	in blastz-run-ucsc script
    ssh pk
    mkdir /cluster/data/mm7/bed/blastzCanFam2.2005-11-14
    cd /cluster/data/mm7/bed
    rm blastz.canFam2
    ln -s blastzCanFam2.2005-11-14 blastz.canFam2
    cd blastzCanFam2.2005-11-14

    cat << '_EOF_' > DEF
# mouse vs dog
export
PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/cluster/home/angie/schwartzbin:/parasol/bin

BLASTZ=blastz.v7.x86_64
BLASTZ_ABRIDGE_REPEATS=1

# TARGET: Mouse Mm7
SEQ1_DIR=/scratch/hg/mm7/nib
SEQ1_SMSK=/scratch/hg/mm7/linSpecRep/notInHumanDogCow
SEQ1_LEN=/scratch/hg/mm7/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000

# QUERY: Dog CanFam2 - chunk big enough to do all chroms in single whole
pieces
SEQ2_DIR=/scratch/hg/canFam2/nib
SEQ2_SMSK=/san/sanvol1/scratch/canFam2/linSpecRep.notInMouse
SEQ2_LEN=/san/sanvol1/scratch/canFam2/chrom.sizes
SEQ2_CHUNK=200000000
SEQ2_LAP=0

BASE=/cluster/data/mm7/bed/blastzCanFam2.2005-11-14
TMPDIR=/scratch/tmp
'_EOF_'
    # << happy emacs

    #	establish a screen to control this job
    screen
    time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-bigClusterHub=pk \
	-stop=net \
	`pwd`/DEF > blastz-to-net.out 2>&1 &
XXXX - Started 2005-11-14 14:50
    time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-bigClusterHub=pk \
	-continue=load -stop=load \
	`pwd`/DEF > load.out 2>&1 &

    featureBits mm7 chainCanFam2Link
    #	842454846 bases of 2583394090 (32.610%) in intersection

    time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-bigClusterHub=pk -swap -stop=load `pwd`/DEF > swap-to-load.out 2>&1 &

    #	re-scoring the chains
    ssh kkstore02
    cd /cluster/data/mm7/bed/blastzCanFam2.2005-11-14
    rm -fr axtNet mafNet axtChain

    ssh pk
    cd /cluster/data/mm7/bed/blastzCanFam2.2005-11-14

    time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
	-continue=chainRun -stop=load \
	`pwd`/DEF > rescoreChain.out 2>&1 &

    time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
	-swap -stop=load \
	`pwd`/DEF > rescoreChainSwap.out 2>&1 &

    time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
	-swap -continue=load -stop=load \
	`pwd`/DEF > rescoreChainSwapLoad.out 2>&1 &

    time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
	-continue=download \
	`pwd`/DEF > rescoreChainDownload.out 2>&1 &

    time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
	-swap -continue=download \
	`pwd`/DEF > rescoreChainDownloadSwap.out 2>&1 &

    #	Measurements:
    ssh kolossus
    cd  /cluster/data/mm7/bed/blastzCanFam2.2005-11-14
    time HGDB_CONF=~/.hg.conf.read-only featureBits mm7 \
	chainCanFam2Link > fb.mm7.chainCanFam2Link.rescore 2>&1
    cat fb.mm7.chainCanFam2Link.rescore
    #	833554390 bases of 2583394090 (32.266%) in intersection
    time HGDB_CONF=~/.hg.conf.read-only featureBits canFam2 \
	chainMm7Link > fb.canFam2.chainMm7Link.rescore 2>&1
    cat fb.canFam2.chainMm7Link.rescore
    #	814108570 bases of 2384996543 (34.135%) in intersection

###########################################################################
# BLASTZ/CHAIN/NET CANFAM2 (DONE - 2005-09-16 - 2005-10-12- Hiram)
    ssh kkstore02
    mkdir /cluster/data/mm7/bed/blastz.canFam2.2005-09-16
    cd /cluster/data/mm7/bed
    ln -s blastz.canFam2.2005-09-16 blastz.canFam2
    cd blastz.canFam2.2005-09-16

    cat << '_EOF_' > DEF
# mouse vs dog
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/cluster/home/angie/schwartzbin:/parasol/bin

BLASTZ=blastz.v7.x86_64
BLASTZ_ABRIDGE_REPEATS=1

# TARGET: Mouse Mm7
SEQ1_DIR=/cluster/bluearc/mm7/nib
SEQ1_SMSK=/cluster/bluearc/mm7/linSpecRep/notInHumanDogCow
SEQ1_LEN=/cluster/bluearc/mm7/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000

# QUERY: Dog CanFam2 - chunk big enough to do all chroms in single whole pieces
SEQ2_DIR=/scratch/hg/canFam2/nib
SEQ2_SMSK=/san/sanvol1/scratch/canFam2/linSpecRep.notInMouse
SEQ2_LEN=/san/sanvol1/scratch/canFam2/chrom.sizes
SEQ2_CHUNK=200000000
SEQ2_LAP=0

BASE=/cluster/data/mm7/bed/blastz.canFam2.2005-09-16
TMPDIR=/scratch/tmp
'_EOF_'
    # << happy emacs

    ssh pk
    #	establish a screen to control this job
    screen
    cd /cluster/data/mm7/bed/blastz.canFam2.2005-09-16
    time $HOME/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-bigClusterHub=pk -stop=load \
	`pwd`/DEF > thruLoad.out 2>&1 &
XXX - STARTED - 2005-09-16 15:50
real    749m31.156s
user    0m0.116s
sys     0m0.103s
    time $HOME/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-bigClusterHub=pk -continue=chainMerge -stop=load \
	`pwd`/DEF > chainMergeThruLoad.out 2>&1 &

    #	detach from screen session: Ctrl-a Ctrl-d
    #	to reattach to this screen session:
    ssh pk
    screen -d -r

    time $HOME/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-bigClusterHub=pk -continue=download \
	`pwd`/DEF > download-cleanup.out 2>&1 &

    time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-bigClusterHub=pk -swap -fileServer=kolossus \
	`pwd`/DEF > swap.out 2>&1 &

    time HGDB_CONF=~/.hg.conf.read-only featureBits mm7 chainCanFam2Link
    #	791921556 bases of 2583394090 (30.654%) in intersection
    time HGDB_CONF=~/.hg.conf.read-only featureBits canFam2 chainMm7Link
    #	773143256 bases of 2384996543 (32.417%) in intersection
    time HGDB_CONF=~/.hg.conf.read-only featureBits canFam2 chainMm6Link
    #	780509502 bases of 2384996543 (32.726%) in intersection
    time HGDB_CONF=~/.hg.conf.read-only featureBits mm6 chainCanFam1Link
    #	798637320 bases of 2597150411 (30.751%) in intersection


############################################################################
#  UPDATE miRNA track (TBD)
#   data from: Michel.Weber@ibcg.biotoul.fr
#   notify them when done.

   cd /cluster/data/mm7/bed
   cd miRNA

   mkdir old
   cp -p * old
   rm *
   
#  save miRNA_track_mm7_aug2005.txt file from email
   
   cp miRNA_track_mm7_aug2005.txt miRNA.tab
   vi miRNA.tab

# edit miRNA.bed to get rid of the top description lines
# and a few blank lines

   hgLoadBed -strict mm7 miRNA miRNA.tab
    
# check previous release track before update

   nice featureBits mm5 miRNA
#  17957 bases of 2615483787 (0.001%) in intersection

   nice featureBits mm7 miRNA
#  20898 bases of 2597150411 (0.001%) in intersection

#######################################################################
# MAKE 11.OOC FILE FOR BLAT (DONE - 2005-09-08 - Hiram)
    ssh kkstore02
    cd /cluster/data/mm7
    #	Size of mouse non-gap genome: 2583394090
    #	Size of  Hg17 non-gap genome: 2866216770
    #	Adjusting the 1024 number from typical human ooc generation:
    #	1024 * (2583394090 / 2866216770) = 923

    time blat mm7.2bit \
	/dev/null /dev/null -tileSize=11 -makeOoc=11.ooc -repMatch=923
    #	Wrote 29368 overused 11-mers to 11.ooc
    #	real    2m41.929
    # Copy over to the bluearc
    cp -p 11.ooc /cluster/bluearc/mm7

#############################################################################
# PREPARE LINEAGE SPECIFIC REPEAT FILES FOR BLASTZ (DONE - 2005-09-15 - Hiram)

    ssh kkstore02
    mkdir /cluster/data/mm7/rmsk
    cd /cluster/data/mm7
    cp -p */chr*.fa.out rmsk
    cd rmsk

    for FN in chr*.fa.out
    do
	echo ${FN}
	/cluster/bluearc/RepeatMasker050305/DateRepeats \
	    ${FN} -query mouse -comp human -comp rat -comp dog -comp cow
    done
    #	takes about 30 minutes

    cd /cluster/data/mm7
    mkdir linSpecRep
    cd linSpecRep
    mkdir notInHuman
    mkdir notInRat
    mkdir notInDog
    mkdir notInCow
    cd /cluster/data/mm7
    for F in rmsk/chr*.out_homo-sapiens*
    do
	B=${F/rmsk\/}
	B=${B/.fa.out*/}
	echo $B 
        /cluster/bin/scripts/extractRepeats 1 ${F} > \
		linSpecRep/notInHuman/${B}.out.spec
        /cluster/bin/scripts/extractRepeats 2 ${F} > \
		linSpecRep/notInRat/${B}.out.spec
        /cluster/bin/scripts/extractRepeats 3 ${F} > \
		linSpecRep/notInDog/${B}.out.spec
        /cluster/bin/scripts/extractRepeats 4 ${F} > \
		linSpecRep/notInCow/${B}.out.spec
    done
    #	the notInHuman, notInDog, and notInCow ended up being identical
    #	To check identical
    cd linSpecRep
    find . -type f | \
	while read FN; do echo `cat ${FN} | sum -r` ${FN}; done \
	| sort -u -k1,1n
    #	Copy to bluearc for use in kluster runs
    mkdir /cluster/bluearc/mm7/linSpecRep
    cd /cluster/bluearc/mm7/linSpecRep
    mkdir notInOthers notInRat
    cp -p /cluster/data/mm7/linSpecRep/notInHuman/* ./notInOthers
    cp -p /cluster/data/mm7/linSpecRep/notInRat/* ./notInRat

#############################################################################
# macaca mulatta vs. Mm7 (WORKING - 2005-10-19 - Hiram)
    ssh kk

    mkdir /cluster/data/mm7/bed/blastzRheMac1.2005-10-19
    cd /cluster/data/mm7/bed
    ln -s blastzRheMac1.2005-10-19 blastz.rheMac1
    cd blastzRheMac1.2005-10-19

    cat << '_EOF_' > DEF
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/i386:/cluster/home/angie/schwartzbin:/parasol/bin

BLASTZ=blastz.v7

# TARGET - Mm7
SEQ1_DIR=/scratch/hg/mm7/nib
SEQ1_LEN=/scratch/hg/mm7/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000

# QUERY - Macaca mulatta rheMac1
SEQ2_DIR=/scratch/hg/rheMac1/rheMac1.2bit
SEQ2_LEN=/scratch/hg/rheMac1/chrom.sizes
SEQ2_LIMIT=300
SEQ2_CHUNK=30000000
SEQ2_LAP=0

BASE=/cluster/data/mm7/bed/blastzRheMac1.2005-10-19
'_EOF_'
    #	happy emacs

    time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-bigClusterHub=kk \
	`pwd`/DEF > blastz.out 2>&1 &

    ssh kolossus
    HGDB_CONF=~/.hg.conf.read-only featureBits mm7 chainRheMac1Link
    #	86364754 bases of 2583394090 (34.310%) in intersection

    time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-swap -bigClusterHub=kk \
	`pwd`/DEF > swap.out 2>&1 &

    #	re-scoring the chains
    ssh kkstore02
    cd /cluster/data/mm7/bed/blastzRheMac1.2005-10-19
    rm -fr axtNet mafNet axtChain

    ssh pk
    cd /cluster/data/mm7/bed/blastzRheMac1.2005-10-19

    time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
	-continue=chainRun -stop=load \
	`pwd`/DEF > rescoreChain.out 2>&1 &
    time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
	-swap -stop=load \
	`pwd`/DEF > rescoreChainSwap.out 2>&1 &

    time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
	-swap -continue=load -stop=load \
	`pwd`/DEF > rescoreChainSwapLoad.out 2>&1 &

    #	Measurements:
    ssh kolossus
    cd  /cluster/data/mm7/bed/blastzRheMac1.2005-10-19
    time HGDB_CONF=~/.hg.conf.read-only featureBits mm7 \
	chainRheMac1Link > fb.mm7.chainRheMac1Link.rescore 2>&1
    cat fb.mm7.chainRheMac1Link.rescore
    #	875709370 bases of 2583394090 (33.898%) in intersection
    time HGDB_CONF=~/.hg.conf.read-only featureBits rheMac1 \
	chainMm7Link > fb.rheMac1.chainMm7Link.rescore 2>&1
    cat fb.rheMac1.chainMm7Link.rescore
    #	862032750 bases of 2691926638 (32.023%) in intersection

    time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
	-continue=download \
	`pwd`/DEF > rescoreChainDownload.out 2>&1 &

    time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
	-swap -continue=download \
	`pwd`/DEF > rescoreChainSwapDownload.out 2>&1 &

#############################################################################
# Elephant vs. Mm7 (WORKING - 2005-10-20 - Hiram)
    ssh pk

    mkdir /cluster/data/mm7/bed/blastzLoxAfr1.2005-10-20
    cd /cluster/data/mm7/bed
    ln -s blastzLoxAfr1.2005-10-20 blastz.loxAfr1
    cd blastzLoxAfr1.2005-10-20

    cat << '_EOF_' > DEF
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/parasol/bin

BLASTZ=blastz.v7.x86_64

# TARGET - Mouse mm7
SEQ1_DIR=/scratch/hg/mm7/nib
SEQ1_LEN=/scratch/hg/mm7/chrom.sizes
SEQ1_CHUNK=30000000
SEQ1_LAP=10000

# QUERY - Elephant loxAfr1
SEQ2_DIR=/scratch/hg/loxAfr1/loxAfr1.2bit
SEQ2_LEN=/scratch/hg/loxAfr1/chrom.sizes
SEQ2_LIMIT=300
SEQ2_CHUNK=30000000
SEQ2_LAP=0

BASE=/cluster/data/mm7/bed/blastzLoxAfr1.2005-10-20
'_EOF_'
    #	happy emacs

    time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-bigClusterHub=pk \
	`pwd`/DEF > blastz.out 2>&1 &
    time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-continue=cat -bigClusterHub=kk \
	`pwd`/DEF > continue.cat.out 2>&1 &

    ssh kolossus
    HGDB_CONF=~/.hg.conf.read-only featureBits mm7 chainLoxAfr1Link
    #	480074381 bases of 2583394090 (18.583%) in intersection


    time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-swap -bigClusterHub=kk \
	`pwd`/DEF > swap.out 2>&1 &
    time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-continue=load -swap -bigClusterHub=kk \
	`pwd`/DEF > load.swap.out 2>&1 &

    ssh hgwdev
    time featureBits loxAfr1 chainMm7Link
    #	477557610 bases of 2295548473 (20.804%) in intersection
    #	real    4536m4.614s
    #	user    22m50.620s
    #	sys     9m38.420s
    #	*!*! NOTE: That is 75.6 hours running time !!!

    #	re-scoring the chains
    ssh kkstore02
    cd /cluster/data/mm7/bed/blastzLoxAfr1.2005-10-20
    rm -fr axtNet mafNet axtChain

    ssh pk
    cd /cluster/data/mm7/bed/blastzLoxAfr1.2005-10-20

    time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
	-continue=chainRun -stop=load \
	`pwd`/DEF > rescoreChain.out 2>&1 &

    time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
	-swap -stop=load \
	`pwd`/DEF > rescoreChainSwap.out 2>&1 &

    time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
	-swap -continue=net -stop=load \
	`pwd`/DEF > rescoreChainSwap.net-to-load.out 2>&1 &

    time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
	-swap -continue=load \
	`pwd`/DEF > rescoreChainSwapLoad.out 2>&1 &

    time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
	-continue=download \
	`pwd`/DEF > rescoreChainDownload.out 2>&1 &

    ssh kolossus
    cd /cluster/data/mm7/bed/blastz.loxAfr1
    time HGDB_CONF=~/.hg.conf.read-only featureBits loxAfr1 \
	chainMm7Link > fb.loxAfr1.chainMm7Link.rescore 2>&1
    #	real    4067m1.699s
    #	user    5m42.134s
    #	sys     2m9.896s
    #	This is over 67 hours !

    cat fb.loxAfr1.chainMm7Link.rescore
    #	471844339 bases of 2295548473 (20.555%) in intersection

#############################################################################
# Tenrec vs. Mm7 (WORKING - 2005-10-20 - Hiram)
    ssh pk

    mkdir /cluster/data/mm7/bed/blastzEchTel1.2005-10-20
    cd /cluster/data/mm7/bed
    ln -s blastzEchTel1.2005-10-20 blastz.echTel1
    cd blastzEchTel1.2005-10-20

    cat << '_EOF_' > DEF
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/parasol/bin

BLASTZ=blastz.v7.x86_64

# TARGET - Mouse mm7
SEQ1_DIR=/scratch/hg/mm7/nib
SEQ1_LEN=/scratch/hg/mm7/chrom.sizes
SEQ1_CHUNK=30000000
SEQ1_LAP=10000

# QUERY - Tenrec echTel1
SEQ2_DIR=/scratch/hg/echTel1/echTel1.2bit
SEQ2_LEN=/scratch/hg/echTel1/chrom.sizes
SEQ2_LIMIT=400
SEQ2_CHUNK=30000000
SEQ2_LAP=0

BASE=/cluster/data/mm7/bed/blastzEchTel1.2005-10-20
'_EOF_'
    #	happy emacs

    time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-bigClusterHub=pk \
	`pwd`/DEF > blastz.out 2>&1 &

    time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-continue=cat -bigClusterHub=pk \
	`pwd`/DEF > continue.cat.out 2>&1 &

    ssh kolossus
    HGDB_CONF=~/.hg.conf.read-only featureBits mm7 chainEchTel1Link
    #	296238377 bases of 2583394090 (11.467%) in intersection

    time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-swap -bigClusterHub=pk \
	`pwd`/DEF > swap.out 2>&1 &

    ssh kolossus
    time HGDB_CONF=~/.hg.conf.read-only featureBits echTel1 \
	chainMm7Link > fb.echTel1.chainMm7Link 2>&1
    #	real    3973m1.497s
    #	user    3m19.311s
    #	sys     1m24.524s
    #	That is ~ 66 hours

    #	re-scoring the chains
    ssh kkstore02
    cd /cluster/data/mm7/bed/blastzEchTel1.2005-10-20
    rm -fr axtNet mafNet axtChain

    ssh pk
    cd /cluster/data/mm7/bed/blastzEchTel1.2005-10-20

    time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
	-continue=chainRun -stop=load \
	`pwd`/DEF > rescoreChain.out 2>&1 &

    time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
	-swap -stop=load \
	`pwd`/DEF > rescoreChainSwap.out 2>&1 &

    time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
	-swap -continue=load \
	`pwd`/DEF > rescoreChainSwapLoad.out 2>&1 &

    time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
	-continue=download \
	`pwd`/DEF > rescoreChainDownload.out 2>&1 &

real    5472m11.402s


#############################################################################
# Rabbit vs. Mm7 (WORKING - 2005-10-20 - Hiram)
    ssh pk

    mkdir /cluster/data/mm7/bed/blastzOryCun1.2005-10-20
    cd /cluster/data/mm7/bed
    ln -s blastzOryCun1.2005-10-20 blastz.oryCun1
    cd blastzOryCun1.2005-10-20

    cat << '_EOF_' > DEF
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/parasol/bin

BLASTZ=blastz.v7.x86_64

# TARGET - Mouse mm7
SEQ1_DIR=/scratch/hg/mm7/nib
SEQ1_LEN=/scratch/hg/mm7/chrom.sizes
SEQ1_CHUNK=30000000
SEQ1_LAP=10000

# QUERY - Rabbit oryCun1
SEQ2_DIR=/scratch/hg/oryCun1/oryCun1.2bit
SEQ2_LEN=/scratch/hg/oryCun1/chrom.sizes
SEQ2_LIMIT=400
SEQ2_CHUNK=30000000
SEQ2_LAP=0

BASE=/cluster/data/mm7/bed/blastzOryCun1.2005-10-20
'_EOF_'
    #	happy emacs

    time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-bigClusterHub=pk \
	`pwd`/DEF > blastz.out 2>&1 &

    time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-continue=cat -bigClusterHub=pk \
	`pwd`/DEF > continue.cat.out 2>&1 &

    ssh kolossus
    HGDB_CONF=~/.hg.conf.read-only featureBits mm7 chainOryCun1Link
    #	503550390 bases of 2583394090 (19.492%) in intersection

    time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-swap -bigClusterHub=pk \
	`pwd`/DEF > swap.out 2>&1 &

    time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-continue=load -swap -bigClusterHub=kk \
	`pwd`/DEF > load.swap.out 2>&1 &

    ssh kolossus
    time HGDB_CONF=~/.hg.conf.read-only featureBits oryCun1 \
	chainMm7Link >fb.oryCun1.chainMm7Link 2>&1
    #	real    7258m58.416s
    #	user    3m38.961s
    #	sys     1m41.555s
    #	That is ~ 121 Hours !

    #	re-scoring the chains
    ssh kkstore02
    cd /cluster/data/mm7/bed/blastzOryCun1.2005-10-20
    rm -fr axtNet mafNet axtChain

    ssh pk
    cd /cluster/data/mm7/bed/blastzOryCun1.2005-10-20

    time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
	-continue=chainRun -stop=load \
	`pwd`/DEF > rescoreChain.out 2>&1 &
    #	real    101m1.441s
    time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
	-swap -stop=load \
	`pwd`/DEF > rescoreChainSwap.out 2>&1 &

    time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
	-swap -continue=load \
	`pwd`/DEF > rescoreChainSwapLoad.out 2>&1 &

    time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
	-continue=download \
	`pwd`/DEF > rescoreChainDownload.out 2>&1 &

    ssh kolossus
    cd /cluster/data/mm7/bed/blastz.oryCun1
    time HGDB_CONF=~/.hg.conf.read-only featureBits \
	oryCun1 chainMm7Link > fb.oryCun1.chainMm7Link.rescore 2>&1  
    #	real    3560m14.127s
    #	user    3m23.400s
    #	sys     1m23.287s
    #	That is over 59 hours !
    cat fb.oryCun1.chainMm7Link.rescore
    #	501663640 bases of 2076044328 (24.164%) in intersection

#############################################################################
# Armadillo vs. Mm7 (WORKING - 2005-10-20 - Hiram)
    ssh pk

    mkdir /cluster/data/mm7/bed/blastzDasNov1.2005-10-20
    cd /cluster/data/mm7/bed
    ln -s blastzDasNov1.2005-10-20 blastz.dasNov1
    cd blastzDasNov1.2005-10-20

    cat << '_EOF_' > DEF
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/parasol/bin

BLASTZ=blastz.v7.x86_64

# TARGET - Mouse mm7
SEQ1_DIR=/scratch/hg/mm7/nib
SEQ1_LEN=/scratch/hg/mm7/chrom.sizes
SEQ1_CHUNK=30000000
SEQ1_LAP=10000

# QUERY - Armadillo dasNov1
SEQ2_DIR=/scratch/hg/dasNov1/dasNov1.2bit
SEQ2_LEN=/scratch/hg/dasNov1/chrom.sizes
SEQ2_LIMIT=400
SEQ2_CHUNK=30000000
SEQ2_LAP=0

BASE=/cluster/data/mm7/bed/blastzDasNov1.2005-10-20
'_EOF_'
    #	happy emacs

    time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-bigClusterHub=pk \
	`pwd`/DEF > blastz.out 2>&1 &

    time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-continue=cat -bigClusterHub=pk \
	`pwd`/DEF > continue.cat.out 2>&1 &

    ssh kolossus
    HGDB_CONF=~/.hg.conf.read-only featureBits mm7 chainDasNov1Link
    #	438537191 bases of 2583394090 (16.975%) in intersection

    time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-swap -bigClusterHub=pk -fileServer=kolossus \
	`pwd`/DEF > swap.out 2>&1 &
    time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-continue=net -swap -bigClusterHub=pk \
	`pwd`/DEF > swap.net.out 2>&1 &

    ssh kolossus
    time HGDB_CONF=~/.hg.conf.read-only featureBits dasNov1 \
	chainMm7Link > fb.dasNov1.chainMm7Link 2>&1
    #	real    8229m16.489s
    #	user    2m56.146s
    #	sys     1m20.550s
    #	That is ~ 137 hours

    #	re-scoring the chains
    ssh kkstore02
    cd /cluster/data/mm7/bed/blastzDasNov1.2005-10-20
    rm -fr axtNet mafNet axtChain

    ssh pk
    cd /cluster/data/mm7/bed/blastzDasNov1.2005-10-20

    time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
	-continue=chainRun -stop=load \
	`pwd`/DEF > rescoreChain.out 2>&1 &

    time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
	-swap -stop=load \
	`pwd`/DEF > rescoreChainSwap.out 2>&1 &

    time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
	-swap -continue=net -stop=load \
	`pwd`/DEF > rescoreChainSwap.net-to-load.out 2>&1 &

    time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
	-continue=download \
	`pwd`/DEF > rescoreChainDownload.out 2>&1 &

    time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
	-swap -continue=load \
	`pwd`/DEF > rescoreChainSwapLoad.out 2>&1 &

real    5472m48.563s

#############################################################################
# Create Allen Brain Atlas mapping. (DONE Nov 2005 JK)

# Set up directory
    ssh kk
    cd /cluster/data/mm7/bed
    mkdir allenBrain
    cd allenBrain

# Copy in allen20051021.tab file that was converted from
# spreadsheet mailed by Susan Sunkin <SusanS@alleninstitute.org>
# Also copy in probeSeq.20051027.fasta, also from Susan.

# Create a list of probe sequences filling ones missing from probeSeq.20050127.fa
# with some NCBI and TIGR files, and some downloaded one at a time.
     allenCollectSeq allen20051021.tab probeSeq.20051027.fasta /cluster/data/mm6/bed/ncbiXm/ncbiNm.fa /cluster/data/mm6/bed/ncbiXm/ncbiXm.fa /cluster/data/mm6/bed/tigrMgiTc/tigrMgiTc.fa ~/kent/src/hg/makeDb/allenBrain/allenCollectSeq/extra.fa allProbes.fa allProbes.tab missing.tab allenBrainUrl.tab
    
# Set up a blat run to align the probes.
    mkdir split
    faSplit sequence allProbes.fa 200 split/rp
    mkdir run
    cd run
    ls -1 ../split/*.fa > mrna.lst
    ls -1 /scratch/hg/mm7/nib/*.nib > genome.lst
    mkdir psl
    cat << '_EOF_' > gsub
#LOOP
blat -ooc=/scratch/hg/h/mouse11.ooc $(path1) $(path2) {check out line+ psl/$(root1)_$(root2).psl}
#ENDLOOP
'_EOF_'
    gensub2 genome.lst mrna.lst gsub spec
    para create spec
# Then do the usual para try/push/time/check until the run is finished

# Then do sorting and near-best-in-genome step on file server
    ssh kkstore02
    cd /cluster/data/mm7/bed/allenBrain/run
    pslSort dirs raw.psl tmp psl
    pslReps raw.psl ../best.psl -nohead -minCover=0.20 -minAli=0.96 -nearTop=0.001 /dev/null
    sort -k 14,14 -k 16,16n ../best.psl > ../allenBrainAli.psl

# Clean up big files no longer needed
   rm raw.psl
   rm -r psl
   rm -r ../split

# Load up database
   ssh hgwdev
   cd /cluster/data/mm7/bed/allenBrain

# Make a new table that contains the URLs for the allen brain genes
# Make this one first since all.joiner considers it the master table.
   hgsql mm7 < ~/kent/src/hg/lib/allenBrainUrl.sql
   hgsql mm7 -e 'load data local infile "allenBrainUrl.tab" into table allenBrainUrl;'

# Make probe alignment table, and load sequence.
   hgLoadPsl mm7 allenBrainAli.psl
   mkdir /gbdb/mm7/allenBrain
   ln -s /cluster/data/mm7/bed/allenBrain/allProbes.fa /gbdb/mm7/allenBrain/allProbes.fa
   hgLoadSeq mm7 /gbdb/mm7/allenBrain/allProbes.fa

# Make mapping between known genes and allenBrain	
   hgMapToGene mm7 allenBrainAli -type=psl knownGene knownToAllenBrain 

############################################################################
# BUILD KNOWN GENES RELATED TABLES for mm7 (STARTED 10/23/05, DONE 11/17/05. Fan)

# First build protein databases, sp051015 and proteins051015
# See makeProteins051015.doc for details.
# Please note that the protein and displayId tables in sp051015 have data of variant splice proteins.

# Create working subdirectories and temporary databases

  ssh hgwdev
  cd /cluster/store11/kg
  mkdir kgMm7A  
  ln -s /cluster/store11/kg/kgMm7A /cluster/store6/kgDB/bed/kgMm7A
  ln -s /cluster/store11/kg/kgMm7A /cluster/data/mm7/bed/kgMm7A

  hgsql mm7 -e "create database kgMm7A"   
  hgsql mm7 -e "create database kgMm7ATemp"

  mkdir /cluster/bluearc/kgDB/kgMm7A
  mkdir /cluster/bluearc/kgDB/kgMm7A/protBlat
  ln -s /cluster/bluearc/kgDB/kgMm7A/protBlat /cluster/store11/kg/kgMm7A/protBlat
  cd /cluster/store11/kg/kgMm7A/protBlat

# Get all protein sequences for mouse

  hgsql -N sp051015 -e \
  'select proteins051015.spXref3.accession,protein.val from proteins051015.spXref3,protein where division="10090" and acc=accession' \
  |awk '{print ">" $1;print $2}' >mm7Prot.fa

# Prepare and perform cluster run for protein/genome alignment

  ssh kk
  cd /cluster/data/mm7/bed/kgMm7A/protBlat
  mkdir prot
  faSplit sequence mm7Prot.fa 1000 prot/prot
  ls /cluster/bluearc/kgDB/kgMm7A/protBlat/prot/* > prot.lis

  ssh hgwdev
  cd /cluster/data/mm7/bed/kgMm7A/protBlat
  hgsql mm7 -N -e 'select chrom from chromInfo' > chrom.lis
  exit
  
  cat << '_EOF_' > gsub
#LOOP
/cluster/bin/i386/blat -noHead -t=dnax -q=prot /cluster/data/mm7/nib/$(path1).nib $(path2) {check out line+ /cluster/bluearc/kgDB/kgMm7A/protBlat/result/$(root1)_$(root2).psl}
#ENDLOOP
'_EOF_'

  mkdir result
  gensub2 chrom.lis prot.lis gsub jobList

  para create jobList
  para try
  para check
  para push
  para check ...
#Completed: 32449 of 39600 jobs
#Crashed: 7151 jobs
#CPU time in finished jobs:   36332462s  605541.04m 10092.35h  420.51d  1.152 y
#IO & Wait Time:                802269s   13371.14m   222.85h    9.29d  0.025 y
#Average job time:                1144s      19.07m     0.32h    0.01d
#Longest running job:                0s       0.00m     0.00h    0.00d
#Longest finished job:           34735s     578.92m     9.65h    0.40d
#Submission to last job:        113157s    1885.95m    31.43h    1.31d
#Estimated complete:                 0s       0.00m     0.00h    0.00d
#39600 jobs in batch

# Many output .psl files are empty, these warnings are OK.  
# Check to see if there is any other error type.

  para problems |grep empty|wc
#   7151   21453  559219

# collect BLAT results

   ssh hgwdev
   cd /cluster/data/mm7/bed/kgMm7A/protBlat

   mkdir result2
   mkdir result3

   cat chrom.lis |sed -e 's/chr/do1 chr/g' >doall

   cat << '_EOF_' > do1.1
echo processing $1
cat result/$1_prot*.psl >result2/$1.psl
'_EOF_'

   cat << '_EOF_' > do1.2
echo processing $1
pslReps -nohead -minCover=0.80 -minAli=0.80 -nearTop=0.002 result2/$1.psl result3/$1.psl /dev/null >>j.out
'_EOF_'

   chmod +x do*

   cp do1.1 do1
   doall
   cp do1.2 do1
   doall

   cat result3/*.psl >protBlat.psl
   hgLoadPsl mm7 protBlat.psl
# Processing protBlat.psl
# load of protBlat did not go as planned: 101748 record(s), 0 row(s) skipped, 880 warning(s) loading psl.tab
# Looked into the cause of the warnings before and found that it was due to that qBaseInsert 
# and tBaseInsert have negative values, probably due to that this is protein alignment.

# Remember to remove result2 and result3 when KG is built and validated.

# VVVVVVVVVVVVVVVVVVVVVVVVVVVVV BELOW IS AN EXCURSION TO BLAT VAR SPLICE PROTEINS VVVVVVVVVVVVV
# (Fan, STARTED 10/28/05, DONE 10/29/05)  
# FORGOT TO FINISH VAR-SPLICE PROTEINS PROCESSING DURING MAKE PROTEINS BUILD.
# NEXT TIME, SHOULD NOT DO THIS.

  mkdir /cluster/bluearc/kgDB/kgMm7A/protBlatVar
  ln -s /cluster/bluearc/kgDB/kgMm7A/protBlatVar /cluster/store11/kg/kgMm7A/protBlatVar
  cd /cluster/store11/kg/kgMm7A/protBlatVar

# Get all protein sequences for mouse

  hgsql -N sp051015 -e \
  'select proteins051015.spXref3.accession,protein.val from proteins051015.spXref3,protein where division="10090" and acc=accession and acc like "%-%"' \
  |awk '{print ">" $1;print $2}' >mm7ProtVar.fa

# Prepare and perform cluster run for protein/genome alignment

  ssh kk
  cd /cluster/data/mm7/bed/kgMm7A/protBlatVar
  mkdir protVar
  faSplit sequence mm7ProtVar.fa 100 protVar/protVar
  ls /cluster/bluearc/kgDB/kgMm7A/protBlatVar/protVar/* > protVar.lis

  ssh hgwdev
  cd /cluster/data/mm7/bed/kgMm7A/protBlatVar
  hgsql mm7 -N -e 'select chrom from chromInfo' > chrom.lis
  exit
  

  cat << '_EOF_' > gsub
#LOOP
/cluster/bin/i386/blat -noHead -t=dnax -q=prot /cluster/data/mm7/nib/$(path1).nib $(path2) {check out line+ /cluster/bluearc/kgDB/kgMm7A/protBlatVar/result/$(root1)_$(root2).psl}
#ENDLOOP
'_EOF_'

  mkdir result
  gensub2 chrom.lis protVar.lis gsub jobList

  para create jobList
  para try
  para check
  para push
  para check ...
# Completed: 2498 of 4000 jobs
# Crashed: 1502 jobs
# CPU time in finished jobs:    3234506s   53908.44m   898.47h   37.44d  0.103 y
# IO & Wait Time:                 47651s     794.18m    13.24h    0.55d  0.002 y
# Average job time:                1314s      21.90m     0.36h    0.02d
# Longest running job:                0s       0.00m     0.00h    0.00d
# Longest finished job:           16756s     279.27m     4.65h    0.19d
# Submission to last job:         18976s     316.27m     5.27h    0.22d

# Many output .psl files are empty, these warnings are OK.  
# Check to see if there is any other error type.

  para problems |grep empty|wc
#   1502    4506  125061

# collect BLAT results

   ssh hgwdev
   cd /cluster/data/mm7/bed/kgMm7A/protBlatVar

   mkdir result2
   mkdir result3

   cat chrom.lis |sed -e 's/chr/do1 chr/g' >doall

   cat << '_EOF_' > do1.1
echo processing $1
cat result/$1_prot*.psl >result2/$1.psl
'_EOF_'

   cat << '_EOF_' > do1.2
echo processing $1
pslReps -nohead -minCover=0.80 -minAli=0.80 -nearTop=0.002 result2/$1.psl result3/$1.psl /dev/null >>j.out
'_EOF_'

   chmod +x do*

   cp do1.1 do1
   doall
   cp do1.2 do1
   doall

   cat result3/*.psl >protBlatVar.psl

   cat ../protBlat/protBlat.psl protBlatVar.psl >protBlatAll.psl
   hgLoadPsl -table=protBlat mm7 protBlatAll.psl

# Processing protBlatAll.psl
# load of protBlat did not go as planned: 104970 record(s), 0 row(s) skipped, 887 warning(s) loading psl.tab

   mv ../protBlat/protBlat.psl ../protBlat/protBlat.psl.orig
   cp -p protBlatAll.psl ../protBlat/protBlat.psl

# Looked into the cause of the warnings before and found that it was due to that qBaseInsert 
# and tBaseInsert have negative values, probably due to that this is protein alignment.

# Remember to remove result2 and result3 when KG is built and validated.

^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

   cd /cluster/data/mm7/bed/kgMm7A

# create all_mrna.psl and tight_mrna.psl
   hgsql mm7 -N -e "select * from all_mrna" |cut -f 2-22 >all_mrna.psl

   pslReps -minCover=0.40 -minAli=0.97 -nearTop=0.002 all_mrna.psl tight_mrna.psl /dev/null
# Processed 214613 alignments

# Use overlapSelect to get protein and mRNA alignment overlaps   
   overlapSelect -statsOutput -dropped=protOut.psl -overlapThreshold=0.90 \
   -selectFmt=psl -inFmt=psl tight_mrna.psl protBlat/protBlat.psl protMrna.stat

   overlapSelect -mergeOutput -dropped=protOut.psl -overlapThreshold=0.90 -selectFmt=psl \
   -inFmt=psl tight_mrna.psl protBlat/protBlat.psl protMrna.out

# Create protein/mRNA pair and protein lists
   cut -f 10,31 protMrna.out|sort -u >spMrna.tab
   cut -f 10    protMrna.out|sort -u >protein.lis

# Load spMrna.tab into spMrna table in temp DB.
   hgsql kgMm7ATemp -e 'drop table spMrna'
   hgsql kgMm7ATemp < ~/src/hg/lib/spMrna.sql
   hgsql kgMm7ATemp -e 'load data local infile "spMrna.tab" into table spMrna'
   hgsql kgMm7ATemp -e 'create index mrnaID on spMrna(mrnaID)'

# Prepare and perform cluster run of protein/mRNA alignment

# Get mRNA fa file.
   cd /cluster/data/mm7/bed/kgMm7A
   /cluster/data/genbank/bin/i386/gbGetSeqs -native -db=mm7 \
   -gbRoot=/cluster/data/genbank genbank mrna mrna.fa

# Create mrnaSeq table in kgMm7ATemp DB.

   faToTab mrna.fa mrnaSeq.tab

   hgsql kgMm7ATemp -e 'drop table mrnaSeq'
   hgsql kgMm7ATemp <~/src/hg/lib/mrnaSeq.sql
   hgsql kgMm7ATemp -e 'load data local infile "mrnaSeq.tab" into table mrnaSeq'
   rm mrnaSeq.tab

# Prepare files for cluster run ??
   ~/src/hg/protein/KG2.sh kgMm7A mm7 051015

# Perform cluster run of protein/mRNA alignment
   ~/src/hg/protein/KG3.sh kgMm7A mm7 051015

# Collect cluster run results
   cd kgBestMrna
   ls out | sed -e 's/prot/do1 prot/g' >doall

# create do1 with the following 2 lines:
   cat << '_EOF_' > do1
echo processing $1
cat out/$1/*.out >>protMrnaRaw.psl
'_EOF_'

   chmod +x do*
   doall

# Filter out low quality alignments
   pslReps -nohead -singleHit -minAli=0.9 protMrnaRaw.psl protMrnaBlat.psl /dev/null
   cut -f 10,14 protMrnaBlat.psl |sort -u >protMrna.lis
   wc protMrna.lis
# 225764  451528 3587320 protMrna.lis

# Load BLAT results into temp DB.
   hgsql kgMm7ATemp < ~/src/hg/lib/protMrnaBlat.sql
   hgsql kgMm7ATemp -e 'load data local infile "protMrnaBlat.psl" into table protMrnaBlat'
   hgsql kgMm7ATemp -e 'create index tName on protMrnaBlat(tName)'

# Create CDS files from protein/mRNA alignment results.
   hgsql kgMm7ATemp -N -e \
   'select qName,"_",tName,tStart+1,":",tEnd+3 from protMrnaBlat order by qName,tName,tEnd-tStart desc'\
   |sed 's/\t_\t/_/g'|sed 's/\t:\t/../g' >protMrna.cds

# Create protMrna.psl with proteinID_mrnaID as query ID.
   cut -f 22-30 ../protMrna.out > j1.tmp
   cut -f 32-42 ../protMrna.out > j2.tmp
   cut -f 10,31 ../protMrna.out|sed -e 's/\t/_/g' >j3.tmp
   paste j1.tmp j3.tmp j2.tmp >protMrna.psl
   rm j1.tmp j2.tmp j3.tmp

# Run mrnaToGene to create protMrna.gp
   bash
   mrnaToGene -cdsFile=protMrna.cds protMrna.psl protMrna.gp 2>protMrna.err >protMrna.log
   exit

############################################

# Redo prot-mRNA alignment using BLAST instead of BLAT, per Mark's recommendation.

   ~/src/hg/protein/kgProtMrna.sh kgMm7A mm7 051015

# During the following job:
#
#   ./kgProtMrnaBlast.csh /iscratch/i/kgDB/kgMm7A/kgProtMrna/kgProtPep/kgProtPep4993.fa
#
# the blastall program had a segment fault on the following protein:
#
#   Q8C9X7
# Removed this protein from /iscratch/i/kgDB/kgMm7A/kgProtMrna/kgProtPep/kgProtPep4993.fa
# on kk manually and re-ran:

   ./kgProtMrnaBlast.csh /iscratch/i/kgDB/kgMm7A/kgProtMrna/kgProtPep/kgProtPep4993.fa

# Collect alignment results and filter out low quality alignments

   find ./psl.tmp -name '*.psl.gz' | xargs zcat | \
   pslReps -nohead -singleHit -minAli=0.9 stdin ProtMrnaBlast.psl /dev/null

   cut -f 10,14 kgProtMrna.psl |sort -u >protMrna.lis
   wc protMrna.lis

# 210958  421916 3365702 protMrna.lis

# Load BLAST results into temp DB.
   hgsql kgMm7ATemp < ~/src/hg/lib/protMrnaBlast.sql
   hgsql kgMm7ATemp -e 'load data local infile "protMrnaBlast.psl" into table protMrnaBlast'
   hgsql kgMm7ATemp -e 'create index tName on protMrnaBlast(tName)'

# Create CDS files from protein/mRNA alignment results.
   hgsql kgMm7ATemp -N -e \
   'select qName,"_",tName,tStart+1,":",tEnd+3 from protMrnaBlast order by qName,tName,tEnd-tStart desc'\
   |sed 's/\t_\t/_/g'|sed 's/\t:\t/../g' >protMrna.cds

# Create protMrna.psl with proteinID_mrnaID as query ID.
   cd /cluster/data/mm7/bed/kgMm7A
   cut -f 22-30 protMrna.out > j1.tmp
   cut -f 32-42 protMrna.out > j2.tmp
   cut -f 10,31 protMrna.out|sed -e 's/\t/_/g' >j3.tmp
   paste j1.tmp j3.tmp j2.tmp >protMrna.psl
   rm j1.tmp j2.tmp j3.tmp

# Run mrnaToGene to create protMrna.gp
   bash
   mrnaToGene -cdsFile=kgProtMrna/protMrna.cds protMrna.psl protMrna.gp 2>protMrna.err >protMrna.log
   exit

############################################

# Prepare refGene and all_mrna gp files.

   cd /cluster/data/mm7/bed/kgMm7A 
   hgsql mm7 -N -e 'select * from refGene' >ref.gp

   hgsql mm7 -N -e \
   'select gbCdnaInfo.acc,cds.name from gbCdnaInfo,cds,all_mrna where all_mrna.qName=gbCdnaInfo.acc and   gbCdnaInfo.cds=cds.id' \
   |sort -u > all_mrna.cds

   bash
   mrnaToGene -cdsFile=all_mrna.cds all_mrna.psl all_mrna.gp 2>all_mrna.err > all_mrna.log
   exit

# Align proteins to RefSeq.

   overlapSelect -inCds -statsOutput -overlapThreshold=0.90 -selectFmt=psl -inFmt=genePred\
   protBlat/protBlat.psl ref.gp ref.stat
   overlapSelect -inCds -dropped=refOut1.gp -overlapThreshold=0.90 -selectFmt=psl -inFmt=genePred\
   protBlat/protBlat.psl ref.gp protRef.gp

   overlapSelect -mergeOutput -selectCds -dropped=protOut1.psl -overlapThreshold=0.80 -inFmt=psl\
   -selectFmt=genePred ref.gp protBlat/protBlat.psl protRef.out

   cut -f 10,22 protRef.out | sort -u >spRef.tab
   cut -f 10 protRef.out    | sort -u >protRef.lis

   hgsql kgMm7ATemp -e 'drop table spRef'
   hgsql kgMm7ATemp <~/src/hg/lib/spRef.sql
   hgsql kgMm7ATemp -e 'load data local infile "spRef.tab" into table spRef'

# Prepare and perform cluster runs for protein/RefSeq alignments

   ~/src/hg/protein/KGRef2.sh kgMm7A mm7 051015
   ~/src/hg/protein/KGRef3.sh kgMm7A mm7 051015

   cd kgBestRef
   ls out | sed -e 's/prot/do1 prot/g' >doall

   cat << '_EOF_' > do1
echo processing $1
cat out/$1/*.out >>protRefRaw.psl
'_EOF_'

   chmod +x do*
   doall

# Filter out low quality alignments.
   pslReps -nohead -singleHit -minAli=0.9 protRefRaw.psl protRefBlat.psl /dev/null
   cut -f 10,14 protRefBlat.psl |sort -u >protRef.lis
   wc protRef.lis
# 54489  108978  940797 protRef.lis

   hgsql kgMm7ATemp -e 'drop table protRefBlat'
   hgsql kgMm7ATemp < ~/src/hg/lib/protRefBlat.sql
   hgsql kgMm7ATemp -e 'load data local infile "protRefBlat.psl" into table protRefBlat'
   hgsql kgMm7ATemp -e 'create index tName on protRefBlat(tName)'

# Run gene-check to filter out invalid gp entries
   cd /cluster/data/mm7/bed/kgMm7A
   cat ref.gp protMrna.gp all_mrna.gp >kgCandidate0.gp
   gene-check  -incl-ok -ok-genepred-out kgCandidate0.passed.gp -details-out kgCandidate0.check.detail \
   -nib-dir /cluster/data/mm7/nib kgCandidate0.gp kgCandidate0.check

   hgsql kgMm7ATemp -e 'drop table kgCandidate0'
   hgsql kgMm7ATemp < ~/src/hg/lib/kgCandidate0.sql
   hgsql kgMm7ATemp -e  'load data local infile "kgCandidate0.gp" into table kgCandidate0'

   hgsql kgMm7ATemp -e 'drop table geneCheck'
   hgsql kgMm7ATemp < ~/src/hg/lib/geneCheck.sql
   hgsql kgMm7ATemp -e  'load data local infile "kgCandidate0.check" into table geneCheck ignore 2 lines'

# Run kgCheck to get all KG candidates that pass the KG gene check criteria

   kgCheck2 kgMm7ATemp mm7 kgCandidate0 geneCheck kgCandidate.tab
   hgsql kgMm7ATemp -e  'drop table kgCandidate'
   hgsql kgMm7ATemp < ~/src/hg/lib/kgCandidate.sql
   hgsql kgMm7ATemp -e  'load data local infile "kgCandidate.tab" into table kgCandidate'
   hgsql kgMm7ATemp -e 'create index alignID on kgCandidate(alignID)'

# Construct the kgCandidateX table that has alignID in the name field. 
   cut -f 2-10 kgCandidate.tab >j2.tmp
   cut -f 11 kgCandidate.tab >j1.tmp
   paste j1.tmp j2.tmp >kgCandidateX.tab

   hgsql kgMm7ATemp -e  'drop table kgCandidateX'
   hgsql kgMm7ATemp < ~/src/hg/lib/kgCandidateX.sql
   hgsql kgMm7ATemp -e  'load data local infile "kgCandidateX.tab" into table kgCandidateX'

# Score protein/mRna and protein/RefSeq alignments

   cd /cluster/data/mm7/bed/kgMm7A
   kgResultBestMrna2 051015 kgMm7ATemp mm7 protMrnaBlast|sort -u >protMrnaBlastScore.tab
   kgResultBestRef2  051015 kgMm7ATemp mm7 protRefBlat|sort -u >protRefScore.tab

# Combine scoring results and load them into temp DB.
   cat protMrnaBlastScore.tab protRefScore.tab >protMrnaScore.tab
   hgsql kgMm7ATemp -e 'drop table protMrnaScore'
   hgsql kgMm7ATemp < ~/src/hg/lib/protMrnaScore.sql
   hgsql kgMm7ATemp -e 'load data local infile "protMrnaScore.tab" into table protMrnaScore'
   hgsql kgMm7ATemp -e 'create index mrnaAcc on protMrnaScore(mrnaAcc)'

# Run kgGetCds to get CDS structure of each gene

   kgGetCds kgMm7ATemp 051015 kgCandidateX jY.tmp
   cat jY.tmp |sort -u >kgCandidateY.tab
   rm jY.tmp
   hgsql kgMm7ATemp -e  'drop table kgCandidateY'
   hgsql kgMm7ATemp < ~/src/hg/lib/kgCandidateY.sql
   hgsql kgMm7ATemp -e  'load data local infile "kgCandidateY.tab" into table kgCandidateY'

# Run kgPickPrep to replace long cds structure string with cdsId.
   kgPickPrep kgMm7ATemp kgCandidateZ.tab
   hgsql kgMm7ATemp -e  'drop table kgCandidateZ'
   hgsql kgMm7ATemp < ~/src/hg/lib/kgCandidateZ.sql
   hgsql kgMm7ATemp -e  'load data local infile "kgCandidateZ.tab" into table kgCandidateZ'
   hgsql kgMm7ATemp -e 'create index cdsId on kgCandidateZ(cdsId)'

# Run kgPick to pick the representative a mrna/protein pair for each unique CDS structure.

   kgPick kgMm7ATemp mm7 proteins051015 kg4.tmp dupSpMrna.tmp
   sort -u dupSpMrna.tmp >dupSpMrna.tab

# Create put back list

# gbGetSeqsX, a modified version of gbGetSeqs output the RefSeq IDs at the beginning of each output line.
    gbGetSeqsX -gbRoot=/cluster/data/genbank db=mm7 -get=ra RefSeq mrna ref.ra

    hgsql mm7 -N -e \
'select r.acc, r.attr, r.val from refRa r, refRa r2, refRa r3 wherer.attr="selenocysteine" and r.acc=r2.acc and r2.attr="rss" and r2.val="rev" and r3.acc=r.acc and r3.attr="org" and  r3.val="Mus musculus"' \
    >kgPutBack2.tab

    hgsql mm7 -N -e \
'select r.acc, r.attr, r.val from refRa r, refRa r2, refRa r3 where r.attr="cno" and r.val like "%ribosomal frameshift%" and r.acc=r2.acc and r2.attr="rss" and r2.val="rev" and r2.val="rev" and r3.acc=r.acc and r3.attr="org" and r3.val="Mus musculus"' \
    >>kgPutBack2.tab

    hgsql mm7 -N -e \
'select r.acc, r.attr, r.val from refRa r, refRa r2, refRa r3 where r.attr="cno" and r.val like "%non-AUG%" and r.acc=r2.acc and r2.attr="rss" and r2.val="rev" and r2.val="rev" and r3.acc=r.acc and r3.attr="org" and r3.val="Mus musculus"' \
    >>kgPutBack2.tab

    hgsql mm7 -N -e \
    'select r.acc, r.attr, r.val from refRa r, refRa r2, refRa r3 where r.attr="translExcept" and r.acc=r2.acc and r2.attr="rss" and r2.val="rev" and r2.val="rev" and r3.acc=r.acc and r3.attr="org" and r3.val="Mus musculus"' \
    >>kgPutBack2.tab 

    hgsql mm7 -N -e \
'select r.acc, r.attr, r.val from refRa r, refRa r2, refRa r3 where r.attr="exception" and r.acc=r2.acc and r2.attr="rss" and r2.val="rev" and r2.val="rev" and r3.acc=r.acc and r3.attr="org" and r3.val="Mus musculus"' \
    >>kgPutBack2.tab

# Add entries in the put back list
# Obtain from Mark the put back list, kgPutBack.lis, for human RefSeq.

   hgsql kgMm7ATemp -e 'drop table kgPutBack2'
   hgsql kgMm7ATemp < ~/src/hg/lib/kgPutBack2.sql
   hgsql kgMm7ATemp -e  'load data local infile "kgPutBack2.tab" into table kgPutBack2'

   kgPutBack kgMm7ATemp mm7 proteins051015 kgPutBack2 kgPutBack2.gp

# Sort KG genes to make the kg4.gp table file.
   cat kgPutBack2.gp kg4.tmp > kg4B.tmp
   ~/kent/src/hg/protein/sortKg.pl kg4B.tmp >kg4.gp

   hgsql kgMm7ATemp -e  'drop table knownGene'
   hgsql kgMm7ATemp < ~/src/hg/lib/knownGene.sql
   hgsql kgMm7ATemp -e  'load data local infile "kg4.gp" into table knownGene'
   
# Load dupSpMrna table after knownGene table is loaded so that joinerCheck does not complain.

   hgsql mm7 -e  'drop table dupSpMrna'
   hgsql mm7 < ~/src/hg/lib/dupSpMrna.sql
   hgsql mm7 -e  'load data local infile "dupSpMrna.tab" into table dupSpMrna'

# Perform analysis before loading kg4 table data to mm7.knownGene table.

# Load data into mm7 knownGene table.
   hgsql mm7 -e  'drop table knownGene'
   hgsql mm7 < ~/src/hg/lib/knownGene.sql
   hgsql mm7 -e  'load data local infile "kg4.gp" into table knownGene'

# Build knownGeneMrna and knownGenePep tables.

   kgPepMrna kgMm7ATemp mm7 051015
   hgsql mm7 -e  'drop table knownGeneMrna'
   hgsql mm7 < ~/src/hg/lib/knownGeneMrna.sql
   hgsql mm7 -e  'load data local infile "knownGeneMrna.tab" into table knownGeneMrna'
   hgsql mm7 -e  'drop table knownGenePep'
   hgsql mm7 < ~/src/hg/lib/knownGenePep.sql
   hgsql mm7 -e  'load data local infile "knownGenePep.tab" into table knownGenePep'

# Build kgXref table

   kgXref2 kgMm7ATemp 051015 mm7

   hgsql mm7 -e  'drop table kgXref'
   hgsql mm7 < ~/src/hg/lib/kgXref.sql
   hgsql mm7 -e  'load data local infile "kgXref.tab" into table kgXref'

# Build spMrna table

   hgsql mm7 -N -e 'select name, proteinID from knownGene' >kgSpMrna.tab

   hgsql mm7 -e  'drop table spMrna'
   hgsql mm7 <~/src/hg/lib/spMrna.sql
   hgsql mm7 -e 'load data local infile "kgSpMrna.tab" into table spMrna'

# Build mrnaRefseq table

   mkdir /cluster/store11/entrez 
   cd /cluster/store11/entrez
   mkdir 051113
   rm /cluster/data/entrez
   ln -s /cluster/store11/entrez/051113 /cluster/data/entrez
   cd /cluster/data/entrez

   wget -- timestamp ftp://ftp.ncbi.nlm.nih.gov/gene/DATA/gene2accession.gz
   wget -- timestamp ftp://ftp.ncbi.nlm.nih.gov/gene/DATA/gene_info.gz
   wget -- timestamp ftp://ftp.ncbi.nlm.nih.gov/gene/DATA/gene2refseq.gz
   gzip -d *.gz

   cut -f 2,4 gene2accession | sort -u | grep -v "-" | grep -v "NM_" | sed -e 's/\./\t/g' > entrezMrna.tab
   cut -f 2,4 gene2refseq | grep "NM_"| sort -u | grep -v "-" | sed -e 's/\./\t/g' > entrezRefseq.tab
   cut -f 2,4,6 gene2accession | grep "NM_"| grep "NP_"|sort -u | sed -e 's/\./\t/g' > entrezRefProt.tab
   
   hgsql entrez -e 'drop table entrezRefseq'
   hgsql entrez -e 'drop table entrezMrna'
   hgsql entrez -e 'drop table entrezRefProt'

   hgsql entrez < ~/src/hg/lib/entrezRefseq.sql
   hgsql entrez < ~/src/hg/lib/entrezMrna.sql
   hgsql entrez < ~/src/hg/lib/entrezRefProt.sql

   hgsql entrez -e 'load data local infile "entrezRefseq.tab" into table entrezRefseq'
   hgsql entrez -e 'load data local infile "entrezMrna.tab" into table entrezMrna'
   hgsql entrez -e 'load data local infile "entrezRefProt.tab" into table entrezRefProt'

   hgsql entrez -N -e \
   'select mrna, refseq from entrezRefseq, entrezMrna where entrezRefseq.geneID=entrezMrna.geneID' \
   >mrnaRefseq.tab

   hgsql mm7 -e 'drop table mrnaRefseq'
   hgsql mm7 < ~/src/hg/lib/mrnaRefseq.sql
   hgsql mm7 -e 'load data local infile "mrnaRefseq.tab" into table mrnaRefseq'

# Build kgProtMap table
    cd /cluster/data/mm7/bed/kgMm7A
    ~/src/hg/protein/kgProtMap2.sh kgMm7A mm7 051015

# Update and clean up kgResultBestMrna2.c and then check it in.

# Build alias tables.		
#	kgAliasM reads from proteins051015.hugo.symbol, proteins051015.hugo.aliases
#	proteins051015.hugo.withdraws, mm7.kgXref.kgID
#	to create kgAliasM.tab and geneAlias.tab
#	by picking out those kgID items from kgXref where
#	kgXref.geneSymbol == hugo.symbol

   cd /cluster/store11/kg/kgMm7A
   mkdir alias
   cd alias
   kgAliasM mm7 proteins051015

#	kgAliasKgXref reads from mm7.knownGene.proteinID,
#	mm7.knownGene.name, mm7.kgXref.geneSymbol
#	to create kgAliasKgXref.tab

   kgAliasKgXref mm7

#	kgAliasRefseq reads from mm7.knownGene.name,
#	mm7.knownGene.proteinID, mm7.kgXref.refseq
#	to create kgAliasRefseq.tab

   kgAliasRefseq mm7

   hgsql sp051015 -N -e 'select name,gene.val from mm7.knownGene,displayId,gene where displayId.val=proteinID and displayId.acc=gene.acc' \
   | sort -u  > kgAliasP.tab

   hgsql mm7 -N -e 'select name, name from knownGene' >kgAliasDup.tab
   hgsql mm7 -N -e 'select mrnaID, dupMrnaID from dupSpMrna' >>kgAliasDup.tab
   
   cat kgAliasM.tab kgAliasRefseq.tab kgAliasKgXref.tab kgAliasP.tab kgAliasDup.tab| \
   sort |uniq > kgAlias.tab

   hgsql -e "drop table kgAlias;" mm7 
   hgsql mm7 < ~/kent/src/hg/lib/kgAlias.sql
   hgsql mm7 -e 'LOAD DATA local INFILE "kgAlias.tab" into table kgAlias' 

#	kgProtAlias reads from mm7.knownGene.name,
#	mm7.knownGene.proteinID, mm7.knownGene.alignID,
#	proteins051015.spXref3.accession, proteins051015.spSecondaryID, proteins051015.pdbSP.pdb
#	to create kgProtAlias.tab

   kgProtAlias mm7 051015

   hgsql mm7 -N -e \
   'select kgID, spDisplayID, protAcc from kgXref where protAcc != ""'\
   | sort -u >kgProtAliasNCBI.tab

# include variant splice protein IDs
   
   hgsql mm7 -N -e \
   'select name, proteinID, parAcc from knownGene,sp051015.varAcc where varAcc=proteinID'\
   |sort -u >kgProtAliasDup.tab

# include duplicate protein IDs from dupSpMrna table
   hgsql mm7 -N -e \
   'select name, knownGene.proteinID, dupProteinID from knownGene, dupSpMrna where name=mrnaID'\
   |sort -u >>kgProtAliasDup.tab

# catch parent acc from dupProteinID too
   hgsql mm7 -N -e\
   'select name, knownGene.proteinID, parAcc from knownGene,dupSpMrna,sp051015.varAcc where name=mrnaID and dupProteinID=varAcc.varAcc'\
   |sort -u >>kgProtAliasDup.tab
    cat kgProtAliasNCBI.tab kgProtAlias.tab kgProtAliasDup.tab | sort -u > kgProtAliasAll.tab

    echo "`date` creating table kgProtAlias"
    hgsql mm7 -e "drop table kgProtAlias;"
    hgsql mm7 <~/src/hg/lib/kgProtAlias.sql; 
    hgsql mm7 -e 'LOAD DATA local INFILE "kgProtAliasAll.tab" into table kgProtAlias;'  

# Build kgSpAlias table

    hgsql mm7 -e \
    'select kgXref.kgID, spID, alias from kgXref, kgAlias where kgXref.kgID=kgAlias.kgID' >j.tmp
    hgsql mm7 -e \
    'select kgXref.kgID, spID, alias from kgXref, kgProtAlias where kgXref.kgID=kgProtAlias.kgID'\
    >>j.tmp
    cat j.tmp|sort -u |grep -v 'kgID' >mm7.kgSpAlias.tab
    rm j.tmp

    hgsql mm7 -e 'drop table kgSpAlias';
    hgsql mm7 < ~/src/hg/lib/kgSpAlias.sql
    hgsql mm7 -e 'load data local infile "mm7.kgSpAlias.tab" into table kgSpAlias'

# MAKE FOLDUTR TABLES (DONE 2005-11-15 Fan)
# First set up directory structure and extract UTR sequence on hgwdev
    ssh hgwdev
    cd /cluster/data/mm7/bed
    mkdir rnaStruct.2005-11-14
    rm rnaStruct
    ln -s rnaStruct.2005-11-14 rnaStruct
    cd rnaStruct
    mkdir -p utr3/split utr5/split utr3/fold utr5/fold
    utrFa mm7 knownGene utr3 utr3/utr.fa
    utrFa mm7 knownGene utr5 utr5/utr.fa

# Split up files and make files that define job.
    ssh kk
    cd /cluster/data/mm7/bed/rnaStruct
    faSplit sequence utr3/utr.fa 50000 utr3/split/s
    faSplit sequence utr5/utr.fa 50000 utr5/split/s
    ls -1 utr3/split > utr3/in.lst
    ls -1 utr5/split > utr5/in.lst
    cd utr3
    cat > gsub <<end
#LOOP
rnaFoldBig split/\$(path1) fold
#ENDLOOP
end
    cp gsub ../utr5

# Do cluster run for 3' UTRs
    gensub2 in.lst single gsub spec
    para create spec
    para try
    para push
# Completed: 29251 of 29251 jobs
# CPU time in finished jobs:     732023s   12200.39m   203.34h    8.47d  0.023 y
# IO & Wait Time:                974945s   16249.08m   270.82h   11.28d  0.031 y
# Average job time:                  58s       0.97m     0.02h    0.00d
# Longest running job:                0s       0.00m     0.00h    0.00d
# Longest finished job:            6132s     102.20m     1.70h    0.07d
# Submission to last job:          9125s     152.08m     2.53h    0.11d

# Do cluster run for 5' UTRs 
    cd ../utr5
    gensub2 in.lst single gsub spec
    para create spec
    para try
    para pus
# Completed: 27603 of 27603 jobs
# CPU time in finished jobs:     137185s    2286.42m    38.11h    1.59d  0.004 y
# IO & Wait Time:                582578s    9709.63m   161.83h    6.74d  0.018 y
# Average job time:                  26s       0.43m     0.01h    0.00d
# Longest running job:                0s       0.00m     0.00h    0.00d
# Longest finished job:           10558s     175.97m     2.93h    0.12d
# Submission to last job:         11704s     195.07m     3.25h    0.14d

# Load database
    ssh hgwdev
    cd /cluster/data/mm7/bed/rnaStruct/utr5
    hgLoadRnaFold mm7 foldUtr5 fold
    cd ../utr3
    hgLoadRnaFold mm7 foldUtr3 fold

# Clean up
    rm -r split fold err batch.bak
    cd ../utr5
    rm -r split fold err batch.bak

# Build KEGG pathway tables.  DONE 5/19/05.  Fan.

   ssh hgwdev
   cd /cluster/store11/kg/kgMm7A
   md kegg
   cd kegg

   ~/src/hg/protein/KGpath.sh kgMm7A mm7 051015

   hgsql mm7 -e "drop table keggMapDesc"
   hgsql mm7 -e "drop table keggPathway"
   hgsql mm7 <~/src/hg/lib/keggMapDesc.sql
   hgsql mm7 <~/src/hg/lib/keggPathway.sql
   hgsql mm7 -e 'load data local infile "keggMapDesc.tab" into table keggMapDesc'
   hgsql mm7 -e 'load data local infile "keggPathway.tab" into table keggPathway'

# Build CGAP pathway tables
# Reloaded cgapAlias.tab file after removing replicate rows - 
# see the other cgap table entry in this document (hartera, 2005-10-07).

   cd ..
   ~/src/hg/protein/KGcgap.sh kgMm7A mm7 051015
   hgsql mm7 -e "drop table cgapAlias"
   hgsql mm7 -e "drop table cgapBiocDesc"
   hgsql mm7 -e "drop table cgapBiocPathway"
   hgsql mm7 <~/src/hg/lib/cgapAlias.sql
   hgsql mm7 <~/src/hg/lib/cgapBiocDesc.sql
   hgsql mm7 <~/src/hg/lib/cgapBiocPathway.sql
   hgsql mm7 -e 'load data local infile "cgapAlias.tab" into table cgapAlias'
   hgsql mm7 -e 'load data local infile "cgapBIOCARTAdescSorted.tab" into table cgapBiocDesc'
   hgsql mm7 -e 'load data local infile "cgapBIOCARTA.tab" into table cgapBiocPathway'

# CREATE FULL TEXT INDEX FOR KNOWN GENES (DONE 1/19/2006 JK)
# This depends on the go and uniProt databases as well as 
# the kgAlias and kgProAlias tables.  The hgKgGetText takes
# about 5 minutes when the database is not too busy.  The rest
# is real quick.
     ssh hgwdev
     cd /cluster/data/mm7/bed/kgMm7A
     mkdir index
     cd index
     hgKgGetText mm7 knownGene.text
     ixIxx knownGene.text knownGene.ix knownGene.ixx
     ln -s /cluster/data/mm7/bed/kgMm7A/index/knownGene.ix /gbdb/mm7/knownGene.ix
     ln -s /cluster/data/mm7/bed/kgMm7A/index/knownGene.ixx /gbdb/mm7/knownGene.ixx

# BUILD KNOWN GENE LIST FOR GOOGLE.  DONE 11/28/05 Fan.

    cd /cluster/data/mm7/bed
    rm -rf knownGeneList/mm7

# Run hgKnownGeneList to generate the tree of HTML pages
# under ./knownGeneList/mm7

    hgKnownGeneList mm7

# copy over to /usr/local/apache/htdocs

    rm -rf /usr/local/apache/htdocs/knownGeneList/mm7
    mkdir -p /usr/local/apache/htdocs/knownGeneList/mm7
    cp -Rfp knownGeneList/mm7/* /usr/local/apache/htdocs/knownGeneList/mm7

############################################################################

### MAKE THE affyU74 TRACK - needed for the Gene Sorter 
#                              (DONE - 2005-11-14 - Fan)
# MAKE THE affyU74 TRACK using Affy consensus sequences instead of 
# target sequences. Recalculate alignments and load data
----------------------------------
# Load up semi-local disk with target sequences for Affy mouse U74 chips.
# ssh kkr1u00
# mkdir -p /iscratch/i/affy
#	This /projects filesystem is not available on kkr1u00
#	but it is on kk
# ssh kk
# cp /projects/compbio/data/microarray/affyGnfMouse/sequences/U74*consensus.fa /iscratch/i/affy

ssh kkr1u00
iSync

# Run cluster job to do alignments
ssh kk
mkdir /cluster/data/mm7/bed/affyU74.2005-11-14
cd /cluster/data/mm7/bed/affyU74.2005-11-14
mkdir run
cd run
mkdir psl
echo /cluster/bluearc/mm7/nib/*.nib | wordLine stdin > genome.lst
ls -1 /iscratch/i/affy/U74*consensus.fa > affy.lst
cat << '_EOF_' > gsub
#LOOP
/cluster/bin/i386/blat -fine -mask=lower -minIdentity=95 -ooc=/scratch/hg/h/11.ooc $(path1) {check in line+ $(path2)} {check out line+ psl/$(root1)_$(root2).psl}
#ENDLOOP
'_EOF_'
# << this line makes emacs coloring happy

gensub2 genome.lst affy.lst gsub jobList
para create jobList
para try
para push
# Completed: 120 of 120 jobs
# CPU time in finished jobs:       6022s     100.37m     1.67h    0.07d  0.000 y
# IO & Wait Time:                  4113s      68.55m     1.14h    0.05d  0.000 y
# Average job time:                  84s       1.41m     0.02h    0.00d
# Longest running job:                0s       0.00m     0.00h    0.00d
# Longest finished job:             244s       4.07m     0.07h    0.00d
# Submission to last job:           280s       4.67m     0.08h    0.00d
# Do sort, best in genome filter, and convert to chromosome coordinates
# to create affyU74.psl.
ssh kk
cd /cluster/data/mm7/bed/affyU74.2005-11-14/run
pslSort dirs raw.psl tmp psl

# change filter parameters for these sequences. only use alignments that
# cover 30% of sequence and have at least minAli = 0.95.
# minAli = 0.97 too high. low minCover as a lot of n's in these sequences
#pslReps -minCover=0.3 -sizeMatters -minAli=0.95 -nearTop=0.005 raw.psl contig.psl /dev/null
pslReps -minCover=0.3 -sizeMatters -minAli=0.95 -nearTop=0.005 raw.psl ../all_affyU74.psl /dev/null

# Sort by chromosome and load into database.
ssh hgwdev
cd /cluster/data/mm7/bed/affyU74.2005-11-14
pslSortAcc nohead chrom temp all_affyU74.psl
cat chrom/*.psl > affyU74.psl
# shorten qName to "xxxx_at" instead of "U74Xv2:xxxx_at;"
# and reload data into table
hgLoadPsl mm7 affyU74.psl
# rm -fr chrom temp run

##   MAKE THE affyGnfU74 TRACKs (DONE - 2005-11-14 - Fan)
# Make bed files and load consensus sequences for Affy U74 chip set.
# Fix broken symlinks to microarray data after directory structure changed
# (DONE, 2005-05-03, hartera)
----------------------------------
#This needs to be done after affyU74 is already made.
ssh hgwdev
mkdir -p /cluster/data/mm7/bed/affyGnf.2005-11-14
cd /cluster/data/mm7/bed/affyGnf.2005-11-14
#	may need to build this command in src/hg/affyGnf
~/src/hg/affyGnf/affyPslAndAtlasToBed ../affyU74.2005-11-14/affyU74.psl \
	/projects/compbio/data/microarray/affyGnfMouse/data/data_public_U74 \
	affyGnfU74A.bed affyGnfU74A.exp -newType -chip=U74Av2
~/src/hg/affyGnf/affyPslAndAtlasToBed ../affyU74.2005-11-14/affyU74.psl \
	/projects/compbio/data/microarray/affyGnfMouse/data/U74B_b.txt \
	affyGnfU74B.bed affyGnfU74B.exp -newType -chip=U74Bv2
~/src/hg/affyGnf/affyPslAndAtlasToBed ../affyU74.2005-11-14/affyU74.psl \
	/projects/compbio/data/microarray/affyGnfMouse/data/U74C_b.txt \
	affyGnfU74C.bed affyGnfU74C.exp -newType -chip=U74Cv2

# edit 3 .bed files to shorten qName to "xxxx_at" instead of "U74Xv2:xxxx_at;"
mkdir sav
cp *.bed sav -p
cat sav/affyGnfU74A.bed|sed -e "s/U74Av2://" >affyGnfU74A.bed
cat sav/affyGnfU74B.bed|sed -e "s/U74Bv2://" >affyGnfU74B.bed
cat sav/affyGnfU74C.bed|sed -e "s/U74Cv2://" >affyGnfU74C.bed

# and reload data into table
hgLoadBed mm7 affyGnfU74A affyGnfU74A.bed
hgLoadBed mm7 affyGnfU74B affyGnfU74B.bed
hgLoadBed mm7 affyGnfU74C affyGnfU74C.bed

# Add in sequence data for U74 tracks.
# Copy consensus sequence to /gbdb if it isn't already
# [THE SYM LINKS WERE ALREADY DONE.]
#    mkdir -p /gbdb/hgFixed/affyProbes
    cd /gbdb/hgFixed/affyProbes
    # fix broken symlinks after directory structure changed
    # /projects/compbiodata ----> /projects/compbio/data
    rm U74*
    # make correct symlinks (hartera, 2005-05-03)
    ln -s /projects/compbio/data/microarray/affyGnfMouse/sequences/U74Av2_consensus.fa .
    ln -s /projects/compbio/data/microarray/affyGnfMouse/sequences/U74Bv2_consensus.fa .
    ln -s /projects/compbio/data/microarray/affyGnfMouse/sequences/U74Cv2_consensus.fa .

    # used perl -pi.bak -e 's/;/ /' <file> to remove ";" after probe name
    # ASSUMED THIS IS ALREADY DONE LAST TIME FOR MM4.
    # reload sequences with prefix removed so acc matches name used in
    # other dependent tables
                                                    
    hgLoadSeq -abbr=U74Av2: mm7 /gbdb/hgFixed/affyProbes/U74Av2_consensus.fa
    hgLoadSeq -abbr=U74Bv2: mm7 /gbdb/hgFixed/affyProbes/U74Bv2_consensus.fa
    hgLoadSeq -abbr=U74Cv2: mm7 /gbdb/hgFixed/affyProbes/U74Cv2_consensus.fa

### GNF ATLAS 2  [DONE Fan 2005-11-14]
    # Align probes from GNF1M chip.
    ssh kk
    cd /cluster/data/mm7/bed
    mkdir -p geneAtlas2/run/psl
    cd geneAtlas2/run
    #mkdir -p /cluster/bluearc/geneAtlas2
    #cp /projects/compbio/data/microarray/geneAtlas2/mouse/gnf1m.fa /cluster/bluearc/geneAtlas2
    #ls -1 /scratch/mus/mm7/maskedContigs/ > genome.lst
    echo /cluster/bluearc/mm7/nib/*.nib | wordLine stdin > genome.lst

    ls -1 /cluster/bluearc/geneAtlas2/gnf1m.fa > mrna.lst
    echo '#LOOP\nblat -fine -ooc=/scratch/hg/h/mouse11.ooc  $(path1) $(path2) {check out line+ psl/$(root1)_$(root2).psl}\n#ENDLOOP' > gsub
    gensub2 genome.lst mrna.lst gsub spec
    para create spec
    para try
    para check
    para push
    para time
# CPU time in finished jobs:      53036s     883.93m    14.73h    0.61d  0.002 y
# IO & Wait Time:                   442s       7.37m     0.12h    0.01d  0.000 y
# Average job time:                1337s      22.28m     0.37h    0.02d
# Longest running job:                0s       0.00m     0.00h    0.00d
# Longest finished job:            3862s      64.37m     1.07h    0.04d
# Submission to last job:          3862s      64.37m     1.07h    0.04d

    # Do sort, best in genome filter, and convert to chromosome coordinates
    # to create gnf1h.psl.
    pslSort dirs raw.psl tmp psl
    pslReps -minCover=0.3 -minAli=0.95 -nearTop=0.005 raw.psl ../affyGnf1m.psl /dev/null

    #rm -r contig.psl raw.psl psl

    # Load probes and alignments from GNF1H into database.
    ssh hgwdev
    cd /cluster/data/mm7/bed/geneAtlas2
#    ln -s /projects/compbio/data/microarray/geneAtlas2/mouse/gnf1m.fa /gbdb/hgFixed/affyProbes
    hgLoadPsl mm7 affyGnf1m.psl
    hgLoadSeq mm7 /gbdb/hgFixed/affyProbes/gnf1m.fa

    # Load up track
    hgMapMicroarray gnfAtlas2.bed hgFixed.gnfMouseAtlas2MedianRatio \
    	affyGnf1m.psl
    # Note that the unmapped 5000 records are from all-N sequences.
    hgLoadBed mm7 gnfAtlas2 gnfAtlas2.bed

# MOUSE AFFYMETRIX MOE430 TRACK (DONE, 2005-11-14, Fan)
#    mkdir -p /projects/compbio/data/microarray/affyMouse
    # Download MOE430A and MOE430B consensus sequences from Affymetrix web site
    # http://www.affymetrix.com/support/technical/byproduct.affx?product=moe430
#    unzip MOE430*_consensus.zip

    # check for duplicate probes: there are none, all have unique names
    # check for duplicate probes: 100 from 136745_at to 1367551_a_at
    # remove "consensus:" and ";" from FASTA headers to shorten probeset
    # names for database

#    sed -e 's/consensus://' MOE430A_consensus | sed -e 's/;/ /' > MOE430_all.fa
#    sed -e 's/consensus://' MOE430B_consensus | sed -e 's/;/ /' >> MOE430_all.fa
 
#    cp /projects/compbio/data/microarray/affyMouse/MOE430_all.fa \
#       /cluster/bluearc/affy/

    # THE ABOVE WAS ALREADY DONE BY RACHEL 4/16/04.

    # Set up cluster job to align MOE430 consensus sequences to mm7
    ssh kkr1u00
    cd /cluster/data/mm7/bed
    mkdir -p affyMOE430
    cd affyMOE430
#    mkdir -p /iscratch/i/affy
#    cp /cluster/bluearc/affy/MOE430_all.fa /iscratch/i/affy
#    iSync

    ssh kk
    cd /cluster/data/mm7/bed/affyMOE430
    ls -1 /iscratch/i/affy/MOE430_all.fa > affy.lst
    echo /cluster/bluearc/mm7/nib/*.nib | wordLine stdin > genome.lst

    echo '#LOOP\n/cluster/bin/i386/blat -fine -mask=lower -minIdentity=95 -ooc=/scratch/hg/h/mouse11.ooc  $(path1) $(path2) {check out line+ psl/$(root1)_$(root2).psl}\n#ENDLOOP' > template.sub
    gensub2 genome.lst affy.lst template.sub para.spec
    mkdir psl
    para create para.spec
    para try
    para check
    para push
# Completed: 40 of 40 jobs
# CPU time in finished jobs:       9533s     158.88m     2.65h    0.11d  0.000 y
# IO & Wait Time:                   924s      15.40m     0.26h    0.01d  0.000 y
# Average job time:                 261s       4.36m     0.07h    0.00d
# Longest running job:                0s       0.00m     0.00h    0.00d
# Longest finished job:             855s      14.25m     0.24h    0.01d
# Submission to last job:           862s      14.37m     0.24h    0.01d

    # Do sort, best in genome filter, and convert to chromosome coordinates
    # to create affyRAE230.psl
    pslSort dirs raw.psl tmp psl

    # only use alignments that cover 30% of sequence and have at least
    # 95% identity in aligned region. 
    # low minCover as a lot of n's in these sequences
    pslReps -minCover=0.3 -sizeMatters -minAli=0.95 -nearTop=0.005 raw.psl affyMOE430.psl /dev/null

    # Load alignments and sequences into database
    ssh hgwdev
    cd /cluster/data/mm7/bed/affyMOE430
    # shorten names in psl file
    sed -e 's/MOE430//' affyMOE430.psl > affyMOE430.psl.bak
    mv affyMOE430.psl.bak affyMOE430.psl

    # load track into database

    hgLoadPsl mm7 affyMOE430.psl
 
    # Add consensus sequences for MOE430
    # Copy sequences to gbdb is they are not there already
#    mkdir -p /gbdb/hgFixed/affyProbes
#    ln -s /projects/compbio/data/microarray/affyMouse/MOE430_all.fa \ 
#       /gbdb/hgFixed/affyProbes

    hgLoadSeq -abbr=MOE430 mm7 /gbdb/hgFixed/affyProbes/MOE430_all.fa
    
    # Clean up
#    rm batch.bak contig.psl raw.psl 
    
    # BELOW TWO THINGS WERE DONE BY RACHEL ALREDAY FOR MM4
    # add entry to trackDb.ra in ~kent/src/hg/makeDb/trackDb/mouse/
    # add affyMOE430.html file and then do make alpha to add to trackDb table

######## BUILD GENE SORTER TABLES #######  (DONE - 2005-11-15 - Fan)
# These are instructions for building the
# Gene Sorter.  Don't start these until
# there is a knownGene track and the affy tracks

# Cluster together various alt-splicing isoforms.
#	Creates the knownIsoforms and knownCanonical tables
ssh hgwdev
cd /tmp
hgClusterGenes mm7 knownGene knownIsoforms knownCanonical
# Got 18908 clusters, from 31449 genes in 40 chromosomes
#	featureBits mm7 knownCanonical
#     776952836 bases of 2583394090 (30.075%) in intersection
#	featureBits mm6 knownCanonical
# 	764263619 bases of 2597150411 (29.427%) in intersection
#	featureBits mm5 knownCanonical
#	853516995 bases of 2615483787 (32.633%) in intersection
#	featureBits mm4 knownCanonical
#	840021165 bases of 2627444668 (31.971%) in intersection
#	featureBits mm3 knownCanonical
#	825943052 bases of 2505900260 (32.960%) in intersection

# Extract peptides from knownGenes into fasta file
# and create a blast database out of them.
ssh hgwdev
mkdir -p  /cluster/data/mm7/bed/geneSorter/blastp
cd /cluster/data/mm7/bed/geneSorter/blastp
pepPredToFa mm7 knownGenePep known.faa
#	You may need to build this binary in src/hg/near/pepPredToFa
/cluster/bluearc/blast229/formatdb -i known.faa -t known -n known

# Copy over database to bluearc scratch
mkdir /cluster/bluearc/mm7/blastp
cp -p /cluster/data/mm7/bed/geneSorter/blastp/known.* /cluster/bluearc/mm7/blastp

# Split up fasta file into bite sized chunks for cluster
cd /cluster/data/mm7/bed/geneSorter/blastp
mkdir split
faSplit sequence known.faa 8000 split/kg

# Make parasol run directory 
ssh kk
mkdir /cluster/data/mm7/bed/geneSorter/blastp/self
cd /cluster/data/mm7/bed/geneSorter/blastp/self
mkdir run
cd run
mkdir out

# Make blast script
cat  << '_EOF_' > blastSome
#!/bin/sh
BLASTMAT=/cluster/bluearc/blast229/data /cluster/bluearc/blast229/blastall \
-p blastp -d /cluster/bluearc/mm7/blastp/known \
-i $1 -o $2 -e 0.01 -m 8 -b 1000
'_EOF_'
# << keep emacs happy
chmod a+x blastSome

# Make gensub2 file
cat  << '_EOF_' > gsub
#LOOP
blastSome {check in line+ $(path1)} {check out line out/$(root1).tab}
#ENDLOOP
'_EOF_'
# << keep emacs happy

# Create parasol batch
#	'ls ../../split/*.fa' is too much, hence the echo
echo ../../split/*.fa | wordLine stdin > split.lst
gensub2 split.lst single gsub jobList
para create jobList
para try
para check
para push 

# Completed: 7720 of 7720 jobs
# CPU time in finished jobs:      72141s    1202.34m    20.04h    0.83d  0.002 y
# IO & Wait Time:                600180s   10003.01m   166.72h    6.95d  0.019 y
# Average job time:                  87s       1.45m     0.02h    0.00d
# Longest running job:                0s       0.00m     0.00h    0.00d
# Longest finished job:             282s       4.70m     0.08h    0.00d
# Submission to last job:          1745s      29.08m     0.48h    0.02d

# Load into database.  This takes about an hour.
ssh hgwdev
cd /cluster/data/mm7/bed/geneSorter/blastp/self/run/out
hgLoadBlastTab mm7 knownBlastTab *.tab
# Scanning through 7729 files
# Loading database with 3391069 rows

# Create known gene mapping table and expression distance tables
# for GNF Atlas 2.  (The hgExpDistance takes an hour.)

hgMapToGene mm7 affyGnf1m knownGene knownToGnf1m
hgExpDistance mm7 hgFixed.gnfMouseAtlas2MedianRatio \
	hgFixed.gnfMouseAtlas2MedianExps gnfAtlas2Distance -lookup=knownToGnf1m
# Have 34863 elements in hgFixed.gnfMouseAtlas2MedianRatio
# Got 20114 unique elements in hgFixed.gnfMouseAtlas2MedianRatio

# Create table that maps between known genes and RefSeq
hgMapToGene mm7 refGene knownGene knownToRefSeq
#	may need to build this command in src/hg/near/hgMapToGene

# Create a table that maps between known genes and 
# the nice affy expression data.
hgMapToGene mm7 affyU74  knownGene knownToU74
hgMapToGene mm7 affyMOE430 knownGene knownToMOE430
hgMapToGene mm7 affyMOE430 -prefix=A: knownGene knownToMOE430A

# Format and load Rinn et al sex expression data
mkdir /cluster/data/mm7/bed/rinnSex
cd !$
hgMapMicroarray rinnSex.bed hgFixed.mouseRinnSexMedianRatio \
../affyMOE430/affyMOE430.psl
hgLoadBed mm7 rinnSex rinnSex.bed

# Format and load the GNF data
mkdir /cluster/data/mm7/bed/affyGnf95
cd /cluster/data/mm7/bed/affyGnf95
~/src/hg/affyGnf/affyPslAndAtlasToBed -newType ../affyU95.psl \
/projects/compbio/data/microarray/affyGnfHuman/data_public_U95 \
affyGnfU95.tab affyGnfU95Exps.tab -shortOut

#	this .sql load was in preceeding instructions, but this .sql file
#	appears to not exist and it doesn't seem to be needed anyway.
#	Everything below this seems to create tables OK.
#  hgsql mm7 < ~/kent/src/hg/affyGnf/affyGnfU95.sql

# Create table that gives distance in expression space between 
# GNF genes.  These commands take about 15 minutes each
#	The affyGnfU74?Exps arguments appear to be unused in 
# hgExpDistance
cd /cluster/data/mm7/bed/geneSorter
hgExpDistance mm7 affyGnfU74A affyGnfU74AExps affyGnfU74ADistance -lookup=knownToU74
# Have 9621 elements in affyGnfU74A
# Got 11805 unique elements in affyGnfU74A
hgExpDistance mm7 affyGnfU74B affyGnfU74BExps affyGnfU74BDistance -lookup=knownToU74
# Have 11002 elements in affyGnfU74B
# Got 7173 unique elements in affyGnfU74B
hgExpDistance mm7 affyGnfU74C affyGnfU74CExps affyGnfU74CDistance -lookup=knownToU74
# Have 7502 elements in affyGnfU74C
# Got 2130 unique elements in affyGnfU74C

# C.ELEGANS BLASTP FOR GENE SORTER 
    # Make C. elegans ortholog column using blastp on wormpep.
    # First make C. elegans protein database and copy it to iscratch/i
    # if it doesn't exist already:
    ssh eieio
    mkdir /cluster/data/ce2/bed/blastp
    cd /cluster/data/ce2/bed/blastp
    # Point a web browser at ftp://ftp.sanger.ac.uk/pub/databases/wormpep/
    # to find out the latest version.  Then use that in place of 142 below.
    wget -O wormPep142.faa ftp://ftp.sanger.ac.uk/pub/databases/wormpep/wormpep142/wormpep142
    formatdb -i wormPep142.faa -t wormPep142 -n wormPep142
    ssh kkr1u00
    if (-e /iscratch/i/ce2/blastp) then
      rm -r /iscratch/i/ce2/blastp
    endif
    mkdir -p /iscratch/i/ce2/blastp
    cp /cluster/data/ce2/bed/blastp/wormPep142.p?? /iscratch/i/ce2/blastp
    iSync

    # Make parasol run directory 
    ssh kk
    mkdir -p /cluster/data/mm7/bed/blastp/ce2/run/out
    cd /cluster/data/mm7/bed/blastp/ce2/run
    # Make blast script
    cat > blastSome <<end
#!/bin/csh
setenv BLASTMAT /iscratch/i/blast/data
/iscratch/i/blast/blastall -p blastp -d /iscratch/i/ce2/blastp/wormPep142 -i \$1 -o \$2 -e 0.01 -m 8 -b 1
end
    chmod a+x blastSome
# Make gensub2 file
    cat > gsub <<end
#LOOP
blastSome {check in line+ \$(path1)} {check out line out/\$(root1).tab}
#ENDLOOP
end
# Create parasol batch
ls -1S /cluster/data/mm7/bed/geneSorter/blastp/split \
|sed -e 's=kg=../../../geneSorter/blastp/split/kg=g' >split.lst
gensub2 split.lst single gsub spec
para create spec
para try, check, push, check, ...
# Completed: 7720 of 7720 jobs
# CPU time in finished jobs:      45124s     752.06m    12.53h    0.52d  0.001 y
# IO & Wait Time:                274207s    4570.12m    76.17h    3.17d  0.009 y
# Average job time:                  41s       0.69m     0.01h    0.00d
# Longest running job:                0s       0.00m     0.00h    0.00d
# Longest finished job:             354s       5.90m     0.10h    0.00d
# Submission to last job:          1859s      30.98m     0.52h    0.02d

# Load into database.  
ssh hgwdev
cd /cluster/data/mm7/bed/blastp/ce2/run/out
hgLoadBlastTab mm7 ceBlastTab -maxPer=1 *.tab

# HUMAN BLASTP FOR GENE SORTER 
    # Make human ortholog column using blastp on human known genes.
    # First make human protein database and copy it to iscratch/i
    # if it doesn't exist already:
#    mkdir /cluster/data/hg17/bed/blastp
#    cd /cluster/data/hg17/bed/blastp
#    pepPredToFa hg17 knownGenePep known.faa
#    formatdb -i known.faa -t known -n known

#   ssh kkr1u00
#    if (-e /iscratch/i/hg17/blastp) then
#      rm -r /iscratch/i/hg17/blastp
#    endif
#    mkdir -p /iscratch/i/hg17/blastp
#    cp /cluster/data/hg17/bed/blastp/known.p?? /iscratch/i/hg17/blastp
#    iSync
# ABOVE WAS ALREADY DONE DURING MM6 BUILD.
    # Make parasol run directory 
    ssh kk
    mkdir -p /cluster/data/mm7/bed/blastp/hg17/run/out
    cd /cluster/data/mm7/bed/blastp/hg17/run
    # Make blast script
    cat > blastSome <<end
#!/bin/csh
setenv BLASTMAT /iscratch/i/blast/data
/iscratch/i/blast/blastall -p blastp -d /iscratch/i/hg17/blastp/known -i \$1 -o \$2 -e 0.001 -m 8 -b 1
end
    chmod a+x blastSome
    # Make gensub2 file
    cat > gsub <<end
#LOOP
blastSome {check in line+ \$(path1)} {check out line out/\$(root1).tab}
#ENDLOOP
end
    # Create parasol batch
ls -1S /cluster/data/mm7/bed/geneSorter/blastp/split \
|sed -e 's=kg=../../../geneSorter/blastp/split/kg=g' >split.lst

gensub2 split.lst single gsub spec
para create spec
para try, check, push, check, ...
# Completed: 7720 of 7720 jobs
# CPU time in finished jobs:      89614s    1493.56m    24.89h    1.04d  0.003 y
# IO & Wait Time:                151484s    2524.74m    42.08h    1.75d  0.005 y
# Average job time:                  31s       0.52m     0.01h    0.00d
# Longest running job:                0s       0.00m     0.00h    0.00d
# Longest finished job:              88s       1.47m     0.02h    0.00d
# Submission to last job:          1605s      26.75m     0.45h    0.02d

# Load into database.  
ssh hgwdev
cd /cluster/data/mm7/bed/blastp/hg17/run/out
hgLoadBlastTab mm7 hgBlastTab -maxPer=1 *.tab

# ZEBRAFISH BLASTP FOR GENE SORTER 
    # Make Danio rerio (zebrafish) ortholog column using blastp on Ensembl.
    # First make protein database and copy it to iscratch/I
    # The below is done by hg17, that section from makeHg17.doc is copied here.
#    ssh kkstore
#    mkdir /cluster/data/danRer2/bed/blastp
#    cd /cluster/data/danRer2/bed/blastp
#    wget ftp://ftp.ensembl.org/pub/current_zebrafish/data/fasta/pep/Danio_rerio.ZFISH4.apr.pep.fa.gz
    
#    zcat Dan*.pep.fa.gz > ensembl.faa
#    formatdb -i ensembl.faa -t ensembl -n ensembl
#    ssh kkr1u00
#    if (-e /iscratch/i/danRer2/blastp) then
#      rm -r /iscratch/i/danRer2/blastp
#    endif
#    mkdir -p /iscratch/i/danRer2/blastp
#    cp /cluster/data/danRer2/bed/blastp/ensembl.p?? /iscratch/i/danRer2/blastp
#    iSync
# ABOVE WAS ALREADY DONE DURING MM6 BUILD.

# The above is copied from makeHg17.doc.

    # Make parasol run directory 
    ssh kk
    mkdir -p /cluster/data/mm7/bed/blastp/danRer2/run/out
    cd /cluster/data/mm7/bed/blastp/danRer2/run
    # Make blast script
    cat > blastSome <<end
#!/bin/csh
setenv BLASTMAT /iscratch/i/blast/data
/iscratch/i/blast/blastall -p blastp -d /iscratch/i/danRer2/blastp/ensembl -i \$1 -o \$2 -e 0.005 -m 8 -b 1
end
    chmod a+x blastSome
    # Make gensub2 file
    cat > gsub <<end
#LOOP
blastSome {check in line+ \$(path1)} {check out line out/\$(root1).tab}
#ENDLOOP
end
    
# Create parasol batch
ls -1S /cluster/data/mm7/bed/geneSorter/blastp/split \
|sed -e 's=kg=../../../geneSorter/blastp/split/kg=g' >split.lst

gensub2 split.lst single gsub spec
para create spec
para try, check, push, check, ...
# Completed: 7720 of 7720 jobs
# CPU time in finished jobs:      81817s    1363.61m    22.73h    0.95d  0.003 y
# IO & Wait Time:                148583s    2476.39m    41.27h    1.72d  0.005 y
# Average job time:                  30s       0.50m     0.01h    0.00d
# Longest running job:                0s       0.00m     0.00h    0.00d
# Longest finished job:              93s       1.55m     0.03h    0.00d
# Submission to last job:          2638s      43.97m     0.73h    0.03d

# Load into database.  
ssh hgwdev
cd /cluster/data/mm7/bed/blastp/danRer2/run/out
hgLoadBlastTab mm7 drBlastTab -maxPer=1 *.tab

# YEAST BLASTP FOR GENE SORTER 
    # Make Saccharomyces cerevisiae (yeast) ortholog column using blastp on 
    # RefSeq.  First make protein database and copy it to iscratch/i
    # if it doesn't exist already:
#    mkdir /cluster/data/sacCer1/bed/blastp
#    cd /cluster/data/sacCer1/bed/blastp
#    wget ftp://genome- ftp.stanford.edu/pub/yeast/data_download/sequence/genomic_sequence/orf_protein/orf_trans.fasta.gz
#    zcat orf_trans.fasta.gz > sgdPep.faa
#    formatdb -i sgdPep.faa -t sgdPep -n sgdPep

#    ssh kkr1u00
    # Note: sacCer1 is a name conflict with SARS coronavirus... oh well, 
    # fortunately we won't be looking for homologs there.  :)
#    if (-e /iscratch/i/sacCer1/blastp) then
#      rm -r /iscratch/i/sacCer1/blastp
#    endif
#    mkdir -p /iscratch/i/sacCer1/blastp
#    cp /cluster/data/sacCer1/bed/blastp/sgdPep.p?? /iscratch/i/sacCer1/blastp
#    iSync
# ABOVE WAS ALREADY DONE DURING MM6 BUILD.

    # Make parasol run directory 
    ssh kk
    mkdir -p /cluster/data/mm7/bed/blastp/sacCer1/run/out
    cd /cluster/data/mm7/bed/blastp/sacCer1/run
    # Make blast script
    cat > blastSome <<end
#!/bin/csh
setenv BLASTMAT /iscratch/i/blast/data
/iscratch/i/blast/blastall -p blastp -d /iscratch/i/sacCer1/blastp/sgdPep -i \$1 -o \$2 -e 0.01 -m 8 -b 1
end
    chmod a+x blastSome
    # Make gensub2 file
    cat > gsub <<end
#LOOP
blastSome {check in line+ \$(path1)} {check out line out/\$(root1).tab}
#ENDLOOP
end
# Create parasol batch
ls -1S /cluster/data/mm7/bed/geneSorter/blastp/split \
|sed -e 's=kg=../../../geneSorter/blastp/split/kg=g' >split.lst
gensub2 split.lst single gsub spec
para create spec
para try, check, push, check, ...
# Completed: 7720 of 7720 jobs
# CPU time in finished jobs:      13337s     222.29m     3.70h    0.15d  0.000 y
# IO & Wait Time:                149594s    2493.23m    41.55h    1.73d  0.005 y
# Average job time:                  21s       0.35m     0.01h    0.00d
# Longest running job:                0s       0.00m     0.00h    0.00d
# Longest finished job:             121s       2.02m     0.03h    0.00d
# Submission to last job:         15796s     263.27m     4.39h    0.18d

# Load into database.  
ssh hgwdev
cd /cluster/data/mm7/bed/blastp/sacCer1/run/out
hgLoadBlastTab mm7 scBlastTab -maxPer=1 *.tab

# DM1 BLASTP FOR GENE SORTER (DONE 5/30/05, Fan)
    # Make Drosophila melanagaster ortholog column using blastp on FlyBase.
    # First make protein database and copy it to iscratch/i
    # if it doesn't exist already:
    # This is already done, see makeMm3.doc for procedure
    # the directory: /cluster/bluearc/dm1/blastp should have data

    # ssh kkr1u00
    # if (-e /iscratch/i/dm1/blastp) then
    #   rm -r /iscratch/i/dm1/blastp
    # endif
    # mkdir -p /iscratch/i/dm1/blastp
    # cp /cluster/data/dm1/bed/blastp/bdgp.p?? /iscratch/i/dm1/blastp
    # iSync
    # THE ABOVE IS ALREADY DONE BY ANGIE
# PLEASE NOTE THE SPLIT DATA WERE COPIED TO STORE3.  SEE BLASTP RUN FOR RN3 FOR DETAILS


    # Make parasol run directory 
    ssh kk
#    mkdir -p /cluster/data/mm7/bed/blastp/dm1/run/out

    mkdir -p /cluster/store3/mm7/bed/blastp/dm1/run/out
    cd /cluster/data/mm7/bed/blastp/dm1/run
    
    # Make blast script
    cat > blastSome <<end
#!/bin/csh
setenv BLASTMAT /iscratch/i/blast/data
/iscratch/i/blast/blastall -p blastp -d /iscratch/i/dm1/blastp/bdgp -i \$1 -o \$2 -e 0.001 -m 8 -b 1
end
    chmod a+x blastSome
    # Make gensub2 file
    cat > gsub <<end
#LOOP
blastSome {check in line+ \$(path1)} {check out line out/\$(root1).tab}
#ENDLOOP
end
# Create parasol batch
# ls -1S /cluster/data/mm7/bed/geneSorter/blastp/split \
# |sed -e 's=kg=../../../geneSorter/blastp/split/kg=g' >split.lst

ls -1S /cluster/data/mm7/bed/geneSorter/blastp/split \
|sed -e 's=kg=/cluster/store3/mm7/bed/split/kg=g' >split.lst

gensub2 split.lst single gsub spec
para create spec
para try, check, push, check, ..
# Completed: 7720 of 7720 jobs
# CPU time in finished jobs:      50313s     838.55m    13.98h    0.58d  0.002 y
# IO & Wait Time:                 49291s     821.52m    13.69h    0.57d  0.002 y
# Average job time:                  13s       0.22m     0.00h    0.00d
# Longest running job:                0s       0.00m     0.00h    0.00d
# Longest finished job:              81s       1.35m     0.02h    0.00d
# Submission to last job:           341s       5.68m     0.09h    0.00d

# Load into database.  
ssh hgwdev
cd /cluster/data/mm7/bed/blastp/dm1/run/out
hgLoadBlastTab mm7 dmBlastTab -maxPer=1 *.tab

# Create table that maps between known genes and LocusLink 
cd /cluster/data/mm7/bed/geneSorter
hgsql --skip-column-names -e "select mrnaAcc,locusLinkId from refLink" mm7 > refToLl.txt
hgMapToGene mm7 refGene knownGene knownToLocusLink -lookup=refToLl.txt
#       row count is 23074  

# Create table that maps between known genes and Pfam domains
hgMapViaSwissProt mm7 knownGene name proteinID Pfam knownToPfam
# row count is 22525 

# Create table to map between known genes and GNF Atlas2
# expression data.
    hgMapToGene mm7 gnfAtlas2 knownGene knownToGnfAtlas2 '-type=bed 12'

# Create table that maps between known genes and genePix database 
    knownToGenePix mm7

# ENABLE GENE SORTER FOR mm7 IN HGCENTRALTEST (already done during first mm7 KG build)
    echo "update dbDb set hgNearOk = 1 where name = 'mm7';" \
      | hgsql -h genome-testdb hgcentraltest

# RAT BLASTP FOR GENE SORTER 
    # Make RAT ortholog column using blastp on RAT known genes.
    # First make RAT protein database and copy it to iscratch/i
    # if it doesn't exist already:
#    mkdir /cluster/data/rn3/bed/blastp
#    cd /cluster/data/rn3/bed/blastp
#    pepPredToFa rn3 knownGenePep known.faa
#    formatdb -i known.faa -t known -n known

#   ssh kkr1u00
#    if (-e /iscratch/i/rn3/blastp) then
#      rm -r /iscratch/i/rn3/blastp
#    endif
#    mkdir -p /iscratch/i/rn3/blastp
#    cp /cluster/data/rn3/bed/blastp/known.p?? /iscratch/i/rn3/blastp
#    iSync
# ABOVE WAS ALREADY DONE DURING MM6 BUILD.

    # Make parasol run directory 
    ssh kk
# PLEASE NOTE THE SPLIT FILES ARE COPIED TO STORE3.  THE NEW KKSTORE-2 FILE SYSTEM
# WHERE STORE5 IS HAS AN OS VERSION COMPATIBILITY PROBLEM THAT CAUSED A LOT 
# I/O PAGE FAULTS.
    mkdir -p /cluster/store3/mm7/bed 
    cp -Rp /cluster/store5/mm7/bed/geneSorter/blastp/split .
    cd /cluster/store5/mm7/bed/geneSorter/blastp
    mv split split.sav
    ln -s /cluster/store3/mm7/bed/split ./split 

    mkdir -p /cluster/store3/mm7/bed/blastp/rn3/run/out
    mv /cluster/data/mm7/bed/blastp /cluster/data/mm7/bed/blastp.sav
    ln -s /cluster/store3/mm7/bed/blastp /cluster/data/mm7/bed/blastp
    cd /cluster/data/mm7/bed/blastp/rn3/run
    # Make blast script
    cat > blastSome <<end
#!/bin/csh
setenv BLASTMAT /iscratch/i/blast/data
/iscratch/i/blast/blastall -p blastp -d /iscratch/i/rn3/blastp/known -i \$1 -o \$2 -e 0.001 -m 8 -b 1
end
    chmod a+x blastSome
    # Make gensub2 file
    cat > gsub <<end
#LOOP
blastSome {check in line+ \$(path1)} {check out line out/\$(root1).tab}
#ENDLOOP
end
# Create parasol batch
# ls -1S /cluster/data/mm7/bed/geneSorter/blastp/split \
# |sed -e 's=kg=../../../geneSorter/blastp/split/kg=g' >split.lst
ls -1S /cluster/data/mm7/bed/geneSorter/blastp/split \
|sed -e 's=kg=/cluster/store3/mm7/bed/split/kg=g' >split.lst
gensub2 split.lst single gsub spec
para create spec
para try, check, push, check, ...
# Completed: 7720 of 7720 jobs
# CPU time in finished jobs:      19182s     319.70m     5.33h    0.22d  0.001 y
# IO & Wait Time:                 20477s     341.28m     5.69h    0.24d  0.001 y
# Average job time:                   5s       0.09m     0.00h    0.00d
# Longest running job:                0s       0.00m     0.00h    0.00d
# Longest finished job:              23s       0.38m     0.01h    0.00d
# Submission to last job:           140s       2.33m     0.04h    0.00d

# Load into database.  
ssh hgwdev
cd /cluster/data/mm7/bed/blastp/rn3/run/out
hgLoadBlastTab mm7 rnBlastTab -maxPer=1 *.tab

# END OF GENE SORTER STUFF (DONE 11/17/05, Fan)
#############################################################################

### MM7 PROTEOME BROWSER TABLES BUILD ####  (DONE - 2005-11-14 - Fan)
# These are instructions for re-building tables 
# needed for the Proteome Browser to be used with mm7.  
# DON'T START THESE UNTIL TABLES FOR KNOWN GENES AND kgProtMap table
# ARE REBUILT.  
# This build is based on proteins DBs dated 051015.

# Create the working directory
   ssh hgwdev
   mkdir /cluster/data/mm7/bed/pb.2005-11-14
   cd /cluster/data/mm7/bed
   rm pb
   ln -s /cluster/data/mm7/bed/pb.2005-11-14 pb
   cd pb

# Define pep* tables in mm7 DB

   cat ~/kent/src/hg/lib/pep*.sql > pepAll.sql

# edit the .sql file and get rid of non-PB table definitions, e.g. pepPred.

   vi pepAll.sql
   hgsql mm7 <pepAll.sql

# Build the pepMwAa table

  hgsql proteins051015 -e \
"select info.acc, molWeight, aaSize from sp051015.info, sp051015.accToTaxon where accToTaxon.taxon=10090 and accToTaxon.acc = info.acc" > pepMwAa.tab

hgsql mm7 -e 'load data local infile "pepMwAa.tab" into table mm7.pepMwAa ignore 1 lines;'

o Build the pepPi table

  hgsql proteins051015 -e "select info.acc from sp051015.info, sp051015.accToTaxon where accToTaxon.taxon=10090 and accToTaxon.acc = info.acc" > protAcc.lis

  pbCalPi protAcc.lis sp051015 pepPi.tab

  hgsql mm7 -e 'load data local infile "pepPi.tab" into table mm7.pepPi;'

# Calculate and load pep distributions

  pbCalDist sp051015 proteins051015 10090 mm7 >pbCalDist.out

    cat pbCalDist.out
    wc  pbCalDist.out

    hgsql mm7
    load data local infile "pepExonCntDist.tab" into table mm7.pepExonCntDist;
    load data local infile "pepCCntDist.tab" into table mm7.pepCCntDist;
    load data local infile "pepHydroDist.tab" into table mm7.pepHydroDist;
    load data local infile "pepMolWtDist.tab" into table mm7.pepMolWtDist;
    load data local infile "pepResDist.tab" into table mm7.pepResDist;
    load data local infile "pepIPCntDist.tab" into table mm7.pepIPCntDist;
    load data local infile "pepPiDist.tab" into table mm7.pepPiDist;
    quit

# Calculate frequency distributions

    pbCalResStd sp051015 10090 mm7

# Create pbAnomLimit and pbResAvgStd tables

   hgsql mm7 < ~/src/hg/lib/pbAnomLimit.sql
   hgsql mm7 < ~/src/hg/lib/pbResAvgStd.sql

   hgsql mm7 -e 'load data local infile "pbResAvgStd.tab" into table mm7.pbResAvgStd;'
   hgsql mm7 -e 'load data local infile "pbAnomLimit.tab" into table mm7.pbAnomLimit;'

# UPDATE kgSpAlias TABLE TO BE USED BY PB 

    cd /cluster/data/mm7/bed/pb
    hgsql mm7 -e \
    'select kgXref.kgID, spID, alias from kgXref, kgAlias where kgXref.kgID=kgAlias.kgID' >j.tmp
    hgsql mm7 -e \
    'select kgXref.kgID, spID, alias from kgXref, kgProtAlias where kgXref.kgID=kgProtAlias.kgID'\
    >>j.tmp
    cat j.tmp|sort -u |grep -v 'kgID' >mm7.kgSpAlias.tab
    rm j.tmp

    hgsql mm7 -e 'drop table kgSpAlias';
    hgsql mm7 < ~/src/hg/lib/kgSpAlias.sql
    hgsql mm7 -e 'load data local infile "mm7.kgSpAlias.tab" into table kgSpAlias'
    gzip mm7.kgSpAlias.tab

# Create pbStamp table for PB
   
  hgsql mm7 < ~/src/hg/lib/pbStamp.sql
  hgsql mm6 -N -e 'select * from pbStamp' > pbStamp.tab

  hgsql mm7 -e 'delete from pbStamp'
  hgsql mm7 -e 'load data local infile "pbStamp.tab" into table mm7.pbStamp'

# ENABLE PROTEOME BROWSER FOR mm7 IN HGCENTRALTEST (already done previously)
    echo "update dbDb set hgPbOk = 1 where name = 'mm7';" \
      | hgsql -h genome-testdb hgcentraltest

# Adjust drawing parameters for Proteome Browser stamps

  Now invoke Proteome Browser and adjust various drawing parameters
  (mostly the ymax of each stamp) if necessary, by updating the 
  pbStamp.tab file and then delete and reload the pbStamp table. 

# Perform preliminary review of Proteome Browser for mm7, then
  notify QA for formal review.

# Update default Browser position
# bring up mySQL on genome-testdb and use hgcentraltest DB (done previously):

   update dbDb set defaultPos="chr2:146923205-146928018" where name="mm7";

# Create QA Push Queue entry with the following tables:

 ceBlastTab 
 cgapAlias 
 cgapBiocDesc 
 cgapBiocPathway 
 dmBlastTab 
 drBlastTab 
 dupSpMrna 
 foldUtr3 
 foldUtr5 
 gnfAtlas2Distance 
 hgBlastTab 
 keggMapDesc 
 keggPathway 
 kgAlias 
 kgProtAlias 
 kgProtMap 
 kgXref 
 knownBlastTab 
 knownCanonical 
 knownGene 
 knownGeneMrna 
 knownGenePep 
 knownIsoforms 
 knownToVisiGene 
 knownToGnf1m 
 knownToGnfAtlas2 
 knownToLocusLink 
 knownToMOE430 
 knownToMOE430A 
 knownToPfam 
 knownToRefSeq 
 knownToU74 
 knownToXmBest 
 rinnSex 
 rnBlastTab 
 scBlastTab 
 spMrna

# END OF mm7 KG/GS/PB RE-BUILD. 11/17/05 Fan.
#####################################################################

############################################################################
## build multiz17way mafFrames (multiz17wayFrames) (markd 2005/11/13)
## rebuild frames to get bug fix, using 1-pass maf methodology (2006-06-09 markd)

cd /cluster/data/mm7/bed/
mkdir multiz17wayFrames/
cd multiz17wayFrames/

# created Makefile and mkMafFrames based on one for canHg11
# create frames usins knownGenes if available, if not refseqs if availble,
# otherwise mRNA with CDS annotation
# created for these databases:

   rn3 hg17 panTro1 rheMac1 oryCun1 canFam2 bosTau2 galGal2 xenTro1 fr1 danRer3

nice make getGenes
nice make -j 3 getFrames
nice make loadDb

# need to do a make sanClean after track verified

    ###
    # rebuild frames to get bug fix, using 1-pass maf methodology
    # (2006-06-09 markd)
    ssh kkstore02
    cd /cluster/data/mm7/bed/multiz17wayFrames
    mv mafFrames/ mafFrames.old
    nice tcsh # easy way to get process niced
    (cat  ../maf/*.maf | time genePredToMafFrames mm7 stdin stdout bosTau2 genes/bosTau2.gp.gz canFam2 genes/canFam2.gp.gz danRer3 genes/danRer3.gp.gz fr1 genes/fr1.gp.gz galGal2 genes/galGal2.gp.gz hg17 genes/hg17.gp.gz mm7 genes/mm7.gp.gz oryCun1 genes/oryCun1.gp.gz panTro1 genes/panTro1.gp.gz rheMac1 genes/rheMac1.gp.gz rn3 genes/rn3.gp.gz xenTro1 genes/xenTro1.gp.gz  |  gzip >multiz17way.mafFrames.gz)>&log&
    ssh hgwdev
    cd /cluster/data/mm7/bed/multiz17wayFrames

    hgLoadMafFrames mm7 multiz17wayFrames multiz17way.mafFrames.gz >&frameslog&


############################################################################
#  BLASTZ Chimp panTro1 second time (DONE - 2005-11-20 - 2005-11-22 - Hiram)
#	The first attempt didn't work well when it got into the multiple
#	alignment, it seemed to get lost in gap areas of the randoms.
    ssh pk
    mkdir /cluster/data/mm7/bed/blastzPanTro1.2005-11-20
    cd /cluster/data/mm7/bed
    rm -f blastz.panTro1
    ln -s blastzPanTro1.2005-11-20 blastz.panTro1
    cd blastzPanTro1.2005-11-20

    cat << '_EOF_' > DEF
# mouse vs chimp
export
PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/cluster/home/angie/schwartzbin:/parasol/bin

BLASTZ=blastz.v7.x86_64
BLASTZ_M=50

# TARGET: Mouse Mm7
SEQ1_DIR=/scratch/hg/mm7/nib
SEQ1_LEN=/scratch/hg/mm7/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000

# QUERY: Chimp PanTro1
SEQ2_DIR=/scratch/hg/panTro1/nib
SEQ2_DIR=/scratch/hg/panTro1/chrom.sizes
SEQ2_CHUNK=30000000
SEQ2_LAP=0

BASE=/cluster/data/mm7/bed/blastzPanTro1.2005-11-20
TMPDIR=/scratch/tmp
'_EOF_'
    #	happy emacs

    time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
	-stop=load `pwd`/DEF > blastz.to.load.out 2>&1 &

    time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
	-continue=cat -stop=load `pwd`/DEF > cat.to.load.out 2>&1 &

    time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
	-continue=chainMerge -stop=load \
	`pwd`/DEF > chainMerge.to.load.out 2>&1 &

    time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
	-swap -stop=load \
	`pwd`/DEF > swap.to.load.out 2>&1 &

    #	Measurements:
    ssh kolossus
    cd  /cluster/data/mm7/bed/blastzPanTro1.2005-11-20
    time HGDB_CONF=~/.hg.conf.read-only featureBits mm7 \
	chainPanTro1Link > fb.mm7.chainPanTro1Link 2>&1
    cat fb.mm7.chainPanTro1Link
    #	906762987 bases of 2583394090 (35.100%) in intersection
    time HGDB_CONF=~/.hg.conf.read-only featureBits panTro1 \
	chainMm7Link > fb.panTro1.chainMm7Link 2>&1
    cat fb.panTro1.chainMm7Link
    #	899743967 bases of 2733948177 (32.910%) in intersection

    time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
	-continue=download \
	`pwd`/DEF > download.out 2>&1 &

    time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
	-swap -continue=download \
	`pwd`/DEF > downloadSwap.out 2>&1 &

############################################################################
# MM3/MM4/MM5/MM6 -> MM7 LIFTOVER CHAINS (DONE 11/21/2005 Andy)
    # These chains hopefully don't suck.
    # Sorry I only used the makeLoChain-align script from the set of scripts 
    # already created for this task.  I wanted more control.  I should mention 
    # I used a size of 10kb instead of 3kb for the split (blat query) sizes in
    # mm7.  This had a huge affect on the amount of hits in the blat, which
    # then had a huge effect on the amount of chains.  I should also mention 
    # that mm7 chromosomes chr1, chrX, and chrUn_random were split further 
    # into more than a single query file.  This helped a LOT in avoiding 
    # cluster hippos classically associated with those chroms.
    ######## LIFTOVER PREPARATION
    # Split up mm7
    ssh hgwdev
    cd /san/sanVol1/scratch/mm7
    mkdir -p liftSplits/{split,lift}
    for fa in /cluster/data/mm7/?{,?}/*.fa; do
      c=`basename $fa .fa`
      echo $c
      faSplit -lift=liftSplits/lift/${c}.lft size $fa -oneFile 10000 liftSplits/split/$c
    done
    mkdir -p biggerSplits/split
    cd biggerSplits/
    ln -s ../liftSplits/lift
    cd split/
    ln -s ../../liftSplits/split/* .
    faSplit sequence chr1.fa 5 chr1_
    faSplit sequence chrX.fa 10 chrX_
    rm chr{1,X}.fa

    # Make some dirs
    cd /san/sanVol1/scratch
    mkdir -p mm{3,4,5}
    # Copy 11.ooc files to each of mm3, mm4, mm5, mm6 dirs.

    ######## LIFTOVER BLATING    
    # MM3
    ssh pk
    cd /cluster/data/mm3
    makeLoChain-align mm3 /san/sanVol1/scratch/mm3/nib mm7 /san/sanVol1/scratch/mm7/biggerSplits/split
    cd bed/
    mv blat.mm7.2005-11-17/ /san/sanVol1/scratch/mm3 
    cd /san/sanVol1/scratch/mm3/blat.mm7.2005-11-17/run/
    sed 's/^blat/blat.sh/; s/\}.*$/}/' spec | awk '{print "/san/sanVol1/scratch/andy/" $0 " mm3ToMm7"}' > newspec
    para create newspec
    para -maxNode=200 -priority=25 push
    para time
#Completed: 2109 of 2109 jobs
#CPU time in finished jobs:    2322886s   38714.77m   645.25h   26.89d  0.074 y
#IO & Wait Time:                 20395s     339.92m     5.67h    0.24d  0.001 y
#Average job time:                1111s      18.52m     0.31h    0.01d
#Longest running job:                0s       0.00m     0.00h    0.00d
#Longest finished job:           21949s     365.82m     6.10h    0.25d
#Submission to last job:         41513s     691.88m    11.53h    0.48d

    # MM4
    ssh pk
    cd /cluster/data/mm4
    makeLoChain-align mm4 /scratch/mus/mm4/softNib mm7 /san/sanVol1/scratch/mm7/biggerSplits/split
    cd bed/
    mv blat.mm7.2005-11-17/ /san/sanVol1/scratch/mm4 
    cd /san/sanVol1/scratch/mm4/blat.mm7.2005-11-17/run/
    sed 's/^blat/blat.sh/; s/\}.*$/}/' spec | awk '{print "/san/sanVol1/scratch/andy/" $0 " mm4ToMm7"}' > newspec
    para create newspec
    para -maxNode=200 -priority=25 push
    para time
#Completed: 2508 of 2508 jobs
#CPU time in finished jobs:    1351933s   22532.22m   375.54h   15.65d  0.043 y
#IO & Wait Time:                 13885s     231.41m     3.86h    0.16d  0.000 y
#Average job time:                 545s       9.08m     0.15h    0.01d
#Longest running job:                0s       0.00m     0.00h    0.00d
#Longest finished job:            6516s     108.60m     1.81h    0.08d
#Submission to last job:         25881s     431.35m     7.19h    0.30d

    # MM5
    ssh pk
    cd /cluster/data/mm5
    makeLoChain-align mm5 /scratch/mus/mm5/softNib mm7 /san/sanVol1/scratch/mm7/biggerSplits/split
    cd bed/
    mv blat.mm7.2005-11-17/ /san/sanVol1/scratch/mm5 
    cd /san/sanVol1/scratch/mm5/blat.mm7.2005-11-17/run/
    sed 's/^blat/blat.sh/; s/\}.*$/}/' spec | awk '{print "/san/sanVol1/scratch/andy/" $0 " mm5ToMm7"}' > newspec
    para create newspec
    para -maxNode=200 -priority=25 push
    para time
#Completed: 2451 of 2451 jobs
#CPU time in finished jobs:    1266001s   21100.02m   351.67h   14.65d  0.040 y
#IO & Wait Time:                 13972s     232.87m     3.88h    0.16d  0.000 y
#Average job time:                 522s       8.70m     0.15h    0.01d
#Longest running job:                0s       0.00m     0.00h    0.00d
#Longest finished job:            6769s     112.82m     1.88h    0.08d
#Submission to last job:         26506s     441.77m     7.36h    0.31d

    # MM6
    cd /cluster/data/mm6
    makeLoChain-align mm6 /scratch/mus/mm6/nib mm7 /san/sanVol1/scratch/mm7/biggerSplits/split
    cd bed/
    mv blat.mm7.2005-11-17/ /san/sanVol1/scratch/mm6 
    cd /san/sanVol1/scratch/mm6/blat.mm7.2005-11-17/run/
    sed 's/^blat/blat.sh/; s/\}.*$/}/' spec | awk '{print "/san/sanVol1/scratch/andy/" $0 " mm6ToMm7"}' > newspec
    para create newspec
    para -maxNode=200 -priority=25 push
    para time
#Completed: 2280 of 2280 jobs
#CPU time in finished jobs:    1250929s   20848.81m   347.48h   14.48d  0.040 y
#IO & Wait Time:                 12983s     216.39m     3.61h    0.15d  0.000 y
#Average job time:                 554s       9.24m     0.15h    0.01d
#Longest running job:                0s       0.00m     0.00h    0.00d
#Longest finished job:            6575s     109.58m     1.83h    0.08d
#Submission to last job:         26374s     439.57m     7.33h    0.31d

    ######## LIFTOVER CHAINING
    # LIFTING
    ssh pk
    cd /san/sanVol1/scratch/andy
    cat << "EOF" > mm7SplitLift.sh
EOF
    chmod +x mm7SplitLift.sh
    cat << "EOF" > mm7ChainMergeSplit.sh
#!/bin/bash
cp -r chainRaw/ /scratch/andy/mm7Lifts
pushd /scratch/andy/mm7Lifts
mkdir chain
/cluster/bin/x86_64/chainMergeSort chainRaw/*.chain | /cluster/bin/x86_64/chainSplit chain stdin
cp -r chain `dirs +1`
rm -rf chain chainRaw
EOF
    chmod +x mm7ChainMergeSplit.sh

    # MM3
    cd /san/sanVol1/scratch/mm3/blat.mm7.2005-11-17/raw
    /san/sanVol1/scratch/andy/mm7SplitLift.sh
    cd ../    
    mkdir chainRun chainRaw
    cd chainRun
    cat > gsub << "EOF"
#LOOP
/cluster/bin/x86_64/axtChain -verbose=0 -psl $(path1) /san/sanVol1/scratch/mm3/nib /san/sanVol1/scratch/mm7/nib {check out line+ ../chainRaw/$(root1).chain}
#ENDLOOP
EOF
    ls -1S ../psl/*.psl > in.lst
    gensub2 in.lst single gsub spec
    para create spec
    para push
    para time
#Completed: 40 of 40 jobs
#CPU time in finished jobs:       8687s     144.79m     2.41h    0.10d  0.000 y
#IO & Wait Time:                  2751s      45.85m     0.76h    0.03d  0.000 y
#Average job time:                 286s       4.77m     0.08h    0.00d
#Longest running job:                0s       0.00m     0.00h    0.00d
#Longest finished job:             920s      15.33m     0.26h    0.01d
#Submission to last job:           921s      15.35m     0.26h    0.01d


    # MM4
    cd /san/sanVol1/scratch/mm4/blat.mm7.2005-11-17/raw
    /san/sanVol1/scratch/andy/mm7SplitLift.sh
    cd ../    
    mkdir chainRun chainRaw
    cd chainRun
    cat > gsub << "EOF"
#LOOP
/cluster/bin/x86_64/axtChain -verbose=0 -psl $(path1) /scratch/mus/mm4/softNib /san/sanVol1/scratch/mm7/nib {check out line+ ../chainRaw/$(root1).chain}
#ENDLOOP
EOF
    ls -1S ../psl/*.psl > in.lst
    gensub2 in.lst single gsub spec
    para create spec
    para push
    para time
#Completed: 40 of 40 jobs
#CPU time in finished jobs:       7678s     127.96m     2.13h    0.09d  0.000 y
#IO & Wait Time:                  4254s      70.90m     1.18h    0.05d  0.000 y
#Average job time:                 298s       4.97m     0.08h    0.00d
#Longest running job:                0s       0.00m     0.00h    0.00d
#Longest finished job:            3087s      51.45m     0.86h    0.04d
#Submission to last job:          3088s      51.47m     0.86h    0.04d

    # MM5
    cd /san/sanVol1/scratch/mm5/blat.mm7.2005-11-17/raw
    /san/sanVol1/scratch/andy/mm7SplitLift.sh
    cd ../    
    mkdir chainRun chainRaw
    cd chainRun
    cat > gsub << "EOF"
#LOOP
/cluster/bin/x86_64/axtChain -verbose=0 -psl $(path1) /scratch/mus/mm5/softNib /san/sanVol1/scratch/mm7/nib {check out line+ ../chainRaw/$(root1).chain}
#ENDLOOP
EOF
    ls -1S ../psl/*.psl > in.lst
    gensub2 in.lst single gsub spec
    para create spec
    para push
    para time
#Completed: 40 of 40 jobs
#CPU time in finished jobs:       8450s     140.83m     2.35h    0.10d  0.000 y
#IO & Wait Time:                  9259s     154.32m     2.57h    0.11d  0.000 y
#Average job time:                 443s       7.38m     0.12h    0.01d
#Longest running job:                0s       0.00m     0.00h    0.00d
#Longest finished job:            2929s      48.82m     0.81h    0.03d
#Submission to last job:          2968s      49.47m     0.82h    0.03d

    # MM6
    cd /san/sanVol1/scratch/mm6/blat.mm7.2005-11-17/raw
    /san/sanVol1/scratch/andy/mm7SplitLift.sh
    cd ../    
    mkdir chainRun chainRaw
    cd chainRun
    cat > gsub << "EOF"
#LOOP
/cluster/bin/x86_64/axtChain -verbose=0 -psl $(path1) /scratch/mus/mm6/nib /san/sanVol1/scratch/mm7/nib {check out line+ ../chainRaw/$(root1).chain}
#ENDLOOP
EOF
    ls -1S ../psl/*.psl > in.lst
    gensub2 in.lst single gsub spec
    para create spec
    para push
    para time
#Completed: 40 of 40 jobs
#CPU time in finished jobs:       8140s     135.66m     2.26h    0.09d  0.000 y
#IO & Wait Time:                 13372s     222.87m     3.71h    0.15d  0.000 y
#Average job time:                 538s       8.96m     0.15h    0.01d
#Longest running job:                0s       0.00m     0.00h    0.00d
#Longest finished job:            2822s      47.03m     0.78h    0.03d
#Submission to last job:          2857s      47.62m     0.79h    0.03d

    ######### CHAINMERGE/NET/NETSUBSET
    ssh kolossus
    cd /scratch/andy/mm7Lifts
    cp -r ~/san/mm6/blat.mm7.2005-11-17/chainRaw/ .
    mkdir chain
    time /cluster/bin/x86_64/chainMergeSort chainRaw/* | /cluster/bin/x86_64/chainSplit chain stdin
#real    30m32.406s
#user    25m57.779s
#sys     1m57.769s
    cp -r chain ~/san/mm6/blat.mm7.2005-11-17/
    rm -rf chain*
    cp -r ~/san/mm5/blat.mm7.2005-11-17/chainRaw/ .
    mkdir chain
    time /cluster/bin/x86_64/chainMergeSort chainRaw/* | /cluster/bin/x86_64/chainSplit chain stdin
#real    31m48.856s
#user    27m19.356s
#sys     2m4.895s
    cp -r chain ~/san/mm5/blat.mm7.2005-11-17/
    rm -rf chain*
    cp -r ~/san/mm4/blat.mm7.2005-11-17/chainRaw/ .
    mkdir chain
    time /cluster/bin/x86_64/chainMergeSort chainRaw/* | /cluster/bin/x86_64/chainSplit chain stdin
#real    26m57.181s
#user    23m22.516s
#sys     1m45.750s
    cd chain/
    for c in *.chain; do
       echo ${c%.chain}; 
       /cluster/bin/x86_64/chainNet $c /cluster/data/mm4/chrom.sizes \
        /cluster/data/mm7/chrom.sizes ../net/${c%.chain}.net /dev/null
       echo done $c
    done
    for chain in *; do 
       c=${chain%.chain}
       /cluster/bin/x86_64/netChainSubset ../net/$c.net $chain ../over/$c.over 
    done
    cp -r * ~/san/mm4/blat.mm7.2005-11-17/
    rm -rf *
    # Ehhhh... do the other ones in a cluster job
    cp -r ~/san/mm3/blat.mm7.2005-11-17/chainRaw/ .
    mkdir chain
    time /cluster/bin/x86_64/chainMergeSort chainRaw/* | /cluster/bin/x86_64/chainSplit chain stdin
#real    33m13.555s
#user    28m17.531s
#sys     2m5.165s
    cp -r chain ~/san/mm3/blat.mm7.2005-11-17/
    rm -rf chain*
    ssh pk
    cd ~/san/andy
    cat << "EOF" > netOver.sh 
#!/bin/bash

chain=$1
chrom=`basename $chain .chain`
sizesMMOld=$2
sizesMM7=/cluster/data/mm7/chrom.sizes
chainDir=`dirname $chain`
blatDir=`dirname $chainDir`
net=${blatDir}/net/${chrom}.net
over=${blatDir}/over/${chrom}.over

mkdir -p ${blatDir}/{over,net}
/cluster/bin/x86_64/chainNet $chain $sizesMMOld $sizesMM7 $net /dev/null
/cluster/bin/x86_64/netChainSubset $net $chain $over
EOF
    # << for emacs
    chmod +x netOver.sh
    mkdir netRun
    cd netRun/
    find /san/sanVol1/scratch/mm3/blat.mm7.2005-11-17/chain -name "*.chain" \
     | awk '{print "/san/sanVol1/scratch/andy/netOver.sh " $1 " /cluster/data/mm3/chrom.sizes"}' >> spec
    find /san/sanVol1/scratch/mm5/blat.mm7.2005-11-17/chain -name "*.chain" \
     | awk '{print "/san/sanVol1/scratch/andy/netOver.sh " $1 " /cluster/data/mm5/chrom.sizes"}' >> spec
    find /san/sanVol1/scratch/mm6/blat.mm7.2005-11-17/chain -name "*.chain" \
     | awk '{print "/san/sanVol1/scratch/andy/netOver.sh " $1 " /cluster/data/mm6/chrom.sizes"}' >> spec
    para create spec
    para push
    para time
#Completed: 120 of 120 jobs
#CPU time in finished jobs:       4826s      80.44m     1.34h    0.06d  0.000 y
#IO & Wait Time:                  6816s     113.59m     1.89h    0.08d  0.000 y
#Average job time:                  97s       1.62m     0.03h    0.00d
#Longest running job:                0s       0.00m     0.00h    0.00d
#Longest finished job:             247s       4.12m     0.07h    0.00d
#Submission to last job:           340s       5.67m     0.09h    0.00d
    
    ########## FINISHING
    ssh hgwdev

    # MM3
    cd ~/san/mm3/blat.mm7.2005-11-17/over
    cat * >> mm3ToMm7.over.chain
    cd ../
    rm -rf psl/ net/ chain/ chainRaw/ over/
    cd ../
    cp -r blat.mm7.2005-11-17/ /cluster/data/mm3/bed
    cd /cluster/data/mm3/bed
    ln -s blat.mm7.2005-11-17 blat.mm7
    ln -s `pwd`/blat.mm7/mm3ToMm7.over.chain liftOver/mm3ToMm7.over.chain
    ln -s `pwd`/liftOver/mm3ToMm7.over.chain /gbdb/mm3/liftOver/mm3ToMm7.over.chain
    cd /usr/local/apache/htdocs/goldenPath/mm3/liftOver
    cp /gbdb/mm3/liftOver/mm3ToMm7.over.chain .
    gzip mm3ToMm7.over.chain
    hgAddLiftOverChain mm3 mm7 /gbdb/mm3/liftOver/mm3ToMm7.over.chain

    # MM4
    cd ~/san/mm4/blat.mm7.2005-11-17/over
    cat * >> ../mm4ToMm7.over.chain
    cd ../
    rm -rf psl/ net/ chain/ chainRaw/ over/
    cd ../
    cp -r blat.mm7.2005-11-17/ /cluster/data/mm4/bed
    cd /cluster/data/mm4/bed
    ln -s blat.mm7.2005-11-17 blat.mm7
    ln -s `pwd`/blat.mm7/mm4ToMm7.over.chain liftOver/mm4ToMm7.over.chain
    ln -s `pwd`/liftOver/mm4ToMm7.over.chain /gbdb/mm4/liftOver/mm4ToMm7.over.chain
    mkdir -p /usr/local/apache/htdocs/goldenPath/mm4/liftOver
    cd /usr/local/apache/htdocs/goldenPath/mm4/liftOver
    cp /gbdb/mm4/liftOver/mm4ToMm7.over.chain .
    gzip mm4ToMm7.over.chain
    hgAddLiftOverChain mm4 mm7 /gbdb/mm4/liftOver/mm4ToMm7.over.chain

    # MM5
    cd ~/san/mm5/blat.mm7.2005-11-17/over
    cat * >> ../mm5ToMm7.over.chain
    cd ../
    rm -rf psl/ net/ chain/ chainRaw/ over/
    cd ../
    cp -r blat.mm7.2005-11-17/ /cluster/data/mm5/bed
    cd /cluster/data/mm5/bed
    ln -s blat.mm7.2005-11-17 blat.mm7
    ln -s `pwd`/blat.mm7/mm5ToMm7.over.chain liftOver/mm5ToMm7.over.chain
    ln -s `pwd`/liftOver/mm5ToMm7.over.chain /gbdb/mm5/liftOver/mm5ToMm7.over.chain
    mkdir -p /usr/local/apache/htdocs/goldenPath/mm5/liftOver
    cd /usr/local/apache/htdocs/goldenPath/mm5/liftOver
    cp /gbdb/mm5/liftOver/mm5ToMm7.over.chain .
    gzip mm5ToMm7.over.chain
    hgAddLiftOverChain mm5 mm7 /gbdb/mm5/liftOver/mm5ToMm7.over.chain

    # MM6
    cd ~/san/mm6/blat.mm7.2005-11-17/over
    cat * >> ../mm6ToMm7.over.chain
    cd ../
    rm -rf psl/ net/ chain/ chainRaw/ over/
    cd ../
    cp -r blat.mm7.2005-11-17/ /cluster/data/mm6/bed
    cd /cluster/data/mm6/bed
    ln -s blat.mm7.2005-11-17 blat.mm7
    ln -s `pwd`/blat.mm7/mm6ToMm7.over.chain liftOver/mm6ToMm7.over.chain
    ln -s `pwd`/liftOver/mm6ToMm7.over.chain /gbdb/mm6/liftOver/mm6ToMm7.over.chain
    mkdir -p /usr/local/apache/htdocs/goldenPath/mm6/liftOver
    cd /usr/local/apache/htdocs/goldenPath/mm6/liftOver
    cp /gbdb/mm6/liftOver/mm6ToMm7.over.chain .
    gzip mm6ToMm7.over.chain
    hgAddLiftOverChain mm6 mm7 /gbdb/mm6/liftOver/mm6ToMm7.over.chain

############################################################################
# RIKEN CAGE STUFF (DONE 11-16-2005 Andy)
    cd /cluster/data/mm7/bed
    mkdir rikenCageCtss
    cd rikenCageCtss/
    hgsql mm5 -e 'select * from rikenCageTc' | cut -f2- | tail +2 > rikenCageTc.mm5.bed
    hgsql mm5 -e 'select chrom,chromStart,chromEnd,dataValue from rikenCageCtssMinus' \
       | tail +2 > minus.mm5.bed
    hgsql mm5 -e 'select chrom,chromStart,chromEnd,dataValue from rikenCageCtssPlus' \
       | tail +2 > plus.mm5.bed
    liftOver rikenCageTc.mm5.bed /gbdb/mm5/liftOver/mm5ToMm7.over.chain rikenCageTc.mm7.bed \
       rikenCageTc.mm7.missed
    liftOver plus.mm5.bed /gbdb/mm5/liftOver/mm5ToMm7.over.chain plus.mm7.bed \
       plus.mm7.missed
    liftOver minus.mm5.bed /gbdb/mm5/liftOver/mm5ToMm7.over.chain minus.mm7.bed \
       minus.mm7.missed
    wc -l *.missed
    hgLoadBed mm7 rikenCageTc rikenCageTc.mm7.bed
    hgLoadBed -strict -bedGraph=4 mm7 rikenCageCtssMinus minus.mm7.bed 
    hgLoadBed -strict -bedGraph=4 mm7 rikenCageCtssPlus plus.mm7.bed 

############################################################################
#   Hg17 BLASTZ Mm7 Lineage specific repeats comparison:
    #	using Lineage Specific Repeats:
    #	Mm7 target chunk size: 10,000,000 overlap 10,000
    #	Hg17 query size: 30,000,000
    #	chain minScore=3000 linearGap=medium
    #	Hg17 on Mm7
    featureBits mm7 chainHg17Link
    #	996434728 bases of 2583394090 (38.571%) in intersection
    featureBits refGene:cds mm7 chainHg17Link -enrichment
    #	refGene:cds 1.022%, chainHg17Link 38.571%, both 1.004%,
    #		cover 98.24%, enrich 2.55x
    #	Mm7 on Hg17
    featureBits hg17 chainMm7Link
    #	994737081 bases of 2866216770 (34.706%) in intersection
    featureBits refGene:cds hg17 chainMm7Link -enrichment
    #	refGene:cds 1.060%, chainMm7Link 34.706%, both 1.032%,
    #		cover 97.35%, enrich 2.80x


    #	Not using Lineage Specific Repeats, using BLASTZ dynamic masking
    #	with M=50, chain minScore=1000, linearGap=loose
    #	Hg17 target chunk size: 500,000 overlap 50
    #	Mm7 query chunk size: entire genome
    #	Hg17 on Mm7
    featureBits mm7 chainHg17noLSRLink
    #	952987983 bases of 2583394090 (36.889%) in intersection
    featureBits refGene:cds mm7 chainHg17noLSRLink -enrichment
    #	refGene:cds 1.022%, chainHg17noLSRLink 36.889%, both 0.987%,
    #		cover 96.61%, enrich 2.62x:
    featureBits hg17 chainMm7noLSRLink
    #	955168137 bases of 2866216770 (33.325%) in intersection
    #	Mm7 on Hg17
    featureBits refGene:cds hg17 chainMm7noLSRLink -enrichment
    #	refGene:cds 1.060%, chainMm7noLSRLink 33.325%, both 1.031%,
    #		cover 97.31%, enrich 2.92x

    #	Intersection of the two experiments:
    featureBits hg17 chainMm7noLSRLink chainMm7Link
    #	899176517 bases of 2866216770 (31.372%) in intersectio
    featureBits mm7 chainHg17noLSRLink chainHg17Link
    #	896080475 bases of 2583394090 (34.686%) in intersection

    #	Chain length measurements:
    #	With lineage specific repeats
    #	Mm7 on Hg17:
    #	Number of Chains: 3,073,210
    #	min: 32 max: 242 median: 880 stddev: 206,679
    #	Number of Chain Links: 61,184,737
    #	min: 1 max: 17,154 median: 23 stddev: 64.4087
    #	Hg17 on Mm7:
    #	Number of Chains: 3,073,210
    #	min: 32 max: 1.22701e+08 median: 883 stddev: 177,227
    #	Number of Chain Links: 61,184,737
    #	min: 1 max: 17,154 median: 23 stddev: 64.4087

    #	Without lineage specific repeats:
    #	Mm7 on Hg17:
    #	Number of Chains: 990,397
    #	min: 26 max: 1.45906e+08 median: 330 stddev: 458,649
    #	min: 4,262 max: 2.45523e+08 median: 1.35414e+08 stddev: 6.02228e+07
    #	Number of Chain Links: 41,223,632
    #	min: 1 max: 17,154 median: 22 stddev: 43.5466

    #	table sizes:
    #	on hg17:
    #	chainMm7Link 2,586,839,468 bytes 61,184,737 rows
    #	chainMm7noLSRLink 1,743,097,348 bytes 41,223,632 rows
    #	chainMm7 267,042,956 bytes 3,073,210 rows
    #	chainMm7noLSR 86,227,444 bytes 990,397 rows

    #	on mm7:
    #	chainHg17Link 2,593,329,860 bytes 61,184,737 rows
    #	chainHg17noLSRLink 1,958,082,264 bytes 41,223,632 rows
    #	chainHg17 267,023,452 bytes 3,073,210 rows
    #	chainHg17noLSR 89,196,048 bytes 990,397 rows


############################################################################
# ADD LINK TO GENENETWORK (DONE. 2/9/06 Fan).

# Copy data from mm6

    hgsql mm6 -N -e 'select * from geneNetworkId' > geneNetworkId.tab

    hgsql mm7 -e 'drop table geneNetworkId'
    hgsql mm7 < ~/src/hg/lib/geneNetworkId.sql
    hgsql mm7 -e 'load data local infile "geneNetworkId.tab" into table geneNetworkId'
		    

# BLASTZ/CHAIN/NET RN4 (DONE 2/23/06 angie)
    ssh pk
    mkdir /cluster/data/mm7/bed/blastz.rn4.2006-02-23
    cd /cluster/data/mm7/bed/blastz.rn4.2006-02-23
    cat << '_EOF_' > DEF
# mouse vs rat

BLASTZ=/cluster/bin/penn/x86_64/blastz.v7.x86_64
BLASTZ_ABRIDGE_REPEATS=1

# TARGET: Mouse Mm7
SEQ1_DIR=/scratch/hg/mm7/nib
SEQ1_SMSK=/scratch/hg/mm7/linSpecRep/notInRat
SEQ1_LEN=/scratch/hg/mm7/chrom.sizes
SEQ1_CHUNK=20000000
SEQ1_LAP=10000

# QUERY: Rat Rn4 - single chunk big enough to run each chrom by itself
SEQ2_DIR=/scratch/hg/rn4/nib
SEQ2_SMSK=/cluster/bluearc/scratch/hg/rn4/linSpecRep.notInMouse
SEQ2_LEN=/cluster/data/rn4/chrom.sizes
SEQ2_CHUNK=300000000
SEQ2_LAP=0

BASE=/cluster/data/mm7/bed/blastz.rn4.2006-02-23
TMPDIR=/scratch/tmp
'_EOF_'
    # << for emacs
    doBlastzChainNet.pl DEF -chainMinScore 3000 -chainLinearGap medium \
      -bigClusterHub pk \
      -blastzOutRoot /cluster/bluearc/blastzMm7Rn4Out >& do.log &
    tail -f do.log
    rm -f /cluster/data/mm7/bed/blastz.rn4
    ln -s blastz.rn4.2006-02-23 /cluster/data/mm7/bed/blastz.rn4


# Update table maps known genes to visiGene images (2006-03-07 galt)
    knownToVisiGene mm7

# UPDATED mm7.knownToVisiGene (2006-03-14 galt)
ssh hgwdev
knownToVisiGene mm7

# UPDATED mm7.knownToVisiGene (2006-04-05 galt)
ssh hgwdev
knownToVisiGene mm7


###########################################################################
# SPLIT SEQUENCE FOR LIFTOVER (DROPUNDER) CHAINS FROM MM8  (2006-04-06 kate)
    
    ssh kkr1u00
    cd /cluster/data/mm7/bed
    mkdir bed/liftOver
    cd bed/liftOver
    makeLoChain-split mm7 /cluster/data/mm7/nib >&! split.log &
    

#######################################################################
## LIFTOVER To Mm8 (DONE - 2006-04-21 - 2006-04-24 - Hiram)
    ssh kkr1u00
    $HOME/kent/src/hg/makeDb/makeLoChain/makeLoChain-split.csh \
      mm8 /cluster/data/mm8/nib
    # as it says, DO THIS NEXT:
    ssh kk
    $HOME/kent/src/hg/makeDb/makeLoChain/makeLoChain-align.csh \
	mm7 /scratch/hg/mm7/nib mm8 /iscratch/i/mm8/split10k \
	/cluster/data/mm8/11.ooc
    # as it says, DO THIS NEXT:
    cd /cluster/data/mm7/bed/blat.mm8.2006-04-21/run
    para try, check, push, check, ...
# Completed: 1360 of 1360 jobs
# CPU time in finished jobs:    4087937s   68132.28m  1135.54h   47.31d  0.130 y
# IO & Wait Time:                 15121s     252.02m     4.20h    0.18d  0.000 y
# Average job time:                3017s      50.28m     0.84h    0.03d
# Longest finished job:           25341s     422.35m     7.04h    0.29d
# Submission to last job:         84772s    1412.87m    23.55h    0.98d
    # as it says, DO THIS NEXT:
    ssh kkr1u00
    cd /cluster/data/mm7/bed
    ln -s blat.mm8.2006-04-21 blat.mm8
    #	runs liftUp to create the psl files
    time $HOME/kent/src/hg/makeDb/makeLoChain/makeLoChain-lift.csh mm7 mm8
    # as it says, DO THIS NEXT:
    ssh kki
    #	prepares the chain batch job
    $HOME/kent/src/hg/makeDb/makeLoChain/makeLoChain-chain.csh \
	mm7 /cluster/data/mm7/nib mm8 /cluster/data/mm8/nib
    # as it says, DO THIS NEXT:
    cd /cluster/data/mm7/bed/blat.mm8.2006-04-21/chainRun
    para try, check, push, check, ...
# Completed: 34 of 34 jobs
# CPU time in finished jobs:       5834s      97.23m     1.62h    0.07d  0.000 y
# IO & Wait Time:                   852s      14.20m     0.24h    0.01d  0.000 y
# Average job time:                 197s       3.28m     0.05h    0.00d
# Longest finished job:             622s      10.37m     0.17h    0.01d
# Submission to last job:          1367s      22.78m     0.38h    0.02d
    ssh kkstore02
    $HOME/kent/src/hg/makeDb/makeLoChain/makeLoChain-net.csh mm7 mm8
    #	Created /cluster/data/mm7/bed/liftOver/mm7ToMm8.over.chain.gz
    # as it says, DO THIS NEXT:
    ssh hgwdev
    $HOME/kent/src/hg/makeDb/makeLoChain/makeLoChain-load.csh mm7 mm8
    #	It says this:
    #	Now, add link for
    #	/usr/local/apache/htdocs/goldenPath/mm7/liftOver/mm7ToMm8.over.chain
    #	to hgLiftOver
    #	But I believe that link was already done:
    cd /gbdb/mm7/liftOver
    ls -og mm7ToMm8*
    #	lrwxrwxrwx    1       53 Apr 24 14:22 mm7ToMm8.over.chain.gz -> \
    #		/cluster/data/mm7/bed/liftOver/mm7ToMm8.over.chain.gz

# ALTGRAPHX TRACK (sugnet) Wed Apr 26 14:00:54 PDT 2006

# First make altGraphX track for hg18 (see makeHg18.doc)
cd /cluster/store1/sugnet/altSplice/hg18-2006.04.13/
mkdir orthoSplice.mm7
cd orthoSplice.mm7/
cp /cluster/data/mm7/bed/blastz.hg18/axtChain/mm7.hg18.all.chain.gz .
cp /cluster/data/mm7/bed/blastz.hg18/axtChain/mm7.hg18.net.gz .
gunzip *.gz
echo 'select chrom, size from chromInfo' | hgsql mm7 | grep -v chrom > chromSizes.tab
chainSplit chains mm7.hg18.all.chain 
netSplit mm7.hg18.net nets
mkdir agx report logs
cat << '_EOF_' > makeRun.sh
#!/usr/bin/perl -w

open(IN, 'chromSizes.tab') or die "Can't open chromSizes.tab\n";
while(<IN>) {
    chomp;
    @w = split;
    print "/cluster/home/sugnet/bin/i386/orthoSplice -chromSize=$w[1] -trumpNum=3 -chrom=$w[0] -altInFile=../mm7/agxs/mm7.$w[0].agx -orthoAgxFile=../hg18/hg18.agx -db=mm7 -orthoDb=hg18 -netFile=nets/$w[0].net -chainFile=chains/$w[0].chain -commonFile=agx/$w[0].mm7.hg18.cons.t3.agx -reportFile=report/$w[0].mm7.report -edgeFile=report/$w[0].mm7.edge.report >& logs/$w[0].test.log\n";
}
'_EOF_'
    # << this line keeps emacs coloring happy
chmod 755 makeRun.sh
./makeRun.sh > orthoSplice.para.spec

cat agx/*.agx > mm7.hg18.t3.agx
cp ~/latestJk/kent/src/hg/lib/altGraphX.sql .

hgLoadBed -notItemRgb -sqlTable=altGraphX.sql mm7 altGraphX mm7.hg18.t3.agx 

# done altGraphX track.

# EXONWALK TRACK (sugnet) Wed Apr 26 14:04:28 PDT 2006

# First build the altGraphX track above.
cd /cluster/store1/sugnet/altSplice/hg18-2006.04.13/orthoSplice.mm7
mkdir exonWalk
cd exonWalk 
mkdir beds

foreach file (`ls ../agx/*.agx`)
  set base=`basename $file .agx`
  echo "/cluster/home/sugnet/bin/i386/exonWalk db=mm7 minPercent=0 trumpSize=100000 $file beds/$base.bed" >> exonWalk.para.spec
end

para create exonWalk.para.spec
para push
cat beds/*.bed > mm7.hg18.cons.t3.bed
mkdir orfs
cd orfs
mkdir bedOrf beds fa borf
splitFile ../../mm7.hg18.cons.t3.bed 500 exonWalk.

cd ..
cat << '_EOF_' > makeFa.sh
#!/bin/sh

for file in "$@"
do
 base=`basename $file`
 echo "Doing $file"
 echo "sequenceForBed -db=mm7 -bedIn=$file -fastaOut=fa/$base.fa "
 sequenceForBed -db=mm7 -bedIn=$file -fastaOut=fa/$base.fa 
done
'_EOF_'
    # << this line keeps emacs coloring happy
chmod 755 makeFa.sh
makeFa.sh beds/*

cat << '_EOF_' > makeGenePred.sh
#!/bin/sh

for file in "$@"
do
  base=`basename $file`
  /cluster/home/sugnet/bin/i386/borfMatcher -keepNmd beds/$base borf/$base.borf bedOrf/$base.bed genePred/$base.gp 
done
'_EOF_'
    # << this line keeps emacs coloring happy
cat << '_EOF_' > makeGenePredWithNmd.sh
#!/bin/sh

for file in "$@"
do
  base=`basename $file`
  /cluster/home/sugnet/bin/i386/borfMatcher -keepNmd beds/$base borf/$base.borf bedOrfWithNmd/$base.bed genePredWithNmd/$base.gp 
done
'_EOF_'
    # << this line keeps emacs coloring happy
chmod 755 *.sh
mkdir genePred
mkdir genePredWithNmd
mkdir bedOrfWithNmd

./makeGenePred.sh beds/*
./makeGenePredWithNmd.sh beds/*

cat beds/* > mm7.hg18.exonWalk.all.bed
cat genePred/* > mm7.hg18.exonWalk.gp
cat genePredWithNmd/* > mm7.hg18.exonWalk.withNmd.gp

hgLoadBed mm7 exonWalkAll mm7.hg18.exonWalk.all.bed 
ldHgGene -predTab mm7 exonWalk mm7.hg18.exonWalk.noNmd.gp 
ldHgGene -predTab mm7 exonWalkWithNmd mm7.hg18.exonWalk.withNmd.gp

trackGenome mm7 all refGene:cds trackGenome.spec 
Track Specification      track  overlap track    cov   track   new    cum 
                          size     size  geno  track     cov   cov    cov 
-----------------------------------------------------------------------------
exonWalk:cds          29568035 24053910  1.04%  81.35%  88.02% 88.02% 88.02%
exonWalk              50860916 24360287  1.79%  47.90%  89.14% 89.14% 89.14%
exonWalkWithNmd:cdf   50860916 24360287  1.79%  47.90%  89.14% 89.14% 89.14%
exonWalk              50860916 24360287  1.79%  47.90%  89.14% 89.14% 89.14%
exonWalkAll           58621025 25081327  2.06%  42.79%  91.78% 91.78% 91.78%
mrna                  136069914 26020276  4.78%  19.12%  95.22% 95.22% 95.22%
intronEst             52046578 21219340  1.83%  40.77%  77.65% 77.65% 77.65%
est                   185259362 23838078  6.51%  12.87%  87.23% 87.23% 87.23%
[hgwdev:orfs> trackGenome mm7 all refGene trackGenome.spec 
Track Specification      track  overlap track    cov   track   new    cum 
                          size     size  geno  track     cov   cov    cov 
-----------------------------------------------------------------------------
exonWalk:cds          29568035 24517871  1.04%  82.92%  52.77% 52.77% 52.77%
exonWalk              50860916 38905251  1.79%  76.49%  83.74% 83.74% 83.74%
exonWalkWithNmd:cdf   50860916 38905251  1.79%  76.49%  83.74% 83.74% 83.74%
exonWalk              50860916 38905251  1.79%  76.49%  83.74% 83.74% 83.74%
exonWalkAll           58621025 40837805  2.06%  69.66%  87.90% 87.90% 87.90%
mrna                  136069914 44970708  4.78%  33.05%  96.79% 96.79% 96.79%
intronEst             52046578 28482307  1.83%  54.72%  61.30% 61.30% 61.30%
est                   185259362 41486850  6.51%  22.39%  89.30% 89.30% 89.30%

# done exonWalk.

############################################################################
# SGP GENES (DONE - 2006-05-05 - Fan)
    ssh kkstore02
    cd  mkdir /cluster/data/mm7/bed
    mv sgp sgp_old
    mkdir /cluster/data/mm7/bed/sgp
    cd /cluster/data/mm7/bed/sgp
    #   They don't do chrM
    bash
    for CHR in `awk '{print $1}' ../../chrom.sizes | grep -v chrM`
    do
        wget --timestamping \
"http://genome.imim.es/genepredictions/M.musculus/mmDec2005/SGP/humangp200603/${CHR}.gtf" \
        -O "${CHR}.gtf"
    done

    ssh hgwdev
    cd /cluster/data/mm7/bed/sgp
    ldHgGene -gtf -genePredExt mm7 sgpGene chr*.gtf

# Dropped the sgpPep table, just let hgc calculate predict protein sequence.
    hgsql mm7 -e 'drop table sgpPep'
    featureBits mm7 -enrichment refGene:CDS sgpGene
refGene:CDS 1.059%, sgpGene 1.451%, both 0.912%, cover 86.18%, enrich 59.39x

###########################################################################
#### LOAD ENSEMBL GENES (DONE 5/12/06, Fan)
# ADDDED STABLE URL TO TRACKDB BLOCK (V38, APR 2006) (2008-01-11, rhead)
#	needed for Gene Sorter procedure below
#	Ensembl released Mouse build 34 the week of August 10th, 2005
   mkdir -p /cluster/store5/mm7/bed/ensGene
#   ln -s /cluster/store5/mm7/bed/ensGene /cluster/data/mm7/bed
   cd /cluster/data/mm7/bed/ensGene

        # Get the Ensembl BioMart at http://www.ensembl.org/Multi/martview
        # Choose Ensembl 38 and Mus musculus genes (NCBIM35), click next
        # Follow this sequence through the pages:
1) Select "Known genes" in the Gene section. Hit next.
NOTE: "Known genes" is no longer available for later versions of BioMart.
2) Uncheck everything on "Filter" page. Then hit next.
        3) Select "Structures". 
        4) Choose GTF as the output, choose gzip compression, name the
	output file ensGeneMm7.gtf.gz and then hit Export

# Ensembl handles random chromosomes differently than us, so we
# strip this data.  Fortunately it just loses a couple of genes.
    gzip -d ensGeneMm7.gtf.gz
    cat ensGeneMm7.gtf | grep -v ^6_DR51 | grep -v NT_ > unrandom.gtf
#	Let's see how much it loses:
# 614986 8609804 87362912 ensGeneMm7.gtf
# 603148 8444072 85668522 unrandom.gtf

# Add "chr" to front of each line in the gene data gtf file to make 
# it compatible with ldHgGene

    sed -e "s/^/chr/" unrandom.gtf | sed -e "s/chrMT/chrM/"| sed -e 's/\..\"/\"/g' > ensGene.gtf
    ldHgGene mm7 ensGene ensGene.gtf
# Read 33902 transcripts in 603148 lines in 1 files
#   33902 groups 22 seqs 1 sources 4 feature types
# 33902 gene predictions

#	save space, gzip them:
    gzip unrandom.gtf
    gzip ensGene.gtf

# ensGtp associates geneId/transcriptId/proteinId for hgPepPred and 
    # hgKnownToSuper.  Use ensMart to create it as above, except:
    # Page 3) Choose the "Features" box. In "Ensembl Attributes", check 
    # Ensembl Gene ID, Ensembl Transcript ID, Ensembl Peptide ID.  
    # Choose Text, tab-separated as the output format.  Result name ensGtp.
    # Save file as ensGtp.tsv.gz
    gunzip ensGtp.tsv.gz
    hgsql mm7 < ~/kent/src/hg/lib/ensGtp.sql
    # remove header line from ensGtp.txt
    echo "load data local infile 'ensGtp.tsv' into table ensGtp ignore 1 lines" | hgsql -N mm7

# Load Ensembl peptides:
    # Get them from ensembl as above in the gene section except for
    # Page 3) Choose the "Sequences" box. 
    # Page 4) check Peptide, then Ensembl Gene ID, Ensembl Transcript ID, 
    # and Ensembl Peptide ID, uncheck others (chrom, bioType, etc).  Format = FASTA.
    # Save file as ensemblPep5.fa.gz (several previous trials failed).
   
   gunzip ensemblPep5.fa.gz   
   cat ensemblPep5.fasta | faToTab -type=protein stdin stdout | grep -v SEQXENCEXNAVAILAXLE > j.tab
   cat j.tab |grep -v "Reading"|awk '{print ">" $1;print $2}' > ensPep.fa
   hgPepPred mm7 ensembl ensPep.fa
   rm j.tab
   gzip *

# Create knownToEnsembl column
    hgMapToGene mm7 ensGene knownGene knownToEnsembl

# Compress everthing to save space
    gzip *
    
#### BUILD Ensembl cross-reference table, ensemblXref3 (TBD)

# PLEASE NOTE THAT THE ENSEMBLXREF3 TABLE IS BUILT USING ENSEMBL BIOMART DATA OF MOUSE BUILD 34.
# THIS TABLE IS NEEDED TO SUPPORT SUPERFAMILY TRACK OF THE PROTEOME BROWSER.
# SINCE ENSEMBL CHANGED THE DATA FORMAT AGAIN (AS USUAL :-(  ), THERE IS NO VERSION NUMBER
# IN THEIR IDs, A FAKE "0" IS GENERATED FOR EACH ID IN ensemblXref3 TABLE.
    # Get the ensembl gene/protein cross-reference data BioMart
    # Follow this sequence through the pages:
    # 1) Make sure that the Mus musculus choice is selected. Hit next.
    # 2) Choose the "Feature" box, select Ensembl gene, Ensembl transcript, 
    #    and Ensembl peptid IDs, UniProt/SPTREMBL ID, UniProt/Swiss-Prot ID, 
    #    and UniProt/Swiss-Prot Accession 
    # 3) Choose "Text, tab separated".  choose gzip compression.  hit export.
    # Save as ensXref2.tsv.gz
    gzip -d ensXref2.tsv.gz
    hgsql mm7 < ~/hg/lib/ensemblXref3Temp.sql
    hgsql mm7 -e 'load data local infile "ensXref2.tsv" into table ensemblXref3Temp ignore 1 lines'

    hgsql mm7 -N -e 'select gene, "0", transcript, "0", protein, "0", tremblAcc, swissDisplayId, swissAcc from ensemblXref3Temp' \
    > ensemblXref3.tab

    hgsql mm7 -e 'drop table ensemblXref3'
    hgsql mm7 <~/src/hg/lib/ensemblXref3.sql
    hgsql mm7 -e 'load data local infile "ensemblXref3.tab" into table ensemblXref3'
    
# load the table into proteome DB also   
    hgsql proteome <~/src/hg/lib/ensemblXref3.sql
    hgsql proteome -e 'load data local infile "ensemblXref3.tab" into table ensemblXref3'


########################################################################
### microRNA targets tracks  (DONE - 2006-05-16 - Hiram)
### from: http://pictar.bio.nyu.edu/ Rajewsky Lab
### Nikolaus Rajewsky nr@scarbo.bio.nyu.edu
### Yi-Lu Wang ylw205@nyu.edu
### dg@thp.Uni-Koeln.DE
    ssh hgwdev
    mkdir /cluster/data/mm7/bed/picTar
    cd /cluster/data/mm7/bed/picTar
    wget --timestamping \
	'http://pictar.bio.nyu.edu/ucsc/mouse/mouse_bed' -O mouse_dog.bed

    wget --timestamping \
	'http://pictar.bio.nyu.edu/ucsc/mouse/mouse_chicken_bed' \
	-O mouse_chicken.bed

    grep -v "^track" mouse_dog.bed \
	| hgLoadBed -strict mm7 picTarMiRNADog stdin
    #	Loaded 127119 elements of size 9
    grep -v "^track" mouse_chicken.bed \
	| hgLoadBed -strict mm7 picTarMiRNAChicken stdin
    #	Loaded 12298 elements of size 9

    nice -n +19 featureBits mm7 picTarMiRNADog
    #	449871 bases of 2583394090 (0.017%) in intersection
    nice -n +19 featureBits mm7 picTarMiRNAChicken
    #	35586 bases of 2583394090 (0.001%) in intersection


#########################################################################
### IGTC (Int'l GeneTrap Consortium) (DONE - 2006-06-07 - angie)
### fasta added 2006-06-21
### Doug Stryke <stryke@cgl.ucsf.edu> in Tom Ferrin's lab

### NOTE -- as of 2007-03-01 the igtc track will be automatically 
### updated on hgwdev by the scripts monthlyUpdateIgtc.csh and 
### updateIgtc.pl in kent/src/hg/utils/automation/ .

    ssh hgwdev
    mkdir /cluster/data/mm7/bed/igtc
    cd /cluster/data/mm7/bed/igtc
    wget http://www.genetrap.org/blattrack/genetrap_mm7.psl
    grep -v ^track genetrap_mm7.psl \
    | hgLoadPsl mm7 -table=igtc stdin
    # Probe fasta is shared by all assemblies:
    wget http://www.genetrap.org/blattrack/genetrap.fasta
    mkdir /gbdb/mm7/igtc
    ln -s /cluster/data/mm7/bed/igtc/genetrap.fasta /gbdb/mm7/igtc/
    hgLoadSeq -replace mm7 /gbdb/mm7/igtc/genetrap.fasta


#########################################################################
# LIFTOVER TO MM6 (DONE, 2006-06-11 - 2006-06-12, hartera)
# Split of mm6 sequences done using makeLoChain-split and doc'd in 
# makeMm6.doc: SPLIT MM6 SEQUENCES FOR LIFTOVER FROM OTHER ASSEMBLIS section.
# Run on pk as faster than kk.
    ssh pk
    mkdir -p /cluster/data/mm7/bed/liftOver
    cd /cluster/data/mm7/bed/liftOver
cat << '_EOF_' > align.csh
#!/bin/csh -fe
set oldAssembly = $1
set oldNibDir = $2
set newAssembly = $3
set newSplitDir = $4
set ooc = $5
if ("$ooc" != "") then
    set ooc = '-ooc='$ooc
endif

set blatDir = /cluster/data/$oldAssembly/bed/blat.$newAssembly.`date +%Y-%m-%d`
echo "Setting up blat in $blatDir"
rm -fr $blatDir
mkdir $blatDir
cd $blatDir
mkdir raw psl run
cd run

echo '#LOOP' > gsub
echo 'blat $(path1) $(path2) {check out line+ ../raw/$(root1)_$(root2).psl} ' \
       '-tileSize=11 '$ooc' -minScore=100 -minIdentity=98 -fastMap' \
  >> gsub
echo '#ENDLOOP' >> gsub

# target
ls -1S $oldNibDir/*.{nib,2bit} > old.lst
# query
ls -1S $newSplitDir/*.{nib,fa} > new.lst

gensub2 old.lst new.lst gsub spec
/parasol/bin/para create spec

echo ""
echo "First two lines of para spec:"
head -2 spec
echo ""
echo "DO THIS NEXT:"
echo "    cd $blatDir/run"
echo "    para try, check, push, check, ..."
echo ""
exit 0
'_EOF_'
   # << emacs
   chmod +x align.csh
   align.csh mm7 /san/sanvol1/scratch/mm7/nib mm6 \
       /san/sanvol1/scratch/mm6/split10k \
       /san/sanvol1/scratch/mm6/11.ooc >&! align.log &
   # Took a few seconds.
   # Do what its output says to do next (start cluster job)
   cd /cluster/data/mm7/bed/blat.mm6.2006-06-11/run
   para try, check, push, check, ...
   para time >&! run.time
# Completed: 1600 of 1600 jobs
# CPU time in finished jobs:    1483451s   24724.18m   412.07h   17.17d  0.047 y
# IO & Wait Time:                  6704s     111.74m     1.86h    0.08d  0.000 y
# Average job time:                 931s      15.52m     0.26h    0.01d
# Longest running job:                0s       0.00m     0.00h    0.00d
# Longest finished job:            9049s     150.82m     2.51h    0.10d
# Submission to last job:         12027s     200.45m     3.34h    0.14d
 
   ssh pk
   cd /cluster/data/mm7/bed/liftOver

cat << '_EOF_' > lift.csh
#!/bin/csh -ef
set oldAssembly = $1
set newAssembly = $2
set newLiftDir = /san/sanvol1/scratch/$newAssembly/split10k

set prefix = /cluster/data/$oldAssembly/bed/blat.$newAssembly
set blatDir = `ls -td $prefix.20* | head -1`
echo "using dir $blatDir"

if ( ! -e $blatDir/raw ) then
    echo "Can't find $blatDir/raw"
endif

if (`ls -1 $newLiftDir/*.lft | wc -l` < 1) then
    echo "Can't find any .lft files in $newLiftDir"
    exit 1
endif
cd $blatDir/raw

foreach chr (`awk '{print $1;}' /cluster/data/$newAssembly/chrom.sizes`)
    echo $chr
    liftUp -pslQ ../psl/$chr.psl $newLiftDir/$chr.lft warn chr*_$chr.psl
end

set execDir = $0:h
echo ""
echo "DO THIS NEXT:"
echo "    ssh pk"
echo "    $execDir/makeLoChain-chain $oldAssembly <$oldAssembly-nibdir> $newAssembly <$newAssembly-nibdir>"
echo ""
exit 0
'_EOF_'
   # << emacs
   chmod +x lift.csh
   lift.csh mm7 mm6 >&! lift.log &
   # makeLoChain-chain can be run on pk. chain alignments

   makeLoChain-chain mm7 /san/sanvol1/scratch/mm7/nib \
                     mm6 /san/sanvol1/scratch/mm6/nib >&! chain.log &
   cd /cluster/data/mm7/bed/blat.mm6.2006-06-11/chainRun
   para try, check, push, check, ...
   para time >&! run.time
# Completed: 40 of 40 jobs
# CPU time in finished jobs:       7800s     130.00m     2.17h    0.09d  0.000 y
# IO & Wait Time:                 15867s     264.45m     4.41h    0.18d  0.001 y
# Average job time:                 592s       9.86m     0.16h    0.01d
# Longest running job:                0s       0.00m     0.00h    0.00d
# Longest finished job:            1323s      22.05m     0.37h    0.02d
# Submission to last job:          1323s      22.05m     0.37h    0.02d
   ssh kkstore02
   cd /cluster/data/mm7/bed/liftOver
   makeLoChain-net mm7 mm6 >&! net.log &
   # Took about 50 minutes.
   # load reference to over.chain into database table,
   # and create symlinks  /gbdb  and download area
   ssh hgwdev
   cd /cluster/data/mm7/bed/liftOver
   makeLoChain-load mm7 mm6 >&! load.log &
   # clean up
   rm *.log
   # add md5sum.txt to include this new liftOver file
   cd /usr/local/apache/htdocs/goldenPath/mm7/liftOver
   rm md5sum.txt
   md5sum *.gz > md5sum.txt
   # copy README.txt from another liftOver directory.
   # test by converting a region using the "convert" link on
   # the browser, and comparing to blat of the same region

#########################################################################
# REGULATORY POTENTIAL (DONE - 2006-06-12 - Hiram)
    #	download data from "James Taylor" <james@bx.psu.edu>
    ssh kkstore02
    cd /cluster/data/mm7/bed
    mkdir /cluster/store11/mm7/bed/regPotential7X
    ln -s /cluster/store11/mm7/bed/regPotential7X .
    cd regPotential7X
    
    #	This is a lot of data
    time for C in 1 2 3 4 5 6 7 8 9 X 10 11 12 13 14 15 16 17 18 19
    do
    wget --timestamping \
"http://www.bx.psu.edu/~james/esperr_rp_7way_scores/genome_scores_mm7/chr${C}.scores.truncated.bz2"
    done
    #	real    77m57.028s

    wget --timestamping \
"http://www.bx.psu.edu/~james/esperr_rp_7way_scores/genome_scores_mm7/trackDb.html" -O description.html

    time for C in 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 X
    do
	bzcat chr${C}.scores.truncated.bz2
    done | wigEncode -noOverlap  stdin regPotential7X.wig regPotential7X.wib
    #	Converted stdin, upper limit 1.00, lower limit 0.00
    #	real    22m7.525s

    #	Loading the table on hgwdev
    ssh hgwdev
    cd /cluster/data/mm7/bed/regPotential7X
    ln -s /cluster/data/mm7/bed/regPotential7X/regPotential7X.wib \
	/gbdb/mm7/wib/regPotential7X.wib
    #	using the tmpDir is faster since it is on local disk and it will
    #	clean up any temporary .tab file it creates there
    time hgLoadWiggle -tmpDir=/scratch/tmp \
	mm7 regPotential7X regPotential7X.wig
    #	real    0m28.683s

    #	create a histogram
    ssh kolossus
    cd /cluster/data/mm7/bed/regPotential7X
    time hgWiggle -verbose=2 -doHistogram -hBinSize=0.01 -hBinCount=100 \
	-hMinVal=0.0 -db=mm7 regPotential7X > histogram.data 2>&1
    #	real    1m38.660s
    #	*only* %65 of the data is 0.0

    #	create download gzip files from the bz2 files:
    for F in chr*.scores.truncated.bz2
    do
	C=`echo $F | awk -F'.' '{print $1}'`
	echo -n "${C}.regPotential7X.mm7.gz working ... "
	bzcat ${F} | gzip > ${C}.regPotential7X.mm7.gz
	echo
    done

#############################################################################
# Create Allen Brain Atlas mapping. (Done 2007-02-08 Galt)
# We are creating several things: a psl probe-track for the RR on mouse,
# a link out from kg to the probe to the ABA website, 
# and a set of gene/probe info which visiGene will use.
# (This needs to be done after have created sequences in
# ncbiXm and tigrMgiTc as above.)

# metadata.log and SRGEsequence.log was provided by 
#  Susan Sunkin <SusanS@alleninstitute.org>
# this is an update to the visiGene with 6000 new images.

# See mm6.txt for steps not needing to be repeated.

# copy in the data files (directory already exists from previous build)
    ssh hgwdev
    cd /cluster/data/mm7/bed/allenBrain
    mkdir old
    mv * old/
    cp /cluster/data/mm6/bed/allenBrain/allen20061204.tab .
    cp /cluster/data/mm6/bed/allenBrain/probeSeq.20061204.fasta .
    cp /cluster/data/mm6/bed/allenBrain/allProbes.fa .
    cp /cluster/data/mm6/bed/allenBrain/allProbes.tab .
    cp /cluster/data/mm6/bed/allenBrain/allenBrainUrl.tab .


# Set up a blat run to align the probes.
    ssk pk
    cd /cluster/data/mm7/bed/allenBrain
    mkdir split
    faSplit sequence allProbes.fa 200 split/rp
    mkdir run
    cd run
    ls -1 ../split/*.fa > mrna.lst
    ls -1 /scratch/hg/mm7/nib/*.nib > genome.lst
    mkdir psl
    cat << '_EOF_' > gsub
#LOOP
blat -ooc=/scratch/hg/h/mouse11.ooc $(path1) $(path2) {check out line+ psl/$(root1)_$(root2).psl}
#ENDLOOP
'_EOF_'
    gensub2 genome.lst mrna.lst gsub spec
    para create spec

# Then do the usual para try/push/time/check until the run is finished
#Completed: 7760 of 7760 jobs
#CPU time in finished jobs:      27428s     457.13m     7.62h    0.32d  0.001 y
#IO & Wait Time:                 22916s     381.93m     6.37h    0.27d  0.001 y
#Average job time:                   6s       0.11m     0.00h    0.00d
#Longest running job:                0s       0.00m     0.00h    0.00d
#Longest finished job:              20s       0.33m     0.01h    0.00d
#Submission to last job:           516s       8.60m     0.14h    0.01d


# Then do sorting and near-best-in-genome step on file server
    ssh kkstore
    cd /cluster/data/mm7/bed/allenBrain/run
    pslSort dirs raw.psl tmp psl
    pslReps raw.psl ../best.psl -nohead -minCover=0.20 -minAli=0.96 -nearTop=0.001 /dev/null
    sort -k 14,14 -k 16,16n ../best.psl > ../allenBrainAli.psl

# Clean up big files no longer needed
    rm raw.psl
    rm -r psl
    rm -r ../split

# Load up database
    ssh hgwdev
    cd /cluster/data/mm7/bed/allenBrain

# Make a new table that contains the URLs for the allen brain genes
# Make this one first since all.joiner considers it the master table.

    hgsql mm7 -e 'drop table allenBrainUrl'
    hgsql mm7 < ~/kent/src/hg/lib/allenBrainUrl.sql
    hgsql mm7 -e 'load data local infile "allenBrainUrl.tab" into table allenBrainUrl'

# Make probe alignment table, and load sequence.
    hgLoadPsl mm7 allenBrainAli.psl
    rm /gbdb/mm7/allenBrain/allProbes.fa
    ln -s /cluster/data/mm7/bed/allenBrain/allProbes.fa /gbdb/mm7/allenBrain/allProbes.fa
    hgLoadSeq -replace mm7 /gbdb/mm7/allenBrain/allProbes.fa

# Make mapping between known genes and allenBrain
    hgMapToGene mm7 allenBrainAli -type=psl knownGene knownToAllenBrain 

##########################################################################
#  xxBlastTab - Help filter out unwanted paralogs  (Galt 2007-01-11)
#
# We are starting with xxBlastTab tables already built in the usual way with
# blastall/blastp, probably with doHgNearBlastp.pl script.
#
# we want to update mm7 for human and rat, 
# so check ./hgGeneData/Mouse/mm7/otherOrgs.ra for current settings

ssh hgwdev

synBlastp.csh mm7 hg17 
#mm7.hgBlastTab:
#new number of unique query values:
#24760
#new number of unique target values
#15354
#old number of unique query values:
#27511
#old number of unique target values
#15931
#cleanup:


synBlastp.csh mm7 rn3
#mm7.rnBlastTab:
#new number of unique query values:
#8603
#new number of unique target values
#5071
#old number of unique query values:
#19217
#old number of unique target values
#5508


#########################################################################
##########################################################################
# GenBank gbMiscDiff table (markd 2007-01-10)
# Supports `NCBI Clone Validation' section of mgcGenes details page

   # genbank release 157.0 now contains misc_diff fields for MGC clones
   # reloading mRNAs results in gbMiscDiff table being created.
   ./bin/gbDbLoadStep -reload -srcDb=genbank -type=mrna mm7

# (DONE galt 2007-02-15)
# Create table that maps between known genes and visiGene database 
    knownToVisiGene mm7

################################################
# AUTOMATE UPSTREAM FILE CREATION (2008-10-15 markd)
update genbank.conf:
mm7.upstreamGeneTbl = refGene
#############################################################################
# UPDATE KEGG TABLES (DONE, Fan, 6/18/10)

mkdir -p /hive/data/genomes/mm7/bed/pathways/kegg
cd /hive/data/genomes/mm7/bed/pathways/kegg

wget --timestamping ftp://ftp.genome.jp/pub/kegg/pathway/map_title.tab

cat map_title.tab | sed -e 's/\t/\tmmu\t/' > j.tmp
cut -f 2 j.tmp >j.mmu
cut -f 1,3 j.tmp >j.1
paste j.mmu j.1 |sed -e 's/\t//' > keggMapDesc.tab
rm j.mmu j.1
rm j.tmp

hgsql mm7 -e 'drop table keggMapDesc'
hgsql mm7 < ~/kent/src/hg/lib/keggMapDesc.sql
hgsql mm7 -e 'load data local infile "keggMapDesc.tab" into table keggMapDesc'

wget --timestamping ftp://ftp.genome.jp/pub/kegg/genes/organisms/mmu/mmu_pathway.list

cat mmu_pathway.list| sed -e 's/path://'|sed -e 's/:/\t/' > j.tmp
hgsql mm7 -e 'drop table keggPathway'
hgsql mm7 < ~/kent/src/hg/lib/keggPathway.sql
hgsql mm7 -e 'load data local infile "j.tmp" into table keggPathway'

hgsql mm7 -N -e \
'select name, locusID, mapID from keggPathway p, knownToLocusLink l where p.locusID=l.value' \
>keggPathway.tab

hgsql mm7 -e 'delete from keggPathway'

hgsql mm7 -e 'load data local infile "keggPathway.tab" into table keggPathway'

rm j.tmp
#############################################################################
# Add KEGG column to mm7 Gene Sorter (Done, Fan, 6/18/2010)

mkdir -p /hive/data/genomes/mm7/bed/geneSorter
cd /hive/data/genomes/mm7/bed/geneSorter
hgsql mm7 -N -e 'select kgId, mapID, mapID, "+", locusID from keggPathway' |sort -u|sed -e 's/\t+\t/+/' > knownToKeggEntrez.tab

hgsql mm7 -e 'drop table knownToKeggEntrez'

hgsql mm7 < ~/kent/src/hg/lib/knownToKeggEntrez.sql

hgsql mm7 -e 'load data local infile "knownToKeggEntrez.tab" into table knownToKeggEntrez'

#############################################################################
