# for emacs: -*- mode: sh; -*-


# This file describes how we made the browser database on 
# NCBI build 33 (April, 2003 freeze)

# [For importing GTF tracks, use /projects/compbio/bin/validate_gtf.pl]

# HOW TO BUILD A ASSEMBLY FROM NCBI FILES
# ---------------------------------------

# NOTE: It is best to run most of this stuff on eieio since it
# is not averse to handling files > 2Gb

# 0) Make gs.16 directory, gs.16/build33 directory, and gs.16/ffa directory.
    mkdir /cluster/store5/gs.16
    mkdir /cluster/store5/gs.16/build33
    mkdir /cluster/store5/gs.16/agp
    mkdir /cluster/store5/gs.16/ffa

#    Make a symbolic link from /cluster/store1 to this location
	
    cd /cluster/store1
    ln -s /cluster/store5/gs.16 ./gs.16

#    Make a symbolic link from your home directory to the build dir:

    ln -s /cluster/store5/gs.16/build33 ~/oo

# 1) Download seq_contig.md, ncbi_build33.agp, contig_overlaps.agp 
# and contig fa file into gs.16/build33 directory. 

# Download certificates.txt in gs.16/build33 directory (first time in build33)

# Download all finished agp's and fa's into gs.16/agp

# Download sequence.inf and ncbi_build33.fa files into gs.16/ffa, and unzip
# ncbi_build33.fa.

# *** For build33, files split into reference.agp/reference.fa (main O&O), DR51.agp/DR51.fa,
#     and DR52.agp/DR52.fa. (alternate versions of MHC region).  These were concatenated 
#     to get the ncbi_build33.agp and ncbi_build33.fa

# 2) Sanity check things with 
    /cluster/bin/i386/checkYbr build33/ncbi_build33.agp ffa/ncbi_build33.fa \
      build33/seq_contig.md
#      report any errors back to Richa and Greg at NCBI.

# 3) Convert fa files into UCSC style fa files and place in "contigs" directory
#    inside the gs.16/build33 directory 

    cd build33
    mkdir contigs
    /cluster/bin/i386/faNcbiToUcsc -split -ntLast ../ffa/ncbi_build33.fa \
      contigs

# 3.1) Make a fake chrM contig
    cd ~/oo
    mkdir M
# copy in chrM.fa, chrM.agp and chrM.gl from previous version.
    mkdir M/NT_999999
    cp chrM.fa NT_999999/NT_999999.fa

# copied chrM.fa, chrM.agp, chrM.gl, chrM.trf.bed, lift directory, NT_999999/NT_999999.fa - not sure which ones we need

# 4) Determine the chromosome sizes from agps

    /cluster/bin/scripts/getChromSizes ../agp

# 4.1) Create lift files (this will create chromosome directory structure) and inserts file

    /cluster/bin/scripts/createNcbiLifts -s chrom_sizes seq_contig.md .

# 5) Create contig agp files (will create contig directory structure)
	
    /cluster/bin/scripts/createNcbiCtgAgp seq_contig.md ncbi_build33.agp .

# 5.1) Create contig gl files

    ~kent/bin/i386/agpToGl contig_overlaps.agp . -md=seq_contig.md

# 6) Create chromsome agp files

    /cluster/bin/scripts/createNcbiChrAgp .

# Since we received final agp's, these were substituted in for the chr agp's
# with all completely commented lines removed.

# 6.1) Copy over jkStuff from previous build
    mkdir jkStuff
    cp /cluster/store1/gs.14/build31/jkStuff/*.sh jkStuff
    /build31/jkStuff/*.csh jkStuff
    cp /cluster/store1/gs.14/build31/jkStuff/*.gsub jkStuff        

# 6.3) Create chromosome gl files
  
    jkStuff/liftGl.sh contig.gl

# 7) Distribute contig .fa to appropriate directory (assumes all files
#    are in "contigs" directory).

    cd ~/hg15
    /cluster/bin/scripts/distNcbiCtgFa contigs .
    rm -r contigs

# 8) Reverse complement NT contig fa files that are flipped in the assembly
#    (uses faRc program)
# Not done for build33 because all contigs on + strand.  It should be this
# way for the rest of the assemblies

    /cluster/bin/scripts/revCompNcbiCtgFa seq_contig.md .


# GET FRESH MRNA/EST AND REFSEQ SEQUENCE FROM GENBANK (DONE 4/10/03)
    # Run this just before the sequence gets here!  It's OK to work on 
    # this in parallel with Terry's steps above, or in parallel with 
    # RepeatMasker below, but DO NOT let this hold up RepeatMasker.  

    # This will create a genbank.134 directory containing compressed
    # GenBank flat files and a mrna.134 containing unpacked sequence
    # info and auxiliary info in a relatively easy to parse (.ra)
    # format.

    # Point your browser to ftp://ftp.ncbi.nih.gov/genbank and look at 
    # the README.genbank.  Figure out the current release number.  (134)
    lynx ftp://ftp.ncbi.nih.gov/genbank/README.genbank
    # Consider deleting one of the older genbank releases.  It's
    # good to at least keep one previous release though.

    # Where there is space make a new genbank directory.  Create a
    # symbolic link to it:
    ssh eieio
    mkdir /cluster/store5/genbank.134
    ln -s /cluster/store5/genbank.134 ~/genbank
    cd ~/genbank
    # ncftp is handy -- it does anonymous login; "prompt" command not needed.
    ncftp ftp.ncbi.nih.gov
      cd genbank
      mget gbpri* gbrod* gbv* gbsts* gbest* gbmam* gbinv* gbbct* gbhtc* gbpat* gbphg* gbpln*
      quit
    # This will take at least 2 hours.

    # Make the refSeq subdir and download files:
    ssh eieio
    mkdir -p /cluster/store5/mrna.134/refSeq/041003
    cd /cluster/store5/mrna.134/refSeq/041003
    ncftp ftp.ncbi.nih.gov
      cd refseq/cumulative
      mget *.Z
      quit
    # Get extra info & human proteins from NCBI:
    wget ftp://ftp.ncbi.nih.gov/refseq/LocusLink/loc2ref
    wget ftp://ftp.ncbi.nih.gov/refseq/LocusLink/mim2loc
    wget ftp://ftp.ncbi.nih.gov/refseq/H_sapiens/mRNA_Prot/hs.faa.gz
    gunzip hs.faa.gz
    # Unpack this into species-specific fa files and get extra info with:
    cd /cluster/store5/mrna.134/refSeq/041003
    cp /cluster/store2/mrna.133/*.fil ..
    gunzip -c rscu.gbff.Z \
    | gbToFaRa -byOrganism=org ../../anyRna.fil refSeq.{fa,ra,ta} stdin

    # Now unpack and organize the larger genbank mrna/est sequences...
    ssh eieio
    cd /cluster/store5/mrna.134
    # Make the RNAs for all organisms
    gunzip -c \
      /cluster/store5/genbank.134/gb{pri,rod,v,mam,inv,bct,htc,pat,phg,pln}* \
    | gbToFaRa -byOrganism=org anyRna.fil mrna.{fa,ra,ta} stdin
    # Make the ESTs for all organisms
    gunzip -c /cluster/store5/genbank.134/gbest*.gz \
    | gbToFaRa anyRna.fil est.{fa,ra,ta} stdin -byOrganism=org
    # Make the nonhuman RNAs
    gunzip -c \
      /cluster/store5/genbank.134/gb{pri,rod,v,mam,inv,bct,htc,pat,phg,pln}* \
    | gbToFaRa humanXenoRna.fil humanXenoRna.{fa,ra,ta} stdin
    # Make the nonMouse RNAs
    gunzip -c \
      /cluster/store5/genbank.134/gb{pri,rod,v,mam,inv,bct,htc,pat,phg,pln}* \
    | gbToFaRa mouseXenoRna.fil mouseXenoRna.{fa,ra,ta} stdin
    # Make the nonRat RNAs
    gunzip -c \
      /cluster/store5/genbank.134/gb{pri,rod,v,mam,inv,bct,htc,pat,phg,pln}* \
    | gbToFaRa ratXenoRna.fil ratXenoRna.{fa,ra,ta} stdin
    # Make the nonhuman ESTs
    gunzip -c /cluster/store5/genbank.134/gbest*.gz \
    | gbToFaRa humanXenoRna.fil humanXenoEst.{fa,ra,ta} stdin
    # Split the really large ones into smaller pieces for more efficient 
    # cluster runs.  
    mkdir humanXenoRnaSplit humanXenoEstSplit
    faSplit about humanXenoRna.fa 10000000 humanXenoRnaSplit/xenoRna
    faSplit about humanXenoEst.fa 70000000 humanXenoEstSplit/xenoEst
    cd org/Homo_sapiens
    mkdir estSplit
    faSplit about est.fa 250000000 estSplit/est
    # Distribute the files to /iscratch/i/ so they're all ready to be aligned.
    ssh kkr1u00
    mkdir -p /iscratch/i/mrna.134/Homo_sapiens
    cp -p /cluster/store5/mrna.134/refSeq/041003/org/Homo_sapiens/refSeq.fa \
      /iscratch/i/mrna.134/Homo_sapiens/
    cp -p /cluster/store5/mrna.134/org/Homo_sapiens/mrna.fa \
      /iscratch/i/mrna.134/Homo_sapiens/
    cp -p /cluster/store5/mrna.134/org/Homo_sapiens/estSplit/*.fa \
      /iscratch/i/mrna.134/Homo_sapiens/
    cp -p /cluster/store5/mrna.134/humanXenoRnaSplit/*.fa \
      /iscratch/i/mrna.134/Homo_sapiens/
    cp -p /cluster/store5/mrna.134/humanXenoEstSplit/*.fa \
      /iscratch/i/mrna.134/Homo_sapiens/
    ~kent/bin/iSync

# REPEAT MASKING (DONE 041103)
    # Split contigs, run RepeatMasker, lift results
    # Notes: 
    # * If there is a new version of RepeatMasker, build it and ask the admins 
    #   to binrsync it (kkstore:/scratch/hg/RepeatMasker/*).
    # * Contigs (*/NT_*/NT_*.fa) are split into 500kb chunks to make 
    #   RepeatMasker runs manageable on the cluster ==> results need lifting.
    # * For the NCBI assembly we repeat mask on the sensitive mode setting
    #   (RepeatMasker -s)

    #- Split contigs into 500kb chunks:
    ssh eieio
    cd ~/hg15
    mkdir contigOut
    cd contigOut
    mkdir split
    foreach d (../contigs/NT_* )
      set contig = $d:t
      faSplit size ../contigs/$contig 500000 split/${contig}_ -lift=split/$contig.lft \
        -maxN=500000
    end

    #- Make the run directory and job list:
    cd ~/hg15
    mkdir RMRun
    rm -f RMRun/RMJobs
    touch RMRun/RMJobs
   foreach f ( /cluster/store5/gs.16/build33/contigOut/split/NT_*.fa )
        set f = $f:t
        echo /cluster/bin/scripts/RMLocalSens \
             /cluster/store5/gs.16/build33/contigOut/split $f \
            '{'check out line+ /cluster/store5/gs.16/build33/contigOut/split/$f.out'}' \
          >> RMRun/RMJobs
    end

    #- Do the run
    ssh kk
    cd ~/hg15/RMRun
    para create RMJobs
    para try, para check, para check, para push, para check,...
    #- Now while that's running, run TRF (simpleRepeat), and RefSeq 
    #- alignments, in parallel.  Also, create the database and the 
    #- tracks that don't rely on cluster runs or on masked sequence.  
    # Distribute the split files and splitting lift files back to our normal
    #  chromosome contig directory tree.
    cd ~/hg15

# Edit script first!
# This copies lft/out/align files from the flat dir structure we were forced to use at first.
# This is a one-time only step.
    /cluster/bin/scripts/distNcbiCtgOut contigOut/split .

    #- Lift up the split-contig .out's to contig-level .out's
    ssh eieio
    cd ~/hg15
    foreach d ( ?{,?}/NT_* )
      cd $d
      set contig = $d:t
      liftUp $contig.fa.out $contig.fa.lft warn ${contig}?*.fa.out > /dev/null
      cd ../..
    end

    #- Lift up RepeatMask .out files to chromosome coordinates via
    tcsh jkStuff/liftOut2.sh
    #- By this point, the database should have been created (below):
    ssh hgwdev
    cd ~/hg15
    hgLoadOut hg15 ?/*.fa.out ??/*.fa.out


# VERIFY REPEATMASKER RESULTS (DONE 041103)

    # Run featureBits on hg15 and on a comparable genome build, and compare:
    ssh hgwdev

    featureBits hg15 rmsk
    # --> 1386879340 bases of 2866468600 (48.383%) in intersection

    featureBits hg13 rmsk
    # --> 1383216615 bases of 2860907679 (48.349%) in intersection

    # Validate the RepeatMasking by randomly selecting a few NT_*.fa files, 
    # manually repeat masking them and matching the .out files with the 
    # related part in the chromosome-level .out files.  For example:
    ssh kkr1u00
    # Pick arbitrary values of $chr and $nt and run these commands: 
    set chr = 1
    set nt  = NT_004321
    mkdir /tmp/RMTest/$nt
    cd /tmp/RMTest/$nt
    cp ~/hg15/$chr/$nt/$nt.fa .
    /scratch/hg/RepeatMasker/RepeatMasker -s $nt.fa
    # Compare $nt.fa.out against the original ~/hg15/$chr/$nt/$nt.fa.out 
    # and against the appropriate part of $chr/chr$chr.fa.out (use the coords 
    # for $nt given in seq_contig.md).  

# CREATING DATABASE  (DONE 04/08/03)
    ssh hgwdev
    # if you haven't already:
    ln -s /cluster/store5/gs.16/build33 ~/oo
    ln -s /cluster/store5/gs.16/build33 ~/hg15
    # Make sure there is at least 5 gig free on hgwdev:/var/lib/mysql
    df -h /var/lib/mysql
    # Create the database.
    echo 'create database hg15' | hgsql hg14
    # make a semi-permanent read-only alias (add this to your .cshrc/.bashrc):
        alias hg15 mysql -u hguser -phguserstuff -A hg15
    # Initialize the relational-mrna and external sequence info tables:
    hgLoadRna new hg15
    # Copy over grp table (for track grouping) from another database:
    echo "create table grp (PRIMARY KEY(NAME)) select * from hg14.grp" \
    | hgsql hg15

# MAKE HGCENTRALTEST ENTRY AND TRACKDB TABLE (DONE 041103)
    ssh hgwdev
    # Enter hg15 into hgcentraltest.dbDb so test browser knows about it:
    # Note: for next assembly, set scientificName column to "Homo sapiens"
    echo 'insert into dbDb values("hg15", "Human April 2003", \
            "/gbdb/hg15/nib", "Human", "DUSP18", 1, 80, "Human");' \
    | hgsql -h genome-testdb hgcentraltest
    # Make trackDb table so browser knows what tracks to expect:
    cd ~/src/hg/makeDb/trackDb
    cvs up -d -P .
    # Edit that makefile to add hg15 in all the right places and do
    make update
    make alpha
    cvs commit makefile

# MAKE HGCENTRALTEST BLATSERVERS ENTRY (DONE 04/12/03 -jk)
    ssh hgwdev
    # Substitute BBB with the correct number for the hostname:
    echo 'insert into blatServers values("hg15", "blat2", "17778", "1"); \
          insert into blatServers values("hg15", "blat2", "17779", "0");' \
    | hgsql -h genome-testdb hgcentraltest


# MAKE LIFTALL.LFT, NCBI.LFT (DONE 041103)
    cd ~/hg15
    cat ?{,?}/lift/{ordered,random}.lft > jkStuff/liftAll.lft
    # Create jkStuff/ncbi.lft for lifting stuff built with the NCBI assembly.
    # Note: this ncbi.lift will not lift floating contigs to chr_random coords,
    # but it will show the strand orientation of the floating contigs 
    # (grep for '|').
    mdToNcbiLift seq_contig.md jkStuff/ncbi.lft 
    # If a lift file has been edited (e.g. as in 6.2.5 above), edit ncbi.lft 
    # to match. If no step 6.2.5 then no editing needed


# SIMPLE REPEAT [TRF] TRACK (DONE 041103)
    # Distribute contigs to /iscratch/i
    ssh kkr1u00
    rm -rf /iscratch/i/gs.16/build33/contigs
    mkdir -p /iscratch/i/gs.16/build33/contigs
    cd ~/hg15
    cp contigs/*.fa /iscratch/i/gs.16/build33/contigs
    # Make sure the total size looks like what you'd expect:
    du -sh /iscratch/i/gs.16/build33/contigs
    ~kent/bin/iSync

    # Create cluster parasol job like so:
    mkdir -p ~/hg15/bed/simpleRepeat
    cd ~/hg15/bed/simpleRepeat
    cp ~/hg14/bed/simpleRepeat/gsub .
    mkdir trf
    ls -1S /iscratch/i/gs.16/build33/contigs/*.fa > genome.lst
    echo "" > dummy.lst
    ssh kk
    cd ~/hg15/bed/simpleRepeat
    gensub2 genome.lst dummy.lst gsub spec
    para create spec
    para try
    para check
    para push
    para check
    # When cluster run is done
    liftUp simpleRepeat.bed ~/hg15/jkStuff/liftAll.lft warn trf/*.bed

    # Load into the database:
    ssh hgwdev
    cd ~/hg15/bed/simpleRepeat
    ~matt/bin/i386/hgLoadBed hg15 simpleRepeat simpleRepeat.bed \
      -sqlTable=$HOME/src/hg/lib/simpleRepeat.sql


# REFSEQ ALIGNMENTS AND REFGENE TRACK PREP (DONE 041103)
    # Make sure contigs have been distributed to /iscratch/i/ (should have 
    # been done for simpleRepeat/TRF above)
    # Make sure refSeq.fa is under /iscratch/i too (GENBANK above)
    ssh kk
    mkdir ~/hg15/bed/refSeq
    cd ~/hg15/bed/refSeq
    mkdir psl
    ls -1S /iscratch/i/gs.16/agp/*.fa > genome.lst
    # Remove DR51.fa, DR52.fa and bottomDrawer.fa from the list in genome.lst
    # Edit each fa file to say chr1, chr2, etc. in the top comment line.
    ls -1 /iscratch/i/mrna.134/Homo_sapiens/refSeq.fa > mrna.lst
    cp ~/hg13/bed/refSeq/gsub .
    # Edit the file to make it do blat.x -band -q=rna -trimHardA
    gensub2 genome.lst mrna.lst gsub spec
    
    # Run this on the small cluster if needed
    ssh kkr1u00 
    para create spec
    para try, para check, para push, para check....
    para time > time
    # When cluster is done, process refSeq alignments into near best in genome.
    ssh eieio
    cd ~/hg15/bed/refSeq
    pslSort dirs raw.psl /tmp psl
    pslReps -minCover=0.15 -sizeMatters -minAli=0.98 -nearTop=0.001 raw.psl \
      all_refSeq.psl /dev/null
    pslSortAcc nohead chrom /tmp all_refSeq.psl
    pslCat -dir chrom > refSeqAli.psl
    # After the database has been created, go to "LOAD REFGENE" below...


# PROCESS SIMPLE REPEATS INTO MASK (DONE 041103 ms&jk)
    # After the simpleRepeats track has been built, make a filtered version 
    # of the trf output: keep trf's with period <= 12:
    ssh eieio
    cd ~/hg15/bed/simpleRepeat
    mkdir -p trfMask
    foreach f (trf/NT_*.bed)
      awk '{if ($5 <= 12) print;}' $f > trfMask/$f:t
    end
    # Lift up filtered trf output to chrom coords as well:
    cd ~/hg15
    mkdir -p bed/simpleRepeat/trfMaskChrom
    foreach c (?{,?})
      if (-e $c/lift/ordered.lst) then
        perl -wpe 's@(\S+)@bed/simpleRepeat/trfMask/$1.bed@' \
          $c/lift/ordered.lst > $c/lift/oTrf.lst
        liftUp bed/simpleRepeat/trfMaskChrom/chr$c.bed \
          jkStuff/liftAll.lft warn `cat $c/lift/oTrf.lst`
      endif
      if (-e $c/lift/random.lst) then
        perl -wpe 's@(\S+)@bed/simpleRepeat/trfMask/$1.bed@' \
           $c/lift/random.lst > $c/lift/rTrf.lst
        liftUp bed/simpleRepeat/trfMaskChrom/chr${c}_random.bed \
          jkStuff/liftAll.lft warn `cat $c/lift/rTrf.lst`
      endif
    end


# MASK SEQUENCE WITH BOTH REPEATMASKER AND SIMPLE REPEAT/TRF (done 4/11/03 jk)
    # This used to be done right after RepeatMasking.  Now, we mask with 
    # TRF as well, so do this after the "PROCESS SIMPLE REPEATS" step above.
    ssh eieio
    cd ~/hg15
    # Make chr*.fa from contig .fa
    tcsh jkStuff/chrFa.sh

    #- Soft-mask (lower-case) the contig and chr .fa's
    tcsh jkStuff/makeFaMasked.sh
    #- Make hard-masked .fa.masked files as well:
    tcsh jkStuff/makeHardMasked.sh
    #- Rebuild the nib, mixedNib, maskedNib files:
    tcsh jkStuff/makeNib.sh

    # Copy the masked contig fa to /iscratch and /scratch:
    ssh kkr1u00
    rm -rf /iscratch/i/gs.16/build33/trfFa
    mkdir -p /iscratch/i/gs.16/build33/trfFa
    cp -p ~/hg15/?{,?}/NT_*/NT_??????.fa /iscratch/i/gs.16/build33/trfFa
    ~kent/bin/iSync
    ssh kkstore
    rm -rf /scratch/hg/gs.16/build33/trfFa
    mkdir -p /scratch/hg/gs.16/build33/trfFa
    cp -p ~/hg15/?{,?}/NT_*/NT_??????.fa /scratch/hg/gs.16/build33/trfFa


# PREPARE CLUSTER FOR BLASTZ RUN (041203)
    # This needs to be done after trf-masking and nib generation.
    ssh eieio
    # Extract lineage-specific repeats using Arian Smit's script:
    mkdir -p ~/hg15/bed/linSpecRep
    cd ~/hg15/bed/linSpecRep
    foreach f (~/hg15/*/*.out)
        ln -sf $f .
    end
    /cluster/bin/scripts/primateSpecificRepeats.pl *.out
    /cluster/bin/scripts/perl-rename 's/(\.fa|\.nib)//' *.out.*spec
    /cluster/bin/scripts/perl-rename 's/\.(rod|prim)spec/.spec/' *.out.*spec
    rm *.out
    # Copy files to the kkstore:/scratch
    ssh kkstore
    # lineage-specific repeats:
    cd ~/hg15/bed
    mkdir -p /scratch/hg/gs.16/build33
    rm -rf /scratch/hg/gs.16/build33/linSpecRep
    cp -Rp linSpecRep /scratch/hg/gs.16/build33
    # RepeatMasker .out:
    cd ~/hg15
    rm -rf /scratch/hg/gs.16/build33/rmsk
    mkdir -p /scratch/hg/gs.16/build33/rmsk
    cp -p ?{,?}/chr?{,?}{,_random}.fa.out /scratch/hg/gs.16/build33/rmsk
    # Chrom-level mixed nibs that have been repeat- and trf-masked:
    rm -rf /scratch/hg/gs.16/build33/chromTrfMixedNib
    mkdir -p /scratch/hg/gs.16/build33/chromTrfMixedNib
    cd ~/hg15
    cp -p nib/chr*.nib /scratch/hg/gs.16/build33/chromTrfMixedNib

    # Ask cluster-admin@soe.ucsc.edu to binrsync /scratch/hg to clusters
    # Copy to /iscratch as well so we can run blastz before binrsync finishes:
    rm -rf /iscratch/i/gs.16/build33/{linSpecRep,rmsk,chromTrfMixedNib}
    cp -Rp /scratch/hg/gs.16/build33/{linSpecRep,rmsk,chromTrfMixedNib} \
      /iscratch/i/gs.16/build33/
    ssh kkr1u00
    ~kent/bin/iSync

    # Jim's comments Feb 12 '03 about the order in which to run blastz:
    # In general we should do
    # 1) hg/mm
    # 2) mm/rn
    # 3) rn/hg
    # 4) hg/hg
    # 5) mm/mm
    # 6) rn/rn
    # There is now an 'axtSwap' program that might let us
    # get out of having to run the inverse of 1,2 & 3,  though
    # 2 in particular is so fast perhaps it's just as well to
    # do the inverse explicitly.

# SEQUENCE INFO: CHROMINFO (DONE)
    ssh eieio
    cd ~/hg15
    # Sanity-check */lift/ordered.lft length vs. agp length:
    foreach c ( ?{,?} )
      if (-e $c/lift/ordered.lst) then
        set lftLen = `tail -1 $c/lift/ordered.lft | awk '{print $5;}'`
        set agpLen = `tail -1 $c/chr$c.agp | awk '{print $3;}'`
        if ($lftLen != $agpLen) then
          echo "ERROR: chr$c : lftLen=$lftLen, agpLen=$agpLen"
        else
          echo "chr$c : $lftLen"
        endif
      endif
    end
    # Make unmasked nibs -- necessary for building chromInfo.
    mkdir nib
    foreach f (?{,?}/chr?{,?}{,_random}.fa)
      echo making unmasked nib for $f
      faToNib $f nib/$f:t:r.nib
    end
    # Make symbolic links from /gbdb/hg15/nib to the real nibs.
    ssh hgwdev
    mkdir -p /gbdb/hg15/nib
    foreach f (/cluster/store5/gs.16/build33/nib/chr*.nib)
      ln -s $f /gbdb/hg15/nib
    end
    # Load /gbdb/hg15/nib paths into database and save size info.
    hgsql hg15  < ~/src/hg/lib/chromInfo.sql
    cd ~/hg15
    hgNibSeq -preMadeNib hg15 /gbdb/hg15/nib ?{,?}/chr?{,?}{,_random}.fa
    echo "select chrom,size from chromInfo" | hgsql -N hg15 > chrom.sizes

# MAKE DOWNLOADABLE SEQUENCE FILES (DONE 4/12/2003 JK)
    ssh eieio
    cd ~/hg15
    #- Build the .zip files
    # Make sure to read and edit jkStuff/zipAll.sh to point to the correct paths.
    ./jkStuff/zipAll.sh |& tee zipAll.log
    #- Look at zipAll.log to make sure all file lists look reasonable.  
    #- Check zip file integrity:
    mkdir zip
    mv *.zip* zip
    cd zip
    foreach f (*.zip)
      unzip -t $f > $f.test
      tail -1 $f.test
    end
    wc -l *.zip.test
    #- Copy the .zip files to hgwdev:/usr/local/apache/...
    ssh hgwdev
    cd ~/hg15/zip
    # Edit cpToWeb.sh to contain the correct destination path.
    ../jkStuff/cpToWeb.sh
    cd /usr/local/apache/htdocs/goldenPath/10april2003
    #- Take a look at bigZips/* and chromosomes/*, update their README.txt's
    # Then make the upstream sequence files.
    cd bigZips
    featureBits hg15 refGene:upstream:1000 -fa=upstream1000.fa
    zip upstream1000.zip upstream1000.fa
    rm upstream1000.fa
    featureBits hg15 refGene:upstream:2000 -fa=upstream2000.fa
    zip upstream2000.zip upstream2000.fa
    rm upstream2000.fa
    featureBits hg15 refGene:upstream:5000 -fa=upstream5000.fa
    zip upstream5000.zip upstream5000.fa
    rm upstream5000.fa


# O+O: ASSEMBLY [GOLD], GAP, COVERAGE, MAP CONTIGS TRACKS (DONE - 042203) 
    # Store o+o info in database.
    # Note: for build31, Terry specially requested these files from NCBI:
    # finished.finf
    # draft.finf
    # predraft.finf
    # extras.finf
    ssh eieio
    cd /cluster/store5/gs.16/build33
    if (-f contig_overlaps.agp) then
      jkStuff/liftGl.sh contig.gl
    else
      ssh hgwdev
      hgGoldGapGl -noGl hg15 /cluster/store5/gs.16 build33 
      echo ""
      echo "*** Note from makeHg15.doc:"
      echo "Come back to this step later when we have contig_overlaps.agp\!"
    endif
    ssh hgwdev
    cd /cluster/store5/gs.16/build33
    if (-f contig_overlaps.agp) then
      hgGoldGapGl hg15 /cluster/store5/gs.16 build33 
      cd /cluster/store5/gs.16
      /cluster/bin/i386/hgClonePos hg15 build33 ffa/sequence.inf /cluster/store5/gs.16 -maxErr=3
    end 
    cd /cluster/store5/gs.16
    hgCtgPos hg15 build33 


# LOAD REFGENE (DONE 041103)
    # Do this after the database has been created and the RefSeq alignments 
    # are done (above)
    # Load refSeq alignments into database
    ssh hgwdev
    cd ~/hg15/bed/refSeq
    hgLoadPsl hg15 -tNameIx refSeqAli.psl
    # Make /gbdb symlinks for refSeq.fa (not .ra)
    mkdir -p /gbdb/hg15/mrna.134
    cd /gbdb/hg15/mrna.134
    ln -s /cluster/store5/mrna.134/refSeq/041003/org/Homo_sapiens/refSeq.fa
    # Load the refSeq mRNA
    cd /cluster/store2/tmp
    hgLoadRna add -type=refSeq hg15 /gbdb/hg15/mrna.134/refSeq.fa \
      /cluster/store5/mrna.134/refSeq/041003/org/Homo_sapiens/refSeq.ra
    cd ~/hg15/bed/refSeq
    hgRefSeqMrna hg15 /gbdb/hg15/mrna.134/refSeq.fa \
      /cluster/store5/mrna.134/refSeq/041003/org/Homo_sapiens/refSeq.ra \
      all_refSeq.psl \
      /cluster/store5/mrna.134/refSeq/041003/loc2ref \
      /cluster/store5/mrna.134/refSeq/041003/hs.faa \
      /cluster/store5/mrna.134/refSeq/041003/mim2loc
    # Don't worry about the "No gene name" errors
    # Add RefSeq status info
    hgRefSeqStatus hg15 /cluster/store5/mrna.134/refSeq/041003/loc2ref
    # Create precomputed join of refFlat and refGene:
    echo 'CREATE TABLE refFlat \
          (KEY geneName (geneName), KEY name (name), KEY chrom (chrom)) \
          SELECT refLink.name as geneName, refGene.* \
          FROM refLink,refGene \
          WHERE refLink.mrnaAcc = refGene.name' \
    | hgsql hg15


# GC PERCENT (DONE 041203)
     ssh hgwdev
     mkdir -p ~/hg15/bed/gcPercent
     cd ~/hg15/bed/gcPercent
     hgsql hg15  < ~/src/hg/lib/gcPercent.sql
     hgGcPercent hg15 ../../nib


# PRELOAD MRNA/EST SEQUENCE INFO INTO DATABASE (DONE 04/08/03)
    # Make /gbdb symlinks for sequence .fa (not .ra)
    mkdir -p /gbdb/hg15/mrna.134
    cd /gbdb/hg15/mrna.134
    ln -s /cluster/store5/mrna.134/org/Homo_sapiens/mrna.fa
    ln -s /cluster/store5/mrna.134/org/Homo_sapiens/est.fa
    ln -s /cluster/store5/mrna.134/humanXenoRna.fa
    ln -s /cluster/store5/mrna.134/humanXenoEst.fa
    # Store the sequence (non-alignment) info in database.
    cd /cluster/store2/tmp
    hgLoadRna add -type=mRNA hg15 /gbdb/hg15/mrna.134/mrna.fa \
      /cluster/store5/mrna.134/org/Homo_sapiens/mrna.ra
    hgLoadRna add -type=EST hg15 /gbdb/hg15/mrna.134/est.fa \
      /cluster/store5/mrna.134/org/Homo_sapiens/est.ra
    hgLoadRna add -type=xenoRna hg15 /gbdb/hg15/mrna.134/humanXenoRna.fa \
      /cluster/store5/mrna.134/humanXenoRna.ra
    hgLoadRna add -type=xenoEst hg15 /gbdb/hg15/mrna.134/humanXenoEst.fa \
      /cluster/store5/mrna.134/humanXenoEst.ra


# MAKING AND STORING mRNA AND EST ALIGNMENTS (DONE 4/12/2002 MS&JK ESTs Partially Redone)
    # Make sure that /scratch/hg/gs.16/build33/trfFa is loaded with NT_*.fa 
    # and has been pushed to the big cluster nodes.  (MASK SEQUENCE above)
    # Make sure mrna/est .fa's are under /iscratch/i too (GENBANK above)
mrna is done against unmasked 041203
    ssh kk
    mkdir -p ~/hg15/bed/{mrna,est}/psl
    cd ~/hg15/bed/mrna
    ls -1S /scratch/hg/gs.16/build33/trfFa/* > genome.lst
    ls -1S /iscratch/i/mrna.134/Homo_sapiens/mrna.fa > mrna.lst
    cp ~/hg13/bed/mrna/gsub .
    gensub2 genome.lst mrna.lst gsub spec
    para create spec
    para try
    cd ~/hg15/bed/est
    ls -1S /scratch/hg/gs.16/build33/trfFa/* > genome.lst
    ls -1S /iscratch/i/mrna.134/Homo_sapiens/est*.fa > mrna.lst
    echo '#LOOP \
/cluster/home/kent/bin/i386/blat {check in line+ $(path1)} {check in line+ $(path2)} -ooc={check in exists /scratch/hg/h/11.ooc} {check out line+ psl/$(root1)_$(root2).psl} \
#ENDLOOP' > gsub
    gensub2 genome.lst mrna.lst gsub spec
    para create spec
    para try
    # In each dir (~/hg15/bed/mrna, ~/hg15/bed/est):
    para check, para push, para check....
    # para time > time
      
    # Process mRNA and EST alignments into near best in genome.
    cd ~/hg15/bed/mrna
    pslSort dirs raw.psl /tmp psl
    pslReps -minAli=0.98 -sizeMatters -nearTop=0.005 raw.psl contig.psl \
      /dev/null
    liftUp -nohead all_mrna.psl ../../jkStuff/liftAll.lft warn contig.psl
    pslSortAcc nohead chrom /tmp all_mrna.psl

    ssh eieio
    cd ~/hg15/bed/est
    pslSort dirs raw.psl /tmp/psl psl
    pslReps -minAli=0.98 -sizeMatters -nearTop=0.005 raw.psl contig.psl \
      /dev/null
    liftUp -nohead all_est.psl ../../jkStuff/liftAll.lft warn contig.psl
    pslSortAcc nohead chrom /cluster/store3/tmp all_est.psl

    # Load mRNA alignments into database. 
    ssh hgwdev
    cd ~/hg15/bed/mrna/chrom
    rm -f *_mrna.psl
    foreach i (*.psl)
      mv $i $i:r_mrna.psl
    end
    hgLoadPsl hg15 *.psl
    cd ..
    hgLoadPsl hg15 all_mrna.psl -nobin

    # Load EST alignments into database.
    ssh hgwdev
    cd ~/hg15/bed/est/chrom
    rm -f *_est.psl
    foreach i (*.psl)
      mv $i $i:r_est.psl
    end
    hgLoadPsl hg15 *.psl
    cd ..
    hgLoadPsl hg15 all_est.psl -nobin
    # Sequence info should have already been loaded into database (PRELOAD)


# SPLICED ESTS (INTRONEST) (DONE 4/12/03 JK)
    # Create subset of ESTs with introns and load into database.
    ssh eieio
    cd ~/hg15
    tcsh jkStuff/makeIntronEst.sh
    ssh hgwdev
    cd ~/hg15/bed/est/intronEst
    hgLoadPsl hg15 *.psl

# ESTORIENTINFO, MRNAORIENTINFO, GENE BOUNDS (RNACLUSTER) (DONE 041303 MS+JK)
    # Put orientation info on ESTs and mRNAs into database:
    ssh eieio
    cd ~/hg15/bed/est
    pslSortAcc nohead contig /tmp/psl contig.psl
    cd ~/hg15/bed/mrna
    pslSortAcc nohead contig /tmp/psl contig.psl
    # Distribute the est and mrna psl files to /iscratch/i
    ssh kkr1u00
    rm -rf /iscratch/i/gs.16/build33/bed
    mkdir -p /iscratch/i/gs.16/build33/bed
    cp -r ~/hg15/bed/est/contig /iscratch/i/gs.16/build33/bed/est
    cp -r ~/hg15/bed/mrna/contig /iscratch/i/gs.16/build33/bed/mrna
    ~kent/bin/iSync
    # mrna: use big cluster.
    ssh kk
    mkdir -p ~/hg15/bed/mrnaOrientInfo/oi
    cd ~/hg15/bed/mrnaOrientInfo
    ls -1S /iscratch/i/gs.16/build33/bed/mrna/*.psl > psl.lst
    ls -1S /iscratch/i/mrna.134/Homo_sapiens/mrna*.fa > mrna.lst
    cp ~/hg14/bed/mrnaOrientInfo/gsub .
    # Edit gsub to point to the correct paths.
    gensub2 psl.lst mrna.lst gsub spec
    para create spec
    para try 
    para check, para push, para check, ....

    # When the cluster run is done do:
    ssh hgwdev
    cd ~/hg15/bed/mrnaOrientInfo
    liftUp mrnaOrientInfo.bed ~/hg15/jkStuff/liftAll.lft warn oi/*.tab
    hgLoadBed hg15 mrnaOrientInfo mrnaOrientInfo.bed \
      -sqlTable=$HOME/kent/src/hg/lib/mrnaOrientInfo.sql > /dev/null

    # est: use small cluster (I/O intensive).  Use 2-level output dir 
    # (input est.fa has been split into multiple files).  
    ssh kkr1u00
    mkdir -p ~/hg15/bed/estOrientInfo/oi
    cd ~/hg15/bed/estOrientInfo
    foreach f (`cat mrna.lst`)
      mkdir oi/$f:t:r
    end
    ls -1S /iscratch/i/gs.16/build33/bed/est/*.psl > psl.lst
    ls -1S /iscratch/i/mrna.134/Homo_sapiens/est*.fa > mrna.lst
    cp ~/hg14/bed/estOrientInfo/gsub .
    # Edit gsub to point to the correct paths.
    gensub2 psl.lst mrna.lst gsub spec
    para create spec
    para try 
    para check, para push, para check, ....

    # When the cluster run is done do:
    ssh hgwdev
    cd ~/hg15/bed/estOrientInfo
    # oi/*/*.tab -> argument list too long... so cat the lowest level together:
    foreach d (oi/*)
      cat $d/*.tab > $d.tab
    end
    liftUp estOrientInfo.bed ~/hg15/jkStuff/liftAll.lft warn oi/*.tab
    bedSort estOrientInfo.bed estOrientInfo.bed
    hgLoadBed hg15 estOrientInfo estOrientInfo.bed \
      -sqlTable=$HOME/kent/src/hg/lib/estOrientInfo.sql > /dev/null

    # Create rnaCluster table (depends on {est,mrna}OrientInfo above)
    cd ~/hg15
    # Create a list of accessions that come from RAGE libraries and need to
    # be excluded. (added by Chuck Wed Nov 27 13:09:07 PST 2002)
    ~/kent/src/hg/geneBounds/clusterRna/generateRageAccList.csh hg15 \
      rage.libs
    mkdir -p ~/hg15/bed/rnaCluster/chrom
    # Exclude accesions in the RAGE file
    foreach f (?{,?}/chr*.fa)
      set c = $f:t:r
      set out = bed/rnaCluster/chrom/$c.bed
      echo clusterRna -mrnaExclude=hg15.rage.libs hg15 /dev/null $out -chrom=$c
      clusterRna -mrnaExclude=hg15.rage.libs hg15 /dev/null $out -chrom=$c
    end

    cd bed/rnaCluster
    hgLoadBed hg15 rnaCluster chrom/*.bed > /dev/null


# GENEBANDS (DONE 4/13/03 JK)
    # Create precomputed geneBands table:
    ssh hgwdev
    hgGeneBands hg15 geneBands.txt
    hgsql hg15 < ~/kent/src/hg/lib/geneBands.sql
    echo "load data local infile 'geneBands.txt' into table geneBands;" \
    | hgsql hg15
    rm geneBands.txt


# PRODUCING GENSCAN PREDICTIONS (DONE 4/13/03 MS&JK)

    ssh eieio
    mkdir -p ~/hg15/bed/genscan
    cd ~/hg15/bed/genscan
    # Make 3 subdirectories for genscan to put their output files in
    mkdir -p gtf pep subopt
    # Generate a list file, genome.list, of all the contigs
    # *that do not have pure Ns* (due to heterochromatin, unsequencable 
    # stuff) which would cause genscan to run forever.
    rm -f genome.list
    touch genome.list
    foreach f ( `ls -1S /cluster/store5/gs.16/build33/?{,?}/NT_*/NT_??????.fa.masked` )
      egrep '[ACGT]' $f > /dev/null
      if ($status == 0) echo $f >> genome.list
    end
        
    # Log into kkr1u00 (not kk!).  kkr1u00 is the driver node for the small
    # cluster (kkr2u00 -kkr8u00. Genscan has problem running on the
    # big cluster, due to limitation of memory and swap space on each
    # processing node).
    ssh kkr1u00
    cd ~/hg15
    cd bed/genscan
    # Create template file, gsub, for gensub2.  For example (3-line file):
#LOOP    rm -f genome.list
/cluster/home/kent/bin/i386/gsBig {check in line+ $(path1)} {check out line gtf/$(root1).gtf} -trans={check out line pep/$(root1).pep} -subopt={check out line subopt/$(root1).bed} -exe=/cluster/home/fanhsu/projects/compbio/bin/genscan-linux/genscan -par=/cluster/home/fanhsu/projects/compbio/bin/genscan-linux/HumanIso.smat -tmp=/tmp -window=2400000
#ENDLOOP
    echo "" > dummy.list
    gensub2 genome.list dummy.list gsub jobList
    para create jobList
    para try
    para check
    para push
    # Issue either one of the following two commands to check the
    # status of the cluster and your jobs, until they are done.
    parasol status
    para check
    # If there were out-of-memory problems (run "para problems"), then 
    # re-run those jobs by hand but change the -window arg from 2400000
    # to 1200000.  In build33, this was 22/NT_011519.
    # Convert these to chromosome level files as so:     
    ssh eieio
    cd ~/hg15/bed/genscan
    liftUp genscan.gtf ../../jkStuff/liftAll.lft warn gtf/NT*.gtf
    liftUp genscanSubopt.bed ../../jkStuff/liftAll.lft warn subopt/NT*.bed > \
      /dev/null
    cat pep/*.pep > genscan.pep

    # Load into the database as so:
    ssh hgwdev
    cd ~/hg15/bed/genscan
    ldHgGene hg15 genscan genscan.gtf
    hgPepPred hg15 generic genscanPep genscan.pep
    hgLoadBed hg15 genscanSubopt genscanSubopt.bed > /dev/null


# CPGISLANDS (DONE 041203)
    ssh eieio
    mkdir -p ~/hg15/bed/cpgIsland
    cd ~/hg15/bed/cpgIsland
    # Build software emailed from Asif Chinwalla (achinwal@watson.wustl.edu)
    # copy the tar file to the current directory
    tar xvf cpg_dist.tar 
    cd cpg_dist
    gcc readseq.c cpg_lh.c -o cpglh.exe
    cd ..
    # cpglh.exe requires hard-masked (N) .fa's.  
    # There may be warnings about "bad character" for IUPAC ambiguous 
    # characters like R, S, etc.  Ignore the warnings.  
    foreach f (../../?{,?}/chr?{,?}{,_random}.fa.masked)
      set fout=$f:t:r:r.cpg
      echo producing $fout...
      ./cpg_dist/cpglh.exe $f > $fout
    end
    cp ~/hg13/bed/cpgIsland/filter.awk .
    awk -f filter.awk chr*.cpg > cpgIsland.bed
    ssh hgwdev
    cd ~/hg15/bed/cpgIsland
    hgLoadBed hg15 cpgIsland -tab -noBin \
      -sqlTable=$HOME/kent/src/hg/lib/cpgIsland.sql cpgIsland.bed


CREATE GOLDEN TRIANGLE (todo)

# Make sure that rnaCluster table is in place.   Then extract Affy 
# expression info into a form suitable for Eisen's clustering program with:
      cd ~/hg15/bed
      mkdir triangle
      cd triangle
      eisenInput hg15 affyHg10.txt
Transfer this to Windows and do k-means clustering
with k=200 with cluster.  Transfer results file back
to ~/hg15/bed/triangle/affyCluster_K_G200.kgg.  Then
do
      promoSeqFromCluster hg15 1000 affyCluster_K_G200.kgg kg200.unmasked
Then RepeatMask the .fa file inkg200.unmasked, and copy masked versions
to kg200.   Then
      cat kg200/*.fa > all1000.fa
and set up cluster Improbizer run to do 100 controls for every real
run on each - putting the output in imp.200.1000.e.  When improbizer
run is done make a file summarizing the runs as so:
      cd imp.200.1000.e
      motifSig ../imp.200.1000.e.iri ../kg200 motif control*
get rid of insignificant motifs with:
      cd ..
      awk '{if ($2 > $3) print; }' imp.200.1000.e.iri > sig.200.1000.e.iri
turn rest into just dnaMotifs with
      iriToDnaMotif sig.200.1000.e.iri motif.200.1000.e.txt
Extract all promoters with
      featureBits hg15 rnaCluster:upstream:1000 -bed=upstream1000.bed -fa=upstream1000.fa
Locate motifs on all promoters with
      dnaMotifFind motif.200.1000.e.txt upstream1000.fa hits.200.1000.e.txt -rc -markov=2
      liftPromoHits upstream1000.bed hits.200.1000.e.txt triangle.bed

# CREATE STS/FISH/BACENDS/CYTOBANDS DIRECTORY STRUCTURE AND SETUP (DONE 4/8/2003)

# Create directory structure to hold information for these tracks
        cd /projects/hg2/booch/psl/

# Change Makefile parameters for OOVERS, GSVERS, PREVGS, PREVOO
        make new

# Update all Makefiles with latest OOVERS and GSVERS, DATABASE, and locations of .fa files
# Makefile in:
#     /gs.16/build33/
#     /gs.16/build33/bacends
#     /gs.16/build33/cytobands
#     /gs.16/build33/cytoPlots
#     /gs.16/build33/fish
#     /gs.16/build33/fosends
#     /gs.16/build33/g2g
#     /gs.16/build33/geneticPlots
#     /gs.16/build33/primers
#     /gs.16/build33/recombrate
#     /gs.16/build33/sts
#     /gs.16/build33/stsPlots
	
# Create accession_info file
	make accession_info.rdb

# UPDATE STS INFORMATION (TODO 4/7/2003)
# Download and unpack updated information from dbSTS:
# In a web browser, go to ftp://ftp.ncbi.nih.gov/repository/dbSTS/.  
# Download dbSTS.sts, dbSTS.aliases, and dbSTS.FASTA.dailydump.Z to /projects/hg2/booch/psl/update
	cd /projects/hg2/booch/psl/update
	wget ftp://ftp.ncbi.nih.gov/repository/dbSTS/dbSTS.sts
	wget ftp://ftp.ncbi.nih.gov/repository/dbSTS/dbSTS.aliases
	wget ftp://ftp.ncbi.nih.gov/repository/dbSTS/dbSTS.FASTA.dailydump.Z
        gunzip dbSTS.FASTA.dailydump.Z

# Edit Makefile to latest sts.X version from PREV, and update STS files
        make update

# Make new directory for this info and move files there
        mkdir /cluster/store1/sts.7
        cp all.STS.fa /cluster/store1/sts.7
        cp all.primers /cluster/store1/sts.7
        cp all.primers.fa /cluster/store1/sts.7

# Copy new files to cluster
        ssh kkstore
        cd /cluster/store1/sts.7
        cp /cluster/store1/sts.7/*.* /scratch/hg/STS

# Ask for propagation from sysadmin

# Load the sequences (change sts.# to match correct location)
	ssh hgwdev
	mkdir /gbdb/hg15/sts.7
	cd /gbdb/hg15/sts.7
	ln -s /cluster/store1/sts.7/all.STS.fa ./all.STS.fa
	ln -s /cluster/store1/sts.7/all.primers.fa ./all.primers.fa
	cd /cluster/store2/tmp
	hgLoadRna addSeq hg15 /gbdb/hg15/sts.7/all.STS.fa
	hgLoadRna addSeq hg15 /gbdb/hg15/sts.7/all.primers.fa


# CREATE STS MARKER ALIGNMENTS (DONE TSF 4/12/2003)

# Create full sequence alignments
        ssh kk
        cd /cluster/home/booch/sts

# Update Makefile with latest OOVERS and GSVERS and
# run cluster jobs
        make new
        make jobList 
        para create jobList
        para push 
	# wait until alignments done
        make stsMarkers.psl

# Copy files to final destination and remove originals
        make copy.assembly
        make clean

# Create primer alignments
        ssh kk
        cd /cluster/home/booch/primers
        
# Update Makefile with latest OOVERS and GSVERS and
# run cluster jobs
        make new
        make jobList.scratch
        para create jobList
        para push

# Do an initial quick filter of results (takes a while, still) and create 
# final file - best done on eieio since disks local
	ssh eieio
	make filter
        make primers.psl

# Copy files to final destination and remove
        make copy.assembly
        make clean
        
# Create ePCR alignments
        ssh kk
        cd /cluster/home/booch/epcr

# Update Makefile with latest OOVERS and GSVERS
        make new
        make jobList
        para create jobList
        para push
        make all.epcr

# Copy files to final destination and remove
        make copy.assembly
        make clean
        

# CREATE AND LOAD STS MARKERS TRACK (DONE TSF 4/12/2003)

# Copy in current stsInfo2.bed and stsAlias.bed files
        cd /projects/hg2/booch/psl/gs.16/build33
        cp ../update/stsInfo2.bed .
        cp ../update/stsAlias.bed .

# Create final version of sts sequence placements
        ssh kks00
        cd /projects/hg2/booch/psl/gs.16/build33/sts
        make stsMarkers.final

# Create final version of primers placements
# Make sure PRIMERS variable in Makefile is pointing to current version
        cd /projects/hg2/booch/psl/gs.16/build33/primers
        make primers.final

# Create bed file
        cd /projects/hg2/booch/psl/gs.16/build33
        make stsMap.bed

# Create database tables
        ssh hgwdev
        cd /projects/hg2/booch/psl/tables
        hgsql hg15 < all_sts_primer.sql
        hgsql hg15 < all_sts_seq.sql
        hgsql hg15 < stsAlias.sql
        hgsql hg15 < stsInfo2.sql
        hgsql hg15 < stsMap.sql

# Load the tables
	cd /projects/hg2/booch/psl/gs.16/build33/sts/
        echo 'load data local infile "stsMarkers.psl.filter.lifted" into table all_sts_seq;' | hgsql hg15
	cd /projects/hg2/booch/psl/gs.16/build33/primers/
        echo 'load data local infile "primers.psl.filter.lifted" into table all_sts_primer;' | hgsql hg15
	cd /projects/hg2/booch/psl/gs.16/build33/
        echo 'load data local infile "stsAlias.bed" into table stsAlias;' | hgsql hg15
        echo 'load data local infile "stsInfo2.bed" into table stsInfo2;' | hgsql hg15
	echo 'load data local infile "stsMap.bed" into table stsMap;' | hgsql hg15


# UPDATE BACEND SEQUENCES (DONE TSF 4/10/2003)

# Download new files. Make new directory based on previous one
	mkdir /cluster/store1/bacends.4
	cd /cluster/store1/bacends.4
	wget ftp://ftp.ncbi.nih.gov/genomes/H_sapiens/BACENDS/AllBACends.mfa.gz
	wget ftp://ftp.ncbi.nih.gov/genomes/H_sapiens/BACENDS/cl_acc_gi_len.gz
	gunzip AllBACends.mfa.gz
	gunzip cl_acc_gi_len.gz

# Convert fa file
	cp /cluster/store1/bacends.3/convert.pl .
	convert.pl < AllBACends.mfa > BACends.fa

# Create new pairs files
	convertBacEndPairInfo cl_acc_gi_len

# Split file into pieces and copy to cluster to propagate
	/cluster/bin/i386/faSplit sequence BACends.fa 100 BACends
        ssh kkstore
        cd /cluster/store1/bacends.4
	rm /scratch/hg/bacEnds/hs/BACends*.fa
        mv /cluster/store1/bacends.4/BACends???.fa /scratch/hg/bacEnds/hs/
	cp /cluster/store1/bacends.4/BACends.fa /scratch/hg/bacEnds/hs/

# Ask for propagation from sysadmin

# Load the sequences (change bacends.# to match correct location)
	ssh hgwdev
	mkdir /gbdb/hg15/bacends.4
	cd /gbdb/hg15/bacends.4
	ln -s /cluster/store1/bacends.4/BACends.fa ./BACends.fa
	cd /cluster/store2/tmp
	hgLoadRna addSeq hg15 /gbdb/hg15/bacends.4/BACends.fa
                
# BACEND SEQUENCE ALIGNMENTS (DONE 041703 - TERRY)
# (alignments done without RepeatMasking)

# Create full sequence alignments
	ssh kk
        cd /cluster/home/booch/bacends

# Update Makefile with latest OOVERS and GSVERS and run cluster jobs
        make new
        make jobList
        para create jobList
        para push 

# Compile alignments and lift the files (takes a while)
	ssh eieio
	make bacEnds.psl.lifted
# Copy files to final destination and remove
        make copy.assembly
        make clean # DONT'T DO THIS (may want to wait until sure they're OK)


# BACEND PAIRS TRACK (DONE 041703 - TERRY)

# Add /projects/compbiousr/booch/booch/scripts to your path

# Update Makefile with new location of pairs/singles 
# files, if necessary (DONE)
        cd /projects/hg2/booch/psl/gs.16/build33/bacends

# Make initial file of alignments
	make bacEnds.rdb
 
# Try to fish out more pairs
	make bacEndsMiss.psl

# Re-make bacEnds.rdb with new info
	make bacEnds.rdb
 
# Create bacEndPairs track file
        make bacEndPairs.bed

# Create bacEndPairsBad and bacEndPairsLong files
	make bacEndPairsBad.bed

# Create psl file to load
	make bacEnds.load.psl

# Create database tables
        ssh hgwdev
        cd /projects/hg2/booch/psl/tables
        hgsql hg15 < all_bacends.sql
        hgsql hg15 < bacEndPairs.sql
        hgsql hg15 < bacEndPairsBad.sql
        hgsql hg15 < bacEndPairsLong.sql

# Load the tables
	cd /projects/hg2/booch/psl/gs.16/build33/bacends/
        echo 'load data local infile "bacEnds.load.psl" into table all_bacends;' | hgsql hg15        
        echo 'load data local infile "bacEndPairs.bed" into table bacEndPairs;' | hgsql hg15
        echo 'load data local infile "bacEndPairsBad.bed" into table bacEndPairsBad;' | hgsql hg15
        echo 'load data local infile "bacEndPairsLong.bed" into table bacEndPairsLong;' | hgsql hg15



# FOSEND SEQUENCE ALIGNMENTS (DONE 051503)

# Create full sequence alignments
        ssh kk
        cd /cluster/home/booch/fosends

# Update Makefile with latest OOVERS and GSVERS and run cluster jobs
        make new
        make jobList
	para create jobList
        para push

# Compile alignments and lift the files (takes a while)
	ssh eieio
        cd /cluster/home/booch/fosends
	make fosEnds.psl.lifted

# Copy files to final destination and remove
        make copy.assembly
        make clean

# FOSEND PAIRS TRACK (DONE 051603)

# Update Makefile with location of pairs files, if necessary
        ssh kks00
        cd /projects/hg2/booch/psl/gs.16/build33/fosends

# Make initial file of alignments
	make fosEnds.rdb

# Try to fish out more pairs
	make fosEndsMiss.psl

# Re-make bacEnds.rdb with new info
	make fosEnds.rdb
 
# Create bacEndPairs track file
        make fosEndPairs.bed

# Create bacEndPairsBad and bacEndPairsLong files
	make fosEndPairsBad.bed

# Create psl file to load
	make fosEnds.load.psl

# Create database tables
        ssh hgwdev
        cd /projects/hg2/booch/psl/tables
        hgsql hg15 < all_fosends.sql
        hgsql hg15 < fosEndPairs.sql
        hgsql hg15 < fosEndPairsBad.sql
        hgsql hg15 < fosEndPairsLong.sql

# Load the tables
	cd /projects/hg2/booch/psl/gs.16/build33/fosends/
        echo 'load data local infile "fosEnds.load.psl" into table all_fosends;' | hgsql hg15        
        echo 'load data local infile "fosEndPairs.bed" into table fosEndPairs;' | hgsql hg15
        echo 'load data local infile "fosEndPairsBad.bed" into table fosEndPairsBad;' | hgsql hg15
        echo 'load data local infile "fosEndPairsLong.bed" into table fosEndPairsLong;' | hgsql hg15

# Load the sequences (change fosends.# to match correct location) (done for hg15 early 4/9/2003)
	mkdir /gbdb/hg15/fosends.3
	cd /gbdb/hg15/fosends.3
	ln -s /cluster/store1/fosends.3/fosEnds.fa ./fosEnds.fa
	cd /cluster/store2/tmp
	hgLoadRna addSeq hg15 /gbdb/hg15/fosends.3/fosEnds.fa
                

# UPDATE FISH CLONES INFORMATION (DONE 4/9/2003)

# Download the latest info from NCBI
        # point browser at http://www.ncbi.nlm.nih.gov/genome/cyto/cytobac.cgi?CHR=all&VERBOSE=ctg
        # change "Show details on sequence-tag" to "yes"
        # change "Download or Display" to "Download table for UNIX"
        # press Submit - save as /projects/hg2/booch/psl/fish/hbrc/hbrc.YYYYMMDD.table

# Format file just downloaded.  
        cd /projects/hg2/booch/psl/fish/

# Edit Makefile to point at file just downloaded (variables HBRC, HBRCFORMAT)
        make HBRC

# Copy it to the new freeze location
        cp /projects/hg2/booch/psl/fish/all.fish.format /projects/hg2/booch/psl/gs.16/build33/fish/


# CREATE AND LOAD FISH CLONES TRACK (DONE 4/23/2003)
# (must be done after STS markers track and BAC end pairs track)

# Extract the file with clone positions from database
        ssh hgwdev
        echo 'select * into outfile "/tmp/booch/clonePos.txt" from clonePos' | hgsql hg15
        mv /tmp/booch/clonePos.txt /projects/hg2/booch/psl/gs.16/build33/fish

# Get current clone/accession information
	ssh kks00
        cd /projects/hg2/booch/psl/gs.16/build33/fish
	wget http://www.ncbi.nlm.nih.gov/genome/clone/DATA/clac.out

# Create initial placement file
	make cyto.markers.bed

# Get sequences for accessions not in genome
	# goto http://www.ncbi.nlm.nih.gov/entrez/batchentrez.cgi?db=Nucleotide
	# select file "/projects/hg2/booch/psl/gs.16/build33/fish/not.found.acc
	# change output to FASTA format
	# download results to "/projects/hg2/booch/psl/gs.16/build33/fish/not.found.fa"

# Place sequences against genome
	make blat

# Try to incorporate new placements
	make cyto.markers.bed2

# Create bed file
        make bed

# Create database table
        ssh hgwdev
        cd /projects/hg2/booch/psl/tables
        hgsql hg15 < fishClones.sql

# Load the table
	cd /projects/hg2/booch/psl/gs.16/build33/fish/
        echo 'load data local infile "fishClones.bed" into table fishClones;' | hgsql hg15
        

# CREATE AND LOAD CHROMOSOME BANDS TRACK (DONE - 4/23/2003)
# (must be done after FISH Clones track) 

# Create bed file
        ssh hgwdev
        make setBands.txt   # NOTE: may get errors if inserts file out-of-sync with pctSetBands file 
        make cytobands.pct.ranges
        make predict

# Create database table
        ssh hgwdev
        cd /projects/hg2/booch/psl/tables
        hgsql hg15 < cytoBand.sql
        
# Load the table
	cd /projects/hg2/booch/psl/gs.16/build33/cytobands/
	echo 'load data local infile "cytobands.bed" into table cytoBand;' | hgsql hg15


# CREATE AND LOAD RECOMBINATION RATE TRACK (DONE 05/01/2003)
# (must be done after STS Markers track) 

# Create bed file
	cd /projects/hg2/booch/psl/gs.16/build33/recombrate
	make recombRate.bed

# Create database table
        ssh hgwdev
        cd /projects/hg2/booch/psl/tables
        hgsql hg15 < recombRate.sql
        
# Load the table
	cd /projects/hg2/booch/psl/gs.16/build33/recombrate/
	echo 'load data local infile "recombRate.bed" into table recombRate;' | hgsql hg15


# CREATE AND LOAD GENMAPDB TRACK (TO DO)

# Get file from UPenn and name upenn.txt (Contact: Mike Morley <mmorley@mail.med.upenn.edu>)
# Move file to /projects/hg2/booch/psl/gs.16/build33/genMapDb

# Create bed file
	cd /projects/hg2/booch/psl/gs.16/build33/genMapDb
	make genMapDb.bed

# Create database table
        ssh hgwdev
        cd /projects/hg2/booch/psl/tables
        hgsql hg15 < genMapDb.sql
        
# Load the table
	cd /projects/hg2/booch/psl/gs.16/build33/genMapDb/
	echo 'load data local infile "genMapDb.bed" into table genMapDb;' | hgsql hg15


# CREATE STS MAP COMPARISON PLOTS AND GENETIC PLOTS (DONE 4/28/2003)
# (Must wait until after the STS Map track has been finished for STS and genetic plots)
# (Must wait until after the CytoBand, FISH tracks have been finished for cytogenetic plots)

# Create sts plots
        cd /projects/hg2/booch/psl/gs.16/build33/stsPlots
        make stsplots

# Create genetic plots
        cd /projects/hg2/booch/psl/gs.16/build33/geneticPlots
        make all
        matlab -nodesktop
        >> allplot_ncbi('/cse/grads/booch/tracks/gs.16/build33/geneticPlots/','build33', 'jpg');
        >> quit

# Create cytogenetic plots
        cd /projects/hg2/booch/psl/gs.16/build33/cytoPlots
        make bandplots
	

# Set up directories where these will end up
        ssh hgwdev
        cd /usr/local/apache/htdocs/goldenPath/mapPlots

# Update Makefile with OOVERS, GSVERS, and FREEZE date
        make new

# Copy over files
        make sts
	make genetic
	make band

# Update the index.html to include links to these new plots, and delete oldest set
# Update the arch.html with the oldest set just removed from index.html
# Check changes to index.html and arch.html into CVS
# Ask for plots to be pushed to hgwbeta (and RR eventually) - push with Chromosome Reports!!!


# CREATE CHROMOSOME REPORTS (DONE 04/29/2003)

# Create bacends file
	cd /projects/hg2/booch/psl/gs.16/build33/bacends
	make bacends.pairs.rdb

# Create overlap file
	cd /projects/hg2/booch/psl/gs.16/build33/g2g
	make g2g.rdb

# Set up directory 
	cd /projects/hg2/booch/psl/chromReports
	cp -r template/ 10Apr2003/

# Change index.html files in all directories to update date of assembly
	# index.html
	# assembly/index.html
	# assembly/bacends/index.html
	# assembly/genetic/index.html
	# assembly/overlap/index.html
	# assembly/rh/index.html
	# assembly/summary/index.html
	# assembly/yac/index.html

# Create files
	cd /projects/hg2/booch/psl/chromReports
	make all.assem

# Move to correct location
	ssh hgwdev
	cd /usr/local/apache/htdocs/goldenPath/chromReports/
	mv /projects/hg2/booch/psl/chromReports/10Apr2003 .

# Set up link to plots (must have been previously made - see above)
	cd /usr/local/apache/htdocs/goldenPath/chromReports/10Apr2003/assembly
	ln -s /usr/local/apache/htdocs/goldenPath/mapPlots/Apr10/RHgenetic ./plots

# Update index.html
	cd /usr/local/apache/htdocs/goldenPath/chromReports/10Apr2003/
	# add link to new directory in index.html
	cvs commit -m "added new assembly 10Apr2003" index.html

# Ask for reports to be pushed to hgwbeta (and RR eventually) - push with Map Plots!!!



# PRODUCING CROSS_SPECIES mRNA ALIGNMENTS (DONE 4/12/03 JK)
    # Make sure masked contigs are in /scratch/hg/gs.16/build33/trfFa 
    # Make sure split-up xenoRna sequence is under /iscratch too (GENBANK)
    ssh kkstore
    mkdir -p ~/hg15/bed/xenoMrna
    cd ~/hg15/bed/xenoMrna
    mkdir psl
    ls -1S /scratch/hg/gs.16/build33/trfFa/*.fa > human.lst
    ls -1S /iscratch/i/mrna.134/Homo_sapiens/xenoRna*.fa > mrna.lst
    # Using split fa -- so create separate output dirs and special gsub:
    foreach f (`cat mrna.lst`)
      mkdir psl/$f:t:r
    end
    echo '#LOOP \
/cluster/home/kent/bin/i386/blat {check in line+ $(path1)} {check in line+ $(path2)} -q=rnax -t=dnax -mask=lower {check out line+ psl/$(root2)/$(root1)_$(root2).psl} \
#ENDLOOP' > gsub
    gensub2 human.lst mrna.lst gsub spec
    para create spec
    ssh kk
    cd ~/hg15/bed/xenoMrna
    para try
    para check
    para push 
    # Do para check until the run is done, doing para push if necessary
    # Sort xeno mRNA alignments as so:
    ssh eieio
    cd ~/hg15/bed/xenoMrna
    pslSort dirs raw.psl /cluster/store2/temp psl/xenoRna*
    pslReps raw.psl cooked.psl /dev/null -minAli=0.25
    liftUp chrom.psl ../../jkStuff/liftAll.lft warn cooked.psl
    pslSortAcc nohead chrom /cluster/store2/temp chrom.psl
    pslCat -dir chrom > xenoMrna.psl
    rm -r chrom raw.psl cooked.psl chrom.psl
    # Load into database as so:
    ssh hgwdev
    cd ~/hg15/bed/xenoMrna
    hgLoadPsl hg15 xenoMrna.psl -tNameIx
    # Sequence info should have already been loaded into database (PRELOAD)


# PRODUCING CROSS_SPECIES EST ALIGNMENTS (DONE 4/28/03)
    # Make sure masked contigs are in /scratch/hg/gs.16/build33/trfFa 
    # Make sure split-up xenoRna sequence is under /iscratch too (GENBANK)
    ssh kkstore
    mkdir -p ~/hg15/bed/xenoEst
    cd ~/hg15/bed/xenoEst
    mkdir psl
    ls -1S /scratch/hg/gs.16/build33/trfFa/*.fa > human.lst
    ls -1S /iscratch/i/mrna.134/Homo_sapiens/xenoEst*.fa > mrna.lst
    # Using split fa -- so create separate output dirs and special gsub:
    foreach f (`cat mrna.lst`)
      mkdir psl/$f:t:r
    end

    echo '#LOOP \
/cluster/home/kent/bin/i386/blat {check in line+ $(path1)} {check in line+ $(path2)} -q=dnax -t=dnax -mask=lower {check out line+ psl/$(root2)/$(root1)_$(root2).psl} \
#ENDLOOP' > gsub
    gensub2 human.lst mrna.lst gsub spec
    ssh kk
    cd ~/hg15/bed/xenoEst
    para create spec
    para try, para check, para push, para check, ...
    # Sort xenoEst alignments:
    ssh eieio
    cd ~/hg15/bed/xenoEst
    pslSort dirs raw.psl /cluster/store2/temp psl/xenoEst*
    pslReps raw.psl cooked.psl /dev/null -minAli=0.10
    liftUp chrom.psl ../../jkStuff/liftAll.lft warn cooked.psl
    pslSortAcc nohead chrom /cluster/store2/temp chrom.psl
    pslCat -dir chrom > xenoEst.psl
    rm -r chrom raw.psl cooked.psl chrom.psl
    # Load into database as so:
    ssh hgwdev
    cd ~/hg15/bed/xenoEst
    hgLoadPsl hg15 xenoEst.psl -tNameIx
    # Sequence info should have already been loaded into database (PRELOAD)

# PRODUCING SQUIRT ALIGNMENTS  (DONE 06/04/03)
    ssh kkstore
    mkdir -p ~/hg15/bed/blatCi1
    cd ~/hg15/bed/blatCi1
    mkdir psl
    foreach f (~/hg15/?{,?}/NT_??????/NT_??????.fa)
      set c=$f:t:r
      mkdir -p psl/$c
    end
    ls -1S /iscratch/i/squirt/ci1/queryFa/*.fa > squirt.lst
    ls -1S /scratch/hg/gs.16/build33/trfFa/*.fa > human.lst
    gensub2 human.lst squirt.lst gsub2D spec

    ssh kk
    para create spec
    ....
    # When cluster run is done, sort alignments:
    ssh eieio
    cd ~/hg15/bed/blatCi1
    mkdir /tmp/$LOGNAME
    pslSort dirs raw.psl /tmp/$LOGNAME psl/*
    pslReps raw.psl cooked.psl /dev/null -minAli=0.05
    liftUp -nohead lifted.psl ../../jkStuff/liftAll.lft warn cooked.psl
    pslSortAcc nohead chrom /tmp/$LOGNAME lifted.psl

    # Rename to correspond with tables as so and load into database:
    ssh hgwdev
    cd ~/hg15/bed/blatCi1/chrom
    rm -f chr*_blatCi1.psl
    foreach i (chr?{,?}{,_random}.psl)
        set r = $i:r
        mv $i ${r}_blatCi1.psl
    end
    hgLoadPsl hg15 *.psl

    # Make squirt /gbdb/ symlink and load squirt sequence data.
    mkdir /gbdb/hg15/squirtSeq
    cd /gbdb/hg15/squirtSeq
    ln -s /cluster/store5/squirt/ci1/ciona.rm.fasta

# PRODUCING FUGU ALIGNMENTS (DONE 041203)
    # Distribute fugu sequence to /iscratch/i/fugu/ (if it isn't already there)
    ssh kkr1u00
    rm -rf /iscratch/i/fugu
    mkdir /iscratch/i/fugu
    cp -p /cluster/store3/fuguSeq/split2.5Mb/*.fa /iscratch/i/fugu
    ~kent/bin/iSync

    ssh kk
    mkdir ~/hg15/bed/blatFugu
    cd ~/hg15/bed/blatFugu
    mkdir psl
    foreach f (~/hg15/?{,?}/NT_??????/NT_??????.fa)
      set c=$f:t:r
      mkdir -p psl/$c
    end
    ls -1S /iscratch/i/fugu/*.fa > fugu.lst
    ls -1S /scratch/hg/gs.16/build33/trfFa/*.fa > human.lst
    cp ~/hg14/bed/blatFugu gsub .
    gensub2 human.lst fugu.lst gsub spec
    para create spec
    para try
    para check
    para push
    para check

    # When cluster run is done, sort alignments:
    ssh eieio
    cd ~/hg15/bed/blatFugu
    pslCat -dir psl/NT_?????? | \
      liftUp -type=.psl stdout ~/hg15/jkStuff/liftAll.lft warn stdin | \
      pslSortAcc nohead chrom temp stdin

    # Rename to correspond with tables as so and load into database:
    ssh hgwdev
    cd ~/hg15/bed/blatFugu/chrom
    rm -f chr*_blatFugu.psl
    foreach i (chr?{,?}{,_random}.psl)
        set r = $i:r
        mv $i ${r}_blatFugu.psl
    end
    hgLoadPsl hg15 *.psl

    # Make fugu /gbdb/ symlink and load Fugu sequence data.
    mkdir /gbdb/hg15/fuguSeq
    cd /gbdb/hg15/fuguSeq
    ln -s /cluster/store3/fuguSeq/fugu_v3_mask.fasta
    cd /cluster/store2/tmp
    hgLoadRna addSeq hg15 /gbdb/hg15/fuguSeq/fugu_v3_mask.fasta

# BLASTZ C elegans ALIGNMENTS (DONE - 2003-06-11 - Hiram)
    #  This has been DONE, but there is some kind of problem in the
    #  browser display because a click on the track produces the error:
    #  Expecting two ranges in loadPslFromRangePair
    #  instead of taking you to the alignment.
    #  I believe some more needs to be done, something like the axtBest
    #  business as seen below in the RAT BLASTZ
    ssh kk
    mkdir -p ~/hg15/bed/blastzCe1
    cp ~angie/hummus/DEF.ce1-hg15.030611 ./DEF
    . ./DEF
    mkdir -p $BASE/run
    ~angie/hummus/make-joblist $DEF > $BASE/run/j
    # that created xdir.sh and joblist run/j
    sh $BASE/xdir.sh
    # xdir.sh makes a bunch of result directories in $BASE/raw/
    # based on chrom name and CHUNK size
    cd $BASE/run
    # now edit j because it is not correct:
    sed -e 's#^#/cluster/home/angie/schwartzbin/#' j > j2
    wc -l j*
    head j2
    # *** make sure the j2 edits are OK, then use it:
    mv j2 j
    # para create will create the file: 'batch' for the cluster run
    para create j
    para try
    para check
    para push
    ... etc ...
# Average job time:                 239s       3.98m     0.07h    0.00d
# Longest job:                     1578s      26.30m     0.44h    0.02d

    # When that cluster run is done, results are in $BASE/raw/chr*/*
    # continuing with ~angie/hummus/Notes:
    # --- normalize and single_cov
    cd ~/hg15/bed/blastzCe1
    #   source the DEF file again in case you are coming back to this
    . ./DEF
    # a new run directory
    mkdir -p $BASE/run.1
    # another obscure script creates a new job list:
    ~angie/hummus/do.out2lav $DEF >$BASE/run.1/j
    cd $BASE/run.1
    # the job list is once again incorrect, edit it:
    sed -e 's/^/\/cluster\/home\/angie\/schwartzbin\//' j > j2
    wc -l j*
    head j2
    # make sure the edited j2 is OK, then use it:
    mv j2 j
    para create j
    para try
    para push;para check; ... etc ...
# Average job time:                   6s       0.09m     0.00h    0.00d
# Longest job:                       13s       0.22m     0.00h    0.00d

    # Translate the .lav files
    # into axt files
    ssh eieio
    set base="/cluster/store5/gs.16/build33/bed/blastzCe1"
    set seq1_dir="/cluster/store5/gs.16/build33/nib"
    set seq2_dir="/cluster/store5/worm/ce1/nib"
    set tbl="blastzCe1"
    cd $base
    mkdir -p axtChrom
    foreach c (lav/*)
      pushd $c
      set chr=$c:t
      set out=$base/axtChrom/$chr.axt
      echo "Translating $chr lav to $out"
      cat `ls -1 *.lav | sort -g` \
        | /cluster/bin/i386/lavToAxt stdin $seq1_dir $seq2_dir stdout \
        | /cluster/bin/i386/axtSort stdin $out
      popd
    end

    # Translate the sorted axt files into psl:
    cd $base
    mkdir -p pslChrom
    foreach f (axtChrom/chr*.axt)
      set c=$f:t:r
      /cluster/bin/i386/axtToPsl $f S1.len S2.len pslChrom/${c}_${tbl}.psl
    end

    # load these blastz results
    # next machine
    ssh hgwdev
    cd ~/hg15/bed/blastzCe1/pslChrom
    /cluster/bin/i386/hgLoadPsl hg15 chr*_*.psl
    # create trackDb/human/hg15/trackDb.ra entry:
# track blastzCe1
# shortLabel BLASTZ C elegans
# longLabel All BLASTZ C elegans alignments
# group compGeno
# priority 111
# visibility hide
# color 100,50,0
# altColor 255,240,200
# spectrum on
# type psl xeno ce1

    # Consolidate AXT files to chrom level, sort, pick best, make psl.
    ssh eieio
    tcsh
    set base="/cluster/store5/gs.16/build33/bed/blastzCe1"
    set tbl="blastzBestCe1"
    cd $base
    mkdir -p axtBest pslBest
    foreach chrdir (lav/chr*)
      set chr=$chrdir:t
      echo axtBesting $chr
      axtBest axtChrom/$chr.axt $chr axtBest/$chr.axt -minScore=300
      echo translating axtBest to psl for $chr
      axtToPsl axtBest/$chr.axt S1.len S2.len pslBest/${chr}_${tbl}.psl
    end

    # Load tables
    ssh hgwdev
    set base="/cluster/store5/gs.16/build33/bed/blastzCe1"
    set tbl="blastzBestCe1"
    cd $base/pslBest
    hgLoadPsl hg15 chr*_${tbl}.psl

    # Make /gbdb links and add them to the axtInfo table:
# Not done for build 32:
     mkdir -p /gbdb/hg15/axtBestCe1
     cd /gbdb/hg15/axtBestCe1
     foreach f ($base/axtBest/chr*.axt)
       ln -s $f .
     end
     cd $base/axtBest
     rm -f axtInfoInserts.sql
     touch axtInfoInserts.sql
     foreach f (/gbdb/hg15/axtBestCe1/chr*.axt)
       set chr=$f:t:r
       echo "INSERT INTO axtInfo VALUES ('ce1','Blastz Best in Genome','$chr','$f');" \
         >> axtInfoInserts.sql
     end
     #  only perform the following if the axtInfo table does not yet
     #		exist in hg15:
     #  hgsql hg15 < ~/kent/src/hg/lib/axtInfo.sql
     hgsql hg15 < axtInfoInserts.sql


# CHAINING elegans blastz
    # next machine
    ssh kk
    cd /cluster/store5/worm/hg15/bed/blastzSelf
    mkdir -p axtChain/run1
    cd /cluster/store5/worm/ce1/bed/blastzSelf/axtChain/run1
    ls -1S /cluster/store5/gs.16/build33/bed/blastzCe1/axtBest/*.axt > input.lst
    cat << '_EOF_' > gsub
#LOOP
doChain {check in exists $(path1)} {check out line+ chain/$(root1).chain} {check out line+ out/$(root1).out}
#ENDLOOP
'_EOF_'
    #  worm nibs are on /cluster/bluearc now
    cat << '_EOF_' > doChain
#!/bin/csh
    axtChain $1 /cluster/store5/gs.16/build33/nib /cluster/bluearc/iscratch/i/worms/Celegans/bothMasksNib $2 > $3
'_EOF_'
    chmod a+x doChain

    mkdir out chain
    gensub2 input.lst single gsub spec
    para create spec
    para try
    para push ... etc ...
Average job time:                 502s       8.37m     0.14h    0.01d
Longest job:                      874s      14.57m     0.24h    0.01d

    # now on the cluster server, sort chains
    # next machine
    ssh eieio
    cd ~/ce1/bed/blastzSelf/axtChain
    chainMergeSort run1/chain/*.chain > all.chain
    chainSplit chain all.chain
    # optionally: rm run1/chain/*.chain

    # Load chains into database
    # next machine
    ssh hgwdev
    cd ~/ce1/bed/blastzSelf/axtChain/chain
    foreach i (*.chain)
        set c = $i:r
        hgLoadChain ce1 ${c}_chainSelf $i
        echo done $c
    end
#  Loading 845504 chains into ce1.chrI_chainSelf
#  Loading 845307 chains into ce1.chrII_chainSelf
#  Loading 1152790 chains into ce1.chrIII_chainSelf
#  Loading 859566 chains into ce1.chrIV_chainSelf
#  Loading 2 chains into ce1.chrM_chainSelf
#  Loading 1114939 chains into ce1.chrV_chainSelf
#  Loading 604293 chains into ce1.chrX_chainSelf

END OF BLASTZ C elegans ####################################################

TIGR GENE INDEX (TODO)
  o mkdir -p ~/hg15/bed/tigr    
    cd ~/hg15/bed/tigr  
    wget ftp://ftp.tigr.org/private/HGI_ren/TGI_track_HumanGenome_build33.tgz
    tar xvzf TGI*.tgz
    foreach f (*cattle*)
      set f1 = `echo $f | sed -e 's/cattle/cow/g'`
      mv $f $f1
    end
    foreach o (mouse cow human pig rat)
      setenv O $o
      foreach f (chr*_$o*s)
        tail +2 $f | perl -wpe 's /THC/TC/; s/(TH?C\d+)/$ENV{O}_$1/;' > $f.gff
      end
    end
    ldHgGene -exon=TC hg15 tigrGeneIndex *.gff


LOAD MOUSEREF TRACK (todo)
    First copy in data from eieio to ~/hg15/bed/mouseRef.  
    Then substitute 'genome' for the appropriate chromosome 
    in each of the alignment files.  Finally do:
       hgRefAlign webb hg15 mouseRef *.alignments

LOAD AVID MOUSE TRACK (todo)
      ssh cc98
      cd ~/hg15/bed
      mkdir avidMouse
      cd avidMouse
      wget http://pipeline.lbl.gov/tableCS-LBNL.txt
      hgAvidShortBed *.txt avidRepeat.bed avidUnique.bed
      hgLoadBed avidRepeat avidRepeat.bed
      hgLoadBed avidUnique avidUnique.bed

LOAD SNPS (Done.  Daryl Thomas; August 28, 2003)
    # SNP processing has been condensed into a single script,
    # which makes snpNih, snpTsc, and snpMap
    #   ${HOME}/kent/src/hg/snp/locations/processSnpLocations.csh

    # snpBuild = 116
    # Run from directory $oo/bed/snp/build$snpBuild/snpMap
    mkdir -p $oo/bed/snp/build$snpBuild/snpMap
    cd       $oo/bed/snp/build$snpBuild/snpMap
    processSnpLocations.csh hg15 human 33 116 >& log &

    # check data:
    # wc -l snpTsc.bed; hg15 -e "select count(*) from snpTsc; 
    # wc -l snpNih.bed; hg15 -e "select count(*) from snpNih; 
    # wc -l snpMap.bed; hg15 -e "select count(*) from snpMap; 
    # select * from snpNih limit 5; desc snpNih; show indexes from snpNih"
    # select * from snpTsc limit 5; desc snpTsc; show indexes from snpTsc"
    # select * from snpMap limit 5; desc snpMap; show indexes from snpMap"

    # remove temp files
    # rm human* *bed.gz

LOAD SNP DETAILS (Done.  Daryl Thomas; February 18, 2004)
    # SNP processing has been condensed into a single script,
    # which makes dbSnpRsHg
    #   ${HOME}/kent/src/hg/snp/details/processSnpDetails.csh

    # snpBuild = 119
    # Run from directory $oo/bed/snp/build$snpBuild/snpMap
    mkdir -p $oo/bed/snp/build$snpBuild/details/Done
    mkdir -p $oo/bed/snp/build$snpBuild/details/Observed
    cd       $oo/bed/snp/build$snpBuild/details
    processSnpDetails.csh hg15 human 119 >& log &
    load data local infile "$fileBase.out" into table $database.$table
    gzip $fileBase.out

    # check data:
    # hgFixed -e "select count(*) from dbSnpRsHg; 
    # select * from dbSnpRSHg limit 5; desc dbSnpRsHg; show indexes from dbSnpRSHg"

    # remove temp files
    # rm dbSnpRs*


LOAD ENSEMBL GENES (Finished 2003-09-26 braney)
     cd ~/hg15/bed
     mkdir ensembl
     cd ensembl
     # Get the ensembl protein data from http://www.ensembl.org/Homo_sapiens/martview
        # Follow this sequence through the pages:
        # Page 1) Make sure that the Homo_sapiens choice is selected. Hit next.
        # Page 2) Uncheck the "Limit to" box in the region choice. Then hit next.
        # Page 3) Choose the "Structures" box. 

        # Page 4) Choose GTF as the ouput
        #  choose gzip compression and then hit export.
        # Save as ensemblGene.gtf.gz

     gunzip ensemblGene.gtf.gz

    # Ensembl handles random chromosomes differently than us, so we
    # strip this data.  Fortunately it just loses a couple of genes.
     grep -v ^6_DR51 ensemblGene.gtf | grep -v _NT_ > unrandom.gtf

    # Add "chr" to front of each line in the gene data gtf file to make 
    # it compatible with ldHgGene 
    # Note: these scripts copied out of ~matt/bin  7/3/03 KRR
     ../../jkStuff/ensemblAddChr.pl unrandom.gtf ensGene.gtf
     # Not necessary for this build
     #../../jkStuff/ensemblFixEns.pl ensGene.gtf ensFixed.gtf 

     # get rid of the ".1" or ".2" after the name
     sed "s/\..\"/\"/g" ensGene.gtf > ensFixed.gtf
     mv ensFixed.gtf ensGene.gtf

     /cluster/bin/i386/ldHgGene hg15 ensGene ensGene.gtf

    # Load Ensembl peptides:
     # Get them from ensembl as above in the gene section except for
     # Page 3) Choose the "Sequences" box. 
     # Page 4) Transcripts/Proteins. 
     # Save file as ensemblePep.fa.gz

     # Here is FTP access
     wget ftp://ftp.ensembl.org/pub/human-15.33/data/fasta/pep/Homo_sapiens.NCBI33.pep.fa.gz
     gunzip Homo_sapiens.NCBI33.pep.fa.gz
     cp Homo_sapiens.NCBI33.pep.fa ensemblPep.fa

     # Go use the data mart link on www.ensembl.org to
     # download a tab-separated file with the columns
     # geneId/transcriptId/protienId and put it in
     # ensGtp.txt.  Then load it into database as so:
     hgsql hg15 < ~/kent/src/hg/lib/ensGtp.sql
     hgsql -N hg15 <<end
     load data local infile 'ensGtp.txt' into table ensGtp
end

     hgPepPred hg15 ensembl ensemblPep.fa

    # LOAD SANGER 22 Pseudogenes  (TODO)
    cd ~/hg15/bed/sanger22
    cp ~/hg10/bed/sanger22/cChr22.3.lx.pseudogene.gff .
    replace ^chr22 with hg10:chr22 in Chr22.3.lx.pseudogene.gff
    liftUp -type=.gff pseudo.gff hg15.lft Chr22.3.lx.pseudogene.gff
    ldHgGene hg15 sanger22pseudo pseudo.gff

LOAD SANGER22 GENES  (TODO)
      cd ~/hg15/bed
      mkdir sanger22
      cd sanger22
      not sure where these files were downloaded from
      grep -v Pseudogene Chr22*.genes.gff | hgSanger22 hg15 stdin Chr22*.cds.gff *.genes.dna *.cds.pep 0
          | ldHgGene hg15 sanger22pseudo stdin
  # Note: this creates sanger22extras, but doesn't currently create
  # a correct sanger22 table, which are replaced in the next steps
      sanger22-gff-doctor Chr22.3.1x.cds.gff Chr22.3.1x.genes.gff \
          | ldHgGene hg15 sanger22 stdin
      sanger22-gff-doctor -pseudogenes Chr22.3.1x.cds.gff Chr22.3.1x.genes.gff \
          | ldHgGene hg15 sanger22pseudo stdin

      hgPepPred hg15 generic sanger22pep *.pep

              
              LOAD SANGER 20 GENES (todo)
     # First download files from James Gilbert's email to ~/hg15/bed/sanger20 and
     # go to that directory while logged onto hgwdev.  Then:
        grep -v Pseudogene chr_20*.gtf | ldHgGene hg15 sanger20 stdin
        hgSanger20 hg15 *.gtf *.info


# JAX ORTHOLOG (table only) (  Done braney 2003-07-01 )
    # Add Jackson labs info
    cd ~/hg15/bed
    mkdir jaxOrtholog
    cd jaxOrtholog
    wget ftp://ftp.informatics.jax.org/pub/reports/HMD_Human4.rpt

    cp /cluster/store1/gs.12/build29/bed/jaxOrtholog/filter.awk .
    awk -f filter.awk *.rpt > jaxOrtholog.tab
    # Drop (just in case), create and load the table like this:
    echo 'drop table jaxOrtholog;' | hgsql hg15
    hgsql hg15 < ~/kent/src/hg/lib/jaxOrtholog.sql
    set table=`pwd`/jaxOrtholog.tab
    set cmd="load data local infile '$table' into table jaxOrtholog;" 
    echo $cmd | hgsql hg15

    # fixes by Heather and Bob (2003-07-02)
    # filter.awk had order of columns wrong
    # create HMD_Human4.rpt.noheadfoot
    # changed 2 humanSymbols from " - " to "-"
    split.pl < HMD_Human4.rpt.noheadfoot > jaxOrtholog2.tab
    set cmd="delete from jaxOrtholog;"
    set cmd="load data local infile '$table' into table jaxOrtholog;" 


LOAD RNAGENES
      ssh hgwdev
      mkdir -p ~/hg15/bed/rnaGene
      cd ~/hg15/bed/rnaGene
      wget ftp://ftp.genetics.wustl.edu/pub/eddy/pickup/ncrna-hg15.gff.gz
      gunzip -c ncrna-hg15.gff.gz | grep -v '^#' > contig.gff
      liftUp chrom.gff ../../jkStuff/liftAll.lft warn contig.gff
      echo 'drop table hgRnaGene;' | hgsql hg15
      hgsql hg15 < ~/kent/src/hg/lib/rnaGene.sql
      hgRnaGenes hg15 chrom.gff

LOAD EXOFISH (todo)
     - login to hgwdev
     - cd /cluster/store5/gs.16/build33/bed
     - mkdir exoFish
     - cd exoFish
     - hg15 < ~kent/src/hg/lib/exoFish.sql
     - Put email attatchment from Olivier Jaillon (ojaaillon@genoscope.cns.fr)
       into /cluster/store5/gs.16/build33/bed/exoFish/all_maping_ecore
     - awk -f filter.awk all_maping_ecore > exoFish.bed
     - hgLoadBed hg15 exoFish exoFish.bed

LOAD MOUSE SYNTENY (TODO)
     ssh hgwdev
     mkdir -p ~/hg15/bed/mouseSyn
     cd ~/hg15/bed/mouseSyn
     # Saved Michael Kamal's email attachment: allDirectedSegmentsBySize300.txt
     # Process the .txt file (minus header) into a bed 6 + file:
     grep -v "^#" allDirectedSegmentsBySize300.txt \
       | awk '($6 > $5) {printf "%s\t%d\t%d\t%s\t%d\t%s\t%d\t%d\t%s\n", $4, $5-1, $6, $1, 999, $7, $2-1, $3, $8;} \
              ($5 > $6) {printf "%s\t%d\t%d\t%s\t%d\t%s\t%d\t%d\t%s\n", $4, $6-1, $5, $1, 999, $7, $2-1, $3, $8;}' \
       > mouseSynWhd.bed
     hgLoadBed -noBin -sqlTable=$HOME/kent/src/hg/lib/mouseSynWhd.sql \
       hg15 mouseSynWhd mouseSynWhd.bed

LOAD GENIE (todo)
     - cat */ctg*/ctg*.affymetrix.gtf > predContigs.gtf
     - liftUp predChrom.gtf ../../jkStuff/liftAll.lft warn predContigs.gtf
     - ldHgGene hg15 genieAlt predChrom.gtf

     - cat */ctg*/ctg*.affymetrix.aa > pred.aa
     - hgPepPred hg15 genie pred.aa 

     - hg15
         mysql> delete * from genieAlt where name like 'RS.%';
         mysql> delete * from genieAlt where name like 'C.%';

# LOAD SOFTBERRY GENES (4/25/03 KRR)
     mkdir -p ~/hg15/bed/softberry
     cd ~/hg15/bed/softberry
     set file = Softb_fgenesh_models_apr03.tar.gz
     wget ftp://www.softberry.com/pub/SC_HUM_APR03/$file
     gunzip -c $file | tar xvf -
     ldHgGene hg15 softberryGene chr*.gff
     hgPepPred hg15 softberry *.protein
     hgSoftberryHom hg15 *.protein

# LOAD GENEID GENES (4/30/03 KRR RELOADED -gtf 2004-04-02 kate)
    mkdir ~/hg15/bed/geneid
    cd ~/hg15/bed/geneid
    mkdir download
    cd download
    # Now download *.gtf and *.prot from 
    set dir = www1.imim.es/genepredictions/H.sapiens/golden_path_20030410/geneid_v1.1/
    foreach c (1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 X Y Un)
      wget http://$dir/chr$c.gtf
      wget http://$dir/chr${c}_random.gtf
      wget http://$dir/chr$c.prot
      wget http://$dir/chr${c}_random.prot
    end
    # Add missing .1 to protein id's
    foreach f (*.prot)
      perl -wpe 's/^(>chr\w+)$/$1.1/' $f > $f:r-fixed.prot
    end
    cd ..
    ldHgGene hg15 geneid download/*.gtf -gtf
    # 32381 gene predictions
    runGeneCheck .
    hgPepPred hg15 generic geneidPep download/*-fixed.prot

LOAD ACEMBLY (5/30/03 KRR)
    mkdir -p ~/hg15/bed/acembly
    cd ~/hg15/bed/acembly
    # Get acembly*gene.gff from Jean and Danielle Thierry-Mieg
    set build = ncbi_33
    wget ftp://ftp.ncbi.nih.gov/repository/acedb/$build.human.genes/acembly.$build.genes.proteins.fasta.tar.gz
    wget ftp://ftp.ncbi.nih.gov/repository/acedb/$build.human.genes/acembly.$build.genes.gff.tar.gz
    gunzip -c acembly.$build.genes.gff.tar.gz | tar xvf -
    gunzip -c acembly.$build.genes.proteins.fasta.tar.gz | tar xvf -
    cd acembly.$build.genes.gff
    # Save just the floating-contig features to different files for lifting 
    # and lift up the floating-contig features to chr*_random coords:
    # NOTE: file prefix (x1) has been added since build 31
    foreach f (x1.acemblygenes.*.gff)
      set c=$f:r:e
      egrep '^[a-zA-Z0-9]+\|NT_[0-9][0-9][0-9][0-9][0-9][0-9]' $f | \
        perl -wpe 's/^(\w+)\|(\w+)/$1\/$2/' > ctg-chr${c}_random.gff
      if (-e ../../../$c/lift/random.lft) then
        liftUp chr${c}_random.gff ../../../$c/lift/random.lft warn \
          ctg-chr${c}_random.gff
      endif
      # Strip out _random or floating contig lines from the normal chrom gff,
      # and add the "chr" prefix:
      grep -v ^$c\| $f | grep -v ^Hs | perl -wpe 's/^/chr/;' > chr$c.gff
    end

    cd ../acembly.$build.genes.proteins.fasta
    #- Remove G_t*_ prefixes from acemblyproteins.*.fasta:
    foreach f (x1.acemblyproteins.*.fasta)
      perl -wpe 's/^\>G_t[\da-zA-Z]+_/\>/' $f > chr$f:r:e.fa
    end

    #- Load into database:
    cd ..
    ldHgGene hg15 acembly acembly.$build.genes.gff/chr*.gff
    hgPepPred hg15 generic acemblyPep \
      acembly.$build.genes.proteins.fasta/chr*.fa
    unset build

LOAD GENOMIC DUPES (DONE - 2003-09-02 - Hiram)
o - Load genomic dupes
    ssh hgwdev
    mkdir -p /cluster/data/hg15/bed/segmentalDups
    cd /cluster/data/hg15/bed/segmentalDups
    wget http://humanparalogy.gene.cwru.edu/build33/files_for_ucsc/genomicSuperDups.txt.gz
    unzip *.zip
    hgsql hg15 < ~kent/src/hg/lib/genomicSuperDups.sql
    hgsql -e 'load data local infile "genomicSuperDups.txt" into table genomicSuperDups;' hg15
    #  The standard hgLoad commands do not work on this file.  This data
    #  is organized differently than any of our standard formats.

LOAD NCI60 (DONE: sugnet - Wed Apr 30 10:25:02 PDT 2003 )
o - # ssh hgwdev
    cd /projects/cc/hg/mapplots/data/NCI60/dross_arrays_nci60/
    mkdir hg15
    cd hg15
    findStanAlignments hg15 ../BC2.txt.ns ../../image/cumulative_plates.011204.list.human hg15.image.psl >& hg15.image.log 
    cp ../experimentOrder.txt ./
    sed -e 's/ / \.\.\//g' < experimentOrder.txt > epo.txt
    egrep -v unknown hg15.image.psl > hg15.image.good.psl
    stanToBedAndExpRecs  hg15.image.good.psl hg15.nci60.exp hg15.nci60.bed `cat epo.txt`
    hg15S -A < ../../scripts/nci60.sql
    echo "load data local infile 'hg15.nci60.bed' into table nci60" | hg15S -A
    mkdir /cluster/store5/gs.16/build33/bed/nci60
    mv hg15.nci60.bed /cluster/store5/gs.16/build33/bed/nci60
    rm *.psl 

CREATE AND LOAD AFFYU95 [ DONE: hartera 2004-03-30 ]

# Set up cluster job to align consensus/exemplar sequences for Affymetrix
# HG-U95Av2 chip to hg15
    cp /projects/compbio/data/microarray/affyGnf/sequences/HG-U95/HG-U95Av2_all.fa /cluster/bluearc/affyGnf
    ssh kkr1u00
    cd /cluster/data/hg15/bed
    rm -rf affyRatio.2004-03-19
    mkdir affyRatio.2004-03-19
    cd affyRatio.2004-03-19
    mkdir -p /iscratch/i/affy
    cp /cluster/bluearc/affyGnf/HG-U95Av2_all.fa /iscratch/i/affy
    iSync
    rm /cluster/bluearc/affyGnf/HG-U95Av2_all.fa

    ssh kk
    cd /cluster/data/hg15/bed/affyRatio.2004-03-19
    ls -1 /iscratch/i/affy/HG-U95Av2_all.fa > affy.lst
    ls -1 /scratch/hg/gs.16/build33/trfFa/ > allctg.lst
    echo '#LOOP\n/cluster/bin/i386/blat -fine -mask=lower -minIdentity=95 -ooc=/scratch/hg/h/11.ooc  /scratch/hg/gs.16/build33/trfFa/$(path1) $(path2) {check out line+ psl/$(root1)_$(root2).psl}\n#ENDLOOP' > template.sub
    gensub2 allctg.lst affy.lst template.sub para.spec
    mkdir psl
    para create para.spec

# Actually do the job with usual para try/check/push/time etc.
# para time  2004-03-19
#Completed: 546 of 546 jobs
#CPU time in finished jobs:       9072s     151.20m     2.52h    0.10d  0.000 y
#IO & Wait Time:                  5409s      90.15m     1.50h    0.06d  0.000 y
#Average job time:                  27s       0.44m     0.01h    0.00d
#Longest job:                      272s       4.53m     0.08h    0.00d
#Submission to last job:           310s       5.17m     0.09h    0.00d

    # Do sort, best in genome filter, and convert to chromosome coordinates
    # to create affyU95.psl
    pslSort dirs raw.psl tmp psl

    # use these filter parameters for these sequences. only use alignments that
    # cover 30% of sequence and have at least 95% identity in aligned region.
    # minAli = 0.97 too high. low minCover as a lot of n's in these sequences
    pslReps -minCover=0.3 -sizeMatters -minAli=0.95 -nearTop=0.005 raw.psl contig.psl /dev/null
    liftUp affyU95.psl ../../jkStuff/liftAll.lft warn contig.psl

    # eliminate the long names to keep names in dependent tables in sync
    sed -e "s/U95Av2://" affyU95.psl | sed -e "s/;//" > affyU95shortQname.psl

    hgLoadPsl hg15 -table=affyU95 affyU95shortQname.psl
    
    # LOAD SEQUENCE DATA FOR AFFY U95Av2 FOR ALIGNMENT DISPLAY

    # Copy probe sequence to /gbdb if it isn't already (they are already there
    # for hg16)
    mkdir -p /gbdb/hgFixed/affyProbes
    cd /gbdb/hgFixed/affyProbes    ln -s /projects/compbio/data/microarray/affyGnf/sequences/HG-U95/HG-U95Av2_all.fa .
    
    # load sequences with "U95Av2:" prefix removed so acc matches name used    
    # in other dependent tables for affy U95Av2   
    hgLoadSeq -abbr=U95Av2: hg15 /gbdb/hgFixed/affyProbes/HG-U95Av2_all.fa

# UPDATE AFFYRATIO (GNF) [DONE, hartera, 2004-03-30]
    # Merge with spot data and load into database. added -chip flag to
    # affyPslAndAtlasToBed to allow correct parsing. affyRatio needs updating
    # because the consensus/exemplar sequences are now being used for the
    # alignments in affyU95. Affy probe targe:t regions were previously used
    # for alignments in affyRatio

    ssh hgwdev
    cd /cluster/data/hg15/bed/affyRatio.2004-03-19
    /cluster/home/sugnet/bin/i386/affyPslAndAtlasToBed -chip=U95Av2 affyU95shortQname.psl /projects/compbiodata/microarray/affyGnf/human_atlas_U95_gnf.noquotes.txt affyRatio.bed affyRatio.exr >& affyPslAndAtlasToBed.log
     
    hgLoadBed -sqlTable=$HOME/src/hg/lib/affyRatio.sql hg15 affyRatio affyRatio.bed

    # Clean up
    rm -r psl tmp err *.tab *.debug batch.bak contig.psl raw.psl batch template.sub *.lst para*

# LOAD SAGE DATA (DONE - 2003-05-14 - Hiram)
    ssh hgwdev
    cd ~/kent/src/hg/sage
    make
    # XXX = uniGene build for which SAGE was built -- not necessarily current!
    # Figure out the build number by peeking at this file:
    wget -O - ftp://ftp.ncbi.nih.gov/pub/sage/map/info.txt 2> /dev/null
#   set Version = XXX
    set Version=160
    mkdir /projects/cc/hg/sugnet/sage/sage.$Version
    cd /projects/cc/hg/sugnet/sage/sage.$Version
    ncftp ftp://ftp.ncbi.nih.gov/pub/sage
      mget -R map/readme.txt map/info.txt extr info map/Hs
      quit
    mkdir map
    mv Hs map
    cd map/Hs/NlaIII
    unzip -j SAGEmap_tag_ug-rel.zip
    cd ../../../extr/
    ../../scripts/summarizeCounts.pl expCounts.tab ./SAGE_*
    ../../scripts/countGenesPerTag.pl expCounts.tab allTags.count.tab
    ../../scripts/createArraysForTags.pl allTags.count.tab tagExpArrays.tab \
      ./SAGE_*
    ../../scripts/countsPerExp.pl expCounts.tab expList.tab
    cd ../map/Hs/NlaIII/ 
    perl -e 'while (<>) { \
               chomp($_); \
               @p = split(/\t/, $_); \
               print "$p[2]\t$p[3]\t$p[0]\n"\
             }' \
      < SAGEmap_tag_ug-rel | sort | sed -e 's/ /_/g' \
      > SAGEmap_ug_tag-rel_Hs
    cd -
    createSageSummary ../map/Hs/NlaIII/SAGEmap_ug_tag-rel_Hs \
      tagExpArrays.tab sageSummary.sage
    # Create the uniGene alignments 
    # ~/hg15/uniGene/hg15.uniGene.lifted.pslReps.psl
    # -- see "MAKE UNIGENE ALIGNMENTS" below
    cd /projects/cc/hg/sugnet/sage/sage.$Version/extr
    addAveMedScoreToPsls \
      ~/hg15/bed/uniGene.$Version/hg15.uniGene.lifted.pslReps.psl \
      sageSummary.sage  uniGene.wscores.bed
    hgLoadBed hg15 uniGene_2 uniGene.wscores.bed
    hgsql hg15 < ~kent/src/hg/lib/sage.sql 
    echo "load data local infile 'sageSummary.sage' into table sage" \
        | hgsql hg15
    cd ../info
    ../../scripts/parseRecords.pl ../extr/expList.tab  > sageExp.tab
    hgsql hg15 < ~/kent/src/hg/lib/sageExp.sql 
    echo "load data local infile 'sageExp.tab' into table sageExp" | hgsql hg15
    # update ~/kent/src/hg/makeDb/trackDb/human/hg15/uniGene_2.html 
    # with current uniGene date. 

    
# MAKE UNIGENE ALIGNMENTS (DONE - 2003-05-15 - Hiram)
    # Download of the latest UniGene version is now automated by a 
    # cron job -- see /cluster/home/angie/crontab , 
    # /cluster/home/angie/unigeneVers/unigene.csh .  
    # If hgwdev gets rebooted, that needs to be restarted... maybe there's 
    # a more stable place to set up that cron job.  

    # substitute XXX -> the uniGene version used by SAGE, if building the 
    # uniGene/SAGE track;  or just the latest uniGene version in 
    # /projects/cc/hg/sugnet/uniGene/ , if doing uniGene alignments only.
    # set Version = XXX
    set Version = 160
    cd /projects/cc/hg/sugnet/uniGene/uniGene.$Version
    gunzip Hs.seq.uniq.gz Hs.data.gz
    ../countSeqsInCluster.pl Hs.data counts.tab
    ../parseUnigene.pl Hs.seq.uniq Hs.seq.uniq.simpleHeader.fa leftoverData.tab
    # Distribute UniGene sequence to /iscratch/i/ (kkstore can see /projects)
    ssh kkstore
    set Version = 160 # same as above
    mkdir -p /iscratch/i/uniGene.$Version
    cp -p \
  /projects/cc/hg/sugnet/uniGene/uniGene.$Version/Hs.seq.uniq.simpleHeader.fa \
      /iscratch/i/uniGene.$Version
    ssh kkr1u00
    ~kent/bin/iSync
    ssh kk
    set Version = 160 # same as above
    mkdir -p ~/hg15/bed/uniGene.$Version
    cd ~/hg15/bed/uniGene.$Version
    ls -1S /scratch/hg/gs.16/build33/trfFa/*.fa > allctg.lst
#   ls -1S /cluster/store5/gs.16/build33/trfFa/* > allctg.lst
    ls -1S /iscratch/i/uniGene.$Version/Hs.seq.uniq.simpleHeader.fa \
      > uniGene.lst
    echo '#LOOP\n/cluster/bin/i386/blat -mask=lower -minIdentity=95 -ooc=/scratch/hg/h/11.ooc $(path1) $(path2) {check out line+ psl/$(root1)_$(root2).psl}\n#ENDLOOP' > template.sub
    gensub2 allctg.lst uniGene.lst template.sub para.spec
    para create para.spec
    mkdir psl
    para try
    para check
    para push
    # ssh eieio
    set Version = 160 # same as above
    cd ~/hg15/bed/uniGene.$Version
    pslSort dirs raw.psl tmp psl >& pslSort.log
    liftUp -type=.psl stdout ../../jkStuff/liftAll.lft warn raw.psl \
    | pslReps -minCover=0.2 -sizeMatters -minAli=0.965 -nearTop=0.002 \
      stdin hg15.uniGene.lifted.pslReps.psl /dev/null
    # use hg15.uniGene.lifted.pslReps.psl for building SAGE track (above).


# LOADING MOUSE MM3 BLASTZ ALIGNMENTS FROM PENN STATE: (DONE 041303)

    # Translate Penn State .lav files into sorted axt:
    ssh eieio
    set base="/cluster/store5/gs.16/build33/bed/blastz.mm3.2003-04-12-03-MS"
    set seq1_dir="/cluster/store5/gs.16/build33/nib/"
    set seq2_dir="/cluster/store2/mm.2003.02/mm3/mixedNib/"
    set tbl="blastzMm3"
    cd $base
    mkdir -p axtChrom
    foreach c (lav/*)
      pushd $c
      set chr=$c:t
      set out=$base/axtChrom/$chr.axt
      echo "Translating $chr lav to $out"
      cat `ls -1 *.lav | sort -g` \
        | lavToAxt stdin $seq1_dir $seq2_dir stdout \
        | axtSort stdin $out
      popd
    end

    # Translate the sorted axt files into psl:
    cd $base
    mkdir -p pslChrom
    foreach f (axtChrom/chr*.axt)
      set c=$f:t:r
      echo $c
      axtToPsl $f S1.len S2.len pslChrom/${c}_${tbl}.psl
    end

    # Load tables
    ssh hgwdev
    set base="/cluster/store5/gs.16/build33/bed/blastz.mm3.2003-04-12-03-MS"
    set tbl="blastzMm3"
    cd $base/pslChrom
    hgLoadPsl hg15 chr*_${tbl}.psl

# MAKING THE BLASTZBESTMOUSE TRACK FROM PENN STATE MM3 AXT FILES (DONE - 041303)

    # Consolidate AXT files to chrom level, sort, pick best, make psl.
    ssh eieio
    tcsh
    set base="/cluster/store5/gs.16/build33/bed/blastz.mm3.2003-04-12-03-MS"
    set tbl="blastzBestMm3"
    cd $base
    mkdir -p axtBest pslBest
    foreach chrdir (lav/chr*)
      set chr=$chrdir:t
      echo axtBesting $chr
      axtBest axtChrom/$chr.axt $chr axtBest/$chr.axt -minScore=300
      echo translating axtBest to psl for $chr
      axtToPsl axtBest/$chr.axt S1.len S2.len pslBest/${chr}_${tbl}.psl
    end

    # Load tables
     ssh hgwdev
     set base="/cluster/store5/gs.16/build33/bed/blastz.mm3.2003-04-12-03-MS"
     set tbl="blastzBestMm3"
     cd $base/pslBest
     hgLoadPsl hg15 chr*_${tbl}.psl

    # Make /gbdb links and add them to the axtInfo table:
# Not done for build 32:
     mkdir -p /gbdb/hg15/axtBestMm3
     cd /gbdb/hg15/axtBestMm3
     foreach f ($base/axtBest/chr*.axt)
       ln -s $f .
     end
     cd $base/axtBest
     rm -f axtInfoInserts.sql
     touch axtInfoInserts.sql
     foreach f (/gbdb/hg15/axtBestMm3/chr*.axt)
       set chr=$f:t:r
       echo "INSERT INTO axtInfo VALUES ('mm3','Blastz Best in Genome','$chr','$f');" \
         >> axtInfoInserts.sql
     end
     hgsql hg15 < ~/kent/src/hg/lib/axtInfo.sql
     hgsql hg15 < axtInfoInserts.sql

# MAKING THE AXTTIGHT FROM AXTBEST (DONE 041203)
    # After creating axtBest alignments above, use subsetAxt to get axtTight:
    ssh eieio
    cd ~/hg15/bed/blastz.mm3.2003-04-12-03-MS/axtBest
    mkdir -p ../axtTight
    tcsh
    foreach i (*.axt)
      echo $i
      subsetAxt  $i ../axtTight/$i \
        ~kent/src/hg/mouseStuff/subsetAxt/coding.mat 3400
    end

    # translate to psl
    cd ../axtTight
    mkdir -p ../pslTight
    foreach i (*.axt)
      set c = $i:r
      axtToPsl $i ../S1.len ../S2.len ../pslTight/${c}_blastzTightMm3.psl
    end

    # Load tables into database
    ssh hgwdev
    cd ~/hg15/bed/blastz.mm3.2003-04-12-03-MS/pslTight
    hgLoadPsl hg15 chr*_blastzTightMm3.psl

# RUNNING MM4 BLASTZ  (IN PROGRESS 2003-07-10 kate)

    ssh eieio
    mkdir -p /cluster/data/hg15/bed/blastz.mm4
    cd /cluster/data/hg15/bed/blastz.mm4
    cat << '_EOF_' > DEF
# human/mouse
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/home/angie/schwartzbin:/cluster/home/kent/bin/i386

ALIGN=blastz-run
BLASTZ=blastz
BLASTZ_H=2000
BLASTZ_ABRIDGE_REPEATS=1

# TARGET
SEQ1_DIR=/iscratch/i/gs.16/build33/chromTrfMixedNib/
SEQ1_RMSK=/iscratch/i/gs.16/build33/rmsk/
SEQ1_SMSK=/iscratch/i/gs.16/build33/linSpecRep/
SEQ1_FLAG=-primate
SEQ1_IN_CONTIGS=0
SEQ1_CHUNK=10000000
SEQ1_LAP=10000

# QUERY
SEQ2_DIR=/cluster/bluearc/mm4/mixedNib/
# not currently used
SEQ2_RMSK=/cluster/bluearc/mm4/rmsk/
SEQ2_SMSK=/cluster/bluearc/mm4/linSpecRep/
SEQ2_FLAG=-rodent
SEQ2_IN_CONTIGS=0
SEQ2_CHUNK=30000000
SEQ2_LAP=0

BASE=/cluster/store5/gs.16/build33/bed/blastz.mm4

DEF=$BASE/DEF
RAW=$BASE/raw
CDBDIR=$BASE
SEQ1_LEN=$BASE/S1.len
SEQ2_LEN=$BASE/S2.len
'_EOF_'
                                                
    # save the DEF file in the current standard place
    chmod +x DEF
    cp DEF ~angie/hummus/DEF.hg15-mm4.2003-07-10

    ssh kk
    cd ~/hg15/bed/blastz.mm4
    
    # source the DEF file
    bash
    . ./DEF

    # follow the next set of directions slavishly
    mkdir -p $BASE/run
    # give up on avoiding angie's directories
    # tcl script
    # creates xdir.sh and joblist run/j
    ~angie/hummus/make-joblist $DEF > $BASE/run/j
    
    # xdir.sh makes a bunch of result directories in $BASE/raw/
    # based on chrom name and CHUNK size
    sh $BASE/xdir.sh
    cd $BASE/run
    
    # now edit j to prefix path to executable name
    # NOTE: we should have a controlled version of schwartz bin executables
    sed -e 's#^#/cluster/home/angie/schwartzbin/#' j > j2
    wc -l j*
    head j2

    # make sure the j2 edits are OK, then use it:
    mv j2 j

    # para create will create the file: 'batch' for the cluster run
    para create j
        # 44460 jobs written to batch
    para try
    para check
    para push
    # ... etc ...
       # 1.3 hr longest job

    # post-process blastz
    ssh eieio
    cd ~/hg15/bed/blastz.mm4
        #   source the DEF file again in case you are coming back to this
    bash
    . ./DEF
    
    # a new run directory
    mkdir -p $BASE/run.1

    # copy files to cluster fileserver
    clusterdir=/cluster/bluearc/hg/gs.16/build33/bed/blastz.mm4
    mkdir -p $clusterdir/{raw,lav}
    cp -r $BASE/raw $clusterdir
    
    # create a new job list to convert out files to lav
    /cluster/bin/scripts/blastz-make-out2lav $DEF $clusterdir \
                        > $BASE/run.1/jobList
    cd $BASE/run.1

    # make sure the job list is OK
    wc -l jobList
       # 342 jobs 
    head jobList

    # run on cluster
    ssh kkr9u01
    cd ~/hg15/bed/blastz.mm4/run.1
    para create jobList
    para try
    para check
    para push
    # etc.
    # 10 minutes total 

    # convert lav files to axt
    ssh eieio
    set base="/cluster/bluearc/hg/gs.16/build33/bed/blastz.mm4"
    set seq1_dir="/cluster/data/hg15/nib"
    set seq2_dir="/cluster/data/mm4/softNib"
    /cluster/bin/scripts/blastz-lav2axt $base $seq1_dir $seq2_dir
    

# BEGINNING OF RAT BLASTZ

# SET UP RAT BLASTZ CLUSTER RUNS
# FIRST FOLLOW INSTRUCTIONS IN ~angie/hummus/Notes

# LOADING RAT RN2 BLASTZ ALIGNMENTS FROM PENN STATE:
#		DONE - 2003-05-09 - Hiram, to be verified
    # Translate Penn State .lav files into sorted axt:
    ssh eieio
    set base="/cluster/store5/gs.16/build33/bed/blastz.rn2.2003-05-06"
    set seq1_dir="/cluster/store5/gs.16/build33/nib/"
    set seq2_dir="/cluster/store4/rn2/nib/"
    set tbl="blastzRn2"
    cd $base
    mkdir -p axtChrom
    foreach c (lav/*)
      pushd $c
      set chr=$c:t
      set out=$base/axtChrom/$chr.axt
      echo "Translating $chr lav to $out"
      cat `ls -1 *.lav | sort -g` \
        | lavToAxt stdin $seq1_dir $seq2_dir stdout \
        | axtSort stdin $out
      popd
    end

    # Translate the sorted axt files into psl:
    cd $base
    mkdir -p pslChrom
    foreach f (axtChrom/chr*.axt)
      set c=$f:t:r
      /cluster/bin/i386/axtToPsl $f S1.len S2.len pslChrom/${c}_${tbl}.psl
    end
    # Load tables
    ssh hgwdev
    set base="/cluster/store5/gs.16/build33/bed/blastz.rn2.2003-05-06"
    set tbl="blastzRn2"
    cd $base/pslChrom
    /cluster/bin/i386/hgLoadPsl hg15 chr*_${tbl}.psl


# MAKING THE BLASTZBESTRAT TRACK FROM PENN STATE RN2 AXT FILES
#		DONE - 2003-05-09 - Hiram, to be verified

    # Consolidate AXT files to chrom level, sort, pick best, make psl.
    ssh eieio
    set base="/cluster/store5/gs.16/build33/bed/blastz.rn2.2003-05-06"
    set tbl="blastzBestRn2"
    cd $base
    mkdir -p axtBest pslBest
    foreach chrdir (lav/chr*)
      set chr=$chrdir:t
      echo axtBesting $chr
      /cluster/bin/i386/axtBest axtChrom/$chr.axt $chr axtBest/$chr.axt -minScore=300
      echo translating axtBest to psl for $chr
      /cluster/bin/i386/axtToPsl axtBest/$chr.axt S1.len S2.len pslBest/${chr}_${tbl}.psl
    end


    # Load tables
     ssh hgwdev
     set base="/cluster/store5/gs.16/build33/bed/blastz.rn2.2003-05-06"
     set tbl="blastzBestRn2"
     cd $base/pslBest
     /cluster/bin/i386/hgLoadPsl hg15 chr*_${tbl}.psl

# Make /gbdb links and add them to the axtInfo table:
# Not done for build 32:
     mkdir -p /gbdb/hg15/axtBestRn2
     cd /gbdb/hg15/axtBestRn2
     foreach f ($base/axtBest/chr*.axt)
       ln -s $f .
     end
     cd $base/axtBest
     rm -f axtInfoInserts.sql
     touch axtInfoInserts.sql
     foreach f (/gbdb/hg15/axtBestRn2/chr*.axt)
       set chr=$f:t:r
       echo "INSERT INTO axtInfo VALUES ('rn2','Blastz Best in Genome','$chr','$f');" \
         >> axtInfoInserts.sql
     end
     /cluster/bin/i386/hgsql hg15 < ~/kent/src/hg/lib/axtInfo.sql
     /cluster/bin/i386/hgsql hg15 < axtInfoInserts.sql

# MAKING THE AXTTIGHT FROM AXTBEST
#		DONE - 2003-05-09 - Hiram, to be verified
    # After creating axtBest alignments above, use subsetAxt to get axtTight:
    ssh eieio
    cd /cluster/store5/gs.16/build33/bed/blastz.rn2.2003-05-06/axtBest
    mkdir -p ../axtTight
    foreach i (*.axt)
      subsetAxt  $i ../axtTight/$i \
        ~kent/src/hg/mouseStuff/subsetAxt/coding.mat 3400
    end
    # translate to psl
    cd ../axtTight
    mkdir -p ../pslTight
    foreach i (*.axt)
      set c = $i:r
      axtToPsl $i ../S1.len ../S2.len ../pslTight/${c}_blastzTightRn2.psl
    end
    # Load tables into database
    ssh hgwdev
    cd /cluster/store5/gs.16/build33/bed/blastz.rn2.2003-05-06/pslTight
    hgLoadPsl hg15 chr*_blastzTightRn2.psl


# BLASTZ RAT RN3  (DONE 2003-07-09 kate)
# Note: these alignments are swapped and displayed on Rn3 browser

    ssh eieio
    mkdir -p /cluster/data/hg15/bed/blastz.rn3
    cd /cluster/data/hg15/bed/blastz.rn3
    cat << '_EOF_' > DEF
# human vs. rat
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/home/angie/schwartzbin:/cluster/home/kent/bin/i386

ALIGN=blastz-run
BLASTZ=blastz
BLASTZ_H=2000
BLASTZ_ABRIDGE_REPEATS=1

# TARGET
SEQ1_DIR=/iscratch/i/gs.16/build33/chromTrfMixedNib/
SEQ1_RMSK=/iscratch/i/gs.16/build33/rmsk/
SEQ1_SMSK=/iscratch/i/gs.16/build33/linSpecRep/
SEQ1_FLAG=-primate
SEQ1_IN_CONTIGS=0
SEQ1_CHUNK=10000000
SEQ1_LAP=10000

# QUERY
SEQ2_DIR=/cluster/bluearc/rat/rn3/softNib/
# not currently used
SEQ2_RMSK=/cluster/bluearc/rat/rn3/rmsk/
SEQ2_SMSK=/cluster/bluearc/rat/rn3/linSpecRep/
SEQ2_FLAG=-rodent
SEQ2_IN_CONTIGS=0
SEQ2_CHUNK=30000000
SEQ2_LAP=0

BASE=/cluster/store5/gs.16/build33/bed/blastz.rn3

DEF=$BASE/DEF
RAW=$BASE/raw
CDBDIR=$BASE
SEQ1_LEN=$BASE/S1.len
SEQ2_LEN=$BASE/S2.len
'_EOF_'
                                                
    # save the DEF file in the current standard place
    chmod +x DEF
    cp DEF ~angie/hummus/DEF.hg15-rn3.2003-07-09

    ssh kk
    cd ~/hg15/bed/blastz.rn3
    
    # source the DEF file
    bash
    . ./DEF

    # follow the next set of directions slavishly
    mkdir -p $BASE/run
    # give up on avoiding angie's directories
    # tcl script
    # creates xdir.sh and joblist run/j
    ~angie/hummus/make-joblist $DEF > $BASE/run/j
    
    # xdir.sh makes a bunch of result directories in $BASE/raw/
    # based on chrom name and CHUNK size
    sh $BASE/xdir.sh
    cd $BASE/run
    
    # now edit j to prefix path to executable name
    # NOTE: we should have a controlled version of schwartz bin executables
    sed -e 's#^#/cluster/home/angie/schwartzbin/#' j > j2
    wc -l j*
    head j2

    # make sure the j2 edits are OK, then use it:
    mv j2 j

    # para create will create the file: 'batch' for the cluster run
    para create j
        # 1400 jobs written to batch
    para try
    para check
    para push
    # ... etc ...
       # 1.3 hr longest job

    # post-process blastz
    ssh eieio
    cd ~/hg15/bed/blastz.rn3
        #   source the DEF file again in case you are coming back to this
    bash
    . ./DEF
    
    # a new run directory
    mkdir -p $BASE/run.1

    # copy files to cluster fileserver
    clusterdir=/cluster/bluearc/hg/gs.16/build33/bed/blastz.rn3
    mkdir -p $clusterdir/{raw,lav}
    cp -r $BASE/raw $clusterdir
    
    # create a new job list to convert out files to lav
    /cluster/bin/scripts/blastz-make-out2lav $DEF $clusterdir \
                        > $BASE/run.1/jobList
    cd $BASE/run.1

    # make sure the job list is OK
    wc -l jobList
       # 342 jobs 
    head jobList

    # run on cluster
    ssh kkr9u01
    cd ~/hg15/bed/blastz.rn3/run.1
    para create jobList
    para try
    para check
    para push
    # etc.
    # 10 minutes total 

    # convert lav files to axt and sort
    ssh kkstore
    # create template for gensub2
    cat << '_EOF_' > gsub
#LOOP
./runTrf {check in line+ $(path1)}  {check out line trf/$(root1).bed}
#ENDLOOP
'_EOF_'

    
    cd /cluster/data/hg15/bed/blastz.rn3
    set base="/cluster/bluearc/hg/gs.16/build33/bed/blastz.rn3"
    set seq1_dir="/cluster/data/hg15/nib"
    set seq2_dir="/cluster/data/rn3/softNib"
    /cluster/bin/scripts/blastz-lav2axt $base $seq1_dir $seq2_dir >&! \
                lav2axt.log &
    tail -100f lav2axt.log
    # 3 hours total ?  No problems with axtSort memory
    # copy axt's from cluster data area to permanent storage area
    rm -fr axtChrom
    cp -r $base/axtChrom .

    # translate sorted axt files into psl
    # (just for featureBits alignment quality check)
    # 2003-09-04 kate
    ssh eieio
    cd /cluster/data/hg15/bed/blastz.rn3
    mkdir -p pslChrom
    set tbl="blastzRn3"
    foreach f (axtChrom/chr*.axt)
      echo $c
      set c=$f:t:r
      /cluster/bin/i386/axtToPsl $f S1.len S2.len pslChrom/${c}_${tbl}.psl
    end
    #   Takes about 20 minutes


# BEST RAT RN3 ALIGNMENTS (DONE 2003-07-26 kate)

    # Consolidate AXT files to chrom level, sort, pick best
    ssh eieio
    cd /cluster/data/hg15/bed/blastz.rn3

    mkdir -p axtBest pslBest
    mkdir run.2
    cd run.2

    # create script to filter files 
    cat << '_EOF_' > doBestAxt
#!/bin/csh -f
# usage: doBestAxt chr axt-file best-file psl-file
#/cluster/bin/i386/axtBest $2 $1 $3 -minScore-300
# incorrect option above, should be:
/cluster/bin/i386/axtBest $2 $1 $3 -minScore=300
sleep 1
/cluster/bin/i386/axtToPsl $3 /cluster/data/hg15/bed/blastz.rn3/S1.len /cluster/data/hg15/bed/blastz.rn3/S2.len $4
'_EOF_'
    chmod +x doBestAxt
    cd axtChrom
    ls -1S | sed 's/.axt$//' > ../run.2/chrom.list
    cd ../run.2

    # create template for cluster job
    cat << '_EOF_' > gsub
#LOOP
doBestAxt $(root1) /cluster/bluearc/hg/hg15/bed/blastz.rn3/axtChrom/$(root1).axt {check out line+ /cluster/data/hg15/bed/blastz.rn3/axtBest/$(root1).axt} {check out line+  /cluster/data/hg15/bed/blastz.rn3/pslBest/$(root1)_blastzBestRn3.psl}
#ENDLOOP
'_EOF_'
    gensub2 chrom.list single gsub jobList
    wc -l jobList
        # 44 jobs
    head jobList

    ssh kk
    cd /cluster/data/hg15/bed/blastz.rn3
    cd run.2
    para create jobList
    para try
    para check
    para push
        # 1 hour ?

    # load into database
    ssh hgwdev
    cd /cluster/data/hg15/bed/blastz.rn3
    cd pslBest
    set tbl="blastzBestRn3"
    hgLoadPsl hg15 chr*_${tbl}.psl



# BEGINNING OF HUMAN BLASTZ

# LOADING HUMAN HG15 (SELF) BLASTZ ALIGNMENTS: (TO DO)

    # Translate Penn State .lav files into sorted axt, with alignments 
    # to self/diagonal dropped:
    ssh eieio
    set base="/cluster/store5/gs.16/build33/bed/blastz.hg15.2003-06-06"
    set seq1_dir="/cluster/store5/gs.16/build33/mixedNib/"
    set seq2_dir="/cluster/store5/gs.16/build33/mixedNib/"
    set tbl="blastzHuman"
    cd $base
    mkdir -p axtChrom
    # sometimes alignments are so huge that they cause axtSort to run out 
    # of memory.  Run them in two passes like this:
    foreach c (lav/*)
      pushd $c
      set chr=$c:t
      set out=$base/axtChrom/$chr.axt
      echo "Translating $chr lav to $out"
      foreach d (*.lav)
        set smallout=$d.axt
        lavToAxt $d $seq1_dir $seq2_dir stdout \
        | axtDropSelf stdin stdout \
        | axtSort stdin $smallout
      end
      cat `ls -1 *.lav.axt | sort -g` \
        > $out
      popd
    end

# STOPPED HERE -- big data, low demand.
    # Translate the sorted axt files into psl:
    cd $base
    mkdir -p pslChrom
    foreach f (axtChrom/chr*.axt)
      set c=$f:t:r
      echo translating $c.axt to psl
      axtToPsl $f S1.len S2.len pslChrom/${c}_${tbl}.psl
    end

    # Load tables
    ssh hgwdev
    set base="/cluster/store5/gs.16/build33/bed/blastz.hg15.2003-03-18-ASH"
    set tbl="blastzHuman"
    cd $base/pslChrom
    hgLoadPsl hg15 chr*_${tbl}.psl

# MAKING THE BLASTZBESTHUMAN TRACK FROM UNFILTERED AXT FILES (TO DO)

    # Consolidate AXT files to chrom level, sort, pick best, make psl.
    ssh eieio
    set base="/cluster/store5/gs.16/build33/bed/blastz.hg15.2003-03-18-ASH"
    set tbl="blastzBestHuman"
    cd $base
    mkdir -p axtBest pslBest
    # run axtBest in 2 passes to reduce size of the input to final axtBest:
    foreach chrdir (lav/*)
      set chr=$chrdir:t
      echo two-pass axtBesting $chr
      foreach a ($chrdir/*.axt)
        axtBest $a $chr $a:r.axtBest
      end
      cat `ls -1 $chrdir/*.axtBest | sort -g` \
        > $chrdir/$chr.axtBestPieces
      axtBest $chrdir/$chr.axtBestPieces $chr axtBest/$chr.axt
      axtToPsl axtBest/$chr.axt S1.len S2.len pslBest/${chr}_${tbl}.psl
    end

    # Load tables
     ssh hgwdev
     set base="/cluster/store5/gs.16/build33/bed/blastz.hg15.2003-03-18-ASH"
     set tbl="blastzBestHuman"
     cd $base/pslBest
     hgLoadPsl hg15 chr*_${tbl}.psl

    # Make /gbdb links and add them to the axtInfo table:
# Not done for build 32:
     mkdir -p /gbdb/hg15/axtBestHg15
     cd /gbdb/hg15/axtBestHg15
     foreach f ($base/axtBest/chr*.axt)
       ln -s $f .
     end
     cd $base/axtBest
     rm -f axtInfoInserts.sql
     touch axtInfoInserts.sql
     foreach f (/gbdb/hg15/axtBestHg15/chr*.axt)
       set chr=$f:t:r
       echo "INSERT INTO axtInfo VALUES ('hg15','Blastz Best Human Self','$chr','$f');" \
         >> axtInfoInserts.sql
     end
     hgsql hg15 < ~/kent/src/hg/lib/axtInfo.sql
     hgsql hg15 < axtInfoInserts.sql

# MAKING THE AXTTIGHT FROM AXTBEST (TO DO)
    # After creating axtBest alignments above, use subsetAxt to get axtTight:
    ssh eieio
    cd ~/hg15/bed/blastz.hg15.2003-03-18-ASH/axtBest
    mkdir -p ../axtTight
    foreach i (*.axt)
      subsetAxt  $i ../axtTight/$i \
        ~kent/src/hg/mouseStuff/subsetAxt/coding.mat 3400
    end
    # translate to psl
    cd ../axtTight
    mkdir -p ../pslTight
    foreach i (*.axt)
      set c = $i:r
      axtToPsl $i ../S1.len ../S2.len ../pslTight/${c}_blastzTightHuman.psl
    end
    # Load tables into database
    ssh hgwdev
    cd ~/hg15/bed/blastz.hg15.2003-03-18-ASH/pslTight
    hgLoadPsl hg15 chr*_blastzTightHuman.psl


# XXX END OF HUMAN BLASTZ


# LIFTING REPEATMASKER .ALIGN FILES (TODO)

foreach d (?{,?}/NT_??????)
  set c=$d:t
  cd $d
  echo $c to $c.fa.align
  /cluster/bin/scripts/liftRMAlign.pl $c.lft > $c.fa.align
  cd ../..
end

foreach chr (?{,?})
  cd $chr
  echo making symbolic links for chr$chr NT .fa.align files
  foreach ctg (NT_??????)
    ln -s $ctg/$ctg.fa.align
  end
  cd ..
  if (-e $chr/lift/ordered.lft) then
    echo making $chr/chr$chr.fa.align
    /cluster/bin/scripts/liftRMAlign.pl $chr/lift/ordered.lft \
      > $chr/chr$chr.fa.align
  endif
  if (-e $chr/lift/random.lft) then
    echo making $chr/chr${chr}_random.fa.align
    /cluster/bin/scripts/liftRMAlign.pl $chr/lift/random.lft \
      > $chr/chr${chr}_random.fa.align
  endif
  echo removing symbolic links for chr$chr NT .fa.align files
  rm $chr/NT_??????.fa.align
end


# TWINSCAN GENE PREDICTIONS (5/1/03 KRR RELOADED -gtf 2004-04-02 kate)

    mkdir -p ~/hg15/bed/twinscan
    cd ~/hg15/bed/twinscan
    foreach c (1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 X Y)
      rm -f chr$c.{gtf,ptx}
      set dir = genome.cs.wustl.edu/~bio/human/NCBI33 
      wget http://$dir/gtf/chr$c.gtf
      wget http://$dir/ptx/chr$c.ptx
      # replace first field (sequence identifier) with chrom name
      perl -wpe "s/^\d+/chr$c/" < chr$c.gtf > chr$c-fixed.gtf
      # pare down protein FASTA header to id and add missing .a:
      perl -wpe 's/^(\>\S+)\s.*$/$1.a/' < chr$c.ptx > chr$c-fixed.fa
    end
    ldHgGene hg15 twinscan chr*-fixed.gtf -gtf
    # 25434 gene predictions
    runGeneCheck .
     # 44 badFrame
     # 29 noStart
     # 18 gap
     # 5 noStop
    hgPepPred hg15 generic twinscanPep chr*-fixed.fa


# LOAD CHIMP DATA (TODO)
    # Download the chimp sequence and distribute to /iscratch/i
    ssh hgwdev
    mkdir /cluster/store1/chimpSeq
    cd /cluster/store1/chimpSeq
    wget http://www.cs.uni-duesseldorf.de/~ebersber/annotation_track_chimp/downloads/mpi-aligned_seqparts_jun02.fa.gz
    gunzip *.gz
    ssh kkr1u00
    mkdir /iscratch/i/chimp
    cp -p /cluster/store1/chimpSeq/*.fa /iscratch/i/chimp/
    # Make sure it unpacked OK
    ~kent/bin/iSync
    ssh kk
    mkdir ~/hg15/bed/blatChimp
    cd ~/hg15/bed/blatChimp
    cp ~/hg13/bed/blatChimp/gsub .
    ls -1S /iscratch/i/chimp/*.fa > chimp.lst
    ls -1S /scratch/hg/gs.16/build33/trfFa.1204/*.fa.trf > human.lst
    mkdir psl
    gensub2 human.lst chimp.lst gsub spec
    para create spec
    para try
    para push
    para check

    # Sort alignments as so
    ssh eieio
    cd ~/hg15/bed/blatChimp
    pslCat -dir psl \
    | liftUp -type=.psl stdout ~/hg15/jkStuff/liftAll.lft warn stdin \
    | pslSortAcc nohead chrom temp stdin
    pslCat -dir chrom > blatChimp.psl

    ssh hgwdev
    cd ~/hg15/bed/blatChimp
    hgLoadPsl hg15 blatChimp.psl         


# SGP GENE PREDICTIONS (DONE - 2003-05-14 - Hiram - to be verified)
    mkdir -p ~/hg15/bed/sgp/download
    cd ~/hg15/bed/sgp/download
    foreach f (~/hg15/?{,?}/chr?{,?}{,_random}.fa)
      set chr = $f:t:r
      wget http://genome.imim.es/genepredictions/H.sapiens/golden_path_20030410/SGP/$chr.gtf
      wget http://genome.imim.es/genepredictions/H.sapiens/golden_path_20030410/SGP/$chr.prot
    end
    wget http://genome.imim.es/genepredictions/H.sapiens/golden_path_20030410/SGP/chrUn.gtf -O chrUn_random.gtf
    wget http://genome.imim.es/genepredictions/H.sapiens/golden_path_20030410/SGP/chrUn.prot -O chrUn_random.prot
    # Add missing .1 to protein id's
    foreach f (*.prot)
      perl -wpe 's/^(>chr\w+)$/$1.1/' $f > $f:r-fixed.prot
    end
    cd ..
    ldHgGene hg15 sgpGene download/*.gtf -exon=CDS
    hgPepPred hg15 generic sgpPep download/*-fixed.prot
 
# SGP GENES (UPDATE 1/18/2006)
    sgpPep table dropped, replaced by hgc generated protein seq in browser

# ALIGNED ANCIENT REPEATS FROM MOUSE BLASTZ (TODO)
    ssh eieio
    mkdir -p ~/hg15/bed/aarMm2
    cd ~/hg15/bed/aarMm2
    set mmdir=../blastz.mm2.2002-12-5-ASH
    foreach aar ($mmdir/aar/*.aar.gz)
      set c = $aar:t:r:r
      echo translating chr$c aar to axt
      zcat $aar \
      | $HOME/kent/src/hg/makeDb/hgAar/aarToAxt \
      | axtToPsl stdin $mmdir/S1.len $mmdir/S2.len stdout \
      > chr${c}_aarMm2.psl
    end   
    ssh hgwdev
    cd ~/hg15/bed/aarMm2
    hgLoadPsl hg15 *.psl

# ALIGNMENT COUNTS FOR WIGGLE TRACK
    # this needs to be updated to reflected the full process.
 
    - Generate BED table of AARs used to select regions.
        cat ../bed/aarMm2/*.psl | awk 'BEGIN{OFS="\t"} {print $14,$16,$17,"aar"}' >aarMm2.bed
    
    - Generate background counts with windows that have a 6kb counts,
      with a maximum windows size of 512kb and sliding the windows by
        foreach axt (../../blastz.mm2.2002-08-01/axtBest/chr*.axt)
           set chr=$axt:t:r
           set tab=$chr.6kb-aar.cnts  (??? need better name ???)
           hgCountAlign -selectBed=aarMm2.bed -winSize=512000 -winSlide=1000 -fixedNumCounts=6000 -countCoords $axt $tab
        end

    - Generate counts for AARs with 50b windows, slide by 5b
        foreach axt (../../blastz.mm2.2002-08-01/axtBest/chr*.axt)
           set chr=$axt:t:r
           set tab=$chr.50b-aar.cnts  (??? need better name ???)
           hgCountAlign -selectBed=aarMm2.bed -winSize=50 -winSlide=5 $axt $tab
        end

    - Generate counts for all with 50b windows, slide by 5b
        foreach axt (../../blastz.mm2.2002-08-01/axtBest/chr*.axt)
           set chr=$axt:t:r
           set tab=$chr.50b.cnts  (??? need better name ???)
           hgCountAlign -winSize=50 -winSlide=5 $axt $tab
        end


REFFULL (TODO)

o ssh to eieio
 mkdir -p /cluster/store5/gs.16/build33/bed/refFull
 cd /cluster/store5/gs.16/build33/bed/refFull

 Download the sequence: wget ftp://blue3.ims.u-tokyo.ac.jp/pub/db/hgc/dbtss/ref-full.fa.gz

 gunzip it and split the ref-rull.fa file into about 200 pieces
 gunzip ref-full.fa.gz
 faSplit sequence ref-full.fa 50 splitRefFull
 ssh kkstore
 cd /cluster/store5/gs.16/build33/bed/refFull
 mkdir /scratch/hg/refFull
 splitRefFull* /scratch/hg/refFull/
 ls -1S /scratch/hg/gs.16/build33/contig.0729/*.fa > genome.lst
 ls -1S /scratch/hg/refFull/split*.fa > refFull.lst

o - Request the admins to do a binrsync to the cluster of /scratch/hg/refFull

o - Use BLAT to generate refFull alignments as so:
      Make sure that /scratch/hg/gs.16/build33/contig/ is loaded
      with NT_*.fa and pushed to the cluster nodes.

          ssh kk
          cd /cluster/store5/gs.16/build33/bed/refFull
          mkdir -p psl
#          run mkdirs.sh script to create sudirs in the psl directory
#               in order to modularize the blat job.
          gensub2 genome.lst refFull.lst gsub spec
          para create spec
          
    Now run a para try/push and para check in each one.

o - Process refFull alignments into near best in genome.
      cd ~/hg15/bed
      cd refFull
      pslSort dirs raw.psl /tmp psl/*
      pslReps -minCover=0.2 -sizeMatters -minAli=0.965 -nearTop=0.001 raw.psl contig.psl /dev/null
      liftUp -nohead all_refFull.psl ../../jkStuff/liftAll.lft warn contig.psl
      pslSortAcc nohead chrom /tmp all_refFull.psl

o - Load refFull alignments into database
      ssh hgwdev
      cd /cluster/store5/gs.16/build33/bed/refFull
      pslCat -dir chrom > refFullAli.psl
      hgLoadPsl hg15 -tNameIx refFullAli.psl


MAKING PROMOTER FILES
    cd /usr/local/apache/htdocs/goldenPath/14nov2002/bigZips
    featureBits hg15 -fa=upstream1000.fa refGene:upstream:1000
    zip upstream1000.zip upstream1000.fa
    featureBits hg15 -fa=upstream2000.fa refGene:upstream:2000
    zip upstream2000.zip upstream2000.fa
    featureBits hg15 -fa=upstream5000.fa refGene:upstream:5000
    zip upstream5000.zip upstream5000.fa
    rm upstream*.fa
    
# MAKING MOUSE AND RAT SYNTENY (MOUSE DONE 041403)(RAT DONE 2003-05-19)

cd ~/hg15/bed
mkdir syntenyMouse
# Copy all the needed scripts from ~/hg14

./syntenicBest.pl -db=hg15 -table=blastzBestMm3
./smooth.pl
./joinsmallgaps.pl
./fillgap.pl -db=hg15 -table=blastzBestMm3
./synteny2bed.pl
hgLoadBed hg15 syntenyMouse ucsc100k.bed
# syntenicBest.pl will overwrite the genomeBest1phase created by the
#	the previous run of this above.  It takes several hours to run.
syntenicBest.pl -db=hg15 -table=blastzBestRn2
#	smooth.pl overwrites genomeBest2phase created by the previous
#	run of this above.  Runs quickly.
smooth.pl
# joinsmallgaps.pl overwrites genomeBest3phase created above. Runs quickly.
joinsmallgaps.pl
# fillgap.pl creates genomeBestFinal
fillgap.pl -db=hg15 -table=blastzBestRn2
# synteny2bed.pl creates ucsc100k.bed
synteny2bed.pl
hgLoadBed hg15 syntenyRat ucsc100k.bed

# create MGC Genes track (markd 13 may 2003)
# this will be automated in the incremental genbank update
/cluster/store5/genbank/bin/i386/mgcDbLoad hg15  /cluster/store5/genbank/data/download/mgc/2003.05.06/mgcFullStatus.tab.gz 


#Creating the hg15Mm3L track on hg15
kent/src/hg/sampleTracks/makeHg15Mm3.csh

# Create firstEF track from Zhang lab at CSHL
# contacts
# Gengxin Chen <cheng@cshl.edu>
# Ivo Grosse <grosse@ipk-gatersleben.de>
# Michael Zhang <mzhang@cshl.edu>
# Got firstef.tar.gz from Gengzin 6/30/03
# Done 2003-07-02 braney
# Got firstef.tar.gz from Gengzin 7/30/03
# Done again 2003-07-31 braney
ssh kk
set db=/cluster/data/hg15
cd $db
mkdir -p bed/firstEF
cd bed/firstEF
# copy firstef.tar.gz to .
zcat firstef.tar.gz | tar xf -
cd firstef
rm -rf *.alpha *.iris4d *.sun4 examples *.i386
ln firstef.i386-linux firstef.i686
# set up script parameters
cat <<  '_EOF_' > firstef
perl ./make_fasta.pl $1 ./tmp/list_file > /dev/null;
./firstef.$HOSTTYPE 1500 ./tmp/list_file ./parameters/ 0 $3 $4 $5 > /dev/null;
perl ./FirstEF_parser.pl ./tmp/list_file_out $2 > /dev/null;
perl ./file_erase.pl ./tmp/list_file;
rm ./tmp/list_file*;
'_EOF_'
cat <<  '_EOF_' > sedit.1
/^No/d
/^$/d
s/\.\./ /g
'_EOF_'
cat <<  '_EOF_' > sedit.2
/0 0 CpGIsland/d
/-1 1 CpGIsland/d
s/ /\010/g
'_EOF_'
cat <<  '_EOF_' > tawk
BEGIN { strand = "-"}
/^>.*$/ { if (strand=="-") strand="+"; else strand = "-"}
/^[^>].*$/ {
  rank=$11
  if (rank > 1)
      extra="#"rank
  else 
    extra=""
  if (strand == "+") 
    printf "chrom %d %d promoter(%d+)%s %d +\nchrom %d %d exon(%d+)%s %d +\nchrom %d %d CpGWindow(%d+)%s %d +\n", $2-1, $3, $1, extra, $4 * 1000 , $5-1, $6, $1, extra, $7 * 1000 , $9-1, $10,  $1, extra, 1000
  else
    printf "chrom %d %d promoter(%d-)%s %d -\nchrom %d %d exon(%d-)%s %d -\nchrom %d %d CpGWindow(%d-)%s %d -\n", $3-1, $2, $1, extra, $4 * 1000 , $6-1, $5, $1, extra, $7  * 1000 , $10-1, $9,  $1, extra, 1000  
}
'_EOF_'
cat <<  '_EOF_' > runProg
suffix=$1
inputFile=$2
chrom=`basename $2 $1`
output=$3 
./firstef $inputFile /tmp/$chrom.tmp $4 $5 $6 > /dev/null 2>&1
cat /tmp/$chrom.tmp | sed -f sedit.1 | awk -f tawk | sed "s/chrom/$chrom/" | sed -f sedit.2 > $output
rm -f /tmp/$chrom.tmp
'_EOF_'
chmod a+x runProg
# since firstef isn't built to have more than one
# copy executing at a time in a single directory
# we build a tar file to extract in each run directory
cd ..
tar cf prog.tar firstef
ls -1S /cluster/store5/gs.16/build33/*/*.fa.masked > human.lst
cat <<  '_EOF_' > doit
cp $1 /tmp
f=`basename $1 .fa.masked`
cd out/$f/firstef
./runProg .fa.masked /tmp/$f.fa.masked ../$f.bed $3 $4 $5 
'_EOF_'
chmod a+x doit
set pExon=0.4
set pPromoter=0.3
set pDonor=0.4
set param="$pPromoter $pDonor $pExon"
echo "#LOOP " > gsub
echo "./doit \044(path1) {check out line+ out/\044(root1)/\044(root1).bed} $param" >> gsub
echo "#ENDLOOP" >> gsub
gensub2 human.lst single gsub temp
cat temp | sed "s/\.fa\//\//" | sed "s/\.fa\.bed/.bed/"  > spec
rm temp
# make directories for cluster output files
rm -rf out
mkdir -p out
cd out
foreach i ( `cat ../human.lst`) 
set f=`basename $i .fa.masked` 
mkdir $f; cd $f; tar xf ../../prog.tar; cd ..
end
cd ..
para create spec
para try
para check
para push
para times

#Average job time:                 547s       9.11m     0.15h    0.01d
#Longest job:                     1739s      28.98m     0.48h    0.02d
# Two jobs fail 
# out/chrY_random/chrY_random.bed is empty
# out/chrM/chrM.bed is empty
ssh hgwdev
cat out/*/*.bed > firstEF.bed
echo "drop table firstEF;" | hgsql hg15
hgLoadBed hg15 firstEF firstEF.bed
rm -rf out

# COW SYNTENY (Done, Heather, Mar. 2005)
# Data from Harris A. Lewin <h-lewin@uiuc.edu>

ssh hgwdev
cd /cluster/data/hg15/bed
mkdir syntenyCow
cd syntenyCow

# Obtain bed5 file from
# http://genome.ucsc.edu/cgi-bin/hgTracks?db=hg15&position=chr1&hgt.customText=http://cagst.animal.uiuc.edu/BACcontig/larkin_et_al_track 
# Save as in.bed

hgLoadBed -noBin hg15 syntenyCow in.bed

# New data from UIUC (using chrom names)
# Reloaded from in2.bed 

hgLoadBed -noBin hg15 syntenyCow in2.bed

# New data again (Dec. 2005)

fixChr.pl < in3.bed > in4.bed
hgLoadBed -noBin hg15 syntenyCow in4.bed

# add to kent/src/hg/makeDb/trackDb/human/hg15/trackDb.ra

# COW BACENDS (Done, Heather, Mar. 8, 2005)

ssh hgwdev
cd /cluster/data/hg15/bed
mkdir bacendsCow
cd bacendsCow

# Obtain bed4 file from
#http://genome.ucsc.edu/cgi-bin/hgTracks?db=hg15&position=chr1&hgt.customText=http://cagst.animal.uiuc.edu/BACcontig/map_and_bac_ends
# Save as in.bed

hgLoadBed -noBin hg15 bacendsCow in.bed

# add to kent/src/hg/makeDb/trackDb/human/hg15/trackDb.ra

# COW BACENDS (Updated, Heather, Mar. 21, 2005)

ssh hgwdev
cd /cluster/data/hg15/bed/bacendsCow

# Obtain GFF file from Denis; unzip into BACendhg15.gff

# Convert into BED 6:

  makebed.pl < BACendhg15.gff > BACendhg15.bed
  hgLoadBed -noBin hg15 bacendsCow BACendhg15.bed
  # 54262 warnings


# RUN 3-WAY ALIGNMENTS HUMAN/RAT/MOUSE USING MULTIZ (DONE kate)

    set multizDir = /cluster/data/hg15/bed/multiz.rn3-mm3.2003-07-26-KRR

    # Edit hg/ratStuff/stageMultiz/stage.csh with correct paths
    # Saved in $multizDir
    # Perform steps in stage.csh (still must be done manually)
    # to set up staging directories and run multiz on cluster.

    # setup up external files for database to reference
    ssh hgwdev
    mkdir -p /gbdb/hg15/multizRn3Mm3
    cd /gbdb/hg15/multizRn3Mm3
    foreach f ($multizDir/hmr/*.maf)
        ln -s $f .
    end

    # load into database
    /cluster/bin/i386/hgLoadMaf -warn hg15 multizRn3Mm3

    # check with previous run -- note fewer alignments in latest run
    featureBits hg15 multizRn3Mm3
        # 1127632252 bases of 2866466359 (39.339%) in intersection
    featureBits hg15 multizMm3Rn2
        # 1166321938 bases of 2866466359 (40.688%) in intersection


# RUN 3-WAY ALIGNMENTS HUMAN/MOUSE/RAT USING MULTIZ (IN PROGRESS kate)

    set multizDir = /cluster/data/hg15/bed/multiz.mm3-rn3.2003-08-06-KRR
    mkdir $multizDir

    # Edit hg/ratStuff/stageMultiz/stage.csh with correct paths
    # Saved in $multizDir
    # Perform steps in stage.csh (still must be done manually)
    # to set up staging directories and run multiz on cluster.

    # setup up external files for database to reference
    ssh hgwdev
    mkdir -p /gbdb/hg15/multizMm3Rn3
    cd /gbdb/hg15/multizMm3Rn3
    foreach f ($multizDir/hmr/*.maf)
        ln -s $f .
    end

    # load into database
    /cluster/bin/i386/hgLoadMaf -warn hg15 multizMm3Rn3

    # check with previous run 
    featureBits hg15 multizMm3Rn3
    featureBits hg15 multizMm3Rn2


# Making Humor 3 way alignment using axtNet 300  (baertsch)
# Approximate directions below

    cd /cluster/data/hg15/bed/blastz.mm3/axtChain/mouseNet
    mkdir -p ../../axtNet300
    forEachChrom
        netToAxt -maxGap=300 $i.net ../chain/$i.chain /cluster/data/hg15/nib /cluster/data/mm3/nib ../../axtNet300/$i.axt
    end

    cd /cluster/data/hg15/bed/blastz.rn3/axtChain/ratNet
    mkdir -p ../../axtNet300
    forEachChrom
        netToAxt -maxGap=300 $i.net ../chain/$i.chain /cluster/data/hg15/nib /cluster/data/rn3/nib ../../axtNet300/$i.axt
    end

    set multizDir = /cluster/data/hg15/bed/humor300
    mkdir $multizDir
    cd /cluster/data/hg15
    forEachChrom
        axtSort blastz.mm3/axtNet300/ tmp
        mv tmp blastz.mm3/axtNet300
        axtToMaf blastz.mm3/axtNet300 /cluster/data/hg15/chrom.sizes /cluster/data/mm3/chrom.sizes axtNet300/$i.mm3.maf -tPrefix=hg15. -qPrefix=mm3. 
        axtSort blastz.rn3/axtNet300/ tmp
        mv tmp blastz.rn3/axtNet300
        axtToMaf blastz.rn3/axtNet300 /cluster/data/hg15/chrom.sizes /cluster/data/rn3/chrom.sizes axtNet300/$i.rn3.maf -tPrefix=hg15. -qPrefix=rn3. 
        /cluster/bin/penn/humor.v4 axtNet300/$i.mm3.maf axtNet300/$i.rn3.maf > $multizDir/$i.hmr.axtNet300.maf
    end
run on kkr9 cluster:
CPU time in finished jobs:      18977s     316.28m     5.27h    0.22d  0.001 y
IO & Wait Time:                  1237s      20.62m     0.34h    0.01d  0.000 y
Average job time:                 470s       7.83m     0.13h    0.01d
Longest job:                     1762s      29.37m     0.49h    0.02d
Submission to last job:          1817s      30.28m     0.50h    0.02d

    mkdir -p /gbdb/hg15/humor300
    cd /gbdb/hg15/humor300
    foreach f ($multizDir/*.maf)
        ln -s $f .
    end
    # load into database
    /cluster/bin/i386/hgLoadMaf -warn hg15 humor300
    rm *.tab

# Creating knownToSuper (which enables superFamily stuff in hgNear/hgGene)
# First see if need to update superfamily data from 
# ftp server at supfam.mrc-lmb.cam.ac.uk following instructions
# in /cluster/store1/superFamily/genomes/README.ucsc.  Then
# make sure that knownToEnsembl and ensGtp tables are created, then:
    zcat /cluster/store1/superFamily/genomes/ass_26-Oct-2003.tab.gz | hgKnownToSuper hg15 hs stdin


# RELOAD ENSEMBL GENES to fix confusion in current track (2004-02-01 - Hiram)
#	Turns out this wasn't necessary.  The correct data was already
#	in Hg15, it just hadn't been pushed out to the RR.
#	Leaving these instructions here for future reference.
#	This is the way is should be done
    # save current tables, just in case.
    hgsql -e "rename table  ensGene to ensGene_old;" hg15
    hgsql -e "rename table  ensGtp  to ensGtp_old;" hg15
    hgsql -e "rename table  ensPep  to ensPep_old;" hg15

    #	You do not want ensembl34a data for this assembly.
    #	Ensembl does not yet make it convenient to fetch archived
    #	assemblies.
    mkdir /cluster/data/hg15/bed/ensembl34a
    cd /cluster/data/hg15/bed/ensembl34a
    # Get the ensembl protein data from 
    # http://www.ensembl.org/Homo_sapiens/martview
    # Follow this sequence through the pages:
    # Page 1) Make sure that the Homo_sapiens choice is selected. Hit next.
    # Page 2) Uncheck the "Limit to" box in the region choice. Then hit next.
    # Page 3) Choose the "Structures" tab. 
    # Page 4) In "Select the output format:" Choose GTF as the output,
    #	 choose gzip compression.  enter the name ensembleGene.gtf.gz
    #	in the download file name entry, hit export.
    # Save as ensemblGene.gtf.gz - about 7.8 Mb
-
    # Ensembl handles random chromosomes differently than us, so we
    # strip this data.  Fortunately it just loses a couple of genes.
    # Add "chr" to front of each line in the gene data gtf file to make 
    # it compatible with our software.
    # Finally, get rid of the ".1" or ".2" after the name
    zcat ensemblGene.gtf.gz \
    | grep -v ^6_DR51 \
    | grep -v ^DR51 \
    | grep -v _NT_ \
    | perl -wpe 's/^([0-9]|X|Y|Un)/chr$1/
                 || die "Line $. doesnt start with human chrom:\n$_"' \
    | sed -e 's/\..\"/\"/g' \
    > ensGene.gtf
    ssh hgwdev
    /cluster/bin/i386/ldHgGene hg15 ensGene \
      /cluster/data/hg15/bed/ensembl34a/ensGene.gtf

    # ensGtp associates geneId/transcriptId/proteinId for hgPepPred and 
    # hgKnownToSuper.  Use ensMart to create it as above, except:
    # Page 3) Choose the "Features" tab. In "Ensembl Attributes", check 
    # Ensembl Gene ID, Ensembl Transcript ID, Ensembl Peptide ID.  
    # Choose Text, tab-separated as the output format.  Result name ensGtp.
    # Save file as ensGtp.txt.gz
    gunzip ensGtp.txt.gz
    hgsql hg15 < ~/kent/src/hg/lib/ensGtp.sql
    echo "load data local infile 'ensGtp.txt' into table ensGtp" | hgsql -N hg15
    gzip ensGtp.txt

    # Load Ensembl peptides:
    # Get them from ensembl as above in the gene section except for
    # Page 3) Choose the "Sequences" tab. 
    # Page 4) Transcripts/Proteins.  Peptide.  Format = FASTA.
    # Save file as ensemblPep.fa.gz - about 7.7 Mb
    zcat ensemblPep.fa.gz | hgPepPred hg15 ensembl stdin

    # compare size of old and new tables as a sanity check
    foreach t (ensGene ensGtp ensPep)
	echo -n "${t}: "
	hgsql -e "select count(*) from ${t};" hg15 | tail -1
	echo -n "${t}_old: "
	hgsql -e "select count(*) from ${t}_old;" hg15 | tail -1
    end
    # if reasonable OK, can drop old tables
    drop table ensGene_old;
    drop table ensGtp_old;  
    drop table ensPep_old;  

    # Create knownToEnsembl column and knownToSuperfamily column
    hgMapToGene hg15 ensGene knownGene knownToEnsembl
    zcat /cluster/store1/superFamily/genomes/ass_26-Oct-2003.tab.gz | hgKnownToSuper hg15 hs stdin


#  KNOWN GENE UPDATE (DONE - 2004-02-24 - Hiram
#	using proteins and swissProt as created for Hg16 recently
    ssh hgwdev
    mkdir /cluster/data/kgDB/bed/kgHg15
    cd /cluster/data/kgDB/bed/kgHg15
    ~/kent/src/hg/protein/KGprocess.sh kgHg15 hg15 040115
#	that runs to the first cluster run, then on kk
    ssh kk
    cd /cluster/data/kgDB/bed/kgHg15/kgBestMrna
    para create jobList
    para try
    para push ... etc ...
# Completed: 37972 of 37972 jobs
# CPU time in finished jobs:     106405s    1773.41m    29.56h    1.23d  0.003 y
# IO & Wait Time:                104497s    1741.62m    29.03h    1.21d  0.003 y
# Average job time:                   6s       0.09m     0.00h    0.00d
# Longest job:                       12s       0.20m     0.00h    0.00d
# Submission to last job:           238s       3.97m     0.07h    0.00d

#  Continuing back on hgwdev
    ssh hgwdev
    cd /cluster/data/kgDB/bed/kgHg15
    ~/kent/src/hg/protein/KGprocess.sh kgHg15 hg15 040115
#	that runs until it prepares the second cluster run, then
    ssh kk
    cd /cluster/data/kgDB/bed/kgHg15/kgProtMap
    para create jobList
    para try
    para push ... etc
# Completed: 4953 of 4953 jobs
# CPU time in finished jobs:    1612079s   26867.98m   447.80h   18.66d  0.051 y
# IO & Wait Time:                218411s    3640.18m    60.67h    2.53d  0.007 y
# Average job time:                 370s       6.16m     0.10h    0.00d
# Longest job:                     1939s      32.32m     0.54h    0.02d
# Submission to last job:          4682s      78.03m     1.30h    0.05d

#  Continuing back on hgwdev
    ssh hgwdev
    cd /cluster/data/kgDB/bed/kgHg15
    ~/kent/src/hg/protein/KGprocess.sh kgHg15 hg15 040115
#  That runs to completion
#  Being more than thorough, record status of all tables currently in
#	hg15 before loading tables, then backup all existing tables from
#	hg15 to kgHg15_2003_06 via a dump since a rename does not work
#	because hg15 is on a different filesystem than kgHg15_2003_06
mkdir /cluster/data/hg15/bed/famBro.2004-02-23
ln -l /cluster/data/hg15/bed/famBro.2004-02-23 /cluster/data/hg15/bed/famBro
cd /cluster/data/hg15/bed/famBro
    ~hiram/bin/tblData.pl -verbose hg15 > tblData.before
    cat << '_EOF_' > loadNew.sh
#!/bin/sh
#
#
OldDB=hg15
NewDB=kgHg15
mkdir -p newKg
#
for T in cgapAlias cgapBiocDesc cgapBiocPathway dupSpMrna keggMapDesc \
        keggPathway kgAlias kgProtMap kgProtAlias kgXref knownGene \
        knownGeneLink knownGeneMrna knownGenePep mrnaRefseq spMrna
do
        echo "copy table data from ${NewDB}.${T} to ${OldDB}.${T};"
        rm -f newKg/${T}.sql newKg/${T}.txt newKg/${T}.txt.gz
        hgsqldump --all -T newKg ${NewDB} ${T}
        hgsql -e "drop table ${T};" ${OldDB}
        cd newKg
        hgsql ${OldDB} < ${T}.sql
        mysqlimport --user=hgcat --password="Man-0-W4r" ${OldDB} ${T}.txt
        gzip ${T}.txt &
        cd ..
done
'_EOF_'

#	Eventually ended saving all of the following tables during the
#	family browser update process below:  (~400 Mb of data)
#	affyUcla             kgProtAlias           knownToCdsSnp    
#	ceBlastTab           kgXref                knownToEnsembl    
#	cgapAlias            knownBlastTab         knownToPfam    
#	cgapBiocDesc         knownCanonical        knownToRefSeq    
#	cgapBiocPathway      knownExpDistance      knownToU133    
#	dmBlastTab           knownGeneLink         knownToU95    
#	drBlastTab           knownGeneMrna         mmBlastTab    
#	dupSpMrna            knownGenePep          mrnaRefseq    
#	keggMapDesc          knownGene             scBlastTab    
#	keggPathway          gnfU95Distance        spMrna    
#	kgAlias              knownIsoforms    


#	continue with the FAMILY BROWSER UPDATE to complete this process

# CREATE FULL TEXT INDEX FOR KNOWN GENES (DONE 1/19/2006 JK)
# This depends on the go and uniProt databases as well as 
# the kgAlias and kgProAlias tables.  The hgKgGetText takes
# about 5 minutes when the database is not too busy.  The rest
# is real quick.
     ssh hgwdev
     cd /cluster/data/hg15/bed/knownGene
     mkdir index
     cd index
     hgKgGetText hg15 knownGene.text
     ixIxx knownGene.text knownGene.ix knownGene.ixx
     ln -s /cluster/data/hg15/bed/knownGene/index/knownGene.ix /gbdb/hg15/knownGene.ix
     ln -s /cluster/data/hg15/bed/knownGene/index/knownGene.ixx /gbdb/hg15/knownGene.ixx


#	FAMILY BROWSER UPDATE	(DONE - 2004-02-24 - Hiram)
#	to be done after knownGene tables are complete from known gene
#	process.
#
# Cluster together various alt-splicing isoforms.
#	Creates the knownIsoforms and knownCanonical tables
ssh hgwdev
mkdir /cluster/data/hg15/bed/famBro.2004-02-23
ln -l /cluster/data/hg15/bed/famBro.2004-02-23 /cluster/data/hg15/bed/famBro
cd /cluster/data/hg15/bed/famBro
hgClusterGenes hg15 knownGene knownIsoforms knownCanonical
# Got 19654 clusters, from 36011 genes in 44 chromosomes

# Extract peptides from knownGenes into fasta file
# and create a blast database out of them.
mkdir /cluster/data/hg15/bed/famBro/blastp
cd /cluster/data/hg15/bed/famBro/blastp
pepPredToFa hg15 knownGenePep known.faa
#	You may need to build this binary in src/hg/near/pepPredToFa
/scratch/blast/formatdb -i known.faa -t known -n known
#	This command is in /projects/compbio/bin/$MACH/formatdb

# Copy over database to bluearc
rm -fr /cluster/bluearc/hg15/blastp
mkdir -p /cluster/bluearc/hg15/blastp
cp -p /cluster/data/hg15/bed/famBro/blastp/known.* /cluster/bluearc/hg15/blastp


# Split up fasta file into bite sized chunks for cluster
cd /cluster/data/hg15/bed/famBro/blastp
mkdir split
faSplit sequence known.faa 8000 split/kg

#	/scratch/blast should already exist and we want to use it
# Make parasol run directory
ssh kk
mkdir /cluster/data/hg15/bed/famBro/blastp/self
cd /cluster/data/hg15/bed/famBro/blastp/self
mkdir run
cd run
mkdir out

# Make blast script
cat  << '_EOF_' > blastSome
#!/bin/sh
BLASTMAT=/scratch/blast/data /scratch/blast/blastall -p blastp \
	-d /cluster/bluearc/hg15/blastp/known -i $1 -o $2 \
	-e 0.01 -m 8 -b 1000
'_EOF_'
chmod a+x blastSome

# Make gensub2 file
cat  << '_EOF_' > gsub
#LOOP
blastSome {check in line+ $(path1)} {check out line out/$(root1).tab}
#ENDLOOP
'_EOF_'

# Create parasol batch
#	'ls ../../split/*.fa' is too much, hence the echo
echo ../../split/*.fa | wordLine stdin > split.lst
gensub2 split.lst single gsub jobList
para create jobList
para try

# Wait a couple of minutes, and do a para check,  if all is good
# then do a
para push
# This should finish in ~10 minutes if the cluster is free.

# Completed: 7753 of 7753 jobs
# CPU time in finished jobs:     118480s    1974.67m    32.91h    1.37d  0.004 y
# IO & Wait Time:                 72240s    1203.99m    20.07h    0.84d  0.002 y
# Average job time:                  25s       0.41m     0.01h    0.00d
# Longest job:                      271s       4.52m     0.08h    0.00d
# Submission to last job:           470s       7.83m     0.13h    0.01d

# Load into database.  This takes about 20 minutes
ssh hgwdev
cd /cluster/data/hg15/bed/famBro/blastp/self/run/out
time hgLoadBlastTab hg15 knownBlastTab *.tab
# Scanning through 7753 files
# Loading database with 8832851 rows
#	previously had  7103098 rows
#	real    19m53.778s
#	user    3m38.460s
#	sys     0m24.020s

cd /cluster/data/hg15/bed/famBro
# Create table that maps between known genes and RefSeq
hgMapToGene hg15 refGene knownGene knownToRefSeq
#	may need to build this command in src/hg/near/hgMapToGene
#	row count changed from 29278 to 29296

# Create table that maps between known genes and LocusLink
hgsql --skip-column-names -e "select mrnaAcc,locusLinkId from refLink" hg15 \
	> refToLl.txt
hgMapToGene hg15 refGene knownGene knownToLocusLink -lookup=refToLl.txt
#	this table did not exist in previous version, rows now: 29296

# Create table that maps between known genes and Pfam domains
hgMapViaSwissProt hg15 knownGene name proteinID Pfam knownToPfam
#	row count went from 30689 to 31337

# Create a table that maps between known genes and 
# the nice affy expression data.
hgMapToGene "-type=bed 12" hg15 affyUcla knownGene knownToU133
#	row count changed from 32009 to 32067

# Create expression distance table.  This will take about an hour.
#	run this in /tmp as it makes a huge (~800Mb) temp file
#	-rw-rw-r--    1 848350571 Feb 23 14:08 /tmp/knownExpDistance.tab
cd ~/kent/src/hg/near/hgExpDistance
cp -p affyUcla.weight /tmp
cd /tmp
time hgExpDistance hg15 affyUcla affyUclaExp knownExpDistance \
	-weights=affyUcla.weight -lookup=knownToU133
# Have 43344 elements in affyUcla
# Got 32067 unique elements in affyUcla
# real    65m16.947s
# user    50m47.980s
# sys     0m48.870s
#	row count went from 31504000 to 32067000

cd /cluster/data/hg15/bed/famBro
# Create table that maps between known genes and 
# the GNF data.
hgMapToGene hg15 affyRatio knownGene knownToU95
#	row count changed from 13403 to 10949
cd /tmp
#	hgFixed.gnfHumanU95Exps argument is unused, no need to exist
time hgExpDistance hg15 hgFixed.gnfHumanU95MedianRatio hgFixed.gnfHumanU95Exps gnfU95Distance    -lookup=knownToU95
#	row count went from 11718000 to 10333000
# Got 10333 unique elements in hgFixed.gnfHumanU95MedianRatio
#	real    10m42.446s
#	user    6m0.980s
#	sys     0m12.430s

# Make sure that GO database is up to date.
See README in /cluster/store1/geneOntology.
#	I update this GO database very carefully, checking that all
#	structures in it remain the same from release to release and
#	backing up the current go DB in a backup database.  In this case
#	the backup is go040107 - when it was loaded for Mm4, and the new
#	go database is based on data from Dec 17th 2003 and Feb 2004 according
#	to the time stamp on the fetched data.  This build was done in
#	/cluster/store1/geneOntology/20040217

cd /cluster/data/hg15/bed/famBro
#	create knownToSuper table
zcat /cluster/store1/superFamily/genomes/ass_26-Oct-2003.tab.gz | hgKnownToSuper hg15 hs stdin
#	row count changed from 34558 to 34334

# Create knownToEnsembl column
hgMapToGene hg15 ensGene knownGene knownToEnsembl
#	row count changed from 33465 to 33561

# Make knownToCdsSnp column.  This is a little complicated by
# having to merge data form the snpTsc and the snpNih tracks.
hgMapToGene hg15 snpTsc knownGene knownToCdsSnp -createOnly -all -cds
hgMapToGene hg15 snpTsc knownGene snp1 -noLoad -all -cds
hgMapToGene hg15 snpNih knownGene snp2 -noLoad -all -cds
sort snp1.tab snp2.tab > knownToCdsSnp.tab
rm snp1.tab snp2.tab
hgsql \
    -e 'load data local infile "knownToCdsSnp.tab" into table knownToCdsSnp;' \
	hg15
#	row count changed from 82592 to 83425

#
#	The five organisms below can be run through quickly.  Get one kluster
#	run started and go on to the next.  Takes about 7 minutes to get
#	them all running and by then one of them is already done.
#	Keep track of where you are at during this.
#
# Make C. elegans ortholog column using blastp on wormpep.
# First make C. elegans protein database and copy it to cluster/bluearc
# if it doesn't exist already
#	This is already done, see makeMm3.doc for procedure
#	the directory: /cluster/bluearc/ce1/blastp should have data

# Create the ceBlastTab
ssh kk
mkdir /cluster/data/hg15/bed/famBro/blastp/ce1
cd /cluster/data/hg15/bed/famBro/blastp/ce1
mkdir run
cd run
mkdir out

# Make blast script
cat  << '_EOF_' > blastSome
#!/bin/sh
BLASTMAT=/scratch/blast/data /scratch/blast/blastall -p blastp \
	-d /cluster/bluearc/ce1/blastp/wormPep \
	-i $1 -o $2 -e 0.01 -m 8 -b 1
'_EOF_'
chmod a+x blastSome

# Make gensub2 file
cat << '_EOF_' > gsub
#LOOP
blastSome {check in line+ $(path1)} {check out line out/$(root1).tab}
#ENDLOOP
'_EOF_'

# Create parasol batch
echo ../../split/*.fa | wordLine stdin > split.lst
gensub2 split.lst single gsub jobList
para make jobList

# Completed: 7753 of 7753 jobs
# CPU time in finished jobs:      53399s     889.99m    14.83h    0.62d  0.002 y
# IO & Wait Time:                 20094s     334.90m     5.58h    0.23d  0.001 y
# Average job time:                   9s       0.16m     0.00h    0.00d
# Longest job:                      140s       2.33m     0.04h    0.00d
# Submission to last job:           247s       4.12m     0.07h    0.00d

# Load into database.  
ssh hgwdev
cd /cluster/data/hg15/bed/famBro/blastp/ce1/run/out
hgLoadBlastTab hg15 ceBlastTab -maxPer=1 *.tab
#	row count went from 23504 to 23714

# Make mouse ortholog column using blastp on mouse known genes.
# First make mouse protein database and copy it to cluster/bluearc
# if it doesn't exist already
#	This already exists.  See makeMm4.doc for procedure
#	the directory: /cluster/bluearc/mm4/blastp should have data

# Make parasol run directory 
ssh kk
mkdir /cluster/data/hg15/bed/famBro/blastp/mm4
cd /cluster/data/hg15/bed/famBro/blastp/mm4
mkdir run
cd run
mkdir out

# Make blast script
cat  << '_EOF_' > blastSome
#!/bin/sh
BLASTMAT=/scratch/blast/data /scratch/blast/blastall -p blastp \
	-d /cluster/bluearc/mm4/blastp/known \
	-i $1 -o $2 -e 0.001 -m 8 -b 1
'_EOF_'
chmod a+x blastSome

# Make gensub2 file
cat << '_EOF_' > gsub
#LOOP
blastSome {check in line+ $(path1)} {check out line out/$(root1).tab}
#ENDLOOP
'_EOF_'

# Create parasol batch
#	this echo trick is used because otherwise the command line is
#	too long and you can not do a simple ls

echo ../../split/*.fa | wordLine stdin > split.lst
gensub2 split.lst single gsub jobList
para make jobList

# Completed: 7753 of 7753 jobs
# CPU time in finished jobs:     102603s    1710.06m    28.50h    1.19d  0.003 y
# IO & Wait Time:                 45327s     755.44m    12.59h    0.52d  0.001 y
# Average job time:                  19s       0.32m     0.01h    0.00d
# Longest job:                      213s       3.55m     0.06h    0.00d
# Submission to last job:           530s       8.83m     0.15h    0.01d

# Load into database.  
ssh hgwdev
cd /cluster/data/hg15/bed/famBro/blastp/mm4/run/out
hgLoadBlastTab hg15 mmBlastTab -maxPer=1 *.tab
#	row count went from 30536 to 31071

# Make Danio rerio (zebrafish) ortholog column using blastp on Ensembl.
# First make protein database and copy it to cluster/bluearc
# if it doesn't exist already
#	This is already done, see makeMm3.doc for procedure
#	the directory: /cluster/bluearc/dr1/blastp should have data

# Make parasol run directory 
ssh kk
mkdir /cluster/data/hg15/bed/famBro/blastp/dr1
cd /cluster/data/hg15/bed/famBro/blastp/dr1
mkdir run
cd run
mkdir out

# Make blast script
cat  << '_EOF_' > blastSome
#!/bin/sh
BLASTMAT=/scratch/blast/data /scratch/blast/blastall -p blastp \
	-d /cluster/bluearc/dr1/blastp/ensembl \
	-i $1 -o $2 -e 0.005 -m 8 -b 1
'_EOF_'
chmod a+x blastSome

# Make gensub2 file
cat << '_EOF_' > gsub
#LOOP
blastSome {check in line+ $(path1)} {check out line out/$(root1).tab}
#ENDLOOP
'_EOF_'

# Create parasol batch
echo ../../split/*.fa | wordLine stdin > split.lst
gensub2 split.lst single gsub jobList
para make jobList

# Completed: 7753 of 7753 jobs
# CPU time in finished jobs:      76565s    1276.09m    21.27h    0.89d  0.002 y
# IO & Wait Time:                 35019s     583.64m     9.73h    0.41d  0.001 y
# Average job time:                  14s       0.24m     0.00h    0.00d
# Longest job:                      208s       3.47m     0.06h    0.00d
# Submission to last job:           508s       8.47m     0.14h    0.01d

# Load into database.  
ssh hgwdev
cd /cluster/data/hg15/bed/famBro/blastp/dr1/run/out
hgLoadBlastTab hg15 drBlastTab -maxPer=1 *.tab
#	row count went from 27914 to 28141

# Make Saccharomyces cerevisiae (yeast) ortholog column using blastp on RefSeq.
# First make protein database and copy it to cluster/bluearc
# if it doesn't exist already
#	This is already done, see makeMm3.doc for procedure
#	the directory: /cluster/bluearc/sc1/blastp should have data

# Make parasol run directory 
ssh kk
mkdir /cluster/data/hg15/bed/famBro/blastp/sc1
cd /cluster/data/hg15/bed/famBro/blastp/sc1
mkdir run
cd run
mkdir out

# Make blast script
cat  << '_EOF_' > blastSome
#!/bin/sh
BLASTMAT=/scratch/blast/data /scratch/blast/blastall -p blastp \
	-d /cluster/bluearc/sc1/blastp/sgd \
	-i $1 -o $2 -e 0.01 -m 8 -b 1
'_EOF_'
chmod a+x blastSome

# Make gensub2 file
cat << '_EOF_' > gsub
#LOOP
blastSome {check in line+ $(path1)} {check out line out/$(root1).tab}
#ENDLOOP
'_EOF_'

# Create parasol batch
echo ../../split/*.fa | wordLine stdin > split.lst
gensub2 split.lst single gsub jobList
para make jobList

# Completed: 7753 of 7753 jobs
# CPU time in finished jobs:      15864s     264.39m     4.41h    0.18d  0.001 y
# IO & Wait Time:                 22424s     373.74m     6.23h    0.26d  0.001 y
# Average job time:                   5s       0.08m     0.00h    0.00d
# Longest job:                       31s       0.52m     0.01h    0.00d
# Submission to last job:           172s       2.87m     0.05h    0.00d

ssh hgwdev
cd /cluster/data/hg15/bed/famBro/blastp/sc1/run/out
hgLoadBlastTab hg15 scBlastTab -maxPer=1 *.tab
#	row count went from 15686 to 15840

# Make Drosophila melanagaster ortholog column using blastp on FlyBase.
# First make SwissProt protein database and copy it to cluster/bluearc
# if it doesn't exist already
#	This is already done, see makeMm3.doc for procedure
#	the directory: /cluster/bluearc/dm1/blastp should have data

# Make parasol run directory 
ssh kk
mkdir /cluster/data/hg15/bed/famBro/blastp/dm1
cd /cluster/data/hg15/bed/famBro/blastp/dm1
mkdir run
cd run
mkdir out

# Make blast script
cat  << '_EOF_' > blastSome
#!/bin/sh
BLASTMAT=/scratch/blast/data /scratch/blast/blastall -p blastp \
	-d /cluster/bluearc/dm1/blastp/flyBase \
	-i $1 -o $2 -e 0.01 -m 8 -b 1
'_EOF_'
chmod a+x blastSome

# Make gensub2 file
cat << '_EOF_' > gsub
#LOOP
blastSome {check in line+ $(path1)} {check out line out/$(root1).tab}
#ENDLOOP
'_EOF_'

# Create parasol batch
echo ../../split/*.fa | wordLine stdin > split.lst
gensub2 split.lst single gsub jobList
para make jobList

# Completed: 7753 of 7753 jobs
# CPU time in finished jobs:      62430s    1040.51m    17.34h    0.72d  0.002 y
# IO & Wait Time:                 23718s     395.29m     6.59h    0.27d  0.001 y
# Average job time:                  11s       0.19m     0.00h    0.00d
# Longest job:                      153s       2.55m     0.04h    0.00d
# Submission to last job:           386s       6.43m     0.11h    0.00d

# Load into database.  
ssh hgwdev
cd /cluster/data/hg15/bed/famBro/blastp/dm1/run/out
hgLoadBlastTab hg15 dmBlastTab -maxPer=1 *.tab
#	row count went from 24969 to 25152

    # Update the proteins link into gdbPdb.hgcentraltest:
hgsql \
 -e 'update gdbPdb set proteomeDb = "proteins040115" where genomeDb = "hg15";' \
        -h genome-testdb hgcentraltest

#
# Load tfbsCons track DONE 2004-03-19 braney
#

set humordir=/gbdb/hg15/humor25
set transfacdir=/projects/compbio/data/transfac
set outdir=hg15_tfbsCons

ssh hgwdev
mkdir /cluster/data/hg15/bed/tfbsCons
cd /cluster/data/hg15/bed/tfbsCons
# Get tfbsConsUtils.tar.gz from Matt Weirauch with Perl scripts  weirauch@soe.ucsc.edu
set tarfile=/cluster/data/hg15/bed/tfbsCons/tfbsConsUtils.tar.gz
tar zxf $tarfile
# the following takes days (says Matt)
nice getTfbsConsData.pl `pwd` $humordir $transfacdir ./IDS.txt $outdir -over &
cd $outdir
rm chr*.bed
hgLoadBed -noSort hg15 tfbsCons -sqlTable=$HOME/kent/src/hg/lib/tfbsCons.sql tfbs.bed -tab

# Get mapping of ID's from Matt so we can link into the TRANSFAC database
set idmap=/cluster/data/hg15/bed/tfbsCons/tfbsConsMap
hgsql hg15 < ~/kent/src/hg/lib/tfbsConsMap.sql
echo "load data local infile '$idmap' into table tfbsConsMap;" | hgsql hg15


# PREP FOR CREATING LIFTOVER CHAINS TO HG15 (2004-04-12 kate)

# split into 3K chunks
ssh eieio
set tempDir = /cluster/bluearc/hg/gs.16/build33/liftOver
#set split = /cluster/bluearc/hg/gs.16/build33/liftOver/split
cd $tempDir
mkdir lift 
cat > split.csh << 'EOF'
set scratch = /iscratch/i/gs.16/build33/liftOver/split
mkdir -p $scratch
foreach i (1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 X Y M)
    echo chr$i
    faSplit -lift=lift/chr$i.lft size /cluster/data/hg15/$i/chr$i.fa -oneFile 3000 $scratch/chr$i
end
'EOF'
csh split.csh >&! split.log &
tail -100f split.log
/cluster/bin/iSync


# LIFTOVER CHAINS TO HG16 (2004-04-14 kate)
# doc'ed in hg16/bed/bedOver/liftOver.doc ?

    ssh hgwdev
    cd /usr/local/apache/htdocs/goldenPath/hg15
    mkdir -p liftOver
    cp /cluster/data/hg16/bed/bedOver/over.chain liftOver/hg15ToHg16.over.chain
    mkdir -p /cluster/data/hg15/liftOver
    cp liftOver/hg15ToHg16.over.chain /cluster/data/hg15/liftOver
    gzip liftOver/hg15ToHg16.over.chain


# RECIPROCAL BEST LIFTOVER CHAINS TO HG16 (WORKING 2004-11-18 kate, sugnet)

    # Start with hg15Tohg16 liftover chains, which are best hg16 alignments to hg15
    # Net these chains, and extract chains from the hg16-reference net to
    # get reciprocal best

    ssh kolossus
    cd /cluster/data/hg15/bed/liftOver
    grep chain hg15ToHg16.over.chain | wc -l
        #   3938
    chainPreNet hg15ToHg16.over.chain /cluster/data/hg15/chrom.sizes \
                                            /cluster/data/hg16/chrom.sizes stdout | \
        chainNet stdin -minSpace=1 /cluster/data/hg15/chrom.sizes \
                                   /cluster/data/hg16/chrom.sizes \
                                        /dev/null hg15ToHg16.rbest.net
    chainSwap hg15ToHg16.over.chain stdout | \
        chainSort -target stdin hg15ToHg16.swp.chain
    netChainSubset hg15ToHg16.rbest.net hg16ToHg16.swp.chain stdout | \
        chainSort -target stdin hg15ToHg16.best.chain
    rm hg16ToHg16.rbest.net hg15ToHg16.swp.chain


#### AFFYTRANSFRAG AND AFFYTRANCRIPTION TRACKS - (2004-07-21 sugnet)
# tracks covering about 1/3 of genome with probes
# every 5bp and hybridized to RNA from SK-N-AS cell line.

# affyTranfrag track: SK_phase2_tfgs_final file emailed from 
# Hari at Affymetrix.
    cd /cluster/store6/weber/affy/transfrags/transfragsLabeled
    hgLoadBed hg15 affyTransfrags SK_phase2_tfgs_final.biggerThan50bp.tab

# affyTranscription track: Wiggle track of hybridization levels
# for every 5bp in genome. Quite a lot of data.
    cd /cluster/store6/weber/affy/graph/hg15/gz
    # Run wigAsciiToBinary for each chrom. I used small shell script "runWiggles.sh"
    wigAsciiToBinary -chrom=chr6 -wibFile=chr6.affyTranscription -minVal=0 chr6.sk.signal 
    wigAsciiToBinary -chrom=chr7 -wibFile=chr7.affyTranscription -minVal=0 chr7.sk.signal 
    wigAsciiToBinary -chrom=chr13 -wibFile=chr13.affyTranscription -minVal=0 chr13.sk.signal 
    wigAsciiToBinary -chrom=chr14 -wibFile=chr14.affyTranscription -minVal=0 chr14.sk.signal 
    wigAsciiToBinary -chrom=chr19 -wibFile=chr19.affyTranscription -minVal=0 chr19.sk.signal 
    wigAsciiToBinary -chrom=chr20 -wibFile=chr20.affyTranscription -minVal=0 chr20.sk.signal 
    wigAsciiToBinary -chrom=chr21 -wibFile=chr21.affyTranscription -minVal=0 chr21.sk.signal 
    wigAsciiToBinary -chrom=chr22 -wibFile=chr22.affyTranscription -minVal=0 chr22.sk.signal 
    wigAsciiToBinary -chrom=chrX -wibFile=chrX.affyTranscription -minVal=0 chrX.sk.signal 
    wigAsciiToBinary -chrom=chrY -wibFile=chrY.affyTranscription -minVal=0 chrY.sk.signal 
    hgLoadWiggle -pathPrefix=/gbdb/hg15/wib/affyTranscription hg15 affyTranscription *.wig
    # Connected to database hg15 for track affyTranscription
    # Creating table definition with 13 columns in hg15.affyTranscription
    # Saving wiggle.tab
    # Loading hg15
    cp *.wib /cluster/data/hg15/bed/affyTranscription/wib/
    cd  /gbdb/hg15/wib/affyTranscription/
    ln -s /cluster/data/hg15/bed/affyTranscription/wib/*.wib ./
    cd /cluster/data/hg15/bed/affyTranscription/wib/
    chmod 664 *.wib
    cd /cluster/store6/weber/affy/graph/hg15/gz
    rm *.tab *.wib *.wig
    gzip *.signal

# Creating Zoom to 10 base view of the affyTranscription data above
#		(DONE - 2004-08-06 - Hiram)
    ssh eieio
    cd /cluster/data/hg15/bed/affyTranscription
    ln -s /cluster/store6/weber/affy/graph/hg15/gz/*.signal.gz .
    cat << '_EOF_' > runZoom.sh
#!/bin/sh
#

mkdir -p zoom10wib
mkdir -p zoom10limits

for F in *.signal.gz
do
    C=${F/.sk.signal.gz/}
    echo "wigZoom -dataSpan=10 $F | wigAsciiToBinary -dataSpan=10 -chrom=${C}"
    zcat $F | awk '{if ($2 < 0) { printf "%d\t0\n", $1 } else { print }}' | \
        wigZoom -dataSpan=10 stdin | \
	    wigAsciiToBinary -dataSpan=10 -chrom=${C} \
		-wibFile=zoom10wib/${C}.zoom10 stdin 2> zoom10limits/${C}
done
'_EOF_'
    # << keep emacs happy
    chmod +x runZoom.sh
    time ./runZoom.sh
    #	takes about 15 minutes
    #	the awk statement takes the negative values and brings them up
    #	to zero as was done for the original data with the -minVal=0
    #	argument to the wigAsciiToBinary  (would need to add this arg to
    #	wigZoom to do the same thing here without the awk.)

    #	Load this zoom 10 view
    ssh hgwdev
    cd /gbdb/hg15/wib/affyTranscription
    ln -s /cluster/data/hg15/bed/affyTranscription/zoom10wib/*.wib .

    cd /cluster/data/hg15/bed/affyTranscription/zoom10wib
    #	Let's make sure we don't ruin the table:
    hgsql -N -e "select count(*) from affyTranscription;" hg15
    #	+--------+
    #	| 575775 |
    #	+--------+
    hgLoadWiggle -oldTable -pathPrefix=/gbdb/hg15/wib/affyTranscription hg15 \
	affyTranscription *.wig
    hgsql -N -e "select count(*) from affyTranscription;" hg15
    #	+--------+
    #	| 649503 |
    #	+--------+

    #	To test, at a browser image width of 1123 and a 10,000 bp display
    #	at:
    #	chr7:26,828,631-26,838,630
    #	That is exactly 10 bases per pixel.
    #	go plus or minus one base from there to go under and over
    #	the threshold, no dramatic change should be seen in the
    #	display (while in "Mean" windowing viewing mode which isn't the
    #	default for this track.)

# LOADING AFFYTXNPHASE2 track (sugnet)
# cd to where data is downloaded.
cd /cluster/store10/sugnet/affyTranscription/graphs/transcriptome.affymetrix.com/download/publication/polyA_minus/graphs

# Affymetrix uses names with underscores (which we don't) and has the annoying tendency to 
# put multiple data points at the same position in the genome which some of our tools 
# (like wigEncode) don't like. This script changes the format, changes the names and
# does averaging of the points and puts them in single chromosome files.
./makeWigFiles.sh # takes a long time to run...

# Use wigEncode to make the .wib and .wig files.
./makeWibWigHg15.sh # takes a long time to run.

# Copy .wib files to /cluster/data/hg15 files
mkdir /cluster/data/hg15/bed/affyTranscriptionPhase2/
cp `find ./ -name "*.wib"` /cluster/data/hg15/bed/affyTranscriptionPhase2/
chmod 775 /cluster/data/hg15/bed/affyTranscriptionPhase2/
chmod 664 /cluster/data/hg15/bed/affyTranscriptionPhase2/*

# link /gbdb/hg15 to data
mkdir /gbdb/hg15/wib/affyTranscriptionPhase2/
chmod 775 /gbdb/hg15/wib/affyTranscriptionPhase2/
cd /gbdb/hg15/wib/affyTranscriptionPhase2/
ln -s /cluster/data/hg15/bed/affyTranscriptionPhase2/* .

cd -
# Load the database tables (using bash) this takes a while
for file in `find ./ -name "*.wig"`; do
  base=`basename $file .wig`
  echo "Doing ${base}Txn"
  hgLoadWiggle -pathPrefix=/gbdb/hg15/wib/affyTranscriptionPhase2/  hg15 ${base}Txn $file
done

# Do the transfrags
cd ../transfrags

# Make names consistent.
./changeNames2.sh
foreach file (`ls -1 *PlusTnFg *MinusTnFg | grep -v '_'`) 
 set base=`echo $file | sed -e 's/@//g'`
 echo Doing $base
 hgLoadBed hg15 $base $base
end

# MYTOUCH FIX - jen - 2006-01-24
  sudo mytouch hg15 geneidPep 0404031400.00
  sudo mytouch hg15 twinscanPep 0404031400.00
  sudo mytouch hg15 dupSpMrna 0402242100.00
  sudo mytouch hg15 keggPathway 0402242100.00
  sudo mytouch hg15 kgAlias 0402242100.00
  sudo mytouch hg15 kgProtAlias 0402242100.00
  sudo mytouch hg15 kgXref 0402242100.00

