This file describes how we made the browser database on the Rattus 
# Norvegicus genome, November 2002 build.

CREATING DATABASE (DONE 11/25/02)

    # Create the database.
    ssh hgwdev
    # Enter mysql via:
    mysql -u hgcat -pbigsecret
    # At mysql prompt type:
      create database rn1;
      quit
    # make a semi-permanent read-only alias:
    alias rn1 "mysql -u hguser -phguserstuff -A rn1"
    # Use df to ake sure there is at least 5 gig free on 
    # hgwdev:/var/lib/mysql


COPY OVER JKSTUFF SCRIPTS DIRECTORY (DONE 11/25/02)

    ssh eieio
    ln -s /cluster/store4/rn1 ~/rn1
    # This is the first rat, so use mm2 as the previous "rat" build:
    ln -s /cluster/store2/mm.2002.02/mm2 ~/lastRn
    cd ~/rn1
    cp -Rp ~/lastRn/jkStuff .
    rm jkStuff/*.{out,lst,lft} jkStuff/*~


BREAK UP SEQUENCE INTO 5 MB CHUNKS AT NON_BRIDGED CONTIGS (DONE 11/25/02)

    # This version of the rat sequence data is in /cluster/store4/rn1.
    # The version of the sequence downloaded 11/25/02 is multi-record FA 
    # with AGP.  Use agpToFA to make chromosome .fa files, then use 
    # splitFaIntoContigs to make contig chromosome files.
    ssh eieio
    cd ~/rn1
    foreach c (?{,?})
      agpToFa -simpleMulti $c/chr$c.agp chr$c $c/chr$c.fa $c/chr$c.contig.fa
      if (-e $c/chr${c}.random.agp) then
        agpToFa -simpleMulti $c/chr$c.random.agp chr${c}_random \
          $c/chr${c}_random.fa $c/chr$c.random.contig.fa
      endif
    end
    # Check that the size of each chromosome .fa file is equal to the 
    # last coord of the .agp:
    foreach f ( */*.agp )
      set agpLen = `tail -1 $f | awk '{print $3;}'`
      set g = `echo $f:r | sed -e 's/\.random/_random/'`
      set faLen = `faSize $g.fa | awk '{print $1;}'`
      if ($agpLen == $faLen) then
        echo $f length = $g length = $faLen
      else
        echo Error\!\!\!  $f length = $agpLen, but $g length = $faLen
      endif
    end

    ssh hgwdev
    cd into your CVS source tree under kent/src/hg/splitFaIntoContigs
    make

    # This will split the rat sequence into approx. 5 Mbase
    # supercontigs between non-bridged clone contigs and drop the
    # resulting dir structure in /cluster/store4/rn1.  The resulting
    # dir structure will include 1 dir for each chromosome, each of
    # which has a set of subdirectories, one subdir per supercontig.
    ssh eieio
    cd ~/rn1
    foreach c (?{,?})
      cp -p $c/chr$c.agp $c/chr$c.agp.bak
      cp -p $c/chr$c.fa $c/chr$c.fa.bak
      splitFaIntoContigs $c/chr$c.agp $c/chr$c.fa . -nSize=5000000
      if (-e $c/chr${c}_random.fa) then
        cp -p $c/chr$c.random.agp $c/chr$c.random.agp.bak
        cp -p $c/chr${c}_random.fa $c/chr${c}_random.fa.bak
        splitFaIntoContigs $c/chr$c.random.agp $c/chr${c}_random.fa . \
          -nSize=5000000
        mv ${c}_random/lift/oOut.lst $c/lift/rOut.lst
        mv ${c}_random/lift/ordered.lft $c/lift/random.lft
        mv ${c}_random/lift/ordered.lst $c/lift/random.lst
        rmdir ${c}_random/lift
        rm ${c}_random/chr${c}_random.{agp,fa}
        mv ${c}_random/* $c
        rmdir ${c}_random
      endif
    end
    # Make sure the reconstructed .fa jives with the original:
    foreach f ( */*.fa.bak )
      echo $f:r
      diff $f $f:r | wc -l
    end
    # The .agp goes through a slight format change, but make sure it 
    # at least ends up with the same number of lines:
    foreach f ( */*.agp.bak )
      set l1 = `wc -l $f | awk '{print $1;}'`
      set l2 = `wc -l $f:r | awk '{print $1;}'`
      if ($l1 == $l2) then
        echo "$f and $f:r have the same #lines"
      else
        echo Error\!\!\!  $f has $l1 lines, but $f:r has $l2
      endif
    end



REPEAT MASKING (DONE 11/26/02)
   Split contigs, run RepeatMasker, lift results
   Notes: 
   * If there is a new version of RepeatMasker, build it and ask the admins 
     to binrsync it (kkstore:/scratch/hg/RepeatMasker/*).
   * Contigs (*/chr*_*/chr*_*.fa) are split into 500kb chunks to make 
     RepeatMasker runs manageable on the cluster ==> results need lifting.
   * For the NCBI assembly we repeat mask on the sensitive mode setting
     (RepeatMasker -m -s)

        #- Split contigs into 500kb chunks:
        ssh eieio
        cd ~/rn1
        foreach d ( */chr*_?{,?} )
          cd $d
          set contig = $d:t
          faSplit size $contig.fa 500000 ${contig}_ -lift=$contig.lft \
            -maxN=500000
          cd ../..
        end

        #- Make the run directory and job list:
        cd ~/rn1
        mkdir RMRun
        rm -f RMRun/RMJobs
        touch RMRun/RMJobs
        foreach d ( ?{,?}/chr*_?{,?} )
          set ctg = $d:t
          foreach f ( $d/${ctg}_?{,?}.fa )
            set f = $f:t
            echo /cluster/bin/scripts/RMRat \
                 /cluster/store4/rn1/$d $f \
               '{'check out line+ /cluster/store4/rn1/$d/$f.out'}' \
              >> RMRun/RMJobs
          end
        end

        #- Do the run
        ssh kk
        cd ~/rn1/RMRun
        para create RMJobs
        para try, para check, para check, para push, para check,...

        #- Lift up the split-contig .out's to contig-level .out's
        ssh eieio
        cd ~/rn1
        foreach d ( ?{,?}/chr*_?{,?} )
          cd $d
          set contig = $d:t
          liftUp $contig.fa.out $contig.lft warn ${contig}_*.fa.out > /dev/null
          cd ../..
        end

        #- Lift up the contig-level .out's to chr-level
        cd ~/rn1
        ./jkStuff/liftOut5.sh

        # soft-mask contig .fa's with .out's
        foreach i (?{,?})
            foreach j ($i/chr${i}_?{,?}/chr${i}_?{,?}.fa \
                       $i/chr${i}_random_?{,?}/chr${i}_random_?{,?}.fa)
                maskOutFa $j $j.out $j -soft
            end
            echo done $i
        end

        #- copy the contig .fa's to the appropriate place on /scratch
        ssh kkstore
        mkdir -p /scratch/hg/rn1/contigs.1126
        cp -p ~/rn1/?{,?}/chr*/chr?{,?}{,_random}_?{,?}.fa \
          /scratch/hg/rn1/contigs.1126

        #- Load the .out files into the database with:
        ssh hgwdev
        cd ~/rn1
        hgLoadOut rn1 ?{,?}/*.fa.out


VERIFY REPEATMASKER RESULTS (DONE 11/26/02)

    # Run featureBits on rn1 and on a comparable genome build, and compare:
    ssh hgwdev
    featureBits rn1 rmsk
    # --> 1081814344 bases of 2852382926 (37.927%) in intersection
    featureBits mm2 rmsk
    # --> 1037964664 bases of 2726995854 (38.063%) in intersection

    # Run RepeatMasker manually on a couple mini-contigs, verify that 
    # they appear in the chrom .out and are masked in the sequence file. 

    # Look at the DNA sequence of some RepeatMasker elements in the 
    # genome browser;  visually compare them to the RepeatMasker library 
    # elements and make sure an alignment is plausible.  


MAKE LIFTALL.LFT (DONE 11/26/02)

    cd ~/rn1
    cat ?{,?}/lift/{ordered,random}.lft > jkStuff/liftAll.lft


STORING O+O SEQUENCE AND ASSEMBLY INFORMATION  (DONE 11/26/02)

     # Load chromosome sequence info into database and save size info.
     ssh hgwdev
     hgsql rn1  < ~/src/hg/lib/chromInfo.sql
     cd ~/rn1
     hgNibSeq -preMadeNib rn1 /cluster/store4/rn1/nib \
       ?{,?}/chr?{,?}{,_random}.fa
     echo "select chrom,size from chromInfo" | hgsql rn1 > chrom.sizes
     # Make /gbdb/rn1/nib directory
     mkdir -p /gbdb/rn1/nib
     cd /gbdb/rn1/nib
     foreach f ( /cluster/store4/rn1/nib/chr*.nib )
       ln -s $f
     end


MAKE GCPERCENT (DONE 11/26/02)
     ssh hgwdev
     cd /cluster/store4/rn1/bed
     mkdir gcPercent
     cd gcPercent
     hgsql rn1  < ~/src/hg/lib/gcPercent.sql
     hgGcPercent rn1 ../../nib


MAKE HGCENTRALTEST ENTRY AND TRACKDB TABLE FOR RN1 (DONE 11/26/02)
    # Enter rn1 into hgcentraltest.dbDb so test browser knows about it:
    ssh genome-testdb
    mysql -u root -pbigSecret hgcentral
      insert into dbDb values("rn1", "Rat Nov. 2002",
        "/cluster/store4/rn1/nib", "Rat", "chr1:15647901-18981566", 1,
        10, "Rat");
      quit
    # Make trackDb table so browser knows what tracks to expect:
    ssh hgwdev
    cd ~/src/hg/makeDb/hgTrackDb
    cvs up -d -P
    # Edit that makefile to add rn1 in all the right places and do
    make
    make update
    cvs commit makefile


MAKE HGCENTRALTEST BLATSERVERS ENTRY FOR RN1 (DONE 12/4/02)
    ssh hgwdev
    mysql -h genome-testdb -u hgcat -pBIGSECRET -A hgcentraltest
      insert into blatServers values("rn1", "blat7", "17778", "1");
      insert into blatServers values("rn1", "blat7", "17779", "0");
      quit


COPY GENBANK MRNA DATA TO THE CLUSTER (DONE for mrna.132)
   ssh kkstore
   # Copy the rna data to the cluster if it isn't there already:
   mkdir /scratch/hg/mrna.132
   mkdir /scratch/hg/mrna.132/Rattus_norvegicus
   cp /cluster/store2/mrna.132/org/Rattus_norvegicus/*.fa /scratch/hg/mrna.132/Rattus_norvegicus
   # Distribute this data to the local nodes: ask sysadmins, or run:
   sudo /cluster/install/utilities/updateLocal


GOLD AND GAP TRACKS (DONE 11/26/02)
    ssh hgwdev
    cd ~/rn1
    hgGoldGapGl -noGl rn1 /cluster/store4/rn1 .


CREATE RELATIONAL MRNA DATABASE (DONE 11/27/02)
    ssh hgwdev

cd /cluster/store2/mrna.132

Make the anyRna.fil file using this text:

restrict mol=mRNA
hide mol

# Make the RNAs
        gunzip -c /cluster/store2/genbank.132/gb{pri,rod,v,mam,inv,bct,htc,pat,phg,pln}* \
        | gbToFaRa -byOrganism=org anyRna.fil mrna.fa mrna.ra mrna.ta stdin

# Make the ESTs
        gunzip -c /cluster/store2/genbank.132/gbest*.gz | \
        gbToFaRa anyRna.fil est.fa est.ra est.ta stdin -byOrganism=org


    # Make symbolic links from hgwdev's /gbdb to the real .fa locations:
    mkdir -p /gbdb/rn1/mrna.132

    cd /gbdb/rn1/mrna.132
    ln -s /cluster/store2/mrna.132/org/Rattus_norvegicus/mrna.fa
    ln -s /cluster/store2/mrna.132/org/Rattus_norvegicus/mrna.ra
    ln -s /cluster/store2/mrna.132/org/Rattus_norvegicus/est.fa
    ln -s /cluster/store2/mrna.132/org/Rattus_norvegicus/est.ra
    ln -s /cluster/store1/refSeqCum/112602/org/Rattus_norvegicus/refSeq.fa
    ln -s /cluster/store1/refSeqCum/112602/org/Rattus_norvegicus/refSeq.ra

    # Call hgLoadRna with the /gbdb locations:
    cd /cluster/store2/temp
    hgLoadRna new rn1
    hgLoadRna add -type=mRNA rn1 /gbdb/rn1/mrna.132/mrna.fa /gbdb/rn1/mrna.132/mrna.ra
    hgLoadRna add -type=EST rn1 /gbdb/rn1/mrna.132/est.fa /gbdb/rn1/mrna.132/est.ra

MAKING AND STORING mRNA AND EST ALIGNMENTS  (DONE 11/28/02)


    # Use BLAT to generate refSeq, mRNA and EST alignments as so:
    #  Make sure that /scratch/hg/rn1/contigs is loaded
    #  with chr*_*.fa and pushed to the cluster nodes.  
    ssh kk
    cd ~/rn1/bed
    foreach i (refSeq mrna est)
      mkdir -p $i
      cd $i
      ls -1S /scratch/hg/rn1/contigs.1126/* > genome.lst
      ls -1 /scratch/hg/mrna.132/Rattus_norvegicus/$i.fa > mrna.lst
      cp ~/mm2/bed/$i/gsub .
      mkdir psl
      gensub2 genome.lst mrna.lst gsub spec
      cd ..
    end 

    # check on progress with para check in mrna, est, and refSeq directories.
    # report on runtime when finished:
      para create spec
      para try
      para push
    para time > time
      
    # Process refSeq, mRNA, and EST alignments into near best in genome.
    ssh eieio
    cd ~/rn1/bed
    cd refSeq
    pslSort dirs raw.psl /cluster/store2/temp psl
    pslReps -minCover=0.2 -sizeMatters -minAli=0.98 -nearTop=0.002 raw.psl \
      contig.psl /dev/null
    liftUp -nohead all_refSeq.psl ../../jkStuff/liftAll.lft warn contig.psl
    pslSortAcc nohead chrom /cluster/store2/temp all_refSeq.psl
    cd ..

    cd mrna
    pslSort dirs raw.psl /cluster/store2/temp psl
    pslReps -minAli=0.98 -sizeMatters -nearTop=0.005 raw.psl contig.psl \
      /dev/null
    liftUp -nohead all_mrna.psl ../../jkStuff/liftAll.lft warn contig.psl
    pslSortAcc nohead chrom /cluster/store2/temp all_mrna.psl
    cd ..

    cd est
    pslSort dirs raw.psl /cluster/store2/temp psl
    pslReps -minAli=0.98 -sizeMatters -nearTop=0.005 raw.psl contig.psl \
      /dev/null
    liftUp -nohead all_est.psl ../../jkStuff/liftAll.lft warn contig.psl
    pslSortAcc nohead chrom /cluster/store2/temp all_est.psl
    cd ..

    # Load mRNA alignments into database.
    ssh hgwdev
    cd /cluster/store4/rn1/bed/mrna/chrom
    foreach i (chr?{,?}{,_random}.psl)
        mv $i $i:r_mrna.psl
    end
    hgLoadPsl rn1 *.psl
    cd ..
    hgLoadPsl rn1 all_mrna.psl -nobin

    # Load EST alignments into database.
    ssh hgwdev
    cd /cluster/store4/rn1/bed/est/chrom
    foreach i (chr?{,?}{,_random}.psl)
        mv $i $i:r_est.psl
    end
    hgLoadPsl rn1 *.psl
    cd ..
    hgLoadPsl rn1 all_est.psl -nobin

    # Create subset of ESTs with introns and load into database.
    ssh eieio
    cd ~/rn1
    tcsh jkStuff/makeIntronEst.sh
    ssh hgwdev
    cd ~/rn1/bed/est/intronEst
    hgLoadPsl rn1 *.psl

    # Load refSeq alignments into database
    ssh hgwdev
    cd ~/rn1/bed/refSeq
    pslCat -dir chrom > refSeqAli.psl
    hgLoadPsl rn1 -tNameIx refSeqAli.psl


PRODUCING ESTORIENTINFO TABLE (DONE 12/2/02 - MATT)

This table is needed for proper orientation of ESTs in the
browser.  Many will appear on the wrong strand without it.
This involves a cluster run.  First load the EST psl files
as so:
     ssh eieio
     cd ~/rn1/bed/est
     pslSortAcc nohead contigs /cluster/store2/temp contig.psl
     ssh kkstore
     mkdir /scratch/hg/rn1/est
     cd ~/rn1/bed/est
     cp -r contigs /scratch/hg/rn1/est

Wait for these to finish.
     cd ..
     mkdir estOrientInfo
     cd estOrientInfo
     mkdir ei
     ls -1S /scratch/hg/rn1/est/contigs/* > psl.lst
     cp ~/mm2/bed/estOrientInfo/gsub .
Update gsub to refer to rat contig sequence currently on
/scratch/hg/rn1/contigs.1126, and rat ESTs on /scratch/hg/rn1/est/contigs
and the rat est in ./scratch/hg/mrna.132/Rattus_norvegicus/est/fa.
     gensub2 psl.lst single gsub spec
     ssh kk
     para create spec
Then run the  job on the cluster
     cd ~/rn1/bed/estOrientInfo
     para try
     sleep 60
     para check
If things look good
     para push
Wait for this to finish then
     liftUp estOrientInfo.bed ../../jkStuff/liftAll.lft warn ei/*.tab
Load them into database as so:
     ssh hgwdev
     cd ~/rn1/bed/estOrientInfo
     hgLoadBed rn1 estOrientInfo estOrientInfo.bed -sqlTable=/cluster/home/kent/src/hg/lib/estOrientInfo.sql
     
CREATE RNACLUSTER TABLE (DONE 12/2/02 - MATT)
 Make sure that refSeqAli and estOrientInfo tables are made already
 (see above).

   ssh hgwdev
   cd ~/rn1/bed
   mkdir rnaCluster
   cd rnaCluster
   mkdir rna est
   foreach i (1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 X Un)
       clusterRna rn1 rna/chr$i.bed est/chr$i.bed -chrom=chr$i
       echo done $i
   end
   hgLoadBed rn1 rnaCluster est/*.bed

PRODUCING KNOWN GENES (DONE 11/26/02 - MATT)

o - Download everything from ftp://ftp.ncbi.nih.gov/refseq/cumulative
    into /cluster/store1/refSeqCum/112602
        cd /cluster/store1/refSeqCum/112602
        uncompress rscu.gbff.Z
        Put the following lines into anyRna.fil
        restrict mol=mRNA
        hide mol        
        gbToFaRa anyRna.fil refSeq.fa refSeq.ra refSeq.ta -byOrganism=org
        This will create fasta and annotations for the rat in refSeq/org
        in refSeq.fa and refSeq.ra under the org/Rattus_norvegicus directory
o - Get extra info from NCBI and produce refGene table as so:
       wget ftp://ftp.ncbi.nih.gov/refseq/LocusLink/loc2ref 
       wget ftp://ftp.ncbi.nih.gov/refseq/LocusLink/mim2loc

o -   hgLoadRna add -type=refSeq rn1 /gbdb/rn1/mrna.132/refSeq.fa /gbdb/rn1/mrna.132/refSeq.ra

o - Produce refGene, refPep, refMrna, and refLink tables as so:
Get the rat proteins:
        wget ftp://ftp.ncbi.nih.gov/refseq/R_norvegicus/mRNA_Prot/rat.faa.gz
        gunzip rat.faa.gz
        cd ~/rn1/bed/refSeq
       hgRefSeqMrna rn1 /cluster/store1/refSeqCum/112602/org/Rattus_norvegicus/refSeq.fa \
        /cluster/store1/refSeqCum/112602/org/Rattus_norvegicus/refSeq.ra \
        all_refSeq.psl /cluster/store1/refSeqCum/112602/loc2ref \
        /cluster/store1/refSeqCum/112602/rat.faa \
        /cluster/store1/refSeqCum/112602/mim2loc
o - Add RefSeq status info 
    hgRefSeqStatus rn1 /cluster/store1/refSeqCum/112602/loc2refE

o - create precomputed join of refFlat and refGene:
      echo 'CREATE TABLE refFlat (KEY geneName (geneName), KEY name (name), KEY chrom (chrom)) SELECT refLink.name as geneName, refGene.* FROM refLink,refGene WHERE refLink.mrnaAcc = refGene.name' | hgsql rn1


SIMPLE REPEAT TRACK (DONE* 11/26/02)
 * 18 of the 590 contigs were re-run with TRF version 2 (trf.2) due to 
   crash/hang problems with TRF v3.01 (trf).  

    ssh kk
    mkdir ~/rn1/bed/simpleRepeat
    cd ~/rn1/bed/simpleRepeat
    cp ~/lastRn/bed/simpleRepeat/gsub .
    mkdir trf
    ls -1 /scratch/hg/rn1/contigs.1126/*.fa > genome.lst
    touch single
    gensub2 genome.lst single gsub spec
    para create spec
    para try
    para push 
    # When job is done do:
    liftUp simpleRepeat.bed ~/rn1/jkStuff/liftAll.lft warn trf/*.bed

    # Load this into the database as so
    ssh hgwdev
    cd ~/rn1/bed/simpleRepeat
    hgLoadBed rn1 simpleRepeat simpleRepeat.bed \
      -sqlTable=$HOME/src/hg/lib/simpleRepeat.sql


PREPARING SEQUENCE FOR CROSS SPECIES ALIGNMENTS (DONE 11/27/02)

    # After the simpleRepeats track has been built, make a filtered version 
    # of the trf output: keep trf's with period <= 12:
    ssh eieio
    cd ~/rn1/bed/simpleRepeat
    mkdir -p trfMask
    foreach f (trf/chr*.bed)
      awk '{if ($5 <= 12) print;}' $f > trfMask/$f:t
    end
    # Lift up filtered trf output to chrom coords as well:
    cd ~/rn1
    mkdir -p bed/simpleRepeat/trfMaskChrom
    foreach c (?{,?})
      perl -wpe 's@(\S+)@bed/simpleRepeat/trfMask/$1.bed@' \
        $c/lift/ordered.lst > $c/lift/oTrf.lst
      if (-e $c/lift/random.lst) then
        perl -wpe 's@(\S+)@bed/simpleRepeat/trfMask/$1.bed@' \
           $c/lift/random.lst > $c/lift/rTrf.lst
      endif
      liftUp bed/simpleRepeat/trfMaskChrom/chr$c.bed \
        jkStuff/liftAll.lft warn `cat $c/lift/oTrf.lst`
      if (-e $c/lift/rTrf.lst) then
        liftUp bed/simpleRepeat/trfMaskChrom/chr${c}_random.bed \
          jkStuff/liftAll.lft warn `cat $c/lift/rTrf.lst`
      endif
    end
    # Make a trfMasked version of the contigs:
    cd ~/rn1
    mkdir trfFa
    foreach f (?{,?}/chr*/chr?{,?}{,_random}_?{,?}.fa)
      set c = $f:t:r
      maskOutFa -softAdd $f bed/simpleRepeat/trfMask/$c.bed trfFa/$c.fa.trf
    end
    # Then make sure there is enough space available on /scratch and do
    ssh kkstore
    cp -Rp ~/rn1/trfFa /scratch/hg/rn1/trfFa.1127
    # ask for binrsync, or sudo /cluster/install/utilities/updateLocal


MAKE DOWNLOADABLE SEQUENCE FILES (DONE 11/27/02)

    # This used to be done right after RepeatMasking.  Now, we mask with 
    # TRF as well, so do this after the "PREPARING SEQUENCE" step above.
    ssh eieio
    cd ~/rn1
    #- Soft-mask (lower-case) the contig and chr .fa's
    ./jkStuff/makeFaMasked.sh
    #- Make hard-masked .fa.masked files as well:
    ./jkStuff/makeHardMasked.sh
    #- Rebuild the nib, mixedNib, maskedNib files:
    ./jkStuff/makeNib.sh
    #- Rebuild the .zip files
    ./jkStuff/zipAll.sh |& tee zipAll.log
    #- Look at zipAll.log to make sure all file lists look reasonable.  
    #- Check zip file integrity:
    foreach f (*.zip)
      unzip -t $f > $f.test
      tail -1 $f.test
    end
    wc -l *.zip.test

    #- Copy the .zip files to hgwdev:/usr/local/apache/...
    ssh hgwdev
    cd ~/rn1
    ./jkStuff/cpToWeb.sh
    cd /usr/local/apache/htdocs/goldenPath/rnNov2002
    #- Take a look at bigZips/* and chromosomes/*, update their README.txt's


PRODUCING GENSCAN PREDICTIONS (DONE 12/3/02)
    
    # Log into kkr1u00 (not kk!).  kkr1u00 is the driver node for the small
    # cluster (kkr2u00 -kkr8u00. (genscan has problem running on the
    # big cluster, due to limitation of memory and swap space on each
    # processing node).
    ssh kkr1u00
    mkdir -p ~/rn1/bed/genscan
    cd ~/rn1/bed/genscan
    # Make 3 subdirectories for genscan to put their output files in
    mkdir gtf pep subopt
    # Generate a list file, genome.list, of all the hard-masked contigs that 
    # *do not* consist of all-N's (which would cause genscan to blow up)
    rm -f genome.list
    touch genome.list
    foreach f ( `ls -1S /cluster/store4/rn1/?{,?}/chr*/chr?{,?}{,_random}_?{,?}.fa.masked` )
      egrep '[ACGT]' $f > /dev/null
      if ($status == 0) echo $f >> genome.list
    end
    # Create template file, gsub, for gensub2.  For example (3-line file):
#LOOP
/cluster/home/kent/bin/i386/gsBig {check in line+ $(path1)} {check out line gtf/$(root1).gtf} -trans={check out line pep/$(root1).pep} -subopt={check out line subopt/$(root1).bed} -exe=/cluster/home/fanhsu/projects/compbio/bin/genscan-linux/genscan -par=/cluster/home/fanhsu/projects/compbio/bin/genscan-linux/HumanIso.smat -tmp=/tmp -window=2400000
#ENDLOOP
    echo "" > dummy.list
    gensub2 genome.list dummy.list gsub jobList
    para create jobList
    para try
    para check
    para push

    # If there are crashes, diagnose with "para problems".  
    # If a job crashes due to genscan running out of memory, re-run it 
    # manually with "-window=2400000" instead of "-window=1200000".
    
    # Convert these to chromosome level files as so:
    liftUp genscan.gtf ../../jkStuff/liftAll.lft warn gtf/*.gtf
    liftUp genscanSubopt.bed ../../jkStuff/liftAll.lft warn subopt/*.bed
    cat pep/*.pep > genscan.pep

    # Load into the database as so:
    ssh hgwdev
    cd ~/rn1/bed/genscan
    ldHgGene rn1 genscan genscan.gtf
    hgPepPred rn1 generic genscanPep genscan.pep
    hgLoadBed rn1 genscanSubopt genscanSubopt.bed


PREPARE CLUSTER FOR BLASTZ RUN (DONE 12/3/02)

    ssh eieio
    # Extract lineage-specific repeats using Arian Smit's script:
    mkdir -p ~/rn1/bed/linSpecRep
    cd ~/rn1/bed/linSpecRep
    foreach f (~/rn1/*/*.out)
        ln -sf $f .
    end
    /cluster/bin/scripts/rodentSpecificRepeats.pl *.out
    /cluster/bin/scripts/perl-rename 's/(\.fa|\.nib)//' *.out.*spec
    /cluster/bin/scripts/perl-rename 's/\.(rod|prim)spec/.spec/' *.out.*spec
    rm *.out
    # Copy files to the kkstore:/scratch
    ssh kkstore
    # lineage-specific repeats:
    cd ~/rn1/bed
    rm -rf /scratch/hg/rn1/linSpecRep
    cp -Rp linSpecRep /scratch/hg/rn1
    # RepeatMasker .out:
    cd ~/rn1
    rm -rf /scratch/hg/rn1/rmsk
    mkdir -p /scratch/hg/rn1/rmsk
    cp -p ?{,?}/chr?{,?}{,_random}.fa.out /scratch/hg/rn1/rmsk
    # Chrom-level mixed nibs that have been repeat- and trf-masked:
    rm -rf /scratch/hg/rn1/chromTrfMixedNib
    mkdir -p /scratch/hg/rn1/chromTrfMixedNib
    cp -p mixedNib/chr*.nib /scratch/hg/rn1/chromTrfMixedNib
    # Ask cluster-admin@soe.ucsc.edu to binrsync /scratch/hg to clusters


MAKING THE BLASTZBESTRAT TRACK FROM PENN STATE AXT FILES (TODO)
    # When Scott Schwartz is done generating .axt's for the blastz 
    # alignments...

    # Create tSizes (human chrom size) and qSizes (rat) for axtToPsl.
      ssh hgwdev
      cd /cluster/store1/gs.13/build30/blastz.rn1gs13.2002-08-30
      In mysql:
         use rn1
         select chrom,size from chromInfo;
         use hg12
         select chrom,size from chromInfo;
      Edit the results of the first select into a tab-separated tSizes,
      edit the results of the second select into a tab-separated qSizes.

    # Consolidate AXT files to chrom level, sort, pick best, make psl.
    ssh eieio
    set base="/cluster/store1/gs.13/build30/blastz.rn1gs13.2002-08-30"
    set tbl="blastzBestHuman_08_30"
    cd $base
    mkdir -p axtBest pslBest
    foreach f (run.2/chr*.axt.gz)
      set chr=$f:t:r:r
      gunzip $f
      axtBest -minScore=300 $f:r $chr axtBest/$chr.axt
      axtToPsl axtBest/$chr.axt tSizes qSizes pslBest/${chr}_${tbl}.psl
      gzip $f:r
      echo Done with $chr.
    end
    # If axtBest runs out of memory, split those chromosome files into 
    # smaller pieces and run axtBest in 2 passes.  
    foreach f (run.2/chr{7,Un}.axt.gz)
      set chr=$f:t:r:r
      rm -f xx*
      gunzip -c $f \
        | perl -we '$/="";  $seq=0; \
                    while (<>) { \
                      if (($. % 1000) == 1) { \
                        close(OUT) if ($. > 1000); \
                        $seq++; \
                        open(OUT, ">xx$seq"); \
                      } \
                      print OUT; \
                    } \
                    close(OUT);'
      foreach s (xx*)
        axtBest -minScore=300 $s $chr $s.axtBest
      end
      cat xx?*.axtBest > xx.axtBest
      axtBest -minScore=300 xx.axtBest $chr axtBest/$chr.axt
      rm xx*
      axtToPsl axtBest/$chr.axt tSizes qSizes \
        pslBest/${chr}_${tbl}.psl
      echo Done with $chr.
    end

    # Load tables
     ssh hgwdev
     set base="/cluster/store1/gs.13/build30/blastz.rn1gs13.2002-08-30"
     cd $base/pslBest
     hgLoadPsl rn1 *.psl
     rm -f ~/kent/src/hg/lib/axtInfoFillRn1.sql
     touch ~/kent/src/hg/lib/axtInfoFillRn1.sql
     foreach f ($base/axtBest/chr*.axt)
       set chr=$f:t:r
       echo 'insert into axtInfo values ("hg12","Blastz Best in Genome","'$chr'","'$f'");' \
         >> ~/kent/src/hg/lib/axtInfoFillRn1.sql
     end
     hgsql rn1 < ~/kent/src/hg/lib/axtInfo.sql
     hgsql rn1 < ~/kent/src/hg/lib/axtInfoFillRn1.sql
     cvs add ~/kent/src/hg/lib/axtInfoFillRn1.sql
     cvs ci ~/kent/src/hg/lib/axtInfoFillRn1.sql

TWINSCAN GENE PREDICTIONS (DONE 01/27/03)

    mkdir -p ~/rn1/bed/twinscan
    cd ~/rn1/bed/twinscan
    wget http://genome.cse.wustl.edu/~bio/rat/Nov02/1-22-03/rat_Nov02_1-22-03.tgz
    gunzip -c *.tgz | tar xvf -
    rm -r chr_tx
    # clean up chrom field of GTF files
    foreach f (chr_gtf/chr*.gtf)
      set chr = $f:t:r
      sed -e "s/^[0-9]*/$chr/" $f > chr_gtf/$chr-fixed.gtf
    end
    # pare down protein FASTA header to id and add missing .a:
    foreach f (chr_ptx/chr*.ptx)
      set chr = $f:t:r
      perl -wpe 's/^\>.*\s+source_id\s*\=\s*(\S+).*$/\>$1.a/;' < \
        chr_ptx/$chr.ptx > chr_ptx/$chr-fixed.fa
    end
    ldHgGene rn1 twinscan chr_gtf/chr*-fixed.gtf -exon=CDS
    hgPepPred rn1 generic twinscanPep chr_ptx/chr*-fixed.fa

NCBI GENE MODELS (TODO)

    mkdir -p ~/rn1/bed/ncbiGenes
    cd ~/rn1/bed/ncbiGenes
    wget ftp://ftp.ncbi.nih.gov/genomes/M_musculus/MGSCv3_Release1/maps/chr_geneos.gtf.gz
    wget ftp://ftp.ncbi.nih.gov/genomes/M_musculus/MGSCv3_Release1/protein/protein.fa.gz
    gunzip chr_genes.gtf.gz
    gunzip protein.fa.gz
    - Process the .gtf and .fa together to join IDs
    ../../jkStuff/mungeNCBIids chr_genes.gtf protein.fa |& uniq
    ldHgGene rn1 ncbiGenes chr_genes-fixed.gtf
    hgPepPred rn1 generic ncbiPep protein-fixed.fa

NCBI GENOMESCAN MODELS (TODO)

    mkdir -p ~/rn1/bed/genomeScan
    cd ~/rn1/bed/genomeScan
    wget ftp://ftp.ncbi.nih.gov/genomes/M_musculus/MGSCv3_Release1/maps/chr_GenomeScan.gtf.gz
    # Remove the ".1" at the end of transcript_id's:
    gunzip -c chr_GenomeScan.gtf.gz | \
      perl -wpe 's/transcript_id "([^\"]+)\.1"/transcript_id "$1"/' > \
      chr_GenomeScan-fixed.gtf
    ldHgGene rn1 genomeScan chr_GenomeScan-fixed.gtf
    wget ftp://ftp.ncbi.nih.gov/genomes/M_musculus/MGSCv3_Release1/protein/GS_prot.fsa.gz
    hgPepPred rn1 generic genomeScanPep GS_prot.fsa

DOING HUMAN/RAT ALIGMENTS (todo)

o - Download the lower-case-masked assembly and put it in
    eieio:/cluster/store1/a2ms.
   
o - Download the assembled rat genome in lower-case
    masked form to /cluster/store1/arachne.3/whole.  
    Execute the script splitAndCopy.csh to chop it
    into roughly 50M pieces in arachne.3/parts
o - Set up the para job to do the alignment as so:
       ssh eieio
       cd /cluster/store4/rn1
       mkdir blatRat.phusion
       cd blatRat.phusion
       ls -1S /scratch/hg/gs.13/build30/contigTrf/* > human.lst
       ls -1 /scratch/hg/rn1/contigs.1126/* > rat.lst
    Make a file 'gsub' with the following three lines in it
#LOOP
/cluster/home/kent/bin/i386/blat -q=dnax -t=dnax {check in line+ $(path2)} {check in line+ $(path1)}  {check out line+ psl/$(root2)_$(root1).psl} -minScore=20 -minIdentity=20 -tileSize=4 -minMatch=2 -oneOff=0 -ooc={check in exists /scratch/hg/h/4.pooc} -qMask=lower -mask=lower
#ENDLOOP
    Process this into a para file and launch the first set
    of jobs (10,000 out of 70,000) as so:
        gensub2 rat.lst human.lst gsub spec
        para create spec
        para push
    Do a 'para check' after a few minutes and make sure everything is right.
        para time > time

o - Sort alignments as so 
       ssh eieio
       cd /cluster/store4/rn1/blatRat
       pslCat -dir -check psl | liftUp -type=.psl stdout ../liftAll.lft warn stdin | pslSortAcc nohead chrom /cluster/store2/temp stdin
o - Get rid of big pile-ups due to contamination as so:
       cd chrom
       foreach i (*.psl)
           echo $i
           mv $i xxx
           pslUnpile -maxPile=600 xxx $i
       rm xxx
       end
o - Remove long redundant bits from read names by making a file
    called subs.in with the following line:
        gnl|ti^ti
        contig_^tig_
    and running the commands
        cd ~/rat/vsOo33/blatRat.phusion/chrom
        subs -e -c ^ *.psl > /dev/null
o - Copy over to network where database is:
        ssh kks00
        cd ~/rn1/bed
        mkdir blatRat
        mkdir blatRat/ph.chrom600
        cd !$
        cp /cluster/store4/rn1/blatRat.phusion/chrom/*.psl .
o - Rename to correspond with tables as so and load into database:
       ssh hgwdev
       cd ~/rn1/bed/blatRat/ph.chrom600
       foreach i (chr?{,?}{,_random}.psl)
           set r = $i:r
           mv $i ${r}_blatRat.psl
       end
       hgLoadPsl rn1 *.psl
o - load sequence into database as so:
        ssh kks00
        faSplit about /projects/hg3/rat/arachne.3/whole/Unplaced.mfa 1200000000 /projects/hg3/rat/arachne.3/whole/unplaced
        ssh hgwdev
        hgLoadRna addSeq '-abbr=gnl|' rn1 /projects/hg3/rat/arachne.3/whole/unpla*.fa
        hgLoadRna addSeq '-abbr=con' rn1 /projects/hg3/rat/arachne.3/whole/SET*.mfa
    This will take quite some time.  Perhaps an hour .

o - Produce 'best in genome' filtered version:
        ssh kks00
        cd ~/rat/vsOo33
        pslSort dirs blatRatAll.psl temp blatRat
        pslReps blatRatAll.psl bestRatAll.psl /dev/null -singleHit -minCover=0.3 -minIdentity=0.1
        pslSortAcc nohead bestRat temp bestRatAll.psl
        cd bestRat
        foreach i (chr?{,?}{,_random}.psl)
           set r = $i:r
           mv $i ${r}_bestRat.psl
        end
o - Load best in genome into database as so:
        ssh hgwdev
        cd ~/rat/vsOo33/bestRat
        hgLoadPsl rn1 *.psl

PRODUCING CROSS_SPECIES mRNA ALIGMENTS (DONE 12/03/02 - MATT)

Here you align vertebrate mRNAs against the masked genome on the
cluster you set up during the previous step.

Make sure that gbpri, gbmam, gbrod, and gbvert are downloaded from Genbank into
/cluster/store2/genbank.132 and unpacked by organism into /cluster/store2/mrna.132/org. 

Set up cluster run more or less as so:
      ssh kk
      cd ~/rn1/bed
      mkdir xenoMrna
      cd xenoMrna
      mkdir psl
      ls -1S /scratch/hg/rn1/contigs.1126/* > genome.lst
      # If it hasn't been done already do:
      # cp -r /cluster/store2/mrna.132/orgFa /scratch/hg/mrna.132/org
      find /scratch/hg/mrna.132/org/ -name mrna.fa | xargs ls -lS | awk {'print $5 " " $9'} \
        | sort -nr | awk {'print $2'} > xenoRna.lst
This produces a size-sorted file of all organisms mrna.
Then edit xenoRna.lst removing the Rattus_norvegicus line, and writing
the first line into 1.org, the second line into 2.org,  and
so forth.  After the 6th line just leave the rest in 7.org.
Then
      ls -1 *.org > ratXenoRna.lst
      cp ~/rn1/bed/xenoMrna/gsub .
      gensub2 genome.lst ratZenoRna.lst gsub spec
      para create spec
      para try
      para check
If all looks well do
      para push.

Sort xeno mRNA alignments as so:
       ssh eieio
       cd ~/rn1/bed/xenoMrna
       pslSort dirs raw.psl /cluster/store2/temp psl
       pslReps raw.psl cooked.psl /dev/null -minAli=0.25
       liftUp chrom.psl ../../jkStuff/liftAll.lft warn cooked.psl
       pslSortAcc nohead chrom /cluster/store2/temp chrom.psl
       pslCat -dir chrom > xenoMrna.psl
       rm -r chrom raw.psl cooked.psl chrom.psl

Load into database as so:
       ssh hgwdev
       cd ~/rn1/bed/xenoMrna
       hgLoadPsl rn1 xenoMrna.psl -tNameIx

Make the xenoRna file
        cd /cluster/store2/mrna.132
        gunzip -c /cluster/store2/genbank.132/gb{pri,rod,v,mam,inv,bct,htc,pat,phg,pln}* \
        | gbToFaRa ratXenoRna.fil ratXenoRna.fa ratXenoRna.ra ratXenoRna.ta stdin

       Make a /gbdb symlink
       cd /gbdb/rn1/mrna.132
       ln -s /cluster/store2/mrna.132/ratXenoRna.fa ratXenoRna.fa
       ln -s /cluster/store2/mrna.132/ratXenoRna.ra ratXenoRna.ra
       hgLoadRna add -type=xenoRna rn1 /gbdb/rn1/mrna.132/ratXenoRna.fa  /gbdb/rn1/mrna.132/ratXenoRna.ra


PRODUCING TETRAODON FISH ALIGNMENTS (TODO)

o - Download sequence from ... and put it on the cluster local disk
    at
       /scratch/hg/fish
o - Do fish/rat alignments.
       ssh kk
       cd ~/rn1/bed
       mkdir blatFish
       cd blatFish
       mkdir psl
       ls -1S /scratch/hg/fish/* > fish.lst
       ls -1S /scratch/hg/rn1/trfFa/* > rat.lst
       cp ~/lastRn/blatFish/gsub .
       gensub2 rat.lst fish.lst gsub spec
       para create spec
       para try
     Make sure jobs are going ok with para check.  Then
       para push
     wait about 2 hours and do another
       para push
     do para checks and if necessary para pushes until done
     or use para shove.
o - Sort alignments as so 
       pslCat -dir psl | liftUp -type=.psl stdout ~/rn1/jkStuff/liftAll.lft warn stdin | pslSortAcc nohead chrom /cluster/store2/temp stdin
o - Copy to hgwdev:/scratch.  Rename to correspond with tables as so and 
    load into database:
       ssh hgwdev
       cd ~/rn1/bed/blatFish/chrom
       foreach i (chr?{,?}{,_random}.psl)
           set r = $i:r
           mv $i ${r}_blatFish.psl
       end
       hgLoadPsl rn1 *.psl
       hgLoadRna addSeq rn1 /cluster/store2/fish/seq15jun2001/*.fa


PRODUCING FUGU FISH ALIGNMENTS (DONE 11/27/02)

    # (Already done, for mm2:)
    # Download sequence to /cluster/store3/fuguSeq from ... and put it on the 
    # cluster local disk at /scratch/hg/fugu on kkstore.
    # Sequence was downloaded from:
    # ftp://ftp.jgi-psf.org/pub/JGI_data/Fugu/fugu_v3_mask.fasta.Z
    # ftp://ftp.jgi-psf.org/pub/JGI_data/Fugu/fugu_v3_prot.fasta.Z
    #
    # faSplit sequence ../fugu_v3_mask.fasta 1000 fuguSplit

    ssh kk
    cd ~/rn1/bed
    mkdir blatFugu
    cd blatFugu
    ls -1S /scratch/hg/fugu/* > fugu.lst
    ls -1S /scratch/hg/rn1/trfFa.1127/* > rat.lst
    cp ~/lastRn/bed/blatFugu/gsub .
    mkdir psl
    foreach f (~/rn1/?{,?}/chr*/chr?{,?}{,_random}_?{,?}.fa)
      set c=$f:t
      mkdir psl/$c
    end
    gensub2 rat.lst fugu.lst gsub spec
    para create spec
    para try
    para check
    para push
    # do another para check in about 2 hours; if it looks good:
    para push -maxPush=500000
    # do para checks and if necessary para pushes until done or use para shove.

    # Sort alignments:
    ssh eieio
    cd ~/rn1/bed/blatFugu
    pslCat -dir psl/* \
      | liftUp -type=.psl stdout ~/rn1/jkStuff/liftAll.lft warn stdin \
      | pslSortAcc nohead chrom /cluster/store2/temp stdin

    # load into database:
    ssh hgwdev
    cd ~/rn1/bed/blatFugu/chrom
    foreach i (chr?{,?}{,_random}.psl)
        set r = $i:r
        mv $i ${r}_blatFugu.psl
    end
    hgLoadPsl rn1 *.psl
    mkdir -p /gbdb/rn1/fuguSeq
    cd /gbdb/rn1/fuguSeq
    ln -s /cluster/store3/fuguSeq/fugu_v3_mask.fasta
    cd /cluster/store2/temp
    hgLoadRna addSeq rn1 /gbdb/rn1/fuguSeq/fugu_v3_mask.fasta


MAKE LIFT FILE FOR AGPS (DONE 12/02/02)
    ssh eieio
    cd ~/rn1/jkStuff
    ./jkStuff/agpToLift.pl chrom.sizes ?{,?}/chr?{,?}{,.random}.agp \
      > jkStuff/liftRNOR.lft


CONTIG POSITIONS (DONE 12/02/02)
    # hgCtgPos creates a table of our macro-contigs (chr*_*), which are 
    # not the contigs that people will be searching for in hgFind (RNOR*).
    # So create ctgPos with RNOR coords from jkStuff/liftRNOR.lft.
    ssh hgwdev
    mkdir -p ~/rn1/bed/ctgPos
    cd ~/rn1/bed/ctgPos
    perl -wpe 'chop;  @words = split(/\s/); \
               $start = $words[0]; \
               $ctg = $words[1]; \
               $size = $words[2]; \
               $chr = $words[3]; \
               $end = $start + $size; \
               $_ = join("\t", $ctg, $size, $chr, $start, $end) . "\t\n"; \
              ' ../../jkStuff/liftRNOR.lft \
      > ctgPos.tab
    # Drop just in case it's already there (this might give error if not):
    echo 'drop table ctgPos' | hgsql rn1
    hgsql rn1 < $HOME/kent/src/hg/lib/ctgPos.sql
    echo 'LOAD data local infile "ctgPos.tab" into table ctgPos' | hgsql rn1


LOAD BACTIG POSITIONS (DONE 12/13/02)

    ssh hgwdev
    mkdir -p ~/rn1/bed/bactigPos
    cd ~/rn1/bed/bactigPos
    # Paul Havlak havlak@swan.hgsc.bcm.tmc.edu sent us a BED 4+ email 
    # attachment.  
    # Save the attachment as ~/rn1/bed/bactigPos/bactigPos.orig.bed
    # Fix the 1-based starts to 0-based:
    awk "-F\t" '{printf "%s\t%d\t%s\t%s\t%s\t%s\n", $1, $2-1, $3, $4, $5, $6;}' < bactigPos.orig.bed > bactigPos.bed
    hgLoadBed rn1 bactigPos bactigPos.bed \
      -noBin -sqlTable=$HOME/kent/src/hg/lib/bactigPos.sql


LOAD CPGISSLANDS (DONE 12/5/02)
    ssh eieio
    mkdir -p ~/rn1/bed/cpgIsland
    cd ~/rn1/bed/cpgIsland
    # Build software emailed from Asif Chinwalla (achinwal@watson.wustl.edu)
    # copy the tar file to the current directory
    tar xvf cpg_dist.tar 
    cd cpg_dist
    gcc readseq.c cpg_lh.c -o cpglh.exe
    cd ..
    foreach f (../../?{,?}/chr?{,?}{,_random}.fa.masked)
      set fout=$f:t:r:r.cpg
      echo running cpglh on $f to $fout
      ./cpg_dist/cpglh.exe $f > $fout.cpg
    end
    # copy filter.awk from a previous release
    cp ~/lastRn/bed/cpgIsland/filter.awk .
    awk -f filter.awk chr*.cpg > cpgIsland.bed
    # load into database:
    ssh hgwdev
    cd ~/rn1/bed/cpgIsland
    hgLoadBed rn1 cpgIsland -tab -noBin \
      -sqlTable=$HOME/kent/src/hg/lib/cpgIsland.sql cpgIsland.bed


LOAD ENSEMBL PHUSIONBLAST RAT-MOUSE HOMOLOGY (DONE 12/03/02)
    ssh eieio
    mkdir -p ~/rn1/bed/ensRatMusHom
    cd ~/rn1/bed/ensRatMusHom
    wget ftp://ftp.ensembl.org/pub/repository/rat/PBratMouse.gz
    gunzip -c PBratMouse.gz | tail +2 \
      | perl -wpe 'chop; @words = split(/\s/); \
                   if ($words[1] =~ /chr\S+\.(RNOR\d+):(\d+)-(\d+)/) { \
                     $rCtg = $1;  $rStart = $2;  $rEnd = $3; \
                   } else { die "Bad format for second word: $words[1]"; } \
                   $mChr   = "mm2.chr" . $words[4]; \
                   $mStart = $words[5]; \
                   $mEnd   = $words[6]; \
                   $strand = $words[7]; \
                   $score  = $words[9]; \
                   $_ = join("\t", $rCtg, $rStart-1, $rEnd, $mChr, $score, \
                             $strand, $mStart-1, $mEnd) . "\n"; \
                   ' \
      | liftUp ensRatMusHom.bed ../../jkStuff/liftRNOR.lft warn stdin
    sed -e 's/ensPhusionBlast/ensRatMusHom/g' \
      $HOME/kent/src/hg/lib/ensPhusionBlast.sql \
      > ensRatMusHom.sql
    ssh hgwdev
    cd ~/rn1/bed/ensRatMusHom
    hgLoadBed rn1 ensRatMusHom ensRatMusHom.bed -sqlTable=ensRatMusHom.sql


LOAD GENEID GENES (DONE 01/28/03)
    mkdir -p ~/rn1/bed/geneid/download
    cd ~/rn1/bed/geneid/download
    foreach f (~/rn1/?{,?}/chr?{,?}{,_random}.fa)
      set chr = $f:t:r
      wget http://genome.imim.es/genepredictions/R.norvegicus/rnNov2002/geneid_v1.1/$chr.gtf
      wget http://genome.imim.es/genepredictions/R.norvegicus/rnNov2002/geneid_v1.1/$chr.prot
    end
    # Add missing .1 to protein id's
    foreach f (*.prot)
      perl -wpe 's/^(>chr\w+)$/$1.1/' $f > $f:r-fixed.prot
    end
    cd ..
    ldHgGene rn1 geneid download/*.gtf -exon=CDS
    hgPepPred rn1 generic geneidPep download/*-fixed.prot

SGP GENE PREDICTIONS (DONE 02/10/03)
    mkdir -p ~/rn1/bed/sgp/download
    cd ~/rn1/bed/sgp/download
    foreach f (~/rn1/?{,?}/chr?{,?}{,_random}.fa)
      set chr = $f:t:r
      wget http://genome.imim.es/genepredictions/R.norvegicus/rnNov2002/SGP/humangp20020628/$chr.gtf
      wget http://genome.imim.es/genepredictions/R.norvegicus/rnNov2002/SGP/humangp20020628/$chr.prot
    end
    # Add missing .1 to protein id's
    foreach f (*.prot)
      perl -wpe 's/^(>chr\w+)$/$1.1/' $f > $f:r-fixed.prot
    end
    cd ..
    ldHgGene rn1 sgpGene download/*.gtf -exon=CDS
    hgPepPred rn1 generic sgpPep download/*-fixed.prot


TIGR GENE INDEX (DONE 2/26/03)
  o mkdir -p ~/rn1/bed/tigr    
    cd ~/rn1/bed/tigr  
    wget ftp://ftp.tigr.org/private/NHGI_rgi_dashu/TGI_track_RatGenome_Feb2003.tgz 
    tar xvzf TGI*.tgz
    foreach f (*cattle*)
      set f1 = `echo $f | sed -e 's/cattle/cow/g'`
      mv $f $f1
    end
    foreach o (mouse cow human pig rat)
      setenv O $o
      foreach f (chr*_$o*s)
        tail +2 $f | perl -wpe 's /THC/TC/; s/(TH?C\d+)/$ENV{O}_$1/;' > $f.gff
      end
    end
    ldHgGene -exon=TC rn1 tigrGeneIndex *.gff


LOAD STS MAP (todo)
     - login to hgwdev
      cd ~/rn1/bed
      rn1 < ~/src/hg/lib/stsMap.sql
      mkdir stsMap
      cd stsMap
      bedSort /projects/cc/hg/mapplots/data/tracks/build28/stsMap.bed stsMap.bed
      - Enter database with "rn1" command.
      - At mysql> prompt type in:
          load data local infile 'stsMap.bed' into table stsMap;
      - At mysql> prompt type

LOAD MGI IDs (TODO)
      - The Locuslink ID to MGI IDs converstion data file,
        LL2MGI.txt, from Jackson Lab should be found under
        ~/rn1/bed/refSeq
      - login to hgwdev
      
      cd ~/rn1/bed/refSeq
      rn1 < ~/src/hg/lib/mgiID.sql
      - Enter database with "rn1" command.
      - At mysql> prompt type in:
          load data local infile 'LL2MGI.txt' into table MGIid;
      - At mysql> prompt type
          quit

LOAD CHROMOSOME BANDS (todo)
      - login to hgwdev
      cd /cluster/store4/rn1/bed
      mkdir cytoBands
      cp /projects/cc/hg/mapplots/data/tracks/build28/cytobands.bed cytoBands
      rn1 < ~/src/hg/lib/cytoBand.sql
      Enter database with "rn1" command.
      - At mysql> prompt type in:
          load data local infile 'cytobands.bed' into table cytoBand;
      - At mysql> prompt type
          quit

LOAD RATREF TRACK (todo)
    First copy in data from kkstore to ~/rn1/bed/ratRef.  
    Then substitute 'genome' for the appropriate chromosome 
    in each of the alignment files.  Finally do:
       hgRefAlign webb rn1 ratRef *.alignments

LOAD AVID RAT TRACK (todo)
      ssh cc98
      cd ~/rn1/bed
      mkdir avidRat
      cd avidRat
      wget http://pipeline.lbl.gov/tableCS-LBNL.txt
      hgAvidShortBed *.txt avidRepeat.bed avidUnique.bed
      hgLoadBed avidRepeat avidRepeat.bed
      hgLoadBed avidUnique avidUnique.bed

LOAD SNPS (TODO)
      - ssh hgwdev
      - cd ~/rn1/bed
      - mkdir snp
      - cd snp
      - Download SNPs from ftp://ftp.ncbi.nlm.nih.gov/pub/sherry/rat.b27.out.gz
      - Unpack.
        createBed < rat.b27.out > snpNih.bed
        hgLoadBed rn1 snpNih snpNih.bed

LOAD ENSEMBL ESTs (TODO)
     ln -s /cluster/store4/rn1 ~/rn1
     mkdir -p ~/rn1/bed/ensembl
     cd ~/rn1/bed/ensembl
     wget http://www.ebi.ac.uk/~stabenau/rat-est.gz
     wget http://www.ebi.ac.uk/~stabenau/rat-est.pep.gz
     gunzip -c rat-est.gz | \
       perl -w -p -e 's/^(\w)/chr$1/' > rat-est-fixed.gtf
     ldHgGene rn1 ensEst rat-est-fixed.gtf
> The id behind '>' is internal and was not in our gtf dump, so
> you have to do some more parsing.
     # pick out the transcript= attribute -- that's the id to use:
     # also remove the first line:
     gunzip -c rat-est.pep.gz | tail +2 | \
       perl -w -p -e 's/^\>gene_id=.*transcript=(\w+)\s+.*$/\>$1/' > \
       rat-est-fixed.pep
     hgPepPred rn1 generic ensEstPep rat-est-fixed.pep

LOAD ENSEMBLE GENES (TODO)
     mkdir -p ~/rn1/bed/ensembl
     cd ~/rn1/bed/ensembl
     wget http://www.ebi.ac.uk/~stabenau/rat-ensembl.gz
     wget http://www.ebi.ac.uk/~stabenau/rat-ensembl.pep.gz
     gunzip -c rat-ensembl.gz | \
       perl -w -p -e 's/^(\w)/chr$1/' > rat-ensembl-fixed.gtf
     ldHgGene rn1 ensGene rat-ensembl-fixed.gtf
> rat-ensembl contains stopcodons, due to some glitches in our
> genebuild. The id behind '>' is internal and was not in our gtf dump, so
> you have to do some more parsing.
# pick out the transcript= attribute -- that's the id to use:
# also remove the first line:
     tail +2 rat-ensembl.pep | \
       perl -w -p -e 's/^\>gene_id=.*transcript=(\w+)\s+.*$/\>$1/' > \
       rat-ensembl-fixed.pep
     hgPepPred rn1 generic ensPep rat-ensembl-fixed.pep

LOAD RNAGENES (todo)
      - login to hgwdev
      - cd ~kent/src/hg/lib
      - rn1 < rnaGene.sql
      - cd /cluster/store4/rn1/bed
      - mkdir rnaGene
      - cd rnaGene
      - download data from ftp.genetics.wustl.edu/pub/eddy/pickup/ncrna-oo27.gff.gz
      - gunzip *.gz
      - liftUp chrom.gff ../../jkStuff/liftAll.lft carry ncrna-oo27.gff
      - hgRnaGenes rn1 chrom.gff

LOAD EXOFISH (todo)
     - login to hgwdev
     - cd /cluster/store4/rn1/bed
     - mkdir exoFish
     - cd exoFish
     - rn1 < ~kent/src/hg/lib/exoFish.sql
     - Put email attatchment from Olivier Jaillon (ojaaillon@genoscope.cns.fr)
       into /cluster/store4/rn1/bed/exoFish/all_maping_ecore
     - awk -f filter.awk all_maping_ecore > exoFish.bed
     - hgLoadBed rn1 exoFish exoFish.bed

LOAD GENIE (TODO)
     mkdir -p ~/rn1/bed/genieAlt
     cd ~/rn1/bed/genieAlt
     wget http://www.neomorphic.com/mgap/mgscv3/gtf/mgscv3.genie.gtf.tgz
     gunzip -c mgscv3.genie.gtf.tgz | tar xvf -
     ldHgGene rn1 genieAlt mgscv3.genie.gtf/chr*.gtf
     wget http://www.neomorphic.com/mgap/mgscv3/fa/mgscv3.aa.tgz
     gunzip -c mgscv3.aa.tgz | tar xvf -
     hgPepPred rn1 genie geniePep chr*.aa.fa

LOAD GENIE CLONE BOUNDS (TODO)
     mkdir -p ~/rn1/bed/genieBounds
     cd ~/rn1/bed/genieBounds
     wget http://www.neomorphic.com/mgap/mgscv3/cb.bed/mgscv3_cb.bed.tgz
     gunzip -c mgscv3_cb.bed.tgz | tar xvf -
     - Trim the track definition from each file (these are actually custom 
       track files):
     foreach c (1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 X Un)
       tail +2 chr${c}_cb.bed > chr${c}_cb-fixed.bed
     end
     hgLoadBed rn1 genieBounds *-fixed.bed

LOAD JACKSON LABS QTL (TODO)
     Carol Bult at Jackson Labs sent us tab-separated QTL info -- 
     See RT289.  
     - Reformat QTL info into bed 6 + (really bed 5, no strand).
       There will be some complaints about lines that are missing 
       chromStart, chromEnd (they're on MIT Markers that we don't have 
       in our stsMapRat table...?)
     cd ~/rn1/bed/qtl
     ./fixit.pl mgi_qtl.rpt > mgi_qtl.bed
     hgLoadBed -noBin -tab -sqlTable=$HOME/kent/src/hg/lib/jaxQTL.sql \
       rn1 jaxQTL mgi_qtl.bed

LOAD SOFTBERRY GENES (todo)
     - ln -s /cluster/store4/rn1 ~/rn1
     - cd ~/rn1/bed
     - mkdir softberry
     - cd softberry
     - get ftp://www.softberry.com/pub/SC_MOU_NOV01/softb_mou_genes_nov01.tar.gz
     ldHgGene rn1 softberryGene chr*.gff
     hgPepPred rn1 softberry *.protein
     hgSoftberryHom rn1 *.protein

LOAD GENOMIC DUPES (todo)
o - Load genomic dupes
    ssh hgwdev
    cd ~/rn1/bed
    mkdir genomicDups
    cd genomicDups
    wget http://codon/jab/web/takeoff/oo33_dups_for_kent.zip
    unzip *.zip
    awk -f filter.awk oo33_dups_for_kent > genomicDups.bed
    hgsql rn1 < ~/src/hg/lib/genomicDups.sql
    hgLoadBed rn1 -oldTable genomicDups genomicDupes.bed

LOAD RGD CURATED GENES TRACK
    - cd rn1
    - cd bed
    - mkdir rgdGene
    - Browse to http://zephyrus.brc.mcw.edu/cgi-bin/pub/viewcvs.cgi/pub_gbrowse/gff_files/RGD_curated_genes.gff
        This is a web-based CVS page. Click the download link and save the file to ~/rn1/bed/RGD_curated_genes.gff
     - Now massage the data format using:
         rn1/bed/rgdGene/massage.pl
     - Load the data:
        ldHgGene rn1 rgdGene Fixed_RGD_Curated_genes.gff
     - Create the link table for searching
        In mysql for the rn1 database do:
        create table rgdLink (id int primary key, name varchar(32) not null);
        LOAD DATA LOCAL INFILE 'RGD.links' into table rgdLink; 
     

FAKING DATA FROM PREVIOUS VERSION
(This is just for until proper track arrives.  Rescues about
97% of data  Just an experiment, not really followed through on).

o - Rescuing STS track:
     - log onto hgwdev
     - mkdir ~/rn1/rescue
     - cd !$
     - mkdir sts
     - cd sts
     - bedDown hg3 mapGenethon sts.fa sts.tab
     - echo ~/rn1/sts.fa > fa.lst
     - pslOoJobs ~/rn1 ~/rn1/rescue/sts/fa.lst ~/rn1/rescue/sts g2g
     - log onto cc01
     - cc ~/rn1/rescue/sts
     - split all.con into 3 parts and condor_submit each part
     - wait for assembly to finish
     - cd psl
     - mkdir all
     - ln ?/*.psl ??/*.psl *.psl all
     - pslSort dirs raw.psl temp all
     - pslReps raw.psl contig.psl /dev/null
     - rm raw.psl
     - liftUp chrom.psl ../../../jkStuff/liftAll.lft carry contig.psl
     - rm contig.psl
     - mv chrom.psl ../convert.psl


LOAD SLAM GENES (hg12)
     cd /cluster/store3/gs.13/build30/bed/slam
     wget http://bio.math.berkeley.edu/slam/rat/gff/UCSC/rnCDS.gff.gz
     wget http://bio.math.berkeley.edu/slam/rat/gff/UCSC/rnCNS.gff.gz
     gunzip *
     ldHgGene -exon=CDS hg12 slam rnCDS.gff
     mv genePred.tab genePred.rn1

     sed -e 's/;//g' -e 's/"//g' rnCNS.bed > rnCNS.bed.2
     sort -n -k 5,5 rnCNS.bed.2 > rnCNS.bed.sort
     examine head and tail of sorted file for range of scores
     rm rnCNS.bed.sort
     size.pl < rnCNS.bed.2 > rnCNS.bed.2.size
     sort -n -k 2,2 rnCNS.bed.2.size > rnCNS.bed.2.size.sort
     examine head and tail of sorted file for range of sizes
     rm rnCNS.bed.2.size.sort
     expand.pl < rnCNS.bed.2 > rnCNS.bed.2.expand

SLAM (hg13)
     cd /cluster/store4/rn1/bed/slam
     mkdir hg13
     cd hg13

     wget http://baboon.math.berkeley.edu/~cdewey/slam/hs_31_Nov2002_rn_2_Nov2002/gff/rnCDS.gff.gz
     gunzip rnCDS.gff.gz
     mv rnCDS.gff ratFromHumanCDS.gff
     ldHgGene -exon=CDS rn1 slamHuman

     wget http://baboon.math.berkeley.edu/~scawley/slam/hs_31_Nov2002_rn_2_Nov2002/gff/rnCNS.bed.gz
     gunzip rnCNS.bed.gz
     mv rnCNS.bed ratFromHumanCNS.bed
     expand.pl < ratFromHumanCNS.bed > ratFromHumanCNS.bed.expand
     hgLoadBed -tab rn1 slamNonCodingHuman ratFromHumanCNS.bed.expand    

     
