
# for emacs: -*- mode: sh; -*-


# This file describes how we made the browser database on the
# Chicken (Gallus gallus) May 2006 release.

#  NOTE:  this doc may have genePred loads that fail to include
#  the bin column.  Please correct that for the next build by adding
#  a bin column when you make any of these tables:
#
#  mysql> SELECT tableName, type FROM trackDb WHERE type LIKE "%Pred%";
#  +-------------+---------------------------------+
#  | tableName   | type                            |
#  +-------------+---------------------------------+
#  | refGene     | genePred refPep refMrna         |
#  | xenoRefGene | genePred xenoRefPep xenoRefMrna |
#  | genscan     | genePred genscanPep             |
#  +-------------+---------------------------------+



#########################################################################
# CREATE BUILD DIRECTORY (DONE 5/14/06 angie)
    ssh kkstore03
    mkdir /cluster/store6/galGal3
    ln -s /cluster/store6/galGal3 /cluster/data/


#########################################################################
# DOWNLOAD MITOCHONDRION GENOME SEQUENCE (DONE 5/14/06 angie)
    mkdir /cluster/data/galGal3/M
    cd /cluster/data/galGal3/M
    # go to http://www.ncbi.nih.gov/ and search Nucleotide for
    # "gallus mitochondrion genome".  That shows the gi number:
    # 5834843
    # Use that number in the entrez linking interface to get fasta:
    wget -O chrM.fa \
      'http://www.ncbi.nlm.nih.gov/entrez/query.fcgi?cmd=Text&db=Nucleotide&uid=5834843&dopt=FASTA'
    # Edit chrM.fa: make sure the long fancy header line says it's the
    # Gallus gallus mitochondrion complete genome, and then replace the
    # header line with just ">chrM".


#########################################################################
# DOWNLOAD, UNPACK AND CHECK CHROM FASTA & AGP (DONE 5/14/06 angie)
    mkdir /cluster/data/galGal3/downloads
    cd /cluster/data/galGal3/downloads
    wget ftp://genome.wustl.edu/pub/user/lhillier/private/chicken_060412.tar.gz
    tar xvzf chicken_060412.tar.gz
    cp /dev/null ../chrom.lst
    foreach agp (*.agp)
      set chr = $agp:r
      echo $chr
      set fa = $chr.fa
      if (! -e $fa) then
        echo "*** No fasta for $agp"
        break
      endif
      set c = `echo $chr | sed -e 's/^chr//; s/_random$//;'`
      if (! -d ../$c) mkdir ../$c
      mv $agp $fa ../$c
    end
    cd ..
    # checkAgpAndFa prints out way too much info -- keep the end/stderr only:
    ls -1d ?{,?} E* | sed -e 's@/$@@' > chrom.lst
    foreach c (`cat chrom.lst`)
      foreach agp ($c/chr$c{,_random}.agp)
        if (-e $agp) then
          set fa = $agp:r.fa
          echo checking consistency of $agp and $fa
          checkAgpAndFa $agp $fa | tail -1
        endif
      end
    end
    faSize */chr*.fa
#Total size: mean 19306674.4 sd 39157213.7 min 1028 (chr32) max 200994015 (chr1) median 2031799


#########################################################################
# BREAK UP SEQUENCE INTO 5 MB CHUNKS AT CONTIGS/GAPS (DONE 5/14/06 angie)
    ssh kkstore03
    cd /cluster/data/galGal3
    foreach c (`cat chrom.lst`)
      foreach agp ($c/chr$c{,_random}.agp)
        if (-e $agp) then
          set fa = $agp:r.fa
          echo splitting $agp and $fa
          cp -p $agp $agp.bak
          cp -p $fa $fa.bak
          splitFaIntoContigs $agp $fa . -nSize=5000000
        endif
      end
    end
    # splitFaIntoContigs makes new dirs for _randoms.  Move their contents
    # back into the main chrom dirs and get rid of the _random dirs.
    foreach d (*_random)
      set base = `echo $d | sed -e 's/_random$//'`
      mv $d/lift/oOut.lst $base/lift/rOut.lst
      mv $d/lift/ordered.lft $base/lift/random.lft
      mv $d/lift/ordered.lst $base/lift/random.lst
      rmdir $d/lift
      mv $d/* $base
      rmdir $d
    end
    # Un/ has Un_random but for some reason o* files were created.  Rename:
    foreach f (Un/lift/*)
      set g = `echo $f | sed -e 's/ordered/random/; s@/o@/r@;'`
      mv $f $g
    end
    # Make a "pseudo-contig" for processing chrM too:
    mkdir M/chrM_1
    sed -e 's/chrM/chrM_1/' M/chrM.fa > M/chrM_1/chrM_1.fa
    mkdir M/lift
    echo "chrM_1/chrM_1.fa.out" > M/lift/oOut.lst
    echo "chrM_1" > M/lift/ordered.lst
    set mSize = `faSize M/chrM.fa | awk '{print $1;}'`
    echo "0	M/chrM_1	$mSize	chrM	$mSize" > M/lift/ordered.lft


#########################################################################
# REPEAT MASKING (DONE 5/15/06 angie)

    # RepeatMasker version and library version:
#    March 20 2006 (open-3-1-5) version of RepeatMasker
    grep RELEASE /cluster/bluearc/RepeatMasker060320/Libraries/RepeatMaskerLib.embl
#CC   RELEASE 20060315;                                            *

    #- Split contigs into 500kb chunks, at gaps if possible:
    ssh kkstore03
    cd /cluster/data/galGal3
    foreach c (`cat chrom.lst`)
      foreach d ($c/chr${c}*_?{,?})
        cd $d
        echo "splitting $d"
        set contig = $d:t
        faSplit gap $contig.fa 500000 ${contig}_ -lift=$contig.lft \
            -minGapSize=100
        cd ../..
      end
    end

    #- Make the run directory and job list:
    cd /cluster/data/galGal3
    mkdir jkStuff
    cat << '_EOF_' > jkStuff/RMChicken
#!/bin/csh -fe

cd $1
pushd .
/bin/mkdir -p /tmp/galGal3/$2
/bin/cp $2 /tmp/galGal3/$2/
cd /tmp/galGal3/$2
/cluster/bluearc/RepeatMasker/RepeatMasker -ali -s -spec chicken $2
popd
/bin/cp /tmp/galGal3/$2/$2.out ./
if (-e /tmp/galGal3/$2/$2.align) /bin/cp /tmp/galGal3/$2/$2.align ./
if (-e /tmp/galGal3/$2/$2.tbl) /bin/cp /tmp/galGal3/$2/$2.tbl ./
if (-e /tmp/galGal3/$2/$2.cat) /bin/cp /tmp/galGal3/$2/$2.cat ./
/bin/rm -fr /tmp/galGal3/$2/*
/bin/rmdir --ignore-fail-on-non-empty /tmp/galGal3/$2
/bin/rmdir --ignore-fail-on-non-empty /tmp/galGal3
'_EOF_'
    # << emacs
    chmod +x jkStuff/RMChicken
    mkdir RMRun
    cp /dev/null RMRun/RMJobs
    foreach c (`cat chrom.lst`)
      foreach d ($c/chr${c}{,_random}_?{,?})
          set ctg = $d:t
          foreach f ( $d/${ctg}_?{,?}.fa )
            set f = $f:t
            echo /cluster/data/galGal3/jkStuff/RMChicken \
                 /cluster/data/galGal3/$d $f \
               '{'check out line+ /cluster/data/galGal3/$d/$f.out'}' \
              >> RMRun/RMJobs
          end
      end
    end

    #- Do the run
    ssh pk
    cd /cluster/data/galGal3/RMRun
    para make RMJobs
    para time
#Completed: 2492 of 2492 jobs
#CPU time in finished jobs:    2613579s   43559.64m   725.99h   30.25d  0.083 y
#IO & Wait Time:                 17626s     293.77m     4.90h    0.20d  0.001 y
#Average job time:                1056s      17.60m     0.29h    0.01d
#Longest finished job:            1335s      22.25m     0.37h    0.02d
#Submission to last job:          8741s     145.68m     2.43h    0.10d

    #- Lift up the 500KB chunk .out's to 5MB ("pseudo-contig") level
    ssh kkstore03
    cd /cluster/data/galGal3
    foreach d (*/chr*_?{,?})
      set contig = $d:t
      echo $contig
      liftUp $d/$contig.fa.out $d/$contig.lft warn $d/${contig}_*.fa.out \
        > /dev/null
    end

    #- Lift pseudo-contigs to chromosome level
    foreach c (`cat chrom.lst`)
      echo lifting $c
      cd $c
      if (-e lift/ordered.lft && ! -z lift/ordered.lft) then
        liftUp chr$c.fa.out lift/ordered.lft warn `cat lift/oOut.lst` \
        > /dev/null
      endif
      if (-e lift/random.lft && ! -z lift/random.lft) then
        liftUp chr${c}_random.fa.out lift/random.lft warn `cat lift/rOut.lst` \
        > /dev/null
      endif
      cd ..
    end

    #- Load the .out files into the database with:
    ssh hgwdev
    cd /cluster/data/galGal3
    hgLoadOut galGal3 */chr*.fa.out

    # Compare coverage to previous release:
    featureBits galGal3 rmsk
#102214417 bases of 1042591351 (9.804%) in intersection
    # galGal2 coverage:
#104249260 bases of 1054197620 (9.889%) in intersection
    # Uh-oh -- in the past, a drop in coverage has meant an RM problem.
    # However, spot-checking the per-chrom coverage of galGal2 vs. galGal3,
    # it seems like many small or random chroms have simply had a lot
    # of repetitive sequence cut from them (significant drop in size as well
    # as drop in coverage), while for many of the large chroms, the coverage
    # has gone up.  Some chroms grew but the rmsk coverage did not keep up.
    # chrUn is quite a bit smaller (121M --> 53M).  I saved the per-chrom
    # coverage figures in /cluster/data/galGal3/RMCompare/ .
#galGal2:chr1    24219568        183744490       13.181
#galGal3:chr1    25804441        195192300       13.220
#galGal2:chr1_random     186317  1261352 14.771
#galGal3:chr1_random     12930   213918  6.044
#galGal2:chr10   807202  18954178        4.259
#galGal3:chr10   858376  20484937        4.190
#galGal2:chr10_random    525320  3515812 14.942
#galGal3:chr10_random    644     13679   4.708
#galGal2:chr2    16579726        143798269       11.530	[exception.
#galGal3:chr2    17182793        150358687       11.428	[
#galGal2:chr2_random     4670    53846   8.673		[
#galGal3:chr2_random     16589   127706  12.990		[
#galGal2:chr18   461248  8797585  5.243
#galGal3:chr18   507614  10513347 4.828

#galGal2:chrE22C19W28    2628    47202   5.568
#galGal2:chrE50C23       690     10171   6.784
#galGal3:chrE22C19W28_E50C23     42486   822662  5.164

#galGal2:chrUn   18697113        121198700       15.427
#galGal3:chrUn_random    11075578        53400422        20.741
#galGal2:chrW    530067  4135691 12.817
#galGal3:chrW    133707  233885  57.168
#galGal2:chrW_random     109531  229903  47.642
#galGal3:chrW_random     311315  625108  49.802
#galGal2:chrZ    4629721 30832492        15.016
#galGal3:chrZ    10599358        67536383        15.694
#galGal2:chrZ_random     2172389 14348615 15.140
#galGal3:chrZ_random     115568  309836   37.300
    # So I'm inclined to think that it's most likely not a RepeatMasker bug
    # but a change in the underlying assembly.  Something to run by LaDeana
    # after reading the release notes...


#########################################################################
# MAKE LIFTALL.LFT (DONE 5/15/06 angie)
    ssh kkstore03
    cd /cluster/data/galGal3
    cat */lift/{ordered,random}.lft > jkStuff/liftAll.lft


#########################################################################
# CREATING DATABASE (DONE 5/15/06 angie)
    ssh hgwdev
    echo 'create database galGal3' | hgsql ''
    # old hgwdev 5/15:
    # We are awfully tight here but at least not out of space:
    df -h /var/lib/mysql
#/dev/sdc1             1.8T  1.6T   65G  97% /var/lib/mysql
    # New hgwdev 5/18:
    df -h /data/mysql
#/dev/sdc1             1.8T  685G  1.1T  40% /data/mysql
    # Copy over the grp table from previous release:
    echo "create table grp (PRIMARY KEY(NAME)) select * from galGal2.grp" \
      | hgsql galGal3


#########################################################################
# GOLD AND GAP TRACKS (DONE 5/15/06 angie)
    ssh hgwdev
    cd /cluster/data/galGal3
    hgGoldGapGl -noGl -chromLst=chrom.lst galGal3 /cluster/data/galGal3 .
    # featureBits complains if there's no chrM_gap, so make one:
    # echo "create table chrM_gap like chr1_gap" | hgsql galGal3
    # oops, that won't work until v4.1, so do this for the time being:
    echo "create table chrM_gap select * from chr1_gap where 0=1" \
    | hgsql galGal3


#########################################################################
# SIMPLE REPEATS (TRF) (DONE 5/15/06 angie)
    # TRF runs pretty quickly now... it takes a few hours total runtime,
    # so instead of binrsyncing and para-running, just do this on the
    # local fileserver
    ssh kkstore03
    mkdir /cluster/data/galGal3/bed
    mkdir /cluster/data/galGal3/bed/simpleRepeat
    cd /cluster/data/galGal3/bed/simpleRepeat
    mkdir trf
    cp /dev/null jobs.csh
    foreach d (/cluster/data/galGal3/*/chr*_?{,?})
      set ctg = $d:t
      foreach f ($d/${ctg}.fa)
        set fout = $f:t:r.bed
        echo $fout
        echo "/cluster/bin/i386/trfBig -trf=/cluster/bin/i386/trf $f /dev/null -bedAt=trf/$fout -tempDir=/tmp" \
        >> jobs.csh
      end
    end
    csh -efx jobs.csh >&! jobs.log &
    # check on this with
    tail -f jobs.log
    wc -l jobs.csh
#259 jobs.csh
    ls -1 trf | wc -l
#259
    endsInLf trf/*
#trf/chr12_random_1.bed zero length
#trf/chr13_random_1.bed zero length
    # That's OK.
    # When job is done do:
    liftUp simpleRepeat.bed /cluster/data/galGal3/jkStuff/liftAll.lft warn \
      trf/*.bed

    # Load into the database:
    ssh hgwdev
    hgLoadBed galGal3 simpleRepeat \
      /cluster/data/galGal3/bed/simpleRepeat/simpleRepeat.bed \
      -sqlTable=$HOME/kent/src/hg/lib/simpleRepeat.sql
    featureBits galGal3 simpleRepeat
#9650062 bases of 1042591351 (0.926%) in intersection
    # galGal2 coverage:
#8434365 bases of 1054197620 (0.800%) in intersection

###########################################################################
# CREATE MICROSAT TRACK (done 2006-7-5 JK)
     ssh hgwdev
     cd /cluster/data/galGal3/bed
     mkdir microsat
     cd microsat
     awk '($5==2 || $5==3) && $6 >= 15 && $8 == 100 && $9 == 0 {printf("%s\t%s\t%s\t%dx%s\n", $1, $2, $3, $6, $16);}' ../simpleRepeat/simpleRepeat.bed > microsat.bed
    /cluster/bin/i386/hgLoadBed galGal3 microsat microsat.bed


#########################################################################
# PROCESS SIMPLE REPEATS INTO MASK (DONE 5/15/06 angie)
    # After the simpleRepeats track has been built, make a filtered version
    # of the trf output: keep trf's with period <= 12:
    ssh kkstore03
    cd /cluster/data/galGal3/bed/simpleRepeat
    mkdir -p trfMask
    foreach f (trf/chr*.bed)
      awk '{if ($5 <= 12) print;}' $f > trfMask/$f:t
    end
    # Lift up filtered trf output to chrom coords as well:
    cd /cluster/data/galGal3
    mkdir bed/simpleRepeat/trfMaskChrom
    foreach c (`cat chrom.lst`)
      if (-e $c/lift/ordered.lst) then
        perl -wpe 's@(\S+)@bed/simpleRepeat/trfMask/$1.bed@' \
          $c/lift/ordered.lst > $c/lift/oTrf.lst
        liftUp bed/simpleRepeat/trfMaskChrom/chr$c.bed \
          jkStuff/liftAll.lft warn `cat $c/lift/oTrf.lst`
      endif
      if (-e $c/lift/random.lst) then
        perl -wpe 's@(\S+)@bed/simpleRepeat/trfMask/$1.bed@' \
           $c/lift/random.lst > $c/lift/rTrf.lst
        liftUp bed/simpleRepeat/trfMaskChrom/chr${c}_random.bed \
          jkStuff/liftAll.lft warn `cat $c/lift/rTrf.lst`
      endif
    end
    # Here's the coverage for the filtered TRF:
    ssh hgwdev
    cat /cluster/data/galGal3/bed/simpleRepeat/trfMaskChrom/*.bed \
      > /tmp/filtTrf.bed
    featureBits galGal3 /tmp/filtTrf.bed
#4110365 bases of 1042591351 (0.394%) in intersection
    # galGal2 coverage:
#4510381 bases of 1054197620 (0.428%) in intersection


#########################################################################
# MASK SEQUENCE WITH REPEATMASKER AND SIMPLE REPEAT/TRF (DONE 5/15/06 angie)
    # Note: just to keep things consistent, redid chr1 and chr2 2/26 with
    # the ProcessRepeats-only rerun results (only masking changes were
    # 5bp in chr1 and 3bp in chr2)
    ssh kkstore03
    cd /cluster/data/galGal3
    # Soft-mask (lower-case) the contig and chr .fa's,
    # then make hard-masked versions from the soft-masked.
    set trfCtg=bed/simpleRepeat/trfMask
    set trfChr=bed/simpleRepeat/trfMaskChrom
    foreach f (*/chr*.fa)
      echo "repeat- and trf-masking $f"
      maskOutFa -soft $f $f.out $f
      set chr = $f:t:r
      maskOutFa -softAdd $f $trfChr/$chr.bed $f
      echo "hard-masking $f"
      maskOutFa $f hard $f.masked
    end
    foreach c (`cat chrom.lst`)
      echo "repeat- and trf-masking contigs of chr$c, chr${c}_random"
      foreach d ($c/chr*_?{,?})
        set ctg=$d:t
        set f=$d/$ctg.fa
        maskOutFa -soft $f $f.out $f
        maskOutFa -softAdd $f $trfCtg/$ctg.bed $f
        maskOutFa $f hard $f.masked
      end
    end
    # Make 2bit for blat/browser usage:
    faToTwoBit */chr*.fa galGal3.2bit
    # Make soft-masked nib for blastz:
    mkdir nib
    foreach f (*/chr*.fa)
      faToNib -softMask $f nib/$f:t:r.nib
    end


#########################################################################
# MAKE CHROMINFO TABLE WITH 2BIT (DONE 5/15/06 angie)
    ssh kkstore03
    cd /cluster/data/galGal3
    mkdir bed/chromInfo
    twoBitInfo galGal3.2bit stdout \
    | awk '{print $1 "\t" $2 "\t/gbdb/galGal3/galGal3.2bit";}' \
      > bed/chromInfo/chromInfo.tab

    # Link to 2bit from /gbdb/galGal3/:
    ssh hgwdev
    mkdir /gbdb/galGal3
    ln -s /cluster/data/galGal3/galGal3.2bit /gbdb/galGal3/
    # Load /gbdb/galGal3/galGal3.2bit paths into database and save size info.
    # Make a special chromInfo.sql with large index size so that
    # chrE22C19W28_E50C23 and chrE22C19W28_E50C23_random don't get collapsed.
    cd /cluster/data/galGal3/bed/chromInfo
    perl -wpe 's/chrom\([0-9]+/chrom\(21/' \
      $HOME/kent/src/hg/lib/chromInfo.sql > chromInfo.sql
    hgLoadSqlTab galGal3 chromInfo chromInfo.sql chromInfo.tab

    hgsql -N galGal3 -e "select chrom,size from chromInfo" \
      > /cluster/data/galGal3/chrom.sizes
    # take a look at chrom.sizes size
    wc chrom.sizes
#     57     114     947 ../../chrom.sizes



#########################################################################
# MAKE 10.OOC, 11.OOC FILES FOR BLAT (DONE 5/15/06 angie)
    # Use -repMatch=380 (based on size -- for human we use 1024, and
    # chicken size is ~37% of human)
    ssh kkr1u00
    cd /cluster/data/galGal3
    mkdir /cluster/bluearc/galGal3
    blat galGal3.2bit /dev/null /dev/null -tileSize=11 \
      -makeOoc=/cluster/bluearc/galGal3/11.ooc -repMatch=380
#Wrote 13061 overused 11-mers to /cluster/bluearc/galGal3/11.ooc
    blat galGal3.2bit /dev/null /dev/null -tileSize=10 \
      -makeOoc=/cluster/bluearc/galGal3/10.ooc -repMatch=380
#Wrote 166633 overused 10-mers to /cluster/bluearc/galGal3/10.ooc
    mkdir /iscratch/i/galGal3
    cp -p /cluster/bluearc/galGal3/*.ooc /iscratch/i/galGal3/
    iSync


#########################################################################
# PUT NIBS ON /SCRATCH (DONE 5/15/06 angie)
    ssh kkstore03
    mkdir /cluster/bluearc/scratch/hg/galGal3
    rsync -av /cluster/data/galGal3/nib/* /cluster/bluearc/scratch/hg/galGal3/nib/
    cp -p /cluster/data/galGal3/galGal3.2bit /cluster/bluearc/scratch/hg/galGal3/
    # Ask cluster-admin to distribute to /scratch on big & small cluster


#########################################################################
# MAKE HGCENTRALTEST ENTRY AND TRACKDB TABLE (DONE 5/15/06 angie)
    # Make trackDb table so browser knows what tracks to expect:
    ssh hgwdev
    cd ~/kent/src/hg/makeDb/trackDb
    cvsup

    # Add trackDb directories and a description.html
    mkdir chicken/galGal3
    cvs add chicken/galGal3
    cvs add chicken/galGal3/description.html
    cvs ci -m "Initial description for galGal3." chicken/galGal3
    # Edit that makefile to add galGal3 in all the right places and do
    make update DBS=galGal3

    mkdir /gbdb/galGal3/html
    mkdir /cluster/data/galGal3/html
    ln -s /cluster/data/galGal3/html/description.html /gbdb/galGal3/html/
    cvs ci -m "Added galGal3." makefile
    # Go public on genome-test.  In a clean tree (no mods, up-to-date):
    cvs up makefile
    make alpha
    # Note: hgcentral*.genome values must correspond
    # with defaultDb.genome values
    hgsql -h genome-testdb hgcentraltest \
      -e 'INSERT INTO dbDb \
        (name, description, nibPath, organism, \
                defaultPos, active, orderKey, genome, scientificName, \
                htmlPath, hgNearOk, hgPbOk, sourceName) values \
        ("galGal3", "May 2006", "/gbdb/galGal3", "Chicken", \
               "chr5:57710001-57780000", 1, 30, "Chicken", \
                "Gallus Gallus", "/gbdb/galGal3/html/description.html", \
               0, 0, "Chicken Genome Sequencing Consortium May 2006 release");'


#########################################################################
# PRODUCING GENSCAN PREDICTIONS (DONE 5/16/06 angie)
    ssh hgwdev
    mkdir /cluster/data/galGal3/bed/genscan
    cd /cluster/data/galGal3/bed/genscan
    # Check out hg3rdParty/genscanlinux to get latest genscan:
    cvs co hg3rdParty/genscanlinux
    # Run on small cluster (more mem than big cluster).
    ssh kki
    cd /cluster/data/galGal3/bed/genscan
    # Make 3 subdirectories for genscan to put their output files in
    mkdir gtf pep subopt
    # Generate a list file, genome.list, of all the hard-masked contigs that
    # *do not* consist of all-N's (which would cause genscan to blow up)
    cp /dev/null genome.list
    foreach f ( `ls -1S /cluster/data/galGal3/*/chr*_*/chr*_?{,?}.fa.masked` )
      egrep '[ACGT]' $f > /dev/null
      if ($status == 0) echo $f >> genome.list
    end
    wc -l genome.list
#    259 genome.list
    # Create template file, gsub, for gensub2.  For example (3-line file):
    cat << '_EOF_' > gsub
#LOOP
/cluster/bin/x86_64/gsBig {check in line+ $(path1)} {check out line gtf/$(root1).gtf} -trans={check out line pep/$(root1).pep} -subopt={check out line subopt/$(root1).bed} -exe=hg3rdParty/genscanlinux/genscan -par=hg3rdParty/genscanlinux/HumanIso.smat -tmp=/tmp -window=2400000
#ENDLOOP
'_EOF_'
    # << this line makes emacs coloring happy
    gensub2 genome.list single gsub jobList
    para make jobList
    para time
#Completed: 255 of 259 jobs
#Crashed: 4 jobs
#CPU time in finished jobs:      82258s    1370.97m    22.85h    0.95d  0.003 y
#IO & Wait Time:                  1053s      17.55m     0.29h    0.01d  0.000 y
#Average job time:                 327s       5.45m     0.09h    0.00d
#Longest finished job:           20131s     335.52m     5.59h    0.23d
#Submission to last job:         23171s     386.18m     6.44h    0.27d

    # If there are crashes, diagnose with "para problems" / "para crashed".
    # If a job crashes due to genscan running out of memory, re-run it
    # manually with "-window=1200000" instead of "-window=2400000".
    ssh kkr5u00
    cd /cluster/data/galGal3/bed/genscan
    /cluster/bin/x86_64/gsBig /cluster/data/galGal3/4/chr4_13/chr4_13.fa.masked gtf/chr4_13.fa.gtf -trans=pep/chr4_13.fa.pep -subopt=subopt/chr4_13.fa.bed -exe=hg3rdParty/genscanlinux/genscan -par=hg3rdParty/genscanlinux/HumanIso.smat -tmp=/tmp -window=1200000
    /cluster/bin/x86_64/gsBig /cluster/data/galGal3/3/chr3_14/chr3_14.fa.masked gtf/chr3_14.fa.gtf -trans=pep/chr3_14.fa.pep -subopt=subopt/chr3_14.fa.bed -exe=hg3rdParty/genscanlinux/genscan -par=hg3rdParty/genscanlinux/HumanIso.smat -tmp=/tmp -window=1200000
    /cluster/bin/x86_64/gsBig /cluster/data/galGal3/2/chr2_22/chr2_22.fa.masked gtf/chr2_22.fa.gtf -trans=pep/chr2_22.fa.pep -subopt=subopt/chr2_22.fa.bed -exe=hg3rdParty/genscanlinux/genscan -par=hg3rdParty/genscanlinux/HumanIso.smat -tmp=/tmp -window=1200000
    /cluster/bin/x86_64/gsBig /cluster/data/galGal3/Un/chrUn_random_3/chrUn_random_3.fa.masked gtf/chrUn_random_3.fa.gtf -trans=pep/chrUn_random_3.fa.pep -subopt=subopt/chrUn_random_3.fa.bed -exe=hg3rdParty/genscanlinux/genscan -par=hg3rdParty/genscanlinux/HumanIso.smat -tmp=/tmp -window=1200000

    # Convert these to chromosome level files as so:
    ssh kkstore03
    cd /cluster/data/galGal3/bed/genscan
    liftUp genscan.gtf ../../jkStuff/liftAll.lft warn gtf/*.gtf
    liftUp genscanSubopt.bed ../../jkStuff/liftAll.lft warn subopt/*.bed
    cat pep/*.pep > genscan.pep

    # Load into the database as so:
    ssh hgwdev
    cd /cluster/data/galGal3/bed/genscan
    ldHgGene galGal3 genscan genscan.gtf
    hgPepPred galGal3 generic genscanPep genscan.pep
    hgLoadBed galGal3 genscanSubopt genscanSubopt.bed


#########################################################################
# MAKE GCPERCENT (DONE 5/15/06 angie)
    ssh kolossus
    mkdir /cluster/data/galGal3/bed/gc5Base
    cd /cluster/data/galGal3/bed/gc5Base
    hgGcPercent -wigOut -doGaps -file=stdout -win=5 -verbose=2 galGal3 \
       /cluster/data/galGal3 \
    | wigEncode stdin gc5Base.wig gc5Base.wib
    ssh hgwdev
    mkdir /gbdb/galGal3/wib
    cd /cluster/data/galGal3/bed/gc5Base
    ln -s `pwd`/gc5Base.wib /gbdb/galGal3/wib
    hgLoadWiggle -pathPrefix=/gbdb/galGal3/wib galGal3 gc5Base gc5Base.wig


#########################################################################
# CPGISSLANDS (WUSTL) (DONE 5/15/06 angie)
    ssh hgwdev
    mkdir /cluster/data/galGal3/bed/cpgIsland
    cd /cluster/data/galGal3/bed/cpgIsland
    # Build software from Asif Chinwalla (achinwal@watson.wustl.edu)
    cvs co hg3rdParty/cpgIslands
    cd hg3rdParty/cpgIslands
    make
    mv cpglh.exe /cluster/data/galGal3/bed/cpgIsland/

    ssh kolossus
    cd /cluster/data/galGal3/bed/cpgIsland
    foreach f (../../*/chr*.fa.masked)
      set fout=$f:t:r:r.cpg
      echo running cpglh on $f to $fout
      nice ./cpglh.exe $f > $fout
    end
    # Transform cpglh output to bed +
    cat << '_EOF_' > filter.awk
/* Input columns: */
/* chrom, start, end, len, CpG: cpgNum, perGc, cpg:gpc, observed:expected */
/* chr1\t 41776\t 42129\t 259\t CpG: 34\t 65.8\t 0.92\t 0.94 */
/* Output columns: */
/* chrom, start, end, name, length, cpgNum, gcNum, perCpg, perGc, obsExp */
/* chr1\t41775\t42129\tCpG: 34\t354\t34\t233\t19.2\t65.8\to0.94 */
{
$2 = $2 - 1;
width = $3 - $2;
printf("%s\t%d\t%s\t%s %s\t%s\t%s\t%0.0f\t%0.1f\t%s\t%s\n",
       $1, $2, $3, $5,$6, width,
       $6, width*$7*0.01, 100.0*2*$6/width, $7, $9);
}
'_EOF_'
    # << this line makes emacs coloring happy
    awk -f filter.awk chr*.cpg > cpgIsland.bed

    # load into database:
    ssh hgwdev
    cd /cluster/data/galGal3/bed/cpgIsland
    hgLoadBed galGal3 cpgIslandExt -tab \
      -sqlTable=$HOME/kent/src/hg/lib/cpgIslandExt.sql cpgIsland.bed


#########################################################################
# CPGISLANDS (ANDY LAW) (DONE 5/15/06 angie)
    # See notes in makeGalGal2.doc
    ssh kolossus
    mkdir /cluster/data/galGal3/bed/cpgIslandGgfAndy
    cd /cluster/data/galGal3/bed/cpgIslandGgfAndy
    #	Build the preProcGgfAndy program in
    #	kent/src/oneShot/preProcGgfAndy into your ~/bin/
    # Use unmasked sequence since this is not a mammal...
    cp /dev/null cpgIslandGgfAndy.bed
    foreach f (../../*/chr*.fa)
      set chr = $f:t:r:r
      echo preproc and run on unmasked $chr
      ~/bin/x86_64/preProcGgfAndy $f \
      | /cluster/home/angie/ggf-andy-cpg-island.pl \
      | perl -wpe 'chomp; ($s,$e,$cpg,$n,$c,$g,$oE) = split("\t"); $s--; \
                   $gc = $c + $g;  $pCpG = (100.0 * 2 * $cpg / $n); \
                   $pGc = (100.0 * $gc / $n); \
                   $_ = "'$chr'\t$s\t$e\tCpG: $cpg\t$n\t$cpg\t$gc\t" . \
                        "$pCpG\t$pGc\t$oE\n";' \
      >> cpgIslandGgfAndy.bed
    end
    wc -l ../cpgIsland/cpgIsland.bed *bed
#  22806 ../cpgIsland/cpgIsland.bed
#  76488 cpgIslandGgfAndy.bed
    # load into database:
    ssh hgwdev
    cd /cluster/data/galGal3/bed/cpgIslandGgfAndy
    sed -e 's/cpgIslandExt/cpgIslandGgfAndy/g' \
      $HOME/kent/src/hg/lib/cpgIslandExt.sql > cpgIslandGgfAndy.sql
    hgLoadBed galGal3 cpgIslandGgfAndy -tab \
      -sqlTable=cpgIslandGgfAndy.sql cpgIslandGgfAndy.bed
    featureBits galGal3 cpgIslandExt
#15533065 bases of 1042591351 (1.490%) in intersection
    featureBits galGal3 cpgIslandGgfAndy
#62026174 bases of 1042591351 (5.949%) in intersection


#########################################################################
# MAKE CHICKEN LINEAGE-SPECIFIC REPEATS (DONE 5/22/06 angie)
    # In an email 2/13/04, Arian said we could treat all human repeats as
    # lineage-specific, but could exclude these from chicken as ancestral:
    # L3, L3a, L3b, MIR, MIR3, MIRb, MIRm
    ssh kkstore03
    cd /cluster/data/galGal3
    mkdir -p /san/sanvol1/galGal3/linSpecRep
    foreach f (*/chr*.fa.out)
      awk '$10 !~/^(L3|L3a|L3b|MIR|MIR3|MIRb|MIRm)$/ {print;}' $f \
      > /san/sanvol1/galGal3/linSpecRep/$f:t:r:r.out.spec
    end

    #	rebuild these as they appear to have become lost
    mkdir /hive/data/staging/data/galGal3/linSpecRep
    foreach f (*/chr*.fa.out)
      awk '$10 !~/^(L3|L3a|L3b|MIR|MIR3|MIRb|MIRm)$/ {print;}' $f \
	> /hive/data/staging/data/galGal3/linSpecRep/$f:t:r:r.out.spec
    end

#########################################################################
# SWAP CHAINS/NET HG18 (DONE 5/23/06 angie)
    ssh kkstore03
    mkdir /cluster/data/galGal3/bed/blastz.hg18.swap
    cd /cluster/data/galGal3/bed/blastz.hg18.swap
    doBlastzChainNet.pl -swap /cluster/data/hg18/bed/blastz.galGal3/DEF \
      >& do.log & tail -f do.log
    ln -s blastz.hg18.swap /cluster/data/galGal3/bed/blastz.hg18


#########################################################################
# SWAP CHAINS/NET MM8 (DONE 5/24/06 angie)
    ssh kkstore03
    mkdir /cluster/data/galGal3/bed/blastz.mm8.swap
    cd /cluster/data/galGal3/bed/blastz.mm8.swap
    doBlastzChainNet.pl -swap /cluster/data/mm8/bed/blastz.galGal3/DEF \
      >& do.log & tail -f do.log
    ln -s blastz.mm8.swap /cluster/data/galGal3/bed/blastz.mm8


#########################################################################
# SWAP CHAINS/NET RN4 (DONE 5/25/06 angie)
    ssh kkstore03
    mkdir /cluster/data/galGal3/bed/blastz.rn4.swap
    cd /cluster/data/galGal3/bed/blastz.rn4.swap
    doBlastzChainNet.pl -swap /cluster/data/rn4/bed/blastz.galGal3/DEF \
      >& do.log & tail -f do.log
    ln -s blastz.rn4.swap /cluster/data/galGal3/bed/blastz.rn4


#########################################################################
# SWAP CHAINS/NET MONDOM4 (DONE 5/26/06 angie)
    ssh kkstore03
    mkdir /cluster/data/galGal3/bed/blastz.monDom4.swap
    cd /cluster/data/galGal3/bed/blastz.monDom4.swap
    doBlastzChainNet.pl -swap /cluster/data/monDom4/bed/blastz.galGal3/DEF \
      >& do.log & tail -f do.log
    ln -s blastz.monDom4.swap /cluster/data/galGal3/bed/blastz.monDom4


#########################################################################
# BLASTZ/CHAIN/NET XENTRO2 (DONE 5/27/06 angie)
    ssh pk
    mkdir /cluster/data/galGal3/bed/blastz.xenTro2.2006-05-26
    cd /cluster/data/galGal3/bed/blastz.xenTro2.2006-05-26
    cat << '_EOF_' > DEF
# chicken vs. frog
BLASTZ=/cluster/bin/penn/x86_64/blastz.v7.x86_64

# Use same params as used for galGal3-xenTro1 (see makeXenTro1.doc)
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=8000
BLASTZ_K=2200
BLASTZ_Q=/cluster/data/blastz/HoxD55.q

# TARGET: Chicken galGal3
SEQ1_DIR=/san/sanvol1/galGal3/nib
SEQ1_LEN=/cluster/data/galGal3/chrom.sizes
SEQ1_CHUNK=50000000
SEQ1_LAP=10000

# QUERY: Frog xenTro2 - single chunk big enough to run two of the
#               largest scaffolds in one job
SEQ2_DIR=/scratch/hg/xenTro2/xenTro2.2bit
SEQ2_LEN=/san/sanvol1/scratch/xenTro2/chrom.sizes
SEQ2_CHUNK=20000000
SEQ2_LAP=0
SEQ2_LIMIT=100

BASE=/cluster/data/galGal3/bed/blastz.xenTro2.2006-05-26
'_EOF_'
    # << emacs
    doBlastzChainNet.pl -blastzOutRoot=/san/sanvol1/scratch/galGal3XenTro2 \
      -bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose DEF \
      >& do.log & tail -f do.log
    ln -s blastz.xenTro2.2006-05-26 /cluster/data/galGal3/bed/blastz.xenTro2


#########################################################################
# SWAP CHAINS/NET DANRER4 (DONE 5/31/06 angie)
    ssh kkstore03
    mkdir /cluster/data/galGal3/bed/blastz.danRer4.swap
    cd /cluster/data/galGal3/bed/blastz.danRer4.swap
    doBlastzChainNet.pl -swap /cluster/data/danRer4/bed/blastz.galGal3/DEF \
      >& do.log & tail -f do.log
    ln -s blastz.danRer4.swap /cluster/data/galGal3/bed/blastz.danRer4


#########################################################################
# GENBANK AUTO UPDATE (DONE 6/9/06 angie)
    # align with revised genbank process. drop xeno ESTs.
    cd ~/kent/src/hg/makeDb/genbank
    cvsup
    # edit etc/genbank.conf to add galGal3

# galGal3
galGal3.serverGenome = /cluster/data/galGal3/galGal3.2bit
galGal3.clusterGenome = /scratch/hg/galGal3/galGal3.2bit
galGal3.ooc = /cluster/bluearc/galGal3/11.ooc
galGal3.align.unplacedChroms = chr*_random
galGal3.lift = /cluster/data/galGal3/jkStuff/liftAll.lft
galGal3.refseq.mrna.native.pslCDnaFilter  = ${ordered.refseq.mrna.native.pslCDnaFilter}
galGal3.refseq.mrna.xeno.pslCDnaFilter    = ${ordered.refseq.mrna.xeno.pslCDnaFilter}
galGal3.genbank.mrna.native.pslCDnaFilter = ${ordered.genbank.mrna.native.pslCDnaFilter}
galGal3.genbank.mrna.xeno.pslCDnaFilter   = ${ordered.genbank.mrna.xeno.pslCDnaFilter}
galGal3.genbank.est.native.pslCDnaFilter  = ${ordered.genbank.est.native.pslCDnaFilter}
galGal3.refseq.mrna.native.load = yes
galGal3.refseq.mrna.xeno.load = yes
galGal3.genbank.mrna.xeno.load = yes
galGal3.downloadDir = galGal3

    cvs ci etc/genbank.conf
    # update /cluster/data/genbank/
    make etc-update

    ssh kkstore02
    cd /cluster/data/genbank
    nice bin/gbAlignStep -initial galGal3 &
    # load database when finished
    ssh hgwdev
    cd /cluster/data/genbank
    nice ./bin/gbDbLoadStep -drop -initialLoad galGal3 &

    # enable daily alignment and update of hgwdev
    cd ~/kent/src/makeDb/genbank
    cvsup
    # add galGal3 to:
        etc/align.dbs
        etc/hgwdev.dbs
    cvs commit
    make etc-update


###########################################################################
# HUMAN (hg18) PROTEINS TRACK FOR hg18 (DONE,  2006-06-16, braney)
    ssh kkstore03
    bash # if not using bash shell already
    # make Blast database for non-random chrom sequences
    mkdir -p /cluster/data/galGal3/blastDb
    cd /cluster/data/galGal3/blastDb
    ls ../*/*/*.lft | grep -v lift | sed 's/lft/fa/' > list
    ln -s `cat list` .
    for i in *.fa
    do
	/cluster/bluearc/blast229/formatdb -i $i -p F
    done
    rm *.log *.fa list

    cd /cluster/data/galGal3
    for i in */*/*.lft
       do cat  $i  ;
    done > jkStuff/subChr.lft

    mkdir -p  /san/sanvol1/scratch/galGal3/blastDb;
    cd /cluster/data/galGal3/blastDb
    for i in nhr nin nsq;
    do
	cp *.$i /san/sanvol1/scratch/galGal3/blastDb;
    done

   mkdir /cluster/data/galGal3/bed/tblastnHg18KG
   cd  /cluster/data/galGal3/bed/tblastnHg18KG
   echo  /san/sanvol1/scratch/galGal3/blastDb/*.nsq | xargs ls -S | sed "s/\.nsq//"  > query.lst
   wc -l query.lst
# 259 query.lst

   # we want around 250000 jobs
   calc `wc /cluster/data/hg18/bed/blat.hg18KG/hg18KG.psl | awk "{print \\\$1}"`/\(250000/`wc query.lst | awk "{print \\\$1}"`\)
# 36727/(250000/259) = 38.049172

   mkdir -p /cluster/bluearc/galGal3/bed/tblastn.hg18KG/kgfa
   split -l 60 /cluster/data/hg18/bed/blat.hg18KG/hg18KG.psl /cluster/bluearc/galGal3/bed/tblastn.hg18KG/kgfa/kg
   ln -s /cluster/bluearc/galGal3/bed/tblastn.hg18KG/kgfa kgfa
   cd kgfa
   for i in *; do
     nice pslxToFa $i $i.fa;
     rm $i;
     done
   cd ..
   ls -1S kgfa/*.fa > kg.lst
   mkdir -p /cluster/bluearc/galGal3/bed/tblastn.hg18KG/blastOut
   ln -s /cluster/bluearc/galGal3/bed/tblastn.hg18KG/blastOut
   for i in `cat kg.lst`; do  mkdir blastOut/`basename $i .fa`; done
   tcsh
   cd /cluster/data/galGal3/bed/tblastn.hg18KG
   cat << '_EOF_' > blastGsub
#LOOP
blastSome $(path1) {check in line $(path2)} {check out exists blastOut/$(root2)/q.$(root1).psl }
#ENDLOOP
'_EOF_'

   cat << '_EOF_' > blastSome
#!/bin/sh
BLASTMAT=/cluster/bluearc/blast229/data
export BLASTMAT
g=`basename $2`
f=/tmp/`basename $3`.$g
for eVal in 0.01 0.001 0.0001 0.00001 0.000001 1E-09 1E-11
do
if /cluster/bluearc/blast229/blastall -M BLOSUM80 -m 0 -F no -e $eVal -p tblastn -d $1 -i $2 -o $f.8
then
        mv $f.8 $f.1
        break;
fi
done
if test -f  $f.1
then
    if /cluster/bin/i386/blastToPsl $f.1 $f.2
    then
        liftUp -nosort -type=".psl" -nohead $f.4 /cluster/data/galGal3/jkStuff/liftAll.lft carry $f.2
        liftUp -nosort -type=".psl" -pslQ -nohead $3.tmp /cluster/data/hg18/bed/blat.hg18KG/protein.lft warn $f.4

        if pslCheck -prot $3.tmp
        then
            mv $3.tmp $3
            rm -f $f.1 $f.2 $f.3 $f.4
        fi
        exit 0
    fi
fi
rm -f $f.1 $f.2 $3.tmp $f.8 $f.3 $f.4
exit 1
'_EOF_'
    # << happy emacs
    chmod +x blastSome
    gensub2 query.lst kg.lst blastGsub blastSpec

    # then run the Blast cluster jobs
    ssh kk
    cd /cluster/data/galGal3/bed/tblastn.hg18KG
    para create blastSpec
    para try, check, push, check etc.
    # pushed 100,000 jobs at a time so need to do para push again later
    para time
# Completed: 158767 of 158767 jobs
# CPU time in finished jobs:    9352133s  155868.88m  2597.81h  108.24d  0.297 y
# IO & Wait Time:                526347s    8772.45m   146.21h    6.09d  0.017 y
# Average job time:                  62s       1.04m     0.02h    0.00d
# Longest finished job:             221s       3.68m     0.06h    0.00d
# Submission to last job:        104854s    1747.57m    29.13h    1.21d

    ssh kkstore03
    cd /cluster/data/galGal3/bed/tblastn.hg18KG
    tcsh
    mkdir chainRun
    cd chainRun
    cat << '_EOF_' > chainGsub
#LOOP
chainOne $(path1)
#ENDLOOP
'_EOF_'

    cat << '_EOF_' > chainOne
(cd $1; cat q.*.psl | simpleChain -prot -outPsl -maxGap=150000 stdin /cluster/bluearc/galGal3/bed/tblastn.hg18KG/blastOut/c.`basename $1`.psl)
'_EOF_'
    chmod +x chainOne
    ls -1dS \
      /cluster/bluearc/galGal3/bed/tblastn.hg18KG/blastOut/kg?? > chain.lst
    gensub2 chain.lst single chainGsub chainSpec
    # do the cluster run for chaining
    ssh pk
    cd /cluster/data/galGal3/bed/tblastn.hg18KG/chainRun
    para create chainSpec
    para try, check, push, check etc.
# Completed: 613 of 613 jobs
# CPU time in finished jobs:      26409s     440.15m     7.34h    0.31d  0.001 y
# IO & Wait Time:                  7787s     129.79m     2.16h    0.09d  0.000 y
# Average job time:                  56s       0.93m     0.02h    0.00d
# Longest finished job:             553s       9.22m     0.15h    0.01d
# Submission to last job:          1834s      30.57m     0.51h    0.02d

    ssh kkstore03
    cd /cluster/data/galGal3/bed/tblastn.hg18KG/blastOut
    bash # if using another shell
    for i in kg??
    do
       cat c.$i.psl | awk "(\$13 - \$12)/\$11 > 0.6 {print}" > c60.$i.psl
       sort -rn c60.$i.psl | pslUniq stdin u.$i.psl
       awk "((\$1 / \$11) ) > 0.60 { print   }" c60.$i.psl > m60.$i.psl
       echo $i
    done
    sort -T /tmp -k 14,14 -k 16,16n -k 17,17n u.*.psl m60* | uniq > /cluster/data/galGal3/bed/tblastn.hg18KG/blastHg18KG.psl
    pslCheck /cluster/data/galGal3/bed/tblastn.hg18KG/blastHg18KG.psl
    # this is ok.
    # load table
    ssh hgwdev
    cd /cluster/data/galGal3/bed/tblastn.hg18KG
    hgLoadPsl galGal3 blastHg18KG.psl
    # check coverage
    featureBits galGal3 blastHg18KG
# 19638830 bases of 1042591351 (1.884%) in intersection

    featureBits galGal2 blastHg16KG
# 17057637 bases of 1054197620 (1.618%) in intersection

    featureBits galGal3 refGene:cds blastHg18KG -enrichment
# refGene:cds 0.504%, blastHg18KG 1.884%, both 0.417%, cover 82.71%, enrich 43.91x

    featureBits galGal2 refGene:cds blastHg16KG -enrichment
# refGene:cds 0.497%, blastHg16KG 1.618%, both 0.335%, cover 67.42%, enrich 41.66x

    ssh kkstore04
    rm -rf /cluster/data/galGal3/bed/tblastn.hg18KG/blastOut
    rm -rf /cluster/bluearc/galGal3/bed/tblastn.hg18KG/blastOut

#end human proteins


#########################################################################
# BACEND PAIRS TRACK
# DOWNLOAD CLONEENDS (BACENDS) FROM NCBI (DONE 6/20/06 angie)
    ssh kkstore03
    cd /cluster/data/galGal3
    # Plenty of room at the moment:
    df -h .
#                      2.0T  1.6T  280G  86% /cluster/store6
    mkdir -p bed/cloneend/ncbi
    cd bed/cloneend/ncbi
    wget --timestamping \
	ftp://ftp.ncbi.nih.gov/genomes/CLONEEND/gallus_gallus/\*
    cd ..
    # Extract unversioned accessions from headers and combine into one
    # uncompressed file which we will link to from /gbdb/:
    zcat ncbi/*ends*.mfa.gz \
    | perl -wpe 'if (/^>/) { \
       s/^>.*\|(gb|dbj)\|(\w+)\.\d+\|.*/>$2/ || die "parse line $.:$_\n"; }' \
      > cloneEnds.fa
    # Make sure the sequences are intact after the header-munging:
    faSize ncbi/*.mfa.gz
#146777127 bases (3202228 N's 143574899 real 131081897 upper 12493002 lower) in 135327 sequences in 8 files
#Total size: mean 1084.6 sd 183.8 min 64 (gi|30716935|gb|CC322877.1|) max 2431 (gi|30609044|gb|CC263290.1|) median 1095
    faSize cloneEnds.fa
#146777127 bases (3202228 N's 143574899 real 131081897 upper 12493002 lower) in 135327 sequences in 1 files
#Total size: mean 1084.6 sd 183.8 min 64 (CC322877) max 2431 (CC263290) median 1095

    # Extract pairings from info files:
    zcat ncbi/*info*.txt.gz \
    | /cluster/bin/scripts/convertCloneEndInfo stdin
#55099 pairs and 22655 singles

    # split for cluster run
    mkdir /cluster/bluearc/galGal3/cloneEnds
    faSplit sequence cloneEnds.fa 100 /cluster/bluearc/galGal3/cloneEnds/cloneEnds
    #	Check to ensure no breakage:
    faSize /cluster/bluearc/galGal3/cloneEnds/*.fa
#146777127 bases (3202228 N's 143574899 real 131081897 upper 12493002 lower) in 135327 sequences in 99 files
#Total size: mean 1084.6 sd 183.8 min 64 (CC322877) max 2431 (CC263290) median 1095
    #	same numbers as before

    # load sequences
    ssh hgwdev
    mkdir /gbdb/galGal3/cloneend
    ln -s /cluster/data/galGal3/bed/cloneend/cloneEnds.fa /gbdb/galGal3/cloneend/
    cd /tmp
    hgLoadSeq galGal3 /gbdb/galGal3/cloneend/cloneEnds.fa
#135327 sequences


# BACEND SEQUENCE ALIGNMENTS (DONE 7/17/06 angie)
    ssh kkstore03
    # Make unmasked fasta.
    cd /cluster/data/galGal3
    mkdir /cluster/bluearc/galGal3/noMask
    foreach f (?{,?}/chr*.fa)
      echo $f:t:r
      perl -wpe 'tr/a-z/A-Z/ if (! /^>/);' $f \
        > /cluster/bluearc/galGal3/noMask/$f:t
    end

    # kluster run  [san is down today so use kk this time]
    ssh kk
    mkdir /cluster/data/galGal3/bed/bacends
    cd /cluster/data/galGal3/bed/bacends
    mkdir out
    # allow blat to run politely in /tmp while it writes output, then
    # copy results to results file:
    cat << '_EOF_' > runBlat.csh
#!/bin/csh -ef
set root1 = $1
set root2 = $2
set result = $3
rm -fr /scratch/tmp/${root1}_${root2}
mkdir /scratch/tmp/${root1}_${root2}
pushd /scratch/tmp/${root1}_${root2}
/cluster/bin/i386/blat /cluster/bluearc/galGal3/noMask/${root1}.fa \
  /cluster/bluearc/galGal3/cloneEnds/${root2}.fa ${root1}.${root2}.psl \
  -ooc=/cluster/bluearc/galGal3/10.ooc -tileSize=10
popd
mkdir -p out/${root2}
rm -f ${result}
mv /scratch/tmp/${root1}_${root2}/${root1}.${root2}.psl ${result}
rm -fr /scratch/tmp/${root1}_${root2}
'_EOF_'
    #	<< emacs
    chmod +x runBlat.csh

    cat << '_EOF_' > template
#LOOP
./runBlat.csh $(root1) $(root2) {check out line+ out/$(root2)/$(root1).$(root2).psl}
#ENDLOOP
'_EOF_'
    # << emacs
    ls -1S /cluster/bluearc/galGal3/cloneEnds/cloneEnds???.fa > bacEnds.lst
    ls -1S /cluster/bluearc/galGal3/noMask/chr*.fa > contig.lst
    gensub2 contig.lst bacEnds.lst template jobList
    para make jobList
#Completed: 5247 of 5247 jobs
#CPU time in finished jobs:      69864s    1164.39m    19.41h    0.81d  0.002 y
#IO & Wait Time:                614130s   10235.51m   170.59h    7.11d  0.019 y
#Average job time:                 130s       2.17m     0.04h    0.00d
#Longest finished job:            3306s      55.10m     0.92h    0.04d
#Submission to last job:          3674s      61.23m     1.02h    0.04d

    ssh kkstore03
    cd /cluster/data/galGal3/bed/bacends
    mkdir temp
    time pslSort dirs raw.psl temp out/*
#9.349u 1.074s 0:10.54 98.7%     0+0k 0+0io 2pf+0w
    time pslReps -nearTop=0.02 -minCover=0.60 -minAli=0.85 -noIntrons raw.psl \
      bacEnds.psl /dev/null
#4.838u 0.158s 0:05.09 97.8%     0+0k 0+0io 2pf+0w


# BACEND PAIRS TRACK (DONE 7/17/06 angie)
    ssh kolossus
    cd /cluster/data/galGal3/bed/bacends
    time /cluster/bin/x86_64/pslPairs -tInsert=10000 \
      -minId=0.91 -noBin -min=25000 \
      -max=350000 -slopval=10000 -hardMax=500000 -slop -short -long -orphan \
      -mismatch -verbose bacEnds.psl \
        ../cloneend/cloneEndPairs.txt all_bacends bacEnds
#1.076u 0.292s 0:01.68 80.9%     0+0k 0+0io 3pf+0w

    # Filter by score and sort by {chrom,chromStart}:
    awk '$5 >= 300 {print;}' bacEnds.pairs | sort -k1,2n > bacEndPairs.bed
    cat bacEnds.{slop,short,long,mismatch,orphan} \
    | awk '$5 >= 300 {print;}' | sort -k1,2n > bacEndPairsBad.bed

    #	CHECK bacEndPairs.bed ID's to make sure they have no blanks in them
    awk '{print $5}' bacEndPairs.bed | sort -u

    /cluster/bin/scripts/extractPslLoad -noBin bacEnds.psl bacEndPairs.bed \
      bacEndPairsBad.bed \
    | sorttbl tname tstart | headchg -del \
    > bacEnds.load.psl
    wc -l bacEnds.*
#  136054 bacEnds.psl
#  106769 bacEnds.load.psl
#   29879 bacEnds.pairs
#       5 bacEnds.long
#      99 bacEnds.lst
#      56 bacEnds.mismatch
#   20125 bacEnds.orphan
#     106 bacEnds.short
#      78 bacEnds.slop

    # load into database
    ssh hgwdev
    cd /cluster/data/galGal3/bed/bacends
    hgLoadBed -strict -notItemRgb galGal3 bacEndPairs bacEndPairs.bed \
        -sqlTable=$HOME/kent/src/hg/lib/bacEndPairs.sql
#Loaded 29879 elements of size 11
    # note - the "Bad" track isn't pushed to RR, just used for assembly QA.
    hgLoadBed -strict -notItemRgb galGal3 bacEndPairsBad bacEndPairsBad.bed \
      -sqlTable=$HOME/kent/src/hg/lib/bacEndPairsBad.sql
#Loaded 20360 elements of size 11
    time hgLoadPsl galGal3 -table=all_bacends bacEnds.load.psl
#1.123u 0.104s 0:04.23 28.8%     0+0k 0+0io 3pf+0w

    # Compares favorably to previous release (good cov up, bad cov down):
    featureBits galGal3 all_bacends
#58629720 bases of 1042591351 (5.623%) in intersection
    featureBits galGal2 all_bacends
#52184234 bases of 1054197620 (4.950%) in intersection
    featureBits galGal3 bacEndPairs
#43710407 bases of 1042591351 (4.192%) in intersection
    featureBits galGal2 bacEndPairs
#39206208 bases of 1054197620 (3.719%) in intersection
    featureBits galGal3 bacEndPairsBad
#15784430 bases of 1042591351 (1.514%) in intersection
    featureBits galGal2 bacEndPairsBad
#27538232 bases of 1054197620 (2.612%) in intersection

    # Clean up.
    rm psl.tab psl.tab.loaded raw.psl bed.tab
    rm -r out
    rmdir temp
    rm -r /cluster/bluearc/galGal3/noMask/ /cluster/bluearc/galGal3/cloneEnds/

# end BACEND PAIRS TRACK


#########################################################################
# MAKE DOWNLOADABLE / GOLDENPATH FILES (DONE 7/13/06 angie - REDONE 8/4)
# REDONE 8/4 after makeDownloads.pl was fixed to not try to include complete
# paths in the tar files.
    ssh hgwdev
    cd /cluster/data/galGal3
    ~/kent/src/utils/makeDownloads.pl galGal3 -verbose=2 \
      >& jkStuff/downloads.log & tail -f jkStuff/downloads.log
    # Edit README.txt files when done:
# *** Edit each README.txt to resolve any notes marked with "***":
#     /cluster/data/galGal3/goldenPath/database/README.txt
#     /cluster/data/galGal3/goldenPath/bigZips/README.txt
#     /cluster/data/galGal3/goldenPath/chromosomes/README.txt
#     (The htdocs/goldenPath/galGal3/*/README.txt "files" are just links to those.)



#########################################################################
# MAKE EMPTY CHRM_GOLD (DONE 7/14/06 angie)
# Kayla found that hgTracks was complaining about missing chrM_gold.
# We can use hgFakeAgp to make a fake chrM.agp, and load that.
# However, then the chrM Assembly track shows faked contents that do not
# match the track/assembly description.  So remove the faked contents,
# leaving an empty table which is fine -- chrM is not part of the assembly.
    ssh hgwdev
    cd /cluster/data/galGal3
    hgFakeAgp M/chrM.fa M/chrM.agp
    hgGoldGapGl -noGl -chromLst=chrom.lst galGal3 /cluster/data/galGal3 .
    hgsql galGal3 -e 'delete from chrM_gold'


#########################################################################
# MULTIZ7WAY (DONE 7/18/06 angie)
# (((galGal3 ((hg18 (mm8 rn4)) monDom4)) xenTro2) danRer4)
    ssh kkstore03
    mkdir /cluster/data/galGal3/bed/multiz7way.2006-07-18
    cd /cluster/data/galGal3/bed/multiz7way.2006-07-18

    # copy MAF's to cluster-friendly server
    mkdir /san/sanvol1/scratch/galGal3/mafNet
    foreach s (rn4 mm8 hg18 monDom4 xenTro2 danRer4)
      echo $s
      rsync -av /cluster/data/galGal3/bed/blastz.$s/mafNet/* \
        /san/sanvol1/scratch/galGal3/mafNet/$s/
    end

    # Prune the hg17 17way tree to just these 9 and update db names:
    /cluster/bin/phast/tree_doctor \
      --prune-all-but=rat_rn3,mouse_mm7,human_hg17,monodelphis_monDom2,chicken_galGal2,xenopus_xenTro1,zebrafish_danRer3 \
      --rename="rat_rn3 -> rat_rn4 ; mouse_mm7 -> mouse_mm8 ; human_hg17 -> human_hg18 ; monodelphis_monDom2 -> monodelphis_monDom4 ; chicken_galGal2 -> chicken_galGal3 ; zebrafish_danRer3 -> zebrafish_danRer4 ; xenopus_xenTro1 -> xenopus_xenTro2" \
      /cluster/data/hg17/bed/multiz17way/17way.nh > 7way.nh
    # *carefully* edit 7way.nh to put galGal3 first.

    /cluster/bin/phast/all_dists 7way.nh > 7way.distances.txt
    grep galGal3 7way.distances.txt | sort -k3,3n | \
        awk '{printf ("%.4f\t%s\n", $3, $2)}' > distances.txt
    cat distances.txt
#0.9847  human_hg18
#1.0149  monodelphis_monDom4
#1.3425  mouse_mm8
#1.3472  rat_rn4
#1.3604  xenopus_xenTro2
#1.6727  zebrafish_danRer4
    # the order in the browser display will be by tree topology,
    # not by distance, but in this case (unlike human) they happen to match.

    # create species list and stripped down tree for autoMZ
    sed -e 's/[a-z][a-z]*_//g; s/:[0-9\.][0-9\.]*//g; s/;//' 7way.nh \
      > tree-commas.nh
    sed -e 's/ //g; s/,/ /g' tree-commas.nh > tree.nh
    sed -e 's/[()]//g; s/,/ /g' tree.nh > species.lst

    ssh pk
    cd /cluster/data/galGal3/bed/multiz7way.2006-07-18
    mkdir maf run
    cd run

    # stash binaries
    mkdir penn
    cp -p /cluster/bin/penn/multiz.v11.x86_64/multiz-tba/multiz penn
    cp -p /cluster/bin/penn/multiz.v11.x86_64/multiz-tba/maf_project penn
    cp -p /cluster/bin/penn/multiz.v11.x86_64/multiz-tba/autoMZ penn

cat > autoMultiz.csh << 'EOF'
#!/bin/csh -ef
    set db = galGal3
    set c = $1
    set maf = $2
    set run = `pwd`
    set tmp = /scratch/tmp/$db/multiz.$c
    set pairs = /san/sanvol1/scratch/$db/mafNet
    rm -fr $tmp
    mkdir -p $tmp
    cp ../{tree.nh,species.lst} $tmp
    pushd $tmp
    foreach s (`cat species.lst`)
        set in = $pairs/$s/$c.maf
        set out = $db.$s.sing.maf
        if ($s == $db) then
            continue
        endif
        if (-e $in.gz) then
            zcat $in.gz > $out
        else if (-e $in) then
            cp $in $out
        else
            echo "##maf version=1 scoring=autoMZ" > $out
        endif
    end
    set path = ($run/penn $path); rehash
    $run/penn/autoMZ + T=$tmp E=$db "`cat tree.nh`" $db.*.sing.maf $c.maf
    popd
    cp $tmp/$c.maf $maf
    rm -fr $tmp
'EOF'
    # << emacs
    chmod +x autoMultiz.csh

cat  << 'EOF' > spec
#LOOP
./autoMultiz.csh $(root1) {check out line+ /cluster/data/galGal3/bed/multiz7way.2006-07-18/maf/$(root1).maf}
#ENDLOOP
'EOF'
    # << emacs

    awk '{print $1}' /cluster/data/galGal3/chrom.sizes > chrom.lst
    gensub2 chrom.lst single spec jobList
    para make jobList
    para time
#Completed: 57 of 57 jobs
#CPU time in finished jobs:       6545s     109.08m     1.82h    0.08d  0.000 y
#IO & Wait Time:                   206s       3.43m     0.06h    0.00d  0.000 y
#Average job time:                 118s       1.97m     0.03h    0.00d
#Longest finished job:             890s      14.83m     0.25h    0.01d
#Submission to last job:           891s      14.85m     0.25h    0.01d

    # Make .gif for tree by pasting the contents of 7way.nh into Galt's
    # cgi-bin/phyloGif tool.  Save the .gif in your checkout of the browser/
    # CVS tree as browser/images/phylo/galGal3_7way.gif .
    ssh hgwdev
    cd ~/browser/images/phylo
    cvs add galGal3_7way.gif
    # note:  markd says it is best to use "cvs add -kb" for binary files - b0b
    cvs ci -m "phyloGif-generated galGal3 7way tree image." galGal3_7way.gif
    # Do a "make alpha" to install the file in htdocs/images/phylo/ :
    cd ~/browser
    cvs up -d -P
    make alpha
    # Include the file in the push request, and include
    # "treeImage phylo/galGal3_7way.gif" in the multiz7way trackDb.ra entry.


#########################################################################
# ANNOTATE MULTIZ7WAY MAF AND LOAD TABLES (DONE 7/18/2006 angie - REDONE 7/28)
    # REDONE 7/28/06 after Kayla found overlaps and Brian fixed mafAddIRows.
    ssh kolossus
    mkdir /cluster/data/galGal3/bed/multiz7way.2006-07-18/anno
    cd /cluster/data/galGal3/bed/multiz7way.2006-07-18/anno
    mkdir maf run
    cd run
    rm -f sizes nBeds
    foreach db (`cat /cluster/data/galGal3/bed/multiz7way.2006-07-18/species.lst`)
      ln -s  /cluster/data/$db/chrom.sizes $db.len
      if (! -e /cluster/data/$db/$db.N.bed) then
        twoBitInfo -nBed /cluster/data/$db/$db.{2bit,N.bed}
      endif
      ln -s  /cluster/data/$db/$db.N.bed $db.bed
      echo $db.bed  >> nBeds
      echo $db.len  >> sizes
    end
    echo date > jobs.csh
    # do smaller jobs first:
    foreach f (`ls -1rS ../../maf/*.maf`)
      echo nice mafAddIRows -nBeds=nBeds -sizes=sizes $f \
        /cluster/data/galGal3/galGal3.2bit ../maf/$f:t  >> jobs.csh
    end
    echo date >> jobs.csh
    csh -efx jobs.csh >&! jobs.log & tail -f jobs.log
    # 23min
    # Run mafFilter -overlap to make sure everything is cool before loading:
    mafFilter -overlap -reject=/tmp/chr1.rej.maf ../maf/chr1.maf > /dev/null
#rejected 0 blocks

    # Load anno/maf
    ssh hgwdev
    cd /cluster/data/galGal3/bed/multiz7way.2006-07-18/anno/maf
    mkdir -p /gbdb/galGal3/multiz7way/anno/maf
    ln -s /cluster/data/galGal3/bed/multiz7way.2006-07-18/anno/maf/*.maf \
      /gbdb/galGal3/multiz7way/anno/maf
    nice hgLoadMaf -pathPrefix=/gbdb/galGal3/multiz7way/anno/maf \
      galGal3 multiz7way
#Loaded 1488826 mafs in 57 files from /gbdb/galGal3/multiz7way/anno/maf
    # Do the computation-intensive part of hgLoadMafSummary on a workhorse
    # machine and then load on hgwdev:
    ssh kolossus
    cd /cluster/data/galGal3/bed/multiz7way.2006-07-18/anno/maf
    cat *.maf \
    | nice hgLoadMafSummary galGal3 -minSize=30000 -mergeGap=1500 -maxSize=200000 \
             -test multiz7waySummary stdin
#Created 518561 summary blocks from 3179814 components and 1488826 mafs from stdin
    ssh hgwdev
    cd /cluster/data/galGal3/bed/multiz7way.2006-07-18/anno/maf
    sed -e 's/mafSummary/multiz7waySummary/' ~/kent/src/hg/lib/mafSummary.sql \
      > /tmp/multiz7waySummary.sql
    time nice hgLoadSqlTab galGal3 multiz7waySummary /tmp/multiz7waySummary.sql \
      multiz7waySummary.tab
#0.000u 0.002s 0:08.00 0.0%      0+0k 0+0io 3pf+0w
    rm *.tab
    ln -s multiz7way.2006-07-18 /cluster/data/galGal3/bed/multiz7way


#########################################################################
# MULTIZ7WAY DOWNLOADABLES (DONE 7/18/2006 angie - REDONE 7/28)
    # Annotated MAF is now documented, so use anno/maf for downloads.
    ssh hgwdev
    mkdir /cluster/data/galGal3/bed/multiz7way.2006-07-18/mafDownloads
    cd /cluster/data/galGal3/bed/multiz7way.2006-07-18/mafDownloads
    # upstream mafs
cat > mafFrags.csh << 'EOF'
    date
    foreach i (1000 2000 5000)
        echo "making upstream$i.maf"
        nice featureBits galGal3 refGene:upstream:$i -fa=/dev/null -bed=up.bad
        awk -F '\t' '{printf("%s\t%s\t%s\t%s\t%s\t%s\n", $1, $2, $3, substr($4, 0, 9), 0, $5)}' up.bad > up.bed
        rm up.bad
        nice mafFrags galGal3 multiz7way up.bed upstream$i.maf \
                -orgs=../species.lst
        rm up.bed
    end
    date
'EOF'
    # << emacs
    time csh mafFrags.csh >&! mafFrags.log & tail -f mafFrags.log
#10.144u 15.154s 0:52.22 48.4%   0+0k 0+0io 5pf+0w

    ssh kkstore03
    cd /cluster/data/galGal3/bed/multiz7way.2006-07-18/mafDownloads
cat > downloads.csh << 'EOF'
    date
    foreach f (../anno/maf/chr*.maf)
	set c = $f:t:r
        echo $c
	nice gzip -c $f > $c.maf.gz
    end
    md5sum *.gz > md5sum.txt
    date
'EOF'
    # << emacs
    time csh -efx downloads.csh >&! downloads.log
#355.340u 3.718s 5:59.55 99.8%   0+0k 0+0io 1pf+0w
    nice gzip up*.maf
    nice md5sum up*.maf.gz >> md5sum.txt
    # Make a README.txt

    ssh hgwdev
    set dir = /usr/local/apache/htdocs/goldenPath/galGal3/multiz7way
    mkdir $dir
    ln -s /cluster/data/galGal3/bed/multiz7way.2006-07-18/mafDownloads/{*.gz,*.txt} $dir



#########################################################################
# MULTIZ7WAY MAF FRAMES (DONE 7/18/2006 angie - REDONE 7/31)
    ssh hgwdev
    mkdir /cluster/data/galGal3/bed/multiz7way.2006-07-18/frames
    cd /cluster/data/galGal3/bed/multiz7way.2006-07-18/frames
    # The following is adapted from MarkD's Makefile used for mm7...

    #------------------------------------------------------------------------
    # get the genes for all genomes
    # mRNAs with CDS.  single select to get cds+psl, then split that up and
    # create genePred
    # using mrna table as genes: danRer4
    mkdir genes
    foreach queryDb (danRer4)
      set tmpExt = `mktemp temp.XXXXXX`
      set tmpMrnaCds = ${queryDb}.mrna-cds.${tmpExt}
      set tmpMrna = ${queryDb}.mrna.${tmpExt}
      set tmpCds = ${queryDb}.cds.${tmpExt}
      echo $queryDb
      hgsql -N -e 'select all_mrna.qName,cds.name,all_mrna.* \
                   from all_mrna,gbCdnaInfo,cds \
                   where (all_mrna.qName = gbCdnaInfo.acc) and \
                     (gbCdnaInfo.cds != 0) and (gbCdnaInfo.cds = cds.id)' \
       ${queryDb} > ${tmpMrnaCds}
      cut -f 1-2  ${tmpMrnaCds} > ${tmpCds}
      cut -f 4-100  ${tmpMrnaCds} > ${tmpMrna}
      mrnaToGene -cdsFile=${tmpCds} -smallInsertSize=8 -quiet ${tmpMrna} \
        stdout \
      | genePredSingleCover stdin stdout | gzip -2c \
        > /scratch/tmp/$queryDb.tmp.gz
      rm ${tmpMrnaCds} ${tmpMrna} ${tmpCds}
      mv /scratch/tmp/$queryDb.tmp.gz genes/$queryDb.gp.gz
      rm -f $tmpExt
    end
    # using knownGene for rn4 mm8 hg18
    # using refGene for galGal3
    # using mgcGenes for xenTro2
    # no genes for monDom4
    # genePreds; (must keep only the first 10 columns for knownGene)
    foreach queryDb (galGal3 rn4 mm8 hg18 xenTro2)
      unset geneTbl range
      if ($queryDb == "xenTro2") then
        set geneTbl = mgcGenes
        set range = 1-10
      else if ($queryDb == "galGal3") then
        set geneTbl = refGene
        set range = 2-11
      else
        set geneTbl = knownGene
        set range = 1-10
      endif
      echo $queryDb
      hgsql -N -e "select * from $geneTbl" ${queryDb} | cut -f $range \
      | genePredSingleCover stdin stdout \
      | gzip -2c \
        > /scratch/tmp/$queryDb.tmp.gz
      mv /scratch/tmp/$queryDb.tmp.gz genes/$queryDb.gp.gz
    end

    #------------------------------------------------------------------------
    # create frames
    ssh kkstore03
    nice tcsh # easy way to get process niced
    cd /cluster/data/galGal3/bed/multiz7way/frames
    (cat  ../anno/maf/*.maf \
     | time genePredToMafFrames galGal3 stdin stdout \
         danRer4 genes/danRer4.gp.gz \
         rn4 genes/rn4.gp.gz \
         hg18 genes/hg18.gp.gz \
         mm8 genes/mm8.gp.gz \
         xenTro2 genes/xenTro2.gp.gz \
         galGal3 genes/galGal3.gp.gz \
     | gzip > multiz7way.mafFrames.gz) >& mafFrames.log & tail -f mafFrames.log

    #------------------------------------------------------------------------
    # load the database
    ssh hgwdev
    cd /cluster/data/galGal3/bed/multiz7way.2006-07-18/frames
    hgLoadMafFrames galGal3 multiz7wayFrames multiz7way.mafFrames.gz


#########################################################################
# PHASTCONS (DONE 7/20/2006 angie)
# Using Kate's process from makeHg17.doc.
# This process is distilled from Hiram and Adam's experiments
# on mouse (mm7) 17way track.  Many parameters are now fixed, without
# being experimentally derived, either because the experiments
# were lengthy and produced similar results, or because they
# weren't runnable given the alignment size.
# These parameters are:
# --rho
# --expected-length
# --target-coverage
# Also, instead of generating cons and noncons tree models,
# we use a single, pre-existing tree model -- Elliot Margulies' model
# from the (37-way) ENCODE alignments.
    ssh kkstore03
    cd /cluster/data/galGal3
    foreach f (`cat chrom.lst`)
      echo $f
      cp -p $f/*.fa /san/sanvol1/scratch/galGal3/chrom/
    end

    # Split chromosome MAF's into windows and use to generate
    # "sufficient statistics" (ss) files for phastCons input
    # NOTE: as the SAN fs has lotsa space, we're leaving these
    # big (temp) files unzipped, to save time during phastCons run.
    # Note also the larger chunk sizes from previous runs -- this
    # reduces run-time on the split, slows down the actual phastCons
    # enough so jobs don't crash (jobs are very quick, just a minute
    # or so), and according to Adam, will produce better results.
    # The previous small chunks were probably required by
    # the phyloFit step, which we are no longer using for the
    # human alignments.
    ssh pk
    mkdir /cluster/data/galGal3/bed/multiz7way.2006-07-18/phastCons
    cd /cluster/data/galGal3/bed/multiz7way.2006-07-18/phastCons
    /cluster/bin/phast/tree_doctor \
      --prune-all-but=rn3,mm7,hg17,monDom2,galGal2,xenTro1,danRer3 \
      --rename="galGal2 -> galGal3 ; rn3 -> rn4 ; hg17 -> hg18 ; mm7 -> mm8 ; monDom2 -> monDom4 ; xenTro1 -> xenTro2 ; danRer3 -> danRer4" \
      /san/sanvol1/scratch/mm7/cons/elliotsEncode.mod \
      > elliotsEncodePruned.mod
    mkdir run.split
    cd run.split
    set WINDOWS = /san/sanvol1/scratch/galGal3/multiz7way.2006-07-18/phastCons/ss
    rm -fr $WINDOWS
    mkdir -p $WINDOWS

    cat << 'EOF' > doSplit.csh
#!/bin/csh -ef
    set MAFS = /cluster/data/galGal3/bed/multiz7way.2006-07-18/maf
    set WINDOWS = /san/sanvol1/scratch/galGal3/multiz7way.2006-07-18/phastCons/ss
    cd $WINDOWS
    set c = $1
    echo $c
    rm -fr $c
    mkdir $c
    /cluster/bin/phast/$MACHTYPE/msa_split $MAFS/$c.maf -i MAF \
        -M /san/sanvol1/scratch/galGal3/chrom/$c.fa \
        -o SS -r $c/$c -w 10000000,0 -I 1000 -B 5000
    echo "Done" >> $c.done
'EOF'
# << emacs
    chmod +x doSplit.csh

    rm -f jobList
    foreach f (../../maf/*.maf)
      set c = $f:t:r
      echo "doSplit.csh $c {check out line+ $WINDOWS/$c.done}" >> jobList
    end

    para make jobList
    para time
#Completed: 54 of 57 jobs
#Crashed: 3 jobs
#CPU time in finished jobs:        570s       9.49m     0.16h    0.01d  0.000 y
#IO & Wait Time:                   229s       3.82m     0.06h    0.00d  0.000 y
#Average job time:                  15s       0.25m     0.00h    0.00d
#Longest finished job:             112s       1.87m     0.03h    0.00d
#Submission to last job:           223s       3.72m     0.06h    0.00d
    para crashed
#chr12_random, chr17_random and chr32 crashed because their maf's contain
#nothing but comments -- no alignments.  That's OK, no windows for them.

    # check tree model on 5MB chunk, using params recommended by Adam,
    # (to verify branch lengths on 2X species)
    ssh kolossus
    cd /cluster/data/galGal3/bed/multiz7way.2006-07-18/phastCons
    /cluster/bin/phast/$MACHTYPE/phyloFit -i SS -E -p MED -s HKY85 \
        --tree "`cat ../tree-commas.nh`" \
        /san/sanvol1/scratch/galGal3/multiz7way.2006-07-18/phastCons/ss/chr7/chr7.10000001-19997442.ss \
        -o phyloFit.tree
    /cluster/bin/phast/$MACHTYPE/phyloFit -i SS -E -p MED -s REV \
        --tree "`cat ../tree-commas.nh`" \
        /san/sanvol1/scratch/galGal3/multiz7way.2006-07-18/phastCons/ss/chr7/chr7.10000001-19997442.ss \
        -o phyloFit.rev.tree
    # Comment from makeHg17.doc:
    # # he ok'ed the results -- not necessary for next human run
    # TODO: maybe run these by Adam... the numbers in the RATE_MAT and
    # TREE are different between the phyloFit and elliotsEncode models,
    # not sure if the differences are significant.  I will proceed with
    # elliotsEncode as in makeHg17.doc.

    # Run phastCons
    #	This job is I/O intensive in its output files, thus it is all
    #	working over in /scratch/tmp/
    mkdir /cluster/data/galGal3/bed/multiz7way.2006-07-18/phastCons/run.cons
    cd /cluster/data/galGal3/bed/multiz7way.2006-07-18/phastCons/run.cons
    cat > doPhast.csh << 'EOF'
#!/bin/csh -fe
set c = $1
set f = $2
set len = $3
set cov = $4
set rho = $5
set tmp = /scratch/tmp/$f
mkdir -p $tmp
set san = /san/sanvol1/scratch/galGal3/multiz7way.2006-07-18/phastCons
cp -p $san/ss/$c/$f.ss ../elliotsEncodePruned.mod $tmp
pushd $tmp > /dev/null
/cluster/bin/phast/$MACHTYPE/phastCons $f.ss elliotsEncodePruned.mod \
  --rho $rho --expected-length $len --target-coverage $cov --quiet \
  --seqname $c --idpref $c --viterbi $f.bed --score > $f.pp
popd > /dev/null
mkdir -p $san/pp/$c $san/bed/$c
sleep 1
mv $tmp/$f.pp $san/pp/$c
mv $tmp/$f.bed $san/bed/$c
rm -fr $tmp
'EOF'
    # << emacs
    chmod a+x doPhast.csh

    #	root1 == chrom name, file1 == ss file name without .ss suffix
    # Create gsub file
    cat > template << 'EOF'
#LOOP
doPhast.csh $(root1) $(file1) 12 .05 .20
#ENDLOOP
'EOF'
    # << emacs

    # Create parasol batch and run it
    ssh pk
    cd /cluster/data/galGal3/bed/multiz7way.2006-07-18/phastCons/run.cons
    pushd /san/sanvol1/scratch/galGal3/multiz7way.2006-07-18/phastCons
    ls -1S ss/chr*/chr*.ss \
    | sed 's/.ss$//' \
      > /cluster/data/galGal3/bed/multiz7way.2006-07-18/phastCons/run.cons/in.list
    popd

    gensub2 in.list single template jobList
    para make jobList
    para time
#Completed: 151 of 151 jobs
#CPU time in finished jobs:       2832s      47.20m     0.79h    0.03d  0.000 y
#IO & Wait Time:                   644s      10.74m     0.18h    0.01d  0.000 y
#Average job time:                  23s       0.38m     0.01h    0.00d
#Longest finished job:              34s       0.57m     0.01h    0.00d
#Submission to last job:           181s       3.02m     0.05h    0.00d

    # create Most Conserved track
    ssh kolossus
    cd /san/sanvol1/scratch/galGal3/multiz7way.2006-07-18/phastCons
    #	The sed's and the sort get the file names in chrom,start order
    # (Hiram tricks -- split into columns on [.-/] with
    #    identifying x,y,z, to allow column sorting and
    #    restoring the filename.  Warning: the sort column
    # will depend on how deep you are in the dir
    find ./bed -name "chr*.bed" \
    | sed -e "s/\// x /g" -e "s/\./ y /g" -e "s/-/ z /g" \
    | sort -k7,7 -k9,9n \
    | sed -e "s/ x /\//g" -e "s/ y /\./g" -e "s/ z /-/g" \
    | xargs cat \
    | awk '{printf "%s\t%d\t%d\tlod=%d\t%s\n", $1, $2, $3, $5, $5;}' \
    | /cluster/bin/scripts/lodToBedScore /dev/stdin > phastConsElements7way.bed
    cp -p phastConsElements7way.bed /cluster/data/galGal3/bed/multiz7way.2006-07-18/phastCons

    # Measure coverage.  If good, load elements into database and proceed with wiggle.
    # Try for 5% overall cov, and 70% CDS cov.
    ssh hgwdev
    cd /cluster/data/galGal3/bed/multiz7way.2006-07-18/phastCons
    featureBits galGal3 -enrichment refGene:cds phastConsElements7way.bed
    featureBits galGal3 -enrichment xenoRefGene:cds phastConsElements7way.bed

    # FIRST ITERATION: doPhast (len cov rho) = (12 .1 .27)
#refGene:cds 0.504%, phastConsElements7way.bed 7.891%, both 0.414%, cover 82.18%, enrich 10.41x
#xenoRefGene:cds 1.785%, phastConsElements7way_12_100_27.bed 7.891%, both 1.610%, cover 90.22%, enrich 11.43x
    mv phastConsElements7way.bed phastConsElements7way_12_100_27.bed
    # SECOND ITERATION: doPhast (len cov rho) = (12 .1 .3)
#refGene:cds 0.504%, phastConsElements7way.bed 8.336%, both 0.421%, cover 83.61%, enrich 10.03x
#xenoRefGene:cds 1.785%, phastConsElements7way.bed 8.336%, both 1.635%, cover 91.59%, enrich 10.99x
    mv phastConsElements7way.bed phastConsElements7way_12_100_30.bed
    # THIRD ITERATION: doPhast (len cov rho) = (12 .05 .27)
#refGene:cds 0.504%, phastConsElements7way.bed 7.650%, both 0.415%, cover 82.26%, enrich 10.75x
#xenoRefGene:cds 1.785%, phastConsElements7way.bed 7.650%, both 1.615%, cover 90.49%, enrich 11.83x
    mv phastConsElements7way.bed phastConsElements7way_12_050_27.bed
    # FOURTH ITERATION: doPhast (len cov rho) = (12 .05 .20)
#refGene:cds 0.504%, phastConsElements7way.bed 6.480%, both 0.392%, cover 77.82%, enrich 12.01x
#xenoRefGene:cds 1.785%, phastConsElements7way.bed 6.480%, both 1.535%, cover 86.02%, enrich 13.28x
    mv phastConsElements7way.bed phastConsElements7way_12_050_20.bed
    # The coverage doesn't want to drop too much and maybe it shouldn't --
    # I don't like the idea of throwing extreme parameters in order to get
    # the desired coverage figure from a model that doesn't want to go there.
    # Load this up and see how it looks...


    # When happy:
    hgLoadBed -strict galGal3 phastConsElements7way \
      phastConsElements7way_12_050_20.bed

    # Create merged posterier probability file and wiggle track data files
    ssh kolossus
    cd /san/sanvol1/scratch/galGal3/multiz7way.2006-07-18/phastCons/
    # sort by chromName, chromStart so that items are in numerical order
    #  for wigEncode
    #next time try Angie's simpler sort, below
    time find ./pp -name "chr*.pp" | \
        sed -e "s/\// x /g" -e "s/\./ y /g" -e "s/-/ z /g" | \
        sort -k7,7 -k9,9n | \
        sed -e "s/ x /\//g" -e "s/ y /\./g" -e "s/ z /-/g" | \
        xargs cat | \
        nice wigEncode stdin phastCons7way.wig phastCons7way.wib
    cp -p phastCons7way.wi? \
      /cluster/data/galGal3/bed/multiz7way.2006-07-18/phastCons

    # Load gbdb and database with wiggle.
    ssh hgwdev
    cd /cluster/data/galGal3/bed/multiz7way.2006-07-18/phastCons
    ln -s `pwd`/phastCons7way.wib /gbdb/galGal3/multiz7way/phastCons7way.wib
    hgLoadWiggle -pathPrefix=/gbdb/galGal3/multiz7way galGal3 \
        phastCons7way phastCons7way.wig


#########################################################################
# PHASTCONS SCORES DOWNLOADABLES FOR 7WAY (DONE 7/20/2006 angie)
    ssh kolossus
    cd /cluster/data/galGal3/bed/multiz7way.2006-07-18
    mkdir phastConsDownloads
    cd phastConsDownloads
    cat > downloads.csh << 'EOF'
    date
    cd /san/sanvol1/scratch/galGal3/multiz7way.2006-07-18/phastCons/pp
    foreach chr (`awk '{print $1}' /cluster/data/galGal3/chrom.sizes`)
      echo $chr
      cat `ls -1 $chr/$chr.*.pp | sort -t\. -k2,2n` \
        | nice gzip -c \
            > /cluster/data/galGal3/bed/multiz7way.2006-07-18/phastConsDownloads/$chr.gz
    end
    date
'EOF'
    # << emacs
    csh -efx downloads.csh >&! downloads.log & tail -f downloads.log
    # ~3min
    md5sum *.gz > md5sum.txt
    # Make a README.txt

    ssh hgwdev
    cd /cluster/data/galGal3/bed/multiz7way.2006-07-18/phastConsDownloads
    set dir = /usr/local/apache/htdocs/goldenPath/galGal3/phastCons7way
    mkdir $dir
    ln -s /cluster/data/galGal3/bed/multiz7way.2006-07-18/phastConsDownloads/{*.gz,*.txt} $dir
    # Clean up after phastCons run.
    ssh kkstore03
    rm /cluster/data/galGal3/bed/multiz7way.2006-07-18/phastCons/*.tab
    rm -r /san/sanvol1/scratch/galGal3/multiz7way.2006-07-18/phastCons


#########################################################################
# SELF CHAINS (DONE 9/11/06 angie)
    # requested by user
    ssh kk
    mkdir /cluster/data/galGal3/bed/blastz.galGal3.2006-09-08
    cd /cluster/data/galGal3/bed/blastz.galGal3.2006-09-08
    cat << '_EOF_' > DEF
# chicken vs chicken

BLASTZ=/cluster/bin/penn/i386/blastz
BLASTZ_H=2000
BLASTZ_M=200

# TARGET: chicken galGal3
SEQ1_DIR=/scratch/hg/galGal3/nib
SEQ1_LEN=/cluster/data/galGal3/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000

# QUERY: chicken galGal3
SEQ2_DIR=/scratch/hg/galGal3/nib
SEQ2_LEN=/cluster/data/galGal3/chrom.sizes
SEQ2_CHUNK=10000000
SEQ2_LAP=0

BASE=/cluster/data/galGal3/bed/blastz.galGal3.2006-09-08
'_EOF_'
    # << emacs
    doBlastzChainNet.pl -chainMinScore=10000 -chainLinearGap=medium \
      -blastzOutRoot=/cluster/bluearc/galGal3Self \
      -bigClusterHub=kk -workhorse=kkr7u00 DEF \
      >& do.log & tail -f do.log
    ln -s blastz.galGal3.2006-09-08 /cluster/data/galGal3/bed/blastz.galGal3


#########################################################################
# SNP & GENES FROM BEIJING GENOME INST. (DONE 12/4/06 angie)
    ssh kkstore03
    mkdir /cluster/data/galGal3/bed/bgi
    cd /cluster/data/galGal3/bed/bgi
    wget --timestamping \
      'ftp://61.50.158.101/BGI/snps_by_strain/2006-11-28/*'
    # (username & password in email from Yong Zhang 11/29)
    # Each .tar.gz unpacks into identical sets of files so they will
    # overwrite each other if unpacked into the same directory.  So create
    # a subdir for each strain, and unpack its files in the subdir:
    foreach f (*.tar.gz)
      set strain = `echo $f:r:r:r | perl -wpe 's/^(\w)/\u$1/'`
      mkdir -p $strain
      pushd $strain
      tar -xvzf ../$f
      popd
    end
    # Make coverage bed4:
    cp /dev/null bgiCov.bed
    foreach strain (Broiler Layer Silkie)
      foreach f ($strain/ContigCov/ContigCov-chr*.txt)
        set chr = `echo $f:t:r | sed -e 's/ContigCov-//'`
        grep -v ^Covered $f \
        | perl -wpe 'if (/^(\d+)\s+(\d+)\s*$/) { \
              $s = $1-1;  $_ = "'$chr'\t$s\t$2\t'$strain'.'$chr'.$1.$2\n"; \
            } else {die};' \
          >> bgiCov.bed
      end
    end
    # Make bgiSnp bed+:
    ~/kent/src/hg/snp/bgi/bgiSnp.pl */SNPtables/*.txt
    # The Layer files had the wrong strain ID code (last digit) --  the
    # script substituted in the correct strain ID (Yong Zhang's suggestion).

    # Load tables
    ssh hgwdev
    cd /cluster/data/galGal3/bed/bgi
    hgLoadBed galGal3 bgiCov bgiCov.bed
    hgLoadBed galGal3 bgiSnp bgiSnp.bed \
      -sqlTable=$HOME/kent/src/hg/lib/bgiSnp.sql
    # Got this warning but I think it's because the final column is always
    # empty so bed.tab doesn't have the final column... table looks OK.
#load of bgiSnp did not go as planned: 3306633 record(s), 0 row(s) skipped, 3306633 warning(s) loading bed.tab



#########################################################################
# STS?
#########################################################################
#########################################################################
# SWAP STICKLEBACK GASACU1
    ssh pk
    mkdir /cluster/data/galGal3/bed/blastz.gasAcu1.swap
    cd /cluster/data/galGal3/bed/blastz.gasAcu1.swap
    time doBlastzChainNet.pl -verbose=2 \
	/cluster/data/gasAcu1/bed/blastz.galGal3.2006-12-28/DEF \
	-smallClusterHub=pk \
        -chainMinScore=2000 -chainLinearGap=loose \
        -tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \
        -swap -bigClusterHub=pk > swap.log 2>&1 &
    #	failed on the net step:
# HgStepManager: executing step 'net'.
# netChains: looks like previous stage was not successful
#	(can't find [galGal3.gasAcu1.]all.chain[.gz]).
    time doBlastzChainNet.pl -verbose=2 \
	/cluster/data/gasAcu1/bed/blastz.galGal3.2006-12-28/DEF \
	-smallClusterHub=pk \
        -chainMinScore=2000 -chainLinearGap=loose \
        -tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \
        -continue=net -swap -bigClusterHub=pk > net_swap.log 2>&1 &
    #	real    7m11.286s

    featureBits galGal3 chainGasAcu1Link > fb.galGal3.chainGasAcu1Link.txt
    #	32781658 bases of 1042591351 (3.144%) in intersection

#######################################################################
## BLASTZ Chicken chroms vs Stickleback chroms and random contigs
##  The above swap of gasAcu1 does *not* include the stickleback chrUn
##  To get alignments on the stickleback chrUn, do this alignment of
##  stickleback with chrUn to chicken, and swap them back to stickleback.
##  Don't do any Db loading, pick out the stickleback randoms after the swap
##	(WORKIN - 2007-01-11 - Hiram)
    ssh kkstore03
    mkdir /cluster/data/galGal3/bed/blastz.gasAcu1.2007-01-11
    cd /cluster/data/galGal3/bed/blastz.gasAcu1.2007-01-11
    cat << '_EOF_' > DEF
# Chicken chroms vs. Stickleback chroms and chrUn in contigs

# Try "human-fugu" (more distant, less repeat-killed than mammal) params
# +M=50:
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=6000
BLASTZ_K=2200
BLASTZ_M=50
BLASTZ_Q=/cluster/data/blastz/HoxD55.q

# TARGET: Chicken galGal3 chroms only, no random sequence
#       The largest is 200 million bases, there are 34 of them
SEQ1_DIR=/san/sanvol1/scratch/galGal3/galGal3.noRandoms.sdTrf.2bit
SEQ1_LEN=/san/sanvol1/scratch/galGal3/galGal3.noRandoms.sdTrf.sizes
SEQ1_CHUNK=40000000
SEQ1_LAP=10000

# QUERY: Stickleback gasAcu1 chroms and chrUn in contigs
#       The largest chrom is 32M bases, the largest contig 418,670 bases
#       The smallest chrom chrM is 15,742 bases, smallest contig 60 bases
#       there are 5,115 chroms and contig pieces
#       total size 1,089,690,673 bases
#       47107538 N's 1042583135 real 834274605 upper 208308530 lower
SEQ2_DIR=/san/sanvol1/scratch/gasAcu1/gasAcu1.sdTrf.2bit
SEQ2_LEN=/san/sanvol1/scratch/gasAcu1/gasAcu1.sdTrf.sizes
SEQ2_CTGDIR=/san/sanvol1/scratch/gasAcu1/gasAcu1.randomContigs.sdTrf.2bit
SEQ2_CTGLEN=/san/sanvol1/scratch/gasAcu1/gasAcu1.randomContigs.sdTrf.sizes
SEQ2_LIFT=/san/sanvol1/scratch/gasAcu1/chrUn.extraCloneGap.lift
SEQ2_CHUNK=1000000
SEQ2_LIMIT=20
SEQ2_LAP=0

BASE=/cluster/data/galGal3/bed/blastz.gasAcu1.2007-01-11
TMPDIR=/scratch/tmp
'_EOF_'
    # << happy emacs

    time doBlastzChainNet.pl -verbose=2 DEF \
	-smallClusterHub=pk \
	-chainMinScore=2000 -chainLinearGap=loose \
	-tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \
	-blastzOutRoot /cluster/bluearc/galGal3ChrGasAcu1Un \
	-stop=net -bigClusterHub=pk > do.log 2>&1 &
    #	real    116m59.888s
    #	And swapping:
    mkdir /cluster/data/gasAcu1/bed/blastz.galGal3.swap
    cd /cluster/data/gasAcu1/bed/blastz.galGal3.swap
    time doBlastzChainNet.pl -verbose=2 \
	/cluster/data/galGal3/bed/blastz.gasAcu1.2007-01-11/DEF \
	-smallClusterHub=pk \
	-chainMinScore=2000 -chainLinearGap=loose \
	-tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \
	-swap -stop=net -bigClusterHub=pk > swap.log 2>&1 &

#######################################################################
## BLASTZ LIZARD ANOCAR1 - (DONE - 2007-02-18 - Hiram)
    ssh kkstore05
    mkdir /cluster/data/galGal3/bed/blastz.anoCar1.2007-02-18
    cd /cluster/data/galGal3/bed/blastz.anoCar1.2007-02-18

    cat << '_EOF_' > DEF
# chicken vs. lizard

# Use same params as used for galGal3-xenTro2
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=8000
BLASTZ_K=2200
BLASTZ_Q=/cluster/data/blastz/HoxD55.q

# TARGET: Chicken galGal3
SEQ1_DIR=/san/sanvol1/galGal3/nib
SEQ1_LEN=/cluster/data/galGal3/chrom.sizes
SEQ1_CHUNK=50000000
SEQ1_LAP=10000

# QUERY: Lizard AnoCar1 - largest chunk big enough for largest scaffold
SEQ2_DIR=/san/sanvol1/scratch/anoCar1/anoCar1.2bit
SEQ2_LEN=/san/sanvol1/scratch/anoCar1/chrom.sizes
SEQ2_CHUNK=20000000
SEQ2_LIMIT=30
SEQ2_LAP=0

BASE=/cluster/data/galGal3/bed/blastz.anoCar1.2007-02-18
TMPDIR=/scratch/tmp
'_EOF_'
    # << happy emacs

    time doBlastzChainNet.pl -verbose=2 DEF \
	-tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \
	-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
	-blastzOutRoot=/san/sanvol1/scratch/galGal3AnoCar1 > do.log 2>&1

    time doBlastzChainNet.pl -verbose=2 DEF \
	-tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \
	-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
	-continue=net \
	-blastzOutRoot=/san/sanvol1/scratch/galGal3AnoCar1 > net.log 2>&1 &

    time nice -n +19 featureBits galGal3 chainAnoCar1Link \
	> fb.galGal3.chainAnoCar1Link.txt 2>&1
    #	real    0m43.752s
    #	106743952 bases of 1042591351 (10.238%) in intersection

    ##	swap documented in anoCar1.txt
    time doBlastzChainNet.pl -verbose=2 \
	/cluster/data/galGal3/bed/blastz.anoCar1.2007-02-18/DEF \
	-tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \
	-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
	-swap > swap.log 2>&1 &

#######################################################################
## BLASTZ FUGU FR2 - (DONE - 2007-03-09 - 2007-03-12 - Hiram)
    ssh kkstore03
    mkdir /cluster/data/galGal3/bed/blastz.fr2.2007-03-09
    cd /cluster/data/galGal3/bed/blastz.fr2.2007-03-09

    cat << '_EOF_' > DEF
# chicken vs. fugu

# Use same params as used for galGal3-xenTro1 (see makeXenTro1.doc)
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=8000
BLASTZ_K=2200
BLASTZ_M=50
BLASTZ_Q=/cluster/data/blastz/HoxD55.q

# TARGET: Chicken galGal3
SEQ1_DIR=/san/sanvol1/galGal3/nib
SEQ1_LEN=/cluster/data/galGal3/chrom.sizes
SEQ1_CHUNK=50000000
SEQ1_LAP=10000

# QUERY: Fugu fr2
SEQ2_DIR=/san/sanvol1/scratch/fr2/fr2.2bit
SEQ2_LEN=/san/sanvol1/scratch/fr2/chrom.sizes
SEQ2_CTGDIR=/san/sanvol1/scratch/fr2/fr2.scaffolds.2bit
SEQ2_CTGLEN=/san/sanvol1/scratch/fr2/fr2.scaffolds.sizes
SEQ2_LIFT=/san/sanvol1/scratch/fr2/liftAll.lft
SEQ2_CHUNK=10000000
SEQ2_LIMIT=30
SEQ2_LAP=0

BASE=/cluster/data/galGal3/bed/blastz.fr2.2007-03-09
TMPDIR=/scratch/tmp
'_EOF_'
    # << happy emacs

    time doBlastzChainNet.pl -verbose=2 DEF \
    -qRepeats=windowmaskerSdust \
	-bigClusterHub=kk -chainMinScore=5000 -chainLinearGap=loose \
	-blastzOutRoot=/cluster/bluearc/galGal3Fr2 > do.log  2>&1
    cat fb.galGal3.chainFr2Link.txt
    #	30935323 bases of 1042591351 (2.967%) in intersection

    mkdir /cluster/data/fr2/bed/blastz.galGal3.swap
    cd /cluster/data/fr2/bed/blastz.galGal3.swap
    time doBlastzChainNet.pl -verbose=2 -qRepeats=windowmaskerSdust \
	/cluster/data/galGal3/bed/blastz.fr2.2007-03-09/DEF \
	-bigClusterHub=kk -chainMinScore=5000 -chainLinearGap=loose \
	-swap > swap.log  2>&1 &
    time doBlastzChainNet.pl -verbose=2 -qRepeats=windowmaskerSdust \
	/cluster/data/galGal3/bed/blastz.fr2.2007-03-09/DEF \
	-bigClusterHub=kk -chainMinScore=5000 -chainLinearGap=loose \
	-continue=net -swap > swap_net.log  2>&1 &
    #	real    3m1.239s
    cat fb.fr2.chainGalGal3Link.txt
    #	36175581 bases of 393312790 (9.198%) in intersection

#######################################################################
## BLASTZ FUGU FR1 - (DONE - 2007-03-09 - 2007-03-12 - Hiram)
    ssh kkstore03
    mkdir /cluster/data/galGal3/bed/blastz.fr1.2007-03-09
    cd /cluster/data/galGal3/bed/blastz.fr1.2007-03-09

    cat << '_EOF_' > DEF
# chicken vs. fugu

# Use same params as used for galGal3-xenTro1 (see makeXenTro1.doc)
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=8000
BLASTZ_K=2200
BLASTZ_M=50
BLASTZ_Q=/cluster/data/blastz/HoxD55.q

# TARGET: Chicken galGal3
SEQ1_DIR=/san/sanvol1/galGal3/nib
SEQ1_LEN=/cluster/data/galGal3/chrom.sizes
SEQ1_CHUNK=50000000
SEQ1_LAP=10000

# QUERY: Fugu fr1
SEQ2_DIR=/san/sanvol1/scratch/fr1/chrUn.sdTrf.2bit
SEQ2_LEN=/san/sanvol1/scratch/fr1/chrom.sizes
SEQ2_CTGDIR=/san/sanvol1/scratch/fr1/fr1.UnScaffolds.sdTrf.2bit
SEQ2_CTGLEN=/san/sanvol1/scratch/fr1/scaffold.sizes
SEQ2_LIFT=/san/sanvol1/scratch/fr1/UnScaffolds/ordered.lft
SEQ2_CHUNK=10000000
SEQ2_LIMIT=30
SEQ2_LAP=0

BASE=/cluster/data/galGal3/bed/blastz.fr1.2007-03-09
TMPDIR=/scratch/tmp
'_EOF_'
    # << happy emacs

    time doBlastzChainNet.pl -verbose=2 DEF \
    -qRepeats=windowmaskerSdust \
	-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
	-blastzOutRoot=/cluster/bluearc/galGal3Fr1 > do.log  2>&1
    cat fb.galGal3.chainFr1Link.txt
    #	29604522 bases of 1042591351 (2.840%) in intersection

    mkdir /cluster/data/fr1/bed/blastz.galGal3.swap
    cd /cluster/data/fr1/bed/blastz.galGal3.swap
    time doBlastzChainNet.pl -verbose=2 -qRepeats=windowmaskerSdust \
	/cluster/data/galGal3/bed/blastz.fr1.2007-03-09/DEF \
	-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
	-swap > swap.log  2>&1 &
    #	real    3m14.494s
    cat fb.fr1.chainGalGal3Link.txt
    #	33536400 bases of 315518167 (10.629%) in intersection


#######################################################################
# BLASTZ PLATYPUS ORNANA1 (DONE 4/9/07 angie)
    ssh kkstore03
    mkdir /cluster/data/galGal3/bed/blastz.ornAna1.2007-04-06
    cd /cluster/data/galGal3/bed/blastz.ornAna1.2007-04-06
    cat << '_EOF_' > DEF
# chicken vs. platypus

# Use same params as used for galGal3-xenTro2 but back off L to 6000.
# If that causes us to get swamped, we can go back up to 8000.
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=6000
BLASTZ_K=2200
BLASTZ_Q=/cluster/data/blastz/HoxD55.q

# TARGET: Chicken galGal3
SEQ1_DIR=/iscratch/i/galGal3/galGal3.2bit
SEQ1_LEN=/cluster/data/galGal3/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000

# QUERY: Platypus ornAna1 - largest chunk big enough for largest scaffold
SEQ2_DIR=/iscratch/i/ornAna1/ornAna1.2bit
SEQ2_LEN=/iscratch/i/ornAna1/chrom.sizes
SEQ2_CHUNK=20000000
SEQ2_LIMIT=400
SEQ2_LAP=0

BASE=/cluster/data/galGal3/bed/blastz.ornAna1.2007-04-06
TMPDIR=/scratch/tmp
'_EOF_'
    # << emacs
    doBlastzChainNet.pl DEF \
      >& do.log & tail -f do.log
    ln -s blastz.ornAna1.2007-04-06 /cluster/data/galGal3/bed/blastz.ornAna1

#########################################################################
# BLASTZ/CHAIN/NET HORSE AND CHICKEN (DONE 2/27/07 Fan)
    ssh kkstore05
    mkdir /cluster/data/equCab1/bed/blastz.galGal3.2007-02-22
    cd /cluster/data/equCab1/bed/blastz.galGal3.2007-02-22
    cat << '_EOF_' > DEF
# Horse vs. Chicken

BLASTZ_M=50

BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=8000
BLASTZ_K=2200
BLASTZ_Q=/cluster/data/blastz/HoxD55.q

# TARGET: Horse equCab1
SEQ1_DIR=/san/sanvol1/scratch/equCab1/equCab1.2bit
SEQ1_LEN=/san/sanvol1/scratch/equCab1/chrom.sizes
# Maximum number of scaffolds that can be lumped together
SEQ1_LIMIT=500
SEQ1_CHUNK=30000000
SEQ1_LAP=10000

# QUERY: Chicken galGal3
SEQ2_DIR=/scratch/hg/galGal3/galGal3.2bit
SEQ2_LEN=/cluster/data/galGal3/chrom.sizes
SEQ2_CHUNK=10000000
SEQ2_LAP=0

BASE=/cluster/data/equCab1/bed/blastz.galGal3.2007-02-22
TMPDIR=/scratch/tmp
'_EOF_'

    # << this line keeps emacs coloring happy
    doBlastzChainNet.pl DEF \
      -bigClusterHub pk \
      -chainMinScore=5000 -chainLinearGap=loose \
      -blastzOutRoot /cluster/bluearc/equCab1/blastz.galGal3 >& do.log &
    tail -f do.log

    ssh hgwdev
    cd /cluster/data/equCab1/bed/blastz.galGal3.2007-02-22
    ln -s blastz.galGal3.2007-02-22 /cluster/data/equCab1/bed/blastz.galGal3
    nice featureBits equCab1 -chrom=chr1 chainGalGal3Link
# 9136777 bases of 177498097 (5.148%) in intersection

    bash
    time nice -n 19 featureBits equCab1 chainGalGal3Link \
	> fb.equCab1.chainGalGal3Link.txt 2>&1
# 114216918 bases of 2421923695 (4.716%) in intersection

    ssh kkstore05
    mkdir /cluster/data/galGal3/bed/blastz.equCab1.swap
    cd /cluster/data/galGal3/bed/blastz.equCab1.swap
    bash
    time doBlastzChainNet.pl \
	/cluster/data/equCab1/bed/blastz.galGal3.2007-02-22/DEF \
	-chainMinScore=5000 -chainLinearGap=loose \
	-verbose=2 -swap -bigClusterHub=pk > swap.log 2>&1 &
    tail -f swap.log

# the above script stopped with the following lines in the .log file:

# netToAxt axtChain/net/chrM.net axtChain/chain/chrM.chain /scratch/hg/galGal3/galGal3.2bit /san/sanvol1/scratch/equCab1/equCab1.2bit stdout
# axtSort stdin stdout
# gzip -c
# Processing chrM
# end
# netToAxt axtChain/net/chrUn_random.net axtChain/chain/chrUn_random.chain /scratch/hg/galGal3/galGal3.2bit /san/sanvol1/scratch/equCab1/equCab1.2bit stdout
# axtSort stdin stdout
# gzip -c
# Processing chrUn_random

# Hiram helped to created the following temp .csh:

    cat finiNet.csh
#!/bin/csh -efx
# This script was automatically generated by /cluster/bin/scripts/doBlastzChainNet.pl
# from /cluster/data/equCab1/bed/blastz.galGal3.2007-02-22/DEF
# It is to be executed on kolossus in /cluster/data/galGal3/bed/blastz.equCab1.swap/axtChain .
# It generates nets (without repeat/gap stats -- those are added later on
# hgwdev) from chains, and generates axt, maf and .over.chain from the nets.
# This script will fail if any of its commands fail.

cd /cluster/data/galGal3/bed/blastz.equCab1.swap

foreach f (axtChain/net/*.net)
netToAxt $f axtChain/chain/$f:t:r.chain \
  /scratch/hg/galGal3/galGal3.2bit /san/sanvol1/scratch/equCab1/equCab1.2bit stdout \
  | axtSort stdin stdout \
  | gzip -c > axtNet/$f:t:r.galGal3.equCab1.net.axt.gz
end

# Make mafNet for multiz: one .maf per galGal3 seq.
mkdir mafNet
foreach f (axtNet/*.galGal3.equCab1.net.axt.gz)
  axtToMaf -tPrefix=galGal3. -qPrefix=equCab1. $f \
        /cluster/data/galGal3/chrom.sizes  /san/sanvol1/scratch/equCab1/chrom.sizes        \
        stdout \
  | gzip -c > mafNet/$f:t:r:r:r:r:r.maf.gz
end

    ssh kolossus
    cd /cluster/data/galGal3/bed/blastz.equCab1.swap
    fininet.csh

    ssh hgwdev
    cd /cluster/data/galGal3/bed/blastz.equCab1.swap
    bash
    time nice -n 19 featureBits galGal3 chainEquCab1Link \
	> fb.galGal3.chainEquCab1Link.txt 2>&1
    # 104418042 bases of 1042591351 (10.015%) in intersection

###########################################################################
# ENSEMBL GENES (DONE 11/2/07 angie)
    ssh hgwdev
    mkdir /cluster/data/galGal3/bed/ensembl
    cd /cluster/data/galGal3/bed/ensembl
    # Go to www.ensembl.org and click around their evolving interface
    # to get the following types of data:
    # 1. a tab-separated file that relates Ensembl gene, transcript and
    #    protein IDs.  Save as ensGtp.txt.gz
    # 2. peptide fasta with only transcript IDs in header -> ensPep.fa.gz

    # They have started dumping GTF files so we can download that directly:
    # wget
    # 'ftp://ftp.ensembl.org/pub/current_gallus_gallus/data/gtf/*'
    # -- but I forgot to do that and just grabbed the GTF from Ensembl
    # BioMart along with the GTP and Pep.

    # Add "chr" to chrom names in the gene data gtf file to make
    # it compatible with our software.  Also change MT --> M.
    zcat ensembl.gff.gz | sed -e 's/^MT/M/; s/^/chr/;' > ensGene.gtf
    ldHgGene -gtf -genePredExt galGal3 ensGene ensGene.gtf
    nice featureBits galGal3 ensGene
#30850267 bases of 1042591351 (2.959%) in intersection

    # strip header line:
    zcat ensGtp.txt.gz | tail +2 \
    | hgLoadSqlTab galGal3 ensGtp ~/kent/src/hg/lib/ensGtp.sql \
      /dev/stdin -notOnServer

    zcat ensPep.fa.gz | sed -e 's/^>.*ENSGALT/>ENSGALT/' \
    | hgPepPred galGal3 generic ensPep stdin

############################################################################
#  BLASTZ petMar1 Lamprey (WORKING - 2008-04-11 - Hiram)
    ssh kkstore03
    screen	# use a screen to control this multi-day job
    mkdir /cluster/data/galGal3/bed/blastzPetMar1.2008-04-11
    cd /cluster/data/galGal3/bed/blastzPetMar1.2008-04-11

    cat << '_EOF_' > DEF
# Chicken vs Lamprey

BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=6000
BLASTZ_K=2200
BLASTZ_Q=/cluster/data/blastz/HoxD55.q

# TARGET: Chicken galGal3
SEQ1_DIR=/scratch/data/galGal3/nib
SEQ1_LEN=/cluster/data/galGal3/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000

# QUERY: Lamprey petMar1
SEQ2_DIR=/scratch/data/petMar1/petMar1.2bit
SEQ2_LEN=/cluster/data/petMar1/chrom.sizes
SEQ2_CHUNK=20000000
SEQ2_LIMIT=200
SEQ2_LAP=0

BASE=/cluster/data/galGal3/bed/blastzPetMar1.2008-04-11
TMPDIR=/scratch/tmp
'_EOF_'
    # << happy emacs

    time nice -n +19 doBlastzChainNet.pl \
	/cluster/data/galGal3/bed/blastzPetMar1.2008-04-11/DEF \
	-chainMinScore=5000 -chainLinearGap=loose \
	-tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \
	-bigClusterHub=memk -verbose=2 > do.log 2>&1 &
# Completed: 83468 of 83468 jobs
# CPU time in finished jobs:    2199234s   36653.91m   610.90h   25.45d  0.070 y
# IO & Wait Time:                426454s    7107.56m   118.46h    4.94d  0.014 y
# Average job time:                  31s       0.52m     0.01h    0.00d
# Longest finished job:             414s       6.90m     0.12h    0.00d
# Submission to last job:        103466s    1724.43m    28.74h    1.20d
    #	continuing after some kluster interruptions
    time nice -n +19 doBlastzChainNet.pl \
	/cluster/data/galGal3/bed/blastzPetMar1.2008-04-11/DEF \
	-chainMinScore=5000 -chainLinearGap=loose \
	-tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \
	-continue=cat -bigClusterHub=memk -verbose=2 > cat.log 2>&1 &
    #	real    13m54.957s
    cat fb.galGal3.chainPetMar1Link.txt
    #	20134896 bases of 1042591351 (1.931%) in intersection

    #	and the swap
    mkdir /cluster/data/petMar1/bed/blastz.galGal3.swap
    cd /cluster/data/petMar1/bed/blastz.galGal3.swap
    time nice -n +19 doBlastzChainNet.pl \
	/cluster/data/galGal3/bed/blastzPetMar1.2008-04-11/DEF \
	-chainMinScore=5000 -chainLinearGap=loose \
	-tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \
	-swap -bigClusterHub=memk -verbose=2 > swap.log 2>&1 &
    #	real    33m41.587s
    cat fb.petMar1.chainGalGal3Link.txt
    #	22308118 bases of 831696438 (2.682%) in intersection

############################################################################
#  BLASTZ petMar1 Lanclet (WORKING - 2008-04-15 - Hiram)
    ssh kkstore03
    screen	# use a screen to control this multi-day job
    mkdir /cluster/data/galGal3/bed/blastzBraFlo1.2008-04-15
    cd /cluster/data/galGal3/bed/blastzBraFlo1.2008-04-15

    cat << '_EOF_' > DEF
# Chicken vs Lanclet

BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=6000
BLASTZ_K=2200
BLASTZ_Q=/scratch/data/blastz/HoxD55.q

# TARGET: Chicken galGal3
SEQ1_DIR=/scratch/data/galGal3/nib
SEQ1_LEN=/scratch/data/galGal3/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000

# QUERY: Lancelet braFlo1 - largest chunk big enough for largest scaffold
#       Largest scaffold 7,200,735 - 3032 scaffolds + chrM
SEQ2_DIR=/scratch/data/braFlo1/braFlo1.2bit
SEQ2_LEN=/scratch/data/braFlo1/chrom.sizes
SEQ2_CTGDIR=/scratch/data/braFlo1/braFlo1UnScaffolds.2bit
SEQ2_CTGLEN=/scratch/data/braFlo1/braFlo1UnScaffolds.sizes
SEQ2_LIFT=/scratch/data/braFlo1/braFlo1.lift
SEQ2_CHUNK=10000000
SEQ2_LIMIT=30
SEQ2_LAP=0

BASE=/cluster/data/galGal3/bed/blastzBraFlo1.2008-04-15
TMPDIR=/scratch/tmp
'_EOF_'
    # << happy emacs

    time nice -n +19 doBlastzChainNet.pl \
	/cluster/data/galGal3/bed/blastzBraFlo1.2008-04-15/DEF \
	-chainMinScore=5000 -chainLinearGap=loose \
	-tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \
	-bigClusterHub=pk -verbose=2 > do.log 2>&1 &
    #	real    168m49.357s
    cat fb.galGal3.chainBraFlo1Link.txt
    #	19795687 bases of 1042591351 (1.899%) in intersection

    #	and the swap
    mkdir /cluster/data/braFlo1/bed/blastz.galGal3.swap
    cd /cluster/data/braFlo1/bed/blastz.galGal3.swap
    time nice -n +19 doBlastzChainNet.pl \
	/cluster/data/galGal3/bed/blastzBraFlo1.2008-04-15/DEF \
	-chainMinScore=5000 -chainLinearGap=loose \
	-tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \
	-swap -bigClusterHub=pk -verbose=2 > swap.log 2>&1 &
    #	real    4m45.054s
    cat fb.braFlo1.chainGalGal3Link.txt
    #	30287175 bases of 923355587 (3.280%) in intersection

#############################################################################
# BLASTZ/CHAIN/NET equCab2 (DONE - 2008-04-18 - larrym)
    ssh kkstore04
    screen #	use screen to control this multi-day job
    mkdir /cluster/data/galGal3/bed/blastz.equCab2.2008-04-18
    cd /cluster/data/galGal3/bed/blastz.equCab2.2008-04-18
    cat << '_EOF_' > DEF
# Chicken vs. Horse

BLASTZ_M=50

# TARGET: Chicken calGal3
SEQ1_DIR=/scratch/data/galGal3/nib
SEQ1_LEN=/cluster/data/galGal3/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000

# QUERY: Horse
SEQ2_DIR=/scratch/data/equCab2/equCab2.2bit
SEQ2_LEN=/cluster/data/equCab2/chrom.sizes
SEQ2_CTGDIR=/san/sanvol1/scratch/equCab2/equCab2.UnScaffolds.2bit
SEQ2_CTGLEN=/san/sanvol1/scratch/equCab2/equCab2.UnScaffolds.sizes
SEQ2_LIFT=/cluster/data/equCab2/jkStuff/equCab2.chrUn.lift
SEQ2_CHUNK=20000000
SEQ2_LIMIT=100
SEQ2_LAP=0

BASE=/cluster/data/galGal3/bed/blastz.equCab2.2008-04-18
TMPDIR=/scratch/tmp
'_EOF_'
    # << happy emacs

    time doBlastzChainNet.pl `pwd`/DEF \
	-verbose=2 -bigClusterHub=pk  \
      -chainMinScore=3000 -chainLinearGap=medium \
      -blastzOutRoot /cluster/bluearc/equCab2/blastz.galGal3 >& do.log &

failed b/c para.results got currupted:

    para recover jobList recoverJobList
    para create recoverJobList
    Checking input files
    0 jobs written to batch

so all jobs had successfully completed.

Generate run.time file manually:

    time doBlastzChainNet.pl `pwd`/DEF -continue=cat \
	-verbose=2 -bigClusterHub=pk  \
      -chainMinScore=3000 -chainLinearGap=medium \
      -blastzOutRoot /cluster/bluearc/equCab2/blastz.galGal3 >>& do.log &

but the batch file has disappeared ... I just decided to remove the psl
files and re-run from scratch:

    time doBlastzChainNet.pl `pwd`/DEF \
	-verbose=2 -bigClusterHub=pk  \
      -chainMinScore=3000 -chainLinearGap=medium \
      -blastzOutRoot /cluster/bluearc/equCab2/blastz.galGal3 >& do.log &
0.248u 0.190s 6:10:43.78 0.0%   0+0k 0+0io 26pf+0w

    ln -s blastz.equCab2.2008-04-18 /cluster/data/galGal3/bed/blastz.equCab2
############################################################################
# TRANSMAP vertebrate.2008-05-20 build  (2008-05-24 markd)

vertebrate-wide transMap alignments were built  Tracks are created and loaded
by a single Makefile. This is available from:
   svn+ssh://hgwdev.soe.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2008-05-20

see doc/builds.txt for specific details.
############################################################################
############################################################################
# TRANSMAP vertebrate.2008-06-07 build  (2008-06-30 markd)

vertebrate-wide transMap alignments were built  Tracks are created and loaded
by a single Makefile. This is available from:
   svn+ssh://hgwdev.soe.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2008-06-30

see doc/builds.txt for specific details.


#####################################################
# build 2bit with contigs from random chroms and chrUn (DONE braney 2008-08-30)
    cd /cluster/data/galGal3
    fgrep -h 'random
Un' */*.agp | awk '{if ($5 == "W") print $1, $2-1, $3, $6}' \
    | while read chr start stop contig; \
    do \
	echo ">$contig"; twoBitToFa galGal3.2bit:$chr:$start-$stop stdout \
	| tail -n +2; \
    done > unRandom.fa

    fgrep -v 'random
Un' chrom.sizes | awk '{print $1}' | \
    twoBitToFa galGal3.2bit -seqList=stdin noUn.fa
    cat noUn.fa unRandom.fa | faToTwoBit stdin galGal3.blastz.2bit

    twoBitInfo galGal3.blastz.2bit galGal3.blastz.sizes

#########################################################################
# BLASTZ/CHAIN/NET TAEGUT1 (DONE braney 2008-09-04)
    ssh swarm
    screen
    mkdir /cluster/data/galGal3/bed/blastz.taeGut1.2008-09-06
    cd /cluster/data/galGal3/bed/blastz.taeGut1.2008-09-06
    cat << _EOF_ > DEF
# chicken vs. zebra finch
BLASTZ_T=2
BLASTZ_M=50

# TARGET: Chicken galGal3
# SEQ1_DIR=/scratch/data/galGal3/galGal3.2bit
# SEQ1_LEN=/scratch/data/galGal3/chrom.sizes
SEQ1_DIR=/hive/data/genomes/galGal3/galGal3.2bit
SEQ1_LEN=/hive/data/genomes/galGal3/chrom.sizes
# SEQ1_CTGDIR=/hive/data/genomes/galGal3/galGal3.blastz.2bit
# SEQ1_CTGLEN=/hive/data/genomes/galGal3/galGal3.blastz.sizes
# SEQ1_LIFT=/hive/data/genomes/galGal3/jkStuff/liftAll.lft
# one chrom at a time
SEQ1_CHUNK=200000000
SEQ1_LAP=0

# QUERY: Zebra finch taeGut1
# SEQ2_DIR=/scratch/data/taeGut1/taeGut1.2bit
# SEQ2_LEN=/scratch/data/taeGut1/chrom.sizes
SEQ2_DIR=/hive/data/genomes/taeGut1/taeGut1.2bit
SEQ2_LEN=/hive/data/genomes/taeGut1/chrom.sizes
SEQ2_CTGDIR=/hive/data/genomes/taeGut1/taeGut1.blastz.2bit
SEQ2_CTGLEN=/hive/data/genomes/taeGut1/taeGut1.blastz.sizes
SEQ2_LIFT=/hive/data/genomes/taeGut1/jkStuff/liftAll.lft
SEQ2_CHUNK=20000000
SEQ2_LAP=0
SEQ2_LIMIT=100

BASE=/hive/data/genomes/galGal3/bed/blastz.taeGut1.2008-09-06
_EOF_
    # << emacs
     doBlastzChainNet.pl -syntenicNet \
     -bigClusterHub=swarm -chainMinScore=3000 -chainLinearGap=medium \
     -smallClusterHub=swarm DEF -workhorse=swarm \
     -qRepeats=windowmaskerSdust > do.log 2>&1 &

    cd /cluster/data/galGal3/bed
    rm -f blastz.taeGut1
    ln -s blastz.taeGut1.2008-09-06 /cluster/data/galGal3/bed/blastz.taeGut1

############

################################################
# AUTOMATE UPSTREAM FILE CREATION (2008-10-15 markd)
update genbank.conf:
galGal3.upstreamGeneTbl = refGene
galGal3.upstreamMaf = multiz7way /hive/data/genomes/galGal3/bed/multiz7way/species.lst

############################################################################
# QUALITY TRACK (REDONE 12/2/09 angie)
# originally done 2008-11-24 - Hiram)
    cd /hive/data/genomes/galGal3
    # create an agp file
for C in `cut -f1 chrom.sizes | grep -v random | sed -e "s/chr//" | sort`
do
    cat $C/chr${C}.agp
done > galGal3.agp

for C in `cut -f1 chrom.sizes | grep random | sed -e "s/chr//; s/_random//" | sort`
do
    F=$C/chr${C}_random.agp
    if [ -s $F ]; then
        cat $F
    fi
done >> galGal3.agp

    cd /hive/data/genomes/galGal3/downloads
    qaToQac contigs.fa.qual.gz assembly.qac
    qacAgpLift -mScore=99 -verbose=2 ../galGal3.agp assembly.qac scaffolds.qac

    mkdir /hive/data/genomes/galGal3/bed/qual
    cd /hive/data/genomes/galGal3/bed/qual
    qacToWig -fixed ../../downloads/scaffolds.qac stdout \
        | wigEncode stdin qual.wig qual.wib
#Converted stdin, upper limit 99.00, lower limit 0.00
    ln -s `pwd`/qual.wib /gbdb/galGal3/wib
    hgLoadWiggle -pathPrefix=/gbdb/galGal3/wib galGal3 quality qual.wig

#############################################################################
# N-SCAN gene predictions (nscanGene) - (2009-03-13 markd)

    # obtained NSCAN predictions from michael brent's group
    # at WUSTL
    cd /cluster/data/galGal3/bed/nscan/
    http://mblab.wustl.edu/predictions/chicken/galGal3/

    wget -nv http://mblab.wustl.edu/predictions/chicken/galGal3/galGal3.gtf
    wget -nv http://mblab.wustl.edu/predictions/chicken/galGal3/galGal3.fa
    wget -nv http://mblab.wustl.edu/predictions/chicken/galGal3/README
    bzip2 galGal3.*
    chmod a-w *
    # load track
    ldHgGene -bin -gtf -genePredExt galGal3 nscanGene galGal3.gtf.bz2
    hgPepPred galGal3 generic nscanPep  galGal3.fa.bz2
    rm *.tab

    # chicken/galGal3/trackDb.ra, add:
    track nscanGene override
    informant Chicken N-SCAN uses Zerba Finch (taeGut1) as the informant, updated with PASA clusters of chicken cDNAs.

    searchTable nscanGene
    searchType genePred
    termRegex chr[0-9a-zA-Z_]+\.([0-9]+|pasa)\.[0-9]+(\.[0-9a-z]+)?
    searchPriority 50

############################################################################
# Re-Run equCab2 alignment (DONE - 2009-06-29,07-01 - Hiram)
    mkdir /hive/data/genomes/galGal3/bed/lastzEquCab2.2009-06-29
    cd /hive/data/genomes/galGal3/bed/lastzEquCab2.2009-06-29

    cat << '_EOF_' > DEF
# Chicken vs. Horse

BLASTZ_M=50

# TARGET: Human galGal3
SEQ1_DIR=/scratch/data/galGal3/bothMaskedNibs
SEQ1_LEN=/scratch/data/galGal3/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000

# QUERY: Horse
SEQ2_DIR=/scratch/data/equCab2/equCab2.2bit
SEQ2_LEN=/scratch/data/equCab2/chrom.sizes
SEQ2_CTGDIR=/hive/data/genomes/equCab2/equCab2.UnScaffolds.2bit
SEQ2_CTGLEN=/hive/data/genomes/equCab2/equCab2.UnScaffolds.sizes
SEQ2_LIFT=/hive/data/genomes/equCab2/jkStuff/equCab2.chrUn.lift
SEQ2_CHUNK=20000000
SEQ2_LIMIT=100
SEQ2_LAP=0

BASE=/hive/data/genomes/galGal3/bed/lastzEquCab2.2009-06-29
TMPDIR=/scratch/tmp
'_EOF_'
    # << happy emacs

    time doBlastzChainNet.pl `pwd`/DEF \
	-noLoadChainSplit -verbose=2 -bigClusterHub=pk \
	-workhorse=hgwdev \
	-chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 &
    #	real    220m54.623s
    #	 cat fb.galGal3.chainEquCab2Link.txt
    #	68476046 bases of 1042591351 (6.568%) in intersection

    mkdir /hive/data/genomes/equCab2/bed/blastz.galGal3.swap
    cd /hive/data/genomes/equCab2/bed/blastz.galGal3.swap
    time doBlastzChainNet.pl \
	/hive/data/genomes/galGal3/bed/lastzEquCab2.2009-06-29/DEF \
	-noLoadChainSplit -verbose=2 -bigClusterHub=pk \
	-swap -workhorse=hgwdev \
	-chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 &
    #	real    10m1.381s
    cat fb.equCab2.chainGalGal3Link.txt
    #	73336799 bases of 2428790173 (3.019%) in intersection

############################################################################
############################################################################
# TRANSMAP vertebrate.2009-07-01 build  (2009-07-21 markd)

vertebrate-wide transMap alignments were built  Tracks are created and loaded
by a single Makefile. This is available from:
   svn+ssh://hgwdev.soe.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2009-07-01

see doc/builds.txt for specific details.
############################################################################
############################################################################
# TRANSMAP vertebrate.2009-09-13 build  (2009-09-20 markd)

vertebrate-wide transMap alignments were built  Tracks are created and loaded
by a single Makefile. This is available from:
   svn+ssh://hgwdev.soe.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2009-09-13

see doc/builds.txt for specific details.
############################################################################
## caCondor 454 alignments (2009-12-08 markd)
############################################################################
      mkdir /hive/data/genomes/galGal3/bed/caCondor454

    # get WUGSC 454 cDNA sequences from the trace archives
       cd data
       wget ftp://ftp.ncbi.nih.gov/pub/TraceDB/gymnogyps_californianus/anc.gymnogyps_californianus.001.gz
       wget ftp://ftp.ncbi.nih.gov/pub/TraceDB/gymnogyps_californianus/fasta.gymnogyps_californianus.001.gz
       cd ..

       # get trace ids for this project, and then extract fasta
        CENTER_NAME = WUGSC column $6
        CENTER_PROJECT = U_FC-MATT_TECHD2 column $7
        TI column $57

        zcat data/anc.gymnogyps_californianus.001.gz |tawk '$6=="WUGSC" && $7=="U_FC-MATT_TECHD2" {print "gnl|ti|"$57}' >data/wugsc-454-cdna.ids
        faSomeRecords data/fasta.gymnogyps_californianus.001.gz data/wugsc-454-cdna.ids data/wugsc-454-cdna.fa

     # obtained PASA cDNA clusters for galGal3 from the Brent lab at WUStL
          http://mblab.wustl.edu/~jeltje/for_Mark/ -> data/pasa-pre-2009-28/
          bzcat data/pasa-pre-2009-28/chr*.gtf.bz2 | gtfToGenePred stdin data/pasa.gp
          getRnaPred galGal3 data/pasa.gp all data/pasa.fa -pslOut=data/pasa.psl

     # obtain N-SCAN gene predictiosn
         getRnaPred galGal3 nscanGene all data/nscan.fa   -pslOut=data/nscan.psl

     # combine to make files for transMap
        cat data/pasa.fa  data/nscan.fa  >data/nscanPasa.fa
        cat data/pasa.psl  data/nscan.psl  >data/nscanPasa.psl

     # modify makefiles that were create during mapping experiments for alignments
     # these run two cluster batches. The first aligns the reads
     make >&log&

     # load tracks
     hgLoadPsl galGal3 -table=caCondor454 results/nscanPasaUnmappedRealign/combined.psl.gz
     ln -s /hive/data/genomes/galGal3/bed/caCondor454/data/wugsc-454-cdna.fa  /gbdb/galGal3/caCondor454.fa
     hgLoadSeq galGal3 /gbdb/galGal3/caCondor454.fa

#######################################################################
#  BLASTZ/CHAIN/NET Turkey vs Chicken (DONE - 2011-03-24 - Chin)
# use a screen to manage this multi-day job

    mkdir /hive/data/genomes/galGal3/bed/lastzMelGal1.2011-03-24
    cd /hive/data/genomes/galGal3/bed/lastzMelGal1.2011-03-24

    cat << '_EOF_' > DEF
# Chicken vs. Turkey
BLASTZ_M=50

# TARGET: Chicken GalGal3
SEQ1_DIR=/scratch/data/galGal3/galGal3.2bit
SEQ1_LEN=/scratch/data/galGal3/chrom.sizes
SEQ1_CHUNK=20000000
SEQ1_LAP=10000
SEQ1_LIMIT=50

# QUERY: Turkey MelGal1
SEQ2_DIR=/scratch/data/melGal1/melGal1.2bit
SEQ2_LEN=/scratch/data/melGal1/chrom.sizes
SEQ2_CHUNK=10000000
SEQ2_LIMIT=100
SEQ2_LAP=0

BASE=/hive/data/genomes/galGal3/bed/lastzMelGal1.2011-03-24
TMPDIR=/scratch/tmp
'_EOF_'
    # << this line keeps emacs coloring happy
    # screen
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
        `pwd`/DEF \
        -noLoadChainSplit -syntenicNet \
        -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
        -chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 &
    # real    120m58.775s
    #  *** All done !  Elapsed time: 120m59s
    #  *** Make sure that goldenPath/galGal3/vsMelGal1/README.txt is accurate.
    #  *** Add {chain,net}MelGal1 tracks to trackDb.ra if necessary.
    cd /hive/data/genomes/galGal3/bed
    ln -s lastzMelGal1.2011-03-24 lastz.melGal3

    cat fb.galGal3.chainMelGal1Link.txt
    # 924755061 bases of 1042591351 (88.698%) in intersection
    cd /hive/data/genomes/galGal3/bed
    ln -s lastzMelGal1.2011-03-24 lastz.melGal3

    #   and then swap
    mkdir /hive/data/genomes/melGal1/bed/blastz.galGal3.swap
    cd /hive/data/genomes/melGal1/bed/blastz.galGal3.swap
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
        /hive/data/genomes/galGal3/bed/lastzMelGal1.2011-03-24/DEF \
        -swap -noLoadChainSplit -syntenicNet \
        -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
        -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 &
    # real    38m4.406s
    cat fb.melGal1.chainGalGal3Link.txt
    # 874579772 bases of 935922386 (93.446%) in intersection
    cd /hive/data/genomes/melGal1/bed

############################################################################
# LASTZ Lizard AnoCar2 (DONE - 2011-04-25 - Hiram)
    mkdir /hive/data/genomes/galGal3/bed/lastzAnoCar2.2011-04-25
    cd /hive/data/genomes/galGal3/bed/lastzAnoCar2.2011-04-25

    cat << '_EOF_' > DEF
# Chicken vs lizard
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=6000
BLASTZ_K=2200
BLASTZ_Q=/scratch/data/blastz/HoxD55.q

# TARGET: Chicken galGal3
SEQ1_DIR=/scratch/data/galGal3/nib
SEQ1_LEN=/scratch/data/galGal3/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000

# QUERY: Lizard anoCar2
SEQ2_DIR=/scratch/data/anoCar2/anoCar2.2bit
SEQ2_LEN=/scratch/data/anoCar2/chrom.sizes
SEQ2_CHUNK=10000000
SEQ2_LAP=0
SEQ2_LIMIT=40

BASE=/hive/data/genomes/galGal3/bed/lastzAnoCar2.2011-04-25
TMPDIR=/scratch/tmp
'_EOF_'
    # << happy emacs

    #	establish a screen to control this job
    screen
    time nice -n +25 doBlastzChainNet.pl -verbose=2 \
	`pwd`/DEF \
	-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=loose \
	-syntenicNet -workhorse=hgwdev -smallClusterHub=encodek \
	-bigClusterHub=swarm -tRepeats=windowmaskerSdust \
	-qRepeats=windowmaskerSdust > do.log 2>&1 &
    #	real    210m1.468s
    cat fb.galGal3.chainAnoCar2Link.txt
    #	113851392 bases of 1042591351 (10.920%) in intersection

    #	running the swap - DONE - 2011-04-26
    mkdir /hive/data/genomes/anoCar2/bed/blastz.galGal3.swap
    cd /hive/data/genomes/anoCar2/bed/blastz.galGal3.swap
    time nice -n +25 doBlastzChainNet.pl -verbose=2 \
	/hive/data/genomes/galGal3/bed/lastzAnoCar2.2011-04-25/DEF \
	-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=loose \
	-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
	-syntenicNet -swap -qRepeats=windowmaskerSdust \
	-tRepeats=windowmaskerSdust > swap.log 2>&1 &
    #	real    11m8.229s
    cat fb.anoCar2.chainGalGal3Link.txt
    #	116069635 bases of 1701353770 (6.822%) in intersection

##############################################################################
# LASTZ Xenopus xenTro3 (DONE - 2010-12-17 - Hiram)
    mkdir /hive/data/genomes/galGal3/bed/lastzXenTro3.2011-09-20
    cd /hive/data/genomes/galGal3/bed/lastzXenTro3.2011-09-20

    cat << '_EOF_' > DEF
# chicken vs. X. tropicalis
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=8000
BLASTZ_K=2200
BLASTZ_Q=/scratch/data/blastz/HoxD55.q

# TARGET: Zebrafish galGal3
SEQ1_DIR=/scratch/data/galGal3/galGal3.2bit
SEQ1_LEN=/scratch/data/galGal3/chrom.sizes
SEQ1_CHUNK=20000000
SEQ1_LAP=10000
SEQ1_LIMIT=40

# QUERY: X. tropicalis xenTro3
SEQ2_DIR=/scratch/data/xenTro3/xenTro3.2bit
SEQ2_LEN=/scratch/data/xenTro3/chrom.sizes
SEQ2_CHUNK=10000000
SEQ2_LAP=0
SEQ2_LIMIT=50

BASE=/hive/data/genomes/galGal3/bed/lastzXenTro3.2011-09-20
TMPDIR=/scratch/tmp
'_EOF_'
    # << happy emacs

    #	establish a screen to control this job
    screen
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
	`pwd`/DEF \
	-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=loose \
	-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
	> do.log 2>&1 &
    #	real    209m23.536s

    cat fb.galGal3.chainXenTro3Link.txt
    #	54978665 bases of 1042591351 (5.273%) in intersection

    mkdir /hive/data/genomes/xenTro3/bed/blastz.galGal3.swap
    cd /hive/data/genomes/xenTro3/bed/blastz.galGal3.swap
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
	/hive/data/genomes/galGal3/bed/lastzXenTro3.2011-09-20/DEF \
	-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=loose \
	-swap -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
	> swap.log 2>&1 &
    #	real    13m36.708s

    cat fb.xenTro3.chainGalGal3Link.txt
    #	64250673 bases of 1358334882 (4.730%) in intersection

#########################################################################
# construct liftOver to galGal4 (DONE - 2012-05-25 - Hiram)
    screen -S galGal3	# manage this longish running job in a screen
    mkdir /hive/data/genomes/galGal3/bed/blat.galGal4.2012-05-25
    cd /hive/data/genomes/galGal3/bed/blat.galGal4.2012-05-25
    # check it with -debug first to see if it is going to work:
    time doSameSpeciesLiftOver.pl -buildDir=`pwd` -bigClusterHub=swarm \
	-ooc=/scratch/data/galGal3/11.ooc \
	-debug -dbHost=hgwdev -workhorse=hgwdev galGal3 galGal4 > do.log 2>&1
    # if that is OK, then run it:
    time doSameSpeciesLiftOver.pl -buildDir=`pwd` -bigClusterHub=swarm \
	-ooc=/scratch/data/galGal3/11.ooc \
	-dbHost=hgwdev -workhorse=hgwdev galGal3 galGal4 > do.log 2>&1
    #   real    20m44.646s

    # verify this file exists:
    #	/gbdb/galGal3/liftOver/galGal3ToGalGal4.over.chain.gz
    # and try out the conversion on genome-test from galGal3 to galGal4

############################################################################
# NUMTS TRACK (DONE 2013-06-15 - Chin)

# copy and unzip
# http://193.204.182.50/files/tracks.zip and
# http://193.204.182.50/files/other%20NumtS%20tracks.zip
# to /hive/data/outside/NumtS/2012/tracks

    mkdir /hive/data/genomes/galGal3/bed/NumtS
    cd /hive/data/genomes/galGal3/bed/NumtS
    cp /hive/data/outside/NumtS/2012/tracks/Gallus_gallus_3/* .
    /hive/data/genomes/galGal3/bed/NumtS] cat tracks_nuclear_NumtS_GalGal3 | grep -v "^track" > galGal3NumtS.bed
    cat tracks_NumtS-assembled_GalGal3 | grep -v "^track" > galGal3NumtSAssembled.bed
    cat tracks_mit_NumtS_galgal3 | grep -v "^track" > galGal3NumtSMitochondrion.bed

    # load the 3 bed files to galGal3
    hgLoadBed galGal3  numtSAssembled  galGal3NumtSAssembled.bed
    hgLoadBed galGal3 numtS galGal3NumtS.bed
    hgLoadBed galGal3 numtSMitochondrion galGal3NumtSMitochondrion.bed

    # Make /gbdb/ links and load bam
    mkdir /gbdb/galGal3/NumtS
    ln -s `pwd`/GG-3_NumtS_seqs.sorted.bam{,.bai} /gbdb/galGal3/NumtS/
    hgBbiDbLink galGal3 bamAllNumtSSorted /gbdb/galGal3/NumtS/GG3_NumtS_seqs.sorted.bam
    # setup trackDb for galGal3 and search rules



############################################################################
# LIFTOVER TO galGal5 (DONE - 2016-10-25 - Hiram)
    ssh hgwdev
    mkdir /hive/data/genomes/galGal3/bed/blat.galGal5.2016-10-25
    cd /hive/data/genomes/galGal3/bed/blat.galGal5.2016-10-25
    doSameSpeciesLiftOver.pl -verbose=2 \
        -debug -bigClusterHub=ku -dbHost=hgwdev -workhorse=hgwdev \
        -ooc=/scratch/data/galGal3/11.ooc \
         galGal3 galGal5
    time (doSameSpeciesLiftOver.pl -verbose=2 \
        -bigClusterHub=ku -dbHost=hgwdev -workhorse=hgwdev \
        -ooc=/scratch/data/galGal3/11.ooc \
         galGal3 galGal5) > doLiftOverToGalGal5.log 2>&1
    # real    135m29.733s

    # see if the liftOver menus function in the browser from galGal3 to
    # galGal5

##############################################################################
