# for emacs: -*- mode: sh; -*-

# Danio Rerio (zebrafish) from Sanger, version Zv7 (released 07/13/07)
#  Project website:
#    http://www.sanger.ac.uk/Projects/D_rerio/
#  Assembly notes:
#    http://www.sanger.ac.uk/Projects/D_rerio/
#    ftp://ftp.ensembl.org/pub/assembly/zebrafish/Zv7release/Zv7_assembl_information.shmtl

#  NOTE:  this doc may have genePred loads that fail to include
#  the bin column.  Please correct that for the next build by adding
#  a bin column when you make any of these tables:
#

###########################################################################
# DOWNLOAD SEQUENCE (DONE, 2007-07-16, hartera)
# MOVE FILES TO SEPARATE DIRECTORY (DONE, 2007-07-20, hartera)
     ssh kkstore06
     mkdir /cluster/store4/danRer5 
     ln -s /cluster/store4/danRer5 /cluster/data
     cd /cluster/data/danRer5
     wget --timestamp \
      ftp://ftp.ensembl.org/pub/assembly/zebrafish/Zv7release/README
     wget --timestamp \
      ftp://ftp.ensembl.org/pub/assembly/zebrafish/Zv7release/Zv7_chr.agp
     wget --timestamp \
      ftp://ftp.ensembl.org/pub/assembly/zebrafish/Zv7release/Zv7_contigs.fa
     wget --timestamp \
      ftp://ftp.ensembl.org/pub/assembly/zebrafish/Zv7release/Zv7_scaffold.agp
     wget --timestamp \
      ftp://ftp.ensembl.org/pub/assembly/zebrafish/Zv7release/Zv7_scaffolds.fa
     # move assembly files to a separate directory
     mkdir Zv7release
     mv *.agp *.fa README ./Zv7release/

###########################################################################
# CREATE AGP FILES FOR RANDOMS (chrNA and chrUn) 
# (2007-08-16 - 2007-08-18, hartera)
    ssh kkstore06
    mkdir /cluster/data/danRer5/Zv7release/randoms
    cd /cluster/data/danRer5/Zv7release/randoms
    # first find the contigs assigned to chromosomes:
    awk '{if ($5 !~ /N/) print $6}' ../Zv7_chr.agp | sort | uniq \
        > chromContigs.txt
    # get list of all contigs:
    awk '{if ($5 !~ /N/) print $6}' ../Zv7_scaffold.agp | sort | uniq \
        > allContigs.txt
    # find contigs not assigned to a chromosome
    comm -23 allContigs.txt chromContigs.txt > contigsRandomsAndAccs.txt
    # get all those that are in the Zv7_scaffold.agp and get a list of 
    # scaffold names.
    foreach c (`cat contigsRandomsAndAccs.txt`)
        grep -w $c ../Zv7_scaffold.agp >> contigsRandomsScafs.agp
    end
    # check that all the contigs/clones names in contigsRandomsAndAccs.txt
    # are also in contigsRandomsScafs.agp
    awk '{if ($6 != 100) print $6}' contigsRandomsScafs.agp \
        | sort | uniq > names.sort
    wc -l names.sort contigsRandomsAndAccs.txt
    # 19845 names.sort
    # 19845 contigsRandomsAndAccs.txt 
    comm -12 contigsRandomsAndAccs.txt names.sort | wc -l
    # 19845
    # get list of scaffolds names
    awk '{print $1}' contigsRandomsScafs.agp | sort -k 1.7,1n | uniq \
        > scaffoldsRandoms.txt
    # make an AGP of the randoms scaffolds only
    grep -w -f scaffoldsRandoms.txt ../Zv7_scaffold.agp > randomsScaffold.agp
    # check that we have all the scaffolds to be selected
    awk '{print $1}' randomsScaffold.agp | sort | uniq > names2.sort
    sort scaffoldsRandoms.txt > scafsTmp.sort
    wc -l scafsTmp.sort names2.sort
    # 5010 scafsTmp.sort
    # 5010 names2.sort
    comm -12 scafsTmp.sort names2.sort | wc -l
    # 5010
    # all scaffolds are in the new AGP file that occur in scaffoldsRandoms.txt
    # get the list of contigs from the agp
    awk '{if ($5 !~ /N/) print $6}' randomsScaffold.agp | sort | uniq \
        > randomContigs.list 
    # extract the FASTA sequences for just these contigs
    faSomeRecords ../Zv7_contigs.fa randomContigs.list Zv7contigs_random.fa
    # remove excess part of the names for the contigs with accessions 
    perl -pi.bak -e 's/^(>[A-Z]{2}[0-9]+\.[0-9]+)\s.+$/$1/' Zv7contigs_random.fa
    # check that all the contigs from the list are in the FASTA file
    grep '>' Zv7contigs_random.fa | sed -e 's/^>//' | sort | uniq \
         > Zv7contigs.list
    wc -l *.list
    # 19736 Zv7contigs.list
    # 19845 randomContigs.list
    
    comm -13 Zv7contigs.list randomContigs.list > notFound
    wc -l notFound
    # 109 notFound
    # (2007-08-06, hartera)
    # RE-DONE (2007-08-18, hartera)
    # These are present in the original, but there are several sequeunces for
    # each accession since there are Ns in the sequence. Need to fix the names
    # in the AGP to match each part of the sequence in the FASTA file.
    # First, get the list of headers from the FASTA for all contigs
    grep '>' ../Zv7_contigs.fa > contigs.headers
    # remove ">"
    perl -pi.bak -e 's/>//' contigs.headers
    foreach a (`cat notFound`)
        grep $a contigs.headers >> contigsHeaders.notFound
    end
    awk '{print $1}' contigsHeaders.notFound > contigsHeaders.notFound.accs
    perl -pi.bak -e 's/_.+$//' contigsHeaders.notFound.accs
    sort contigsHeaders.notFound.accs | uniq \
         > contigsHeaders.notFound.accs.sort
    sort notFound | uniq > notFound.sort
    wc -l notFound.sort 
    # 109 notFound.sort
    wc -l contigsHeaders.notFound.accs.sort
    # 109 contigsHeaders.notFound.accs.sort
    comm -12 notFound.sort contigsHeaders.notFound.accs.sort | wc -l
    # 109
    # So all the not Found accessions are in the list of contig headers 
    # accessions: contigsHeaders.notFound.accs.sort
    # Then extract the names for the accession parts 
    # e.g. BX649254.13_01364 zK228P6.01364 BX649254 1 32480
    # and add them to the AGP file for the correct lines for these 
    # components. The last 2 fields above are the start and end coordinates
    # relative to the accession (BX649254.13 in this case). Also, add
    # 50,000 Ns between scaffolds.
    # Wrote program to do this: agpAddCtgNamesAndGaps.c
    /cluster/home/hartera/bin/x86_64/agpAddCtgNamesAndGaps \
            contigsHeaders.notFound randomsScaffold.agp \
            randomsScafFixed.agp >& agpScafs.log
    awk '{print $1}' randomsScaffold.agp | sort | uniq > scafs.lst.uniq
    wc -l scafs.lst.uniq
    # 5010 scafs.lst.uniq
    wc -l *.agp
    # 46688 contigsRandomsScafs.agp
    # 40641 randomsScafFixed.agp
    # 35633 randomsScaffold.agp

    # 35633 + 5010 = 40643 but there are 2 less gap rows since there are none
    # at the ends of the random chromosomes. So the number of lines in the 
    # AGP file is correct: 40641.
    # get the list of contigs again for the randoms from the FASTA headers
    # and check all of these are present in the AGP file and vice versa.
    cp contigs.headers contigs.names
    # remove excess part of the names for the contigs with accessions 
    perl -pi.bak -e \
         's/^([A-Z]{2}[0-9]+\.[0-9]+_?[0-9]*\.?[0-9]*)\s.+$/$1/' \
         contigs.names
    sort contigs.names | uniq > contigs.names.sort
    awk '{print $6}' randomsScafFixed.agp | sort | uniq > contigsFromAgp.sort
    wc -l contigs*.sort
    # 60092 contigs.names.sort
    # 20351 contigsFromAgp.sort
    comm -13 contigs.names.sort contigsFromAgp.sort
# CR293502.4
# CR356227.33
# CR388165.16
# CR854948.10
# CR931788.11
# CR954226.7
# CT573234.3
# CT573263.4
# CT737182.2
# CT997808.4

# These accessions are not matched from the headers to the AGP file. 
# e.g. CR293502.4
# CR293502.4_00001 zK31A1.00001 CR293502 1 27131
# CR293502.4_02478 zK31A1.02478 CR293502 27232 29631
# CR293502.4_01210.0 zK31A1.01210.0 CR293502 119649 233137
# The last 2 fields above are coordinates relative to the accession and should
# match to fields 7 and 8 in the relevant lines of the AGP file but, in this
# case, they do not.
# in the AGP file:
# Zv7_scaffold2558        1944730 1960624 49      D       CR293502.4      11237
# 27131   +
# Zv7_scaffold2558        1960625 1960724 50      N       100     fragment no
# Zv7_scaffold2558        1960725 1963124 51      D       CR293502.4      27232
# 29631   +
# Zv7_scaffold2558        1963125 1963224 52      N       100     fragment no
# Zv7_scaffold2558        1963225 1995007 53      D       CR293502.4      201355
# 233137  -
# Co-ordinates are relative to CR293502.4 in fields 7 and 8
    grep CR293502.4 randomsScaffold.agp > CR293502.4.agp
# E-mailed Tina Eyre (te3@sanger.ac.uk) and Ian Sealy (is1@sanger.ac.uk) at 
# Sanger to ask them about these discrepancies and how to fix it (2007-08-09).
# Received the Zv7_all_scaffolds.agp file on 2007-08-14 from Ian Sealy
# (is1@sanger.ac.uk). This contains the unfinished clones and should not be
# shared with the public. The contig/clone names match up to those for clone
# name fragments in the Zv7_contigs.fa file.
# Received Zv7_all_scaffolds.agp:
    grep CR293502.4 ../Zv7_all_scaffolds.agp > CR293502.4.allScafs.agp 
# Zv7_scaffold2558        1944730 1960624 49      U       CR293502.4_00001
# 11237    27131   +       zK31A1.00001    15895   27131
# Zv7_scaffold2558        1960725 1963124 51      U       CR293502.4_02478
# 12400    +       zK31A1.02478    2400    2400
# Zv7_scaffold2558        1963225 1995007 53      U       CR293502.4_01210.0
# 81707    113489  -       zK31A1.01210.0  31783   113489
   # Coordinates in fields 7 and 8 of this file are relative to the clone  
   # fragment names in field 6.
    foreach f (CR293502*.agp)
      awk \
      '{if ($5 !~ /N/) print "faFrag ", $6".fa", $7-1, $8, $6$7"-"$8".fa";}' \
       $f >> faFrag${f}
    end  
    chmod +x faFrag*
    awk '{print $6}' CR293502.4.allScafs.agp > ctgList.txt
    foreach f (`cat ctgList.txt`)
       echo $f > list
       faSomeRecords ../Zv7_contigs.fa list ${f}.fa
    end  
    faFragCR293502.4.agp
# Wrote 15895 bases to CR293502.411237-27131.fa
# Wrote 2400 bases to CR293502.427232-29631.fa
# Wrote 31783 bases to CR293502.4201355-233137.fa
    faFragCR293502.4.allScafs.agp
# Wrote 15895 bases to CR293502.4_0000111237-27131.fa
# Wrote 2400 bases to CR293502.4_024781-2400.fa
# Wrote 31783 bases to CR293502.4_01210.081707-113489.fa
# When diff on each pair of files of the same size, the sequence is 
# identical, only the headers are different.
    
     # Decided to base assembly on scaffolds not contigs (2007-08-22)

##########################################################################
# ALL CHROMS AGP (2007-08-18, hartera)
    ssh kkstore06
    cd /cluster/data/danRer5/Zv7release
    awk '{if ($5 !~ /N/) print $6;}' Zv7_all_scaffolds.agp | sort | uniq \
        > contigNamesFromAllScafs.sort
    # compare to contig names from FASTA file
    comm -13 contigNamesFromAllScafs.sort ./randoms/contigs.names.sort
    # no difference: all contig names from AGP file are in the FASTA file
    comm -23 contigNamesFromAllScafs.sort ./randoms/contigs.names.sort \
         > notInAllScafs
    wc -l notInAllScafs
    # 3924 notInAllScafs
    grep "Zv7_NA" notInAllScafs | wc -l
    # 3924
    # So the only ones not in FASTA file are the 3924 Zv7_NA contigs 
    # get clone names without underscore and extension
    # remove excess part of the names for the contigs with accessions
    cp contigNamesFromAllScafs.sort contigNamesFromAllScafs2.sort 
    perl -pi.bak -e \
         's/^([A-Z]{2}[0-9]+\.[0-9]+)_?[0-9]*\.?[0-9]*$/$1/' \
         contigNamesFromAllScafs2.sort
    grep -v "Zv7_NA" contigNamesFromAllScafs2.sort \
         | sort | uniq > contigNamesFromAllScafs2NoNAs.sort
    # get list of contigs and clones in randoms only
    awk '{if ($5 !~ /N/) print $6;}' ./randoms/randomsScaffold.agp \
        | sort | uniq > randomsContigsNames.sort
    # remove randoms scaffolds from the list of contigs/clones from 
    # Zv7_all_scaffolds.agp
    comm -13 randomsContigsNames.sort contigNamesFromAllScafs2NoNAs.sort \
         > contigNamesFromAllScafs2NoRandoms.txt
    sort contigNamesFromAllScafs2NoRandoms.txt | uniq \
         > contigNamesFromAllScafs2NoRandoms.sort
    # then get the compare this list to a list of clones/contigs 
    # from Zv7_chr.agp
    awk '{if ($5 !~ /N/) print $6;}' Zv7_chr.agp | sort | uniq \
        > chromsAgpContigs.sort
    comm -23 contigNamesFromAllScafs2NoRandoms.sort chromsAgpContigs.sort \
         | wc -l
    # 0
    # So there are no new contigs in the Zv7_all_scaffolds.agp file that
    # are not randoms or Zv7_NA.
    # Try agpAddCtgNamesAndGaps.c on Zv7_chr.agp and see if all 
    # clone fragments can be found in the FASTA file.
    cp ./randoms/contigs.headers .
    # get the names from the headers
    awk '{print $1}' contigs.headers | sort | uniq > contigsNames.headers.sort
    # get the contig/clone names from the Zv7_chr.agp file: sorted file is
    # chromContigs.txt
    comm -13 contigsNames.headers.sort ./randoms/chromContigs.txt \
         > contigsInChromAgpOnly.txt
    wc -l contigsInChromAgpOnly.txt
    # 575 contigsInChromAgpOnly.txt
    # Get FASTA file headers for just this set of contigs. These are ones
    # with fragment that are named XXNNNN_NN e.g. BX511136.3_00285.0
    grep -f contigsInChromAgpOnly.txt contigs.headers \
         > contigsInChromAgpOnly.headers
    /cluster/home/hartera/bin/x86_64/agpAddCtgNamesAndGaps \
           contigsInChromAgpOnly.headers Zv7_chr.agp \
           chrFixed.agp >& agpChroms.log
    # check if all the contig/clone names in the AGP file have now been 
    # found in the FASTA file.

    sort contigs.names | uniq > contigs.names.sort
    # get contig/clone names from fixed AGP file
    awk '{if ($5 !~ /N/) print $6}' chrFixed.agp | sort | uniq \
        > contigsFromChrFixedAgp.sort
    # get list of names in the FASTA contig headers
    cp ./randoms/contigs.names.sort .

    wc -l contigs*.sort
    # 60092 contigs.names.sort
    # 39659 contigsFromChrFixedAgp.sort
    comm -13 contigs.names.sort contigsFromChrFixedAgp.sort \
         > notFoundInChrFixedAgp.txt
    wc -l notFoundInChrFixedAgp.txt
    # 334 notFoundInChrFixedAgp.txt
    echo BX005112.16 > list
    nice faSomeRecords Zv7_contigs.fa list BX005112.16_02613.fa
    faSize BX005112.16_02613.fa 
    # 171673 bases (0 N's 171673 real 171673 upper 0 lower) in 1 sequences in
    # 1 files
    grep BX005112.16 Zv7_chr.agp 
# chr21   31909678        32080531        1296    D       BX005112.16     820
# 171673   +
# chr21   32080632        32084318        1298    D       BX005112.16     171774
# 175460   +
    grep BX005112.16 chrFixed.agp
# chr21   1104388161      1104559014      1296    D       BX005112.16     820
# 171673   +
# chr21   1104559115      1104562801      1298    D       BX005112.16_02934
# 171774   175460  +
    grep BX005112.16 Zv7_all_scaffolds.agp
# Zv7_scaffold2077        678529  849382  54      U       BX005112.16_02613
# 820      171673  +       zK85G15.02613   170854  171673
# Zv7_scaffold2077        849483  853169  56      U       BX005112.16_02934
# 1 3687    +       zK85G15.02934   3687    3687
    grep BX005112.16 contigs.headers
# BX005112.16_02613 zK85G15.02613 BX005112 1 171673
# BX005112.16_02934 zK85G15.02934 BX005112 171774 175460
    echo BX005112.16 > list2

    nice faSomeRecords Zv7_scaffolds.fa list2 BX005112.fa
    # not found, these accedssions are not in the FASTA file.
    # In order to create the chroms, need to extract the relevant coords from 
    # the accessions to create the FASTA file.
    # Now basing assembly on scaffolds instead of contigs so create
    # AGP files for scaffolds.

##########################################################################
# CREATE A SCAFFOLDS AGP WITH SCAFFOLDS ONLY FOR RANDOMS 
# (2007-08-22 and 2007-08-24, hartera) 
# Make AGP using just the unmapped scaffolds and not making virtual 
# chromosomes for unmapped scaffolds so that this is the same as the 
# way Ensembl handles these. Therefore there will be 25 chromosomes, chrM, 
# plus 5010 unmapped scaffolds.  
    ssh kkstore06
    # make agps and fasta directories
    mkdir -p /cluster/data/danRer5/Zv7release/agps
    mkdir -p /cluster/data/danRer5/Zv7release/fasta
    # move assemmbly FASTA files to this directory
    cd /cluster/data/danRer5/Zv7release
    mv Zv7_*.fa ./fasta/
    cd /cluster/data/danRer5/Zv7release/randoms
    # get list of scaffolds for randoms - one for Un_random and one for 
    # NA_random scaffolds.
    awk '{if ($1 ~ /Zv7_scaffold/) print $1;}' randomsScaffold.agp \
        | sort -k 1.7,1n | uniq > Un_randomScafs.list
    awk '{if ($1 ~ /Zv7_NA/) print $1;}' randomsScaffold.agp \
        | sort -k 1.7,1n | uniq > NA_randomScafs.list
    wc -l *randomScafs.list
    # 4844 NA_randomScafs.list
    # 166 Un_randomScafs.list
    # 5010 total

    # get sequences for just these scaffolds
    foreach f (NA Un)
       faSomeRecords ../fasta/Zv7_scaffolds.fa ${f}_randomScafs.list \
                     Zv7${f}_randomScafs.fa
    end
    # check that they are all there
    foreach f (NA Un)
       grep '>' Zv7${f}_randomScafs.fa > ${f}_Random.headers
    end
    wc -l *Random.headers
    # 4844 NA_Random.headers
    # 166 Un_Random.headers
    # 5010 total
    perl -pi.bak -e 's/>//' *Random.headers
    foreach f (NA Un)
       sort -k 1.7,1n ${f}_Random.headers | uniq > ${f}_Random.headers.sort
    end 
    comm -12 NA_Random.headers.sort NA_randomScafs.list | wc -l 
    # 4844
    comm -12 Un_Random.headers.sort Un_randomScafs.list | wc -l
    # 166
    # Total is 4844 + 166 = 5010 
    # so all the sequences in the scaffolds lists are in the FASTA files.
    
    # Make an AGP from these scaffolds FASTA sequences with 50000 Ns 
    # inserted between scaffolds.
    foreach c (NA_random Un_random)
        scaffoldFaToAgp -scaffoldGapSize=0 Zv7${c}Scafs.fa 
    end
    
# scaffold gap size is 0, total scaffolds: 4844
# chrom size is 117689868
# writing Zv7NA_randomScafs.agp
# writing Zv7NA_randomScafs.gap
# writing Zv7NA_randomScafs.lft
# scaffold gap size is 0, total scaffolds: 166
# chrom size is 45800611
# writing Zv7Un_randomScafs.agp
# writing Zv7Un_randomScafs.gap
# writing Zv7Un_randomScafs.lft
   
   # Create AGP with just the scaffolds
 
   # sort NA by scaffold number:
   # first remove gap lines:
   grep -w -v "N" Zv7NA_randomScafs.agp > Zv7NA_randomScafsNoGaps.agp
   grep -w -v "N" Zv7Un_randomScafs.agp > Zv7Un_randomScafsNoGaps.agp
   sort -k 6.7n -k 6.8n -k 6.9,6.10n -k 6.10,6.11n \
        Zv7NA_randomScafsNoGaps.agp > Zv7NA_randomScafsSorted.agp
   foreach f (Zv7Un_randomScafsNoGaps.agp Zv7NA_randomScafsSorted.agp)
      set g = $f:r
      echo $g 
      awk 'BEGIN {OFS="\t"} {print $6, $7, $8, "1", "W", $6, $7, $8, "+"}' \
          $f > ${g}2.agp
   end
   wc -l Zv7*_randomScafs*agp
   # 9687 Zv7NA_randomScafs.agp
   # 4844 Zv7NA_randomScafsNoGaps.agp
   # 4844 Zv7NA_randomScafsSorted.agp
   # 4844 Zv7NA_randomScafsSorted2.agp
   # 331 Zv7Un_randomScafs.agp
   # 166 Zv7Un_randomScafsNoGaps.agp
   # 166 Zv7Un_randomScafsNoGaps2.agp
   # 4844 + 166 = 5010 -> total unmapped scaffolds
   cat Zv7NA_randomScafsSorted2.agp Zv7Un_randomScafsNoGaps2.agp \
       > Zv7AllRandomScafs.agp
   wc -l Zv7AllRandomScafs.agp
   # 5010 Zv7AllRandomScafs.agp
   # move scaffolds AGP to agps directory:
   mv Zv7AllRandomScafs.agp /cluster/data/danRer5/Zv7release/agps/
   # move sequences to FASTA directory
   mv Zv7*_randomScafs.fa \
      /cluster/data/danRer5/Zv7release/fasta/

#######################################################################
# PROCESS CHROMOSME AGP AND SCAFFOLDS AGP TO CREATE A SCAFFOLDS TO 
# CHROMOSOMES AGP FILE (DONE, 2007-08-24, hartera) 

   # The Zv7_chr.agp file contains a mapping of contigs to chromsomes
   # and the Zv7_scaffold.agp file contains a mapping of contigs to scaffolds.
   # To build the Browser and assemble chromosomes, we need an AGP file
   # that maps scaffolds to chromsomes. 
   # A program, createScaffoldsAgp.c was written to create this AGP file. 
   # in kent/src/hg/oneShot/createScaffoldsAgp/.
   ssh kkstore06
   cd /cluster/data/danRer5/Zv7release/agps
   createScaffoldsAgp ../Zv7_scaffold.agp ../Zv7_chr.agp \
                      Zv7ScafToChrom.agp >& createAgp.log
   wc -l *.agp
   # 5010 Zv7AllRandomScafs.agp
   # 4943 Zv7ScafToChrom.agp
   # 9953 total

###########################################################################
# CHECK AGP FILES AND FASTA CONTENTS AND SIZE CONSISTENCY 
# (DONE, 2007-08-22, hartera)
# 
   # Check that all the scaffolds in AGPs are in the FASTA file and vice versa
   # get list of scaffolds in AGP files: Zv7ScafToChrom.agp,
   # chrNA_random.scaffolds.agp and chrUn_random.scaffolds.agp
   ssh kkstor06
   cd /cluster/data/danRer5/Zv7release/assembly/agps
   
   foreach f (*.agp)
      awk '{if ($5 !~ /N/) print $6}' $f >> allScafsFromAgps.txt
   end
   sort allScafsFromAgps.txt | uniq > allScafsFromAgps.sort
   grep '>' ../fasta/Zv7_scaffolds.fa | sed -e 's/>//' | sort | uniq \
        > scafsHeaders.sort
   wc -l *.sort
   # 7494 allScafsFromAgps.sort
   # 7494 scafsHeaders.sort
   comm -12 allScafsFromAgps.sort scafsHeaders.sort | wc -l
   # 7494
   # 7494 are common to both files so they the AGP files contain all the
   # scaffolds in the FASTA file and vice versa.
   # create one AGP file for the assembly
   cat Zv7ScafToChrom.agp Zv7AllRandomScafs.agp > Zv7ScafToChromAndRandom.agp

   cd /cluster/data/danRer5/Zv7release/fasta
   # make a scaffolds directory to work in 
   mkdir scaffolds
   cd scaffolds
   faSize -detailed ../Zv7_scaffolds.fa > Zv7.scaffolds.sizes
   # Check that these sizes correspond to the sizes in the scaffolds agp file
   # use script compareSizes2.pl
   cat << '_EOF_' > compareSizes2.pl
#!/usr/bin/perl -w
use strict;

my ($file, $agp);

$file = $ARGV[0];
$agp = $ARGV[1];

open(FILE, $file) || die "Can not open $file: $!\n";
open(AGP, $agp) || die "Can not open $agp: $!\n";
open(OUT, ">log.txt") || die "Can not create log.txt: $!\n";

my ($l, @f, $name, $size, %scafsHash);
while (<FILE>)
{
$l = $_;
@f = split(/\t/, $l);

$name = $f[0]; 
$size = $f[1];
$scafsHash{$name} = $size;
}
close FILE;

while (<AGP>)
{
my ($line, @fi, $scaf, $end);
$line = $_;

if ($line =~ /Zv/)
   {
   @fi = split(/\t/, $line);
   $scaf = $fi[5];
   $end = $fi[7];

   if (exists($scafsHash{$scaf}))
      {
      if ($scafsHash{$scaf} == $end)
         {
         print OUT "$scaf - ok\n";
         }
      else
         {
         print OUT "$scaf - different size to sequence\n";
         }
      }
   else
      {
      print OUT "$scaf - does not exist in list of sizes\n";
      }
   }
}
close AGP;
close OUT;
'_EOF_'
   # << happy emacs
   chmod +x compareSizes2.pl
   perl compareSizes2.pl Zv7.scaffolds.sizes \
        ../../agps/Zv7ScafToChromAndRandom.agp 
   grep different log.txt
   grep not log.txt
   # these are all consistent with the sequence sizes
   # check that the co-ordinates in the agp files are consistent:
   # field 2 is the start position, field 3 is the end and field 8 is the size
   # so check that this is consistent.
   cd /cluster/data/danRer5
   awk '{if ($5 !~ /N/ && (($3-$2+1) != $8)) print $6;}' \
       ../../agps/Zv7AllRandomScafs.agp > Zv7.scaffolds.coordCheck 
   # this file is empty so they are ok. do the same for the contigs to
   # chromsomes .agp file
   awk '{if ($5 !~ /N/ && (($3-$2+1) != ($8-$7 +1))) print $6;}' \
       ../../Zv7_chr.agp > Zv7.contigsToChroms.coordCheck
   # this file is empty so ok
   # in order to create the scaffolds to chroms AGP file with 
   # createScaffoldsAgp, the coordinates were checked between the 
   # Zv7_scaffold.agp and Zv7_chr.agp files.
   cd ..
   rm -r scaffolds
 
#######################################################################
# MAKE GENOME DB FROM CHROMOSOMES AND UNMAPPED SCAFFOLDS 
# (DONE, 2007-08-24, hartera)
# CHANGE dbDb ENTRY MADE BY SCRIPT TO DISPLAY SAME DEFAULT POSITION 
# AS FOR DANRER4, GENE vdac2 (DONE, 2007-09-10, hartera)
# CHANGE dbDb ENTRY SO THAT THE FULL NAME OF THE SANGER INSTITUTE IS GIVEN
# IN THE SOURCENAME FIELD (DONE, 2007-09-22, hartera)
# CHANGED FRAG IN GOLD TABLE FROM GI NUMBER TO ACCESSION FOR chrM 
# (DONE, 2007-10-01, hartera)
   # Since there will be a mixture of chroms and unmapped scaffolds, the
   # chroms must be built first being inputting the assembly sequence to 
   # makeGenomeDb.pl
   ssh kkstore06
   cd /cluster/data/danRer5/Zv7release/
   cd fasta
   agpToFa -simpleMultiMixed ../agps/Zv7ScafToChrom.agp all \
         Zv7_chromosomes.fa ./Zv7_scaffolds.fa
   # check the chroms are all there
   grep '>' Zv7_chromosomes.fa | sed -e 's/>//' > headers
   # all chroms 1-25 are there
   rm headers
   # check agp and FASTA
   checkAgpAndFa ../agps/Zv7ScafToChrom.agp Zv7_chromosomes.fa
   # All AGP and FASTA entries agree - both files are valid
   # cat together chromosomes FASTA and scaffolds FASTA:
   cat Zv7_chromosomes.fa Zv7NA_randomScafs.fa Zv7Un_randomScafs.fa \
       > Zv7ChromsAndRandomScafs.fa
   # To find the gi number for chrM, go to http://www.ncbi.nih.gov/ and search
   # Nucleotide for "Danio mitochondrion genome". 
   # That shows gi number: 15079186, accession: NC_002333.2, 16596 bp
   # also GI:8576324 and accession: AC024175
   cd /cluster/data/danRer5/
   cat << 'EOF' > danRer5.config.ra
db danRer5
clade vertebrate
scientificName Danio rerio
commonName Zebrafish
assemblyDate Jul. 2007
assemblyLabel Sanger Centre, Danio rerio Sequencing Project Zv7
orderKey 449
dbDbSpeciesDir zebrafish
# NC_002333.2
mitoAcc 15079186
agpFiles /cluster/data/danRer5/Zv7release/agps/Zv7ScafToChromAndRandom.agp
fastaFiles /cluster/data/danRer5/Zv7release/fasta/Zv7ChromsAndRandomScafs.fa 
'EOF'    
 # << keep emacs coloring happy
   # corrected wget for mitochondron genome so:
   # http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nuccore&rettype=fasta&id=15079186&retmode=text
   # remove directories created
   rm -r TemporaryTrackDbCheckout bed html jkStuff
   rm dbDbInsert.sql 
   ssh hgwdev
   cd /cluster/data/danRer5/
   hgsql -e 'delete from dbDb where name="danRer5";' hgcentraltest
   
   ssh kkstore06
   cd /cluster/data/danRer5
   # Run with -debug to see what it would do / check for issues:
   makeGenomeDb.pl danRer5.config.ra -debug -verbose=3
   makeGenomeDb.pl danRer5.config.ra >& makeGenomeDb.log & 
   # Took about 10 minutes.
   tail -f makeGenomeDb.log
   
   # Follow the directions at the end of the log after
    #NOTES -- STUFF THAT YOU WILL HAVE TO DO --
# Search for '***' notes in each file in and make corrections (sometimes the
# files used for a previous assembly might make a better template):
#  description.html /cluster/data/danRer5/html/{trackDb.ra,gap.html,gold.html}

# Then cd ../.. (to trackDb/) and
# - edit makefile to add danRer5 to DBS.
# - (if necessary) cvs add zebrafish
# - cvs add zebrafish/danRer5
# - cvs add zebrafish/danRer5/*.{ra,html}
# - cvs ci -m "Added danRer5 to DBS." makefile
# - cvs ci -m "Initial descriptions for danRer5." zebrafish/danRer5
# - (if necessary) cvs ci zebrafish
# - Run make update DBS=danRer5 and make alpha when done.
# - (optional) Clean up /cluster/data/danRer5/TemporaryTrackDbCheckout
# - cvsup your ~/kent/src/hg/makeDb/trackDb and make future edits there.
   # makeGenomeDb.pl creates the db for the genome, makes the hgcentraltest
   # entries and loads the gold (scaffolds) and gap tables. Creates the GC
   # percent, short match and restriction enzyme tracks. It also creates
   # a danRer5 directory in the trackDb/zebrafish directory with a trackDb.ra 
   # file and a template for the assembly descripton, and for the 
   # gold and gap track tracks 
   # Then need to load in fragment gaps manually for gap table.
   
   # (2007-09-10, hartera), Change the default Browser position in dbDb
   # so that it displays vdac2 - chr13:14,786,820-14,801,993 - same as for 
   # danRer4.
   ssh hgwdev
   hgsql -h genome-testdb -e \
      'update dbDb set defaultPos = "chr13:14,786,820-14,801,993" \
       where name = "danRer5";' hgcentraltest

   # (2007-09-22, hartera), Change the source name for danRer5 to give 
   # Sanger's full name and use similar format to other genome assembly
   # entries in dbDb.
   hgsql -h genome-testdb -e \
      'update dbDb set sourceName = "Wellcome Trust Sanger Institute, Zv7
assembly" where name = "danRer5";' \
      hgcentraltest
   # (2007-10-01, hartera), Added the accession instead of the gi number
   # for the frag field for chrM in the gold table. gi number is gi|15079186
   # and accession is NC_002333.2. This is the same as AC024175.3 which was
   # used previously.
   ssh hgwdev
   hgsql -e \
       'update gold set frag = "NC_002333.2" where chrom = "chrM";' danRer5 

##########################################################################
# REBUILD GAP TABLE WITH FRAGMENT GAPS AS WELL AS CONTIG GAPS 
# (DONE, 2007-08-24, hartera)
# It is confusing not to display the fragment gaps in the gap track - 
# these are gaps within contigs.
    ssh kkstore06
    cd /cluster/data/danRer5/Zv7release/
    # need to use the contigs to chroms AGP
    # also copy over the contigs to scaffolds AGP for the unmapped scaffolds
    cp /cluster/data/danRer5/Zv7release/randoms/randomsScaffold.agp \
       /cluster/data/danRer5/Zv7release/agps/   
    mkdir /cluster/data/danRer5/gap 
    awk '{if ($5 ~ /N/) print;}' Zv7_chr.agp \
        > /cluster/data/danRer5/gap/chroms.gap
    # 34449 gaps
    awk '{if ($5 ~ /N/) print;}' ./agps/randomsScaffold.agp  \
        > /cluster/data/danRer5/gap/randomScafs.gap
    # 15278 gaps
    cat chroms.gap randomScafs.gap > danRer5.gap
    wc -l /cluster/data/danRer5/gap/*.gap
    # 34449 /cluster/data/danRer5/gap/chroms.gap
    # 15278 /cluster/data/danRer5/gap/randomScafs.gap
    # 49727 /cluster/data/danRer5/gap/danRer5.gap

    rm chroms.gap randomScafs.gap 
    # 2459 is current count in table. 
    ssh hgwdev
    cd /cluster/data/danRer5/
    hgLoadGap -unsplit danRer5 /cluster/data/danRer5/gap/danRer5.gap
    hgsql -e 'select count(*) from gap;' danRer5
    # 49727 
    # So all the gaps were loaded. 
       
###########################################################################
# REPEATMASKER RUN (DONE, 2007-08-25 - 2007-08-28, hartera)
# RE-RUN REPEATMASKER (DONE, 2007-08-30 - 2007-09-05, hartera)
# The sequence was under RepeatMasked, because the new zebrafish repeats 
# library with the unclassified repeats must be distributed to all the 
# nodes for pk.
# NESTED REPEATS TRACK RE-CREATED AFTER FIX TO extractNestedRepeats.pl
# SCRIPT (DONE, 2007-09-19, angie)

# RepeatMasker,v 1.19 2007/05/23 (May 17 2007 (open-3-1-8) version of
# RepeatMasker) with RepeatMasker library RELEASE 20061006.
# Added zebunc.ref (Zebrafish Unclassified) repeats from RepBase12.07
# from the Genetic Information Research Institute (GIRI,
# http://www.girinst.org/server/RepBase/index.php). Some repeats were 
# removed from zebunc.ref because they mask out real genes (advised by
# Zebrafish bioinformatics group at Sanger) - more details below.

    # Download the zebunc.ref unclassified repeats file for zebrafish 
    # from RepBase:
    ssh kkstore06
    cd /cluster/data/danRer5/
    gunzip repeatmaskerlibraries-20061006.tar.gz 
    # no unclassified zebrafish repeats here
    # Download zebunc.ref from
    # http://www.girinst.org/server/RepBase/RepBase12.07.fasta/zebunc.ref
    # last updated 21-Aug-2007 16:21 	451K
    # copy the one used for danRer4 and see if it is any different
    cp /cluster/bluearc/RepeatMasker060320/Libraries/zebunc.ref.txt \
       zebuncref2006.txt
    diff zebunc.ref zebuncref2006.txt
    # no difference so check format is still correct for the current 
    # version of RepeatMasker on pk /scratch/data/RepeatMasker/Libraries/
    # format still looks the same.
    ssh pk
    mkdir /tmp/danRer5
    cd /tmp/danRer5
    faOneRecord /cluster/data/danRer5/Zv7release/fasta/Zv7_contigs.fa \
                Zv7_scaffold1.1 > Zv7scaffold1.1.fa
    # Run RepeatMasker on this to get it to create the danio library
    /scratch/data/RepeatMasker/RepeatMasker -ali -s -species danio \
                  Zv7scaffold1.1.fa
    cp /cluster/bluearc/RepeatMasker060320/Libraries/zebunc.ref.format \
       /scratch/data/RepeatMasker/Libraries/
    # Add this to the specieslib created for danio
    cd /scratch/data/RepeatMasker/Libraries/
    cat zebunc.ref.format \
       >> /scratch/data/RepeatMasker/Libraries/20061006/danio/specieslib 
    grep '>' specieslib | wc -l
    # 1184
    # then the RepeatMasker script
    # Need to re-do this and add the extra zebrafish repeats to the danio
    # libraries on /cluster/bluearc/scratch/data/RepeatMasker/
    # and get it rsynced to all the parasol hubs, nodes, workhorse machines
    # and fileservers. 
    # E-mail from Kerstin Howe about zebrafish repeats on Aug 31, 2007
    # states that some of these unknown repeats were removed because they 
    # align to real genes. Save this e-mail as README.Repeats
    # (2007-09-03, hartera)
    ssh kkstore06
    cd /cluster/data/danRer5/
    # get list of zebunc.ref sequence names
    grep '>' zebunc.ref | sed -e 's/>//' > zebunc.ref.names
    # make a list of the repeats that Sanger removed: unknownRepsRemoved.txt
    # Sanger removed some of these repeats because they masked out real genes
    # get list list and remove them from list of repeat names.
    grep -w -v -f unknownRepsRemoved.txt zebunc.ref.names \
         > zebuncFixed.ref.names
    wc -l *.names
    # 958 zebunc.ref.names
    # 943 zebuncFixed.ref.names

    # select just those remaining names from the zebunc.ref FASTA file
    faSomeRecords zebunc.ref zebuncFixed.ref.names zebuncFixed.ref.fa 
    grep '>' zebuncFixed.ref.fa | wc -l
    # 943 
    # then add these repeats to the library for zebrafish on bluearc
    ssh hgwdev
    # do a dummy run first to create the specieslib for zebrafish
    /cluster/bluearc/scratch/data/RepeatMasker/RepeatMasker \
          -species 'danio' /dev/null
    # then cat the new library to the specieslib for danio
    cat /cluster/data/danRer5/zebuncFixed.ref.fa >> \
 /cluster/bluearc/scratch/data/RepeatMasker/Libraries/20061006/danio/specieslib 
    cd /cluster/bluearc/scratch/data/RepeatMasker/Libraries/20061006/danio
    grep '>' specieslib | grep "Dr | wc -l
    # 943
    # (2007-09-04, hartera)
    # also copy to /cluster/bluearc/RepeatMasker
    # do a dummy run first to create the specieslib for zebrafish
    /cluster/bluearc/RepeatMasker/RepeatMasker -species 'danio' /dev/null
    cp -p \
/cluster/bluearc/scratch/data/RepeatMasker/Libraries/20061006/danio/specieslib\
    /cluster/bluearc/RepeatMasker/Libraries/20061006/danio/
    # ask cluster-admins for an rsync from 
    # /cluster/bluearc/scratch/data/RepeatMasker/ 
    # to /scratch/data/RepeatMasker/ on all machines (parasol hubs, nodes, 
    # fileservers, workhorse machines)
  
    ssh hgwdev
    # cleanup
    rm /gbdb/danRer5/danRer5.2bit
    rm -r /san/sanvol1/scratch/danRer5/RMPart
    rm /san/sanvol1/scratch/danRer5/danRer5.2bit
    rm /san/sanvol1/scratch/danRer5/*.ooc
    hgsql -e 'drop table rmsk;' danRer5

    ssh kkstore06
    # clean up
    cd /cluster/data/danRer5
    rm danRer5.rmsk.2bit danRer5.rmskTrf.2bit danRer5.2bit
    mv RepeatMasker.2007-08-25/ OldRepeatMask
    ## use screen for this
    time nice doRepeatMasker.pl -bigClusterHub=pk -species danio \
         danRer5 >& repeatmask.log &
    tail -f repeatmask.log
    # 0.338u 0.267s 20:33:39.53 0.0%  0+0k 0+0io 5pf+0w

    # Checked part of danRer5.rmsk.2bit with twoBitToFa and it looks masked.

    ## From previous run of RepeatMasker, this is fixed now:
    # Output is in 
    # /scratch/tmp/doRepeatMasker.cluster.s15477/chr3:16000000-16500000.fa.out
    # Checked in danRer4 RM run files and some of them have "*" at the end of 
    # a line e.g. /cluster/data/danRer4/1/chr1_11/chr1_11_10.fa.out
    # Also, the IDs in the current run do not run consecutively 
    # e.g. 1,2,3,4,5,6,2,7,8 etc.
    # E-mailed Angie to ask for advice   
    # (2007-08-27, hartera)
    # liftUp is fixed so it will skip this repeat in the liftUp step when
    # it finds a "*" but no ID.
    # After the repeat DNA11TA1_DR with ID 821, there is a line with a 
    # repeat but no ID, just a "*" at the end which confuses liftUp.
    # Angie sent this problem as an example to Robert Hubley to fix the bug.
    # Here is the offending line:
# 294    9.8  0.0  0.0  chr3:16000000-16500000  372065 372105 (127895) +
# DNA11TA1_DR      DNA/Tc1           164  204    (0)      *
    # Angie fixed liftUp to just warn when it finds a "*" instead of an ID.
  
    # NOTE: the Nested Repeats track is created automatically by the
    # doRepeatMasker.pl script. When the script was run to create the 
    # RepeatMasker track it was found that the extractNestedRepeats.pl
    # needed fixing which was done by angie. 
    # The Nested Repeats track was then re-created (angie, 2007-09-19)
    cd /cluster/data/danRer5/bed/RepeatMasker.2007-09-04
    extractNestedRepeats.pl danRer5.fa.out > danRer5.nestedRepeats.bed

    hgLoadBed danRer5 nestedRepeats danRer5.nestedRepeats.bed \
       -sqlTable=\$HOME/kent/src/hg/lib/nestedRepeats.sql
 
###########################################################################
# SIMPLE REPEAT (TRF) TRACK (DONE, 2007-08-25 - 2007-08-27, hartera)
   # TRF can be run in parallel with RepeatMasker on the file server
   # since it doesn't require masked input sequence.
   
   ssh kolossus
   ## use screen for this
   mkdir /cluster/data/danRer5/bed/simpleRepeat
   cd /cluster/data/danRer5/bed/simpleRepeat
   time nice twoBitToFa ../../danRer5.unmasked.2bit stdout \
        | trfBig -trf=/cluster/bin/i386/trf stdin /dev/null \
                -bedAt=simpleRepeat.bed -tempDir=/scratch/tmp \
        >& trf.log &
   # Took about 8 hours to run
   tail -f trf.log 
   # Crashed on kolossus so re-run as before on kki:
   # Broken pipe                   twoBitToFa ../../danRer5.unmasked.2bit
   # stdout |
   #     12.260u 4.233s 8:05:02.78 0.0%  0+0k 0+0io 0pf+0w
   #    Segmentation fault            trfBig -trf=/cluster/bin/i386/trf stdin
   # /dev/null -bedAt=simpleRepeat.bed  ...
   # This time split the sequence up and run on kki
   ssh kkstore06
   cd /cluster/data/danRer5/
   mkdir splitNoMask
   cd splitNoMask
   twoBitToFa ../danRer5.unmasked.2bit danRer5.unmasked.fa
   faSplit byname danRer5.unmasked.fa .
   rm danRer5.unmasked.fa
    
   # Copy the split sequences to iscratch dir on kkr1u00 
   ssh kkr1u00
   rm -r /iscratch/i/danRer5/splitNoMask
   mkdir -p /iscratch/i/danRer5/splitNoMask
   foreach s (/cluster/data/danRer5/splitNoMask/*.fa)
      echo "Copying $s ..."
      cp $s /iscratch/i/danRer5/splitNoMask/
   end
 
   # Count files transferred
   foreach f (/iscratch/i/danRer5/splitNoMask/*.fa)
      ls $f >> seq.lst
   end
   wc -l seq.lst 
   # 5036 seq.lst
   # correct: 25 chroms, 1 chrM, and 5010 unmapped scaffolds.
   rm seq.lst
 
   # rsync to cluster machines
   foreach R (2 3 4 5 6 7 8)
      rsync -a --progress /iscratch/i/danRer5/ kkr${R}u00:/iscratch/i/danRer5/
   end
    
   ## use screen
   ssh kki
   mkdir -p /cluster/data/danRer5/bed/simpleRepeat
   cd /cluster/data/danRer5/bed/simpleRepeat
   mkdir trf
cat << '_EOF_' > runTrf
#!/bin/csh -fe
#
set path1 = $1
set inputFN = $1:t
set outpath = $2
set outputFN = $2:t
mkdir -p /tmp/$outputFN
cp $path1 /tmp/$outputFN
pushd .
cd /tmp/$outputFN
/cluster/bin/i386/trfBig -trf=/cluster/bin/i386/trf $inputFN /dev/null -bedAt=$outputFN -tempDir=/tmp
popd
rm -f $outpath
cp -p /tmp/$outputFN/$outputFN $outpath
rm -fr /tmp/$outputFN/*
rmdir --ignore-fail-on-non-empty /tmp/$outputFN
'_EOF_'
 # << keep emacs coloring happy
   chmod +x runTrf

cat << '_EOF_' > gsub
#LOOP
./runTrf {check in line+ $(path1)}  {check out line trf/$(root1).bed}
#ENDLOOP
'_EOF_'
   # << keep emacs coloring happy

   foreach f (/iscratch/i/danRer5/splitNoMask/*.fa)
      ls -1S $f >> genome.lst
   end 
   gensub2 genome.lst single gsub jobList
   /parasol/bin/para create jobList
   # 5036 jobs written to batch
   /parasol/bin/para try, check, push, check etc...
   /parasol/bin/para time
# still crashing on one sequence: 
# sh: line 1: 23369 Segmentation fault      /cluster/bin/i386/trf
# /tmp/Zv7_scaffold2487.tf 2 7 7 80 10 50 2000 -m -d
# can't open /tmp/Zv7_scaffold2487.tf.2.7.7.80.10.50.2000.mask
# Completed: 5035 of 5036 jobs
# Crashed: 1 jobs
# CPU time in finished jobs:      31429s     523.82m     8.73h    0.36d  0.001 y
# IO & Wait Time:                 12949s     215.81m     3.60h    0.15d  0.000 y
# Average job time:                   9s       0.15m     0.00h    0.00d
# Longest running job:                0s       0.00m     0.00h    0.00d
# Longest finished job:            3818s      63.63m     1.06h    0.04d
# Submission to last job:          5013s      83.55m     1.39h    0.06d

    # test this sequence 
   ssh kkstore06
   cd /cluster/data/danRer5/bed/simpleRepeat
   mkdir noMaskSplit
   mkdir run2
   cd noMaskSplit
   faSplit -minGapSize=100 -lift=scaf2487.lft gap \
      /cluster/data/danRer5/splitNoMask/Zv7_scaffold2487.fa \
      10000 scaf2487_
    
   ssh kkr1u00
   mkdir /iscratch/i/danRer5/splitScaf2487/
   cp /cluster/data/danRer5/bed/simpleRepeat/noMaskSplit/*.fa \
      /iscratch/i/danRer5/splitScaf2487/
   ls /iscratch/i/danRer5/splitScaf2487/*.fa | wc -l 
   # 250
   # rsync to all iServers
   foreach R (2 3 4 5 6 7 8)
      rsync -a --progress /iscratch/i/danRer5/ kkr${R}u00:/iscratch/i/danRer5/
   end
    
   ssh kki
   cd /cluster/data/danRer5/bed/simpleRepeat/run2
   cp ../runTrf .
   cp ../gsub .
   mkdir trf
   ls -1S /iscratch/i/danRer5/splitScaf2487/*.fa > genome.lst
   gensub2 genome.lst single gsub jobList
   /parasol/bin/para create jobList
   # 250 jobs written to batch
   /parasol/bin/para try, check, push, check etc...
   /parasol/bin/para time
# Completed: 249 of 250 jobs
# Crashed: 1 jobs
#CPU time in finished jobs:         73s       1.22m     0.02h    0.00d  0.000 y
#IO & Wait Time:                   622s      10.36m     0.17h    0.01d  0.000 y
#Average job time:                   3s       0.05m     0.00h    0.00d
#Longest running job:                0s       0.00m     0.00h    0.00d
#Longest finished job:              23s       0.38m     0.01h    0.00d
#Submission to last job:            63s       1.05m     0.02h    0.00d
    
   /parasol/bin/para problems >& problems 
# trf/scaf2487_082.bed does not exist
# stderr:
# sh: line 1:  7469 Segmentation fault      /cluster/bin/i386/trf
# /tmp/scaf2487_082.tf 2 7 7 80 10 50 2000 -m -d
# can't open /tmp/scaf2487_082.tf.2.7.7.80.10.50.2000.mask
   # Failed sequence looks fine - just A,C,G,T or N.
   # Downloaded the latest version of trf (v4.0, previous was v3.21) 
   # for both 64 bit and i386. Tried 64 bit version on the scaf2487_082.fa
   # and it works fine. So now run on kolossu.
   mv /cluster/data/danRer5/bed/simpleRepeat \
      /cluster/data/danRer5/bed/OldsimpleRepeat
   
   # run on the pk cluster, too slow on kolossus using the 2bit sequence file.
   ssh pk
   mkdir -p /san/sanvol1/scratch/danRer5/splitNoMask
   # copy the split sequence files to the san - 1 for each chorm and 1
   # for each unmapped scaffold
   foreach f (/cluster/data/danRer5/splitNoMask/*.fa)
     cp $f /san/sanvol1/scratch/danRer5/splitNoMask
   end
   mkdir /cluster/data/danRer5/bed/simpleRepeat
   cd /cluster/data/danRer5/bed/simpleRepeat
   mkdir trf
cat << '_EOF_' > runTrf
#!/bin/csh -fe
#
set path1 = $1
set inputFN = $1:t
set outpath = $2
set outputFN = $2:t
mkdir -p /tmp/$outputFN
cp $path1 /tmp/$outputFN
pushd .
cd /tmp/$outputFN
/cluster/bin/i386/trfBig -trf=/cluster/bin/x86_64/trf $inputFN /dev/null -bedAt=$outputFN -tempDir=/tmp
popd
rm -f $outpath
cp -p /tmp/$outputFN/$outputFN $outpath
rm -fr /tmp/$outputFN/*
rmdir --ignore-fail-on-non-empty /tmp/$outputFN
'_EOF_'
 # << keep emacs coloring happy
   chmod +x runTrf

cat << '_EOF_' > gsub
#LOOP
./runTrf {check in line+ $(path1)}  {check out line trf/$(root1).bed}
#ENDLOOP
'_EOF_'
   # << keep emacs coloring happy

   foreach f (/san/sanvol1/scratch/danRer5/splitNoMask/*.fa)
      ls -1S $f >> genome.lst
   end 
   /parasol/bin/gensub2 genome.lst single gsub jobList
   /parasol/bin/para create jobList
   # 5036 jobs written to batch
   /parasol/bin/para try, check, push, check etc...
   /parasol/bin/para time
# Completed: 5036 of 5036 jobs
# CPU time in finished jobs:      45132s     752.20m    12.54h    0.52d  0.001 y
# IO & Wait Time:                 16660s     277.67m     4.63h    0.19d  0.001 y
# Average job time:                  12s       0.20m     0.00h    0.00d
# Longest running job:                0s       0.00m     0.00h    0.00d
# Longest finished job:            5804s      96.73m     1.61h    0.07d
# Submission to last job:          6042s     100.70m     1.68h    0.07d
    
   # (2007-08-27, hartera)
   # the input to trf was the chromosomes and scaffolds so no liftUp needed.
   # cat all the files together
   cat ./trf/*.bed > simpleRepeat.bed
   # load table
   ssh hgwdev
   cd /cluster/data/danRer5/bed/simpleRepeat
   hgLoadBed danRer5 simpleRepeat simpleRepeat.bed \
      -sqlTable=$HOME/kent/src/hg/lib/simpleRepeat.sql
   # Loaded 966523 elements of size 16

###########################################################################
# PROCESS SIMPLE REPEATS INTO A MASK (DONE, 2007-08-28, hartera)
# RE-DONE AFTER RE-RUNNING REPEATMASKER (DONE, 2007-09-05, hartera) 
   # The danRer5.rmsk.2bit is already masked with RepeatMasker output.
   # After the simpleRepeats track has been built, make a filtered
   # version of the trf output for masking: keep trfs with period <= 12.
   ssh kkstore06
   cd /cluster/data/danRer5/bed/simpleRepeat
   rm -r trfMask
   rm trfMask.bed
   mkdir trfMask
   foreach f (trf/*.bed)
     awk '{if ($5 <= 12) print;}' $f > trfMask/$f:t
   end

   cat ./trfMask/*.bed > trfMask.bed
   # don't need to lift up. these are already in chrom and unmapped 
   # scaffold coordinates.
   ## Add trfMask to repeat masked sequence
   ssh kkstore06
   cd /cluster/data/danRer5
   # cleanup old sequence:
   rm danRer5.rmskTrf.2bit danRer5.2bit

   cat << '_EOF_' > addTrf.csh
#!/bin/csh -efx
#This script will fail if any of its commands fail.

set DB = danRer5
set WORK_DIR = /cluster/data/${DB}
cd ${WORK_DIR}
set inputTwoBit = ${WORK_DIR}/${DB}.rmsk.2bit
set outputTwoBit = ${WORK_DIR}/${DB}.rmskTrf.2bit
cat /cluster/data/${DB}/bed/simpleRepeat/trfMask.bed \
        | twoBitMask -add -type=.bed ${inputTwoBit} stdin ${outputTwoBit}
twoBitToFa ${outputTwoBit} stdout | faSize stdin > faSize.${DB}.rmskTrf.txt
'_EOF_'
    # << happy emacs
    chmod +x ./addTrf.csh
    time ./addTrf.csh
    # Warning: BED file stdin has >=13 fields which means it might contain
    # block coordinates, but this program uses only the first three fields
    # (the entire span -- no support for blocks). 
    # This is ok, there are no blocks, only the first 3 fields are 
    # standard BED format.
    # 44.550u 6.030s 0:50.66 99.8%    0+0k 0+0io 0pf+0w
    cat faSize.danRer5.rmskTrf.txt
# 1440582308 bases (4986636 N's 1435595672 real 724596899 upper 710998773
# lower) in 5036 sequences in 1 files
# Total size: mean 286056.9 sd 3643515.2 min 2000 (Zv7_NA5415) max 70371393
# (chr5) median 7336
# N count: mean 990.2 sd 9834.4
# U count: mean 143883.4 sd 1862761.1
# L count: mean 141183.2 sd 1772056.1
# %49.35 masked total, %49.53 masked real
    # this is similar to previous masking of zebrafish assembly sequence
    # for danRer4: %44.10 masked total, %48.94 masked real
    ln -s danRer5.rmskTrf.2bit danRer5.2bit

    # add symlink to /gbdb/danRer5 to symlink the danRer5.2bit file
    ssh hgwdev
    rm /gbdb/danRer5/danRer5.2bit
    ln -s /cluster/data/danRer5/danRer5.rmskTrf.2bit \
          /gbdb/danRer5/danRer5.2bit
    # copy over to the san
    rm /san/sanvol1/scratch/danRer5/danRer5.2bit
    cp -p /cluster/data/danRer5/danRer5.rmskTrf.2bit \
          /san/sanvol1/scratch/danRer5/danRer5.2bit
    # also copy to iscratch
    ssh kkr1u00
    rm /iscratch/i/danRer5/danRer5.2bit
    cp -p /cluster/data/danRer5/danRer5.rmskTrf.2bit \
          /iscratch/i/danRer5/danRer5.2bit
    # rsync to iServers
    foreach R (2 3 4 5 6 7 8)
       rsync -a --progress /iscratch/i/danRer5/ \
             kkr${R}u00:/iscratch/i/danRer5/
    end

###########################################################################
# MAKE 10.OOC, 11.OOC FILES FOR BLAT (DONE, 2007-08-28, hartera)
# RE-RUN AFTER RE-RUNNING REPEATMASKER AND RE-MASKING SEQUENCE
# (DONE, 2007-09-05, hartera)
    # Use -repMatch=512 (based on size -- for human we use 1024, and
    # the zebrafish genome is ~50% of the size of the human genome
    # zebrafish is about 1.44 Gb and human is 3.1 Gb.
    ssh kolossus
    rm -r /cluster/data/danRer5/bed/ooc
    mkdir /cluster/data/danRer5/bed/ooc
    cd /cluster/data/danRer5/bed/ooc
    time blat /cluster/data/danRer5/danRer5.2bit \
       /dev/null /dev/null -tileSize=11 -makeOoc=danRer5_11.ooc -repMatch=512
    # 75.647u 3.159s 1:21.04 97.2%    0+0k 0+0io 2pf+0w
    # Wrote 40297 overused 11-mers to danRer5_11.ooc
    
    # then make 10.ooc file for overused 10mers, repMatch = 4096 
    # for humans so use 2048 for danRer5
    time blat /cluster/data/danRer5/danRer5.2bit \
       /dev/null /dev/null -tileSize=10 -makeOoc=danRer5_10.ooc -repMatch=2048
    # 72.078u 2.587s 1:14.83 99.7%    0+0k 0+0io 0pf+0w
    # Wrote 9589 overused 10-mers to danRer5_10.ooc

    # copy the *.ooc files over to the san for cluster runs and Genbank run
    rm /san/sanvol1/scratch/danRer5/*.ooc
    cp -p *.ooc /san/sanvol1/scratch/danRer5/
    # also copy to iscratch
    ssh kkr1u00
    rm /iscratch/i/danRer5/*.ooc
    cp -p /cluster/data/danRer5/bed/ooc/*.ooc /iscratch/i/danRer5/
    # rsync to iServers
    foreach R (2 3 4 5 6 7 8)
       rsync -a --progress /iscratch/i/danRer5/*.ooc \
             kkr${R}u00:/iscratch/i/danRer5/
    end
   
###########################################################################
# CREATE LIFT FILE SEPARATED BY NON-BRIDGED GAPS 
# (DONE, 2007-08-28, hartera)
# This is needed for the GenBank run. Use Hiram's gapToLift program
    ssh hgwdev
    cd /cluster/data/danRer5/jkStuff
    gapToLift danRer5 nonBridgedGap.lft
    # copy over to the san
    cp -p nonBridgedGap.lft /san/sanvol1/scratch/danRer5
    # there are 14973 rows in this file, only 370 for mouse
    # that's ok. 

###########################################################################
# AUTO UPDATE GENBANK MRNA AND EST AND ZGC GENES RUN
# (DONE, 2007-08-28, hartera)
# RE-RUN AFTER RE-DOING REPEATMASKING OF GENOME SEQUENCE
# (DONE, 2007-09-05, hartera)
    ssh hgwdev  
    cd $HOME/kent/src/hg/makeDb/genbank
    cvs update -dP
    # edit etc/genbank.conf to add danRer5 and commit to CVS
    # done already for first run so no need to update.
# danRer5 (zebrafish)
danRer5.serverGenome = /cluster/data/danRer5/danRer5.2bit
danRer5.clusterGenome = /iscratch/i/danRer5/danRer5.2bit
danRer5.ooc = /iscratch/i/danRer5/danRer5_11.ooc
danRer5.lift = /cluster/data/danRer5/jkStuff/nonBridgedGap.lft
danRer5.refseq.mrna.native.pslCDnaFilter  = ${ordered.refseq.mrna.native.pslCDnaFilter}
danRer5.refseq.mrna.xeno.pslCDnaFilter    = ${ordered.refseq.mrna.xeno.pslCDnaFilter}
danRer5.genbank.mrna.native.pslCDnaFilter = ${ordered.genbank.mrna.native.pslCDnaFilter}
danRer5.genbank.mrna.xeno.pslCDnaFilter   = ${ordered.genbank.mrna.xeno.pslCDnaFilter}
danRer5.genbank.est.native.pslCDnaFilter  = ${ordered.genbank.est.native.pslCDnaFilter}
danRer5.downloadDir = danRer5
danRer5.mgcTables.default = full
danRer5.mgcTables.mgc = all
danRer5.perChromTables = no
   # end of section added to etc/genbank.conf
   cvs commit -m "Added danRer5." etc/genbank.conf
   # also added the perChromTables line afterwards. This means that there
   # will not be one table per chromosome. Would be too many tables as 
   # there are 5010 scaffolds.
   # update /cluster/data/genbank/
   make etc-update

   # ~/kent/src/hg/makeDb/genbank/src/lib/gbGenome.c already contains
   # danRer genome information

   ssh kkstore06
   cd /cluster/data/genbank
   # for re-run, remove everything under these directories:
   rm -r /cluster/data/genbank/data/aligned/genbank.161.0/danRer5/*
   rm -r /cluster/data/genbank/data/aligned/refseq.24/danRer5/*
   time nice bin/gbAlignStep -initial danRer5 &
   # 3691.965u 996.052s 4:08:37.03 31.4%     0+0k 0+0io 32pf+0w
   # logFile: var/build/logs/2007.09.05-10:04:19.danRer5.initalign.log
   # check log file - looks ok, it has finished (last line in log file)
   # kkstore06.soe.ucsc.edu 2007.09.05-14:12:56 danRer5.initalign: finish 
  
   # then load database
   sh hgwdev
   cd /cluster/data/genbank
   # -drop parameter will ensure that old tables are dropped before loading
   time nice ./bin/gbDbLoadStep -drop -initialLoad danRer5 &
   # logFile: var/dbload/hgwdev/logs/2007.09.05-14:36:03.dbload.log
   # 1108.286u 310.389s 41:23.51 57.1%       0+0k 0+0io 16pf+0w
   # This time ran smoothly and the only problem was an NFS issue with
   # removing directories at the end - the load went ok though. 
   # On the first time loading the GenBank alignments, it failed:
   # failed because there were too many open files, trying to load a table
   # per chrom and per scaffold so add the danRer5.perChromTables = n
   # line to genbank.conf file (see above) so that it will all be loaded into
   # one table. Too many table otherwise, as there are 25 chroms, chrM and 
   # 5010 scaffolds. 
   # Solution: remove lock file: 
   # rm /cluster/data/genbank/var/dbload/hgwdev/run/dbload.lock
   # then run again

###########################################################################
# ENSEMBL GENES TRACK FOR PROTEIN-CODING GENES FROM 
# ENSEMBL VERSION 46 (AUG 2007) (DONE, 2007-09-05, markd)
# Compare peptides translated from genome to those downloaded from
# BioMart for these transcripts and LOAD ensPep TABLE WITH PEPTIDE 
# SEQUENCES FOR THE PROTEIN-CODING ENSEMBL GENES 
# (DONE, 2007-09-24 - 2007-09-25, hartera)
# ADD AUTOSQL CODE FOR ensInfo TABLE AND ADD CODE TO HGC TO PRINT
# THIS ADDITIONAL INFORMATION ON THE DETAILS PAGE FOR EACH GENE
# (DONE, 2007-09-28 and 2007-10-03, hartera)
   # Used the programs created by Robert Baertsch to download Ensembl  
   # genes from Ensembl's ftp site, load it into the danRer5 database.
  
   ssh hgwdev
   mkdir /cluster/data/danRer5/bed/ensembl46
   cd /cluster/data/danRer5/bed/ensembl46
   
   # Download Ensembl genes from ftp site and create tracks in browser:
   # current version is danio_rerio core_46_7
   hgLoadEnsembl danio_rerio core_46_7 danRer5

   # these problems were detected, add to problem.ids file:
   genePredCheck -db=danRer5 ensGene.gp
Error: ensGene.gp:2720: ENSDART00000019661 exonFrame on non-CDS exon 14
Error: ensGene.gp:3874: ENSDART00000027605 exonFrame on non-CDS exon 0
Error: ensGene.gp:9416: ENSDART00000062681 exonFrame on non-CDS exon 12
checked: 31743 failed: 3

   hgLoadEnsembl -f problem.ids danio_rerio core_46_7 danRer5

   # hgLoadEnsembl creates the ensGene, ensGeneXref, ensGtp and ensInfo 
   # tables but no ensPep peptide sequence table.

   # Compare the peptides as translated from the genomic sequence of the 
   # CDS for each transcript and the peptides for the transcripts downloaded
   # from BioMart (2007-09-23 - 2007-09-24, hartera)
   ssh kkstore06
   mkdir /cluster/data/danRer5/bed/ensembl46/peptides

   ssh hgwdev
   cd /cluster/data/danRer5/bed/ensembl46/peptides
   hgsql -N -e 'select distinct(name) from ensGene;' danRer5 | sort \
         > ensGene.txIds.sort

   # First get peptides from the genome translation of the genes defined in 
   # ensGene. This does not work for the mitochondrial genome. The ability 
   # to deal with vertebrate mitochondrial genetic code for translation 
   # of genes on chrM was added (markd).
   getRnaPred -peptides -genePredExt danRer5 ensGene all ensGeneTablePep.fa
   # Took about 10 minutes
   
   # Then download the peptide sequences from BioMart and compare 
   # differences may include selenocysteines and translationnal frameshifts

   # Get the ensembl peptide sequences from
   # http://www.ensembl.org/biomart/martview
   # Follow this sequence:
   # 1) Choose the Ensembl Genes 46 as the database and then 
   # Danio Rerio genes (ZFISH7) as the dataset.
   # 2) Click on the Attributes link in the left panel. Select sequences.
   # 3) Expand the SEQUENCES section and choose Peptide as type of sequence 
   # to export and then expand the Header Information section and select 
   # Ensembl Gene ID from Gene Attributes and Ensembl Transcript ID 
   # and Ensembl Peptide ID from Transcript Attributes 
   # 4) Click on the Filters link in the left panel and expand the GENE
   # section. Select the Gene type box and then select protein_coding as 
   # these are the only genes with an associated protein sequence.
   # 5) Click on the Results link in the top black menu bar and 
   # choose FASTA for the output and export all results to
   # Compressed file (.gz). Select unique results only.
   # save the file as ensembl46Pep.fasta.gz and move to
   # /cluster/data/danRer5/bed/ensembl46
 
   ssh kkstore06
   cd /cluster/data/danRer5/bed/ensembl46/peptides
   # unzip the Enseml peptides file downloaded from BioMart
   gunzip ensembl46Pep.fasta.gz
   grep '>' ensembl46Pep.fasta | wc -l
   # 31743
   grep '>' ensembl46Pep.fasta > headers
   awk 'BEGIN {FS="|"} {print $1;}' headers | sort | uniq \
       > pepBioMart.txIds.sort
   perl -pi.bak -e 's/>//' pepBioMart.txIds.sort
   # compare list of transcript IDs from the ensGene table to those for the
   # downloaded peptides from BioMart.
   comm -13 ensGene.txIds.sort pepBioMart.txIds.sort
   # from the BioMart peptide download there were 3 extra:
# ENSDART00000019661
# ENSDART00000027605
# ENSDART00000062681
   # These were the problem IDs that were removed from the set - see above.
   comm -23 ensGene.txIds.sort pepBioMart.txIds.sort
   # no difference
   # change headers so that they only have the Ensembl transcript ID.
   awk 'BEGIN{FS="|"} {if ($1 ~ /ENSDART/) print $1; else print;}' \
       ensembl46Pep.fasta > ensembl46PepTxIdsOnly.fasta
   # use faCmp (~/kent/src/utils/faCmp/faCmp.c) to compare the two files.
   # it requires the same number of sequences in both files. 
   # Use faFilter to remove the ones not in ensGeneTablePep.fa 
   faFilter -v -name=ENSDART00000019661 ensembl46PepTxIdsOnly.fasta tmp1.fa
   faFilter -v -name=ENSDART00000027605 tmp1.fa tmp2.fa
   faFilter -v -name=ENSDART00000062681 \
            tmp2.fa ensembl46PepTxIdsOnlyFilt.fasta
   grep '>' ensembl46PepTxIdsOnlyFilt.fasta | wc -l
   # 31740
   rm tmp*
   # faCmp assumed that DNA sequences were being compared. An option was 
   # added so that protein sequences could be compared (markd added this).
   faCmp -sortName -peptide \
         ensGeneTablePep.fa ensembl46PepTxIdsOnlyFilt.fasta \
         >& ensTablevsBioMartFasta.diff
# ENSDART00000038434 in ensGeneTablePep.fa differs from ENSDART00000038434 at
# ensembl46PepTxIdsOnlyFilt.fasta at base 114 (X != V)
# ENSDART00000046903 in ensGeneTablePep.fa differs from ENSDART00000046903 at
# ensembl46PepTxIdsOnlyFilt.fasta at base 71 (X != T)
# ENSDART00000047515 in ensGeneTablePep.fa differs from ENSDART00000047515 at
# ensembl46PepTxIdsOnlyFilt.fasta at base 149 (X != L)
# ENSDART00000049170 in ensGeneTablePep.fa differs from ENSDART00000049170 at
# ensembl46PepTxIdsOnlyFilt.fasta at base 172 (X != S)
# ENSDART00000080260 in ensGeneTablePep.fa differs from ENSDART00000080260 at
# ensembl46PepTxIdsOnlyFilt.fasta at base 353 (X != G)
# ENSDART00000081978 in ensGeneTablePep.fa differs from ENSDART00000081978 at
# ensembl46PepTxIdsOnlyFilt.fasta at base 21 (X != R)
# ENSDART00000082954 in ensGeneTablePep.fa differs from ENSDART00000082954 at
# ensembl46PepTxIdsOnlyFilt.fasta at base 747 (X != A)
# ENSDART00000083022 in ensGeneTablePep.fa differs from ENSDART00000083022 at
# ensembl46PepTxIdsOnlyFilt.fasta at base 344 (X != L)
# ENSDART00000105255 in ensGeneTablePep.fa differs from ENSDART00000105255 at
# ensembl46PepTxIdsOnlyFilt.fasta at base 427 (X != A)

# There are differences in just 9 of the protein sequences where we have an 
# "X" because there is an "N" in the genome sequence in the codon for the
# amino acid whereas Ensembl used transcript evidence to predict the 
# missing base. Since there are so few differences, then the predicted 
# translation from the genome is good enough without loading an ensPep table.
   # Load anyway for completeness since there are some differences. Use 
   # "generic" as the type for loading using hgPepPred, not "ensembl" since
   # all the IDs were removed from the header line but the transcript ID.
   # If type "ensembl" is defined, it expects at least 3 IDs separated by "|".

   ssh hgwdev
   cd /cluster/data/danRer5/bed/ensembl46/peptides
   # load the BioMart downloaded Ensembl 46 peptide sequences into database:
   hgPepPred danRer5 generic ensPep ensembl46PepTxIdsOnlyFilt.fasta
   hgsql -e 'select count(*) from ensPep;' danRer5
   # 31740
   # All the sequences were loaded. 
   # Added a zebrafish/danRer5/trackDb.ra entry for ensGene with the
   # archive setting, aug2007, so that URLs for Ensembl Genes point to the 
   # correct archive. Added the Esnembl version number to the longlabel and
   # as a dataVersion setting (Ensembl Relese 46).

   # clean up
   ssh kkstore06
   cd /cluster/data/danRer5/bed/ensembl46/peptides
   rm tmp*.fa headers ensPep.tab ensembl46PepTxIdsOnly.fasta *.bak
   rm pepBioMart.txIds.sort ensGene.txIds.sort 

   # (2007-09-28, hartera)
   # Add code to use ensInfo table, use autoSql. There is already a 
   # ensInfo.sql table specified in ~/kent/src/hg/lib
   ssh hgwdev
cat << 'EOF' > $HOME/kent/src/hg/lib/ensInfo.as
table ensInfo
"ensembl Genes track additional information"
    (
    string name;	"Ensembl transcript ID"
    string otherId;	"other ID"
    string geneId; 	"Ensembl gene ID"
    string class;	"Ensembl gene type"
    string geneDesc;	"Ensembl gene description"
    string status;	"Ensembl gene confidence"
    )
'EOF'
    cd $HOME/kent/src/hg/lib/ensInfo.as
    # move the current ensInfo.sql out of the way
    mv ensInfo.sql tmp.sql
    autoSql ensInfo.as ensInfo
    # renmae autogenerated file and reinstate original file as ensInfo.sql
    # Difference is that this file has a longblob for description whereas
    # the description field in the autoSql generated code just has a 
    # varchar[255] which may not be big enough in all cases.
    mv ensInfo.sql ensInfoTemp.sql
    mv tmp.sql ensInfo.sql
    mv ensInfo.h ../inc/
    # commit ../inc/ensInfo.h, ensInfo.c and ensInfo.as to CVS
    # add ensInfo.o to makefile and commmit to CVS
    cd $HOME/kent/src/hg/
    # add code to hgc.c to doEnsemblGene function to write information 
    # from the ensInfo table on the details page for each gene.
    # (2007-10-03, hartera). Check whether the name2 field in ensGene is
    # always the same as otherId in the ensInfo table.
    ssh hgwdev
    cd /cluster/data/danRer5/bed/ensembl46
    hgsql -N -e 'select name, name2 from ensGene;' danRer5 \
          | sort | uniq > names.ensGene.sort
    hgsql -N -e 'select name, otherId from ensInfo;' danRer5 \
          | sort | uniq > names.ensInfo.sort
    wc -l names*.sort
    # 31740 names.ensGene.sort
    # 31991 names.ensInfo.sort
    comm -12 names.ensGene.sort names.ensInfo.sort | wc -l
    # 31740
    # So the name, name2 pair in ensGene is the same as the name, otherId
    # pair in ensInfo. ensInfo has more entries than ensGene because
    # it includes pseudogene and non-coding RNAs. hgc already prints the 
    # Alternate Name using name2 from ensGene so do not need to print out
    # the otherId field from ensInfo.

###########################################################################
# MAKE HGCENTRALTEST BLATSERVERS ENTRY FOR danRer5 (DONE, 2006-09-11, hartera)
   ssh hgwdev
   # DNA port is "0", trans prot port is "1"  
   echo 'insert into blatServers values("danRer5", "blat8", "17784", "1",
"0");  insert into blatServers values("danRer5", "blat8", "17785", "0",
"1");' \
    | hgsql hgcentraltest
   # this enables blat and isPcr, isPcr is enabled by loading blat server
   # with tilesize=5 (ask for this when request blat servers from
   # cluster admin).
   # if you need to delete those entries
   echo 'delete from blatServers where db="danRer5";' | hgsql hgcentraltest

###########################################################################
# VEGA GENES (DONE, 2007-09-05 - 2007-09-12, hartera)
# Data provided by Tina Eyre from Sanger: te3@sanger.ac.uk
# GTF file sent on 2007-08-30, Vegav27
    ssh kkstore06
    mkdir -p /cluster/data/danRer5/bed/vega27/data/
    cd /cluster/data/danRer5/bed/vega27/data
    # 2 files: vegav27.gtf.zip  vegav27_information.txt.zip
    unzip vegav27.gtf.zip
    mv ./lustre/work1/ensembl/te3/otter_vega_builds/vega/070523/vegav27.gtf .
    rm -r lustre
    unzip vegav27_information.txt.zip     
    mv \
       ./lustre/work1/ensembl/te3/otter_vega_builds/vega/070523/vegav27_information.txt
\    .
    rm -r lustre
    # asked for the vegav27_information.txt file to be in the format 
    # required to load the table directory. 
    # Check the format then prepare data files to load database tables.
    # Can grep for "otter" to get only gene description lines from the GTF 
    # file - the rest is output and header from program that generated it.
    # 2007-09-06, hartera
    ssh kkstore06
    cd /cluster/data/danRer5/bed/vega27
    # get just lines describing genes and remove "chr" prefix from scaffolds
    # NOTE: there are no NA scaffolds. 
  grep "otter" ./data/vegav27.gtf | sed -e 's/chrZv7_/Zv7_/' > vegav27Fixed.gtf   
    # vegaInfo is transcriptId, otterId, geneId, method and geneDesc
    # Get otter transcript ID and otter gene ID:
    awk 'BEGIN{FS=";"} {OFS="\t"} \
        {if (($5 ~ /otter_gene_id/) && ($6 ~ /otter_transcript_id/)) \
         print $5, $6;}' vegav27Fixed.gtf | sort | uniq > vega27OtterIds.txt
    # remove the otter_gene_id and otter_transcript_id and 
    # extra spaces and quotation marks.
    perl -pi.bak -e 's/\sotter_gene_id\s\"//;' vega27OtterIds.txt
    perl -pi.bak -e 's/"\t\sotter_transcript_id\s"(OTTDART[0-9]+)"/\t$1/' \
         vega27OtterIds.txt
    wc -l vega27OtterIds.txt
    # 12819 vega27OtterIds.txt
    # get list of the otter gene IDs only
    awk '{print $1}' vega27OtterIds.txt | sort | uniq > otterGeneIds.only
    # then get list of otter gene IDs from information file
    awk '{if ($1 ~ /OTTDARG/) print $1}' data/vegav27_information.txt \
        | sort | uniq > infoOtterGeneIds.only
    comm -12 otterGeneIds.only infoOtterGeneIds.only | wc -l
    # 8649
    wc -l *.only
    # 10374 infoOtterGeneIds.only
    # 8649 otterGeneIds.only
    # all the gene IDs from the GTF file but the information file contains 
    # 1725 are in the information file but not the GTF file
    comm -13 otterGeneIds.only infoOtterGeneIds.only > geneIds.InfoOnly.txt
cat << 'EOF' > mergeTransIdAndInfo.pl
#!/usr/bin/perl -w
use strict;

my ($idsFile, $infoFile, %idsHash);
$idsFile = $ARGV[0];
$infoFile = $ARGV[1];
open (IDS, $idsFile) || die "Can not open $idsFile: $!\n";
open (INFO, $infoFile) || die "Can not open $idsFile: $!\n";

# Read in the otter gene IDs and transcript IDs
while (<IDS>)
{
my (@f, $line);
chomp;
$line = $_;
# split line on tab
@f = split(/\t/, $line);
# hash keyed by gene ID, stores transcript ID
# more than one ID so store as an array
if ($f[0] =~ /OTTDARG/)
   {
   # add to array in hash if it is an otter gene ID
   push @{$idsHash{$f[0]}}, $f[1];
   }
}
close IDS;

while (<INFO>)
{
my ($line, @info, $geneId, @transIds);
# read in information file and get the otter gene ID
$line = $_;
# split line on tab
@info = split(/\t/, $line);
if ($info[0] =~ /OTTDARG/)
   {
   $geneId = $info[0];
   # look up transcript ID in hash 
   
   # if gene ID exists in the hash then print out the information
   # file line with the correct transcript ID
   if (exists($idsHash{$geneId}))
      {
      @transIds = @{$idsHash{$geneId}};
      # rewrite line with transcript ID
      foreach my $t (@transIds)
         {
         print "$t\t$line";
         }
      }
   }
}
close INFO;
'EOF'
   # << happy emacs
    chmod +x mergeTransIdAndInfo.pl
    mergeTransIdAndInfo.pl vega27OtterIds.txt ./data/vegav27_information.txt \
         > vegav27InfoWithTransIds.txt 
    wc -l vega27OtterIds.txt
    # 12819 vega27OtterIds.txt
    wc -l vegav27InfoWithTransIds.txt
    # 12820 vegav27InfoWithTransIds.txt
    awk 'BEGIN {OFS="\t"} {print $2, $1}' vegav27InfoWithTransIds.txt \
        | sort | uniq > infoWithTrans.Ids
    sort vega27OtterIds.txt | uniq > otterIds.sort
    comm -13 otterIds.sort infoWithTrans.Ids
    comm -23 otterIds.sort infoWithTrans.Ids
    # No difference, there must be an extra line duplicated
    wc -l infoWithTrans.Ids
    # 12819 infoWithTrans.Ids
    awk 'BEGIN {OFS="\t"} {print $2, $1}' vegav27InfoWithTransIds.txt \
        | sort | uniq -c | sort -nr > count
    # 2 OTTDARG00000014371      OTTDART00000027056
    # this appears twice in info file
    grep OTTDARG00000014371 ./data/vegav27_information.txt
# OTTDARG00000014371      DKEY-26M21.1    ZDB-GENE-041210-211
# si:ch211-162i19.2       protein_coding  novel protein   NOVEL
# OTTDARG00000014371      DKEY-26M21.1    ZDB-GENE-041210-211
# si:ch211-162i19.2       protein_coding  novel protein   NOVEL
    # remove one copy from the vegav27_information.txt file.
    # so just sort and uniq the resulting information file with transcript IDs
    sort vegav27InfoWithTransIds.txt | uniq > vegav27InfoWithTransIdsUniq.txt
    wc -l vegav27InfoWithTransIdsUniq.txt
    # 12819 vegav27InfoWithTransIdsUniq.txt
    # There are extra IDs in the information file as it includes all Vega
    # genes (including clones not in Ensembl). Those actually in the Vega 
    # v27 GTF are those that are found on clones that are in Ensembl.
    # Vega includes only clone sequence and Zv7_NA scaffolds are WGS so 
    # these are not annotated in Vega. This information is from 
    # an e-mail from Tina Eyre (te3@sanger.ac.uk) on 2007-09-07.

    # then use the info file to grab those genes that are pseudogenes, get the
    # transcript ID from the vegaIDs.txt file. Then grep out the pseudogenes
    # to a separate file. 

    # Next step is to find which transcripts are pseudogenes.
    grep pseudogene vegav27InfoWithTransIdsUniq.txt | sort | uniq | wc -l
    # found 58 pseudogenes so need to create the pseudogene track 
    # Get transcript IDs for pseudogenes.
    grep pseudogene vegav27InfoWithTransIdsUniq.txt | awk '{print $1}' \
         > pseudogenes.ids 
    grep -w -f pseudogenes.ids vegav27Fixed.gtf > vegav27Pseudogene.gtf 
    awk '{print $20}' vegav27Pseudogene.gtf | sort | uniq | wc -l
    # 58
    # Need to make the GTF file vegGene table by subtracting pseudogenes:
    grep -vw -f pseudogenes.ids vegav27Fixed.gtf > vegaGenev27.gtf
    wc -l vega*gtf
    # 168381 vegaGenev27.gtf
    # 168602 vegav27Fixed.gtf
    # 221 vegav27Pseudogene.gtf

    # Need to relabel IDs to get the name to be the otter transcript ID
    # and name 2 to be the transcript_id (needs to be labeled as gene_id)
    # Also, relabel the otter_transcript_id to be transcript_id as ldHgGene
    # groups the rows by this ID.  
    sed -e 's/gene_id/tmp_id/' vegaGenev27.gtf \
        | sed -e 's/transcript_id/gene_id/' \
        | sed -e 's/otter_transcript_id/transcript_id/' > vegaGenev27Format.gtf
    # remove an extra line
    grep -v "rachels_data" vegaGenev27Format.gtf > tmp
    mv tmp vegaGenev27Format.gtf

    # Do the same for the pseudogene GTF file:
    sed -e 's/gene_id/tmp_id/' vegav27Pseudogene.gtf \
        | sed -e 's/transcript_id/gene_id/' \
        | sed -e 's/otter_transcript_id/transcript_id/' \
        > vegaPseudogenev27Format.gtf

    gtfToGenePred -genePredExt -infoOut=tmpInfo.txt \
        vegaGenev27Format.gtf vegaGenev27.gp
    genePredCheck vegaGenev27.gp
    # genepred ok checked: 12761 failed: 0
    gtfToGenePred -genePredExt -infoOut=tmpInfo.txt \
        vegaPseudogenev27Format.gtf vegaPseudoGenev27.gp
    genePredCheck vegaPseudoGenev27.gp
    # genepred ok checked: 58 failed: 0
    
    # load GTF files for Vega genes and pseudogenes:
    ssh hgwdev
    cd /cluster/data/danRer5/bed/vega27
    # load vegaGene table
    hgLoadGenePred -genePredExt danRer5 vegaGene vegaGenev27.gp
    # load vegaPseudoGene table
    hgLoadGenePred -genePredExt danRer5 vegaPseudoGene vegaPseudoGenev27.gp

    hgsql -N -e 'select distinct(chrom) from vegaGene;' danRer5 
# There are 36, chr1-25 and 11 scaffolds are annotated:
# Zv7_scaffold2491
# Zv7_scaffold2498
# Zv7_scaffold2509
# Zv7_scaffold2516
# Zv7_scaffold2523
# Zv7_scaffold2533
# Zv7_scaffold2537
# Zv7_scaffold2551
# Zv7_scaffold2560
# Zv7_scaffold2633
# Zv7_scaffold2650

    hgsql -N -e 'select distinct(chrom) from vegaPseudoGene;' danRer5
# 12 chroms and 1 scaffolds have pseudogenes annotated
# chr2, chr3, chr4, chr5, chr9, chr12, chr13, chr18, chr19, chr20, chr24,
# Zv7_scaffold2509 
    # Only finished sequence is annotated by Vega 
    # Vega information tables:
    # mySQL table definition and autosql-generated files created previously 
    # for zebrafish-specific information (vegaInfoZfish).
    # Add clone_id to a separate table instead of this one. A tab-separated 
    # file of transcript ID and clone ID was provided by Tina Eyre 
    # (te3@sanger.ac.uk) at Sanger.  
    # Need to grep for the transcript IDs     
    # created a second table for the cloneId accessions since there
    # are multiple ids for some VEGA genes. Otherwise, there would be 
    # a comma separated list in this field or many rows repeated but just
    # different in the cloneId field. Associate transcript ID to clone IDs.  
    # Table definitions are in danRer4.txt.
    
    # first check that the clone IDs file is complete.
    # this is from Tina Eyre sent on 2007-09-07 and the file is
    # vegav27_transcript_to_clone.txt.zip
    ssh kkstore06
    cd /cluster/data/danRer5/bed/vega27
    unzip vegav27_transcript_to_clone.txt.zip
    awk '{print $1}' vegav27_transcript_to_clone.txt | sort | uniq \
        > transWithCloneId.txt
    wc -l transWithCloneId.txt
    # 12819 transWithCloneId.txt
    awk '{print $1}' vegav27InfoWithTransIdsUniq.txt | sort | uniq \
        > vegaInfoTransIds.txt
    wc -l vegaInfoTransIds.txt
    # 12819 vegaInfoTransIds.txt
    # compare to vega27OtterIds.txt
    comm -12 transWithCloneId.txt vegaInfoTransIds.txt | wc -l
    # 12819
    # so all the transcript IDs are in this list.
    # sort this table:
    sort vegav27_transcript_to_clone.txt | uniq > vegav27TransToClone.txt
    
    # load these tables: 
    ssh hgwdev
    cd /cluster/data/danRer5/bed/vega27
    hgLoadSqlTab danRer5 vegaInfoZfish ~/kent/src/hg/lib/vegaInfoZfish.sql \
                 vegav27InfoWithTransIdsUniq.txt
    hgLoadSqlTab danRer5 vegaToCloneId ~/kent/src/hg/lib/vegaToCloneId.sql \
                 vegav27TransToClone.txt

# Add trackDb.ra entry. Keep Vega Genes and Vega Pseudogenes separate as for
# human and add version number to description. 

###########################################################################
# BLASTZ FOR MEDAKA (oryLat1) (DONE, 2007-09-10, hartera)
# CREATE CHAIN AND NET TRACKS, AXTNET, MAFNET AND ALIGNMENT DOWNLOADS
    # No lineage-specific repeats for this species pair. 
    # Medaka also has no species-specific repeats in the RepeatMasker
    # library so run this using dynamic masking.
    ssh kkstore06
    mkdir /cluster/data/danRer5/bed/blastz.oryLat1.2007-09-10
    cd /cluster/data/danRer5/bed
    ln -s blastz.oryLat1.2007-09-10 blastz.oryLat1
    cd /cluster/data/danRer5/bed/blastz.oryLat1.2007-09-10
# Use Blastz parameters as for tetNig1 in danRer4.txt. Using scaffolds makes this run
# slower so it is best to have the scaffolds in the query. Use HoxD55.q 
# matrix as medaka is quite distant from zebrafish. Blastz uses 
# lineage-specfic repeats but there are none for these two species.
# Use soft-masked scaffolds for oryLat1 (and dynamic masking) and also use 
# windowMasker plus SDust repeats since there are no species-specific
# RepeatMasker repeats for medaka. The random scaffolds were used in Blastz
# separately (oryLat1UnScaffolds.2bit) rather than as an artificial unordered
# chromosome to avoid getting Blastz alignments across non-contiguous
# scaffolds. 
cat << '_EOF_' > DEF
# zebrafish (danRer5) vs. Medaka (oryLat1)
BLASTZ=blastz.v7.x86_64
BLASTZ_H=2500
BLASTZ_M=50
BLASTZ_Q=/cluster/data/blastz/HoxD55.q

# TARGET - zebrafish (danRer5) 
SEQ1_DIR=/san/sanvol1/scratch/danRer5/danRer5.2bit
SEQ1_LEN=/cluster/data/danRer5/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000

# QUERY - Medaka (oryLat1)
# (40M chunks covers the largest chroms in one # gulp)
SEQ2_DIR=/san/sanvol1/scratch/oryLat1/oryLat1.sdTrf.2bit
SEQ2_LEN=/san/sanvol1/scratch/oryLat1/chrom.sizes
SEQ2_CTGDIR=/san/sanvol1/scratch/oryLat1/oryLat1UnScaffolds.2bit
SEQ2_CTGLEN=/san/sanvol1/scratch/oryLat1/oryLat1UnScaffolds.sizes
SEQ2_LIFT=/san/sanvol1/scratch/oryLat1/chrUn.lift
SEQ2_CHUNK=40000000
SEQ2_LIMIT=50
SEQ2_LAP=0

BASE=/cluster/data/danRer5/bed/blastz.oryLat1.2007-09-10
TMPDIR=/scratch/tmp
'_EOF_'
   # << this line keeps emacs coloring happy
   chmod +x DEF
   ## use screen
   time nice /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
        -bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
        -qRepeats=windowmaskerSdust \
        -blastzOutRoot /san/sanvol1/scratch/danRer5OryLat1 \
        `pwd`/DEF >& doBlastz.log &
   # 0.154u 0.096s 6:13:59.67 0.0%   0+0k 0+0io 4pf+0w

# TEST PARAMETERS:
# RUN 1 - parameters above used for fugu queried against zebrafish before.
# BLASTZ_H=2500
# BLASTZ_M=50
# BLASTZ_Q=/cluster/data/blastz/HoxD55.q
# 
# TRY WITH PARAMETERS USED FOR DANRER4 - Run 2:
# use parameters for oryLat1 in danRer4.txt. Using scaffolds makes this run
# slower so it is best to have the scaffolds in the query. Use HoxD55.q 
# matrix as medaka is quite distant from zebrafish. Blastz uses 
# lineage-specfic repeats but there are none for these two species.
# Use soft-masked scaffolds and dynamic masking.
# zebrafish (danRer5) vs. Medaka (oryLat1)
# BLASTZ=blastz.v7.x86_64
# BLASTZ_H=2000
# BLASTZ_Y=3400
# BLASTZ_L=6000
# BLASTZ_K=2200
# BLASTZ_M=50
# BLASTZ_Q=/cluster/data/blastz/HoxD55.q

   # Run 1:
   featureBits danRer5 refGene:cds chainOryLat1Run1Link -enrichment
# refGene:cds 1.019%, chainOryLat1Run1Link 10.478%, both 0.895%, cover 87.79%,
# enrich 8.38x
   # Run 2:
   featureBits danRer5 refGene:cds chainOryLat1Link -enrichment
# refGene:cds 1.019%, chainOryLat1Link 11.164%, both 0.894%, cover 87.77%,
# enrich 7.86x
   # Chains danRer4, parameters like Run 2:
   featureBits danRer4 refGene:cds chainOryLat1Link -enrichment
# refGene:cds 0.952%, chainOryLat1Link 12.366%, both 0.813%, cover 85.43%,
# enrich 6.91x
   # Run 1: 
   hgsql -e 'select count(*) from chainOryLat1Run1Link;' danRer5
+----------+
| count(*) |
+----------+
| 32786056 |
+----------+
    # Run 2:
    hgsql -e 'select count(*) from chainOryLat1Link;' danRer5
+----------+
| count(*) |
+----------+
| 46341892 |
+----------+
    # Run 1:
    hgsql -e 'select count(*) from chainOryLat1Run1;' danRer5
+----------+
| count(*) |
+----------+
|  2197571 |
+----------+
    # Run 2:
    hgsql -e 'select count(*) from chainOryLat1;' danRer5
+----------+
| count(*) |
+----------+
|  2961333 |
+----------+
   # Nets Run 1:
   featureBits danRer5 refGene:cds netOryLat1Run1 -enrichment
# refGene:cds 1.019%, netOryLat1Run1 65.343%, both 0.977%, cover 95.85%,
# enrich 1.47x
   # Nets Run 2:
   featureBits danRer5 refGene:cds netOryLat1 -enrichment   
# refGene:cds 1.019%, netOryLat1 66.238%, both 0.974%, cover 95.63%, enrich
# 1.44x
   # Nets danRer4, parameters like Run 2:
   featureBits danRer4 refGene:cds netOryLat1 -enrichment   
# refGene:cds 0.952%, netOryLat1 64.174%, both 0.888%, cover 93.29%, enrich
# 1.45x
   # [1]    Exit 25
   # /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 -bigClusterHub=pk
   # 0.188u 0.132s 7:33:05.44 0.0%   0+0k 0+0io 3pf+0w
   # Command failed:
# ssh -x hgwdev nice
# /cluster/data/danRer5/bed/blastz.oryLat1.2007-09-10/axtChain/installDownloads.csh
   ssh hgwdev
   # remove downloads and run install downloads again
   cd /usr/local/apache/htdocs/goldenPath/danRer5/
   rm -r vsOryLat1
   rm -r liftOver
   ssh kkstore06
   cd /cluster/data/danRer5/bed/blastz.oryLat1.2007-09-10
   # Run script again starting at download step
   time nice /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
        -bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
        -qRepeats=windowmaskerSdust -continue download \
        -blastzOutRoot /san/sanvol1/scratch/danRer5OryLat1 \
        `pwd`/DEF >& doBlastz2.log &
   # 0.095u 0.047s 17:35.63 0.0%     0+0k 0+0io 0pf+0w

 ## DO BLASTZ SWAP TO CREATE THE DANRER5 CHAINS, NETS, AXTNET, MAFNET AND
  # ALIGNMENT DOWNLOADS FOR DANRER5 ON ORYLAT1 - documented in oryLat1.txt

###########################################################################
# SPLIT MASKED SEQUENCE INTO CHROM AND SCAFFOLD FASTA FILES
# (DONE, 2007-09-10, hartera)
   ssh kkstore06
   cd /cluster/data/danRer5/
   mkdir splitSoftMask
   cd splitSoftMask
   twoBitToFa ../danRer5.2bit danRer5.softMasked.fa
   faSplit byname danRer5.softMasked.fa .
   rm danRer5.softMasked.fa
    
   # Copy the chrom and scaffold sequences to /san/sanvol1/scratch/danRer5
   mkdir -p /san/sanvol1/scratch/danRer5/splitSoftMask
   foreach f (/cluster/data/danRer5/splitSoftMask/*.fa)
      cp -p $f /san/sanvol1/scratch/danRer5/splitSoftMask/
   end

   foreach f (/san/sanvol1/scratch/danRer5/splitSoftMask/*.fa)
      ls $f >> seq.lst
   end
   wc -l seq.lst
   # 5036 seq.lst 
   # correct: 25 chroms, 1 chrM, and 5010 unmapped scaffolds.
   rm seq.lst

   # Copy the split sequences to iscratch dir on kkr1u00 
   ssh kkr1u00
   rm -r /iscratch/i/danRer5/splitSoftMask
   mkdir -p /iscratch/i/danRer5/splitSoftMask
   foreach s (/cluster/data/danRer5/splitSoftMask/*.fa)
      echo "Copying $s ..."
      cp $s /iscratch/i/danRer5/splitSoftMask/
   end
 
   # Count files transferred
   foreach f (/iscratch/i/danRer5/splitSoftMask/*.fa)
      ls $f >> seq.lst  
   end
   wc -l seq.lst
   # 5036 seq.lst
   # correct: 25 chroms, 1 chrM, and 5010 unmapped scaffolds.
   rm seq.lst

   # rsync to cluster machines
   foreach R (2 3 4 5 6 7 8)
      rsync -a --progress /iscratch/i/danRer5/ kkr${R}u00:/iscratch/i/danRer5/
   end
    
###########################################################################
# AFFY ZEBRAFISH GENOME ARRAY CHIP TRACK (DONE, 2007-09-10 - 2007-09-12) 
# With the first version of this track on danRer4, QA found a number of short
# alignments of <= 30 bp and there are a number in the <= 50bp range.
# These do not seem to be meaningful so filtering was changed to try to
# remove these alignments while retaining meaningful alignments.
# pslCDnaFilter was used with the same settings as used for the
# Genbank EST alignments for zebrafish.
# Also use -minIdentity=90 for Blat instead of -minIdentity=95 since as the
# higher minIdentity is causing alignments to be dropped that should not be.
# Blat's minIdentity seems to be more severe than that for pslReps or
# pslCDnaFilter as it takes insertions and deletions into account.
# These are Jim's recommendations.
# Load both consensus and target sequences to form a composite track. Users
# have asked for Affy target sequences in the past.
    # Array chip consensus sequences already downloaded for danRer1
    ssh hgwdev
    cd /projects/compbio/data/microarray/affyZebrafish
    mkdir -p /san/sanvol1/scratch/affy
    # Affy Zebrafish consensus sequences already in this directory:
   cp /projects/compbio/data/microarray/affyZebrafish/Zebrafish_consensus.fa \
       /san/sanvol1/scratch/affy/
    # Set up cluster job to align Zebrafish consensus sequences to danRer5
    mkdir -p /cluster/data/danRer5/bed/affyZebrafish.2007-09-10
    ln -s /cluster/data/danRer5/bed/affyZebrafish.2007-09-10 \
          /cluster/data/danRer5/bed/affyZebrafish

    ssh pk
    cd /cluster/data/danRer5/bed/affyZebrafish
    ls -1 /san/sanvol1/scratch/affy/Zebrafish_consensus.fa > affy.lst
    foreach f (/san/sanvol1/scratch/danRer5/splitSoftMask/*.fa)
       ls -1 $f >> genome.lst
    end
    wc -l genome.lst 
    # 5036 genome.lst
    # for output:
    mkdir -p /san/sanvol1/scratch/danRer5/affy/psl
    # use -repeats option to report matches to repeat bases separately
    # to other matches in the PSL output.
    echo '#LOOP\n/cluster/bin/x86_64/blat -fine -repeats=lower -minIdentity=90
-ooc=/san/sanvol1/scratch/danRer5/danRer5_11.ooc $(path1) $(path2) {check out
line+ /san/sanvol1/scratch/danRer5/affy/psl/$(root1)_$(root2).psl}\n#ENDLOOP'
> template.sub

    /parasol/bin/gensub2 genome.lst affy.lst template.sub para.spec
    /parasol/bin/para create para.spec
    # 5036 jobs written to batch
    /parasol/bin/para try, check, push ... etc.
    /parasol/bin/para time
# Completed: 5036 of 5036 jobs
#CPU time in finished jobs:      22117s     368.61m     6.14h    0.26d  0.001 y
#IO & Wait Time:                 14290s     238.17m     3.97h    0.17d  0.000 y
#Average job time:                   7s       0.12m     0.00h    0.00d
#Longest running job:                0s       0.00m     0.00h    0.00d
#Longest finished job:             840s      14.00m     0.23h    0.01d
#Submission to last job:           988s      16.47m     0.27h    0.01d

    # need to do pslSort
    ssh pk
    cd /san/sanvol1/scratch/danRer5/affy
    # Do sort and then best in genome filter.
    # only use alignments that have at least
    # 95% identity in aligned region.
    # Previously did not use minCover since a lot of sequence is in
    # Un and NA so genes may be split up so good to see all alignments.
    # However, found a number of short alignments of <= 50 bp. These are
    # not meaningful so maybe need to use minCover. If increased too much,
    # then hits on poor parts of the assembly will be missed.
    # use pslCDnaFilter with the same parameters as used for zebrafish
    # Genbank EST alignments.
    pslSort dirs raw.psl tmp psl
    pslCDnaFilter -localNearBest=0.005 -minQSize=20 -minNonRepSize=16 \
       -ignoreNs -bestOverlap -minId=0.95 -minCover=0.15 raw.psl \
       affyZebrafishConsensus.psl
#                         seqs    aligns
#             total:     15295   459469
# drop minNonRepSize:     2791    391655
#     drop minIdent:     2590    30197
#     drop minCover:     1712    7112
#        weird over:     237     1038
#        kept weird:     176     228
#    drop localBest:     1393    13532
#              kept:     14971   16973
# 96.6% of sequences aligned. There are 15502 Affy consensus sequences.

# FOR DANRER4:
#                         seqs    aligns
#             total:     15272   828202
#drop minNonRepSize:     2763    741674
#     drop minIdent:     2656    39188
#     drop minCover:     2550    10784
#        weird over:     359     1439
#        kept weird:     277     347
#    drop localBest:     2830    17737
#              kept:     14952   18819
# Kept 97.9% of alignments. There are 15502 Affy sequences originally
# aligned so there are now 96.5% remaining.
     
    # rsync these psl files
    rsync -a --progress /san/sanvol1/scratch/danRer5/affy/*.psl \
         /cluster/data/danRer5/bed/affyZebrafish/
    
    ssh kkstore06
    cd /cluster/data/danRer5/bed/affyZebrafish
    # shorten names in psl file
    sed -e 's/Zebrafish://' affyZebrafishConsensus.psl > tmp.psl
    mv tmp.psl affyZebrafishConsensus.psl
    pslCheck affyZebrafishConsensus.psl
    # psl is good

    # load track into database
    ssh hgwdev
    cd /cluster/data/danRer5/bed/affyZebrafish
    hgLoadPsl danRer5 affyZebrafishConsensus.psl
    # Add consensus sequences for Zebrafish chip
    # Copy sequences to gbdb if they are not there already
    mkdir -p /gbdb/hgFixed/affyProbes
    ln -s \
    /projects/compbio/data/microarray/affyZebrafish/Zebrafish_consensus.fa \
      /gbdb/hgFixed/affyProbes
    # these sequences were loaded previously so no need to reload.
    hgLoadSeq -abbr=Zebrafish: danRer5 \
              /gbdb/hgFixed/affyProbes/Zebrafish_consensus.fa
    # Clean up
    rm batch.bak raw.psl
    # check number of short alignments:
    hgsql -e \
     'select count(*) from affyZebrafishConsensus where (qEnd - qStart) <= 50;'\
     danRer5
    # 6
    # There were 7 for danRer4 - this is ok.
    
    ### TARGET SEQUENCES SUBTRACK: (2007-09-11 - 2007-09-12)
    ### Create subtrack of target sequences, this is just the region of the 
    # consensus sequence that is used for designing the probesets.
    
    # Array chip target sequences downloaded from 
 # http://www.affymetrix.com/support/technical/byproduct.affx?product=zebrafish
    ssh hgwdev
    # copy sequences to /projects/compbio/data/microarray/affyZebrafish
    cd /projects/compbio/data/microarray/affyZebrafish
    unzip Zebrafish_target.zip
    grep '>' Zebrafish_target > target.headers
    wc -l target.headers
    # 15617 target.headers
    # includes the control sequences
    # remove these
    grep -v "AFFX" target.headers > targetNoControls.headers
    wc -l targetNoControls.headers
    # 15502 targetNoControls.headers
    mv Zebrafish_target Zebrafish_target.fa 
    grep '>' Zebrafish_target.fa | wc -l
    # 15617 sequences
    awk 'BEGIN {FS=";"} {print $1;}' targetNoControls.headers \
        | sed -e 's/>//' > targetNoControls.ids
    wc -l *.ids
    # 15502 targetNoControls.ids
    # remove semi-colon after name in the target FASTA file
    sed -e 's/;//' Zebrafish_target > Zebrafish_target.fa
    faSomeRecords Zebrafish_target.fa targetNoControls.ids \
          Zebrafish_targetNoControls.fa
    grep '>' Zebrafish_targetNoControls.fa | wc -l
    # 15502
    # Remove "Zebrafish:" from name, leave in "tg:" to differentiate 
    # sequences from the consensus ones.
    perl -pi.bak -e 's/>target:Zebrafish:/>tg:/' Zebrafish_targetNoControls.fa
    
    # make affy directory on the san if it does not exist already:
    mkdir -p /san/sanvol1/scratch/affy
   cp -p \
/projects/compbio/data/microarray/affyZebrafish/Zebrafish_targetNoControls.fa \
       /san/sanvol1/scratch/affy/
    # Set up cluster job to align Zebrafish consensus sequences to danRer5
    # Directories below were set up above
   
    # mkdir -p /cluster/data/danRer5/bed/affyZebrafish.2006-09-10
    # ln -s /cluster/data/danRer5/bed/affyZebrafish.2006-09-10 \
    #      /cluster/data/danRer5/bed/affyZebrafish
    mkdir -p /cluster/data/danRer5/bed/affyZebrafish.2006-09-10/target

    ssh pk
    cd /cluster/data/danRer5/bed/affyZebrafish/target
    ls -1 /san/sanvol1/scratch/affy/Zebrafish_targetNoControls.fa \
        > affyTarget.lst
    foreach f (/san/sanvol1/scratch/danRer5/splitSoftMask/*.fa)
       ls -1 $f >> genome.lst
    end
    wc -l genome.lst 
    # 5036 genome.lst
    # for output:
    mkdir -p /san/sanvol1/scratch/danRer5/affy/pslTarget
    # use -repeats option to report matches to repeat bases separately
    # to other matches in the PSL output.
    echo '#LOOP\n/cluster/bin/x86_64/blat -fine -repeats=lower -minIdentity=90
-ooc=/san/sanvol1/scratch/danRer5/danRer5_11.ooc $(path1) $(path2) {check out
line+ /san/sanvol1/scratch/danRer5/affy/pslTarget/$(root1)_$(root2).psl}\n#ENDLOOP' > template.sub

    /parasol/bin/gensub2 genome.lst affyTarget.lst template.sub para.spec
    /parasol/bin/para create para.spec
    # 5036 jobs written to batch
    /parasol/bin/para try, check, push ... etc.
# Keeps crashing - can't write to directory - permissions look ok.
# Tried again (2007-09-12, hartera) and it worked fine.
    /parasol/bin/para time
# Completed: 5036 of 5036 jobs
#CPU time in finished jobs:      17604s     293.40m     4.89h    0.20d  0.001y
#IO & Wait Time:                 14325s     238.75m     3.98h    0.17d  0.000 y
#Average job time:                   6s       0.11m     0.00h    0.00d
#Longest running job:                0s       0.00m     0.00h    0.00d
#Longest finished job:             767s      12.78m     0.21h    0.01d
#Submission to last job:           842s      14.03m     0.23h    0.01d

    # need to do pslSort
    ssh pk
    cd /san/sanvol1/scratch/danRer5/affy
    # Do sort and then best in genome filter.
    pslSort dirs rawTarget.psl tmp pslTarget
    # only use alignments that have at least 95% identity in aligned region.
    pslCDnaFilter -localNearBest=0.005 -minQSize=20 -minNonRepSize=16 \
       -ignoreNs -bestOverlap -minId=0.95 -minCover=0.15 rawTarget.psl \
       affyZebrafishTarget.psl
#                         seqs    aligns
#             total:     15216   308151
#      drop invalid:     1       1
# drop minNonRepSize:     2051    256676
#     drop minIdent:     1917    20458
#     drop minCover:     741     2951
#        weird over:     143     607
#        kept weird:     92      116
#    drop localBest:     1306    11562
#              kept:     14820   16503
# 95.6% of sequences aligned. There are 15502 Affy consensus sequences.

    # rsync these psl files
    rsync -a --progress /san/sanvol1/scratch/danRer5/affy/*Target.psl \
         /cluster/data/danRer5/bed/affyZebrafish/target/

    ssh kkstore06
    cd /cluster/data/danRer5/bed/affyZebrafish/target
    # shorten names in psl file
    #sed -e 's/Zebrafish://' affyZebrafishTarget.psl > tmp.psl
    #mv tmp.psl affyZebrafishConsensus.psl
    pslCheck affyZebrafishTarget.psl
    # psl is good

    # load track into database
    ssh hgwdev
    cd /cluster/data/danRer5/bed/affyZebrafish/target
    hgLoadPsl danRer5 affyZebrafishTarget.psl
    # Add consensus target for Zebrafish chip
    # Copy sequences to gbdb if they are not there already
    mkdir -p /gbdb/hgFixed/affyProbes
    ln -s \
/projects/compbio/data/microarray/affyZebrafish/Zebrafish_targetNoControls.fa \
      /gbdb/hgFixed/affyProbes
    # these sequences were loaded previously so no need to reload.
    hgLoadSeq danRer5 \
              /gbdb/hgFixed/affyProbes/Zebrafish_targetNoControls.fa
    # Clean up
    rm batch.bak rawTarget.psl
    # check number of short alignments:
    hgsql -e \
     'select count(*) from affyZebrafishTarget where (qEnd - qStart) <= 50;'\
     danRer5
    # 33
    # This is ok, these are shorter regions anyway.
    # Added trackDb.ra entry to zebrafish/danRer5/trackDb.ra. This is
    # a composite track consisting of 2 subtracks for the consensus
    # sequence Blat alignments and the target sequence Blat alignments.
    # The composite track is called affyZebrafish. Add searches for both
    # tracks.

###########################################################################
# MAKE DOWNLOADABLE / GOLDENPATH FILES (DONE, 2007-09-11, hartera)
# ADDED A CHROMOSOMES DOWNLOAD DIRECOTRY (DONE, 2007-09-14, hartera)
    ssh hgwdev
    cd /cluster/data/danRer5
    ln -s /cluster/data/danRer5/bed/RepeatMasker.2007-09-04/danRer5.fa.out \
          /cluster/data/danRer5
    /cluster/bin/scripts/makeDownloads.pl danRer5 -verbose=2 \
          >& jkStuff/downloads.log &
    tail -f jkStuff/downloads.log
    # Edit README.txt files when done:
# *** All done!
# *** Please take a look at the downloads for danRer5 using a web browser.
# *** Edit each README.txt to resolve any notes marked with "***":
#     /cluster/data/danRer5/goldenPath/database/README.txt
#     /cluster/data/danRer5/goldenPath/bigZips/README.txt
#    (The htdocs/goldenPath/danRer5/*/README.txt "files" are just links to
#     those.)
    # ADD CHROMOSOMES DOWNLOADS DIRECTORY
    # Add chromosome and scaffolds downloads, these are already in 
    # /cluster/data/danRer5/splitSoftMask (2007-09-14, hartera)
    # compress files:
    ssh kkstore06
    mkdir /cluster/data/danRer5/goldenPath/chromosomes
    cd /cluster/data/danRer5/splitSoftMask
    foreach c (chr*.fa)
      gzip $c
    end
    # now scaffolds, compress together into one file
    foreach s (Zv7_*.fa)
      gzip -c $s >> unmappedScaffolds.fa.gz
    end 
    mv *.gz /cluster/data/danRer5/goldenPath/chromosomes/

    ssh hgwdev
    # make symbolic links to the goldenPath downloads directory for danRer5
    mkdir /usr/local/apache/htdocs/goldenPath/danRer5/chromosomes
    ln -s /cluster/data/danRer5/goldenPath/chromosomes/*.gz \
       /usr/local/apache/htdocs/goldenPath/danRer5/chromosomes/
    cd /usr/local/apache/htdocs/goldenPath/danRer5/chromosomes
    md5sum *.gz > md5sum.txt
    cp /usr/local/apache/htdocs/goldenPath/danRer4/chromosomes/README.txt \ 
       /cluster/data/danRer5/goldenPath/chromosomes/
    # edit README.txt to make it specific for the danRer5 (Sanger Zv7) 
    # assembly and then link it to the downloads directory.
    ln -s /cluster/data/danRer5/goldenPath/chromosomes/README.txt \
       /usr/local/apache/htdocs/goldenPath/danRer5/chromosomes/

########################################################################
# BLASTZ Swap Mouse mm9 (DONE - 2007-09-12 - Hiram)
    ssh kkstore06
    # the original
    cd /cluster/data/mm9/bed/blastzDanRer5.2007-09-11
    cat fb.mm9.chainDanRer5Link.txt
    #	48497464 bases of 2620346127 (1.851%) in intersection

    #	the swap
    mkdir /cluster/data/danRer5/bed/blastz.mm9.swap
    cd /cluster/data/danRer5/bed/blastz.mm9.swap
    time nice -n +19 $HOME/kent/src/hg/utils/automation/doBlastzChainNet.pl \
	-chainMinScore=5000 \
	/cluster/data/mm9/bed/blastzDanRer5.2007-09-11/DEF \
	-swap -chainLinearGap=loose -bigClusterHub=pk -verbose=2 \
	> swap.log 2>&1 &
    #	real    9m47.163s
    cat fb.danRer5.chainMm9Link.txt
    #	34017483 bases of 1435609608 (2.370%) in intersection

########################################################################
# ADD CONTIGS TRACK (DONE, 2007-09-14, hartera)
    # make ctgPos2 (contig name, size, chrom, chromStart, chromEnd) from 
    # chunks (contigs) agp files.
    ssh kkstore06
    mkdir -p /cluster/data/danRer5/bed/ctgPos2
    # get chrM information from danRer4:
    ssh hgwdev 
    cd /cluster/data/danRer5/bed/ctgPos2
    hgsql -N -e 'select * from ctgPos2 where chrom = "chrM";' danRer4 \
         > chrM.ctgPos2.txt

    ssh kkstore06
    cd /cluster/data/danRer5/bed/ctgPos2
    # update chrM accession to NC_002333.2
    perl -pi.bak -e 's/AC024175\.3/NC_002333\.2/' chrM.ctgPos2.txt
    
    # ctgPos2 .sql .as .c and .h files exist - see makeDanRer1.doc
    # get the contigs for the chroms, chr1-25:
    awk 'BEGIN {OFS="\t"} \
        {if ($5 != "N") print $6, $3-$2+1, $1, $2-1, $3, $5}' \
         /cluster/data/danRer5/Zv7release/Zv7_chr.agp >> ctgPos2.tab
    # Add chrM to ctgPos2 file:
    cat chrM.ctgPos2.txt >> ctgPos2.tab
 
    # then do the same for the unmapped scaffolds:
    awk 'BEGIN {OFS="\t"} \
        {if ($5 != "N") print $6, $3-$2+1, $1, $2-1, $3, $5}' \
        /cluster/data/danRer5/Zv7release/randoms/randomsScaffold.agp \
        >> ctgPos2.tab

    # load the ctgPos2 table
    ssh hgwdev
    cd /cluster/data/danRer5/bed/ctgPos2
    # use hgLoadSqlTab as it gives more error messages than using 
    # "load data local infile ...".
    hgLoadSqlTab danRer5 ctgPos2 \
            ~/kent/src/hg/lib/ctgPos2.sql ctgPos2.tab
    # check that all chroms and scaffolds are there
    hgsql -N -e 'select distinct(chrom) from ctgPos2;' danRer5 \
           | sort > chromsAndScafsCtgPos2.txt
    hgsql -N -e 'select distinct(chrom) from chromInfo;' danRer5 \
           | sort > chromsAndScafsChromInfo.txt
    wc -l chroms*
    # 5036 chromsAndScafsChromInfo.txt
    # 5036 chromsAndScafsCtgPos2.txt
    comm -12 chromsAndScafsCtgPos2.txt chromsAndScafsChromInfo.txt | wc -l
    # 5036
    # So all chroms and unmapped scaffolds are represented in the table.

    # cleanup
    ssh kkstore06
    cd /cluster/data/danRer5/bed/ctgPos2
    rm *.bak chromsAndScafs*

# create trackDb.ra entry and html page for ctgPos2 track.
# add search for the track and make sure the termRegex will handle
# contigs named "Zv7_scaffoldN.N" where N is an integer and all the 
# accessions in the *.agp files.

###########################################################################
# CREATE MICROARRAY DATA TRACK BY ADDING ZON LAB WILD TYPE MICROARRAY DATA TO 
# AFFY ZEBRAFISH ALIGNMENTS (DONE, 2007-09-16, hartera)
# Array data is for whole embryos of five wild type zebrafish strains. 
# Data is in hgFixed (see hgFixed.doc) - from Len Zon's lab at Children's 
# Hospital Boston. Contact: adibiase@enders.tch.harvard.edu
# see danRer4.txt and hgFixed.txt for the history of this track and the 
# expression data.
    ssh hgwdev
    mkdir -p /cluster/data/danRer5/bed/ZonLab/wtArray
    cd /cluster/data/danRer5/bed/ZonLab/wtArray
    
    # Map the data to the consensus sequence alignments first. 
    # use AllRatio table for mapping. There are not many arrays in this 
    # dataset so using AllRatio will allow the selection of All Arrays
    # from the track controls on the track description page. Also set up the
    # Zebrafish microarrayGroups.ra so that the Medians of replicates or
    # Means of replicates can also be selected for display.
    # Create mapped data in zebrafishZonWT.bed.
    hgMapMicroarray zebrafishZonWTAffyConsensus.bed \
         hgFixed.zebrafishZonWTAllRatio \
         /cluster/data/danRer5/bed/affyZebrafish/affyZebrafishConsensus.psl
    # Loaded 15617 rows of expression data from hgFixed.zebrafishZonWTAllRatio
    # Mapped 14971,  multiply-mapped 2002, missed 0, unmapped 646

    hgLoadBed danRer5 affyZonWildType zebrafishZonWTAffyConsensus.bed
    # Loaded 16973 elements of size 15

    # then for the Affy Zebrafish chip target sequences, map these to the 
    # Wild Type expression data
    hgMapMicroarray -pslPrefix="tg:" zebrafishZonWTAffyTarget.bed \
         hgFixed.zebrafishZonWTAllRatio \
         /cluster/data/danRer5/bed/affyZebrafish/target/affyZebrafishTarget.psl
    # Loaded 15617 rows of expression data from hgFixed.zebrafishZonWTAllRatio
    # Mapped 14820,  multiply-mapped 1683, missed 0, unmapped 797

    hgLoadBed danRer5 affyTargetZonWildType zebrafishZonWTAffyTarget.bed
    # Loaded 16503 elements of size 15

    # add trackDb.ra entry at trackDb/zebrafish/danRer5 level to create
    # a composite affyZonWildType with alignments and array data for both
    # Affy Zebrafish chip consensus sequences and target sequences - see
    # danRer4.txt for more details of history of trackDb.ra and 
    # microarrayGroups.ra entries for this data.

###########################################################################
# BLASTZ FOR TETRAODON (tetNig1) (DONE, 2007-09-16, hartera)
# CREATE CHAIN AND NET TRACKS, AXTNET, MAFNET AND ALIGNMENT DOWNLOADS
    # No lineage-specific repeats for this species pair. 
    # Tetraodon also has no species-specific repeats in the RepeatMasker
    # library so run this using dynamic masking.
    # The tetraodon 2bit file of scaffolds was used 
    # (tetNig1.randomContigs.sdTrf.2bit) - this contains sequences for 
    # scaffolds from randoms. These were used for Blastz rather than the 
    # random chroms so that alignments do not occur across non-contiguous 
    # scaffolds.
    ## use screen to run
    ssh kkstore06
    mkdir /cluster/data/danRer5/bed/blastz.tetNig1.2007-09-16
    cd /cluster/data/danRer5/bed
    ln -s blastz.tetNig1.2007-09-16 blastz.tetNig1
    cd /cluster/data/danRer5/bed/blastz.tetNig1.2007-09-16

# Use Blastz parameters for tetNig1 in danRer4.txt. Using scaffolds makes this run
# slower so it is best to have the scaffolds in the query. Use HoxD55.q 
# matrix as tetraodon is quite distant from zebrafish. Blastz uses 
# lineage-specfic repeats but there are none for these two species.
# Use soft-masked scaffolds for tetNig1 (and dynamic masking) and also use 
# windowMasker plus SDust repeats since there are no species-specific
# RepeatMasker repeats for tetraodon.
cat << '_EOF_' > DEF
# zebrafish (danRer5) vs. tetraodon (tetNig1)
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/cluster/home/angie/schwartzbin:/parasol/bin

BLASTZ=blastz.v7.x86_64
BLASTZ_H=2500
BLASTZ_M=50
BLASTZ_Q=/cluster/data/blastz/HoxD55.q

# TARGET - zebrafish (danRer5)
SEQ1_DIR=/san/sanvol1/scratch/danRer5/danRer5.2bit
SEQ1_LEN=/cluster/data/danRer5/chrom.sizes
SEQ1_LIMIT=30
SEQ1_CHUNK=10000000
SEQ1_LAP=10000

# QUERY - Tetraodon (tetNig1)
# soft-masked chroms and random scaffolds in 2bit format
SEQ2_DIR=/san/sanvol1/scratch/tetNig1/tetNig1.sdTrf.2bit
SEQ2_LEN=/san/sanvol1/scratch/tetNig1/chrom.sizes
SEQ2_CTGDIR=/san/sanvol1/scratch/tetNig1/tetNig1.randomContigs.sdTrf.2bit
SEQ2_CTGLEN=/san/sanvol1/scratch/tetNig1/tetNig1.randomContigs.sdTrf.sizes
SEQ2_LIFT=/san/sanvol1/scratch/tetNig1/tetNig1.randomContigs.lift
SEQ2_CHUNK=1000000000
SEQ2_LAP=0

BASE=/cluster/data/danRer5/bed/blastz.tetNig1.2007-09-16
TMPDIR=/scratch/tmp
'_EOF_'
   # << this line keeps emacs coloring happy
   chmod +x DEF
   mkdir /san/sanvol1/scratch/danRer5TetNig1
   time nice /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
        -bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
        -qRepeats=windowmaskerSdust \
        -blastzOutRoot /san/sanvol1/scratch/danRer5TetNig1 \
        `pwd`/DEF >& doBlastz.log &
   # Took about 5 hours 7 minutes to run.

   featureBits danRer5 refGene:cds chainTetNig1Link -enrichment
# refGene:cds 1.019%, chainTetNig1Link 6.359%, both 0.866%, cover 84.94%,
# enrich 13.36x  
   featureBits danRer4 refGene:cds chainTetNig1Link -enrichment
# refGene:cds 0.953%, chainTetNig1Link 7.345%, both 0.836%, cover 87.72%,
# enrich 11.94x
   featureBits danRer5 refGene:cds netTetNig1 -enrichment
# refGene:cds 1.019%, netTetNig1 63.685%, both 0.961%, cover 94.30%, enrich
# 1.48x
   featureBits danRer4 refGene:cds netTetNig1 -enrichment
# refGene:cds 0.953%, netTetNig1 63.389%, both 0.909%, cover 95.41%, enrich
# 1.51x

   ## DO BLASTZ SWAP TO CREATE THE DANRER5 CHAINS, NETS, AXTNET, MAFNET AND
    # ALIGNMENT DOWNLOAD FOR DANRER5 ON TETNIG1 - documented in tetNig1.txt

###########################################################################
# BLASTZ FOR FUGU (fr2) (DONE, 2007-09-18 - 2007-09-19, hartera)
# CREATE CHAIN AND NET TRACKS, AXTNET, MAFNET AND ALIGNMENT DOWNLOADS
    # No lineage-specific repeats for this species pair. 
    # Fugu also has no species-specific repeats in the RepeatMasker
    # library so run this using dynamic masking.
    # Usee the Fugu 2bit file of scaffolds (fr2.scaffolds.2bit) - this 
    # contains sequences for all the scaffolds of chrUn. The whole assembly
    # is unmapped, unordered scaffolds. The individual scaffolds were    
    # used for Blastz rather than the random chroms so that alignments do not 
    # occur across non-contiguous scaffolds.

    ## use screen to run
    ssh kkstore06
    mkdir /cluster/data/danRer5/bed/blastz.fr2.2007-09-18
    cd /cluster/data/danRer5/bed
    ln -s blastz.fr2.2007-09-18 blastz.fr2
    cd /cluster/data/danRer5/bed/blastz.fr2.2007-09-18

# Use Blastz parameters for fr1 in danRer4.txt. Using scaffolds makes this run
# slower so it is best to have the scaffolds in the query. Use HoxD55.q 
# matrix as Fugu is quite distant from zebrafish. Blastz uses 
# lineage-specfic repeats but there are none for these two species.
# Use soft-masked scaffolds for fr2 (and dynamic masking) and also use 
# windowMasker plus SDust repeats since there are no species-specific
# RepeatMasker repeats for Fugu. Fugu is about 400.5 Mb of scaffolds plus
# chrM which is about 16.4 kb.

cat << '_EOF_' > DEF
# zebrafish (danRer5) vs. Fugu (fr2)
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/cluster/home/angie/schwartzbin:/parasol/bin

BLASTZ=blastz.v7.x86_64
BLASTZ_H=2500
BLASTZ_M=50
BLASTZ_Q=/cluster/data/blastz/HoxD55.q

# TARGET - zebrafish (danRer5)
SEQ1_DIR=/san/sanvol1/scratch/danRer5/danRer5.2bit
SEQ1_LEN=/cluster/data/danRer5/chrom.sizes
SEQ1_LIMIT=30
SEQ1_CHUNK=10000000
SEQ1_LAP=10000

# QUERY - Fugu (fr2)
# soft-masked chroms and random scaffolds in 2bit format
SEQ2_DIR=/san/sanvol1/scratch/fr2/fr2.2bit
SEQ2_LEN=/san/sanvol1/scratch/fr2/chrom.sizes
SEQ2_CTGDIR=/san/sanvol1/scratch/fr2/fr2.scaffolds.2bit
SEQ2_CTGLEN=/san/sanvol1/scratch/fr2/fr2.scaffolds.sizes
SEQ2_LIFT=/san/sanvol1/scratch/fr2/liftAll.lft
SEQ2_CHUNK=20000000
SEQ2_LIMIT=30
SEQ2_LAP=0

BASE=/cluster/data/danRer5/bed/blastz.fr2.2007-09-18
TMPDIR=/scratch/tmp
'_EOF_'
   # << this line keeps emacs coloring happy
   chmod +x DEF
   mkdir /san/sanvol1/scratch/danRer5Fr2
   time nice /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
        -bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
        -qRepeats=windowmaskerSdust \
        -blastzOutRoot /san/sanvol1/scratch/danRer5Fr2 \
        `pwd`/DEF >& doBlastz.log &
   # Took about 6 hours and 50 minutes to run.
   cat fb.danRer5.chainFr2Link.txt
   # 104476108 bases of 1435609608 (7.277%) in intersection
   # Compare coverage to refGene CDS: 
   featureBits danRer5 refGene:cds chainFr2Link -enrichment
# refGene:cds 1.019%, chainFr2Link 7.277%, both 0.886%, cover 86.93%, 
# enrich 11.95x   
   featureBits danRer4 refGene:cds chainFr2Link -enrichment
# refGene:cds 0.955%, chainFr2Link 8.543%, both 0.810%, cover 84.73%, 
# enrich 9.92x   
   featureBits danRer5 refGene:cds netFr2 -enrichment
# refGene:cds 1.019%, netFr2 64.210%, both 0.968%, cover 94.95%, enrich 1.48x
   featureBits danRer4 refGene:cds netFr2 -enrichment
# refGene:cds 0.955%, netFr2 62.980%, both 0.885%, cover 92.68%, enrich 1.47x 

   ## DO BLASTZ SWAP TO CREATE THE DANRER5 CHAINS, NETS, AXTNET, MAFNET AND
    # ALIGNMENT DOWNLOAD FOR DANRER5 ON FR2 -document in fr2.txt

###########################################################################
# CREATE LIFTOVER CHAINS FROM danRer5 TO danRer4 
# (DONE 2007-09-19 AND 2007-09-21, hartera)
    ssh kkstore06
    mkdir /cluster/data/danRer5/bed/blat.danRer4
    cd /cluster/data/danRer5/bed/blat.danRer4

    time nice doSameSpeciesLiftOver.pl danRer5 danRer4 \
        -bigClusterHub pk \
        -ooc /san/sanvol1/scratch/danRer5/danRer5_11.ooc \
	-buildDir /cluster/data/danRer5/bed/blat.danRer4 >& do.log &
    # 0.401u 0.253s 8:09:03.62 0.0%   0+0k 0+0io 11pf+0w
    # Remove symbolic link to liftOver chains and copy over the file
    rm ../liftOver/danRer5ToDanRer4.over.chain.gz
    cp -p danRer5ToDanRer4.over.chain.gz ../liftOver

    # a link in /usr/local/apache/htdocs/goldenPath/danRer5/liftOver has 
    # already been made to this file and md5sum.txt needs to be updated
    ssh hgwdev
    cd /usr/local/apache/htdocs/goldenPath/danRer5/liftOver
    md5sum *.gz > md5sum.txt
 
###########################################################################
# VEGA GENES UPDATE TO v28 (DONE, 2007-09-28 - 2007-10-01, hartera)
# Data provided by Tina Eyre from Sanger: te3@sanger.ac.uk
# UPDATE SENT ON 2007-09-27, Vega v28. RE-LOAD TABLES TO UPDATE THE 
    ssh kkstore06
    mkdir -p /cluster/data/danRer5/bed/vega28/data/
    cd /cluster/data/danRer5/bed/vega28/data
    # 3 files: vegav28.gtf.zip  vegav28_information.txt.zip
    # vegav28_transcript_to_clone.txt.zip
    unzip vegav28.gtf.zip
    unzip vegav28_information.txt.zip  
    unzip vegav28_transcript_to_clone.txt.zip   
    # asked for the vegav28_information.txt file to be in the format 
    # required to load the table directory. 
    # Check the format then prepare data files to load database tables.
    cd /cluster/data/danRer5/bed/vega28
    # this time all the lines are describing genes and there is no extra 
    # header lines and output from the program that generated the file. 
    # NOTE: there are no NA scaffolds. 
    # Remove "chr" prefix from scaffolds
    sed -e 's/chrZv7_/Zv7_/' ./data/vegav28.gtf > vegav28Fixed.gtf   
    # vegaInfo is transcriptId, otterId, geneId, method and geneDesc
    # Get otter transcript ID and otter gene ID:
    awk 'BEGIN{FS=";"} {OFS="\t"} \
        {if (($5 ~ /otter_gene_id/) && ($6 ~ /otter_transcript_id/)) \
         print $5, $6;}' vegav28Fixed.gtf | sort | uniq > vega28OtterIds.txt
    # remove the otter_gene_id and otter_transcript_id and 
    # extra spaces and quotation marks.
    perl -pi.bak -e 's/\sotter_gene_id\s\"//;' vega28OtterIds.txt
    perl -pi.bak -e 's/"\t\sotter_transcript_id\s"(OTTDART[0-9]+)"/\t$1/' \
         vega28OtterIds.txt
    wc -l vega28OtterIds.txt
    # 14001 vega28OtterIds.txt
    # there were 12819 in Vega v27
    # get list of the otter gene IDs only
    awk '{print $1}' vega28OtterIds.txt | sort | uniq > otterGeneIds.only
    # then get list of otter gene IDs from information file
    awk '{if ($1 ~ /OTTDARG/) print $1}' data/vegav28_information.txt \
        | sort | uniq > infoOtterGeneIds.only
    comm -12 otterGeneIds.only infoOtterGeneIds.only | wc -l
    # 8489
    wc -l *.only
    # 10374 infoOtterGeneIds.only
    # 9318 otterGeneIds.only
    comm -23 otterGeneIds.only infoOtterGeneIds.only > geneIdsGtfNotInfo.txt
    # also check the transcripts in transcript to clone are all there.
    awk 'BEGIN {FS="\t"} {print $2;}' vega28OtterIds.txt \
        | sort | uniq > otterTxIds.only
    wc -l otterTxIds.only
    # 14001 otterTxIds.only
    awk 'BEGIN {FS="\t"} {print $1;}' ./data/vegav28_transcript_to_clone.txt \
        | sort | uniq > vegaCloneTxIds.sort
    wc -l vegaCloneTxIds.sort
    # 14001 vegaCloneTxIds.sort
    comm -12 otterTxIds.only vegaCloneTxIds.sort | wc -l
    # 14001
    # All otter transcript IDs in the GTF vegav28 gene file are in the
    # vegav28_transcript_to_clone.txt file as expected.
    wc -l geneIdsGtfNotInfo.txt
    # 829 geneIdsGtfNotInfo.txt
    # E-mailed Tina Eyre at Sanger - on 2007-09-28 - to ask about 
    # these otter gene Ids that are in the GTF file but not in the 
    # vegav28_information.txt file.
    # Received correct file: 2007-10-01
    # Check again
    ssh kkstore06
    cd /cluster/data/danRer5/bed/vega28/data
    unzip vegav28_information.txt.zip
    cd ..    
    # then get list of otter gene IDs from information file
    awk '{if ($1 ~ /OTTDARG/) print $1}' ./data/vegav28_information.txt \
        | sort | uniq > infoOtterGeneIds.only
    comm -12 otterGeneIds.only infoOtterGeneIds.only | wc -l
    # 9318
    wc -l *.only
    # 11115 infoOtterGeneIds.only
    # 9318 otterGeneIds.only
    # All IDs from the GTF file are contained in the information file.
    # all the gene IDs from the GTF file but the information file contains 
    # 1797 are in the information file but not the GTF file
    # Extra genes are included that are found on clones that do not map 
    # to Ensembl.
    comm -13 otterGeneIds.only infoOtterGeneIds.only > geneIds.InfoOnly.txt
cat << 'EOF' > mergeTransIdAndInfo.pl
#!/usr/bin/perl -w
use strict;

my ($idsFile, $infoFile, %idsHash);
$idsFile = $ARGV[0];
$infoFile = $ARGV[1];
open (IDS, $idsFile) || die "Can not open $idsFile: $!\n";
open (INFO, $infoFile) || die "Can not open $idsFile: $!\n";

# Read in the otter gene IDs and transcript IDs
while (<IDS>)
{
my (@f, $line);
chomp;
$line = $_;
# split line on tab
@f = split(/\t/, $line);
# hash keyed by gene ID, stores transcript ID
# more than one ID so store as an array
if ($f[0] =~ /OTTDARG/)
   {
   # add to array in hash if it is an otter gene ID
   push @{$idsHash{$f[0]}}, $f[1];
   }
}
close IDS;

while (<INFO>)
{
my ($line, @info, $geneId, @transIds);
# read in information file and get the otter gene ID
$line = $_;
# split line on tab
@info = split(/\t/, $line);
if ($info[0] =~ /OTTDARG/)
   {
   $geneId = $info[0];
   # look up transcript ID in hash 
   
   # if gene ID exists in the hash then print out the information
   # file line with the correct transcript ID
   if (exists($idsHash{$geneId}))
      {
      @transIds = @{$idsHash{$geneId}};
      # rewrite line with transcript ID
      foreach my $t (@transIds)
         {
         print "$t\t$line";
         }
      }
   }
}
close INFO;
'EOF'
   # << happy emacs
    chmod +x mergeTransIdAndInfo.pl
    mergeTransIdAndInfo.pl vega28OtterIds.txt ./data/vegav28_information.txt \
         > vegav28InfoWithTransIds.txt 
    wc -l vega28OtterIds.txt
    # 14001 vega28OtterIds.txt
    wc -l vegav28InfoWithTransIds.txt
    # 14001 vegav28InfoWithTransIds.txt
    # so just sort and uniq the resulting information file with transcript IDs
    sort vegav28InfoWithTransIds.txt | uniq > vegav28InfoWithTransIdsUniq.txt
    wc -l vegav28InfoWithTransIdsUniq.txt
    # 14001 vegav28InfoWithTransIdsUniq.txt
    # So no duplicated lines.
    # There are extra IDs in the information file as it includes all Vega
    # genes (including clones not in Ensembl). Those actually in the Vega 
    # v27 GTF are those that are found on clones that are in Ensembl.
    # Vega includes only clone sequence and Zv7_NA scaffolds are WGS so 
    # these are not annotated in Vega. This information is from 
    # an e-mail from Tina Eyre (te3@sanger.ac.uk) on 2007-09-07.

    # then use the info file to grab those genes that are pseudogenes, get the
    # transcript ID from the vegaIDs.txt file. Then grep out the pseudogenes
    # to a separate file. 

    # Next step is to find which transcripts are pseudogenes.
    grep pseudogene vegav28InfoWithTransIdsUniq.txt | sort | uniq | wc -l
    # found 68 pseudogenes so need to create the pseudogene track 
    # Get transcript IDs for pseudogenes.
    grep pseudogene vegav28InfoWithTransIdsUniq.txt | awk '{print $1}' \
         > pseudogenes.ids 
    grep -w -f pseudogenes.ids vegav28Fixed.gtf > vegav28Pseudogene.gtf 
    awk '{print $20}' vegav28Pseudogene.gtf | sort | uniq | wc -l
    # 68
    # Need to make the GTF file vegGene table by subtracting pseudogenes:
    grep -vw -f pseudogenes.ids vegav28Fixed.gtf > vegaGenev28.gtf
    wc -l vega*gtf
    # 178987 vegaGenev28.gtf
    # 179249 vegav28Fixed.gtf
    # 262 vegav28Pseudogene.gtf
    # Need to relabel IDs to get the name to be the otter transcript ID
    # and name 2 to be the transcript_id (needs to be labeled as gene_id)
    # Also, relabel the otter_transcript_id to be transcript_id as ldHgGene
    # groups the rows by this ID.  
    sed -e 's/gene_id/tmp_id/' vegaGenev28.gtf \
        | sed -e 's/transcript_id/gene_id/' \
        | sed -e 's/otter_transcript_id/transcript_id/' > vegaGenev28Format.gtf

    # Do the same for the pseudogene GTF file:
    sed -e 's/gene_id/tmp_id/' vegav28Pseudogene.gtf \
        | sed -e 's/transcript_id/gene_id/' \
        | sed -e 's/otter_transcript_id/transcript_id/' \
        > vegaPseudogenev28Format.gtf
    
    gtfToGenePred -genePredExt -infoOut=tmpInfo.txt \
        vegaGenev28Format.gtf vegaGenev28.gp
    genePredCheck vegaGenev28.gp
    # checked: 13933 failed: 0
    gtfToGenePred -genePredExt -infoOut=tmpInfo.txt \
        vegaPseudogenev28Format.gtf vegaPseudogenev28.gp
    genePredCheck vegaPseudogenev28.gp
    # checked: 68 failed: 0
    
    # load GTF files for Vega genes and pseudogenes:
    ssh hgwdev
    cd /cluster/data/danRer5/bed/vega28
    hgsql -e 'drop table vegaGene;' danRer5
    # load vegaGene table
    hgLoadGenePred -genePredExt danRer5 vegaGene vegaGenev28.gp
    hgsql -e 'drop table vegaPseudoGene;' danRer5
    # load vegaPseudoGene table
    hgLoadGenePred -genePredExt danRer5 vegaPseudoGene vegaPseudogenev28.gp

    hgsql -N -e 'select distinct(chrom) from vegaGene;' danRer5 
# For vegav28 there are 29, chr1-25 plue 4 scaffolds:
# Zv7_scaffold2498
# Zv7_scaffold2509
# Zv7_scaffold2516
# Zv7_scaffold2572

# For Vega v27, there were more scaffolds annotated so e-mailed Tina Eyre
# to ask why this is the case.
# There are 36, chr1-25 and 11 scaffolds are annotated:
# Zv7_scaffold2491
# Zv7_scaffold2498
# Zv7_scaffold2509
# Zv7_scaffold2516
# Zv7_scaffold2523
# Zv7_scaffold2533
# Zv7_scaffold2537
# Zv7_scaffold2551
# Zv7_scaffold2560
# Zv7_scaffold2633
# Zv7_scaffold2650

    hgsql -N -e 'select distinct(chrom) from vegaPseudoGene;' danRer5
# 13 chroms and 1 scaffolds have pseudogenes annotated
# chr2, chr3, chr4, chr5, chr9, chr12, chr13, chr18, chr19, chr20, chr22, chr24,
# Zv7_scaffold2509 
    # Only finished sequence is annotated by Vega 
    # Vega information tables:
    # mySQL table definition and autosql-generated files created previously 
    # for zebrafish-specific information (vegaInfoZfish).
    # Add clone_id to a separate table instead of this one. A tab-separated 
    # file of transcript ID and clone ID was provided by Tina Eyre 
    # (te3@sanger.ac.uk) at Sanger.  
    # Need to grep for the transcript IDs     
    # created a second table for the cloneId accessions since there
    # are multiple ids for some VEGA genes. Otherwise, there would be 
    # a comma separated list in this field or many rows repeated but just
    # different in the cloneId field. Associate transcript ID to clone IDs.  
    # Table definitions are in danRer4.txt.
    
    # Load the vega to clone ID information:
    ssh kkstore06
    cd /cluster/data/danRer5/bed/vega28
    sort ./data/vegav28_transcript_to_clone.txt | uniq \
         > vegav28TransToClone.txt
    
    # load the vegaInfo and vegaToCloneId tables: 
    ssh hgwdev
    cd /cluster/data/danRer5/bed/vega28
    hgsql -e 'drop table vegaInfoZfish;' danRer5
    hgLoadSqlTab danRer5 vegaInfoZfish ~/kent/src/hg/lib/vegaInfoZfish.sql \
                 vegav28InfoWithTransIdsUniq.txt
    hgsql -e 'drop table vegaToCloneId;' danRer5
    hgLoadSqlTab danRer5 vegaToCloneId ~/kent/src/hg/lib/vegaToCloneId.sql \
                 vegav28TransToClone.txt
    
    # A peptide file was requested this time (vegav28_protein.fa) so load
    # it into vegaPep:
    ssh kkstore06
    cd /cluster/data/danRer5/bed/vega28
    # There are more peptides in the FASTA file than there are transcript IDs
    # in the GTF file so get those peptides sequences for transcripts in GTF.
    # otterTxIds.only is the list of transcript IDs (14,001 IDs).
    faSomeRecords ./data/vegav28_protein.fa otterTxIds.only vega28Pep.fa 
    grep '>' vega28Pep.fa | wc -l 
    # 10110 

    grep '>' vega28Pep.fa | sed -e 's/>//' | sort | uniq > vega28PepTxIds.txt
    comm -12 vega28PepTxIds.txt otterTxIds.only | wc -l
    # 10110
    comm -13 vega28PepTxIds.txt otterTxIds.only > otterTxNotInPeptides.txt
    wc -l otterTxNotInPeptides.txt
    # 3891 otterTxNotInPeptides.txt
    # Checked a few and these appear to be novel processed transcripts or novel 
    # protein-coding so probably the novel ones do not have an associated 
    # protein - check with Tina Eyre.
    ssh hgwdev
    cd /cluster/data/danRer5/bed/vega28
    hgPepPred danRer5 generic vegaPep vega28Pep.fa
    # hgsql -e 'select count(*) from vegaPep;' danRer5
    # 10110
    # Check which type of transcripts do not have a peptide translation.
    hgsql -N -e 'select distinct(name) from vegaPep;' danRer5 | \
         sort > vegaPep.names.sort
    hgsql -N -e 'select distinct(transcriptId) from vegaInfoZfish;' danRer5 \
         | sort > vegaInfo.txId.sort
    comm -12 vegaPep.names.sort vegaInfo.txId.sort | wc -l
    # 10110
    wc -l *.sort
    # 14001 vegaInfo.txId.sort
    # 10110 vegaPep.names.sort
    comm -13 vegaPep.names.sort vegaInfo.txId.sort > vegaTx.inInfoNotPep.txt
    wc -l vegaTx.inInfoNotPep.txt
    # 3891 vegaTx.inInfoNotPep.txt
    foreach t (`cat vegaTx.inInfoNotPep.txt`)
       hgsql -N -e "select transcriptId, method, confidence from vegaInfoZfish
where transcriptId = '${t}';" danRer5 >> vegaTx.inInfoNotPep.type
    end

    awk 'BEGIN {OFS="\t"} {print $2, $3}' vegaTx.inInfoNotPep.type \
        | sort | uniq -c | sort -nr > vegaTx.inInfoNotPep.counts
 
# Add trackDb.ra entry for trackDb/zebrafish/danRer5/trackDb.ra if it is not
# there already. Keep Vega Genes and Vega Pseudogenes separate as for
# human and add version number to description. 

###########################################################################
# HUMAN (hg18) PROTEINS TRACK (DONE braney 2007-10-18)
    ssh kkstore06
    # bash  if not using bash shell already

    mkdir /cluster/data/danRer5/blastDb
    cd /cluster/data/danRer5
    awk '{if ($2 > 1000000) print $1}' chrom.sizes > great1M.lst
    twoBitToFa danRer5.2bit stdout | faSomeRecords stdin great1M.lst temp.fa
    faSplit gap temp.fa 1000000 blastDb/x -lift=blastDb.lft
    rm temp.fa
    twoBitToFa danRer5.2bit stdout | faSomeRecords -exclude stdin great1M.lst temp.fa
    faSplit sequence temp.fa 156 blastDb/y
    cd blastDb
    for i in *.fa
    do
	/cluster/bluearc/blast229/formatdb -i $i -p F
    done
    rm *.fa

    mkdir -p /cluster/bluearc//danRer5/blastDb
    cd /cluster/data/danRer5/blastDb
    for i in nhr nin nsq; 
    do 
	echo $i
	cp *.$i /cluster/bluearc//danRer5/blastDb
    done

    mkdir -p /cluster/data/danRer5/bed/tblastn.hg18KG
    cd /cluster/data/danRer5/bed/tblastn.hg18KG
    echo /cluster/bluearc//danRer5/blastDb/*.nsq | xargs ls -S | sed "s/\.nsq//"  > query.lst
    wc -l query.lst
# 1456 query.lst

   # we want around 200000 jobs
   calc `wc /cluster/data/hg18/bed/blat.hg18KG/hg18KG.psl | awk "{print \\\$1}"`/\(200000/`wc query.lst | awk "{print \\\$1}"`\)

# 36727/(200000/1456) = 267.372560

   mkdir -p /cluster/bluearc/danRer5/bed/tblastn.hg18KG/kgfa
   split -l 267 /cluster/data/hg18/bed/blat.hg18KG/hg18KG.psl  /cluster/bluearc/danRer5/bed/tblastn.hg18KG/kgfa/kg
   ln -s /cluster/bluearc/danRer5/bed/tblastn.hg18KG/kgfa kgfa
   cd kgfa
   for i in *; do 
     nice pslxToFa $i $i.fa; 
     rm $i; 
     done
   cd ..
   ls -1S kgfa/*.fa > kg.lst
   mkdir -p /cluster/bluearc/danRer5/bed/tblastn.hg18KG/blastOut
   ln -s /cluster/bluearc/danRer5/bed/tblastn.hg18KG/blastOut
   for i in `cat kg.lst`; do  mkdir blastOut/`basename $i .fa`; done
   tcsh
   cd /cluster/data/danRer5/bed/tblastn.hg18KG
   cat << '_EOF_' > blastGsub
#LOOP
blastSome $(path1) {check in line $(path2)} {check out exists blastOut/$(root2)/q.$(root1).psl }
#ENDLOOP
'_EOF_'

   cat << '_EOF_' > blastSome
#!/bin/sh
BLASTMAT=/cluster/bluearc/blast229/data
export BLASTMAT
g=`basename $2`
f=/tmp/`basename $3`.$g
for eVal in 0.01 0.001 0.0001 0.00001 0.000001 1E-09 1E-11
do
if /cluster/bluearc/blast229/blastall -M BLOSUM80 -m 0 -F no -e $eVal -p tblastn -d $1 -i $2 -o $f.8
then
        mv $f.8 $f.1
        break;
fi
done
if test -f  $f.1
then
    if /cluster/bin/i386/blastToPsl $f.1 $f.2
    then
	liftUp -nosort -type=".psl" -nohead $f.3 /cluster/data/danRer5/blastDb.lft carry $f.2
        liftUp -nosort -type=".psl" -pslQ -nohead $3.tmp /cluster/data/hg18/bed/blat.hg18KG/protein.lft warn $f.3

        if pslCheck -prot $3.tmp
        then                   
            mv $3.tmp $3      
            rm -f $f.1 $f.2 $f.3 $f.4
        fi
        exit 0 
    fi        
fi                                                                                
rm -f $f.1 $f.2 $3.tmp $f.8 $f.3 $f.4
exit 1
'_EOF_'
    # << happy emacs
    chmod +x blastSome
    gensub2 query.lst kg.lst blastGsub blastSpec
    exit 
    
    ssh kk
    cd /cluster/data/danRer5/bed/tblastn.hg18KG
    para create blastSpec
#    para try, check, push, check etc.

    para time

# Completed: 75786 of 75786 jobs
# CPU time in finished jobs:   13289094s  221484.91m  3691.42h  153.81d  0.421 y
# IO & Wait Time:               6534562s  108909.36m  1815.16h   75.63d  0.207 y
# Average job time:                 262s       4.36m     0.07h    0.00d
# Longest finished job:            1158s      19.30m     0.32h    0.01d
# Submission to last job:         37516s     625.27m    10.42h    0.43d

    ssh kkstore06
    cd /cluster/data/danRer5/bed/tblastn.hg18KG
    mkdir chainRun
    cd chainRun
    tcsh
    cat << '_EOF_' > chainGsub
#LOOP
chainOne $(path1)
#ENDLOOP
'_EOF_'

    cat << '_EOF_' > chainOne
(cd $1; cat q.*.psl | simpleChain -prot -outPsl -maxGap=100000 stdin /cluster/bluearc/danRer5/bed/tblastn.hg18KG/blastOut/c.`basename $1`.psl)
'_EOF_'
    chmod +x chainOne
    ls -1dS /cluster/bluearc/danRer5/bed/tblastn.hg18KG/blastOut/kg?? > chain.lst
    gensub2 chain.lst single chainGsub chainSpec
    # do the cluster run for chaining
    ssh pk
    cd /cluster/data/danRer5/bed/tblastn.hg18KG/chainRun
    para create chainSpec
    para maxNode 30
    para try, check, push, check etc.

# Completed: 138 of 138 jobs
# CPU time in finished jobs:      83388s    1389.80m    23.16h    0.97d  0.003 y
# IO & Wait Time:                 43355s     722.58m    12.04h    0.50d  0.001 y
# Average job time:                 918s      15.31m     0.26h    0.01d
# Longest finished job:           21693s     361.55m     6.03h    0.25d
# Submission to last job:         21713s     361.88m     6.03h    0.25d

    ssh kkstore06
    cd /cluster/data/danRer5/bed/tblastn.hg18KG/blastOut
    for i in kg??
    do
       cat c.$i.psl | awk "(\$13 - \$12)/\$11 > 0.6 {print}" > c60.$i.psl
       sort -rn c60.$i.psl | pslUniq stdin u.$i.psl
       awk "((\$1 / \$11) ) > 0.60 { print   }" c60.$i.psl > m60.$i.psl
       echo $i
    done
    sort -T /tmp -k 14,14 -k 16,16n -k 17,17n u.*.psl m60* | uniq > /cluster/data/danRer5/bed/tblastn.hg18KG/blastHg18KG.psl 
    cd ..
    pslCheck blastHg18KG.psl 

    # load table 
    ssh hgwdev
    cd /cluster/data/danRer5/bed/tblastn.hg18KG
    hgLoadPsl danRer5 blastHg18KG.psl

    # check coverage
    featureBits danRer5 blastHg18KG 
# 21056644 bases of 1435609608 (1.467%) in intersection

    featureBits danRer4 blastHg18KG 
# 21159392 bases of 1626093931 (1.301%) in intersection

    featureBits danRer5 ensGene:cds blastHg18KG  -enrichment
# ensGene:cds 2.114%, blastHg18KG 1.467%, both 1.077%, cover 50.92%, enrich 34.72x
    featureBits danRer4 ensGene:cds blastHg18KG  -enrichment
# ensGene:cds 2.230%, blastHg18KG 1.301%, both 1.058%, cover 47.46%, enrich 36.47x

    ssh kkstore06
    rm -rf /cluster/data/danRer5/bed/tblastn.hg18KG/blastOut
    rm -rf /cluster/bluearc/danRer5/bed/tblastn.hg18KG/blastOut
#end tblastn
##########################################################################
############################################################################
# TRANSMAP vertebrate.2008-05-20 build  (2008-05-24 markd)

vertebrate-wide transMap alignments were built  Tracks are created and loaded
by a single Makefile. This is available from:
   svn+ssh://hgwdev.soe.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2008-05-20

see doc/builds.txt for specific details.
############################################################################
############################################################################
# TRANSMAP vertebrate.2008-06-07 build  (2008-06-30 markd)

vertebrate-wide transMap alignments were built  Tracks are created and loaded
by a single Makefile. This is available from:
   svn+ssh://hgwdev.soe.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2008-06-30

see doc/builds.txt for specific details.

############################################################################
# SWAP BLASTZ Medaka oryLat2 (DONE - 2008-08-28 - Hiram)
    cd /cluster/data/oryLat2/bed/blastzDanRer5.2008-08-27
    cat fb.oryLat2.chainDanRer5Link.txt 
    #	138070427 bases of 700386597 (19.713%) in intersection

    #	and for the swap
    mkdir /cluster/data/danRer5/bed/blastz.oryLat2.swap
    cd /cluster/data/danRer5/bed/blastz.oryLat2.swap
    time doBlastzChainNet.pl -chainMinScore=3000 -chainLinearGap=medium \
	/cluster/data/oryLat2/bed/blastzDanRer5.2008-08-27/DEF \
	-swap -tRepeats=windowmaskerSdust \
	-verbose=2 -smallClusterHub=pk -bigClusterHub=pk > swap.log 2>&1 &
    #	real    58m15.303s
    #	had to finish the load nets manually, mistakenly included
    #	-qRepeats=windowmaskerSdust when that is not valid for danRer5
    cat fb.danRer5.chainOryLat2Link.txt
    #	158709399 bases of 1435609608 (11.055%) in intersection

    #	Then, continuing:
    doBlastzChainNet.pl -chainMinScore=3000 -chainLinearGap=medium \
	/cluster/data/oryLat2/bed/blastzDanRer5.2008-08-27/DEF \
	-continue=download -swap -tRepeats=windowmaskerSdust \
	-verbose=2 -smallClusterHub=pk -bigClusterHub=pk > download.log 2>&1 &

#########################################################################

################################################
# AUTOMATE UPSTREAM FILE CREATION (2008-10-15 markd)
update genbank.conf:
danRer5.upstreamGeneTbl = refGene
############################################################################
# TRANSMAP vertebrate.2009-07-01 build  (2009-07-21 markd)

vertebrate-wide transMap alignments were built  Tracks are created and loaded
by a single Makefile. This is available from:
   svn+ssh://hgwdev.soe.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2009-07-01

see doc/builds.txt for specific details.
############################################################################
# SWAP BLASTZ FOR DOG (canFam2) (DONE, 2009-08-08, hartera)
# CREATE CHAIN AND NET TRACKS, AXTNET, MAFNET AND ALIGNMENT DOWNLOADS
# See canFam2.txt for creation of zebrafish chain and net tracks and downloads
# on the dog genome assembly.
    mkdir /hive/data/genomes/danRer5/bed/blastz.canFam2.swap
    cd /hive/data/genomes/danRer5/bed/blastz.canFam2.swap
    
    time nice doBlastzChainNet.pl -chainMinScore=5000 -chainLinearGap=loose \
	-swap /hive/data/genomes/canFam2/bed/blastz.danRer5.2009-08-07/DEF \
	-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=pk \
        -chainMinScore=5000 -chainLinearGap=loose \
	>& doBlastz.swap.log &
    cat fb.danRer5.chainCanFam2Link.txt 
    # 32053647 bases of 1435609608 (2.233%) in intersection
   
############################################################################
# TRANSMAP vertebrate.2009-09-13 build  (2009-09-20 markd)

vertebrate-wide transMap alignments were built  Tracks are created and loaded
by a single Makefile. This is available from:
   svn+ssh://hgwdev.soe.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2009-09-13

see doc/builds.txt for specific details.

############################################################################
# CREATE LIFTOVER CHAINS FROM danRer5 TO danRer6 (DONE 10/27/09 angie)
    doSameSpeciesLiftOver.pl -debug danRer5 danRer6
    cd /hive/data/genomes/danRer5/bed/blat.danRer6.2009-10-27
    doSameSpeciesLiftOver.pl danRer5 danRer6 >& do.log & tail -f do.log

############################################################################
# BUILD U MASS ChiP-Seq TRACKS (DONE 3/28/11, Fan)

ssh hgwdev
mkdir -p /hive/data/genomes/danRer5/bed/H3K4me
cd /hive/data/genomes/danRer5/bed/H3K4me

# Download the following files from http://lawsonlab.umassmed.edu/ChIP-SeqSupp.html:

   Supplementary File 1 REV 24h_input_signal.wig
   Supplementary File 2 REV 24h_ME1_signal.wig
   Supplementary File 3 REV 24h_ME3_signal.wig
   Supplementary File 4 REV 24h_ME1_hotspots.bed
   Supplementary File 5 REV 24h_ME1_peaks.WIG
   Supplementary File 6 REV 24h_ME3_hotspots.bed
   Supplementary File 7 REV 24h_ME3_peaks.WIG

# build UMassInput track

cat 'Supplementary File 1 REV 24h_input_signal.wig'|sed -e 's/chrMT/chrM/g' >UMassInput.wig.input
wigEncode UMassInput.wig.input UMassInput.wig UMassInput.wib
# Converted UMassInput.wig.input, upper limit 500.00, lower limit 10.00

ln -s /hive/data/genomes/danRer5/bed/H3K4me/UMassInput.wib /gbdb/danRer5/wib/UMassInput.wib

hgLoadWiggle danRer5 UMassInput UMassInput.wig
#WARNING: Exceeded Zv7_NA1024 size 99420 > 99329. dropping 10 data point(s)
#WARNING: Exceeded Zv7_NA134 size 97140 > 97109. dropping 4 data point(s)
#WARNING: Exceeded Zv7_NA2952 size 14610 > 14546. dropping 7 data point(s)
#WARNING: Exceeded Zv7_NA3149 size 24520 > 24469. dropping 6 data point(s)
#WARNING: Exceeded Zv7_NA3766 size 23600 > 23589. dropping 2 data point(s)
#WARNING: Exceeded Zv7_NA7305 size 3820 > 3819. dropping 1 data point(s)
#WARNING: Exceeded chrM size 16690 > 16596. dropping 10 data point(s)
#WARNING: Exceeded chrM size 16720 > 16596. dropping 13 data point(s)

# Build UMassME1 track

cat 'Supplementary File 2 REV 24h_ME1_signal.wig'|sed -e 's/chrMT/chrM/g' >UMassME1.wig.input
wigEncode UMassME1.wig.input UMassME1.wig UMassME1.wib
# Converted UMassME1.wig.input, upper limit 404.00, lower limit 10.00

ln -s /hive/data/genomes/danRer5/bed/H3K4me/UMassME1.wib /gbdb/danRer5/wib/UMassME1.wib

hgLoadWiggle danRer5 UMassME1 UMassME1.wig
#WARNING: Exceeded Zv7_NA1266 size 52350 > 52297. dropping 6 data point(s)
#WARNING: Exceeded Zv7_NA1308 size 18360 > 18317. dropping 5 data point(s)
#WARNING: Exceeded Zv7_NA1320 size 40410 > 40361. dropping 5 data point(s)
#WARNING: Exceeded Zv7_NA1723 size 44750 > 44727. dropping 3 data point(s)
#WARNING: Exceeded Zv7_NA1741 size 44070 > 44004. dropping 7 data point(s)
#WARNING: Exceeded Zv7_NA1868 size 34760 > 34733. dropping 3 data point(s)
#WARNING: Exceeded Zv7_NA2098 size 13950 > 13871. dropping 8 data point(s)
#WARNING: Exceeded Zv7_NA2952 size 14640 > 14546. dropping 10 data point(s)
#WARNING: Exceeded Zv7_NA3149 size 24520 > 24469. dropping 6 data point(s)
#WARNING: Exceeded Zv7_NA3282 size 15540 > 15514. dropping 3 data point(s)
#WARNING: Exceeded Zv7_NA3685 size 5030 > 5011. dropping 2 data point(s)
#WARNING: Exceeded Zv7_NA3793 size 3860 > 3857. dropping 1 data point(s)
#WARNING: Exceeded Zv7_NA4436 size 7570 > 7547. dropping 3 data point(s)
#WARNING: Exceeded Zv7_NA4449 size 2260 > 2210. dropping 6 data point(s)
#WARNING: Exceeded Zv7_NA5160 size 4560 > 4535. dropping 3 data point(s)
#WARNING: Exceeded Zv7_NA6710 size 11900 > 11826. dropping 8 data point(s)
#WARNING: Exceeded Zv7_NA6758 size 4590 > 4582. dropping 1 data point(s)
#WARNING: Exceeded Zv7_NA7355 size 2680 > 2667. dropping 2 data point(s)
#WARNING: Exceeded Zv7_NA7403 size 4870 > 4815. dropping 6 data point(s)
#WARNING: Exceeded Zv7_NA7408 size 4180 > 4175. dropping 1 data point(s)

# Build UMassME1Peak track

cat 'Supplementary File 5 REV 24h_ME1_peaks.WIG'|sed -e 's/chrMT/chrM/g' >UMassME1Peak.wig.input
wigEncode UMassME1Peak.wig.input UMassME1Peak.wig UMassME1Peak.wib
# Converted UMassME1Peak.wig.input, upper limit 2510.00, lower limit 20.00

ln -s /hive/data/genomes/danRer5/bed/H3K4me/UMassME1Peak.wib /gbdb/danRer5/wib/UMassME1Peak.wib

hgLoadWiggle danRer5 UMassME1Peak UMassME1Peak.wig




# Build UMassME3 track

cat 'Supplementary File 3 REV 24h_ME3_signal.wig'|sed -e 's/chrMT/chrM/g' >UMassME3.wig.input
wigEncode UMassME3.wig.input UMassME3.wig UMassME3.wib
# Converted UMassME3.wig.input, upper limit 477.00, lower limit 10.00

ln -s /hive/data/genomes/danRer5/bed/H3K4me/UMassME3.wib /gbdb/danRer5/wib/UMassME3.wib

hgLoadWiggle danRer5 UMassME3 UMassME3.wig
#WARNING: Exceeded Zv7_NA1024 size 99410 > 99329. dropping 9 data point(s)
#WARNING: Exceeded Zv7_NA1266 size 52370 > 52297. dropping 8 data point(s)
#WARNING: Exceeded Zv7_NA1308 size 18380 > 18317. dropping 7 data point(s)
#WARNING: Exceeded Zv7_NA1417 size 8250 > 8219. dropping 4 data point(s)
#WARNING: Exceeded Zv7_NA153 size 43460 > 43438. dropping 3 data point(s)
#WARNING: Exceeded Zv7_NA1741 size 44020 > 44004. dropping 2 data point(s)
#WARNING: Exceeded Zv7_NA1913 size 14430 > 14424. dropping 1 data point(s)
#WARNING: Exceeded Zv7_NA2378 size 10240 > 10147. dropping 10 data point(s)
#WARNING: Exceeded Zv7_NA2552 size 2110 > 2041. dropping 7 data point(s)
#WARNING: Exceeded Zv7_NA2952 size 14640 > 14546. dropping 10 data point(s)
#WARNING: Exceeded Zv7_NA3149 size 24580 > 24469. dropping 12 data point(s)
#WARNING: Exceeded Zv7_NA3282 size 15630 > 15514. dropping 12 data point(s)
#WARNING: Exceeded Zv7_NA4966 size 4270 > 4220. dropping 6 data point(s)
#WARNING: Exceeded Zv7_NA5160 size 4560 > 4535. dropping 3 data point(s)
#WARNING: Exceeded Zv7_NA6710 size 11900 > 11826. dropping 8 data point(s)
#WARNING: Exceeded Zv7_NA7305 size 3850 > 3819. dropping 4 data point(s)
#WARNING: Exceeded Zv7_NA7355 size 2760 > 2667. dropping 10 data point(s)
#WARNING: Exceeded Zv7_NA7403 size 4870 > 4815. dropping 6 data point(s)
#WARNING: Exceeded Zv7_NA7450 size 5410 > 5333. dropping 8 data point(s)
#WARNING: Exceeded Zv7_NA7545 size 12030 > 11945. dropping 9 data point(s)
#WARNING: Exceeded Zv7_NA78 size 146010 > 145973. dropping 4 data point(s)

# Build UMassME3Peak track

cat 'Supplementary File 7 REV 24h_ME3_peaks.WIG'|sed -e 's/chrMT/chrM/g' >UMassME3Peak.wig.input
wigEncode UMassME3Peak.wig.input UMassME3Peak.wig UMassME3Peak.wib
# Converted UMassME3Peak.wig.input, upper limit 6835.00, lower limit 16.00

ln -s /hive/data/genomes/danRer5/bed/H3K4me/UMassME3Peak.wib /gbdb/danRer5/wib/UMassME3Peak.wib

hgLoadWiggle danRer5 UMassME3Peak UMassME3Peak.wig

# Build UMassME1Hotspot track

cat 'Supplementary File 4 REV 24h_ME1_hotspots.bed' |grep -v 'hotspot' > UMassME1Hotspot.bed
hgLoadBed danRer5 UMassME1Hotspot UMassME1Hotspot.bed
# Loaded 41478 elements of size 5

# Build UMassME3Hotspot track

cat 'Supplementary File 6 REV 24h_ME3_hotspots.bed' |grep -v 'hotspot' > UMassME3Hotspot.bed
hgLoadBed danRer5 UMassME3Hotspot UMassME3Hotspot.bed
# Loaded 29033 elements of size 5




