# for emacs: -*- mode: sh; -*-

# This file describes browser build for the oxyTri2
# Oxytricha trifallax - ciliate

#############################################################################
# fetch sequence from refseq (DONE - 2015-12-08 - Hiram)
    mkdir -p /hive/data/genomes/oxyTri2/genbank
    cd /hive/data/genomes/oxyTri2/genbank

for D in GCA_000711775.1_Oxytricha_MIC_v2.0 GCA_001297925.1_ASM129792v1 \
   GCA_000295675.1_oxytricha_asm_v1.1
do
  rsync -L -a -P rsync://ftp.ncbi.nlm.nih.gov/genomes/genbank/protozoa/Oxytricha_trifallax/all_assembly_versions/${D}/ ./${D}/
done

    mkdir chrM
for mitoAcc in JN383842 JN383843
do
  wget -O chrM/${mitoAcc}.fa \
 "http://www.ncbi.nlm.nih.gov/sviewer/viewer.fcgi?db=nuccore&dopt=fasta&sendto=on&id=$mitoAcc"
done

    # measure the sequences:
for F in */*genomic.fna.gz
do
 echo "==== $F ===="
 faSize $F
 echo "============="
done

==== GCA_000295675.1_oxytricha_asm_v1.1/GCA_000295675.1_oxytricha_asm_v1.1_genomic.fna.gz ====
67158112 bases (1383 N's 67156729 real 67156729 upper 0 lower) in 22363 sequences in 1 files
Total size: mean 3003.1 sd 2386.5 min 200 (AMCR01020018.1) max 66022 (AMCR01009294.1) median 2420
%0.00 masked total, %0.00 masked real
=============
==== GCA_000711775.1_Oxytricha_MIC_v2.0/GCA_000711775.1_Oxytricha_MIC_v2.0_genomic.fna.gz ====
496291018 bases (0 N's 496291018 real 253388527 upper 242902491 lower) in 25720 sequences in 1 files
Total size: mean 19295.9 sd 23164.0 min 1505 (ARYC01024697.1) max 380827 (ARYC01000045.1) median 12381
%48.94 masked total, %48.94 masked real
=============
==== GCA_001297925.1_ASM129792v1/GCA_001297925.1_ASM129792v1_genomic.fna.gz ====
57457034 bases (0 N's 57457034 real 48353099 upper 9103935 lower) in 22458 sequences in 1 files
Total size: mean 2558.4 sd 2034.5 min 201 (LAEC01022424.1) max 28998 (LAEC01020391.1) median 2072
%15.84 masked total, %15.84 masked real
=============

    faCount chrM/*.fa
#seq    len     A       C       G       T       N       cpg
gi|359817564|gb|JN383842.1|     5280    1883    613     1092    1692    0      112
gi|359817572|gb|JN383843.1|     69800   27602   7995    8639    25291   273    1626


    faSize chrM/*.fa */*genomic.fna.gz
620981244 bases (1656 N's 620979588 real 368973162 upper 252006426 lower) in 70543 sequences in 5 files
Total size: mean 8802.9 sd 16187.3 min 200 (AMCR01020018.1) max 380827 (ARYC01000045.1) median 3577
N count: mean 0.0 sd 1.1
U count: mean 5230.5 sd 9150.9
L count: mean 3572.4 sd 7821.9
%40.58 masked total, %40.58 masked real

#############################################################################
# fixup to UCSC naming scheme (DONE - 2015-12-10 - Hiram)
    mkdir /hive/data/genomes/oxyTri2/ucsc
    cd /hive/data/genomes/oxyTri2/ucsc
    # going to construct artificial AGP files from the faCount measurements
    # of the sequences

    printf "%s\n" '#!/usr/bin/env perl

use strict;
use warnings;

my $argc = scalar(@ARGV);

if ($argc != 2) {
  printf STDERR "usage: faCountToAgp.pl chrName file.faCount.txt > chrName.file.agp\n";
  exit 255;
}

my $chrName = shift;
my $faCountFile = shift;
my $ix = 1;
my $chrStart = 1;
my $chrEnd = 1;
my $gapSize = 50;
open (FH, "<$faCountFile") or die "can not read $faCountFile";
while (my $line = <FH>) {
  next if ($line =~ m/^#/);
  next if ($line =~ m/^total/);
  chomp $line;
  my ($contigName, $size, $rest) = split('\s+', $line, 3);
#  printf STDERR "# %s\t%d\n", $contigName, $size;
  if ($ix > 1) {
    $chrEnd = $chrStart + $gapSize - 1;
    printf "%s\t%d\t%d\t%d\tN\t%d\tcontig\tno\tna\n",
       $chrName, $chrStart, $chrEnd, $ix++, $gapSize;
    $chrStart = $chrEnd + 1;
  }
  $chrEnd = $chrStart + $size - 1;
  printf "%s\t%d\t%d\t%d\tF\t%s\t1\t%d\t+\n",
    $chrName, $chrStart, $chrEnd, $ix++, $contigName, $size;
  $chrStart = $chrEnd + 1;
}
close (FH);
' > faCountToAgp.pl
    chmod +x faCountToAgp.pl

    # Al Zahler's existing chr1 MAC sequence, constructed in the same order
    # strain SB310
    zcat ../genbank/GCA_000295675.1_oxytricha_asm_v1.1/GCA_000295675.1_oxytricha_asm_v1.1_genomic.fna.gz \
      | sed -e 's/^>.*strain SB310 />/; s/ macronuclear.*//;' \
         | gzip -c > chrMACsb310.contigs.fa.gz
    faCount chrMACsb310.contigs.fa.gz > chrMACsb310.contigs.faCount.txt
    ./faCountToAgp.pl chrMACsb310 chrMACsb310.contigs.faCount.txt \
        > chrMACsb310.contigs.agp

    # MAC sequence strain JRB510
    zcat ../genbank/GCA_001297925.1_ASM129792v1/GCA_001297925.1_ASM129792v1_genomic.fna.gz \
       | sed -e 's/.1 Oxytricha trifallax.*/v1/;' | gzip -c \
           > chrMACjrb510.contigs.fa.gz
    faCount chrMACjrb510.contigs.fa.gz | sort -k2nr \
       > length.sorted.ASM129792v1.faCount.txt
    ./faCountToAgp.pl chrMACjrb510 length.sorted.ASM129792v1.faCount.txt \
       > chrMACjrb510.contigs.agp

    # the MIC sequence strain JRB310, equivalent to Al Zahler's chr2:
    zcat ../genbank/GCA_000711775.1_Oxytricha_MIC_v2.0/*.fna.gz \
       | sed -e 's/.1 Oxytricha trifa.*//' | gzip -c \
         > chrMICjrb310.contigs.fa.gz
    faCount chrMICjrb310.contigs.fa.gz > chrMICjrb310.contigs.faCount.txt
    ./faCountToAgp.pl chrMICjrb310 chrMICjrb310.contigs.faCount.txt \
       > chrMICjrb310.contigs.agp

    # need to construct an artifical AGP file for the chrM sequence to
    #  annotate the gaps
    echo ">JN383843.1" > JN383843.1.fa
    grep -v "^>" ../genbank/chrM/JN383843.fa >> JN383843.1.fa
    sed -e 's/JN383843.1/chrM/;' JN383843.1.fa > chrM.fa
    hgFakeAgp -minContigGap=1 JN383843.1.fa stdout \
       | sed -e 's/^JN383843.1/chrM/;' > JN383843.1.agp

    # and then an equivalent fa.gz file with the contig sequences
    faToTwoBit chrM.fa chrM.2bit
    grep -v contig JN383843.1.agp | while read L
do
  start=`echo $L | awk '{printf "%d", $2-1}'`
  end=`echo $L | awk '{printf "%d", $3}'`
  name=`echo $L | awk '{printf "%s", $6}'`
  echo chrM:$start-$end $name 1>&2
  twoBitToFa chrM.2bit:chrM:$start-$end stdout | sed -e "s/^>.*/>$name/;"
done | faToTwoBit stdin JN383843.1.2bit
    twoBitToFa JN383843.1.2bit stdout | gzip -c > JN383843.1.fa.gz

    echo ">chrmO" > chrmO.fa
    grep -v "^>" ../genbank/chrM/JN383842.fa >> chrmO.fa
    export mSize=`faCount chrmO.fa | grep total | awk '{print $2}'`
    printf "chrmO\t1\t%d\t1\tF\t%s\t1\t%d\t+\n" $mSize "JN383842.1" $mSize \
       > JN383842.1.agp

    # verify the fa.gz and agp files are compatible
    # this is different than usual since all of the fa.gz files are the
    # 'contig' sequences that are going to make up the 'chrom' sequences
    # as defined by the agp files
    cat chrMACjrb510.contigs.agp chrMACsb310.contigs.agp \
      chrMICjrb310.contigs.agp JN383843.1.agp JN383842.1.agp > agp.test

    zcat JN383842.1.fa.gz JN383843.1.contigs.fa.gz chrMACjrb510.contigs.fa.gz \
        chrMACsb310.contigs.fa.gz chrMICjrb310.contigs.fa.gz \
          | agpToFa -simpleMultiMixed agp.test all stdout stdin \
             | faToTwoBit stdin test.2bit

    cat *.agp | checkAgpAndFa stdin test.2bit 2>&1 | tail
    # All AGP and FASTA entries agree - both files are valid

    # fetch photo:
    mkdir /hive/data/genomes/oxyTri2/photo
    cd /hive/data/genomes/oxyTri2/photo
    # manually download the photo from the WEB page:
    https://www.genome.gov/dmd/img.cfm?node=Photos/Microorganisms&id=79122
    identify NHGRI-79122.jpg 
    # NHGRI-79122.jpg JPEG 4267x3200 4267x3200+0+0 8-bit DirectClass 5.909mb 

    convert -quality 80 -geometry "400x300" NHGRI-79122.jpg \
       Oxytricha_trifallax.jpg

    # check that into the source tree hg/htdocs/images/
    cp Oxytricha_trifallax.jpg \
      /cluster/home/ceisenhart/kent/src/hg/htdocs/images/
    cd /cluster/home/ceisenhart/kent/src/hg/htdocs/images/
    git add Oxytricha_trifallax.jpg
    git commit -m "photo from NHGRI 79122 public rights refs no redmine"

    # and copy over to /usr/local/apache/htdocs/images/Oxytricha_trifallax.jpg

#############################################################################
#  Initial database build (DONE - 2015-12-10 - Hiram)

    cd /hive/data/genomes/oxyTri2
    printf "%s" '# Config parameters for makeGenomeDb.pl:
db oxyTri2
clade ciliate
genomeCladePriority 1200
scientificName Oxytricha trifallax
commonName Oxytricha trifallax
assemblyDate Sep. 2015
assemblyLabel Washington University Genome Sequencing Center
assemblyShortLabel WashU ASM129792v1
orderKey 40100
# mito sequence included in the UCSC files
mitoAcc none
fastaFiles /hive/data/genomes/oxyTri2/ucsc/*.fa.gz
agpFiles /hive/data/genomes/oxyTri2/ucsc/*.agp
# qualFiles none
dbDbSpeciesDir ciliate
photoCreditURL https://www.genome.gov/dmd/img.cfm?node=Photos/Microorganisms&id=79122
photoCreditName NHGRI 79122
ncbiGenomeId 13412
ncbiAssemblyId 2342058
ncbiAssemblyName ASM129792v1
ncbiBioProject 74629
genBankAccessionID GCA_001297925.1
taxId 1172189
' > oxyTri2.config.ra

    # verify sequence and AGP are OK:  (forgot to include the -stop=agp)
    time (makeGenomeDb.pl -workhorse=hgwdev -dbHost=hgwdev -fileServer=hgwdev \
         oxyTri2.config.ra) > agp.log 2>&1
    # *** All done!  (through the 'agp' step)
    # about 6 minutes

    # check in the trackDb files created and add to trackDb/makefile

    # need to add a new clade category:
    hgsql hgcentraltest \
-e 'INSERT INTO clade (name, label, priority) VALUES ("ciliate", "Ciliates", 1200);'


##############################################################################
# cpgIslands on UNMASKED sequence (DONE - 2015-12-10 - Hiram)
    mkdir /hive/data/genomes/oxyTri2/bed/cpgIslandsUnmasked
    cd /hive/data/genomes/oxyTri2/bed/cpgIslandsUnmasked

    time (doCpgIslands.pl -dbHost=hgwdev -bigClusterHub=ku -buildDir=`pwd` \
       -tableName=cpgIslandExtUnmasked \
          -maskedSeq=/hive/data/genomes/oxyTri2/oxyTri2.unmasked.2bit \
             -workhorse=hgwdev -smallClusterHub=ku oxyTri2) > do.log 2>&1

    cat fb.oxyTri2.cpgIslandExtUnmasked.txt
    # 482406 bases of 620980971 (0.078%) in intersection

#############################################################################
# cytoBandIdeo - (DONE - 2015-12-10 - Hiram)
    mkdir /hive/data/genomes/oxyTri2/bed/cytoBand
    cd /hive/data/genomes/oxyTri2/bed/cytoBand
    makeCytoBandIdeo.csh oxyTri2

#########################################################################
# ucscToINSDC table/track (TBD - 2015-03-20 - Hiram)
    mkdir /hive/data/genomes/oxyTri2/bed/ucscToINSDC
    cd /hive/data/genomes/oxyTri2/bed/ucscToINSDC

    ~/kent/src/hg/utils/automation/ucscToINSDC.sh \
       ../../genbank/GCA_*assembly_structure/Primary_Assembly

    awk '{printf "%s\t0\t%d\n", $1,$2}' ../../chrom.sizes \
         | sort > name.coordinate.tab
    join name.coordinate.tab ucscToINSDC.txt | tr '[ ]' '[\t]' \
         > ucscToINSDC.bed
    # verify all names are coming through, should be same line count:
    wc -l *
    # 25187 name.coordinate.tab
    # 25187 ucscToINSDC.bed
    # 25187 ucscToINSDC.txt

    cut -f1 ucscToINSDC.bed | awk '{print length($0)}' | sort -n | tail -1
    # 14
    # use the 14 in this sed
    sed -e "s/21/14/" $HOME/kent/src/hg/lib/ucscToINSDC.sql \
         | hgLoadSqlTab oxyTri2 ucscToINSDC stdin ucscToINSDC.bed
    checkTableCoords oxyTri2
    # should cover %100 entirely:
    featureBits -countGaps oxyTri2 ucscToINSDC
    # 2053849526 bases of 2053849526 (100.000%) in intersection

#########################################################################
# fixup search rule for assembly track/gold table (DONE - 2015-12-15 - Hiram)
    hgsql -N -e "select frag from gold;" oxyTri2 | sort -u \
        > oxyTri2.frag.gold.txt

    export maxLen=`awk '{print length($0)}' oxyTri2.frag.gold.txt | sort -rn | head -1`
    echo "scan to column: $maxLen"

export C=1
while [ $C -le $maxLen ];
do
echo -n " $C: "
awk '{ print substr($0,'$C',1) }' oxyTri2.frag.gold.txt | sort -u | xargs echo | sed -e 's/ //g'
C=`echo $C | awk '{print $1+1}'`
done
 1: ACJL
 2: ANRo
 3: 3EYn
 4: 8Ct
 5: 03i
 6: 18g
 7: 0123456789
 8: .0123456789
 9: .0123456789
 10: .0123456789
 11: .0123456789_
 12: .0123456789
 13: .0123456789v
 14: .0123456789
 15: 012345

    # implying a regexp of:
    [ACJL][ANRo][EYn3][Ct8][i03]([g18][0-9][v0-9_.]*)?

    # verify this rule will find them all or eliminate them all:
    hgsql -N -e "select frag from gold;" oxyTri2 | wc -l
    # 70660

    hgsql -N -e "select frag from gold;" oxyTri2 \
       | egrep -e '[ACJL][ANRo][EYn3][Ct8][i03][g18][0-9]([v0-9_.]*)?' | wc -l
    # 147512

    hgsql -N -e "select frag from gold;" oxyTri2 \
      | egrep -v -e '[ACJL][ANRo][EYn3][Ct8][i03][g18][0-9]([v0-9_.]*)?' | wc -l
    # 0

    # hence, add to trackDb/tarsier/oxyTri2/trackDb.ra
searchTable gold
shortCircuit 1
termRegex [ACJL][ANRo][EYn3][Ct8][i03][g18][0-9]([v0-9_.]*)?
query select chrom,chromStart,chromEnd,frag from %s where frag like '%s%%'
searchPriority 8

#########################################################################
# sequences track, for the other names of the 'contigs'
#    (DONE - 2015-12-15 - Hiram)
    mkdir /hive/data/genomes/oxyTri2/bed/sequences
    cd /hive/data/genomes/oxyTri2/bed/sequences

    # construct the required 'bed' files from the names in the original
    #   genbank fasta, and the contigs.lift file that specifies where the
    #   'contigs' were placed to construct the artificial chroms

    zcat ../../genbank/GCA_001297925.1_ASM129792v1/GCA_001297925.1_ASM129792v1_genomic.fna.gz \
    | grep "^>" \
      | sed -e 's/^>//; s/.1 Oxytricha trifallax strain JRB510 /v1\t/; s/, whole genome shotgun sequence//;' \
       | sort > chrMACjrb510.contig.names.txt
   grep LAEC01 ../../jkStuff/oxyTri2.contigs.lift | sort -k2 > chrMACjrb510.lift
    join -2 2 chrMACjrb510.contig.names.txt chrMACjrb510.lift \
       | awk '{printf "%d\t%s\t%d\t%s\t%d\n", $3,$2,$4,$5,$6}' \
         | sort -k1,1n > chrMACjrb510.contigs.lift
    join -2 2 chrMACjrb510.contig.names.txt chrMACjrb510.lift \
      | awk '{printf "%s\t%d\t%d\t%s\n", $5,$3,$3+$4,$2}' \
        | sort -k2,2n > chrMACjrb510.contigs.bed


    zgrep "^>" ../../genbank/GCA_000295675.1_oxytricha_asm_v1.1/GCA_000295675.1_oxytricha_asm_v1.1_genomic.fna.gz \
   | sed -e 's/^>//; s/ Oxytricha trifallax strain SB310 /\t/; s/ macronuclear, whole genome shotgun sequence//;' \
     | sort -k2,2 > chrMACsb310.contig.names.txt

    grep chrMACsb310 ../../jkStuff/oxyTri2.contigs.lift | sort -k2 \
       > chrMACsb310.lift

    join -1 2 -2 2 chrMACsb310.contig.names.txt chrMACsb310.lift \
       | awk '{printf "%d\t%s\t%d\t%s\t%d\n", $3,$2,$4,$5,$6}' \
         | sort -k1,1n > chrMACsb310.contigs.lift

    join -1 2 -2 2 chrMACsb310.contig.names.txt chrMACsb310.lift \
       | awk '{printf "%s\t%d\t%d\t%s\n", $5,$3,$3+$4,$2}' \
         | sort -k2,2n > chrMACsb310.contigs.bed

    zgrep "^>" ../../genbank/GCA_000711775.1_Oxytricha_MIC_v2.0/GCA_000711775.1_Oxytricha_MIC_v2.0_genomic.fna.gz \
       | sed -e 's/^>//; s/.1 Oxytricha trifallax strain JRB310 /\t/; s/, whole genome shotgun sequence//;' \
         | sort -k1,1 > chrMICjrb310.contig.names.txt

    grep chrMICjrb310 ../../jkStuff/oxyTri2.contigs.lift | sort -k2 \
       > chrMICjrb310.lift

    join -2 2 chrMICjrb310.contig.names.txt chrMICjrb310.lift \
       | awk '{printf "%d\t%s\t%d\t%s\t%d\n", $3,$2,$4,$5,$6}' \
         | sort -k1,1n > chrMICjrb310.contigs.lift

    join -2 2 chrMICjrb310.contig.names.txt chrMICjrb310.lift \
       | awk '{printf "%s\t%d\t%d\t%s\n", $5,$3,$3+$4,$2}' \
         | sort -k2,2n > chrMICjrb310.contigs.bed

    grep -w chrM ../../chrom.sizes \
        | awk '{printf "%s\t0\t%d\tJN383842.1\n", $1, $2}' > chrM.bed


    grep chrmO ../../chrom.sizes \
        | awk '{printf "%s\t0\t%d\tJN383843.1\n", $1, $2}' > chrmO.bed

    hgLoadBed -type=bed4 oxyTri2 sequences *.bed

    hgsql -N -e "select name from sequences;" oxyTri2 | sort -u \
        > oxyTri2.name.sequences.txt

    export maxLen=`awk '{print length($0)}' oxyTri2.name.sequences.txt | sort -rn | head -1`
    echo "scan to column: $maxLen"

export C=1
while [ $C -le $maxLen ];
do
echo -n " $C: "
awk '{ print substr($0,'$C',1) }' oxyTri2.name.sequences.txt | sort -u | xargs echo | sed -e 's/ //g'
C=`echo $C | awk '{print $1+1}'`
done
 1: ACJcd
 2: MNeot
 3: 3Cgn
 4: 12345678Rt
 5: 0123456789i
 6: 0123456789g
 7: 0123456789
 8: .0123456789
 9: .0123456789
 10: .0123456789_
 11: .0123456789C_
 12: .0123456789C_o
...

    # implying a regexp of:
    [ACJcd][MNeot][Cgn3][Rt0-9][i0-9][g0-9][0-9]([a-z0-9_.]*)?

    # verify this rule will find them all or eliminate them all:
    hgsql -N -e "select name from sequences;" oxyTri2 | wc -l
    # 70453

    hgsql -N -e "select name from sequences;" oxyTri2 \
       | egrep -e '[ACJcd][MNeot][Cgn3][Rt0-9][i0-9][g0-9][0-9]([a-z0-9_.]*)?' | wc -l
    # 70453

    hgsql -N -e "select frag from gold;" oxyTri2 \
       | egrep -v -e '[ACJcd][MNeot][Cgn3][Rt0-9][i0-9][g0-9][0-9]([a-z0-9_.]*)?' | wc -l
    # 0

    # hence, add to trackDb/tarsier/oxyTri2/trackDb.ra
searchTable gold
shortCircuit 1
termRegex [ACJL][ANRo][EYn3][Ct8][i03][g18][0-9]([v0-9_.]*)?
query select chrom,chromStart,chromEnd,frag from %s where frag like '%s%%'
searchPriority 8

##########################################################################
# running repeat masker (DONE - 2015-12-10 - Hiram)
    mkdir /hive/data/genomes/oxyTri2/bed/repeatMasker
    cd /hive/data/genomes/oxyTri2/bed/repeatMasker
    time (doRepeatMasker.pl -buildDir=`pwd` \
        -bigClusterHub=ku -dbHost=hgwdev -workhorse=hgwdev \
        -smallClusterHub=ku oxyTri2) > do.log 2>&1
    # real    26m32.442s

    cat faSize.rmsk.txt
#    624508144 bases (3528556 N's 620979588 real 604835905 upper 16143683 lower)
#      in 5 sequences in 1 files
#    Total size: mean 124901628.8 sd 210756971.1 min 5280 (chrmO)
#      max 497576968 (chrMICjrb310) median 58579884
#    %2.59 masked total, %2.60 masked real

    egrep -i "versi|relea" do.log
#    RepeatMasker version open-4.0.5
#    grep version of RepeatMasker$ /scratch/data/RepeatMasker/RepeatMasker
#    #    January 31 2015 (open-4-0-5) version of RepeatMasker
#    grep RELEASE /scratch/data/RepeatMasker/Libraries/RepeatMaskerLib.embl
#    CC   RELEASE 20140131;  

    time featureBits -countGaps oxyTri2 rmsk
#    16143693 bases of 624508144 (2.585%) in intersection
#    real    0m1.832s

##########################################################################
# running simple repeat (DONE - 2015-12-10 - Hiram)

    mkdir /hive/data/genomes/oxyTri2/bed/simpleRepeat
    cd /hive/data/genomes/oxyTri2/bed/simpleRepeat
    time (doSimpleRepeat.pl -buildDir=`pwd` -bigClusterHub=ku \
        -dbHost=hgwdev -workhorse=hgwdev -smallClusterHub=ku \
        oxyTri2) > do.log 2>&1
    # real    47m51.931s

    cat fb.simpleRepeat
    # 46901593 bases of 620980971 (7.553%) in intersection

    # using the Window Masker result:
    cd /hive/data/genomes/oxyTri2
    twoBitMask bed/windowMasker/oxyTri2.cleanWMSdust.2bit \
       -add bed/simpleRepeat/trfMask.bed  oxyTri2.2bit
    #   you can safely ignore the warning about fields >= 13
    twoBitToFa oxyTri2.2bit stdout | faSize stdin > faSize.oxyTri2.2bit.txt
    cat faSize.oxyTri2.2bit.txt
#   624508144 bases (3528556 N's 620979588 real 339311116 upper 281668472 lower)
#      in 5 sequences in 1 files
#   Total size: mean 124901628.8 sd 210756971.1 min 5280 (chrmO)
#     max 497576968 (chrMICjrb310) median 58579884
#   %45.10 masked total, %45.36 masked real


    rm /gbdb/oxyTri2/oxyTri2.2bit
    ln -s `pwd`/oxyTri2.2bit /gbdb/oxyTri2/oxyTri2.2bit

##########################################################################
# CREATE MICROSAT TRACK (DONE - 2015-12-10 - Hiram)
     ssh hgwdev
     mkdir /cluster/data/oxyTri2/bed/microsat
     cd /cluster/data/oxyTri2/bed/microsat
     awk '($5==2 || $5==3) && $6 >= 15 && $8 == 100 && $9 == 0 {printf("%s\t%s\t%s\t%dx%s\n", $1, $2, $3, $6, $16);}' \
	../simpleRepeat/simpleRepeat.bed > microsat.bed
    hgLoadBed oxyTri2 microsat microsat.bed
    #	Read 755 elements of size 4 from microsat.bed

##########################################################################
## WINDOWMASKER (DONE - 2015-12-10 - Hiram)

    mkdir /hive/data/genomes/oxyTri2/bed/windowMasker
    cd /hive/data/genomes/oxyTri2/bed/windowMasker
    time (doWindowMasker.pl -buildDir=`pwd` -workhorse=hgwdev \
        -dbHost=hgwdev oxyTri2) > do.log 2>&1
    # real    23m33.281s

    # Masking statistics
    cat faSize.oxyTri2.cleanWMSdust.txt
#    624508144 bases (3528556 N's 620979588 real 339382064 upper
#      281597524 lower) in 5 sequences in 1 files
#    Total size: mean 124901628.8 sd 210756971.1 min 5280 (chrmO)
#      max 497576968 (chrMICjrb310) median 58579884
#    %45.09 masked total, %45.35 masked real

    cat fb.oxyTri2.rmsk.windowmaskerSdust.txt
    #   14493217 bases of 624508144 (2.321%) in intersection

##########################################################################
# cpgIslands - (DONE - 2015-12-10 - Hiram)
    mkdir /hive/data/genomes/oxyTri2/bed/cpgIslands
    cd /hive/data/genomes/oxyTri2/bed/cpgIslands
    time (doCpgIslands.pl -dbHost=hgwdev -bigClusterHub=ku \
      -workhorse=hgwdev -smallClusterHub=ku oxyTri2) > do.log 2>&1 &
    # real    2m2.556s

    cat fb.oxyTri2.cpgIslandExt.txt
    # 310453 bases of 620980971 (0.050%) in intersection

#########################################################################
# genscan - (DONE - 2015-12-10 - Hiram)
    mkdir /hive/data/genomes/oxyTri2/bed/genscan
    cd /hive/data/genomes/oxyTri2/bed/genscan
    time (doGenscan.pl -buildDir=`pwd` -workhorse=hgwdev -dbHost=hgwdev \
      -bigClusterHub=ku oxyTri2) > do.log 2>&1
    # real    57m21.101s

    cat fb.oxyTri2.genscan.txt
    #   55630505 bases of 1977771384 (2.813%) in intersection

    cat fb.oxyTri2.genscanSubopt.txt
    #   58340692 bases of 1977771384 (2.950%) in intersection

#############################################################################
# augustus genes (DONE - 2015-12-11 - Hiram)

    mkdir /hive/data/genomes/oxyTri2/bed/augustus
    cd /hive/data/genomes/oxyTri2/bed/augustus
    time (doAugustus.pl -buildDir=`pwd` -bigClusterHub=ku \
        -species=saccharomyces -dbHost=hgwdev \
           -workhorse=hgwdev oxyTri2) > do.log 2>&1

    cat fb.oxyTri2.augustusGene.txt
    # 25248650 bases of 100286401 (25.177%) in intersection

########################################################################
# Create kluster run files (TBD - 2015-03-24 - Hiram)

    cd /hive/data/genomes/oxyTri2
    # numerator is oxyTri2 gapless bases "real" as reported by:
    head -1 faSize.oxyTri2.2bit.txt
# 2053849526 bases (76078142 N's 1977771384 real 1208345365 upper
#    76942601 lower) in 25187 sequences in 1 files

    # numerator is 'real' base count
    # denominator is hg19 gapless bases as reported by:
    #   featureBits -noRandom -noHap hg19 gap
    #     234344806 bases of 2861349177 (8.190%) in intersection
    # 1024 is threshold used for human -repMatch:
    calc \( 620979588 / 2861349177 \) \* 1024
    # ( 620979588 / 2861349177 ) * 1024 = 222.231912

    # ==> use -repMatch=200 according to size scaled down from 1024 for human.
    # repMatch=200 produces 88.795, which is quite a bit, using repMatch=400
    cd /hive/data/genomes/oxyTri2
    time blat oxyTri2.2bit \
         /dev/null /dev/null -tileSize=11 -makeOoc=jkStuff/oxyTri2.11.ooc \
        -repMatch=400
    # Wrote 28353 overused 11-mers to jkStuff/oxyTri2.11.ooc
    #  real    0m11.314s

    # there are many non-bridged gaps since these are artifical chromosomes
    #   definately need to tell genbank
    hgsql -N -e 'select * from gap where bridge="no" order by size;' oxyTri2 \
      | ave -tableOut -col=7 stdin
# # min Q1 median Q3 max mean N sum stddev
# 50076 58368.8 70128 100495 1.07816e+07 178173 670 1.19376e+08 672006

    # note the minimum non-bridged gap size is 50

    gapToLift -verbose=2 -minGap=50 oxyTri2 jkStuff/oxyTri2.nonBridged.lft \
        -bedFile=jkStuff/oxyTri2.nonBridged.bed


    # survey sizes of the individual sequences:
    n50.pl chrom.sizes.contigs
#       reading: chrom.sizes.contigs
#       contig count: 70543, total size: 620981244, one half size: 310490622
# cumulative    N50 count       contig  contig size
# 310471381       7008    ARYC01007568    19742
# 310490622 one half size
# 310491123       7009    ARYC01013959    19742

#############################################################################
# GENBANK AUTO UPDATE (DONE - 2015-12-11 - Hiram)
    ssh hgwdev
    cd $HOME/kent/src/hg/makeDb/genbank
    git pull
    # /cluster/data/genbank/data/organism.lst shows:
    # #organism       mrnaCnt estCnt  refSeqCnt
    # Oxytricha fallax        4       0       0
    # Oxytricha sp.   1       0       0
    # Oxytricha sp. LPJ-2005  2       0       0
    # Oxytricha trifallax     1       0       0

    # edit etc/genbank.conf to add oxyTri2 just before droAna1

# oxyTri2 (tibetan frog)
oxyTri2.serverGenome = /hive/data/genomes/oxyTri2/oxyTri2.2bit
oxyTri2.clusterGenome = /hive/data/genomes/oxyTri2/oxyTri2.2bit
oxyTri2.ooc = /hive/data/genomes/oxyTri2/jkStuff/oxyTri2.11.ooc
oxyTri2.lift = /hive/data/genomes/oxyTri2/jkStuff/oxyTri2.nonBridged.lft
oxyTri2.perChromTables = no
oxyTri2.refseq.mrna.xeno.pslCDnaFilter    = ${finished.refseq.mrna.xeno.pslCDnaFilter}
oxyTri2.genbank.mrna.native.pslCDnaFilter = ${finished.genbank.mrna.native.pslCDnaFilter}
oxyTri2.genbank.mrna.xeno.pslCDnaFilter   = ${finished.genbank.mrna.xeno.pslCDnaFilter}
oxyTri2.genbank.est.native.pslCDnaFilter  = ${finished.genbank.est.native.pslCDnaFilter}
oxyTri2.genbank.est.xeno.pslCDnaFilter    = ${finished.genbank.est.xeno.pslCDnaFilter}
oxyTri2.downloadDir = oxyTri2
# defaults are OK: genbank and refseq mrna.native.load yes
#                  genbank est.native.load yes refseq.mrna.xeno.load yes
#                  genbank mrna and est xeno.load no

    # Edit src/lib/gbGenome.c to add new species.
    git commit -m "Added oxyTri2 Oxytricha trifallax refs no redmine" \
         etc/genbank.conf src/lib/gbGenome.c
    git push
    # update /cluster/data/genbank/etc/:
    make etc-update
    # update /cluster/data/genbank/bin/:
    make install-server

    screen      #  control this business with a screen since it takes a while
    cd /cluster/data/genbank

    time ./bin/gbAlignStep -initial oxyTri2
    # logFile: var/build/logs/2015.12.10-23:33:17.oxyTri2.initalign.log
    #    real    1049m54.083s

    #   To re-do, rm the dir first:
    #     /cluster/data/genbank/work/initial.oxyTri2

    # load database when finished
    ssh hgwdev
    cd /cluster/data/genbank
    time ./bin/gbDbLoadStep -drop -initialLoad oxyTri2
    # logFile: var/dbload/hgwdev/logs/2015.12.11-17:56:41.oxyTri2.dbload.log
    # real    18m14.340s

    # enable daily alignment and update of hgwdev
    cd ~/kent/src/hg/makeDb/genbank
    git pull
    # add oxyTri2 to:
    #   vi etc/align.dbs etc/hgwdev.dbs
    git commit -m "Added oxyTri2 - Oxytricha trifallax refs no redmine" \
      etc/align.dbs etc/hgwdev.dbs
    git push
    make etc-update

#############################################################################
## blat server turned on (DONE - 2015-12-11 - Hiram)
#	After getting a blat server assigned by the Blat Server Gods,
    ssh hgwdev

    hgsql -e 'INSERT INTO blatServers (db, host, port, isTrans, canPcr) \
	VALUES ("oxyTri2", "blat1a", "17852", "1", "0"); \
	INSERT INTO blatServers (db, host, port, isTrans, canPcr) \
	VALUES ("oxyTri2", "blat1a", "17853", "0", "1");' \
	    hgcentraltest
    #	test it with some sequence

############################################################################
## reset default position to ABO gene (DONE - 2014-01-13 - Hiram)
    ssh hgwdev
    hgsql -e 'update dbDb set defaultPos="chrMACsb310:13679183-13682923"
	where name="oxyTri2";' hgcentraltest

#########################################################################

#########################################################################
# all.joiner update, downloads and in pushQ - (TBD - 2015-06-22 - Hiram)
    cd $HOME/kent/src/hg/makeDb/schema
    # fixup all.joiner until this is a clean output
    joinerCheck -database=oxyTri2 -tableCoverage all.joiner
    joinerCheck -database=oxyTri2 -times all.joiner
    joinerCheck -database=oxyTri2 -keys all.joiner

    cd /hive/data/genomes/oxyTri2
    time makeDownloads.pl oxyTri2 > downloads.log 2>&1
    # real    13m42.027s

    #   now ready for pushQ entry
    mkdir /hive/data/genomes/oxyTri2/pushQ
    cd /hive/data/genomes/oxyTri2/pushQ
    makePushQSql.pl oxyTri2 > oxyTri2.pushQ.sql 2> stderr.out
    #   check for errors in stderr.out, some are OK, e.g.:
    # WARNING: hgwdev does not have /gbdb/oxyTri2/wib/gc5Base.wib
    # WARNING: hgwdev does not have /gbdb/oxyTri2/wib/quality.wib
    # WARNING: hgwdev does not have /gbdb/oxyTri2/bbi/qualityBw/quality.bw
    # WARNING: oxyTri2 does not have seq
    # WARNING: oxyTri2 does not have extFile
    # WARNING: oxyTri2 does not have estOrientInfo
    # WARNING: oxyTri2 does not have mrnaOrientInfo

    #   copy it to hgwbeta
    scp -p oxyTri2.pushQ.sql qateam@hgwbeta:/tmp
    ssh qateam@hgwbeta "./bin/x86_64/hgsql qapushq < /tmp/oxyTri2.pushQ.sql"

    #   in that pushQ entry walk through each entry and see if the
    #   sizes will set properly

#########################################################################
