#############################################################################
## 369-Way Multiz (TBD - 2018-12-10 - Hiram)
    ssh hgwdev

    # the phylo tree was done by kmer counting
    ssh ku
    mkdir /hive/data/genomes/staAur2/bed/kmers
    cd /hive/data/genomes/staAur2/bed/kmers
    # working on this set of sequences:
    ls -d ../sequences/GC* | sed -e 's#.*sequences/##;' > accession.list

    # with this template
#LOOP
kmerOne $(file1) {check out exists+ $(file1)/$(file1).31mers.profile.txt.gz}
#ENDLOOP

    # this script does the kmer preparation, specifically set to 31 here:
#!/bin/bash

set -beEu -o pipefail

export SORT="$HOME/bin/x86_64/gnusort -S32G --parallel=4 -T /dev/shm"

db=$1
fa=`ls ../sequences/$db/*_genomic.fna.gz | grep -v "_from_"`

if [ ! -s "${db}/${db}.31mers.profile.txt.gz" ]; then
  mkdir -p "${db}"
  ./kmerPrint.pl 31 ${fa} | gzip -c > ${db}/${db}.31mers.txt.gz
  zcat ${db}/${db}.31mers.txt.gz | cut -f1 | ${SORT} | uniq -c \
     | awk '{printf "%s\t%d\n", $2, $1}' | ${SORT} | gzip -c > ${db}/${db}.31mers.profile.txt.gz
  rm -f ${db}/${db}.31mers.txt.gz
fi

    # first batch:
    gensub2 accession.list single template kmer31.jobList
    para -ram-32g create kmer31.jobList
    para try ... check ... push ... etc... time
# Completed: 369 of 369 jobs
# CPU time in finished jobs:      42269s     704.48m    11.74h    0.49d  0.001 y
# IO & Wait Time:                     0s       0.00m     0.00h    0.00d  0.000 y
# Average job time:                  65s       1.09m     0.02h    0.00d
# Longest finished job:             153s       2.55m     0.04h    0.00d
# Submission to last job:           853s      14.22m     0.24h    0.01d

    # in this same directory, on ku, clear this previous batch:
    cd /hive/data/genomes/staAur2/bed/kmers
    para clearSickNodes; para flushResults; para resetCounts; para freeBatch

    # the next job list is made by this perl script.  It creates a comparison
    # between each kmer result.  The number of jobs is going to be:
    #   N * ( N / 2 )  since it is the sum of: (N-1)+(N-2)+(N-3)... + 1

#!/usr/bin/env perl

use strict;
use warnings;

my $argc = scalar(@ARGV);

if ($argc != 1) {
  printf STDERR "usage: ./allByAllPairs.pl N\n";
  printf STDERR "A_albimanus     68.19   A_aquasalis     68.846969\n";
  printf STDERR "A_albimanus     70.85   A_arabiensis    54.805332\n";
  printf STDERR "A_aquasalis     71.54   A_arabiensis    54.812861\n";

  exit 255;
}

my $N = shift;

my @workList;
my $listCount = 0;

open (FH, "ls -drt */*${N}mers.profile.txt.gz | sed -e 's/.gz//;' | sort |") or die "can not read ls -drt */*.profile.txt";
while (my $line = <FH>) {
  chomp $line;
  $line =~ s#/.*##;
  $workList[$listCount++] = $line;
}
close (FH);
foreach my $db (@workList) {
  printf STDERR "%s\n", $db;
}

my @remainingList = @workList;

for (my $i = 0; $i < $listCount; ++$i) {
  my $s1 = $workList[$i];
  for (my $j = $i+1; $j < $listCount; ++$j) {
    my $s2 = $workList[$j];
    printf "runOne %s %s %d {check out exists+ compare.%d/%s/%s.txt}\n",
      $s1, $s2, $N, $N, $s1, $s2;
  }
}

printf STDERR "# listCount: %d, job count = %d * (%d / 2) = %d\n", $listCount, $listCount, $listCount, $listCount*($listCount/2);
###############################################################################

    ./mkJobList.pl > jobList.kmer31.compare
    para -ram=32g create jobList.kmer31.compare
    para try ... check ... push ... etc... time
# Completed: 67896 of 67896 jobs
# CPU time in finished jobs:    2277675s   37961.26m   632.69h   26.36d  0.072 y
# IO & Wait Time:                260509s    4341.81m    72.36h    3.02d  0.008 y
# Average job time:                  37s       0.62m     0.01h    0.00d
# Longest finished job:              70s       1.17m     0.02h    0.00d
# Submission to last job:          2649s      44.15m     0.74h    0.03d

    # collect all the results in one file:
    find ./compare.31 -type f | xargs cat > allByAll.kmers31.compared.txt

    # this matrixUp.pl script constructs an N by N 'similarity' matrix

    ~/kent/src/hg/makeDb/doc/staAur2/matrixUp.pl > data31mer.csv
    # AND it writes the file: matrixKey.txt which is a naming index
    # of the rows and columns in the matrix in data31mer.csv, rename it:
    mv matrixKey.txt matrixKey.31mer.txt

    # prepare for running the phylip command 'neighbor'
    # construct a 'distance' matrix input for phylip from the 'similarity'
    # matrix.  This perl script reads the data31mer.csv and matrixKey
    # to construct the infile for 'neighbor'.  It needs to mangle the
    # sequence names down to a length of 10 characters since that is
    # all that 'neighbor' can handle.  It writes to STDERR the translation
    # two column file to restore the names from the result

#!/usr/bin/env perl

use strict;
use warnings;

my %orderKey;   # key is number, value is name
my $keyCount = 0;
my %shortName;  # key is long name, value is short name
my %shortKey;   # key is short name, value is number

my $argc = scalar(@ARGV);

if ($argc != 1) {
  printf STDERR "usage: ./mkTestData.pl N\n";
  printf STDERR "where N is Nmer, will be reading:\n";
  printf STDERR " ../matrixKey.Nmer.txt and ../dataNmer.csv\n";
  exit 255;
}

my $N = shift;

# -rw-rw-r--   1     7270 Dec 23 13:36 matrixKey.31mer.txt
# -rw-rw-r--   1  1049999 Dec 23 13:36 data31mer.csv

# 24    A_stephensi
# 25    Aedes_aegypti
# 26    Aedes_albopictus

open (FH, "<../matrixKey.${N}mer.txt") or die "can not read ../matrixKey.${N}mer.txt";
while (my $line = <FH>) {
  chomp $line;
  my ($key, $name) = split('\s+', $line);
  $orderKey{$key} = $name;
  if ($name =~ m/_/) {
     my @nameParts = split('_', $name);
     my $shortName = $nameParts[1];
     $shortName =~ s/0//;
     $shortName =~ s/v//g;
     for (my $i = 1; $i < 4; ++$i) {
        my $tryName = sprintf("%s%d", $shortName, $i);
        if (! defined($shortKey{$tryName})) {
           $shortName = $tryName;
           last;
        }
     }
     printf STDERR "short name is longer than 10 chars ? $shortName\n" if (length($shortName) > 10);
     die "name conflict: $name" if (defined($shortKey{$name}));
     $shortName{$name} = $shortName;
     $shortKey{$keyCount} = $shortName;
  } else {
     printf STDERR "short name is longer than 10 chars ? $name\n" if (length($name) > 10);
     die "name conflict: $name" if (defined($shortKey{$name}));
     $shortName{$name} = $name;
     $shortKey{$keyCount} = $name;
  }
  ++$keyCount;
}
close (FH);

for (my $i = 0; $i < $keyCount; ++$i) {
   printf STDERR "%s\t%s\n",  $shortName{$orderKey{$i}}, $orderKey{$i};
}

my @rows;
my $rowCount = 0;

open (FH, "<../data${N}mer.csv") or die "can not read ../data${N}mer.csv";
while (my $line = <FH>) {
  chomp $line;
  my @a = split(',', $line);
  $rows[$rowCount++] = \@a
}
close (FH);

for (my $i = 0; $i < $keyCount; ++$i) {
   for (my $j = 0; $j < $keyCount; ++$j) {
      next if ($i == $j);
      my $ave = ($rows[$i]->[$j] + $rows[$j]->[$i]) / 2;
      $rows[$i]->[$j] = $ave;
      $rows[$j]->[$i] = $ave;
   }
}

printf "%5d\n", $keyCount;
for (my $i = 0; $i < $keyCount; ++$i) {
   printf "%-10.10s", $shortKey{$i};
   for (my $j = 0; $j < $keyCount; ++$j) {
      my $value = $rows[$i]->[$j];
      printf "%8.4f", (100.0 - $value) / 100.0;
   }
   printf "\n";
}
###########################################################################

    ./mkTestData.pl > infile 2> name.31mer.translate.txt

    # running 'neighbor' select the UPGMA option, results
    # end up in outfile and outtree.  There doesn't seem to be
    # any command line options to this command, it starts with
    # an interactive menu where option selections are made.

    ./neighbor
    # rename the results:
    mv outfile outfile.31mer
    mv outtree outtree.31mer

    # restore the full sequence names:
    cat outtree.31mer \
  | sed -e 's/(/(\n/g; s/,/,\n/g; s/)/\n)/g; s/ //g;' \
  | grep -v "^$" | ~/kent/src/hg/utils/phyloTrees/binaryTree.pl \
    -nameTranslate=name.31mer.translate.txt -noInternal -lineOutput /dev/stdin \
      > upgma.31mer.nh

    # then from GCF names to the sequence names

    sed -e 's/\./v/;' ../../nameCorrespond/gcToShortName.tab > gc.shortName.tab

    cat upgma.31mer.nh \
  | sed -e 's/(/(\n/g; s/,/,\n/g; s/)/\n)/g; s/ //g;' \
  | grep -v "^$" | ~/kent/src/hg/utils/phyloTrees/binaryTree.pl \
    -nameTranslate=gc.shortName.tab -noInternal -lineOutput /dev/stdin \
      > upgma.31mer.shortName.nh

    # take that result to TreeGraph2 to reorganize to get the staAur2
    # reference at the top: Sa_USA300_FPR3757
    
    # the result is the staAur2.369way.nh used to run the multiz

    mkdir /hive/data/genomes/staAur2/bed/multiz369way
    cd /hive/data/genomes/staAur2/bed/multiz369way

    # the resulting staAur2.369way.nh tree from the above kmer procedure starts:
    head staAur2.369way.nh | sed -e 's/^/# /;'
# ((((((((((((((((((((((((((staAur2:0.00475,
#                          (Sa_USA300_SUR12:0.00381,
#                           (Sa_USA300_TCH1516:0.00335,
#                            ((Sa_JE2:0.00055,
#                              Sa_C2406:0.00055):0.00086,
#                             (Sa_USA300_NRS384:0.00129,
#                              (Sa_UA_S391_USA300:0.00081,
#                               ((Sa_CIT:0.000001,
#                                 Sa_OXLIM:0.000001):0.00068,
#                                (Sa_33b:0.00029,

    # and ends, note the Bacillus_subtilis and E_coli here for outgroups

#        Sa_NRS271:0.16326):0.00391,
#       Sa_MOZ66:0.16717):0.01913):0.02373):0.0805,
#    Sa_K17:0.29052):0.18744):0.02154,
#  Bacillus_subtilis_168:0.4995):0.00045,
# (((((E_coli_K_12_MG1655:0.22665,
#      E_coli_O157_H7_Sakai_RIMD_0509952:0.22665):0.0382,
#     E_coli_UMN026:0.26485):0.02508,
#    E_coli_O83_H1_NRG_857C:0.28993):0.0484,
#   E_coli_IAI39:0.33834):0.14118,
#  E_coli_O104_H4_2011C_3493:0.47952):0.02043):0.1:0.1;

    # convert it to other naming schemes:
    cut -f1,4 \
      ../../nameCorrespond/taxId.acc.asmId.shortName.sciName.noDot.tab \
         | awk '{printf "%s\t%s\n", $2,$1}' > shortName.to.taxId.tab

    sed -e 's/\./v/;' ../../nameCorrespond/gcToShortName.tab \
        | awk '{printf "%s\t%s\n", $2, $1}' > shortName.to.GCFname.tab
    awk '{printf "%s\t%s\n", $2, $1}' ../../nameCorrespond/gcToShortName.tab \
       > shortName.to.GCFaccession.tab

    cat staAur2.369way.nh \
  | sed -e 's/(/(\n/g; s/,/,\n/g; s/)/\n)/g; s/ //g;' \
  | grep -v "^$" | ~/kent/src/hg/utils/phyloTrees/binaryTree.pl \
    -nameTranslate=shortName.to.taxId.tab -noInternal -lineOutput /dev/stdin \
      > staAur2.369way.taxId.nh

    cat staAur2.369way.nh \
  | sed -e 's/(/(\n/g; s/,/,\n/g; s/)/\n)/g; s/ //g;' \
  | grep -v "^$" | ~/kent/src/hg/utils/phyloTrees/binaryTree.pl \
    -nameTranslate=shortName.to.GCFaccession.tab -noInternal \
      -lineOutput /dev/stdin \ > staAur2.369way.accession.nh

    #########################################################################
    # extract species list from that .nh file
    # this sed only works on this type of ascii printout that has one
    # name per line.
    sed -e 's/:.*//; s/(//g; s/ //g;' staAur2.369way.nh \
      | xargs echo > species.list.txt

#	Use this specification in the phyloGif tool:
#	http://genome.ucsc.edu/cgi-bin/phyloGif
#	to obtain a png image for src/hg/htdocs/images/phylo/staAur2_369way.png

    # all distances for sizeStats.pl below:

    /cluster/bin/phast/all_dists staAur2.369way.nh | grep staAur2 \
       | sed -e "s/staAur2.//" | sort -k2n > 369way.distances.txt

    # this is a specialized sizeStats.pl from the usual kind

    cat > sizeStats.pl << '_EOF_'
#!/usr/bin/env perl

use strict;
use warnings;

my %dbToName;  # key is name from 369way.distances.txt, value is sciName

open (FH , "<dbName.to.sciName.txt") or die "can not read dbName.to.sciName.txt";
while (my $line = <FH>) {
  chomp $line;
  my ($db, $name) = split('\s+',$line);
  $name =~ s/__/. /;
  $dbToName{$db} = $name;
}
close (FH);

open (FH, "<369way.distances.txt") or
        die "can not read 369way.distances.txt";

my $count = 0;
while (my $line = <FH>) {
    chomp $line;
    my ($D, $dist) = split('\s+', $line);
    my $Db = ucfirst($D);
    my $chain = "chain" . ucfirst($D);
    my $B="/hive/data/genomes/staAur2/bed/lastz.$D/fb.staAur2." .
        $chain . "Link.txt";
    if ( $D =~ m/^G/) {
        $B=`ls /hive/data/genomes/staAur2/bed/awsMultiz/lastzRun/lastz_$D/fb.staAur2.chain.${D}*`;
        chomp $B;
    }
    my $chainLinkMeasure =
        `awk '{print \$5}' ${B} 2> /dev/null | sed -e "s/(//; s/)//"`;
    chomp $chainLinkMeasure;
    $chainLinkMeasure = 0.0 if (length($chainLinkMeasure) < 1);
    $chainLinkMeasure =~ s/\%//;
    my $chainSynMeasure = "";
    if ( $D =~ m/^G/) {
        $B=`ls /hive/data/genomes/staAur2/bed/awsMultiz/lastzRun/lastz_$D/fb.staAur2.chainSyn.${D}*`;
        chomp $B;
    } else {
        $B="/hive/data/genomes/staAur2/bed/lastz.${D}/fb.staAur2.chainSyn${Db}Link.txt";
    }
    $chainSynMeasure =
      `awk '{print \$5}' ${B} 2> /dev/null | sed -e "s/(//; s/)//"`;
    chomp $chainSynMeasure;
    $chainSynMeasure = 0.0 if (length($chainSynMeasure) < 1);
    $chainSynMeasure =~ s/\%//;
    my $chainRBestMeasure = "";
    if ( $D =~ m/^G/) {
        $B=`ls /hive/data/genomes/staAur2/bed/awsMultiz/lastzRun/lastz_$D/fb.staAur2.chainRBest.${D}*`;
        chomp $B;
    } else {
        $B="/hive/data/genomes/staAur2/bed/lastz.${D}/fb.staAur2.chainRBest.${Db}.txt";
    }
    $chainRBestMeasure =
      `awk '{print \$5}' ${B} 2> /dev/null | sed -e "s/(//; s/)//"`;
    chomp $chainRBestMeasure;
    $chainRBestMeasure = 0.0 if (length($chainRBestMeasure) < 1);
    $chainRBestMeasure =~ s/\%//;
    my $swapFile="/hive/data/genomes/${D}/bed/lastz.staAur2/fb.${D}.chainGalGal6Link.txt";
    my $swapMeasure = "0";
    if ( -s $swapFile ) {
	$swapMeasure =
	    `awk '{print \$5}' ${swapFile} 2> /dev/null | sed -e "s/(//; s/)//"`;
	chomp $swapMeasure;
	$swapMeasure = 0.0 if (length($swapMeasure) < 1);
	$swapMeasure =~ s/\%//;
    }
    my $orgName=
    `hgsql -N -e "select organism from dbDb where name='$D';" hgcentraltest`;
    chomp $orgName;
    if (length($orgName) < 1) {
        if (defined($dbToName{$D})) {
          $orgName = $dbToName{$D};
        } else {
          $orgName="N/A";
        }
    }
    ++$count;
    my $percentAlike = 100.0 * ($chainLinkMeasure - $chainRBestMeasure) / $chainLinkMeasure;
    printf "# %03d %.4f (%% %06.3f) (%% %06.3f) (%% %06.3f) %5.2f - %s %s\n", $count, $dist,
        $chainLinkMeasure, $chainSynMeasure, $chainRBestMeasure, $percentAlike, $orgName, $D;
}
close (FH);
'_EOF_'
# << happy emacs

    chmod +x ./sizeStats.pl
    ./sizeStats.pl

#	If you can fill in all the numbers in this table, you are ready for
#	the multiple alignment procedure.

# the unlabeled column between rBestLink and sciName is the % ratio
# of recipBest to chainLink:
#    100.0 * (chainLink - rBestLink) / chainLink

#   N  dist  chainLink    synLink   rBestLink         sciName - accession/db
#
# 001 0.4253 (% 43.814) (% 12.216) (% 32.785) 25.17 - Caenorhabditis_sp._38_MB-2015 C_sp38_MB_2015
# 002 0.4357 (% 59.069) (% 37.574) (% 48.812) 17.36 - C. brenneri caePb3
# 003 0.4425 (% 40.332) (% 25.464) (% 36.514)  9.47 - C. sp. 5 ju800 caeSp51
# 004 0.4434 (% 39.417) (% 06.460) (% 27.675) 29.79 - C. angaria caeAng2
# 005 0.4466 (% 57.873) (% 39.138) (% 47.416) 18.07 - Caenorhabditis_latens C_latens
# 006 0.4480 (% 61.020) (% 40.374) (% 50.049) 17.98 - C. remanei caeRem4
# 007 0.4569 (% 15.004) (% 00.079) (% 12.189) 18.76 - Ancylostoma_duodenale
# 008 0.4576 (% 16.527) (% 00.580) (% 13.743) 16.85 - Ancylostoma_caninum
# 009 0.4606 (% 08.936) (% 00.180) (% 07.705) 13.78 - Barber pole worm haeCon2
# 010 0.4607 (% 14.484) (% 00.219) (% 11.872) 18.03 - Teladorsagia_circumcincta
# 011 0.4615 (% 55.203) (% 30.009) (% 43.133) 21.86 - C. tropicalis caeSp111
# 012 0.4615 (% 41.026) (% 18.314) (% 31.414) 23.43 - Caenorhabditis_sp._31_LS-2015 C_sp31_LS_2015
# 013 0.4630 (% 03.444) (% 00.018) (% 02.414) 29.91 - Opisthorchis_viverrini
# 014 0.4634 (% 14.994) (% 00.164) (% 12.019) 19.84 - Oesophagostomum_dentatum
# 015 0.4638 (% 03.450) (% 00.007) (% 02.410) 30.14 - Clonorchis_sinensis
# 016 0.4645 (% 11.502) (% 00.228) (% 09.476) 17.61 - Toxocara_canis
# 017 0.4647 (% 14.406) (% 00.306) (% 11.734) 18.55 - Haemonchus_contortus
# 018 0.4651 (% 05.722) (% 00.132) (% 04.869) 14.91 - Pig roundworm ascSuu1
# 019 0.4660 (% 00.832) (% 00.000) (% 00.495) 40.50 - Dicrocoelium_dendriticum
# 020 0.4672 (% 13.175) (% 00.264) (% 10.946) 16.92 - Ascaris_suum
# 021 0.4681 (% 05.601) (% 00.019) (% 04.196) 25.08 - Hymenolepis_microstoma
# 022 0.4686 (% 53.670) (% 38.079) (% 44.232) 17.59 - Caenorhabditis_sp._34_TK-2017 C_sp34_TK_2017
# 023 0.4689 (% 11.720) (% 00.230) (% 09.718) 17.08 - Parascaris_univalens
# 024 0.4715 (% 59.618) (% 37.200) (% 48.546) 18.57 - Caenorhabditis_sp._26_LS-2015 C_sp26_LS_2015
# 025 0.4715 (% 15.792) (% 00.540) (% 13.351) 15.46 - Nippostrongylus_brasiliensis
# 026 0.4721 (% 15.964) (% 00.578) (% 13.279) 16.82 - A. ceylanicum ancCey1
# 027 0.4726 (% 24.970) (% 01.150) (% 20.772) 16.81 - Diploscapter_coronatus
# 028 0.4730 (% 14.696) (% 00.369) (% 12.240) 16.71 - Heligmosomoides_polygyrus_bakeri
# 029 0.4740 (% 03.502) (% 00.000) (% 02.468) 29.53 - Fasciola_gigantica
# 030 0.4740 (% 62.531) (% 41.703) (% 51.026) 18.40 - Caenorhabditis_nigoni C_nigoni
# 031 0.4742 (% 23.687) (% 00.814) (% 19.743) 16.65 - Diploscapter_pachys
# 032 0.4747 (% 13.074) (% 00.292) (% 10.610) 18.85 - Pristionchus_arcanus
# 033 0.4747 (% 60.930) (% 40.599) (% 49.316) 19.06 - Caenorhabditis_briggsae C_briggsae
# 034 0.4747 (% 39.384) (% 31.995) (% 35.681)  9.40 - C. briggsae cb4
# 035 0.4752 (% 52.955) (% 36.731) (% 44.297) 16.35 - Caenorhabditis_sp._40_LS-2015 C_sp40_LS_2015
# 036 0.4758 (% 15.754) (% 00.618) (% 12.952) 17.79 - Necator_americanus
# 037 0.4758 (% 08.902) (% 00.270) (% 07.682) 13.70 - N. americanus necAme1
# 038 0.4762 (% 02.894) (% 00.000) (% 02.011) 30.51 - Spirometra_erinaceieuropaei
# 039 0.4786 (% 04.266) (% 00.019) (% 03.130) 26.63 - Fasciola_hepatica
# 040 0.4788 (% 17.480) (% 00.631) (% 14.549) 16.77 - Dictyocaulus_viviparus
# 041 0.4830 (% 08.805) (% 00.022) (% 06.772) 23.09 - Romanomermis_culicivorax
# 042 0.4859 (% 06.222) (% 00.068) (% 05.303) 14.77 - P. exspectatus priExs1
# 043 0.4859 (% 12.342) (% 00.199) (% 10.056) 18.52 - Pristionchus_exspectatus
# 044 0.4863 (% 15.278) (% 00.137) (% 11.986) 21.55 - Oscheius_sp._TEL-2014 Oscheius_TEL_2014
# 045 0.4889 (% 16.319) (% 00.084) (% 12.626) 22.63 - Setaria_digitata
# 046 0.4902 (% 11.406) (% 00.118) (% 09.141) 19.86 - Plectus_sambesii
# 047 0.4959 (% 12.624) (% 00.190) (% 10.610) 15.95 - Angiostrongylus_cantonensis
# 048 0.4969 (% 14.600) (% 00.179) (% 11.254) 22.92 - Setaria_equina
# 049 0.4980 (% 16.698) (% 00.276) (% 12.631) 24.36 - Loa_loa
# 050 0.4980 (% 05.056) (% 00.107) (% 04.254) 15.86 - Eye worm loaLoa1
# 051 0.4984 (% 14.413) (% 00.233) (% 11.306) 21.56 - Ditylenchus_destructor
# 052 0.4988 (% 12.602) (% 00.312) (% 10.270) 18.50 - Pristionchus_entomophagus
# 053 0.5006 (% 45.308) (% 20.405) (% 36.074) 20.38 - C. japonica caeJap4
# 054 0.5007 (% 15.520) (% 00.269) (% 11.600) 25.26 - Onchocerca_flexuosa
# 055 0.5010 (% 06.308) (% 00.187) (% 05.354) 15.12 - P. pacificus priPac3
# 056 0.5013 (% 03.234) (% 00.019) (% 02.304) 28.76 - Echinococcus_granulosus
# 057 0.5028 (% 19.777) (% 00.117) (% 14.787) 25.23 - Onchocerca_ochengi
# 058 0.5038 (% 03.457) (% 00.018) (% 02.482) 28.20 - Taenia_saginata
# 059 0.5038 (% 03.268) (% 00.012) (% 02.310) 29.31 - Taenia_solium
# 060 0.5040 (% 03.383) (% 00.019) (% 02.395) 29.20 - Echinococcus_canadensis
# 061 0.5042 (% 03.396) (% 00.018) (% 02.434) 28.33 - Taenia_asiatica
# 062 0.5042 (% 12.880) (% 00.310) (% 10.474) 18.68 - Pristionchus_maxplancki
# 063 0.5042 (% 23.081) (% 01.821) (% 18.534) 19.70 - Caenorhabditis_sp._21_LS-2015 C_sp21_LS_2015
# 064 0.5067 (% 05.193) (% 00.151) (% 04.375) 15.75 - O. volvulus oncVol1
# 065 0.5071 (% 25.170) (% 00.609) (% 18.668) 25.83 - Onchocerca_volvulus
# 066 0.5074 (% 12.987) (% 00.445) (% 10.449) 19.54 - Pristionchus_pacificus
# 067 0.5088 (% 03.403) (% 00.019) (% 02.440) 28.30 - Echinococcus_multilocularis
# 068 0.5098 (% 44.315) (% 17.195) (% 35.657) 19.54 - Caenorhabditis_sp._39_LS-2015 C_sp39_LS_2015
# 069 0.5136 (% 20.673) (% 00.156) (% 15.260) 26.18 - Wuchereria_bancrofti
# 070 0.5143 (% 10.486) (% 00.600) (% 09.313) 11.19 - H. bacteriophora/m31e hetBac1
# 071 0.5156 (% 19.944) (% 00.441) (% 14.559) 27.00 - Brugia_pahangi
# 072 0.5165 (% 12.825) (% 00.184) (% 09.853) 23.17 - Steinernema_monticolum
# 073 0.5170 (% 13.379) (% 00.020) (% 10.156) 24.09 - Macrostomum_lignano
# 074 0.5183 (% 07.825) (% 00.000) (% 06.305) 19.42 - Schistosoma_japonicum
# 075 0.5185 (% 15.865) (% 00.344) (% 12.506) 21.17 - Parapristionchus_giblindavisi
# 076 0.5190 (% 20.305) (% 00.186) (% 15.022) 26.02 - Dirofilaria_immitis
# 077 0.5190 (% 05.005) (% 00.129) (% 04.188) 16.32 - Dog heartworm dirImm1
# 078 0.5205 (% 09.723) (% 00.007) (% 07.927) 18.47 - Schistosoma_haematobium
# 079 0.5217 (% 18.703) (% 00.294) (% 13.109) 29.91 - Rhabditophanes_sp._KR3021 Rhabditophanes_KR3021
# 080 0.5257 (% 11.415) (% 00.065) (% 07.962) 30.25 - Trichinella_papuae
# 081 0.5262 (% 10.699) (% 00.051) (% 07.442) 30.44 - Trichinella_nativa
# 082 0.5263 (% 10.754) (% 00.051) (% 07.465) 30.58 - Trichinella_nelsoni
# 083 0.5272 (% 37.989) (% 00.013) (% 30.954) 18.52 - Girardia_tigrina
# 084 0.5277 (% 16.345) (% 00.160) (% 11.694) 28.46 - Rotylenchulus_reniformis
# 085 0.5287 (% 10.902) (% 00.051) (% 07.665) 29.69 - Trichinella_sp._T8 Trichinella_T8
# 086 0.5292 (% 05.065) (% 00.102) (% 04.259) 15.91 - Filarial worm bruMal2
# 087 0.5295 (% 10.328) (% 00.018) (% 08.457) 18.12 - Schistosoma_mansoni
# 088 0.5300 (% 10.885) (% 00.063) (% 07.626) 29.94 - Trichinella_sp._T9 Trichinella_T9
# 089 0.5304 (% 06.686) (% 00.020) (% 04.946) 26.02 - Gyrodactylus_salaris
# 090 0.5314 (% 11.462) (% 00.066) (% 07.980) 30.38 - Trichinella_pseudospiralis
# 091 0.5324 (% 10.836) (% 00.052) (% 07.561) 30.22 - Trichinella_patagoniensis
# 092 0.5331 (% 02.006) (% 00.000) (% 01.536) 23.43 - C. intestinalis ci3
# 093 0.5335 (% 11.008) (% 00.066) (% 07.671) 30.31 - Trichinella_sp._T6 Trichinella_T6
# 094 0.5345 (% 02.848) (% 00.007) (% 02.239) 21.38 - Trichinella triSpi1
# 095 0.5346 (% 10.697) (% 00.052) (% 07.591) 29.04 - Trichinella_spiralis
# 096 0.5350 (% 11.705) (% 00.054) (% 08.203) 29.92 - Trichinella_zimbabwensis
# 097 0.5355 (% 12.640) (% 00.334) (% 09.947) 21.31 - Steinernema_carpocapsae
# 098 0.5361 (% 10.944) (% 00.063) (% 07.639) 30.20 - Trichinella_britovi
# 099 0.5393 (% 16.763) (% 00.185) (% 12.386) 26.11 - Brugia_malayi
# 100 0.5453 (% 06.137) (% 00.064) (% 04.468) 27.20 - Trichuris_trichiura
# 101 0.5466 (% 06.291) (% 00.145) (% 05.125) 18.53 - Pine wood nematode burXyl1
# 102 0.5466 (% 13.470) (% 00.169) (% 09.909) 26.44 - Bursaphelenchus_xylophilus
# 103 0.5474 (% 16.438) (% 01.183) (% 13.268) 19.28 - Oscheius_tipulae
# 104 0.5477 (% 03.712) (% 00.018) (% 02.756) 25.75 - Taenia_multiceps
# 105 0.5478 (% 10.278) (% 00.197) (% 08.144) 20.76 - Steinernema_glaseri
# 106 0.5480 (% 06.585) (% 00.136) (% 05.461) 17.07 - Microworm panRed1
# 107 0.5494 (% 39.093) (% 00.095) (% 31.402) 19.67 - Schmidtea_mediterranea
# 108 0.5507 (% 10.269) (% 00.081) (% 07.827) 23.78 - Globodera_pallida
# 109 0.5516 (% 11.885) (% 00.259) (% 09.311) 21.66 - Steinernema_feltiae
# 110 0.5526 (% 17.785) (% 00.000) (% 14.029) 21.12 - Dugesia_japonica
# 111 0.5532 (% 11.549) (% 00.085) (% 08.872) 23.18 - Subanguina_moxae
# 112 0.5547 (% 11.318) (% 00.138) (% 08.695) 23.18 - Globodera_ellingtonae
# 113 0.5551 (% 11.987) (% 00.319) (% 09.412) 21.48 - Steinernema_scapterisci
# 114 0.5561 (% 10.968) (% 00.132) (% 08.386) 23.54 - Globodera_rostochiensis
# 115 0.5565 (% 12.683) (% 00.000) (% 08.999) 29.05 - Meloidogyne_floridensis
# 116 0.5593 (% 38.533) (% 24.831) (% 31.789) 17.50 - Caenorhabditis_sp._32_LS-2015 C_sp32_LS_2015
# 117 0.5613 (% 02.917) (% 00.036) (% 02.285) 21.67 - Whipworm triSui1
# 118 0.5615 (% 26.293) (% 00.097) (% 19.830) 24.58 - Meloidogyne_javanica
# 119 0.5633 (% 03.516) (% 00.090) (% 02.816) 19.91 - M. incognita melInc2
# 120 0.5634 (% 26.760) (% 00.186) (% 19.868) 25.75 - Meloidogyne_incognita
# 121 0.5653 (% 31.839) (% 00.889) (% 19.485) 38.80 - Strongyloides_venezuelensis
# 122 0.5653 (% 31.051) (% 00.685) (% 19.457) 37.34 - Strongyloides_papillosus
# 123 0.5662 (% 28.310) (% 00.661) (% 17.250) 39.07 - Parastrongyloides_trichosuri
# 124 0.5689 (% 06.427) (% 00.000) (% 04.011) 37.59 - Oscheius_sp._MCB Oscheius_MCB
# 125 0.5690 (% 05.651) (% 00.066) (% 04.144) 26.67 - Trichuris_muris
# 126 0.5694 (% 11.930) (% 00.052) (% 08.389) 29.68 - Trichinella_murrelli
# 127 0.5714 (% 03.916) (% 00.074) (% 03.186) 18.64 - M. hapla melHap1
# 128 0.5744 (% 31.303) (% 00.285) (% 23.952) 23.48 - Meloidogyne_arenaria
# 129 0.5789 (% 07.254) (% 00.013) (% 05.201) 28.30 - Heterodera_glycines
# 130 0.5932 (% 17.004) (% 00.081) (% 10.827) 36.33 - Meloidogyne_graminicola
# 131 0.5953 (% 35.984) (% 01.356) (% 20.257) 43.71 - Strongyloides_stercoralis
# 132 0.5981 (% 36.164) (% 01.850) (% 20.838) 42.38 - Threadworm strRat2
# 133 0.6420 (% 23.468) (% 00.240) (% 18.204) 22.43 - Acrobeloides_nanus
# 134 0.7045 (% 00.158) (% 00.000) (% 00.090) 43.04 - Elaeophora_elaphi

# None of this concern for distances matters in building the first step, the
# maf files.  The distances will be better calibrated later.

    # create species list and stripped down tree for autoMZ
    sed -e 's/,//g; s/:[0-9]\+.[0-9]\+//g; s/;//g;' staAur2.369way.nh \
       | xargs echo > tree.nh

    cat tree.nh | fold -s -w 78 | sed -e 's/^/# /;'

# ((((((((((((((((((((((((((staAur2 (Sa_USA300_SUR12 (Sa_USA300_TCH1516 
# ((Sa_JE2 Sa_C2406) (Sa_USA300_NRS384 (Sa_UA_S391_USA300 ((Sa_CIT Sa_OXLIM) 
# (Sa_33b (Sa_26b_MRSA (Sa_27b_MRSA (Sa_29b_MRSA (Sa_31b_MRSA 
# Sa_25b_MRSA)))))))))))) Sa_CAR) Sa_AR_0216) Sa_USA300_ISMMS1) 
# ((((((Sa_USA300_SUR9 Sa_USA300_SUR10) Sa_USA300_SUR4) Sa_USA300_SUR8) 
# Sa_USA300_SUR11) (((((Sa_USA300_SUR13 Sa_USA300_SUR21) Sa_USA300_SUR3) 
# Sa_USA300_SUR6) (((((Sa_USA300_SUR14 Sa_USA300_SUR19) Sa_USA300_SUR16) 
# Sa_USA300_SUR17) Sa_USA300_SUR18) Sa_USA300_SUR7)) ((Sa_USA300_SUR20 
# Sa_USA300_SUR22) Sa_USA300_SUR23))) (Sa_USA300_SUR15 Sa_USA300_SUR24))) 
# (Sa_USA300_2014C01 (Sa_USA300_SUR1_a (Sa_USA300_SUR1_b Sa_USA300_SUR2)))) 
# Sa_UTSW_MRSA_55) (((Sa_TCH_959 Sa_CA15) Sa_HUV05) (Sa_M121 Sa_CA12))) 
# Sa_V2200) ((((((Sa_Newman Sa_Newman_NYU_Newman) Sa_Newman_D2C) 
# Sa_Sa_Newman_UoM) ((Sa_COL ((((Sa_NCTC8325_2006 Sa_HG001_RN1) 
# Sa_NCTC8325_2018) ((((Sa_VC40 Sa_NRS146) (Sa_FDAARGOS_6 Sa_NRS107)) 
# Sa_FDAARGOS_15) Sa_DSM_20231)) ((Sa_FDAARGOS_412 ((Sa_54 Sa_MSSA54) Sa_164)) 
# Sa_EDCC5458))) (Sa_28 Sa_82))) Sa_JH4899) (((Sa_M1 Sa_Clinical) Sa_NCTC13395) 
# Sa_NCTC13394))) (Sa_199 Sa_64)) (Sa_2395_USA500 Sa_SVH7513)) ((Sa_165 Sa_546) 
# Sa_545)) (((((((Sa_TW20 Sa_CMRSA_6) Sa_V521) Sa_M92) Sa_Z172) Sa_CMRSA_3) 
# (((Sa_JKD6008 (Sa_T0131 (Sa_WCH_SK2 Sa_NCTC11940))) Sa_NCTC9944) ((Sa_Gv51 
# ((Sa_Be62 Sa_Gv88) Sa_HC1335)) Sa_HC1340))) Sa_NMR08)) (((((((((Sa_MW2_2004 
# Sa_USA400_0051) Sa_JMUB3031) (((Sa_MSSA476 Sa_FORC_026) (Sa_No10 
# Sa_CFSAN007851)) Sa_FORC_045)) (Sa_FORC59 Sa_JMUB1273)) (((((Sa_ST20130940 
# Sa_ST20130941) (Sa_45 Sa_135)) (Sa_ST20130938 Sa_ST20130939)) 
# Sa_S_aureus_171) Sa_121560027_1)) (Sa_11819_97 (Sa_GR2 Sa_NCTC13435))) 
# ((((((((Sa_CN1 Sa_FORC_040) (Sa_TMUS2126 Sa_TMUS2134)) ((Sa_FORC_012 
# Sa_E16SA093) Sa_F17SA003)) Sa_FORC_061) Sa_08_02300) Sa_NCTC13137) (((Sa_RKI4 
# Sa_CFSAN018750) Sa_NCTC6136) Sa_BA01611)) (Sa_ST20130942 Sa_ST20130943))) 
# Sa_AUS0325) (((((Sa_FDA209P Sa_ATCC_6538) Sa_NCTC10344) Sa_NCTC9752) 
# Sa_NCTC3761) Sa_MOK042))) (((((((((((((Sa_N315 (Sa_ECT_R_2 Sa_CC5)) 
# (((Sa_04_02981 Sa_AR_0220) Sa_CFBR_105) Sa_UCI_28_ST5)) (((Sa_FCFHV36 
# Sa_ZJ5499) (Sa_AR461 Sa_AR_0468)) Sa_MI)) Sa_UCI62) ((Sa_JH9 Sa_JH1) 
# Sa_AR_474)) (Sa_SA112 Sa_CR14_035)) (((((((Sa_ST228_16035 (Sa_ST228_10388 
# Sa_ST228_10497)) Sa_ST228_18341) Sa_ST228_15532) (Sa_ST228_18583 
# Sa_ST228_18412)) Sa_ST228_16125) (((((Sa_NRS149 Sa_CFSAN007894) (Sa_SA564 
# Sa_CFSAN007850)) (Sa_RIVM6519 (Sa_AR462 Sa_CHU15_056))) Sa_FDAARGOS_159) 
# Sa_NZAK3)) (Sa_FDAARGOS_48 Sa_FDAARGOS_1))) ((((Sa_Mu50 Sa_AR_0219) Sa_Mu3) 
# (Sa_FORC_027 Sa_FORC_062)) (Sa_NCCP14558 Sa_NCCP14562))) Sa_HPV107) 
# Sa_ISU935_ST5) Sa_HOU1444_VR) ((Sa_ED98 Sa_ch21) Sa_ch22)) 
# Sa_ST772_MRSA_V_DAR4145)) (((Sa_HO_5096_0412 Sa_H_EMRSA_15) Sa_EDCC5464) 
# (Sa_IT1_S Sa_IT4_R))) (((((Sa_RF122 Sa_NCTC7485) ((((Sa_O46 Sa_O326) 
# Sa_CFSAN064037) ((Sa_ED133 Sa_O17) Sa_LGA251)) (((Sa_NRS153 (Sa_422 Sa_128)) 
# (Sa_27 Sa_78)) Sa_XQ))) Sa_NCTC5663) (((((((Sa_M013 Sa_SA957) Sa_HZW450) 
# Sa_SA268) (Sa_SA40 Sa_SA40TW)) Sa_CFSAN007883) Sa_MS4) (Sa_6850 (Sa_GN3 
# Sa_GN1)))) Sa_JKD6159)) (((((((((Sa_ST398_S0385 (Sa_08BA02176 ((Sa_RIVM1295 
# ((Sa_RIVM1607 Sa_08S00974) Sa_10_5235)) (Sa_LA_MRSA_ST398_E154 
# Sa_ISU926_ST398)))) (Sa_16_LA_309 Sa_12_LA_293)) (Sa_8_LA_272 (Sa_2_LA_86 
# Sa_19_LA_388))) ((((Sa_71193 Sa_293G) Sa_GD5) (Sa_GD705 Sa_GD1539)) 
# Sa_GD1677)) Sa_PTDrAP2) Sa_17_LA_343) Sa_RIVM3897) ((((((((Sa_MRSA252 
# Sa_NCTC13277) (((Sa_FORC_001 Sa_0201753_2) (Sa_CFSAN007847 (((Sa_468 Sa_143) 
# Sa_466) (Sa_277 Sa_85)))) Sa_CFSAN007896)) ((((Sa_ATCC_25923 
# Sa_Seattle_1945_G477) Sa_Seattle_1945_G478) Sa_80wphwpl_v1) Sa_NRS484)) 
# (Sa_55_2053 Sa_13420)) Sa_FDAARGOS_504) (Sa_SJTUF_J27 Sa_CFSAN018749)) 
# Sa_AR_0470) Sa_ILRI_Eymole1_1)) (((Sa_CA_347 ((((Sa_MCRF184 (Sa_1549_WT 
# (Sa_1549_REV Sa_1549_SCV))) Sa_CFSAN007835) Sa_AR464) Sa_AR466)) Sa_NCTC8726) 
# Sa_JS395))) (Sa_55_99_44 Sa_BB155)) (((Sa_Bmb9393 ((Sa_FDAARGOS_10 
# Sa_NCTC13140) Sa_OC8)) (((Sa_NRS1 Sa_AR_0228) ((Sa_61 Sa_191) Sa_628)) 
# Sa_FORC_039)) ((Sa_Gv69 Sa_MRSA107) Sa_MOK063))) ((Sa_FDAARGOS_40 (Sa_187 
# Sa_629)) (Sa_K18 Sa_K5))) Sa_5_3949) (((((Sa_TCH60 (Sa_MN8 Sa_NRS143)) 
# (((Sa_CFSAN064038 (((((Sa_3_LA_115 Sa_21_LA_436) (Sa_13_LA_301 Sa_14_5418)) 
# Sa_7_4623) Sa_20_LA_415) Sa_9_LA_281)) (Sa_1_1439 ((Sa_6_LA_232 Sa_4_LA_208) 
# Sa_15_LA_305))) Sa_22_LA_562)) Sa_AR_0471) ((Sa_Tager_104 Sa_AR_475) 
# ((((((((Sa_502A_RN6607 Sa_AR_0222) Sa_080880048_3) Sa_NRS70) (((Sa_AR465 
# Sa_AR_0467) Sa_AR_0215) Sa_AR_0469)) ((((((Sa_FDAARGOS_5 (Sa_NRS137 
# Sa_NRS133)) (((((((Sa_USA300_2014C02 Sa_1969N) Sa_3020C01) (Sa_1971C01 
# (Sa_1625CO1 Sa_2148C01))) Sa_FDAARGOS_140) Sa_AR_0225) Sa_AR_0226) (Sa_5118N 
# Sa_AR_0223))) Sa_NRS120) ((Sa_FDAARGOS_2 (Sa_FDAARGOS_43 Sa_Mw2_2018)) 
# (Sa_08_02119 (Sa_24117_WT (Sa_24117_REV Sa_24117_SCV))))) ((Sa_2148N 
# (Sa_AR_0472 Sa_AR_0473)) (Sa_101110051_1 Sa_101110041_3))) Sa_SR434)) 
# (Sa_NX_T55 Sa_QD_CD9)) Sa_NRS271) Sa_MOZ66))) Sa_K17)) Bacillus_subtilis_168) 
# (((((E_coli_K_12_MG1655 E_coli_O157_H7_Sakai_RIMD_0509952) E_coli_UMN026) 
# E_coli_O83_H1_NRG_857C) E_coli_IAI39) E_coli_O104_H4_2011C_3493))

    sed -e 's/:.*//; s/(//g; s/ *//;' staAur2.369way.nh \
       | xargs echo > species.list

    cat species.list | fold -s -w 78 | sed -e 's/^/# /;'

# staAur2 Sa_USA300_SUR12 Sa_USA300_TCH1516 Sa_JE2 Sa_C2406 Sa_USA300_NRS384 
# Sa_UA_S391_USA300 Sa_CIT Sa_OXLIM Sa_33b Sa_26b_MRSA Sa_27b_MRSA Sa_29b_MRSA 
# Sa_31b_MRSA Sa_25b_MRSA Sa_CAR Sa_AR_0216 Sa_USA300_ISMMS1 Sa_USA300_SUR9 
# Sa_USA300_SUR10 Sa_USA300_SUR4 Sa_USA300_SUR8 Sa_USA300_SUR11 Sa_USA300_SUR13 
# Sa_USA300_SUR21 Sa_USA300_SUR3 Sa_USA300_SUR6 Sa_USA300_SUR14 Sa_USA300_SUR19 
# Sa_USA300_SUR16 Sa_USA300_SUR17 Sa_USA300_SUR18 Sa_USA300_SUR7 
# Sa_USA300_SUR20 Sa_USA300_SUR22 Sa_USA300_SUR23 Sa_USA300_SUR15 
# Sa_USA300_SUR24 Sa_USA300_2014C01 Sa_USA300_SUR1_a Sa_USA300_SUR1_b 
# Sa_USA300_SUR2 Sa_UTSW_MRSA_55 Sa_TCH_959 Sa_CA15 Sa_HUV05 Sa_M121 Sa_CA12 
# Sa_V2200 Sa_Newman Sa_Newman_NYU_Newman Sa_Newman_D2C Sa_Sa_Newman_UoM Sa_COL 
# Sa_NCTC8325_2006 Sa_HG001_RN1 Sa_NCTC8325_2018 Sa_VC40 Sa_NRS146 
# Sa_FDAARGOS_6 Sa_NRS107 Sa_FDAARGOS_15 Sa_DSM_20231 Sa_FDAARGOS_412 Sa_54 
# Sa_MSSA54 Sa_164 Sa_EDCC5458 Sa_28 Sa_82 Sa_JH4899 Sa_M1 Sa_Clinical 
# Sa_NCTC13395 Sa_NCTC13394 Sa_199 Sa_64 Sa_2395_USA500 Sa_SVH7513 Sa_165 
# Sa_546 Sa_545 Sa_TW20 Sa_CMRSA_6 Sa_V521 Sa_M92 Sa_Z172 Sa_CMRSA_3 Sa_JKD6008 
# Sa_T0131 Sa_WCH_SK2 Sa_NCTC11940 Sa_NCTC9944 Sa_Gv51 Sa_Be62 Sa_Gv88 
# Sa_HC1335 Sa_HC1340 Sa_NMR08 Sa_MW2_2004 Sa_USA400_0051 Sa_JMUB3031 
# Sa_MSSA476 Sa_FORC_026 Sa_No10 Sa_CFSAN007851 Sa_FORC_045 Sa_FORC59 
# Sa_JMUB1273 Sa_ST20130940 Sa_ST20130941 Sa_45 Sa_135 Sa_ST20130938 
# Sa_ST20130939 Sa_S_aureus_171 Sa_121560027_1 Sa_11819_97 Sa_GR2 Sa_NCTC13435 
# Sa_CN1 Sa_FORC_040 Sa_TMUS2126 Sa_TMUS2134 Sa_FORC_012 Sa_E16SA093 
# Sa_F17SA003 Sa_FORC_061 Sa_08_02300 Sa_NCTC13137 Sa_RKI4 Sa_CFSAN018750 
# Sa_NCTC6136 Sa_BA01611 Sa_ST20130942 Sa_ST20130943 Sa_AUS0325 Sa_FDA209P 
# Sa_ATCC_6538 Sa_NCTC10344 Sa_NCTC9752 Sa_NCTC3761 Sa_MOK042 Sa_N315 
# Sa_ECT_R_2 Sa_CC5 Sa_04_02981 Sa_AR_0220 Sa_CFBR_105 Sa_UCI_28_ST5 Sa_FCFHV36 
# Sa_ZJ5499 Sa_AR461 Sa_AR_0468 Sa_MI Sa_UCI62 Sa_JH9 Sa_JH1 Sa_AR_474 Sa_SA112 
# Sa_CR14_035 Sa_ST228_16035 Sa_ST228_10388 Sa_ST228_10497 Sa_ST228_18341 
# Sa_ST228_15532 Sa_ST228_18583 Sa_ST228_18412 Sa_ST228_16125 Sa_NRS149 
# Sa_CFSAN007894 Sa_SA564 Sa_CFSAN007850 Sa_RIVM6519 Sa_AR462 Sa_CHU15_056 
# Sa_FDAARGOS_159 Sa_NZAK3 Sa_FDAARGOS_48 Sa_FDAARGOS_1 Sa_Mu50 Sa_AR_0219 
# Sa_Mu3 Sa_FORC_027 Sa_FORC_062 Sa_NCCP14558 Sa_NCCP14562 Sa_HPV107 
# Sa_ISU935_ST5 Sa_HOU1444_VR Sa_ED98 Sa_ch21 Sa_ch22 Sa_ST772_MRSA_V_DAR4145 
# Sa_HO_5096_0412 Sa_H_EMRSA_15 Sa_EDCC5464 Sa_IT1_S Sa_IT4_R Sa_RF122 
# Sa_NCTC7485 Sa_O46 Sa_O326 Sa_CFSAN064037 Sa_ED133 Sa_O17 Sa_LGA251 Sa_NRS153 
# Sa_422 Sa_128 Sa_27 Sa_78 Sa_XQ Sa_NCTC5663 Sa_M013 Sa_SA957 Sa_HZW450 
# Sa_SA268 Sa_SA40 Sa_SA40TW Sa_CFSAN007883 Sa_MS4 Sa_6850 Sa_GN3 Sa_GN1 
# Sa_JKD6159 Sa_ST398_S0385 Sa_08BA02176 Sa_RIVM1295 Sa_RIVM1607 Sa_08S00974 
# Sa_10_5235 Sa_LA_MRSA_ST398_E154 Sa_ISU926_ST398 Sa_16_LA_309 Sa_12_LA_293 
# Sa_8_LA_272 Sa_2_LA_86 Sa_19_LA_388 Sa_71193 Sa_293G Sa_GD5 Sa_GD705 
# Sa_GD1539 Sa_GD1677 Sa_PTDrAP2 Sa_17_LA_343 Sa_RIVM3897 Sa_MRSA252 
# Sa_NCTC13277 Sa_FORC_001 Sa_0201753_2 Sa_CFSAN007847 Sa_468 Sa_143 Sa_466 
# Sa_277 Sa_85 Sa_CFSAN007896 Sa_ATCC_25923 Sa_Seattle_1945_G477 
# Sa_Seattle_1945_G478 Sa_80wphwpl_v1 Sa_NRS484 Sa_55_2053 Sa_13420 
# Sa_FDAARGOS_504 Sa_SJTUF_J27 Sa_CFSAN018749 Sa_AR_0470 Sa_ILRI_Eymole1_1 
# Sa_CA_347 Sa_MCRF184 Sa_1549_WT Sa_1549_REV Sa_1549_SCV Sa_CFSAN007835 
# Sa_AR464 Sa_AR466 Sa_NCTC8726 Sa_JS395 Sa_55_99_44 Sa_BB155 Sa_Bmb9393 
# Sa_FDAARGOS_10 Sa_NCTC13140 Sa_OC8 Sa_NRS1 Sa_AR_0228 Sa_61 Sa_191 Sa_628 
# Sa_FORC_039 Sa_Gv69 Sa_MRSA107 Sa_MOK063 Sa_FDAARGOS_40 Sa_187 Sa_629 Sa_K18 
# Sa_K5 Sa_5_3949 Sa_TCH60 Sa_MN8 Sa_NRS143 Sa_CFSAN064038 Sa_3_LA_115 
# Sa_21_LA_436 Sa_13_LA_301 Sa_14_5418 Sa_7_4623 Sa_20_LA_415 Sa_9_LA_281 
# Sa_1_1439 Sa_6_LA_232 Sa_4_LA_208 Sa_15_LA_305 Sa_22_LA_562 Sa_AR_0471 
# Sa_Tager_104 Sa_AR_475 Sa_502A_RN6607 Sa_AR_0222 Sa_080880048_3 Sa_NRS70 
# Sa_AR465 Sa_AR_0467 Sa_AR_0215 Sa_AR_0469 Sa_FDAARGOS_5 Sa_NRS137 Sa_NRS133 
# Sa_USA300_2014C02 Sa_1969N Sa_3020C01 Sa_1971C01 Sa_1625CO1 Sa_2148C01 
# Sa_FDAARGOS_140 Sa_AR_0225 Sa_AR_0226 Sa_5118N Sa_AR_0223 Sa_NRS120 
# Sa_FDAARGOS_2 Sa_FDAARGOS_43 Sa_Mw2_2018 Sa_08_02119 Sa_24117_WT Sa_24117_REV 
# Sa_24117_SCV Sa_2148N Sa_AR_0472 Sa_AR_0473 Sa_101110051_1 Sa_101110041_3 
# Sa_SR434 Sa_NX_T55 Sa_QD_CD9 Sa_NRS271 Sa_MOZ66 Sa_K17 Bacillus_subtilis_168 
# E_coli_K_12_MG1655 E_coli_O157_H7_Sakai_RIMD_0509952 E_coli_UMN026 
# E_coli_O83_H1_NRG_857C E_coli_IAI39 E_coli_O104_H4_2011C_3493

    # do not need to take any kind of N50 survey, going to use 'mafNet'
    # alignment for everything

    # the usual rules for selection:
    # 1. when n50Size over 500,000 and percentN under 5 -> syntenic net
    # 2. otherwise, use chainNet

    # are woefully inadequate for this purpose.  All the alignments
    # are similar.  The 'mafNet' alignment has the highest numbers

    awk -F$'\t' '$4 > 500000' allSummary.tab | awk '$5 < 5' \
	| cut -f7 | awk -F'.' '{print $1}' > syntenicList.txt

    awk -F$'\t' '$4 <= 500000 || $5 >= 5' allSummary.tab \
        | cut -f7 | awk -F'.' '{print $1}' > chainNetList.txt

    awk '{printf "%s\t%s\t%s\n", $NF,$5,$7}' ../sizeStats.txt \
          | sed -e 's/)//g;' | sort -k3,3nr | sort \
             | join -t$'\t' - <(sort syntenicList.txt) | sed -e 's/^/# /;'

    #	bash shell syntax here ...
    cd /hive/data/genomes/staAur2/bed/multiz369way
    mkdir mafLinks

export H="/hive/data/genomes/staAur2/bed"

export TOP="/hive/data/genomes/staAur2/bed/multiz369way"

# ls ../lastzRuns/lastzGCF_003194185v1/mafNet
# NC_007790v1.maf.gz  NC_007791v1.maf.gz  NC_007792v1.maf.gz  NC_007793v1.maf.gz

sed -e 's/staAur2 //;' species.list | sed -e 's/ /\n/g' | while read specName
do
  cd "${TOP}"
  GC=`grep -w "${specName}" shortName.to.GCFname.tab | cut -f2`
# Sa_0201753_2   GCF_003194185v1
  mafNet="../lastzRuns/lastz${GC}/mafNet"
  echo "$specName" 1>&2
  mkdir -p "mafLinks/${specName}"
  cd mafLinks/${specName}
    for M in ../../$mafNet/*.maf.gz
    do
      B=`basename $M`
      zcat "${M}" | sed -e "s/$GC/$specName/;" | gzip -c > ${B}
    done
done

    # verify the symLinks are good:
    find ./mafLinks -type f | sed -e 's#.*/##;' | sort | uniq -c
    # 260 NC_007790v1.maf.gz
    # 265 NC_007791v1.maf.gz
    # 365 NC_007792v1.maf.gz
    # 368 NC_007793v1.maf.gz

    # not all of them match in the plastids

    # the fullChromRun is going to take forever, split them up at
    #  approximately 100,000 boundaries in the largest gap between
    #  genes

    #	run a kluster job to split them all
    ssh ku
    cd /hive/data/genomes/staAur2/bed/multiz369way/mafSplit

    printf '
#!/bin/csh -ef
set G = $1
set M = $2
mkdir -p $G
pushd $G > /dev/null
if ( -s staAur2_${M}.00.maf ) then
    /bin/rm -f staAur2_${M}.*.maf
endif
/cluster/bin/x86_64/mafSplit ../mafSplit.bed staAur2_ ../../mafLinks/${G}/${M}.maf.gz
/bin/gzip staAur2_${M}.*.maf
popd > /dev/null
' > runOne

    # << happy emacs
    chmod +x runOne

    printf '#LOOP
runOne $(dir1) $(file1) {check out exists+ $(dir1)/staAur2_$(file1).00.maf.gz}
#ENDLOOP
' > template

    find ../mafLinks -type l | awk -F'/' '{printf "%s/%s\n", $3,$4}' \
      | sed -e 's/.maf.gz//;' > maf.list

    gensub2 maf.list single template jobList
    para -ram=16g create jobList
    para try ... check ... push ... etc...
# Completed: 1258 of 1258 jobs
# CPU time in finished jobs:        729s      12.15m     0.20h    0.01d  0.000 y
# IO & Wait Time:                  5160s      86.00m     1.43h    0.06d  0.000 y
# Average job time:                   5s       0.08m     0.00h    0.00d
# Longest finished job:               8s       0.13m     0.00h    0.00d
# Submission to last job:           487s       8.12m     0.14h    0.01d

    # construct a list of all possible maf file names.
    # they do not all exist in each of the species directories
    find . -type f | grep "maf.gz" | wc -l
    # 11577

    find . -type f | grep ".maf.gz$" | xargs -L 1 basename | sort -u \
        > run.maf.list
    wc -l run.maf.list
    # 33 run.maf.list

    # number of chroms with data:
    awk -F'.' '{print $1}' run.maf.list  | sed -e 's/staAur2_//;' \
      | sort | uniq -c | sort -n | wc -l
    #  4

    mkdir /hive/data/genomes/staAur2/bed/multiz369way/splitRun
    cd /hive/data/genomes/staAur2/bed/multiz369way/splitRun
    mkdir maf run
    cd run
    mkdir penn
    cp -p /cluster/bin/penn/multiz.2009-01-21_patched/multiz penn
    cp -p /cluster/bin/penn/multiz.2009-01-21_patched/maf_project penn
    cp -p /cluster/bin/penn/multiz.2009-01-21_patched/autoMZ penn

    #	set the db and pairs directories here
    cat > autoMultiz.csh << '_EOF_'
#!/bin/csh -ef
set db = staAur2
set c = $1
set result = $2
set run = `/bin/pwd`
set tmp = /dev/shm/$db/multiz.$c
set pairs = /hive/data/genomes/staAur2/bed/multiz369way/mafSplit
/bin/rm -fr $tmp
/bin/mkdir -p $tmp
/bin/cp -p ../../tree.nh ../../species.list $tmp
pushd $tmp > /dev/null
foreach s (`/bin/sed -e "s/$db //" species.list`)
    set in = $pairs/$s/$c
    set out = $db.$s.sing.maf
    if (-e $in.gz) then
        /bin/zcat $in.gz > $out
        if (! -s $out) then
            echo "##maf version=1 scoring=autoMZ" > $out
        endif
    else if (-e $in) then
        /bin/ln -s $in $out
    else
        echo "##maf version=1 scoring=autoMZ" > $out
    endif
end
set path = ($run/penn $path); rehash
$run/penn/autoMZ + T=$tmp E=$db "`cat tree.nh`" $db.*.sing.maf $c \
        > /dev/null
popd > /dev/null
/bin/rm -f $result
/bin/cp -p $tmp/$c $result
/bin/rm -fr $tmp
/bin/rmdir --ignore-fail-on-non-empty /dev/shm/$db
'_EOF_'
# << happy emacs
    chmod +x autoMultiz.csh

    printf '#LOOP
./autoMultiz.csh $(root1) {check out line+ /hive/data/genomes/staAur2/bed/multiz369way/splitRun/maf/$(root1)}
#ENDLOOP
' > template

    ln -s  ../../mafSplit/run.maf.list maf.list

    ssh ku
    cd /hive/data/genomes/staAur2/bed/multiz369way/splitRun/run
    gensub2 maf.list single template jobList
    para create jobList
    para try ... check ... push ... etc...
# Completed: 33 of 33 jobs
# CPU time in finished jobs:     665856s   11097.59m   184.96h    7.71d  0.021 y
# IO & Wait Time:                   214s       3.57m     0.06h    0.00d  0.000 y
# Average job time:               20184s     336.40m     5.61h    0.23d
# Longest finished job:           29328s     488.80m     8.15h    0.34d
# Submission to last job:         29328s     488.80m     8.15h    0.34d

    # put the split maf results back together into a single per-chrom maf file
    #	eliminate duplicate comments
    ssh hgwdev
    cd /hive/data/genomes/staAur2/bed/multiz369way/splitRun
    mkdir ../maf
    #	no need to save the comments since they are lost with mafAddIRows

    cat << '_EOF_' >> runOne
#!/bin/csh -fe
set C = $1
if ( -s ../maf/${C}.maf.gz ) then
    rm -f ../maf/${C}.maf.gz
endif
if ( -s maf/staAur2_${C}.00.maf ) then
  head -q -n 1 maf/staAur2_${C}.00.maf | sort -u > ../maf/${C}.maf
  grep -h -v "^#" `ls maf/staAur2_${C}.*.maf | sort -t. -k2,2n` >> ../maf/${C}.maf
  tail -q -n 1 maf/staAur2_${C}.00.maf | sort -u >> ../maf/${C}.maf
else
  touch ../maf/${C}.maf
endif
'_EOF_'
    # << happy emacs
    chmod +x runOne

    cat << '_EOF_' >> template
#LOOP
runOne $(root1) {check out exists ../maf/$(root1).maf}
#ENDLOOP
'_EOF_'
    # << happy emacs

    cut -f1 ../../../chrom.sizes > chr.list
    ssh ku
    cd /hive/data/genomes/staAur2/bed/multiz369way/splitRun
    gensub2 chr.list single template jobList
    para -ram=16g create jobList
    # only 4 jobs, they run with the 'para try'
    para try ... check ... push ... etc ...
    para -maxJob=32 push
# Completed: 4 of 4 jobs
# CPU time in finished jobs:         11s       0.18m     0.00h    0.00d  0.000 y
# IO & Wait Time:                     9s       0.16m     0.00h    0.00d  0.000 y
# Average job time:                   5s       0.08m     0.00h    0.00d
# Longest finished job:              14s       0.23m     0.00h    0.00d
# Submission to last job:            21s       0.35m     0.01h    0.00d

    cd /hive/data/genomes/staAur2/bed/multiz369way/maf
    # the four results:
# -rw-rw-r-- 1    1039254 Dec 29 10:20 NC_007791v1.maf
# -rw-rw-r-- 1    8427476 Dec 29 10:20 NC_007792v1.maf
# -rw-rw-r-- 1    2682382 Dec 29 10:20 NC_007790v1.maf
# -rw-rw-r-- 1 1716754011 Dec 29 10:20 NC_007793v1.maf

    # Load into database
    mkdir -p /gbdb/staAur2/multiz369way/maf
    cd /hive/data/genomes/staAur2/bed/multiz369way/maf
    ln -s `pwd`/*.maf /gbdb/staAur2/multiz369way/maf/

    # this generates an immense multiz369way.tab file in the directory
    #	where it is running.  Best to run this over in scratch.
    #   This is going to take all day.
    cd /dev/shm
    time hgLoadMaf staAur2 multiz369way
    # Loaded 25064 mafs in 4 files from /gbdb/staAur2/multiz369way
    # real    0m8.868s

    time (cat /gbdb/staAur2/multiz369way/maf/*.maf \
	| hgLoadMafSummary staAur2 -minSize=10000 -verbose=2 \
		-mergeGap=500 -maxSize=50000  multiz369waySummary stdin
# Created 110274 summary blocks from 7621971 components and 25064 mafs from stdin
# real    0m22.061s

# -rw-rw-r-- 1    5919198 Dec 29 10:26 multiz369waySummary.tab
# -rw-rw-r-- 1    1306535 Dec 29 10:26 multiz369way.tab

    wc -l multiz369way*
#    25064 multiz369way.tab
#   110274 multiz369waySummary.tab

    rm multiz369way*.tab

#######################################################################

    # this fullChromRun is on-going 2018-12-29 to see if it will finish
    # at some point
    # only 4 of these, run up as full chromosomes without the splitting
    # procedure

    mkdir /hive/data/genomes/staAur2/bed/multiz369way/fullChromRun
    cd /hive/data/genomes/staAur2/bed/multiz369way/fullChromRun
    mkdir maf run
    cd run
    mkdir penn
    cp -p /cluster/bin/penn/multiz.2009-01-21_patched/multiz penn
    cp -p /cluster/bin/penn/multiz.2009-01-21_patched/maf_project penn
    cp -p /cluster/bin/penn/multiz.2009-01-21_patched/autoMZ penn

    find ../../mafLinks -type f | grep ".maf.gz$" | xargs -L 1 basename \
       | sed -e 's/.gz//;' | sort -u > maf.list

    wc -l maf.list
    # 4 maf.list
    cat maf.list | sed -e 's/^/# /;'
    # NC_007790v1.maf
    # NC_007791v1.maf
    # NC_007792v1.maf
    # NC_007793v1.maf

    #	set the db and pairs directories here
    cat > autoMultiz.csh << '_EOF_'
#!/bin/csh -ef
set db = staAur2
set c = $1
set result = $2
set run = `/bin/pwd`
set tmp = /dev/shm/$db/multiz.$c
set pairs = /hive/data/genomes/staAur2/bed/multiz369way/mafLinks
/bin/rm -fr $tmp
/bin/mkdir -p $tmp
/bin/cp -p ../../tree.nh ../../species.list $tmp
pushd $tmp > /dev/null
foreach s (`/bin/sed -e "s/$db //" species.list`)
    set in = $pairs/$s/$c
    set out = $db.$s.sing.maf
    if (-e $in.gz) then
        /bin/zcat $in.gz > $out
        if (! -s $out) then
            echo "##maf version=1 scoring=autoMZ" > $out
        endif
    else if (-e $in) then
        /bin/ln -s $in $out
    else
        echo "##maf version=1 scoring=autoMZ" > $out
    endif
end
set path = ($run/penn $path); rehash
$run/penn/autoMZ + T=$tmp E=$db "`cat tree.nh`" $db.*.sing.maf $c \
        > /dev/null
popd > /dev/null
/bin/rm -f $result
/bin/cp -p $tmp/$c $result
/bin/rm -fr $tmp
'_EOF_'
# << happy emacs
    chmod +x autoMultiz.csh

    printf '#LOOP
./autoMultiz.csh $(file1) {check out line+ /hive/data/genomes/staAur2/bed/multiz369way/fullChromRun/maf/$(root1).maf}
#ENDLOOP
' > template

    ssh ku
    cd /hive/data/genomes/staAur2/bed/multiz369way/fullChromRun/run
    gensub2 maf.list single template jobList
    para -ram=64g create jobList
    # there are only 4 jobs, 'para try' runs them all
XXX - running - Sun Dec 23 22:38:44 PST 2018
# the tiny plasmids run rather rapidly, both the small ones under 4 minutes
# the large chromosome might run for some time . . .
# Completed: 7 of 7 jobs
# CPU time in finished jobs:     295337s    4922.28m    82.04h    3.42d  0.009 y
# IO & Wait Time:                   226s       3.77m     0.06h    0.00d  0.000 y
# Average job time:               42223s     703.72m    11.73h    0.49d
# Longest finished job:           58035s     967.25m    16.12h    0.67d
# Submission to last job:         58046s     967.43m    16.12h    0.67d

    cd /hive/data/genomes/staAur2/bed/multiz369way
    du -hsc fullChromRun/maf
    # 20G     fullChromRun/maf

    # Load into database
    ssh hgwdev
    cd /hive/data/genomes/staAur2/bed/multiz369way/fullChromRun/maf
    mkdir /gbdb/staAur2/multiz369way
    ln -s `pwd`/*.maf /gbdb/staAur2/multiz369way
    cd /dev/shm
    time hgLoadMaf staAur2 multiz369way
    # Loaded 6396612 mafs in 7 files from /gbdb/staAur2/multiz369way
    # real    2m35.511s

    time cat /gbdb/staAur2/multiz369way/*.maf \
	| hgLoadMafSummary staAur2 -minSize=10000 -verbose=2 \
		-mergeGap=500 -maxSize=50000  multiz369waySummary stdin
    # Created 1897571 summary blocks from 214521234 components and 6396612 mafs from stdin
    # real    5m27.726s

# -rw-rw-r-- 1 312973610 Dec 10 09:06 multiz369way.tab
# -rw-rw-r-- 1 103769870 Dec 10 09:15 multiz369waySummary.tab

    wc -l multiz369way*
    # 6396612 multiz369way.tab
    # 1897571 multiz369waySummary.tab

    rm multiz369way*.tab

##############################################################################
# GAP ANNOTATE MULTIZ369WAY MAF AND LOAD TABLES (TBD - 2018-12-10 - Hiram)
    # mafAddIRows has to be run on single chromosome maf files, it does not
    #	function correctly when more than one reference sequence
    #	are in a single file.

    # this new one needs to be created
    cd /hive/data/genomes/staAur2
    twoBitInfo -nBed staAur2.unmasked.2bit staAur2.N.bed
    # there are no gaps:
#   -rw-rw-r--   1      0 Dec 29 11:28 staAur2.N.bed

    mkdir -p /hive/data/genomes/staAur2/bed/multiz369way/anno
    cd /hive/data/genomes/staAur2/bed/multiz369way/anno
    ln -s ../../../nameCorrespond/gcToShortName.tab .

rm -fr nBedsDir sizesDir nBeds sizes
mkdir nBedsDir sizesDir

    # the UCSC local databases can be done normally
cat ../species.list | tr ' ' '\n' | grep "^[a-z]" | while read D
do
  printf "%s\n" "${D}"
  ls -og /hive/data/genomes/${D}/${D}.N.bed /hive/data/genomes/${D}/chrom.sizes
  ln -s /hive/data/genomes/${D}/${D}.N.bed nBedsDir/${D}.bed
  echo "${D}.bed" >> nBeds
  ln -s /hive/data/genomes/${D}/chrom.sizes sizesDir/${D}.len
  echo "${D}.len" >> sizes
done

ls ../../../twoBits/GCF_003193745v1
GCF_003193745v1.2bit         GCF_003193745v1.idKeys.txt
GCF_003193745v1.chrom.sizes  GCF_003193745v1.keySignature.txt


cat ../species.list | tr ' ' '\n' | grep -v "^[a-z]" | while read S
do
  gcName=`grep -w "${S}"  gcToShortName.tab | cut -f1 | sed -e 's/\./v/;'`
  printf "%s\t%s\n" "${S}" "${gcName}"
  twoBitInfo -nBed ../../../twoBits/$gcName/$gcName.2bit nBedsDir/${S}.bed
  echo ${S}.bed >> nBeds
  ln -s /hive/data/genomes/staAur2/twoBits/$gcName/$gcName.chrom.sizes sizesDir/${S}.len
  echo ${S}.len >> sizes
done

    # verify count:
    wc -l nBeds sizes
#   369 nBeds
#   369 sizes

    # make sure they all are successful symLinks:
    ls -ogL nBedsDir/*.bed | wc -l
    # 369

    screen -S staAur2      # use a screen to control this longish job
    ssh ku
    mkdir /hive/data/genomes/staAur2/bed/multiz369way/anno/mafNet
    cd /hive/data/genomes/staAur2/bed/multiz369way/anno/mafNet
    mkdir result
    ln -s ../nBeds .
    ln -s ../sizes .
    ln -s ../sizesDir/*.len .
    ln -s ../nBedsDir/*.bed .

    printf '#LOOP
mafAddIRows -nBeds=nBeds $(path1) /hive/data/genomes/staAur2/staAur2.2bit {check out line+ result/$(file1)}
#ENDLOOP
' > template

    ls ../../maf/*.maf > maf.list
    gensub2 maf.list single template jobList
    # these jobs require a lot of memory
    para -ram=48g create jobList
    # only 4 jobs run with the 'para try'
    para try ... check ...
    para push
# Completed: 4 of 4 jobs
# CPU time in finished jobs:        111s       1.85m     0.03h    0.00d  0.000 y
# IO & Wait Time:                    11s       0.19m     0.00h    0.00d  0.000 y
# Average job time:                  31s       0.51m     0.01h    0.00d
# Longest finished job:             113s       1.88m     0.03h    0.00d
# Submission to last job:           120s       2.00m     0.03h    0.00d

    du -hsc result ../../maf
    # 2.1G    result
    # 1.7G    ../../maf

    # Load into database
    rm -f /gbdb/staAur2/multiz369way/NC*.maf
    cd /hive/data/genomes/staAur2/bed/multiz369way/anno/mafNet/result

    ln -s `pwd`/*.maf /gbdb/staAur2/multiz369way/

    # this generates an immense multiz369way.tab file in the directory
    #	where it is running.  Best to run this over in scratch.
    cd /dev/shm
    time hgLoadMaf staAur2 multiz369way
# Loading multiz369way into database
# Loaded 25064 mafs in 4 files from /gbdb/staAur2/multiz369way
# real    0m12.367s

    time (cat /gbdb/staAur2/multiz369way/*.maf \
        | hgLoadMafSummary -verbose=2 -minSize=10000 \
	-mergeGap=500 -maxSize=50000 staAur2 multiz369waySummary stdin)
# Created 110274 summary blocks from 7621971 components and 25064 mafs from stdin
# real    0m26.114s

    wc -l multiz369*.tab
    #   25064 multiz369way.tab
    #  110274 multiz369waySummary.tab

# -rw-rw-r-- 1    1310640 Dec 29 11:44 multiz369way.tab
# -rw-rw-r-- 1    6139746 Dec 29 11:45 multiz369waySummary.tab

    rm multiz369way*.tab

##############################################################################
# MULTIZ7WAY MAF FRAMES (TBD - 2018-12-13 - Hiram)
    ssh hgwdev
    mkdir /hive/data/genomes/staAur2/bed/multiz369way/frames
    cd /hive/data/genomes/staAur2/bed/multiz369way/frames
#   survey all the genomes to find out what kinds of gene tracks they have

    printf '#!/bin/csh -fe
foreach db (`cat ../species.list | tr " " "\\n" | grep -v "^[A-Z]"`)
    echo -n "# ${db}: "
    set tables = `hgsql $db -N -e "show tables" | egrep "Gene|ncbiRefSeq"`
    foreach table ($tables)
        if ($table == "ensGene" || $table == "refGene" || \
           $table == "ncbiRefSeq" || $table == "mgcGenes" || \
           $table == "ws245Genes" || \
           $table == "knownGene" || $table == "xenoRefGene" ) then
           set count = `hgsql $db -N -e "select count(*) from $table"`
            echo -n "${table}: ${count}, "
        endif
    end
    echo
end
' > showGenes.csh

    chmod +x ./showGenes.csh
    time ./showGenes.csh
# staAur2: ensGene: 61109, ncbiRefSeq: 52984, refGene: 53937, ws245Genes: 57827, xenoRefGene: 32625,
# caeAng2: ws245Genes: 33934,
# caeJap4: ws245Genes: 38144, xenoRefGene: 158453,
# cb4: ws245Genes: 23050, xenoRefGene: 130721,
# caeSp51: ws245Genes: 46280,
# caeRem4: ws245Genes: 32899, xenoRefGene: 155137,
# caePb3: ws245Genes: 33210, xenoRefGene: 189584,
# caeSp111: ws245Genes: 27721,
# priExs1: ws245Genes: 24642, xenoRefGene: 153824,
# priPac3: ws245Genes: 24243, xenoRefGene: 97503,
# burXyl1: ws245Genes: 18074,
# necAme1: ws245Genes: 19643,
# ancCey1: ws245Genes: 65583, xenoRefGene: 149848,
# ascSuu1: ws245Genes: 3082, xenoRefGene: 148202,
# haeCon2: ws245Genes: 26359,
# triSui1: ws245Genes: 14435,
# panRed1: ws245Genes: 26372,
# bruMal2: ws245Genes: 18709,
# dirImm1: ws245Genes: 12857,
# oncVol1: ws245Genes: 12385,
# loaLoa1: ws245Genes: 15577,
# melInc2: ws245Genes: 20365,
# melHap1: ws245Genes: 14420, xenoRefGene: 88300,
# ci3: ensGene: 17784, refGene: 1306, xenoRefGene: 86960,
# hetBac1: ws245Genes: 20928,
# triSpi1: ws245Genes: 16380,

    # from that summary, use these gene sets:
    # ensGene - staAur2 ci3
    # ws245Genes - caeAng2 caeJap4 cb4 caeSp51 caeRem4 caePb3 caeSp111 priExs1
    # priPac3 burXyl1 necAme1 ancCey1 ascSuu1 haeCon2 triSui1 panRed1 bruMal2
    # dirImm1 oncVol1 loaLoa1 melInc2 melHap1 hetBac1 triSpi1

    mkdir genes

    # 1. ensGene: staAur2 ci3
    for DB in staAur2 ci3
do
hgsql -N -e "select
name,chrom,strand,txStart,txEnd,cdsStart,cdsEnd,exonCount,exonStarts,exonEnds
from ensGene" ${DB} \
      | genePredSingleCover stdin stdout | gzip -2c \
        > /dev/shm/${DB}.tmp.gz
    mv /dev/shm/${DB}.tmp.gz genes/$DB.gp.gz
    echo -n "# ${DB}: "
    genePredCheck -db=${DB} genes/${DB}.gp.gz
done
# staAur2: checked: 20195 failed: 0
# ci3: checked: 16035 failed: 0

    # 2. ws245Genes caeAng2 caeJap4 cb4 caeSp51 caeRem4 caePb3 caeSp111 priExs1
    # priPac3 burXyl1 necAme1 ancCey1 ascSuu1 haeCon2 triSui1 panRed1 bruMal2
    # dirImm1 oncVol1 loaLoa1 melInc2 melHap1 hetBac1 triSpi1

    for DB in caeAng2 caeJap4 cb4 caeSp51 caeRem4 caePb3 caeSp111 priExs1 priPac3 burXyl1 necAme1 ancCey1 ascSuu1 haeCon2 triSui1 panRed1 bruMal2 dirImm1 oncVol1 loaLoa1 melInc2 melHap1 hetBac1 triSpi1
do
    hgsql -N -e "select
name,chrom,strand,txStart,txEnd,cdsStart,cdsEnd,exonCount,exonStarts,exonEnds
from ws245Genes" ${DB} \
      | genePredSingleCover stdin stdout | gzip -2c \
        > /dev/shm/${DB}.tmp.gz
    mv /dev/shm/${DB}.tmp.gz genes/$DB.gp.gz
    echo -n "# ${DB}: "
    genePredCheck -db=${DB} genes/${DB}.gp.gz
done
# caeAng2: checked: 27919 failed: 0
# caeJap4: checked: 29913 failed: 0
# cb4: checked: 21737 failed: 0
# caeSp51: checked: 34679 failed: 0
# caeRem4: checked: 31433 failed: 0
# caePb3: checked: 30626 failed: 0
# caeSp111: checked: 22306 failed: 0
# priExs1: checked: 23767 failed: 0
# priPac3: checked: 22872 failed: 0
# burXyl1: checked: 18074 failed: 0
# necAme1: checked: 18984 failed: 0
# ancCey1: checked: 35775 failed: 0
# ascSuu1: checked: 3065 failed: 0
# haeCon2: checked: 21837 failed: 0
# triSui1: checked: 13600 failed: 0
# panRed1: checked: 24071 failed: 0
# bruMal2: checked: 13471 failed: 0
# dirImm1: checked: 12857 failed: 0
# oncVol1: checked: 12133 failed: 0
# loaLoa1: checked: 14903 failed: 0
# melInc2: checked: 19189 failed: 0
# melHap1: checked: 14103 failed: 0
# hetBac1: checked: 20928 failed: 0
# triSpi1: checked: 16370 failed: 0

    # verify counts for genes are reasonable:
    for T in genes/*.gz
do
    echo -n "# $T: "
    zcat $T | cut -f1 | sort | uniq -c | wc -l
done
# genes/ancCey1.gp.gz: 35775
# genes/ascSuu1.gp.gz: 3065
# genes/bruMal2.gp.gz: 13471
# genes/burXyl1.gp.gz: 18074
# genes/caeAng2.gp.gz: 27919
# genes/caeJap4.gp.gz: 29913
# genes/caePb3.gp.gz: 30626
# genes/caeRem4.gp.gz: 31433
# genes/caeSp111.gp.gz: 22306
# genes/caeSp51.gp.gz: 34679
# genes/cb4.gp.gz: 21737
# genes/staAur2.gp.gz: 20195
# genes/ci3.gp.gz: 16035
# genes/dirImm1.gp.gz: 12857
# genes/haeCon2.gp.gz: 21837
# genes/hetBac1.gp.gz: 20928
# genes/loaLoa1.gp.gz: 14903
# genes/melHap1.gp.gz: 14103
# genes/melInc2.gp.gz: 19189
# genes/necAme1.gp.gz: 18984
# genes/oncVol1.gp.gz: 12133
# genes/panRed1.gp.gz: 24071
# genes/priExs1.gp.gz: 23767
# genes/priPac3.gp.gz: 22872
# genes/triSpi1.gp.gz: 16370
# genes/triSui1.gp.gz: 13600

    # kluster job to annotate each maf file
    screen -S staAur2      # manage long running procedure with screen
    ssh ku
    cd /hive/data/genomes/staAur2/bed/multiz369way/frames

    printf '#!/bin/csh -fe

set C = $1
set G = $2

cat ../anno/mafNet/result/${C}.maf | genePredToMafFrames staAur2 stdin stdout \
        ${G} genes/${G}.gp.gz | gzip > parts/${C}.${G}.mafFrames.gz
' > runOne

    chmod +x runOne

    ls ../anno/mafNet/result > chr.list
    ls genes | sed -e "s/.gp.gz//" > gene.list

    printf '#LOOP
runOne $(root1) $(root2) {check out exists+ parts/$(root1).$(root2).mafFrames.gz}
#ENDLOOP
' > template

    mkdir parts
    gensub2 chr.list gene.list template jobList
    para -ram=64g create jobList
    para try ... check ... push
# Completed: 182 of 182 jobs
# CPU time in finished jobs:      16011s     266.86m     4.45h    0.19d  0.001 y
# IO & Wait Time:                  7072s     117.86m     1.96h    0.08d  0.000 y
# Average job time:                 127s       2.11m     0.04h    0.00d
# Longest finished job:             196s       3.27m     0.05h    0.00d
# Submission to last job:           401s       6.68m     0.11h    0.00d

    # collect all results into one file:
    cd /hive/data/genomes/staAur2/bed/multiz369way/frames
    time find ./parts -type f | while read F
do
    echo "${F}" 1>&2
    zcat ${F}
done | sort -k1,1 -k2,2n > multiz369wayFrames.bed
    # real    0m24.365s

    # -rw-rw-r-- 1 287750919 Dec 13 11:50 multiz369wayFrames.bed

    gzip multiz369wayFrames.bed

    # verify there are frames on everything, should be 26 species:
    # (count from: ls genes | wc)
    zcat multiz369wayFrames.bed.gz | awk '{print $4}' | sort | uniq -c \
        | sed -e 's/^/# /;' > species.check.list
    wc -l species.check.list
    # 26
#  278211 ancCey1
#   12740 ascSuu1
#  120258 bruMal2
#  147066 burXyl1
#  314609 caeAng2
#  268169 caeJap4
#  372623 caePb3
#  450001 caeRem4
#  384830 caeSp111
#  271790 caeSp51
#  251521 cb4
#  123527 staAur2
#   49580 ci3
#  119879 dirImm1
#  154460 haeCon2
#  209501 hetBac1
#  119983 loaLoa1
#  100365 melHap1
#   87175 melInc2
#  144268 necAme1
#  124476 oncVol1
#  157050 panRed1
#   86331 priExs1
#   89170 priPac3
#   94395 triSpi1
#   60978 triSui1

    #   load the resulting file
    ssh hgwdev
    cd /hive/data/genomes/staAur2/bed/multiz369way/frames
    time hgLoadMafFrames staAur2 multiz369wayFrames multiz369wayFrames.bed.gz
    #   real    0m29.214s


    hgsql -e 'select count(*) from multiz369wayFrames;' staAur2
    # +----------+
    # | count(*) |
    # +----------+
    # |  4592956 |
    # +----------+

    time featureBits -countGaps staAur2 multiz369wayFrames
    # 34457786 bases of 100286401 (34.359%) in intersection
    # real    0m22.416s

    #   enable the trackDb entries:
# frames multiz369wayFrames
# irows on
    #   zoom to base level in an exon to see codon displays
    #	appears to work OK

#########################################################################
# Phylogenetic tree from 369-way (TBD - 2018-12-13 - Hiram)
    mkdir /hive/data/genomes/staAur2/bed/multiz369way/4d
    cd /hive/data/genomes/staAur2/bed/multiz369way/4d

    # the annotated maf's are in:
    ../anno/mafNet/result

    # using knownGene for staAur2, only transcribed genes and nothing
    #	from the randoms and other misc.
    hgsql -Ne "select name,chrom,strand,txStart,txEnd,cdsStart,cdsEnd,exonCount,exonStarts,exonEnds from ensGene where cdsEnd > cdsStart;" staAur2 \
      | egrep -E -v "chrM|chrUn|random|_alt" > ensGene.gp
    wc -l *.gp
    #     33379 ensGene.gp

    # verify it is only on the chroms:
    cut -f2 ensGene.gp | sort | uniq -c | sort -rn | sed -e 's/^/    # /;'
    #    7509 chrV
    #    6211 chrIV
    #    5588 chrII
    #    4997 chrI
    #    4646 chrIII
    #    4428 chrX

    genePredSingleCover ensGene.gp stdout | sort > ensGeneNR.gp
    wc -l ensGeneNR.gp
    #	20184 ensGeneNR.gp

    ssh ku
    mkdir /hive/data/genomes/staAur2/bed/multiz369way/4d/run
    cd /hive/data/genomes/staAur2/bed/multiz369way/4d/run
    mkdir ../mfa

    # newer versions of msa_view have a slightly different operation
    # the sed of the gp file inserts the reference species in the chr name
    cat << '_EOF_' > 4d.csh
#!/bin/csh -fe
set PHASTBIN = /cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin
set r = "/hive/data/genomes/staAur2/bed/multiz369way"
set c = $1
set infile = $r/anno/mafNet/result/$2
set outfile = $3
cd /dev/shm
# 'clean' maf, removes all chrom names, leaves only the db name
perl -wpe 's/^s ([^.]+)\.\S+/s $1/' $infile > $c.maf
awk -v C=$c '$2 == C {print}' $r/4d/ensGeneNR.gp | sed -e "s/\t$c\t/\tstaAur2.$c\t/" > $c.gp
set NL=`wc -l $c.gp| gawk '{print $1}'`
if ("$NL" != "0") then
    $PHASTBIN/msa_view --4d --features $c.gp -i MAF $c.maf -o SS > $c.ss
    $PHASTBIN/msa_view -i SS --tuple-size 1 $c.ss > $r/4d/run/$outfile
else
    echo "" > $r/4d/run/$outfile
endif
rm -f $c.gp $c.maf $c.ss
'_EOF_'
    # << happy emacs
    chmod +x 4d.csh

    ../anno/mafNet/result

    ls -1S /hive/data/genomes/staAur2/bed/multiz369way/anno/mafNet/result/*.maf \
	| sed -e "s#.*multiz369way/anno/mafNet/result/##" \
        | egrep -E -v "chrM|chrUn|random|_alt" > maf.list

    printf '#LOOP
4d.csh $(root1) $(path1) {check out line+ ../mfa/$(root1).mfa}
#ENDLOOP
' > template

    gensub2 maf.list single template jobList
    para -ram=64g create jobList
    para try ... check ... push ... etc...
    para time
# Completed: 6 of 6 jobs
# CPU time in finished jobs:       1829s      30.48m     0.51h    0.02d  0.000 y
# IO & Wait Time:                    20s       0.34m     0.01h    0.00d  0.000 y
# Average job time:                 308s       5.14m     0.09h    0.00d
# Longest finished job:             341s       5.68m     0.09h    0.00d
# Submission to last job:           352s       5.87m     0.10h    0.00d

    # combine mfa files
    ssh hgwdev
    cd /hive/data/genomes/staAur2/bed/multiz369way/4d
    # verify no tiny files:
    ls -og mfa | sort -k3nr
# -rw-rw-r-- 1 4660039 Dec 13 12:08 chrII.mfa
# -rw-rw-r-- 1 3973699 Dec 13 12:09 chrV.mfa
# -rw-rw-r-- 1 3416554 Dec 13 12:08 chrX.mfa
# -rw-rw-r-- 1 2920429 Dec 13 12:08 chrI.mfa
# -rw-rw-r-- 1 2589544 Dec 13 12:08 chrIII.mfa
# -rw-rw-r-- 1 2183464 Dec 13 12:09 chrIV.mfa

    #want comma-less species.list
    time /cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin/msa_view \
	--aggregate "`cat ../species.list`" mfa/*.mfa | sed s/"> "/">"/ \
	    > 4d.all.mfa
    # real    0m1.255s

    # check they are all in there:
    grep "^>" 4d.all.mfa | wc -l
    #   369

    # remove the :0.1234 distances from the nh tree:
    sed -e 's/:[0-9.]\+//g;' ../staAur2.369way.nh > tree-commas.nh

    # use phyloFit to create tree model (output is phyloFit.mod)
    time /cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin/phyloFit \
	    --EM --precision MED --msa-format FASTA --subst-mod REV \
		--tree tree-commas.nh 4d.all.mfa
    #   real    50m55.630s

    mv phyloFit.mod all.mod

    # after rearranging the tree to be more display friendly:

# (((((((((((((Caenorhabditis_elegans:0.425799,
#             (Caenorhabditis_sp._38_MB-2015:0.448904,
#              Caenorhabditis_angaria:0.438865):0.38477):0.0632893,
#            Caenorhabditis_japonica:0.631705):0.00332863,
#           Caenorhabditis_sp._34_TK-2017:0.670052):0.00266019,
#          (((((((Caenorhabditis_briggsae:0.00761568,
#                 Caenorhabditis_briggsae:0.00617763):0.0901744,
#                Caenorhabditis_nigoni:0.0878926):0.37284,
#               (Caenorhabditis_sp._26_LS-2015:0.266258,
#                Caenorhabditis_sp5_ju800:0.250871):0.0573557):0.00272983,
#              Caenorhabditis_sp._40_LS-2015:0.311033):0.160657,
#             ((Caenorhabditis_latens:0.0739274,
#               Caenorhabditis_remanei:0.0771516):0.343608,
#              Caenorhabditis_brenneri:0.439498):0.0439004):0.0374774,
#            Caenorhabditis_tropicalis:0.479027):0.0929836,
#           Caenorhabditis_sp._31_LS-2015:0.844993):0.0466798):0.0886724,
#         Caenorhabditis_sp._21_LS-2015:0.81046):0.159439,
#        (((Caenorhabditis_sp._32_LS-2015:0.461375,
#           Caenorhabditis_sp._39_LS-2015:0.42763):0.0699299,
#          (((Pristionchus_arcanus:0.12529,
#             ((Pristionchus_exspectatus:0.0115759,
#               Pristionchus_exspectatus:0.00987795):0.092637,
#              (Pristionchus_pacificus:0.0369936,
#               Pristionchus_pacificus:0.0151984):0.0917107):0.0582013):0.146807,
#            Pristionchus_maxplancki:0.231402):0.108203,
#           Pristionchus_entomophagus:0.314155):0.615931):0.00264449,
#         (((Bursaphelenchus_xylophilus:0.0169712,
#            Bursaphelenchus_xylophilus:0.0121836):0.664516,
#           Subanguina_moxae:1.19812):0.279427,
#          ((Oscheius_sp._MCB:0.310932,
#            Oscheius_sp._TEL-2014:0.314067):0.387814,
#           (((Angiostrongylus_cantonensis:0.76614,
#              (Necator_americanus:0.0069094,
#               Necator_americanus:0.00709169):0.62038):0.22255,
#             Nippostrongylus_brasiliensis:0.585445):0.128625,
#            ((((((((Ancylostoma_caninum:0.109552,
#                    Ancylostoma_duodenale:0.128167):0.0645197,
#                   Ancylostoma_ceylanicum:0.138249):0.302453,
#                  ((((Ascaris_suum:0.0264911,
#                      Ascaris_suum:0.0266353):0.0718425,
#                     Parascaris_univalens:0.111167):0.412171,
#                    Toxocara_canis:0.435243):1.03133,
#                   Oesophagostomum_dentatum:0.444431):0.133431):0.231588,
#                 ((Haemonchus_contortus:0.0237971,
#                   Haemonchus_contortus:0.0104187):0.40375,
#                  Teladorsagia_circumcincta:0.313931):0.415593):0.132113,
#                (((Clonorchis_sinensis:0.102068,
#                   Opisthorchis_viverrini:0.0948009):0.672677,
#                  Dicrocoelium_dendriticum:0.641121):0.366763,
#                 ((Elaeophora_elaphi:0.229754,
#                   Macrostomum_lignano:0.315971):0.260862,
#                  ((Fasciola_gigantica:0.0728112,
#                    Fasciola_hepatica:0.0784224):0.849193,
#                   Spirometra_erinaceieuropaei:0.717335):0.238022):0.175856):0.306152):0.0247175,
#               (Heligmosomoides_polygyrus_bakeri:0.639113,
#                (Oscheius_tipulae:0.772979,
#                 (((Steinernema_carpocapsae:0.288945,
#                    Steinernema_scapterisci:0.285651):0.224527,
#                   Steinernema_feltiae:0.606142):0.0853374,
#                  Steinernema_glaseri:0.525076):0.152498):0.168142):0.00279645):0.00267557,
#              ((((Echinococcus_canadensis:0.0305467,
#                  Echinococcus_granulosus:0.0307877):0.00949128,
#                 Echinococcus_multilocularis:0.039056):0.157973,
#                (((Taenia_asiatica:0.00962609,
#                   Taenia_saginata:0.0164412):0.0108728,
#                  Taenia_multiceps:0.0270195):0.034031,
#                 Taenia_solium:0.0458656):0.187618):0.687903,
#               (Trichuris_muris:0.289869,
#                (Trichuris_trichiura:0.184836,
#                 Trichuris_suis:0.125543):0.291242):0.610672):0.454711):0.00261495,
#             ((Plectus_sambesii:0.668591,
#               Steinernema_monticolum:0.874638):0.0801328,
#              Panagrellus_redivivus:0.667253):0.218884):0.00718934):0.198615):0.0754372):0.16779):0.00370935):0.149757,
#       (Diploscapter_coronatus:0.0396115,
#        Diploscapter_pachys:0.0410107):0.965863):0.0835229,
#      (((((((((((((Brugia_malayi:0.0147423,
#                   Brugia_malayi:0.0184123):0.0136278,
#                  Brugia_pahangi:0.0347893):0.0496926,
#                 Wuchereria_bancrofti:0.0607276):0.203732,
#                ((Dirofilaria_immitis:0.021486,
#                  Dirofilaria_immitis:0.00721013):0.206272,
#                 (Onchocerca_ochengi:0.0198343,
#                  (Onchocerca_volvulus:0.00726612,
#                   Onchocerca_volvulus:0.0153111):0.0105149):0.0986467):0.0222077):0.00274305,
#               (Loa_loa:0.0109839,
#                Loa_loa:0.00898028):0.213308):0.00261442,
#              (Setaria_digitata:0.149493,
#               Setaria_equina:0.120979):0.317084):0.0425831,
#             Onchocerca_flexuosa:0.0978734):0.742036,
#            ((((Dugesia_japonica:0.514157,
#                Girardia_tigrina:1.22545):0.240555,
#               Schmidtea_mediterranea:0.619031):0.356539,
#              (((((Meloidogyne_arenaria:0.0340878,
#                   Meloidogyne_javanica:0.0426825):0.0272648,
#                  Meloidogyne_incognita:0.0308494):0.0157159,
#                 (Meloidogyne_floridensis:0.0840962,
#                  Meloidogyne_incognita:0.0530444):0.00989274):0.18009,
#                (Meloidogyne_graminicola:0.50525,
#                 Meloidogyne_hapla:0.171223):0.0509786):0.654631,
#               (Parastrongyloides_trichosuri:0.389745,
#                ((Strongyloides_papillosus:0.0783108,
#                  Strongyloides_venezuelensis:0.0916494):0.198375,
#                 (Strongyloides_stercoralis:0.177646,
#                  Strongyloides_ratti:0.150624):0.0896129):0.165222):0.220136):0.131527):0.137744,
#             ((Schistosoma_haematobium:0.0960504,
#               Schistosoma_mansoni:0.10575):0.224188,
#              Schistosoma_japonicum:0.29067):0.484258):0.152071):0.239138,
#           Dictyocaulus_viviparus:0.814104):0.248629,
#          Hymenolepis_microstoma:0.705812):0.234011,
#         (Rhabditophanes_sp._KR3021:0.753609,
#          (Ciona_intestinalis:0.864953,
#           Heterorhabditis_bacteriophora:0.889764):0.341081):0.233997):0.163448,
#        Gyrodactylus_salaris:0.912435):0.232379,
#       Parapristionchus_giblindavisi:0.738653):0.156608):0.28746,
#     Ditylenchus_destructor:0.885526):0.238847,
#    (((((Trichinella_sp._T6:0.0168947,
#         Trichinella_patagoniensis:0.0187164):0.00398067,
#        ((Trichinella_sp._T8:0.0102628,
#          (Trichinella_sp._T9:0.0118704,
#           Trichinella_britovi:0.00645468):0.00273378):0.00273053,
#         Trichinella_murrelli:0.0111672):0.00482982):0.00272028,
#       Trichinella_nativa:0.0154206):0.0136128,
#      (Trichinella_nelsoni:0.0210349,
#       (Trichinella_spiralis:0.00697283,
#        Trichinella_spiralis:0.0168217):0.0169346):0.00352233):0.0510198,
#     ((Trichinella_papuae:0.0217595,
#       Trichinella_zimbabwensis:0.0207942):0.0233717,
#      Trichinella_pseudospiralis:0.0610335):0.0574885):0.947802):0.183355,
#   Romanomermis_culicivorax:0.680971):0.164498,
#  ((((Globodera_ellingtonae:0.035005,
#      Globodera_rostochiensis:0.0362764):0.0267624,
#     Globodera_pallida:0.0659637):0.259339,
#    Heterodera_glycines:0.354234):0.225215,
#   Rotylenchulus_reniformis:0.468661):0.395303):0.491,
# Acrobeloides_nanus:0.491):0.1:0.1;

    grep TREE all.mod | sed -e 's/TREE: //;' \
       | /cluster/bin/phast/all_dists /dev/stdin | grep staAur2 \
          | sed -e "s/staAur2.//;"  | sort > staAur2.dists

    /cluster/bin/phast/all_dists ../staAur2.369way.nh | grep staAur2 \
          | sed -e "s/staAur2.//;"  | sort > original.dists

    # printing out the 'original', the 'new' the 'difference' and
    #    percent difference/delta
    join original.dists staAur2.dists | awk '{
  printf "#\t%s\t%8.6f\t%8.6f\t%8.6f\t%8.6f\n", $1, $2, $3, $2-$3, 100*($2-$3)/$3 }'       | sort -k4n

#########################################################################
# phastCons 369-way (TBD - 2018-12-14 - Hiram)
    # split 369way mafs into 1M chunks and generate sufficient statistics
    # files for phastCons
    ssh ku
    mkdir -p /hive/data/genomes/staAur2/bed/multiz369way/cons/ss
    mkdir -p /hive/data/genomes/staAur2/bed/multiz369way/cons/msa.split
    cd /hive/data/genomes/staAur2/bed/multiz369way/cons/msa.split

    cat << '_EOF_' > doSplit.csh
#!/bin/csh -ef
set c = $1
set MAF = /hive/data/genomes/staAur2/bed/multiz369way/anno/mafNet/result/$c.maf
set WINDOWS = /hive/data/genomes/staAur2/bed/multiz369way/cons/ss/$c
set WC = `cat $MAF | wc -l`
set NL = `grep "^#" $MAF | wc -l`
if ( -s $2 ) then
    exit 0
endif
if ( -s $2.running ) then
    exit 0
endif

date >> $2.running

rm -fr $WINDOWS
mkdir $WINDOWS
pushd $WINDOWS > /dev/null
if ( $WC != $NL ) then
/cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin/msa_split \
    $MAF -i MAF -o SS -r $WINDOWS/$c -w 1000000,0 -I 1000 -B 5000
endif
popd > /dev/null
date >> $2
rm -f $2.running
'_EOF_'
    # << happy emacs
    chmod +x doSplit.csh

    printf '#LOOP
doSplit.csh $(root1) {check out line+ $(root1).done}
#ENDLOOP
' > template

#	do the easy ones first to see some immediate results
    ls -1S -r ../../anno/mafNet/result | sed -e "s/.maf//;" > maf.list
    # all can finish OK at a 64Gb memory limit
    gensub2 maf.list single template jobList
    para -ram=64g create jobList
    para try ... check ... etc
    para push
# Completed: 7 of 7 jobs
# CPU time in finished jobs:       2431s      40.52m     0.68h    0.03d  0.000 y
# IO & Wait Time:                   278s       4.63m     0.08h    0.00d  0.000 y
# Average job time:                 387s       6.45m     0.11h    0.00d
# Longest finished job:             540s       9.00m     0.15h    0.01d
# Submission to last job:           548s       9.13m     0.15h    0.01d

    # Run phastCons
    #	This job is I/O intensive in its output files, beware where this
    #	takes place or do not run too many at once.
    ssh ku
    mkdir -p /hive/data/genomes/staAur2/bed/multiz369way/cons/run.cons
    cd /hive/data/genomes/staAur2/bed/multiz369way/cons/run.cons

    #	This is setup for multiple runs based on subsets, but only running
    #   the 'all' subset here.
    #   It triggers off of the current working directory
    #	$cwd:t which is the "grp" in this script.  Running:
    #	all and vertebrates

    cat << '_EOF_' > doPhast.csh
#!/bin/csh -fe
set PHASTBIN = /cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin
set c = $1
set f = $2
set len = $3
set cov = $4
set rho = $5
set grp = $cwd:t
set cons = /hive/data/genomes/staAur2/bed/multiz369way/cons
set tmp = $cons/tmp/$f
mkdir -p $tmp
set ssSrc = $cons/ss
set useGrp = "$grp.mod"
if (-s $cons/$grp/$grp.non-inf) then
  ln -s $cons/$grp/$grp.mod $tmp
  ln -s $cons/$grp/$grp.non-inf $tmp
  ln -s $ssSrc/$c/$f.ss $tmp
else
  ln -s $ssSrc/$c/$f.ss $tmp
  ln -s $cons/$grp/$grp.mod $tmp
endif
pushd $tmp > /dev/null
if (-s $grp.non-inf) then
  $PHASTBIN/phastCons $f.ss $useGrp \
    --rho $rho --expected-length $len --target-coverage $cov --quiet \
    --not-informative `cat $grp.non-inf` \
    --seqname $c --idpref $c --most-conserved $f.bed --score > $f.pp
else
  $PHASTBIN/phastCons $f.ss $useGrp \
    --rho $rho --expected-length $len --target-coverage $cov --quiet \
    --seqname $c --idpref $c --most-conserved $f.bed --score > $f.pp
endif
popd > /dev/null
mkdir -p pp/$c bed/$c
sleep 4
touch pp/$c bed/$c
rm -f pp/$c/$f.pp
rm -f bed/$c/$f.bed
mv $tmp/$f.pp pp/$c
mv $tmp/$f.bed bed/$c
rm -fr $tmp
'_EOF_'
    # << happy emacs
    chmod +x doPhast.csh

    #	this template will serve for all runs
    #	root1 == chrom name, file1 == ss file name without .ss suffix
    printf '#LOOP
../run.cons/doPhast.csh $(root1) $(file1) 45 0.3 0.3 {check out line+ pp/$(root1)/$(file1).pp}
#ENDLOOP
' > template

    ls -1S ../ss/chr*/chr* | sed -e "s/.ss$//" > ss.list
    wc -l ss.list
    #	104 ss.list

    # Create parasol batch and run it
    # run for all species
    cd /hive/data/genomes/staAur2/bed/multiz369way/cons
    mkdir -p all
    cd all
    #	Using the .mod tree
    cp -p ../../4d/all.mod ./all.mod

    gensub2 ../run.cons/ss.list single ../run.cons/template jobList
    # beware overwhelming the cluster with these fast running high I/O jobs
    para -ram=32g create jobList
    para try ... check ...
    para -maxJob=16 push
# Completed: 104 of 104 jobs
# CPU time in finished jobs:       6198s     103.30m     1.72h    0.07d  0.000 y
# IO & Wait Time:                   718s      11.97m     0.20h    0.01d  0.000 y
# Average job time:                  67s       1.11m     0.02h    0.00d
# Longest finished job:              84s       1.40m     0.02h    0.00d
# Submission to last job:           216s       3.60m     0.06h    0.00d

    # create Most Conserved track
    cd /hive/data/genomes/staAur2/bed/multiz369way/cons/all
    time cut -f1 ../../../../chrom.sizes | while read C
do
    echo $C 1>&2
    ls -d bed/${C} 2> /dev/null | while read D
    do
        cat ${D}/${C}*.bed
    done | sort -k1,1 -k2,2n \
    | awk '{printf "%s\t%d\t%d\tlod=%d\t%s\n", "'${C}'", $2, $3, $5, $5;}'
done > tmpMostConserved.bed
    # real    0m39.720s

    # -rw-rw-r-- 1 74615007 Dec 14 14:15 tmpMostConserved.bed

    time /cluster/bin/scripts/lodToBedScore tmpMostConserved.bed \
        > mostConserved.bed
    # real    0m10.119s

    # -rw-rw-r-- 1 76283801 Dec 14 14:16 mostConserved.bed

    # load into database
    ssh hgwdev
    cd /hive/data/genomes/staAur2/bed/multiz369way/cons/all
    time hgLoadBed staAur2 phastConsElements369way mostConserved.bed
    #  Read 2275482 elements of size 5 from mostConserved.bed
    #  real    0m10.328s

    # most interesting high measurement for this coverage
    #	--rho 0.3 --expected-length 45 --target-coverage 0.3
    time featureBits staAur2 -enrichment ncbiRefSeq:cds phastConsElements369way
    # ncbiRefSeq:cds 25.009%, phastConsElements369way 43.266%, both 16.084%, cover 64.31%, enrich 1.49x
    # real    0m12.804s

    # Try for 5% overall cov, and 70% CDS cov
    time featureBits staAur2 -enrichment refGene:cds phastConsElements369way
    # refGene:cds 25.013%, phastConsElements369way 43.266%, both 16.087%, cover 64.31%, enrich 1.49x
    # real    0m13.519s

    time featureBits staAur2 -enrichment ensGene:cds phastConsElements369way
    # ensGene:cds 25.144%, phastConsElements369way 43.266%, both 16.129%, cover 64.15%, enrich 1.48x
    # real    0m10.267s

    # Create merged posterier probability file and wiggle track data files
    cd /hive/data/genomes/staAur2/bed/multiz369way/cons/all
    mkdir downloads

    time for D in `ls -d pp/chr* | sed -e 's#pp/##'`
do
    echo "working: $D" 1>&2
    find ./pp/${D} -type f | sed -e "s#^./##; s#\.# d #g; s#-# m #;" \
	| sort -k1,1 -k3,3n | sed -e "s# d #.#g; s# m #-#g;" | xargs cat \
        | gzip -c > downloads/${D}.phastCons369way.wigFix.gz
done
    # real    0m51.373s

    #	encode those files into wiggle data
    time (zcat downloads/*.wigFix.gz \
	| wigEncode stdin phastCons369way.wig phastCons369way.wib)
    #   Converted stdin, upper limit 1.00, lower limit 0.00
    #   real    0m26.755s

    du -hsc *.wi?
    #	91M     phastCons369way.wib
    #  9.1M    phastCons369way.wig

    # -rw-rw-r-- 1 95051167 Dec 14 14:24 phastCons369way.wib
    # -rw-rw-r-- 1  9487394 Dec 14 14:24 phastCons369way.wig

    #	encode into a bigWig file:
    #	(warning wigToBigWig process may be too large for memory limits
    #	in bash, to avoid the 32 Gb memory limit, set 180 Gb here:
export sizeG=188743680
ulimit -d $sizeG
ulimit -v $sizeG
    time (zcat downloads/*.wigFix.gz \
      | wigToBigWig -verbose=2 stdin \
	../../../../chrom.sizes phastCons369way.bw) > bigWig.log 2>&1
    egrep "VmPeak|real" bigWig.log
    # pid=74267: VmPeak:     1108636 kB
    # real    0m51.112s

    # -rw-rw-r-- 1 244041045 Dec 14 14:27 phastCons369way.bw

    bigWigInfo phastCons369way.bw
version: 4
isCompressed: yes
isSwapped: 0
primaryDataSize: 182,564,643
primaryIndexSize: 3,214,556
zoomLevels: 10
chromCount: 7
basesCovered: 95,051,167
mean: 0.467683
min: 0.000000
max: 1.000000
std: 0.439150

    #	if you wanted to use the bigWig file, loading bigWig table:
    #   but we don't use the bigWig file
    ln -s `pwd`/phastCons369way.bw /gbdb/staAur2/multiz369way
    hgsql staAur2 -e 'drop table if exists phastCons369way; \
            create table phastCons369way (fileName varchar(255) not null); \
            insert into phastCons369way values
	("/gbdb/staAur2/multiz369way/phastCons369way.bw");'

    # Load gbdb and database with wiggle.
    ssh hgwdev
    cd /hive/data/genomes/staAur2/bed/multiz369way/cons/all
    ln -s `pwd`/phastCons369way.wib /gbdb/staAur2/multiz369way/phastCons369way.wib
    time hgLoadWiggle -pathPrefix=/gbdb/staAur2/multiz369way staAur2 \
	phastCons369way phastCons369way.wig
    #   real    0m0.722s

    time wigTableStats.sh staAur2 phastCons369way
# db.table            min max   mean   count     sumData
# staAur2.phastCons369way  0 1 0.467683 95051167 4.44538e+07
#       stdDev viewLimits
#      0.43915 viewLimits=0:1
#  real    0m0.505s

    #  Create histogram to get an overview of all the data
    ssh hgwdev
    cd /hive/data/genomes/staAur2/bed/multiz369way/cons/all
    time hgWiggle -doHistogram -db=staAur2 \
	-hBinSize=0.001 -hBinCount=1000 -hMinVal=0.0 -verbose=2 \
	    phastCons369way > staAur2.phastCons369way.histogram.data 2>&1
    #	real    0m4.020s
    # the pid VmPeak measurements to stderr are getting into the middle of
    # a line in this data output.  Either do *not* use 2>&1 to get all
    # into the output file, or edit it to fix it after done

    #	create plot of histogram:
    # updated for new gnuplot on hgwdev 2018-11-26 (can't get font to change)
    printf 'set terminal pngcairo size 1000,600 background "#000000" font "/usr/share/fonts/default/Type1/n022004l.pfb"
set output "staAur2.phastCons369.histo.png"
set size 1.0, 1.0
set style line 1 lt 2 lc rgb "#ff88ff" lw 2
set style line 2 lt 2 lc rgb "#66ff66" lw 2
set style line 3 lt 2 lc rgb "#ffff00" lw 2
set style line 4 lt 2 lc rgb "#ffffff" lw 2
set border lc rgb "#ffff00"
set key left box ls 3
set key tc variable
set grid noxtics
set grid y2tics ls 4
set grid ytics ls 4
set title " C. elegans/staAur2 Histogram phastCons369way track" \
    tc rgb "#ffffff"
set xlabel " phastCons369way score" tc rgb "#ffffff"
set ylabel " Relative Frequency" tc rgb "#ff88ff"
set y2label " Cumulative Relative Frequency (CRF)" tc rgb "#66ff66"
set y2range [0:1]
set yrange [0:0.02]

plot "staAur2.phastCons369way.histogram.data" using 2:5 title " RelFreq" with impulses ls 1, \
        "staAur2.phastCons369way.histogram.data" using 2:7 axes x1y2 title " CRF" with lines ls 2
' | gnuplot

    # take a look to see if it is sane:

    display histo.png &

#########################################################################
# phyloP for 369-way (TBD - 2018-12-14 - Hiram)
#
    # can use the same split 1M chunk SS files into that were used
    # for phastCons

    # run phyloP with score=LRT
    ssh ku
    mkdir /cluster/data/staAur2/bed/multiz369way/consPhyloP
    cd /cluster/data/staAur2/bed/multiz369way/consPhyloP

    mkdir run.phyloP
    cd run.phyloP
    # Adjust model file base composition background and rate matrix to be
    # representative of the chromosomes in play
    grep BACK ../../4d/all.mod
    #   BACKGROUND: 0.338800 0.205656 0.164894 0.290649

    grep BACKGROUND ../../4d/all.mod | awk '{printf "%0.3f\n", $3 + $4}'
    #	0.371
    /cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin/modFreqs \
	../../4d/all.mod 0.371 > all.mod
    # verify, the BACKGROUND should now be paired up:
    grep BACK all.mod
    #   BACKGROUND: 0.314500 0.185500 0.185500 0.314500

    cat << '_EOF_' > doPhyloP.csh
#!/bin/csh -fe
set PHASTBIN = /cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin
set f = $1
set d = $f:h
set file1 = $f:t
set out = $2
set cName = $f:t:r
set grp = $cwd:t
set cons = /hive/data/genomes/staAur2/bed/multiz369way/consPhyloP
set tmp = $cons/tmp/$grp/$f
/bin/rm -fr $tmp
/bin/mkdir -p $tmp
set ssSrc = "/hive/data/genomes/staAur2/bed/multiz369way/cons/ss/$f"
set useGrp = "$grp.mod"
/bin/ln -s $cons/run.phyloP/$grp.mod $tmp
pushd $tmp > /dev/null
$PHASTBIN/phyloP --method LRT --mode CONACC --wig-scores --chrom $cName \
    -i SS $useGrp $ssSrc.ss > $file1.wigFix
popd > /dev/null
/bin/mkdir -p $out:h
sleep 4
/bin/touch $out:h
/bin/mv $tmp/$file1.wigFix $out
/bin/rm -fr $tmp
/bin/rmdir --ignore-fail-on-non-empty $cons/tmp/$grp/$d:h
/bin/rmdir --ignore-fail-on-non-empty $cons/tmp/$grp
/bin/rmdir --ignore-fail-on-non-empty $cons/tmp
'_EOF_'
    # << happy emacs
    chmod +x doPhyloP.csh

    # Create list of chunks
    find ../../cons/ss -type f | grep ".ss$" \
	| sed -e "s/.ss$//; s#^../../cons/ss/##" > ss.list
    # make sure the list looks good
    wc -l ss.list
    #	104 ss.list

    # Create template file
    #	file1 == $chr/$chunk/file name without .ss suffix
    cat << '_EOF_' > template
    printf '#LOOP
../run.phyloP/doPhyloP.csh $(path1) {check out line+ wigFix/$(dir1)/$(file1).wigFix}
#ENDLOOP
' > template

    ######################   Running all species  #######################
    # setup run for all species
    mkdir /hive/data/genomes/staAur2/bed/multiz369way/consPhyloP/all
    cd /hive/data/genomes/staAur2/bed/multiz369way/consPhyloP/all
    rm -fr wigFix
    mkdir wigFix

    gensub2 ../run.phyloP/ss.list single ../run.phyloP/template jobList
    # beware overloading the cluster with these quick and high I/O jobs
    para -ram=32g create jobList
    para try ... check ...
    para -maxJob=16 push
    para time > run.time
# Completed: 104 of 104 jobs
# CPU time in finished jobs:     420481s    7008.02m   116.80h    4.87d  0.013 y
# IO & Wait Time:                  1037s      17.28m     0.29h    0.01d  0.000 y
# Average job time:                4053s      67.55m     1.13h    0.05d
# Longest finished job:            5455s      90.92m     1.52h    0.06d
# Submission to last job:          5629s      93.82m     1.56h    0.07d

    mkdir downloads
    time for D in `ls -d wigFix/chr* | sed -e 's#wigFix/##'`
do
    echo "working: $D" 1>&2
    find ./wigFix/${D} -type f | sed -e "s#^./##; s#\.# d #g; s#-# m #;" \
	| sort -k1,1 -k3,3n | sed -e "s# d #.#g; s# m #-#g;" | xargs cat \
        | gzip -c > downloads/${D}.phyloP369way.wigFix.gz
done
    #   real    1m24.231s

    du -hsc downloads
    #   195M    downloads

    # check integrity of data with wigToBigWig
    time (zcat downloads/*.wigFix.gz \
	| wigToBigWig -verbose=2 stdin /hive/data/genomes/staAur2/chrom.sizes \
	phyloP369way.bw) > bigWig.log 2>&1
    egrep "real|VmPeak" bigWig.log
    # pid=180522: VmPeak:    1108636 kB
    # real    0m51.720s

    # -rw-rw-r-- 1 356259120 Dec 14 17:17 phyloP369way.bw

    bigWigInfo phyloP369way.bw  | sed -e 's/^/# /;'
# version: 4
# isCompressed: yes
# isSwapped: 0
# primaryDataSize: 281,654,520
# primaryIndexSize: 3,214,556
# zoomLevels: 10
# chromCount: 7
# basesCovered: 95,051,167
# mean: 1.130380
# min: -20.000000
# max: 20.000000
# std: 2.927330

    #	encode those files into wiggle data
    time (zcat downloads/*.wigFix.gz \
	| wigEncode stdin phyloP369way.wig phyloP369way.wib)
# Converted stdin, upper limit 20.00, lower limit -20.00
# real    0m25.705s

    # -rw-rw-r-- 1  95051167 Dec 14 17:19 phyloP369way.wib
    # -rw-rw-r-- 1  10006648 Dec 14 17:19 phyloP369way.wig

    du -hsc *.wi?
    #  91M     phyloP369way.wib
    # 9.6M    phyloP369way.wig

    # Load gbdb and database with wiggle.
    ln -s `pwd`/phyloP369way.wib /gbdb/staAur2/multiz369way/phyloP369way.wib
    time hgLoadWiggle -pathPrefix=/gbdb/staAur2/multiz369way staAur2 \
	phyloP369way phyloP369way.wig
    # real    0m0.761s

    # use to set trackDb.ra entries for wiggle min and max
    # and verify table is loaded correctly

    wigTableStats.sh staAur2 phyloP369way
# db.table          min   max     mean       count     sumData
# staAur2.phyloP369way -20 20 1.13038 95051167 1.07444e+08
#       stdDev viewLimits
#      2.92733 viewLimits=-13.5063:15.767

    #	that range is: 20+20= 40 for hBinSize=0.04

    #  Create histogram to get an overview of all the data
    time hgWiggle -doHistogram \
	-hBinSize=0.04 -hBinCount=1000 -hMinVal=-20 -verbose=2 \
	    -db=staAur2 phyloP369way > staAur2.phyloP369way.histo.data 2>&1
    #   real    0m4.117s

    # xaxis range:
    grep -v chrom staAur2.phyloP369way.histo.data | grep "^[0-9]" \
	| ave -col=2 stdin | sed -e 's/^/# /;'
# Q1 -7.500000
# median 0.320000
# Q3 8.140000
# average 0.309272
# min -20.000000
# max 20.000000
# count 783
# total 242.159998
# standard deviation 9.085841

    # find out the range for the 2:5 graph
    grep -v chrom staAur2.phyloP369way.histo.data | grep "^[0-9]" \
	| ave -col=5 stdin | sed -e 's/^/# /;'
# Q1 0.000002
# median 0.000108
# Q3 0.000374
# average 0.001277
# min 0.000000
# max 0.038921
# count 783
# total 0.999993
# standard deviation 0.003638

    #	create plot of histogram:
    # updated for new gnuplot on hgwdev 2018-11-26 (can't get font to change)
    printf 'set terminal pngcairo size 1000,600 background "#000000" font "/usr/share/fonts/default/Type1/n022004l.pfb"
set output "staAur2.phyloP369.histo.png"
set size 1.0, 1.0
set style line 1 lt 2 lc rgb "#ff88ff" lw 2
set style line 2 lt 2 lc rgb "#66ff66" lw 2
set style line 3 lt 2 lc rgb "#ffff00" lw 2
set style line 4 lt 2 lc rgb "#ffffff" lw 2
set border lc rgb "#ffff00"
set key left box ls 3
set key tc variable
set grid noxtics
set grid y2tics ls 4
set grid ytics ls 4
set title " C. elegans/staAur2 Histogram phyloP369way track" \
    tc rgb "#ffffff"
set xlabel " phyloP369way score" tc rgb "#ffffff"
set ylabel " Relative Frequency" tc rgb "#ff88ff"
set y2label " Cumulative Relative Frequency (CRF)" tc rgb "#66ff66"
set xrange [-1:1.5]
set yrange [0:0.04]

plot "staAur2.phyloP369way.histo.data" using 2:5 title " RelFreq" with impulses ls 1, \
        "staAur2.phyloP369way.histo.data" using 2:7 axes x1y2 title " CRF" with lines ls 2
' | gnuplot

    # verify it looks sane
    display staAur2.phyloP369.histo.png &

#############################################################################
# determine taxonomy for everything (TBD - 2018-12-14 - Hiram)
# to set up group divisions

    # use this perl script to extra the taxonomy strings from NCBI entrez

    mkdir /hive/data/genomes/staAur2/bed/multiz369way/taxonomy
    cd /hive/data/genomes/staAur2/bed/multiz369way/taxonomy

    cat > taxon.pl << '_EOF_'
#!/usr/bin/env perl

use strict;
use warnings;

my $argc = scalar(@ARGV);
if ($argc < 1) {
  printf STDERR "usage: taxon.pl <taxId> [more taxIds]\n";
  exit 255;
}

while (my $taxId = shift) {
  my $taxonomyString = "";
  open (TX, "wget -O /dev/stdout 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=taxonomy&id=$taxId&retmode=xml' 2> /dev/null | grep -i ScientificName | headRest 1 stdin|") or die "can not wget entrez/eutils";
while (my $sciName = <TX>) {
    chomp $sciName;
    $sciName =~ s/.*<ScientificName>//;
    $sciName =~ s#</ScientificName>##;
    $taxonomyString .= sprintf("%s;", $sciName);
  }
  close (TX);
  printf "%d\t%s\n", $taxId, $taxonomyString;
}
'_EOF_'
# << happy emacs

    chmod +x taxon.pl

    ./taxon.pl `sed -e 's/:.*//; s/(//g; s/ //g;' ../staAur2.369way.taxId.nh  | sort -u | xargs echo` > taxonomy.txt

    # survey the names in those
    sed -e 's/.*organisms;//;' taxonomy.txt | tr ';' '\n' | sort | uniq -c | sort -rn | head -20
    122 Opisthokonta
    122 Metazoa
    122 Eumetazoa
    122 Eukaryota
    122 Bilateria
    122 
     99 Protostomia
     99 Nematoda
     99 Ecdysozoa
     83 Chromadorea
     76 Rhabditida
     35 Rhabditina
     29 Rhabditomorpha
     27 Tylenchina
     23 Rhabditoidea
     23 Rhabditidae
     22 Platyhelminthes
     18 Peloderinae
     18 Caenorhabditis
     16 Enoplea

     sed -e 's/.*;Bilateria;//;' taxonomy.txt | cut -d';' -f1-4 \
         | sort | uniq -c | sort -n
      1 Deuterostomia;Chordata;Tunicata;Ascidiacea
      1 Platyhelminthes;Cestoda;Eucestoda;Diphyllobothriidea
      1 Platyhelminthes;Monogenea;Monopisthocotylea;Gyrodactylidea
      1 Platyhelminthes;Rhabditophora;Macrostomorpha;Macrostomida
      2 Platyhelminthes;Trematoda;Digenea;Opisthorchiida
      3 Platyhelminthes;Rhabditophora;Seriata;Tricladida
      3 Platyhelminthes;Trematoda;Digenea;Plagiorchiida
      3 Platyhelminthes;Trematoda;Digenea;Strigeidida
      8 Platyhelminthes;Cestoda;Eucestoda;Cyclophyllidea
     16 Protostomia;Ecdysozoa;Nematoda;Enoplea
     83 Protostomia;Ecdysozoa;Nematoda;Chromadorea

     sed -e 's/.*;Bilateria;//;' taxonomy.txt | grep Protostomia \
          | cut -d';' -f1-6 | sort | uniq -c | sort -n
      1 Protostomia;Ecdysozoa;Nematoda;Chromadorea;Plectida;Plectina
      1 Protostomia;Ecdysozoa;Nematoda;Chromadorea;Strongylida;Metastrongyloidea
      1 Protostomia;Ecdysozoa;Nematoda;Enoplea;Dorylaimia;Mermithida
      5 Protostomia;Ecdysozoa;Nematoda;Chromadorea;Strongylida;Trichostrongyloidea
     14 Protostomia;Ecdysozoa;Nematoda;Chromadorea;Rhabditida;Spirurina
     15 Protostomia;Ecdysozoa;Nematoda;Enoplea;Dorylaimia;Trichinellida
     27 Protostomia;Ecdysozoa;Nematoda;Chromadorea;Rhabditida;Tylenchina
     35 Protostomia;Ecdysozoa;Nematoda;Chromadorea;Rhabditida;Rhabditina

     # with those numbers in hand, break them up like this
     # the 'orderByChain.db.txt' list is from the output of sizeStats.pl
     # as done earlier, just the sequence name/db names from that output
     # in their featureBits chainLink order

for N in Deuterostomia Platyhelminthes Plectida Rhabditina Spirurina Tylenchina Strongylida Dorylaimia
do
   grep ";$N;" taxonomy.txt | cut -f1 > $N.taxId.list
   join -t$'\t' <(sort $N.taxId.list) <(sort taxId.GC.sequenceName.accAsmId.sciName.name.tab) | cut -f3 | sort -u > $N.db.list
done

cat orderByChain.db.txt | while read n
do
  grep -w "${n}" *.list
done | sed -e 's/.db.list:/\t/;' > division.dbName.ordered.tab

for N in Deuterostomia Platyhelminthes Plectida Rhabditina Spirurina Tylenchina Strongylida Dorylaimia
do
  grep "$N" division.dbName.ordered.tab | cut -f2 > ${N}.ordered
done

    # those *.ordered listings are used in trackDb to establish
    # the chainNet composite:

    cd ~/kent/src/hg/makeDb/trackDb/worm/staAur2

    ~/kent/src/hg/utils/phyloTrees/chainNetCompositeTrackDb.pl net nematodes \
      staAur2 Rhabditina.ordered Spirurina.ordered Tylenchina.ordered \
        Strongylida.ordered Plectida.ordered Dorylaimia.ordered \
          Platyhelminthes.ordered Deuterostomia.ordered \
     >  chainNetNematodes.ra

    # edit that .ra file to remove the .ordered from each name in this list:
    subGroup3 clade Clade c00=Rhabditina c01=Spirurina c02=Tylenchina c03=Strongylida c04=Plectida c05=Dorylaimia c06=Platyhelminthes c07=Deuterostomia

    # can avoid this edit by calling those files *.list instead

#############################################################################
# construct download files for 369-way (TBD - 2018-11-27 - Hiram)
    mkdir /usr/local/apache/htdocs-hgdownload/goldenPath/staAur2/multiz369way
    mkdir /usr/local/apache/htdocs-hgdownload/goldenPath/staAur2/phastCons369way
    mkdir /usr/local/apache/htdocs-hgdownload/goldenPath/staAur2/phyloP369way
    mkdir /hive/data/genomes/staAur2/bed/multiz369way/downloads
    cd /hive/data/genomes/staAur2/bed/multiz369way/downloads
    mkdir multiz369way phastCons369way phyloP369way

    #########################################################################
    ## create upstream refGene maf files
    cd /hive/data/genomes/staAur2/bed/multiz369way/downloads/multiz369way
    # bash script

#!/bin/bash
export geneTbl="ensGene"
for S in 1000 2000 5000
do
    echo "making upstream${S}.maf"
    featureBits staAur2 ${geneTbl}:upstream:${S} -fa=/dev/null -bed=stdout \
        | perl -wpe 's/_up[^\t]+/\t0/' | sort -k1,1 -k2,2n \
        | /cluster/bin/$MACHTYPE/mafFrags staAur2 multiz369way \
                stdin stdout \
                -orgs=/hive/data/genomes/staAur2/bed/multiz369way/species.list \
        | gzip -c > upstream${S}.${geneTbl}.maf.gz
    echo "done upstream${S}.${geneTbl}.maf.gz"
done
    # real    107m51.664s

# -rw-rw-r-- 1  289630077 Dec 14 18:00 upstream1000.ensGene.maf.gz
# -rw-rw-r-- 1  545877544 Dec 14 18:30 upstream2000.ensGene.maf.gz
# -rw-rw-r-- 1 1334065100 Dec 14 19:16 upstream5000.ensGene.maf.gz

    ######################################################################
    ## compress the maf files
    cd /hive/data/genomes/staAur2/bed/multiz369way/downloads/multiz369way
    time rsync -a -P ../../anno/mafNet/result/ ./maf/
    # real    3m21.807s


    du -hsc maf/
    # 49G     maf/

    cd maf
    time gzip *.maf &
    # real 21m44.263s

    du -hscL maf ../../anno/maf/result/
    # 5.4G    maf
    #  63G     ../../anno/maf/result/

    cd maf
    md5sum *.maf.gz > md5sum.txt

    # collect the other sequences here:
    cd /hive/data/genomes/staAur2/bed/multiz369way/downloads/multiz369way
    mkdir sequences
grep -v "^[a-z]" nameCrossReference.tsv | while read L
do
   seqName=`printf "%s" "${L}" | awk -F$'\t' '{print $1}'`
   acc=`printf "%s" "${L}" | awk -F$'\t' '{print $2}'`
   asmId=`printf "%s" "${L}" | awk -F$'\t' '{print $4}'`
   src="../../../awsMultiz/sequences/${acc}_${asmId}"
   twoBit="../../../awsMultiz/twoBit/${acc}_${asmId}.2bit"
   destName=`printf "%s.%s.%s" "${seqName}" "${acc}" "${asmId}"`
   printf "sequences/%s\n" "${destName}"
   mkdir -p "sequences/${destName}"
   cp -p "${twoBit}" "sequences/${destName}/"
   cp -p ${src}/*_assembly_report.txt "sequences/${destName}/"
done

   du -hsc sequences
   #  7.2G    sequences/

   cd sequences
   time md5sum */*.2bit */*.txt > md5sum.txt
   # real    0m30.828s

   mkdir /usr/local/apache/htdocs-hgdownload/goldenPath/staAur2/multiz369way/maf
    cd maf
    ln -s `pwd`/* /usr/local/apache/htdocs-hgdownload/goldenPath/staAur2/multiz369way/maf/
   mkdir /usr/local/apache/htdocs-hgdownload/goldenPath/staAur2/multiz369way/sequences
    cd ../sequences
    ln -s `pwd`/* /usr/local/apache/htdocs-hgdownload/goldenPath/staAur2/multiz369way/sequences/

    cd ..

    # obtain the README.txt from the multiz26way and fix it up with appropriate
    # listings

    ln -s `pwd`/*.maf.gz `pwd`/*.nh `pwd`/*.txt \
         /usr/local/apache/htdocs-hgdownload/goldenPath/staAur2/multiz369way/

    ###########################################################################

    cd /hive/data/genomes/staAur2/bed/multiz369way/downloads/multiz369way
    grep TREE ../../4d/all.mod | awk '{print $NF}' \
      | ~/kent/src/hg/utils/phyloTrees/asciiTree.pl /dev/stdin \
         > staAur2.369way.nh
    cat staAur2.369way.nh \
      |  sed -e 's/(/(\n/g; s/,/,\n/g; s/)/\n)/g; s/ //g;' \
        | grep -v "^$" | ~/kent/src/hg/utils/phyloTrees/binaryTree.pl \
          -nameTranslate=../../sequenceName.sciName.tab -noInternal \
             -lineOutput /dev/stdin > staAur2.369way.scientificName.nh
    cat staAur2.369way.nh \
      |  sed -e 's/(/(\n/g; s/,/,\n/g; s/)/\n)/g; s/ //g;' \
        | grep -v "^$" | ~/kent/src/hg/utils/phyloTrees/binaryTree.pl \
          -nameTranslate=../../sequenceName.taxId.tab -noInternal \
             -lineOutput /dev/stdin > staAur2.369way.taxId.nh


    ~/kent/src/hg/utils/phyloTrees/commonNames.sh staAur2.369way.nh \
      | ~/kent/src/hg/utils/phyloTrees/asciiTree.pl /dev/stdin \
         > staAur2.369way.commonNames.nh
    ~/kent/src/hg/utils/phyloTrees/scientificNames.sh staAur2.369way.nh \
	| $HOME/kent/src/hg/utils/phyloTrees/asciiTree.pl /dev/stdin \
	    > staAur2.369way.scientificNames.nh
    time md5sum *.nh *.maf.gz > md5sum.txt
    #   real    0m3.147s

    ln -s `pwd`/*.maf.gz `pwd`/*.nh \
        /usr/local/apache/htdocs-hgdownload/goldenPath/staAur2/multiz369way

    du -hsc ./maf ../../anno/result
    #  18G     ./maf
    # 156G    ../../anno/result

    # obtain the README.txt from staAur2/multiz20way and update for this
    #   situation
    ln -s `pwd`/*.txt \
         /usr/local/apache/htdocs-hgdownload/goldenPath/staAur2/multiz369way/

    #####################################################################
    cd /hive/data/genomes/staAur2/bed/multiz369way/downloads/phastCons369way

    ln -s ../../cons/all/downloads/*.wigFix.gz .

    ln -s ../../cons/all/phastCons369way.bw ./staAur2.phastCons369way.bw
    ln -s ../../cons/all/all.mod ./staAur2.phastCons369way.mod

    time md5sum *.mod *.bw *.gz > md5sum.txt
    #   real    0m20.354s

    # obtain the README.txt from staAur2/phastCons26way and update for this
    #   situation, and edit accordingly.

    mkdir /usr/local/apache/htdocs-hgdownload/goldenPath/staAur2/phastCons369way
    ln -s `pwd`/* /usr/local/apache/htdocs-hgdownload/goldenPath/staAur2/phastCons369way

    #####################################################################
    cd /hive/data/genomes/staAur2/bed/multiz369way/downloads/phyloP369way

    ln -s ../../../consPhyloP/all/downloads/*.wigFix.gz .
    md5sum *.wigFix.gz > md5sum.txt

    ln -s ../../consPhyloP/run.phyloP/all.mod staAur2.phyloP369way.mod
    ln -s ../../consPhyloP/all/phyloP369way.bw staAur2.phyloP369way.bw

    md5sum *.mod *.bw > md5sum.txt

    # obtain the README.txt from staAur2/phyloP20way and update for this
    mkdir /usr/local/apache/htdocs-hgdownload/goldenPath/staAur2/phyloP369way
    ln -s `pwd`/* \
/usr/local/apache/htdocs-hgdownload/goldenPath/staAur2/phyloP369way/

#############################################################################
# hgPal downloads (TBD - 2018-12-17 - Hiram)
#   FASTA from 369-way for knownGene, refGene and knownCanonical

    ssh hgwdev
    screen -S staAur2HgPal
    mkdir /hive/data/genomes/staAur2/bed/multiz369way/pal
    cd /hive/data/genomes/staAur2/bed/multiz369way/pal
    cat ../species.list | tr '[ ]' '[\n]' > order.list

    # this for loop can take hours on a high contig count assembly
    # it is just fine on human/staAur2, just a few seconds
    export mz=multiz369way
    export gp=ensGene
    export db=staAur2
    export I=0
    export D=0
    mkdir exonAA exonNuc
    for C in `sort -nk2 ../../../chrom.sizes | cut -f1`
    do
        I=`echo $I | awk '{print $1+1}'`
        D=`echo $D | awk '{print $1+1}'`
        dNum=`echo $D | awk '{printf "%03d", int($1/3690)}'`
        mkdir -p exonNuc/${dNum} > /dev/null
        mkdir -p exonAA/${dNum} > /dev/null
	echo "mafGene -chrom=$C -exons -noTrans $db $mz $gp order.list stdout | gzip -c > exonNuc/${dNum}/$C.exonNuc.fa.gz &"
	echo "mafGene -chrom=$C -exons $db $mz $gp order.list stdout | gzip -c > exonAA/${dNum}/$C.exonAA.fa.gz &"
        if [ $I -gt 16 ]; then
            echo "date"
            echo "wait"
            I=0
        fi
    done > $gp.jobs
    echo "date" >> $gp.jobs
    echo "wait" >> $gp.jobs

    time (sh -x ./$gp.jobs) > $gp.jobs.log 2>&1
    # real    41m45.902s

    export mz=multiz369way
    export gp=ensGene
    time find ./exonAA -type f | grep exonAA.fa.gz | xargs zcat \
     | gzip -c > $gp.$mz.exonAA.fa.gz
    # real    1m26.225s

    time find ./exonNuc -type f | grep exonNuc.fa.gz | xargs zcat \
     | gzip -c > $gp.$mz.exonNuc.fa.gz
    #   real    5m55.586s

    # -rw-rw-r-- 1  546125992 Dec 17 10:24 ensGene.multiz369way.exonAA.fa.gz
    # -rw-rw-r-- 1 1049129671 Dec 17 10:35 ensGene.multiz369way.exonNuc.fa.gz

    export mz=multiz369way
    export gp=ensGene
    export db=staAur2
    export pd=/usr/local/apache/htdocs-hgdownload/goldenPath/$db/$mz/alignments
    mkdir -p $pd
    md5sum *.fa.gz > md5sum.txt
    ln -s `pwd`/$gp.$mz.exonAA.fa.gz $pd/$gp.exonAA.fa.gz
    ln -s `pwd`/$gp.$mz.exonNuc.fa.gz $pd/$gp.exonNuc.fa.gz
    ln -s `pwd`/md5sum.txt $pd/

    rm -rf exonAA exonNuc

#############################################################################
# wiki page for 369-way (TBD - 2017-11-06 - Hiram)
    mkdir /hive/users/hiram/bigWays/staAur2.369way
    cd /hive/users/hiram/bigWays

    sed -e 's/ /\n/g;' /hive/data/genomes/staAur2/bed/multiz369way/species.list \
       > staAur2.369way/ordered.list

    # sizeStats.sh catches up the cached measurements required for data
    # in the tables.  They are usually already mostly done, only new
    # assemblies will have updates.
    ./sizeStats.sh staAur2.369way/ordered.list
    # dbDb.sh constructs staAur2.369way/XenTro9_369-way_conservation_alignment.html
    # may need to add new assembly references to srcReference.list and
    # urlReference.list
    ./dbDb.sh staAur2 369way
    # sizeStats.pl constructs staAur2.369way/XenTro9_369-way_Genome_size_statistics.html
    # this requires entries in coverage.list for new sequences
    ./sizeStats.pl staAur2 369way

    # defCheck.pl constructs XenTro9_369-way_conservation_lastz_parameters.html
    ./defCheck.pl staAur2 369way

    # this constructs the html pages in staAur2.369way/:
# -rw-rw-r-- 1 6247 May  2 17:07 XenTro9_369-way_conservation_alignment.html
# -rw-rw-r-- 1 84369 May  2 17:09 XenTro9_369-way_Genome_size_statistics.html
# -rw-rw-r-- 1 5033 May  2 17:10 XenTro9_369-way_conservation_lastz_parameters.html

    # add those pages to the genomewiki.  Their page names are the
    # names of the .html files without the .html:
#  XenTro9_369-way_conservation_alignment
#  XenTro9_369-way_Genome_size_statistics
#  XenTro9_369-way_conservation_lastz_parameters

    # when you view the first one you enter, it will have links to the
    # missing two.

############################################################################
# pushQ readmine (TBD - 2017-11-07 - Hiram)

  cd /usr/local/apache/htdocs-hgdownload/goldenPath/staAur2
  find -L `pwd`/multiz369way `pwd`/phastCons369way `pwd`/phyloP369way \
	/gbdb/staAur2/multiz369way -type f \
    > /hive/data/genomes/staAur2/bed/multiz369way/downloads/redmine.20216.fileList
  wc -l /hive/data/genomes/staAur2/bed/multiz369way/downloads/redmine.20216.fileList
# 1450 /hive/data/genomes/staAur2/bed/multiz369way/downloads/redmine.20216.fileList

  cd /hive/data/genomes/staAur2/bed/multiz369way/downloads
  hgsql -e 'show tables;' staAur2 | grep 369way \
	| sed -e 's/^/staAur2./;' > redmine.20216.table.list

############################################################################
