#############################################################################
## 35-Way Multiz (TBD - 2020-12-16 - Hiram)
    ssh hgwdev
    mkdir /hive/data/genomes/mm39/bed/multiz35way
    cd /hive/data/genomes/mm39/bed/multiz35way

    # from the 220-way in the source tree, select out the 35 used here:
    /cluster/bin/phast/tree_doctor \
        --prune-all-but bosTau9,calJac4,canFam4,criGri1,casCan1,cavPor3,danRer11,echTel2,equCab3,eriEur2,galGal6,galVar1,gorGor6,hg38,loxAfr3,manPen1,mm39,monDom5,neoSch1,ochPri3,oryCun2,otoGar3,oviAri4,panPan3,panTro6,petMar3,rheMac10,rn6,sorAra2,speTri2,susScr3,tarSyr2,tupBel1,turTru2,xenTro9 \
        /cluster/home/hiram/kent/src/hg/utils/phyloTrees/220way.nh \
   | sed -e 's/criGri1/GCF_003668045.3/;' \
     > t0.nh

    # using TreeGraph2 tree editor on the Mac, rearrange to get mm39
    # at the top, and attempt to get the others in phylo order:
    /cluster/bin/phast/all_dists t.nh | grep mm39 \
        | sed -e "s/mm39.//" | sort -k2n | sed -e 's/^/#\t/;'
#       rn6     0.186098
#       GCF_003668045.3 0.332282
#       casCan1 0.412432
#       speTri2 0.417900
#       manPen1 0.485701
#       cavPor3 0.491203
#       galVar1 0.493420
#       calJac4 0.494237
#       gorGor6 0.500585
#       hg38    0.502391
#       panPan3 0.502461
#       panTro6 0.502681
#       tarSyr2 0.503897
#       rheMac10        0.510018
#       otoGar3 0.511897
#       equCab3 0.520098
#       turTru2 0.540150
#       canFam4 0.543004
#       tupBel1 0.549623
#       susScr3 0.549974
#       loxAfr3 0.556386
#       oryCun2 0.556860
#       oviAri4 0.574054
#       neoSch1 0.585546
#       bosTau9 0.599054
#       ochPri3 0.643702
#       eriEur2 0.676481
#       echTel2 0.703393
#       sorAra2 0.724258
#       monDom5 0.921254
#       xenTro9 1.214580
#       galGal6 1.326078
#       petMar3 2.253737
#       danRer11        2.279062

    #	what that looks like:
~/kent/src/hg/utils/phyloTrees/asciiTree.pl t.nh > mm39.35way.nh
~/kent/src/hg/utils/phyloTrees/asciiTree.pl mm39.35way.nh | sed -e 's/^/# /;'

# ((((((((((((((mm39:0.089509,
#              rn6:0.096589):0.072773,
#             GCF_003668045.3:0.17):0.08015,
#            casCan1:0.17):0.05,
#           speTri2:0.125468):0.022992,
#          cavPor3:0.175779):0.025746,
#         (oryCun2:0.114227,
#         ochPri3:0.201069):0.101463):0.015313,
#        ((((((((hg38:0.00655,
#               panTro6:0.00684):0.00122,
#              panPan3:0.00784):0.003,
#             gorGor6:0.008964):0.025204,
#            rheMac10:0.043601):0.02183,
#           calJac4:0.04965):0.05209,
#          tarSyr2:0.1114):0.02052,
#         otoGar3:0.13992):0.013494,
#        (tupBel1:0.136203,
#        galVar1:0.08):0.054937):0.002):0.020593,
#       (((susScr3:0.12,
#         (turTru2:0.064688,
#         (bosTau9:0.11,
#         oviAri4:0.085):0.013592):0.045488):0.02,
#        ((equCab3:0.084397,
#         manPen1:0.05):0.017,
#        (canFam4:0.054458,
#        neoSch1:0.097):0.069845):0.008727):0.011671,
#       (eriEur2:0.221785,
#       sorAra2:0.269562):0.056393):0.021227):0.023664,
#      (loxAfr3:0.098929,
#      echTel2:0.245936):0.056717):0.234728,
#     monDom5:0.285786):0.181168,
#    galGal6:0.509442):0.05,
#   xenTro9:0.347944):0.211354,
#  danRer11:1.201072):0.2,
# petMar3:0.975747);

    # extract species list from that .nh file
    sed 's/[a-z][a-z]*_//g; s/:[0-9\.][0-9\.]*//g; s/;//; /^ *$/d' \
        mm39.35way.nh | xargs echo | sed 's/ //g; s/,/ /g' \
        | sed 's/[()]//g; s/,/ /g' | tr '[ ]' '[\n]' > species.list.txt

    # construct db to name sed translation list:

    cat species.list.txt | while read DB
do
hgsql -N -e "select name,organism from dbDb where name=\"${DB}\";" hgcentraltest
done | sed -e 's#^#s/#;' | sed -e "s#\t#/#; s/ /_/g;" | sed -e 's#$#/;#' | sed -e 's/\./_/g' \
    | sed -e "s#'#_x_#g;" > db.to.name.sed

    printf "s/GCF_003668045.3/Chinese_hamster/;\n" >> db.to.name.sed

    sed -f db.to.name.sed mm39.35way.nh | sed -e "s#_x_#'#g; s#X__#X._#;"\
       > mm39.35way.commonNames.nh

    cat mm39.35way.commonNames.nh | sed -e 's/^/# /;'
# ((((((((((((((Mouse:0.089509,
#              Rat:0.096589):0.072773,
#             Chinese_hamster:0.17):0.08015,
#            Beaver:0.17):0.05,
#           Squirrel:0.125468):0.022992,
#          Guinea_pig:0.175779):0.025746,
#         (Rabbit:0.114227,
#         Pika:0.201069):0.101463):0.015313,
#        ((((((((Human:0.00655,
#               Chimp:0.00684):0.00122,
#              Bonobo:0.00784):0.003,
#             Gorilla:0.008964):0.025204,
#            Rhesus:0.043601):0.02183,
#           Marmoset:0.04965):0.05209,
#          Tarsier:0.1114):0.02052,
#         Bushbaby:0.13992):0.013494,
#        (Tree_shrew:0.136203,
#        Malayan_flying_lemur:0.08):0.054937):0.002):0.020593,
#       (((Pig:0.12,
#         (Dolphin:0.064688,
#         (Cow:0.11,
#         Sheep:0.085):0.013592):0.045488):0.02,
#        ((Horse:0.084397,
#         Chinese_pangolin:0.05):0.017,
#        (Dog:0.054458,
#        Hawaiian_monk_seal:0.097):0.069845):0.008727):0.011671,
#       (Hedgehog:0.221785,
#       Shrew:0.269562):0.056393):0.021227):0.023664,
#      (Elephant:0.098929,
#      Tenrec:0.245936):0.056717):0.234728,
#     Opossum:0.285786):0.181168,
#    Chicken:0.509442):0.05,
#   X._tropicalis:0.347944):0.211354,
#  Zebrafish:1.201072):0.2,
# Lamprey:0.975747);

#	Use this specification in the phyloGif tool:
#	http://genome.ucsc.edu/cgi-bin/phyloGif
#	to obtain a png image for src/hg/htdocs/images/phylo/mm39_35way.png

    cat species.list.txt | while read DB
do
hgsql -N -e "select name,scientificName from dbDb where name=\"${DB}\";" hgcentraltest
done | sed -e 's#^#s/#;' | sed -e "s#\t#/#; s/ /_/g;" | sed -e 's#$#/;#' | sed -e 's/\./_/g' \
    | sed -e "s#'#_x_#g;" > db.to.sciName.sed

    printf "s/GCF_003668045.3/Cricetulus_griseus/;\n" >> db.to.sciName.sed

    sed -f db.to.sciName.sed mm39.35way.nh > mm39.35way.scientificNames.nh

       > mm39.35way.commonNames.nh

       | $HOME/kent/src/hg/utils/phyloTrees/asciiTree.pl /dev/stdin \
          > mm39.35way.scientificNames.nh
    cat mm39.35way.scientificNames.nh | sed -e 's/^/# /;'
# ((((((((((((((Mus_musculus:0.089509,
#              Rattus_norvegicus:0.096589):0.072773,
#             Cricetulus_griseus:0.17):0.08015,
#            Castor_canadensis:0.17):0.05,
#           Spermophilus_tridecemlineatus:0.125468):0.022992,
#          Cavia_porcellus:0.175779):0.025746,
#         (Oryctolagus_cuniculus:0.114227,
#         Ochotona_princeps:0.201069):0.101463):0.015313,
#        ((((((((Homo_sapiens:0.00655,
#               Pan_troglodytes:0.00684):0.00122,
#              Pan_paniscus:0.00784):0.003,
#             Gorilla_gorilla_gorilla:0.008964):0.025204,
#            Macaca_mulatta:0.043601):0.02183,
#           Callithrix_jacchus:0.04965):0.05209,
#          Tarsius_syrichta:0.1114):0.02052,
#         Otolemur_garnettii:0.13992):0.013494,
#        (Tupaia_belangeri:0.136203,
#        Galeopterus_variegatus:0.08):0.054937):0.002):0.020593,
#       (((Sus_scrofa:0.12,
#         (Tursiops_truncatus:0.064688,
#         (Bos_taurus:0.11,
#         Ovis_aries:0.085):0.013592):0.045488):0.02,
#        ((Equus_caballus:0.084397,
#         Manis_pentadactyla:0.05):0.017,
#        (Canis_lupus_familiaris:0.054458,
#        Neomonachus_schauinslandi:0.097):0.069845):0.008727):0.011671,
#       (Erinaceus_europaeus:0.221785,
#       Sorex_araneus:0.269562):0.056393):0.021227):0.023664,
#      (Loxodonta_africana:0.098929,
#      Echinops_telfairi:0.245936):0.056717):0.234728,
#     Monodelphis_domestica:0.285786):0.181168,
#    Gallus_gallus:0.509442):0.05,
#   Xenopus_tropicalis:0.347944):0.211354,
#  Danio_rerio:1.201072):0.2,
# Petromyzon_marinus:0.975747);

    /cluster/bin/phast/all_dists mm39.35way.nh | grep mm39 \
        | sed -e "s/mm39.//" | sort -k2n > 35way.distances.txt
    #	Use this output to create the table below
    cat 35way.distances.txt | sed -e 's/^/# /;'
# rn6   0.186098
# GCF_003668045.3       0.332282
# casCan1       0.412432
# speTri2       0.417900
# manPen1       0.485701
# cavPor3       0.491203
# galVar1       0.493420
# calJac4       0.494237
# gorGor6       0.500585
# hg38  0.502391
# panPan3       0.502461
# panTro6       0.502681
# tarSyr2       0.503897
# rheMac10      0.510018
# otoGar3       0.511897
# equCab3       0.520098
# turTru2       0.540150
# canFam4       0.543004
# tupBel1       0.549623
# susScr3       0.549974
# loxAfr3       0.556386
# oryCun2       0.556860
# oviAri4       0.574054
# neoSch1       0.585546
# bosTau9       0.599054
# ochPri3       0.643702
# eriEur2       0.676481
# echTel2       0.703393
# sorAra2       0.724258
# monDom5       0.921254
# xenTro9       1.214580
# galGal6       1.326078
# petMar3       2.253737
# danRer11      2.279062

    ~/kent/src/hg/makeDb/doc/mm39/sizeStats.pl

#	If you can fill in all the numbers in this table, you are ready for
#	the multiple alignment procedure
# count phylo    chain      synNet    rBest  synNet v.  query
#       dist      link                         chain
# 001 0.1861 (% 70.901) (% 66.256) (% 65.494)  6.55 - Rat rn6
# 002 0.3323 (% 58.000) (% 54.406) (% 00.000)  6.20 - Cricetulus griseus GCF_003668045.3
# 003 0.4124 (% 36.600) (% 32.404) (% 34.407) 11.46 - Beaver casCan1
# 004 0.4179 (% 34.147) (% 31.953) (% 32.373)  6.43 - Squirrel speTri2
# 005 0.4857 (% 27.322) (% 22.622) (% 25.847) 17.20 - Chinese pangolin manPen1
# 006 0.4912 (% 28.378) (% 26.311) (% 26.932)  7.28 - Guinea pig cavPor3
# 007 0.4934 (% 35.714) (% 31.386) (% 33.751) 12.12 - Malayan flying lemur galVar1
# 008 0.4942 (% 33.090) (% 31.297) (% 31.492)  5.42 - Marmoset calJac4
# 009 0.5006 (% 35.064) (% 33.225) (% 33.360)  5.24 - Gorilla gorGor6
# 010 0.5024 (% 35.372) (% 33.566) (% 33.646)  5.11 - Human hg38
# 011 0.5025 (% 35.282) (% 33.492) (% 33.579)  5.07 - Bonobo panPan3
# 012 0.5027 (% 35.291) (% 33.511) (% 33.608)  5.04 - Chimp panTro6
# 013 0.5039 (% 32.278) (% 28.908) (% 30.629) 10.44 - Tarsier tarSyr2
# 014 0.5100 (% 34.838) (% 33.095) (% 33.167)  5.00 - Rhesus rheMac10
# 015 0.5119 (% 29.729) (% 27.878) (% 28.263)  6.23 - Bushbaby otoGar3
# 016 0.5201 (% 34.831) (% 33.041) (% 33.077)  5.14 - Horse equCab3
# 017 0.5402 (% 30.188) (% 24.901) (% 28.769) 17.51 - Dolphin turTru2
# 018 0.5430 (% 29.376) (% 27.756) (% 27.951)  5.51 - Dog canFam4
# 019 0.5496 (% 19.691) (% 12.280) (% 18.365) 37.64 - Tree shrew tupBel1
# 020 0.5500 (% 25.611) (% 23.252) (% 24.292)  9.21 - Pig susScr3
# 021 0.5564 (% 25.756) (% 23.827) (% 24.416)  7.49 - Elephant loxAfr3
# 022 0.5569 (% 25.182) (% 23.079) (% 23.777)  8.35 - Rabbit oryCun2
# 023 0.5741 (% 26.200) (% 24.360) (% 24.783)  7.02 - Sheep oviAri4
# 024 0.5855 (% 31.257) (% 29.478) (% 29.730)  5.69 - Hawaiian monk seal neoSch1
# 025 0.5991 (% 26.588) (% 24.846) (% 25.188)  6.55 - Cow bosTau9
# 026 0.6437 (% 18.559) (% 16.419) (% 17.355) 11.53 - Pika ochPri3
# 027 0.6765 (% 13.450) (% 09.501) (% 12.323) 29.36 - Hedgehog eriEur2
# 028 0.7034 (% 14.456) (% 11.161) (% 13.329) 22.79 - Tenrec echTel2
# 029 0.7243 (% 13.370) (% 10.430) (% 12.301) 21.99 - Shrew sorAra2
# 030 0.9213 (% 05.382) (% 04.390) (% 04.627) 18.43 - Opossum monDom5
# 031 1.2146 (% 01.936) (% 00.783) (% 01.354) 59.56 - X. tropicalis xenTro9
# 032 1.3261 (% 02.537) (% 01.835) (% 02.015) 27.67 - Chicken galGal6
# 033 2.2537 (% 00.905) (% 00.027) (% 00.517) 97.02 - Lamprey petMar3
# 034 2.2791 (% 01.416) (% 00.185) (% 00.947) 86.94 - Zebrafish danRer11
# count phylo    chain      synNet    rBest  synNet v.  query
#       dist      link                         chain

# None of this concern for distances matters in building the first step, the
# maf files.  The distances will be better calibrated later.

    # create species list and stripped down tree for autoMZ
    sed 's/[a-z][a-z]*_//g; s/:[0-9\.][0-9\.]*//g; s/;//; /^ *$/d' \
	mm39.35way.nh | xargs echo | sed 's/ //g; s/,/ /g' > tree.nh

    sed 's/[()]//g; s/,/ /g' tree.nh > species.list
    cat species.list | fold -s -w 76 | sed -e 's/^/# /;'
# mm39 rn6 GCF_003668045.3 casCan1 speTri2 cavPor3 oryCun2 ochPri3 hg38 
# panTro6 panPan3 gorGor6 rheMac10 calJac4 tarSyr2 otoGar3 tupBel1 galVar1 
# susScr3 turTru2 bosTau9 oviAri4 equCab3 manPen1 canFam4 neoSch1 eriEur2 
# sorAra2 loxAfr3 echTel2 monDom5 galGal6 xenTro9 danRer11 petMar3

    #	bash shell syntax here ...
    cd /hive/data/genomes/mm39/bed/multiz35way

    export H=/hive/data/genomes/mm39/bed
    mkdir mafLinks
    # good, phylo close  assemblies can use syntenic net:
    for G in rn6 cavPor3 calJac4 gorGor6 hg38 panPan3 panTro6 rheMac10 \
	equCab3 canFam4 oviAri4 neoSch1
    do
      cd $H/multiz35way
      rm -fr mafLinks/$G
      mkdir mafLinks/$G
      cd mafLinks/$G
      mafSplit -outDirDepth=0 -byTarget -useFullSequenceName \
        /dev/null . ${H}/lastz.$G/axtChain/mm39.${G}.synNet.maf.gz
      gzip *.maf
      echo lastz.$G/axtChain/mm39.${G}.synNet.maf.gz
    done

    # GCF_003668045.3 is special and was done with a name translation step
    # to eliminate dots in the assembly name and in the sequence names
    # maf files expect only one dot on the s lines: assembly.sequence
    mkdir mafLinks/GCF_003668045v3
    cd mafLinks/GCF_003668045v3
    mafSplit -outDirDepth=0 -byTarget -useFullSequenceName \
	/dev/null . ../../translated/GCF_003668045v3.maf.gz
    gzip *.maf

    # assemblies using recip best net:
    for G in casCan1  speTri2 manPen1 tarSyr2 otoGar3 turTru2 tupBel1 \
	susScr3 loxAfr3 oryCun2 bosTau9 ochPri3 eriEur2 echTel2 sorAra2
    do
      cd $H/multiz35way
      rm -fr mafLinks/$G
      mkdir mafLinks/$G
      cd mafLinks/$G
      echo ln -s "lastz.$G/mafRBestNet/"'*' ./mafLinks/$G
      ln -s ${H}/lastz.$G/mafRBestNet/*.maf.gz ./
    done

    # and finally, assemblies using the base mafNet
    for G in galVar1 monDom5 xenTro9 galGal6 danRer11 petMar3
    do
      cd $H/multiz35way
      rm -fr mafLinks/$G
      mkdir mafLinks/$G
      cd mafLinks/$G
      echo ln -s "lastz.$G/mafNet/"'*' ./mafLinks/$G
      ln -s ${H}/lastz.$G/mafNet/*.maf.gz ./
    done
    # verify the symLinks are good, should be 22 primary chromosomes for all:
    for D in `ls -d mafLinks/*`
    do
       printf "%02d\t%s\n" "`ls $D | egrep -v "chrUn|random" | wc -l`" "${D}"
    done | sed -e 's/^/#\t/;'
#       22      mafLinks/GCF_003668045.3
#       22      mafLinks/bosTau9
#       22      mafLinks/calJac4
...
#       22      mafLinks/tupBel1
#       22      mafLinks/turTru2
#       22      mafLinks/xenTro9

    # verify the symLinks are good, various other numbers with all chromosomes:
    for D in `ls -d mafLinks/*`
    do
       printf "%02d\t%s\n" "`ls $D | wc -l`" "${D}"
    done | sed -e 's/^/#\t/;' | sort -k2,2n
#       25      mafLinks/cavPor3
#       25      mafLinks/oviAri4
#       27      mafLinks/canFam4
...
#       47      mafLinks/speTri2
#       49      mafLinks/rn6
#       54      mafLinks/galVar1

    # Interesting that there are no matches to all the chromosomes by
    # any organism.  Let's see how many if all are placed together
    for D in `ls -d mafLinks/*`
    do
       ls $D | grep chr | sed -e 's#.*/##;'
    done | sort -u | wc -l
# 54
    # seems to be the limit, wonder what is missing:
    find ./mafLinks -type f | sed -e 's#.*/##;' \
       | sed -e 's/.maf.gz//;' | sort -u > maf.list.here
    comm -23 <(cut -f1 ../../chrom.sizes | sort ) maf.list.here | sed -e 's/^/#\t/;'
#       chr1_GL456239v1_random
#       chr4_JH584295v1_random
#       chrUn_GL456368v1
#       chrUn_GL456370v1
#       chrUn_GL456383v1
#       chrUn_GL456389v1
#       chrUn_GL456390v1
#       chrUn_GL456392v1
#       chrUn_GL456396v1
#       chrUn_MU069435v1

    # and the sizes of those missed chroms:
    comm -23 <(cut -f1 ../../chrom.sizes | sort ) maf.list.here \
| while read S; do grep "${S}" ../../chrom.sizes; done | sed -e 's/^/#\t/;'
#       chr1_GL456239v1_random  40056
#       chr4_JH584295v1_random  1976
#       chrUn_GL456368v1        20208
#       chrUn_GL456370v1        26764
#       chrUn_GL456383v1        38659
#       chrUn_GL456389v1        28772
#       chrUn_GL456390v1        24668
#       chrUn_GL456392v1        23629
#       chrUn_GL456396v1        21240
#       chrUn_MU069435v1        31129

    # try to do these as full chromosomes without the splitting procedure
    # they will work, they just take a really long time for the big chroms

    mkdir /hive/data/genomes/mm39/bed/multiz35way/fullChromRun
    cd /hive/data/genomes/mm39/bed/multiz35way/fullChromRun
    mkdir maf run
    cd run
    mkdir penn
    cp -p /cluster/bin/penn/multiz.2009-01-21_patched/multiz penn
    cp -p /cluster/bin/penn/multiz.2009-01-21_patched/maf_project penn
    cp -p /cluster/bin/penn/multiz.2009-01-21_patched/autoMZ penn

    find ../../mafLinks -type l | grep ".maf.gz$" | xargs -L 1 basename \
       | sed -e 's/.gz//;' | sort -u > maf.list

    wc -l maf.list
    # 54 maf.list

    #	set the db and pairs directories here
    cat > autoMultiz.csh << '_EOF_'
#!/bin/csh -ef
set db = mm39
set c = $1
set result = $2
set run = `/bin/pwd`
set tmp = /dev/shm/$db/multiz.$c
set pairs = /hive/data/genomes/mm39/bed/multiz35way/mafLinks
/bin/rm -fr $tmp
/bin/mkdir -p $tmp
/usr/bin/sed -e 's/GCF_003668045.3/GCF_003668045v3/;' ../../tree.nh > $tmp/tree.nh
/usr/bin/sed -e 's/GCF_003668045.3/GCF_003668045v3/;' ../../species.list > $tmp/species.list
pushd $tmp > /dev/null
foreach s (`/bin/sed -e "s/$db //" species.list`)
    set in = $pairs/$s/$c
    set out = $db.$s.sing.maf
    if (-e $in.gz) then
        /bin/zcat $in.gz > $out
        if (! -s $out) then
            echo "##maf version=1 scoring=autoMZ" > $out
        endif
    else if (-e $in) then
        /bin/ln -s $in $out
    else
        echo "##maf version=1 scoring=autoMZ" > $out
    endif
end
set path = ($run/penn $path); rehash
$run/penn/autoMZ + T=$tmp E=$db "`cat tree.nh`" $db.*.sing.maf $c \
        > /dev/null
popd > /dev/null
/bin/rm -f $result
/bin/cp -p $tmp/$c $result
/bin/rm -fr $tmp
'_EOF_'
# << happy emacs
    chmod +x autoMultiz.csh

    printf '#LOOP
./autoMultiz.csh $(file1) {check out line+ /hive/data/genomes/mm39/bed/multiz35way/fullChromRun/maf/$(root1).maf}
#ENDLOOP
' > template

    ssh ku
    cd /hive/data/genomes/mm39/bed/multiz35way/fullChromRun/run
    gensub2 maf.list single template jobList
    para -ram=64g create jobList
# Completed: 54 of 54 jobs
# CPU time in finished jobs:    1548419s   25806.98m   430.12h   17.92d  0.049 y
# IO & Wait Time:                   772s      12.87m     0.21h    0.01d  0.000 y
# Average job time:               28689s     478.15m     7.97h    0.33d
# Longest finished job:          125726s    2095.43m    34.92h    1.46d
# Submission to last job:        125739s    2095.65m    34.93h    1.46d

#######################################################################
# GAP ANNOTATE MULTIZ30WAY MAF AND LOAD TABLES (TBD - 2017-11-03 - Hiram)
    # mafAddIRows has to be run on single chromosome maf files, it does not
    #	function correctly when more than one reference sequence
    #	are in a single file.
    mkdir -p /hive/data/genomes/mm39/bed/multiz35way/anno
    cd /hive/data/genomes/mm39/bed/multiz35way/anno

    # check for N.bed files everywhere:
    for DB in `sed -e 's/ GCF_003668045.3//;' ../species.list`
do
    if [ ! -s /hive/data/genomes/${DB}/${DB}.N.bed ]; then
        echo "MISS: ${DB}"
#        cd /hive/data/genomes/${DB}
#        twoBitInfo -nBed ${DB}.2bit ${DB}.N.bed
    else
        echo "  OK: ${DB}"
    fi
    cd /hive/data/genomes/mm39/bed/multiz35way/anno
done
    twoBitInfo -nBed \
/hive/data/genomes/asmHubs/refseqBuild/GCF/003/668/045/GCF_003668045.3_CriGri-PICRH-1.0/GCF_003668045.3_CriGri-PICRH-1.0.2bit \
stdout | sed -e 's/\./v/g;' > GCF_003668045v3.bed
    sed -e 's/\./v/g;' /hive/data/genomes/asmHubs/refseqBuild/GCF/003/668/045/GCF_003668045.3_CriGri-PICRH-1.0/GCF_003668045.3_CriGri-PICRH-1.0.chrom.sizes \
      > GCF_003668045v3.len

    cd /hive/data/genomes/mm39/bed/multiz35way/anno
    for DB in `sed -e 's/ GCF_003668045.3//;' ../species.list`
do
    echo "${DB} "
    ln -s  /hive/data/genomes/${DB}/${DB}.N.bed ${DB}.bed
    ln -s  /hive/data/genomes/${DB}/chrom.sizes ${DB}.len
done
    ls *.bed > nBeds
    ls *.len > sizes
    # make sure they all are successful symLinks:
    ls -ogrtL *.bed | wc -l
    # 35
    ls -ogrtL *.len | wc -l
    # 35
    wc -l nBeds sizes
    # 35 nBeds
    # 35 sizes

    screen -S mm39      # use a screen to control this longish job
    # do not need to go to ku for this, can run on hgwdev parasol
    cd /hive/data/genomes/mm39/bed/multiz35way/anno
    mkdir result

    printf '#LOOP
mafAddIRows -nBeds=nBeds $(path1) /hive/data/genomes/mm39/mm39.2bit {check out line+ result/$(file1)}
#ENDLOOP
' > template

    # tac to get some of the smaller ones to run first so it is easy to verify
    # it is running OK
    ls ../fullChromRun/maf/*.maf | tac > maf.list

    gensub2 maf.list single template jobList
    # good to allow plenty of memory, slows them down and allows the large ones
    #  to finish
    para -ram=64g create jobList
    para try ... check ...
    para push
# Completed: 54 of 54 jobs
# CPU time in finished jobs:       4649s      77.48m     1.29h    0.05d  0.000 y
# IO & Wait Time:                   346s       5.77m     0.10h    0.00d  0.000 y
# Average job time:                  93s       1.54m     0.03h    0.00d
# Longest finished job:             382s       6.37m     0.11h    0.00d
# Submission to last job:          1432s      23.87m     0.40h    0.02d

    du -hsc result
    #  141G    result

    # translate those results back to the GCF hub names:
    cd /hive/data/genomes/mm39/bed/multiz35way
    mkdir maf

    printf '#LOOP
reverseTrans.sh $(path1) {check out exists+ maf/$(path1).maf}
#ENDLOOP
' > template

    printf '#!/bin/bash

set -beEu -o pipefail

export C=$1
export R=$2

~/kent/src/hg/makeDb/mm39/reverseTranslate.pl /hive/data/genomes/asmHubs/refseqBuild/GCF/003/668/045/GCF_003668045.3_CriGri-PICRH-1.0/GCF_003668045.3_CriGri-PICRH-1.0.chrom.sizes GCF_003668045.3 anno/result/${C}.maf > ${R}
' > reverseTrans.sh
    chmod +x reverseTrans.sh

    ls anno/result | sed -e 's/.maf//;' > chr.result.list

    gensub2 chr.result.list single template reverseTrans.jobList
    para create reverseTrans.jobList
    para push
# Completed: 54 of 54 jobs
# CPU time in finished jobs:       1784s      29.74m     0.50h    0.02d  0.000 y
# IO & Wait Time:                  1880s      31.33m     0.52h    0.02d  0.000 y
# Average job time:                  68s       1.13m     0.02h    0.00d
# Longest finished job:             245s       4.08m     0.07h    0.00d
# Submission to last job:           285s       4.75m     0.08h    0.00d

    # Load into database
    rm -f /gbdb/mm39/multiz35way/maf/*
    cd /hive/data/genomes/mm39/bed/multiz35way/maf

    ln -s `pwd`/*.maf /gbdb/mm39/multiz35way/maf/

    # this generates an immense multiz35way.tab file in the directory
    #	where it is running.  Best to run this over in scratch.
    cd /dev/shm
    time hgLoadMaf -pathPrefix=/gbdb/mm39/multiz35way/maf mm39 multiz35way
    # Loaded 40655883 mafs in 358 files from /gbdb/mm39/multiz35way/maf
    # real    37m27.075s

    # -rw-rw-r-- 1 2177747201 Nov  2 18:27 multiz35way.tab

    time (cat /gbdb/mm39/multiz35way/maf/*.maf \
        | hgLoadMafSummary -verbose=2 -minSize=30000 \
	-mergeGap=1500 -maxSize=200000 mm39 multiz35waySummary stdin)
# Created 4568973 summary blocks from 850984320 components and 40655883 mafs from stdin
# real    59m27.383s

# -rw-rw-r--   1 2177747201 Nov  2 18:27 multiz35way.tab
# -rw-rw-r--   1  224894681 Nov  3 08:12 multiz35waySummary.tab

    wc -l multiz35way*.tab
    # 40655883 multiz35way.tab
    #  4568973 multiz35waySummary.tab

    rm multiz35way*.tab

##############################################################################
# MULTIZ7WAY MAF FRAMES (DONE - 2020-12-22 - Hiram)
    ssh hgwdev
    mkdir /hive/data/genomes/mm39/bed/multiz35way/frames
    cd /hive/data/genomes/mm39/bed/multiz35way/frames
#   survey all the genomes to find out what kinds of gene tracks they have

    printf '#!/bin/csh -fe
foreach db (`sed -e "s/GCF_003668045.3 //;" ../species.list`)
    echo -n "# ${db}: "
    set tables = `hgsql $db -N -e "show tables" | egrep "Gene|ncbiRefSeq"`
    foreach table ($tables)
        if ($table == "ensGene" || $table == "refGene" || \
           $table == "ncbiRefSeq" || $table == "mgcGenes" || \
           $table == "knownGene" || $table == "xenoRefGene" ) then
           set count = `hgsql $db -N -e "select count(*) from $table"`
            echo -n "${table}: ${count}, "
        endif
    end
    echo
end
' > showGenes.csh

    chmod +x ./showGenes.csh
    time ./showGenes.csh
# mm39: ncbiRefSeq: 130343, refGene: 47741, xenoRefGene: 197343, 
# rn6: ensGene: 41078, mgcGenes: 7013, ncbiRefSeq: 69456, refGene: 19160, xenoRefGene: 223156, 
# casCan1: ensGene: 38514, ncbiRefSeq: 40013, xenoRefGene: 362113, 
# speTri2: ensGene: 33336, ncbiRefSeq: 41886, xenoRefGene: 446239, 
# cavPor3: ensGene: 34846, ncbiRefSeq: 44775, refGene: 491, xenoRefGene: 337367
# oryCun2: ensGene: 51853, ncbiRefSeq: 43655, refGene: 1750, xenoRefGene: 345704
# ochPri3: ncbiRefSeq: 26253, xenoRefGene: 313971, 
# hg38: ensGene: 208239, knownGene: 229647, mgcGenes: 36638, ncbiRefSeq: 170769, refGene: 88819, xenoRefGene: 200365, 
# panTro6: ncbiRefSeq: 102471, refGene: 2877, xenoRefGene: 245176, 
# panPan3: ncbiRefSeq: 85047, refGene: 218, xenoRefGene: 246746, 
# gorGor6: ncbiRefSeq: 61950, refGene: 449, xenoRefGene: 333640, 
# rheMac10: ensGene: 64228, ncbiRefSeq: 86732, refGene: 6482, xenoRefGene: 243139, 
# calJac4: ncbiRefSeq: 107273, refGene: 237, xenoRefGene: 256337, 
# tarSyr2: ensGene: 38314, ncbiRefSeq: 35799, xenoRefGene: 375108, 
# otoGar3: ensGene: 28565, ncbiRefSeq: 38532, xenoRefGene: 530019, 
# tupBel1: ensGene: 34727, xenoRefGene: 845862, 
# galVar1: ncbiRefSeq: 34747, xenoRefGene: 568133, 
# susScr3: ensGene: 30585, ncbiRefSeq: 66084, refGene: 5345, xenoRefGene: 477106, 
# turTru2: xenoRefGene: 583163, 
# bosTau9: ensGene: 43984, ncbiRefSeq: 76369, refGene: 14599, xenoRefGene: 221423, 
# oviAri4: ncbiRefSeq: 49730, refGene: 1026, xenoRefGene: 230226, 
# equCab3: ensGene: 59087, ncbiRefSeq: 76580, refGene: 1966, xenoRefGene: 311382, 
# manPen1: xenoRefGene: 548155, 
# canFam4: refGene: 2382, xenoRefGene: 238226, 
# neoSch1: ncbiRefSeq: 29897, xenoRefGene: 446526, 
# eriEur2: ncbiRefSeq: 29936, xenoRefGene: 315055, 
# sorAra2: ncbiRefSeq: 23525, xenoRefGene: 469604, 
# loxAfr3: ensGene: 28847, ncbiRefSeq: 46056, refGene: 23, xenoRefGene: 359460, 
# echTel2: ncbiRefSeq: 23075, xenoRefGene: 470748, 
# monDom5: ensGene: 32358, refGene: 1239, xenoRefGene: 256483, 
# galGal6: ensGene: 39288, ncbiRefSeq: 62170, refGene: 7493, xenoRefGene: 151778, 
# xenTro9: ensGene: 56323, ncbiRefSeq: 43724, refGene: 8806, xenoRefGene: 164095, 
# danRer11: ensGene: 59876, ncbiRefSeq: 65219, refGene: 18968, xenoRefGene: 164771, 
# petMar3: xenoRefGene: 200903, 

    # and for the hub assembly: GCF_003668045.3_CriGri-PICRH-1.0
    # wc -l trackData/ncbiRefSeq/process/GCF_003668045.3_CriGri-PICRH-1.0.gp
    # 61048 trackData/ncbiRefSeq/process/GCF_003668045.3_CriGri-PICRH-1.0.gp

    # from that summary, use these gene sets:
    # knownGene - hg38
    # ensGene - tupBel1 monDom5
    # ncbiRefSeq - mm39 rn6 casCan1 speTri2 cavPor3 oryCun2 ochPri3 panTro6
panPan3 gorGor6 rheMac10 calJac4 tarSyr2 otoGar3 galVar1 susScr3 bosTau9
oviAri4 equCab3 neoSch1 eriEur2 sorAra2 loxAfr3 echTel2 galGal6 xenTro9
danRer11
    # xenoRefGene - turTru2 manPen1 canFam4 petMar3


    mkdir genes

    #   1. knownGene: hg38
    for DB in hg38
do
    hgsql -N -e "select name,chrom,strand,txStart,txEnd,cdsStart,cdsEnd,exonCount,exonStarts,exonEnds from knownGene" ${DB} \
      | genePredSingleCover stdin stdout | gzip -2c \
        > genes/${DB}.gp.gz
    genePredCheck -db=${DB} genes/${DB}.gp.gz 2>&1 | sed -e 's/^/    # /;'
done
    # checked: 19327 failed: 0

    #   2. ensGene: tupBel1 monDom5
    for DB in tupBel1 monDom5
do
hgsql -N -e "select
name,chrom,strand,txStart,txEnd,cdsStart,cdsEnd,exonCount,exonStarts,exonEnds
from ensGene" ${DB} \
      | genePredSingleCover stdin stdout | gzip -2c \
        > /dev/shm/${DB}.tmp.gz
    mv /dev/shm/${DB}.tmp.gz genes/$DB.gp.gz
    echo -n "# ${DB}: "
    genePredCheck -db=${DB} genes/${DB}.gp.gz
done
# tupBel1: checked: 29256 failed: 0
# monDom5: checked: 21033 failed: 0

    # ncbiRefSeq for the hub:
    cut -f1 /hive/data/genomes/asmHubs/refseqBuild/GCF/003/668/045/GCF_003668045.3_CriGri-PICRH-1.0/GCF_003668045.3_CriGri-PICRH-1.0.chrom.sizes | while read C
do
   printf "s/%s/%s/g;\n" "${C}" "`echo $C | sed -e 's/\./v/;'`"
done > GCF.name.trans.sed

    genePredSingleCover /hive/data/genomes/asmHubs/refseqBuild/GCF/003/668/045/GCF_003668045.3_CriGri-PICRH-1.0/trackData/ncbiRefSeq/process/GCF_003668045.3_CriGri-PICRH-1.0.gp stdout | sed -f GCF.name.trans.sed | gzip -2c > genes/GCF_003668045v3.gp.gz

    sed -e 's/\./v/;' /hive/data/genomes/asmHubs/refseqBuild/GCF/003/668/045/GCF_003668045.3_CriGri-PICRH-1.0/GCF_003668045.3_CriGri-PICRH-1.0.chrom.sizes \
    | genePredCheck -chromSizes=stdin genes/GCF_003668045v3.gp.gz
    # checked: 22354 failed: 0

    # 3. ncbiRefSeq - mm39 rn6 casCan1 speTri2 cavPor3 oryCun2 ochPri3 panTro6
panPan3 gorGor6 rheMac10 calJac4 tarSyr2 otoGar3 galVar1 susScr3 bosTau9
oviAri4 equCab3 neoSch1 eriEur2 sorAra2 loxAfr3 echTel2 galGal6 xenTro9
danRer11
    for DB in mm39 rn6 casCan1 speTri2 cavPor3 oryCun2 ochPri3 panTro6 \
panPan3 gorGor6 rheMac10 calJac4 tarSyr2 otoGar3 galVar1 susScr3 bosTau9 \
oviAri4 equCab3 neoSch1 eriEur2 sorAra2 loxAfr3 echTel2 galGal6 xenTro9 \
danRer11
do
hgsql -N -e "select
name,chrom,strand,txStart,txEnd,cdsStart,cdsEnd,exonCount,exonStarts,exonEnds
from ncbiRefSeq" ${DB} \
      | genePredSingleCover stdin stdout | gzip -2c \
        > /dev/shm/${DB}.tmp.gz
    mv /dev/shm/${DB}.tmp.gz genes/$DB.gp.gz
    echo -n "# ${DB}: "
    genePredCheck -db=${DB} genes/${DB}.gp.gz
done
# mm39: checked: 22051 failed: 0
# rn6: checked: 22857 failed: 0
# casCan1: checked: 21652 failed: 0
# speTri2: checked: 19892 failed: 0
# cavPor3: checked: 19940 failed: 0
# oryCun2: checked: 20276 failed: 0
# ochPri3: checked: 18520 failed: 0
# panTro6: checked: 21380 failed: 0
# panPan3: checked: 22261 failed: 0
# gorGor6: checked: 20581 failed: 0
# rheMac10: checked: 21021 failed: 0
# calJac4: checked: 22183 failed: 0
# tarSyr2: checked: 19968 failed: 0
# otoGar3: checked: 19536 failed: 0
# galVar1: checked: 22794 failed: 0
# susScr3: checked: 24180 failed: 0
# bosTau9: checked: 21001 failed: 0
# oviAri4: checked: 20513 failed: 0
# equCab3: checked: 21079 failed: 0
# neoSch1: checked: 18783 failed: 0
# eriEur2: checked: 19258 failed: 0
# sorAra2: checked: 19160 failed: 0
# loxAfr3: checked: 21061 failed: 0
# echTel2: checked: 18790 failed: 0
# galGal6: checked: 17412 failed: 0
# xenTro9: checked: 21265 failed: 0
# danRer11: checked: 32676 failed: 0

    #   3. xenoRefGene - turTru2 manPen1 canFam4 petMar3

    for DB in turTru2 manPen1 canFam4 petMar3
do
    hgsql -N -e "select
name,chrom,strand,txStart,txEnd,cdsStart,cdsEnd,exonCount,exonStarts,exonEnds
from xenoRefGene" ${DB} \
      | genePredSingleCover stdin stdout | gzip -2c \
        > /dev/shm/${DB}.tmp.gz
    mv /dev/shm/${DB}.tmp.gz genes/$DB.gp.gz
    echo -n "# ${DB}: "
    genePredCheck -db=${DB} genes/${DB}.gp.gz
done
# turTru2: checked: 36500 failed: 0
# manPen1: checked: 30879 failed: 0
# canFam4: checked: 20127 failed: 0
# petMar3: checked: 10819 failed: 0


    # verify counts for genes are reasonable:
    for T in genes/*.gz
do
    echo -n "# $T: "
    zcat $T | cut -f1 | sort | uniq -c | wc -l
done
# genes/GCF_003668045v3.gp.gz: 22354
# genes/bosTau9.gp.gz: 20992
# genes/calJac4.gp.gz: 22183
# genes/canFam4.gp.gz: 19154
# genes/casCan1.gp.gz: 21652
# genes/cavPor3.gp.gz: 19938
# genes/danRer11.gp.gz: 29321
# genes/echTel2.gp.gz: 18790
# genes/equCab3.gp.gz: 21079
# genes/eriEur2.gp.gz: 19258
# genes/galGal6.gp.gz: 17404
# genes/galVar1.gp.gz: 22794
# genes/gorGor6.gp.gz: 20579
# genes/hg38.gp.gz: 19310
# genes/loxAfr3.gp.gz: 21061
# genes/manPen1.gp.gz: 26152
# genes/mm39.gp.gz: 22051
# genes/monDom5.gp.gz: 21033
# genes/neoSch1.gp.gz: 18783
# genes/ochPri3.gp.gz: 18520
# genes/oryCun2.gp.gz: 20271
# genes/otoGar3.gp.gz: 19536
# genes/oviAri4.gp.gz: 20508
# genes/panPan3.gp.gz: 22261
# genes/panTro6.gp.gz: 21376
# genes/petMar3.gp.gz: 9982
# genes/rheMac10.gp.gz: 21020
# genes/rn6.gp.gz: 22854
# genes/sorAra2.gp.gz: 19160
# genes/speTri2.gp.gz: 19892
# genes/susScr3.gp.gz: 24043
# genes/tarSyr2.gp.gz: 19968
# genes/tupBel1.gp.gz: 15407
# genes/turTru2.gp.gz: 29711
# genes/xenTro9.gp.gz: 21212

    # kluster job to annotate each maf file
    screen -S mm39      # manage long running procedure with screen
    ssh ku
    cd /hive/data/genomes/mm39/bed/multiz35way/frames

    printf '#!/bin/csh -fe

set C = $1
set G = $2

cat ../anno/result/${C}.maf | genePredToMafFrames mm39 stdin stdout \
        ${G} genes/${G}.gp.gz | gzip > parts/${C}.${G}.mafFrames.gz
' > runOne

    chmod +x runOne

    ls ../anno/result | sed -e "s/.maf//" > chr.list
    ls genes | sed -e "s/.gp.gz//" > gene.list

    printf '#LOOP
runOne $(root1) $(root2) {check out exists+ parts/$(root1).$(root2).mafFrames.gz}
#ENDLOOP
' > template

    mkdir parts
    gensub2 chr.list gene.list template jobList
    para -ram=64g create jobList
    para try ... check ... push
# Completed: 1890 of 1890 jobs
# CPU time in finished jobs:      68240s    1137.33m    18.96h    0.79d  0.002 y
# IO & Wait Time:                 62137s    1035.62m    17.26h    0.72d  0.002 y
# Average job time:                  69s       1.15m     0.02h    0.00d
# Longest finished job:             372s       6.20m     0.10h    0.00d
# Submission to last job:          1098s      18.30m     0.30h    0.01d

    # collect all results into one file:
    cd /hive/data/genomes/mm39/bed/multiz35way/frames
    time find ./parts -type f | grep -v GCF | while read F
do
    echo "${F}" 1>&2
    zcat ${F}
done | sort -k1,1 -k2,2n > multiz35wayFrames.bed
    # real    2m4.953s
    time find ./parts -type f | grep GCF | while read F
do
    echo "${F}" 1>&2
    zcat ${F} | sed -e 's/GCF_003668045v3/GCF_003668045.3/;'
done | sort -k1,1 -k2,2n >> multiz35wayFrames.bed

    # -rw-rw-r-- 1 592159010 Dec 22 09:24 multiz35wayFrames.bed

    gzip multiz35wayFrames.bed

    # verify there are frames on everything, should be 35 species:
    # (count from: ls genes | wc)
    zcat multiz35wayFrames.bed.gz | awk '{print $4}' | sort | uniq -c \
        | sed -e 's/^/# /;' > species.check.list
    wc -l species.check.list
    # 35
#  240536 GCF_003668045.3
#  261904 bosTau9
#  266491 calJac4
#  230600 canFam4
#  232036 casCan1
#  240423 cavPor3
#  323912 danRer11
#  229208 echTel2
#  274281 equCab3
#  246825 eriEur2
#  346334 galGal6
#  371538 galVar1
#  245105 gorGor6
#  237059 hg38
#  263806 loxAfr3
#  224191 manPen1
#  200489 mm39
#  348971 monDom5
#  242155 neoSch1
#  226440 ochPri3
#  236903 oryCun2
#  234706 otoGar3
#  296794 oviAri4
#  264590 panPan3
#  263990 panTro6
#  146858 petMar3
#  261855 rheMac10
#  235340 rn6
#  239030 sorAra2
#  230420 speTri2
#  257781 susScr3
#  231443 tarSyr2
#  196941 tupBel1
#  258704 turTru2
#  322154 xenTro9

    #   load the resulting file
    ssh hgwdev
    cd /hive/data/genomes/mm39/bed/multiz35way/frames
    time hgLoadMafFrames mm39 multiz35wayFrames multiz35wayFrames.bed.gz
    #   real    1m0.040s

    hgsql -e 'select count(*) from multiz35wayFrames;' mm39
    #	+----------+
    #	| count(*) |
    #	+----------+
    #	|  8929813 |
    #	+----------+

    time featureBits -countGaps mm39 multiz35wayFrames
    # 60020650 bases of 2728222451 (2.200%) in intersection
    # real    0m43.619s

    #   enable the trackDb entries:
# frames multiz35wayFrames
# irows on
    #   zoom to base level in an exon to see codon displays
    #	appears to work OK

#########################################################################
# Phylogenetic tree from 35-way (TBD - 2013-09-13 - Hiram)
    mkdir /hive/data/genomes/mm39/bed/multiz35way/4d
    cd /hive/data/genomes/mm39/bed/multiz35way/4d

    # the annotated maf's are in:
    ../anno/result/*.maf

    # using knownGene for mm39, only transcribed genes and nothing
    #	from the randoms and other misc.
    hgsql -Ne "select name,chrom,strand,txStart,txEnd,cdsStart,cdsEnd,exonCount,exonStarts,exonEnds from ncbiRefSeq where cdsEnd > cdsStart;" mm39 \
      | egrep -E -v "chrM|chrUn|random|_alt" > ncbiRefSeq.gp
    wc -l *.gp
    #     95199 ncbiRefSeq.gp
    genePredCheck -db=mm39 ncbiRefSeq.gp 
    #	checked: 92405 failed: 0

    # verify it is only on the chroms:
    cut -f2 ncbiRefSeq.gp | sort | uniq -c | sort -rn | sed -e 's/^/    # /;'
    #    7748 chr2
    #    7617 chr7
    #    6852 chr11
    #    5889 chr4
    #    5843 chr1
    #    5810 chr5
    #    5514 chr9
    #    4700 chr6
    #    4477 chr8
    #    4357 chr10
    #    4310 chr17
    #    4158 chr3
    #    3615 chr14
    #    3530 chr15
    #    3479 chrX
    #    3254 chr12
    #    2968 chr13
    #    2952 chr16
    #    2718 chr19
    #    2301 chr18
    #     313 chrY

    genePredSingleCover ncbiRefSeq.gp stdout | sort > ncbiRefSeqNP.gp
    wc -l ncbiRefSeqNP.gp
    #	21990 ncbiRefSeqNP.gp
    genePredCheck -db=mm39 ncbiRefSeqNP.gp 
    #	checked: 21990 failed: 0

    ssh ku
    mkdir /hive/data/genomes/mm39/bed/multiz35way/4d/run
    cd /hive/data/genomes/mm39/bed/multiz35way/4d/run
    mkdir ../mfa

    # newer versions of msa_view have a slightly different operation
    # the sed of the gp file inserts the reference species in the chr name
    cat << '_EOF_' > 4d.csh
#!/bin/csh -fe
set PHASTBIN = /cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin
set r = "/hive/data/genomes/mm39/bed/multiz35way"
set c = $1
set infile = $r/anno/result/$2
set outfile = $3
cd /dev/shm
# 'clean' maf, removes all chrom names, leaves only the db name
perl -wpe 's/^s ([^.]+)\.\S+/s $1/' $infile > $c.maf
awk -v C=$c '$2 == C {print}' $r/4d/ncbiRefSeqNP.gp | sed -e "s/\t$c\t/\tmm39.$c\t/" > $c.gp
set NL=`wc -l $c.gp| gawk '{print $1}'`
if ("$NL" != "0") then
    $PHASTBIN/msa_view --4d --features $c.gp -i MAF $c.maf -o SS > $c.ss
    $PHASTBIN/msa_view -i SS --tuple-size 1 $c.ss > $r/4d/run/$outfile
else
    echo "" > $r/4d/run/$outfile
endif
rm -f $c.gp $c.maf $c.ss
'_EOF_'
    # << happy emacs
    chmod +x 4d.csh

    ls -1S /hive/data/genomes/mm39/bed/multiz35way/anno/result/*.maf \
	| sed -e "s#.*multiz35way/anno/result/##" \
        | egrep -E -v "chrM|chrUn|random|_alt" > maf.list

    printf '#LOOP
4d.csh $(root1) $(path1) {check out line+ ../mfa/$(root1).mfa}
#ENDLOOP
' > template

    gensub2 maf.list single template jobList
    para -ram=64g create jobList
    para try ... check ... push ... etc...
    para time
# Completed: 21 of 21 jobs
# CPU time in finished jobs:       5711s      95.18m     1.59h    0.07d  0.000 y
# IO & Wait Time:                    62s       1.03m     0.02h    0.00d  0.000 y
# Average job time:                 275s       4.58m     0.08h    0.00d
# Longest finished job:             470s       7.83m     0.13h    0.01d
# Submission to last job:           471s       7.85m     0.13h    0.01d

    # combine mfa files
    ssh hgwdev
    cd /hive/data/genomes/mm39/bed/multiz35way/4d
    # verify no tiny files:
    ls -og mfa | sort -k3nr | tail -2
    #  -rw-rw-r-- 1  293090 Dec 22 11:46 chrY.mfa

    #want comma-less species.list
    time /cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin/msa_view \
	--aggregate "`sed -e 's/GCF_003668045.3/GCF_003668045v3/;' \
           ../species.list`" mfa/*.mfa | sed s/"> "/">"/ > 4d.all.mfa
    # real    0m3.182s

    # check they are all in there:
    grep "^>" 4d.all.mfa | wc -l
    #   35

    sed 's/[a-z][a-z]*_//g; s/:[0-9\.][0-9\.]*//g; s/;//; /^ *$/d' \
        mm39.35way.nh

    sed 's/[a-z][a-z]*_//g; s/:[0-9\.][0-9\.]*//g; s/;//; /^ *$/d' \
	../mm39.35way.nh | sed -e 's/GCF_003668045.3/GCF_003668045v3/;' \
           > tree-commas.nh

    # use phyloFit to create tree model (output is phyloFit.mod)
    time /cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin/phyloFit \
	    --EM --precision MED --msa-format FASTA --subst-mod REV \
		--tree tree-commas.nh 4d.all.mfa
    #   real    67m7.237s

    mv phyloFit.mod all.mod

    grep TREE all.mod
TREE: ((((((((((((((mm39:0.0843874,rn6:0.0899985):0.0657921,
GCF_003668045v3:0.106248):0.151018,casCan1:0.135335):0.0230996,
speTri2:0.144923):0.00521622,cavPor3:0.232945):0.026955,
(oryCun2:0.11389,ochPri3:0.195179):0.105379):0.0128213,
((((((((hg38:0.0109653,panTro6:0.00222548):0.00206435,
panPan3:0.00221569):0.00629185,gorGor6:0.00877389):0.0221025,
rheMac10:0.0361164):0.0217292,calJac4:0.0658044):0.0586954,
tarSyr2:0.140238):0.0112568,otoGar3:0.155854):0.0148223,
(tupBel1:0.182689,galVar1:0.106299):0.00938657):0.00641):0.0206,
(((susScr3:0.119945,(turTru2:0.0637542,(bosTau9:0.0311355,
oviAri4:0.0369141):0.0938814):0.0195077):0.0461252,
((equCab3:0.105047,manPen1:0.137597):0.00716862,
(canFam4:0.0869943,neoSch1:0.0695671):0.0650575):0.00523737):0.0112043,
(eriEur2:0.221845,sorAra2:0.280481):0.0694824):0.0210386):0.0285152,
(loxAfr3:0.103064,echTel2:0.240863):0.0478413):0.237009,
monDom5:0.34975):0.214304,galGal6:0.444509):0.287968,
xenTro9:0.766181):0.454886,danRer11:0.852917):0.44051,petMar3:0.44051);

    # compare these calculated lengths to what we started with

    /cluster/bin/phast/all_dists ../mm39.35way.nh  | grep mm39 \
	| sed -e "s/mm39.//;" | sort > original.dists

    grep TREE all.mod | sed -e 's/TREE: //;' \
       | /cluster/bin/phast/all_dists /dev/stdin | grep mm39 \
          | sed -e "s/mm39.//; s/GCF_003668045v3/GCF_003668045.3/;" | sort > mm39.dists

    # printing out the 'original', the 'new' the 'difference' and
    #    percent difference/delta
echo "#   sequence     old dist       new dist       difference   per-cent diff"
    join original.dists mm39.dists | awk '{
  printf "#\t%s\t%8.6f\t%8.6f\t%8.6f\t%5.2f\n", $1, $2, $3, $2-$3, 100*($2-$3)/$3 }'       | sort -k4n
#   sequence     old dist       new dist       difference   per-cent diff

#       rn6     0.186098        0.174386         0.011712        6.72
#       GCF_003668045.3 0.332282 0.256428        0.075854       29.58
#       casCan1 0.412432        0.436532        -0.024100       -5.52
#       speTri2 0.417900        0.469220        -0.051320      -10.94
#       galVar1 0.493420        0.491385         0.002035        0.41
#       panPan3 0.502461        0.512813        -0.010352       -2.02
#       gorGor6 0.500585        0.513080        -0.012495       -2.44
#       panTro6 0.502681        0.514887        -0.012206       -2.37
#       rheMac10 0.510018       0.518320        -0.008302       -1.60
#       hg38    0.502391        0.523627        -0.021236       -4.06
#       calJac4 0.494237        0.526279        -0.032042       -6.09
#       equCab3 0.520098        0.539586        -0.019488       -3.61
#       tarSyr2 0.503897        0.542017        -0.038120       -7.03
#       otoGar3 0.511897        0.546376        -0.034479       -6.31
#       turTru2 0.540150        0.551520        -0.011370       -2.06
#       neoSch1 0.585546        0.561994        0.023552         4.19
#       cavPor3 0.491203        0.562458        -0.071255      -12.67
#       tupBel1 0.549623        0.567775        -0.018152       -3.20
#       loxAfr3 0.556386        0.569310        -0.012924       -2.27
#       manPen1 0.485701        0.572136        -0.086435      -15.11
#       oryCun2 0.556860        0.575737        -0.018877       -3.28
#       canFam4 0.543004        0.579422        -0.036418       -6.29
#       susScr3 0.549974        0.588203        -0.038229       -6.50
#       bosTau9 0.599054        0.612782        -0.013728       -2.24
#       oviAri4 0.574054        0.618561        -0.044507       -7.20
#       ochPri3 0.643702        0.657026        -0.013324       -2.03
#       eriEur2 0.676481        0.702256        -0.025775       -3.67
#       echTel2 0.703393        0.707109        -0.003716       -0.53
#       sorAra2 0.724258        0.760892        -0.036634       -4.81
#       monDom5 0.921254        1.005164        -0.083910       -8.35
#       galGal6 1.326078        1.314227         0.011851        0.90
#       xenTro9 1.214580        1.923867        -0.709287      -36.87
#       danRer11 2.279062       2.465489        -0.186427       -7.56
#       petMar3 2.253737        2.493592        -0.239855       -9.62

#########################################################################
# phastCons 35-way (DONE - 2020-12-22 - Hiram)
    # split 35way mafs into 10M chunks and generate sufficient statistics
    # files for # phastCons
    ssh ku
    mkdir -p /hive/data/genomes/mm39/bed/multiz35way/cons/ss
    mkdir -p /hive/data/genomes/mm39/bed/multiz35way/cons/msa.split
    cd /hive/data/genomes/mm39/bed/multiz35way/cons/msa.split

    cat << '_EOF_' > doSplit.csh
#!/bin/csh -ef
set c = $1
set MAF = /hive/data/genomes/mm39/bed/multiz35way/anno/result/$c.maf
set WINDOWS = /hive/data/genomes/mm39/bed/multiz35way/cons/ss/$c
set WC = `cat $MAF | wc -l`
set NL = `grep "^#" $MAF | wc -l`
if ( -s $2 ) then
    exit 0
endif
if ( -s $2.running ) then
    exit 0
endif

date >> $2.running

rm -fr $WINDOWS
mkdir $WINDOWS
pushd $WINDOWS > /dev/null
if ( $WC != $NL ) then
/cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin/msa_split \
    $MAF -i MAF -o SS -r $WINDOWS/$c -w 3000000,0 -I 300 -B 5000
endif
popd > /dev/null
date >> $2
rm -f $2.running
'_EOF_'
    # << happy emacs
    chmod +x doSplit.csh

    printf '#LOOP
doSplit.csh $(root1) {check out line+ $(root1).done}
#ENDLOOP
' > template

#	do the easy ones first to see some immediate results
    ls -1S -r ../../anno/result | sed -e "s/.maf//;" > maf.list
    # all can finish OK at a 64Gb memory limit
    gensub2 maf.list single template jobList
    para -ram=64g create jobList
    para try ... check ... etc
    para push
# Completed: 54 of 54 jobs
# CPU time in finished jobs:      24295s     404.91m     6.75h    0.28d  0.001 y
# IO & Wait Time:                  1111s      18.52m     0.31h    0.01d  0.000 y
# Average job time:                 470s       7.84m     0.13h    0.01d
# Longest finished job:            2724s      45.40m     0.76h    0.03d
# Submission to last job:          2765s      46.08m     0.77h    0.03d

    # Run phastCons
    #	This job is I/O intensive in its output files, beware where this
    #	takes place or do not run too many at once.
    ssh ku
    mkdir -p /hive/data/genomes/mm39/bed/multiz35way/cons/run.cons
    cd /hive/data/genomes/mm39/bed/multiz35way/cons/run.cons

    #	This is setup for multiple runs based on subsets, but only running
    #   the 'all' subset here.
    #   It triggers off of the current working directory
    #	$cwd:t which is the "grp" in this script.  Running:
    #	all and vertebrates

    cat << '_EOF_' > doPhast.csh
#!/bin/csh -fe
set PHASTBIN = /cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin
set c = $1
set f = $2
set len = $3
set cov = $4
set rho = $5
set grp = $cwd:t
set cons = /hive/data/genomes/mm39/bed/multiz35way/cons
set tmp = $cons/tmp/$f
mkdir -p $tmp
set ssSrc = $cons/ss
set useGrp = "$grp.mod"
if (-s $cons/$grp/$grp.non-inf) then
  ln -s $cons/$grp/$grp.mod $tmp
  ln -s $cons/$grp/$grp.non-inf $tmp
  ln -s $ssSrc/$c/$f.ss $tmp
else
  ln -s $ssSrc/$c/$f.ss $tmp
  ln -s $cons/$grp/$grp.mod $tmp
endif
pushd $tmp > /dev/null
if (-s $grp.non-inf) then
  $PHASTBIN/phastCons $f.ss $useGrp \
    --rho $rho --expected-length $len --target-coverage $cov --quiet \
    --not-informative `cat $grp.non-inf` \
    --seqname $c --idpref $c --most-conserved $f.bed --score > $f.pp
else
  $PHASTBIN/phastCons $f.ss $useGrp \
    --rho $rho --expected-length $len --target-coverage $cov --quiet \
    --seqname $c --idpref $c --most-conserved $f.bed --score > $f.pp
endif
popd > /dev/null
mkdir -p pp/$c bed/$c
sleep 4
touch pp/$c bed/$c
rm -f pp/$c/$f.pp
rm -f bed/$c/$f.bed
mv $tmp/$f.pp pp/$c
mv $tmp/$f.bed bed/$c
rm -fr $tmp
'_EOF_'
    # << happy emacs
    chmod +x doPhast.csh

    #	this template will serve for all runs
    #	root1 == chrom name, file1 == ss file name without .ss suffix
    printf '#LOOP
../run.cons/doPhast.csh $(root1) $(file1) 45 0.3 0.3 {check out line+ pp/$(root1)/$(file1).pp}
#ENDLOOP
' > template

    ls -1S ../ss/chr*/chr* | sed -e "s/.ss$//" > ss.list
    wc -l ss.list
    #	930 ss.list

    # Create parasol batch and run it
    # run for all species
    cd /hive/data/genomes/mm39/bed/multiz35way/cons
    mkdir -p all
    cd all
    #	Using the .mod tree
    cp -p ../../4d/all.mod ./all.mod

    gensub2 ../run.cons/ss.list single ../run.cons/template jobList
    # beware overwhelming the cluster with these fast running high I/O jobs
    para -ram=32g create jobList
    para try ... check ...
    para -maxJob=16 push
# Completed: 930 of 930 jobs
# CPU time in finished jobs:      22874s     381.23m     6.35h    0.26d  0.001 y
# IO & Wait Time:                  6698s     111.64m     1.86h    0.08d  0.000 y
# Average job time:                  32s       0.53m     0.01h    0.00d
# Longest finished job:              49s       0.82m     0.01h    0.00d
# Submission to last job:           200s       3.33m     0.06h    0.00d

    # create Most Conserved track
    cd /hive/data/genomes/mm39/bed/multiz35way/cons/all
    time cut -f1 ../../../../chrom.sizes | while read C
do
    echo $C 1>&2
    ls -d bed/${C} 2> /dev/null | while read D
    do
        cat ${D}/${C}*.bed
    done | sort -k1,1 -k2,2n \
    | awk '{printf "%s\t%d\t%d\tlod=%d\t%s\n", "'${C}'", $2, $3, $5, $5;}'
done > tmpMostConserved.bed
    # real    0m43.805s

    # -rw-rw-r--  1 164925477 Dec 22 14:02 tmpMostConserved.bed

    time /cluster/bin/scripts/lodToBedScore tmpMostConserved.bed \
        > mostConserved.bed
    # real    0m26.890s

    # -rw-rw-r--  1 169120947 Dec 22 14:04 mostConserved.bed

    # load into database
    ssh hgwdev
    cd /hive/data/genomes/mm39/bed/multiz35way/cons/all
    time hgLoadBed mm39 phastConsElements35way mostConserved.bed
    #  Read 4829247 elements of size 5 from mostConserved.bed
    #  real    0m36.697s

    # Try for 5% overall cov, and 70% CDS cov
    #	--rho 0.3 --expected-length 45 --target-coverage 0.3
    time featureBits mm39 -enrichment ncbiRefSeq:cds phastConsElements35way
# ncbiRefSeq:cds 1.408%, phastConsElements35way 5.679%, both 0.993%, cover 70.51%, enrich 12.42x
# real    0m31.631s


    time featureBits mm39 -enrichment refGene:cds phastConsElements35way
# refGene:cds 1.333%, phastConsElements35way 5.679%, both 0.972%, cover 72.87%, enrich 12.83x
# real    0m24.183s

    # Create merged posterier probability file and wiggle track data files
    cd /hive/data/genomes/mm39/bed/multiz35way/cons/all
    mkdir downloads

    time for D in `ls -d pp/chr* | sed -e 's#pp/##'`
do
    echo "working: $D" 1>&2
    find ./pp/${D} -type f | sed -e "s#^./##; s#\.# d #g; s#-# m #;" \
	| sort -k1,1 -k3,3n | sed -e "s# d #.#g; s# m #-#g;" | xargs cat \
        | gzip -c > downloads/${D}.phastCons35way.wigFix.gz
done
    # real    15m34.013s

    #	encode those files into wiggle data
    time (zcat downloads/*.wigFix.gz \
	| wigEncode stdin phastCons35way.wig phastCons35way.wib)
    #   Converted stdin, upper limit 1.00, lower limit 0.00
    #   real    6m57.150s

    du -hsc *.wi?
    #	1.9G    phastCons35way.wib
    #	223M    phastCons35way.wig

    #	encode into a bigWig file:
    #	(warning wigToBigWig process may be too large for memory limits
    #	in bash, to avoid the 32 Gb memory limit, set 180 Gb here:
export sizeG=188743680
ulimit -d $sizeG
ulimit -v $sizeG
    time (zcat downloads/*.wigFix.gz \
      | wigToBigWig -verbose=2 stdin \
	../../../../chrom.sizes phastCons35way.bw) > bigWig.log 2>&1
    egrep "VmPeak|real" bigWig.log
    # pid=29661: VmPeak:    22610628 kB
    # real    21m44.557s

    # -rw-rw-r--  1 4567105688 Dec 22 16:47 phastCons35way.bw

    bigWigInfo phastCons35way.bw
version: 4
isCompressed: yes
isSwapped: 0
primaryDataSize: 3,226,958,317
primaryIndexSize: 75,088,076
zoomLevels: 10
chromCount: 54
basesCovered: 2,035,330,949
mean: 0.124503
min: 0.000000
max: 1.000000


    #	if you wanted to use the bigWig file, loading bigWig table:
    #   but we don't use the bigWig file
    mkdir /gbdb/mm39/bbi
    ln -s `pwd`/phastCons35way.bw /gbdb/mm39/bbi
    hgsql mm39 -e 'drop table if exists phastCons35way; \
            create table phastCons35way (fileName varchar(255) not null); \
            insert into phastCons35way values
	("/gbdb/mm39/bbi/phastCons35way.bw");'

    # Load gbdb and database with wiggle.
    ssh hgwdev
    cd /hive/data/genomes/mm39/bed/multiz35way/cons/all
    ln -s `pwd`/phastCons35way.wib /gbdb/mm39/multiz35way/phastCons35way.wib
    time hgLoadWiggle -pathPrefix=/gbdb/mm39/multiz35way mm39 \
	phastCons35way phastCons35way.wig
    #   real    0m18.870s

    time wigTableStats.sh mm39 phastCons35way
# db.table            min max   mean       count     sumData
# mm39.phastCons35way  0 1 0.124503 2035330949 2.53404e+08
#       stdDev viewLimits
#      0.262545 viewLimits=0:1
    # real    0m8.782s

    #  Create histogram to get an overview of all the data
    ssh hgwdev
    cd /hive/data/genomes/mm39/bed/multiz35way/cons/all
    time hgWiggle -doHistogram -db=mm39 \
	-hBinSize=0.001 -hBinCount=300 -hMinVal=0.0 -verbose=2 \
	    phastCons35way > histogram.data 2>&1
    #	real    1m30.048s

    #	create plot of histogram:

XXX - need to update this gnuplot definition:

    printf 'set terminal png small x000000 xffffff xc000ff x66ff66 xffff00 x00ffff font \
"/usr/share/fonts/default/Type1/n022004l.pfb"
set size 1.4, 0.8
set key left box
set grid noxtics
set grid ytics
set title " Human Mm39 Histogram phastCons35way track"
set xlabel " phastCons35way score"
set ylabel " Relative Frequency"
set y2label " Cumulative Relative Frequency (CRF)"
set y2range [0:1]
set y2tics
set yrange [0:0.02]

plot "histogram.data" using 2:5 title " RelFreq" with impulses, \
        "histogram.data" using 2:7 axes x1y2 title " CRF" with lines
' | gnuplot > histo.png

    # take a look to see if it is sane:

    display histo.png &

#########################################################################
# phyloP for 35-way (DONE - 2020-12-22 - Hiram)
#
    # split SS files into 1M chunks, this business needs smaller files
    #   to complete

    ssh ku
    mkdir /hive/data/genomes/mm39/bed/multiz35way/consPhyloP
    cd /hive/data/genomes/mm39/bed/multiz35way/consPhyloP
    mkdir ss run.split
    cd run.split

    printf '#!/bin/csh -ef
set c = $1
set MAF = /hive/data/genomes/mm39/bed/multiz35way/anno/result/$c.maf
set WINDOWS = /hive/data/genomes/mm39/bed/multiz35way/consPhyloP/ss/$c
set WC = `cat $MAF | wc -l`
set NL = `grep "^#" $MAF | wc -l`
if ( -s $2 ) then
    exit 0
endif
if ( -s $2.running ) then
    exit 0
endif

date >> $2.running

rm -fr $WINDOWS
mkdir -p $WINDOWS
pushd $WINDOWS > /dev/null
if ( $WC != $NL ) then
/cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin/msa_split \
    $MAF -i MAF -o SS -r $WINDOWS/$c -w 1000000,0 -I 1000 -B 5000
endif
popd > /dev/null
date >> $2
rm -f $2.running
' > doSplit.csh

    chmod +x doSplit.csh

    #	do the easy ones first to see some immediate results
    ls -1S -r ../../anno/result | sed -e "s/.maf//;" > maf.list

    # this needs a {check out line+ $(root1.done)} test for verification:
    printf '#LOOP
./doSplit.csh $(root1) $(root1).done
#ENDLOOP
' > template

    gensub2 maf.list single template jobList
    # all can complete successfully at the 64Gb memory limit
    para -ram=64g create jobList
    para try ... check ... push ... etc...
# Completed: 54 of 54 jobs
# CPU time in finished jobs:      25321s     422.01m     7.03h    0.29d  0.001 y
# IO & Wait Time:                   843s      14.06m     0.23h    0.01d  0.000 y
# Average job time:                 485s       8.08m     0.13h    0.01d
# Longest finished job:            2873s      47.88m     0.80h    0.03d
# Submission to last job:          2931s      48.85m     0.81h    0.03d

    # run phyloP with score=LRT
    ssh ku
    mkdir /cluster/data/mm39/bed/multiz35way/consPhyloP
    cd /cluster/data/mm39/bed/multiz35way/consPhyloP

    mkdir run.phyloP
    cd run.phyloP
    # Adjust model file base composition background and rate matrix to be
    # representative of the chromosomes in play
    grep BACK ../../4d/all.mod
    #   BACKGROUND: 0.219062 0.338786 0.207231 0.234921 

    grep BACKGROUND ../../4d/all.mod | awk '{printf "%0.3f\n", $3 + $4}'
    #	0.546
    /cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin/modFreqs \
	../../4d/all.mod 0.546 > all.mod
    # verify, the BACKGROUND should now be paired up:
    grep BACK all.mod
    #   BACKGROUND: 0.217500 0.282500 0.282500 0.217500

    printf '#!/bin/csh -fe
set PHASTBIN = /cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin
set f = $1
set ssFile = $1:t
set out = $2
set cName = $f:h
set n = $f:r:e
set grp = $cwd:t
set cons = /hive/data/genomes/mm39/bed/multiz35way/consPhyloP
set tmp = $cons/tmp/$grp/$f
/bin/rm -fr $tmp
/bin/mkdir -p $tmp
set ssSrc = "$cons/ss/$cName/$ssFile"
set useGrp = "$grp.mod"
/bin/ln -s $cons/run.phyloP/$grp.mod $tmp
pushd $tmp > /dev/null
echo source: $ssSrc.ss
$PHASTBIN/phyloP --method LRT --mode CONACC --wig-scores --chrom $cName \
    -i SS $useGrp $ssSrc.ss > $ssFile.wigFix
popd > /dev/null
/bin/mkdir -p $out:h
sleep 4
/bin/touch $out:h
/bin/mv $tmp/$ssFile.wigFix $out
/bin/rm -fr $tmp
/bin/rmdir --ignore-fail-on-non-empty $cons/tmp/$grp
/bin/rmdir --ignore-fail-on-non-empty $cons/tmp
' > doPhyloP.csh

    chmod +x doPhyloP.csh

    # Create list of chunks
    find ../ss -type f | sed -e "s/.ss$//; s#../ss/##;" > ss.list
    # make sure the list looks good
    wc -l ss.list
    #	3308 ss.list

    # Create template file
    #	file1 == $chr/$chunk/file name without .ss suffix
    printf '#LOOP
../run.phyloP/doPhyloP.csh $(path1) {check out line+ wigFix/$(dir1)/$(file1).wigFix}
#ENDLOOP
' > template

    ######################   Running all species  #######################
    # setup run for all species
    mkdir /hive/data/genomes/mm39/bed/multiz35way/consPhyloP/all
    cd /hive/data/genomes/mm39/bed/multiz35way/consPhyloP/all
    rm -fr wigFix
    mkdir wigFix

    gensub2 ../run.phyloP/ss.list single ../run.phyloP/template jobList
    # beware overloading the cluster with these quick and high I/O jobs
    para -ram=32g create jobList
    para try ... check ...
    para -maxJob=16 push
    para time > run.time
# Completed: 2699 of 2699 jobs
# CPU time in finished jobs:     839140s   13985.67m   233.09h    9.71d  0.027 y
# IO & Wait Time:                 17961s     299.35m     4.99h    0.21d  0.001 y
# Average job time:                 318s       5.29m     0.09h    0.00d
# Longest finished job:             657s      10.95m     0.18h    0.01d
# Submission to last job:          3669s      61.15m     1.02h    0.04d

    mkdir downloads
    time for D in `ls -d wigFix/chr* | sed -e 's#wigFix/##'`
do
    echo "working: $D" 1>&2
    find ./wigFix/${D} -type f | sed -e "s#^./##; s#\.# d #g; s#-# m #;" \
	| sort -k1,1 -k3,3n | sed -e "s# d #.#g; s# m #-#g;" | xargs cat \
        | gzip -c > downloads/${D}.phyloP35way.wigFix.gz
done
    # real    30m48.598s

    du -hsc downloads
    #   3.4G    downloads

    # check integrity of data with wigToBigWig
    time (zcat downloads/*.wigFix.gz \
	| wigToBigWig -verbose=2 stdin /hive/data/genomes/mm39/chrom.sizes \
	phyloP35way.bw) > bigWig.log 2>&1

    egrep "real|VmPeak" bigWig.log
    # pid=123418: VmPeak:   22609988 kB
    # real    22m26.776s

    bigWigInfo phyloP35way.bw  | sed -e 's/^/# /;'
# version: 4
# isCompressed: yes
# isSwapped: 0
# primaryDataSize: 4,660,484,132
# primaryIndexSize: 75,089,444
# zoomLevels: 10
# chromCount: 53
# basesCovered: 2,035,330,611
# mean: 0.110677
# min: -13.709000
# max: 4.643000
# std: 0.833332

    #	encode those files into wiggle data
    time (zcat downloads/*.wigFix.gz \
	| wigEncode stdin phyloP35way.wig phyloP35way.wib)
# Converted stdin, upper limit 4.64, lower limit -13.71
# real    8m41.899s

# -rw-rw-r--  1 2035330611 Dec 22 18:40 phyloP35way.wib
# -rw-rw-r--  1  239980863 Dec 22 18:40 phyloP35way.wig

    du -hsc *.wi?
    #	1.9G    phyloP35way.wib
    #	229M    phyloP35way.wig

    # Load gbdb and database with wiggle.
    ln -s `pwd`/phyloP35way.wib /gbdb/mm39/multiz35way/phyloP35way.wib
    time hgLoadWiggle -pathPrefix=/gbdb/mm39/multiz35way mm39 \
	phyloP35way phyloP35way.wig
    # real    0m15.232s

    # use to set trackDb.ra entries for wiggle min and max
    # and verify table is loaded correctly

    wigTableStats.sh mm39 phyloP35way
# db.table          min   max     mean       count     sumData
# mm39.phyloP35way -13.709 4.643 0.110677 2035330611 2.25264e+08
#       stdDev viewLimits
#    0.833332 viewLimits=-4.05598:4.27734

    #	that range is: 13.709+4.643 = 18.352 for hBinSize=0.018352

    #  Create histogram to get an overview of all the data
    time hgWiggle -doHistogram \
	-hBinSize=0.018352 -hBinCount=1000 -hMinVal=-20 -verbose=2 \
	    -db=mm39 phyloP35way > histogram.data 2>&1
    #   real    1m14.295s


XXX - ready to graph - Tue Dec 22 21:28:59 PST 2020
    # xaxis range:
    grep -v chrom histogram.data | grep "^[0-9]" | ave -col=2 stdin \
	| sed -e 's/^/# /;'
# Q1 -10.953050
# median -6.861155
# Q3 -2.769245
# average -6.875971
# min -20.000000
# max 1.312000
# count 768
# total -5280.745380
# standard deviation 4.757034

    # find out the range for the 2:5 graph
    grep -v chrom histogram.data | grep "^[0-9]" | ave -col=5 stdin \
      | sed -e 's/^/# /;'
# Q1 0.000000
# median 0.000001
# Q3 0.000140
# average 0.001302
# min 0.000000
# max 0.023556
# count 768
# total 0.999975
# standard deviation 0.003490

    #	create plot of histogram:
    printf 'set terminal png small x000000 xffffff xc000ff x66ff66 xffff00 x00ffff font \
"/usr/share/fonts/default/Type1/n022004l.pfb"
set size 1.4, 0.8
set key left box
set grid noxtics
set grid ytics
set title " Human mm39 Histogram phyloP35way track"
set xlabel " phyloP35way score"
set ylabel " Relative Frequency"
set y2label " Cumulative Relative Frequency (CRF)"
set y2range [0:1]
set y2tics
set xrange [-5:1.5]
set yrange [0:0.04]

plot "histogram.data" using 2:5 title " RelFreq" with impulses, \
        "histogram.data" using 2:7 axes x1y2 title " CRF" with lines
' | gnuplot > histo.png

    # verify it looks sane
    display histo.png &

#############################################################################
# construct download files for 35-way (TBD - 2015-04-15 - Hiram)
    mkdir /usr/local/apache/htdocs-hgdownload/goldenPath/mm39/multiz35way
    mkdir /usr/local/apache/htdocs-hgdownload/goldenPath/mm39/phastCons35way
    mkdir /usr/local/apache/htdocs-hgdownload/goldenPath/mm39/phyloP35way
    mkdir /hive/data/genomes/mm39/bed/multiz35way/downloads
    cd /hive/data/genomes/mm39/bed/multiz35way/downloads
    mkdir multiz35way phastCons35way phyloP35way

    #########################################################################
    ## create upstream refGene maf files
    cd /hive/data/genomes/mm39/bed/multiz35way/downloads/multiz35way

    # Brian had to fix mafFrags so it could work with the assembly GCF
    # identifier

    # bash script

#!/bin/sh
export geneTbl="ncbiRefSeq"
for S in 300 2000 5000
do
    echo "making upstream${S}.maf"
    featureBits mm39 ${geneTbl}:upstream:${S} -fa=/dev/null -bed=stdout \
        | perl -wpe 's/_up[^\t]+/\t0/' | sort -k1,1 -k2,2n \
        | /cluster/bin/$MACHTYPE/mafFrags mm39 multiz35way \
                stdin stdout \
                -orgs=/hive/data/genomes/mm39/bed/multiz35way/species.list \
        | gzip -c > upstream${S}.${geneTbl}.maf.gz
    echo "done upstream${S}.${geneTbl}.maf.gz"
done
    #   real    117m33.236s

-rw-rw-r-- 1   96310644 Jan  4 17:04 upstream300.ncbiRefSeq.maf.gz
-rw-rw-r-- 1  948130770 Jan  4 17:41 upstream2000.ncbiRefSeq.maf.gz
-rw-rw-r-- 1 2032740393 Jan  4 18:48 upstream5000.ncbiRefSeq.maf.gz

    # verify sanity:
    zgrep "^s " upstream300.ncbiRefSeq.maf.gz | awk '{print $2}' \
      | sort | uniq -c | sort -rn | less
    
    # they all have the same counts, followed by single gene names:
  89641 xenTro9
  89641 turTru2
  89641 tupBel1
  89641 tarSyr2
  89641 susScr3
...
  89641 casCan1
  89641 canFam4
  89641 calJac4
  89641 bosTau9
  89641 GCF_003668045.3
      1 XM_982184.5
      1 XM_981747.8
      1 XM_981711.5
      1 XM_981599.4
      1 XM_977914.8
... etc ...

    # same pattern seen with the 2000 and 5000 upstreams
    zgrep "^s " upstream2000.ncbiRefSeq.maf.gz | awk '{print $2}' \
      | sort | uniq -c | sort -rn | less

    zgrep "^s " upstream5000.ncbiRefSeq.maf.gz | awk '{print $2}' \
      | sort | uniq -c | sort -rn | less

    ######################################################################
    ## compress the maf files
    cd /hive/data/genomes/mm39/bed/multiz35way/downloads/multiz35way
    mkdir maf
    time rsync -a -P ../../maf/ ./maf/
    # real    12m9.290s

    du -hscL maf/ ../../maf/
    #	141G    maf/
    #	141G    ../../maf/

    cd maf
    time gzip *.maf &
    # about an hour

    du -hscL maf ../../maf/
    #	16G     maf
    #	141G    ../../maf/

    cd maf
    md5sum *.maf.gz > md5sum.txt

    mkdir /usr/local/apache/htdocs-hgdownload/goldenPath/mm39/multiz35way/maf
    cd maf
    ln -s `pwd`/* /usr/local/apache/htdocs-hgdownload/goldenPath/mm39/multiz35way/maf
    cd --
    ln -s `pwd`/*.maf.gz `pwd`/*.nh `pwd`/*.txt \
         /usr/local/apache/htdocs-hgdownload/goldenPath/mm39/multiz35way/

    ###########################################################################

    cd /hive/data/genomes/mm39/bed/multiz35way/downloads/multiz35way
    grep TREE ../../4d/all.mod | awk '{print $NF}' \
      | ~/kent/src/hg/utils/phyloTrees/asciiTree.pl /dev/stdin \
         > mm39.35way.nh

    sed -f ../../db.to.name.sed mm39.35way.nh \
       | sed -e "s#_x_#'#g; s#X__#X._#;" > mm39.35way.commonNames.nh

    sed -f ../../db.to.sciName.sed mm39.35way.nh \
           > mm39.35way.scientificNames.nh

    time md5sum *.nh *.maf.gz > md5sum.txt
    #   real    0m3.147s

    ln -s `pwd`/*.maf.gz `pwd`/*.nh \
        /usr/local/apache/htdocs-hgdownload/goldenPath/mm39/multiz35way

    du -hscL ./maf ../../maf
    #  18G     ./maf
    # 156G    ../../maf

    # obtain the README.txt from danRer10/multiz12way and update for this
    #   situation
    ln -s `pwd`/*.txt \
         /usr/local/apache/htdocs-hgdownload/goldenPath/mm39/multiz35way/

    #####################################################################
    cd /hive/data/genomes/mm39/bed/multiz35way/downloads/phastCons35way

    mkdir mm39.35way.phastCons
    cd mm39.35way.phastCons
    ln -s ../../../cons/all/downloads/*.wigFix.gz .
    md5sum *.gz > md5sum.txt

    cd /hive/data/genomes/mm39/bed/multiz35way/downloads/phastCons35way
    ln -s ../../cons/all/phastCons35way.bw ./mm39.phastCons35way.bw
    ln -s ../../cons/all/all.mod ./mm39.phastCons35way.mod
    time md5sum *.mod *.bw > md5sum.txt
    #   real    0m20.354s

    # obtain the README.txt from danRer10/phastCons12way and update for this
    mkdir /usr/local/apache/htdocs-hgdownload/goldenPath/mm39/phastCons35way/mm39.35way.phastCons
    cd mm39.35way.phastCons
    ln -s `pwd`/* /usr/local/apache/htdocs-hgdownload/goldenPath/mm39/phastCons35way/mm39.35way.phastCons

    cd ..
    #   situation
    ln -s `pwd`/*.mod `pwd`/*.bw `pwd`/*.txt \
      /usr/local/apache/htdocs-hgdownload/goldenPath/mm39/phastCons35way

    #####################################################################
    cd /hive/data/genomes/mm39/bed/multiz35way/downloads/phyloP35way

    mkdir mm39.35way.phyloP
    cd mm39.35way.phyloP

    ln -s ../../../consPhyloP/all/downloads/*.wigFix.gz .
    md5sum *.wigFix.gz > md5sum.txt

    cd ..

    ln -s ../../consPhyloP/run.phyloP/all.mod mm39.phyloP35way.mod
    ln -s ../../consPhyloP/all/phyloP35way.bw mm39.phyloP35way.bw

    md5sum *.mod *.bw > md5sum.txt

    # obtain the README.txt from danRer10/phyloP12way and update for this
    #   situation
    mkdir /usr/local/apache/htdocs-hgdownload/goldenPath/mm39/phyloP35way/mm39.35way.phyloP
    cd mm39.35way.phyloP
    ln -s `pwd`/* \
/usr/local/apache/htdocs-hgdownload/goldenPath/mm39/phyloP35way/mm39.35way.phyloP

    cd ..

    ln -s `pwd`/*.mod `pwd`/*.bw `pwd`/*.txt \
      /usr/local/apache/htdocs-hgdownload/goldenPath/mm39/phyloP35way

#############################################################################
# hgPal downloads (DONE - 2020-12-23 - Hiram)
#   FASTA from 35-way for ncbiRefSeq, refGene and knownCanonical

    ssh hgwdev
    screen -S mm39HgPal
    mkdir /hive/data/genomes/mm39/bed/multiz35way/pal
    cd /hive/data/genomes/mm39/bed/multiz35way/pal
    cat ../species.list | tr '[ ]' '[\n]' > order.list

    # ncbiRefSeq
    export mz=multiz35way
    export gp=ncbiRefSeq
    export db=mm39
    export I=0
    export D=0
    mkdir exonAA exonNuc
    for C in `sort -nk2 ../../../chrom.sizes | cut -f1`
    do
        I=`echo $I | awk '{print $1+1}'`
        D=`echo $D | awk '{print $1+1}'`
        dNum=`echo $D | awk '{printf "%03d", int($1/300)}'`
        mkdir -p exonNuc/${dNum} > /dev/null
        mkdir -p exonAA/${dNum} > /dev/null
	echo "mafGene -chrom=$C -exons -noTrans $db $mz $gp order.list stdout | gzip -c > exonNuc/${dNum}/$C.exonNuc.fa.gz &"
	echo "mafGene -chrom=$C -exons $db $mz $gp order.list stdout | gzip -c > exonAA/${dNum}/$C.exonAA.fa.gz &"
        if [ $I -gt 16 ]; then
            echo "date"
            echo "wait"
            I=0
        fi
    done > $gp.jobs
    echo "date" >> $gp.jobs
    echo "wait" >> $gp.jobs

    time (sh -x ./$gp.jobs) > $gp.jobs.log 2>&1
    # real    18m43.962s

    export mz=multiz35way
    export gp=ncbiRefSeq
    time find ./exonAA -type f | grep exonAA.fa.gz | xargs zcat \
     | gzip -c > $gp.$mz.exonAA.fa.gz
    # real    2m0.962s

    time find ./exonNuc -type f | grep exonNuc.fa.gz | xargs zcat \
     | gzip -c > $gp.$mz.exonNuc.fa.gz
    #   real    10m12.351s

    # -rw-rw-r-- 1  906052407 Dec 23 16:34 ncbiRefSeq.multiz35way.exonAA.fa.gz
    # -rw-rw-r-- 1 1596566489 Dec 23 16:53 ncbiRefSeq.multiz35way.exonNuc.fa.gz

    export mz=multiz35way
    export gp=ncbiRefSeq
    export db=mm39
    export pd=/usr/local/apache/htdocs-hgdownload/goldenPath/$db/$mz/alignments
    mkdir -p $pd
    md5sum *.fa.gz > md5sum.txt
    ln -s `pwd`/$gp.$mz.exonAA.fa.gz $pd/$gp.exonAA.fa.gz
    ln -s `pwd`/$gp.$mz.exonNuc.fa.gz $pd/$gp.exonNuc.fa.gz
    ln -s `pwd`/md5sum.txt $pd/

    rm -rf exonAA exonNuc

#############################################################################
# wiki page for 35-way (DONE - 2021-01-11 - Hiram)
    mkdir /hive/users/hiram/bigWays/mm39.35way
    cd /hive/users/hiram/bigWays
    echo "mm39" > mm39.35way/ordered.list
awk '{print $1}' /hive/data/genomes/mm39/bed/multiz35way/35way.distances.txt \
       >> mm39.35way/ordered.list

    # sizeStats.sh catches up the cached measurements required for data
    # in the tables.  They are usually already mostly done, only new
    # assemblies will have updates.
    ./sizeStats.sh mm39.35way/ordered.list
    # dbDb.sh constructs mm39.35way/XenTro9_35-way_conservation_alignment.html
    # may need to add new assembly references to srcReference.list and
    # urlReference.list
    # Updated this script to work with the GCF accession and finding relevant
    # data from the assembly hub files
    ./dbDb.sh mm39 35way

    # sizeStats.pl constructs mm39.35way/XenTro9_35-way_Genome_size_statistics.html
    # this requires entries in coverage.list for new sequences
    ./sizeStats.pl mm39 35way

    # defCheck.pl constructs XenTro9_35-way_conservation_lastz_parameters.html
    ./defCheck.pl mm39 35way

    # this constructs the html pages in mm39.35way/:
# -rw-rw-r-- 1 17957 Jan  4 17:56 Mm39_35-way_conservation_alignment.html
# -rw-rw-r-- 1 11991 Jan  4 17:58 Mm39_35-way_conservation_lastz_parameters.html
# -rw-rw-r-- 1 23297 Jan 11 14:58 Mm39_35-way_Genome_size_statistics.html

    # add those pages to the genomewiki.  Their page names are the
    # names of the .html files without the .html:
# Mm39_35-way_Genome_size_statistics.html
# Mm39_35-way_conservation_alignment.html
# Mm39_35-way_conservation_lastz_parameters.html

    # when you view the first one you enter, it will have links to the
    # missing two.

############################################################################
# pushQ readmine (DONE - 2021-01-15 - Hiram)

  cd /usr/local/apache/htdocs-hgdownload/goldenPath/mm39
  find -L `pwd`/multiz35way `pwd`/phastCons35way `pwd`/phyloP35way \
	/gbdb/mm39/multiz35way -type f \
    > /hive/data/genomes/mm39/bed/multiz35way/downloads/redmine.26584.file.list
  wc -l /hive/data/genomes/mm39/bed/multiz35way/downloads/redmine.26584.file.list
# 1450 /hive/data/genomes/mm39/bed/multiz35way/downloads/redmine.26584.file.list

  cd /hive/data/genomes/mm39/bed/multiz35way/downloads
  hgsql -e 'show tables;' mm39 | grep 35way \
	| sed -e 's/^/mm39./;' > redmine.26584.table.list

############################################################################
