###############################################################################
### Constructing a small 7-way alignment with just human CoV plus MERS
### Working - Hiram - 2020-05-13
###############################################################################

mkdir -p /hive/data/genomes/wuhCor1/bed/multiz7way/lastz
cd /hive/data/genomes/wuhCor1/bed/multiz7way/lastz

# setup ordinary virus names for this sample:
# CoV229E instead of 229E, the 229E doesn't work in the MAF file processing
# Also can't use a name with minus sign such as SARS-CoV-1

sed -e 's/^>.*/>MERS/;' ../../lastzStrains/sequences/safeNameFa/NC_019843v3.fa > MERS.fa
sed -e 's/^>.*/>SARS_CoV_1/;' ../../lastzStrains/sequences/safeNameFa/NC_004718v3.fa > SARS_CoV_1.fa
sed -e 's/^>.*/>CoV229E/;' ../../lastzStrains/sequences/safeNameFa/NC_002645v1.fa > CoV229E.fa
sed -e 's/^>.*/>NL63/;' ../../lastzStrains/sequences/safeNameFa/NC_005831v2.fa > NL63.fa
sed -e 's/^>.*/>OC43/;' ../../lastzStrains/sequences/safeNameFa/NC_006213v1.fa > OC43.fa
sed -e 's/^>.*/>HKU1/;'  ../../lastzStrains/sequences/safeNameFa/NC_006577v2.fa > HKU1.fa
twoBitToFa ../../../wuhCor1.2bit wuhCor1.fa

faCount *.fa

# #seq    len     A       C       G       T       N       cpg
# CoV229E 27317   7420    4549    5903    9445    0       488
# HKU1    29926   8331    3895    5699    12001   0       340
# MERS    30119   7900    6116    6304    9799    0       711
# NL63    27553   7253    3979    5516    10805   0       332
# OC43    30741   8502    4660    6649    10930   0       485
# SARS_CoV_1      29751   8481    5940    6187    9143    0       568
# NC_045512v2     29903   8954    5492    5863    9594    0       439

export TOP="/hive/data/genomes/wuhCor1/bed/multiz7way/lastz"
export target="wuhCor1"

time for query in CoV229E HKU1 MERS NL63 OC43 SARS_CoV_1
do
cd  ${TOP}
rm -fr ${TOP}/$query
mkdir -p ${TOP}/$query
cd  ${TOP}/$query
time /cluster/bin/penn/lastz-distrib-1.04.03/bin/lastz \
   ../${target}.fa ../${query}.fa \
   --transition=2 --format=axt+ --rdotplot=$target.$query.dotPlot \
     --noytrim > $target.$query.axt
faSize -detailed ../$target.fa > $target.sizes
faSize -detailed ../$query.fa > $query.sizes
faToTwoBit ../$query.fa $query.2bit
axtToPsl $target.$query.axt $target.sizes $query.sizes $target.$query.psl
axtToMaf $target.$query.axt $target.sizes $query.sizes $target.$query.maf
axtChain -psl -verbose=0 -linearGap=loose $target.$query.psl \
   ../../../../$target.2bit $query.2bit stdout \
    | chainAntiRepeat ../../../../$target.2bit $query.2bit stdin $target.$query.chain
chainPreNet $target.$query.chain $target.sizes $query.sizes stdout \
   | chainNet  stdin -minSpace=1 $target.sizes $query.sizes stdout /dev/null \
     | netSyntenic stdin $target.$query.noClass.net
netSplit $target.$query.noClass.net net
netToAxt net/*.net $target.$query.chain ../../../../$target.2bit $query.2bit stdout \
   | axtSort stdin $target.$query.axtNet
axtToMaf -tPrefix=NC_045512v2. -qPrefix=$query.  $target.$query.axtNet \
   $target.sizes $query.sizes $target.$query.mafNet
cd  ${TOP}
done

# real    0m2.402s

# verify names are correct in mafNet files
for F in */*.mafNet
do
  grep "^s " ${F} | awk '{print $2}' | sort | uniq -c | sed -e 's/^/# /'
done
#       9 CoV229E.CoV229E
#       9 NC_045512v2.NC_045512v2
#       6 HKU1.HKU1
#       6 NC_045512v2.NC_045512v2
#       9 MERS.MERS
#       9 NC_045512v2.NC_045512v2
#       8 NC_045512v2.NC_045512v2
#       8 NL63.NL63
#       5 NC_045512v2.NC_045512v2
#       5 OC43.OC43
#       5 NC_045512v2.NC_045512v2
#       5 SARS_CoV_1.SARS_CoV_1

cd /hive/data/genomes/wuhCor1/bed/multiz7way
# extract 7-way tree from 119way.nh, and reset names:

/cluster/bin/phast/tree_doctor \
 --prune-all-but NC_045512v2,NC_019843v3,NC_004718v3,NC_002645v1,NC_005831v2,NC_006213v1,NC_006577v2 \
  ../../goldenPath/multiz119way/sequenceNames.119way.nh \
  | sed -e 's/NC_019843v3/MERS/; s/NC_004718v3/SARS_CoV_1/; s/NC_002645v1/CoV229E/; s/NC_005831v2/NL63/; s/NC_006213v1/OC43/; s/NC_006577v2/HKU1/;' > wuhCor1.7way.nh

/cluster/home/hiram/kent/src/hg/utils/phyloTrees/asciiTree.pl \
   wuhCor1.7way.nh | sed -e 's/^/# /;'

# ((((NC_045512v2:0.48935,
#    SARS_CoV_1:0.48934):0.01064,
#   MERS:0.49999):0.000012,
#  (OC43:0.49947,
#  HKU1:0.499460):0.000540):0.000001,
# (CoV229E:0.49967,
# NL63:0.49967):0.000321);

sed 's/[a-z][a-z]*_//g; s/:[0-9\.][0-9\.]*//g; s/;//; /^ *$/d' \
       wuhCor1.7way.nh | xargs echo | sed 's/ //g; s/,/ /g' > tree.nh

sed 's/[()]//g; s/,/ /g' tree.nh > species.list

cat species.list | tr ' ' '\n' | sort > 7.grep.list

mkdir mafLinks

for S in `sed -e 's/NC_045512v2 //;' species.list`
do
  echo $S
  mafFile="mafLinks/NC_045512v2.${S}.maf.gz"
  cat ./lastz/${S}/wuhCor1.${S}.mafNet | gzip -c > "${mafFile}"
done

zcat mafLinks/*.maf.gz | grep "^s " | awk '{print $2}' | sort \
    | uniq -c | sort -rn | sed -e 's/^/# /;'
#      42 NC_045512v2.NC_045512v2
#       9 MERS.MERS
#       9 CoV229E.CoV229E
#       8 NL63.NL63
#       6 HKU1.HKU1
#       5 SARS_CoV_1.SARS_CoV_1
#       5 OC43.OC43

mkdir maf run
cd run
mkdir penn
cp -p /cluster/bin/penn/multiz.2009-01-21_patched/multiz penn
cp -p /cluster/bin/penn/multiz.2009-01-21_patched/maf_project penn
cp -p /cluster/bin/penn/multiz.2009-01-21_patched/autoMZ penn

printf '#LOOP
./autoMultiz.csh $(file1) {check out line+ /hive/data/genomes/wuhCor1/bed/multiz7way/maf/$(root1).maf}
#ENDLOOP
' > template

ls ../mafLinks | sed -e 's/NC_045512v2.//; s/.maf.gz//;' > maf.list

printf '#!/bin/csh -ef
set db = NC_045512v2
set c = $1
set result = $2
set run = `/bin/pwd`
set tmp = /dev/shm/$db/multiz.$c
set pairs = /hive/data/genomes/wuhCor1/bed/multiz7way/mafLinks
/bin/rm -fr $tmp
/bin/mkdir -p $tmp
/bin/cp -p ../tree.nh ../species.list $tmp
pushd $tmp > /dev/null
foreach s (`/bin/sed -e "s/$db //;" species.list`)
    set in = $pairs/$db.$s.maf
    set out = $db.$s.sing.maf
    if (-e $in.gz) then
        /bin/zcat $in.gz > $out
        if (! -s $out) then
            echo "##maf version=1 scoring=autoMZ" > $out
        endif
    else if (-e $in) then
        /bin/ln -s $in $out
    else
        echo "##maf version=1 scoring=autoMZ" > $out
    endif
end
set path = ($run/penn $path); rehash
ls -ogrtL
echo "$run/penn/autoMZ + T=$tmp E=$db ""`cat tree.nh`"" $db.*.sing.maf $c"
$run/penn/autoMZ + T=$tmp E=$db "`cat tree.nh`" $db.*.sing.maf $c \
        > /dev/null
popd > /dev/null
/bin/rm -f $result
/bin/cp -p $tmp/$c $result
echo "# working: $tmp"
/bin/rm -fr $tmp
/bin/rmdir --ignore-fail-on-non-empty  /dev/shm/$db
' > autoMultiz.csh

chmod +x autoMultiz.csh

gensub2 maf.list single template jobList

para create jobList
para try
para time > run.time

cat run.time

Completed: 6 of 6 jobs
CPU time in finished jobs:         11s       0.19m     0.00h    0.00d  0.000 y
IO & Wait Time:                    25s       0.41m     0.01h    0.00d  0.000 y
Average job time:                   6s       0.10m     0.00h    0.00d
Longest finished job:               7s       0.12m     0.00h    0.00d
Submission to last job:            13s       0.22m     0.00h    0.00d

## put the results together into a single

head -1 maf/SARS_CoV_1.maf > multiz7way.maf
for F in maf/*.maf
do
    echo "${F}" 1>&2
    egrep -v "^#" ${F} | sed -e 's#^s \([A-Z0-9a-z_]*\)#s \1.\1#;'
done | sed -e 's#^s NC_045512v2.NC_045512v2#s wuhCor1.NC_045512v2#;' \
>> multiz7way.maf
tail -1 maf/SARS_CoV_1.maf >> multiz7way.maf

grep "^s " multiz7way.maf | awk '{print $2}' | sort | uniq -c \
  | sort -rn | sed -e 's/^/# /;'
#     450 wuhCor1.NC_045512v2
#     450 SARS_CoV_1.SARS_CoV_1
#     330 MERS.MERS
#     288 OC43.OC43
#     276 HKU1.HKU1
#     252 NL63.NL63
#     162 CoV229E.CoV229E

mkdir /gbdb/wuhCor1/multiz7way
rm -f /gbdb/wuhCor1/multiz7way/multiz7way.maf
ln -s `pwd`/multiz7way.maf /gbdb/wuhCor1/multiz7way
hgLoadMaf wuhCor1 multiz7way
# Loaded 450 mafs in 1 files from /gbdb/wuhCor1/multiz7way

mafFrag wuhCor1 multiz7way NC_045512v2 0 29903 + mafFrag.multiz7way.maf

./dotToDash.pl mafFrag.multiz7way.maf > defraged.multiz7way.maf

rm /gbdb/wuhCor1/multiz7way/multiz7way.maf
ln -s `pwd`/defraged.multiz7way.maf /gbdb/wuhCor1/multiz7way/multiz7way.maf

hgLoadMaf wuhCor1 multiz7way
# Loaded 1 mafs in 1 files from /gbdb/wuhCor1/multiz7way

##############################################################################
## frames (DONE - 2020-05-13 - Hiram)

# build single coverage ORF annotation for MAF codon display and for mafSnp display
mkdir  /hive/data/genomes/wuhCor1/bed/multiz7way/mafFrames
cd  /hive/data/genomes/wuhCor1/bed/multiz7way/mafFrames

cp -p "/hive/data/outside/ncbi/genomes/GCF/009/858/895/GCF_009858895.2_ASM985889v3/GCF_009858895.2_ASM985889v3_protein.faa.gz" .

zgrep "^>" GCF_009858895.2_ASM985889v3_protein.faa.gz
>YP_009724389.1 orf1ab polyprotein [Wuhan seafood market pneumonia virus]
>YP_009724390.1 surface glycoprotein [Wuhan seafood market pneumonia virus]
>YP_009724391.1 ORF3a protein [Wuhan seafood market pneumonia virus]
>YP_009724392.1 envelope protein [Wuhan seafood market pneumonia virus]
>YP_009724393.1 membrane glycoprotein [Wuhan seafood market pneumonia virus]
>YP_009724394.1 ORF6 protein [Wuhan seafood market pneumonia virus]
>YP_009724395.1 ORF7a protein [Wuhan seafood market pneumonia virus]
>YP_009724396.1 ORF8 protein [Wuhan seafood market pneumonia virus]
>YP_009724397.2 nucleocapsid phosphoprotein [Wuhan seafood market pneumonia virus]
>YP_009725255.1 ORF10 protein [Wuhan seafood market pneumonia virus]
>YP_009725295.1 orf1a polyprotein [Wuhan seafood market pneumonia virus]
>YP_009725296.1 ORF7b [Wuhan seafood market pneumonia virus]

blat -noHead -t=dnax -q=prot /hive/data/genomes/wuhCor1/wuhCor1.2bit \
   GCF_009858895.2_ASM985889v3_protein.faa.gz stdout | pslToBed stdin full.bed

sed -e 's/27758/27755/g; s/129/126/g;' full.bed > hack.bed

#edit hack.bed to remove overlap of YP_009725296.1 with YP_009724395.1
# change the 27758 to 27755 and the 129 to 126
diff full.bed hack.bed
12c12
< NC_045512v2   27755   27884   YP_009725296.1  1000    +       27755   27884  0
        1       129,    0,
---
> NC_045512v2   27758   27884   YP_009725296.1  1000    +       27758   27884  0
        1       126,    0,

bedToGenePred hack.bed stdout | genePredSingleCover stdin singleCover7way.gp
hgLoadGenePred wuhCor1 singleCover7way singleCover7way.gp

genePredToMafFrames wuhCor1 ../mafFrag.multiz7way.maf frames.tab wuhCor1 singleCover7way.gp
hgLoadMafFrames wuhCor1 multiz7wayFrames frames.tab

##############################################################################
## SNP view
mkdir -p  /hive/data/genomes/wuhCor1/bed/snpView7Way
cd /hive/data/genomes/wuhCor1/bed/snpView7Way

awk '/^s/ {print $2}' ../multiz7way/defraged.multiz7way.maf \
  | sed 's/\..*//' > species.lst
for i in `cat species.lst`; do f=`echo $i \
  | tr '_' '-'`; echo "s/$f/$i/g"; done > backSedScript.txt
for i in `cat species.lst`; do f=`echo $i \
  | tr '_' '-'`; echo "s/$i/$f/g"; done > foreSedScript.txt
sed -f foreSedScript.txt species.lst > editSpecies.lst

mafGene -exons wuhCor1 multiz7way singleCover7way species.lst stdout \
   | sed -f foreSedScript.txt >  nonsyn.faa

# see #26766
#paSNP -outScore editSpecies.lst nonsyn.faa stdout | sed 's/:/ /' | sed 's/-/ /' \
#   | awk '{if ($5 < -1) term=1583; else if ($5 < 1) term=1581; else term=1579; print $1, $2-1, $3, $4, term, "+", $2-1, $3, "255,0,0", 1, $3-($2 - 1), 0}' \
#   | sed -f backSedScript.txt > nonsyn.bed

paSNP editSpecies.lst nonsyn.faa stdout | sed 's/:/ /' | sed 's/-/ /' \
   | awk '{print $1, $2-1, $3, $4, 1583, "+", $2-1, $3, "255,0,0", 1, $3-($2 - 1), 0}' \
   | sed -f backSedScript.txt > nonsyn.bed

awk '{print $4}' nonsyn.bed | sort | uniq -c | sed -e 's/^/# /;'
#    1696 CoV229E
#    2170 HKU1
#    2371 MERS
#    2330 NL63
#    2257 OC43
#    1385 SARS_CoV_1

mafGene -uniqAA -exons wuhCor1 multiz7way singleCover7way species.lst stdout \
  | sed -f foreSedScript.txt > syn.faa

paSNP editSpecies.lst syn.faa stdout | sed 's/:/ /' | sed 's/-/ /' \
   | awk '{print $1, $2-1, $3, $4, 1819, "+", $2-1, $3, "0,255,0", 1, $3 - ($2 - 1), 0}' \
     | sed -f backSedScript.txt   > syn.bed

mafToSnpBed wuhCor1 ../multiz7way/defraged.multiz7way.maf \
   ../multiz7way/mafFrames/singleCover7way.gp stdout \
      | sed 's/wuhCor1.//' > single.bed

#these should all disappear on the merge
grep "1580$" single.bed \
  | awk '{print $1, $2, $3, $4, $5, "+", $2, $3, "255,255,0", 1, $3 -$2, 0}' \
    > codingVariant.bed

grep "1623$" single.bed \
  | awk '{print $1, $2, $3, $4, $5, "+", $2, $3, "255,255,0", 1, $3 -$2, 0}' \
    > utrVariant.bed
grep "1624$" single.bed \
  | awk '{print $1, $2, $3, $4, $5, "+", $2, $3, "255,255,0", 1, $3 -$2, 0}' \
    >> utrVariant.bed

grep " 0$" single.bed \
 | awk '{print $1, $2, $3, $4, $5, "+", $2, $3, "240,240,180", 1, $3 -$2, 0}' \
   > missing.bed

grep "1628$" single.bed \
  | awk '{print $1, $2, $3, $4, $5, "+", $2, $3, "0,0,0", 1, $3 -$2, 0}' \
    > intergenic.bed

grep "1627$" single.bed \
   | awk '{print $1, $2, $3, $4, $5, "+", $2, $3, "0,0,0", 1, $3 -$2, 0}' \
     > intron.bed

hgsql -N -e "select * from chromInfo" wuhCor1 > wuhCor1.chrom.sizes

rm output.bed
for i in `cat species.lst`
do
echo $i 1>&2
grep -wh "$i" nonsyn.bed syn.bed codingVariant.bed utrVariant.bed \
    intron.bed intergenic.bed missing.bed \
      | bedSmash stdin wuhCor1.chrom.sizes stdout >> output.bed
done

# make codingVariants into missing data instead of showing blue
awk '{print $1,$2,$3,$4,$5}' output.bed | sed 's/ 1580$/ 0/' > load.bed

hgLoadBed wuhCor1 mafSnp7way load.bed
# Read 22910 elements of size 5 from load.bed

#########################################################################
# Phylogenetic tree from 7-way (DONE - 2020-01-06 - Hiram)
    mkdir /hive/data/genomes/wuhCor1/bed/multiz7way/4d
    cd /hive/data/genomes/wuhCor1/bed/multiz7way/4d

    # tried using the 'defraged' maf:
    ../defraged.multiz7way.maf
    # that did not work
    # Skipping 4d.all.mfa; insufficient informative sites ...

    # using the full maf:
    ../multiz7way.maf

    # using ncbiGene for wuhCor1
    hgsql -N -e 'select * from ncbiGene;' wuhCor1 \
       | cut -f2- > wuhCor1.ncbiGene.gp

    genePredSingleCover wuhCor1.ncbiGene.gp stdout \
       | sort > wuhCor1.ncbiGeneNR.gp
    wc -l *.gp
    #  13 wuhCor1.ncbiGene.gp
    #  10 wuhCor1.ncbiGeneNR.gp

    sed -e 's/wuhCor1.NC_045512v2/NC_045512v2.NC_045512v2/' \
       ../multiz7way.maf > multiz7way.NC_045512v2.maf

    time /cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin/msa_view \
       --4d --features wuhCor1.ncbiGeneNR.gp \
         -i MAF multiz7way.NC_045512v2.maf -o SS > multiz7way.ss
    # real    0m0.140s

    time /cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin/msa_view \
        -i SS --tuple-size 1 multiz7way.ss > multiz7way.mfa
    # real    0m0.015s

    #want comma-less species.list
    time /cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin/msa_view \
       --aggregate "`cat ../species.list`" multiz7way.mfa | sed s/"> "/">"/ \
	    > 4d.all.mfa
    # real    0m0.020s

    # check they are all in there:
    grep "^>" 4d.all.mfa | sort -u | wc -l
    #  7

    sed -e 's/ /,/g;' ../tree.nh > tree_commas.nh

    # tree_commas.nh looks like:
# ((((NC_045512v2,SARS_CoV_1),MERS),(OC43,HKU1)),(CoV229E,NL63))

    # use phyloFit to create tree model (output is phyloFit.mod)
    time /cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin/phyloFit \
	    --EM --precision MED --msa-format FASTA --subst-mod REV \
		--tree tree_commas.nh 4d.all.mfa
    #   real    0m0.458s

    mv phyloFit.mod all.mod

    grep "TREE:" all.mod | sed -e 's/TREE: //;' > 7way.nh

    /cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin/all_dists \
      7way.nh | grep NC_045512v2 | sed -e 's/NC_045512v2.//;' \
         | sort -k2n > 7way.distances.txt
    sed -e 's/^/# /;' 7way.distances.txt
# SARS_CoV_1    0.735593
# NL63  1.720630
# MERS  1.797932
# CoV229E       1.879442
# OC43  1.962180
# HKU1  2.136617

##############################################################################
# phastCons 7-way (DONE - 2020-01-06 - Hiram)
    # split 7way mafs into 10M chunks and generate sufficient statistics
    # files for # phastCons
    ssh hgwdev
    mkdir -p /hive/data/genomes/wuhCor1/bed/multiz7way/cons/SS
    cd /hive/data/genomes/wuhCor1/bed/multiz7way/cons/SS
    /cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin/msa_split \
       ../../defraged.multiz7way.maf -i MAF -o SS \
          -r multiz7way -w 10000000,0 -I 1000 -B 5000

    # Run phastCons

export len=45
export cov=0.3
export rho=0.3
export c=wuhCor1


    cd /hive/data/genomes/wuhCor1/bed/multiz7way/cons

sed -e 's/NC_045512v2/wuhCor1/g' ../4d/all.mod > all.mod

time /cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin/phastCons \
  SS/multiz7way.1-29903.ss all.mod \
    --rho $rho --expected-length $len --target-coverage $cov --quiet \
    --seqname $c --idpref $c --most-conserved $c.bed --score \
     | sed -e "s/$c/NC_045512v2/;" > $c.pp
# real    0m0.134s

awk '{printf "%s\t%d\t%d\tlod=%d\t%s\n", "NC_045512v2", $2, $3, $5, $5}' \
   wuhCor1.bed > tmpMostConserved.bed
/cluster/bin/scripts/lodToBedScore tmpMostConserved.bed > mostConserved.bed

    wigToBigWig -verbose=2 $c.pp /hive/data/genomes/wuhCor1/chrom.sizes $c.bw
# pid=130258: VmPeak:      43032 kB

    bigWigInfo wuhCor1.bw | sed -e 's/^/# /;'
# version: 4
# isCompressed: yes
# isSwapped: 0
# primaryDataSize: 48,184
# primaryIndexSize: 6,436
# zoomLevels: 6
# chromCount: 1
# basesCovered: 29,903
# mean: 0.859734
# min: 0.000000
# max: 1.000000
# std: 0.264092

    wigEncode $c.pp phastCons7way.wig phastCons7way.wib
    #  Converted wuhCor1.pp, upper limit 1.00, lower limit 0.00

    # load into database
    ssh hgwdev
    cd /hive/data/genomes/wuhCor1/bed/multiz7way/cons
    hgLoadBed wuhCor1 phastConsElements7way mostConserved.bed
    # Read 41 elements of size 5 from mostConserved.bed

    featureBits wuhCor1 phastConsElements7way
    #  26555 bases of 29903 (88.804%) in intersection

    ln -s `pwd`/phastCons7way.wib \
          /gbdb/wuhCor1/multiz7way/phastCons7way.wib
    hgLoadWiggle -pathPrefix=/gbdb/wuhCor1/multiz7way \
	wuhCor1 phastCons7way phastCons7way.wig

    # on human we often try for 5% overall cov, and 70% CDS cov
    # most bets are off here for that goal, these alignments are too few
    #	and too far between
    #	--rho 0.3 --expected-length 45 --target-coverage 0.3
    featureBits wuhCor1 -enrichment ncbiGene:cds phastConsElements7way
# ncbiGene:cds 97.850%, phastConsElements7way 88.804%, both 86.654%,
#   cover 88.56%, enrich 1.00x

    wigTableStats.sh wuhCor1 phastCons7way
# db.table          min max mean       count sumData      stdDev  viewLimits
wuhCor1.phastCons7way   0 1 0.859734 29903 25708.6 0.264091 viewLimits=0:1

    #  Create histogram to get an overview of all the data
    hgWiggle -doHistogram -db=wuhCor1 \
	-hBinSize=0.001 -hBinCount=1000 -hMinVal=0.0 -verbose=2 \
	    phastCons7way > histogram.data 2>&1
    #	real    2m40.179s

XXX - to be done
    #	create plot of histogram:

    cat << '_EOF_' | gnuplot > histo.png
set terminal png small x000000 xffffff xc000ff x66ff66 xffff00 x00ffff
set size 1.4, 0.8
set key left box
set grid noxtics
set grid ytics
set title " Ebola wuhCor1 Histogram phastCons7way track"
set xlabel " phastCons7way score"
set ylabel " Relative Frequency"
set y2label " Cumulative Relative Frequency (CRF)"
set y2range [0:1]
set y2tics
set yrange [0:0.02]

plot "histogram.data" using 2:5 title " RelFreq" with impulses, \
        "histogram.data" using 2:7 axes x1y2 title " CRF" with lines
'_EOF_'
    #	<< happy emacs

    display histo.png &

#########################################################################
# phyloP for 7-way (DONE - 2020-01-06 - Hiram)
    # run phyloP with score=LRT
    mkdir /cluster/data/wuhCor1/bed/multiz7way/consPhyloP
    cd /cluster/data/wuhCor1/bed/multiz7way/consPhyloP

    # Adjust model file base composition background and rate matrix to be
    # representative of the chromosomes in play
    grep BACKGROUND ../4d/all.mod | awk '{printf "%0.3f\n", $3 + $4}'
    #	0.226
    /cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin/modFreqs \
	../4d/all.mod 0.226 > all.mod
    # verify, the BACKGROUND should now be paired up:
    grep BACK all.mod
    #   BACKGROUND: 0.387000 0.113000 0.113000 0.387000

    sed -e 's/wuhCor1/NC_045512v2/' ../cons/SS/multiz7way.1-29903.ss \
      > NC_045512v2.ss

    time /cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin/phyloP \
      --method LRT --mode CONACC --wig-scores --chrom NC_045512v2 \
        -i SS all.mod NC_045512v2.ss > NC_045512v2.wigFix
    # real    0m0.589s

    # check integrity of data with wigToBigWig
    wigToBigWig -verbose=2 NC_045512v2.wigFix \
      /hive/data/genomes/wuhCor1/chrom.sizes phyloP7way.bw
    # pid=139119: VmPeak:      43032 kB

    bigWigInfo phyloP7way.bw | sed -e 's/^/# /;'
# version: 4
# isCompressed: yes
# isSwapped: 0
# primaryDataSize: 50,209
# primaryIndexSize: 6,436
# zoomLevels: 6
# chromCount: 1
# basesCovered: 29,903
# mean: 0.727993
# min: -1.054000
# max: 3.454000
# std: 0.946768

    #	encode those files into wiggle data
    wigEncode NC_045512v2.wigFix phyloP7way.wig phyloP7way.wib
    # Converted NC_045512v2.wigFix, upper limit 3.45, lower limit -1.05

    # Load gbdb and database with wiggle.
    ln -s `pwd`/phyloP7way.wib /gbdb/wuhCor1/multiz7way/phyloP7way.wib
    hgLoadWiggle -pathPrefix=/gbdb/wuhCor1/multiz7way wuhCor1 \
	phyloP7way phyloP7way.wig

    # use to set trackDb.ra entries for wiggle min and max
    # and verify table is loaded correctly

    wigTableStats.sh wuhCor1 phyloP7way
# db.table              min max   mean  count sumData
wuhCor1.phyloP7way  -1.054 3.454 0.727993 29903 21769.2
#    stdDev    viewLimits
#    0.946768  viewLimits=-1.054:3.454

#	that range is: 1.054 + 3.454 = 4.508 for hBinSize=0.004508

    #  Create histogram to get an overview of all the data
    hgWiggle -doHistogram -hBinSize=0.004508 -hBinCount=1000 \
       -hMinVal=-1.054 -verbose=2 \
	    -db=wuhCor1 phyloP7way > histogram.data 2>&1

XXX - to be done
    # find out the range for the 2:5 graph
    grep -v chrom histogram.data | grep "^[0-9]" | ave -col=5 stdin
# Q1 0.000158
# median 0.000791
# Q3 0.002532
# average 0.002123
# min 0.000053
# max 0.045049
# count 471
# total 1.000058
# standard deviation 0.004132

    #	create plot of histogram:
    cat << '_EOF_' | gnuplot > histo.png
set terminal png small x000000 xffffff xc000ff x66ff66 xffff00 x00ffff
set size 1.4, 0.8
set key left box
set grid noxtics
set grid ytics
set title " Ebola wuhCor1 Histogram phyloP7way track"
set xlabel " phyloP7way score"
set ylabel " Relative Frequency"
set y2label " Cumulative Relative Frequency (CRF)"
set y2range [0:1]
set y2tics
set yrange [0:0.02]

plot "histogram.data" using 2:5 title " RelFreq" with impulses, \
        "histogram.data" using 2:7 axes x1y2 title " CRF" with lines
'_EOF_'
    #	<< happy emacs

    display histo.png &

#############################################################################
# 7-way downloads (DONE - 2020-03-18 - Hiram)
    mkdir -p \
       /hive/data/genomes/wuhCor1/bed/multiz7way/downloads/phastCons7way
    cd /hive/data/genomes/wuhCor1/bed/multiz7way/downloads/phastCons7way

    cp -p ../../cons/wuhCor1.pp ./wuhCor1.phastCons7way.wigFix
    gzip wuhCor1.phastCons7way.wigFix
    ln -s ../../cons/all.mod ./wuhCor1.phastCons7way.mod
    ln -s ../../cons/wuhCor1.bw ./wuhCor1.phastCons7way.bw
    ln -s ../wuhCor1.7way.nameList.txt .

    # reusing the README.txt file from the 119way already completed:
# /usr/local/apache/htdocs-hgdownload/goldenPath/wuhCor1/phastCons119way/README.txt
    # edited here to reflect this construction

    md5sum *.txt *.gz *.mod *.bw > md5sum.txt

    mkdir -p \
       /hive/data/genomes/wuhCor1/bed/multiz7way/downloads/phyloP7way
    cd /hive/data/genomes/wuhCor1/bed/multiz7way/downloads/phyloP7way

    ln -s ../../consPhyloP/all.mod ./wuhCor1.phyloP7way.mod
    cp -p ../../consPhyloP/NC_045512v2.wigFix ./wuhCor1.phyloP7way.wigFix
    ln -s ../../consPhyloP/phyloP7way.bw ./wuhCor1.phyloP7way.bw
    gzip *.wigFix

    ln -s ../../acc.date.description.list wuhCor1.7way.nameList.txt

    # reusing the README.txt file from:
# /usr/local/apache/htdocs-hgdownload/goldenPath/wuhCor1/phyloP119way/README.txt
    # edited here to reflect this construction

    md5sum *.txt *.gz *.mod *.bw > md5sum.txt

    mkdir /usr/local/apache/htdocs-hgdownload/goldenPath/wuhCor1/phastCons7way
    cd /usr/local/apache/htdocs-hgdownload/goldenPath/wuhCor1/phastCons7way
    ln -s \
     /hive/data/genomes/wuhCor1/bed/multiz7way/downloads/phastCons7way/* ./

    mkdir /usr/local/apache/htdocs-hgdownload/goldenPath/wuhCor1/phyloP7way
    cd /usr/local/apache/htdocs-hgdownload/goldenPath/wuhCor1/phyloP7way
    ln -s \
     /hive/data/genomes/wuhCor1/bed/multiz7way/downloads/phyloP7way/* ./

##############################################################################
