
################################################################################
#### 2023-06-13 - fetch maf.gz - Hiram

mkdir -p /hive/data/genomes/hg38/bed/cactus447/singleCover
cd /hive/data/genomes/hg38/bed/cactus447/singleCover

obtained maf file from Glenn, from the 'courtyard' login:

/nanopore/cgl/glenn-scratch/new-maf/447-mammalian-2022v1-single-copy.maf.gz

-rw-r--r-- 1 hickey cgl 881994218891 Jun  5 11:06 /nanopore/cgl/glenn-scratch/new-maf/447-mammalian-2022v1-single-copy.maf.gz

There was some go around with a different version.  This one
is 'single coverage'

-rw-r--r-- 1 881994218891 Jun  5 11:06 447-mammalian-2022v1-single-copy.maf.gz

################################################################################
#### split this file into individual chromosome files 2023-06-21 - Hiram

  mkdir  /hive/data/genomes/hg38/bed/cactus447/splitSingleCover
  cd  /hive/data/genomes/hg38/bed/cactus447/splitSingleCover

time mafSplit -byTarget -useFullSequenceName /dev/null ./  ../singleCover/447-mammalian-2022v1-single-copy.maf.gz

Splitting 1 files by target sequence -- ignoring first argument /dev/null

real    804m35.124s
user    1061m9.580s
sys     101m19.894s


################################################################################
### convert the sequence names to hg38 chrom names 2023-06-12 to 06-14 - Hiram

  mkdir  /hive/data/genomes/hg38/bed/cactus447/ucscNames
  cd  /hive/data/genomes/hg38/bed/cactus447/ucscNames

   runOne script for processing each file:

#!/bin/bash

set -beEu -o pipefail

export cactusMaf="${1}"
export srcMaf="../splitSingleCover/${cactusMaf}"

cactusId=${cactusMaf%.maf}

export sedCommand=`grep -w "${cactusId}" cactus.to.ucsc.sed`
export ucscName=`grep -w "${cactusId}" 2022v1.hg38.chrom.equiv.txt | cut -f2`

case "${ucscName}" in
    chrM)
if [ ${srcMaf} -nt ${ucscName}.maf ]; then
  sed -f cactus.to.ucsc.sed ${srcMaf} > ${ucscName}.maf
  touch -r ${srcMaf} ${ucscName}.maf
fi
       ;;
    chr[0-9][0-9])
       rm -f "${ucscName}.maf"
       ln ${srcMaf} "${ucscName}.maf"
       ;;
    chr[0-9])
       rm -f "${ucscName}.maf"
       ln ${srcMaf} "${ucscName}.maf"
       ;;
    G*.maf)
    K*.maf)
if [ ${srcMaf} -nt ${ucscName}.maf ]; then
  sed -e "${sedCommand}" ${srcMaf} > ${ucscName}.maf
  touch -r ${srcMaf} ${ucscName}.maf
else
  printf "DONE: ${ucscName}.maf\n" 1>&2
fi
       ;;
esac

if [ "${ucscName}.maf" -nt "nameScan/${ucscName}.seqCount.txt" ]; then
  grep "^s." "${ucscName}.maf" | cut -d' ' -f2 | sort | uniq -c | sort -rn \
    > "nameScan/${ucscName}.seqCount.txt"
  touch -r "${ucscName}.maf" "nameScan/${ucscName}.seqCount.txt"
  sed -e "s/^ \+//;" "nameScan/${ucscName}.seqCount.txt" | cut -d' ' -f2 \
    | cut -d'.' -f1 | sort | uniq -c \
       | sort -rn > "nameScan/${ucscName}.species.txt"
fi
#####################################################

     template:
#LOOP
runOne $(path1)
#ENDLOOP

     using list of maf files:
     ls ../splitSingleCover  | grep maf > cactus.maf.list

     gensub2 cactus.maf.list single template jobList

     ### made mistake of running too many of those at the same time
     ### and their use of large amounts of memory caused hgwdev to
     ### hang up and needed a reboot to recover.  So, run these carefully
     ### only a couple at a time on hgwdev and on hgcompute-01, the large
     ### ones get as large as 800 Gb in memory.

     ### get the species names from the nameScan:

   sed -e 's/^ \+//;' nameScan/chr2.species.txt | cut -d' ' -f2 \
      | sort > ../species.list.txt

################################################################################
### get the chrom.sizes out of the maf files 2023-06-19 - Hiram

   mkdir /hive/data/genomes/hg38/bed/cactus447/chromSizes
   cd /hive/data/genomes/hg38/bed/cactus447/chromSizes

   ### runOne script to run each one:

#!/bin/bash

export inMaf="${1}"
export result="${2}"

grep "^s " "${inMaf}" | awk '{printf "%s\t%s\n", $2,$6}' | sort -u > "${result}

    ### working on the maf list:
    ls ../ucscNames/chr*.maf > maf.list

     ### template to construct jobList
#LOOP
runOne $(path1) {check out line+ result/$(root1).txt}
#ENDLOOP

     gensub2 maf.list single template jobList

     para -ram=6g create jobList

Completed: 101 of 101 jobs
CPU time in finished jobs:      90801s    1513.35m    25.22h    1.05d  0.003 y
IO & Wait Time:                     0s       0.00m     0.00h    0.00d  0.000 y
Average job time:                 762s      12.69m     0.21h    0.01d
Longest finished job:            7952s     132.53m     2.21h    0.09d
Submission to last job:          8022s     133.70m     2.23h    0.09d

    ### sizes.pl script to get the answer out of the result/*.txt files:

#!/usr/bin/env perl

use strict;
use warnings;

my %chromSizes; # key is assembly name, value is a pointer to a hash
                #  with key chromName, value size

open (my $FH, "-|", "ls result/*.txt") or die "can not read ls result/*.txt";
while (my $file = <$FH>) {
  chomp $file;
  printf STDERR "# reading: %s\n", $file;
  open (my $R, "<", $file) or die "can not read $file";
  while (my $line = <$R>) {
    chomp $line;
    my @a = split('\.', $line);
    if (! defined($chromSizes{$a[0]})) {
       my %h;
       $chromSizes{$a[0]} = \%h;
    }
    my $asm = $chromSizes{$a[0]};
    if (scalar(@a) == 3) {
      my ($a2, $size) = split('\s+', $a[2]);
      my $chr = "$a[1].$a2";
      if (defined($asm->{$chr})) {
         if ($asm->{$chr} != $size) {
           printf STDERR "ERROR: size mismatch: %d != %d for %s\t%s.%s\n", $asm->{$chr}, $size, $a[0], $a[1], $a[2];
           exit 255;
         }
      } else {
         $asm->{$chr} = $size;
      }
#      printf "%s\t%s.%s\n", $a[0], $a[1], $a[2];
    } elsif (scalar(@a) == 2) {
      my ($chr, $size) = split('\s+', $a[1]);
      if (defined($asm->{$chr})) {
         if ($asm->{$chr} != $size) {
           printf STDERR "ERROR: size mismatch: %d != %d for %s\t%s\n", $asm->{$chr}, $size, $a[0], $a[1];
           exit 255;
         }
      } else {
         $asm->{$chr} = $size;
      }
#      printf "%s\t%s\n", $a[0], $a[1];
    } else {
      printf STDERR "ERROR: not 2 or 3: '%s'\n", $line;
      exit 255;
    }
  }
  close ($R);
}
close ($FH);

foreach my $asm (sort keys %chromSizes) {
   printf STDERR "# output %s\n", $asm;
   my $out = "sizes/${asm}.chrom.sizes";
   open ($FH, "|-", "sort -k2,2nr > $out") or die "can not write to $out";
   my $h = $chromSizes{$asm};
   foreach my $chr (sort keys %$h) {
#      printf  "%s\t%s\t%d\n", $asm, $chr, $h->{$chr};
      printf $FH "%s\t%d\n", $chr, $h->{$chr};
   }
   close ($FH);
}

    mkdir sizes

    time (./sizes.pl) > sizes.log 2>&1
real    10m11.808s
user    6m40.128s
sys     3m37.475s


################################################################################
### add the iRow annotation 2023-06-19 to 2023-06-26 - Hiram

  mkdir /hive/data/genomes/hg38/bed/cactus447/iRows
  cd /hive/data/genomes/hg38/bed/cactus447/iRows

  # obtained the N.bed files from Glenn
  ##  symLink them into this directory:

ls /hive/data/genomes/hg38/bed/cactus447/Nbeds/*.N.bed | grep -v "hg38.N.bed" \
   | while read P
do
  B=`basename $P | sed -e 's/N.bed/bed/;'`
  rm -f ./${P}
  ln -s "${P}" ./${B}
done

  ### and symlink in the chrom.sizes files as .len files

  ls /hive/data/genomes/hg38/bed/cactus447/chromSizes/sizes/*.chrom.sizes \
 | grep -v hg38.chrom.sizes | while read P
do
  S=`basename ${P} | sed -e 's/chrom.sizes/len/;'`
  printf "%s\n" "${S}"
  ln -s "${P}" ./${S}
done

ln -s /hive/data/genomes/hg38/chrom.sizes hg38.len

   ls *.bed > nBeds
   ls *.len > sizes

   ### list of maf file to process:

   ls ../ucscNames/chr*.maf > maf.list

   ### template file to construct jobList
#LOOP
mafAddIRows -nBeds=nBeds $(path1) /hive/data/genomes/hg38/hg38.2bit result/$(file1)
#ENDLOOP

   gensub2 maf.list single template jobList

   ### took a week to get these all done.
   ### convert each file to a bigMaf file:

   mkdir /hive/data/genomes/hg38/bed/cactus447/iRows/bigMaf
   cd /hive/data/genomes/hg38/bed/cactus447/iRows/bigMaf

   ### tried to sort them, but made a mistake on the sort -k argument:
   for F in ../result/chr*.maf
do
   B=`basename $F | sed -e 's/.maf//;'`
   mafToBigMaf hg38 "${F}" stdout | $HOME/bin/x86_64/gnusort -k2,2 -S1000G \
        --parallel=32 > "${B}.bigMaf"
done

real    1588m59.209s
user    1259m38.364s
sys     371m13.448s

   ### put these all together:
   ls -S chr*.bigMaf | while read M
do
  cat "${M}"
done > ../hg38.cactus447.bigMaf

real    322m25.741s
user    0m13.577s
sys     82m41.509s

   ### since they were sorted incorrectly above, sort this one huge file:

   $HOME/bin/x86_64/gnusort -S500G --parallel=64 -k1,1 -k2,2n \
     hg38.cactus447.bigMaf > hg38.sorted447way.bigMaf
real    752m12.425s
user    47m34.825s
sys     371m7.394s

################################################################################
### make the bigBed file from the sorted bigMaf 2023-06-27

bedToBigBed -verbose=2 -itemsPerSlot=4 -type=bed3+1 \
 -as=$HOME/kent/src/hg/lib/bigMaf.as -tab iRows/hg38.sorted447way.bigMaf \
     /hive/data/genomes/hg38/chrom.sizes hg38.cactus447way.bb

# pid=61008: VmPeak:     5682928 kB
# real    4157m14.091s

# that time is 69 hours

##############################################################################
# liftOver files to convert sequence names (DONE - 2023-08-04 - Hiram)
#
#   The sequence names in the assemblies in the alignment do not match
#   the sequence names in the corresponding genome browsers assemblies

    mkdir /hive/data/genomes/hg38/bed/cactus447/liftOver
    cd /hive/data/genomes/hg38/bed/cactus447/liftOver
    mkdir lifts
    ln -s ../idKeys/447way.equivalent.txt .

#  using this script to construct liftOver files
#!/bin/bash

TOP="/hive/data/genomes/hg38/bed/cactus447/liftOver"

cd "${TOP}"
mkdir -p lifts

cat 447way.equivalent.txt | while read L
do
  seqName=`printf "%s" "${L}" | cut -f1`
  asmId=`printf "%s" "${L}" | cut -f2`
  seqIdKeys="/hive/data/genomes/hg38/bed/cactus447/idKeys/${seqName}/${seqName}.idKeys.txt"
  case ${asmId} in
      GCA_* | GCF_* )
        gcX="${asmId:0:3}"
        d0="${asmId:4:3}"
        d1="${asmId:7:3}"
        d2="${asmId:10:3}"
    ab="/hive/data/genomes/asmHubs/allBuild/${gcX}/${d0}/${d1}/${d2}/${asmId}"
        buildDir=`realpath "${ab}"`
        asmKeys="${buildDir}/trackData/idKeys/${asmId}.idKeys.txt"
        asmSizes="${buildDir}/${asmId}.chrom.sizes"
#        asmSizes="${buildDir}/idKeys/chrom.sizes"
ls -og "${asmKeys}"
        ;;
     [a-zA-Z0-9]*)
        asmKeys="/hive/data/genomes/${asmId}/bed/idKeys/${asmId}.idKeys.txt"
        asmSizes="/hive/data/genomes/${asmId}/chrom.sizes"
        ;;
     *)  printf "ERROR not recognized $asmId\n" 1>&2
         exit 255
       ;;
  esac
  seqKeys="/hive/data/genomes/hg38/bed/cactus447/idKeys/${seqName}/${seqName}.idKeys.txt"
  seq2Bit="/hive/data/genomes/hg38/bed/cactus447/2bitFiles/${seqName}.2bit"
  if [ ! -s  "${seqKeys}" ]; then
    printf "can not find seq idKeys\n%s\n" "${seqKeys}" 1>&2
    exit 255
  fi
  if [ ! -s  "${asmKeys}" ]; then
    printf "can not find asm idKeys\n%s\n" "${asmKeys}" 1>&2
    exit 255
  fi
  printf "%s\t%s\n" "${asmId}" "${seqName}" 1>&2
printf "lifts/${seqName}.${asmId}.lift\n" 1>&2
printf "$seqKeys\n" 1>&2
printf "$asmKeys\n" 1>&2
printf "$asmSizes\n" 1>&2
  if [ ! -s "lifts/${seqName}.${asmId}.lift" ]; then
  join -t$'\t' "${seqKeys}" "${asmKeys}" | sort -k3,3 \
    | join -t$'\t' -2 3 <(sort -k1,1 ${asmSizes}) - \
      | awk -F$'\t' '{printf "0\t%s\t%d\t%s\t%d\n", $1, $2, $4, $2}' \
         > lifts/${seqName}.${asmId}.lift
  fi
done

    # a couple of those ended up with duplicate names in the liftOver
    # due to duplicate contigs in the assemblies.  They were edited
    # manually, one was really bad and needed this perl script to clean it:

#!/usr/bin/env perl

use strict;
use warnings;

my %idDone;     # key is sequence name, value is string of names used

open (my $fh, "<", "lifts/Otolemur_garnettii.otoGar3.lift") or die "can not read lifts/Otolemur_garnettii.otoGar3.lift";
while (my $line = <$fh>) {
  chomp $line;
  my @a = split('\t', $line);
  if (!defined($idDone{$a[1]})) {
    $idDone{$a[1]} = $a[3];
    printf "%s\n", $line;
  } else {
    $idDone{$a[1]} .= "," . $a[3];
  }
}
close ($fh);

foreach my $seq (sort keys %idDone) {
  printf STDERR "%s\t%s\n", $seq, $idDone{$seq};
}

##############################################################################
# MAF FRAMES (working - 2023-08-01 - Hiram)
    ssh hgwdev
    mkdir /hive/data/genomes/hg38/bed/cactus447/frames
    cd /hive/data/genomes/hg38/bed/cactus447/frames

    mkdir genes

#   survey all the genomes to find out what kinds of gene tracks they have

printf '#/bin/bash

egrep -v "GCA_|GCF_" ../idKeys/447way.equivalent.txt | cut -f2 | while read db
do
  printf "# ${db}:"
  for table in `hgsql -N -e "show tables;" $db | egrep "Gene|ncbiRefSeq" | egrep -v "Curated|Cds|Link|Other|PepTable|Predicted|Psl|wgEncode|gencode|uuGene|ToGeneName|ccds|encode|Ext|Old|Pep|Select|Bak|Verify|orfeome|Multi|transMap|Pfam|knownGeneMrna|sgpGene"`
  do
     count=`hgsql -N -e "select count(*) from $table;" $db`
     printf " %%s: %%s" "${table}" "${count}"
  done
  printf "\\n"
done
' > showGenes.sh


    chmod +x ./showGenes.sh
    ./showGenes.sh > gene.survey.txt 2>&1 &
    # most of them have ncbiRefSeq
    grep ncbiRefSeq gene.survey.txt  | cut -d' ' -f2 | sed -e 's/://;' \
       | sort | xargs echo | fold -s -w 72
# bosMut1 canFam4 chiLan1 colAng1 dipOrd2 eleEdw1 felCat8 fukDam1 micOch1
# mm10 myoBra1 myoDav1 myoLuc2 odoRosDiv1 orcOrc1 otoGar3 proCoq1 pteAle1
# rheMac10 sorAra2 speTri2 susScr3 tarSyr2 tupChi1

    # a couple do not:
    grep -v ncbiRefSeq gene.survey.txt  | cut -d' ' -f2 | sed -e 's/://;' \
       | sort | xargs echo | fold -s -w 72
# cavApe1 eidHel1 ptePar1
    # but they do have ensGene or augustusGene:
    egrep "cavApe1|eidHel1|ptePar1" gene.survey.txt
# cavApe1: augustusGene: 10114 ensGene: 22510 xenoRefGene: 322224
# eidHel1: augustusGene: 49674
# ptePar1: augustusGene: 68613

    #   1. ncbiRefSeq  - add hg38 here manually since it didn't come in above
#    for db in hg38
# do
#    asmName="${db}"

    for db in bosMut1 canFam4 chiLan1 colAng1 dipOrd2 eleEdw1 felCat8 fukDam1 micOch1 mm10 myoBra1 myoDav1 myoLuc2 odoRosDiv1 orcOrc1 otoGar3 proCoq1 pteAle1 rheMac10 sorAra2 speTri2 susScr3 tarSyr2 tupChi1
do
    asmName=`grep -w "${db}" ../idKeys/447way.equivalent.txt | cut -f1`
    hgsql -N -e "select name,chrom,strand,txStart,txEnd,cdsStart,cdsEnd,exonCount,exonStarts,exonEnds from ncbiRefSeq" ${db} \
      | genePredSingleCover stdin stdout | gzip -2c \
        > genes/${asmName}.gp.gz
    echo -n "# ${asmName}: "
    genePredCheck -db=${db} genes/${asmName}.gp.gz
done
# hg38: checked: 22882 failed: 0
# Bos_mutus: checked: 20457 failed: 0
# CanFam4: checked: 21143 failed: 0
# Chinchilla_lanigera: checked: 20482 failed: 0
# Colobus_angolensis: checked: 20042 failed: 0
# Dipodomys_ordii: checked: 19738 failed: 0
# Elephantulus_edwardii: checked: 20384 failed: 0
# Felis_catus: checked: 20051 failed: 0
# Fukomys_damarensis: checked: 19811 failed: 0
# Microtus_ochrogaster: checked: 20048 failed: 0
# Mus_musculus: checked: 23375 failed: 0
# Myotis_brandtii: checked: 19949 failed: 0
# Myotis_davidii: checked: 18815 failed: 0
# Myotis_lucifugus: checked: 19895 failed: 0
# Odobenus_rosmarus: checked: 19346 failed: 0
# Orcinus_orca: checked: 19136 failed: 0
# Otolemur_garnettii: checked: 19536 failed: 0
# Propithecus_coquerelli: checked: 19356 failed: 0
# Pteropus_alecto: checked: 18326 failed: 0
# Macaca_mulatta: checked: 21021 failed: 0
# Sorex_araneus: checked: 19160 failed: 0
# Ictidomys_tridecemlineatus: checked: 19892 failed: 0
# Sus_scrofa: checked: 24180 failed: 0
# Carlito_syrichta: checked: 19968 failed: 0
# Tupaia_chinensis: checked: 21047 failed: 0

    #   2. ensGene
    for db in cavApe1
do
asmName=`grep -w "${db}" ../idKeys/447way.equivalent.txt | cut -f1`
hgsql -N -e "select
name,chrom,strand,txStart,txEnd,cdsStart,cdsEnd,exonCount,exonStarts,exonEnds
from ensGene" ${db} \
      | genePredSingleCover stdin stdout | gzip -2c \
        > /dev/shm/${db}.tmp.gz
    mv /dev/shm/${db}.tmp.gz genes/${asmName}.gp.gz
    echo -n "# ${asmName}: "
    genePredCheck -db=${db} genes/${asmName}.gp.gz
done
# Cavia_aperea: checked: 14182 failed: 0

    #   3. augustusGene
    for db in eidHel1 ptePar1
do
asmName=`grep -w "${db}" ../idKeys/447way.equivalent.txt | cut -f1`
hgsql -N -e "select
name,chrom,strand,txStart,txEnd,cdsStart,cdsEnd,exonCount,exonStarts,exonEnds
from augustusGene" ${db} \
      | genePredSingleCover stdin stdout | gzip -2c \
        > /dev/shm/${db}.tmp.gz
    mv /dev/shm/${db}.tmp.gz genes/${asmName}.gp.gz
    echo -n "# ${asmName}: "
    genePredCheck -db=${db} genes/${asmName}.gp.gz
done
# Eidolon_helvum: checked: 31495 failed: 0
# Pteronotus_parnellii: checked: 47678 failed: 0

    # verify counts for genes are reasonable:
    for T in genes/*.gz
do
    printf "# $T: "
    zcat $T | cut -f1 | sort -u | wc -l
done
# genes/Bos_mutus.gp.gz: 20457
# genes/CanFam4.gp.gz: 21143
# genes/Canis_lupus_familiaris.gp.gz: 19812
# genes/Carlito_syrichta.gp.gz: 19968
# genes/Cavia_aperea.gp.gz: 14182
# genes/Cercocebus_atys.gp.gz: 20534
# genes/Chinchilla_lanigera.gp.gz: 20481
# genes/Colobus_angolensis.gp.gz: 20042
# genes/Condylura_cristata.gp.gz: 17842
# genes/Dipodomys_ordii.gp.gz: 19738
...
# genes/Pteropus_vampyrus.gp.gz: 19485
# genes/Rattus_norvegicus.gp.gz: 22854
# genes/Rhinopithecus_roxellana.gp.gz: 21027
# genes/Saimiri_boliviensis.gp.gz: 20933
# genes/Sorex_araneus.gp.gz: 19160
# genes/Sus_scrofa.gp.gz: 24043
# genes/Theropithecus_gelada.gp.gz: 20592
# genes/Tupaia_chinensis.gp.gz: 21047

# And, can pick up ncbiRefSeq for GCF equivalent assemblies for those
# outside the UCSC database set of genomes

grep GCF_ ../idKeys/447way.equivalent.txt | while read dbGCF
do
  asmName=`printf "%s" "${dbGCF}" | cut -f1`
  asmId=`printf "%s" "${dbGCF}" | cut -f2`
  gcX="${asmId:0:3}"
  d0="${asmId:4:3}"
  d1="${asmId:7:3}"
  d2="${asmId:10:3}"
  buildDir="/hive/data/genomes/asmHubs/refseqBuild/$gcX/$d0/$d1/$d2/${asmId}"
  if [ ! -d "${buildDir}" ]; then
     printf "not found: %s\n" "${buildDir}"
  else

  ncbiRefSeqGp="${buildDir}/trackData/ncbiRefSeq/process/${asmId}.ncbiRefSeq.gp"
    sizes="${buildDir}/${asmId}.chrom.sizes"
    if [ -s "${ncbiRefSeqGp}" ]; then
      genePredSingleCover "${ncbiRefSeqGp}" stdout | gzip -c > genes/${asmName}.gp.gz
      printf "# %s: " "${asmName}"
      genePredCheck -chromSizes="${sizes}" genes/${asmName}.gp.gz
    else
      printf "MISSING: ${ncbiRefSeqGp}\n" 1>&2
    fi
  fi

done

# verify counts for genes are reasonable:
    for T in genes/*.gz
do
    printf "# $T: "
    zcat $T | cut -f1 | sort -u | wc -l
done
# genes/Bos_mutus.gp.gz: 20457
# genes/CanFam4.gp.gz: 21143
# genes/Canis_lupus_familiaris.gp.gz: 19812
# genes/Carlito_syrichta.gp.gz: 19968
# genes/Cavia_aperea.gp.gz: 14182
# genes/Cercocebus_atys.gp.gz: 20534
# genes/Chinchilla_lanigera.gp.gz: 20481
# genes/Colobus_angolensis.gp.gz: 20042
# genes/Condylura_cristata.gp.gz: 17842
# genes/Dipodomys_ordii.gp.gz: 19738
...
# genes/Pteropus_vampyrus.gp.gz: 19485
# genes/Rattus_norvegicus.gp.gz: 22854
# genes/Rhinopithecus_roxellana.gp.gz: 21027
# genes/Saimiri_boliviensis.gp.gz: 20933
# genes/Sorex_araneus.gp.gz: 19160
# genes/Sus_scrofa.gp.gz: 24043
# genes/Theropithecus_gelada.gp.gz: 20592
# genes/Tupaia_chinensis.gp.gz: 21047

    # these gene predictions need lifting to change the sequence names
    mv genes genes.seqToAsm
    mkdir genes
    cd genes.seqToAsm

for F in *.gp.gz
do
  seqName=`echo $F | sed -e 's/.gp.gz//;'`
  liftFile="`ls -d /hive/data/genomes/hg38/bed/cactus447/liftOver/lifts/${seqName}.*.lift 2> /dev/null`"
  if [ -s "${liftFile}" ]; then
    printf "%s\n" "${seqName}" 1>&2
    liftUp -type=.genepred stdout "${liftFile}" warn "${F}" | gzip -c \
        > "../genes/${seqName}.gp.gz"
  else
    printf "can not find lift file for $seqName\n" 1>&2
  fi
done

    # kluster job to annotate each maf file
    screen -S hg38      # manage long running procedure with screen
    ssh ku
    cd /hive/data/genomes/hg38/bed/cactus447way/frames

    printf '#!/bin/bash
set -beEu -o pipefail

export C="${1}"
export G="${2}"

cat ../iRows/result/${C}.maf | genePredToMafFrames hg38 stdin stdout \
        ${G} genes/${G}.gp.gz | gzip > parts/${C}.${G}.mafFrames.gz
' > runOne

    chmod +x runOne

    ls ../iRows/result | grep maf | sed -e "s/.maf//" > chr.list
    ls genes | sed -e "s/.gp.gz//" > gene.list

    printf '#LOOP
runOne $(root1) $(root2) {check out exists+ parts/$(root1).$(root2).mafFrames.gz}
#ENDLOOP
' > template

    mkdir parts
    # run on the ku kluster
    gensub2 chr.list gene.list template jobList
    para -ram=128g create jobList
    para try ... check ... push
# Completed: 4444 of 4444 jobs
# CPU time in finished jobs:    3955725s   65928.75m  1098.81h   45.78d  0.125 y
# IO & Wait Time:               1735948s   28932.46m   482.21h   20.09d  0.055 y
# Average job time:                1281s      21.35m     0.36h    0.01d
# Longest finished job:           34637s     577.28m     9.62h    0.40d
# Submission to last job:         55372s     922.87m    15.38h    0.64d

    # collect all results into one file:
    cd /hive/data/genomes/hg38/bed/cactus447way/frames
    find ./parts -type f | while read F;
do
 zcat ${F} | awk '11 == NF'
done | sort -k1,1 -k2,2n > cactus447wayFrames.bed

    bedToBigBed -type=bed3+7 -as=$HOME/kent/src/hg/lib/mafFrames.as \
       -tab cactus447wayFrames.bed ../../../chrom.sizes cactus447wayFrames.bb

    bigBedInfo cactus447wayFrames.bb | sed -e 's/^/# /;'

# version: 4
# fieldCount: 11
# hasHeaderExtension: yes
# isCompressed: yes
# isSwapped: 0
# extraIndexCount: 0
# itemCount: 16,063,019
# primaryDataSize: 136,782,205
# primaryIndexSize: 1,013,720
# zoomLevels: 10
# chromCount: 82
# basesCovered: 65,486,909
# meanDepth (of bases covered): 22.709888
# minDepth: 1.000000
# maxDepth: 44.000000
# std of depth: 18.692097

    # -rw-rw-r-- 1 1192312943 Aug  5 15:28 cactus447wayFrames.bed
    # -rw-rw-r-- 1  166572857 Aug  5 15:29 cactus447wayFrames.bb

    gzip cactus447wayFrames.bed

    # verify there are frames on everything expected.  Since the
    #  frames on the HL* asemblies did not work due to sequence name
    #  differences, this won't be everything.
    # (ls genes | wc shows 44 'expected')
    zcat cactus447wayFrames.bed.gz | awk '{print $4}' | sort | uniq -c \
        | sed -e 's/^/# /;' > species.check.list
    wc -l species.check.list
    # 44

#  391443 Bos_mutus
#  414721 CanFam4
#  417196 Canis_lupus_familiaris
#  328151 Carlito_syrichta
#  255097 Cavia_aperea
#  373053 Cercocebus_atys
#  398356 Chinchilla_lanigera
#  350442 Colobus_angolensis
#  346524 Condylura_cristata
#  363333 Dipodomys_ordii
...
#  379732 Pteropus_alecto
#  387484 Pteropus_vampyrus
#  389056 Rattus_norvegicus
#  360929 Rhinopithecus_roxellana
#  366019 Saimiri_boliviensis
#  336296 Sorex_araneus
#  402603 Sus_scrofa
#  357418 Theropithecus_gelada
#  372558 Tupaia_chinensis
#  195495 hg38

    #   load the resulting file, merely for measurement purposes here
    ssh hgwdev
    cd /hive/data/genomes/hg38/bed/cactus447way/frames
    time hgLoadMafFrames hg38 cactus447wayFrames cactus447wayFrames.bed.gz
    #   real    3m14.979s

    hgsql -e 'select count(*) from cactus447wayFrames;' hg38
    # +----------+
    # | count(*) |
    # +----------+
    # | 16063019 |
    # +----------+

    time featureBits -countGaps hg38 cactus447wayFrames
    # 65486909 bases of 3299210039 (1.985%) in intersection
    # real    1m50.539s

    #   enable the trackDb entries:
# frames https://hgdownload.soe.ucsc.edu/goldenPath/hg38/cactus447way/cactus447wayFrames.bb
# irows on
    #   zoom to base level in an exon to see codon displays
    #	appears to work OK

    # do not need this loaded table:
    hgsql hg38 -e 'drop table cactus447wayFrames;'

#########################################################################
# Try loading this track data into database to see if download files
#    such as upstream gene mafs can be constructed
    mkdir -p /gbdb/hg38/cactus447way/maf
    cd /hive/data/genomes/hg38/bed/cactus447/iRows/result
    ln -s `pwd`/chr*.maf /gbdb/hg38/cactus447way/maf/

    # this generates an immense cactus447way.tab file in the directory
    #	where it is running.  Best to run this over in scratch.
    #   This is going to take all day.
    cd /dev/shm
    time hgLoadMaf -pathPrefix=/gbdb/hg38/cactus447way/maf hg38 cactus447way

# -rw-rw-r--    1 55422976 Aug  8 09:22 cactus447way.tab

    hgsql -e 'select count(*) from cactus447way;' hg38

    # Loading cactus447way into database
    # Loaded 108225530 mafs in 101 files from /gbdb/hg38/cactus447way/maf

    # real    440m30.144s
    # user    404m46.183s
    # sys     19m27.664s

#########################################################################
# Phylogenetic tree from 447-way (DONE - 2022-08-26 - Hiram)
    mkdir /hive/data/genomes/hg38/bed/cactus447/4d
    cd /hive/data/genomes/hg38/bed/cactus447/4d

    # the annotated maf's are in:
    ../iRows/result/chr*.maf

    # using ncbiRefSeq for hg38, only transcribed genes and nothing
    #	from the randoms and other misc.
    hgsql -Ne "select name,chrom,strand,txStart,txEnd,cdsStart,cdsEnd,exonCount,exonStarts,exonEnds from ncbiRefSeq where cdsEnd > cdsStart;" hg38 \
      | egrep -E -v "chrM|chrUn|random|_alt|_fix" > ncbiRefSeq.gp
    wc -l *.gp
    #     130420 ncbiRefSeq.gp

    # verify it is only on the chroms:
    cut -f2 ncbiRefSeq.gp | sort | uniq -c | sort -rn | sed -e 's/^/    # /;'
    #   12841 chr1
    #    9518 chr2
    #    8490 chr3
    #    7521 chr19
    #    7519 chr11
    #    7329 chr17
    #    7287 chr12
    #    6155 chr6
    #    6151 chr10
    #    5905 chr7
    #    5561 chr5
    #    5504 chr9
    #    5351 chr4
    #    5235 chr16
    #    4964 chr8
    #    4296 chrX
    #    4255 chr15
    #    3959 chr14
    #    3089 chr20
    #    2721 chr22
    #    2521 chr18
    #    2471 chr13
    #    1411 chr21
    #     366 chrY

    genePredSingleCover ncbiRefSeq.gp stdout | sort > ncbiRefSeqNR.gp
    wc -l ncbiRefSeqNR.gp
    #	19542 ncbiRefSeqNR.gp

    ssh ku
    mkdir /hive/data/genomes/hg38/bed/cactus447/4d/run
    cd /hive/data/genomes/hg38/bed/cactus447/4d/run
    mkdir ../mfa

    # newer versions of msa_view have a slightly different operation
    # the sed of the gp file inserts the reference species in the chr name
    # these processes are going to be copying the whole chromosome maf file
    # to /scratch/tmp/ - these are huge files, make sure /scratch/tmp/
    # on the kluster nodes is clean enough to manage the largest chromosome
    # maf file, for example:
# -rw-rw-r-- 1 661829442743 Jun 20 18:55 chr1.maf
# -rw-rw-r-- 1 682410918389 Jun 21 16:21 chr2.maf

    cat << '_EOF_' > 4d.csh
#!/bin/csh -fe
set PHASTBIN = /cluster/bin/phast.build/cornellCVS/phast.2018-03-29/bin
set r = "/hive/data/genomes/hg38/bed/cactus447"
set c = $1
set infile = $r/iRows/result/$2
set outfile = $3
cd /scratch/tmp
# 'clean' maf, removes all chrom names, leaves only the db name
perl -wpe 's/^s ([^.]+)\.\S+/s $1/' $infile > $c.maf
awk -v C=$c '$2 == C {print}' $r/4d/ncbiRefSeqNR.gp | sed -e "s/\t$c\t/\thg38.$c\t/" > $c.gp
set NL=`wc -l $c.gp| gawk '{print $1}'`
if ("$NL" != "0") then
    $PHASTBIN/msa_view --4d --features $c.gp -i MAF $c.maf -o SS > $c.ss
    $PHASTBIN/msa_view -i SS --tuple-size 1 $c.ss > $r/4d/run/$outfile
else
    echo "" > $r/4d/run/$outfile
endif
rm -f $c.gp $c.maf $c.ss
'_EOF_'
    # << happy emacs
    chmod +x 4d.csh

    ls -1S /hive/data/genomes/hg38/bed/cactus447/iRows/result/chr*.maf \
	| sed -e "s#.*cactus447/iRows/result/##" \
        | egrep -E -v "chrM|chrUn|random|_alt|_fix" > maf.list

    printf '#LOOP
4d.csh $(root1) $(path1) {check out line+ ../mfa/$(root1).mfa}
#ENDLOOP
' > template

    gensub2 maf.list single template jobList
    para -ram=128g create jobList
    para try ... check ... push ... etc...
    para time
# CPU time in finished jobs:     290959s    4849.31m    80.82h    3.37d  0.009 y
# IO & Wait Time:                 17027s     283.79m     4.73h    0.20d  0.001 y
# Average job time:               15399s     256.65m     4.28h    0.18d
# Longest finished job:           36250s     604.17m    10.07h    0.42d
# Submission to last job:         36265s     604.42m    10.07h    0.42d

    # combine mfa files
    ssh hgwdev
    cd /hive/data/genomes/hg38/bed/cactus447/4d
    # verify no tiny files:
    ls -og mfa | sort -k3nr | tail -3
    # -rw-rw-r-- 1  2583636 Aug  8 10:30 chr21.mfa
    # -rw-rw-r-- 1  1759815 Aug  8 09:47 chrY.mfa

    #want comma-less species.list
    time /cluster/bin/phast.build/cornellCVS/phast.2018-03-29/bin/msa_view \
	--aggregate "`cat ../species.list.txt`" mfa/*.mfa | sed s/"> "/">"/ \
	    > 4d.all.mfa
    # real    0m9.416s
    # -rw-rw-r-- 1 303080280 Aug  8 21:48 4d.all.mfa

    # check they are all in there:
    grep "^>" 4d.all.mfa | wc -l
    #   447

    sed 's/:[0-9\.][0-9\.]*//g; s/;//; /^ *$/d' \
        ../hg38.447way.nh

    sed 's/:[0-9\.][0-9\.]*//g; s/;//; /^ *$/d' \
	../hg38.447way.nh > tree-commas.nh

    # use phyloFit to create tree model (output is phyloFit.mod)
    time /cluster/bin/phast.build/cornellCVS/phast.2018-03-29/bin/phyloFit \
	    --EM --precision MED --msa-format FASTA --subst-mod REV \
		--tree tree-commas.nh 4d.all.mfa
    #   real    880m32.863s

    mv phyloFit.mod all.mod

    grep TREE all.mod
# TREE: (((((((((((((hg38:0.00899231,(Pan_paniscus:0.00233221,
# Pan_troglodytes:0.00214362):0.00450706):0.00200006,
# (Gorilla_gorilla:0.00120794,
# Gorilla_beringei:0.00131286):0.00731999):0.00848699,
# (Pongo_abelii:0.00175881,Pongo_pygmaeus:0.0021602):0.015206):0.00273536,
# (((((Hylobates_lar:0.000904688,Hylobates_pileatus:0.000921279):0.00271202,
# (((Hylobates_abbotti:0.00182513,Hylobates_muelleri:0.00188051):0.000678509,
# Hylobates_agilis:0.00217126):0.00068124,
# ...
# (Ceratotherium_simum:0.000693402,
# Ceratotherium_simum_cottoni:0.000665708):0.00612505):0.00973125):0.0392953):0.0100285):0.0323073):0.00299066):0.00383198):0.00906124):0.0213096):0.012194,
# (((Dasypus_novemcinctus:0.0705749,(Tolypeutes_matacus:0.036879,
# Chaetophractus_vellerosus:0.0367115):0.0193664):0.0421356,
# ((Tamandua_tetradactyla:0.0243613,
# Myrmecophaga_tridactyla:0.0216596):0.0822034,
# (Choloepus_didactylus:0.0057887,
# Choloepus_hoffmanni:0.00621468):0.0702582):0.018824):0.0529836,
# (((Trichechus_manatus:0.0621184,(Procavia_capensis:0.0107201,
# Heterohyrax_brucei:0.0105857):0.149548):0.00440411,
# Loxodonta_africana:0.0800445):0.0234193,
# ((((Microgale_talazaci:0.118008,Echinops_telfairi:0.077048):0.158928,
# Chrysochloris_asiatica:0.138211):0.0147081,
# Elephantulus_edwardii:0.20827):0.00712897,
# Orycteropus_afer:0.112233):0.00664842):0.0534765):0.012194);

    # compare these calculated lengths to what we started with

    /cluster/bin/phast/all_dists ../hg38.447way.nh  | grep hg38 \
	| sed -e "s/hg38.//;" | sort > original.dists

    grep TREE all.mod | sed -e 's/TREE: //;' \
       | /cluster/bin/phast/all_dists /dev/stdin | grep hg38 \
          | sed -e "s/hg38.//;"  | sort > hg38.dists

    # printing out the 'original', the 'new' the 'difference' and
    #    percent difference/delta
    join original.dists hg38.dists | awk '{
  printf "#\t%s\t%8.6f\t%8.6f\t%8.6f\t%8.6f\n", $1, $2, $3, $2-$3, 100*($2-$3)/$3 }'       | sort -k4n | cut -f2,3,4,6 | sort -k4,4n
Chiropotes_albinasus    0.044221        0.153018        -71.100786
Hapalemur_gilberti      0.091144        0.238983        -61.861722
Hapalemur_meridionalis  0.091511        0.239866        -61.849116
Hapalemur_occidentalis  0.090759        0.237751        -61.826028
Hapalemur_griseus       0.091379        0.239368        -61.824889
Hapalemur_alaotrensis   0.091149        0.238685        -61.812012
Eulemur_rufus   0.090624        0.236415        -61.667407
Eulemur_fulvus  0.090508        0.236040        -61.655652
Lemur_catta     0.090748        0.236490        -61.627130
...
Tolypeutes_matacus      0.518057        0.332742        55.693300
Chaetophractus_vellerosus       0.520177        0.332575        56.408930
Catagonus_wagneri       0.578187        0.366272        57.857275
Megaderma_lyra  0.575967        0.360862        59.608659
Spermophilus_dauricus   0.530847        0.329338        61.186076
Choloepus_didactylus    0.537227        0.329232        63.175815
Vicugna_pacos   0.553177        0.338170        63.579561
Choloepus_hoffmanni     0.539357        0.329658        63.611076
Tursiops_truncatus      0.533317        0.325632        63.779051
Myrmecophaga_tridactyla 0.589117        0.357048        64.996583

    # also calculated with SSREV model:
    mkdir /cluster/data/hg38/bed/cactus447/4dSSREV
    cd /cluster/data/hg38/bed/cactus447/4dSSREV
    cp -p ../4d/4d.all.mfa .
    cp -p ../4d/tree-commas.nh .
    time /cluster/bin/phast.build/cornellCVS/phast.2021-06-01/bin/phyloFit \
       --EM --precision MED --msa-format FASTA --subst-mod SSREV \
          --tree tree-commas.nh 4d.all.mfa
    # real    1406m51.018s

    mv phyloFit.mod all.mod
    grep BACK all.mod
    # BACKGROUND: 0.254978 0.245022 0.245022 0.254978

#########################################################################
# phyloP for 447-way (DONE - 2017-11-06 - Hiram)
#
    # split SS files into 1M chunks, this business needs smaller files
    #   to complete

    ssh ku
    mkdir /hive/data/genomes/hg38/bed/cactus447/consPhyloP
    cd /hive/data/genomes/hg38/bed/cactus447/consPhyloP
    mkdir ss run.split
    cd run.split

    # the annotated maf's are in:
    ../iRows/result/chr*.maf

    printf '#!/bin/csh -ef
set c = $1
set MAF = /hive/data/genomes/hg38/bed/cactus447/iRows/result/$c.maf
set WINDOWS = /hive/data/genomes/hg38/bed/cactus447/consPhyloP/ss/$c
set NL = `grep -m 10 -c -v "^#" $MAF`
if ( -s $2 ) then
    exit 0
endif
if ( -s $2.running ) then
    exit 0
endif

date >> $2.running

rm -fr $WINDOWS
mkdir -p $WINDOWS
pushd $WINDOWS > /dev/null
if ( $NL > 0 ) then
/cluster/bin/phast.build/cornellCVS/phast.2021-06-01/bin/msa_split \
    $MAF -i MAF -o SS -r $WINDOWS/$c -w 1000000,0 -I 1000 -B 5000
endif
popd > /dev/null
date >> $2
rm -f $2.running
' > doSplit.csh

    chmod +x doSplit.csh

    #	do the easy ones first to see some immediate results
    ls -1SL -r ../../iRows/result | grep chr | sed -e "s/.maf//;" > maf.list

    # this needs a {check out line+ $(root1.done)} test for verification:
    printf '#LOOP
./doSplit.csh $(root1) {check out line+ $(root1).done}
#ENDLOOP
' > template

    gensub2 maf.list single template jobList
    # this was tricky, some of the jobs couldn't complete even with
    #  128g, had to complete a number of these manually, and they
    #  take a long time, more than 12 hours for individual chromosomes
    para -ram=128g create jobList
    para try ... check ... push ... etc...

# Completed: 79 of 101 jobs
# Crashed: 22 jobs
# CPU time in finished jobs:       8058s     134.29m     2.24h    0.09d  0.000 y
# IO & Wait Time:                   380s       6.34m     0.11h    0.00d  0.000 y
# Average job time:                 107s       1.78m     0.03h    0.00d
# Longest finished job:            3239s      53.98m     0.90h    0.04d
# Submission to last job:          5350s      89.17m     1.49h    0.06d

    # run phyloP with score=LRT
    ssh ku
    mkdir /cluster/data/hg38/bed/cactus447/consPhyloP
    cd /cluster/data/hg38/bed/cactus447/consPhyloP

    mkdir run.phyloP
    cd run.phyloP
    # Adjust model file base composition background and rate matrix to be
    # representative of the chromosomes in play
    grep BACK ../../4dSSREV/all.mod
    # BACKGROUND: 0.254978 0.245022 0.245022 0.254978
    # interesting, they are already paired up

    grep BACKGROUND ../../4dSSREV/all.mod | awk '{printf "%0.3f\n", $3 + $4}'
    #	0.490
    /cluster/bin/phast.build/cornellCVS/phast.2021-06-01/bin/modFreqs \
	../../4dSSREV/all.mod 0.490 > all.mod
    # verify, the BACKGROUND should now be paired up:
    grep BACK all.mod
    #   BACKGROUND: 0.255000 0.245000 0.245000 0.255000

    printf '#!/bin/csh -fe
set PHASTBIN = /cluster/bin/phast.build/cornellCVS/phast.2021-06-01/bin
set f = $1
set ssFile = $1:t
set out = $2
set cName = $f:h
set n = $f:r:e
set grp = $cwd:t
set cons = /hive/data/genomes/hg38/bed/cactus447/consPhyloP
set tmp = $cons/tmp/$grp/$f
/bin/rm -fr $tmp
/bin/mkdir -p $tmp
set ssSrc = "$cons/ss/$cName/$ssFile"
set useGrp = "$grp.mod"
/bin/ln -s $cons/run.phyloP/$grp.mod $tmp
pushd $tmp > /dev/null
echo source: $ssSrc.ss
$PHASTBIN/phyloP --method LRT --mode CONACC --wig-scores --chrom $cName \
    -i SS $useGrp $ssSrc.ss > $ssFile.wigFix
popd > /dev/null
/bin/mkdir -p $out:h
sleep 4
/bin/touch $out:h
/bin/mv $tmp/$ssFile.wigFix $out
/bin/rm -fr $tmp
/bin/rmdir --ignore-fail-on-non-empty $cons/tmp/$grp
/bin/rmdir --ignore-fail-on-non-empty $cons/tmp
' > doPhyloP.csh

    chmod +x doPhyloP.csh

    # Create list of chunks
    find ../ss -type f | sed -e "s/.ss$//; s#../ss/##;" > ss.list
    # make sure the list looks good
    wc -l ss.list
    #	3031 ss.list

    # Create template file
    #	file1 == $chr/$chunk/file name without .ss suffix
    printf '#LOOP
../run.phyloP/doPhyloP.csh $(path1) {check out line+ wigFix/$(dir1)/$(file1).wigFix}
#ENDLOOP
' > template

    ######################   Running all species  #######################
    # setup run for all species
    mkdir /hive/data/genomes/hg38/bed/cactus447/consPhyloP/all
    cd /hive/data/genomes/hg38/bed/cactus447/consPhyloP/all
    mkdir wigFix

    gensub2 ../run.phyloP/ss.list single ../run.phyloP/template jobList
    # beware overloading the cluster with these quick and high I/O jobs
    para -ram=64g create jobList
    para try ... check ...
    para -maxJob=16 push
    para time > run.time
Completed: 3031 of 3031 jobs
CPU time in finished jobs:   28585172s  476419.53m  7940.33h  330.85d  0.906 y
IO & Wait Time:                132590s    2209.83m    36.83h    1.53d  0.004 y
Average job time:                9475s     157.91m     2.63h    0.11d
Longest finished job:           13733s     228.88m     3.81h    0.16d
Submission to last job:         38568s     642.80m    10.71h    0.45d

    mkdir downloads
    time for D in `ls -d wigFix/chr* | sed -e 's#wigFix/##'`
do
    echo "working: $D" 1>&2
    find ./wigFix/${D} -type f | sed -e "s#^./##; s#\.# d #g; s#-# m #;" \
	| sort -k1,1 -k3,3n | sed -e "s# d #.#g; s# m #-#g;" | xargs cat \
        | gzip -c > downloads/${D}.phyloP447way.wigFix.gz
done
    #   real    34m31.522s

    du -hsc downloads
    #   5.4G    downloads

    # check integrity of data with wigToBigWig
    zcat downloads/*.wigFix.gz > all.wigFix
  time (wigToBigWig -verbose=2 all.wigFix /hive/data/genomes/hg38/chrom.sizes \
	phyloP447way.bw) > bigWig.log 2>&1

    egrep "real|VmPeak" bigWig.log
    # pid=236037: VmPeak:   33037308 kB
    # real    25m42.759s

    bigWigInfo phyloP447way.bw | sed -e 's/^/# /;'
# version: 4
# isCompressed: yes
# isSwapped: 0
# primaryDataSize: 7,824,240,203
# primaryIndexSize: 90,693,508
# zoomLevels: 10
# chromCount: 98
# basesCovered: 2,874,168,445
# mean: -0.022678
# min: -20.000000
# max: 8.123000
# std: 1.692685

    #	encode those files into wiggle data
#     time (zcat downloads/*.wigFix.gz \
    wigEncode all.wigFix phyloP447way.wig phyloP447way.wib
# Converted all.wigFix, upper limit 8.12, lower limit -20.00
# real    8m8.231s


# -rw-rw-r--   1  2874168445 Aug 30 14:08 phyloP447way.wib
# -rw-rw-r--   1   298978228 Aug 30 14:08 phyloP447way.wig

    du -hsc *.wi?
    # 2.7G    phyloP447way.wib
    # 286M    phyloP447way.wig

    # Load gbdb and database with wiggle.
    rm -f /gbdb/hg38/cactus447way/phyloP447way.wib
    ln -s `pwd`/phyloP447way.wib /gbdb/hg38/cactus447way/phyloP447way.wib
    time hgLoadWiggle -pathPrefix=/gbdb/hg38/cactus447way hg38 \
	phyloP447way phyloP447way.wig
    # real    0m27.533s

    # use to set trackDb.ra entries for wiggle min and max
    # and verify table is loaded correctly

    wigTableStats.sh hg38 phyloP447way
# db.table          min   max     mean       count     sumData
hg38.phyloP447way   -20 8.123 -0.022678 2874168445 -6.51804e+07
#      1.69269 viewLimits=-8.4861:8.123
#       stdDev viewLimits

    #	that range is: 20+8.123= 28.123 for hBinSize=0.028123

    #  Create histogram to get an overview of all the data
    time hgWiggle -doHistogram \
	-hBinSize=0.028123 -hBinCount=1000 -hMinVal=-20 -verbose=2 \
	    -db=hg38 phyloP447way > histogram.data 2>&1
    #  real    1m39.861s


    # x-axis range:
    grep -v chrom histogram.data | grep "^[0-9]" | ave -col=2 stdin \
	| sed -e 's/^/# /;'
# Q1 -10.086650
# median -4.012075
# Q3 2.062495
# average -4.017054
# min -20.000000
# max 8.123000
# count 864
# total -3470.734684
# standard deviation 7.028195

    # find out the range for the 2:5 graph
    grep -v chrom histogram.data | grep "^[0-9]" | ave -col=5 stdin \
      | sed -e 's/^/# /;'
# Q1 0.000016
# median 0.000074
# Q3 0.000431
# average 0.001157
# min 0.000000
# max 0.025240
# count 864
# total 1.000001
# standard deviation 0.003283

    #	create plot of histogram:
    printf 'set terminal pngcairo size 1200,600 background "#000000" font "/usr/share/fonts/default/Type1/n022004l.pfb"
set output "hg38.phyloP447.histo.png"
set size 1.0, 1.0
set style line 1 lt 2 lc rgb "#ff88ff" lw 2
set style line 2 lt 2 lc rgb "#66ff66" lw 2
set style line 3 lt 2 lc rgb "#ffff00" lw 2
set style line 4 lt 2 lc rgb "#ffffff" lw 2
set border lc rgb "#ffff00"
set key left box ls 3
set key tc variable
set grid noxtics
set grid ytics ls 4
set y2tics border nomirror out tc rgb "#ffffff"
set ytics border nomirror out tc rgb "#ffffff"
set title " Human hg38 Histogram phyloP447way track" tc rgb "#ffffff"
set xlabel " hg38.phyloP447way score" tc rgb "#ffffff"
set ylabel " Relative Frequency" tc rgb "#ff88ff"
set y2label " Cumulative Relative Frequency (CRF)" tc rgb "#66ff66"
set xrange [-4:2.5]
set yrange [0:0.04]
set y2range [0:1]

plot "histogram.data" using 2:5 title " RelFreq" with impulses ls 1, \
        "histogram.data" using 2:7 axes x1y2 title " CRF" with lines ls 2
' | gnuplot

    # verify it looks sane
    display hg38.phyloP447.histo.png &
    # it now shows the spike at -1.000 for the artifical filling of
    # no data available.

    ######################   Running primate species  #######################
    # computing 4d-sites phyloFit for primates
    mkdir /cluster/data/hg38/bed/cactus447/4d/primates
    cd /cluster/data/hg38/bed/cactus447/4d/primates
    # the primates.list.txt was obtained from the hg38.447way.nh
    # tree by taking the first 243 in that tree, ending in:
    head -243 ../../hg38.447way.nh | tail -1
     Galagoides_demidoff:0.00802126):0.0124128):0.0334321):0.0146219):0.105,

    head -243 ../../hg38.447way.nh | sed -e 's/^ \+//;' \
       | tr -d '(' | sed -e 's/:.*//;' | sort  > primates.list.txt

    # then only using the mfa sequences for those specific sequences
    ls ../mfa/chr*.mfa | while read C
do
  B=`basename $C`
  faSomeRecords "${C}" primates.list.txt ${B}
  printf "%s\n" "${B}"
done
    # resulting in:
    faSize chr*.mfa
# 151201516 bases (18902 N's 151182614 real 151182614 upper 0 lower)
#	in 5832 sequences in 24 files

    # then running phyloFit:
    sed 's/:[0-9\.][0-9\.]*//g; s/;//; /^ *$/d'  primates.nh > tree-commas.nh

    # note: using the SSREV model here:
    /cluster/bin/phast.build/cornellCVS/phast.2021-06-01/bin/phyloFit \
      --EM --precision MED --msa-format FASTA --subst-mod SSREV \
        --tree tree-commas.nh primates.mfa
    # rename the result
    mv phyloFit.mod primates.mod

    # adjust the frequencies
    cd  /cluster/data/hg38/bed/cactus447/consPhyloP/run.phyloP
    grep BACK ../../4d/primates/primates.mod
    # BACKGROUND: 0.261935 0.238065 0.238065 0.261935
    # interesting, they are already paired up

    grep BACK ../../4d/primates/primates.mod | awk '{printf "%0.3f\n", $3 + $4}'
    #	0.476
    /cluster/bin/phast.build/cornellCVS/phast.2021-06-01/bin/modFreqs \
	../../4d/primates/primates.mod 0.476 > primates.mod
    # verify, the BACKGROUND should now be paired up:
    grep BACK all.mod
    #   BACKGROUND: 0.262000 0.238000 0.238000 0.262000

    # setup run for primate species
    mkdir /hive/data/genomes/hg38/bed/cactus447/consPhyloP/primates
    cd /hive/data/genomes/hg38/bed/cactus447/consPhyloP/primates
    mkdir wigFix

    gensub2 ../run.phyloP/ss.list single ../run.phyloP/template jobList
    para -ram=6g create jobList
    para try ... check ...
    para -maxJob=16 push
    para time > run.time
Completed: 2438 of 2438 jobs
CPU time in finished jobs:   18916928s  315282.13m  5254.70h  218.95d  0.600 y
IO & Wait Time:                 68909s    1148.49m    19.14h    0.80d  0.002 y
Average job time:                7787s     129.79m     2.16h    0.09d
Longest finished job:           12050s     200.83m     3.35h    0.14d
Submission to last job:         26707s     445.12m     7.42h    0.31d

    mkdir downloads
    time for D in `ls -d wigFix/chr* | sed -e 's#wigFix/##'`
do
    echo "working: $D" 1>&2
    find ./wigFix/${D} -type f | sed -e "s#^./##; s#\.# d #g; s#-# m #;" \
	| sort -k1,1 -k3,3n | sed -e "s# d #.#g; s# m #-#g;" | xargs cat \
        | gzip -c > downloads/${D}.phyloP447wayPrimates.wigFix.gz
done
    #   real    33m18.006s

    du -hsc downloads
    #   4.9G    downloads

    # check integrity of data with wigToBigWig
    zcat downloads/*.wigFix.gz > primates.wigFix
    time (wigToBigWig -verbose=2 primates.wigFix \
        /hive/data/genomes/hg38/chrom.sizes \
	phyloP447wayPrimates.bw) > bigWig.log 2>&1

    egrep "real|VmPeak" bigWig.log
    # pid=236355: VmPeak:   33037308 kB
    # real    26m17.042s

    bigWigInfo phyloP447wayPrimates.bw | sed -e 's/^/# /;'
# version: 4
# isCompressed: yes
# isSwapped: 0
# primaryDataSize: 6,954,238,655
# primaryIndexSize: 90,693,508
# zoomLevels: 10
# chromCount: 98
# basesCovered: 2,874,168,445
# mean: -0.066955
# min: -20.000000
# max: 1.587000
# std: 1.479093

    #	encode those files into wiggle data
#    time (zcat downloads/*.wigFix.gz \
	time wigEncode primates.wigFix phyloP447wayPrimates.wig phyloP447wayPrimates.wib
# Converted primates.wigFix, upper limit 1.59, lower limit -20.00
# real    9m17.113s
XXX - running - Wed Aug 30 13:59:09 PDT 2023

# -rw-rw-r--   1  2874168445 Aug 30 14:08 phyloP447wayPrimates.wib
# -rw-rw-r--   1   320006223 Aug 30 14:08 phyloP447wayPrimates.wig

    du -hsc *.wi?
    # 2.7G    phyloP447way.wib
    # 306M    phyloP447way.wig

    # Load gbdb and database with wiggle.
    rm -f /gbdb/hg38/cactus447way/phyloP447wayPrimates.wib
    ln -s `pwd`/phyloP447wayPrimates.wib /gbdb/hg38/cactus447way/phyloP447wayPrimates.wib
    time hgLoadWiggle -pathPrefix=/gbdb/hg38/cactus447way hg38 \
	phyloP447wayPrimates phyloP447wayPrimates.wig
    # real    0m27.140s

    # use to set trackDb.ra entries for wiggle min and max
    # and verify table is loaded correctly

    wigTableStats.sh hg38 phyloP447wayPrimates
# db.table                min   max     mean       count     sumData
hg38.phyloP447wayPrimates -20 1.587 -0.0669553 2874168445 -1.92441e+08
#      1.47909 viewLimits=-7.46242:1.587
#       stdDev viewLimits

    #	that range is: 20+1.587= 21.587 for hBinSize=0.021587

    #  Create histogram to get an overview of all the data
    time hgWiggle -doHistogram \
	-hBinSize=0.021587 -hBinCount=1000 -hMinVal=-20 -verbose=2 \
	    -db=hg38 phyloP447wayPrimates > histogram.data 2>&1
    #   real    1m30.401s
    # x-axis range:
    grep -v chrom histogram.data | grep "^[0-9]" | ave -col=2 stdin \
	| sed -e 's/^/# /;'
# Q1 -11.645800
# median -7.231290
# Q3 -2.816750
# average -7.238782
# min -20.000000
# max 1.565410
# count 818
# total -5921.323540
# standard deviation 5.110470

    # find out the range for the 2:5 graph
    grep -v chrom histogram.data | grep "^[0-9]" | ave -col=5 stdin \
      | sed -e 's/^/# /;'
# Q1 0.000007
# median 0.000032
# Q3 0.000340
# average 0.001237
# min 0.000001
# max 0.025761
# count 818
# total 1.012033
# standard deviation 0.003386

    #	create plot of histogram:
    printf 'set terminal pngcairo size 1200,600 background "#000000" font "/usr/share/fonts/default/Type1/n022004l.pfb"
set output "hg38.phyloP447Primates.histo.png"
set size 1.0, 1.0
set style line 1 lt 2 lc rgb "#ff88ff" lw 2
set style line 2 lt 2 lc rgb "#66ff66" lw 2
set style line 3 lt 2 lc rgb "#ffff00" lw 2
set style line 4 lt 2 lc rgb "#ffffff" lw 2
set border lc rgb "#ffff00"
set key left box ls 3
set key tc variable
set grid noxtics
set grid ytics ls 4
set y2tics border nomirror out tc rgb "#ffffff"
set ytics border nomirror out tc rgb "#ffffff"
set title " Human hg38 Histogram phyloP447wayPrimates track" tc rgb "#ffffff"
set xlabel " hg38.phyloP447wayPrimates score" tc rgb "#ffffff"
set ylabel " Relative Frequency" tc rgb "#ff88ff"
set y2label " Cumulative Relative Frequency (CRF)" tc rgb "#66ff66"
set xrange [-3:2]
set yrange [0:0.04]
set y2range [0:1]

plot "histogram.data" using 2:5 title " RelFreq" with impulses ls 1, \
        "histogram.data" using 2:7 axes x1y2 title " CRF" with lines ls 2
' | gnuplot

    # verify it looks sane
    display hg38.phyloP447Primates.histo.png &
    # it now shows the spike at -1.000 for the artifical filling of
    # no data available.

#############################################################################
