# for emacs: -*- mode: sh; -*-

# This file describes tracks built for the manuscript by Yatish Turakhia, Russ Corbet-Detig
# et al. about apparent recurrent mutations (some erroneous, some real) that can cause trouble for
# phylogenetic analyses, and comparisons of phylogenetic trees from different groups
# (mainly Nextstrain and COG-UK).
# Relevant github repos:
# https://github.com/yatisht/strain_phylogenetics
# https://github.com/lgozasht/COVID-19-Lab-Specific-Bias-Filter

#########################################################################
# Lab-associated mutations (DONE - 2020-06-03 - Angie)

    mkdir /hive/data/genomes/wuhCor1/bed/labAssocMuts
    cd /hive/data/genomes/wuhCor1/bed/labAssocMuts
    # Saved file '2020-04-19 - Table_S1_Lab_Associated.tsv' emailed from Landen Gozashti
    # Convert file to bed & bigBed with labAssocMuts.as:
    perl -wne 's/[\r\n]//g;
        @w = split("\t");
        next unless (@w and $w[0] =~ m/^[A-Z](\d+)[A-Z]$/);
        $pos = $1;
        # Tweak columns to match output of github.com/lgozasht/COVID-19-Lab-Specific-Bias-Filter
        ($name, $aaChange, $articPrimer, $altCount, $pars, $comment, $maf) = @w;
        $name =~ s/T/U/;
        $aaChange =~ s/^AACHANGE=//;
        $articPrimer =~ s/left/LEFT/g;
        $articPrimer =~ s/right/RIGHT/g;
        $articPrimer =~ s/^/nCoV-2019_/ if ($articPrimer ne "");
        $articPrimer =~ s/, /,nCoV-2019_/g;
        print join("\t", "NC_045512v2", ($pos-1), $pos, $name, $pars, $altCount, $maf,
                   $articPrimer, $aaChange, $comment) . "\n";' \
     '2020-04-19 - Table_S1_Lab_Associated.tsv' \
    | sort -k2n,2n \
      > labAssocMutsNs419.bed
    bedToBigBed -tab -type=bed4+9 -as=$HOME/kent/src/hg/lib/labAssocMuts.as \
      labAssocMutsNs419.bed ../../chrom.sizes labAssocMutsNs419.bb

    # Install in /gbdb
    mkdir /gbdb/wuhCor1/phylogenetics/
    ln -s `pwd`/labAssocMutsNs419.bb /gbdb/wuhCor1/phylogenetics/


#########################################################################
# SARS-CoV-2 PHYLOGENY (DONE - 2020-09-03 - Angie)  (sampleColorFile added 2020-09-27)
# First done 2020-07-15; updated 2020-07-22 with tree from Yatish Turakhia, collapsed to keep
# only nodes with associated mutations.
# Updated 2020-09-03 with latest release 28-08-20.
# NOTE: updates from 2020-11 onward are driven by scripts in hg/utils/otto/sarscov2phylo/ .

    releaseLabel=28-08-20
    mkdir -p /hive/data/genomes/wuhCor1/bed/sarsCov2Phylo/$releaseLabel
    cd /hive/data/genomes/wuhCor1/bed/sarsCov2Phylo/$releaseLabel
    wget https://github.com/roblanf/sarscov2phylo/releases/download/28-08-20/ft_SH.tree
    # aln_global_unmasked.fa.xz emailed from Rob

    # Figure out what fasta sequences are not in the tree and therefore need to be excluded from VCF:
    xzcat aln_global_unmasked.fa.xz \
    | grep ^\> \
    | sed -re 's/^>//;' \
    | sort > fa.ids.sorted
    sed -re 's/\)[0-9.]+:/\):/g; s/:[0-9e.:-]+//g; s/[\(\);]//g; s/,/\n/g;'" s/'//g;" ft_SH.tree \
    | sort > tree.ids.sorted
    # How many samples?
    wc -l tree.ids.sorted
#59600 tree.ids.sorted

    # Make sure the number of samples in tree but not fasta is 0
    # (if not, probably need to fix sed command):
    comm -13 fa.ids.sorted tree.ids.sorted | wc -l
#0
    # Make list of samples in fasta but not tree:
    comm -23 fa.ids.sorted tree.ids.sorted > rob-$releaseLabel.idsToExclude

    # Get the name for the reference (Wuhan-Hu-1 or Wuhan/Hu-1):
    grep EPI_ISL_402125 fa.ids.sorted
    #hCoV-19/Wuhan/Hu-1/2019|EPI_ISL_402125|2019-12-31

    # Run faToVcf.  First without -ambiguousToN and -minAc for Yatish Turakhia's tools:
    xzcat aln_global_unmasked.fa.xz \
    | faToVcf stdin gisaid-$releaseLabel.unfiltered.vcf -includeRef \
        -ref='hCoV-19/Wuhan/Hu-1/2019|EPI_ISL_402125|2019-12-31' \
        -vcfChrom=NC_045512v2 -verbose=2 -excludeFile=rob-$releaseLabel.idsToExclude
#Read 59889 sequences.
#Using hCoV-19/Wuhan/Hu-1/2019|EPI_ISL_402125|2019-12-31 as reference.
#Excluded 289 sequences named in rob-28-08-20.idsToExclude (59600 sequences remaining including reference)
    # Remove some redundant parts of the very long concatenated IDs.
    perl -wpi -e 's@hCoV-19/@@g;  s@\|20(\d\d)@|$1@g;' gisaid-$releaseLabel.unfiltered.vcf
    ls -l gisaid-$releaseLabel.unfiltered.vcf
#-rw-rw-r-- 1 angie genecats 2155993752 Sep  2 17:59 gisaid-28-08-20.unfiltered.vcf
    wc -l gisaid-$releaseLabel.unfiltered.vcf
#18059 gisaid-28-08-20.unfiltered.vcf
    gzip -f gisaid-$releaseLabel.unfiltered.vcf

    # Cross-check number of samples in tree with VCF sample IDs:
    zcat gisaid-$releaseLabel.unfiltered.vcf.gz | head | g ^#CHROM | cut -f 10- | wc -w
#59600

    # Use -ambiguousToN to avoid complaints about ambiguous alleles from display code
    # (also to prevent more situations where it looks like CZB maybe saw an alt, but according
    # to them they didn't see an alt, it was a low-conf base).
    xzcat aln_global_unmasked.fa.xz \
    | faToVcf stdin gisaid-$releaseLabel.ambigToN.vcf -includeRef \
        -ambiguousToN \
        -ref='hCoV-19/Wuhan/Hu-1/2019|EPI_ISL_402125|2019-12-31' \
        -vcfChrom=NC_045512v2 -verbose=2 -excludeFile=rob-$releaseLabel.idsToExclude
    perl -wpi -e 's@hCoV-19/@@g;  s@\|20(\d\d)@|$1@g;' gisaid-$releaseLabel.ambigToN.vcf
    ls -l gisaid-$releaseLabel.ambigToN.vcf
#-rw-rw-r-- 1 angie genecats 1705011225 Sep  2 18:05 gisaid-28-08-20.ambigToN.vcf
    wc -l gisaid-$releaseLabel.ambigToN.vcf
#14278 gisaid-28-08-20.ambigToN.vcf
    bgzip -f gisaid-$releaseLabel.ambigToN.vcf
    tabix -p vcf gisaid-$releaseLabel.ambigToN.vcf.gz

    # Then with -ambiguousToN and -minAc (0.1% = 59600 / 1000 = 60)
    # for not-terribly-slow browser display:
    xzcat aln_global_unmasked.fa.xz \
    | faToVcf stdin gisaid-$releaseLabel.minAf.001.vcf -includeRef \
        -ambiguousToN -minAc=60 \
        -ref='hCoV-19/Wuhan/Hu-1/2019|EPI_ISL_402125|2019-12-31' \
        -vcfChrom=NC_045512v2 -verbose=2 -excludeFile=rob-$releaseLabel.idsToExclude
    perl -wpi -e 's@hCoV-19/@@g;  s@\|20(\d\d)@|$1@g;' gisaid-$releaseLabel.minAf.001.vcf
    ls -l gisaid-$releaseLabel.minAf.001.vcf
#-rw-rw-r-- 1 angie genecats 63014097 Sep  2 18:07 gisaid-28-08-20.minAf.001.vcf
    wc -l gisaid-$releaseLabel.minAf.001.vcf
#509 gisaid-28-08-20.minAf.001.vcf
    bgzip -f gisaid-$releaseLabel.minAf.001.vcf
    tabix -p vcf gisaid-$releaseLabel.minAf.001.vcf.gz

    # Make an even more filtered version with -minAc=596 (1%):
    xzcat aln_global_unmasked.fa.xz \
    | faToVcf stdin gisaid-$releaseLabel.minAf.01.vcf -includeRef \
        -ambiguousToN -minAc=596 \
        -ref='hCoV-19/Wuhan/Hu-1/2019|EPI_ISL_402125|2019-12-31' \
        -vcfChrom=NC_045512v2 -verbose=2 -excludeFile=rob-$releaseLabel.idsToExclude
    perl -wpi -e 's@hCoV-19/@@g;  s@\|20(\d\d)@|$1@g;' gisaid-$releaseLabel.minAf.01.vcf
    ls -l gisaid-$releaseLabel.minAf.01.vcf
#-rw-rw-r-- 1 angie genecats 10185589 Sep  2 18:12 gisaid-28-08-20.minAf.01.vcf
    wc -l gisaid-$releaseLabel.minAf.01.vcf
#66 gisaid-28-08-20.minAf.01.vcf
    bgzip -f gisaid-$releaseLabel.minAf.01.vcf
    tabix -p vcf gisaid-$releaseLabel.minAf.01.vcf.gz

    # Shorten tree IDs to match VCF:
    perl -wpe 's@hCoV-19/@@g;  s@\|20(\d\d)@|$1@g; '"s@'@@g;" ft_SH.tree \
      > ft_SH.shorterNames.tree
    # Set the root of the tree to the Wuhan/Hu-1 (NC_045512.2) reference not WIV04:
    ~/github/newick_utils/src/nw_reroot ft_SH.shorterNames.tree \
      'Wuhan/Hu-1/2019|EPI_ISL_402125|19-12-31' > ft_SH.reroot.nh
    # Use Yatish Turakhia's usher program to collapse nodes of the tree at which there is
    # no mutation, but not to condense identical samples into new nodes.
    #*** NOTE FOR NEXT TIME: use ~angie/github/yatish_usher/build/usher; output file names changed
    #*** and --print_uncondensed-final-tree changed to --write-uncondensed-final-tree
    ~angie/github/yatish_strain_phylogenetics/build/usher \
      --vcf gisaid-$releaseLabel.unfiltered.vcf.gz  \
      --tree ft_SH.reroot.nh \
      --collapse-final-tree \
      --print_uncondensed-final-tree \
    | tail -1 \
      > ft_SH.collapsed.nh

    # Get sample metadata from GISAID so we can color samples & branches by lineage.
    # This requires registering with gisaid.org.  Then go to https://www.epicov.org/epi3/frontend,
    # log in, click Downloads, and click "nextmeta" in the dialog that pops up.  That downloads
    # a local metadata_*.tsv.gz file with timestamp name.
    # Extract the EPI_ ID and lineage columns:
    zcat metadata_2020-09-01*.tsv.gz \
    | tail -n +2 \
    | cut -f3,18,19 \
    | sort > metadata_2020-09-01.epiToLineageAndGisaid
    # Map EPI ID to sample name used in tree & vcf
    awk -F\| '{print $2 "\t" $0;}' tree.ids.sorted \
    | perl -wpe 's@hCoV-19/@@g;  s@\|20(\d\d)@|$1@g; '"s@'@@g;" \
    | sort > epiToSample
    # Join on EPI ID to associate tree sample names with lineages.
    join epiToSample metadata_2020-09-01.epiToLineageAndGisaid -o 1.2,2.2,2.3 > sampleToLineage

    # Add files with lineage colors and GISAID clade colors (trackDb setting sampleColorFile).
    # Color choices adapted from Figure 1 of
    # https://www.eurosurveillance.org/content/10.2807/1560-7917.ES.2020.25.32.2001410
    $HOME/kent/src/hg/makeDb/doc/wuhCor1/cladeLineageColors.pl sampleToLineage \
    | gzip > lineageColors.gz
    $HOME/kent/src/hg/makeDb/doc/wuhCor1/gisaidCladeColors.pl sampleToLineage \
    | gzip > gisaidColors.gz

    # Install files in /gbdb.
    mkdir -p /gbdb/wuhCor1/sarsCov2Phylo
    for f in gisaid-$releaseLabel.{ambigToN,minAf.001,minAf.01}.vcf.gz; do
      destName=$(echo $f | sed -re "s/-$releaseLabel//")
      ln -sf `pwd`/$f /gbdb/wuhCor1/sarsCov2Phylo/$destName
      ln -sf `pwd`/$f.tbi /gbdb/wuhCor1/sarsCov2Phylo/$destName.tbi
    done
    ln -sf `pwd`/lineageColors.gz /gbdb/wuhCor1/sarsCov2Phylo/sarscov2phylo.lineageColors.gz
    ln -sf `pwd`/gisaidColors.gz /gbdb/wuhCor1/sarsCov2Phylo/sarscov2phylo.gisaidColors.gz
    #TODO
    ln -sf `pwd`/ft_SH.collapsed.nh \
      /gbdb/wuhCor1/sarsCov2Phylo/sarscov2phylo.ft.nh


#########################################################################
# SARS-CoV-2 PHYLOGENY - PUBLIC SEQUENCE (DONE - 2020-10-15 - Angie)
# NOTE: updates from 2020-11 onward are driven by scripts in hg/utils/otto/sarscov2phylo/ .
    releaseLabel=28-08-20
    cd /hive/data/genomes/wuhCor1/bed/sarsCov2Phylo/$releaseLabel
    # First, regenerate collapsed tree with latest usher so that branch lengths aren't munged.
    ~angie/github/yatish_usher/build/usher --vcf gisaid-28-08-20.unfiltered.vcf.gz --tree ft_SH.reroot.nh --collapse-final-tree --write-uncondensed-final-tree
    mv uncondensed-final-tree.nh ft_SH.collapsed.nh

    # Use Chris's latest mapping of EPI IDs to public sequence IDs to map tree IDs to public IDs.
    tawk '{if ($4 == "") { print $1, $2 "|" $3;} else { print $1, $2 "|" $3 "|" $4;} }' \
      /hive/users/angie/gisaid/epiToPublicAndDate.2020-10-08 \
    | sed -re 's/20([0-9][0-9])(-[0-9-]+)?$/\1\2/' \
      > epiToPublicName

    sed -re 's/[\):][^,]+,/\n/g; s/\(//g; s/,/\n/g; s/\)[0-9:]*;//;' ft_SH.collapsed.nh \
    | awk -F"|" '{print $2 "\t" $0;}' | sort > epiToTreeName

    join -t"	" epiToTreeName epiToPublicName | cut -f 2,3 > treeToPublic
    wc -l treeToPublic
#30241 treeToPublic

    # Use new utils to limit tree and VCF to just the public sequences:
    phyloRenameAndPrune ft_SH.collapsed.nh treeToPublic ft_SH.collapsed.public.nh
    sed -re 's/,/,\n/g' ft_SH.collapsed.public.nh | wc -l
#30241
    vcfRenameAndPrune gisaid-$releaseLabel.ambigToN.vcf.gz treeToPublic \
      public-$releaseLabel.ambigToN.vcf
    head public-$releaseLabel.ambigToN.vcf | grep ^#CHROM | wc
#      1   30250 1460345
    bgzip public-$releaseLabel.ambigToN.vcf
    tabix -p vcf public-$releaseLabel.ambigToN.vcf.gz

    # Make allele-frequency-filtered versions
    zcat public-$releaseLabel.ambigToN.vcf.gz \
    | perl -wne 'if (/^#/) { print; } else { die unless m/^(\S+\t){7}AC=([\d,]+);AN=(\d+)/; ($acVal, $an) = ($2, $3); @acs = sort {$b <=> $a} split(",", $acVal); $maxAc = $acs[0]; $freq = $maxAc / $an;  if ($freq >= 0.01) { print; } }' \
      > public-$releaseLabel.minAf.01.vcf
    wc -l public-$releaseLabel.minAf.01.vcf
#64 public-$releaseLabel.minAf.01.vcf
    bgzip public-$releaseLabel.minAf.01.vcf
    tabix -p vcf public-$releaseLabel.minAf.01.vcf.gz

    zcat public-$releaseLabel.ambigToN.vcf.gz \
    | perl -wne 'if (/^#/) { print; } else { die unless m/^(\S+\t){7}AC=([\d,]+);AN=(\d+)/; ($acVal, $an) = ($2, $3); @acs = sort {$b <=> $a} split(",", $acVal); $maxAc = $acs[0]; $freq = $maxAc / $an;  if ($freq >= 0.001) { print; } }' \
      > public-$releaseLabel.minAf.001.vcf
    wc -l public-$releaseLabel.minAf.001.vcf
#594 public-$releaseLabel.minAf.001.vcf
    bgzip public-$releaseLabel.minAf.001.vcf
    tabix -p vcf public-$releaseLabel.minAf.001.vcf.gz

    sed -re 's/\|/\\|/g;' treeToPublic \
    | tawk '{print "s@" $1 "@" $2 "@;";}' \
    > treeToPublic.sed
    # NOTE FOR NEXT TIME: this sed took 20 minutes!  Need a faster substitution method.
    # ALSO we ended up with some garbage at the end of the file!  lineFileUdcMayOpen on local file
    # somehow ignored the garbage which started with a large block of null characters, but
    # netLineFileMayOpen on remote compressed file did not ignore the garbage.  Fixed 10/21/20.
    zcat lineageColors.gz | sed -rf treeToPublic.sed | grep -v EPI_ISL_ > publicLineageColors
    wc -l publicLineageColors
#30239 publicLineageColors
    gzip publicLineageColors

    # Author credits file... strip GenBank version numbers because NCBI metadata doesn't have those
    cut -f 2 treeToPublic \
    | cut -d \| -f 1 \
    | sed -re 's/^([A-Z][A-Z][0-9]{6})\.[0-9]/\1/;' \
    | sort > publicIdsInTree
# Three categories of IDs, different metadata sources:
# CNA0013884 -- from https://bigd.big.ac.cn/ncov/release_genome
#   * Advanced search, database = CNGBdb
#   * Select Column button, enable originating & submitting labs [no authors option unfortunately]
#   * Download Table button
#   * That saves an Excel .xlsx file; export to TSV, save as cncb.metadata.<date>.tsv
# LC528232.1 -- from GenBank
#   * https://www.ncbi.nlm.nih.gov/labs/virus/vssi/#/virus?SeqType_s=Nucleotide&VirusLineage_ss=SARS-CoV-2,%20taxid:2697049
#   * Download button
#   * Current table view result --> CSV format, Next button
#   * Download all records, Next button
#   * Select Accession and Authors [no labs options unfortunately]
#   * Download button, save as ncbi.authors.date.csv
# England/BIRM-5E2A3/2020 -- from COG-UK
#   * https://www.ebi.ac.uk/ena/browser/view/PRJEB37886
#   * select columns center_name, sample_accession, sample_alias
#   * Download report: TSV
#   * file saved to filereport_read_run_PRJEB37886_tsv.txt (extra first column, run_accession)
    tail -n+2 cncb.metadata.20-10-15.tsv \
    | cut -f 2,12,14 \
    | egrep -v '^[A-Z][A-Z][0-9]{6}' \
    | sed -e 's/"//g; s/$/\tn\/a/;' \
      > cncb.credits
    tail -n+2 ncbi.authors.20-10-15.csv \
    | csvToTab \
    | tawk '{print $1, "n/a", "n/a", $2;}' \
      > ncbi.credits
    tail -n+2 filereport_read_run_PRJEB37886_tsv.txt \
    | tawk '{print $4, $3, $3, "COVID-19 Genomics UK Consortium";}' \
    | sed -e 's@^COG-UK/@@;' \
    | sort -u \
      > cogUk.credits.partialIds
    grep / publicIdsInTree \
    | awk -F/ '{print $2 "\t" $0;}' \
    | sort \
      > cogUk.partialToFull
    join -a 2 -e "n/a" -t"	" -o 2.2,1.2,1.3,1.4 cogUk.credits.partialIds cogUk.partialToFull \
    | tawk '$4 == "n/a" { $4 = "COVID-19 Genomics UK Consortium"; }' \
      > cogUk.credits
    /bin/echo -e "accession\toriginating_lab\tsubmitting_lab\tauthors" > acknowledgements.tsv
    grep -Fwf publicIdsInTree cncb.credits >> acknowledgements.tsv
    grep -Fwf publicIdsInTree ncbi.credits >> acknowledgements.tsv
    grep -Fwf publicIdsInTree cogUk.credits >> acknowledgements.tsv
    gzip acknowledgements.tsv

    # Install
    mkdir /gbdb/wuhCor1/sarsCov2PhyloPub
    for f in public-$releaseLabel.{ambigToN,minAf.001,minAf.01}.vcf.gz; do
      destName=$(echo $f | sed -re "s/-$releaseLabel//")
      ln -sf `pwd`/$f /gbdb/wuhCor1/sarsCov2Phylo/$destName
      ln -sf `pwd`/$f.tbi /gbdb/wuhCor1/sarsCov2Phylo/$destName.tbi
    done
    ln -s `pwd`/ft_SH.collapsed.public.nh \
      /gbdb/wuhCor1/sarsCov2PhyloPub/sarscov2phylo.pub.ft.nh
    ln -s `pwd`/publicLineageColors.gz \
      /gbdb/wuhCor1/sarsCov2PhyloPub/sarscov2phylo.pub.lineageColors.gz
    ln -s `pwd`/acknowledgements.tsv.gz \
      /gbdb/wuhCor1/sarsCov2PhyloPub/acknowledgements.tsv.gz


#########################################################################
# PROBLEMATIC SITES (DONE - 2021-11-01 - Angie)
# Previously updated 2020-08-26, 2020-12-30, 2021-08-16

    today=$(date +%y-%m-%d)
    mkdir -p /hive/data/genomes/wuhCor1/bed/problematicSites/$today
    cd /hive/data/genomes/wuhCor1/bed/problematicSites/$today
    # They call the format VCF, but it is really just VCF-like with extra tab-sep columns.
    wget https://raw.githubusercontent.com/W-L/ProblematicSites_SARS-CoV2/master/problematic_sites_sarsCov2.vcf
    # Make a bigBed4+.  First make sure the columns haven't changed since last time.
    columns=$(grep ^#CHROM problematic_sites_sarsCov2.vcf)
    expected=$(cat ../columns.expected)
    if [ "$columns" != "$expected" ]; then
      echo "STOP!  COLUMNS CHANGED!  Change the script."
    fi
    perl -we \
      'while (<>) {
         chomp; chomp;
         # The header defines some keywords and expanded values; store those for later use.
         if (/^##\s+(\w+) = (.+)/) {
           ($key, $val) = ($1, $2);
           if (exists $expand{$key}) { warn "Clash for key {$key}: |$expand{$key}| vs |$val|\n"; }
           $expand{$key} = $val;
         }
         if (/^#/) {
           next;
         }
         (undef, $pos, undef, $ref, $alt, undef, $filter, $info) = split("\t");
         %info = map { ($key, $val) = split("="); $key => $val } split(";", $info);
         # Used to be columns: $contrib, $exc, $country, $lab, $gene, $aaPos, $aaRef, $aaAlt)
         # Now in info: SUB (submitter), EXC, SRC_COUNTRY, SRC_LAB, GENE, AA_POS, AA_REF, AA_ALT
         foreach $tag (qw/SRC_COUNTRY SRC_LAB GENE AA_POS AA_REF AA_ALT/) {
           if (! exists $info{$tag} || $info{$tag} eq ".") {
             $info{$tag} = "";
           }
         }
         # Expand keywords used in a couple of the columns.
         @contribs = map { $expand{$_} || $_ } split(",", $info{SUB});
         @labs = map { $expand{$_} || $_ } split(",", $info{SRC_LAB});
         if ($info{SRC_LAB} ne "") {
            @countries = split(",", $info{SRC_COUNTRY});
            if (scalar @labs != scalar @countries) {
              if (scalar @countries == 1) {
                for ($i = 1;  $i < @labs;  $i++) {
                  $countries[$i] = $countries[0];
                }
              } else {
                die "Differing numbers of countries and labs";
              }
            } else {
              for ($i = 0;  $i < @labs;  $i++) {
                $labs[$i] .= " ($countries[$i])"
              }
            }
         }
         $info{GENE} =~ s/gene-//g;
         # Print out one joined record for each sequence of "seq_end" single-base annotations.
         if (defined $seqEndStart && $info{EXC} !~ /seq_end/) {
           print join("\t", "NC_045512v2", $seqEndStart, $seqEndEnd, "seq_end", "mask", "", "",
                      $expand{"NDM"}, "", "", "", "", "") . "\n";
           $seqEndStart = $seqEndEnd = undef;
         }
         if ($info{EXC} eq "seq_end") {
           if (! defined $seqEndStart) {
             $seqEndStart = $pos-1;
           }
           $seqEndEnd = $pos;
         } else {
           print join("\t", "NC_045512v2", $pos-1, $pos, $info{EXC}, $filter, $ref, $alt,
                      join(", ", @contribs), join(", ", @labs),
                      $info{GENE}, $info{AA_POS}, $info{AA_REF}, $info{AA_ALT}) . "\n";
         }
       }
       if (defined $seqEndStart) {
         print join("\t", "NC_045512v2", $seqEndStart, $seqEndEnd, "seq_end", "mask", "", "",
                    $expand{"NDM"}, "", "", "", "", "") . "\n";
       }' problematic_sites_sarsCov2.vcf \
         > problematicSites.bed
    tawk '$5 == "mask"' problematicSites.bed | cut -f 1-4,6- \
    | sort -k2n > problematicSitesMask.bed
    tawk '$5 == "caution"' problematicSites.bed | cut -f 1-4,6- \
    | sort -k2n > problematicSitesCaution.bed
    # Split into two subtracks so we can color them differently: red for mask, orange for caution
    bedToBigBed -type=bed4+ -as=$HOME/kent/src/hg/lib/problematicSites.as -tab \
      problematicSitesMask.bed ../../../chrom.sizes problematicSitesMask.bb
    bedToBigBed -type=bed4+ -as=$HOME/kent/src/hg/lib/problematicSites.as -tab \
      problematicSitesCaution.bed ../../../chrom.sizes problematicSitesCaution.bb
    echo "$columns" > ../columns.expected

    # Install files
    mkdir -p /gbdb/wuhCor1/problematicSites
    ln -sf `pwd`/problematicSites{Mask,Caution}.bb /gbdb/wuhCor1/problematicSites/


#########################################################################
#
# updates to SARS-CoV-2 PHYLOGENY (both GISAID and PUBLIC SEQUENCE) from
# 2020-11 onward are driven by scripts in hg/utils/otto/sarscov2phylo/ .
#

#########################################################################
# B.1.1.7 variants (DONE - 2021-01-05 - angie)

mkdir /hive/data/genomes/wuhCor1/bed/lineageB_1_1_7_US
cd /hive/data/genomes/wuhCor1/bed/lineageB_1_1_7_US
# I downloaded the first eight U.S. lineage B.1.1.7 sequences from GenBank (MW422255, MW422256)
# and GISAID (EPI_ISL_755593, EPI_ISL_755594, EPI_ISL_755595, EPI_ISL_755596, EPI_ISL_755940,
# EPI_ISL_755941), used hgPhyloPlace to align them to the genome, place in phylo tree and make
# custom tracks including a VCF track with the 8 samples.  Copy the custom track VCF and Newick:
cp /data/apache/userdata/sessions/fb/AngieHinrichs/0f8b7e37/ct/ct_hgwdev_1761a_38f864.vcf \
    lineageB_1_1_7_US_first8.vcf

# Run minimap2 to get VCF for deletion variants that hgPhyloPlace ignores:
minimap2  --cs NC_045512v2.fa MW422255.1.fa | paftools.js call -L 20000 -f NC_045512v2.fa -
#NC_045512v2     11287   .       GTCTGGTTTT      G
#NC_045512v2     21764   .       ATACATG A
#NC_045512v2     21990   .       TTTA    T
#NC_045512v2     28270   .       TA      T

# Edit lineageB_1_1_7_US_first8.vcf to add those deletion variants and remove no-call SNVs at
# overlapping positions.

# Add a column for B.1.1.7 consensus variants; default to match reference, then edit to set to alt
# allele in specific locations.
perl -wne 'chomp; @w = split("\t");
           if (@w == 1) { print $_ . "\n"; }
           elsif ($w[0] eq "#CHROM") { print join("\t", @w[0..8], "B.1.1.7", @w[9..$#w]) . "\n"; }
           else {
             if ($w[7] =~ /^(AC=\d+;AN=)(\d+)$/) {
               $newAN = $2 + 1;
               $w[7] = "$1$newAN";
             } else { die "Unexpected INFO column $w[7]"; }
             print join("\t", @w[0..8], "0", @w[9..$#w]) . "\n";
           }' \
    lineageB_1_1_7_US_first8.vcf > tmp

# Edit tmp to set genotype value for B.1.1.7 to 1 where appropriate.

# Exclude variants with no alt allele (included in hgPhyloPlace VCF only to highlight where a
# sample has N's at variants in the big tree; not needed here & not handled quite right by our
# VCF display...)
tawk '$5 != "*"' tmp > lineageB_1_1_7_US_first8.vcf

bgzip -f lineageB_1_1_7_US_first8.vcf
tabix -p vcf lineageB_1_1_7_US_first8.vcf.gz
# Copy the Newick tree file showing the relationship between the 8 samples:
cp /data/apache/userdata/sessions/fb/AngieHinrichs/0f8b7e37/ct/uploadedSamples_hgwdev_16d7d_38f4d0.nwk \
    lineageB_1_1_7_US_first8.nwk
# Edit the Newick file to add B.1.1.7 consensus.

# 2020-01-05: More sequences have appeard in GenBank; replace EPI_ IDs with GenBank accessions
# so we can share VCF:
zcat lineageB_1_1_7_US_first8.vcf.gz \
| sed -re 's/EPI_ISL_755593/MW430974.1/;
           s/EPI_ISL_755594/MW430966.1/;
           s/EPI_ISL_755595/MW430967.1/;
           s/EPI_ISL_755596/MW430968.1/;' \
> lineageB_1_1_7_US_first8.vcf
bgzip -f lineageB_1_1_7_US_first8.vcf
tabix -p vcf lineageB_1_1_7_US_first8.vcf.gz
sed -re 's/EPI_ISL_755593/MW430974.1/;
         s/EPI_ISL_755594/MW430966.1/;
         s/EPI_ISL_755595/MW430967.1/;
         s/EPI_ISL_755596/MW430968.1/;' \
    lineageB_1_1_7_US_first8.nwk > tmp
mv tmp lineageB_1_1_7_US_first8.nwk

# That VCF file will have to suppressed from download, but we can make a public-only VCF
# file to offer for download:
zcat lineageB_1_1_7_US_first8.vcf.gz \
| cut -f 1-16 \
| vcfFilter -minAc=1 stdin \
    > lineageB_1_1_7_US_first6.vcf
bgzip lineageB_1_1_7_US_first6.vcf
tabix -p vcf lineageB_1_1_7_US_first6.vcf.gz

# 2020-01-05: Sequence just became available from NY; add it in starting with hgPhyloPlace VCF...
tail -n+399 /data/apache/trash/ct/ct_pp_hgwdev_angie_bcfa_4f0e10.ct | tawk '$5 != "*"' > tmp
# Get deletion lines to add in:
zcat lineageB_1_1_7_US_first8.vcf.gz | grep del >> tmp
# Edit the deletion lines into the correct places so lines are sorted by position.
# Check for other edits to make:
zcat lineageB_1_1_7_US_first8.vcf.gz | cut -f 1-9 > /data/tmp/angie/1
cut -f 1-9 tmp > /data/tmp/angie/2
diff /data/tmp/angie/[12]
#26c26
#< NC_045512v2   17615   A17615G A       G       .       .       AC=5;AN=9       GT
#---
#> NC_045512v2   17615   A17615G A       G       .       .       AC=6;AN=9       GT
# -- aha, the new sequence adds 1 to the AC for A17615G.
# But the same variants are there in each file, so we can paste columns and then fix up AC and AN
# with vcfFilter.
cut -f 18 tmp > tmp2
zcat lineageB_1_1_7_US_first8.vcf.gz \
| paste - tmp2 \
| vcfFilter -minAc=1 stdin \
    > lineageB_1_1_7_US_first9.vcf
# Edit lineageB_1_1_7_US_first9.vcf to fix the pasted ##fileformat and ##reference header lines.
bgzip -f lineageB_1_1_7_US_first9.vcf
tabix -p vcf lineageB_1_1_7_US_first9.vcf.gz
# Now copy the Newick file:
cp /data/apache/trash/ct/uploadedSamples_hgwdev_angie_bcfa_4f0e10.nwk lineageB_1_1_7_US_first9.nwk
# Edit the Newick file to add B.1.1.7 consensus.

# Ha, I just missed the GenBank release!  Well, replace EPI_ISL_767594 with MW440433.1.  :)
zcat lineageB_1_1_7_US_first9.vcf.gz \
| sed -re 's/EPI_ISL_767594/MW440433.1/;' \
    > lineageB_1_1_7_US_first9.vcf
bgzip -f lineageB_1_1_7_US_first9.vcf
tabix -p vcf lineageB_1_1_7_US_first9.vcf.gz
sed -re 's/EPI_ISL_767594/MW440433.1/;' lineageB_1_1_7_US_first9.nwk > tmp
mv tmp lineageB_1_1_7_US_first9.nwk
# Update the GenBank subset VCF that we can offer for download:
zcat lineageB_1_1_7_US_first9.vcf.gz \
| cut -f 1-16,19 \
| vcfFilter -minAc=1 stdin \
    > lineageB_1_1_7_US_first7.vcf
bgzip lineageB_1_1_7_US_first7.vcf
tabix -p vcf lineageB_1_1_7_US_first7.vcf.gz

# Install in /gbdb
mkdir /gbdb/wuhCor1/lineageB_1_1_7_US
ln -s `pwd`/lineageB_1_1_7_US_first*.vcf.gz* /gbdb/wuhCor1/lineageB_1_1_7_US/
ln -s `pwd`/lineageB_1_1_7_US_first*.nwk /gbdb/wuhCor1/lineageB_1_1_7_US/


#########################################################################
# Normalize VCF+tree sample names in 13-11-20 public-all set; it will be basis of
# incrementally updated public tree going forward.  (DONE - 2021-01-07 - Angie)

cd /hive/data/outside/otto/sarscov2phylo/13-11-20
# Names currently used in VCF+tree:
zcat public-13-11-20.all.minAf.01.vcf.gz | head | grep ^#CHROM \
| sed -re 's/\t/\n/g;' | tail -n+10 \
    > public-13-11-20.all.names
wc -l public-13-11-20.all.names
#157816 public-13-11-20.all.names
# There are some inconsistent name and date formats that we need to clean up e.g.
#NorthernIreland/NIRE-102049/2020|Northern_Ireland/NIRE-102049/2020|20-03-23
#Northern_Ireland/NIRE-106F84/2020|2020-05-26
# i.e. gisaidStrain|cogUkName|shortenedDate vs. cogUkName|date
# --> Use only cogUkName since that's all we'll get going forward
# --> Shorten dates that are not shortened
# While we're at it, remove the rev-strand (doh!) CNCB version of GZMU0016
# (Guangzhou/GZMU0016/2020|CNA0013710|20-02-25) because mafft made a garbage alignment
# for it, while the GenBank version is fine.
awk -F\| '{ if ($2 ~ /^(England|Northern|Scotland|Wales)/) { print $0 "\t" $2 "|" $3; }
            else { print $0 "\t" $0; } }' \
    public-13-11-20.all.names \
| grep -v CNA0013710 \
| sed -re 's/20([0-9][0-9])(-[0-9-]+)?$/\1\2/; s/, /_/g; s/[, ]/_/g;' \
    > cleanup.renaming
wc -l cleanup.renaming
#157815 cleanup.renaming
# Make sure there aren't any dups
cut -f 1 cleanup.renaming | sort -u | wc -l
#157815
cut -f 2 cleanup.renaming | sort -u | wc -l
#157815
# Make new VCF+tree with cleaned-up names
phyloRenameAndPrune public-13-11-20.all.nh cleanup.renaming public-13-11-20.all.renamed.nh
vcfRenameAndPrune public-13-11-20.all.vcf.gz cleanup.renaming stdout \
| bgzip -c > public-13-11-20.all.renamed.vcf.gz
# Make a protobuf for UShER/hgPhyloPlace -- mask out problematic sites first..
problematicSitesVcf=/hive/data/genomes/wuhCor1/bed/problematicSites/20-12-30/problematic_sites_sarsCov2.vcf
tawk '{ if ($1 ~ /^#/) { print; } else if ($7 == "mask") { $1 = "NC_045512v2"; print; } }' \
    $problematicSitesVcf > mask.vcf
time vcfFilter -excludeVcf=mask.vcf public-13-11-20.all.renamed.vcf.gz \
| gzip -c \
    > public-13-11-20.all.renamed.masked.vcf.gz
usherDir=~angie/github/usher
usher=$usherDir/build/usher
time $usher -c -u \
  -v public-13-11-20.all.renamed.masked.vcf.gz \
  -t public-13-11-20.all.renamed.nh \
  -o public-13-11-20.all.renamed.masked.pb
#real    21m48.333s
# Likewise for the notMasked.pb
time $usher -c -u \
  -v <(zcat public-13-11-20.all.renamed.vcf.gz) \
  -t public-13-11-20.all.renamed.nh \
  -o public-13-11-20.all.renamed.notMasked.pb
# Now a script in hg/utils/otto/sarscov2phylo/ can use those files as a base
# for incrementally adding in new public sequences.


#########################################################################
# Remove rev-strand version of GZMU0016 from cncb fasta (DONE - 2021-01-07 - Angie)

cd /hive/data/outside/otto/sarscov2phylo/cncb.2020-10-15
grep ^\> cncb.nonGenBank.fasta | sed -re 's/^>//;' | grep CNA0013710 | awk '{print $1;}' \
    > exclude.CNA0013710
faSomeRecords -exclude cncb.nonGenBank.fasta exclude.CNA0013710 tmp
faSize cncb.nonGenBank.fasta
#7754193 bases (35730 N's 7718463 real 7718459 upper 4 lower) in 260 sequences in 1 files
#Total size: mean 29823.8 sd 235.7 min 26973 (BetaCoV/Wuhan/WH19002/2019) max 29941 (Guangzhou/GZ8H0005/2020) median 29861
faSize tmp
#7724289 bases (35730 N's 7688559 real 7688555 upper 4 lower) in 259 sequences in 1 files
#Total size: mean 29823.5 sd 236.1 min 26973 (BetaCoV/Wuhan/WH19002/2019) max 29941 (Guangzhou/GZ8H0005/2020) median 29860
mv tmp cncb.nonGenBank.fasta


#########################################################################
