# for emacs: -*- mode: sh; -*-

#########################################################################
# Mutations in in COVID variants of interest, from Nick Keener and Angie Hinrichs
# 2/8/21 Kate

# Files from Nick Keener, UCSC GI (grad student).  His notes:
Here are the BED files that have the changes we discussed today. There's two for each strain, one with the nucleotide mutations and indels and one with amino acids mutations and indels. I decided to just drop the protein names from the name column since some of the other protein names are pretty long and they'll be able to see what gene it's in by looking at the protein track. But if you want to have the protein names except for spike let me know and I can add them back in. For indels I named them with the format : del_[1-based genome coordinate of 1st deleted nucleotide] or ins_[1-based genome coordinate of nucleotide after insertion site]. I'll list the sequence counts for each lineage below. Let me know if you need anything else or want me to make any changes. Enjoy your weekend!

Sequence Counts:
B.1.1.7: 9838
B.1.351: 793
B.1.429: 1360
P.1: 78

# 4 strains, 2 BEDs for each (AA and Nuc mutations).  BED4 format

# Format BED 9 to color by potential deleteriousness.  Initially:
#       bright for spike rbp, medium for other spike, light for other
# Later: integrate with antibody escape annotations

cd /hive/data/genomes/wuhCor1/bed/strains/keener3

$ wc -l *.aa.bed
  24 gisaid_B.1.1.7_2021_02_05_aa.bed
  16 gisaid_B.1.351_2021_02_05_aa.bed
  10 gisaid_B.1.429_2021_02_05_aa.bed
  25 gisaid_P.1_2021_02_05_aa.bed

$ wc -l *.nuc.bed
  32 gisaid_B.1.1.7_2021_02_05_nuc.bed
  20 gisaid_B.1.351_2021_02_05_nuc.bed
  14 gisaid_B.1.429_2021_02_05_nuc.bed
  35 gisaid_P.1_2021_02_05_nuc.bed

# Add # to header lines (and notify Nick)

cat > load.csh << 'EOF'
foreach v (B.1.1.7 B.1.351 B.1.429 P.1)
    echo $v
    set d = 2021_02_05
    foreach t (aa nuc)
        set f = ${v}_${d}_${t}
        echo $f
        bedToBigBed -type=bed4 gisaid_$f.bed ../../../chrom.sizes variantMuts_$f.bb
        ln -s `pwd`/variantMuts_${f}.bb /gbdb/wuhCor1/strainMuts
    end
end
'EOF'
EOF

csh load.csh >&! load.out

########################
# Update to include newly designated VOC, VOI, etc.  RM #27926.
#    September 2021, kate (as volunteer) & angie

# This required running Nick Keener's alignment and mutation calling pipeline, 
# 'lineageVariants', with minor mods (Angie added minAF cutoff, I added command-line
# options for cutoffs).
# Angie  downloaded sequences from GISAID and provided technical/scientific guidance
# for this track,
#
# 1 VOC Delta
# 2 VOI Mu, Lambda
# 3 VUM Eta, Iota, Kappa

# VOC
# Alpha   B.1.1.7
# Beta    B.1.351
# Gamma   P.1
# Delta   B.1617.2
# 
# VOI
# Lambda  C.37
# Mu      B.1.621
# 
# VUM (formerly VOI)
# Epsilon B.1.429
# Eta     B.1.525
# Iota    B.1.526
# Kappa   B.1.617.1

# Noting there are 2 additional WHO-named variants. These are former VOI's,
# but no longer under monitoring:  Zeta (P.2) and Theta (P.3).  
# These are not included in this track.

cd /hive/data/genomes/wuhCor1/bed/strains/2021-09-10

# Install pipeline

# Use this once Angie & Nick have updated pipeline at github
#git clone https://github.com/nickeener/sarscov2lineages.git
#cd sarscov2lineages/
#bash
#conda env create -f environment.yml

# For now, using modified environment: sarscov2lineagesAngie github dir, 
#                                       slAngie conda env
# This is a bit messy!

bash
pushd sarscov2lineagesAngie/  # caution: this is symlink into Angie's home dir
conda env create -f environment.yml -n slAngie
conda activate slAngie

# move old faToVcf out of the way so we pick up Angie's newer version (adds -minAF option)
# (noting newer faToVcf missing from Angie's slAngie conda bin)
pushd ~/miniconda3/envs/slAngie/bin/
mkdir old
mv faToVcf old/
popd

mkdir results
for i in B.1.525 B.1.526 B.1.617.1 B.1.617.2 B.1.621 C.37; do echo $i; cp $i.early3000.fa results/$i.fa; ./sarscov2lineagesAngie/lineageVariants.sh results/$i.fa sarscov2lineagesAngie/NC_045512.fa; done > results/log.txt 2>&1

# re-run to confirm, use new scripts and capture intermediate files and log file.  not used for track
#mkdir results2
#for i in B.1.525 B.1.526 B.1.617.1 B.1.617.2 B.1.621 C.37; do echo $i; cp $i.early3000.fa results2/$i.fa; ./sarscov2lineagesAngie/lineageVariants.params.sh -v results2/$i.fa sarscov2lineagesAngie/NC_045512.fa; done > results2/log.txt 2>&1

cd results
cat > load.csh << 'EOF'
foreach v (B.1.617.2 B.1.621 C.37 B.1.617.1 B.1.525 B.1.526 )
    echo $v
    set d = 2021_09_10
    foreach t (aa nuc)
        set if = ${v}_${t}.bed
        set T = `perl -e "print ucfirst($t)"`
        set of = variant${T}Muts_${v}_${d}
        # fix chrom name and strip protein prefix from variant identifiers
        sed -e 's/NC_045512.2/NC_045512v2/' -e 's/\t[^\t]*:/\t/' < $if > $of.bed
        bedToBigBed -type=bed4 $of.bed /hive/data/genomes/wuhCor1/chrom.sizes $of.bb
        ln -s `pwd`/$of /gbdb/wuhCor1/strainMuts
    end
end
'EOF'
EOF

wc -l *_aa.bed
  18 B.1.525_aa.bed
  12 B.1.526_aa.bed
  13 B.1.617.1_aa.bed
  12 B.1.617.2_aa.bed
  17 B.1.621_aa.bed
  18 C.37_aa.bed
wc -l *_nuc.bed
  30 B.1.525_nuc.bed
  16 B.1.526_nuc.bed
  18 B.1.617.1_nuc.bed
  16 B.1.617.2_nuc.bed
  25 B.1.621_nuc.bed
  27 C.37_nuc.bed
    
csh load.csh >&! load.log &
cd ../

# Run Delta with lower cutoffs, as per Angie's rec -- the early delta seqs
# included many gaps and the default params missed spike muts.
# Aiming here to get close to outbreak.info spike mut identifications.

mkdir delta
i=B.1.617.2
cp $i.early3000.fa delta/$i.fa
./sarscov2lineagesAngie/lineageVariants.params.sh -s 0.70 -i .50 -v \
        delta/B.1.617.2.fa sarscov2lineagesAngie/NC_045512.fa > delta/log.txt 2>&1

# Worked -- picked up both final SNV and the del
grep 'S:' *aa.bed
NC_045512.2     21616   21619   S:T19R
NC_045512.2     22028   22034   S:del_22029
NC_045512.2     22915   22918   S:L452R
NC_045512.2     22993   22996   S:T478K
NC_045512.2     23401   23404   S:D614G
NC_045512.2     23602   23605   S:P681R
NC_045512.2     24409   24412   S:D950N

# Load up

cd delta
set v = B.1.617.2
set d = 2021_09_10
    foreach t (aa nuc)
        set if = ${v}_${t}.bed
        set T = `perl -e "print ucfirst($t)"`
        set of = variant${T}Muts_${v}_${d}
        # fix chrom name and strip protein prefix from variant identifiers
        sed -e 's/NC_045512.2/NC_045512v2/' -e 's/\t[^\t]*:/\t/' < $if > $of.bed
        bedToBigBed -type=bed4 $of.bed /hive/data/genomes/wuhCor1/chrom.sizes $of.bb
        ln -s `pwd`/$of.bb /gbdb/wuhCor1/strainMuts
    end


################
# Sanity check from earlier runs.  Just here for historical reasons, as RM wasn't updated during
# track dev.

# Noting that Angie wants to keep non-delta SNV cutoff at .95, so we will miss some 
# outbreak listed variants as they use .75 cutoff.

# outbreak.info/situation-reports#voc  9/14/21
# WHO https://www.who.int/en/activities/tracking-SARS-CoV-2-variants/  Sept 2 2021

########
# Variants of Concern

B.1.617.2 (Delta)  India, Dec 2020  Nextstrain 21A/S, 1,039,720 seqs !!!

    spike muts from outbreak: T19R, E156G, del157/158, L452R, T478K, D614G, P681R, D950N

    UCSC pipeline: D514G, P681R (6 fewer)

#########
# Variants of interest:

B.1.525 (Eta)  UK and Nigeria Dec 2020  Nextrain 20A, 8057 seqs

    Spike muts from Outbreak: Q52R, A67V, 69del, 70del, 144del, E484K, D614G, Q677H, F888L

    UCSC pipeline: lacks 3 (Q52R, Q677H, F88L)

B.1.526 (Iota)  New York, Nov 2020  Nextstrain 20C, 39,674 seqs

    Spike muts from Outbreak: L5F, T95I, D253G, D614G

    UCSC pipeline: same

B.1.617.1 (Kappa)  India, Dec 2020   Nextstrain 20A/S, 6286 seqs

    Spike muts from Outbreak: L452R, E484Q, D614G, P681R, Q1071H

    UCSC pipeline: lacks 1 (Q1071H)

C.37 (Lambda) Peru Dec 2020, 6635 seqs

    Spike muts from Outbreak: G75V, T76I, R246N, Del247/253, L452Q, F490S, D614G, T859N

    UCSC pipeline: lacks 3 (G75V, R246N, Del247/253)

B.1.621 (Mu) Colombia, Jan 2021, 5731 seqs

     Spike muts from Outbreak: R346K, E484K, N501Y, D614G, P681H


NOTE: Outbreak includes Theta (P.3) first identified in Philipines in Mar 2021, with just 305 seqs,
        as a VOI

###################
# Exploratory runs of delta to identify more variants.  First run was stymied by prevalence of N's i# in seqs.  To address, Angie added -minAf option to vcfToFa.  We will also try lower cutoff,
# based observing that outbreak.info includes variants at 93% cutoff.  Also, more seqs -- 10K
# that Angie downloaded from GISAID.


#########################################################################
# UPDATE - Files prepared for Kate's runs of Nick's scripts - 2021-09-10 - Angie

mkdir /hive/data/genomes/wuhCor1/bed/strains/2021-09-10
cd /hive/data/genomes/wuhCor1/bed/strains/2021-09-10

gisaidDir=/hive/users/angie/gisaid

N=10000

# Pick the $N earliest B.1.1.7 sequences (that don't claim to be from before October 2020):
zcat $gisaidDir/metadata_batch_2021-09-10.tsv.gz \
| cut -f 1,5,19 \
| tawk '$3 == "B.1.1.7" && $2 ~ /202[01]-[0-9]{2}-[0-9]{2}/ && $2 > "2020-09-30"' \
| sort -k2,2 \
| head -$N \
    > B.1.1.7.early$N

time faSomeRecords <(xzcat $gisaidDir/sequences_batch_2021-09-10.fa.xz) \
    <(cut -f 1 B.1.1.7.early$N) B.1.1.7.early$N.fa
#real    3m38.874s
faSize B.1.1.7.early$N.fa
#89363373 bases (460913 N's 88902460 real 88902460 upper 0 lower) in 3002 sequences in 1 files

# Now for the rest, rejecting any before 2021:
# Delta (B.1.617.2)
# Eta (B.1.525)
# Iota (B.1.526)
# Kappa (B.1.617.1)
# Lambda (C.37)
# Mu (B.1.621)
for lineage in B.1.617.2 B.1.525 B.1.526 B.1.617.1 C.37 B.1.621; do
    zcat $gisaidDir/metadata_batch_2021-09-10.tsv.gz \
    | cut -f 1,5,19 \
    | tawk '$3 == "'$lineage'" && $2 ~ /2021-[0-9]{2}-[0-9]{2}/' \
    | sort -k2,2 \
    | head -$N \
        > $lineage.early$N
    time faSomeRecords <(xzcat $gisaidDir/sequences_batch_2021-09-10.fa.xz) \
        <(cut -f 1 $lineage.early$N) $lineage.early$N.fa
done


#########################################################################
# Update for Omicron Thu Dec  2 07:41:53 PST 2021
cd /hive/data/genomes/wuhCor1/bed/strains/2021-09-10/results/
wget https://raw.githubusercontent.com/cov-lineages/constellations/main/constellations/definitions/cB.1.1.529.json
echo 'C14408T > C241T > C3037T > A23403G > G28881A > G28883C > G28882A > C21846T > C23604A > A23063T > T22882G,C22995A,A23013C,C23525T,C24130A > A2832G,T5386G,G8393A,C10029T,C10449A,A11537G,T13195C,C15240T,A18163G,C21762T,G22578A,T22673C,C22674T,T22679C,G22813T,G22898A,G22992A,A23040G,G23048A,A23055G,C23202A,T23599G,C23854A,G23948T,A24424T,T24469A,C24503T,C25000T,C25584T,C26270T,A26530G,C26577G,G26709A,A27259C,C27807T,A28271T,C28311T' > B.1.1.529_nuc.txt
~/kent/src/hg/makeDb/scripts/variantsMuts/usherDescToBed B.1.1.529_nuc.txt cB.1.1.529.json B.1.1.529


#########################################################################
# Omicron additions - 2021-12-02 - Angie

# The cov-lineages/constellations/.../cB.1.1.529.json file actually had an incomplete list
# of deletions in Omicron, and omitted the insertion.

# Log in on gisaid.org, go to Search tab, select "VOC Omicron ..." from Variants menu
# 339 sequences match; click checkbox to select all, click Download, leave the checkbox
# "Replace spaces with underscores in FASTA header" checked, click Download,
# xz the downloaded file, upload file to hgwdev.

mkdir /hive/data/genomes/wuhCor1/bed/strains/2021-12-02
cd /hive/data/genomes/wuhCor1/bed/strains/2021-12-02
unxz gisaid_hcov-19_2021_12_02_22.fasta.xz
mv gisaid_hcov-19_2021_12_02_22.fasta B.1.1.529.2021-12-02.fa

ln -s ~angie/github/sarscov2lineages .
# I had previously created this conda environment
# (same conda env create shown in Kate's section above)
conda activate slAngie

./sarscov2lineages/lineageVariants.params.sh -c 50 -s 0.8 -i 0.7 -v \
    B.1.1.529.2021-12-02.fa sarscov2lineages/NC_045512.fa

# Omicron/B.1.1.529 has two mutations in Spike codon 371 that need to be translated together.
# The _aa.bed output has two separate translations (S371P and S371F) but really it should be
# S371L -- hand-edit to fix.
vi B.1.1.529.2021-12-02_aa.bed

# Fix reference name, make bigBed, install in /gbdb.  Use Max's names for bigBed files.
sed -re 's/NC_045512\.2/NC_045512v2/' B.1.1.529.2021-12-02_nuc.bed \
    > B.1.1.529_nuc.bed
sed -re 's/NC_045512\.2/NC_045512v2/' B.1.1.529.2021-12-02_aa.bed \
    > B.1.1.529_prot.bed
for suffix in nuc prot; do
    bedToBigBed -type=bed4 B.1.1.529_$suffix.bed /hive/data/genomes/wuhCor1/chrom.sizes \
       B.1.1.529_$suffix.bb
    ln -sf `pwd`/B.1.1.529_$suffix.bb /gbdb/wuhCor1/strainMuts/
done


#########################################################################
BA.4 and 5, based on files created by Angie (not sure how they were made)
Max, Wed May  4 06:47:30 PDT 2022

mkdir /hive/data/genomes/wuhCor1/bed/strains/2022-05-04
cd mkdir /hive/data/genomes/wuhCor1/bed/strains/2022-05-04

conda activate ~angie/miniconda3/envs/sarscov2lineages
ln -s ~angie/github/sarscov2lineages .

./sarscov2lineages/lineageVariants.params.sh -c 50 -s 0.8 -i 0.7 -v BA.5.2022-05-03.fa sarscov2lineages/NC_045512.fa
./sarscov2lineages/lineageVariants.params.sh -c 50 -s 0.8 -i 0.7 -v BA.4.2022-05-03.fa sarscov2lineages/NC_045512.fa

for lineage in BA.4 BA.5; do
for suffix in nuc aa; do
    sed -re 's/NC_045512\.2/NC_045512v2/' ${lineage}*_$suffix.bed > ${lineage}_$suffix.bed
    bedToBigBed -type=bed4 ${lineage}_$suffix.bed /hive/data/genomes/wuhCor1/chrom.sizes \
       ${lineage}_$suffix.bb
    ln -sf `pwd`/${lineage}_$suffix.bb /gbdb/wuhCor1/strainMuts/
done
done

#########################################################################
# Omicron updates - 2023-09-22 - Angie

# For each lineage in BA.2, BA.2.12.1, BA.2.75, BQ.1, XBB, XBB.1.5, XBB.1.16, CH.1.1, XBB.1.9,
# XBB.2.3, EG.5.1, do the following:

# *** NOT THIS TIME, just for future reference: ***
# Download method for a relatively new lineage
# * Log in on gisaid.org, go to Search tab, select Lineage: <whichever lineage>
# * If there are more than a couple thousand matching sequences, restrict by date to the earliest
#   range that gets ~2000 sequences
# * click checkbox to select all, click Download, leave the checkbox "Replace spaces with
#   underscores in FASTA header" checked, click Download, rename and xz the downloaded file,
#   upload file to hgwdev.
#*** END NOT THIS TIME ***

# At this point, those lineages are all huge and well-established.  Instead, pick a bunch of
# sequences from the giant lineage polytomy nodes in the latest UShER tree, using a sample-paths
# file generated from today's tree by matUtils extract.
today=2023-09-22
treeDir=/hive/data/outside/otto/sarscov2phylo/$today
cd $treeDir
# If sample-paths file has not already been created:
time ~angie/github/usher/build/matUtils extract -i gisaidAndPublic.$today.masked.pb -c BA.2 \
    -S sample-paths
#real    7m29.687s

mkdir /hive/data/genomes/wuhCor1/bed/strains/$today
cd /hive/data/genomes/wuhCor1/bed/strains/$today

#*** NOT THIS TIME ***
# unxz the downloaded & uploaded GISAID sequence files
unxz $lineage.$today.fa.xz
#*** END NOT THIS TIME ***

cladePaths=$treeDir/clade-paths
samplePaths=$treeDir/sample-paths

gisaidDir=/hive/users/angie/gisaid

# Find the BA.2 polytomy node:
lineage=BA.2
grep ^$lineage$'\t' $cladePaths | cut -f 2
#node_2161928
# Get a bunch of sequences on that exact node... ah, actually, BA.2 is annotated one node upstream
# of the real polytomy:
grep node_2161928: $samplePaths | head -1
# ... node_2161926:C9344T node_2161928:A9424G node_2162206:C2790T
# We want to include C2790T, so use node_2162206 instead.
linNode=node_2162206
# For simplicity, restrict to GISAID sequences and randomly select 2000:
grep $linNode: $samplePaths | grep -vE $linNode':[ACGT0-9,]+ node_[0-9]+:' \
| cut -f 1 \
| grep \|EPI_ISL_ \
| randomLines stdin 2000 $lineage.fullNames
# Extract the sequences from the full GISAID fasta
time xzcat $gisaidDir/gisaid_fullNames_$today.fa.xz \
| faSomeRecords stdin $lineage.fullNames $lineage.$today.fa
#real    14m47.830s
faSize $lineage.$today.fa
#59490045 bases (298890 N's 59191155 real 59191155 upper 0 lower) in 2000 sequences in 1 files
#Total size: mean 29745.0 sd 76.6 min 29296 (SouthKorea/GJ-HERI-A0956/2022|EPI_ISL_17703033|2022-04-29) max 29901 (India/MH-INSACOG-CSIR-NEERI1135/2022|EPI_ISL_11034322|2022-02-28) median 29751

# BA.2.12.1 is annotated a couple nodes upstream of its giant polytomy.
lineage=BA.2.12.1
grep ^$lineage$'\t' $cladePaths | cut -f 2
#node_2657907
grep node_2657907: $samplePaths | head -1
#... node_2657889:C11674T node_2657907:T22917A node_2657908:C21721T node_2657979:T15009C ...
linNode=node_2657979
grep $linNode: $samplePaths | grep -vE $linNode':[ACGT0-9,]+ node_[0-9]+:' \
| cut -f 1 \
| grep \|EPI_ISL_ \
| randomLines stdin 2000 $lineage.fullNames
# Making FASTA is very slow, do that later in a big loop with all the other lineages.

# XBB.1.5 is practically nothing without 17124... and reversions have tossed XBB.1.5 on some dinky
# reversion path with only a few thousand sequences total (13k including sublineages without 17124).
# So use the XBB.1.5_17124 label for XBB.1.5.
lineage=XBB.1.5
linNode=$(grep ^XBB.1.5_17124 $cladePaths | cut -f 2)
echo $linNode
#node_2396649
grep $linNode: $samplePaths | grep -vE $linNode':[ACGT0-9,]+ node_[0-9]+:' \
| cut -f 1 \
| grep \|EPI_ISL_ \
| randomLines stdin 2000 $lineage.fullNames
#stdin has 2203 lines.  Random lines will only output 1101 or less lines
#on a file this size. You asked for 2000. Sorry.
# Geez, OK, fine.
grep $linNode: $samplePaths | grep -vE $linNode':[ACGT0-9,]+ node_[0-9]+:' \
| cut -f 1 \
| grep \|EPI_ISL_ \
    > $lineage.fullNames
wc -l $lineage.fullNames
#2203 XBB.1.5.fullNames

# XBB.1.16 is also annotated a couple nodes upstream of polytomy.  I should add a _dropout label
# for these and annotate at the polytomy... helpful for Freyja code too.
lineage=XBB.1.16
grep ^$lineage$'\t' $cladePaths | cut -f 2
#node_2301939
grep node_2301939: $samplePaths | head -1
#... node_2301939:A22101T node_2301940:T28297C node_2302169:C29386T ...
linNode=2302169
grep $linNode: $samplePaths | grep -vE $linNode':[ACGT0-9,]+ node_[0-9]+:' \
| cut -f 1 \
| grep \|EPI_ISL_ \
    > $lineage.fullNames
wc -l $lineage.fullNames
#911 XBB.1.16.fullNames

# XBB.2.3 is annotated 3 nodes upstream of its polytomy
lineage=XBB.2.3
grep ^$lineage$'\t' $cladePaths | cut -f 2
#node_2505483
grep node_2505483: $samplePaths | head -1
#... node_2505483:T23018C node_2505484:G29706T node_2505485:C4234T node_2505521:G6536A ...
linNode=node_2505521
grep $linNode: $samplePaths | grep -vE $linNode':[ACGT0-9,]+ node_[0-9]+:' \
| cut -f 1 \
| grep \|EPI_ISL_ \
    > $lineage.fullNames
wc -l $lineage.fullNames
#222 XBB.2.3.fullNames
# Wow, that's so few.  Can add several sublineages if we don't get good results from that.
# Or at least allow another node or two in the regex.


# The other lineages are annotated on their big polytomy nodes.
for lineage in BA.2.75 BQ.1 XBB CH.1.1 XBB.1.9 EG.5.1; do
    linNode=$(grep ^$lineage$'\t' $cladePaths | cut -f 2)
    grep $linNode: $samplePaths | grep -vE $linNode':[ACGT0-9,]+ node_[0-9]+:' \
    | cut -f 1 \
    | grep \|EPI_ISL_ \
        > $lineage.fullNames
    wc -l $lineage.fullNames
done
#611 BA.2.75.fullNames
#168 BQ.1.fullNames
#63 XBB.fullNames
#88 CH.1.1.fullNames
#177 XBB.1.9.fullNames
#213 EG.5.1.fullNames

# OK, that is just way too few for all but BA.2.75.  Allow another node...
for lineage in BQ.1 XBB CH.1.1 XBB.1.9 EG.5.1; do
    linNode=$(grep ^$lineage$'\t' $cladePaths | cut -f 2)
    grep $linNode: $samplePaths | grep -vE $linNode':[ACGT0-9,]+ node_[0-9]+:[ACGT0-9,]+ node_[0-9]+:' \
    | cut -f 1 \
    | grep \|EPI_ISL_ \
        > $lineage.fullNames
    wc -l $lineage.fullNames
done
#1874 BQ.1.fullNames
#743 XBB.fullNames
#665 CH.1.1.fullNames
#561 XBB.1.9.fullNames
#975 EG.5.1.fullNames
# OK, now we're talking.


# Get FASTA for the ones for which I didn't already generate FASTA...
for lineage in BA.2.12.1 BA.2.75 BQ.1 XBB XBB.1.5 XBB.1.16 CH.1.1 XBB.1.9 XBB.2.3 EG.5.1; do
    time xzcat $gisaidDir/gisaid_fullNames_$today.fa.xz \
    | faSomeRecords stdin $lineage.fullNames $lineage.$today.fa
    faSize $lineage.$today.fa
done
#real    15m5.060s
#65395512 bases (638366 N's 64757146 real 64757146 upper 0 lower) in 2203 sequences in 1 files
#Total size: mean 29684.8 sd 146.5 min 28394 (USA/NV-NSPHL-CL2023-00002852/2023|EPI_ISL_16735355|2023-01-18) max 29862 (USA/CT-YNH-2586/2023|EPI_ISL_16757123|2023) median 29722
#real    14m31.843s
#18130724 bases (537201 N's 17593523 real 17593523 upper 0 lower) in 611 sequences in 1 files
#Total size: mean 29673.9 sd 261.8 min 28578 (India/TG-CDFD-GH-276/2022|EPI_ISL_14196571|2022-07-19) max 29903 (India/MP-AIIMS_B-ICMR-INSACOG-WGS-918/2022|EPI_ISL_13574753|2022-06-09) median 29733
#real    14m13.970s
#55584411 bases (608337 N's 54976074 real 54976074 upper 0 lower) in 1874 sequences in 1 files
#Total size: mean 29660.8 sd 196.8 min 28312 (USA/AZ-ASU93160/2022|EPI_ISL_15667635|2022-10-27) max 29871 (Italy/CAM-TIGEM-IZSM-COLLI-38577/2022|EPI_ISL_15195631|2022-09-22) median 29718
#real    14m47.237s
#22057105 bases (490469 N's 21566636 real 21566636 upper 0 lower) in 743 sequences in 1 files
#Total size: mean 29686.5 sd 192.2 min 28032 (Malaysia/IMR_LF00509/2022|EPI_ISL_15852264|2022-11-08) max 29901 (India/DL-NII-1466/2022|EPI_ISL_16170535|2022-10-25) median 29737
#real    14m51.990s
#59412748 bases (723349 N's 58689399 real 58689399 upper 0 lower) in 2000 sequences in 1 files
#Total size: mean 29706.4 sd 126.6 min 28766 (USA/TX-DSHS-19573/2022|EPI_ISL_12543273|2022-03-29) max 29874 (Australia/VIC59851/2022|EPI_ISL_13275021|2022-05-16) median 29729
#real    15m6.092s
#27058700 bases (519324 N's 26539376 real 26539376 upper 0 lower) in 911 sequences in 1 files
#Total size: mean 29702.2 sd 170.5 min 28581 (India/TN-CDFD-M3-520/2023|EPI_ISL_17544408|2023-03-28) max 29946 (India/UP-ILSGS21449/2023|EPI_ISL_17957831|2023-04-12) median 29741
#real    13m2.308s
#19719791 bases (229086 N's 19490705 real 19490705 upper 0 lower) in 665 sequences in 1 files
#Total size: mean 29653.8 sd 186.4 min 28572 (NewZealand/22YA4644/2022|EPI_ISL_15957424|2022-11-21) max 29876 (Italy/LOM-UniMI130/2022|EPI_ISL_17518108|2022-12-01) median 29701
#real    13m0.853s
#16635498 bases (293858 N's 16341640 real 16341640 upper 0 lower) in 561 sequences in 1 files
#Total size: mean 29653.3 sd 202.2 min 28575 (Malaysia/MKAI-6779747/2022|EPI_ISL_16121077|2022-11-07) max 29870 (Italy/LOM-69338838/2023|EPI_ISL_17434186|2023-04-04) median 29727
#real    12m58.907s
#6585941 bases (133719 N's 6452222 real 6452222 upper 0 lower) in 222 sequences in 1 files
#Total size: mean 29666.4 sd 187.9 min 28572 (Spain/CT-HUGTiP-A6R129/2023|EPI_ISL_17199678|2023-02-22) max 29899 (India/HR-NII-1658/2023|EPI_ISL_17372239|2023-03-07) median 29726
#real    13m15.788s
#28983299 bases (308229 N's 28675070 real 28675070 upper 0 lower) in 975 sequences in 1 files
#Total size: mean 29726.5 sd 118.7 min 28578 (Belgium/UCL_H-245G0017/2023|EPI_ISL_18242731|2023-09-02) max 29900 (USA/TX-TAMGHRC-SHS_97566/2023|EPI_ISL_18111433|2023-07-14) median 29749


ln -s ~angie/github/sarscov2lineages .
# I had previously created this conda environment
# (same conda env create shown in Kate's section above)
conda activate slAngie

for lineage in BA.2 BA.2.12.1 BA.2.75 BQ.1 XBB XBB.1.5 XBB.1.16 CH.1.1 XBB.1.9 XBB.2.3 EG.5.1; do
    echo $lineage
    time ./sarscov2lineages/lineageVariants.params.sh -c 50 -s 0.8 -i 0.7 -v \
        $lineage.$today.fa sarscov2lineages/NC_045512.fa
done
#BA.2
#real    2m21.662s
#BA.2.12.1
#real    3m15.397s
#BA.2.75
#real    0m42.523s
#BQ.1
#real    2m6.535s
#XBB
#real    0m54.531s
#XBB.1.5
#real    2m7.850s
#XBB.1.16
#real    1m11.199s
#CH.1.1
#real    0m46.571s
#XBB.1.9
#real    0m42.110s
#XBB.2.3
#real    0m18.271s
#EG.5.1
#real    1m16.673s


# There are lots of codons, especially in Spike, with multiple nuc substitutions in the same
# codon.  The script used above translates each nuc mutation independently, so instead of a
# cumulative codon change it does two or three separate translations.  Find and fix those.
for f in *_aa.bed; do
    echo $f
    cut -f 2 $f | uniq -d
done
#BA.2.12.1.2023-09-22_aa.bed
#27381
#BA.2.2023-09-22_aa.bed
#27381
#BA.2.75.2023-09-22_aa.bed
#22576
#27381
#BQ.1.2023-09-22_aa.bed
#11287
#CH.1.1.2023-09-22_aa.bed
#22576
#27381
#EG.5.1.2023-09-22_aa.bed
#22576
#22894
#23017
#27381
#XBB.1.16.2023-09-22_aa.bed
#22576
#22894
#23017
#27381
#XBB.1.5.2023-09-22_aa.bed
#27381
#XBB.1.9.2023-09-22_aa.bed
#22576
#22894
#27381
#XBB.2.3.2023-09-22_aa.bed
#22576
#22894
#23017
#27381
#XBB.2023-09-22_aa.bed
#22576
#22894
#27381
# All of those except for BA.5-derived BQ.1 have BA.2's G27382C,A27383T,T27384C which is
# ORF6:D61L.  When translated separately, there are two coding changes, neither of which is
# the one we want:
# NC_045512.2     27381   27384   ORF6:D61H
# NC_045512.2     27381   27384   ORF6:D61V
# So we can change ORF6:D61H to ORF6:D61L and remove D61V.
sed -re 's/ORF6:D61H/ORF6:D61L/;
         /ORF6:D61V/ d;' -i.bak *_aa.bed

# BA.2.75, CH.1.1, EG.5.1, and XBB* have G22577C in addition to G22578A, so their _aa.bed
# files have both S:G339R and S:G339D but they should have S:G339H.
for lineage in BA.2.75 XBB XBB.1.5 XBB.1.16 CH.1.1 XBB.1.9 XBB.2.3 EG.5.1; do
    sed -re 's/S:G339R/S:G339H/;
             /S:G339D/ d;' -i.bak $lineage.${today}_aa.bed
done

# BQ.1 has conflicting annotations on 11288 from our separate indel and substitution pipelines:
#NC_045512.2     11287   11296   del_11288
#NC_045512.2     11287   11288   T11288A
# What did I do for BA.5?  ../2022-05-04/BA.5.2022-05-03_nuc.bed has only the del.
# Yep, minimap output BQ.1.2023-09-22.fa.vars shows just the deletion. Remove that one.
sed -re '/T11288A/ d;' -i.bak BQ.1.${today}_nuc.bed
sed -re '/nsp6:S106T/ d;' -i.bak BQ.1.${today}_aa.bed

# XBB* including EG.5.1.1 have G22895C and T22896C making S:V445P (not S:V445L and S:V445A).
for lineage in XBB XBB.1.5 XBB.1.16 XBB.1.9 XBB.2.3 EG.5.1; do
    sed -re 's/S:V445L/S:V445P/;
             /S:V445A/ d;' -i.bak $lineage.${today}_aa.bed
done

# EG.5.1, XBB.1.16 and XBB.2.3 have T23018C in addition to parents' T23019C making S:F486P
for lineage in EG.5.1 XBB.1.16 XBB.2.3; do
    sed -re 's/S:F486L/S:F486P/;
             /S:F486S/ d;' -i.bak $lineage.${today}_aa.bed
done

# Double-check that we fixed all the double annotations:
for f in *_aa.bed; do
    echo $f
    cut -f 2 $f | uniq -d
done
# Just the file names, good.


for lineage in BA.2 BA.2.12.1 BA.2.75 BQ.1 XBB XBB.1.5 XBB.1.16 CH.1.1 XBB.1.9 XBB.2.3 EG.5.1; do
    # Fix reference name, make bigBed, install in /gbdb.  Use Max's names for bigBed files.
    sed -re 's/NC_045512\.2/NC_045512v2/' $lineage.${today}_nuc.bed \
        > ${lineage}_nuc.bed
    sed -re 's/NC_045512\.2/NC_045512v2/' $lineage.${today}_aa.bed \
        > ${lineage}_prot.bed
    for suffix in nuc prot; do
        bedToBigBed -type=bed4 ${lineage}_$suffix.bed /hive/data/genomes/wuhCor1/chrom.sizes \
            ${lineage}_$suffix.bb
        ln -sf `pwd`/${lineage}_$suffix.bb /gbdb/wuhCor1/strainMuts/
    done
done

conda deactivate

# Now update the variantMuts.ra and .html files.

# BA.2 needs to go before BA.4 but the others can appear in order after BA.5.  Follow pre-existing
# conventions...

# AA muts:
ordLetter=N
prio=15
for lineage in BA.2.12.1 BA.2.75 BQ.1 XBB XBB.1.5 XBB.1.16 CH.1.1 XBB.1.9 XBB.2.3 EG.5.1; do
    linUnd=$(echo $lineage | tr . _)
    linSplat=$(echo $lineage | sed -re 's/\.//g;')
    cat <<EOF
        track variantAaMuts_$linUnd
        parent variantMuts off
        subGroups variant=${ordLetter}_$linSplat mutation=AA designation=VOC
        shortLabel Omicron $lineage AA Muts
        longLabel Omicron $lineage amino acid mutations from GISAID sequences (Sep 22, 2023)
        url https://outbreak.info/situation-reports?pango=$lineage
        urlLabel $lineage Situation Report at outbreak.info
        bigDataUrl /gbdb/wuhCor1/strainMuts/${lineage}_prot.bb
        color 219,40,35
        priority $prio

EOF
    ordLetter=$(echo $ordLetter | tr 'A-YZ' 'B-ZA')
    prio=$(($prio + 1))
done

# Nuc muts:
ordLetter=N
prio=125
for lineage in BA.2.12.1 BA.2.75 BQ.1 XBB XBB.1.5 XBB.1.16 CH.1.1 XBB.1.9 XBB.2.3 EG.5.1; do
    linUnd=$(echo $lineage | tr . _)
    linSplat=$(echo $lineage | sed -re 's/\.//g;')
    cat <<EOF
        track variantNucMuts_$linUnd
        parent variantMuts off
        subGroups variant=${ordLetter}_$linSplat mutation=NUC designation=VOC
        shortLabel Omicron $lineage Nuc Muts
        longLabel Omicron VOC ($lineage) nucleotide mutations identifed from GISAID sequences (Sep 2023)
        url https://outbreak.info/situation-reports?pango=$lineage
        urlLabel $lineage Situation Report at outbreak.info
        bigDataUrl /gbdb/wuhCor1/strainMuts/${lineage}_nuc.bb
        color 219,40,35
        priority $prio

EOF
    ordLetter=$(echo $ordLetter | tr 'A-YZ' 'B-ZA')
    prio=$(($prio + 1))
done

# Description table
color=$(perl -we 'print sprintf("%02X%02X%02X", 219,40,35) . "\n";')
echo $color
#DB2823
for lineage in BA.2.12.1 BA.2.75 BQ.1 XBB XBB.1.5 XBB.1.16 CH.1.1 XBB.1.9 XBB.2.3 EG.5.1; do
    cat <<EOF
  <tr>
    <td><span style="background-color: #$color;">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</span></td>
    <td>Omicron</td>
    <td>$lineage</td>
    <td>2??NS</td>
    <td>GRA</td>
    <td>???First det</td>
    <td>???Date</td>
    <td>VUM</td>
  </tr>

EOF
done


#########################################################################
# Omicron updates - 2024-01-29 - Angie

# For each lineage in XBB.1.5.70, HK.3, BA.2.86, JN.1, do the following:

# *** NOT THIS TIME, just for future reference: ***
# Download method for a relatively new lineage
# * Log in on gisaid.org, go to Search tab, select Lineage: <whichever lineage>
# * If there are more than a couple thousand matching sequences, restrict by date to the earliest
#   range that gets ~2000 sequences
# * click checkbox to select all, click Download, leave the checkbox "Replace spaces with
#   underscores in FASTA header" checked, click Download, rename and xz the downloaded file,
#   upload file to hgwdev.
#*** END NOT THIS TIME ***

# At this point, those lineages are all huge and well-established.  Instead, pick a bunch of
# sequences from the giant lineage polytomy nodes in the latest UShER tree, using a sample-paths
# file generated from today's tree by matUtils extract.
today=2024-01-29
treeDir=/hive/data/outside/otto/sarscov2phylo/$today
cd $treeDir
# If sample-paths file has not already been created:
time ~angie/github/usher/build/matUtils extract -i gisaidAndPublic.$today.masked.pb -c BA.2 \
    -S sample-paths
#real    9m24.485s

mkdir /hive/data/genomes/wuhCor1/bed/strains/$today
cd /hive/data/genomes/wuhCor1/bed/strains/$today

#*** NOT THIS TIME ***
# unxz the downloaded & uploaded GISAID sequence files
unxz $lineage.$today.fa.xz
#*** END NOT THIS TIME ***

cladePaths=$treeDir/clade-paths
samplePaths=$treeDir/sample-paths

gisaidDir=/hive/users/angie/gisaid

for lineage in XBB.1.5.70 HK.3 BA.2.86 JN.1; do
    linNode=$(grep ^$lineage$'\t' $cladePaths | cut -f 2)
    grep $linNode: $samplePaths | grep -vE $linNode':[ACGT0-9,]+ node_[0-9]+:' \
    | cut -f 1 \
    | grep \|EPI_ISL_ \
        > $lineage.fullNames
    wc -l $lineage.fullNames
done
#64 XBB.1.5.70.fullNames
#151 HK.3.fullNames
#1 BA.2.86.fullNames
#1684 JN.1.fullNames

# OK, we need to try again for all but JN.1, allowing another node:
for lineage in XBB.1.5.70 HK.3 BA.2.86; do
    linNode=$(grep ^$lineage$'\t' $cladePaths | cut -f 2)
    grep $linNode: $samplePaths | grep -vE $linNode':[ACGT0-9,]+ node_[0-9]+:[ACGT0-9,]+ node_[0-9]+:' \
    | cut -f 1 \
    | grep \|EPI_ISL_ \
        > $lineage.fullNames
    wc -l $lineage.fullNames
done
#221 XBB.1.5.70.fullNames
#944 HK.3.fullNames
#91 BA.2.86.fullNames

# Try allowing one more node for BA.2.86:
for lineage in BA.2.86; do
    linNode=$(grep ^$lineage$'\t' $cladePaths | cut -f 2)
    grep $linNode: $samplePaths | grep -vE $linNode':[ACGT0-9,]+ node_[0-9]+:[ACGT0-9,]+ node_[0-9]+:[ACGT0-9,]+ node_[0-9]+:' \
    | cut -f 1 \
    | grep \|EPI_ISL_ \
        > $lineage.fullNames
    wc -l $lineage.fullNames
done
#549 BA.2.86.fullNames

# Now we have hundreds for each lineage.

# Get FASTA.
for lineage in XBB.1.5.70 HK.3 BA.2.86 JN.1; do
    time xzcat $gisaidDir/gisaid_fullNames_$today.fa.xz \
    | faSomeRecords stdin $lineage.fullNames $lineage.$today.fa
    faSize $lineage.$today.fa
done
#real    15m44.272s
#6563045 bases (103705 N's 6459340 real 6459340 upper 0 lower) in 221 sequences in 1 files
#Total size: mean 29697.0 sd 106.1 min 29289 (Brazil/CE-LACENCE-232200557/2023|EPI_ISL_17680974|2023-04-03) max 29847 (USA/CA-HLX-STM-JZKMW6PDA/2023|EPI_ISL_18288754|2023-08-27) median 29726
#real    17m1.099s
#28077756 bases (175271 N's 27902485 real 27902485 upper 0 lower) in 944 sequences in 1 files
#Total size: mean 29743.4 sd 107.2 min 28575 (Spain/MD-HRYC-72304619/2023|EPI_ISL_18098568|2023-08-02) max 29873 (Yunnan/YNCDC-2692/2023|EPI_ISL_18292825|2023-09-06) median 29754
#real    17m22.534s
#16279033 bases (276054 N's 16002979 real 16002979 upper 0 lower) in 549 sequences in 1 files
#Total size: mean 29652.2 sd 217.4 min 27512 (Belgium/LHUB-ULB_81752875/2023|EPI_ISL_18544196|2023-11-17) max 29891 (Sweden/S-1260893029R4/2023|EPI_ISL_18795991|2023-12-04) median 29711
#real    15m32.348s
#49988263 bases (568278 N's 49419985 real 49419985 upper 0 lower) in 1684 sequences in 1 files
#Total size: mean 29684.2 sd 173.2 min 28563 (Malaysia/MKAKB_CL_80975/2023|EPI_ISL_18798545|2023-12-12) max 29875 (Sweden/AB-01_SE100_23CS503157/2023|EPI_ISL_18522860|2023-11-10) median 29721

ln -s ~angie/github/sarscov2lineages .
# I had previously created this conda environment
# (same conda env create shown in Kate's section above)
conda activate slAngie

for lineage in XBB.1.5.70 HK.3 BA.2.86 JN.1; do
    echo $lineage
    time ./sarscov2lineages/lineageVariants.params.sh -c 50 -s 0.8 -i 0.7 -v \
        $lineage.$today.fa sarscov2lineages/NC_045512.fa
done
#XBB.1.5.70
#real    0m27.572s
#HK.3
#real    1m18.207s
#BA.2.86
#real    0m44.637s
#JN.1
#real    2m31.823s

# There are lots of codons, especially in Spike, with multiple nuc substitutions in the same
# codon.  The script used above translates each nuc mutation independently, so instead of a
# cumulative codon change it does two or three separate translations.  Find and fix those.
for f in *_aa.bed; do
    echo $f
    cut -f 2 $f | uniq -d
done
#BA.2.86.2024-01-29_aa.bed
#11287
#22030
#22576
#22894
#23017
#27381
#HK.3.2024-01-29_aa.bed
#22576
#22894
#23017
#27381
#JN.1.2024-01-29_aa.bed
#22030
#22576
#22894
#23017
#27381
#XBB.1.5.70.2024-01-29_aa.bed
#22576
#22894
#23017
#27381
# Checking vs. covariants.org's page for each variant is helpful.

# Uh-oh, BA.2.86 has a complex region where there's a deletion of 1 base (frameshift!) and then
# later an insertion that should rescue the frameshift:
#NC_045512.2     22030   22031   S:del_22031
#NC_045512.2     22030   22033   S:F157S
#NC_045512.2     22030   22033   S:F157L
#NC_045512.2     22033   22036   S:R158G
#NC_045512.2     22034   22035   S:ins_22035

# To really figure out what the amino acid sequence is there, better do a real alignment and
# translation of S:157-159...
faFrag -mixed sarscov2lineages/NC_045512.fa 22027 22036 stdout
#>NC_045512.2:22027-22036
#GAGTTCAGA

# BA.2.86.2024-01-29.fa.vcf.filtered has 3 substitutions in that range:
#NC_045512.2     22032   T22032C T       C       .       .       AC=529;AN=539
#NC_045512.2     22033   C22033A C       A       .       .       AC=528;AN=538
#NC_045512.2     22034   A22034G A       G       .       .       AC=529;AN=532

# minimap2 has the two balancing frameshift indels:
#V       NC_045512.2     22030   22031   493     60      t       -       Israel/ICH-741198454/2023|EPI_ISL_18096761|2023-07-31   21990   21990   +
#V       NC_045512.2     22034   22034   493     60      -       g       Israel/ICH-741198454/2023|EPI_ISL_18096761|2023-07-31   21993   21994   +

faOneRecord BA.2.86.2024-01-29.fa 'Israel/ICH-741198454/2023|EPI_ISL_18096761|2023-07-31' \
| faFrag -mixed stdin 21987 21996 stdout
#>Israel/ICH-741198454/2023|EPI_ISL_18096761|2023-07-31:21987-21996
#GAGTCAGGA

# OK, manually align:

# no indels:
#GAGTTCAGA
#    |||
#GAGTCAGGA

# indels:
#GAGTTCA-GA
#GAG-TCAGGA

# OK, so in this case, go with the substitution view over the indel view.
# GAG,TTC,AGA = E,F,R --> GAG,TCA,GGA --> E,S,G
# So ignore the del/ins and call it S:F157S and S:R158G.  That's what CoVariants has too, good.

# Double-check that we fixed all the double annotations:
for f in *_aa.bed; do
    echo $f
    cut -f 2 $f | uniq -d
done
# Just the file names, good.


for lineage in XBB.1.5.70 HK.3 BA.2.86 JN.1; do
    # Fix reference name, make bigBed, install in /gbdb.  Use Max's names for bigBed files.
    sed -re 's/NC_045512\.2/NC_045512v2/' $lineage.${today}_nuc.bed \
        > ${lineage}_nuc.bed
    sed -re 's/NC_045512\.2/NC_045512v2/' $lineage.${today}_aa.bed \
        > ${lineage}_prot.bed
    for suffix in nuc prot; do
        bedToBigBed -type=bed4 ${lineage}_$suffix.bed /hive/data/genomes/wuhCor1/chrom.sizes \
            ${lineage}_$suffix.bb
        ln -sf `pwd`/${lineage}_$suffix.bb /gbdb/wuhCor1/strainMuts/
    done
done

conda deactivate

#TODO
# Now update the variantMuts.ra and .html files.

# See above.
