# Genomic Answers for Kids (GA4K), Children's Mercy - 2026-04-16 Claude max
# GA4K is a pediatric rare-disease PacBio HiFi long-read cohort (Cohen et al.
# 2022, Genet Med, PMID 35305867). The release ships 24 per-chromosome VCFs of
# site-only small variants (SNVs and short indels), filtered to variants
# replicated in >=2 unrelated GA4K individuals or matched to an HPRC variant.
# Upstream data lives under /hive/data/genomes/hg38/bed/lrSv/GA4K (co-located
# with the matched GA4K structural-variant release; see the lrSv makedoc).
cd /hive/data/genomes/hg38/bed/lrSv/GA4K
bcftools concat -Oz -o ga4kSnv.vcf.gz \
    pacbio_snv_vcf/pb_joint_merged.snv.chr{1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,X,Y}.vcf.gz
tabix -p vcf ga4kSnv.vcf.gz
# Symlinks placed under /gbdb/hg38/varFreqs/ga4k/ for the ga4kSnv stanza in
# trackDb/human/varFreqs.ra.

# Mexico Biobank, Max, Nov 8 2025
CrossMap.py vcf /gbdb/hg19/liftOver/hg19ToHg38.over.chain.gz /hive
/data/genomes/hg19/bed/varFreqs/mexbb/MXBv2.vcf.gz /hive/data/genomes/hg38/p14Clean/hg38.p14.fa MXBv2.lift.hg19ToHg38.vcf && bgzip MXBv2.lift.hg19ToHg38.vcf && bcftools sort MXBv2.lift.hg19ToHg38.vcf -Oz -m 200G -T /data/tmp/ -o MXBv2.lift.hg19ToHg38.vcf.gz && tabix -p vcf MXBv2.lift.hg19ToHg38.vcf.gz

# Mexico City Prospective study, Max Oct 28 2025
cd /hive/data/genomes/hg38/bed/varFreqs/mcps/
for i in `seq 1 22` X; do wget https://rgc-mcps.regeneron.com/downloads/20230130/chr$i.freq.vcf.gz; done
for i in `seq 1 22` X; do wget https://rgc-mcps.regeneron.com/downloads/20230130/chr$i.freq.vcf.gz.tbi; done
mv *vcf* vcf/
bcftools concat  --threads 16  -Oz -o mcps.freq.vcf.gz vcf/chr{1..22}.freq.vcf.gz vcf/chrX.freq.vcf.gz
# make normal AC and AF and AN fields for mouseovers
zcat mcps.freq.vcf.gz | sed -e 's/_RAW//g' > mcps.fix.freq.vcf
mv -f mcps.fix.freq.vcf mcps.freq.vcf
bgzip mcps.freq.vcf
tabix -p vcf mcps.freq.vcf.gz 

# Regeneron million exomes, Max, Nov 3 2025
cd /hive/data/genomes/hg38/bed/varFreqs/me
for i in `seq 1 22` X Y; do wget https://rgc-research.regeneron.com/me/downloads/20231004/rgc_me_variant_frequencies_chr${i}_20231004.vcf.gz.tbi; done
bcftools concat  --threads 10  -Oz -o rgc_me_freqs_20231004.vcf.gz rgc_me_variant_frequencies_chr{1..22}_20231004.vcf.gz  rgc_me_variant_frequencies_chrX_20231004.vcf.gz rgc_me_variant_frequencies_chrY_20231004.vcf.gz 
zcat rgc_me_freqs_20231004.vcf.gz | sed -e 's/ALL_//g' > rgc_me_freqs_20231004.fix.vcf
tabix -p vcf rgc_me_freqs_20231004.vcf.gz

# GA south asia 100k pilot
cd /hive/data/genomes/hg38/bed/varFreqs/ga100k/
parallel -j 8 wget -q --no-check-certificate https://browser.genomeasia100k.org/service/web/download_files/{}.substitutions.annot.cont_withmaf.vcf.gz ::: {1..22} X Y
# fix the header line, remove "FORMAT"
for i in *.vcf.gz; do echo "zcat $i |   awk 'BEGIN{OFS=\"\\t\"} /^#CHROM/{NF=8; print; next} /^#/ {print; next} {NF=8; print}' |   bgzip -c > fixed/$i" >> cmds.txt; done
parallel -j 8 < cmds.txt
bcftools concat  --threads 16  -Oz -o ../ga100k.subst.vcf.gz fixed/{1..22}.substitutions.annot.cont_withmaf.vcf.gz
# add indels
wget -q --no-check-certificate https://browser.genomeasia100k.org/service/web/download_files/All.indels.annot.cont_withmaf.vcf.gz
# index
tabix -p vcf ../ga100k*.vcf.gz
tabix -p vcf All*.vcf.gz

# TOPMED Freeze 10
cd /hive/data/genomes/hg38/bed/varFreqs/topmed/
# need to download the VCFs manually, 22 VCFs, with one time links from https://bravo.sph.umich.edu/vcfs.html
# grrrr...
bcftools concat  --threads 10  -Oz -o topmed10.vcf.gz {1..22}.vcf.gz X.vcf.gz 
tabix -p vcf topmed10.vcf.gz

# Abraom brazil
# get unique download link from https://abraom.ib.usp.br/download/index.php
cd /hive/data/genomes/hg38/bed/varFreqs/abraom/
wget 'https://abraom.ib.usp.br/download/download-files.php?fid=RklEMTIzNDU2&key=1762266466-key690a0d62348de0.22872232' -O abraom.tar
tar xvfz abraom.tar
ln -s  /hive/data/genomes/hg38/p14Clean/hg38.p14.fa
samtools faidx hg38.p14.fa 
python ~/kent/src/hg/makeDb/scripts/varFreqs/abraomToVcf.py SABE1171.Abraom.clean.tsv abraom.vcf hg38.p14.fa
tabix -p vcf abraom.vcf.gz 

# SGDP
cd /hive/data/genomes/hg38/bed/varFreqs/sgp/
CrossMap.py vcf /gbdb/hg19/liftOver/hg19ToHg38.over.chain.gz /hive/data/genomes/hg19/bed/varFreqs/sgdp/SGDP.nh2.vcf.gz hg38.p14.fa sgdp.hg38.nh2.vcf
bgzip sgdp.hg38.nh2.vcf
bcftools sort sgdp.hg38.nh2.vcf.gz -Oz -m 200G -T /data/tmp/ -o sgdp.hg38.nh2.sort.vcf.gz 
mv sgdp.hg38.nh2.sort.vcf.gz SGDP.nh2.vcf.gz
tabix -p vcf SGDP.nh2.vcf.gz

# KOVA
cd /hive/data/genomes/hg38/bed/varFreqs/sgp/
# got tsv file via google drive link from 장인수 <insoo078@kribb.re.kr> 
# VCF converter, written by Claude Opus 4.1 using 2 lines of example input
python ~/kent/src/hg/makeDb/scripts/varFreqs/kovaToVcf.py 1_KOVA.v7.tsv.gz kova.v7.vcf
bgzip kova.v7.vcf
tabix -p vcf kova.v7.vcf.gz

# NPM Singapore
cd /hive/data/genomes/hg38/bed/varFreqs/npm/
# downloaded data manually from chorus website, https://chorus.grids-platform.io/vcfdl
bcftools concat  --threads 10  -Oz -o SG10K_Health_r5.3.2.sites.vcf.bgz  SG10K_Health_r5.3.2.sites.chr{1..22}.vcf.bgz SG10K_Health_r5.3.2.sites.chrX.vcf.bgz SG10K_Health_r5.3.2.sites.chrY.vcf.bgz 
tabiv -p vcf SG10K_Health_r5.3.2.sites.vcf.bgz

# Saudi 300 genomes
cd /hive/data/genomes/hg38/bed/varFreqs/saudi
wget https://figshare.com/ndownloader/files/51297884 -O 51297884.tsv.gz
python3 ~/kent/src/hg/makeDb/scripts/varFreqs/saudiToVcf.py
bgzip saudi.vcf
tabix -p vcf saudi.vcf.gz

# SFARI SPARK
cd /hive/data/genomes/hg38/bed/varFreqs/sparkExomes/
# used globus to download into vcf/
sh ~/kent/src/hg/makeDb/scripts/varFreqs/sparkMergeVcfAddCounts.sh vcf/SPARK.iWES_v3.2024_08.deepvariant 8
bcftools norm -m-  SPARK.iWES_v3.2024_08.deepvariant.sites.vcf.gz -Oz > SPARK.iWES_v3.2024_08.deepvariant.norm.vcf.gz && tabix -p vcf SPARK.iWES_v3.2024_08.deepvariant.norm.vcf.gz

cd /hive/data/genomes/hg38/bed/varFreqs/sparkWgs/
# used globus to download into vcf/
sh ~/kent/src/hg/makeDb/scripts/varFreqs/sparkMergeVcfAddCounts.sh vcf/wgs_12519_genome.deepvariant 8
bcftools norm -m-  wgs_12519_genome.deepvariant.sites.vcf.gz -Oz > wgs_12519_genome.deepvariant.norm.vcf.gz
tabix -p vcf wgs_12519_genome.deepvariant.norm.vcf.gz

# NCBI ALFA bigBed to VCF, Max Jan 26 2026
# Source: ALFA R4 bigBed files, 904M variants, output 163M with non-zero AF
cd /hive/data/genomes/hg38/bed/varFreqs/alfa
python3 ~/kent/src/hg/makeDb/scripts/varFreqs/alfa_to_vcf.py --out ALFA.vcf --zero-af-file ALFA_zero.txt
# Compress and index
bgzip ALFA.vcf
tabix -p vcf ALFA.vcf.gz
# Final: 2.7GB, 163M variants (146M SNPs, 17M indels), ALFA_zero.txt has 26GB of zero-AF variants

# HRC (Haplotype Reference Consortium), Claude max, Mar 17 2026
# Source: HRC.r1-1.GRCh37.wgs.mac5.sites.tab.gz
# 40M variants from 32,488 WGS samples, originally on GRCh37
cd /hive/data/genomes/hg38/bed/varFreqs/hrc/
# download HRC.r1-1.GRCh37.wgs.mac5.sites.tab.gz from http://www.haplotype-reference-consortium.org/site
python3 ~/kent/src/hg/makeDb/scripts/varFreqs/hrcToVcf.py
# 40,405,505 variants read, 8,052 unmapped, 40,397,453 lifted to hg38
# sort, compress, index
bcftools sort hrc.vcf -Oz -o hrc.vcf.gz
tabix -p vcf hrc.vcf.gz
rm hrc.vcf
ln -s /hive/data/genomes/hg38/bed/varFreqs/hrc/hrc.vcf.gz /gbdb/hg38/varFreqs/hrc/hrc.vcf.gz
ln -s /hive/data/genomes/hg38/bed/varFreqs/hrc/hrc.vcf.gz.tbi /gbdb/hg38/varFreqs/hrc/hrc.vcf.gz.tbi

# Australia, Max, Jan 2026
# received files from m.hobbs@garvan.org.au
cd /hive/data/genomes/hg38/bed/varFreqs/mgrb/
bcftools norm -f hg38.fa -m-any MGRB.phase3.GRCh38.vcf.gz -o MGRB.phase3.GRCh38.norm.vcf.gz
tabix MGRB.phase3.GRCh38.norm.vcf.gz

# SCHEMA Schizophrenia Exome Meta-Analysis track for hg38, Max, Jan 22 2026
# source: https://schema.broadinstitute.org/
# Original is in hg19/GRCh37 coordinates
cd /hive/data/genomes/hg38/bed/varFreqs/schema
# SCHEMA_variant_results.vcf.bgz (384M, hg19 coordinates)
# Step 1: Add AC, AN, AF fields by summing case+control counts
~/kent/src/hg/makeDb/scripts/varFreqs/schema_addAcAnAf.py
bgzip SCHEMA_variant_results_withAF.vcf
tabix -p vcf SCHEMA_variant_results_withAF.vcf.gz
# Step 2: Liftover from hg19 to hg38
# prep hg38 reference FASTA
zcat /usr/local/apache/htdocs-hgdownload/goldenPath/hg38/bigZips/hg38.fa.gz > hg38.fa
samtools faidx hg38.fa
CrossMap.py vcf /gbdb/hg19/liftOver/hg19ToHg38.over.chain.gz \
    SCHEMA_variant_results_withAF.vcf.gz \
    hg38.fa \
    SCHEMA_variant_results_hg38.vcf
# Output stats: Total entries: 8865268, Failed to map: 780
# Sort
grep "^#" SCHEMA_variant_results_hg38.vcf > SCHEMA_variant_results_hg38_sorted.vcf
grep -v "^#" SCHEMA_variant_results_hg38.vcf | sort -k1,1V -k2,2n >> SCHEMA_variant_results_hg38_sorted.vcf
# Compress and index
bgzip SCHEMA_variant_results_hg38_sorted.vcf
tabix -p vcf SCHEMA_variant_results_hg38_sorted.vcf.gz
# Clean up temporary files
rm -f SCHEMA_variant_results_hg38.vcf SCHEMA_variant_results_hg38.vcf.unmap hg38.fa hg38.fa.fai

# Gregor rare disease project, Max, Mar 2026
cd /hive/data/genomes/hg38/bed/varFreqs/gregor/
# Downloaded from G Drive, pointed to by Jon Bernstein, Stanford
# https://drive.google.com/drive/folders/1v-BnW7nKcEjF-NyLqU1Up3YJuP5KJJAg
# created symlink into my UCSC G Drive, then used rclone
rclone copy mhaeussldrive:RO4 ./
bcftools concat --threads 16 -Oz -o gregor.vcf.gz chr{1..22}.vcf.gz chrX.vcf.gz chrY.vcf.gz
tabix -p vcf gregor.vcf.gz
# output ~20 GB, took 10 minutes.

# HGDP1k data from the phased Vars track, Max/Claude, Mar 18 2026
# Just flattening what we have and reducing details
# Source: 3.2TB VCF with 4094 genomes and per-population INFO fields for 80 populations
# Strip genotypes and keep only overall + continental group fields (drop per-population-per-sex)
# Already has chr prefix, no rename needed
# Note: first attempt kept all fields -> 169GB, too large. This version keeps only continental groups.
cd /hive/data/genomes/hg38/bed/varFreqs/hgdp1kFreq/
KEEP="INFO/AC,INFO/AF,INFO/AN,INFO/nhomalt,INFO/gnomad_AC,INFO/gnomad_AF,INFO/gnomad_AN,INFO/gnomad_AC_afr,INFO/gnomad_AF_a
fr,INFO/gnomad_AN_afr,INFO/gnomad_AC_ami,INFO/gnomad_AF_ami,INFO/gnomad_AN_ami,INFO/gnomad_AC_amr,INFO/gnomad_AF_amr,INFO/g
nomad_AN_amr,INFO/gnomad_AC_asj,INFO/gnomad_AF_asj,INFO/gnomad_AN_asj,INFO/gnomad_AC_eas,INFO/gnomad_AF_eas,INFO/gnomad_AN_
eas,INFO/gnomad_AC_fin,INFO/gnomad_AF_fin,INFO/gnomad_AN_fin,INFO/gnomad_AC_mid,INFO/gnomad_AF_mid,INFO/gnomad_AN_mid,INFO/
gnomad_AC_nfe,INFO/gnomad_AF_nfe,INFO/gnomad_AN_nfe,INFO/gnomad_AC_oth,INFO/gnomad_AF_oth,INFO/gnomad_AN_oth,INFO/gnomad_AC
_sas,INFO/gnomad_AF_sas,INFO/gnomad_AN_sas,INFO/gnomad_popmax,INFO/gnomad_faf95_popmax"
# This took days to complete, so asked Claude to make it parallel
#bcftools view -G /gbdb/hg38/phasedVars/hgdp1k/gnomad.genomes.v3.1.2.hgdp_tgp.vcf.gz --threads 8 \
#| bcftools annotate -x "^${KEEP}" -Oz --threads 4 -o hgdp1k.freq.vcf.gz
# use 30 threads, and chunks of 50 Mbp
sh ~/kent/src/hg/makeDb/scripts/varFreqs/vcfFilterParallel.sh /gbdb/hg38/phasedVars/hgdp1k/gnomad.genomes.v3.1.2.hgdp_tgp.vcf.gz hgdp1k.freq.parallel.vcf.gz "$KEEP" 30 50 &
tabix -p vcf hgdp1k.freq.vcf.gz

# Swefreq, Max, Feb 2026
# downloaded files from https://swefreq.nbis.se/dataset/SweGen/download
# Access was approved through the website, but I emailed swefreq@scilifelab.se, it needed a reminder email
# Also got email from adam.ameur@igp.uu.se with followup info and do-no-allow-downloads instruction
cd /hive/data/genomes/hg38/bed/varFreqs/swefreq

# Indigenomes, Max Jan 2026
# downloaded from https://clingen.igib.res.in/indigen/, used as-is
cd /hive/data/genomes/hg38/bed/varFreqs/indigenomes/

# Japan Tommo 60k, Max Jan 2026
# downloaded from https://jmorp.megabank.tohoku.ac.jp/downloads
cd /hive/data/genomes/hg38/bed/varFreqs/tommo61kjpn/
# copied urls from website
wget -i urls.txt 
bcftools concat --threads 16 -Oz -o tommo-61kjpn-20250616-GRCh38-snvindel-af-autosome.vcf.gz \
    tommo-61kjpn-20250616-GRCh38-snvindel-af-autosome-chr{1..22}.vcf.gz
tabix -p vcf tommo-61kjpn-20250616-GRCh38-snvindel-af-autosome.vcf.gz

# FinnGen, Max/Claude, Jan 2026
cd /hive/data/genomes/hg38/bed/varFreqs/finngen/                                                                           
# Source TSV was downloaded from FinnGen (via email link from Google Cloud bucket)                                         
# finnge_R12_annotated_variants_v1.gz (32 GB TSV)                                                                          
# Convert TSV to VCF using custom Python script (written by Claude Opus 4.5)                                               
python ~/kent/src/hg/makeDb/scripts/varFreqs/finngen_to_vcf.py \                                                                    
    finnge_R12_annotated_variants_v1.gz \                                                                                    
    finnge_R12_annotated_variants_v1.vcf                                                                                     
# Compress and index                                                                                                       
bgzip finnge_R12_annotated_variants_v1.vcf -@8                                                                             
tabix -p vcf finnge_R12_annotated_variants_v1.vcf.gz                                                                       

# All of Us, Max Feb 2026
# Received from Qudsi at UCSC in the Ioannidis group via phoenix
# only concated and ran tabix on it
cd /hive/data/genomes/hg38/bed/varFreqs/allofus/
bcftools concat --threads 16 -Oz -o allOfUs.locAncFreq.vcf.gz clean/allele_freq_chr{1..22}.NW.clean.conf90.oneline.vcf.gz
tabix allOfUs.locAncFreq.vcf.gz

##########
# 2026-03-27 Claude max

# Two phased SV VCF tracks moved into phasedVars superTrack from lrSv:
# - han945SvVcf: Per-sample genotypes for 945 Han Chinese SVs
# - lrSv1kgOntPhased: Phased SVs from 1,019 diverse humans (1KG ONT)
# Data files remain in /hive/data/genomes/hg38/bed/lrSv/
# Symlinks moved from /gbdb/{hg38,hs1}/lrSv/ to /gbdb/{hg38,hs1}/phasedVars/
# Build documentation for these tracks is in lrSv.txt

##########
# 2026-04-20 Claude max

# CoLoRSdb v1.2.0 long-read SNV/indel population frequencies added as
# the colorsDbSnv subtrack of varFreqs, for both hg38 and hs1.
#
# Upstream VCFs (GRCh38 and CHM13 releases) are already present in
# /hive/data/genomes/hg38/bed/lrSv/colorsDb/ (placed there when the
# CoLoRSdb SV track was first built under lrSv). We just add VCF
# symlinks under each assembly's varFreqs directory using a consistent
# filename so the shared trackDb stanza can use $D.

mkdir -p /gbdb/hg38/varFreqs/colorsDb /gbdb/hs1/varFreqs/colorsDb
ln -sf /hive/data/genomes/hg38/bed/lrSv/colorsDb/CoLoRSdb.GRCh38.v1.2.0.deepvariant.glnexus.vcf.gz     /gbdb/hg38/varFreqs/colorsDb/colorsDbSnv.vcf.gz
ln -sf /hive/data/genomes/hg38/bed/lrSv/colorsDb/CoLoRSdb.GRCh38.v1.2.0.deepvariant.glnexus.vcf.gz.tbi /gbdb/hg38/varFreqs/colorsDb/colorsDbSnv.vcf.gz.tbi
ln -sf /hive/data/genomes/hg38/bed/lrSv/colorsDb/CoLoRSdb.CHM13.v1.2.0.deepvariant.glnexus.vcf.gz      /gbdb/hs1/varFreqs/colorsDb/colorsDbSnv.vcf.gz
ln -sf /hive/data/genomes/hg38/bed/lrSv/colorsDb/CoLoRSdb.CHM13.v1.2.0.deepvariant.glnexus.vcf.gz.tbi  /gbdb/hs1/varFreqs/colorsDb/colorsDbSnv.vcf.gz.tbi

# The varFreqs.ra trackDb file is already in human/ (shared for both
# hg38 and hs1 via the human/trackDb.ra include), so no move was needed.
# Only colorsDbSnv is expected to render on hs1 - the other varFreqs
# subtracks have hg38-only data and will silently show nothing there.

##########
# 2026-04-20 Claude max
#
# Rebuilt varFreqsAll combined bigBed to include GA4K and CoLoRSdb
# long-read PacBio subtracks that were added to varFreqs since the
# last build (Mar 20).
#
# Steps (in /hive/data/genomes/hg38/bed/varFreqs/all):
# 1. Added GA4K and CoLoRSdb rows to
#      ~/kent/src/hg/makeDb/scripts/varFreqs/databases.tsv
#    and appended their /gbdb paths to files.txt.
# 2. Deleted merged.vcf.gz and merged.annotated.vcf.gz to force a full
#    merge + bcftools csq re-annotation (per-sample normalized VCFs
#    from the previous run were kept; only the two new VCFs were
#    normalized in Step 4).
# 3. Ran ./mergeAndAnnotate.sh (~55 min: 5 min per-file, ~15 min merge,
#    ~35 min csq).
# 4. Ran ./vcfToBigBed.py --output-prefix varFreqsAll --threads 8
#    (Phase 1 pre-extract ~90 min, Phase 2 chrom BED build ~30 min).
# 5. bedToBigBed on 275 GB sorted BED (~2 h) to produce 37.7 GB
#    varFreqsAll.bb with 1,165,666,478 records and 113 fields.
# 6. Updated varFreqs.ra filterValues.sources and added
#    filterByRange.GA4KAF/AC and filterByRange.CoLoRSdbAF/AC.
# Existing /gbdb/hg38/varFreqs/varFreqsAll.bb symlink was preserved.

# 2026-04-22 Claude max
# GWAS SVatalog small-variant (SNV/indel) allele frequencies from the 101
# SVatalog samples, sibling of the lrSv chirmade101Sv structural-variant
# track. Paper: Chirmade et al. 2026, Heredity, PMID 41203876.
# SNPs were called from 10X Genomics linked short-read WGS of the 101
# samples with GATK HaplotypeCaller v4.0.0.0 and phased with SHAPEIT v4.2.0.
# Data: the per-chromosome allele-frequency text files were downloaded into
# /hive/data/genomes/hg38/bed/varFreqs/svCatalog/ alongside the LD-stats
# files (see the companion Zenodo deposit 13367574).
cd /hive/data/genomes/hg38/bed/varFreqs/svCatalog/
# Convert the 23 per-chrom *_allele_freq.txt files to a single sites-only
# VCF with AF/AC/AN plus gnomAD v3.1 NFE AF and dbSNP RSID as INFO fields.
# AC is synthesized as round(AF * 202) and AN is fixed at 202 since the
# source release does not ship AC/AN.
python3 ~/kent/src/hg/makeDb/scripts/varFreqs/svatalogFreqToVcf.py \
    svatalog.vcf chr{1..22}_allele_freq.txt chrX_allele_freq.txt
bcftools sort svatalog.vcf -Oz -m 16G -T /tmp/ -o svatalog.vcf.gz
tabix -p vcf svatalog.vcf.gz
rm -f svatalog.vcf
# 8,814,835 variants -> 172 MB bgzipped + 1.6 MB tabix index.
# Symlinks placed under /gbdb/hg38/varFreqs/svatalog/ for the svatalogSnv
# stanza in trackDb/human/varFreqs.ra.

# varFreqsAll rebuild, 2026-04-22 Claude max
# Regenerate the All Databases Combined track to include SVatalog. Source
# count rises from 23 to 24 databases; final bigBed is 35.3 GB with
# 1,166,005,346 records and 115 fields (up from 113). Pipeline run in
# /hive/data/genomes/hg38/bed/varFreqs/all/:
# 1. Added /gbdb/hg38/varFreqs/svatalog/svatalog.vcf.gz to files.txt and
#    the SVatalog row to ~/kent/src/hg/makeDb/scripts/varFreqs/databases.tsv.
# 2. Cleared stale merged.vcf.gz / merged.annotated.vcf.gz to force re-merge.
# 3. Ran ./mergeAndAnnotate.sh (~60 min: pre-extract skip+svatalog, ~20 min
#    merge, ~35 min csq).
# 4. Ran python3 vcfToBigBed.py --output-prefix varFreqsAll --threads 8
#    (Phase 1 ~90 min re-extract, Phase 2 chrom BED build ~30 min,
#    concat+sort ~45 min).
# 5. bedToBigBed on 260 GB sorted BED (~2h 15m) -> 35.3 GB varFreqsAll.bb.
# 6. Updated varFreqs.ra filterValues.sources and added
#    filterByRange.SVatalogAF / filterByRange.SVatalogAC.
# Existing /gbdb/hg38/varFreqs/varFreqsAll.bb symlink (pointing at
# /hive/data/genomes/hg38/bed/varFreqs/all/varFreqsAll.bb) was preserved
# and now resolves to the new 35.3 GB build.

##########
# 2026-04-29 Claude max
# Indigenous African 180 WGS allele frequencies (Tishkoff lab,
# Fan et al. 2023 Cell, PMID 36868214). 180 individuals (15 per
# population) from 12 indigenous African populations sequenced at >30x
# on Illumina HiSeq X Ten. Sites-only SNP VCF with aggregate AC/AF/AN
# (no per-population frequencies) was supplied directly by Matthew
# Hansen (mhansen@upenn.edu, Tishkoff lab) via Box. Original calls on
# hs37d5 (GRCh37) so we lift to hg38.
cd /hive/data/genomes/hg38/bed/varFreqs/tishkoff/
# Source: 180wgs.SNPs.sites.AF.vcf.gz (~316 MB, 33,618,897 SNPs,
# autosomes only, bare chromosome names "1"-"22").
# Step 1: rename bare chromosome names to chr-prefixed UCSC names.
bcftools annotate --rename-chrs chrRename.txt -Oz \
    -o tishkoff.hg19.chr.vcf.gz 180wgs.SNPs.sites.AF.vcf.gz
# Step 2: lift to hg38 with CrossMap (~5 min). hg38.fa is the same
# /hive/data/genomes/hg38/bed/varFreqs/all/hg38.fa used elsewhere.
CrossMap.py vcf /gbdb/hg19/liftOver/hg19ToHg38.over.chain.gz \
    tishkoff.hg19.chr.vcf.gz \
    /hive/data/genomes/hg38/bed/varFreqs/all/hg38.fa \
    tishkoff.hg38.unsorted.vcf
# CrossMap reports: 33,618,897 input -> 9,066 unmapped, 33,609,831 lifted.
# Step 3: drop variants that landed on alt/random/Un/fix contigs and
# fix the contig= header lines (CrossMap dropped the chr prefix from
# the contig IDs even though data lines have it).
awk 'BEGIN{OFS="\t"} /^#/ {next} $1 ~ /^chr([1-9]|1[0-9]|2[0-2]|X|Y|M)$/ {print}' \
    tishkoff.hg38.unsorted.vcf > tishkoff.hg38.canon.body
awk 'BEGIN{OFS="\t"} /^##contig=/ {sub(/<ID=/,"<ID=chr"); print; next} /^#/ {print}' \
    tishkoff.hg38.unsorted.vcf > tishkoff.hg38.canon.header
cat tishkoff.hg38.canon.header tishkoff.hg38.canon.body > tishkoff.hg38.canon.fixed.vcf
# 33,600,472 variants survive (9,359 of the lifted variants landed on
# alt/random/Un contigs and were dropped).
# Step 4: sort + bgzip + tabix.
bcftools sort tishkoff.hg38.canon.fixed.vcf -Oz -m 16G -T /data/tmp/ \
    -o tishkoff180.vcf.gz
tabix -p vcf tishkoff180.vcf.gz
rm tishkoff.hg38.unsorted.vcf tishkoff.hg38.canon.* tishkoff.hg19.chr.vcf.gz
# Final: 346 MB bgzip + 1.6 MB tabix index, 33,600,472 SNPs (autosomes
# + a handful of chrX entries from the PAR regions). 18,425 of the
# 33,618,897 source variants (0.055%) are not represented after lift
# (9,066 failed liftOver, 9,359 mapped to non-canonical contigs).
# Symlinks placed at /gbdb/hg38/varFreqs/tishkoff/ for the tishkoff180
# stanza in trackDb/human/varFreqs.ra.

# varFreqsAll rebuild, 2026-04-30 Claude max
# Regenerate the All Databases Combined track to include Tishkoff180.
# Source count rises from 24 to 25 databases; final bigBed is 35.6 GB
# (37.9 GB on disk before zoom indexes) with 1,169,063,801 records and
# 117 fields (up from 115). Pipeline run in
# /hive/data/genomes/hg38/bed/varFreqs/all/:
# 1. Added /gbdb/hg38/varFreqs/tishkoff/tishkoff180.vcf.gz to files.txt
#    and the Tishkoff180 row to
#    ~/kent/src/hg/makeDb/scripts/varFreqs/databases.tsv.
# 2. Cleared stale merged.vcf.gz / merged.annotated.vcf.gz to force re-merge.
# 3. Ran ./mergeAndAnnotate.sh: per-VCF strip+norm reused cached files for
#    the 24 unchanged DBs; only tishkoff180 was normalized (~5 min).
#    bcftools merge ~25 min; bcftools csq ~35 min.
#    Output: 1,169,064,168 merged variants (up from 1,166,005,346).
#    Tishkoff added ~3.06M sites that were not present in any other DB
#    (most of its 33.6M SNPs were already covered by the bigger DBs).
# 4. Ran python3 vcfToBigBed.py --output-prefix varFreqsAll --threads 8
#    (Phase 1 ~90 min re-extract of all 25 DBs, Phase 2 ~25 min chrom BED
#    build, concat+sort ~45 min).
# 5. bedToBigBed on the sorted BED -> 35.6 GB varFreqsAll.bb (1.169B
#    records, 117 fields).
# 6. Updated varFreqs.ra filterValues.sources and added
#    filterByRange.Tishkoff180AF / filterByRange.Tishkoff180AC.
# Existing /gbdb/hg38/varFreqs/varFreqsAll.bb symlink (pointing at
# /hive/data/genomes/hg38/bed/varFreqs/all/varFreqsAll.bb) was preserved
# and now resolves to the new 35.6 GB build.

# Performance follow-up note (Claude max, 2026-04-30):
# An incremental add currently takes ~10 hours. The biggest wins for a
# future incremental-add workflow are:
#   - Phase 1 of vcfToBigBed.py re-extracts per-DB TSVs for ALL databases
#     every run (~90 min). Skip extraction when the per-DB output dir is
#     already present and source mtime is unchanged. Estimated saving:
#     ~85 min when adding one DB.
#   - bcftools merge re-merges all 25 normalized VCFs (~25 min). For a
#     pure additive change, `bcftools merge old_merged.vcf.gz new.vcf.gz`
#     is much cheaper than re-merging from scratch. Estimated saving:
#     ~20 min.
# We tested a BCSQ cache (csqWithCache.sh, prototype in this directory's
# git history), splitting merged variants into "cached" vs "uncached" via
# bcftools isec and only running csq on the uncached subset. It is
# correct but NOT a win: bcftools csq runs at ~30M records/min and is
# already well-threaded; bcftools isec / annotate over the same volume
# is slower than just running csq again. Don't pursue. The script was
# removed after benchmarking; see this commit's history if you want to
# re-test on different hardware.

##########
# 2026-05-05 Claude max
# GenomeAsia Pilot (gasp + gaspIndel) GRCh37 -> hg38 lift, refs #36642
#
# Lou's QA (note 2026-05-04) showed both subtracks were GRCh37 sites
# served at /gbdb/hg38/, so positions were off by tens of kb to many
# Mb (e.g. rs1050171 at hg19 chr7:55,249,063 instead of hg38
# chr7:55,174,014). Three confirmations: contig lengths in the VCF
# header are GRCh37, every contig record declares assembly=b37, and
# the GATK reference FASTA is human_g1k_v37_decoy.fasta.
#
# Source files (sites-only, bare chromosome names "1"-"22", "X", "Y"):
#   /hive/data/genomes/hg38/bed/varFreqs/ga100k/ga100k.subst.vcf.gz
#       (66,236,516 SNVs, autosomes only)
#   /hive/data/genomes/hg38/bed/varFreqs/ga100k/All.indels.annot.cont_withmaf.vcf.gz
#       (4,415,156 indels, 1-22 + X/Y/MT plus GL*/hs37d5 alt contigs)
# The indel VCF has a malformed #CHROM line (declares FORMAT but has
# no sample columns and no FORMAT field on data lines); the lift
# script strips it on the fly.
#
# Lift script: ~/kent/src/hg/makeDb/scripts/varFreqs/gaspLift.sh
# Recipe is the same as tishkoff180: chr-prefix rename via
# bcftools annotate --rename-chrs, CrossMap.py vcf with
# hg19ToHg38.over.chain.gz, post-filter to canonical chr1-22/X/Y/M,
# fix contig= header (CrossMap drops the chr prefix from contig IDs),
# bcftools sort, bgzip, tabix.
mkdir -p /hive/data/genomes/hg38/bed/varFreqs/ga100k/lift
cd /hive/data/genomes/hg38/bed/varFreqs/ga100k/lift
# chrRename.txt is in this directory (1 -> chr1 ... MT -> chrM).
~/kent/src/hg/makeDb/scripts/varFreqs/gaspLift.sh \
    /hive/data/genomes/hg38/bed/varFreqs/ga100k/ga100k.subst.vcf.gz \
    ./ga100k.subst.hg38
~/kent/src/hg/makeDb/scripts/varFreqs/gaspLift.sh \
    /hive/data/genomes/hg38/bed/varFreqs/ga100k/All.indels.annot.cont_withmaf.vcf.gz \
    ./ga100k.indels.hg38
# Lift accounting is logged in subst.lift.log and indel.lift.log.
# After lift, /gbdb symlinks point at the lifted .vcf.gz / .tbi pair.
# trackDb bigDataUrl for gaspIndel was renamed to ga100k.indels.vcf.gz
# (the old All.indels.annot.cont_withmaf.vcf.gz file name was a
# verbatim copy of the GenomeAsia download name).

##########
# 2026-05-06 Claude max
# varFreqsAll rebuild: 5 vcfToBigBed.py fixes + add NPM (Singapore SG10K_Health),
# refs #36642 notes 49, 51, 53, 55.

# 1. Moved scripts from /hive/.../all/ into kent so the build is reproducible
#    from a fresh checkout (Lou's note 49):
#      ~/kent/src/hg/makeDb/scripts/varFreqs/vcfToBigBed.py   (driver)
#      ~/kent/src/hg/makeDb/scripts/varFreqs/mergeAndAnnotate.sh (merge+csq)
#    Original /hive/.../all/ paths are now symlinks into kent.

# 2. vcfToBigBed.py fixes in this round:
#    - normalize_consequence(): bcftools csq emits "&"-joined compounds
#      (e.g. "stop_gained&frameshift") which exact-match-failed the old
#      8-bucket consequence filter and orphaned ~8.5 M records. The
#      normalizer rewrites "&" to "," so a single record can match multiple
#      filter buckets, and appends ",others" to any token list with no named
#      filter present so nothing is orphaned.  trackdb gains 4 new buckets
#      (3_prime_utr, 5_prime_utr, non_coding, others) and switches to
#      filterType.consequence multipleListOr.
#    - Source-attribution bug: the old check only inspected the unified
#      AC/AF slot, so AllOfUs (which ships only per-population fields)
#      attributed to ZERO of its 67.5 M variants — ~43 M phantom rows in
#      the previous bigBed had an empty "sources" column. The fix scans
#      per-population slots before declaring "no data".
#    - parse_bcsq() now returns "" instead of "." for aaChange/dnaChange on
#      non-coding variants, so the mouseOver/detail page renders a clean
#      blank line instead of a stray dot.
#    - maxAF format: "{:.6g}" -> "{:.6f}" so very small AFs print as
#      "0.000003" instead of "3.31347e-06".
#    - autoSql `table varFreqs` -> `table varFreqsAll` (matches the bigBed
#      filename; required for hgIntegrator to wire up correctly).

# 3. NPM Singapore (SG10K_Health) added to databases.tsv + files.txt +
#    populations.tsv (SgChinese / SgMalay / SgIndian ancestry groups) +
#    trackDb filter UI.  The individual `npm` subtrack still has
#    tableBrowser off (license), but its data is folded into varFreqsAll
#    same as for finngen / kova / mgrb / swefreq / tishkoff180 (precedent).

cd /hive/data/genomes/hg38/bed/varFreqs/all/
# Force re-merge + re-csq (existing per-VCF normalize cache is reused).
rm -f merged.vcf.gz merged.vcf.gz.tbi merged.annotated.vcf.gz \
    merged.annotated.vcf.gz.tbi
rm -rf extracted beds varFreqsAll.bed
./mergeAndAnnotate.sh
python3 vcfToBigBed.py --output-prefix varFreqsAll --threads 8
# Build is in-place: /gbdb/hg38/varFreqs/varFreqsAll.bb stays symlinked
# at /hive/.../all/varFreqsAll.bb, which is overwritten at the end of
# the bedToBigBed step.

