# for emacs: -*- mode: sh; -*-

# dbSnpNNN ("dbSNP 2.0") bigDbSnp tracks for hg38 / GRCh38 and hg19 / GRCh37

##############################################################################
# dbSnp152: dbSNP build 152 (DONE 9/18/19 angie)

    topDir=/hive/data/outside/dbSNP/152
    mkdir -p $topDir/json
    cd $topDir/json
    wget --timestamping -nd ftp://ftp.ncbi.nih.gov/snp/latest_release/JSON/\*
    md5sum -c CHECKSUMS
#refsnp-chr10.json.bz2: OK
#...
#refsnp-withdrawn.json.bz2: OK

    # jsonQuery commands to figure out what assemblies, SO terms and frequency sources are in there,
    # by sampling first 10,000 variants on an arbitrary chrom:
    set assemblyPath = "primary_snapshot_data.placements_with_allele[*].placement_annot.seq_id_traits_by_assembly[*].assembly_name"
    set rnaSoPath = "primary_snapshot_data.allele_annotations[*].assembly_annotation[*].genes[*].rnas[*].sequence_ontology[*].accession"
    set proteinSoPath = "primary_snapshot_data.allele_annotations[*].assembly_annotation[*].genes[*].rnas[*].protein.sequence_ontology[*].accession"
    set freqSourcePath = "primary_snapshot_data.allele_annotations[*].frequency[*].study_name"
    foreach jPath ("$assemblyPath" "$rnaSoPath" "$proteinSoPath" "$freqSourcePath")
      echo "$jPath"
      bzcat refsnp-chr3.json.bz2 \
      | head -10000 \
      | jsonQuery -countUniq -verbose=2 stdin "$jPath" stdout \
      | sort -nr
      echo ""
    end
    # Assemblies:
#     10229 "GRCh38.p12"
#     10111 "GRCh37.p13"
    # RNA SO terms -- make sure all of these appear in soTerm.[ch]:
#    147701 "SO:0001627"
#     26809 "SO:0002153"
#     19013 "SO:0002152"
#      8334 "SO:0001624"
#      3082 "SO:0001986"
#      2657 "SO:0001580"
#      1772 "SO:0001619"
#      1741 "SO:0001987"
#       833 "SO:0001623"
#        30 "SO:0001575"
#        10 "SO:0001574"
#         4 "SO:0001590"
    # Protein SO terms -- ditto for soTerm.[ch]:
#       784 "SO:0001819"
#       713 "SO:0001583"
#         6 "SO:0001821"
#         5 "SO:0001587"
#         2 "SO:0000865"
    # Made sure all those are in hg/{inc,lib}/soTerm.[ch]

    # Projects reporting allele counts/frequencies:
#     18130 "GnomAD"
#     17473 "1000Genomes"
#     17376 "TOPMED"
#     15952 "TWINSUK"
#     15952 "ALSPAC"
#     15188 "Estonian"
#       555 "GnomAD_exomes"
#       473 "GoESP"
#       468 "ExAC"
    freqSourceOrder=1000Genomes,GnomAD_exomes,TOPMED,ExAC,GnomAD,GoESP,ALSPAC,TWINSUK,Estonian

    cd $topDir
    # Construct a mapping from RefSeq accessions like NC_000... to assembly, 2bit, and UCSC name.
    hgsql hg38 -NBe 'select alias, chrom from chromAlias where find_in_set("refseq", source);' \
    | tawk '{print $1, "GRCh38.p12", "/hive/data/genomes/hg38/hg38.2bit", $2;}' \
      > refSeqToUcsc.tab
    hgsql hg19 -NBe 'select alias, chrom from chromAlias where find_in_set("refseq", source);' \
    | tawk '{print $1, "GRCh37.p13", "/hive/data/genomes/hg19/hg19.2bit", $2;}' \
      >> refSeqToUcsc.tab

    # Construct a mapping of equivalent RefSeq assembly regions for GRCh38 and GRCh37,
    # so we can distinguish multiple mappings to PAR/alts/fixes from plain old multiple mappings.
    refseqAssemblies=/hive/data/outside/ncbi/genomes/refseq/vertebrate_mammalian/Homo_sapiens/all_assembly_versions
    grep -v ^# \
      $refseqAssemblies/GCF_000001405.{25_GRCh37.p13,38_GRCh38.p12}/*_assembly_structure/all_alt_scaffold_placement.txt \
    | tawk '{print $7, $12-1, $13, $4, $10-1, $11;}' \
    | sort -k 1,1 -k2n,2n \
    | tawk '{print $1 ":" $2 ":" $3, $4 ":" $5 ":" $6;}' \
      > equivRegions.tab
    # Add PARs:
    grep -w PAR \
      $refseqAssemblies/GCF_000001405.25_GRCh37.p13/*_assembly_regions.txt \
    | sort \
    | sed -e 's/X/NC_000023.10/; s/Y/NC_000024.9/;' \
    | tawk '{print $1, $2 ":" $3 - 1 ":" $4;}'
#PAR#1   NC_000023.10:60000:2699520
#PAR#1   NC_000024.9:10000:2649520
#PAR#2   NC_000023.10:154931043:155260560
#PAR#2   NC_000024.9:59034049:59363566
    echo -e "NC_000023.10:60000:2699520\tNC_000024.9:10000:2649520" >> equivRegions.tab
    echo -e "NC_000023.10:154931043:155260560\tNC_000024.9:59034049:59363566" >> equivRegions.tab
    grep -w PAR \
      $refseqAssemblies/GCF_000001405.38_GRCh38.p12/*_assembly_regions.txt \
    | sort \
    | sed -e 's/X/NC_000023.11/; s/Y/NC_000024.10/;' \
    | tawk '{print $1, $2 ":" $3 - 1 ":" $4;}'
#PAR#1   NC_000023.11:10000:2781479
#PAR#1   NC_000024.10:10000:2781479
#PAR#2   NC_000023.11:155701382:156030895
#PAR#2   NC_000024.10:56887902:57217415
    echo -e "NC_000023.11:10000:2781479\tNC_000024.10:10000:2781479" >> equivRegions.tab
    echo -e "NC_000023.11:155701382:156030895\tNC_000024.10:56887902:57217415" >> equivRegions.tab


    # Run doBigDbSnp.pl...
    $HOME/kent/src/hg/utils/automation/doBigDbSnp.pl $topDir -continue convert -stop bigBed \
      >& do.log &
    tail -f do.log
# *** All done !  (through the 'install' step)  Elapsed time: 278m36s
# *** Steps were performed in /hive/data/outside/dbSNP/152/bigDbSnp.2019-09-17

    # 9/17/19: re-run from dbSnpJsonToTab onward after lots of changes
    topDir=/hive/data/outside/dbSNP/152
    $HOME/kent/src/hg/utils/automation/doBigDbSnp.pl $topDir -stop install -debug
# *** Steps were performed in /hive/data/outside/dbSNP/152/bigDbSnp.2019-09-17
    cd /hive/data/outside/dbSNP/152/bigDbSnp.2019-09-17
    # Link to ../split, -continue convert to avoid re-splitting (the slowest part of the process):
    rm split
    ln -s ../split split
    $HOME/kent/src/hg/utils/automation/doBigDbSnp.pl $topDir 152 $freqSourceOrder \
      -buildDir=`pwd` -continue convert -stop install \
      >& do.log &
    tail -f do.log
# *** All done !  (through the 'install' step)  Elapsed time: 449m6s
# *** Steps were performed in /hive/data/outside/dbSNP/152/bigDbSnp.2019-09-17

    # 10/8/19: count up how many variants have freq counts for each project
    cut -f 4 dbSnp152Details.tab \
    | perl -wne 'chomp; next unless $_; @w = split ",";
        if ($w[0]) { print "1000Genomes\n" }
        if ($w[1]) { print "GnomAD_exomes\n"; }
        if ($w[2]) { print "TOPMED\n" }
        if ($w[3]) { print "ExAC\n" }
        if ($w[4]) { print "GnomAD\n" }
        if ($w[5]) { print "GoESP\n" }
        if ($w[6]) { print "ALSPAC\n" }
        if ($w[7]) { print "TWINSUK\n" }
        if ($w[8]) { print "Estonian\n" }' \
    | sort | uniq -c | sort -nr
#437624857 TOPMED
#234158623 GnomAD
#84743526 1000Genomes
#44887599 TWINSUK
#44887599 ALSPAC
#31397792 Estonian
#11721224 GnomAD_exomes
#8854021 ExAC
#1973787 GoESP

    # 10/11/19: count up how many instances of each type of ucscNote:
    cut -f 15 hg19.dbSnp152.checked.bigDbSnp | sed -re 's/,/\n/g;' | g . | sort | uniq -c
#   10680 altIsAmbiguous
#    4808 classMismatch
#  409132 clinvar
#  106941 clusterError
#12757487 commonAll
#18901486 commonSome
#  823424 diffMajor
#    7635 freqIsAmbiguous
#   23027 freqNotRefAlt
#  555144 multiMap
#99618012 overlapDiffClass
#14790469 overlapSameClass
#     101 refIsAmbiguous
# 2933684 refIsMinor
#  150892 refIsRare
#   45618 refIsSingleton
#       4 refMismatch
# 3761191 revStrand
    cut -f 15 hg38.dbSnp152.checked.bigDbSnp | sed -re 's/,/\n/g;' | g . | sort | uniq -c
#    10807 altIsAmbiguous
#     5103 classMismatch
#   408665 clinvar
#    94310 clusterError
# 13027110 commonAll
# 19258751 commonSome
#   836327 diffMajor
#     7736 freqIsAmbiguous
#    36306 freqNotRefAlt
#   130175 multiMap
#102260850 overlapDiffClass
# 15075710 overlapSameClass
#      110 refIsAmbiguous
#  3033691 refIsMinor
#   189809 refIsRare
#    63804 refIsSingleton
#       33 refMismatch
#  4439534 revStrand

    # 10/18/19: add subset tracks
    $HOME/kent/src/hg/utils/automation/doBigDbSnp.pl $topDir 152 $freqSourceOrder \
       -buildDir=`pwd` -continue=bigBed -stop=install >& subsets.log &
    tail -f subsets.log


##############################################################################
# dbSnp153: dbSNP build 153 (DONE 11/25/19 angie)

    topDir=/hive/data/outside/dbSNP/153
    mkdir -p $topDir/json
    cd $topDir/json
    wget --timestamping -nd ftp://ftp.ncbi.nih.gov/snp/latest_release/JSON/\*
    md5sum -c CHECKSUMS
#refsnp-chr10.json.bz2: OK
#...
#refsnp-withdrawn.json.bz2: OK

    # jsonQuery commands to figure out what assemblies, SO terms and frequency sources are in there,
    # by sampling first 10,000 variants on an arbitrary chrom:
    assemblyPath="primary_snapshot_data.placements_with_allele[*].placement_annot.seq_id_traits_by_assembly[*].assembly_name"
    rnaSoPath="primary_snapshot_data.allele_annotations[*].assembly_annotation[*].genes[*].rnas[*].sequence_ontology[*].accession"
    proteinSoPath="primary_snapshot_data.allele_annotations[*].assembly_annotation[*].genes[*].rnas[*].protein.sequence_ontology[*].accession"
    freqSourcePath="primary_snapshot_data.allele_annotations[*].frequency[*].study_name"
    for jPath in "$assemblyPath" "$rnaSoPath" "$proteinSoPath" "$freqSourcePath"; do
      echo "$jPath"
      bzcat refsnp-chr3.json.bz2 \
      | head -10000 \
      | jsonQuery -countUniq -verbose=2 /dev/stdin "$jPath" stdout \
      | sort -nr
      echo ""
    done
    # Assemblies:
#     10229 "GRCh38.p12"
#     10111 "GRCh37.p13"

    # RNA SO terms -- make sure all of these appear in soTerm.[ch]:
#    144288 "SO:0001627"
#     25829 "SO:0002153"
#     19729 "SO:0002152"
#      8506 "SO:0001624"
#      3112 "SO:0001986"
#      2648 "SO:0001580"
#      1769 "SO:0001619"
#      1712 "SO:0001987"
#       878 "SO:0001623"
#        30 "SO:0001575"
#        10 "SO:0001574"
#         4 "SO:0001590"
    # Protein SO terms -- ditto for soTerm.[ch]:
#       770 "SO:0001819"
#       726 "SO:0001583"
#         6 "SO:0001821"
#         5 "SO:0001587"
#         2 "SO:0000865"
    # Made sure all those are in hg/{inc,lib}/soTerm.[ch] (nothing new since b152)

    # Projects reporting allele counts/frequencies:
#     17493 "1000Genomes"
#     17392 "TOPMED"
#     16619 "GnomAD"
#     16306 "NorthernSweden"
#     15970 "TWINSUK"
#     15970 "ALSPAC"
#     15202 "Estonian"
#     13096 "Vietnamese"
#      1844 "PAGE_STUDY"
#       473 "GoESP"
#       468 "ExAC"
#       458 "GnomAD_exomes"

    # This time the JSON downloads include a file frequency_studies.json that describes each study.
    # Will be useful for making a details page, but some descriptions are just study names.
    # Get total_count values from refsnp-chr3.json.bz2, put 1000Genomes first, then order by
    # decreasing total_count:
    # 1000Genomes:  5008
    # GnomAD_exomes: 251006
    # TOPMED: 125568
    # ExAC: 121234
    # PAGE_STUDY: 78694
    # GnomAD: 31348
    # GoESP: 12494
    # Estonian: 4480
    # ALSPAC: 3854
    # TWINSUK: 3708
    # NorthernSweden: 600
    # Vietnamese: 212
    freqSourceOrder=1000Genomes,GnomAD_exomes,TOPMED,ExAC,PAGE_STUDY,GnomAD,GoESP,Estonian,ALSPAC,TWINSUK,NorthernSweden,Vietnamese
    # Hmm, the PAGE study is a population genetics study.  It excludes caucasians:
    # https://www.biorxiv.org/content/biorxiv/early/2018/10/17/188094.full.pdf
    # "Genotyped individuals self-identified as Hispanic/Latino (N=22,216),
    #  African American (N=17,299), Asian (N=4,680), Native Hawaiian (N=3,940),
    #  Native American (N=652), or Other (N=1,052, primarily South Asian or mixed heritage,
    #  as well as participants who did not identify with any of the available options."
    # They didn't attempt to make balanced global (non-caucasian) populations (e.g. almost
    # as many Native Hawaiians as Asians), so I'll keep 1000Genomes first.

    # Reuse assembly sequence mapping files from b152 since the assemblies are the same.
    cd $topDir
    cp ../152/refSeqToUcsc.tab .
    cp ../152/equivRegions.tab .

    # Run doBigDbSnp.pl (first with -debug to make runDir
    $HOME/kent/src/hg/utils/automation/doBigDbSnp.pl $topDir -debug
# *** Steps were performed in /hive/data/outside/dbSNP/153/bigDbSnp.2019-08-07
    cd /hive/data/outside/dbSNP/153/bigDbSnp.2019-08-07
    $HOME/kent/src/hg/utils/automation/doBigDbSnp.pl $topDir -stop install \
      >& do.log &
    tail -f do.log
# *** All done !  (through the 'install' step)  Elapsed time: 2095m35s

    # 9/13/19: Now that checkDbSnp has been added to doBigDbSnp.pl, re-run from the check
    # stage onward (but don't cleanup just yet in case we need to debug files).
    cd /hive/data/outside/dbSNP/153/bigDbSnp.2019-08-07
    $HOME/kent/src/hg/utils/automation/doBigDbSnp.pl $topDir -buildDir=`pwd` \
      -continue check -stop install \
      >& check.log &
    tail -f check.log

    # 9/18/19: re-run from dbSnpJsonToTab onward after adding several more ucscNotes.
    cd /hive/data/outside/dbSNP/153/bigDbSnp.2019-08-07
    $HOME/kent/src/hg/utils/automation/doBigDbSnp.pl $topDir \
      -buildDir=`pwd` -continue convert -stop install \
      >& redo.log &
    tail -f redo.log
# *** All done !  (through the 'install' step)  Elapsed time: 263m59s
# *** Steps were performed in /hive/data/outside/dbSNP/153/bigDbSnp.2019-08-07

    #*** uh-oh... when checkBigDbSnp failed, doCheck.sh did not fail... I guess backgrounding
    #*** the jobs and 'wait' hide errors?
    cd /hive/data/outside/dbSNP/153/bigDbSnp.2019-08-07
    $HOME/kent/src/hg/utils/automation/doBigDbSnp.pl $topDir -buildDir=`pwd` \
      -continue check -stop install \
      >& check.log &
    tail -f check.log

    # 9/19/19: and again after changing doBigDbSnp.pl to have args & wait on specific pids:
    # 10/30/19: and again after adding new ucscNotes (#23283).
    # 11/4/19: and again after finding that refIsMinor & diffMajor could be appended multiple times
    # 11/7/19: and again after finding that some cases of freqNotRefAlt are VCF normalization probs
    # 11/8/19: and again after adding badCoords.bed and warnings output files
    # 11/15/19: and again after adding ucscNotes freqIncomplete, freqNotMapped
    # 11/24/19: and again: support for alt alleles with different expansion ranges (rs782394990)
    topDir=/hive/data/outside/dbSNP/153
    freqSourceOrder=1000Genomes,GnomAD_exomes,TOPMED,ExAC,PAGE_STUDY,GnomAD,GoESP,Estonian,ALSPAC,TWINSUK,NorthernSweden,Vietnamese
    # Run doBigDbSnp.pl (first with -debug to make runDir):
    $HOME/kent/src/hg/utils/automation/doBigDbSnp.pl $topDir 153 $freqSourceOrder -debug
# *** Steps were performed in /hive/data/outside/dbSNP/153/bigDbSnp.2019-11-24
    cd /hive/data/outside/dbSNP/153/bigDbSnp.2019-11-24
    # Link to ../bigDbSnp.2019-08-07/split, -continue convert to avoid re-splitting (the slowest part of the process):
    rmdir split
    ln -s ../bigDbSnp.2019-08-07/split split
    $HOME/kent/src/hg/utils/automation/doBigDbSnp.pl $topDir 153 $freqSourceOrder \
      -buildDir=`pwd` -continue convert -stop install \
      >& do.log &
    tail -f do.log
# *** All done !  (through the 'install' step)  Elapsed time: 579m31s
# *** Steps were performed in /hive/data/outside/dbSNP/153/bigDbSnp.2019-11-24

    wc -l dbSnp153Errors.tab
#0 dbSnp153Errors.tab
    # Yay!  Now we don't need to document any dropped variants.  :)
    awk '{print $1, $2, $3;}' dbSnp153Warnings.tab  | sort | uniq -c
#   5339 Frequency report not
#   3852 Incomplete freq data
#  65257 Mismatching SPDI del
# 197684 Mismatching SPDI pos

    # Statistics:
    bigBedInfo -extraIndex hg19.dbSnp153.bb
#extraIndexCount: 1
#    name (field 3) with 683142960 items
#itemCount: 683,142,960
#chromCount: 297
#basesCovered: 823,963,745
#meanDepth (of bases covered): 1.279107
#minDepth: 1.000000
#maxDepth: 321.000000
#std of depth: 1.993272
    bigBedInfo -extraIndex hg38.dbSnp153.bb
#extraIndexCount: 1
#    name (field 3) with 702599342 items
#itemCount: 702,599,342
#chromCount: 575
#basesCovered: 845,712,321
#meanDepth (of bases covered): 1.279667
#minDepth: 1.000000
#maxDepth: 321.000000
#std of depth: 1.990520
    bigBedInfo -extraIndex hg19.dbSnp153BadCoords.bb
#    name (field 3) with 121484 items
#itemCount: 121,484
#chromCount: 191
#basesCovered: 750,840
#meanDepth (of bases covered): 2.603311
#minDepth: 1.000000
#maxDepth: 150.000000
#std of depth: 4.266904
    bigBedInfo -extraIndex hg38.dbSnp153BadCoords.bb
#extraIndexCount: 1
#    name (field 3) with 149475 items
#itemCount: 149,475
#chromCount: 418
#basesCovered: 986,919
#meanDepth (of bases covered): 2.867679
#minDepth: 1.000000
#maxDepth: 119.000000
#std of depth: 4.568874

    # count up how many variants (mappings) have freq counts for each project
    cut -f 4 dbSnp153Details.tab \
    | perl -we '
        my @freqSourceOrder = ("1000Genomes", "GnomAD_exomes", "TOPMED", "ExAC", "PAGE_STUDY",
            "GnomAD", "GoESP", "Estonian", "ALSPAC", "TWINSUK", "NorthernSweden", "Vietnamese");
        my @counts = ();
        while (<>) {
            chomp; next unless $_; @w = split ",";
            for (my $i = 0;  $i < @w;  $i++) {
                if ($w[$i]) { $counts[$i]++; }
            }
        }
        for (my $i = 0;  $i < @freqSourceOrder;  $i++) {
            print sprintf("%10d %s", $counts[$i] || 0, $freqSourceOrder[$i]) . "\n";
        }'
# 437625304 TOPMED
# 211192464 GnomAD
#  84744519 1000Genomes
#  44888412 TWINSUK
#  44888412 ALSPAC
#  31397949 Estonian
#  16351704 NorthernSweden
#  12283947 GnomAD_exomes
#  10004068 Vietnamese
#   8854135 ExAC
#   1973844 GoESP
#   1323049 PAGE_STUDY

    # count up how many instances of each type of ucscNote:
    time cut -f 15 hg19.dbSnp153.checked.bigDbSnp | sed -re 's/,/\n/g;' | g . | sort | uniq -c
#    10755 altIsAmbiguous
#     5998 classMismatch
#   454678 clinvar
#   143864 clinvarBenign
#     7932 clinvarConflicting
#    96242 clinvarPathogenic
#   114826 clusterError
# 12184521 commonAll
# 20541190 commonSome
#  1377831 diffMajor
#     3922 freqIncomplete
#     7656 freqIsAmbiguous
#     2685 freqNotMapped
#    17694 freqNotRefAlt
#   562180 multiMap
#   114095 otherMapErr
#107015341 overlapDiffClass
# 16915239 overlapSameClass
#662601770 rareAll
#670958439 rareSome
#      101 refIsAmbiguous
#  3272116 refIsMinor
#   136547 refIsRare
#    37832 refIsSingleton
#        4 refMismatch
#  3813702 revStrand
#real    34m57.796s
#user    47m49.283s
#sys     4m29.442s

    # Check count of rs's with at least one bad mapping:
    grep otherMapErr hg19.dbSnp153.checked.bigDbSnp | cut -f 4 | sort -u | wc -l
#55454

    time cut -f 15 hg38.dbSnp153.checked.bigDbSnp | sed -re 's/,/\n/g;' | g . | sort | uniq -c
#    10888 altIsAmbiguous
#     6216 classMismatch
#   453996 clinvar
#   143736 clinvarBenign
#     7950 clinvarConflicting
#    95262 clinvarPathogenic
#   128306 clusterError
# 12438655 commonAll
# 20902944 commonSome
#  1399109 diffMajor
#     4673 freqIncomplete
#     7756 freqIsAmbiguous
#     6590 freqNotMapped
#    32170 freqNotRefAlt
#   132123 multiMap
#   204219 otherMapErr
#110007682 overlapDiffClass
# 17291289 overlapSameClass
#681696398 rareAll
#690160687 rareSome
#      111 refIsAmbiguous
#  3360435 refIsMinor
#   160827 refIsRare
#    50927 refIsSingleton
#       33 refMismatch
#  4532511 revStrand
#real    36m36.972s
#user    49m41.817s
#sys     4m43.806s

    # Check count of rs's with at least one bad mapping:
    grep otherMapErr hg38.dbSnp153.checked.bigDbSnp | cut -f 4 | sort -u | wc -l
#86636


# Partial redo 2024-03-08 similar to Galt's dbSnp155 partial redo 2023-03-26 below, to fix the same
# comma-parsing problem with clinicalSignificance -- see RM #33070.  Finished 2024-03-14.
    topDir=/hive/data/outside/dbSNP/153
    freqSourceOrder=1000Genomes,GnomAD_exomes,TOPMED,ExAC,PAGE_STUDY,GnomAD,GoESP,Estonian,ALSPAC,TWINSUK,NorthernSweden,Vietnamese
    # Run doBigDbSnp.pl (first with -debug to make runDir):
    $HOME/kent/src/hg/utils/automation/doBigDbSnp.pl $topDir 153 $freqSourceOrder \
        -assemblyList=GRCh37.p13,GRCh38.p12 -debug

# *** Steps were performed in /hive/data/outside/dbSNP/153/bigDbSnp.2024-03-08

    cd /hive/data/outside/dbSNP/153/bigDbSnp.2024-03-08

    # Link to ../bigDbSnp.2019-08-07/split, -continue convert to avoid re-splitting (the slowest part of the process):
    rmdir split
    ln -s ../bigDbSnp.2019-08-07/split split
    rmdir splitProcessed
    ln -s ../bigDbSnp.2019-08-07/splitProcessed splitProcessed
    $HOME/kent/src/hg/utils/automation/doBigDbSnp.pl $topDir 153 $freqSourceOrder \
        -assemblyList=GRCh37.p13,GRCh38.p12 \
        -buildDir=`pwd` -continue convert -stop install \
      >& do.log &
    tail -f do.log
    # It failed at the fixHg19ChrM step because the script expects chrom to be "chrMT" but instead
    # it's "NC_012920.1".  Maybe things changed after b153?  Anyway, run these modified commands
    # in place of doFixHg19ChrM.sh:
# For hg19, liftOver chrMT annotations to hg19 chrM.
sed -e 's/NC_012920 /NC_012920.1 /' \
  /hive/data/outside/dbSNP/131/human/NC_012920ToChrM.over.chain \
  > hg19.mitoLiftover.chain
# For liftOver, convert 0-base fully-closed to 0-based half-open because liftOver
# doesn't deal with 0-base items.
# already done: mv hg19.dbSnp153.bigDbSnp hg19.preChrMFix.dbSnp153.bigDbSnp
time (grep ^NC_012920.1 hg19.preChrMFix.dbSnp153.bigDbSnp \
      | awk -F$'\t' 'BEGIN{OFS="'$'\t''";} {$3 += 1; print;}' \
      | liftOver -tab -bedPlus=3 stdin \
          hg19.mitoLiftover.chain stdout chrM.unmapped \
      | awk -F$'\t' 'BEGIN{OFS="'$'\t''";} {$3 -= 1; print;}' \
      | sort -k2n,2n \
        > hg19.chrM.dbSnp153.bigDbSnp)
#real    4m20.207s
wc -l hg19.chrM.dbSnp153.bigDbSnp chrM.unmapped
#  3312 hg19.chrM.dbSnp153.bigDbSnp
#    14 chrM.unmapped
# 7 "Partially deleted in new" items
time grep -v ^NC_012920.1 hg19.preChrMFix.dbSnp153.bigDbSnp \
     | sort --merge -k1,1 -k2n,2n - hg19.chrM.dbSnp153.bigDbSnp \
       > hg19.dbSnp153.bigDbSnp
#real    5m21.835s
    $HOME/kent/src/hg/utils/automation/doBigDbSnp.pl $topDir 153 $freqSourceOrder \
        -assemblyList=GRCh37.p13,GRCh38.p12 \
        -buildDir=`pwd` -continue check -stop install \
      >& do2.log &
    tail -f do2.log
# *** All done !  (through the 'install' step)  Elapsed time: 177m56s
# *** Steps were performed in /hive/data/outside/dbSNP/153/bigDbSnp.2024-03-08

    # Ah, but the compare-bot, Gerardo and Lou found that dbSnp153Common lost some items --
    # it turned out that no indels made it into dbSnp153Common due to a change for 155 in
    # dbSnpJsonToTab.c that broke something else for 153.  Run again (but repeat the manual
    # version of the fixHg19ChrM step.
    $HOME/kent/src/hg/utils/automation/doBigDbSnp.pl $topDir 153 $freqSourceOrder \
        -assemblyList=GRCh37.p13,GRCh38.p12 \
        -buildDir=`pwd` -continue convert -stop mergeChroms \
      >& do.log &
    tail -f do.log
# *** All done !  (through the 'mergeChroms' step)  Elapsed time: 164m12s
# *** Steps were performed in /hive/data/outside/dbSNP/153/bigDbSnp.2024-03-08

# For hg19, liftOver chrMT annotations to hg19 chrM.
sed -e 's/NC_012920 /NC_012920.1 /' \
  /hive/data/outside/dbSNP/131/human/NC_012920ToChrM.over.chain \
  > hg19.mitoLiftover.chain
# For liftOver, convert 0-base fully-closed to 0-based half-open because liftOver
# doesn't deal with 0-base items.
mv hg19.dbSnp153.bigDbSnp hg19.preChrMFix.dbSnp153.bigDbSnp
time (grep ^NC_012920.1 hg19.preChrMFix.dbSnp153.bigDbSnp \
      | awk -F$'\t' 'BEGIN{OFS="'$'\t''";} {$3 += 1; print;}' \
      | liftOver -tab -bedPlus=3 stdin \
          hg19.mitoLiftover.chain stdout chrM.unmapped \
      | awk -F$'\t' 'BEGIN{OFS="'$'\t''";} {$3 -= 1; print;}' \
      | sort -k2n,2n \
        > hg19.chrM.dbSnp153.bigDbSnp)
#real    4m44.075s
wc -l hg19.chrM.dbSnp153.bigDbSnp chrM.unmapped
#  3312 hg19.chrM.dbSnp153.bigDbSnp
#    14 chrM.unmapped
# 7 "Partially deleted in new" items
time grep -v ^NC_012920.1 hg19.preChrMFix.dbSnp153.bigDbSnp \
     | sort --merge -k1,1 -k2n,2n - hg19.chrM.dbSnp153.bigDbSnp \
       > hg19.dbSnp153.bigDbSnp
#real    5m58.364s
    $HOME/kent/src/hg/utils/automation/doBigDbSnp.pl $topDir 153 $freqSourceOrder \
        -assemblyList=GRCh37.p13,GRCh38.p12 \
        -buildDir=`pwd` -continue check -stop install \
      >& do2.log &
    tail -f do2.log
# *** All done !  (through the 'install' step)  Elapsed time: 176m58s
# *** Steps were performed in /hive/data/outside/dbSNP/153/bigDbSnp.2024-03-08


##############################################################################
# dbSnp155: dbSNP build 155 (IN-PROGRESS 11/25/19 galt)

    topDir=/hive/data/outside/dbSNP/155
    mkdir -p $topDir/json
    cd $topDir/json

    wget --timestamping -nd ftp://ftp.ncbi.nih.gov/snp/latest_release/JSON/\*
    md5sum -c CHECKSUMS
#refsnp-chr10.json.bz2: OK
#...
#refsnp-withdrawn.json.bz2: OK



    # jsonQuery commands to figure out what assemblies, SO terms and frequency sources are in there,
    # by sampling first 10,000 variants on an arbitrary chrom:
    assemblyPath="primary_snapshot_data.placements_with_allele[*].placement_annot.seq_id_traits_by_assembly[*].assembly_name"
    rnaSoPath="primary_snapshot_data.allele_annotations[*].assembly_annotation[*].genes[*].rnas[*].sequence_ontology[*].accession"
    proteinSoPath="primary_snapshot_data.allele_annotations[*].assembly_annotation[*].genes[*].rnas[*].protein.sequence_ontology[*].accession"
    freqSourcePath="primary_snapshot_data.allele_annotations[*].frequency[*].study_name"
    for jPath in "$assemblyPath" "$rnaSoPath" "$proteinSoPath" "$freqSourcePath"; do
      echo "$jPath"
      bzcat refsnp-chr3.json.bz2 \
      | head -10000 \
      | jsonQuery -countUniq -verbose=2 /dev/stdin "$jPath" stdout \
      | sort -nr
      echo ""
    done
    # Assemblies:
#     10232 "GRCh38.p13"
#     10111 "GRCh37.p13"

    # RNA SO terms -- make sure all of these appear in soTerm.[ch]:

#    144288 "SO:0001627"
#     25829 "SO:0002153"
#     19729 "SO:0002152"
#      8506 "SO:0001624" 
#      3112 "SO:0001986"
#      2648 "SO:0001580"
#      1769 "SO:0001619"
#      1712 "SO:0001987"
#       878 "SO:0001623"
#        30 "SO:0001575"
#        10 "SO:0001574"
#         4 "SO:0001590"

    # Made sure all RNA SO terms are found in hg/{inc,lib}/soTerm.[ch] (nothing new since b152)


    # Protein SO terms -- make sure all of these appear in soTerm.[ch]:

#       770 "SO:0001819"
#       726 "SO:0001583"
#         6 "SO:0001821"
#         5 "SO:0001587"
#         2 "SO:0000865"

    # Made sure all Protein SO terms are found in hg/{inc,lib}/soTerm.[ch] (nothing new since b152)


    # Projects reporting allele counts/frequencies:

#     34713 "TOPMED"
#     22432 "dbGaP_PopFreq"
#     21128 "KOREAN"
#     17553 "SGDP_PRJ"
#     17470 "1000Genomes"
#     17069 "Qatari"
#     16284 "NorthernSweden"
#     16074 "Siberian"
#     15948 "TWINSUK"
#     15948 "TOMMO"
#     15948 "ALSPAC"
#     15408 "GENOME_DK"
#     15356 "GnomAD"
#     15340 "GoNL"
#     15198 "Estonian"
#     13086 "Vietnamese"
#     12000 "Korea1K"
#     11238 "HapMap"
#      3406 "PRJEB36033"
#      2454 "HGDP_Stanford"
#      2018 "Daghestan"
#      1846 "PAGE_STUDY"
#      1204 "Chileans"
#      1134 "MGP"
#      1116 "PRJEB37584"
#       474 "GoESP"
#       468 "ExAC"
#       460 "GnomAD_exomes"
#       318 "FINRISK"
#        46 "PharmGKB"
#         6 "PRJEB37766"

    # This time the JSON downloads include a file frequency_studies.json that describes each study.
    # Will be useful for making a details page, but some descriptions are just study names.

    # Got total_count values from refsnp-chr3.json.bz2 above, put ALFA 2.0 (dbGaP_PopFreq) first, then order by

    # decreasing total_count:

#     22432 "dbGaP_PopFreq"
#     34713 "TOPMED"
#     21128 "KOREAN"
#     17553 "SGDP_PRJ"
#     17470 "1000Genomes"
#     17069 "Qatari"
#     16284 "NorthernSweden"
#     16074 "Siberian"
#     15948 "TWINSUK"
#     15948 "TOMMO"
#     15948 "ALSPAC"
#     15408 "GENOME_DK"
#     15356 "GnomAD"
#     15340 "GoNL"
#     15198 "Estonian"
#     13086 "Vietnamese"
#     12000 "Korea1K"
#     11238 "HapMap"
#      3406 "PRJEB36033"
#      2454 "HGDP_Stanford"
#      2018 "Daghestan"
#      1846 "PAGE_STUDY"
#      1204 "Chileans"
#      1134 "MGP"
#      1116 "PRJEB37584"
#       474 "GoESP"
#       468 "ExAC"
#       460 "GnomAD_exomes"
#       318 "FINRISK"
#        46 "PharmGKB"
#         6 "PRJEB37766"



#NOTE freqSourceOrder with ALFA2 aka dbGaP_PopFreq as the primary study. This was later abandoned since the common set was much smaller, went back to putting 1000Genomes first later on.
    freqSourceOrder=dbGaP_PopFreq,TOPMED,KOREAN,SGDP_PRJ,1000Genomes,Qatari,NorthernSweden,Siberian,TWINSUK,TOMMO,ALSPAC,GENOME_DK,GnomAD,GoNL,Estonian,Vietnamese,Korea1K,HapMap,PRJEB36033,HGDP_Stanford,Daghestan,PAGE_STUDY,Chileans,MGP,PRJEB37584,GoESP,ExAC,GnomAD_exomes,FINRISK,PharmGKB,PRJEB37766

    # Hmm, the PAGE study is a population genetics study.  It excludes caucasians:
    # https://www.biorxiv.org/content/biorxiv/early/2018/10/17/188094.full.pdf
    # "Genotyped individuals self-identified as Hispanic/Latino (N=22,216),
    #  African American (N=17,299), Asian (N=4,680), Native Hawaiian (N=3,940),
    #  Native American (N=652), or Other (N=1,052, primarily South Asian or mixed heritage,
    #  as well as participants who did not identify with any of the available options."
    # They didn't attempt to make balanced global (non-caucasian) populations (e.g. almost
    # as many Native Hawaiians as Asians), so I'll keep 1000Genomes first.

    cd $topDir

    # dbSnp has changed since v152, it now uses patch13 too, so "GRCh38.p13" instead of "GRCh38.p12"

    # Construct a mapping from RefSeq accessions like NC_000... to assembly, 2bit, and UCSC name.
    hgsql hg38 -NBe 'select alias, chrom from chromAlias where find_in_set("refseq", source);' \
    | tawk '{print $1, "GRCh38.p13", "/hive/data/genomes/hg38/hg38.2bit", $2;}' \
      > refSeqToUcsc.tab
    hgsql hg19 -NBe 'select alias, chrom from chromAlias where find_in_set("refseq", source);' \
    | tawk '{print $1, "GRCh37.p13", "/hive/data/genomes/hg19/hg19.2bit", $2;}' \
      >> refSeqToUcsc.tab

    # has 45 more alts and fixes for hg38 just as expected for patch13.
    

    #make equivRegions.tab

    # hg38 patch13 RefSeq assembly accession: GCF_000001405.39 (latest)

    # Construct a mapping of equivalent RefSeq assembly regions for GRCh38 and GRCh37,
    # so we can distinguish multiple mappings to PAR/alts/fixes from plain old multiple mappings.
    refseqAssemblies=/hive/data/outside/ncbi/genomes/refseq/vertebrate_mammalian/Homo_sapiens/all_assembly_versions
    grep -v ^# \
      $refseqAssemblies/GCF_000001405.{25_GRCh37.p13,39_GRCh38.p13}/*_assembly_structure/all_alt_scaffold_placement.txt \
    | tawk '{print $7, $12-1, $13, $4, $10-1, $11;}' \
    | sort -k 1,1 -k2n,2n \
    | tawk '{print $1 ":" $2 ":" $3, $4 ":" $5 ":" $6;}' \
      > equivRegions.tab

    # Add PARs:
    grep -w PAR \
      $refseqAssemblies/GCF_000001405.25_GRCh37.p13/*_assembly_regions.txt \
    | sort \
    | sed -e 's/X/NC_000023.10/; s/Y/NC_000024.9/;' \
    | tawk '{print $1, $2 ":" $3 - 1 ":" $4;}'

#PAR#1   NC_000023.10:60000:2699520
#PAR#1   NC_000024.9:10000:2649520
#PAR#2   NC_000023.10:154931043:155260560
#PAR#2   NC_000024.9:59034049:59363566
    echo -e "NC_000023.10:60000:2699520\tNC_000024.9:10000:2649520" >> equivRegions.tab
    echo -e "NC_000023.10:154931043:155260560\tNC_000024.9:59034049:59363566" >> equivRegions.tab

    grep -w PAR \
      $refseqAssemblies/GCF_000001405.39_GRCh38.p13/*_assembly_regions.txt \
    | sort \
    | sed -e 's/X/NC_000023.11/; s/Y/NC_000024.10/;' \
    | tawk '{print $1, $2 ":" $3 - 1 ":" $4;}'

#PAR#1   NC_000023.11:10000:2781479
#PAR#1   NC_000024.10:10000:2781479
#PAR#2   NC_000023.11:155701382:156030895
#PAR#2   NC_000024.10:56887902:57217415
    echo -e "NC_000023.11:10000:2781479\tNC_000024.10:10000:2781479" >> equivRegions.tab
    echo -e "NC_000023.11:155701382:156030895\tNC_000024.10:56887902:57217415" >> equivRegions.tab

    # Patch doBigDbSnp.pl to update the default hg38 assembly to p13.
    vi $HOME/kent/src/hg/utils/automation/doBigDbSnp.pl
    # around line 47, change to GRCh38.p13 and save and commit and push this new default.
    -my $assemblyList = 'GRCh37.p13,GRCh38.p12';
    +my $assemblyList = 'GRCh37.p13,GRCh38.p13';


    # Run doBigDbSnp.pl (first with -debug to make runDir
    # FYI the system is currently using ku cluster to run parasol jobs.

    $HOME/kent/src/hg/utils/automation/doBigDbSnp.pl $topDir 155 $freqSourceOrder -debug
# *** Steps were performed in /hive/data/outside/dbSNP/155/bigDbSnp.2021-09-30
    cd /hive/data/outside/dbSNP/155/bigDbSnp.2021-09-30
    $HOME/kent/src/hg/utils/automation/doBigDbSnp.pl $topDir 155 $freqSourceOrder -stop install \
      >& do.log &
    tail -f do.log


convert step choked because the step above for create equivRegions.tab lacked -e on echo statement
which meant that the "\t" was NOT converted to an actual tab. Must re-run.

Re-ran from start but died on step fixHg19ChrM which was looking for NC_012920 but that has been renamed to just chrMT in the dbsnp data for hg19.
fixed doFixHg19ChrM.sh by tweaking the doBigDbSnp.pl that creates it.

Even though perhaps $freqSourceOrder is only needed for the convert step,
the script requires it.

Lame that after starting up screen, it lost my local variables,
so I had to redefine topDir and freqSourceOrder after launching screen program for running background jobs.

# resuming
    cd /hive/data/outside/dbSNP/155/bigDbSnp.2021-09-30
    $HOME/kent/src/hg/utils/automation/doBigDbSnp.pl $topDir 155 $freqSourceOrder\
      -buildDir=`pwd` -continue check -stop install \
      >& check.log &
    tail -f check.log

 *** All done !  (through the 'install' step)  Elapsed time: 408m55s
 *** Steps were performed in /hive/data/outside/dbSNP/155/bigDbSnp.2021-09-30

# GALT 2022-05-14

    # FYI
    # split   step runs on hgwdev
    # convert step runs on ku  which has 32 machines that have 32 CPUs each.
    # fixed freqSoureOrder changed, back to using 1000Genomes as the primary study, instead of ALFA2 dbGaP_PopFreq

    topDir=/hive/data/outside/dbSNP/155
    # after trying ALFA2 as primary study, it did not have enought SNPs, so I am going back to using 1000Genomes as the primary study.
    freqSourceOrder=1000Genomes,dbGaP_PopFreq,TOPMED,KOREAN,SGDP_PRJ,Qatari,NorthernSweden,Siberian,TWINSUK,TOMMO,ALSPAC,GENOME_DK,GnomAD,GoNL,Estonian,Vietnamese,Korea1K,HapMap,PRJEB36033,HGDP_Stanford,Daghestan,PAGE_STUDY,Chileans,MGP,PRJEB37584,GoESP,ExAC,GnomAD_exomes,FINRISK,PharmGKB,PRJEB37766
    # Run doBigDbSnp.pl (first with -debug to make runDir):
    $HOME/kent/src/hg/utils/automation/doBigDbSnp.pl $topDir 155 $freqSourceOrder -debug

# *** Steps were performed in /hive/data/outside/dbSNP/155/bigDbSnp.2022-05-14


    cd /hive/data/outside/dbSNP/155/bigDbSnp.2022-05-14
    # Link to ../bigDbSnp.2021-09-30/split, -continue convert to avoid re-splitting (the slowest part of the process):
    rmdir split
    ln -s ../bigDbSnp.2021-09-30/split split
    rmdir splitProcessed
    ln -s ../bigDbSnp.2021-09-30/splitProcessed splitProcessed
    $HOME/kent/src/hg/utils/automation/doBigDbSnp.pl $topDir 155 $freqSourceOrder \
      -buildDir=`pwd` -continue convert -stop install \
      >& do.log &
    tail -f do.log
    # crashed on the mergeToChrom step

    # NOTICE that on mergeToChrom it uses /dev/shm/ virtual memory disk for temp files,
    # and it can run out of space, and if you have to stop and restart it,
    # be sure to clean it out properly like this:
# para stop  # seems to flush results too.
# para freeBatch  # should also fix any sick batches and machines.
#
# TO CLEAN:
# rm -fr /dev/shm/dbSnpMergeSortBed.*/
# rm -fr /dev/shm/dbSnpMergeSort.*/
#
# Note that it also overflowed the relatively small space in /scratch/tmp/ which
# is used by sort in the sort command.  So I had to drastically reduce the maximum
# number of simultaneous jobs, all the way down to just 6 jobs!
# added -maxJob=6 to the para make jobList command in doMergeToChrom.sh
/parasol/bin/para make -maxJob=6 jobList
# and cleaning out temp files in /dev/shm/ and then para stop, para freeBatch, 
# and finally I re-ran doMergeToChrom.sh manually.

    $HOME/kent/src/hg/utils/automation/doBigDbSnp.pl $topDir 155 $freqSourceOrder \
      -buildDir=`pwd` -continue mergeChroms -stop install \
      >& do2.log &
    tail -f do2.log

# step bigBed choked on hg19 because of a subtle bug in resSizes counting which is now fixed.
    $HOME/kent/src/hg/utils/automation/doBigDbSnp.pl $topDir 155 $freqSourceOrder \
      -buildDir=`pwd` -continue bigBed -stop install \
      >& do3.log &
    tail -f do3.log

 *** All done !  (through the 'install' step)  Elapsed time: 326m34s
 *** Steps were performed in /hive/data/outside/dbSNP/155/bigDbSnp.2022-05-14

    wc -l dbSnp155Errors.tab

#13318 dbSnp155Errors.tab
# nearly all of these "Errors" are because a small study called 'ChromosomeY' was not picked up
# in our scan of the first 10000 lines of chrom 3 that we used above. However, this study is not big enough to be important.
1011954 NOSEQ   freqSourceOrder does not contains 'ChromosomeY'

I copied track description page fron 153 and updated the content
src/hg/makeDb/trackDb/human/dbSnp155Composite.html

Informally, I made a couple of scripts in here to help update the studies list and the keywords counts sections of the description page.
/hive/data/outside/dbSNP/155/json/



redoing for 2023-03-15
   # re-do since commas in clinicalSignificance needed htmlencoding so main utility was updated.

    topDir=/hive/data/outside/dbSNP/155
    # after trying ALFA2 as primary study, it did not have enought SNPs, so I am going back to using 1000Genomes as the primary study.
    freqSourceOrder=1000Genomes,dbGaP_PopFreq,TOPMED,KOREAN,SGDP_PRJ,Qatari,NorthernSweden,Siberian,TWINSUK,TOMMO,ALSPAC,GENOME_DK,GnomAD,GoNL,Estonian,Vietnamese,Korea1K,HapMap,PRJEB36033,HGDP_Stanford,Daghestan,PAGE_STUDY,Chileans,MGP,PRJEB37584,GoESP,ExAC,GnomAD_exomes,FINRISK,PharmGKB,PRJEB37766
    # Run doBigDbSnp.pl (first with -debug to make runDir):
    $HOME/kent/src/hg/utils/automation/doBigDbSnp.pl $topDir 155 $freqSourceOrder -debug

# *** Steps were performed in /hive/data/outside/dbSNP/155/bigDbSnp.2023-03-15

    cd /hive/data/outside/dbSNP/155/bigDbSnp.2023-03-15

    # Link to ../bigDbSnp.2021-09-30/split, -continue convert to avoid re-splitting (the slowest part of the process):
    rmdir split
    ln -s ../bigDbSnp.2021-09-30/split split
    rmdir splitProcessed
    ln -s ../bigDbSnp.2021-09-30/splitProcessed splitProcessed
    $HOME/kent/src/hg/utils/automation/doBigDbSnp.pl $topDir 155 $freqSourceOrder \
      -buildDir=`pwd` -continue convert -stop install \
      >& do.log &
    tail -f do.log

 # convert step runs on ku kluster

   ssh ku

[ku.gi.ucsc.edu:~> parasol list batches
#user     run   wait   done crash pri max cpu  ram  plan min batch
galt      461   9755    334     0  10  -1   1  8.0g  496  10 /hive/data/outside/dbSNP/155/bigDbSnp.2023-03-15/run.convert/



redoing for 2023-03-26
     # re-do since the clinicalSignificance fix had to be re-done in a completely different way that should be good now.

    topDir=/hive/data/outside/dbSNP/155
    # after trying ALFA2 as primary study, it did not have enought SNPs, so I am going back to using 1000Genomes as the primary study.
    freqSourceOrder=1000Genomes,dbGaP_PopFreq,TOPMED,KOREAN,SGDP_PRJ,Qatari,NorthernSweden,Siberian,TWINSUK,TOMMO,ALSPAC,GENOME_DK,GnomAD,GoNL,Estonian,Vietnamese,Korea1K,HapMap,PRJEB36033,HGDP_Stanford,Daghestan,PAGE_STUDY,Chileans,MGP,PRJEB37584,GoESP,ExAC,GnomAD_exomes,FINRISK,PharmGKB,PRJEB37766
    # Run doBigDbSnp.pl (first with -debug to make runDir):
    $HOME/kent/src/hg/utils/automation/doBigDbSnp.pl $topDir 155 $freqSourceOrder -debug

# *** Steps were performed in /hive/data/outside/dbSNP/155/bigDbSnp.2023-03-26

    cd /hive/data/outside/dbSNP/155/bigDbSnp.2023-03-26

    # Link to ../bigDbSnp.2021-09-30/split, -continue convert to avoid re-splitting (the slowest part of the process):
    rmdir split
    ln -s ../bigDbSnp.2021-09-30/split split
    rmdir splitProcessed
    ln -s ../bigDbSnp.2021-09-30/splitProcessed splitProcessed
    $HOME/kent/src/hg/utils/automation/doBigDbSnp.pl $topDir 155 $freqSourceOrder \
      -buildDir=`pwd` -continue convert -stop install \
      >& do.log &
    tail -f do.log

 *** All done !  (through the 'install' step)  Elapsed time: 1028m23s
 *** Steps were performed in /hive/data/outside/dbSNP/155/bigDbSnp.2023-03-26

    # checked errors and warnings, did not see anything concerning.

#############################################################################

H3AFRICA is the dbsnp study with african variants that we want.

https://h3africa.org/

v155 and v156 did not have H3AFRICA.

V156 is still the latest release at this time (2024-06-10)


#############################################################################
