#!/usr/bin/env python3

import logging, sys, optparse, re, os, datetime, gzip, urllib.request, urllib.error, urllib.parse, subprocess
import tempfile, json
from collections import defaultdict
from os.path import join, basename, dirname, isfile, abspath, isdir
from datetime import date, datetime, timedelta

dbToUrl = {
    "dbVar": "https://www.ncbi.nlm.nih.gov/dbvar/variants/%s/",
    "UniProtKB (variants)" : "http://www.uniprot.org/uniprot/%s",
    "OMIM Allelic Variant" : "http://www.omim.org/entry/%s",
    "MedGen": "https://www.ncbi.nlm.nih.gov/medgen/%s",
    "OMIM" : "http://www.omim.org/entry/%s",
    "MONDO" : "https://monarchinitiative.org/disease/%s",
    "ClinGen" : "https://reg.clinicalgenome.org/redmine/projects/registry/genboree_registry/by_caid?caid=%s",
    "Orphanet" : "http://www.orpha.net/consor/cgi-bin/OC_Exp.php?lng=EN&Expert=%s"
}

# since we're filtering on it, we make sure that we have all molecular consequences in the tdb file
# if Clinvar ever adds a new value, this script must fail and tdb must be updated.
possMolConseqs = set(["genic downstream transcript variant","no sequence alteration","inframe indel","stop lost","genic upstream transcript variant","initiator codon variant","inframe insertion","inframe deletion","","splice acceptor variant","splice donor variant","5 prime UTR variant","nonsense","non-coding transcript variant","3 prime UTR variant","frameshift variant","intron variant","synonymous variant","missense variant", ""])

# these consequences are highlighted using a triangle decorator. Similar to Decipher and Gnomad browsers
truncConseqs = set(["nonsense", "frameshift variant", "splice acceptor variant", "splice donor variant"])
pinkConseqs = set(["splice acceptor variant", "splice donor variant"])

# make sure that if the second list is edited, it is never out of sync with the first list
assert(len(set(truncConseqs - possMolConseqs))==0)

# === COMMAND LINE INTERFACE, OPTIONS AND HELP ===
parser = optparse.OptionParser("""usage: %prog [options] summaryFname varAllFname hgvsFname - check and convert the three main clinVar tab-sep files to four bed files, split into CNV and shorter mutations, for both hg19 and hg38 and convert all to bigBed.

Output goes into the current dir

Typical input files are at ftp://ftp.ncbi.nlm.nih.gov/pub/clinvar/tab_delimited/
""") 

parser.add_option("-d", "--debug", dest="debug", action="store_true", help="show debug messages")
parser.add_option("", "--alpha", dest="isAlpha", action="store_true", help="Default target is /gbdb/{hg19,hg38}/bbi/clinvar. With this option, we use /gbdb/{hg19,hg38}/bbi/clinvarAlpha, the hgwdev only version of the track, see human/clinvarAlpha.ra in trackDb")
parser.add_option("-a", "--auto", dest="auto", action="store_true", help="download the file from NCBI into the current dir and convert to bigBed")
parser.add_option("", "--skipDown", dest="skipDownload", action="store_true", help="Only with --auto: don't download again if it's already there, useful when debugging/developing")
parser.add_option("", "--maxDiff", dest="maxDiff", action="store", type="float", help="look for last month's download file in current dir and accept this much difference, expressed as a ratio. Can only be used with --auto.")
(options, args) = parser.parse_args()

if options.debug:
    logging.basicConfig(level=logging.DEBUG)
else:
    logging.basicConfig(level=logging.INFO)

if not options.isAlpha and os.getlogin()!="otto":
    print("Please do not run this script as a normal user, you will run into all sorts of permission problems.")
    print("to develop and test, ssh to otto, then run it with doUpdate.sh --alpha to generate the dev-only tracks.")
    print("When you're done, run doUpdate.sh without the --alpha and commit the change to the kent tree.")
    sys.exit(1)

#clinvarExpectHeaders = "#AlleleID	Type	Name	GeneID	GeneSymbol	HGNC_ID	ClinicalSignificance	ClinSigSimple	LastEvaluated	RS# (dbSNP)	nsv/esv (dbVar)	RCVaccession	PhenotypeIDS	PhenotypeList	Origin	OriginSimple	Assembly	ChromosomeAccession	Chromosome	Start	Stop	ReferenceAllele	AlternateAllele	Cytogenetic	ReviewStatus	NumberSubmitters	Guidelines	TestedInGTR	OtherIDs	SubmitterCategories	VariationID	PositionVCF	ReferenceAlleleVCF	AlternateAlleleVCF\n"
clinvarExpectHeaders = "#AlleleID|Type|Name|GeneID|GeneSymbol|HGNC_ID|ClinicalSignificance|ClinSigSimple|LastEvaluated|RS# (dbSNP)|nsv/esv (dbVar)|RCVaccession|PhenotypeIDS|PhenotypeList|Origin|OriginSimple|Assembly|ChromosomeAccession|Chromosome|Start|Stop|ReferenceAllele|AlternateAllele|Cytogenetic|ReviewStatus|NumberSubmitters|Guidelines|TestedInGTR|OtherIDs|SubmitterCategories|VariationID|PositionVCF|ReferenceAlleleVCF|AlternateAlleleVCF|SomaticClinicalImpact|SomaticClinicalImpactLastEvaluated|ReviewStatusClinicalImpact|Oncogenicity|OncogenicityLastEvaluated|ReviewStatusOncogenicity|SCVsForAggregateGermlineClassification|SCVsForAggregateSomaticClinicalImpact|SCVsForAggregateOncogenicityClassification".split("|")

# ==== FUNCTIONs =====

# benign = BN, likely benign = LB, conflicting = CF, likely pathogenic= LP, 
# pathogenic = PG, other = OT, uncertain = VUS, RF=risk factor
# colors were defined by Ana Benet Pages
# - WARNING ON FILTER CHANGES:  -  see #28562
# Never change these codes. They are in old sessions. If you change them, change the across the whole script
# and also change the name of the .as field, so the cart variables will be different. You can add new values through:
cnvColors = {
        "INS" : {
            "OT" : "225,165,0",
            "BN" : "250,214,69",
            "LB" : "250,214,69",
            "CF" : "225,165,0",
            "RF" : "225,165,0",
            "VUS" : "225,165,0",
            "LP" : "199,114,3",
            "PG" : "199,114,3"
            },
        "LOSS" : {
            "OT" : "255,0,0",
            "BN" : "255,98,119",
            "LB" : "255,98,119",
            "CF" : "255,0,0",
            "VUS" : "255,0,0",
            "RF" : "255,0,0",
            "LP" : "153,0,0",
            "PG" : "153,0,0"
            },
        "GAIN" : {
            "OT" : "0,0,225",
            "BN" : "77,184,255",
            "LB" : "77,184,255",
            "CF" : "0,0,225",
            "VUS" : "0,0,225",
            "RF" : "0,0,225",
            "LP" : "0,0,179",
            "PG" : "0,0,179"
            },
        "STRUCT" : {
            "OT" : "0,210,0",
            "BN" : "0,255,0",
            "LB" : "0,255,0",
            "CF" : "0,210,0",
            "VUS" : "0,210,0",
            "RF" : "0,210,0",
            "LP" : "0,128,0",
            "PG" : "0,128,0"
            },
        "INV" : {
            "OT" : "192,0,192",
            "BN" : "255,128,255",
            "LB" : "255,128,255",
            "CF" : "192,0,192",
            "VUS" : "192,0,192",
            "RF" : "192,0,192",
            "LP" : "128,0,128",
            "PG" : "128,0,128"
            },
        "SEQALT" : {
            "OT" : "128,128,128",
            "BN" : "191,191,191",
            "LB" : "191,191,191",
            "CF" : "128,128,128",
            "VUS" : "128,128,128",
            "RF" : "128,128,128",
            "LP" : "64,64,64",
            "PG" : "64,64,64"
            },
        "SEQLEN" : {
            "OT" : "0,179,179",
            "BN" : "0,255,255",
            "LB" : "0,255,255",
            "CF" : "0,179,179",
            "VUS" : "0,179,179",
            "RF" : "0,179,179",
            "LP" : "0,102,102",
            "PG" : "0,102,102"
            },
        "SUBST" : {
            "OT" : "128,128,128",
            "BN" : "191,191,191",
            "LB" : "191,191,191",
            "CF" : "128,128,128",
            "VUS" : "128,128,128",
            "RF" : "128,128,128",
            "LP" : "64,64,64",
            "PG" : "64,64,64"
            },
}

def mustRun(cmd):
    logging.debug(cmd)
    ret = os.system(cmd)
    if ret!=0:
        print(("Could not run: %s" % cmd))
        sys.exit(1)

def shortenName(name):
    "make the clinvar name shorter and keep a long version for the mouse over "
    cnvMatch = cnvRe.match(name)
    #hgvsMatch = hgvsRe.match(name)

    if ":" in name and not "chr" in name:
        longName = name.split(":")[0]
    else:
        longName = name

    shortName = name

    if cnvMatch != None:
        # some names include the original assembly as a prefix
        band, copy = cnvMatch.groups()
        if copy == None:
            copy = ""
        shortName = band+copy
    elif name.startswith("GRCh"):
        # GRCh37/hg19 15q14-15.1 chr15:34638237..42057083 complex variant
        print(name)
        assert False # name that starts with GRCh but does not match our regex should not happen
    elif "." in name:
        # many names look like NM_000274.3(OAT):c.550_552delGCT (p.Ala184del)
        # so we remove the gene and the protein description
        if ":" in name:
            name = name.split(":")[1]
            longName = name.split(" ")[0]

        # handle genomic HGVS NC_000010.11:g.(?_87925503)_(87965482_?)del
        if name.startswith("NC_") and "g." in name and ")" in name:
            shortName = name
        else:
            # strip away the final description like " (p.Ala184del)"
            if not name.startswith("g.("):
                # do not strip anything anymore is all we have the g. notation as we would end up with just g.
                name = name.split(" ")[0].split("[")[0].split("(")[0]
            shortName = name

        if name.endswith("del"):
            shortName = "del"
        elif name.endswith("ins"):
            shortName = "ins"
        elif name.endswith("dup"):
            shortName = "dup"
        else:
            posMatch = posRe.match(name)
            if posMatch!=None:
                pos, dupDelIns, change = posMatch.groups()
                if dupDelIns==None:
                    dupDelIns= ""

                if change==None or change=="":
                    shortName = pos+dupDelIns
                else:
                    shortName = dupDelIns+change

    return shortName, longName

def allTypeToCode(allType, debugId):
    """
    Map the allele type field from ClinVal to one of these classes for filtering:
        STRUCT = structual alteration
        LOSS = deletion, copy number loss, mobile element deletion
        GAIN = duplication, tandem dupl, intrachrom transp, copy number gain
        INS = insertion, mobile or novel element insertion
        INV = inversion
        SEQALT = catch-all category: multiallele cnv, undetermined, others, not defined
        SEQLEN = short tandem repeat, short repeat, rep exp, rep contraction, NT expansion
        SUBST = SNV, MNV, multi nucl polymophrism, indel
        OTH = protein only

        # categories were defined by Ana Benet Pages
        # see table at https://drive.google.com/file/d/1-iqHrZTwT1wuatNA_A5b6knyqE6JG14f/view?usp=sharing

    In 3/2020, these were all possible values:

        fusion  6       0.000%
        complex 61      0.005%
        protein only    103     0.008%
        NT expansion    142     0.011%
        Translocation   302     0.022%
        inversion       583     0.043%
        undetermined variant    866     0.064%
        insertion       7133    0.530%
        indel   8490    0.630%
        short repeat    18778   1.394%
        duplication     34148   2.535%
        copy number loss        40441   3.002%
        copy number gain        40671   3.019%
        deletion        77325   5.740%
        single nucleotide variant       1118033 82.997%

    """
    allType = allType.lower() # they changed the casing in Sep 2020
    # doing this first, as this is 80% of the data -> faster
    if allType in ["single nucleotide variant", "variation"]:
        # 'variation' doesn't fit anywhere, but we think that these are mostly OMIM imports, so SNV sounds OK
        return "SUBST"
    # the order here matches Ana's coloring table for easier reading
    elif allType in ["translocation", "fusion", "complex"]:
        return "STRUCT"
    elif allType in ["deletion", "copy number loss"]:
        return "LOSS"
    elif allType in ["duplication", "copy number gain", "tandem duplication"]:
        return "GAIN"
    elif allType in ["indel", "insertion"]:
        return "INS"
    elif allType in ["inversion"]:
        return "INV"
    elif allType in ["undetermined variant"]:
        return "SEQALT"
    elif allType in ["short repeat", "nt expansion", "microsatellite"]:
        return "SEQLEN"
    #elif allType in ["variation"]: # appeared in Sep 2020
        #return "OTH"
    #elif allType in ["protein only"]: # looks like they removed protein only sometimes in early 2020
        #return "OTH"
    else:
        print("found unknown allele type '%s' for accession %s"%(allType, debugId))
        assert(False) # found new allele type in ClinVar table : code needs fixing

def originSimpleToCode(originSimple):
    """
    for filtering, group the simplified origins into simpler origin codes:
    return one of these values:
        "GERM" = germline
        "SOM" = somatic
        "GERMSOM" = germline/somatic
        "NOVO" = de novo
        "UNK" = unknown

    as of 3/2020, these are the values in originSimple and how often they appear:

    tested-inconclusive     91      0.007%
    not applicable  193     0.014%
    germline/somatic        1808    0.134%
    somatic 7929    0.589%
    not provided    53173   3.947%
    unknown 72601   5.389%
    germline        1211288 89.919%
    """
    if originSimple=="germline":
        return "GERM"
    elif originSimple=="germline/somatic":
        return "GERMSOM"
    elif originSimple in ["not applicable", "not provided", "unknown", "tested-inconclusive"]:
        return "UNK"
    elif originSimple in ["not applicable", "not provided", "unknown"]:
        return "UNK"
    elif originSimple in ["somatic"]:
        return "SOM"
    else:
        print("found new origin string: "+originSimple)
        assert(False) # Clinvar has added a new origin string

def clinSignToCode(pathoStr):
    """ there are hundreds of possible pathogenicity descriptions. flatten into one of seven codes:
    - benign = BN
    likely benign = LB
    conflicting = CF
    likely pathogenic= LP
    pathogenic = PG
    other = OT
    uncertain = UC

    examples:

    Benign/Likely benign,other      13      0.002%
    Conflicting interpretations of pathogenicity,risk factor        13      0.002%
    Likely benign,other     18      0.003%  
    Pathogenic,drug response        28      0.005%
    protective      32      0.005%
    Pathogenic,risk factor  33      0.005%
    Pathogenic,other        102     0.017%
    Affects 120     0.019%
    association     202     0.033%
    drug response   387     0.063%
    risk factor     444     0.072%
    no interpretation for the single variant        564     0.092%
    other   1687    0.274%
    Pathogenic/Likely pathogenic    5209    0.846%
    not provided    8860    1.440%
    Benign/Likely benign    19489   3.167%
    Likely pathogenic       30200   4.907%
    Conflicting interpretations of pathogenicity    30910   5.023%
    Pathogenic      66679   10.835%
    Benign  77551   12.602%
    Likely benign   154870  25.166%
    Uncertain significance  217838  35.399%

    """
    pathoStr = pathoStr.lower()
    # don't change the order of these unless you tested it. The order is crucial.
    # the aim of this order is to err on the side of caution: if a variant is VUS and pathogenic,
    # we rather classify it as VUS (=triggering manual review). If it's pathogenic and benign at the same 
    # time, we rather classify it as pathogenic
    #
    # Do not changes these codes. See WARNING ON FILTER CHANGES in this script.
    if "conflicting" in pathoStr:
        return "CF"
    if "uncertain" in pathoStr:
        return "VUS"
    if "risk factor" in pathoStr:
        return "RF"
    if "likely pathogenic" in pathoStr:
        return "LP"
    if "pathogenic" in pathoStr:
        return "PG"
    if "likely benign" in pathoStr:
        return "LB"
    if "benign" in pathoStr:
        return "BN"
    return "OT"

def lastMonth(d,x=1):
    """ returns same day as this month, but last month, e.g. for 2013-01-01 
    return 2012-12-01 
    """

    days_of_month = [31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31]
    newmonth = ((( d.month - 1) - x ) % 12 ) + 1
    newyear  = d.year + ((( d.month - 1) - x ) // 12 ) 
    if d.day > days_of_month[newmonth-1]:
      newday = days_of_month[newmonth-1]
    else:
      newday = d.day
    return datetime( newyear, newmonth, newday)

def compareWithLastMonth(fname, maxDiff):
    " go back one month from current date, make sure that linecount diff is at most maxDiff "
    todayStr = date.today().isoformat()
    previousDate = lastMonth(date.today())
    previousDateStr = previousDate.date().isoformat()
    previousFname = fname.replace(todayStr, previousDateStr)
    if not isfile(previousFname):
        print(("Cannot find %s, cannot compare new data with previous month" % previousFname))
        return
        
    previousCount = sum(1 for line in open(previousFname, encoding="utf8"))
    todayCount = sum(1 for line in open(fname, encoding="utf8"))
    percDiff = (todayCount-previousCount)/float(todayCount)

    if percDiff < 0 and maxDiff:
        raise Exception("line count change negative: %s %d, %s %d. Run without --maxDiff to ignore this error. Or run the cronjob script doUpdate.sh with -nocheck to ignore this error." % \
            (previousFname, previousCount, fname, todayCount))
    if percDiff > maxDiff:
        raise Exception("line count increased too much: yesterday %d , today %d. Check if the input file is OK. If yes, remove the --maxDiff option from the clinVarToBed, call it only with the --auto option. Or run the cronjob script doUpdate.sh with -nocheck to ignore this error." % (previousCount, todayCount))
    # see RM 14350 do more than just linecount
    # see http://stackoverflow.com/questions/1566461/how-to-count-differences-between-two-files-on-linux
    cmd = "diff %s %s --speed-large-files | grep '^[><]'  | wc -l" % (fname, previousFname)
    diffCount = int(subprocess.check_output(cmd, shell=True, stderr=subprocess.STDOUT))
    logging.info("%s: %d lines changed" % (fname, diffCount))
    percDiff = float(diffCount)/todayCount
    if percDiff > maxDiff:
        logging.info(cmd)
        diffFname =  abspath(join("downloads", "lastDiff.txt"))
        cmd = "diff %s %s --speed-large-files > %s" % (fname, previousFname, diffFname)
        os.system(cmd) # ret code is not 0, as there are differences, so no need to check
        raise Exception("too many differences: %d out of %d lines. Look at the differences n the file %s. If you are OK with them, run '/hive/data/outside/otto/clinvar/doUpdate.sh -nocheck'" % (diffCount, todayCount, diffFname))

# I'm leaving this in the code, because I would not be surprised if one day we have to go back and parse
# the other files again
#def parseVariationAlleleFile(varFname):
#    """ return dict allele Id -> variant ID """
#    logging.info("Parsing %s" % varFname)
#    allToVar = {}
#    foundHead = False
#    expHead = "#VariationID	Type	AlleleID	Interpreted"
#    for line in gzip.open(varFname):
#        line = line.rstrip("\n")
#        if line==expHead:
#            foundHead = True
#        if line.startswith("#Var") and "Type" in line:
#            if not line == expHead:
#                print(("Header of variation_allele.txt.gz has changed again. Code needs fixing"))
#                print("found:",line)
#                print("expected:",expHead)
#            else:
#                foundHead = True
#
#        if line.startswith("#"):
#            continue
#        if not foundHead:
#            raise Exception("Header of variation_allele.txt.gz has changed again. Code needs fixing")
#        fields = line.split("\t")
#        allToVar[fields[2]]=fields[0]
#    return allToVar

def parseHgvsFile(hgvsFname):
    " return dict: allele Id -> list of (hgvs coding, hgvs protein) "
    logging.info("Parsing %s" % hgvsFname)
    expHead	= "#Symbol	GeneID	VariationID	AlleleID	Type	Assembly	NucleotideExpression	NucleotideChange	ProteinExpression	ProteinChange	UsedForNaming	Submitted	OnRefSeqGene"
    hg19Hgvs = defaultdict(list)
    hg38Hgvs = defaultdict(list)
    foundHead = False
    for line in gzip.open(hgvsFname, "rt", encoding="utf8"):
        line = line.rstrip("\n")
        if line==expHead:
            foundHead = True
        if line.startswith("#Symbol") and "GeneID" in line:
            if not line == expHead:
                print ("Header of hgvs file has changed again. Code needs fixing. Look at the new fields and decide if we should show them.")
                print(("found:",line))
                print(("expected:",expHead))
            else:
                foundHead = True

        if line.startswith("#"):
            continue
        if not foundHead:
            raise Exception("Invalid Header. Code needs fixing, again.")

        fields = line.split("\t")
        sym, geneId, varId, alleleId, varType, assembly, nuclExpr, nuclChange, protExpr, protChange, usedForNaming, submitted, onLrg = fields
        #varType = fields[4]
        #if "coding" in varType and not "LRG" in fields[6]:
        #assert("|" not in nuclChange and "|" not in protChange)
        #hgvsRow = "|".join((nuclChange, protChange))
        #if ";" in hgvsRow:
            #print(sym, geneId, varId, hgvsRow)
        # this creates invalid HGVS, but it's the only way for now, until we have real JSON tables
        #hgvsRow = hgvsRow.replace(";", ",")
        #assert(";" not in hgvsRow)
        hgvsRow = (nuclChange, protChange)

        if assembly=="GRCh37":
            addTo = [hg19Hgvs]
        elif assembly=="GRCh38":
            addTo = [hg38Hgvs]
        elif assembly=="na": # protein changes have no assembly and must be shown on all assemblies
            addTo = [hg19Hgvs, hg38Hgvs]
        elif assembly=="NCBI36":
            continue
        else:
            print("invalid assembly in HGVS: %s, in %s" % (assembly, line))
            assert(False)

        for data in addTo:
            data[alleleId].append( hgvsRow )

    return hg19Hgvs, hg38Hgvs

def accListToHtml(inStr):
    """
    given a string of a list of db:acc tuples, like "dbVar:nssv578901, omim:12343", convert the accessions
    to HTML links to the databases. Also, pull out the dbVar SSV accession for its own field later.
    A more recent example is  MONDO:MONDO:0019667, which meant that I had to add the "maxsplit" option.
    """
    dbVarAcc = ""
    newParts = []
    for part in inStr.split(","):
        fields = part.split(":", maxsplit=1)
        if len(fields)!=2:
            newParts.append(part)
            continue

        db, acc = fields
        accForUrl = acc

        if db=="dbVar" and len(acc)>5 and acc[1:4]=="ssv":
            # can be nssv/essv/essv, see https://www.ncbi.nlm.nih.gov/dbvar/content/faq/#nsvnssv
            dbVarAcc = acc

        if db in dbToUrl:
            if "OMIM" in db:
                accForUrl = accForUrl.replace(".", "#")
            url = (dbToUrl[db] % accForUrl)
            newPart = "<a target=_blank href='%s'>%s:%s</a>" % (url, db, acc)
        else:
            newPart = part
        newParts.append(newPart)
    return (", ".join(newParts)), dbVarAcc

def reviewStatusToHtmlStars(reviewStatus):
    """
    In email from Donna maglott@ncbi.nlm.nih.gov mapping is like this:
        1-criteria provided, conflicting interpretations
        2-criteria provided, multiple submitters, no conflicts
        1-criteria provided, single submitter
        0-no assertion criteria provided
        0-no assertion for the individual variant
        0-no assertion provided
        4-practice guideline
        3-reviewed by expert panel

    given the review status, return an html string with the correct number of black stars
    """
    revStatus = reviewStatus.strip().lower().replace(" ", "")
    starCount = None

    if revStatus == "nointerpretationforthesinglevariant":
        starCount = 0
    if revStatus == "noclassificationprovided" or revStatus == "noclassificationsfromunflaggedrecords":
        starCount = 0
    if revStatus=="criteriaprovided,conflictinginterpretations":
        starCount = 1
    if revStatus=="criteriaprovided,conflictingclassifications": # first seen in Mar 2024
        starCount = 1
    if revStatus=="criteriaprovided,multiplesubmitters,noconflicts":
        starCount = 2
    if revStatus=="criteriaprovided,singlesubmitter":
        starCount = 1
    if revStatus == "practiceguideline":
        starCount = 4
    if revStatus == "reviewedbyexpertpanel":
        starCount = 3

    if revStatus == "noclassificationforthesinglevariant":
        starCount = 0
    if "noassertion" in revStatus or revStatus=="-":
        starCount = 0

    if starCount == None:
        print(("Illegal review status: %s" % revStatus))
        assert(False)

    emptyStarCount = 4-starCount

    htmlList = ["&#9733;"]*starCount 
    htmlList.extend( ["&#9734;"]*emptyStarCount )

    htmlStr = "".join(htmlList)
    htmlStr += "&nbsp;&nbsp;<small>based on: %s</small>" % reviewStatus

    if starCount==1:
        asciiDesc = "%d star" % starCount
    else:
        asciiDesc = "%d stars" % starCount
    return htmlStr, asciiDesc, starCount

def loadSubmissions(summFname):
    " load the submissions into the Mysql table for the lollipop track details page "
    # not doing the load+rename strategy yet, I don't think it makes a difference here
    logging.info("Loading submissions table %s into mysql, hg19 and also hg38" % summFname)
    fh = tempfile.NamedTemporaryFile(prefix="clinVarToBed", suffix=".tsv")
    tmpFname = fh.name
    cmd= "zgrep -v '^#' %s > %s" % (summFname, tmpFname)
    assert(os.system(cmd)==0)

    cmd = "chmod a+r %s" % (tmpFname) # otherwise mysqld can't read the file
    assert(os.system(cmd)==0)

    cmd = "hgLoadSqlTab hg19 clinvarSub clinvarSub.sql %s" % (tmpFname)
    print(cmd)
    assert(os.system(cmd)==0)

    cmd = "hgLoadSqlTab hg38 clinvarSub clinvarSub.sql %s" % (tmpFname)
    assert(os.system(cmd)==0)

def mostCurrentVersionName():
    " return name of version on FTP site "
    cmd = 'curl -s https://ftp.ncbi.nlm.nih.gov/pub/clinvar/xml/ClinVarVCVRelease_00-latest.xml.gz | zcat | head -n2 | tail -n1'
    verLine = subprocess.check_output(cmd, shell=True, encoding="utf8")
    #<ClinVarVariationRelease xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:noNamespaceSchemaLocation="http://ftp.ncbi.nlm.nih.gov/pub/clinvar/xsd_public/ClinVar_VCV_2.0.xsd" ReleaseDate="2024-03-06">
    version = verLine.split(" ")[3].split("=")[1].split('"')[1]
    return version

def downloadFromNcbi(skipDownload):
    " download files from NCBI and return list of filenames "
    today = date.today().isoformat()
    downDir = join("downloads", today)
    try:
        os.mkdir(downDir)
    except FileExistsError:
        pass

    version = mostCurrentVersionName()
    logging.info("Version on NCBI server is %s" % version)
    versionFname = join(downDir, "version.txt")
    open(versionFname, "w").write(version)

    baseUrl = "https://ftp.ncbi.nlm.nih.gov/pub/clinvar/tab_delimited/"
    url = baseUrl+"variant_summary.txt.gz"
    outFname = join(downDir, "variant_summary.txt.gz")
    filename = outFname

    # get the new variant allele file
    varUrl = baseUrl+"variation_allele.txt.gz"
    varFname = join(downDir, "variation_allele.txt.gz")

    # get the new hgvs allele file
    hgvsUrl = baseUrl+"hgvs4variation.txt.gz"
    hgvsFname = join(downDir, "hgvs4variation.txt.gz")

    # get the summary file
    summUrl = baseUrl+"submission_summary.txt.gz"
    summFname = join(downDir, "submission_summary.txt.gz")

    # get the new VCF file
    vcfUrl = "https://ftp.ncbi.nlm.nih.gov/pub/clinvar/vcf_GRCh38/clinvar.vcf.gz"
    vcfFname = join(downDir, "GRCh38.clinvar.vcf.gz")

    if not (isfile(vcfFname)):
        logging.info("Downlading %s to %s" % (vcfUrl, vcfFname))
        open(vcfFname, "wb").write(urllib.request.urlopen(vcfUrl).read())

    if not (skipDownload and isfile(outFname) and isfile(varFname) and isfile(hgvsFname) and isfile(summFname)):
        logging.info("Downlading %s to %s" % (url, outFname))
        open(outFname, "wb").write(urllib.request.urlopen(url).read())
        logging.info("Downlading %s to %s" % (hgvsUrl, hgvsFname))
        open(hgvsFname, "wb").write(urllib.request.urlopen(hgvsUrl).read())
        logging.info("Downlading %s to %s" % (varUrl, varFname))
        open(varFname, "wb").write(urllib.request.urlopen(varUrl).read())
        logging.info("Downlading %s to %s" % (summUrl, summFname))
        open(summFname, "wb").write(urllib.request.urlopen(summUrl).read())

    return filename, varFname, hgvsFname, vcfFname, summFname, versionFname

def parseVcf(vcfFname):
    " parse VCF and return as dict alleleId -> (molConseq) "
    logging.info("Parsing %s" % vcfFname)
    ret = {}
    for line in gzip.open(vcfFname, "rt", encoding="utf_8"):
        # #CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO
        #1       930248  789256  G       A       .       .       ALLELEID=707587;CLNDISDB=MedGen:CN517202;CLNDN=not_provided;CLNHGVS=
        if line.startswith("#"):
            continue
        chrom, start, end, ref, alt, qual, filter, info = line.rstrip("\n").split("\t")
        infoParts = info.split(";")

        alleleId = None
        mc = None
        for info in infoParts:
            key, val = info.split("=")
            if key=="ALLELEID":
                alleleId = val
            elif key=="MC":
                mc = val.split("|")[1].split(",")[0].replace("_", " ")
                if not mc in possMolConseqs:
                    print("molecular consequence '%s' in VCF file is not known and so not in trackDb file. "
                            "Please update otto script %s" % (mc, __file__))
                    sys.exit(1)

        if mc and alleleId:
            ret[int(alleleId)] = mc

    return ret

def shortenSeq(s):
    return s[:25]+"..."+s[-25:]

# ----------- MAIN --------------
if args==[] and not options.auto:
    parser.print_help()
    exit(1)

outSuffix = ""

if options.auto:
    filename, varFName, hgvsFname, vcfFname, summFname, versionFname = downloadFromNcbi(options.skipDownload)
else:
    filename = args[0]
    varFname = args[1]
    hgvsFname = args[2]
    vcfFname = args[3]
    versionFname = args[4]

#allToVar = parseVariationAlleleFile(varFname) # the alleles are now in the hgvs file after a Clinvar change
allToVcf = parseVcf(vcfFname)
hg19Hgvs, hg38Hgvs = parseHgvsFile(hgvsFname)

# e.g. GRCh38/hg38 1p36.33-36.31(chr1:1181847-5507243)x1
# GRCh38/hg38 9p24.3-q34.3(chr9:204193-138179445)
# GRCh37/hg19 21q22.13-22.2(chr21:38741104..40274106)
# GRCh37/hg19 15q14-15.1 chr15:34638237..42057083 complex variant
# NC_000010.11:g.(?_87925503)_(87965482_?)del
cnvRe = re.compile(r'GRCh[0-9]*/hg[0-9]* ([XY0-9pq.-]+) ?\(?[XYa-z_:0-9.-]+?\)?(x[0-9]+)?.*')

hgvsRe = re.compile(r'(N[RPCGM]_[0-9.]+)(\([a-zA-Z0-9_-]+\))?:([+0-9c.ACTG>a-z_()-]+)')

# e.g.
# c.80_83delGGATinsTGCTGTAAACTGTAACTGTAAA
# c.80A>T
# c.740+356C>T
# c.3839_3855del17
posRe = re.compile(r'^([mgcpdrn]\.[*0-9_?+-]+)(delins|dup|del|ins|inv)?([>~ACTG0-9]*)$')

ifh = gzip.open(filename, "rt", encoding="latin_1")

# check the header
line1 = ifh.readline()
fileHeaders = line1.rstrip().split("\t")
if (fileHeaders != clinvarExpectHeaders): # test if clinvar has changed their header line again
    logging.error("ClinVar has changed their header line again")
    logging.error("current header line: %s" % fileHeaders)
    logging.error("expected header line: %s" % clinvarExpectHeaders)
    raise Exception("Sorry. The code will need to be fixed.")

# open out files
hg19Bed = open("bed/clinvarMain.hg19.%sbed" % outSuffix, "w", encoding="latin1")
hg38Bed = open("bed/clinvarMain.hg38.%sbed" % outSuffix, "w", encoding="latin1")

hg19DecorBed = open("bed/clinvarMainDecor.hg19.%sbed" % outSuffix, "w", encoding="latin1")
hg38DecorBed = open("bed/clinvarMainDecor.hg38.%sbed" % outSuffix, "w", encoding="latin1")

hg19BedCnv = open("bed/clinvarCnv.hg19.%sbed" % outSuffix, "w", encoding="latin1")
hg38BedCnv = open("bed/clinvarCnv.hg38.%sbed" % outSuffix, "w", encoding="latin1")


longCount = 0
noAssCount = 0
noMcCount = 0
minusAccs = []

# convert lines
for line in ifh:
    line = line.replace(", ", ",").replace(";", ",") # replace, bigBed conventions
    line = line.rstrip("\n")
    row = line.split("\t")
    row = [f.replace("\0", "") for f in row ]

    alleleId, allType, name, geneId, geneSymbol, hgncId, clinSign, clinSignSimple, lastEval, snpAcc, dbVarAcc, \
      irvcAcc, phenotypeIds, phenotypeList, origin, originSimple, assembly, chromAcc, chrom, start, end, \
      refAll, varAll, cytogenetic, reviewStatus, numberSubmitters, guidelines, inGtr, otherIds, \
      submCategories, varId, posVcf, refAllVcf, altAllVcf, somClinImpact, somClinLastEval, revStatusClinImpact, oncogen, oncogenLastEval, revStatusOncogen, aggrGermClass, aggrSomClassImpact, aggrSomOnc = row
   # SomaticClinicalImpact|SomaticClinicalImpactLastEvaluated|ReviewStatusClinicalImpact|Oncogenicity|OncogenicityLastEvaluated|ReviewStatusOncogenicity

    if chrom=="" or assembly=="" or assembly=="NCBI36" or assembly=="na":
        noAssCount += 1
        continue

    if assembly=="GRCh37":
        hgvs = hg19Hgvs
    elif assembly=="GRCh38":
        hgvs = hg38Hgvs
    else:
        print("Invalid assembly: %s, row %s" % (repr(assembly), repr(row)))
        assert(False)

    hgvsTable =  json.dumps(hgvs[alleleId])

    molConseq = allToVcf.get(int(alleleId))
    isTrunc = False
    if molConseq is None:
        noMcCount += 1
        molConseq = ""
    else:
        if molConseq in truncConseqs:
            isTrunc = True

    if chromAcc=="NT_187513.1":
        chrom = "chrUn_KI270742v1"
    elif chromAcc=="NT_167222.1":
        chrom = "chrUn_gl000228"
    elif chromAcc=='NT_187499.1':
        chrom = "chrUn_KI270744v1"
    elif chromAcc=="NT_113889.1":
        if assembly=="GRCh37":
            chrom = "chrUn_gl000218"
        else:
            chrom = "chrUn_GL000218v1"
    elif chrom=="Un":
        print("Unknown fix/alt/patch chrom, please adapt clinVarToBed: %s, row: %s" % (chrom, row))
        sys.exit(1)
    else:
        chrom = "chr"+chrom

    if chrom=="chrUn" and assembly=="GRCh38":
        print(("wrong chrUn chrom on assembly hg38. Skipping %s" % irvcAcc))
        continue

    if chrom=="chrMT": # why does NCBI use chrMT but we use chrM ?
        if assembly=="GRCh37":
            chrom = "chrMT" # our chrM is different from NCBI's MT, but chrMT got added hg19 in 2020 
        else:
            chrom = "chrM"

    shortName, longName = shortenName(name)

    if len(shortName)>20:
        shortName = shortName[:17]+"..."
        longCount+=1

    if len(longName)>60:
        longName = longName[:60]+"..."

    if start=="-1" and end=="-1":
        minusAccs.append(irvcAcc)
        continue

    if start=="" or end=="":
        print("empty start or end coordinate. record %s, start/end = %s/%s "% (irvcAcc, start, end))
        continue

    if int(start)<0 or int(end)<0:
        print("illegal start or end coordinate. record %s, start/end = %s/%s"% (irvcAcc, start, end))
        continue

    # sometimes they seem to inverse their coordinates
    if int(start)>int(end):
        start, end = end, start

    if (chrom=="chr17" and end=="217748468" or end=="84062405") or  \
        (chrom=="chr9" and end=="140730116" and assembly=="GRCh38"): # VCV003340459.2 reported to clinvar May 2025

        print("blacklisted feature")
        continue

    # reorder columns for bigBed
    start = str(int(start)-1) # convert to zero-based, half-open
    score = "0"
    strand = "."
    thickStart = start
    thickEnd = end
    itemRgb = "0"
    blockCount = "1"
    blockSizes = str(int(end)-int(start))
    blockStarts = "0"

    # used until Oct 2017
    # if dbVarAcc.startswith("nsv") or "copy number" in allType:
    # now using just size on genome, see #20268
    # shortening to >= 50bp to bring into line with LOVD and gnomAD
    if (int(end)-int(start))>=50:
        isCnv = True
    else:
        isCnv = False

    pathoCode = clinSignToCode(clinSign)
    originCode = originSimpleToCode(originSimple)
    allTypeCode = allTypeToCode(allType, "|".join(row))

    if isCnv:
        typeColors = cnvColors[allTypeCode]
        #else:
        #print("CNV encountered with unknown type: %s / %s"%(allType, allTypeCode))
        #assert(False)
        itemRgb = typeColors.get(pathoCode)
        if itemRgb is None:
            print("CNV encountered with unknown pathoCode: "+pathoCode)
            assert(False)

    else:
        if pathoCode in ["PG", "LP"]: # pathogenic or likely pathogenic
            itemRgb = "210,0,0" # red
        elif pathoCode in ["BN", "LB"]: # benign or likely benign
            itemRgb = "0,210,0" # green
        elif pathoCode in ["VUS", "RF"]: # uncertain
            itemRgb = "0,0,128" # dark-blue
        elif pathoCode in ["CF"]: # conflicting
            itemRgb = "137,121,212" # light-blue
        elif pathoCode in ["OT"]: # other
            itemRgb = "128,128,128" # grey
        else:
            assert(False)# should never happen

    varLen = int(end)-int(start)

    geneStr = geneSymbol
    if geneId.isdigit() and geneId!="-1":
        geneStr = geneId+"|"+geneSymbol
    if len(geneStr)>250:
        geneStr = "not shown, too many genes"

    mouseOverName = name

    otherIds, dbVarSsvAcc = accListToHtml(otherIds)

    if isCnv and dbVarSsvAcc!="":
        shortName = dbVarSsvAcc
    elif name=="":
        name = "No display name was provided by ClinVar for this variant"
        shortName = "NoName"
        mouseOverName = ""

    if len(name)>90:
        name = name[:90]+"..."

    vcvId = "VCV%09d" % int(varId)
    vcvIdVersion = "VCV%09d" % int(varId)
    name = varId+"|"+vcvId+" : "+name

    if len(mouseOverName)>80:
        mouseOverName = mouseOverName[:80]+"..."

    #if len(hgvsProt) > 90:
        #hgvsProt = hgvsProt[:90]+"..."
    #if len(hgvsCod) > 90:
        #hgvsCod = hgvsCod[:90]+"..."

    phenotypeIds, _ = accListToHtml(phenotypeIds)

    starRatingHtml, asciiStars, starCount = reviewStatusToHtmlStars(reviewStatus)

    phenotypeList = ", ".join(phenotypeList.split("|"))

    mouseOverParts = [
            mouseOverName,
            "<b>Review Status: </b>"+starRatingHtml,
            "<b>Type: </b>"+allType,
            "<b>Consequence: </b>"+ molConseq,
            "<b>Significance: </b>"+clinSign,
            "<b>Origin: </b>"+origin,
            "<b>Phenotypes: </b>"+phenotypeList
            ]
    # removed Apr 2020: numberSubmitters+" submitters", "Level: "+asciiStars
    mouseOver = "<br>".join(mouseOverParts)

    if snpAcc=="-1":
        snpAcc=""
    else:
        snpAcc = "rs"+snpAcc # dbSnp links changed in mid/late 2022

    vcfDesc = chrom+":"+posVcf+":"+refAllVcf+">"+altAllVcf
    if len(vcfDesc)>=255:
	    refAllVcf = shortenSeq(refAllVcf)
	    altAllVcf = shortenSeq(altAllVcf)
	    vcfDesc = chrom+":"+posVcf+":"+refAllVcf+">"+altAllVcf

    somImpactDesc = ""
    if somClinImpact!="" and somClinImpact!="-":
	    somImpactDesc = "%s (%s, %s)" % (somClinImpact, somClinLastEval, revStatusClinImpact)

    oncogenDesc = ""
    if oncogen!="" and oncogen!="-":
	    oncogenDesc = "%s (%s, %s)" % (oncogen, oncogenLastEval, revStatusOncogen)

    mouseOverIdx = 33 # if you ever change the field order below, make sure that you adapt this number!

    row = [chrom, start, end, vcvId, str(starCount), strand, thickStart, thickEnd, itemRgb, \
        blockCount, blockSizes, blockStarts,
        name, clinSign, starRatingHtml, allType, geneStr, molConseq,
        snpAcc, dbVarAcc, irvcAcc, inGtr, phenotypeList, phenotypeIds, origin, assembly, cytogenetic,
        hgvsTable, "", numberSubmitters, lastEval, guidelines, otherIds, mouseOver,
	    vcfDesc,
	    somImpactDesc,
	    oncogenDesc,
        # the new classes
        aggrGermClass, aggrSomClassImpact, aggrSomOnc,
        # these fields are the for the filters, not for the display
        pathoCode, originCode, allTypeCode, str(varLen), str(starCount),
        # the variant ID got forgotten in the original bigBed schema, it was only part of the origName field
        varId, dbVarSsvAcc, vcvId, vcvIdVersion,
        # the name was moved to the back of the field list in 2025, as we needed a unique field to link to decorator
        shortName
        ]

    # replace clinvar's placeholders with real empty fields
    newRow = []
    for x in row:
        if x in ["-1", "-"]:
            newRow.append("")
        else:
            newRow.append(x)
    row = newRow

    if assembly=="GRCh37":
        ofh = hg19Bed
        decorOfh = hg19DecorBed
        if isCnv:
            ofh = hg19BedCnv
    elif assembly=="GRCh38":
        ofh = hg38Bed
        decorOfh = hg38DecorBed
        if isCnv:
            ofh = hg38BedCnv
    else:
        noAssCount +=1
        
    ofh.write("\t".join(row))
    ofh.write("\n")

    if isTrunc:
        # truncating mutations get a special symbol written into another BED file, for decorators
        decorRow = list(row[:12])
        rgbColor = row[8]
        #decorRow[8] = "0,0,0" # line color
        decorRow[8] = "255,255,255" # line color
        if molConseq in pinkConseqs:
            decorRow[8] = "255,192,203" # inspired by decipher colors

        mouseOverText = row[mouseOverIdx]

        decorRef = row[0]+":"+row[1]+"-"+row[2]+":"+row[3]
        decorRow[3] = "" # delete mouseover

        decorRow.append(decorRef)
        decorRow.append("glyph")
        #decorRow.append(rgbColor)
        decorRow.append(rgbColor) # fill color
        decorRow.append("Diamond")
        decorRow.append(mouseOverText)

        decorOfh.write("\t".join(decorRow))
        decorOfh.write("\n")

hg19Bed.close()
hg38Bed.close()

hg19DecorBed.close()
hg38DecorBed.close()

hg19BedCnv.close()
hg38BedCnv.close()

logging.info("%d lines with feature name that was too long, shortened them" % longCount)
logging.info("%d lines without, with old assembly or no chrom, skipped" % noAssCount)
logging.info("%d lines with -1/-1 start/end positions skipped. Examples: %s" % (len(minusAccs), minusAccs[:3]))

fnames = [hg19Bed.name, hg38Bed.name, hg19DecorBed.name, hg38DecorBed.name, hg19BedCnv.name, hg38BedCnv.name]

# sort the files
for fname in fnames:
    cmd = "bedSort %s %s" % (fname, fname)
    logging.debug(cmd)
    assert(os.system(cmd)==0)

# check new bed files against last months's bed files
if options.maxDiff and not options.isAlpha:
    for fname in fnames:
        compareWithLastMonth(fname, options.maxDiff)

bigBedDir = "bigBed"
if options.isAlpha:
    bigBedDir = "bigBedAlpha"

# convert to bigBed without the date in the fname, like clinvarMain.hg19.bb
tmpFnames = []
for fname in fnames:
    parts = fname.split(".")
    if len(parts) == 4: # clinvarMain.hg19.2011-11-11.bed
        base, db, date, ext = parts
    else:
        base, db, ext = parts

    base = basename(base)
        
    tmpFname = "%s.%s.temp.bb" % (base, db)

    asFname = "clinvar.as"
    extraOptions = "-extraIndex=_dbVarSsvId,snpId,vcvId,_vcvIdVersion "
    if "Decor" in fname:
        asFname = "clinvarDecoration.as"
        extraOptions = ""

    cmd = "bedToBigBed %s -tab -type=bed9+ -as=%s %s /hive/data/genomes/%s/chrom.sizes %s" % (extraOptions, asFname, fname, db, tmpFname)
    mustRun(cmd)

    finalFname = "%s/%s.%s.bb" % (bigBedDir, base, db)
    tableName = finalFname.split('.')[0] # e.g. clinvarMain

    tmpFnames.append( (db, tableName, tmpFname, finalFname) )

# make it atomic by renaming the temp files to the final file names at the very end
for db, tableName, tmpFname, finalFname in tmpFnames:
    logging.debug("Renaming %s to %s" % (tmpFname, finalFname))
    os.rename(tmpFname, finalFname)

# create the version.txt file
clinvarVersion = open(versionFname).read()
versionString = "ClinVar Release: %s, converted at UCSC on %s" % (clinvarVersion, date.today().strftime("%Y-%m-%d"))
outFname = join(bigBedDir, "version.txt")
with open(outFname, "w") as ofh:
    ofh.write(versionString)

# put a copy of the files into the archive
if not options.isAlpha:
    for db, tableName, tmpFname, finalFname in tmpFnames:
        # not doing this anymore, using bigDataUrl now in trackDb
        #cmd = "hgBbiDbLink %s %s %s" % (db, tableName, gbdbFname)
        #mustRun(cmd)

        yearMonDay = date.today().strftime("%Y-%m-%d")
        gbdbFname = "/gbdb/%s/bbi/clinvar/%s.bb" % (db, tableName)
        # put a copy into the archive
        outDir = "/usr/local/apache/htdocs-hgdownload/goldenPath/archive/%s/clinvar/%s" % (db, yearMonDay)
        if not isdir(outDir):
            os.mkdir(outDir)
        archiveFname = join(outDir, basename(gbdbFname))
        logging.info("Creating %s" % archiveFname)
        cmd = "cp %s %s" % (finalFname, archiveFname)
        mustRun(cmd)

    # add the version.txt files
    for db in ["hg38", "hg19"]:
        archDir = "/usr/local/apache/htdocs-hgdownload/goldenPath/archive/%s/clinvar/%s" % (db, yearMonDay)
        cmd = "cp %s/version.txt %s" % (bigBedDir, archDir)
        mustRun(cmd)

loadSubmissions(summFname)

print("ClinVar update end: OK")
