#!/usr/bin/env python
import sys, os, urllib.request, argparse, re, html, textwrap, requests
from xml.etree import ElementTree as ET

def parsePubmed(doc, id):
    infoDict = dict()
    infoDict['url'] = "https://www.ncbi.nlm.nih.gov/pubmed/%s" % id
    attribList = ['PubDate', 'Source', 'Title', 'Volume', 'Issue', 'Pages', 'SO', 'CollectiveName']
    for element in doc:
        if element.tag != "DocSum":
            continue
        items = element.findall("Item")
        for i in items:
            if i.attrib['Name'] == 'AuthorList':
                infoDict['Authors'] = list()
                for j in i:
                    infoDict['Authors'].append(j.text)
                continue
            if i.attrib['Name'] == "ArticleIds":
                for j in i:
                    if (j.attrib['Name'] == 'pubmed'):
                        infoDict[j.attrib['Name']] = j.text
                    if (j.attrib['Name'] == 'pmc'):
                        infoDict[j.attrib['Name']] = j.text
                    if (j.attrib['Name'] == 'doi'):
                        infoDict[j.attrib['Name']] = j.text
                continue
            if i.attrib['Name'] in attribList:
                infoDict[i.attrib['Name']] = i.text

    return infoDict

def parsePmc(doc, id):
    for node in doc.iter("ArticleId"):
        foundPubMedId = 0
        for child in node:
            if child.tag == "IdType" and child.text == "pmid":
                foundPubMedId = 1
            if foundPubMedId == 1 and child.tag == "Value":
                return parseInit(child.text) 
            
    sys.stderr.write("Unable to find pubmed id for pubmed central id: %s\n" % id)
    sys.exit()

def parseInit(id):
    urlbase = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi?"
    db = "Pubmed"
    url = urlbase + "db=%s" % db + "&id=%s" % id
    if re.match("^PMC", id):
        db = "PMC"
        id = re.sub("PMC", "", id)
        url = urlbase + "db=%s" % db + "&id=%s&version=2.0" % id
    sys.stderr.write("Accessing %s\n" % url)
    fetch = urllib.request.urlopen(url)

    doc = ET.XML(fetch.read())
    if db == "Pubmed":
        infoDict = parsePubmed(doc, id)
    elif db == "PMC":
        infoDict = parsePmc(doc, id)

    return infoDict

def htmlEscape(str):
    return html.escape(str)

def makeHtml(infoDict, plain, verbose, doi):
    authors = list()
    authcount = 0
    etal = 0
    if 'CollectiveName' in infoDict:
        authors.append(infoDict['CollectiveName'])
        authcount = 1
    for i in infoDict['Authors']:
        if authcount == 10 and not verbose:
            etal = 1
            break
        authors.append(i)
        authcount = authcount + 1
    sep = ", "
    authStr = sep.join(authors)
    if (not plain):
        authStr = htmlEscape(authStr)
    if etal and not plain:
        authStr = authStr + (" <em>et al</em>")
    if etal and plain:
       authStr = authStr + (" et al")
    authStr = authStr + (".")
    title = re.sub(r"\.$", "", infoDict['Title'])
    if 'Source' in infoDict:
        journal = infoDict['Source']
    elif 'Journal' in infoDict:
        journal = infoDict['Journal']
    if 'SO' in infoDict:
        dateStr = re.sub(";:", ":", infoDict['SO'])
    else:
        dateStr1 = infoDict['PubDate']
        dateStr = ""
        if 'Volume' in infoDict:
            dateStr = dateStr + ";%s" % infoDict['Volume']
        if 'Issue' in infoDict:
            dateStr = dateStr + "(%s)" % infoDict['Issue']
        if 'Pagination' in infoDict:
            dateStr = dateStr + "%s" % infoDict['Pagination']
        elif 'Pages' in infoDict:
            dateStr = dateStr + ":%s" %infoDict['Pages']
        dateStr = re.sub(r"\s+","", dateStr)
        dateStr = dateStr1 + dateStr
    if not re.search(r"\.$", dateStr):
        dateStr = dateStr + "."
    # construct hyperlinks for PMID and PMCID (if it exists)
    if (not plain):
        idStr = "PMID: <a href=\"%s\" target=\"_blank\">%s</a>" % (htmlEscape(infoDict['url']), infoDict['pubmed'])
        if 'pmc' in infoDict:
            idStr = idStr + "; PMC: <a href=\"https://www.ncbi.nlm.nih.gov/pmc/articles/%s/\" target=\"_blank\">%s</a>" % (infoDict['pmc'], infoDict['pmc'])
        if doi and 'doi' in infoDict:
            idStr = ("DOI: <a href=\"https://doi.org/%s\" target=\"_blank\">%s</a>; " % (htmlEscape(infoDict['doi']), infoDict['doi'] ) ) + idStr
    else:
        idStr = "PMID: <a href=\"%s\" target=\"_blank\">%s</a>" % (infoDict['url'], infoDict['pubmed'])
        if 'pmc' in infoDict:
            idStr = idStr + "; PMC: <a href=\"https://www.ncbi.nlm.nih.gov/pmc/articles/%s/\" target=\"_blank\">%s</a>" % (infoDict['pmc'], infoDict['pmc'])
        if doi and 'doi' in infoDict:
            idStr = ("DOI: <a href=\"https://doi.org/%s\" target=\"_blank\">%s</a>; " % (infoDict['doi'], infoDict['doi'] ) ) + idStr

    # now that the pubmed link has been constructed, we can overwrite the url in infoDict with the original article URL

    # make sure the portlet that generates outlinks for PubMed didn't fail.  If it did, try again until
    # it works or we give up.
    # Note: no longer sure this is necessary - seems like NCBI is doing something different now, but at
    # any rate urllib2 no longer seems to fetch the links list in at least some cases.  Requests works.
    origUrl = infoDict['url']
    for try_count in range(10):
        origComment = ""
        infoDict['url'] = origUrl
        fetch = requests.get(infoDict['url'])
        try:
            m = re.search(r'<div class="full-text-links-list">\s*<a\s+(class="[^"]*"\s+)?href="(\S+)"', fetch.text)
            if m:
                if m.group(2):
                    # Rhetorical: how can m match without m.group(1) being defined for this regex? Anyway ....
                    infoDict['url'] = m.group(2).replace("&amp;", "&")
                break
            else:
                #n = re.search('<div class="icons"></div>', doc) # another possible detection of failed portlet
                p = re.search('Default output of portlet NCBIPageSection', doc)
                if p is None:
                    break
        except:
            try:
                m = re.search(r'<div class="full-text-links-list">\s*<a\s+(class="[^"]*"\s+)?href="(.+)"', fetch.text)
                if m:
                    if m.group(2):
                        # Rhetorical: how can m match without m.group(1) being defined for this regex? Anyway ....
                        infoDict['url'] = m.group(2).replace("&amp;", "&").replace(" ", "%20").replace("///","//")
                    break
            except:
                print("Failed to fetch complete links from NCBI after 10 tries.  Try again later or just use the PubMed paper link.")
    else:
        print("Failed to fetch complete links from NCBI after 10 tries.  Try again later or just use the PubMed paper link.")

    htmlLines = list()
    htmlLines.append("<p>")
    htmlLines.append("%s" % authStr)
    if (not plain):
        htmlLines.append("<a href=\"%s\" target=\"_blank\">" % htmlEscape(infoDict['url']))
        htmlLines.append("%s</a>." % htmlEscape(title))
        htmlLines.append("<em>%s</em>. %s" % (htmlEscape(journal), dateStr))
    else:
        htmlLines.append("<a href=\"%s\" target=\"_blank\">" % infoDict['url'])
        htmlLines.append("%s</a>." % title)
        htmlLines.append("<em>%s</em>. %s" % (journal, dateStr))
    htmlLines.append("%s" % idStr)
    htmlLines.append("</p>")
    if plain:
        htmlLines = list()
        idStr = "PMID: %s" % infoDict['pubmed'];
        if 'pmc' in infoDict:
            idStr = idStr + "; PMC: %s" % infoDict['pmc']
        if 'doi' in infoDict:
            idStr = ("DOI: %s; " % infoDict['doi']) + idStr
        htmlLines.append("%s %s. %s. %s %s. %s" % (authStr, title, journal, dateStr, idStr, infoDict['url']))
    if not plain and origComment:
        htmlLines.append("%s" %  origComment)
    sep = "\n"
    space = " "

    processLines = list()

    for i in htmlLines:
        processLines.append(textwrap.fill(i, 100))
    htmlStr = sep.join(processLines)
    return htmlStr, authStr

def main():
    parser = argparse.ArgumentParser(
        description='Turns PubMed Ids and PubMedCentral Ids into GB formatted citations in html',
        epilog='example: getTrackReferences PMC3039671 21347206'
        )
    parser.add_argument('ids', metavar='IDs', type=str, nargs='+', help='The list of PubMed and PubMedCentral Ids')
    parser.add_argument('-p', '--plain', action="store_true", default=0, help="Output the references in plain-text instead of html.")
    parser.add_argument('-v', '--verbose', action="store_true", default=0, help="Output the full author list instead of truncating after the first 10.")
    parser.add_argument('-d', '--doi', action="store_true", default=0, help="Include a DOI link.")
    args = parser.parse_args()
    ids = args.ids
    references = dict()

    for i in ids:
        infoDict = parseInit(i)
        html, authStr = makeHtml(infoDict, args.plain, args.verbose, args.doi)
        references[authStr] = html
    print("\n")
    for i in sorted(references.keys()):
        print("%s\n" % references[i])

if __name__ == "__main__":
    main()
