#!/usr/bin/env python
# pylint: disable=C0103,C0326,C0410,W0402

""" guess the best assembly given a bigWig, bigBed, or VCF file """

import logging, optparse, sys, re, glob
import os, gzip, subprocess
from collections import defaultdict
from os.path import join, isfile, expanduser, basename

# ==== functions =====
def parseArgs():
    " setup logging, parse command line arguments and options. -h shows auto-generated help page "
    parser = optparse.OptionParser("""usage: %prog [options] inFile - given a bigBed, "\
bigWig, or VCF file or URL,
guess the assembly based on the chrom names and sizes. Must have bigBedInfo and
bigWigInfo in PATH for big* files. VCF files (.vcf, .vcf.gz) are parsed directly
from their ##contig header lines. Also requires a bigGuessDb.txt.gz, an alpha version of
which can be downloaded at https://hgwdev.gi.ucsc.edu/~max/bigGuessDb/bigGuessDb.txt.gz

Example run:
    $ wget https://hgwdev.gi.ucsc.edu/~max/bigGuessDb/bigGuessDb.txt.gz
    $ bigGuessDb --best https://ftp.ncbi.nlm.nih.gov/geo/samples/GSM1014nnn/GSM1014177/suppl/GSM1014177_mm9_wgEncodeUwDnaseNih3t3NihsMImmortalSigRep2.bigWig
    mm9

""")

    parser.add_option("-d", "--debug", dest="debug", action="store_true", \
        help="show debug messages")
    parser.add_option("", "--index", dest="index", action="store_true", \
            help="used by UCSC staff: go over /hive/data/genomes and build an index of all chromSizes")
    parser.add_option("-b", "--best", dest="best", action="store_true", \
        help="only print a single string, the best matching assembly, or 'emptyFile' or 'notFound'. " \
        "If multiple arguments are given or --fromFile is used, a tab-sep table is output.")
    parser.add_option("-i", "--indexFile", dest="indexFname", action="store", \
        help="Use specified index file, default is %default. ", default="bigGuessDb.txt.gz")
    parser.add_option("", "--fromFile", dest="fromFile", action="store", \
        help="Read URLs to process from input file, can be /dev/stdin")
    (options, args) = parser.parse_args()

    if args==[] and not options.index and not options.fromFile:
        parser.print_help()
        exit(1)

    if options.debug:
        logging.basicConfig(level=logging.DEBUG)
        logging.getLogger().setLevel(logging.DEBUG)
    else:
        logging.basicConfig(level=logging.INFO)
        logging.getLogger().setLevel(logging.INFO)

    return args, options

def parseSizes(inFname, doSubset):
    " given a chrom.sizes file, return the 10 longest and 10 shortest chrom names "
    sizes = list()
    logging.info("Reading %s",inFname)
    for line in open(inFname):
        if line.startswith("#"):
            continue
        chrom, size = line.rstrip("\n").split("\t")[:2]
        sizes.append( (int(size), chrom) )

    if len(sizes)==0:
        logging.error("bigBed/bigWig file is empty. Cannot guess assembly.")
        return None

    sizes.sort()

    if not doSubset:
        return sizes

    someSizes = sizes[-20:] # small chroms carry less information and have fewer features
    return someSizes

def writeSizes(allSizes, outFname):
    " write all sizes to the index file "
    ofh = gzip.open(outFname, "wt") # "write" "text"
    for db, dbSizes in allSizes.items():
        sizeParts = ["%s=%d" % (chrom, size) for size,chrom in dbSizes]
        sizeStr = ",".join(sizeParts)
        ofh.write("%s\t%s\n" % (db, sizeStr))
    ofh.close()
    logging.info("Wrote %s", outFname)

def indexGenarkSizes(allSizes, inDir):
    for dbType in ["GCA", "GCF"]:
        baseDir = join(inDir, "asmHubs", dbType)
        for (dirpath, dirnames, fnames) in os.walk(baseDir):
            for fname in fnames:
                if not fname.endswith("chrom.sizes.txt"):
                    continue
                fullFname = join(dirpath, fname)
                # e.g. /GCA/000/002/305/GCA_000002305.1/GCA_000002305.1.chrom.sizes.txt 
                if not isfile(fullFname):
                    logging.error(fullFname+" does not exist.")
                    continue

                db = fname.split("/")[-1].replace(".chrom.sizes.txt", "")
                sizes = parseSizes(fullFname, False)
                size1 = sizes.pop()
                allSizes[db] = [size1]
    return allSizes

def buildIndex(inDir, outFname):
    """ go over all direct subdirectories of inDir and find a chrom.sizes file,
    compact it to format db -> list of (chrom,size) and write to outFname """
    allSizes = dict()

    import json # this is not style guide conform, but makes sure that these packages don't lead to problems for users of this script
    from six.moves import urllib # works in python2 and 3

    apiData = json.load(urllib.request.urlopen("https://api.genome.ucsc.edu/list/ucscGenomes"))

    dbList = list()
    for db, dbData in apiData["ucscGenomes"].items():
        orderKey = dbData["orderKey"]
        dbList.append( (orderKey, db) )

    dbList.sort()

    for orderKey, db in dbList:
        subDir = join(inDir, db)
        chromFname = join(subDir, "chrom.sizes")
        if not isfile(chromFname):
            chromFname = join(subDir, db+".sizes")

        if not isfile(chromFname):
            print("not found "+chromFname)
            continue

        doSubset = True
        if db.startswith("hg") or db.startswith("mm"):
            doSubset = False

        if os.path.getsize(chromFname) != 0:
            allSizes[db] = parseSizes(chromFname, doSubset)

    allSizes = indexGenarkSizes(allSizes, inDir)

    writeSizes(allSizes, outFname)

def readSizeIndex(inFname):
    " read chrom sizes index and return as dict (chromName, size) - > db "
    sizeToDbs = defaultdict(list)
    #sizeToDb = dict()
    for line in gzip.open(inFname, "rt"):
        db, sizeStr = line.rstrip("\n").split("\t")
        sizes = sizeStr.split(",")
        sizes = [x.split("=") for x in sizes]
        sizes = [(chrom, int(size)) for (chrom, size) in sizes]
        for chrom, size in sizes:
            #assert( (chrom, size) not in sizeToDb )
            #sizeToDb[ (chrom, size) ] = db
            sizeToDbs[ (chrom, size) ].append(db)
    return sizeToDbs

def vcfSizes(inFname):
    " return list of (chrom, size) from VCF ##contig header lines "
    sizes = list()
    opener = gzip.open if inFname.endswith(".gz") or inFname.endswith(".bgz") else open
    with opener(inFname, "rt") as fh:
        for line in fh:
            if line.startswith("##contig="):
                # handles both <ID=chr1,length=123> and <ID=chr1,assembly=None,length=123,species=>
                m = re.search(r'ID=([^,>]+)', line)
                mLen = re.search(r'length=(\d+)', line)
                if m and mLen:
                    sizes.append((m.group(1), int(mLen.group(1))))
            elif not line.startswith("#"):
                break  # stop at first data line
    return sizes

def isVcf(inFname):
    " return True if the filename looks like a VCF "
    lower = inFname.lower()
    return lower.endswith(".vcf") or lower.endswith(".vcf.gz") or lower.endswith(".vcf.bgz")

def bigSizes(inFname):
    " return chrom -> size from bigWig, bigBed, or VCF file "
    if isVcf(inFname):
        return vcfSizes(inFname)

    sizes = list()

    cmdExec = "bigWigInfo"
    inExt = inFname.lower().split(".")[-1]
    if inExt in ["bb", ".igbed"]:
        cmdExec = "bigBedInfo"

    cmd = [cmdExec, "-chroms", inFname]

    doParse = False
    if sys.version_info[0]==2:
        p = subprocess.Popen(cmd, stdout=subprocess.PIPE)
    else:
        p = subprocess.Popen(cmd, encoding="latin1", stdout=subprocess.PIPE)

    if p.returncode is not None and p.returncode!=0:
        logging.error("Could not run '%s'. Are the tools bigBedInfo and bigWigInfo installed and in your PATH?" % (" ".join(cmd)))
        sys.exit(1)

    for line in p.stdout:
        if line.startswith("chromCount: "):
            doParse = True
            continue
        if line.startswith("basesCovered: "):
            doParse = False
            continue
        if doParse:
            chrom, _idx, size = line.strip().split(" ")
            sizes.append((chrom, int(size)))
    return sizes

def sortBySecondLen(e):
    " sort function that takes the second element of a tuple and returns its negative length -> put longest lists first "
    return -len(e[1])

def findBestDb(sizeIndex, fileSizes):
    """ given a list of file sizes, look up all (chrom, size) in chrom size index
    and report best DBs sorted by number of matches """
    dbChromMatch = defaultdict(list)
    for (chrom, size) in fileSizes:
        if (chrom, size) in sizeIndex:
            dbs = sizeIndex[(chrom, size)]
            for db in dbs:
                dbChromMatch[db].append(chrom)

    dbMatches = list(dbChromMatch.items()) # dbMatches is now a list of db -> list of chromosomes
    dbMatches.sort(key=sortBySecondLen)
    return dbMatches

def printAllMatches(inFname, dbChromMatch):
    " print all matching dbs as a tsv "
    for db, chromList in dbChromMatch:
        print("\t".join([inFname, db, str(len(chromList)), ",".join(chromList)]))

# ----------- main --------------
def main():
    " entry point to script "
    args, options = parseArgs()

    if not options.indexFname:
        indexFname = "/hive/data/genomes/bigGuessDb.txt.gz"
        if not isfile(indexFname):
            indexFname = "bigGuessDb.txt.gz"
    else:
        indexFname = expanduser(options.indexFname)

    if options.index:
        buildIndex("/hive/data/genomes", indexFname)
        exit(0)

    if len(args)>0:
        inFnames = args
    else:
        inFnames = open(options.fromFile).read().splitlines()

    if options.best:
        if len(inFnames)>1:
            print("#fname\tbestDb")
    else:
        print("#fname\tdb\tmatchCount\tmatchList")

    for inFname in inFnames:
        fileSizes = bigSizes(inFname)

        if len(fileSizes) == 0:
            logging.debug("%s is empty. Cannot determine assembly." % inFname)
            hits = ( ("emptyFile", ["0"]),  )
        else:
            sizeIndex = readSizeIndex(indexFname)
            hits = findBestDb(sizeIndex, fileSizes)

        if options.best:
            if len(hits)==0:
                bestDb = "notFound"
            else:
                #if (len(hits[0][1]) >= 2): # need more than a single match, as chrM often matches
                # - deactivated for now, as we store only a single size for GenArk genomes 
                bestDb = hits[0][0]
            if len(inFnames)>1:
                print(inFname+"\t"+bestDb)
            else:
                print(bestDb)

        else:
            printAllMatches(inFname, hits)

main()