#!/usr/bin/env python2.7
import sys, os, re, argparse, subprocess, httplib


def parseHtml(fileset, basepath, conn, subdirpath=None):
    if subdirpath == None:
        subdirpath = ""
    getpath = basepath + subdirpath

    conn.request("GET", getpath)
    r1 = conn.getresponse()
    r1lines = r1.read()
    lines = r1lines.split("\n")
    p = re.compile('.*a href.*\>(\S+[^/])\<\/a\>')
    p2 = re.compile('\<a href.*\>(.*\/)\<\/a\>')
    p3 = re.compile('Parent Directory')
    subdirs = set()
    grab = 0

    for i in lines:
        if p3.search(i):
            grab = 1
            continue
        if not grab:
            continue
        if p.search(i):
            file = subdirpath + p.search(i).group(1)
            fileset.add(file)
        if p2.search(i):
            dir = p2.search(i).group(1)
            subdirs.add(dir)

    if subdirs:
        for i in subdirs:
            dirpath = subdirpath + i
            fileset = parseHtml(fileset, basepath, conn, dirpath)

    return fileset

def main():
    parser = argparse.ArgumentParser(
    prog='encodeQaCheckRRFiles',
    formatter_class=argparse.RawDescriptionHelpFormatter,
    description='Compares files on the RR against a list of files',
    epilog=
"""Example:
encodeQaCheckHgdownloadFiles hg19 wgEncodeSydhTfbs files.list
encodeQaCheckHgdownloadFiles hg18 wgEncodeHudsonalphaChipSeq checkPushFilesList
encodeQaCheckHgdownloadFiles hg19 wgEncodeCshlLongRnaSeq checkPushFilesList -s hgdownload-sd
"""
    )
    parser.add_argument('database', help='The database, typically hg19 or mm9')
    parser.add_argument('composite', help='The composite name, wgEncodeCshlLongRnaSeq for instance')
    parser.add_argument('files', help='The list of files')
    parser.add_argument('-d', '--dev', action="store_true", default=0, help='Check for files missing on dev that are present on the RR')
    parser.add_argument('-s', '--server', default="hgdownload", help="The server to use, default: hgdownload.")


    if len(sys.argv) == 1:
        parser.print_help()
        return

    args = parser.parse_args(sys.argv[1:])

    #http://hgdownload.soe.ucsc.edu/goldenPath/hg19/encodeDCC/wgEncodeUwDnase/
    sys.stderr.write("Initiating connection to hgdownload\n")
    server = "%s.soe.ucsc.edu" % args.server
    conn = httplib.HTTPConnection("%s.soe.ucsc.edu" % args.server)
    basepath =  "/goldenPath/%s/encodeDCC/%s/" % (args.database, args.composite)
    print "%s%s" % (server, basepath)
    sys.stderr.write("Getting index of files\n")
    conn.request("GET", basepath)
    r1 = conn.getresponse()
    r1lines = r1.read()

    RRFiles = set()
    DevFiles = set()
    if re.search('404 Not Found', r1lines):
        sys.stderr.write("404 Not Found, did you specify the right composite or database?\n")
        return
    else:
        RRFiles = parseHtml(RRFiles, basepath, conn)

    f = open(args.files, "r")
    lines = f.readlines()
    for i in lines:
        DevFiles.add(i.rstrip("\n"))

    missingFromRR = DevFiles - RRFiles
    missingFromDev = RRFiles - DevFiles

    missingFromRR.discard("README.txt")

    print ""
    if missingFromRR:
        print "Missing from %s (%s):" % (args.server, len(missingFromRR))
        for i in sorted(missingFromRR):
            print i
    else:
        print "Nothing missing from %s" % args.server
    
    print ""
    if args.dev:
        
        if missingFromDev:
            print "Missing from list (%s):" % len(missingFromDev)
            for i in sorted(missingFromDev):
                print i
        else:
            print "Nothing missing on list"

if  __name__ == "__main__":
    main()
