#!/usr/bin/perl

#################################
###########  Purpose  ###########
#################################
# This program is designed to look in a download directory, 
# or the current working directory for wgEncode downloadable files.
# The result is 3 files: files.txt md5sum.txt and md5sum.history.
# It makes a list of files in files.txt, and looks up the associated metadata with the file.
# If you want to md5 the files, it does that multithreaded (2 at a time) 
# and checks against a history file so you don't md5sum files over and over again. only files 
# that pass a fileMask are allowed to go in files.txt and md5sum.txt
# all files are md5summed and inserted into md5sum.history

use warnings;
use strict;
use Cwd;
use Getopt::Long;
use Digest::MD5;
#multithread module to contain the md5 function, so we can run multiple at once
use threads;
use threads::shared;
use Time::HiRes qw(usleep nanosleep);
use POSIX;

# for debug only, you'll need to find the package on your own
# use DataBrowser qw(browse);

use lib "/cluster/bin/scripts";
use Encode;

use HgDb;


sub usage {
    # Usage statement
    print STDERR <<END;
usage: encodeMkFilesList <db> [options]

Creates files.txt and optionally md5sum.txt.
The md5summing function will md5sum the internal contents of any gzipped file, 
but stores the external md5sum in md5sum.txt and metaDb.

options:

    -md5            md5 sums the files in the directory
    -superfast      use 7 threads instead of 2
    -noupdate       don't update md5sums in metaDb
    -table          selects which metadata table to use
    -verbose        verbose output (levels 1 - 2)
    -quiet          quiet mode, silences all output
    -cv             custom cv directory (default is /data/apache/cgi-bin/encode)
    -directory      custom download directory (default is working directory)
    -fileMask       only allows selected pattern matches (must be in quotes)
                E.g. -fileMask "wgEncode*" (default)
                Regex Example:  -fileMask "wgEncode\\S+Huvec.* wgEncode\\S+GM\\d+Input"
                Normal example: -fileMask "*.gz *GM12878* *GM12*Tnfa"

END
exit 1;
}


sub processFilesList {
    # returns a list of files to be process that pass the file mask as well as a global files list
    # sets the sleep time based on largest file size found
    my %fileMask = %{$_[0]};
    my %sizes;
    my @files;
    my @fileMask;
    my $ls = `ls -lh 2>/dev/null`;
    my @list = split "\n", $ls;

    #parse through list to get the sizes and filenames
    foreach my $row (@list) {
        my @row = split /\s+/, $row;
        
        #skip weird behaviors in ls
        unless ($row[8]) {
            next;
        }
        
        #skip directories
        unless ($row =~ m/^\-/) {
            next;
        }
        push @files, $row[8];
        
        foreach my $mask (keys %fileMask) {
            if ($row[8] =~ m/$mask/) {
                push @fileMask, $row[8];
                last;
            }
        }
        $sizes{$row[8]} = $row[4];
    }

    # set the sleep time between thread status polling
    my $largest;
    my $sleepTime = "1";
    foreach my $size (keys %sizes) {
        # set sleep time based on largest file order of magniture (K, M, G) encountered
        # uni-directional
        if ($sizes{$size} =~ m/K/ && $sleepTime eq "1") {
            $sleepTime = "U";
        }
        if ($sizes{$size} =~ m/M/ && ($sleepTime eq "1" || $sleepTime eq "U")) {
            $sleepTime = "U2";
        }
        if ($sizes{$size} =~ m/G/) {
            $sleepTime = 2;
        }
    #my @return = (\%sizes, \@files, \@filemask, $sleepTime);
    my @return;
    push @return, \%sizes;
    push @return, \@files;
    push @return, \@fileMask;
    push @return, \$sleepTime;
    return (\@return);
    }
}

sub sortTypeOfTermPriority {
    # takes in typeofterms hash and return a sort list of terms according to
    # priority in the cv, removes hidden ones
    my %typeofterms = %{$_[0]};
    my %opt = %{$_[1]};
    my @sort;
    foreach my $key (sort keys %typeofterms) {
        # remove hidden ones
        # we only look at the ones that have keys hidden = no, or no hidden value
        if (exists ${$typeofterms{$key}}{'hidden'}) {
            #if it says anything other than no
            unless (${$typeofterms{$key}}{'hidden'} eq 'no') {
                next;
            }
        }

        #make sure priority is there
        unless (exists ${$typeofterms{$key}}{'priority'}) {
            next;
        }

        #multidimensional array sort, @sort is the multidimentional one
        #@array is a secondary array pushed onto the first one (@sort)
        if (${$typeofterms{$key}}{'priority'} =~ m/\D/) {
            unless ($opt{'quiet'}) {
                print STDERR "$key has non-numeric characters in the priority field\n";
            }
            ${$typeofterms{$key}}{'priority'} =~ s/\D//g;
        }

        my @array;
        # special case cellType>cell and Antibody>antibody
        my $kkey = $key;
        if ($kkey =~ m/cellType/) {
            $kkey = "cell";
        }
        if ($kkey =~ m/Antibody/) {
            $kkey = "antibody";
        }
        push @array, $kkey;
        push @array, ${$typeofterms{$key}}{'priority'};
        push @sort, \@array;
    }

    #sorts the array first according to priority then alphanumerically on value
    @sort = sort {$a->[1] <=> $b->[1] || $a->[0] cmp $b->[0]} @sort;
    if ($opt{'verbose'} >= 2 ) {
        print STDERR "\npriority list\n";
        foreach my $sortval (@sort) {
            print STDERR "$$sortval[0] = $$sortval[1]\n";
        }
    }

    #create one dimensional array for easy iteration
    my @sortorder;
    foreach my $element (@sort) {
        push @sortorder, $$element[0];
    }
    return (\@sortorder);
}

sub getMd5Files {
    # generates a list of files that need to be md5summed based off an incoming list of files
    # and md5sum history file
    my @files = @{$_[0]};
    my %md5sums = %{$_[1]};
    my $md5hist = $_[2];
    my @statlist = @{$_[3]};
    my %opt = %{$_[4]};
    my %gzmd5sums;
    my %lastmod;
    my %tempmd5;
    my %files;

    foreach my $key (@files) {
        $files{$key} = 1;
    }

    # copy over the filenames to tempmd5 hash for removal purposes
    # also initializes all last modification times to 0
    foreach my $file (@files) {
        $tempmd5{$file} = 1;
        $lastmod{$file} = 0;
    }

    my @md5files;
    if (-e $md5hist) {
        open MDH, "$md5hist" or die "die: can't open history file\n";
        while (<MDH>) {
            my $line = $_;
            chomp $line;
            my @line = split " ", $line;

            # history file is structured [filename] [md5sum] [last modified]
            my $file = $line[0];
            my $md5 = $line[1];
            my $lastmod = $line[2];
            my $gzmd5;
            if ($md5 =~ m/(\S+),(\S+)/) {
                $md5 = $1;
                $gzmd5 = $2;
            }
            # make sure file exists, not deleted, etc....
            unless (-e $file) {
                next;
            }
            unless (exists $files{$file}) {
                next;
            }
            $md5sums{$file} = $md5;
            if ($gzmd5) {
                $gzmd5sums{$file} = $gzmd5;
            }
            if ($lastmod) {
                $lastmod{$file} = $lastmod;
            } else {
                #this, supposedly, should never execute
                $lastmod{$file} = 0;
            }
        }
        close MDH;

        # remove files that have the same file modification time as the history file says
        # assumption that the file is identical if the filename and file modification are identical
        # is made
        foreach my $file (keys %tempmd5) {
            my @stat = stat("$file");
            if ($opt{'verbose'} >= 3) {
                my $count = 0;
                print STDERR "\n";
                print STDERR "$file statistics:\n";
                foreach my $stat (@stat) {
                    print STDERR "$statlist[$count] = $stat\n";
                    $count++;
                }
            }
            
            # $stat[9] is the last modify value of the stat array
            my $filelastmod = $stat[9];
            if ($lastmod{$file} == $filelastmod) {
                delete $tempmd5{$file};
            }
        }
        
        # copy all remain files to @md5files
        @md5files = keys %tempmd5;

    } else {
        # no history file means all files have to be processed
        @md5files = @files;
    }
    my @return;
    push @return, \@md5files;
    push @return, \%md5sums;
    push @return, \%gzmd5sums;
    return (\@return);
}

sub metaDbUpdate {
    my %md5sums = %{$_[0]};
    my %fileHash = %{$_[1]};
    my %opt = %{$_[2]};
    my $db = $_[3];
    my $filename = $_[4];
    my $tableName = $_[5];
    my $metadb = $_[6];
    my $objName = $_[7];
    my $localUpdate = 0;
    my $inlist = 0;
    my @fileList;

    if ($fileHash{'fileName'} =~ m/,/) {
        @fileList = split ",", $fileHash{'fileName'};
    } else {
        push @fileList, $fileHash{'fileName'};
    }
    @fileList = sort {$a cmp $b} (@fileList);
    my $md5Str = "";
    foreach my $file (@fileList) {
        if ($file eq $filename) {
            $inlist = 1;
        }
        if ($md5sums{$file}) {
            $md5Str = $md5Str . "$md5sums{$file},";
        }
    }
    unless ($inlist) {
        unless ($opt{'quiet'}) {
            print STDERR "$filename is not in fileName field\n";
        }
        return (0);
    }
    #remove trailing comma
    $md5Str =~ s/,$//;
    #print "md5Str = $md5Str\n";
    unless (exists($fileHash{'md5sum'})) {
        my @into = ('obj', 'var', 'val');
        my @values = ($objName, 'md5sum', $md5Str);
        $db->insert($metadb, \@into, \@values);
        if ($opt{'verbose'} >= 1) {
            print STDERR "Inserted md5sum for $fileHash{'fileName'}\n";
        }
        $localUpdate = 1;
    } else {
        unless ($md5Str eq $fileHash{'md5sum'}) {
            my @values = ($md5Str, $objName, 'md5sum');
            $db->execute("update $metadb set val = ? where obj = ? and var = ?", @values);
            if ($opt{'verbose'} >= 1) {
                print STDERR "Updated md5sum for $fileHash{'fileName'}\n";
            }
            $localUpdate = 1;
        }
    }

    if ($localUpdate) {
        return (1);
    } else {
        return (0);
    }
}


############
### INIT ###
############

my $user = "$ENV{'USER'}";
my $nice = POSIX::nice(19);

unless ($nice) {
    print STDERR "Can't set nice to 19\n";
}
my %opt;
my $goodopts = GetOptions(\%opt,
                    "md5",
                    "fileMask=s",
                    "table=s",
                    "verbose=i",
                    "quiet",
                    "cv=s",
                    "directory=s",
                    "history=s",
                    "noupdate",
                    "superfast"
);

#make sure that GetOptions returns 1 and every option is good
unless ($goodopts) {
    usage();
}

#custom cv dir
my $configdir = "/data/apache/cgi-bin/encode";
if ($opt{'cv'}) {
    $configdir = $opt{'cv'};
}

if (scalar(@ARGV) < 1) {
    print STDERR "db is required\n";
    usage();
}

my $metadb = "metaDb_$user";
my $writedb = 1;
if ($opt{'table'}) {
    $metadb = $opt{'table'};
    
    #don't allow people to use metaDb directly
    if ($metadb eq "metaDb") {
        print STDERR "Direct modification of metaDb is not allowed, please use a sandbox table.". 
        " Writing to common metaDb table has been disabled.\n";
        $writedb = 0;
    }
}
if ($opt{'noupdate'}) {
    $writedb = 0;
}

my $workdir = getcwd();
if ($opt{'directory'}) {
    $workdir = "$opt{'directory'}";
    $workdir =~ s/^\/\//\//;
    $workdir =~ s/\/$//;
}

unless ($opt{'verbose'}) {
    $opt{'verbose'} = 1;
}
if (!open(TTY, "/dev/tty")) {

} else {
    my $tpgrp = tcgetpgrp(fileno(*TTY));
    my $pgrp = getpgrp();
    if ($tpgrp == $pgrp) {
        #foreground
    } else {
        $opt{'verbose'} = 0;
        $opt{'quiet'} = 1;
    }
}
if ($opt{'quiet'}) {
    $opt{'verbose'} = 0;
}
if ($opt{'verbose'} >= 1) {
    print STDERR "using dir: $workdir\n";
}

my $composite = "";
unless ($workdir =~ m/wgEncode/) {
    print STDERR "Directory is not an encode directory: $workdir. Must match wgEncode*\n";
    usage();
} else {
    $workdir =~ m/(wgEncode\S+)/;
    $composite = $1;
}

my $err = chdir($workdir);
unless ($err) {
    die "die: can't change to directory $workdir\n";
}

# default file Masking, if updated, please update usage statement
my %fileMask;
my $fileMask = "wgEncode*";
if ($opt{'fileMask'}) {
    $fileMask = $opt{'fileMask'};
}
my @mask = split /\s+/, $fileMask;

if ($opt{'verbose'} >= 1) {
    print STDERR "fileMasks (regex's):";
}
foreach my $mask (@mask) {
    # convert $mask to regex if user used * as wildcard so that it can be inserted into 
    # regex properly
    $mask =~ s/^\*//;
    $mask =~ s/\*$//;
    $mask =~ s/[^\\][^\.|\]]\*/\.\*/;
    $fileMask{$mask} = 1;
    if ($opt{'verbose'} >= 1) {
        print STDERR " $mask";
    }
}
if ($opt{'verbose'} >= 1) {
    print STDERR "\n";
}

############
### MAIN ###
############

# the only required argument (database name)
my $assm = $ARGV[0];

# database connect and check if accessible
my $db = HgDb->new(DB => $assm) or die "die: Can't connect to DB: $assm\n";
# check if metaDb table exists
my $sth = $db->execute("show tables") or die "die: Can't show tables\n";
my %tables;
while(my @table = $sth->fetchrow_array) {
    $tables{$table[0]} = 1;
}
unless (exists $tables{$metadb}) {
    die "die: The metaDb table: $metadb does not exist.\n";
}

my %md5sums;
my %seenmd5s;
my %gzmd5sums;

# structure {fileInfo}->{filename}->{key}
my %fileInfo;

# %cv's structure returned from Encode.pm is {cv}->{type}->{term}->{key}
# if cv structure changes drastically, please change encode.pm as well
my %cv = Encode::getControlledVocab($configdir);

# grab filenames and human readable sizes
# and processe those lines returned from ls
my @pflReturn = @{&processFilesList(\%fileMask)};
my %sizes = %{$pflReturn[0]};
my @files = @{$pflReturn[1]};
my @fileMask = @{$pflReturn[2]};
my $sleepTime = $pflReturn[3];

# grabs the sort order of the terms form the cv, based on priority, then alphaneumeric
my %typeofterms = %{$cv{'typeOfTerm'}};
my @sortorder = @{&sortTypeOfTermPriority(\%typeofterms, \%opt)};

# list of stat terms for clean verbose output
my @statlist = qw(dev inode file linknum uid grpid devid 
                    file lastaccess lastmodify inochng blksize blkamt);

#thread status, shared between threads
my %threadStatus : shared;

#md5 is threaded, files done in parallel
if ($opt{'md5'}) {

    #this part checks the history file for md5sum and last modification date
    my $md5hist = "md5sum.history";
    if ($opt{'history'}) {
        $md5hist = $opt{'history'};
    }

    # makes sure that only files that haven't changed gets processed
    my @md5return = @{&getMd5Files(\@files, \%md5sums, $md5hist, \@statlist, \%opt)};
    my @md5files = @{$md5return[0]};
    %md5sums = %{$md5return[1]};
    %gzmd5sums = %{$md5return[2]};

    # process only if there are files to process
    $db->disconnect;
    if (@md5files) {
        # this launches a subroutine at the bottom, because threadstatus must be encountered
        #by the interpreter before it is seen in a sub
        my @return = @{&doMd5Threads(\%md5sums, \@md5files, \%opt, \%gzmd5sums)};
        %md5sums = %{$return[0]};
        %gzmd5sums = %{$return[1]};
    }
    $db = HgDb->new(DB => $assm) or die "die: Can't connect to DB: $assm\n";   
 
    open MDF, ">md5sum.txt" or die "die: can't open md5sum.txt file to write\n";
    if ($opt{'verbose'} >= 1) {
        print STDERR "\n";
        print STDERR "writing md5sum.txt\n";
    }
    foreach my $file (sort { $a cmp $b } @fileMask) {
        print MDF "$file $md5sums{$file}\n";
    }
    close MDF;
    
    open MDF, ">md5sum.history" or die "die: can't open md5sum.history to write\n";
    if ($opt{'verbose'} >= 1) {
        print STDERR "\n";
        print STDERR "writing md5sum.history\n";
    }
    foreach my $file (sort { $a cmp $b } keys %md5sums) {
        my @stat = stat("$file");
        my $sum = $md5sums{$file};
        if (exists $gzmd5sums{$file}) {
            $sum = "$sum,$gzmd5sums{$file}";
        }
        print MDF "$file $sum $stat[9]\n";
    }
    close MDF;
} else {
    my $md5hist = "md5sum.history";
    if ($opt{'history'}) {
        $md5hist = $opt{'history'};
    }
    my @md5return = @{&getMd5Files(\@files, \%md5sums, $md5hist, \@statlist, \%opt)};
    %md5sums = %{$md5return[1]};
}

if ($opt{'verbose'} >= 2) {
    print STDERR "\n";
    print STDERR "files to be processed:\n";
    foreach my $name (@fileMask) {
        print STDERR "\t$name\n";
    }
}

my $updated = 0;
my %seentables;
# go through each file and grab metadata, and dump to array convert to hash
foreach my $file (@fileMask) {
    my @fileparts = split '\.', $file;
    my $tableName = $fileparts[0];
    my $test = $fileparts[$#fileparts];
    my $type;

    # get the type
    if ($test =~ m/gz/) {
        $type = $fileparts[($#fileparts - 1)];
    }
    elsif ($test eq 'bai') {
        $type = "bai";
    }
    elsif ($test eq 'bam') {
        $type = 'bam';
    }
    elsif (scalar (@fileparts) == 2) {
        $type = $test;
    } else {
        if ($opt{'verbose'} == 1) {
            print STDERR "something weird parsing name:\n";
            print STDERR "file = $file\n";
        }
        $type = "unknown";
    }

    #hash to store info about file MDB + extraneous, except size;
    my %fileHash;

    #only process if the first part of the file matches wgEncode
    if ($file =~ m/^wgEncode/) {

        # there are two different methods to get the stanza, one makes the assumption that the base
        # filename is the table name, that may not be true, so if it fails to grab a stanza
        # a second method that uses the full filename to look up the stanza is used
        my $objName = 0;
        my $retry = 0;
        my $fail = 0;
        my $query = $db->execute("select obj,var,val from $metadb where obj = '$tableName'");

        while(my $ref = $query->fetchrow_arrayref) {
            my @temp = @{$ref};
            $objName = $temp[0];
            $fileHash{$temp[1]} = $temp[2];
        }
        unless ($objName) {
                $retry = 1;
        }
        if ($retry) {
            my $statement = "select a.obj,a.var,a.val from $metadb a , $metadb b where a.obj=b.obj "
                            . "and b.var=\"fileName\" and b.val like '%$file%'";
            my $query = $db->execute("$statement");
            while(my $ref = $query->fetchrow_arrayref) {
                my @temp = @{$ref};
                $objName = $temp[0];
                 unless ($objName) {
                    last
                }
                $fileHash{$temp[1]} = $temp[2];
            }
        }

        if ($opt{'verbose'} >= 3) {
            print STDERR "mdb info for table $tableName:\n";
            foreach my $mdbobj (keys %fileHash) {
                print STDERR "$mdbobj = $fileHash{$mdbobj}\n";
            }
            print STDERR "\n";
        }
        unless ($opt{'quiet'}) {
            unless ($objName) {
                print STDERR "$file is an orphaned file, there is no matching metadata\n";
            } elsif ($objName && !(exists $fileHash{'fileName'})) {
                print STDERR "There was metadata found for $file, but no fileName field\n";
            }
        }
        if (exists $md5sums{$file} && $writedb == 1 && exists($fileHash{'fileName'})) {
            my $localup = &metaDbUpdate(\%md5sums, \%fileHash, \%opt, $db, $file,
                                    $tableName, $metadb, $objName);
            if ($localup && ($updated == 0)) {
                $updated = 1;
            }

            $fileHash{'md5sum'} = $md5sums{$file};
        } elsif (exists $md5sums{$file}) {
            $fileHash{'md5sum'} = $md5sums{$file};
        }
    }
    $fileHash{'type'} = $type;

    #put hash into fileInfo container
    $fileInfo{$file} = \%fileHash;
}


my %comparemd5;
foreach my $file (keys %md5sums) {
    if (exists $gzmd5sums{$file}) {
        $comparemd5{$file} = $gzmd5sums{$file};
    } else {
        $comparemd5{$file} = $md5sums{$file};
    }

}

for my $file (sort { $a cmp $b } keys %comparemd5) {
    if (exists $fileInfo{$file} and exists ${$fileInfo{$file}}{'objStatus'} and ${$fileInfo{$file}}{'objStatus'} =~ m/re.*/) {
        next
    }
    unless (exists $seenmd5s{$comparemd5{$file}}) {
        $seenmd5s{$comparemd5{$file}} = $file;
    } else {
        unless ($opt{'quiet'}) {
            print STDERR "$file and $seenmd5s{$comparemd5{$file}}".
                     " have the same md5sum: $comparemd5{$file}\n";
        }
    }
}

#if the metaDb was updated, and the species is matching either mm or hg
if ($updated && (($assm =~ m/hg/) || ($assm =~ m/mm/))) {

    #determine species
    my $species = "";
    if ($assm =~ m/hg/) {
        $species = "human";
    } elsif ($assm =~ m/mm/) {
        $species = "mouse";
    }
    if ($species) {

        #assume kent src, if not found, warn user to maunally update
        my $kentsrc = "$ENV{'HOME'}/kent/src/hg/makeDb/trackDb/$species/$assm/metaDb/alpha";
        my $rafile = "$kentsrc/$composite.ra";
        if (-e $rafile) {
            `mdbPrint $assm -composite=$composite 1> $rafile 2>/dev/null`;
            unless ($opt{'quiet'}) {
                print STDERR "\n\n";
                print STDERR "Update .ra file, please commit and push out md5sum changes\n";
            }
        } else {
            unless ($opt{'quiet'}) {
                print STDERR "\n\n";
                print STDERR "Couldn't find composite .ra file, please manually update the .ra file".
                         " and push out md5sum changes\n";
            }
        }
    }
} elsif ($updated == 0) {
} else {
    unless ($opt{'quiet'}) {
        #dunno if this will ever be hit, but just in case
        print STDERR "Could not determine species, please manually update metaDb\n";
    }
}

open FILE, ">files.txt" or die "die: can't open files.txt to write\n";
if ($opt{'verbose'} >= 1) {
    print STDERR "writing files.txt\n";
}
#alphanumeric sort
foreach my $file (sort { $a cmp $b } keys %fileInfo) {
    print FILE "$file\t";
    #print each element according to sort order
    foreach my $key (@sortorder) {
        #don't repeat fileName in metadata strings
        if ($key eq "fileName") {
            next;
        }
        if (exists ${$fileInfo{$file}}{$key}) {
            print FILE "$key=${$fileInfo{$file}}{$key}; ";
        }
    }
    print FILE "size=$sizes{$file}";
    print FILE "\n";
}
close FILE;

# these has to go at the end because the shared variable must be pre-declared before the interpreter
# sees it
sub doMd5Threads {
    # takes in a list of files and md5sums them two at a time with threads
    my %md5sums = %{$_[0]};
    my @md5files = sort @{$_[1]};
    my %opt = %{$_[2]};
    my %gzmd5sums = %{$_[3]};
    my %threadContainer;
    # only 2 threads allowed, filesystem limitation, ~ 125 MB/s from disk only
    my $threads = 2;
    if ($opt{'superfast'}) {
        $threads = 7;
    }
    # in case there is only 1 file to md5sum
    if (scalar @md5files < $threads) {
        $threads = scalar @md5files;
    }
    for (my $i = 0; $i < $threads; $i++) {
        my $file = shift @md5files;
        # create the thread (reference to subroutine, file, threadnumber/id)
        # subroutine is at the end of file because shared variable must be declared
        # before the interpreter sees it
        $threadContainer{$i} = threads->create(\&md5sum, $file, $i);

        # unique thread id for verbosity purposes
        my $tid = $threadContainer{$i}->tid();
        if ($opt{'verbose'} >= 1) {
            print STDERR "Thread $tid started on $file\n";
        }
        #$threadStatus{$i} = 0;
    }

    #the loop to check if the threads are done and to spawn another one
    while (@md5files) {
        foreach my $thrid (keys %threadContainer) {

            # if thread status is 0 still, it means thread is still running
            if (exists($threadStatus{$thrid})) {

                #join result back to main
                my $sum = $threadContainer{$thrid}->join();
                my $gzsum;
                if ($sum =~ m/(\S+),(\S+)/) {
                    $sum = $1;
                    $gzsum = $2;
                }
                #thread stat, name of file is the return status
                $md5sums{$threadStatus{$thrid}} = $sum;
                if ($gzsum){
                    $gzmd5sums{$threadStatus{$thrid}} = $gzsum;
                }
                delete $threadContainer{$thrid};
                delete $threadStatus{$thrid};
                #if file list is still populated
                if (@md5files) {
                    my $file = shift @md5files;
                    $threadContainer{$thrid} = threads->create(\&md5sum, $file, $thrid);
                    my $tid = $threadContainer{$thrid}->tid();
                    if ($opt{'verbose'} >= 1 ) {
                        print STDERR "Thread $tid started on $file\n";
                    }
                }
            }
        }
        # sleep a certain amount of time determined earlier
        #if ($sleepTime eq "U") {
            usleep(10000);
        #} elsif ($sleepTime eq "U2") {
        #    usleep(400000);
        #} else {
        #    sleep 2;
        #}
    }

    #clean up the remaining threads
    foreach my $thrid (keys %threadContainer) {
        my $sum = $threadContainer{$thrid}->join();
        my $gzsum;
        if ($sum =~ m/(\S+),(\S+)/) {
            $sum = $1;
            $gzsum = $2;
        }
        #thread stat, name of file is the return status
        $md5sums{$threadStatus{$thrid}} = $sum;
        if ($gzsum){
            $gzmd5sums{$threadStatus{$thrid}} = $gzsum;
        }    

    }
    my @return;
    push @return, \%md5sums;
    push @return, \%gzmd5sums;
    return (\@return);
}

sub md5sum {
    # Computes md5sum on a given filename in a separate thread,
    # modifies a shared thread status variable when done to avoid
    # excessive polling
    # returns sum, and modifies status variable to signal doneness
    my $file = $_[0];
    my $id = $_[1];
    my $sum = "";
    my $gzsum;
    if ($file =~ m/\.gz$/) {
        $gzsum = `zcat $file | md5sum`;
        chomp $gzsum;
        $gzsum =~ s/[^0-9a-f]//g;
    }
    $sum = `cat $file | md5sum`;
    chomp $sum;
    $sum =~ s/[^0-9a-f]//g;

    #open (FILE, $file) or die "die: Can't open '$file': $!\n";
    #binmode(FILE);
    #my $sum = Digest::MD5->new->addfile(*FILE)->hexdigest;

    # lock shared threadstatus variable for thread-safeness
    lock(%threadStatus);
    $threadStatus{$id} = $file;
    if ($gzsum) {
        my $return = "$sum,$gzsum";
        return ($return);
    }
    return ($sum);
}
