#!/usr/bin/perl
#
# gbAlignFinish [options] database ...
#
# Finish up the BLAT alignments, sort, combining and installing PSL 
# and associated files.
#
# Options:
#   -workdir=work/align - Must be the same as specified for gbAlignSetup.
#   -verbose - print details.
#   -keep - keep tmp files for debugging.
#   -noMigrate - Don't migrate alignments.
#
# Arguments:
#   - database - Database to process.
#
# Obtains other parameters from etc/genbank.conf
#
# $Id: gbAlignFinish,v 1.15 2006/03/26 19:18:16 markd Exp $
#
use strict;
use warnings;
use File::Basename;
use FindBin;
use lib "$FindBin::Bin/../lib";
use gbCommon;
#use Data::Dumper qw(Dumper);  # useful for debugging

# globals from command line
my $gKeep = 0;
my $gWorkDir = "work/align";
my $noMigrate = 0;

# read partition list
sub readPartitionList() {
    my @partitionList;
    my $partFile = "$gWorkDir/partitions.lst";
    open(STUPID, $partFile) || die("can't open $partFile");
    my $line;
    while ($line = <STUPID>) {
        chop($line);
        my @row = split("\t", $line);
        my $typeAccPrefix = $row[3] . (($row[4] eq "") ? "" : ("." . $row[4]));
        my $part = {
            srcDb => lc($row[0]),
            release => $row[1],
            update => $row[2],
            type => $row[3],
            accPrefix => $row[4],
            database => $row[5],
            orgCat => $row[6],
            typeAccPrefix => $typeAccPrefix,
            typeAccPrefixOrgCat => $typeAccPrefix . "." . $row[6]
        };
        push(@partitionList, $part);
    }
    close(STUPID) || die("close failed");
    return @partitionList;
}

# print a partition record (for debugging)
sub printPartition($) {
    my ($p) = @_;
    print $p->{srcDb}, " ", $p->{release}, " ", $p->{update}, " ", $p->{typeAccPrefix}, " ", $p->{database}, " ", $p->{orgCat}, "\n";
}

# This is loaded from the partition.lst file create by by gbAlignGet.  This is
# a mechanism to know what partitions actually have data aligned or migrated.
# The fact that some partitions might only be migrated but have no new
# alignments makes this necessary in addition to other mechanisms
# format of each line is a tab-separated columns of:
#   srcDb release update type accPrefix database orgCat
sub loadPartitionList() {
    my @partitionList = readPartitionList();

    # sort so that full is done first, and then updates to in in assending
    # order
    @partitionList = sort {
        my $d = $a->{database} cmp $b->{database};
        if ($d == 0) {
            $d = $a->{srcDb} cmp $b->{srcDb};
        }
        if ($d == 0) {
            $d = $a->{release} cmp $b->{release};
        }
        if ($d == 0) {
            $d = $a->{update} cmp $b->{update};
            if ($d != 0) {
                if ($a->{update} eq "full") {
                    $d = -1;
                } elsif ($b->{update} eq "full") {
                    $d = 1;
                }
            }
        }
        if ($d == 0) {
            $d = $a->{type} cmp $b->{type};
        }
        if ($d == 0) {
            $d = $a->{accPrefix} cmp $b->{accPrefix};
        }
        if ($d == 0) {
            $d = $a->{orgCat} cmp $b->{orgCat};
        }
        return $d;
    } @partitionList;
    return @partitionList;
}

# get alignment directory for a partition
sub getAlignDir($) {
    my ($part) = @_;
    return "data/aligned/$part->{release}/$part->{database}/$part->{update}";
}

# Determine pslCDnaFilter parameters to use from config file.  Return undef to
# skip PSL filtering.
sub getPslCDnaFilterArgs($) {
    my ($part) = @_;
    my $db = $part->{database};

    my $args = getDbConf($db, "$part->{srcDb}.$part->{type}.$part->{orgCat}.pslCDnaFilter");
    if ($args eq "no") {
        return undef;
    }
    my $hapRegions = getDbConfUndef($db, "hapRegions");
    if (defined($hapRegions)) {
        $args = "$args -hapRegions=$hapRegions";
    }
    return $args;
}

# Combine PSL from one set of PSL (results from one query file), appending to
# output file.  Record the offset of the start and end of the psl that were
# added.
#
# We do this as a pipeline, to avoid intermediate disk I/O if possible.  We
# need to sort by query for pslCDnaFilter.  We use sort rather than pslSort, since
# pslSort can't read from a pipe (and sort is much faster).  We use the gbBlat
# naming convention where query file is the first level of directory to group
# into small batches for pslCDnaFilter processing.  However, we need to save the
# output of pslCDnaFilter for use by selectWithPsl.  Instead of writing the data
# twice, we just save the offsets in the output PSL file.  These are used by
# selectWithPsl to only load a subset of the PSL.
sub combinePslSet($$$$$$) {
    my($queryDir, $pslCDnaFilterArgs, $tmpDir, $outPsl, $outRawPsl, $pslQueryOffsets) = @_;

    my $startOff = (-e $outPsl) ? getFileSize($outPsl) : 0;

    # sort by query for pslCDnaFilter
    my $cmd = "find $queryDir -name '*.psl' | xargs cat "
        . "| sort -T $tmpDir -k 10,10 -k 12n,12n | uniq ";
    if (defined($outRawPsl)) {
        $cmd .= "| tee -a $outRawPsl "
    }

    if (defined($pslCDnaFilterArgs)) {
        # ugly hack, generate poly-A file from queryDir
        my $polyA = $queryDir . ".polya";
        $polyA =~ s/\/psl\//\//;
        $pslCDnaFilterArgs =~ s/-polyASizes/-polyASizes=$polyA/;
        $cmd .= "| pslCDnaFilter $pslCDnaFilterArgs stdin stdout "
    }
    $cmd .= " >> $outPsl";
    runPipe($cmd);
    my $endOff = (-e $outPsl) ? getFileSize($outPsl) : 0;

    # always save offsets, even if empty
    $$pslQueryOffsets{$queryDir} = [$startOff, $endOff];
}

# Combine, removing duplicates, return name of output psl or undef if no PSLs.
sub combinePsls($$$$@) {
    my($part, $updWorkDir, $tmpDir, $pslQueryOffsets, @queryDirs) = @_;

    my $pslCDnaFilterArgs = getPslCDnaFilterArgs($part);
    my $pslPrefix = "$updWorkDir/$part->{typeAccPrefixOrgCat}";
    
    # tmp file for combined PSLs
    my $outPsl = "$pslPrefix.psl";
    if (-e $outPsl) {
        unlink($outPsl) || die("unlink $outPsl");
    }
    
    # for mRNAs, we save the files before PSL reps for expermental work.
    # this is not done for ESTs, as there are so many. Also not done
    # if we are not filterting, as this would be redundant.
    my $outRawPsl;
    if (($part->{type} eq "mrna") && defined($pslCDnaFilterArgs)) {
        $outRawPsl = "$pslPrefix.rawPsl";
        if (-e $outRawPsl) {
            unlink($outRawPsl) || die("unlink $outRawPsl");
        }
    }

    # Process each query dir in a pipeline with error checking, use find to
    # avoid possible comand line overflow. Sort by qName,qStart,qEnd for
    # pslCDnaFilter.
    foreach my $queryDir (@queryDirs) {
        combinePslSet($queryDir, $pslCDnaFilterArgs, $tmpDir, $outPsl, $outRawPsl, $pslQueryOffsets);
    }

    # return pair of output files, or undef if empty
    my $outPslResults = (getFileSize($outPsl) > 0) ? $outPsl : undef;
    my $outRawPslResults = (defined($outRawPsl) && (getFileSize($outPsl) > 0)) ? $outRawPsl : undef;
    return ($outPslResults, $outRawPslResults);
}

# get the list start and end offsets from a pslQueryOffsets
sub getPslQueryOffsets($$) {
    my($queryDir, $pslQueryOffsets) = @_;
    if (!defined(${$pslQueryOffsets}{$queryDir})) {
        gbError("BUG: no PSL offsets found for queryDir $queryDir");
    }
    return (${$pslQueryOffsets}{$queryDir}[0],
            ${$pslQueryOffsets}{$queryDir}[1]);
}

# Combine one set of orientation info files.  Return 1 if any were extracted.
sub combineOISet($$$$$) {
    my($queryDir, $outPsl, $pslQueryOffsets, $outOI, $outRawOI) = @_;

    my($startOff, $endOff) =  getPslQueryOffsets($queryDir, $pslQueryOffsets);
    if (($startOff == $endOff) && !defined($outRawOI)) {
        return; # nothing to do
    }
    my $cmd = "find $queryDir -name '*.oi' | xargs cat ";
    if ($startOff < $endOff) {
        # at least create selected OI
        if (defined($outRawOI)) {
            $cmd .= "| tee -a $outRawOI ";
        }
        $cmd .= "| selectWithPsl -fmt=oi -startOff=$startOff -endOff=$endOff $outPsl stdin stdout"
            . " >> $outOI";
    } else {
        # just create raw OI
        $cmd .= " >> $outRawOI";
    }
    runPipe($cmd);
}

# Combine orientation info. Only called if there are PSLs, and hence
# must be orientInfo files.  This processing parallels combinePsls.
sub combineOIs($$$$$@) {
    my($part, $updWorkDir, $tmpDir, $outPsl, $pslQueryOffsets, @queryDirs) = @_;

    # Process each query dir in a pipeline, selecting the ones kept
    # by pslCDnaFilter.
    my $outOI = "$updWorkDir/$part->{typeAccPrefixOrgCat}.oi";
    if (-e $outOI) {
        unlink($outOI) || die("unlink $outOI");
    }
    # for mRNAs, also save all alignments
    my $outRawOI;
    if ($part->{type} eq "mrna") {
        $outRawOI = "$updWorkDir/$part->{typeAccPrefixOrgCat}.rawOi";
        if (-e $outRawOI) {
            unlink($outRawOI) || die("unlink $outRawOI");
        }
    }
    foreach my $queryDir (@queryDirs) {
        combineOISet($queryDir, $outPsl, $pslQueryOffsets, $outOI, $outRawOI);
    }

    # return pair of output files, or undef if empty
    my $outOIResults = (getFileSize($outOI) > 0)
        ? $outOI : undef;
    my $outRawOIResults = (defined($outRawOI) && (getFileSize($outOI) > 0))
        ? $outRawOI : undef;
    return ($outOIResults, $outRawOIResults);
}

# combine one set of intron PSLs
sub combineIntronPslSet($$$$) {
    my($queryDir, $outPsl, $pslQueryOffsets, $outIntronPsl) = @_;

    my($startOff, $endOff) = getPslQueryOffsets($queryDir, $pslQueryOffsets);
    if ($startOff == $endOff) {
        return;  # will no PSLs saved
    }
    my $cmd = "find $queryDir -name '*.intronPsl' | xargs cat ";
    $cmd .= "| selectWithPsl -fmt=psl -startOff=$startOff -endOff=$endOff $outPsl stdin stdout"
        . " >> $outIntronPsl";
    runPipe($cmd);
}

# Combine intronPsl files. This processing parallels combinePsls, however
# intronPsl files only exist for native ESTs and not all EST psl will have
# intronPsl files.
sub combineIntronPsls($$$$$@) {
    my($part, $updWorkDir, $tmpDir, $outPsl, $pslQueryOffsets, @queryDirs) = @_;
    my $outIntronPsl = "$updWorkDir/$part->{typeAccPrefixOrgCat}.intronPsl";
    if (-e $outIntronPsl) {
        unlink($outIntronPsl) || die("unlink $outIntronPsl");
    }

    # Process each query dir in a pipeline, selecting the ones kept
    # by pslCDnaFilter.
    foreach my $queryDir (@queryDirs) {
        combineIntronPslSet($queryDir, $outPsl, $pslQueryOffsets,
                            $outIntronPsl);
    }

    # return output file, or undef if empty
    if (getFileSize($outIntronPsl) > 0) {
        return $outIntronPsl;
    } else {
        return undef;
    }
}

# install raw PSL and OI files
sub installRawPsls($$$) {
    my($outRawPsl, $outRawOI, $part) = @_;
    my $alignDir = getAlignDir($part);
    my $instRawPsl = $alignDir . "/" . basename($outRawPsl) . ".gz";
    my $instRawOI;
    if (defined($outRawOI)) {
        $instRawOI = $alignDir . "/" . basename($outRawOI) . ".gz";
    }

    makeDir($alignDir);
    runProg("gzip -1c $outRawPsl >$instRawPsl.tmp");
    if (defined($outRawOI)) {
        runProg("gzip -1c $outRawOI >$instRawOI.tmp");
    }

    renameFile("$instRawPsl.tmp", $instRawPsl);
    if (defined($instRawOI)) {
        renameFile("$instRawOI.tmp", $instRawOI);
    }
}

# Do work of combining and install PSL, etc for a partition
sub finishInstall($$) {
    my($part, $tmpDir) = @_;
    my $updWorkDir = "$gWorkDir/$part->{release}/$part->{database}/$part->{update}";

    my ($outPsl, $outRawPsl, $outOI, $outRawOI, $outIntronPsl, %pslQueryOffsets);
    my @queryDirs = glob("$updWorkDir/psl/$part->{typeAccPrefixOrgCat}.*");
    if ($#queryDirs >= 0) {
        ($outPsl, $outRawPsl)
            = combinePsls($part, $updWorkDir, $tmpDir, \%pslQueryOffsets, @queryDirs);
        
        # build orient info and intron psl if we have PSLs and are native.
        if ((defined($outPsl) || defined($outRawPsl))
            && ($part->{orgCat} eq "native")) {
            ($outOI, $outRawOI) = combineOIs($part, $updWorkDir, $tmpDir, $outPsl, \%pslQueryOffsets, @queryDirs);
            if ($part->{type} eq "est") {
                $outIntronPsl = combineIntronPsls($part, $updWorkDir, $tmpDir, $outPsl, \%pslQueryOffsets, @queryDirs);
            }
        }
    }
    
    # save raw PSLs first, as the index file created by gbAlignInstall
    # is flag of successful completion
    if (defined($outRawPsl)) {
        installRawPsls($outRawPsl, $outRawOI, $part);
    }

    # copy psl, migrate alignments, and build index.  This is done even
    # if output files were not produced, as the index lists unaligned seqs.
    my $cmd = "gbAlignInstall ";
    if ($noMigrate) {
        $cmd .= " -noMigrate";
    }
    $cmd .= " -workdir=$gWorkDir -orgCats=$part->{orgCat} $part->{release} $part->{update} $part->{typeAccPrefix} $part->{database}";
    runProg($cmd);

    if (!$gKeep) {
        foreach my $f ($outPsl, $outRawPsl, $outOI, $outRawOI, $outIntronPsl) {
            if (defined($f)) {
                unlink($f);
            }
        }
    }
}

# Process a partation, combining cluster-generated files,
# migrating alignments, and building the index file.
sub finishAligns($) {
    my($part) = @_;
    if ($gbCommon::verbose) {
        print STDERR "finishAligns: $part->{release}, $part->{database} $part->{update} $part->{typeAccPrefix} $part->{orgCat}\n";
    }
    # check if already completed
    my $alignDir = getAlignDir($part);
    my $alidxBase = "$alignDir/$part->{typeAccPrefixOrgCat}";

    if (-e "$alidxBase.alidx") {
        if ($gbCommon::verbose > 1) {
            print STDERR "finishAligns: $alidxBase.alidx exists, skipping,\n";
        }
        return;
    }
    my $tmpDir = getTmpDir("gbAlign");
    finishInstall($part, $tmpDir);
    if ($gKeep) {
        print STDERR "keeping tmp directory $tmpDir\n";
    } else {
        removeDir($tmpDir);
    }
}

# Verify that all expected PSLs exist
sub verifyExpectedPsls() {
    my $expected = "$gWorkDir/align.expected";
    open(EXPECTED, $expected) || die("can't open $expected");
    my $relPsl;
    my $missingCnt = 0;
    while ($relPsl = <EXPECTED>) {
        my $psl = "$gWorkDir/$relPsl";
        chomp($psl);
        if (! -e $psl) {
            print STDERR "Error: missing PSL: $psl\n";
            $missingCnt++;
        }
    }
    close(EXPECTED) || die("close failed");
    if ($missingCnt > 0) {
        die("$missingCnt missing PSLs");
    }
}

# Entry
setTaskName("gbAlignFinish");

# parse into globals used in program
while (($#ARGV >= 0) && ($ARGV[0] =~/^-.*/)) {
    my $opt = $ARGV[0];
    shift @ARGV;
    if ($opt =~ /^-workdir($|=)/) {
        $gWorkDir = parseOptEq($opt);
    } elsif ($opt eq "-verbose") {
        $gbCommon::verbose = 1;
    } elsif ($opt =~ /^-verbose=/) {
        $gbCommon::verbose = parseOptEq($opt);
    } elsif ($opt eq "-keep") {
        $gKeep = 1;
    } elsif ($opt eq "-noMigrate") {
        $noMigrate = 1;
    } else {
        gbError("invalid option \"$opt\"");
    }
}
if ($#ARGV < 0) {
    gbError("wrong # args: gbAlignFinish [-workdir=dir] [-keep] [-verbose] database ...");
}
my @databases;
while ($#ARGV >= 0) {
    push(@databases, $ARGV[0]);
    shift @ARGV;
}

# check that the empty done file was created
my $doneFlag = "$gWorkDir/align.done";
if (! -e $doneFlag) {
    gbError("alignment appears not to have completed, $doneFlag does not exist");
}

verifyExpectedPsls();
my @partitionList =  loadPartitionList();

# process each update and type for each database
foreach my $part (@partitionList) {
    finishAligns($part);
}
