#!/usr/bin/env perl
# File: filterPrimers
# Author: Terry Furey
# Date: 10/2001
# Description: Filter output from blat to get believed primer placements

# $Id: filterSTSPrimers,v 1.3 2007/08/03 16:45:12 hiram Exp $

use strict;
use warnings;

# Usage message
if ($#ARGV < 3) {
  print STDERR "USAGE: filterPrimers [-mouse -rat -chicken -nohead] <sts info file> <psl file> <all.primers> <all.epcr>\n";
  exit(1);
}

# Read and check arguments
my $mouse = 0;
my $rat = 0;
my $nohead = 0;
my $chicken = 0;
my $i;
my $j;

while ($#ARGV > 3) {
  my $param = shift(@ARGV);
  if ($param eq "-mouse") {
    $rat = 1;
  } elsif ($param eq "-rat") {
    $rat = 1;
  } elsif ($param eq "-chicken") {
    $chicken = 1;
  } elsif ($param eq "-nohead") {
    $nohead = 1;
  } else {
    die("$param is not a valid option\n");
  }
}
my $names = shift(@ARGV);
open(FILE, "<$names") || die ("Could not open $names\n");
my $psl = shift(@ARGV);
open(PSL, "<$psl") || die("Could not open $psl\n");
my $dbsts = shift(@ARGV);
open(DBSTS, "<$dbsts") || die("Could not open $dbsts\n");
my $epcr = shift(@ARGV);
open(EPCR, "<$epcr") || die("Could not open $epcr\n");

my @fields;
my %id;

# Read in the name information and record which ids have matching STS
print STDERR "Reading name info from: $names\n";
if ($mouse) {
  my $line = <FILE>;
  while ($line = <FILE>) {
    chomp($line);
    @fields = split("\t",$line);
    $fields[2] =~ s/MGI://;
    $fields[3] =~ s/MGI://;
    $id{$fields[2]} = $fields[3];
  }
} elsif ($rat) {
  while (my $line = <FILE>) {
    chomp($line);
    @fields = split("\t",$line);
    $id{$fields[0]} = $fields[0];
  }
} elsif ($chicken) {
  while (my $line = <FILE>) {
    chomp($line);
    @fields = split("\t",$line);
    $id{$fields[0]} = $fields[0];
  }
} else {
  while (my $line = <FILE>) {
    chomp($line);
    @fields = split("\t",$line);
    $id{$fields[8]} = $fields[0];
  }
}
close(FILE);

my @primersize;
# Read in and record primer size info
print STDERR "Reading primer info from: $dbsts\n";
while (my $line = <DBSTS>) {
  chomp($line);
  my ($primerid, $left, $right, $distance, $id) = split(' ',$line);
  if ($rat) {
    my ($primerid, $name) = split("_",$primerid);
  }
  $primersize[$primerid] = 0;
  my @left = split(//,$left);
  for ($i = 0; $i <= $#left; $i++) {
    if ($left[$i] =~ /[acgtACGT]/) { 
      $primersize[$primerid]++;
    }
  }
  my @right = split(//,$right);
  for ($i = 0; $i <= $#right; $i++) {
    if ($right[$i] =~ /[acgtACGT]/) { 
      $primersize[$primerid]++;
    }
  }
}
close(DBSTS);

# Read in ePCR alignments
my @epcr;
my @found;
my @ctg;
my @start;
my @end;
my @ucsc;
my $ctg;
my $place;
my $ucsc;
my $name;
print STDERR "Reading ePCR info from: $epcr\n";
while (my $line = <EPCR>) {
  chomp($line);
  ($ctg, $place, $dbsts, $ucsc) = split('\s+',$line);
  if ($rat) {
    ($dbsts, $name) = split("_",$dbsts);
  }
  if (defined($epcr[$dbsts])) {
    $i = $epcr[$dbsts];
  } else {
    $i = 0;
  }
  $ctg[$dbsts][$i] = $ctg;
  ($start[$dbsts][$i], $end[$dbsts][$i]) = split(/\.\./,$place);
  if ($rat) {
    $ucsc[$dbsts] = $dbsts;
  } else {
    $ucsc[$dbsts] = $ucsc;
  }
  $found[$dbsts][$i] = 0;
  $epcr[$dbsts]++;
}
close(EPCR);

# Read in and discard header lines
if (!$nohead) {
  for ($i = 0; $i < 5; $i++) {
    my $line = <PSL>;
  }
}

my $count = 0;
# Process each line
open(TEMP, ">$$.temp");
my $prevline = "";
my $previd = "";
my $prevstart = "";
my $prevend = "";
my $prevctg = "";
my @count;
my $primerid;

print STDERR "Reading alignment results from: $psl\n";
while (my $line = <PSL>) {
  next if ($line eq "\n");
  next if ($line =~ "psLayout");
  next if ($line =~ "----");
  next if ($line =~ "tart");

  $count++;
  if ($count%100000 == 0) {
    print STDERR "$count\n";
  }
  chomp($line);
  @fields = split('\t',$line);
  if ($mouse) {
    $primerid = $fields[9];
  } elsif ($rat) {
      ($primerid, $name) = split("_",$fields[9]);
  } elsif ($chicken) {
      $line =~ s/dbSTS\_//;
      @fields = split('\t',$line);
      $primerid = $fields[9];
  } else {
    ($dbsts,$primerid) = split("_",$fields[9]);
  }
  my $qsize = $fields[10];
  my $matchsize =  $fields[16] - $fields[15] + 1;

  # Check if epcr placed this marker, and if this record matches one of the placements
  my $gpstart = $fields[15];
  my $gpend = $fields[16];
  my $gpctg = $fields[13];
  my ($acc,$piece) = split("_",$fields[12]);
  my $foundline = 0;
  if ($epcr[$primerid]) {
    if ($epcr[$primerid] <= 20) {
      my $print = 0;
      for ($i = 0; $i < $epcr[$primerid]; $i++) {
	if ((abs($gpstart - $start[$primerid][$i]) < 5) && (abs($gpend - $end[$primerid][$i] < 5))
	    && ($gpctg eq $ctg[$primerid][$i])) {
	  if ((($primersize[$primerid] - $fields[0]) < 4)
	      && (($previd ne $primerid) || ($prevstart ne $gpstart)
		  || ($prevend ne $gpend) || ($prevctg ne $gpctg))) {
	    #      && ($line ne $prevline))
	    $found[$primerid][$i] = 1;
	    $print = 1;
	    $i = $epcr[$primerid] + 1;
	    $prevstart = $gpstart;
	    $prevend = $gpend;
	    $prevctg = $gpctg;
	    $previd = $primerid;
	  }
	}
      }
      if ($print) {
	print TEMP "$line\n";
	$count[$primerid]++;
	$foundline = 1;
      }
    }
  }
  if (!$foundline) {
    # Based on how much of the primer was matched, determine how much slop allowable
    # for total STS length and keep those that match criteria
    if ((((($primersize[$primerid] - $fields[0]) < 4) && ((abs($qsize - $matchsize) < 25)) && (!$epcr[$primerid]))
	|| ((($primersize[$primerid] - $fields[0]) < 3) && (abs($qsize - $matchsize) < 50))
	|| ((($primersize[$primerid] - $fields[0]) < 2) && (abs($qsize - $matchsize) < 150))
	|| ((($primersize[$primerid] - $fields[0]) < 1) && (abs($qsize - $matchsize) < 200))
	|| ((($primersize[$primerid] - $fields[0]) < 5) && (abs($qsize - $matchsize) < 50) && ($fields[1] == 0) && (!$epcr[$primerid])))
	&& (($previd ne $primerid) || ($prevstart ne $gpstart) 
	    || ($prevend ne $gpend) || ($prevctg ne $gpctg))) {
	#&& ($line ne $prevline)) 
      $prevstart = $gpstart;
      $prevend = $gpend;
      $prevctg = $gpctg;
      $previd = $primerid;
      if ($id{$primerid}) {
	print TEMP "$line\n";
	$count[$primerid]++;
      } elsif ($mouse) {
	print STDERR "No translation for MGI:$primerid\n";
      }
    }
  }
  $prevline = $line;
}	#	while (my $line = <PSL>)
close(OUT);
close(PSL);
close(TEMP);

# Remove entries with too many placements
open(TEMP, "$$.temp");
while (my $line = <TEMP>) {
  @fields = split('\t',$line);
  if ($mouse) {
    $primerid = $fields[9];
  } elsif ($rat) {
    ($primerid, $name) = split("_",$fields[9]);
  } else {
    ($dbsts,$primerid) = split("_",$fields[9]);
  }
  if ($count[$primerid] < 200) {
    print $line;
  }
}
close(TEMP);
`rm $$.temp`;


# Check which of the epcr placements were not found by BLAT
print STDERR "Determining ePCR not found from ePCR results\n"; 
my $epcrNotFound = 0;
my $epcrChecked = 0;
open(OUT, ">epcr.not.found");
for ($i = 0; $i <= $#epcr; $i++) {
    if (defined($epcr[$i])) {
    ++$epcrChecked;
      if ($epcr[$i] <= 5) {
	for ($j = 0; $j < $epcr[$i]; $j++) {
	  if ($start[$i][$j] && !$found[$i][$j]) {
	    print OUT "$ctg[$i][$j]\t$start[$i][$j]..$end[$i][$j]\t$i\t$ucsc[$i]\n";
	    ++$epcrNotFound;
	  }
	}
      }
    }
}
close(OUT);

print STDERR "Out of $epcrChecked ePCR alignments examined, not found: $epcrNotFound\n";
