#!/usr/bin/env perl
# File: epcrToHgPsl
# Author: Terry Furey
# Date: 7/17/2002
# Description:  Convert an output file from ePCR to a psl file

# DO NOT EDIT the /cluster/bin/scripts copy of this file -- 
# edit ~/kent/src/hg/stsMarkers/epcrToPsl instead.

# $Id: epcrToPsl,v 1.5 2009/05/06 21:20:33 hiram Exp $

use warnings;
use strict;

# USAGE message
if ($#ARGV < 2) {
  print STDERR "USAGE: epcrToHgPsl  [-mouse -rat] <epcr file> <primers file> <db dir>\n";
  print STDERR "<db dir> is something like /hive/data/genomes/db/\n";
  exit(1);
}

my $rat = 0;
my $mouse = 0;
my $inf = "";

while ($#ARGV > 4) {
  my $arg = shift(@ARGV);
  if ($arg eq "-rat") {
    $rat = 1;
  } elsif ($arg eq "-mouse") {
    $mouse = 1;
  }
}

my $file = shift(@ARGV);
open(FILE, "<$file") || die "Can not open epcr file '$file'";
my $epcrCount = `cat $file | wc -l`;
chomp $epcrCount;
my $primers = shift(@ARGV);
open(PRIMERS, "<$primers") || die "Can not open primers file '$primers'";
my $dir = shift(@ARGV);
my $db = `basename $dir`;
chomp $db;
my $twoBit = "$dir/$db.2bit";

my %chrSizes;
# Get chrom chrom sizes
print STDERR "Reading chrom sizes $dir/chrom.sizes\n";
open(CHR, "$dir/chrom.sizes") || die "Can not open '$dir/chrom.sizes'";
while (my $line = <CHR>) {
    chomp($line);
    my ($chr, $size) = split('\s+', $line);
    $chrSizes{$chr} = $size;
}

my %primerLeft;
my %primerRight;
my %primerLeftSize;
my %primerRightSize;
my %rightStart;
my %size;

# Read in the primers file
print STDERR "Reading primer info from $primers\n";
while (my $line = <PRIMERS>) {
    chomp($line);
    my ($primerid, $left, $right, $distance, $id) = split(' ',$line);
    $primerLeft{$primerid} = substr($left,0,7);
    $primerRight{$primerid} = substr($right,0,7);
    my @left = split(//,$left);
    $primerLeftSize{$primerid}=$#left + 1;
    my @right = split(//,$right);
    $primerRightSize{$primerid} = $#right + 1;
    if ($distance eq "-") {
	$size{$primerid} = 500;
    } elsif ($distance =~ /-/) {
	my ($start,$end) = split("-",$distance);
	$size{$primerid} = int(($end - $start)/2) + $start;
    } else {
	$size{$primerid} = $distance;
    }
    $rightStart{$primerid} = $size{$primerid} - $primerRightSize{$primerid};
}
close(PRIMERS);

my $done = 0;
# Read in epcr file and output psl file
print STDERR "Creating psl file ${file}.psl and ${file}.nomatch\n";
open(OUT, ">${file}.psl");
open(ERR, ">${file}.nomatch");
my %ratName;
while (my $line = <FILE>) {
    chomp($line);
    ++$done;
    printf STDERR "%d of %d complete == %% %.2f\n",
	$done, $epcrCount, 100.0*$done/$epcrCount if (0 == $done%500);
    my ($chr, $place, $dbsts, $ucsc) = split(' ',$line);
    if (($rat) || ($mouse)) {
	$ratName{$ucsc} = $dbsts;
	$dbsts = $ucsc;
    }
    my ($start, $end) = split(/\.\./,$place);
    $start--;
    my $match = $primerLeftSize{$dbsts} + $primerRightSize{$dbsts};
    my $tsize = $end - $start;
    my $tgap = 1;
    my $tgapsize = $tsize - $match;
    my $qgap = 1;
    my $qgapsize = $size{$dbsts} - $match;
    my $leftstart = $start;
    my $rightStart = $end -  $primerRightSize{$dbsts};
    
    # Determine strand
    my $strand = 'x';
    my $to = $start + 7;

    open(FA, "/cluster/bin/x86_64/twoBitToFa $twoBit:$chr:$start-$to stdout|") || die "Can not read from $twoBit:$chr:$start-$to";
    my $find = <FA>;
    $find = <FA>;
    close(FA);
    chomp($find);
    $find =~ tr/a-z/A-Z/;
    if ($find eq $primerLeft{$dbsts}) {
	$strand = "+";
    } elsif ($find eq $primerRight{$dbsts}) {
	$strand = "-";
    } else {
	my @chars = split(//,$find);
	my @left = split(//,$primerLeft{$dbsts});
	my @right = split(//,$primerRight{$dbsts});
	my $r = 0;
	my $l = 0;
	for (my $i = 0; $i < 7; $i++) {
	    if ($chars[$i] eq $left[$i]) {
		$l++;
	    }
	    if ($chars[$i] eq $right[$i]) {
		$r++;
	    }
	}
	if ($l > 5) {
	    $strand = "+";
	} elsif ($r > 5) {
	    $strand = "-";
	} else {
	    $strand = "";
	    print ERR "Could not determine strand for $line - $find ($primerLeft{$dbsts}-$l,$primerRight{$dbsts}-$r)\n";
	}
    }

    # Print out record
    if ($strand) {
	print OUT "$match\t0\t0\t0\t$qgap\t$qgapsize\t$tgap\t$tgapsize\t$strand\t";
	if (($rat) || ($mouse)) {
	    print OUT "$ratName{$dbsts}\t$size{$dbsts}\t0\t$size{$dbsts}\t$chr\t0\t$start\t$end\t";
	} else {
	    print OUT "dbSTS_$dbsts\t$size{$dbsts}\t0\t$size{$dbsts}\t$chr\t$chrSizes{$chr}\t$start\t$end\t";
	}
	print OUT "2\t$primerLeftSize{$dbsts},$primerRightSize{$dbsts},\t0,$rightStart{$dbsts},\t";
	print OUT "$leftstart,$rightStart,\n";
    }
}
close(FILE);
close(OUT);
