#!/usr/bin/env perl

# File: compileAccInfo
# Author: Terry Furey
# Date: 4/13/01
# Project: Human
# Description: Create a file listing all of the
# contig and accession information

# $Id: compileAccInfo,v 1.3 2007/08/06 23:26:04 hiram Exp $

use strict;
use warnings;
use Carp ();
local $SIG{__WARN__} = \&Carp::cluck;

# Usage message
if ($#ARGV < 1) {
  print STDERR "USAGE: compileAccInfo [-pre <golden_pos.rdb> -nb -ncbi -mouse -rat] <ctg dir> <seq inf>\n";
  exit(1);
}

# global variables
my %ntacc;
my $randomStart;

# Read parameters
my $pre = 0;
my $nb = 0;
my $done = 0;
my $ncbi = 0;
my $mouse = 0;
my $rat = 0;
while ($#ARGV > 1) {
  my $arg = shift(@ARGV);
  if ($arg eq "-pre") {
    $pre = 1;
    my $gp = shift(@ARGV);
    open(GP, "<$gp") || die("Could not open $gp\n");
  } elsif ($arg eq "-ncbi") {
    $ncbi = 1;
  } elsif ($arg eq "-mouse") {
    $mouse = 1;
  } elsif ($arg eq "-rat") {
    $rat = 1;
  } elsif ($arg eq "-nb") {
    $nb = 1;
  }
}
my $dir = shift(@ARGV);
my $seqinfo = shift(@ARGV);
if (!-e $dir) {
  die("$dir does not exist\n");
}

my @chroms;

# Set the chromosomes being processed
if ($ncbi) {
  @chroms = (1..22,qw(X),qw(Y),qw(M),qw(Un));
} elsif ($mouse) {
  @chroms = (1..19,qw(X),qw(Y),qw(M),qw(Un));
} elsif ($rat) {
  @chroms = (1..20,qw(X),qw(M),qw(Un));
} else {
  @chroms = (1..22,qw(X),qw(Y),qw(NA),qw(UL));
}

# Read in the phases of the accessions (human)
my %phase;
if ((!$mouse) && (!$rat)) {
  open(SEQ, "<$seqinfo") || die("Could not open $seqinfo\n");
  while (my $line = <SEQ>) {
    chomp($line);
    my ($thisacc, $gi, $bases, $phase, $draft, $chr, $FISH, $lab, $clone) = split(' ',$line);
    my ($acc, $vers) = split(/\./, $thisacc);
    $phase{$acc} = $phase;
  }
  close(SEQ);
}

# Read in golden_pos if creating from WashU map
my %accchr;
my %allordered;
my %allstart;
my %allend;
if ($pre) {
  while (my $line = <GP>) {
    chomp($line);
    my ($chr, $ord, $acc, $start, $end, $mid, $size) = split("\t", $line);
    $accchr{$acc} = $chr;
    $allordered{$acc} = $ord;
    $allstart{$acc} = $start;
    $allend{$acc} = $end;
  }
}

# Process all of the chromosomes including NA, UL, and random regions
open(OUT, ">accession_info.rdb");
print OUT "Chr\tOrd\tStart\tEnd\tMiddle\tContig\tCtgIdx\tBargeIdx\tClone\tAcc\tBridged\tNT\tPhase\n";
print OUT "5S\t2N\t10N\t10N\t10N\t10S\t5N\t5N\t10S\t10S\t5S\t20S\t5N\n";
my $count = 0;
my @contig;
my @ctgstart;
my @ctgend;
foreach my $chr (@chroms) {
  my $border = 0;	#	for barge reading
  my @btype;	#	barge types
  my @ctgidx;	#	for barge reading, apparently unused
  my @bctg;	#	for barge reading, apparently unused
  my @bargeidx;	#	for barge reading
  my @bacc;	#	for barge reading, apparently unused
  my %accbarge;	#	accessions in barge
  my %clone;	#	accession barge info
  my %ntname;	#	accession barge info
  my %accExists;	#	accession barge info, apparently unused
  my %accctg;
  my %accctgidx;
  my $start;
  my $thisctg;
  my $ctglength;
  my $thischr;
  my $chrlength;
  my $ctg;

  print STDERR "Processing chrom $chr\n";

  # First, read in ordering information for the contigs
  my $numctg = 0;
  if (!$pre) {
    if (open(LIFT, "$dir/$chr/lift/ordered.lft")) {
    print STDERR "reading contig information from $chr/lift/ordered.lft\n";
      while (my $line = <LIFT>) {
	chomp($line);
	($start, $thisctg, $ctglength, $thischr, $chrlength) = split(' ',$line);
	$ctg = $thisctg;
	if ($thisctg =~ m#/#) {
	    ($thischr, $ctg) = split(/\//,$thisctg);
	}
	if ($ctglength != 0) {
	  $contig[$numctg] = $ctg;
	  $ctgstart[$numctg] = $start;
	  $ctgend[$numctg] = $start + $ctglength;
	  $numctg++;
	}
      }
    }
    close(LIFT);
    $randomStart = $numctg;
    if (open(LIFT, "$dir/$chr/lift/random.lft")) {
    print STDERR "reading contig information from $chr/lift/random.lft\n";
      while (my $line = <LIFT>) {
	chomp($line);
	my ($start, $thisctg, $ctglength, $thischr, $chrlength) = split(' ',$line);
	$ctg = $thisctg;
	if ($thisctg =~ m#/#) {
	    ($thischr, $ctg) = split(/\//,$thisctg);
	}
	if ($ctglength != 0) {
	  $contig[$numctg] = $ctg;
	  $ctgstart[$numctg] = $start;
	  $ctgend[$numctg] = $start + $ctglength;
	  $numctg++;
	}
      }
    }
    close(LIFT);
  } else {
    @contig = split(' ',`ls -d $dir/$chr/ctg*`);
    my $numctg = $#contig + 1;
    for (my $i = 0; $i < $numctg; $i++) {
      my @subdir = split(/\//, $contig[$i]);
      $contig[$i] = $subdir[$#subdir];
      $contig[$i] =~ s/\///;
    }
  }

  # Read in NT contig information, if necessary
  if ((!$mouse) && (!$rat)) {
    for (my $i = 0; $i < $numctg; $i++) {
      if (open(NT, "<$dir/$chr/$contig[$i]/nt.noNt")) {
	my $currnt = "";
	while (my $line = <NT>) {
	  my ($nt, $ntacc, $start, $direction, $length) = split(' ',$line);
	  if ($nt eq $currnt) {
	    $ntacc{$nt} .= ";$ntacc";
	  } else {
	    $ntacc{$nt} = "$ntacc";
	  }
	  $currnt = $nt;
	}
      }
    }
  }

  # Next, check if the barge information is available
  for (my $i = 0; $i <= $border; $i++) {
    $btype[$border] = "";
  }
  $border = -1;
  for (my $i = 0; $i < $numctg; $i++) {
    next if (!defined $contig[$i]);
    if (open(BARGE, "<$dir/$chr/$contig[$i]/mmBarge")) {
      print STDERR "Processing $contig[$i]\n";
      my $line = <BARGE>;
      $line = <BARGE>;
      my $firstacc = 1;
      $border++;
      my $bidx = 0;
      while ($line = <BARGE>) {
	chomp($line);
	
	# Record if barge is bridged or not
	if ($line =~ /gap/) {
	  if (!($line =~ /weak/)) {
	    if ($line =~ /bridged/) {
	      $btype[$border] = "true";
	      #print STDERR "Barge $border bridged\n";
	    } else {
	      $btype[$border] = "false";
	      #print STDERR "Barge $border open\n";
	    }
	    $border++;
	    $firstacc = 1;
	  }
	} else {
	  my ($start, $name, $phase, $clone, $direction, $size, $before,
	    $after, $center, $plc, $mappos) = split(' ',$line);
	  # Record contig info
	  if ($firstacc) {
	    $ctgidx[$border] = $i;
	    $bctg[$border] = $contig[$i];
	    $bargeidx[$border] = $bidx;
	    $bidx++;
	    $bacc[$border] = "";
	    $firstacc = 0;
	  }

	  # Create a list of accessions for the barge
	  if ($name =~ /NT/) {
	    $bacc[$border] .= "$ntacc{$name};";
	    my @accs = split(";",$ntacc{$name});
	    my $ntcount = 0;
	    for (my $j = 0; $j <= $#accs; $j++) {
	      $accbarge{$accs[$j]} = $border;
	      $clone{$accs[$j]} = $clone;
	      $ntcount++;
	      $ntname{$accs[$j]} = "${name}_$ntcount";
	      $accExists{$accs[$j]} = 1;
	      # print STDERR "ntname for $accs[$j] is $ntname{$accs[$j]}\n";
	    }
	  } else {
	    $bacc[$border] .= "$name;";
	    $accbarge{$name} = $border;
	    $clone{$name} = $clone;
	    $ntname{$name} = "";
	    $accExists{$name} = 1;
	  }
	}
      }
      close(BARGE);
      $nb = 0;
    } else {
      $nb = 1;
    }
  }

  # Lastly, read in AGP position information
  my %ordered;
  my %start;
  my %end;
  if (!$pre) {
    my $agp = "$dir/$chr/chr$chr.agp";
    my $ranAgp = "$dir/$chr/chr" . $chr . "_random.agp";
    my $tmpAgp = "$$.agp";
    if ( -f $ranAgp && -f $agp ) {
	`cat $agp $ranAgp > $tmpAgp`;
    } elsif ( -f $ranAgp ) {
	`cat $ranAgp > $tmpAgp`;
    } else {
	`cat $agp > $tmpAgp`;
    }
    open(AGP, "$tmpAgp") or die "Can not open created file $tmpAgp:$!";
    while (my $line = <AGP>) {
      chomp($line);
      my ($thischr, $start, $end, $index, $type, $accession) = split(' ',$line);
      if ($type eq "N") {
	next;
      }
      my ($acc, $version) = split(/\./,$accession);
      if ($thischr =~ /random/) {
	$ordered{$acc} = 0;
      } else {
	$ordered{$acc} = 1;
      }
      if(!exists $start{$acc}) {
	$start{$acc} = $start;
      }
      $end{$acc} = $end;
      if (!$ordered{$acc}) {
	$index = $randomStart;
      } else {
	$index = 0;
      }
      while ($start{$acc} > $ctgend[$index]) {
	$index++;
      }
      $accctg{$acc} = $contig[$index];
      $accctgidx{$acc} = $index;
      $count++;
    }
    close(AGP);
    `rm $tmpAgp`;
  } else {
    foreach my $acc (keys %accchr) {
      if ($accchr{$acc} eq $chr) {
	$count++;
	$ordered{$acc} = $allordered{$acc};
	$start{$acc} = $allstart{$acc};
	$end{$acc} = $allend{$acc};
	$accExists{$acc} = 1;
      }
    }
  }

  # Print it all out for each accession
  foreach my $acc (keys %start) {
    my $middle = int(($start{$acc} + $end{$acc})/2);
    my $bargeidx = 0;
    my $btype = "false";
    if (exists($accbarge{$acc})) {
	$border = $accbarge{$acc};
	if (defined($btype[$border]) && (!$btype[$border])) {
	  $btype[$border] = "false";
	}
	$btype = $btype[$border];
	$bargeidx = $bargeidx[$border];
    }
    my $phase = -1;
    if (exists($phase{$acc}) && ($phase{$acc} =~ /0|1|2|3/)) {
      $phase = $phase{$acc};
    }
    my $clone = "";
    if (exists($clone{$acc})){$clone=$clone{$acc};}
    my $ntname = "";
    if (exists($ntname{$acc})){$ntname=$ntname{$acc};}
    if (!exists($ordered{$acc})){print STDERR "ordered{$acc} does not exist\n";}
    if (!exists($accctgidx{$acc})){print STDERR "accctgidx{$acc} does not exist\n";}
    die "chr not defined" if (!defined($chr));
    die "acc not defined" if (!defined($acc));
    die "ordered{$acc} not exist" if (!exists($ordered{$acc}));
    die "start{$acc} not exist" if (!exists($start{$acc}));
    die "end{$acc} not exist" if (!exists($end{$acc}));
    die "middle not defined" if (!defined($middle));
    die "accctg{$acc} not exist" if (!exists($accctg{$acc}));
    die "accctgidx{$acc} not exist" if (!exists($accctgidx{$acc}));
    die "bargeidx not defined" if (!defined($bargeidx));
    die "clone not defined" if (!defined($clone));
    die "btype not defined" if (!defined($btype));
    die "ntname not defined" if (!defined($ntname));
    die "phase not defined" if (!defined($phase));
    if (!$nb) {
      print OUT "chr0XX${chr}YY\t1XX$ordered{$acc}YY\t2XX$start{$acc}YY\t3XX$end{$acc}YY\t4XX${middle}YY\t5XX$accctg{$acc}YY\t6XX$accctgidx{$acc}YY\t7XX${bargeidx}YY\t8XX${clone}YY\t9XX${acc}YY\t10XX${btype}YY\t11XX${ntname}YY\t12XX${phase}YY\n";
    } else {
      print OUT "chr$chr\t$ordered{$acc}\t$start{$acc}\t$end{$acc}\t$middle\t$accctg{$acc}\t$accctgidx{$acc}\t0\t$clone\t$acc\tfalse\t$ntname\t$phase\n";
    }
  }
}
print STDERR "$count processed\n";
close(OUT);
