#!/usr/bin/perl -W

# takes dbSnp XML files as stdin and prints condensed information
# to stdout in one line per SNP in tab delimited format as the 
# first step of parsing the dbSnp data for loading into the fixed
# browser table hgFixed.dbSnpRS (supplemental details)

$rs=$a1=$a2=$seq5=$seq3=$valid=$func=$seq5rc=$seq3rc="";
$het=$hetSE=0;

while (<>)
{
    if (/<NSE-rs>/) # start of new SNP
    {
	while (<>)
	{
	    if (/<\/NSE-rs>/) # end of current SNP
	    {
		if ( $a1 =~ /\//) # reset alleles if not biallelic
		{   # alleles should be read into hash for compatibility 
		    # with triallelic polymorphisms; reset variables if null
		    $a2 = "$a1/$a2";
		    $a1 = "(not_biallelic)";
		}

		if ($seq5  eq "") {$seq5  = "n";               }
		if ($seq3  eq "") {$seq3  = "n";               }
		if ($valid eq "") {$valid = "no-information,"; }
		if ($a1    eq "") {$a1    = "?";               }
		if ($a2    eq "") {$a2    = "?";               }
		if ($func  eq "") {$func  = "X,";              }

		substr($valid, -1) = ""; # remove last character
		substr($func,  -1) = ""; # remove last character

		print "rs$rs\t$het\t$hetSE\t$valid\t$a1\t$a2\t";
		print substr($seq5,-20)."\t".substr($seq3,0,20)."\t";
		print "$func\n";
		
		# reinitialize
		$rs=$a1=$a2=$seq5=$seq3=$valid=$func=$seq5rc=$seq3rc="";
		$het=$hetSE=0.0;
	    }
	    elsif (/<(NSE-rs_refsnp-id)>(\S+)<\/\1>/)                  
	        {$rs     = $2;}
	    elsif (/<(NSE-rs_observed)>(\S+)\/(\S+)<\/\1>/)         
	        {$a1=$2; $a2=$3;}
	    elsif (/<(NSE-rs_seq-5_E)>(\S+)<\/\1>/)                 
	        {$seq5  .= $2;}
	    elsif (/<(NSE-rs_seq-3_E)>(\S+)<\/\1>/)                 
	        {$seq3  .= $2;}
	    elsif (/<(NSE-rs_het)>(\S+)<\/\1>/)                     
	        {$het   += $2;}
	    elsif (/<(NSE-rs_het-SE)>(\S+)<\/\1>/ && ($2 ne "NaN")) 
	        {$hetSE += $2;}
	    elsif (/<(NSE-rs_validated-)(\S+) value=\"true\"\/>/)   
	        {if (index($valid,$2)== -1) {$valid .= "$2,";}}
	    elsif (/<(NSE-FxnSet_fxn-class-contig value=\")(\S+)\"\/>/)
	        {if (index($func,$2)== -1) {$func .= "$2,";}}
	}
    }
}


