#!/usr/bin/awk -f
#
# bcgscSageToBed -prefix=prefix mapFile ... >bed8
#
# Convert long-SAGE sequences in files from BCGSC to BED8
#
#  1                 2               3            4                 5                                   6
#  TagSequence <tab> Tag count <tab> Contig <tab> NlaIII site <tab> Position of mapping on Contig <tab> strand
#
# -Position in file are 1-based, and indicate the location of start of CATG,
#  the tag sequence is down-stream of this (in the direction of transcription)
#  and does not contain the CATG.
#    - for + strand, the tag sequence starts is 4 bases after the position
#    - for - strand, the tag sequence ends is 4 bases before the position

BEGIN {
    OFS = "\t";
    prefix="";
    for (i = 1; i <= ARGC; i++) {
        if (ARGV[i] !~ "^-") {
            break;
        } else if (ARGV[i] ~ "^-prefix=.*") {
            prefix=substr(ARGV[i], 9);
            ARGV[i] = "";
        } else {
            print "Error: invalid option:",ARGV[i] >"/dev/stderr";
            exit(1);
        }
    }
}

{
    contig = toupper($3);
    tagSeq = $1;
    tagCnt = $2;
    strand = $6;
    name = prefix NR;
    # score assumes log(tagCnt) <= 10, so score <= 1000
    ## score = int(log(tagCnt)*100);
    # not really a browser score
    score = tagCnt;
}

strand=="+" {
    start = ($5-1);
    end = start+length(tagSeq)+4;
    thickStart = start;
    thickEnd = $5+3;
}
strand=="-" {
    start = ($5-1)-length(tagSeq)-3;
    end = $5;
    thickStart = end-4;
    thickEnd = end;
}


{
    print contig,start,end,name,score,strand,thickStart,thickEnd;
}
