#!/usr/bin/env python2.7
# cosmicToBed
"""Convert a cosmic file into a bed file"""
import os
import sys
import collections
import argparse

def parseArgs(args):
    """
    Parse the command line arguments.
    """
    parser= argparse.ArgumentParser(description = __doc__)
    parser.add_argument ("inputFile",
    help = " The input file. ",
    type = argparse.FileType("r"))
    parser.add_argument ("outputFile",
    help = " The output file. ",
    type =argparse.FileType("w"))
    parser.add_argument ("--verbose",
    help = " Show runtime messages.",
    action = "store_true") 
    parser.set_defaults(verbose = False)
    if (len(sys.argv) == 1):
        parser.print_help()
        exit(1)
    options = parser.parse_args()
    return options

def main(args):
    """
    Initialized options and calls other functions.
    """
    options = parseArgs(args)
    firstLine = True
    count = 0 
    skippedLines = 0

    unmappedLines = open("unmappedLines.txt", "w")
    for line in options.inputFile:
        count += 1
        if firstLine:
            firstLine = False
            continue
        splitLine = line.strip("\n").split("\t")
        #.replace("\tNS","\t ").replace("\tNS\n","\t \n").replace("\t\t","\t \t").replace("\t\n","\t \n").split("\t")
        if (len(splitLine) < 34): 
            print ("This line has less than the expected number of fields " + str(count))
            print ("Aborting, please check that the input file has not been corrupted")
            exit(1)
        if (len(splitLine[23].split(":")) < 2):
            if options.verbose: print(" Line number " + str(count) + " has no genomic coordinates and will be excluded. ")
            unmappedLines.write(line)
            skippedLines += 1
            continue
       
        chromNum = splitLine[23].split(":")[0]       
        if ("23" in chromNum):
            chrom = "chrX"
        elif ("24" in chromNum):
            chrom = "chrY"
        elif ("25" in chromNum):
            chrom = "chrM"
        else:
            chrom = "chr" + splitLine[23].split(":")[0] # Reference chromosome
        
        chromStart = int(splitLine[23].split(":")[1].split("-")[0]) - 1  # Chrome start coordinate
        chromEnd = splitLine[23].split(":")[1].split("-")[1]    # Chrome end coordinate
        name = splitLine[16].strip("COSM")    # COSMIC mutation id
        strand = splitLine[24]  # + or - for strand
        geneName = splitLine[0] # Gene: HUGO symbol etc
        transcriptAcc = splitLine[1]    # ENST or other transcript accession
        if "" == splitLine[2]:
            geneCdsLength = "0" # Cast to 0. 
        else:
            geneCdsLength = splitLine[2]    # Length of the gene (base pair)
        if "" == splitLine[3]:
            geneId = "0" # Cast to 0.  
        else:
            geneId = splitLine[3]   # If the gene is in HGNC, this links to it
        sampleName = splitLine[4]  # A sample name is an instance of a portion of a tumor. 
        if "" == splitLine[5]:
            sampleId = "0" # Cast to 0.  
        else:
            sampleId = splitLine[5] # The sample id is used to identify a sample within the COSMIC database
        if "" == splitLine[6]:
            idTumor = "0" # Cast to 0.  
        else:
            idTumor = splitLine[6] # The tumors id
        primSite = splitLine[7] # Primary site of organ/tissue
        siteSubtype1 = splitLine[8] # Subtype 1 site of organ/tissue
        siteSubtype2 = splitLine[9] # Subtype 2 site of organ/tissue
        siteSubtype3 = splitLine[10]    # Subtype 3 site of organ/tissue
        primHistology = splitLine[11]   # Primary histology of organ/tissue
        histSubtype1 = splitLine[12]    # Subtype 1 of histology of organ/tissue
        histSubtype2 = splitLine[13]    # Subtype 2 of histology of organ/tissue   
        histSubtype3 = splitLine[14]    # Subtype 3 of histology of organ/tissue
        ntChange = splitLine[17]    # Nucleotide sequence change
        aaChange = splitLine[18]    # Amino acid sequence change
        mutDesc = splitLine[19] # Description on effect of variant on the gene
        mutZyg = splitLine[20]  # Information on whether the mutation was reported homozygous, heterozygous or other
        loh = splitLine[21] # Loss of heterozygosity information 
        snp = splitLine[25] # All the known SNP's are flagged as 'y' 
        resMut = splitLine[26]  # Resistant mutation, mutation confers drug resistance 
        fathmmPred = splitLine[27]  # FATHMM (Functional Analysis through Hidden Markov Models) prediction
        if "" == splitLine[28]:
            fathmmScore = "0.0"
        else:
            fathmmScore = splitLine[28] # FATHMM score
        mutSomStat = splitLine[29]  # Mutation somatic status 
        if "" == splitLine[30]:
            pubmedId = "0"
        else:
            pubmedId = splitLine[30]    # The pubmed id for the paper the sample was noted in
        if "" == splitLine[31]:
            studyIds = "0"
        else:
            studyIds = splitLine[31]    # Lists the unique ids of studies that have involved this sample
        samSource = splitLine[32]   # Describes where the sample has originated from
        tumSource = splitLine[33]   # Describes the tumor type
        if "" == splitLine[34]:
            age = "0" # Cast null values to 0 to satisfy int requirement. 
        else:
            age = splitLine[34] # Age of the sample 

        options.outputFile.write(chrom + "\t") # 1
        options.outputFile.write(str(chromStart) + "\t") # 2
        options.outputFile.write(chromEnd + "\t") # 3
        options.outputFile.write(name + "\t") # 4
        options.outputFile.write("0\t") #5
        options.outputFile.write(strand + "\t") # 6
        options.outputFile.write(str(chromStart) + "\t") # 7
        options.outputFile.write(chromEnd + "\t") # 8
        options.outputFile.write(geneName + "\t") # 10
        options.outputFile.write(transcriptAcc + "\t") # 11
        options.outputFile.write(geneCdsLength + "\t") # 12
        options.outputFile.write(geneId + "\t") # 13
        options.outputFile.write(sampleName + "\t") # 14
        options.outputFile.write(sampleId + "\t") # 15
        options.outputFile.write(idTumor + "\t") # 16
        options.outputFile.write(primSite + "\t") # 17
        options.outputFile.write(siteSubtype1 + "\t") # 18
        options.outputFile.write(siteSubtype2 + "\t") # 19
        options.outputFile.write(siteSubtype3 + "\t") # 20
        options.outputFile.write(primHistology + "\t") # 21
        options.outputFile.write(histSubtype1 + "\t") # 22
        options.outputFile.write(histSubtype2 + "\t") # 23
        options.outputFile.write(histSubtype3 + "\t") # 24
        options.outputFile.write(ntChange + "\t") # 25
        options.outputFile.write(aaChange + "\t") # 26
        options.outputFile.write(mutDesc + "\t") # 27
        options.outputFile.write(mutZyg + "\t") # 28
        options.outputFile.write(loh + "\t") # 29
        options.outputFile.write(snp + "\t") # 30
        options.outputFile.write(resMut + "\t") # 31
        options.outputFile.write(fathmmPred + "\t") # 32
        options.outputFile.write(fathmmScore + "\t") # 33
        options.outputFile.write(mutSomStat + "\t") # 34
        options.outputFile.write(pubmedId + "\t") # 35
        options.outputFile.write(studyIds + "\t") # 36
        options.outputFile.write(samSource + "\t") # 37
        options.outputFile.write(tumSource + "\t") # 38
        options.outputFile.write(age + "\t") # 39
        options.outputFile.write("COSM" + name + "\t") # 40
        options.outputFile.write(geneName + " " + aaChange + " " + idTumor + "\n") # 41

    print("This many lines were skipped, " + str(skippedLines) + \
            " for not having genomic coordinate")


if __name__ == "__main__" : 
    sys.exit(main(sys.argv))
