#!/usr/bin/python3 # When this script is executed in the strelka workflow directory (directly above all the analysis folders), it will create in each result directory a CSV file of all nonsynonymous mutant calls. It will also compile a list of mutation genes and all the samples that have it. # -- Li Tai Fang # -- 5/8/2013 import sys, os import regex as re # identify the analysis directories: analysis_dir = [dir_i for dir_i in os.listdir() if os.path.exists(dir_i + os.sep + 'results' )] analysis_dir.sort() # Check if the analysis directory is empty (meaning strelka did not complete successfully) # If empty, remove them from the list for n, dir_i in enumerate(analysis_dir): if os.listdir( dir_i + os.sep + 'results' ) == []: analysis_dir.pop( n ) print( dir_i + '\'s results directory is empty.') # Directory where the annovar software is installed: annovar_dir = '/home/lethalfang/Documents/UCSF/annovar/' dbsnp_dir = '/home/lethalfang/Documents/UCSF/annovar/humandb/' if ( not os.path.exists(annovar_dir + 'convert2annovar.pl') ) or \ ( not os.path.exists(annovar_dir + 'annotate_variation.pl') ): raise Exception('Annovar not properly installed or correctly located.') # header for .exonic_variant_function (created by annovar): annovar_annotated_header = '\t'.join( ['#Line', 'Variant Annotation', 'Gene Annotation', 'Chromosome', 'Start Position', 'End Position', 'Ref Base', 'Alt Base', 'CHROM', 'POS', 'ID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO', 'FORMAT', 'NORMAL', 'TUMOR\n'] ) # Keep a collection of Gene, and their respective locations called_genes = [] genes_locations = [] genes_chr = [] # Make a dictionary for A=4, C=5, G=6, T=7 in order of the tumor/normal info list: base_info_loc = {'A':4, 'C':5, 'G':6, 'T':7} # Regex patterns: qss_patt = re.compile(r'\bQSS=([0-9]+)') qsi_patt = re.compile(r'\bQSI=([0-9]+)') # Captures: GeneSymbol, Exon, and Codon Change gene_anno_patt = re.compile( r'^(?[\w-.]+):\w+:(?\w+):c\.(?\w+):p\.(?\w+)\b' ) # Annotate and get gene list for every project: for proj_i in analysis_dir: # Calling annovar to perform some tasks: # 1. Create annovar file for SNV: create_snv_anov = 'perl {}convert2annovar.pl -format vcf4 -includeinfo -allallele {} > {}'.format(annovar_dir, proj_i+'/results/passed.somatic.snvs.vcf', proj_i+'/results/passed.somatic.snvs.anov') # 2. Annotate SNV annotate_snv_anov = 'perl {}annotate_variation.pl --buildver hg19 --geneanno {} {}'.format(annovar_dir, proj_i+'/results/passed.somatic.snvs.anov', dbsnp_dir) # 3. Create annovar file for INDEL: create_indel_anov = 'perl {}convert2annovar.pl -format vcf4 -includeinfo -allallele {} > {}'.format(annovar_dir, proj_i+'/results/passed.somatic.indels.vcf', proj_i+'/results/passed.somatic.indels.anov') #4. Annovate INDELS annotate_indel_anov = 'perl {}annotate_variation.pl --buildver hg19 --geneanno {} {}'.format(annovar_dir, proj_i+'/results/passed.somatic.indels.anov', dbsnp_dir) # Proceed if annovar has *not* been run in this directory: if not os.path.exists(proj_i+'/results/passed.somatic.snvs.anov.exonic_variant_function'): os.system(create_snv_anov) os.system(annotate_snv_anov) os.system(create_indel_anov) os.system(annotate_indel_anov) # Then, passed.somatic.snvs.anov.exonic_variant_function and # passed.somatic.indels.anov.exonic_variant_function are created. # Create an Excel Style output CSV: excelstyle_output = open(proj_i+'/results/excelstyle_nonsynonymous_SNV-INDEL.csv', 'w') excelstyle_output.write( annovar_annotated_header ) # Also create the refined style that's easier to read: refined_output = open(proj_i+'/results/refined_excelstyle_nonsynonymous.csv', 'w') refined_output.write('\t'.join(['Variant Annotation', 'Gene', 'Exon', 'Codon', 'Chromosome', 'Start Position', 'Variant Call', 'N (A or DP1)', 'N (C or DP2)', 'N (G or TAR)', 'N (T or TIR)', 'T (A or DP1)', 'T (C or DP2)', 'T (G or TAR)', 'T (T or TIR)', 'QSS or QSI', 'Frequency\n'])) # Discard synonymous mutations, and combine SNV and INDEL: # SNV first with open(proj_i+'/results/passed.somatic.snvs.anov.exonic_variant_function', 'r') as exon_snv: snv_line_i = exon_snv.readline() while True: if not snv_line_i: break line_items = snv_line_i.split('\t') if not( line_items[1] == 'synonymous SNV' or line_items[1] == 'unknown' ): ## Write the regular excel style output: excelstyle_output.write( snv_line_i ) ## Items for the refined excel style: variant_annotation = line_items[1] # gene_annotation = line_items[2] gene_anno_search = re.search( gene_anno_patt, gene_annotation ) # Gene Symbol, Exon, Nucleic Acid Change, Codon, and Variant Call if gene_anno_search: if 'gene' in gene_anno_search.groupdict().keys(): gene_symbol = gene_anno_search.groupdict()['gene'] else: gene_symbol = 'N/A' if 'exon' in gene_anno_search.groupdict().keys(): exon = gene_anno_search.groupdict()['exon'] else: exon = 'N/A' if 'codon' in gene_anno_search.groupdict().keys(): codon = gene_anno_search.groupdict()['codon'] else: codon = 'N/A' if 'na' in gene_anno_search.groupdict().keys(): variant_call = gene_anno_search.groupdict()['na'] else: variant_call = 'N/A' else: gene_symbol = 'N/A' exon = 'N/A' codon = 'N/A' variant_call = 'N/A' # chrom = line_items[3] start_position = line_items[4] # # metrics = line_items[15] qss_search = re.search(qss_patt, metrics) qss = qss_search.groups()[0] # normal_info = line_items[17].split(':') normal_A = normal_info[4] normal_C = normal_info[5] normal_G = normal_info[6] normal_T = normal_info[7] tumor_info = line_items[18].rstrip('\n').split(':') tumor_A = tumor_info[4] tumor_C = tumor_info[5] tumor_G = tumor_info[6] tumor_T = tumor_info[7] ## ref_allele = line_items[6] alt_allele = line_items[7] ref_allele_count = int(tumor_info[ base_info_loc[ref_allele] ].split(',')[1]) alt_allele_count = int(tumor_info[ base_info_loc[alt_allele] ].split(',')[1]) mutant_freq = alt_allele_count / (alt_allele_count + ref_allele_count) # refined text refined_text = '\t'.join( (variant_annotation, gene_symbol, exon, codon, chrom, start_position, variant_call, normal_A, normal_C, normal_G, normal_T, tumor_A, tumor_C, tumor_G, tumor_T, qss, '%.2g' % mutant_freq, '\n') ) # Write it: refined_output.write( refined_text ) # Tallying purposes called_genes.append( gene_symbol ) genes_locations.append( ','.join([chrom, start_position]) ) genes_chr.append( chrom ) snv_line_i = exon_snv.readline() # Now INDEL: with open(proj_i+'/results/passed.somatic.indels.anov.exonic_variant_function', 'r') as exon_indel: indel_line_i = exon_indel.readline() while True: if not indel_line_i: break ## Write the regular excel style output: line_items = indel_line_i.split('\t') excelstyle_output.write( indel_line_i ) ## Items for the refined excel style: variant_annotation = line_items[1] # gene_annotation = line_items[2] gene_anno_search = re.search( gene_anno_patt, gene_annotation ) # Gene Symbol, Exon, Nucleic Acid Change, Codon, and Variant Call if gene_anno_search: if 'gene' in gene_anno_search.groupdict().keys(): gene_symbol = gene_anno_search.groupdict()['gene'] else: gene_symbol = 'N/A' if 'exon' in gene_anno_search.groupdict().keys(): exon = gene_anno_search.groupdict()['exon'] else: exon = 'N/A' if 'codon' in gene_anno_search.groupdict().keys(): codon = gene_anno_search.groupdict()['codon'] else: codon = 'N/A' if 'na' in gene_anno_search.groupdict().keys(): variant_call = gene_anno_search.groupdict()['na'] else: variant_call = 'N/A' else: gene_symbol = 'N/A' exon = 'N/A' codon = 'N/A' variant_call = 'N/A' # chrom = line_items[3] start_position = line_items[4] # TIR = Reads strongly supporting indel allele for tiers 1,2 # TAR = Reads strongly supporting alternate allele for tiers 1,2 # DP1/DP2 = tier 1 and tier 2 reads normal_info = line_items[17].split(':') normal_DP1 = normal_info[0] normal_DP2 = normal_info[1] normal_TAR = normal_info[2] normal_TIR = normal_info[3] # metrics = line_items[15] qsi_search = re.search(qsi_patt, metrics) qsi = qsi_search.groups()[0] tumor_info = line_items[18].rstrip('\n').split(':') tumor_DP1 = tumor_info[0] tumor_DP2 = tumor_info[1] tumor_TAR = tumor_info[2] tumor_TIR = tumor_info[3] TIR_count = int( tumor_TIR.split(',')[1] ) TAR_count = int( tumor_TAR.split(',')[1] ) indel_freq = TIR_count / (TAR_count + TIR_count) # refined text refined_text = '\t'.join( (variant_annotation, gene_symbol, exon, codon, chrom, start_position, variant_call, normal_DP1, normal_DP2, normal_TAR, normal_TIR, tumor_DP1, tumor_DP2, tumor_TAR, tumor_TIR, qsi, '%.2g' % indel_freq, '\n') ) # Write it: refined_output.write( refined_text ) ## Tallying: called_genes.append( line_items[2].split(':')[0] ) genes_locations.append( ','.join([line_items[3], line_items[4]]) ) genes_chr.append( line_items[3] ) indel_line_i = exon_indel.readline() # Close the loop: excelstyle_output.close() refined_output.close() # Set of genes: set_of_gene_positions = set(genes_locations) pos2gene_dict = dict( zip(genes_locations, called_genes) ) ##### ##### # Figure out if each "project" contains a mutation in the set print('#GeneSymbol', 'Chromosome', 'Start Position', '\t'.join( analysis_dir ), 'Total', sep='\t' ) for pos_i in set_of_gene_positions: print(pos2gene_dict[pos_i], pos_i.replace(',', '\t'), sep='\t', end='') # Tally how many patients have a variant in this gene_i tally = 0 for proj_i in analysis_dir: position_list = [] with open(proj_i+'/results/excelstyle_nonsynonymous_SNV-INDEL.csv', 'r') as excel_style: # 1st line is just a header: line_i = excel_style.readline() while True: if not line_i: break line_items = line_i.split('\t') position_list.append( ','.join( [line_items[3],line_items[4]] ) ) line_i = excel_style.readline() position_list_set = set(position_list) # See if this "project" contains gene_i: if pos_i in position_list_set: print('\t', '1', sep='', end='') tally = tally + 1 else: print('\t0', end='') print('\t', tally, '\n', sep='', end='')