Source code for variants

import matplotlib as mpl
import matplotlib.pyplot as plt
import pandas as pd
import regex as re
import sys

'''
Functions specific to variants: reading in the variant information and creating a variants track.
'''

[docs]def get_variant_df(transcript, variant_file): '''Read the variant information from a tsv file into a DataFrame. Args: | transcript (str): Ensembl transcript ID. | variant_file (str): path to file containing the variants. Returns: variant_df (DataFrame): contains the variant information. ''' variant_df = pd.DataFrame([]) variant_df_chunker = pd.read_csv(variant_file, sep="\t", chunksize=1000) for variant_df_chunk in variant_df_chunker: variant_df_chunk = variant_df_chunk[variant_df_chunk["featureID"] == transcript] variant_df = pd.concat([variant_df, variant_df_chunk], ignore_index=True) variant_df.fillna("NULL", inplace=True) return variant_df
[docs]def make_track(variant_track, transcript_len, bound_l, color_l, edge_color_l, variant_df, setting_dict, variant_key): '''Make the variants track. Args: | variant_track (matplotlib.axes.Axes): axis for the variants track | exon_coord_df (DataFrame): contains the exon base pair and transcript position coordinates. | bound_l (list of ints): contains the exons and utr bounds to be used in making the color bar. | color_l (list of strs): contains the list of colors to be used in making the color bar. | edge_color_l (list): contains the list of edge colors to be used in making the color bar. | variant_df (DataFrame): contains the variants information. | setting_dict (dictionary): settings for making the png. | variant_key (matplotlib.axes.Axes): axis for the variants key. ''' #(1) Make the color bar. cmap = mpl.colors.ListedColormap(color_l) norm = mpl.colors.BoundaryNorm(bound_l, cmap.N) cb = mpl.colorbar.ColorbarBase(variant_track, cmap=cmap, norm=norm, boundaries=bound_l, spacing='proportional', orientation='horizontal', drawedges=False) plt.setp(variant_track.get_xticklabels(), visible=False) plt.rc('text', usetex=True) variant_track.set_ylabel(setting_dict["v_track_y_axis_label"], rotation='horizontal', ha='right', va='center', size=setting_dict["v_track_fontsize"]) #(2) Annotate variant track with variants. annotate_track_with_variants(variant_track, variant_df, transcript_len, setting_dict) #(3) Make the variant annotations key. make_variant_annotations_key(variant_key, variant_df, setting_dict)
[docs]def annotate_track_with_variants(variant_track, variant_df, transcript_len, setting_dict): '''Annotate the variants track with arrows for the variants. Args: | variant_track (matplotlib.axes.Axes): axis for the variant track. | variant_df (DataFrame): contains the variant information. | transcript_len (int): transcript length. | setting_dict (dictionary): settings for making the png. Returns: variant_track (matplotlib.axes.Axes): axis for the variant track. ''' if variant_df.shape[0] == 0: return variant_track #Add columns to variant_df for annotating variants with arrows: ID, axes x coordinates, arrow bin. variant_df["id"] = pd.Series([str(i) for i in range(1,variant_df.shape[0]+1)], index=variant_df.index) variant_df["top"] = variant_df.apply(lambda x: 1 if setting_dict["v_track_vars_t_or_b"][x["effect"]] == "T" else 0, axis=1) #Column for whether variant should be annotated with a top or bottom arrow. #Add column to determine whether each arrow should be merged with the previous. variant_df["trans_pos_pc"] = variant_df["tp"]/transcript_len #axes x coordinates. get_display_from_axes_coords = lambda axes_x_coord: variant_track.transAxes.transform((axes_x_coord,0))[0] - variant_track.transAxes.transform((0,0))[0] variant_df["num_pixels_diff"] = variant_df["trans_pos_pc"].diff().apply(get_display_from_axes_coords) variant_df["merge_wt_prev"] = variant_df["num_pixels_diff"] <= setting_dict["v_track_merge_pixel_thresh"] variant_df["arrow_bin"] = [0]*len(variant_df.index) arrow_bin = 1 variant_df.at[0,"arrow_bin"] = arrow_bin for i in range(1,len(variant_df.index)): if variant_df.iloc[i]["merge_wt_prev"] == False or variant_df.iloc[i]["top"] != variant_df.iloc[i-1]["top"]: arrow_bin += 1 variant_df.at[i,"arrow_bin"] = arrow_bin #Create new dataframe where each row corresponds to 1 arrow. x_pos_s = variant_df.groupby("arrow_bin")["trans_pos_pc"].mean() text_s = variant_df.groupby("arrow_bin").apply(lambda x: ",".join(x["id"].tolist())) #Make the annotation text string for a variant arrow. top_s = variant_df.groupby("arrow_bin").apply(lambda x: x["top"].tolist()[0]) #Get whether the arrow is a top or bottom arrow. heights_s = get_arrow_height_s(top_s, setting_dict) variant_annotation_df = pd.concat([x_pos_s, text_s, top_s, heights_s],axis=1) variant_annotation_df.rename(columns={"trans_pos_pc":"x", 0:"text", 1:"top", 2:"height"}, inplace=True) #Annotate the variants. variant_annotation_df.apply(axis=1, func=annotate_track_with_arrow, variant_track=variant_track, setting_dict=setting_dict) #Add text to indicate which variants types are annotated above and below the colorbar. variant_track.text(setting_dict["v_track_vars_text_top_x"], setting_dict["v_track_vars_text_top_y"], setting_dict["v_track_vars_text_top"], ha='center', va='bottom', size=setting_dict["v_track_fontsize"]) variant_track.text(setting_dict["v_track_vars_text_bot_x"], setting_dict["v_track_vars_text_bot_y"], setting_dict["v_track_vars_text_bot"], ha='center', va='top', size=setting_dict["v_track_fontsize"]) return variant_track
[docs]def get_arrow_height_s(top_s, setting_dict): '''Get the height of the arrow. Args: top_s (Series): indicates whether a variant is annotated with an arrow above or beneath the colorbar. Returns: height_s (Series): indicates the height the arrow used to annotate each variant. ''' top_l = top_s.tolist() height_l = [] t_bin,b_bin = 0,0 for i in range(len(top_l)): if top_l[i] == 0: height_l.append(t_bin % setting_dict["v_track_num_arrow_heights"]) t_bin += 1 else: height_l.append(b_bin % setting_dict["v_track_num_arrow_heights"]) b_bin += 1 height_s = pd.Series(height_l, index=top_s.index) return height_s
[docs]def annotate_track_with_arrow(arrow_bin, variant_track, setting_dict): '''Annotate the variants track with an arrow. Args: | arrow_bin (int): | variant_track (matplotlib.axes.Axes): axis for the variant track. | setting_dict (dictionary): settings for making the png. ''' if arrow_bin["top"] == 1: variant_track.annotate(arrow_bin["text"], xy=(arrow_bin["x"], setting_dict["v_track_t_arrow_head"]), xycoords='data', xytext=(arrow_bin["x"], setting_dict["v_track_t_start"]+arrow_bin["height"]*setting_dict["v_track_t_inc"]), textcoords='data', ha='center', arrowprops=dict(arrowstyle=setting_dict["v_track_arrow_style"], alpha=0.75), fontsize=8) else: variant_track.annotate(arrow_bin["text"], xy=(arrow_bin["x"], setting_dict["v_track_b_arrow_head"]), xycoords='data', xytext=(arrow_bin["x"], setting_dict["v_track_b_start"]-arrow_bin["height"]*setting_dict["v_track_b_inc"]), textcoords='data', ha='center', arrowprops=dict(arrowstyle=setting_dict["v_track_arrow_style"], alpha=0.75), fontsize=8)
[docs]def make_variant_annotations_key(variant_key, variant_df, setting_dict): '''Make a variant annotations key. Args: | variant_key (matplotlib.axes.Axes): axis for the variant key. | variant_df (DataFrame): contains the variant information. | setting_dict (dictionary): settings for making the png. Returns: variant_key (matplotlib.axes.Axes): axis for the variant key. ''' plt.rc('text',usetex=True) variant_key.set_axis_off() variant_key_txt = r'''\begin{tabular}{''' + 'l'*setting_dict["v_key_num_cols"] + '''} \\\\ ''' #variant_df["var_type_abbrev"] = variant_df.apply(lambda x: setting_dict["v_track_var_abbrevs"][x["effect"]] # if setting_dict["v_track_var_abbrevs"].has_key(x["effect"]) != -1 else "", axis=1) #Abbreviate a variant type. variant_df["var_type_abbrev"] = variant_df.apply(lambda x: setting_dict["v_track_var_abbrevs"][x["effect"]] if "v_track_var_abbrevs" in setting_dict.keys() else "", axis=1) #Abbreviate a variant type. variant_df["variant_txt_str"] = variant_df["id"] + ": " + variant_df["dnachange"] + ", " + variant_df["prot_change"] + ", (" + variant_df["var_type_abbrev"] + ")" variant_df["variant_txt_str"] = variant_df["variant_txt_str"].map(mark_up_special_chars) add_multicol = lambda cell_txt: "\multicolumn{2}{l}{" + cell_txt + "}" if len(cell_txt) > setting_dict["v_key_max_chars_per_col"] else cell_txt variant_df["variant_txt_str"] = variant_df["variant_txt_str"].map(add_multicol) var_txt_str_l = variant_df["variant_txt_str"].tolist() col_idx = 0 for var_txt_str in var_txt_str_l: cols_for_str = 1 m = re.search("\\multicolumn\{(\d*)\}",var_txt_str) if m != None: cols_for_str = int(m.group(1)) if col_idx + cols_for_str > setting_dict["v_key_num_cols"]: variant_key_txt += " \\\ " + var_txt_str col_idx = cols_for_str else: if col_idx > 0: variant_key_txt += " & " variant_key_txt += var_txt_str col_idx += cols_for_str variant_key_txt += '\end{tabular}' variant_key.text(setting_dict["v_key_x"], setting_dict["v_key_y"], variant_key_txt, verticalalignment='top', ha='center', size=setting_dict["v_key_fontsize"], transform=variant_key.transAxes) return variant_key
[docs]def mark_up_special_chars(some_text): '''Mark up special characters for latex text. Args: | some_text (str): Returns: some_text: str ''' some_text = some_text.replace("_", "\_") some_text = some_text.replace(">", "$>$") return some_text