Source code for variants

import matplotlib as mpl
import matplotlib.pyplot as plt
import pandas as pd
import regex as re
import sys

'''
Functions specific to variants: reading in the variant information and creating a variants track.
'''

[docs]def get_variant_df(transcript, variant_file):
    
    '''Read the variant information from a tsv file into a DataFrame.
    
    Args:
        | transcript (str): Ensembl transcript ID.
        | variant_file (str): path to file containing the variants.
    
    Returns:
        variant_df (DataFrame): contains the variant information.
    '''
    
    variant_df = pd.DataFrame([])
    variant_df_chunker = pd.read_csv(variant_file, sep="\t", chunksize=1000)
    for variant_df_chunk in variant_df_chunker:
        variant_df_chunk = variant_df_chunk[variant_df_chunk["featureID"] == transcript]
        variant_df = pd.concat([variant_df, variant_df_chunk], ignore_index=True)
    
    variant_df.fillna("NULL", inplace=True)
    
    return variant_df


[docs]def make_track(variant_track, transcript_len, bound_l, color_l, edge_color_l, variant_df, setting_dict, variant_key):

    '''Make the variants track.
    
    Args:
        | variant_track (matplotlib.axes.Axes): axis for the variants track
        | exon_coord_df (DataFrame): contains the exon base pair and transcript position coordinates.
        | bound_l (list of ints): contains the exons and utr bounds to be used in making the color bar.
        | color_l (list of strs): contains the list of colors to be used in making the color bar.
        | edge_color_l (list): contains the list of edge colors to be used in making the color bar.
        | variant_df (DataFrame): contains the variants information.
        | setting_dict (dictionary): settings for making the png.
        | variant_key (matplotlib.axes.Axes): axis for the variants key.
    '''

    #(1) Make the color bar.
    cmap = mpl.colors.ListedColormap(color_l)
    norm = mpl.colors.BoundaryNorm(bound_l, cmap.N)
    cb = mpl.colorbar.ColorbarBase(variant_track, cmap=cmap, norm=norm, boundaries=bound_l, spacing='proportional',
                                   orientation='horizontal', drawedges=False)
    plt.setp(variant_track.get_xticklabels(), visible=False)
    plt.rc('text', usetex=True)
    variant_track.set_ylabel(setting_dict["v_track_y_axis_label"], rotation='horizontal', ha='right', va='center', size=setting_dict["v_track_fontsize"])
    
    #(2) Annotate variant track with variants.
    annotate_track_with_variants(variant_track, variant_df, transcript_len, setting_dict)
    
    #(3) Make the variant annotations key.
    make_variant_annotations_key(variant_key, variant_df, setting_dict)


[docs]def annotate_track_with_variants(variant_track, variant_df, transcript_len, setting_dict):

    '''Annotate the variants track with arrows for the variants.
    
    Args:
        | variant_track (matplotlib.axes.Axes): axis for the variant track.
        | variant_df (DataFrame): contains the variant information.
        | transcript_len (int): transcript length.
        | setting_dict (dictionary): settings for making the png.
        
    Returns:
        variant_track (matplotlib.axes.Axes): axis for the variant track.
    '''

    if variant_df.shape[0] == 0:
        return variant_track
    
    #Add columns to variant_df for annotating variants with arrows: ID, axes x coordinates, arrow bin.
    variant_df["id"] = pd.Series([str(i) for i in range(1,variant_df.shape[0]+1)], index=variant_df.index)
    variant_df["top"] = variant_df.apply(lambda x: 1 if setting_dict["v_track_vars_t_or_b"][x["effect"]] == "T" else 0,
                                         axis=1) #Column for whether variant should be annotated with a top or bottom arrow.
    #Add column to determine whether each arrow should be merged with the previous.
    variant_df["trans_pos_pc"] = variant_df["tp"]/transcript_len #axes x coordinates.
    get_display_from_axes_coords = lambda axes_x_coord: variant_track.transAxes.transform((axes_x_coord,0))[0] - variant_track.transAxes.transform((0,0))[0]
    variant_df["num_pixels_diff"] = variant_df["trans_pos_pc"].diff().apply(get_display_from_axes_coords)
    variant_df["merge_wt_prev"] = variant_df["num_pixels_diff"] <= setting_dict["v_track_merge_pixel_thresh"]
    variant_df["arrow_bin"] = [0]*len(variant_df.index)
    arrow_bin = 1
    variant_df.at[0,"arrow_bin"] = arrow_bin
    for i in range(1,len(variant_df.index)):
        if variant_df.iloc[i]["merge_wt_prev"] == False or variant_df.iloc[i]["top"] != variant_df.iloc[i-1]["top"]:
            arrow_bin += 1
        variant_df.at[i,"arrow_bin"] = arrow_bin
    
    #Create new dataframe where each row corresponds to 1 arrow.
    x_pos_s = variant_df.groupby("arrow_bin")["trans_pos_pc"].mean()
    text_s = variant_df.groupby("arrow_bin").apply(lambda x: ",".join(x["id"].tolist())) #Make the annotation text string for a variant arrow.
    top_s = variant_df.groupby("arrow_bin").apply(lambda x: x["top"].tolist()[0]) #Get whether the arrow is a top or bottom arrow.   
    heights_s = get_arrow_height_s(top_s, setting_dict)
    variant_annotation_df = pd.concat([x_pos_s, text_s, top_s, heights_s],axis=1)
    variant_annotation_df.rename(columns={"trans_pos_pc":"x", 0:"text", 1:"top", 2:"height"}, inplace=True)

    #Annotate the variants.
    variant_annotation_df.apply(axis=1, func=annotate_track_with_arrow, variant_track=variant_track, setting_dict=setting_dict)    

    #Add text to indicate which variants types are annotated above and below the colorbar.
    variant_track.text(setting_dict["v_track_vars_text_top_x"], setting_dict["v_track_vars_text_top_y"], setting_dict["v_track_vars_text_top"],
                        ha='center', va='bottom', size=setting_dict["v_track_fontsize"])
    variant_track.text(setting_dict["v_track_vars_text_bot_x"], setting_dict["v_track_vars_text_bot_y"], setting_dict["v_track_vars_text_bot"],
                        ha='center', va='top', size=setting_dict["v_track_fontsize"])
    
    return variant_track


[docs]def get_arrow_height_s(top_s, setting_dict):
    
    '''Get the height of the arrow.
    
    Args:
        top_s (Series): indicates whether a variant is annotated with an arrow above or beneath the colorbar.
    
    Returns:
        height_s (Series): indicates the height the arrow used to annotate each variant.
    '''
    
    top_l = top_s.tolist()
    height_l = []
    t_bin,b_bin = 0,0
    for i in range(len(top_l)):
        if top_l[i] == 0:
            height_l.append(t_bin % setting_dict["v_track_num_arrow_heights"])
            t_bin += 1
        else:
            height_l.append(b_bin % setting_dict["v_track_num_arrow_heights"])
            b_bin += 1
    
    height_s = pd.Series(height_l, index=top_s.index)
    
    return height_s

    
[docs]def annotate_track_with_arrow(arrow_bin, variant_track, setting_dict):

    '''Annotate the variants track with an arrow.
    
    Args:
        | arrow_bin (int):
        | variant_track (matplotlib.axes.Axes): axis for the variant track.
        | setting_dict (dictionary): settings for making the png.
    '''

    if arrow_bin["top"] == 1:
        variant_track.annotate(arrow_bin["text"], xy=(arrow_bin["x"], setting_dict["v_track_t_arrow_head"]), xycoords='data',
                                xytext=(arrow_bin["x"], setting_dict["v_track_t_start"]+arrow_bin["height"]*setting_dict["v_track_t_inc"]), textcoords='data',
                                ha='center', arrowprops=dict(arrowstyle=setting_dict["v_track_arrow_style"], alpha=0.75), fontsize=8)
    else:
        variant_track.annotate(arrow_bin["text"], xy=(arrow_bin["x"], setting_dict["v_track_b_arrow_head"]), xycoords='data',
                                xytext=(arrow_bin["x"], setting_dict["v_track_b_start"]-arrow_bin["height"]*setting_dict["v_track_b_inc"]), textcoords='data',
                                ha='center', arrowprops=dict(arrowstyle=setting_dict["v_track_arrow_style"], alpha=0.75), fontsize=8)
    

[docs]def make_variant_annotations_key(variant_key, variant_df, setting_dict):
    
    '''Make a variant annotations key.
    
    Args:
        | variant_key (matplotlib.axes.Axes): axis for the variant key.
        | variant_df (DataFrame): contains the variant information.
        | setting_dict (dictionary): settings for making the png.
    
    Returns:
        variant_key (matplotlib.axes.Axes): axis for the variant key.
    '''
    
    plt.rc('text',usetex=True)
    variant_key.set_axis_off()
    variant_key_txt = r'''\begin{tabular}{''' + 'l'*setting_dict["v_key_num_cols"] + '''} \\\\ ''' 
    #variant_df["var_type_abbrev"] = variant_df.apply(lambda x: setting_dict["v_track_var_abbrevs"][x["effect"]] 
    #                                                 if setting_dict["v_track_var_abbrevs"].has_key(x["effect"]) != -1 else "", axis=1) #Abbreviate a variant type.
    variant_df["var_type_abbrev"] = variant_df.apply(lambda x: setting_dict["v_track_var_abbrevs"][x["effect"]] 
                                                     if "v_track_var_abbrevs" in setting_dict.keys() else "", axis=1) #Abbreviate a variant type.
    variant_df["variant_txt_str"] = variant_df["id"] + ": " + variant_df["dnachange"] + ", " + variant_df["prot_change"] + ", (" + variant_df["var_type_abbrev"] + ")"
    variant_df["variant_txt_str"] = variant_df["variant_txt_str"].map(mark_up_special_chars)    
    add_multicol = lambda cell_txt: "\multicolumn{2}{l}{" + cell_txt + "}" if len(cell_txt) > setting_dict["v_key_max_chars_per_col"] else cell_txt
    variant_df["variant_txt_str"] = variant_df["variant_txt_str"].map(add_multicol)
    var_txt_str_l = variant_df["variant_txt_str"].tolist()
    col_idx = 0
    for var_txt_str in var_txt_str_l:
        cols_for_str = 1
        m = re.search("\\multicolumn\{(\d*)\}",var_txt_str)
        if m != None: 
            cols_for_str = int(m.group(1))
        if col_idx + cols_for_str > setting_dict["v_key_num_cols"]:
            variant_key_txt += " \\\ " + var_txt_str
            col_idx = cols_for_str
        else:
            if col_idx > 0:
                variant_key_txt += " & "
            variant_key_txt += var_txt_str
            col_idx += cols_for_str
    
    variant_key_txt += '\end{tabular}'
    variant_key.text(setting_dict["v_key_x"], setting_dict["v_key_y"], variant_key_txt, verticalalignment='top', ha='center',
                     size=setting_dict["v_key_fontsize"], transform=variant_key.transAxes)

    return variant_key


[docs]def mark_up_special_chars(some_text):
    
    '''Mark up special characters for latex text.
    
    Args:
        | some_text (str):
    
    Returns:
        some_text: str
    '''
    
    some_text = some_text.replace("_", "\_")
    some_text = some_text.replace(">", "$>$")
    
    return some_text