Source code for protdomains

import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import sys
#import random
import numpy as np


'''
Functions specific to protein domains: reading in the protein domains and creating a protein domains track.
'''

[docs]def get_protein_domain_df(protein_domain_file, transcript_l, database, sortby_col_l): '''Read the protein domain information from a tsv file into a DataFrame. Args: | protein_domain_file (str): path to protein domain file. | transcript_l (list of strs): list of Ensembl transcript IDs. | database (str): protein domain database. | sortby_col_l (list of strs): columns to sort the DataFrame by. Returns: protein_domain_df (DataFrame): contains the protein domain information. ''' protein_domain_df = pd.DataFrame([]) protein_domain_df_chunker = pd.read_csv(protein_domain_file, sep="\t", chunksize=1000) for protein_domain_df_chunk in protein_domain_df_chunker: protein_domain_df_chunk = protein_domain_df_chunk[["TranscriptID","Length","DomainID","Start","End","Domain_type","name","UniprotID"]] protein_domain_df_chunk = protein_domain_df_chunk[(protein_domain_df_chunk["TranscriptID"].isin(transcript_l)) & (protein_domain_df_chunk["Domain_type"]==database)] protein_domain_df = pd.concat([protein_domain_df, protein_domain_df_chunk], ignore_index=True) protein_domain_df.sort_values(by=sortby_col_l, inplace=True) return protein_domain_df
[docs]def get_protein_domain_color_s(protein_domain_df): '''Create a protein domain color series where the indexes are the protein domains, and the values are the colors. Args: | protein_domain_df (DataFrame): contains the protein domain information Returns: protein_domain_color_s (Series): contains colors indexed by protein domain. ''' protein_domain_l = pd.unique(protein_domain_df["DomainID"]) #random.shuffle(protein_domain_l) cmap = mpl.cm.get_cmap(name="Set1", lut=len(protein_domain_l)) cmap_tmp_l = [cmap(i) for i in range(len(protein_domain_l))] cmap_l = [] for c in cmap_tmp_l: if isinstance(c, str): cmap_l.append(c) elif isinstance(c, tuple): cmap_l.append(",".join([str(f) for f in list(c)])) protein_domain_color_s = pd.Series(data=cmap_l, index=protein_domain_l) protein_domain_color_s.ix["none"] = "#D1D0CE" protein_domain_color_s.ix["utr"] = "white" return protein_domain_color_s
[docs]def make_track(track, protein_domain_df, utr_df, protein_domain_color_s, setting_dict, variant_track): '''Make the protein domain track. Args: | track (matplotlib.axes.Axes): axis for the protein domain track. | protein_domain_df (DataFrame): contains the protein domain information. | utr_df (DataFrame): contains the utr information. | protein_domain_color_s (Series): colors indexed by protein domain IDs. | setting_dict (dictionary): settings for making the png. | variants_track (matplotlib.axes.Axes): axis for the variant track.''' #Work out the protein domain start and end transcript positions. protein_domain_df["start_tp_pc"] = (protein_domain_df["Start"]/protein_domain_df["Length"]).astype(float) protein_domain_df["end_tp_pc"] = (protein_domain_df["End"]/protein_domain_df["Length"]).astype(float) transcript_len = utr_df["end_tp"].max() tr_start = utr_df[utr_df["utr"]=="5'"]["end_tp"].max() tr_end = utr_df[utr_df["utr"]=="3'"]["start_tp"].min() - 1 tr_diff = tr_end - tr_start protein_domain_df["start_tp"] = (tr_start+tr_diff*protein_domain_df["start_tp_pc"]).astype(int) protein_domain_df["end_tp"] = (tr_start+tr_diff*protein_domain_df["end_tp_pc"]).astype(int) #Make the colorbar. [protein_domain_bound_l, bound_l, color_l] = get_bound_color_ls_for_cb(protein_domain_df, utr_df, protein_domain_color_s, setting_dict["pd_track_stripe_min_bases"]) cmap = mpl.colors.ListedColormap(color_l) norm = mpl.colors.BoundaryNorm(bound_l, cmap.N) cb = mpl.colorbar.ColorbarBase(track, cmap=cmap, norm=norm, boundaries=bound_l, spacing='proportional', orientation='horizontal', ticks=[], drawedges=False)#, alpha=0.5) plt.setp(track.get_yticklabels(),visible=False) plt.rc('text',usetex=True) track.set_ylabel(setting_dict["pd_track_y_label"], rotation='horizontal', ha='right', va=setting_dict["pd_track_y_label_va"], position=(setting_dict["pd_track_y_label_x"], setting_dict["pd_track_y_label_y"]), size=setting_dict["pd_track_y_label_fontsize"]) #Make the protein domain track legend. make_legend(track, protein_domain_df, protein_domain_color_s, setting_dict["pd_track_legend_fontsize"], setting_dict["pd_track_legend_max_chars_per_row"], setting_dict["pd_track_legend_bbox"]) #Add tick marks for the protein domains to the variant track. if variant_track != None: variant_track = set_colorbar_ticks(variant_track, protein_domain_bound_l, transcript_len, "both", 5, 1)
[docs]def get_bound_color_ls_for_cb(protein_domain_df, utr_df, protein_domain_color_s, stripe_min_bases): ''' Get the bound and color lists for the colorbar. These are (1) pd_bound_l, list of protein domain bounds which can be used to mark the protein domains on the variant track; (2) whole_trans_stripe_bound_l, list of regions covering the whole transcript in which contiguous regions have different contents (utr, protein domain(s), empty), and regions containing > 1 protein domain are striped. This list will be used to make the protein domain colorbar; (3) whole_trans_stripe_color_l, list of colors corresponding to whole_trans_stripe_bound_l. Args: | protein_domain_df (DataFrame): contains the protein domain information. | utr_df (DataFrame): contains the utr information. | protein_domain_color_s (Series): colors indexed by protein domain IDs. | stripe_min_bases: minimum stripe width in bases. Returns: | pd_bound_l (list of ints): list of protein domain bounds | whole_trans_stripe_bound_l (list of ints): list of utr and protein domain bounds | whole_trans_stripe_color_l (list of strs and tuples of floats): list of corresponding colors for the latter. ''' #Get the bounds without the utrs. protein_domain_start_tp_l = protein_domain_df["start_tp"].tolist() protein_domain_end_tp_l = protein_domain_df["end_tp"].tolist() pd_bound_l = [[protein_domain_start_tp_l[i], protein_domain_end_tp_l[i]] for i in range(len(protein_domain_start_tp_l))] pd_bound_l = [bound for bound_l in pd_bound_l for bound in bound_l] #Add the utrs to the protein domains. bound_color_df = pd.concat([protein_domain_df, utr_df]) bound_color_df = bound_color_df[["DomainID", "start_tp", "end_tp"]] bound_color_df["DomainID"].fillna(value="utr", inplace=True) bound_color_df.sort_values(["start_tp", "end_tp", "DomainID"], inplace=True) #print bound_color_df #Divide the whole transcript into regions such that contiguous regions differ wrt their contents (utr, protein domain(s), empty) domain_l = bound_color_df["DomainID"].tolist() start_bound_l = bound_color_df["start_tp"].tolist() end_bound_l = bound_color_df["end_tp"].tolist() pd_utr_bound_ll = [[start_bound_l[i], end_bound_l[i]] for i in range(len(start_bound_l))] whole_trans_bound_ll = get_whole_trans_bound_ll(pd_utr_bound_ll) whole_trans_domain_ll = get_whole_trans_domain_ll(whole_trans_bound_ll, pd_utr_bound_ll, domain_l) #print whole_trans_domain_ll #Add the none regions. whole_trans_domain_ll = [["none"] if len(whole_trans_domain_ll[i]) == 0 else whole_trans_domain_ll[i] for i in range(len(whole_trans_domain_ll))] #print whole_trans_domain_ll #Add bounds for stripes in the regions containing > 1 protein domain. whole_trans_stripe_bound_l, whole_trans_stripe_color_l = [],[] for i in range(len(whole_trans_bound_ll)): if len(whole_trans_domain_ll[i]) == 1: whole_trans_stripe_bound_l.append(whole_trans_bound_ll[i][0]) whole_trans_stripe_color_l.append(protein_domain_color_s.ix[whole_trans_domain_ll[i][0]]) else: [stripe_bound_l, stripe_domain_l] = generate_stripe_ls(whole_trans_bound_ll, whole_trans_domain_ll, i, stripe_min_bases) whole_trans_stripe_bound_l.extend(stripe_bound_l[:-1])#Don't need the region end bound (use next region start bound in its place). whole_trans_stripe_color_l.extend([protein_domain_color_s.ix[domain] for domain in stripe_domain_l]) whole_trans_stripe_bound_l.append(whole_trans_bound_ll[-1][1]) #print whole_trans_stripe_bound_l #print len(whole_trans_stripe_bound_l) #print whole_trans_stripe_color_l #print len(whole_trans_stripe_color_l) return [pd_bound_l, whole_trans_stripe_bound_l, whole_trans_stripe_color_l]
[docs]def get_whole_trans_bound_ll(pd_utr_bound_ll): '''Divide the whole transcript into regions such that contiguous regions differ wrt their contents (utr, protein domain(s), empty), and return their bounds. Args: | pd_utr_bound_ll (list of list of ints): the start and end bounds of the utrs and protein domains. Returns: whole_trans_bound_ll (list of list of ints): lists containing start and end bound coordinates covering the whole transcript, where contiguous regions differ wrt their contents (utr, protein domain(s), neither). ''' #print "pd_utr_bound_ll: " + str(pd_utr_bound_ll) se_ll = [["s","e"] for bound_l in pd_utr_bound_ll] se_l = [bound_type for se_l in se_ll for bound_type in se_l] bound_l = [bound for bound_l in pd_utr_bound_ll for bound in bound_l] idx_l = sorted(range(len(bound_l)), key=lambda k: bound_l[k]) bound_l = sorted(bound_l) se_l = [se_l[idx] for idx in idx_l] #print bound_l #print se_l whole_trans_bound_ll = [] for i in range(len(bound_l)-1): bound = [] if se_l[i] == "s": bound.append(bound_l[i]) else: bound.append(bound_l[i] + 1) if se_l[i+1] == "s": bound.append(bound_l[i+1] - 1) else: bound.append(bound_l[i+1]) if i > 0: if bound[0] <= whole_trans_bound_ll[-1][1]: bound[0] = whole_trans_bound_ll[-1][1] + 1 if bound[1] >= bound[0]: whole_trans_bound_ll.append(bound) return whole_trans_bound_ll
[docs]def get_whole_trans_domain_ll(whole_trans_bound_ll, pd_utr_bound_ll, domain_l): '''Get the utr or protein domain(s) in each region defined in whole_trans_bound_ll. Args: | whole_trans_bound_ll (list of lists of ints): lists containing start and end bound coordinates covering the whole transcript, where contiguous regions differ wrt their contents (utr, protein domain(s), empty). | pd_utr_bound_ll (list of lists of ints): the start and end bounds of the utrs and protein domains. | domain_l (list of strs): list of protein domains corresponding to the bounds in pd_utr_bound_ll. Returns: whole_trans_domain_ll (list of lists of strs): the contents (utr, protein domain(s), empty) of each region in whole_trans_bound_ll. ''' whole_trans_domain_ll = [] for i in range(len(whole_trans_bound_ll)): overlap_domain_l = [] new_set = set(range(whole_trans_bound_ll[i][0],whole_trans_bound_ll[i][1]+1)) for j in range(len(pd_utr_bound_ll)): old_set = set(range(pd_utr_bound_ll[j][0],pd_utr_bound_ll[j][1]+1)) if len(new_set.intersection(old_set)) > 0: overlap_domain_l.append(domain_l[j]) whole_trans_domain_ll.append(overlap_domain_l) #print whole_trans_domain_ll return whole_trans_domain_ll
[docs]def generate_stripe_ls(whole_trans_bound_ll, whole_trans_domain_ll, i, stripe_min_bases): '''Generate the stripe lists for regions which contain greater than 1 protein domain. The 1st color will be for a protein domain not in the left-neighbouring region, and the last will be for a protein domain not in the right-neigbouring region. Args: | whole_trans_bound_ll (list of lists of ints): lists containing start and end bound coordinates covering the whole transcript, where contiguous regions differ wrt their contents (utr, protein domain(s), empty). | whole_trans_domain_ll (list of lists of strs): the contents (utr, protein domain(s), empty) of each region in whole_trans_bound_ll. | i (int): index in whole_trans_bound_ll. | stripe_min_bases: minimum stripe width in bases. Returns: | stripe_bound_l (list of ints): bounds for stripes, including end bound of region. | stripe_domain_l (list of strs): protein domains corresponding to the stripe bounds (alternates through the protein domains in this region). ''' def get_domain_removed(domain_l_1, domain_l_2): diff_l = list(set(domain_l_1) - set(domain_l_2)) new_domain = None if len(diff_l) == 0 else diff_l[0] return new_domain #Work out the 1st domain, last domain and other domains. first_domain = None if i == 0 else get_domain_removed(whole_trans_domain_ll[i], whole_trans_domain_ll[i-1]) last_domain = None if i >= len(whole_trans_domain_ll) - 1 else get_domain_removed(whole_trans_domain_ll[i], whole_trans_domain_ll[i+1]) other_domain_l = [domain for domain in whole_trans_domain_ll[i] if domain not in [first_domain, last_domain]] domain_l = list(filter(lambda x: x != None, pd.unique([first_domain] + other_domain_l + [last_domain]))) #Work out the number of intervals. num_intervals = len(list(filter(lambda x: x != None, [first_domain] + other_domain_l + [last_domain]))) interval_inc = num_intervals - 1 if first_domain == last_domain and first_domain != None else num_intervals region_len = whole_trans_bound_ll[i][1] - whole_trans_bound_ll[i][0] while float(region_len)/num_intervals >= stripe_min_bases: num_intervals += interval_inc #Generate the stripe bounds and corresponding domains. stripe_bound_l = list(np.linspace(whole_trans_bound_ll[i][0], whole_trans_bound_ll[i][1], num_intervals+1)) stripe_domain_l = [domain_l[k % len(domain_l)] for k in range(len(stripe_bound_l)-1)] return [stripe_bound_l, stripe_domain_l]
[docs]def make_legend(protein_domain_track, protein_domain_df, protein_domain_color_s, fontsize, max_chars_per_row, bbox): '''Make the protein domains track legend. Args: | protein_domain_track (matplotlib.axes.Axes): axis for the protein domain track. | protein_domain_df (DataFrame): contains the protein domain information. | protein_domain_color_s: colors indexed by protein domain IDs. | fontsize (int): fontsize for text in the legend. | max_chars_per_row: maximum number of characters for 1 row of the legend. | bbox: where to anchor the legend (uses the axes coordinate system so (0,0) is bottom left and (1.0,1.0) is top right. ''' protein_domain_df = protein_domain_df.drop_duplicates(["DomainID"]) pds_df_minus_utrs = protein_domain_df[~protein_domain_df["DomainID"].isin(["utr_3","utr_5"])] handles = [mpl.patches.Patch(facecolor=protein_domain_color_s.ix[protein_domain]) for protein_domain in pds_df_minus_utrs["DomainID"]] labels = [description for description in pds_df_minus_utrs["name"].dropna().tolist()] n_cols = get_legend_num_cols(pds_df_minus_utrs["name"].dropna().tolist(),max_chars_per_row) legend = protein_domain_track.legend(handles, labels, bbox_to_anchor=bbox, ncol=n_cols, mode="expand", bbox_transform=protein_domain_track.transAxes) for label in legend.get_texts(): label.set_fontsize(fontsize)
[docs]def get_legend_num_cols(protein_domain_description_l, max_num_chars_per_row): '''Work out the number of columns to use in the protein domain track legend. Args: | protein_domain_description_l (list of strs): protein domain descriptions | max_num_chars_per_row (int): maximum number of characters for 1 row of the legend. Returns: num_cols (int): number of columns. ''' patch_len = 6 min_patch_text_gap = 2 min_text_patch_gap = 5 num_cols = len(protein_domain_description_l) while num_cols >= 1: zero_rows_too_long = True num_chars_in_row = 0 for i in range(len(protein_domain_description_l)): num_chars_in_row += patch_len + min_patch_text_gap + len(protein_domain_description_l[i]) + min_text_patch_gap if num_chars_in_row > max_num_chars_per_row: zero_rows_too_long = False if (i+1) % num_cols == 0: num_chars_in_row = 0 if zero_rows_too_long == True: break num_cols -= 1 return num_cols
[docs]def set_colorbar_ticks(track, bound_l, transcript_len, top_or_bottom, markersize, markeredgewidth): '''Set the ticks on a colorbar. Args: | track (matplotlib.axes.Axes): axis for a track. | bound_l: list of ints | transcript_len (int): length of the transcript. | top_or_bottom (str): indicates whether the ticks should be below or above or both. | markersize (int): tick size. | markeredgewidth (int): tick width. Returns: track (matplotlib.axes.Axes): the axis with ticks added. ''' for i in range(len(bound_l)): bound_l[i] = float(bound_l[i]-1)/float(transcript_len-1) #This assumes the xlim is (1, transcript_length). plt.setp(track.xaxis.get_ticklines(),'markersize', markersize) plt.setp(track.xaxis.get_ticklines(),'markeredgewidth', markeredgewidth) track.xaxis.set_ticks(bound_l) track.xaxis.set_ticks_position(top_or_bottom) track.set_xticklabels([""]*len(bound_l)) return track