import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import sys
#import random
import numpy as np
'''
Functions specific to protein domains: reading in the protein domains and creating a protein domains track.
'''
[docs]def get_protein_domain_df(protein_domain_file, transcript_l, database, sortby_col_l):
'''Read the protein domain information from a tsv file into a DataFrame.
Args:
| protein_domain_file (str): path to protein domain file.
| transcript_l (list of strs): list of Ensembl transcript IDs.
| database (str): protein domain database.
| sortby_col_l (list of strs): columns to sort the DataFrame by.
Returns:
protein_domain_df (DataFrame): contains the protein domain information.
'''
protein_domain_df = pd.DataFrame([])
protein_domain_df_chunker = pd.read_csv(protein_domain_file, sep="\t", chunksize=1000)
for protein_domain_df_chunk in protein_domain_df_chunker:
protein_domain_df_chunk = protein_domain_df_chunk[["TranscriptID","Length","DomainID","Start","End","Domain_type","name","UniprotID"]]
protein_domain_df_chunk = protein_domain_df_chunk[(protein_domain_df_chunk["TranscriptID"].isin(transcript_l)) & (protein_domain_df_chunk["Domain_type"]==database)]
protein_domain_df = pd.concat([protein_domain_df, protein_domain_df_chunk], ignore_index=True)
protein_domain_df.sort_values(by=sortby_col_l, inplace=True)
return protein_domain_df
[docs]def get_protein_domain_color_s(protein_domain_df):
'''Create a protein domain color series where the indexes are the protein domains, and the values are the colors.
Args:
| protein_domain_df (DataFrame): contains the protein domain information
Returns:
protein_domain_color_s (Series): contains colors indexed by protein domain.
'''
protein_domain_l = pd.unique(protein_domain_df["DomainID"])
#random.shuffle(protein_domain_l)
cmap = mpl.cm.get_cmap(name="Set1", lut=len(protein_domain_l))
cmap_tmp_l = [cmap(i) for i in range(len(protein_domain_l))]
cmap_l = []
for c in cmap_tmp_l:
if isinstance(c, str):
cmap_l.append(c)
elif isinstance(c, tuple):
cmap_l.append(",".join([str(f) for f in list(c)]))
protein_domain_color_s = pd.Series(data=cmap_l, index=protein_domain_l)
protein_domain_color_s.ix["none"] = "#D1D0CE"
protein_domain_color_s.ix["utr"] = "white"
return protein_domain_color_s
[docs]def make_track(track, protein_domain_df, utr_df, protein_domain_color_s, setting_dict, variant_track):
'''Make the protein domain track.
Args:
| track (matplotlib.axes.Axes): axis for the protein domain track.
| protein_domain_df (DataFrame): contains the protein domain information.
| utr_df (DataFrame): contains the utr information.
| protein_domain_color_s (Series): colors indexed by protein domain IDs.
| setting_dict (dictionary): settings for making the png.
| variants_track (matplotlib.axes.Axes): axis for the variant track.'''
#Work out the protein domain start and end transcript positions.
protein_domain_df["start_tp_pc"] = (protein_domain_df["Start"]/protein_domain_df["Length"]).astype(float)
protein_domain_df["end_tp_pc"] = (protein_domain_df["End"]/protein_domain_df["Length"]).astype(float)
transcript_len = utr_df["end_tp"].max()
tr_start = utr_df[utr_df["utr"]=="5'"]["end_tp"].max()
tr_end = utr_df[utr_df["utr"]=="3'"]["start_tp"].min() - 1
tr_diff = tr_end - tr_start
protein_domain_df["start_tp"] = (tr_start+tr_diff*protein_domain_df["start_tp_pc"]).astype(int)
protein_domain_df["end_tp"] = (tr_start+tr_diff*protein_domain_df["end_tp_pc"]).astype(int)
#Make the colorbar.
[protein_domain_bound_l, bound_l, color_l] = get_bound_color_ls_for_cb(protein_domain_df, utr_df, protein_domain_color_s,
setting_dict["pd_track_stripe_min_bases"])
cmap = mpl.colors.ListedColormap(color_l)
norm = mpl.colors.BoundaryNorm(bound_l, cmap.N)
cb = mpl.colorbar.ColorbarBase(track, cmap=cmap, norm=norm, boundaries=bound_l,
spacing='proportional', orientation='horizontal', ticks=[], drawedges=False)#, alpha=0.5)
plt.setp(track.get_yticklabels(),visible=False)
plt.rc('text',usetex=True)
track.set_ylabel(setting_dict["pd_track_y_label"], rotation='horizontal', ha='right', va=setting_dict["pd_track_y_label_va"],
position=(setting_dict["pd_track_y_label_x"], setting_dict["pd_track_y_label_y"]), size=setting_dict["pd_track_y_label_fontsize"])
#Make the protein domain track legend.
make_legend(track, protein_domain_df, protein_domain_color_s, setting_dict["pd_track_legend_fontsize"], setting_dict["pd_track_legend_max_chars_per_row"],
setting_dict["pd_track_legend_bbox"])
#Add tick marks for the protein domains to the variant track.
if variant_track != None:
variant_track = set_colorbar_ticks(variant_track, protein_domain_bound_l, transcript_len, "both", 5, 1)
[docs]def get_bound_color_ls_for_cb(protein_domain_df, utr_df, protein_domain_color_s, stripe_min_bases):
'''
Get the bound and color lists for the colorbar. These are (1) pd_bound_l, list of protein domain bounds
which can be used to mark the protein domains on the variant track; (2) whole_trans_stripe_bound_l, list of
regions covering the whole transcript in which contiguous regions have different contents (utr, protein
domain(s), empty), and regions containing > 1 protein domain are striped. This list will be used to make the
protein domain colorbar; (3) whole_trans_stripe_color_l, list of colors corresponding to
whole_trans_stripe_bound_l.
Args:
| protein_domain_df (DataFrame): contains the protein domain information.
| utr_df (DataFrame): contains the utr information.
| protein_domain_color_s (Series): colors indexed by protein domain IDs.
| stripe_min_bases: minimum stripe width in bases.
Returns:
| pd_bound_l (list of ints): list of protein domain bounds
| whole_trans_stripe_bound_l (list of ints): list of utr and protein domain bounds
| whole_trans_stripe_color_l (list of strs and tuples of floats): list of corresponding colors for the latter.
'''
#Get the bounds without the utrs.
protein_domain_start_tp_l = protein_domain_df["start_tp"].tolist()
protein_domain_end_tp_l = protein_domain_df["end_tp"].tolist()
pd_bound_l = [[protein_domain_start_tp_l[i], protein_domain_end_tp_l[i]] for i in range(len(protein_domain_start_tp_l))]
pd_bound_l = [bound for bound_l in pd_bound_l for bound in bound_l]
#Add the utrs to the protein domains.
bound_color_df = pd.concat([protein_domain_df, utr_df])
bound_color_df = bound_color_df[["DomainID", "start_tp", "end_tp"]]
bound_color_df["DomainID"].fillna(value="utr", inplace=True)
bound_color_df.sort_values(["start_tp", "end_tp", "DomainID"], inplace=True)
#print bound_color_df
#Divide the whole transcript into regions such that contiguous regions differ wrt their contents (utr, protein domain(s), empty)
domain_l = bound_color_df["DomainID"].tolist()
start_bound_l = bound_color_df["start_tp"].tolist()
end_bound_l = bound_color_df["end_tp"].tolist()
pd_utr_bound_ll = [[start_bound_l[i], end_bound_l[i]] for i in range(len(start_bound_l))]
whole_trans_bound_ll = get_whole_trans_bound_ll(pd_utr_bound_ll)
whole_trans_domain_ll = get_whole_trans_domain_ll(whole_trans_bound_ll, pd_utr_bound_ll, domain_l)
#print whole_trans_domain_ll
#Add the none regions.
whole_trans_domain_ll = [["none"] if len(whole_trans_domain_ll[i]) == 0 else whole_trans_domain_ll[i] for i in range(len(whole_trans_domain_ll))]
#print whole_trans_domain_ll
#Add bounds for stripes in the regions containing > 1 protein domain.
whole_trans_stripe_bound_l, whole_trans_stripe_color_l = [],[]
for i in range(len(whole_trans_bound_ll)):
if len(whole_trans_domain_ll[i]) == 1:
whole_trans_stripe_bound_l.append(whole_trans_bound_ll[i][0])
whole_trans_stripe_color_l.append(protein_domain_color_s.ix[whole_trans_domain_ll[i][0]])
else:
[stripe_bound_l, stripe_domain_l] = generate_stripe_ls(whole_trans_bound_ll, whole_trans_domain_ll, i, stripe_min_bases)
whole_trans_stripe_bound_l.extend(stripe_bound_l[:-1])#Don't need the region end bound (use next region start bound in its place).
whole_trans_stripe_color_l.extend([protein_domain_color_s.ix[domain] for domain in stripe_domain_l])
whole_trans_stripe_bound_l.append(whole_trans_bound_ll[-1][1])
#print whole_trans_stripe_bound_l
#print len(whole_trans_stripe_bound_l)
#print whole_trans_stripe_color_l
#print len(whole_trans_stripe_color_l)
return [pd_bound_l, whole_trans_stripe_bound_l, whole_trans_stripe_color_l]
[docs]def get_whole_trans_bound_ll(pd_utr_bound_ll):
'''Divide the whole transcript into regions such that contiguous regions differ wrt their contents (utr, protein domain(s), empty), and return their bounds.
Args:
| pd_utr_bound_ll (list of list of ints): the start and end bounds of the utrs and protein domains.
Returns:
whole_trans_bound_ll (list of list of ints): lists containing start and end bound coordinates covering the whole transcript, where contiguous regions differ wrt their contents (utr, protein domain(s), neither).
'''
#print "pd_utr_bound_ll: " + str(pd_utr_bound_ll)
se_ll = [["s","e"] for bound_l in pd_utr_bound_ll]
se_l = [bound_type for se_l in se_ll for bound_type in se_l]
bound_l = [bound for bound_l in pd_utr_bound_ll for bound in bound_l]
idx_l = sorted(range(len(bound_l)), key=lambda k: bound_l[k])
bound_l = sorted(bound_l)
se_l = [se_l[idx] for idx in idx_l]
#print bound_l
#print se_l
whole_trans_bound_ll = []
for i in range(len(bound_l)-1):
bound = []
if se_l[i] == "s":
bound.append(bound_l[i])
else:
bound.append(bound_l[i] + 1)
if se_l[i+1] == "s":
bound.append(bound_l[i+1] - 1)
else:
bound.append(bound_l[i+1])
if i > 0:
if bound[0] <= whole_trans_bound_ll[-1][1]:
bound[0] = whole_trans_bound_ll[-1][1] + 1
if bound[1] >= bound[0]:
whole_trans_bound_ll.append(bound)
return whole_trans_bound_ll
[docs]def get_whole_trans_domain_ll(whole_trans_bound_ll, pd_utr_bound_ll, domain_l):
'''Get the utr or protein domain(s) in each region defined in whole_trans_bound_ll.
Args:
| whole_trans_bound_ll (list of lists of ints): lists containing start and end bound coordinates covering the whole transcript, where contiguous regions differ wrt their contents (utr, protein domain(s), empty).
| pd_utr_bound_ll (list of lists of ints): the start and end bounds of the utrs and protein domains.
| domain_l (list of strs): list of protein domains corresponding to the bounds in pd_utr_bound_ll.
Returns:
whole_trans_domain_ll (list of lists of strs): the contents (utr, protein domain(s), empty) of each region in whole_trans_bound_ll.
'''
whole_trans_domain_ll = []
for i in range(len(whole_trans_bound_ll)):
overlap_domain_l = []
new_set = set(range(whole_trans_bound_ll[i][0],whole_trans_bound_ll[i][1]+1))
for j in range(len(pd_utr_bound_ll)):
old_set = set(range(pd_utr_bound_ll[j][0],pd_utr_bound_ll[j][1]+1))
if len(new_set.intersection(old_set)) > 0:
overlap_domain_l.append(domain_l[j])
whole_trans_domain_ll.append(overlap_domain_l)
#print whole_trans_domain_ll
return whole_trans_domain_ll
[docs]def generate_stripe_ls(whole_trans_bound_ll, whole_trans_domain_ll, i, stripe_min_bases):
'''Generate the stripe lists for regions which contain greater than 1 protein domain. The 1st color will be for
a protein domain not in the left-neighbouring region, and the last will be for a protein domain not in the
right-neigbouring region.
Args:
| whole_trans_bound_ll (list of lists of ints): lists containing start and end bound coordinates covering the whole transcript, where contiguous regions differ wrt their contents (utr, protein domain(s), empty).
| whole_trans_domain_ll (list of lists of strs): the contents (utr, protein domain(s), empty) of each region in whole_trans_bound_ll.
| i (int): index in whole_trans_bound_ll.
| stripe_min_bases: minimum stripe width in bases.
Returns:
| stripe_bound_l (list of ints): bounds for stripes, including end bound of region.
| stripe_domain_l (list of strs): protein domains corresponding to the stripe bounds (alternates through the protein domains in this region).
'''
def get_domain_removed(domain_l_1, domain_l_2):
diff_l = list(set(domain_l_1) - set(domain_l_2))
new_domain = None if len(diff_l) == 0 else diff_l[0]
return new_domain
#Work out the 1st domain, last domain and other domains.
first_domain = None if i == 0 else get_domain_removed(whole_trans_domain_ll[i], whole_trans_domain_ll[i-1])
last_domain = None if i >= len(whole_trans_domain_ll) - 1 else get_domain_removed(whole_trans_domain_ll[i], whole_trans_domain_ll[i+1])
other_domain_l = [domain for domain in whole_trans_domain_ll[i] if domain not in [first_domain, last_domain]]
domain_l = list(filter(lambda x: x != None, pd.unique([first_domain] + other_domain_l + [last_domain])))
#Work out the number of intervals.
num_intervals = len(list(filter(lambda x: x != None, [first_domain] + other_domain_l + [last_domain])))
interval_inc = num_intervals - 1 if first_domain == last_domain and first_domain != None else num_intervals
region_len = whole_trans_bound_ll[i][1] - whole_trans_bound_ll[i][0]
while float(region_len)/num_intervals >= stripe_min_bases:
num_intervals += interval_inc
#Generate the stripe bounds and corresponding domains.
stripe_bound_l = list(np.linspace(whole_trans_bound_ll[i][0], whole_trans_bound_ll[i][1], num_intervals+1))
stripe_domain_l = [domain_l[k % len(domain_l)] for k in range(len(stripe_bound_l)-1)]
return [stripe_bound_l, stripe_domain_l]
[docs]def make_legend(protein_domain_track, protein_domain_df, protein_domain_color_s, fontsize, max_chars_per_row, bbox):
'''Make the protein domains track legend.
Args:
| protein_domain_track (matplotlib.axes.Axes): axis for the protein domain track.
| protein_domain_df (DataFrame): contains the protein domain information.
| protein_domain_color_s: colors indexed by protein domain IDs.
| fontsize (int): fontsize for text in the legend.
| max_chars_per_row: maximum number of characters for 1 row of the legend.
| bbox: where to anchor the legend (uses the axes coordinate system so (0,0) is bottom left and (1.0,1.0) is top right.
'''
protein_domain_df = protein_domain_df.drop_duplicates(["DomainID"])
pds_df_minus_utrs = protein_domain_df[~protein_domain_df["DomainID"].isin(["utr_3","utr_5"])]
handles = [mpl.patches.Patch(facecolor=protein_domain_color_s.ix[protein_domain]) for protein_domain in pds_df_minus_utrs["DomainID"]]
labels = [description for description in pds_df_minus_utrs["name"].dropna().tolist()]
n_cols = get_legend_num_cols(pds_df_minus_utrs["name"].dropna().tolist(),max_chars_per_row)
legend = protein_domain_track.legend(handles, labels, bbox_to_anchor=bbox, ncol=n_cols, mode="expand", bbox_transform=protein_domain_track.transAxes)
for label in legend.get_texts():
label.set_fontsize(fontsize)
[docs]def get_legend_num_cols(protein_domain_description_l, max_num_chars_per_row):
'''Work out the number of columns to use in the protein domain track legend.
Args:
| protein_domain_description_l (list of strs): protein domain descriptions
| max_num_chars_per_row (int): maximum number of characters for 1 row of the legend.
Returns:
num_cols (int): number of columns.
'''
patch_len = 6
min_patch_text_gap = 2
min_text_patch_gap = 5
num_cols = len(protein_domain_description_l)
while num_cols >= 1:
zero_rows_too_long = True
num_chars_in_row = 0
for i in range(len(protein_domain_description_l)):
num_chars_in_row += patch_len + min_patch_text_gap + len(protein_domain_description_l[i]) + min_text_patch_gap
if num_chars_in_row > max_num_chars_per_row:
zero_rows_too_long = False
if (i+1) % num_cols == 0:
num_chars_in_row = 0
if zero_rows_too_long == True:
break
num_cols -= 1
return num_cols
[docs]def set_colorbar_ticks(track, bound_l, transcript_len, top_or_bottom, markersize, markeredgewidth):
'''Set the ticks on a colorbar.
Args:
| track (matplotlib.axes.Axes): axis for a track.
| bound_l: list of ints
| transcript_len (int): length of the transcript.
| top_or_bottom (str): indicates whether the ticks should be below or above or both.
| markersize (int): tick size.
| markeredgewidth (int): tick width.
Returns:
track (matplotlib.axes.Axes): the axis with ticks added.
'''
for i in range(len(bound_l)):
bound_l[i] = float(bound_l[i]-1)/float(transcript_len-1) #This assumes the xlim is (1, transcript_length).
plt.setp(track.xaxis.get_ticklines(),'markersize', markersize)
plt.setp(track.xaxis.get_ticklines(),'markeredgewidth', markeredgewidth)
track.xaxis.set_ticks(bound_l)
track.xaxis.set_ticks_position(top_or_bottom)
track.set_xticklabels([""]*len(bound_l))
return track