import utrs as u
import variants as v
import protdomains as pds
import coverage as c
import matplotlib.pyplot as plt
import sys
import pandas as pd
[docs]def make_protein_domain_color_file(protein_domain_file, transcript_l, database, sortby_col_l, out_path):
'''Make a protein domain color file.
Args:
| protein_domain_file (str): path to file containing the protein domain information.
| transcript_l (list of strs): list of Ensembl transcript IDs.
| database (str): protein domain database.
| sortby_col_l (list of strs): list of names of columns to sort the DataFrame by.
| out_path (str): path to write the protein domain color file to.
'''
print("make_protein_domain_color_file")
protein_domain_df = pds.get_protein_domain_df(protein_domain_file, transcript_l, database, sortby_col_l)
protein_domain_color_s = pds.get_protein_domain_color_s(protein_domain_df)
protein_domain_color_s.to_csv(out_path)
print("Written protein domain color file to {0}\n".format(out_path))
[docs]def make_exon_coord_file(cov_file, transcript, out_path):
'''Make an exon coordinate file from a coverage file.
Args:
| cov_file (str): path to coverage file.
| transcript (str): Ensembl transcript ID.
| out_path (str): path to write the exon coordinate file to.
'''
print("make_exon_coord_file")
cov_df = c.get_cov_df(cov_file, transcript, None, None, [])
exon_coord_df = c.get_exon_coord_df(cov_df)
exon_coord_df.to_csv(out_path, index=True)
print("Written exon coordinate file to {0}\n".format(out_path))
[docs]def make_png(transcript_l, title_l, track_l, sample_ll, utr_file_l, exon_coord_file_l, cov_file_l,
variant_file_l, protein_domain_file_l, protein_domain_color_file, setting_dict, png_file):
'''Make a png which contains coverage/variants/protein domain tracks for 1 or more transcripts, subject to space limitations.
Args:
| transcript_l (list of strs): Ensembl transcript ID(s)
| title_l (list of strs): title to use for each transcript.
| track_l (list of strs): list of strings of length 3 which encode whether to generate each of the 3 tracks (coverage, variants, protein domains).
| sample_ll (list of list of strs): list of lists of sample IDs.
| utr_file_l (list of strs): list of utr file paths.
| exon_coord_file_l (list of strs): list of exon coordinate file paths.
| cov_file_l (list of strs): list of coverage file paths.
| variant_file_l (list of strs): list of variant file paths.
| protein_domain_file_l (list of strs): list of protein domain file paths.
| protein_domain_color_file (str): protein domain color file.
| setting_dict (dictionary): settings for making the png.
| png_file (str): path to write the png file to.
'''
print("make_png")
#Check the parameters are well-formed.
if len(transcript_l) == len(title_l) == len(track_l) == len(sample_ll) == len(utr_file_l) == len(exon_coord_file_l) == len(cov_file_l) == len(variant_file_l) == len(protein_domain_file_l):
True
else:
print("ERROR: Parameters of make_png function which are lists must all be the same length.\n")
return False
#Check that the maximum number of tracks is not exceeded.
num_rows, previous_track = 0, None
track_gap_dict = dict(zip(["c","v","pd", None],
[setting_dict["c_track_gap_rows"], setting_dict["v_track_gap_rows"], setting_dict["pd_track_gap_rows"], 0]))
for track_s in track_l:
if track_s[0] == "1":
num_rows += track_gap_dict[previous_track] + setting_dict["c_track_rows"]
previous_track = "c"
if track_s[1] == "1":
num_rows += track_gap_dict[previous_track] + setting_dict["v_anns_top_rows"] + setting_dict["v_track_rows"] + setting_dict["v_anns_bot_rows"] + setting_dict["v_key_rows"]
previous_track = "v"
if track_s[2] == "1":
num_rows += setting_dict["pd_track_rows"] + track_gap_dict[previous_track]
previous_track = "pd"
if num_rows > setting_dict["fig_num_rows"]:
print("ERROR: PNG figure requires {0} rows but there are only {1}\n".format(num_rows,setting_dict["fig_num_rows"]))
return False
#Initialise the figure.
fig = plt.figure()
plt.rc('text', usetex=True)
#Make the axes.
get_tp_from_exon_bp_l = lambda bp, exon_bp_l: exon_bp_l.index(int(bp)) + 1
num_rows = setting_dict["fig_num_rows"]
start_row = 0
title_1_coords = None
for i in range(len(transcript_l)):
print("Transcript: {0}".format(transcript_l[i]))
#Read in the exon positions and the utrs.
exon_coord_df = pd.read_csv(exon_coord_file_l[i], index_col="exon")
strand = "+" if (exon_coord_df["end_bp"] > exon_coord_df["start_bp"]).all() else "-"
if strand == "+":
print("Transcription direction: forward")
elif strand == "-":
print("Transcript direction: reverse")
exon_bp_l = []
for j in range(len(exon_coord_df.index)):
if strand == "+":
exon_bp_l.extend(range(exon_coord_df["start_bp"].iloc[j],exon_coord_df["end_bp"].iloc[j]+1))
elif strand == "-":
exon_bp_l.extend(range(exon_coord_df["start_bp"].iloc[j],exon_coord_df["end_bp"].iloc[j]-1,-1))
utr_df = u.get_utr_df(utr_file_l[i], strand, transcript_l[i])
#Add the transcript positions to utr_df.
utr_df["start_tp"] = utr_df["start_bp"].apply(func=get_tp_from_exon_bp_l, exon_bp_l=exon_bp_l)
utr_df["end_tp"] = utr_df["end_bp"].apply(func=get_tp_from_exon_bp_l, exon_bp_l=exon_bp_l)
#Make the title track
if i == 0:
plt.figtext(setting_dict["title_1_fig_x"], setting_dict["title_1_fig_y"], title_l[i], fontsize=setting_dict["title_fontsize"])
title_1_coords = fig.transFigure.transform((setting_dict["title_1_fig_x"], setting_dict["title_1_fig_y"]))
else:
title_track = plt.subplot2grid((num_rows,1), (start_row,0))
title_track.set_axis_off()
inv = title_track.transData.inverted()
title_track.text(inv.transform(title_1_coords)[0], setting_dict["title_2_ax_y"], title_l[i], fontsize=setting_dict["title_fontsize"])
start_row += setting_dict["t_track_rows"]
#Make the coverage track.'''
if track_l[i][0] == "1":
print("Making coverage track.")
cov_df = c.get_cov_df(cov_file_l[i], transcript_l[i], None, None, sample_ll[i])
[bound_l, color_l, edge_color_l] = get_exon_bound_color_l(exon_coord_df, utr_df, strand)
coverage_track = plt.subplot2grid((num_rows,1), (start_row, 0), rowspan=setting_dict["c_track_rows"])
start_row += setting_dict["c_track_rows"]
c.make_track(coverage_track, cov_df, bound_l, color_l, edge_color_l, setting_dict)
start_row += setting_dict["c_track_gap_rows"]
#Make the variants track.'''
variant_track = None
if track_l[i][1] == "1": #Make the variants track.
print("Making variant track.")
variant_df = v.get_variant_df(transcript_l[i], variant_file_l[i])
variant_df.rename(columns={"pos":"bp"}, inplace=True)
variant_df["tp"] = variant_df["bp"].apply(func=get_tp_from_exon_bp_l, exon_bp_l=exon_bp_l)
#variant_df.drop_duplicates(subset=["GENE_prot_change","GENE_DNA_change"], inplace=True)
variant_df.sort_values(by="tp", inplace=True)
start_row += setting_dict["v_anns_top_rows"]
variant_track = plt.subplot2grid((num_rows,1), (start_row,0))
start_row += setting_dict["v_track_rows"]
start_row += setting_dict["v_anns_bot_rows"]
variant_key = plt.subplot2grid((num_rows,1), (start_row,0), rowspan=setting_dict["v_key_rows"])
start_row += setting_dict["v_key_rows"]
[bound_l, color_l, edge_color_l] = get_exon_bound_color_l(exon_coord_df, utr_df, strand)
v.make_track(variant_track, bound_l[-1], bound_l, color_l, edge_color_l, variant_df, setting_dict, variant_key)
start_row += setting_dict["v_track_gap_rows"]
#Make the protein domain track.'''
if track_l[i][2] == "1": #Make the protein domains track.
print("Making protein domain track.")
protein_domain_df = pds.get_protein_domain_df(protein_domain_file_l[i], [transcript_l[i]], "Pfam", ["Start","End"])
protein_domain_track = plt.subplot2grid((num_rows,1), (start_row,0))
start_row += setting_dict["pd_track_rows"]
protein_domain_color_df = pd.read_csv(protein_domain_color_file, header=None, names=["Domain", "Color"], index_col="Domain")
protein_domain_color_s = pd.Series(data=protein_domain_color_df["Color"], index=protein_domain_color_df.index)
del protein_domain_color_df
protein_domain_color_s = protein_domain_color_s.apply(lambda x: x if "," not in x else tuple([float(f) for f in x.split(",")]))
pds.make_track(protein_domain_track, protein_domain_df, utr_df, protein_domain_color_s, setting_dict, variant_track)
start_row += setting_dict["pd_track_gap_rows"]
fig.set_size_inches(setting_dict["fig_width_inches"], setting_dict["fig_height_inches"])
plt.savefig(png_file, dpi=setting_dict["fig_dpi"])
print("Written {0}.\n".format(png_file))
[docs]def get_exon_bound_color_l(exon_coord_df, utr_df, strand):
'''Get the bounds, colors and edge colors required to generate a color bar that displays the utrs and exons for a transcript.
Args:
| exon_coord_df (DataFrame): contains the exon base pair and transcript position coordinates.
| utr_df (DataFrame): contains the utr base pair and transcript position coordinates.
| strand (str): specifies whether the transcript is on the positive or negative strand.
Returns:
exon_bound_color_ll (list of lists of ints and strs): contains bound_l, the list of bounds, color_l, the list of colors and edge_color_l, the list of edge colors.
'''
bound_color_df = pd.concat([exon_coord_df,utr_df])
bound_color_df.drop(["start_bp","end_bp"], axis=1, inplace=True)
bound_color_df["utr"].fillna(value="exon", inplace=True)
bound_color_df.sort_values(by=["start_tp","end_tp","utr"], inplace=True)
bound_color_df.index = range(len(bound_color_df))
bound_color_df.index.rename("",inplace=True)
#print bound_color_df
for i in range(1,len(bound_color_df)):
if bound_color_df.loc[i,"end_tp"] == bound_color_df.loc[i-1,"end_tp"]:
continue
if bound_color_df.loc[i,"start_tp"] < bound_color_df.loc[i-1,"end_tp"]:
bound_color_df.loc[i,"start_tp"] = bound_color_df.loc[i-1,"end_tp"] + 1
bound_color_df.drop_duplicates(["start_tp","end_tp"],inplace=True)
#print bound_color_df
bound_color_df["color"] = ["white"] * len(bound_color_df.index)
exon_color_l = ["red","#6E6E6E"]
exon_color_i = 0
for i in bound_color_df.index.tolist():
#print i
if bound_color_df.loc[i, "utr"] == "5'" or bound_color_df.loc[i,"utr"] == "3'":
bound_color_df.loc[i,"color"] = "white"
else:
bound_color_df.loc[i,"color"] = exon_color_l[exon_color_i]
exon_color_i = 1 - exon_color_i
bound_l = bound_color_df["start_tp"].tolist()
transcript_len = bound_color_df.iloc[len(bound_color_df.index)-1]["end_tp"]
if bound_l[-1] != transcript_len:
bound_l.append(transcript_len)
color_l = bound_color_df["color"].tolist()
edge_color_l = ["black" if color == "white" else color for color in color_l]
exon_bound_color_ll = [bound_l, color_l, edge_color_l]
return exon_bound_color_ll