import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
'''
Functions specific to coverage: reading in the data and creating the coverage track.
'''
[docs]def get_cov_df(cov_file, transcript, bp_start, bp_end, sample_l):
'''Get the coverage data for a transcript. If bp_start and bp_end are None, then coverage data for the whole transcript will
be extracted. The coordinates in cov_df and exon_coords_df are 1-based.
Args:
| cov_file (str): path to file containing the coverage data.
| transcript (str): Ensembl transcript ID.
| bp_start (int): base pair start coordinate.
| bp_end (int): base pair end coordinate.
| sample_l (list of strs): list of sample IDs.
Returns:
cov_df (DataFrame): contains the coverage data.
'''
print("Reading in coverage data for {0} samples...".format(len(sample_l)))
if bp_start != None and bp_end != None:
if bp_start >= bp_end:
print("WARNING: bp_start {0} is not less than {1}".format(bp_start,bp_end))
cov_df_chunker = pd.read_csv(cov_file, chunksize=1000)
col_to_keep_l = ["chromStart","strand","position","cov","exon"]
cov_df_chunk_l = []
for cov_df_chunk in cov_df_chunker:
cov_df_chunk = cov_df_chunk[cov_df_chunk["name"].str.contains(transcript)]
cov_df_chunk["exon"] = cov_df_chunk["name"].str.split(pat=":").str.get(2)
cov_df_chunk["cov"] = cov_df_chunk[sample_l].mean(axis=1)
cov_df_chunk = cov_df_chunk[col_to_keep_l]
#print cov_df_chunk
cov_df_chunk_l.append(cov_df_chunk)
cov_df = pd.concat(cov_df_chunk_l,ignore_index=True)
del cov_df_chunk_l
#Work out the bp and check whether bp_start <= bp <= bp_end
#BED start positions are 0-based i.e. the first base in an exon is interpreted as +1 from the start position.
#The end position is 1-based.
cov_df["bp"] = cov_df["chromStart"] + cov_df["position"]
cov_df.sort_values(by="bp", inplace=True)
cov_df.index = range(len(cov_df.index))
gene_strand = cov_df.iloc[0]["strand"]
#cov_df.drop("strand", axis=1, inplace=True)
cov_df["tp"] = range(1,len(cov_df.index)+1)
if gene_strand == "-":
cov_df["tp"] = range(len(cov_df.index),0,-1)
cov_df.drop(["chromStart","position"], axis=1, inplace=True)
return cov_df
[docs]def get_exon_coord_df(cov_df):
'''Make an exon coordinate file.
Args:
cov_df (DataFrame): contains the coverage information.
Returns:
exon_coord_df (DataFrame): contains the exon base pair and transcript position coordinates.
'''
#Exons: the exon start and end positions in this list are 1-based.
#cov_df["exon"] = cov_df["name"].str.split(pat=":").str.get(2)
exon_coord_df = cov_df.groupby("exon")["bp"].agg([np.min,np.max])
exon_coord_df.columns = ["start_bp","end_bp"]
#Add the transcript positions to exon_coords_df
get_tp_from_cov_df = lambda bp, cov_df: cov_df[cov_df["bp"] == bp].iloc[0]["tp"]
exon_coord_df["start_tp"] = exon_coord_df["start_bp"].apply(func=get_tp_from_cov_df, cov_df=cov_df)
exon_coord_df["end_tp"] = exon_coord_df["end_bp"].apply(func=get_tp_from_cov_df, cov_df=cov_df)
exon_coord_df.sort_values(by="start_tp", inplace=True)
gene_strand = cov_df.iloc[0]["strand"]
if gene_strand == "-":
exon_coord_df.rename(columns={"start_bp":"end_bp", "end_bp":"start_bp", "start_tp":"end_tp", "end_tp":"start_tp"}, inplace=True)
return exon_coord_df
[docs]def make_track(track, cov_df, bound_l, color_l, edge_color_l, setting_dict):
'''Make the coverage track.
Args:
| track (matplotlib.axes.Axes): the axis for this coverage track.
| cov_df (DataFrame): contains the coverage data.
| bound_l (list of ints): bounds of the utrs and exons.
| color_l (list of strs): colors for the utrs and exons.
| edge_color_l (list of strs): edge colors for the utrs and exons.
| setting_dict (dictionary): settings for making the png.'''
for i in range(1,len(bound_l)):
cov_in_bounds_df = cov_df[(cov_df["tp"] >= bound_l[i-1]) & (cov_df["tp"] < bound_l[i])]
#print cov_in_bounds_df
track.fill_between(cov_in_bounds_df["tp"].tolist(), cov_in_bounds_df["cov"], facecolor=color_l[i-1], edgecolor=edge_color_l[i-1])
track.set_xlabel('Position')
plt.rc('text',usetex=True)
track.set_ylabel(setting_dict["c_track_y_axis_label"], rotation="horizontal", size=setting_dict["c_track_fontsize"], ha='right', va='center')
track.set_xlim((0,bound_l[-1]))
track.set_ylim([0,cov_df["cov"].max()+10])
track.grid(True)
plt.setp(track.xaxis.get_ticklabels(), size=setting_dict["c_track_fontsize"])
plt.setp(track.yaxis.get_ticklabels(), size=setting_dict["c_track_fontsize"])
plt.setp(track.xaxis.get_label(), size=setting_dict["c_track_fontsize"])
#NOTE, Above the xlim start is set to x[0] i.e. 1 so the track starts from 1, but the first xtick label in the plot is 0!
#I have tried without success to replace this 0 with a 1.