Source code for coverage

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


'''
Functions specific to coverage: reading in the data and creating the coverage track.
'''

[docs]def get_cov_df(cov_file, transcript, bp_start, bp_end, sample_l):

    '''Get the coverage data for a transcript. If bp_start and bp_end are None, then coverage data for the whole transcript will
    be extracted. The coordinates in cov_df and exon_coords_df are 1-based.
    
    Args:
        | cov_file (str): path to file containing the coverage data.
        | transcript (str): Ensembl transcript ID.
        | bp_start (int): base pair start coordinate.
        | bp_end (int): base pair end coordinate.
        | sample_l (list of strs): list of sample IDs.
    
    Returns:
        cov_df (DataFrame): contains the coverage data.
    '''    

    print("Reading in coverage data for {0} samples...".format(len(sample_l)))
    if bp_start != None and bp_end != None:
        if bp_start >= bp_end:
            print("WARNING: bp_start {0} is not less than {1}".format(bp_start,bp_end)) 
    
    cov_df_chunker = pd.read_csv(cov_file, chunksize=1000)
    col_to_keep_l = ["chromStart","strand","position","cov","exon"]
    cov_df_chunk_l = []
    for cov_df_chunk in cov_df_chunker:
        cov_df_chunk = cov_df_chunk[cov_df_chunk["name"].str.contains(transcript)]
        cov_df_chunk["exon"] = cov_df_chunk["name"].str.split(pat=":").str.get(2)
        cov_df_chunk["cov"] = cov_df_chunk[sample_l].mean(axis=1)
        cov_df_chunk = cov_df_chunk[col_to_keep_l]
        #print cov_df_chunk
        cov_df_chunk_l.append(cov_df_chunk)
    
    cov_df = pd.concat(cov_df_chunk_l,ignore_index=True)
    del cov_df_chunk_l
    
    #Work out the bp and check whether bp_start <= bp <= bp_end
    #BED start positions are 0-based i.e. the first base in an exon is interpreted as +1 from the start position.
    #The end position is 1-based.
    cov_df["bp"] = cov_df["chromStart"] + cov_df["position"]
    cov_df.sort_values(by="bp", inplace=True)
    cov_df.index = range(len(cov_df.index))
    gene_strand = cov_df.iloc[0]["strand"]
    #cov_df.drop("strand", axis=1, inplace=True)
    cov_df["tp"] = range(1,len(cov_df.index)+1)
    if gene_strand == "-":
        cov_df["tp"] = range(len(cov_df.index),0,-1)
    cov_df.drop(["chromStart","position"], axis=1, inplace=True)
    
    return cov_df


[docs]def get_exon_coord_df(cov_df):

    '''Make an exon coordinate file.
    
    Args:
        cov_df (DataFrame): contains the coverage information.
        
    Returns:
        exon_coord_df (DataFrame): contains the exon base pair and transcript position coordinates.
    '''

    #Exons: the exon start and end positions in this list are 1-based. 
    #cov_df["exon"] = cov_df["name"].str.split(pat=":").str.get(2)
    
    exon_coord_df = cov_df.groupby("exon")["bp"].agg([np.min,np.max])
    exon_coord_df.columns = ["start_bp","end_bp"] 
    
    #Add the transcript positions to exon_coords_df 
    get_tp_from_cov_df = lambda bp, cov_df: cov_df[cov_df["bp"] == bp].iloc[0]["tp"] 
    exon_coord_df["start_tp"] =  exon_coord_df["start_bp"].apply(func=get_tp_from_cov_df, cov_df=cov_df)
    exon_coord_df["end_tp"] =  exon_coord_df["end_bp"].apply(func=get_tp_from_cov_df, cov_df=cov_df)
    exon_coord_df.sort_values(by="start_tp", inplace=True)
    
    gene_strand = cov_df.iloc[0]["strand"]
    if gene_strand == "-":
        exon_coord_df.rename(columns={"start_bp":"end_bp", "end_bp":"start_bp", "start_tp":"end_tp", "end_tp":"start_tp"}, inplace=True)
    
    return exon_coord_df


[docs]def make_track(track, cov_df, bound_l, color_l, edge_color_l, setting_dict):
    
    '''Make the coverage track.
    
    Args:
        | track (matplotlib.axes.Axes): the axis for this coverage track.
        | cov_df (DataFrame): contains the coverage data.
        | bound_l (list of ints): bounds of the utrs and exons.
        | color_l (list of strs): colors for the utrs and exons.
        | edge_color_l (list of strs): edge colors for the utrs and exons.
        | setting_dict (dictionary): settings for making the png.'''
    
    for i in range(1,len(bound_l)):
        cov_in_bounds_df =  cov_df[(cov_df["tp"] >= bound_l[i-1]) & (cov_df["tp"] < bound_l[i])]
        #print cov_in_bounds_df
        track.fill_between(cov_in_bounds_df["tp"].tolist(), cov_in_bounds_df["cov"], facecolor=color_l[i-1], edgecolor=edge_color_l[i-1])

    track.set_xlabel('Position')
    plt.rc('text',usetex=True)
    track.set_ylabel(setting_dict["c_track_y_axis_label"], rotation="horizontal", size=setting_dict["c_track_fontsize"], ha='right', va='center')
    track.set_xlim((0,bound_l[-1]))
    track.set_ylim([0,cov_df["cov"].max()+10])
    track.grid(True)
    plt.setp(track.xaxis.get_ticklabels(), size=setting_dict["c_track_fontsize"])
    plt.setp(track.yaxis.get_ticklabels(), size=setting_dict["c_track_fontsize"])
    plt.setp(track.xaxis.get_label(), size=setting_dict["c_track_fontsize"])
    
    #NOTE, Above the xlim start is set to x[0] i.e. 1 so the track starts from 1, but the first xtick label in the plot is 0!
    #I have tried without success to replace this 0 with a 1.