Source code for transplotter

import utrs as u
import variants as v
import protdomains as pds
import coverage as c
import matplotlib.pyplot as plt
import sys
import pandas as pd


[docs]def make_protein_domain_color_file(protein_domain_file, transcript_l, database, sortby_col_l, out_path):

    '''Make a protein domain color file.
    
    Args:
        | protein_domain_file (str): path to file containing the protein domain information.
        | transcript_l (list of strs): list of Ensembl transcript IDs.
        | database (str): protein domain database.
        | sortby_col_l (list of strs): list of names of columns to sort the DataFrame by.
        | out_path (str): path to write the protein domain color file to.
    '''
    
    print("make_protein_domain_color_file")
    protein_domain_df = pds.get_protein_domain_df(protein_domain_file, transcript_l, database, sortby_col_l)
    protein_domain_color_s = pds.get_protein_domain_color_s(protein_domain_df)
    protein_domain_color_s.to_csv(out_path)
    print("Written protein domain color file to {0}\n".format(out_path))
    

[docs]def make_exon_coord_file(cov_file, transcript, out_path):
    
    '''Make an exon coordinate file from a coverage file.
    
    Args:
        | cov_file (str): path to coverage file.
        | transcript (str): Ensembl transcript ID.
        | out_path (str): path to write the exon coordinate file to.
    '''

    print("make_exon_coord_file")
    cov_df = c.get_cov_df(cov_file, transcript, None, None, [])
    exon_coord_df = c.get_exon_coord_df(cov_df)
    exon_coord_df.to_csv(out_path, index=True)
    print("Written exon coordinate file to {0}\n".format(out_path))
    

[docs]def make_png(transcript_l, title_l, track_l, sample_ll, utr_file_l, exon_coord_file_l, cov_file_l,
             variant_file_l, protein_domain_file_l, protein_domain_color_file, setting_dict, png_file):
    
    '''Make a png which contains coverage/variants/protein domain tracks for 1 or more transcripts, subject to space limitations.
    
    Args:
        | transcript_l (list of strs): Ensembl transcript ID(s)
        | title_l (list of strs): title to use for each transcript.
        | track_l (list of strs): list of strings of length 3 which encode whether to generate each of the 3 tracks (coverage, variants, protein domains). 
        | sample_ll (list of list of strs): list of lists of sample IDs.
        | utr_file_l (list of strs): list of utr file paths.
        | exon_coord_file_l (list of strs): list of exon coordinate file paths.
        | cov_file_l (list of strs): list of coverage file paths.
        | variant_file_l (list of strs): list of variant file paths.
        | protein_domain_file_l (list of strs): list of protein domain file paths.
        | protein_domain_color_file (str): protein domain color file.
        | setting_dict (dictionary): settings for making the png.
        | png_file (str): path to write the png file to.
    '''

    print("make_png")

    #Check the parameters are well-formed.
    if len(transcript_l) == len(title_l) == len(track_l) == len(sample_ll) == len(utr_file_l) == len(exon_coord_file_l) == len(cov_file_l) == len(variant_file_l) == len(protein_domain_file_l):
        True
    else:
        print("ERROR: Parameters of make_png function which are lists must all be the same length.\n")
        return False
        
    #Check that the maximum number of tracks is not exceeded.
    num_rows, previous_track = 0, None
    track_gap_dict = dict(zip(["c","v","pd", None],
                              [setting_dict["c_track_gap_rows"], setting_dict["v_track_gap_rows"], setting_dict["pd_track_gap_rows"], 0]))
    for track_s in track_l:
        if track_s[0] == "1":
            num_rows += track_gap_dict[previous_track] + setting_dict["c_track_rows"]
            previous_track = "c"
        if track_s[1] == "1":
            num_rows += track_gap_dict[previous_track] + setting_dict["v_anns_top_rows"] + setting_dict["v_track_rows"] + setting_dict["v_anns_bot_rows"] + setting_dict["v_key_rows"]
            previous_track = "v"
        if track_s[2] == "1":
            num_rows += setting_dict["pd_track_rows"] + track_gap_dict[previous_track]
            previous_track = "pd"
    if num_rows > setting_dict["fig_num_rows"]:
        print("ERROR: PNG figure requires {0} rows but there are only {1}\n".format(num_rows,setting_dict["fig_num_rows"]))
        return False

    #Initialise the figure.
    fig = plt.figure()
    plt.rc('text', usetex=True)

    #Make the axes.
    get_tp_from_exon_bp_l = lambda bp, exon_bp_l: exon_bp_l.index(int(bp)) + 1
    num_rows = setting_dict["fig_num_rows"]
    start_row = 0
    title_1_coords = None

    for i in range(len(transcript_l)):
        
        print("Transcript: {0}".format(transcript_l[i]))
        
        #Read in the exon positions and the utrs.
        exon_coord_df = pd.read_csv(exon_coord_file_l[i], index_col="exon")
        strand = "+" if (exon_coord_df["end_bp"] > exon_coord_df["start_bp"]).all() else "-"
        if strand == "+":
            print("Transcription direction: forward")
        elif strand == "-":
            print("Transcript direction: reverse") 
        exon_bp_l = []
        for j in range(len(exon_coord_df.index)):
            if strand == "+":
                exon_bp_l.extend(range(exon_coord_df["start_bp"].iloc[j],exon_coord_df["end_bp"].iloc[j]+1))
            elif strand == "-":
                exon_bp_l.extend(range(exon_coord_df["start_bp"].iloc[j],exon_coord_df["end_bp"].iloc[j]-1,-1))
        utr_df = u.get_utr_df(utr_file_l[i], strand, transcript_l[i])
        #Add the transcript positions to utr_df. 
        utr_df["start_tp"] = utr_df["start_bp"].apply(func=get_tp_from_exon_bp_l, exon_bp_l=exon_bp_l)
        utr_df["end_tp"] = utr_df["end_bp"].apply(func=get_tp_from_exon_bp_l, exon_bp_l=exon_bp_l)
        
        #Make the title track
        if i == 0:
            plt.figtext(setting_dict["title_1_fig_x"], setting_dict["title_1_fig_y"], title_l[i], fontsize=setting_dict["title_fontsize"])
            title_1_coords = fig.transFigure.transform((setting_dict["title_1_fig_x"], setting_dict["title_1_fig_y"]))
        else:
            title_track = plt.subplot2grid((num_rows,1), (start_row,0))
            title_track.set_axis_off()
            inv = title_track.transData.inverted()
            title_track.text(inv.transform(title_1_coords)[0], setting_dict["title_2_ax_y"], title_l[i], fontsize=setting_dict["title_fontsize"])
            start_row += setting_dict["t_track_rows"]
        
        #Make the coverage track.'''
        if track_l[i][0] == "1":
            print("Making coverage track.")
            cov_df = c.get_cov_df(cov_file_l[i], transcript_l[i], None, None, sample_ll[i])
            [bound_l, color_l, edge_color_l] = get_exon_bound_color_l(exon_coord_df, utr_df, strand)
            coverage_track = plt.subplot2grid((num_rows,1), (start_row, 0), rowspan=setting_dict["c_track_rows"])
            start_row += setting_dict["c_track_rows"]
            c.make_track(coverage_track, cov_df, bound_l, color_l, edge_color_l, setting_dict)
            start_row += setting_dict["c_track_gap_rows"]
        
        #Make the variants track.'''
        variant_track = None
        if track_l[i][1] == "1": #Make the variants track.
            print("Making variant track.")
            variant_df = v.get_variant_df(transcript_l[i], variant_file_l[i])
            variant_df.rename(columns={"pos":"bp"}, inplace=True)
            variant_df["tp"] = variant_df["bp"].apply(func=get_tp_from_exon_bp_l, exon_bp_l=exon_bp_l)
            #variant_df.drop_duplicates(subset=["GENE_prot_change","GENE_DNA_change"], inplace=True)
            variant_df.sort_values(by="tp", inplace=True)
            start_row += setting_dict["v_anns_top_rows"]
            variant_track = plt.subplot2grid((num_rows,1), (start_row,0))
            start_row += setting_dict["v_track_rows"]
            start_row += setting_dict["v_anns_bot_rows"]
            variant_key = plt.subplot2grid((num_rows,1), (start_row,0), rowspan=setting_dict["v_key_rows"])
            start_row += setting_dict["v_key_rows"]
            [bound_l, color_l, edge_color_l] = get_exon_bound_color_l(exon_coord_df, utr_df, strand)
            v.make_track(variant_track, bound_l[-1], bound_l, color_l, edge_color_l, variant_df, setting_dict, variant_key)
            start_row += setting_dict["v_track_gap_rows"]
        
        #Make the protein domain track.'''
        if track_l[i][2] == "1": #Make the protein domains track.
            print("Making protein domain track.")
            protein_domain_df = pds.get_protein_domain_df(protein_domain_file_l[i], [transcript_l[i]], "Pfam", ["Start","End"])
            protein_domain_track = plt.subplot2grid((num_rows,1), (start_row,0))
            start_row += setting_dict["pd_track_rows"]
            protein_domain_color_df = pd.read_csv(protein_domain_color_file, header=None, names=["Domain", "Color"], index_col="Domain")
            protein_domain_color_s = pd.Series(data=protein_domain_color_df["Color"], index=protein_domain_color_df.index)
            del protein_domain_color_df
            protein_domain_color_s = protein_domain_color_s.apply(lambda x: x if "," not in x else tuple([float(f) for f in x.split(",")]))
            pds.make_track(protein_domain_track, protein_domain_df, utr_df, protein_domain_color_s, setting_dict, variant_track)
            start_row += setting_dict["pd_track_gap_rows"]

    fig.set_size_inches(setting_dict["fig_width_inches"], setting_dict["fig_height_inches"])
    plt.savefig(png_file, dpi=setting_dict["fig_dpi"])
    print("Written {0}.\n".format(png_file)) 


[docs]def get_exon_bound_color_l(exon_coord_df, utr_df, strand):

    '''Get the bounds, colors and edge colors required to generate a color bar that displays the utrs and exons for a transcript. 
    
    Args:
        | exon_coord_df (DataFrame): contains the exon base pair and transcript position coordinates.
        | utr_df (DataFrame): contains the utr base pair and transcript position coordinates.
        | strand (str): specifies whether the transcript is on the positive or negative strand.
    
    Returns:
        exon_bound_color_ll (list of lists of ints and strs): contains bound_l, the list of bounds, color_l, the list of colors and edge_color_l, the list of edge colors.
    '''

    bound_color_df = pd.concat([exon_coord_df,utr_df])
    bound_color_df.drop(["start_bp","end_bp"], axis=1, inplace=True)
    bound_color_df["utr"].fillna(value="exon", inplace=True)
    bound_color_df.sort_values(by=["start_tp","end_tp","utr"], inplace=True)
    bound_color_df.index = range(len(bound_color_df))
    bound_color_df.index.rename("",inplace=True)
    #print bound_color_df
    
    for i in range(1,len(bound_color_df)):
        if bound_color_df.loc[i,"end_tp"] == bound_color_df.loc[i-1,"end_tp"]:
            continue
        if bound_color_df.loc[i,"start_tp"] < bound_color_df.loc[i-1,"end_tp"]:
            bound_color_df.loc[i,"start_tp"] = bound_color_df.loc[i-1,"end_tp"] + 1    
    bound_color_df.drop_duplicates(["start_tp","end_tp"],inplace=True)
    #print bound_color_df
    
    bound_color_df["color"] = ["white"] * len(bound_color_df.index)
    exon_color_l = ["red","#6E6E6E"]
    exon_color_i = 0
    for i in bound_color_df.index.tolist():
        #print i
        if bound_color_df.loc[i, "utr"] == "5'" or bound_color_df.loc[i,"utr"] == "3'":
            bound_color_df.loc[i,"color"] = "white"
        else:
            bound_color_df.loc[i,"color"] = exon_color_l[exon_color_i]
            exon_color_i = 1 - exon_color_i
    
    bound_l = bound_color_df["start_tp"].tolist()
    transcript_len = bound_color_df.iloc[len(bound_color_df.index)-1]["end_tp"]
    if bound_l[-1] != transcript_len:
        bound_l.append(transcript_len)
    color_l = bound_color_df["color"].tolist()

    edge_color_l = ["black" if color == "white" else color for color in color_l]

    exon_bound_color_ll = [bound_l, color_l, edge_color_l]
    
    return exon_bound_color_ll