Source code for utrs
import pandas as pd
'''
Functions for the UTRs file.
'''
[docs]def get_utr_df(utr_file, strand, transcript):
'''Read the UTR information into a DataFrame.
Args:
| utr_file (str): path to file containing the utr information.
| strand (str): whether the transcript is on the positive or negative strand.
| transcript (str): Ensemble transcript ID.
Returns:
utr_df: DataFrame
'''
utr_df = pd.DataFrame([])
utr_df_chunker = pd.read_csv(utr_file,sep=",",chunksize=1000)
for utr_df_chunk in utr_df_chunker:
utr_df_chunk = utr_df_chunk[utr_df_chunk["Ensembl Transcript ID"] == transcript]
utr_df = pd.concat([utr_df, utr_df_chunk], ignore_index=True)
utr_df = utr_df[~utr_df["5' UTR Start"].isnull() | ~utr_df["3' UTR Start"].isnull()]
#Convert the 0-based start coordinates to 1-based.
utr_df["5' UTR Start"] = utr_df["5' UTR Start"] + 1
utr_df["3' UTR Start"] = utr_df["3' UTR Start"] + 1
#Get utr_df into the desired form.
utr_df["utr"] = utr_df["5' UTR Start"].isnull()
utr_df["utr"].replace(to_replace={False:"5'", True:"3'"}, inplace=True)
utr_df["5' UTR Start"].fillna(utr_df["3' UTR Start"], inplace=True)
utr_df["5' UTR End"].fillna(utr_df["3' UTR End"], inplace=True)
utr_df = utr_df[["utr", "5' UTR Start", "5' UTR End"]]
utr_df.rename(columns={"5' UTR Start":"start_bp", "5' UTR End":"end_bp"}, inplace=True)
if strand == "-":
utr_df.rename(columns={"start_bp":"end_bp", "end_bp":"start_bp", "start_tp":"end_tp", "end_tp":"start_tp"}, inplace=True)
return utr_df