import argparse
import pandas as pd
from pathlib import Path
try:
from pylprotpredictor import alignment
from pylprotpredictor import cds
from pylprotpredictor import export
except ImportError:
from . import alignment
from . import cds
from . import export
[docs]def import_cds(cds_obj_filepath):
"""
:param cds_obj_filepath: path to JSON file with collection of CDS objects
:return: dictionary of the CDS objects
"""
cds_objects = {}
d = export.import_json(cds_obj_filepath)
for cds_id in d:
cds_obj = cds.CDS()
cds_obj.init_from_dict(d[cds_id])
cds_objects[cds_id] = cds_obj
return cds_objects
[docs]def get_cds_obj(cds_id, pred_cds):
"""Find the CDS object given an id
:param cds_id: id of the CDS to find
:param pred_cds: dictionary of the predicted CDS
:return: a CDS object
"""
if cds_id not in pred_cds:
raise ValueError("CDS not found for %s" % (cds_id))
return pred_cds[cds_id]
[docs]def parse_similarity_search_report(pot_pyl_similarity_search, pred_cds):
"""Parse the similarity search report and add information to the list of
potential PYL CDS
:param pot_pyl_similarity_search: path to similarity search report of potential PYL CDS against a reference database
:param pred_cds: dictionary of the predicted CDS
"""
similarity_search_report = pd.read_table(
pot_pyl_similarity_search,
index_col=None,
header=None)
for row in similarity_search_report.itertuples():
cds_id = row[1].split("-")[0]
cds_obj = get_cds_obj(cds_id, pred_cds)
al = alignment.Alignment()
al.init_from_search_report_row(row)
cds_obj.add_id_alignment(row[1], al)
return pred_cds
["status", "conserved_start","conserved_end","strand","conserved_stop_codon","origin_seq","original_start","original_end","original_stop_codon","rejected_start","rejected_end","matched_RefSeq90"])
[docs]def check_pyl_proteins(
pot_pyl_similarity_search, pred_cds_obj_filepath,
cons_pred_cds_seq, info_filepath):
"""
Check predicted PYL CDS:
- Get the potential PYL CDS
- Parse the similarity search report
- Identify and extract the correct CDS sequence (the one with the lowest evalue and longest alignment for potential PYL)
:param pot_pyl_similarity_search: path to similarity search report of potential PYL CDS against a reference database
:param pred_cds_obj_filepath: path to generated JSON file to store the list of predicted CDS objects
:param cons_pred_cds_seq: path to a FASTA file for the conserved CDS sequences
:param info_filepath: path to a CSV file with final information about the CDS
"""
pred_cds = import_cds(pred_cds_obj_filepath)
parse_similarity_search_report(pot_pyl_similarity_search, pred_cds)
extract_correct_cds(pred_cds, cons_pred_cds_seq, info_filepath)
if __name__ == "__main__":
parser = argparse.ArgumentParser(description='Check predicted PYL CDS')
parser.add_argument('--pot_pyl_similarity_search', help='path to similarity search report of potential PYL CDS against a reference database')
parser.add_argument('--pred_cds_obj_filepath', help='path to generated JSON file to store the list of predicted CDS objects')
parser.add_argument('--cons_pred_cds_seq_filepath', help='path to a FASTA file for the conserved CDS sequences')
parser.add_argument('--info_filepath', help='path to a CSV file with final information about the CDS')
args = parser.parse_args()
check_pyl_proteins(
pot_pyl_similarity_search=Path(aargs.pot_pyl_similarity_search),
pred_cds_obj_filepath=Path(aargs.pred_cds_obj_filepath),
cons_pred_cds_seq=Path(aargs.cons_pred_cds_seq_filepath),
info_filepath=Path(aargs.info_filepath))