Source code for pylprotpredictor.check

import argparse
import pandas as pd

from pathlib import Path

try:
    from pylprotpredictor import alignment
    from pylprotpredictor import cds
    from pylprotpredictor import export
except ImportError:
    from . import alignment
    from . import cds
    from . import export


[docs]def import_cds(cds_obj_filepath):
    """
    :param cds_obj_filepath: path to JSON file with collection of CDS objects

    :return: dictionary of the CDS objects
    """
    cds_objects = {}
    d = export.import_json(cds_obj_filepath)
    for cds_id in d:
        cds_obj = cds.CDS()
        cds_obj.init_from_dict(d[cds_id])
        cds_objects[cds_id] = cds_obj
    return cds_objects


[docs]def get_cds_obj(cds_id, pred_cds):
    """Find the CDS object given an id

    :param cds_id: id of the CDS to find
    :param pred_cds: dictionary of the predicted CDS

    :return: a CDS object
    """
    if cds_id not in pred_cds:
        raise ValueError("CDS not found for %s" % (cds_id))
    return pred_cds[cds_id]


[docs]def parse_similarity_search_report(pot_pyl_similarity_search, pred_cds):
    """Parse the similarity search report and add information to the list of
    potential PYL CDS

    :param pot_pyl_similarity_search: path to similarity search report of potential PYL CDS against a reference database
    :param pred_cds: dictionary of the predicted CDS
    """
    similarity_search_report = pd.read_table(
        pot_pyl_similarity_search,
        index_col=None,
        header=None)

    for row in similarity_search_report.itertuples():
        cds_id = row[1].split("-")[0]
        cds_obj = get_cds_obj(cds_id, pred_cds)

        al = alignment.Alignment()
        al.init_from_search_report_row(row)
        cds_obj.add_id_alignment(row[1], al)

    return pred_cds


[docs]def extract_correct_cds(pred_cds, cons_pred_cds_seq, info_filepath):
    """Identify and extract the correct CDS sequence

    :param pred_cds: dictionary of the predicted CDS
    :param cons_pred_cds_seq: path to a FASTA file for the conserved CDS sequences
    :param info_filepath: path to a CSV file with final information about the CDS
    """
    cons_cds_sequences = []
    info = {}
    keys = list(pred_cds.keys())
    keys.sort()

    for cds_id in keys:
        cds_obj = pred_cds[cds_id]
        cons_al = alignment.Alignment()

        if cds_obj.is_potential_pyl():
            cons_al = cds_obj.identify_cons_rej_cds()

        cons_seq = cds_obj.get_conserved_cds()
        rej_seq = cds_obj.get_rejected_cds()

        if cons_seq is None:
            cons_seq = cds_obj
        
        cons_cds_sequences.append(cons_seq.get_seqrecord())

        info[cds_id] = {
            "status": cds_obj.get_status(),
            "conserved_start": cons_seq.get_start(),
            "conserved_end": cons_seq.get_end(),
            "strand": cds_obj.get_strand(),
            "conserved_stop_codon": cons_seq.get_stop_codon(),
            "origin_seq": cds_obj.get_origin_seq_id(),
            "original_start": cds_obj.get_start(),
            "original_end": cds_obj.get_end(),
            "original_stop_codon": cds_obj.get_stop_codon(),
            "rejected_start": ";".join([str(s.get_start()) for s in rej_seq]),
            "rejected_end": ";".join([str(s.get_end()) for s in rej_seq]),
            "matched_RefSeq90": cons_al.get_sseqid()}

    export.export_fasta(cons_cds_sequences, cons_pred_cds_seq)
    export.export_csv(
        info,
        info_filepath,
        ["status", "conserved_start","conserved_end","strand","conserved_stop_codon","origin_seq","original_start","original_end","original_stop_codon","rejected_start","rejected_end","matched_RefSeq90"])


[docs]def check_pyl_proteins(
        pot_pyl_similarity_search, pred_cds_obj_filepath,
        cons_pred_cds_seq, info_filepath):
    """
    Check predicted PYL CDS:

    - Get the potential PYL CDS
    - Parse the similarity search report
    - Identify and extract the correct CDS sequence (the one with the lowest evalue and longest alignment for potential PYL)

    :param pot_pyl_similarity_search: path to similarity search report of potential PYL CDS against a reference database
    :param pred_cds_obj_filepath: path to generated JSON file to store the list of predicted CDS objects
    :param cons_pred_cds_seq: path to a FASTA file for the conserved CDS sequences
    :param info_filepath: path to a CSV file with final information about the CDS
    """
    pred_cds = import_cds(pred_cds_obj_filepath)
    parse_similarity_search_report(pot_pyl_similarity_search, pred_cds)
    extract_correct_cds(pred_cds, cons_pred_cds_seq, info_filepath)


if __name__ == "__main__":
    parser = argparse.ArgumentParser(description='Check predicted PYL CDS')
    parser.add_argument('--pot_pyl_similarity_search', help='path to similarity search report of potential PYL CDS against a reference database')
    parser.add_argument('--pred_cds_obj_filepath', help='path to generated JSON file to store the list of predicted CDS objects')
    parser.add_argument('--cons_pred_cds_seq_filepath', help='path to a FASTA file for the conserved CDS sequences')
    parser.add_argument('--info_filepath', help='path to a CSV file with final information about the CDS')

    args = parser.parse_args()

    check_pyl_proteins(
        pot_pyl_similarity_search=Path(aargs.pot_pyl_similarity_search),
        pred_cds_obj_filepath=Path(aargs.pred_cds_obj_filepath),
        cons_pred_cds_seq=Path(aargs.cons_pred_cds_seq_filepath),
        info_filepath=Path(aargs.info_filepath))