Source code for pylprotpredictor.predict

import argparse
import logging

from Bio import SeqIO
from pathlib import Path

try:
    from pylprotpredictor import cds
    from pylprotpredictor import export
except ImportError:
    from . import cds
    from . import export


[docs]def extract_seqs(seq_filepath):
    """Extract the sequences in a fasta file

    :param seq_filepath: path to a fasta file

    :return: a dictionary with all sequences indexed by their id, their length and their complement sequence
    """
    seqs = {}
    for record in SeqIO.parse(seq_filepath.as_posix(), "fasta"):
        seqs[record.id] = record
    return seqs


[docs]def extract_predicted_cds(
        pred_cds_path, pred_cds_info_path, tag_ending_cds_info_path,
        genome_filepath):
    """Extract the list of predicted CDS and identify the CDS ending with TAG STOP codon

    :param pred_cds_path: path to the output of CDS prediction (Prodigal)
    :param pred_cds_info_path: path to a CSV file in which the information (start, end, strand, origin) are collected for each predicted CDS
    :param tag_ending_cds_info_path: path to CSV file to export the information about the TAG ending CDS
    :param genome_filepath: path to reference genome

    :return: a dictionary with the predicted CDS represented by CDS object
    """
    pred_cds = {}
    pred_cds_info = {}
    tag_ending_cds_info = {}

    origin_seqs = extract_seqs(genome_filepath)

    for record in SeqIO.parse(pred_cds_path.as_posix(), "fasta"):
        cds_obj = cds.CDS()
        cds_obj.init_from_record(record)
        cds_id = cds_obj.get_id()
        origin_seq_id = cds_obj.get_origin_seq_id()
        strand = cds_obj.get_strand()
        start = cds_obj.get_start()
        end = cds_obj.get_end()
        stop_codon = cds_obj.get_stop_codon()

        pred_cds[cds_id] = cds_obj

        pred_cds_info[cds_id] = {"start": start, "end": end, "strand": strand, "origin_seq": origin_seq_id, "stop_codon": stop_codon}

        if cds_obj.is_tag_ending_seq():
            tag_ending_cds_info[cds_id] = {"start": start, "end": end, "strand": strand, "origin_seq": origin_seq_id}
            cds_obj.set_origin_seq(origin_seqs[origin_seq_id])
            cds_obj.set_status("tag-ending")

    export.export_csv(pred_cds_info, pred_cds_info_path, ["start", "end", "strand", "origin_seq", "stop_codon"])
    export.export_csv(tag_ending_cds_info, tag_ending_cds_info_path, ["start", "end", "strand", "origin_seq"])

    logging.info("Number of predicted CDS: %s\n" % (len(pred_cds.keys())))
    logging.info("Number of TAG-ending predicted CDS: %s\n" % (len(tag_ending_cds_info.keys())))

    return pred_cds


[docs]def extract_potential_pyl_cds(
        pred_cds, pot_pyl_cds_filepath, pot_pyl_cds_info_filepath, pred_cds_obj_filepath):
    """Extract potential PYL CDS from TAG-ending CDS

    :param pred_cds: a dictionary with the predicted CDS represented as CDS objects
    :param pot_pyl_cds_filepath: path to fasta file in which the protein sequences of the potential PYL CDS are saved
    :param pot_pyl_cds_info_filepath: path to a cvs file to get information about potential PYL CDS
    :param pred_cds_obj_filepath: path to generated JSON file to store the list of predicted CDS objects
    """
    info = {}
    sequences = []
    json_pred_cds = {}
    for cds_id in pred_cds:
        cds_obj = pred_cds[cds_id]
        if cds_obj.is_tag_ending():
            logging.info("\tCDS: %s" % (cds_id))

            cds_obj.find_alternative_ends()
            cds_obj.extract_possible_alternative_seq()

            if cds_obj.has_alternative_cds():
                info[cds_id] = {
                    "start": cds_obj.get_start(),
                    "end": cds_obj.get_end(),
                    "strand": cds_obj.get_strand(),
                    "origin_seq": cds_obj.get_origin_seq_id(),
                    "alternative_start": ";".join([str(s) for s in cds_obj.get_alternative_start()]),
                    "alternative_end": ";".join([str(s) for s in cds_obj.get_alternative_end()])}
                sequences.append(cds_obj.get_translated_seq())
                sequences += cds_obj.get_translated_alternative_seq()
                cds_obj.set_status("potential pyl")

        json_pred_cds.update(pred_cds[cds_id].export_to_dict())

    logging.info("Number of potential Pyl proteins: %s\n" % (len(info.keys())))
    export.export_fasta(sequences, pot_pyl_cds_filepath)
    export.export_csv(
        info,
        pot_pyl_cds_info_filepath,
        ["start", "end", "strand", "origin_seq", "alternative_start", "alternative_end"])
    export.export_json(json_pred_cds, pred_cds_obj_filepath)


[docs]def predict_pyl_proteins(
        genome_filepath, pred_cds_filepath, pot_pyl_seq_filepath, log_filepath,
        pred_cds_info_filepath, tag_ending_cds_info_filepath,
        pot_pyl_seq_info_filepath, pred_cds_obj_filepath):
    """
    Run prediction of potentila PYL CDS:

    - Extraction of predicted CDS into a dictionary
    - Identification of TAG-ending proteins
    - Extraction of potential PYL sequences

    :param genome_filepath: path to file with genome sequence
    :param pred_cds_filepath: path to the output of CDS prediction (Prodigal)
    :param pot_pyl_seq_filepath: path to fasta file with potential PYL CDS sequence
    :param log_filepath: path to log file
    :param pred_cds_info_filepath: path to CSV file with predicted CDS info
    :param tag_ending_cds_info_filepath: path to CSV file with TAG-ending CDS info
    :param pot_pyl_seq_info_filepath: path to CSV file with potential PYL CDS info
    :param pred_cds_obj_filepath: path to generated JSON file to store the list of predicted CDS objects
    """
    logging.basicConfig(filename=log_filepath, level=logging.INFO, filemode="w")
    pred_cds = extract_predicted_cds(
        pred_cds_filepath,
        pred_cds_info_filepath,
        tag_ending_cds_info_filepath,
        genome_filepath)
    extract_potential_pyl_cds(
        pred_cds,
        pot_pyl_seq_filepath,
        pot_pyl_seq_info_filepath,
        pred_cds_obj_filepath)


if __name__ == "__main__":
    parser = argparse.ArgumentParser(description='Predict potential PYL CDS')
    parser.add_argument('--genome', help='path to FASTA file with genome (whole or scaffold) on which predict proteins')
    parser.add_argument('--pred_cds', help='path to FASTA file with predicted CDS of the genome')
    parser.add_argument('--pot_pyl_seq', help='path to generated FASTA file with potential PYL CDS sequence')
    parser.add_argument('--log', help='path to generated log file')
    parser.add_argument('--pred_cds_info', help='path to generated CSV file with predicted CDS info')
    parser.add_argument('--pred_cds_obj', help='path to generated JSON file to store the list of predicted CDS object')
    parser.add_argument('--tag_ending_cds_info', help='path to generated CSV file with TAG-ending CDS info')
    parser.add_argument('--pot_pyl_seq_info', help='path to generated CSV file with potential PYL CDS info')

    args = parser.parse_args()

    predict_pyl_proteins(
        genome_filepath=Path(args.genome),
        pred_cds_filepath=Path(args.pred_cds),
        pot_pyl_seq_filepath=Path(args.pot_pyl_seq),
        log_filepath=Path(args.log),
        pred_cds_info_filepath=Path(args.pred_cds_info),
        tag_ending_cds_info_filepath=Path(args.tag_ending_cds_info),
        pot_pyl_seq_info_filepath=Path(args.pot_pyl_seq_info),
        pred_cds_obj_filepath=Path(args.pred_cds_obj))