Source code for pylprotpredictor.predict
import argparse
import logging
from Bio import SeqIO
from pathlib import Path
try:
from pylprotpredictor import cds
from pylprotpredictor import export
except ImportError:
from . import cds
from . import export
[docs]def extract_seqs(seq_filepath):
"""Extract the sequences in a fasta file
:param seq_filepath: path to a fasta file
:return: a dictionary with all sequences indexed by their id, their length and their complement sequence
"""
seqs = {}
for record in SeqIO.parse(seq_filepath.as_posix(), "fasta"):
seqs[record.id] = record
return seqs
[docs]def extract_predicted_cds(
pred_cds_path, pred_cds_info_path, tag_ending_cds_info_path,
genome_filepath):
"""Extract the list of predicted CDS and identify the CDS ending with TAG STOP codon
:param pred_cds_path: path to the output of CDS prediction (Prodigal)
:param pred_cds_info_path: path to a CSV file in which the information (start, end, strand, origin) are collected for each predicted CDS
:param tag_ending_cds_info_path: path to CSV file to export the information about the TAG ending CDS
:param genome_filepath: path to reference genome
:return: a dictionary with the predicted CDS represented by CDS object
"""
pred_cds = {}
pred_cds_info = {}
tag_ending_cds_info = {}
origin_seqs = extract_seqs(genome_filepath)
for record in SeqIO.parse(pred_cds_path.as_posix(), "fasta"):
cds_obj = cds.CDS()
cds_obj.init_from_record(record)
cds_id = cds_obj.get_id()
origin_seq_id = cds_obj.get_origin_seq_id()
strand = cds_obj.get_strand()
start = cds_obj.get_start()
end = cds_obj.get_end()
stop_codon = cds_obj.get_stop_codon()
pred_cds[cds_id] = cds_obj
pred_cds_info[cds_id] = {"start": start, "end": end, "strand": strand, "origin_seq": origin_seq_id, "stop_codon": stop_codon}
if cds_obj.is_tag_ending_seq():
tag_ending_cds_info[cds_id] = {"start": start, "end": end, "strand": strand, "origin_seq": origin_seq_id}
cds_obj.set_origin_seq(origin_seqs[origin_seq_id])
cds_obj.set_status("tag-ending")
export.export_csv(pred_cds_info, pred_cds_info_path, ["start", "end", "strand", "origin_seq", "stop_codon"])
export.export_csv(tag_ending_cds_info, tag_ending_cds_info_path, ["start", "end", "strand", "origin_seq"])
logging.info("Number of predicted CDS: %s\n" % (len(pred_cds.keys())))
logging.info("Number of TAG-ending predicted CDS: %s\n" % (len(tag_ending_cds_info.keys())))
return pred_cds
[docs]def extract_potential_pyl_cds(
pred_cds, pot_pyl_cds_filepath, pot_pyl_cds_info_filepath, pred_cds_obj_filepath):
"""Extract potential PYL CDS from TAG-ending CDS
:param pred_cds: a dictionary with the predicted CDS represented as CDS objects
:param pot_pyl_cds_filepath: path to fasta file in which the protein sequences of the potential PYL CDS are saved
:param pot_pyl_cds_info_filepath: path to a cvs file to get information about potential PYL CDS
:param pred_cds_obj_filepath: path to generated JSON file to store the list of predicted CDS objects
"""
info = {}
sequences = []
json_pred_cds = {}
for cds_id in pred_cds:
cds_obj = pred_cds[cds_id]
if cds_obj.is_tag_ending():
logging.info("\tCDS: %s" % (cds_id))
cds_obj.find_alternative_ends()
cds_obj.extract_possible_alternative_seq()
if cds_obj.has_alternative_cds():
info[cds_id] = {
"start": cds_obj.get_start(),
"end": cds_obj.get_end(),
"strand": cds_obj.get_strand(),
"origin_seq": cds_obj.get_origin_seq_id(),
"alternative_start": ";".join([str(s) for s in cds_obj.get_alternative_start()]),
"alternative_end": ";".join([str(s) for s in cds_obj.get_alternative_end()])}
sequences.append(cds_obj.get_translated_seq())
sequences += cds_obj.get_translated_alternative_seq()
cds_obj.set_status("potential pyl")
json_pred_cds.update(pred_cds[cds_id].export_to_dict())
logging.info("Number of potential Pyl proteins: %s\n" % (len(info.keys())))
export.export_fasta(sequences, pot_pyl_cds_filepath)
export.export_csv(
info,
pot_pyl_cds_info_filepath,
["start", "end", "strand", "origin_seq", "alternative_start", "alternative_end"])
export.export_json(json_pred_cds, pred_cds_obj_filepath)
[docs]def predict_pyl_proteins(
genome_filepath, pred_cds_filepath, pot_pyl_seq_filepath, log_filepath,
pred_cds_info_filepath, tag_ending_cds_info_filepath,
pot_pyl_seq_info_filepath, pred_cds_obj_filepath):
"""
Run prediction of potentila PYL CDS:
- Extraction of predicted CDS into a dictionary
- Identification of TAG-ending proteins
- Extraction of potential PYL sequences
:param genome_filepath: path to file with genome sequence
:param pred_cds_filepath: path to the output of CDS prediction (Prodigal)
:param pot_pyl_seq_filepath: path to fasta file with potential PYL CDS sequence
:param log_filepath: path to log file
:param pred_cds_info_filepath: path to CSV file with predicted CDS info
:param tag_ending_cds_info_filepath: path to CSV file with TAG-ending CDS info
:param pot_pyl_seq_info_filepath: path to CSV file with potential PYL CDS info
:param pred_cds_obj_filepath: path to generated JSON file to store the list of predicted CDS objects
"""
logging.basicConfig(filename=log_filepath, level=logging.INFO, filemode="w")
pred_cds = extract_predicted_cds(
pred_cds_filepath,
pred_cds_info_filepath,
tag_ending_cds_info_filepath,
genome_filepath)
extract_potential_pyl_cds(
pred_cds,
pot_pyl_seq_filepath,
pot_pyl_seq_info_filepath,
pred_cds_obj_filepath)
if __name__ == "__main__":
parser = argparse.ArgumentParser(description='Predict potential PYL CDS')
parser.add_argument('--genome', help='path to FASTA file with genome (whole or scaffold) on which predict proteins')
parser.add_argument('--pred_cds', help='path to FASTA file with predicted CDS of the genome')
parser.add_argument('--pot_pyl_seq', help='path to generated FASTA file with potential PYL CDS sequence')
parser.add_argument('--log', help='path to generated log file')
parser.add_argument('--pred_cds_info', help='path to generated CSV file with predicted CDS info')
parser.add_argument('--pred_cds_obj', help='path to generated JSON file to store the list of predicted CDS object')
parser.add_argument('--tag_ending_cds_info', help='path to generated CSV file with TAG-ending CDS info')
parser.add_argument('--pot_pyl_seq_info', help='path to generated CSV file with potential PYL CDS info')
args = parser.parse_args()
predict_pyl_proteins(
genome_filepath=Path(args.genome),
pred_cds_filepath=Path(args.pred_cds),
pot_pyl_seq_filepath=Path(args.pot_pyl_seq),
log_filepath=Path(args.log),
pred_cds_info_filepath=Path(args.pred_cds_info),
tag_ending_cds_info_filepath=Path(args.tag_ending_cds_info),
pot_pyl_seq_info_filepath=Path(args.pot_pyl_seq_info),
pred_cds_obj_filepath=Path(args.pred_cds_obj))