Skip to content

Commit

Permalink
Add option to create taxon_file.tsv (issue #35).
Browse files Browse the repository at this point in the history
Update function modifying ptools-init.dat (issue #34).
  • Loading branch information
ArnaudBelcour committed Jan 6, 2020
1 parent d7154f2 commit 2d93267
Show file tree
Hide file tree
Showing 6 changed files with 86 additions and 22 deletions.
8 changes: 4 additions & 4 deletions README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -280,7 +280,7 @@ mpwt can be used in a python script with an import:
patho_inference=optional_boolean,
patho_hole_filler=optional_boolean,
patho_operon_predictor=optional_boolean,
patho_citations=optional_boolean,
no_download_articles=optional_boolean,
dat_creation=optional_boolean,
dat_extraction=optional_boolean,
size_reduction=optional_boolean,
Expand All @@ -303,7 +303,7 @@ mpwt can be used in a python script with an import:
+-------------------------+------------------------------------------------+-------------------------------------------------------------------------+
| --op | patho_operon_predictor(boolean) | Launch PathoLogic Operon Predictor |
+-------------------------+------------------------------------------------+-------------------------------------------------------------------------+
| --nc | patho_citations(boolean) | Launch PathoLogic without loading PubMed citations |
| --nc | no_download_articles(boolean) | Launch PathoLogic without loading PubMed citations |
+-------------------------+------------------------------------------------+-------------------------------------------------------------------------+
| --dat | dat_creation(boolean) | Create BioPAX/attribute-value dat files |
+-------------------------+------------------------------------------------+-------------------------------------------------------------------------+
Expand Down Expand Up @@ -386,7 +386,7 @@ Convert Genbank and GFF files into PathoLogic files then create PGDBs of studied
mpwt.create_pathologic_file(input_folder='path/to/folder/input', output_folder='path/to/folder/pf')
mpwt.multiprocess_pwt(input_folder='path/to/folder/pf', patho_inference=True)
Create PGDBs of studied organisms inside ptools-local with Hole Filler, Operon Predictor and without loading PubMed citations:
Create PGDBs of studied organisms inside ptools-local with Hole Filler, Operon Predictor and without loading PubMed citations (need Pathway Tools 23.5 or higher):

..
Expand All @@ -401,7 +401,7 @@ Create PGDBs of studied organisms inside ptools-local with Hole Filler, Operon P
patho_inference=True,
patho_hole_filler=True,
patho_operon_predictor=True,
patho_citations=True,
no_download_articles=True,
patho_log='path/to/folder/log')
Create PGDBs of studied organisms inside ptools-local and create dat files:
Expand Down
4 changes: 2 additions & 2 deletions mpwt/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ def run_mpwt():
patho_inference = args['--patho']
patho_hole_filler = args['--hf']
patho_operon_predictor = args['--op']
patho_citations = args['--nc']
no_download_articles = args['--nc']
dat_creation = args['--dat']
move_dat = args['--md']
size_reduction = args['-r']
Expand Down Expand Up @@ -116,7 +116,7 @@ def run_mpwt():
patho_inference=patho_inference,
patho_hole_filler=patho_hole_filler,
patho_operon_predictor=patho_operon_predictor,
patho_citations=patho_citations,
no_download_articles=no_download_articles,
dat_creation=dat_creation,
dat_extraction=move_dat,
size_reduction=size_reduction,
Expand Down
36 changes: 25 additions & 11 deletions mpwt/mpwt_workflow.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
-check the results (results_check)
"""

import csv
import logging
import os
import shutil
Expand All @@ -14,7 +15,7 @@
from mpwt import utils
from mpwt.pwt_wrapper import run_pwt, run_pwt_dat, run_move_pgdb
from mpwt.results_check import check_dat, check_pwt, permission_change
from mpwt.pathologic_input import check_input_and_existing_pgdb, create_mpwt_input, pwt_input_files, create_only_dat_lisp, create_dat_creation_script
from mpwt.pathologic_input import check_input_and_existing_pgdb, create_mpwt_input, pwt_input_files, create_only_dat_lisp, create_dat_creation_script, read_taxon_id
from multiprocessing import Pool

logging.basicConfig(format='%(message)s', level=logging.CRITICAL)
Expand All @@ -23,10 +24,10 @@


def multiprocess_pwt(input_folder=None, output_folder=None, patho_inference=None,
patho_hole_filler=None, patho_operon_predictor=None, patho_citations=None,
patho_hole_filler=None, patho_operon_predictor=None, no_download_articles=None,
dat_creation=None, dat_extraction=None, size_reduction=None,
number_cpu=None, patho_log=None, ignore_error=None,
taxon_file=None, turn_off_citations=None, verbose=None):
taxon_file=None, verbose=None):
"""
Function managing all the workflow (from the creatin of the input files to the results).
Use it when you import mpwt in a script.
Expand All @@ -37,7 +38,7 @@ def multiprocess_pwt(input_folder=None, output_folder=None, patho_inference=None
patho_inference (bool): PathoLogic inference (True/False)
patho_hole_filler (bool): PathoLogic hole filler (True/False)
patho_operon_predictor (bool): PathoLogic operon predictor (True/False)
patho_citations (bool): turning off loading of PubMed citations (True/False)
no_download_articles (bool): turning off loading of PubMed citations (True/False)
dat_creation (bool): BioPAX/attributes-values files creation (True/False)
dat_extraction (bool): BioPAX/attributes-values files extraction (True/False)
size_reduction (bool): delete ptools-local data at the end (True/False)
Expand Down Expand Up @@ -72,16 +73,16 @@ def multiprocess_pwt(input_folder=None, output_folder=None, patho_inference=None
sys.exit('To use --ignore-error/ignore_error, you need to use the --patho/patho_inference argument.')

# Check if taxon_file is used with patho_inference.
if taxon_file and not patho_inference:
sys.exit('To use --taxon-file/taxon_file, you need to use the --patho/patho_inference argument.')
if (taxon_file and not patho_inference) and (taxon_file and not input_folder):
sys.exit('To use --taxon-file/taxon_file, you need to use the --patho/patho_inference argument. Or you can use it with the -f argument to create the taxon file from data.')

#Check if patho_operon_predictor is used with patho_inference.
if patho_operon_predictor and not patho_inference:
sys.exit('To use --op/patho_operon_predictor, you need to use the --patho/patho_inference argument.')

#Check if patho_citations is used with patho_inference.
if patho_citations and not patho_inference:
sys.exit('To use --nc/patho_citations, you need to use the --patho/patho_inference argument.')
#Check if no_download_articles is used with patho_inference.
if no_download_articles and not patho_inference:
sys.exit('To use --nc/no_download_articles, you need to use the --patho/patho_inference argument.')

# Use the number of cpu given by the user or 1 CPU.
if number_cpu:
Expand All @@ -93,8 +94,21 @@ def multiprocess_pwt(input_folder=None, output_folder=None, patho_inference=None
number_cpu_to_use = 1
mpwt_pool = Pool(processes=number_cpu_to_use)

# Create taxon file in the input folder.
if taxon_file and input_folder and not patho_inference:
taxon_file_pathname = input_folder + '/taxon_id.tsv'
if os.path.exists(taxon_file_pathname):
sys.exit('taxon ID file (' + taxon_file_pathname + ') already exists.')
else:
taxon_ids = read_taxon_id(input_folder)
with open(taxon_file_pathname, 'w') as taxon_id_file:
taxon_id_writer = csv.writer(taxon_id_file, delimiter='\t')
taxon_id_writer.writerow(['species', 'taxon_id'])
for species, taxon_id in taxon_ids.items():
taxon_id_writer.writerow([species, taxon_id])

# Turn off loading of pubmed entries.
if patho_citations:
if no_download_articles:
utils.pubmed_citations(activate_citations=False)

# Check input folder and create input files for PathoLogic.
Expand Down Expand Up @@ -217,7 +231,7 @@ def multiprocess_pwt(input_folder=None, output_folder=None, patho_inference=None
mpwt_pool.join()

# Turn on loading of pubmed entries.
if patho_citations:
if no_download_articles:
utils.pubmed_citations(activate_citations=True)

end_time = time.time()
Expand Down
49 changes: 49 additions & 0 deletions mpwt/pathologic_input.py
Original file line number Diff line number Diff line change
Expand Up @@ -394,6 +394,55 @@ def create_dats_and_lisp(run_folder, taxon_file):
return all([os.path.isfile(organism_dat), os.path.isfile(genetic_dat), check_lisp_file])


def read_taxon_id(run_folder):
taxon_ids = {}

for input_folder in os.listdir(run_folder):
for input_file in os.listdir(run_folder + '/' + input_folder):
if '.gbk' in input_file:
gbk_pathname = run_folder + '/' + input_folder + '/' + input_file
# Take the species name and the taxon id from the genbank file.
with open(gbk_pathname, "r") as gbk:
# Take the first record of the genbank (first contig/chromosome) to retrieve the species name.
first_seq_record = next(SeqIO.parse(gbk, "genbank"))
# Take the source feature of the first record.
# This feature contains the taxon ID in the db_xref qualifier.
src_features = [feature for feature in first_seq_record.features if feature.type == "source"]
for src_feature in src_features:
try:
src_dbxref_qualifiers = src_feature.qualifiers['db_xref']
for src_dbxref_qualifier in src_dbxref_qualifiers:
if 'taxon:' in src_dbxref_qualifier:
taxon_id = src_dbxref_qualifier.replace('taxon:', '')
except KeyError:
logger.info('No taxon ID in the Genbank {0} In the FEATURES source you must have: /db_xref="taxon:taxonid" Where taxonid is the Id of your organism. You can find it on the NCBI.'.format(gbk_pathname))

elif '.gff' in input_file:
gff_pathname = run_folder + '/' + input_folder + '/' + input_file

# Instead of parsing and creating a database from the GFF, parse the file and extract the first region feature.
try:
region_feature = [feature for feature in DataIterator(gff_pathname) if feature.featuretype == 'region'][0]
except IndexError:
raise IndexError('No region feature in the GFF file of {0}, GFF file must have region features.'.format(input_folder))

try:
region_feature.attributes['Dbxref']
except KeyError:
raise KeyError('No Dbxref in GFF file of {0} GFF file must have a ;Dbxref=taxon:taxonid; in the region feature.'.format(input_folder))

for dbxref in region_feature.attributes['Dbxref']:
if 'taxon' in dbxref:
taxon_id = dbxref.split('taxon:')[1]

elif '.pf' in input_file:
logger.info('No taxon ID associated to a PathoLogic Format. {0} will have a missing taxon_id'.format(input_folder))
taxon_id = "missing"
taxon_ids[input_folder] = taxon_id

return taxon_ids


def pwt_input_files(multiprocess_input):
"""
Check if files needed by Pathway Tools are available, if not create them.
Expand Down
2 changes: 1 addition & 1 deletion mpwt/results_check.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ def check_pwt(multiprocess_inputs, patho_log_folder):
if patho_log_folder:
patho_error_file.write(line)

if 'Build done.' in line:
if 'Build done.' in line or 'PGDB build done.' in line:
if patho_log_folder:
patho_error_file.write(line)
resume_inference_line = next(input_file)
Expand Down
9 changes: 5 additions & 4 deletions mpwt/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -379,17 +379,18 @@ def pubmed_citations(activate_citations):
"""
Activate or deactivate loading of PubMed citations.
TODO: update this function with the argument from the new version of Pathway Tools
Args:
activate_citations (bool): boolean to indicate if you want to activate or not the downlaod of Pubmed entries.
"""
ptools_init_filepath = find_ptools_path() + '/ptools-init.dat'
new_ptools_file = ""
with open(ptools_init_filepath, 'r') as ptools_init_file:
for line in ptools_init_file.read().split('\n'):
if '##download-pubmed-citations' in line:
if '###Batch-PathoLogic-Download-Pubmed-Entries?' in line:
if activate_citations:
line = line.replace('N', 'Y')
line = line.replace('F', 'T')
else:
line = line.replace('Y', 'N')
line = line.replace('T', 'F')
if line != '':
new_ptools_file = new_ptools_file + line + '\n'
else:
Expand Down

0 comments on commit 2d93267

Please sign in to comment.