-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #114 from iacopy/113-create-genes-queries
Create JSON queries from list of genes
- Loading branch information
Showing
2 changed files
with
283 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,196 @@ | ||
# pylint: disable=duplicate-code | ||
""" | ||
Generate a JSON query for each gene. | ||
Useful to obtain protein structures for a list of genes, e.g for the GABA receptor family. | ||
Usage: | ||
$ python querygenes.py --types experimental computational GENE1 GENE2 | ||
""" | ||
|
||
# Standard Library | ||
import argparse | ||
import os | ||
|
||
TEMPLATE_EXP = """ | ||
{ | ||
"query": { | ||
"type": "group", | ||
"logical_operator": "and", | ||
"nodes": [ | ||
{ | ||
"type": "terminal", | ||
"service": "text", | ||
"parameters": { | ||
"attribute": "entity_poly.rcsb_entity_polymer_type", | ||
"operator": "exact_match", | ||
"negation": false, | ||
"value": "Protein" | ||
} | ||
}, | ||
{ | ||
"type": "group", | ||
"nodes": [ | ||
{ | ||
"type": "terminal", | ||
"service": "text", | ||
"parameters": { | ||
"attribute": "exptl.method", | ||
"operator": "exact_match", | ||
"negation": false, | ||
"value": "X-RAY DIFFRACTION" | ||
} | ||
}, | ||
{ | ||
"type": "terminal", | ||
"service": "text", | ||
"parameters": { | ||
"attribute": "exptl.method", | ||
"operator": "exact_match", | ||
"negation": false, | ||
"value": "SOLUTION NMR" | ||
} | ||
}, | ||
{ | ||
"type": "terminal", | ||
"service": "text", | ||
"parameters": { | ||
"attribute": "exptl.method", | ||
"operator": "exact_match", | ||
"negation": false, | ||
"value": "ELECTRON MICROSCOPY" | ||
} | ||
} | ||
], | ||
"logical_operator": "or" | ||
}, | ||
{ | ||
"type": "terminal", | ||
"service": "text", | ||
"parameters": { | ||
"attribute": "rcsb_entity_source_organism.rcsb_gene_name.value", | ||
"operator": "exact_match", | ||
"negation": false, | ||
"value": "$gene_name" | ||
} | ||
} | ||
], | ||
"label": "text" | ||
}, | ||
"return_type": "entry", | ||
"request_options": { | ||
"return_all_hits": true, | ||
"results_content_type": [ | ||
"experimental" | ||
], | ||
"sort": [ | ||
{ | ||
"sort_by": "score", | ||
"direction": "desc" | ||
} | ||
], | ||
"scoring_strategy": "combined" | ||
} | ||
} | ||
""".strip() | ||
|
||
TEMPLATE_COM = """ | ||
{ | ||
"query": { | ||
"type": "group", | ||
"logical_operator": "and", | ||
"nodes": [ | ||
{ | ||
"type": "terminal", | ||
"service": "text", | ||
"parameters": { | ||
"attribute": "entity_poly.rcsb_entity_polymer_type", | ||
"operator": "exact_match", | ||
"negation": false, | ||
"value": "Protein" | ||
} | ||
}, | ||
{ | ||
"type": "terminal", | ||
"service": "text", | ||
"parameters": { | ||
"attribute": "rcsb_entity_source_organism.rcsb_gene_name.value", | ||
"operator": "exact_match", | ||
"negation": false, | ||
"value": "$gene_name" | ||
} | ||
} | ||
], | ||
"label": "text" | ||
}, | ||
"return_type": "entry", | ||
"request_options": { | ||
"return_all_hits": true, | ||
"results_content_type": [ | ||
"computational" | ||
], | ||
"sort": [ | ||
{ | ||
"sort_by": "score", | ||
"direction": "desc" | ||
} | ||
], | ||
"scoring_strategy": "combined" | ||
} | ||
} | ||
""".strip() | ||
|
||
|
||
TEMPLATES = {"experimental": TEMPLATE_EXP, "computational": TEMPLATE_COM} | ||
|
||
|
||
def main( | ||
name: str, | ||
gene_names: list, | ||
output: str = "templates", | ||
types: tuple = ("experimental",), | ||
): | ||
""" | ||
Create queries for genes. | ||
Args: | ||
name (str): the name of the directory that will be created | ||
gene_names (list): the list of genes | ||
output (str): the root directory path | ||
types (tuple): the types of protein structures (experimental and/or computational) | ||
""" | ||
if not name: | ||
raise ValueError("The name of the project must be provided") | ||
if not gene_names: | ||
raise ValueError("At least one gene name must be provided") | ||
if not types: | ||
raise ValueError("At least one type of protein structure must be provided") | ||
|
||
project_dir = f"{output}/{name}" | ||
for type_ in types: | ||
sub_dir = f"{project_dir}/{type_}/queries" | ||
os.makedirs(sub_dir, exist_ok=True) | ||
template = TEMPLATES[type_] | ||
for gene_name in gene_names: | ||
query = template.replace("$gene_name", gene_name) | ||
file_name = f"{sub_dir}/{gene_name}.json" | ||
with open(file_name, "w", encoding="utf-8") as file: | ||
file.write(query) | ||
print(f"{type_} query for gene {gene_name} saved to {file_name}") | ||
|
||
|
||
if __name__ == "__main__": | ||
parser = argparse.ArgumentParser() | ||
parser.add_argument( | ||
"name", help="Name of the project/group (protein family or similar)" | ||
) | ||
parser.add_argument("gene_names", nargs="+", help="Gene names to search for") | ||
parser.add_argument("--output", default="templates", help="Output directory") | ||
parser.add_argument( | ||
"--types", | ||
nargs="+", | ||
default=["experimental", "computational"], | ||
help="Types of queries to generate", | ||
) | ||
args = parser.parse_args() | ||
|
||
main(args.name, args.gene_names, args.output, args.types) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,87 @@ | ||
""" | ||
Test module for querygenes.py | ||
""" | ||
|
||
# Standard Library | ||
import json | ||
import os | ||
|
||
# 3rd party | ||
import pytest | ||
|
||
# My stuff | ||
from querygenes import main | ||
|
||
|
||
def load_json(file_path): | ||
""" | ||
Load a JSON file from disk and return its content as a dictionary | ||
""" | ||
with open(file_path, "r", encoding="utf-8") as file: | ||
return json.load(file) | ||
|
||
|
||
def assert_file_contains_gene_query(file_path, gene_name, node_index): | ||
""" | ||
Assert that the content of the file at file_path contains a query for the gene_name. | ||
We test that the query is looking for the gene_name in the rcsb_gene_name attribute. | ||
Other attributes are not tested here, but they could be added in the future. | ||
Args: | ||
file_path (str): the path to the file to check | ||
gene_name (str): the gene name to look for in the query | ||
node_index (int): the index of the node in the query | ||
""" | ||
content = load_json(file_path) | ||
assert content["query"]["nodes"][node_index]["parameters"] == { | ||
"attribute": "rcsb_entity_source_organism.rcsb_gene_name.value", | ||
"operator": "exact_match", | ||
"negation": False, | ||
"value": gene_name, | ||
} | ||
|
||
|
||
def test_main_with_default_arguments(tmpdir): | ||
"""main() should create the expected files""" | ||
name = "test_project" | ||
gene_names = ["gene1", "gene2"] | ||
output = str(tmpdir) | ||
types = ["experimental", "computational"] | ||
|
||
main(name, gene_names, output, types) | ||
|
||
expected_file = os.path.join(output, name, "experimental", "queries", "gene1.json") | ||
assert os.path.exists(expected_file), f"{expected_file} does not exist" | ||
assert_file_contains_gene_query(expected_file, "gene1", 2) | ||
|
||
expected_file = os.path.join(output, name, "experimental", "queries", "gene2.json") | ||
assert os.path.exists(expected_file), f"{expected_file} does not exist" | ||
assert_file_contains_gene_query(expected_file, "gene2", 2) | ||
|
||
expected_file = os.path.join(output, name, "computational", "queries", "gene1.json") | ||
assert os.path.exists(expected_file), f"{expected_file} does not exist" | ||
assert_file_contains_gene_query(expected_file, "gene1", 1) | ||
|
||
expected_file = os.path.join(output, name, "computational", "queries", "gene2.json") | ||
assert os.path.exists(expected_file), f"{expected_file} does not exist" | ||
assert_file_contains_gene_query(expected_file, "gene2", 1) | ||
|
||
|
||
def test_main_with_empty_name(): | ||
"""main() should raise a ValueError if the name of the project is empty""" | ||
with pytest.raises(ValueError): | ||
main("", ["GENE1", "GENE2"]) | ||
|
||
|
||
def test_main_with_no_gene_names(): | ||
"""main() should raise a ValueError if no gene names are provided""" | ||
with pytest.raises(ValueError): | ||
main("project_name", []) | ||
|
||
|
||
def test_main_with_no_types(): | ||
"""main() should raise a ValueError if no types are provided""" | ||
with pytest.raises(ValueError): | ||
main("project_name", ["GENE1", "GENE2"], types=[]) |