Skip to content

Commit

Permalink
Merge pull request #114 from iacopy/113-create-genes-queries
Browse files Browse the repository at this point in the history
Create JSON queries from list of genes
  • Loading branch information
iacopy authored Sep 28, 2024
2 parents 608f3f1 + 2da8117 commit 59e5194
Show file tree
Hide file tree
Showing 2 changed files with 283 additions and 0 deletions.
196 changes: 196 additions & 0 deletions src/querygenes.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,196 @@
# pylint: disable=duplicate-code
"""
Generate a JSON query for each gene.
Useful to obtain protein structures for a list of genes, e.g for the GABA receptor family.
Usage:
$ python querygenes.py --types experimental computational GENE1 GENE2
"""

# Standard Library
import argparse
import os

TEMPLATE_EXP = """
{
"query": {
"type": "group",
"logical_operator": "and",
"nodes": [
{
"type": "terminal",
"service": "text",
"parameters": {
"attribute": "entity_poly.rcsb_entity_polymer_type",
"operator": "exact_match",
"negation": false,
"value": "Protein"
}
},
{
"type": "group",
"nodes": [
{
"type": "terminal",
"service": "text",
"parameters": {
"attribute": "exptl.method",
"operator": "exact_match",
"negation": false,
"value": "X-RAY DIFFRACTION"
}
},
{
"type": "terminal",
"service": "text",
"parameters": {
"attribute": "exptl.method",
"operator": "exact_match",
"negation": false,
"value": "SOLUTION NMR"
}
},
{
"type": "terminal",
"service": "text",
"parameters": {
"attribute": "exptl.method",
"operator": "exact_match",
"negation": false,
"value": "ELECTRON MICROSCOPY"
}
}
],
"logical_operator": "or"
},
{
"type": "terminal",
"service": "text",
"parameters": {
"attribute": "rcsb_entity_source_organism.rcsb_gene_name.value",
"operator": "exact_match",
"negation": false,
"value": "$gene_name"
}
}
],
"label": "text"
},
"return_type": "entry",
"request_options": {
"return_all_hits": true,
"results_content_type": [
"experimental"
],
"sort": [
{
"sort_by": "score",
"direction": "desc"
}
],
"scoring_strategy": "combined"
}
}
""".strip()

TEMPLATE_COM = """
{
"query": {
"type": "group",
"logical_operator": "and",
"nodes": [
{
"type": "terminal",
"service": "text",
"parameters": {
"attribute": "entity_poly.rcsb_entity_polymer_type",
"operator": "exact_match",
"negation": false,
"value": "Protein"
}
},
{
"type": "terminal",
"service": "text",
"parameters": {
"attribute": "rcsb_entity_source_organism.rcsb_gene_name.value",
"operator": "exact_match",
"negation": false,
"value": "$gene_name"
}
}
],
"label": "text"
},
"return_type": "entry",
"request_options": {
"return_all_hits": true,
"results_content_type": [
"computational"
],
"sort": [
{
"sort_by": "score",
"direction": "desc"
}
],
"scoring_strategy": "combined"
}
}
""".strip()


TEMPLATES = {"experimental": TEMPLATE_EXP, "computational": TEMPLATE_COM}


def main(
name: str,
gene_names: list,
output: str = "templates",
types: tuple = ("experimental",),
):
"""
Create queries for genes.
Args:
name (str): the name of the directory that will be created
gene_names (list): the list of genes
output (str): the root directory path
types (tuple): the types of protein structures (experimental and/or computational)
"""
if not name:
raise ValueError("The name of the project must be provided")
if not gene_names:
raise ValueError("At least one gene name must be provided")
if not types:
raise ValueError("At least one type of protein structure must be provided")

project_dir = f"{output}/{name}"
for type_ in types:
sub_dir = f"{project_dir}/{type_}/queries"
os.makedirs(sub_dir, exist_ok=True)
template = TEMPLATES[type_]
for gene_name in gene_names:
query = template.replace("$gene_name", gene_name)
file_name = f"{sub_dir}/{gene_name}.json"
with open(file_name, "w", encoding="utf-8") as file:
file.write(query)
print(f"{type_} query for gene {gene_name} saved to {file_name}")


if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument(
"name", help="Name of the project/group (protein family or similar)"
)
parser.add_argument("gene_names", nargs="+", help="Gene names to search for")
parser.add_argument("--output", default="templates", help="Output directory")
parser.add_argument(
"--types",
nargs="+",
default=["experimental", "computational"],
help="Types of queries to generate",
)
args = parser.parse_args()

main(args.name, args.gene_names, args.output, args.types)
87 changes: 87 additions & 0 deletions tests/test_querygenes.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
"""
Test module for querygenes.py
"""

# Standard Library
import json
import os

# 3rd party
import pytest

# My stuff
from querygenes import main


def load_json(file_path):
"""
Load a JSON file from disk and return its content as a dictionary
"""
with open(file_path, "r", encoding="utf-8") as file:
return json.load(file)


def assert_file_contains_gene_query(file_path, gene_name, node_index):
"""
Assert that the content of the file at file_path contains a query for the gene_name.
We test that the query is looking for the gene_name in the rcsb_gene_name attribute.
Other attributes are not tested here, but they could be added in the future.
Args:
file_path (str): the path to the file to check
gene_name (str): the gene name to look for in the query
node_index (int): the index of the node in the query
"""
content = load_json(file_path)
assert content["query"]["nodes"][node_index]["parameters"] == {
"attribute": "rcsb_entity_source_organism.rcsb_gene_name.value",
"operator": "exact_match",
"negation": False,
"value": gene_name,
}


def test_main_with_default_arguments(tmpdir):
"""main() should create the expected files"""
name = "test_project"
gene_names = ["gene1", "gene2"]
output = str(tmpdir)
types = ["experimental", "computational"]

main(name, gene_names, output, types)

expected_file = os.path.join(output, name, "experimental", "queries", "gene1.json")
assert os.path.exists(expected_file), f"{expected_file} does not exist"
assert_file_contains_gene_query(expected_file, "gene1", 2)

expected_file = os.path.join(output, name, "experimental", "queries", "gene2.json")
assert os.path.exists(expected_file), f"{expected_file} does not exist"
assert_file_contains_gene_query(expected_file, "gene2", 2)

expected_file = os.path.join(output, name, "computational", "queries", "gene1.json")
assert os.path.exists(expected_file), f"{expected_file} does not exist"
assert_file_contains_gene_query(expected_file, "gene1", 1)

expected_file = os.path.join(output, name, "computational", "queries", "gene2.json")
assert os.path.exists(expected_file), f"{expected_file} does not exist"
assert_file_contains_gene_query(expected_file, "gene2", 1)


def test_main_with_empty_name():
"""main() should raise a ValueError if the name of the project is empty"""
with pytest.raises(ValueError):
main("", ["GENE1", "GENE2"])


def test_main_with_no_gene_names():
"""main() should raise a ValueError if no gene names are provided"""
with pytest.raises(ValueError):
main("project_name", [])


def test_main_with_no_types():
"""main() should raise a ValueError if no types are provided"""
with pytest.raises(ValueError):
main("project_name", ["GENE1", "GENE2"], types=[])

0 comments on commit 59e5194

Please sign in to comment.