Skip to content

Commit

Permalink
querygenes: list of genes -> JSON queries
Browse files Browse the repository at this point in the history
Crea un JSON file per query

Issue #113
  • Loading branch information
iacopy committed Sep 28, 2024
1 parent 608f3f1 commit aeeccb7
Showing 1 changed file with 188 additions and 0 deletions.
188 changes: 188 additions & 0 deletions src/querygenes.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,188 @@
# pylint: disable=duplicate-code
"""
Generate a JSON query for each gene.
Usage:
$ python querygenes.py --types experimental computational GENE1 GENE2
"""

# Standard Library
import argparse
import os

TEMPLATE_EXP = """
{
"query": {
"type": "group",
"logical_operator": "and",
"nodes": [
{
"type": "terminal",
"service": "text",
"parameters": {
"attribute": "entity_poly.rcsb_entity_polymer_type",
"operator": "exact_match",
"negation": false,
"value": "Protein"
}
},
{
"type": "group",
"nodes": [
{
"type": "terminal",
"service": "text",
"parameters": {
"attribute": "exptl.method",
"operator": "exact_match",
"negation": false,
"value": "X-RAY DIFFRACTION"
}
},
{
"type": "terminal",
"service": "text",
"parameters": {
"attribute": "exptl.method",
"operator": "exact_match",
"negation": false,
"value": "SOLUTION NMR"
}
},
{
"type": "terminal",
"service": "text",
"parameters": {
"attribute": "exptl.method",
"operator": "exact_match",
"negation": false,
"value": "ELECTRON MICROSCOPY"
}
}
],
"logical_operator": "or"
},
{
"type": "terminal",
"service": "text",
"parameters": {
"attribute": "rcsb_entity_source_organism.rcsb_gene_name.value",
"operator": "exact_match",
"negation": false,
"value": "$gene_name"
}
}
],
"label": "text"
},
"return_type": "entry",
"request_options": {
"return_all_hits": true,
"results_content_type": [
"experimental"
],
"sort": [
{
"sort_by": "score",
"direction": "desc"
}
],
"scoring_strategy": "combined"
}
}
""".strip()

TEMPLATE_COM = """
{
"query": {
"type": "group",
"logical_operator": "and",
"nodes": [
{
"type": "terminal",
"service": "text",
"parameters": {
"attribute": "entity_poly.rcsb_entity_polymer_type",
"operator": "exact_match",
"negation": false,
"value": "Protein"
}
},
{
"type": "terminal",
"service": "text",
"parameters": {
"attribute": "rcsb_entity_source_organism.rcsb_gene_name.value",
"operator": "exact_match",
"negation": false,
"value": "$gene_name"
}
}
],
"label": "text"
},
"return_type": "entry",
"request_options": {
"return_all_hits": true,
"results_content_type": [
"computational"
],
"sort": [
{
"sort_by": "score",
"direction": "desc"
}
],
"scoring_strategy": "combined"
}
}
""".strip()


TEMPLATES = {"experimental": TEMPLATE_EXP, "computational": TEMPLATE_COM}


def main(
name: str,
gene_names: list,
output: str = "templates",
types: tuple = ("experimental",),
):
"""
Create queries for genes.
Args:
name (str): the name of the directory that will be created
gene_names (list): the list of genes
output (str): the root directory path
types (tuple): the types of protein structures (experimental and/or computational)
"""
project_dir = f"{output}/{name}"
for type_ in types:
sub_dir = f"{project_dir}/{type_}/queries"
os.makedirs(sub_dir, exist_ok=True)
template = TEMPLATES[type_]
for gene_name in gene_names:
query = template.replace("$gene_name", gene_name)
file_name = f"{sub_dir}/{gene_name}.json"
with open(file_name, "w", encoding="utf-8") as file:
file.write(query)
print(f"{type_} query for gene {gene_name} saved to {file_name}")


if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument(
"name", help="Name of the project/group (protein family or similar)"
)
parser.add_argument("gene_names", nargs="+", help="Gene names to search for")
parser.add_argument("--output", default="templates", help="Output directory")
parser.add_argument(
"--types",
nargs="+",
default=["experimental", "computational"],
help="Types of queries to generate",
)
args = parser.parse_args()

main(args.name, args.gene_names, args.output, args.types)

0 comments on commit aeeccb7

Please sign in to comment.