-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
querygenes: list of genes -> JSON queries
Crea un JSON file per query Issue #113
- Loading branch information
Showing
1 changed file
with
188 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,188 @@ | ||
# pylint: disable=duplicate-code | ||
""" | ||
Generate a JSON query for each gene. | ||
Usage: | ||
$ python querygenes.py --types experimental computational GENE1 GENE2 | ||
""" | ||
|
||
# Standard Library | ||
import argparse | ||
import os | ||
|
||
TEMPLATE_EXP = """ | ||
{ | ||
"query": { | ||
"type": "group", | ||
"logical_operator": "and", | ||
"nodes": [ | ||
{ | ||
"type": "terminal", | ||
"service": "text", | ||
"parameters": { | ||
"attribute": "entity_poly.rcsb_entity_polymer_type", | ||
"operator": "exact_match", | ||
"negation": false, | ||
"value": "Protein" | ||
} | ||
}, | ||
{ | ||
"type": "group", | ||
"nodes": [ | ||
{ | ||
"type": "terminal", | ||
"service": "text", | ||
"parameters": { | ||
"attribute": "exptl.method", | ||
"operator": "exact_match", | ||
"negation": false, | ||
"value": "X-RAY DIFFRACTION" | ||
} | ||
}, | ||
{ | ||
"type": "terminal", | ||
"service": "text", | ||
"parameters": { | ||
"attribute": "exptl.method", | ||
"operator": "exact_match", | ||
"negation": false, | ||
"value": "SOLUTION NMR" | ||
} | ||
}, | ||
{ | ||
"type": "terminal", | ||
"service": "text", | ||
"parameters": { | ||
"attribute": "exptl.method", | ||
"operator": "exact_match", | ||
"negation": false, | ||
"value": "ELECTRON MICROSCOPY" | ||
} | ||
} | ||
], | ||
"logical_operator": "or" | ||
}, | ||
{ | ||
"type": "terminal", | ||
"service": "text", | ||
"parameters": { | ||
"attribute": "rcsb_entity_source_organism.rcsb_gene_name.value", | ||
"operator": "exact_match", | ||
"negation": false, | ||
"value": "$gene_name" | ||
} | ||
} | ||
], | ||
"label": "text" | ||
}, | ||
"return_type": "entry", | ||
"request_options": { | ||
"return_all_hits": true, | ||
"results_content_type": [ | ||
"experimental" | ||
], | ||
"sort": [ | ||
{ | ||
"sort_by": "score", | ||
"direction": "desc" | ||
} | ||
], | ||
"scoring_strategy": "combined" | ||
} | ||
} | ||
""".strip() | ||
|
||
TEMPLATE_COM = """ | ||
{ | ||
"query": { | ||
"type": "group", | ||
"logical_operator": "and", | ||
"nodes": [ | ||
{ | ||
"type": "terminal", | ||
"service": "text", | ||
"parameters": { | ||
"attribute": "entity_poly.rcsb_entity_polymer_type", | ||
"operator": "exact_match", | ||
"negation": false, | ||
"value": "Protein" | ||
} | ||
}, | ||
{ | ||
"type": "terminal", | ||
"service": "text", | ||
"parameters": { | ||
"attribute": "rcsb_entity_source_organism.rcsb_gene_name.value", | ||
"operator": "exact_match", | ||
"negation": false, | ||
"value": "$gene_name" | ||
} | ||
} | ||
], | ||
"label": "text" | ||
}, | ||
"return_type": "entry", | ||
"request_options": { | ||
"return_all_hits": true, | ||
"results_content_type": [ | ||
"computational" | ||
], | ||
"sort": [ | ||
{ | ||
"sort_by": "score", | ||
"direction": "desc" | ||
} | ||
], | ||
"scoring_strategy": "combined" | ||
} | ||
} | ||
""".strip() | ||
|
||
|
||
TEMPLATES = {"experimental": TEMPLATE_EXP, "computational": TEMPLATE_COM} | ||
|
||
|
||
def main( | ||
name: str, | ||
gene_names: list, | ||
output: str = "templates", | ||
types: tuple = ("experimental",), | ||
): | ||
""" | ||
Create queries for genes. | ||
Args: | ||
name (str): the name of the directory that will be created | ||
gene_names (list): the list of genes | ||
output (str): the root directory path | ||
types (tuple): the types of protein structures (experimental and/or computational) | ||
""" | ||
project_dir = f"{output}/{name}" | ||
for type_ in types: | ||
sub_dir = f"{project_dir}/{type_}/queries" | ||
os.makedirs(sub_dir, exist_ok=True) | ||
template = TEMPLATES[type_] | ||
for gene_name in gene_names: | ||
query = template.replace("$gene_name", gene_name) | ||
file_name = f"{sub_dir}/{gene_name}.json" | ||
with open(file_name, "w", encoding="utf-8") as file: | ||
file.write(query) | ||
print(f"{type_} query for gene {gene_name} saved to {file_name}") | ||
|
||
|
||
if __name__ == "__main__": | ||
parser = argparse.ArgumentParser() | ||
parser.add_argument( | ||
"name", help="Name of the project/group (protein family or similar)" | ||
) | ||
parser.add_argument("gene_names", nargs="+", help="Gene names to search for") | ||
parser.add_argument("--output", default="templates", help="Output directory") | ||
parser.add_argument( | ||
"--types", | ||
nargs="+", | ||
default=["experimental", "computational"], | ||
help="Types of queries to generate", | ||
) | ||
args = parser.parse_args() | ||
|
||
main(args.name, args.gene_names, args.output, args.types) |