From eb75b15f2ea8ff0ace845c2bea854374560d454a Mon Sep 17 00:00:00 2001 From: James Stevenson Date: Mon, 8 May 2023 22:11:00 -0400 Subject: [PATCH 1/8] fix!: use more precise name for CLI source option --- README.md | 6 ++-- .../loading_and_updating_data.rst | 7 +++-- gene/cli.py | 30 +++++++++---------- 3 files changed, 22 insertions(+), 21 deletions(-) diff --git a/README.md b/README.md index 0c457200..51b94089 100644 --- a/README.md +++ b/README.md @@ -73,15 +73,15 @@ Use the `gene_update` command in a shell to update the database. The normalizer currently pulls data from [HGNC](https://www.genenames.org/), [Ensembl](https://useast.ensembl.org/index.html), and [NCBI](https://www.ncbi.nlm.nih.gov/gene/). -To update one source, simply set `--normalizer` to the source you wish to update. The normalizer will check to see if local source data is up-to-date, acquire the most recent data if not, and use it to populate the database. +To update one source, simply set `--source` to the source you wish to update. The normalizer will check to see if local source data is up-to-date, acquire the most recent data if not, and use it to populate the database. For example, run the following to acquire the latest HGNC data if necessary, and update the HGNC gene records in the normalizer database: ```commandline -gene_update --normalizer="hgnc" +gene_update --source="hgnc" ``` -To update multiple sources, you can use the `--normalizer` option with the source names separated by spaces. +To update multiple sources, you can use the `--source` option with the source names separated by spaces. #### Update all sources diff --git a/docs/source/managing_data/loading_and_updating_data.rst b/docs/source/managing_data/loading_and_updating_data.rst index 01d59ea6..44b30dad 100644 --- a/docs/source/managing_data/loading_and_updating_data.rst +++ b/docs/source/managing_data/loading_and_updating_data.rst @@ -14,14 +14,15 @@ Calling the Gene Normalizer update command with the ``--update_all`` and ``--upd Reload individual source ------------------------ -To update specific sources, use the ``--normalizer`` option along with source name(s), quoted and separated by spaces. While it is possible to update individual source data without updating the normalized record data with ``--update_merged``, the normalization query endpoints may not function properly until normalized data is refreshed again. :: +To update specific sources, use the ``--source`` option along with source name(s), quoted and separated by spaces. While it is possible to update individual source data without updating the normalized record data with ``--update_merged``, the normalization query endpoints may not function properly until normalized data is refreshed again. :: + + gene_norm_update --source="HGNC NCBI" --update_merged - gene_norm_update --normalizer="HGNC NCBI" --update_merged Check DB health --------------- -The shell command `gene_norm_check_db` performs a basic check on the database status. It first confirms that the database's schema exists, and then identifies whether metadata is available for each source, and whether gene record and normalized concept tables are non-empty. Check the process's exit code for the result. +The shell command `gene_norm_check_db` performs a basic check on the database status. It first confirms that the database's schema exists, and then identifies whether metadata is available for each source, and whether gene record and normalized concept tables are non-empty. Check the process's exit code for the result. :: % gene_norm_check_db % echo $? diff --git a/gene/cli.py b/gene/cli.py index b4abfa2c..611667e6 100644 --- a/gene/cli.py +++ b/gene/cli.py @@ -102,18 +102,18 @@ def dump_database(output_directory: Path, db_url: str): click.get_current_context().exit(1) -def _update_normalizers( - normalizers: Collection[SourceName], db: AbstractDatabase, update_merged: bool +def _update_normalizer( + sources: Collection[SourceName], db: AbstractDatabase, update_merged: bool ) -> None: """Update selected normalizer sources. - :param normalizers: names of sources to update + :param sources: names of sources to update :param db: database instance :param update_merged: if true, retain processed records to use in updating merged records """ processed_ids = list() - for n in normalizers: + for n in sources: delete_time = _delete_source(n, db) _load_source(n, db, delete_time, processed_ids) @@ -207,8 +207,8 @@ def _load_merge(db: AbstractDatabase, processed_ids: Set[str]) -> None: @click.command() @click.option( - '--normalizer', - help="The normalizer(s) you wish to update separated by spaces." + '--source', + help="The source(s) you wish to update separated by spaces." ) @click.option( '--aws_instance', @@ -229,11 +229,11 @@ def _load_merge(db: AbstractDatabase, processed_ids: Set[str]) -> None: is_flag=True, help='Update concepts for normalize endpoint from accepted sources.' ) -def update_normalizer_db(normalizer: str, aws_instance: bool, db_url: str, +def update_normalizer_db(source: str, aws_instance: bool, db_url: str, update_all: bool, update_merged: bool) -> None: """Update selected normalizer source(s) in the gene database. - :param normalizer: names of sources to update, comma-separated + :param source: names of sources to update, comma-separated :param aws_instance: if true, use cloud instance :param db_url: URI pointing to database :param update_all: if true, update all sources (ignore `normalizer` parameter) @@ -242,8 +242,8 @@ def update_normalizer_db(normalizer: str, aws_instance: bool, db_url: str, db = create_db(db_url, aws_instance) if update_all: - _update_normalizers(list(SourceName), db, update_merged) - elif not normalizer: + _update_normalizer(list(SourceName), db, update_merged) + elif not source: if update_merged: _load_merge(db, set()) else: @@ -252,18 +252,18 @@ def update_normalizer_db(normalizer: str, aws_instance: bool, db_url: str, click.echo(ctx.get_help()) ctx.exit() else: - normalizers = normalizer.lower().split() + sources = source.lower().split() - if len(normalizers) == 0: + if len(sources) == 0: raise Exception("Must enter a normalizer") - non_sources = set(normalizers) - set(SOURCES) + non_sources = set(sources) - set(SOURCES) if len(non_sources) != 0: raise Exception(f"Not valid source(s): {non_sources}") - sources_to_update = {SourceName(SOURCES[s]) for s in normalizers} - _update_normalizers(sources_to_update, db, update_merged) + sources_to_update = {SourceName(SOURCES[s]) for s in sources} + _update_normalizer(sources_to_update, db, update_merged) if __name__ == '__main__': From 9cd3483766cf5c1627a21530ac04dc054d4161cc Mon Sep 17 00:00:00 2001 From: James Stevenson Date: Tue, 9 May 2023 09:31:14 -0400 Subject: [PATCH 2/8] singular -> plural --- README.md | 6 +++--- .../loading_and_updating_data.rst | 4 ++-- gene/cli.py | 18 +++++++++--------- 3 files changed, 14 insertions(+), 14 deletions(-) diff --git a/README.md b/README.md index 51b94089..250b3787 100644 --- a/README.md +++ b/README.md @@ -73,15 +73,15 @@ Use the `gene_update` command in a shell to update the database. The normalizer currently pulls data from [HGNC](https://www.genenames.org/), [Ensembl](https://useast.ensembl.org/index.html), and [NCBI](https://www.ncbi.nlm.nih.gov/gene/). -To update one source, simply set `--source` to the source you wish to update. The normalizer will check to see if local source data is up-to-date, acquire the most recent data if not, and use it to populate the database. +To update one source, simply set `--sources` to the source you wish to update. The normalizer will check to see if local source data is up-to-date, acquire the most recent data if not, and use it to populate the database. For example, run the following to acquire the latest HGNC data if necessary, and update the HGNC gene records in the normalizer database: ```commandline -gene_update --source="hgnc" +gene_update --sources="hgnc" ``` -To update multiple sources, you can use the `--source` option with the source names separated by spaces. +To update multiple sources, you can use the `--sources` option with the source names separated by spaces. #### Update all sources diff --git a/docs/source/managing_data/loading_and_updating_data.rst b/docs/source/managing_data/loading_and_updating_data.rst index 44b30dad..f579df15 100644 --- a/docs/source/managing_data/loading_and_updating_data.rst +++ b/docs/source/managing_data/loading_and_updating_data.rst @@ -14,9 +14,9 @@ Calling the Gene Normalizer update command with the ``--update_all`` and ``--upd Reload individual source ------------------------ -To update specific sources, use the ``--source`` option along with source name(s), quoted and separated by spaces. While it is possible to update individual source data without updating the normalized record data with ``--update_merged``, the normalization query endpoints may not function properly until normalized data is refreshed again. :: +To update specific sources, use the ``--sources`` option along with source name(s), quoted and separated by spaces. While it is possible to update individual source data without updating the normalized record data with ``--update_merged``, the normalization query endpoints may not function properly until normalized data is refreshed again. :: - gene_norm_update --source="HGNC NCBI" --update_merged + gene_norm_update --sources="HGNC NCBI" --update_merged Check DB health diff --git a/gene/cli.py b/gene/cli.py index 611667e6..6d4ea056 100644 --- a/gene/cli.py +++ b/gene/cli.py @@ -207,7 +207,7 @@ def _load_merge(db: AbstractDatabase, processed_ids: Set[str]) -> None: @click.command() @click.option( - '--source', + '--sources', help="The source(s) you wish to update separated by spaces." ) @click.option( @@ -229,7 +229,7 @@ def _load_merge(db: AbstractDatabase, processed_ids: Set[str]) -> None: is_flag=True, help='Update concepts for normalize endpoint from accepted sources.' ) -def update_normalizer_db(source: str, aws_instance: bool, db_url: str, +def update_normalizer_db(sources: str, aws_instance: bool, db_url: str, update_all: bool, update_merged: bool) -> None: """Update selected normalizer source(s) in the gene database. @@ -243,7 +243,7 @@ def update_normalizer_db(source: str, aws_instance: bool, db_url: str, if update_all: _update_normalizer(list(SourceName), db, update_merged) - elif not source: + elif not sources: if update_merged: _load_merge(db, set()) else: @@ -252,18 +252,18 @@ def update_normalizer_db(source: str, aws_instance: bool, db_url: str, click.echo(ctx.get_help()) ctx.exit() else: - sources = source.lower().split() + sources_split = sources.lower().split() - if len(sources) == 0: - raise Exception("Must enter a normalizer") + if len(sources_split) == 0: + raise Exception("Must enter 1 or more source names to update") - non_sources = set(sources) - set(SOURCES) + non_sources = set(sources_split) - set(SOURCES) if len(non_sources) != 0: raise Exception(f"Not valid source(s): {non_sources}") - sources_to_update = {SourceName(SOURCES[s]) for s in sources} - _update_normalizer(sources_to_update, db, update_merged) + parsed_source_names = {SourceName(SOURCES[s]) for s in sources_split} + _update_normalizer(parsed_source_names, db, update_merged) if __name__ == '__main__': From 044135056078df916b2f08860d9872ab0374d57f Mon Sep 17 00:00:00 2001 From: James Stevenson Date: Tue, 9 May 2023 10:41:55 -0400 Subject: [PATCH 3/8] more fiddling --- docs/source/managing_data/loading_and_updating_data.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/source/managing_data/loading_and_updating_data.rst b/docs/source/managing_data/loading_and_updating_data.rst index f579df15..ce3918ff 100644 --- a/docs/source/managing_data/loading_and_updating_data.rst +++ b/docs/source/managing_data/loading_and_updating_data.rst @@ -6,7 +6,7 @@ See the :ref:`ETL API documentation` for information on programmatic ac Full load/reload ---------------- -Calling the Gene Normalizer update command with the ``--update_all`` and ``--update_merged`` flags will delete all existing data, fetch new source data if available, and then perform a complete reload of the database: :: +Calling the Gene Normalizer update command with the ``--update_all`` and ``--update_merged`` flags will delete all existing data, fetch new source data if available, and then perform a complete reload of the database (including merged records): :: gene_norm_update --update_all --update_merged @@ -14,7 +14,7 @@ Calling the Gene Normalizer update command with the ``--update_all`` and ``--upd Reload individual source ------------------------ -To update specific sources, use the ``--sources`` option along with source name(s), quoted and separated by spaces. While it is possible to update individual source data without updating the normalized record data with ``--update_merged``, the normalization query endpoints may not function properly until normalized data is refreshed again. :: +To update specific sources, call the ``--sources`` option with source name(s) separated by spaces, surrounded by quotes. While it is possible to update individual source data without updating the normalized record data, the normalization query endpoints may not function properly until normalized data is refreshed again, so the `--update_merged` flag is recommended. :: gene_norm_update --sources="HGNC NCBI" --update_merged From bddfc5d382d9ef9b457456bab9d38ae80fa9f1af Mon Sep 17 00:00:00 2001 From: James Stevenson Date: Tue, 9 May 2023 10:42:50 -0400 Subject: [PATCH 4/8] fiddling --- docs/source/managing_data/loading_and_updating_data.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/managing_data/loading_and_updating_data.rst b/docs/source/managing_data/loading_and_updating_data.rst index ce3918ff..82a13179 100644 --- a/docs/source/managing_data/loading_and_updating_data.rst +++ b/docs/source/managing_data/loading_and_updating_data.rst @@ -14,7 +14,7 @@ Calling the Gene Normalizer update command with the ``--update_all`` and ``--upd Reload individual source ------------------------ -To update specific sources, call the ``--sources`` option with source name(s) separated by spaces, surrounded by quotes. While it is possible to update individual source data without updating the normalized record data, the normalization query endpoints may not function properly until normalized data is refreshed again, so the `--update_merged` flag is recommended. :: +To update specific sources, call the ``--sources`` option with source name(s) quoted and separated by spaces. While it is possible to update individual source data without also updating the normalized record data, that may affect the proper function of the normalized query endpoints, so it is recommended to include the `--update_merged` flag as well. gene_norm_update --sources="HGNC NCBI" --update_merged From a8857793d2ac4f8de5551dee676ade665a60da9b Mon Sep 17 00:00:00 2001 From: James Stevenson Date: Tue, 9 May 2023 10:47:37 -0400 Subject: [PATCH 5/8] more fiddling --- docs/source/managing_data/loading_and_updating_data.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/source/managing_data/loading_and_updating_data.rst b/docs/source/managing_data/loading_and_updating_data.rst index 82a13179..5666b5ea 100644 --- a/docs/source/managing_data/loading_and_updating_data.rst +++ b/docs/source/managing_data/loading_and_updating_data.rst @@ -14,7 +14,7 @@ Calling the Gene Normalizer update command with the ``--update_all`` and ``--upd Reload individual source ------------------------ -To update specific sources, call the ``--sources`` option with source name(s) quoted and separated by spaces. While it is possible to update individual source data without also updating the normalized record data, that may affect the proper function of the normalized query endpoints, so it is recommended to include the `--update_merged` flag as well. +To update specific sources, call the ``--sources`` option with one or more source name(s) quoted and separated by spaces. While it is possible to update individual source data without also updating the normalized record data, that may affect the proper function of the normalized query endpoints, so it is recommended to include the ``--update_merged`` flag as well. gene_norm_update --sources="HGNC NCBI" --update_merged @@ -22,7 +22,7 @@ To update specific sources, call the ``--sources`` option with source name(s) qu Check DB health --------------- -The shell command `gene_norm_check_db` performs a basic check on the database status. It first confirms that the database's schema exists, and then identifies whether metadata is available for each source, and whether gene record and normalized concept tables are non-empty. Check the process's exit code for the result. :: +The shell command ``gene_norm_check_db`` performs a basic check on the database status. It first confirms that the database's schema exists, and then identifies whether metadata is available for each source, and whether gene record and normalized concept tables are non-empty. Check the process's exit code for the result. :: % gene_norm_check_db % echo $? From 82a4f76de407ae24332471156bb7f860155890c3 Mon Sep 17 00:00:00 2001 From: James Stevenson Date: Tue, 9 May 2023 10:47:59 -0400 Subject: [PATCH 6/8] more fiddling --- docs/source/managing_data/loading_and_updating_data.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/managing_data/loading_and_updating_data.rst b/docs/source/managing_data/loading_and_updating_data.rst index 5666b5ea..f5c1f6ca 100644 --- a/docs/source/managing_data/loading_and_updating_data.rst +++ b/docs/source/managing_data/loading_and_updating_data.rst @@ -14,7 +14,7 @@ Calling the Gene Normalizer update command with the ``--update_all`` and ``--upd Reload individual source ------------------------ -To update specific sources, call the ``--sources`` option with one or more source name(s) quoted and separated by spaces. While it is possible to update individual source data without also updating the normalized record data, that may affect the proper function of the normalized query endpoints, so it is recommended to include the ``--update_merged`` flag as well. +To update specific sources, call the ``--sources`` option with one or more source name(s) quoted and separated by spaces. While it is possible to update individual source data without also updating the normalized record data, that may affect the proper function of the normalized query endpoints, so it is recommended to include the ``--update_merged`` flag as well. :: gene_norm_update --sources="HGNC NCBI" --update_merged From 5c8a5e2fefad17dd513572a3675ed599ef8ab7df Mon Sep 17 00:00:00 2001 From: James Stevenson Date: Tue, 9 May 2023 10:49:57 -0400 Subject: [PATCH 7/8] f --- gene/cli.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gene/cli.py b/gene/cli.py index 6d4ea056..728cfdf2 100644 --- a/gene/cli.py +++ b/gene/cli.py @@ -107,7 +107,7 @@ def _update_normalizer( ) -> None: """Update selected normalizer sources. - :param sources: names of sources to update + :param source: names of sources to update :param db: database instance :param update_merged: if true, retain processed records to use in updating merged records From 3d38effa6ed4ecf84652424e135554b27ec0d0a3 Mon Sep 17 00:00:00 2001 From: James Stevenson Date: Tue, 9 May 2023 10:52:36 -0400 Subject: [PATCH 8/8] check docstrings --- gene/cli.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/gene/cli.py b/gene/cli.py index 728cfdf2..93808832 100644 --- a/gene/cli.py +++ b/gene/cli.py @@ -107,7 +107,7 @@ def _update_normalizer( ) -> None: """Update selected normalizer sources. - :param source: names of sources to update + :param sources: names of sources to update :param db: database instance :param update_merged: if true, retain processed records to use in updating merged records @@ -233,7 +233,7 @@ def update_normalizer_db(sources: str, aws_instance: bool, db_url: str, update_all: bool, update_merged: bool) -> None: """Update selected normalizer source(s) in the gene database. - :param source: names of sources to update, comma-separated + :param sources: names of sources to update, comma-separated :param aws_instance: if true, use cloud instance :param db_url: URI pointing to database :param update_all: if true, update all sources (ignore `normalizer` parameter)