diff --git a/README.md b/README.md index 0c457200..250b3787 100644 --- a/README.md +++ b/README.md @@ -73,15 +73,15 @@ Use the `gene_update` command in a shell to update the database. The normalizer currently pulls data from [HGNC](https://www.genenames.org/), [Ensembl](https://useast.ensembl.org/index.html), and [NCBI](https://www.ncbi.nlm.nih.gov/gene/). -To update one source, simply set `--normalizer` to the source you wish to update. The normalizer will check to see if local source data is up-to-date, acquire the most recent data if not, and use it to populate the database. +To update one source, simply set `--sources` to the source you wish to update. The normalizer will check to see if local source data is up-to-date, acquire the most recent data if not, and use it to populate the database. For example, run the following to acquire the latest HGNC data if necessary, and update the HGNC gene records in the normalizer database: ```commandline -gene_update --normalizer="hgnc" +gene_update --sources="hgnc" ``` -To update multiple sources, you can use the `--normalizer` option with the source names separated by spaces. +To update multiple sources, you can use the `--sources` option with the source names separated by spaces. #### Update all sources diff --git a/docs/source/managing_data/loading_and_updating_data.rst b/docs/source/managing_data/loading_and_updating_data.rst index 01d59ea6..f5c1f6ca 100644 --- a/docs/source/managing_data/loading_and_updating_data.rst +++ b/docs/source/managing_data/loading_and_updating_data.rst @@ -6,7 +6,7 @@ See the :ref:`ETL API documentation` for information on programmatic ac Full load/reload ---------------- -Calling the Gene Normalizer update command with the ``--update_all`` and ``--update_merged`` flags will delete all existing data, fetch new source data if available, and then perform a complete reload of the database: :: +Calling the Gene Normalizer update command with the ``--update_all`` and ``--update_merged`` flags will delete all existing data, fetch new source data if available, and then perform a complete reload of the database (including merged records): :: gene_norm_update --update_all --update_merged @@ -14,14 +14,15 @@ Calling the Gene Normalizer update command with the ``--update_all`` and ``--upd Reload individual source ------------------------ -To update specific sources, use the ``--normalizer`` option along with source name(s), quoted and separated by spaces. While it is possible to update individual source data without updating the normalized record data with ``--update_merged``, the normalization query endpoints may not function properly until normalized data is refreshed again. :: +To update specific sources, call the ``--sources`` option with one or more source name(s) quoted and separated by spaces. While it is possible to update individual source data without also updating the normalized record data, that may affect the proper function of the normalized query endpoints, so it is recommended to include the ``--update_merged`` flag as well. :: + + gene_norm_update --sources="HGNC NCBI" --update_merged - gene_norm_update --normalizer="HGNC NCBI" --update_merged Check DB health --------------- -The shell command `gene_norm_check_db` performs a basic check on the database status. It first confirms that the database's schema exists, and then identifies whether metadata is available for each source, and whether gene record and normalized concept tables are non-empty. Check the process's exit code for the result. +The shell command ``gene_norm_check_db`` performs a basic check on the database status. It first confirms that the database's schema exists, and then identifies whether metadata is available for each source, and whether gene record and normalized concept tables are non-empty. Check the process's exit code for the result. :: % gene_norm_check_db % echo $? diff --git a/gene/cli.py b/gene/cli.py index b4abfa2c..93808832 100644 --- a/gene/cli.py +++ b/gene/cli.py @@ -102,18 +102,18 @@ def dump_database(output_directory: Path, db_url: str): click.get_current_context().exit(1) -def _update_normalizers( - normalizers: Collection[SourceName], db: AbstractDatabase, update_merged: bool +def _update_normalizer( + sources: Collection[SourceName], db: AbstractDatabase, update_merged: bool ) -> None: """Update selected normalizer sources. - :param normalizers: names of sources to update + :param sources: names of sources to update :param db: database instance :param update_merged: if true, retain processed records to use in updating merged records """ processed_ids = list() - for n in normalizers: + for n in sources: delete_time = _delete_source(n, db) _load_source(n, db, delete_time, processed_ids) @@ -207,8 +207,8 @@ def _load_merge(db: AbstractDatabase, processed_ids: Set[str]) -> None: @click.command() @click.option( - '--normalizer', - help="The normalizer(s) you wish to update separated by spaces." + '--sources', + help="The source(s) you wish to update separated by spaces." ) @click.option( '--aws_instance', @@ -229,11 +229,11 @@ def _load_merge(db: AbstractDatabase, processed_ids: Set[str]) -> None: is_flag=True, help='Update concepts for normalize endpoint from accepted sources.' ) -def update_normalizer_db(normalizer: str, aws_instance: bool, db_url: str, +def update_normalizer_db(sources: str, aws_instance: bool, db_url: str, update_all: bool, update_merged: bool) -> None: """Update selected normalizer source(s) in the gene database. - :param normalizer: names of sources to update, comma-separated + :param sources: names of sources to update, comma-separated :param aws_instance: if true, use cloud instance :param db_url: URI pointing to database :param update_all: if true, update all sources (ignore `normalizer` parameter) @@ -242,8 +242,8 @@ def update_normalizer_db(normalizer: str, aws_instance: bool, db_url: str, db = create_db(db_url, aws_instance) if update_all: - _update_normalizers(list(SourceName), db, update_merged) - elif not normalizer: + _update_normalizer(list(SourceName), db, update_merged) + elif not sources: if update_merged: _load_merge(db, set()) else: @@ -252,18 +252,18 @@ def update_normalizer_db(normalizer: str, aws_instance: bool, db_url: str, click.echo(ctx.get_help()) ctx.exit() else: - normalizers = normalizer.lower().split() + sources_split = sources.lower().split() - if len(normalizers) == 0: - raise Exception("Must enter a normalizer") + if len(sources_split) == 0: + raise Exception("Must enter 1 or more source names to update") - non_sources = set(normalizers) - set(SOURCES) + non_sources = set(sources_split) - set(SOURCES) if len(non_sources) != 0: raise Exception(f"Not valid source(s): {non_sources}") - sources_to_update = {SourceName(SOURCES[s]) for s in normalizers} - _update_normalizers(sources_to_update, db, update_merged) + parsed_source_names = {SourceName(SOURCES[s]) for s in sources_split} + _update_normalizer(parsed_source_names, db, update_merged) if __name__ == '__main__':