diff --git a/README.md b/README.md index 4865b52..4532fee 100644 --- a/README.md +++ b/README.md @@ -2,7 +2,7 @@ ![Static Badge](https://img.shields.io/badge/Elix%2C%20Inc.-%235EB6B3?style=flat) ![Static Badge](https://img.shields.io/badge/Institute%20of%20Science%20Tokyo-%231C3177?style=flat) -Welcome to the computer-assisted chemical synthesis data source project !!! +Welcome to the **computer-assisted chemical synthesis data source** project !!! Over the last decade, computer-assisted chemical synthesis has re-emerged as a heavily researched subject in Chemoinformatics. Even though the idea of utilizing computers to assist chemical synthesis has existed for nearly as @@ -222,8 +222,8 @@ The following miscellaneous chemical reaction rule data sources are supported: ## Data -The purpose of the [data](/data) directory is to archive the datasets that are hosted on [GitHub](https://github.com) -and [CodeOcean](https://codeocean.com) repositories. +The purpose of the [data](/data) directory is to archive the data sources that are hosted on +[GitHub](https://github.com) and [CodeOcean](https://codeocean.com) repositories. ## License Information diff --git a/data/compound/miscellaneous_v_moses_by_20201218_polykovskiy_d_et_al/dataset_v1.csv b/data/compound/zinc_v_moses_by_20201218_polykovskiy_d_et_al/dataset_v1.csv similarity index 100% rename from data/compound/miscellaneous_v_moses_by_20201218_polykovskiy_d_et_al/dataset_v1.csv rename to data/compound/zinc_v_moses_by_20201218_polykovskiy_d_et_al/dataset_v1.csv diff --git a/data_source/reaction/uspto/uspto.py b/data_source/reaction/uspto/uspto.py index 29958b0..138ce33 100644 --- a/data_source/reaction/uspto/uspto.py +++ b/data_source/reaction/uspto/uspto.py @@ -56,7 +56,7 @@ def get_supported_versions( "v_480k_or_mit_by_20171204_jin_w_et_al": "https://doi.org/10.48550/arXiv.1709.04555", "v_480k_or_mit_by_20180622_schwaller_p_et_al": "https://doi.org/10.1039/C8SC02339E", "v_stereo_by_20180622_schwaller_p_et_al": "https://doi.org/10.1039/C8SC02339E", - "v_lef_by_20181221_bradshaw_j_et_al": "https://openreview.net/forum?id=r1x4BnCqKX", + "v_lef_by_20181221_bradshaw_j_et_al": "https://doi.org/10.48550/arXiv.1805.10970", "v_1k_tpl_by_20210128_schwaller_p_et_al": "https://doi.org/10.1038/s42256-020-00284-w", "v_1976_to_2016_remapped_by_20210407_schwaller_p_et_al": "https://doi.org/10.1126/sciadv.abe4166", "v_1976_to_2016_remapped_by_20240313_chen_s_et_al": "https://doi.org/10.6084/m9.figshare.25046471.v1", @@ -97,9 +97,7 @@ def download( ) ) - if version in [ - "v_1976_to_2013_rsmi_by_20121009_lowe_d_m", - ]: + if version == "v_1976_to_2013_rsmi_by_20121009_lowe_d_m": USPTOReactionDatasetDownloadUtility.download_v_1976_to_2013_by_20121009_lowe_d_m( version=version, output_directory_path=output_directory_path @@ -230,9 +228,7 @@ def extract( ) ) - if version in [ - "v_1976_to_2013_rsmi_by_20121009_lowe_d_m", - ]: + if version == "v_1976_to_2013_rsmi_by_20121009_lowe_d_m": USPTOReactionDatasetExtractionUtility.extract_v_1976_to_2013_by_20121009_lowe_d_m( version=version, input_directory_path=input_directory_path, diff --git a/scripts/download_extract_and_format_data.py b/scripts/download_extract_and_format_data.py index de6f90d..9ef478f 100644 --- a/scripts/download_extract_and_format_data.py +++ b/scripts/download_extract_and_format_data.py @@ -128,78 +128,74 @@ def get_script_logger() -> Logger: if __name__ == "__main__": script_logger = get_script_logger() - try: - script_arguments = get_script_arguments() - - if script_arguments.data_source_category == "compound": - data_source = CompoundDataSource( - logger=script_logger - ) - - elif script_arguments.data_source_category == "reaction": - data_source = ReactionDataSource( - logger=script_logger - ) - - elif script_arguments.data_source_category == "reaction_rule": - data_source = ReactionRuleDataSource( - logger=script_logger + script_arguments = get_script_arguments() + + if script_arguments.data_source_category == "compound": + data_source = CompoundDataSource( + logger=script_logger + ) + + elif script_arguments.data_source_category == "reaction": + data_source = ReactionDataSource( + logger=script_logger + ) + + elif script_arguments.data_source_category == "reaction_rule": + data_source = ReactionRuleDataSource( + logger=script_logger + ) + + else: + raise ValueError( + "The data source category '{category:s}' is not supported.".format( + category=script_arguments.data_source_category ) - - else: - raise ValueError( - "The data source category '{category:s}' is not supported.".format( - category=script_arguments.data_source_category + ) + + if script_arguments.get_data_source_name_information: + print(script_arguments.data_source_category) + print(data_source.get_names_of_supported_data_sources()) + + elif script_arguments.get_data_source_version_information: + print(script_arguments.data_source_category) + print(script_arguments.data_source_name) + print(data_source.get_supported_versions( + name=script_arguments.data_source_name + )) + + else: + temporary_output_directory_path = Path( + script_arguments.output_directory_path, + "{timestamp:s}_temporary_output_directory".format( + timestamp=datetime.now().strftime( + format="%Y%m%d%H%M%S" ) ) - - if script_arguments.get_data_source_name_information: - print(script_arguments.data_source_category) - print(data_source.get_names_of_supported_data_sources()) - - elif script_arguments.get_data_source_version_information: - print(script_arguments.data_source_category) - print(script_arguments.data_source_name) - print(data_source.get_supported_versions( - name=script_arguments.data_source_name - )) - - else: - temporary_output_directory_path = Path( - script_arguments.output_directory_path, - "{timestamp:s}_temporary_output_directory".format( - timestamp=datetime.now().strftime( - format="%Y%m%d%H%M%S" - ) - ) - ) - - temporary_output_directory_path.mkdir() - - data_source.download( - name=script_arguments.data_source_name, - version=script_arguments.data_source_version, - output_directory_path=temporary_output_directory_path - ) - - data_source.extract( - name=script_arguments.data_source_name, - version=script_arguments.data_source_version, - input_directory_path=temporary_output_directory_path, - output_directory_path=temporary_output_directory_path - ) - - data_source.format( - name=script_arguments.data_source_name, - version=script_arguments.data_source_version, - input_directory_path=temporary_output_directory_path, - output_directory_path=script_arguments.output_directory_path, - number_of_processes=script_arguments.number_of_processes - ) - - rmtree( - path=temporary_output_directory_path - ) - - except: - raise + ) + + temporary_output_directory_path.mkdir() + + data_source.download( + name=script_arguments.data_source_name, + version=script_arguments.data_source_version, + output_directory_path=temporary_output_directory_path + ) + + data_source.extract( + name=script_arguments.data_source_name, + version=script_arguments.data_source_version, + input_directory_path=temporary_output_directory_path, + output_directory_path=temporary_output_directory_path + ) + + data_source.format( + name=script_arguments.data_source_name, + version=script_arguments.data_source_version, + input_directory_path=temporary_output_directory_path, + output_directory_path=script_arguments.output_directory_path, + number_of_processes=script_arguments.number_of_processes + ) + + rmtree( + path=temporary_output_directory_path + ) diff --git a/setup.cfg b/setup.cfg index 34264c3..ba06543 100644 --- a/setup.cfg +++ b/setup.cfg @@ -21,7 +21,7 @@ keywords = retro-rules, rhea, uspto, - zinc20 + zinc [options] packages = find: