From 594a5ac0c64e594b874da1beee8dd2482480f52e Mon Sep 17 00:00:00 2001 From: Akindele Michael Date: Tue, 15 Oct 2024 14:22:45 +0100 Subject: [PATCH 01/36] check for invalid language and data type QIDs --- .../check/check_query_identifiers.py | 99 +++++++++++++++++++ 1 file changed, 99 insertions(+) create mode 100644 src/scribe_data/check/check_query_identifiers.py diff --git a/src/scribe_data/check/check_query_identifiers.py b/src/scribe_data/check/check_query_identifiers.py new file mode 100644 index 000000000..b379a5c86 --- /dev/null +++ b/src/scribe_data/check/check_query_identifiers.py @@ -0,0 +1,99 @@ +import re +from pathlib import Path + +from scribe_data.cli.cli_utils import ( + LANGUAGE_DATA_EXTRACTION_DIR, + language_metadata, + data_type_metadata, +) + + +def extract_qid_from_sparql(file_path: Path, pattern: str) -> str: + """ + Extract the QID based on the pattern provided (either language or data type). + """ + try: + with open(file_path, "r", encoding="utf-8") as file: + content = file.read() + match = re.search(pattern, content) + if match: + return match.group(0).replace("wd:", "") + except Exception as e: + print(f"Error reading {file_path}: {e}") + return None + + +def check_queries(): + language_pattern = r"\?lexeme dct:language wd:Q\d+" + data_type_pattern = r"wikibase:lexicalCategory wd:Q\d+" + incorrect_languages = [] + incorrect_data_types = [] + + language_extraction_dir = LANGUAGE_DATA_EXTRACTION_DIR + for query_file in language_extraction_dir.glob("**/*.sparql"): + lang_qid = extract_qid_from_sparql(query_file, language_pattern) + data_type_qid = extract_qid_from_sparql(query_file, data_type_pattern) + + # Validate language QID and data type QID + if not is_valid_language(query_file, lang_qid): + incorrect_languages.append(query_file) + if not is_valid_data_type(query_file, data_type_qid): + incorrect_data_types.append(query_file) + + if incorrect_languages: + print("Queries with incorrect languages QIDs are:") + for file in incorrect_languages: + print(f"- {file}") + + if incorrect_data_types: + print("Queries with incorrect data type QIDs are:") + for file in incorrect_data_types: + print(f"- {file}") + + +def is_valid_language(query_file, lang_qid): + lang_directory_name = query_file.parent.parent.name.lower() + languages = language_metadata.get( + "languages" + ) # might not work since language_metadata file is not fully updated + language_entry = next( + (lang for lang in languages if lang["language"] == lang_directory_name), None + ) + + if not language_entry: + print( + f"Warning: Language '{lang_directory_name}' not found in language_metadata.json." + ) + return False + + expected_language_qid = language_entry["qid"] + print("Expected language QID:", expected_language_qid) + + if lang_qid != expected_language_qid: + print( + f"Incorrect language QID in {lang_directory_name}. " + f"Found: {lang_qid}, Expected: {expected_language_qid}" + ) + return False + return True + + +def is_valid_data_type(query_file, data_type_qid): + directory_name = query_file.parent.name # e.g., "nouns" or "verbs" + expected_data_type_qid = data_type_metadata.get(directory_name) + + if data_type_qid != expected_data_type_qid: + print( + f"Warning: Incorrect data type QID in {query_file}. Found: {data_type_qid}, Expected: {expected_data_type_qid}" + ) + return False + return True + + +# Examples: + +# file_path = Path("French/verbs/query_verbs.sparql") +# print(is_valid_data_type(file_path, "QW24907")) # check for data type +# print(is_valid_language(file_path, "Q150")) # check for if valid language + +check_queries() From defab4d33374bf47ee3b63ce335f14d29c06f5bc Mon Sep 17 00:00:00 2001 From: Angel osim <69635048+Otom-obhazi@users.noreply.github.com> Date: Tue, 15 Oct 2024 12:54:33 +0100 Subject: [PATCH 02/36] Create query_adverbs.sparql adverbs for yoruba --- .../Yoruba/adverbs/query_adverbs.sparql | 13 +++++++++++++ 1 file changed, 13 insertions(+) create mode 100644 src/scribe_data/language_data_extraction/Yoruba/adverbs/query_adverbs.sparql diff --git a/src/scribe_data/language_data_extraction/Yoruba/adverbs/query_adverbs.sparql b/src/scribe_data/language_data_extraction/Yoruba/adverbs/query_adverbs.sparql new file mode 100644 index 000000000..38387bde2 --- /dev/null +++ b/src/scribe_data/language_data_extraction/Yoruba/adverbs/query_adverbs.sparql @@ -0,0 +1,13 @@ +# tool: scribe-data +# All Yoruba (Q34311) adverbs. +# Enter this query at https://query.wikidata.org/. + +SELECT DISTINCT + (REPLACE(STR(?lexeme), "http://www.wikidata.org/entity/", "") AS ?lexemeID) + ?adverb + +WHERE { + ?lexeme dct:language wd:Q34311 ; + wikibase:lexicalCategory wd:Q380057 ; + wikibase:lemma ?adverb . +} From 662a0f6f4be9a33d433a964d375cd4b11b7f70cc Mon Sep 17 00:00:00 2001 From: Andrew Tavis McAllister Date: Tue, 15 Oct 2024 18:16:01 +0200 Subject: [PATCH 03/36] Remove select distinct from all queries --- .../Arabic/adjectives/query_adjectives.sparql | 2 +- .../language_data_extraction/Arabic/nouns/query_nouns.sparql | 2 +- .../language_data_extraction/Basque/nouns/query_nouns.sparql | 2 +- .../language_data_extraction/Basque/verbs/query_verbs.sparql | 2 +- .../language_data_extraction/Bengali/nouns/query_nouns.sparql | 2 +- .../Chinese/Mandarin/nouns/query_nouns.sparql | 2 +- .../language_data_extraction/Czech/nouns/query_nouns.sparql | 2 +- .../Danish/adjectives/query_adjectives_1.sparql | 2 +- .../Danish/adjectives/query_adjectives_2.sparql | 2 +- .../Danish/adjectives/query_adjectives_3.sparql | 2 +- .../Danish/adverbs/query_adverbs.sparql | 2 +- .../language_data_extraction/Danish/nouns/query_nouns.sparql | 2 +- .../language_data_extraction/English/nouns/query_nouns.sparql | 2 +- .../language_data_extraction/English/verbs/query_verbs.sparql | 2 +- .../Esperanto/adjectives/query_adjectives.sparql | 2 +- .../Esperanto/adverbs/query_adverbs.sparql | 2 +- .../language_data_extraction/Esperanto/nouns/query_nouns.sparql | 2 +- .../Esperanto/personal_pronouns/query_personal_pronouns.sparql | 2 +- .../language_data_extraction/Esperanto/verbs/query_verbs.sparql | 2 +- .../Estonian/adverbs/query_adverbs_1.sparql | 2 +- .../Estonian/adverbs/query_adverbs_2.sparql | 2 +- .../language_data_extraction/Estonian/nouns/query_nouns.sparql | 2 +- .../language_data_extraction/Estonian/verbs/query_verbs.sparql | 2 +- .../language_data_extraction/Finnish/nouns/query_nouns.sparql | 2 +- .../language_data_extraction/French/nouns/query_nouns.sparql | 2 +- .../language_data_extraction/German/nouns/query_nouns.sparql | 2 +- .../language_data_extraction/German/verbs/query_verbs_1.sparql | 2 +- .../language_data_extraction/German/verbs/query_verbs_2.sparql | 2 +- .../language_data_extraction/Greek/nouns/query_nouns.sparql | 2 +- .../language_data_extraction/Greek/verbs/query_verbs.sparql | 2 +- .../language_data_extraction/Hausa/nouns/query_nouns.sparql | 2 +- .../language_data_extraction/Hebrew/nouns/query_nouns.sparql | 2 +- .../language_data_extraction/Hebrew/verbs/query_verbs_1.sparql | 2 +- .../language_data_extraction/Hebrew/verbs/query_verbs_2.sparql | 2 +- .../language_data_extraction/Hebrew/verbs/query_verbs_3.sparql | 2 +- .../language_data_extraction/Hebrew/verbs/query_verbs_4.sparql | 2 +- .../Hindustani/Hindi/adjectives/query_adjectives.sparql | 2 +- .../Hindustani/Hindi/adverbs/query_adverbs.sparql | 2 +- .../Hindustani/Hindi/nouns/query_nouns.sparql | 2 +- .../Hindustani/Hindi/postpositions/query_postpositions.sparql | 2 +- .../Hindustani/Hindi/prepositions/query_prepositions.sparql | 2 +- .../Hindustani/Hindi/verbs/query_verbs.sparql | 2 +- .../Hindustani/Urdu/adjectives/query_adjectives.sparql | 2 +- .../Hindustani/Urdu/adverbs/query_adverbs.sparql | 2 +- .../Hindustani/Urdu/nouns/query_nouns.sparql | 2 +- .../Hindustani/Urdu/postpositions/query_postpositions.sparql | 2 +- .../Hindustani/Urdu/prepositions/query_prepositions.sparql | 2 +- .../Hindustani/Urdu/verbs/query_verbs.sparql | 2 +- .../Indonesian/nouns/query_nouns.sparql | 2 +- .../language_data_extraction/Italian/nouns/query_nouns.sparql | 2 +- .../language_data_extraction/Japanese/nouns/query_nouns.sparql | 2 +- .../Korean/adverbs/query_adverbs.sparql | 2 +- .../Korean/postposition/query_postpositions.sparql | 2 +- .../language_data_extraction/Kurmanji/nouns/query_nouns.sparql | 2 +- .../language_data_extraction/Malay/nouns/query_nouns.sparql | 2 +- .../Malayalam/adjectives/query_adjectives.sparql | 2 +- .../Malayalam/adverbs/query_adverbs.sparql | 2 +- .../language_data_extraction/Malayalam/nouns/query_nouns.sparql | 2 +- .../Malayalam/prepositions/query_prepositions.sparql | 2 +- .../language_data_extraction/Malayalam/verbs/query_verbs.sparql | 2 +- .../Norwegian/Bokm\303\245l/nouns/query_nouns.sparql" | 2 +- .../Norwegian/Bokm\303\245l/verbs/query_verbs.sparql" | 2 +- .../Norwegian/Nynorsk/nouns/query_nouns.sparql | 2 +- .../Norwegian/Nynorsk/verbs/query_verbs.sparql | 2 +- .../Pidgin/Nigerian/adverbs/query_adverbs.sparql | 2 +- .../Pidgin/Nigerian/nouns/query_nouns.sparql | 2 +- .../language_data_extraction/Polish/nouns/query_nouns.sparql | 2 +- .../Portuguese/nouns/query_nouns.sparql | 2 +- .../Portuguese/verbs/query_verbs.sparql | 2 +- .../Punjabi/Gurmukhi/nouns/query_nouns.sparql | 2 +- .../Punjabi/Gurmukhi/verbs/query_verbs.sparql | 2 +- .../Punjabi/Shahmukhi/nouns/query_nouns.sparql | 2 +- .../Punjabi/Shahmukhi/verbs/query_verbs.sparql | 2 +- .../language_data_extraction/Russian/nouns/query_nouns.sparql | 2 +- .../language_data_extraction/Russian/verbs/query_verbs.sparql | 2 +- .../Slovak/adverbs/query_adverbs.sparql | 2 +- .../language_data_extraction/Slovak/nouns/query_nouns.sparql | 2 +- .../language_data_extraction/Spanish/nouns/query_nouns.sparql | 2 +- .../language_data_extraction/Swahili/nouns/query_nouns.sparql | 2 +- .../language_data_extraction/Swedish/nouns/query_nouns.sparql | 2 +- .../language_data_extraction/Tajik/nouns/query_nouns.sparql | 2 +- .../language_data_extraction/Tamil/nouns/query_nouns.sparql | 2 +- .../language_data_extraction/Ukrainian/nouns/query_nouns.sparql | 2 +- .../Yoruba/adverbs/query_adverbs.sparql | 2 +- .../language_data_extraction/Yoruba/nouns/query_nouns.sparql | 2 +- 85 files changed, 85 insertions(+), 85 deletions(-) diff --git a/src/scribe_data/language_data_extraction/Arabic/adjectives/query_adjectives.sparql b/src/scribe_data/language_data_extraction/Arabic/adjectives/query_adjectives.sparql index ae7b2b1a4..0f9851c8d 100644 --- a/src/scribe_data/language_data_extraction/Arabic/adjectives/query_adjectives.sparql +++ b/src/scribe_data/language_data_extraction/Arabic/adjectives/query_adjectives.sparql @@ -2,7 +2,7 @@ # All Arabic (Q13955) adjectives. # Enter this query at https://query.wikidata.org/. -SELECT DISTINCT +SELECT (REPLACE(STR(?lexeme), "http://www.wikidata.org/entity/", "") AS ?lexemeID) ?adjective ?femSingularNominativeIndef diff --git a/src/scribe_data/language_data_extraction/Arabic/nouns/query_nouns.sparql b/src/scribe_data/language_data_extraction/Arabic/nouns/query_nouns.sparql index e18ebcd83..56e8b42c1 100644 --- a/src/scribe_data/language_data_extraction/Arabic/nouns/query_nouns.sparql +++ b/src/scribe_data/language_data_extraction/Arabic/nouns/query_nouns.sparql @@ -2,7 +2,7 @@ # All Arabic (Q13955) nouns. # Enter this query at https://query.wikidata.org/. -SELECT DISTINCT +SELECT (REPLACE(STR(?lexeme), "http://www.wikidata.org/entity/", "") AS ?lexemeID) ?noun diff --git a/src/scribe_data/language_data_extraction/Basque/nouns/query_nouns.sparql b/src/scribe_data/language_data_extraction/Basque/nouns/query_nouns.sparql index 198959aec..19314c81a 100644 --- a/src/scribe_data/language_data_extraction/Basque/nouns/query_nouns.sparql +++ b/src/scribe_data/language_data_extraction/Basque/nouns/query_nouns.sparql @@ -2,7 +2,7 @@ # All Basque (Q8752) nouns and all implemented singular and plural forms. # Enter this query at https://query.wikidata.org/. -SELECT DISTINCT +SELECT (REPLACE(STR(?lexeme), "http://www.wikidata.org/entity/", "") AS ?lexemeID) ?absIndefinite ?absSingular diff --git a/src/scribe_data/language_data_extraction/Basque/verbs/query_verbs.sparql b/src/scribe_data/language_data_extraction/Basque/verbs/query_verbs.sparql index 5c699ce2c..4bb8792b2 100644 --- a/src/scribe_data/language_data_extraction/Basque/verbs/query_verbs.sparql +++ b/src/scribe_data/language_data_extraction/Basque/verbs/query_verbs.sparql @@ -2,7 +2,7 @@ # All Basque (Q8752) verbs and the currently implemented tenses for each. # Enter this query at https://query.wikidata.org/. -SELECT DISTINCT +SELECT (REPLACE(STR(?lexeme), "http://www.wikidata.org/entity/", "") AS ?lexemeID) ?infinitive ?future diff --git a/src/scribe_data/language_data_extraction/Bengali/nouns/query_nouns.sparql b/src/scribe_data/language_data_extraction/Bengali/nouns/query_nouns.sparql index c8a1a1dd5..dc36759e7 100644 --- a/src/scribe_data/language_data_extraction/Bengali/nouns/query_nouns.sparql +++ b/src/scribe_data/language_data_extraction/Bengali/nouns/query_nouns.sparql @@ -2,7 +2,7 @@ # All Bengali (Bangla Q9610) nouns and their forms in the various cases. # Enter this query at https://query.wikidata.org/. -SELECT DISTINCT +SELECT (REPLACE(STR(?lexeme), "http://www.wikidata.org/entity/", "") AS ?lexemeID) ?nominative ?genitive diff --git a/src/scribe_data/language_data_extraction/Chinese/Mandarin/nouns/query_nouns.sparql b/src/scribe_data/language_data_extraction/Chinese/Mandarin/nouns/query_nouns.sparql index 1d672c3c2..f3badc28e 100644 --- a/src/scribe_data/language_data_extraction/Chinese/Mandarin/nouns/query_nouns.sparql +++ b/src/scribe_data/language_data_extraction/Chinese/Mandarin/nouns/query_nouns.sparql @@ -2,7 +2,7 @@ # All Standard Mandarin Chinese (Q727694) nouns. # Enter this query at https://query.wikidata.org/. -SELECT DISTINCT +SELECT (REPLACE(STR(?lexeme), "http://www.wikidata.org/entity/", "") AS ?lexemeID) ?noun diff --git a/src/scribe_data/language_data_extraction/Czech/nouns/query_nouns.sparql b/src/scribe_data/language_data_extraction/Czech/nouns/query_nouns.sparql index d8456e98b..77cbb579d 100644 --- a/src/scribe_data/language_data_extraction/Czech/nouns/query_nouns.sparql +++ b/src/scribe_data/language_data_extraction/Czech/nouns/query_nouns.sparql @@ -2,7 +2,7 @@ # All Czeck (Q9056) nouns, their plurals and their genders. # Enter this query at https://query.wikidata.org/. -SELECT DISTINCT +SELECT (REPLACE(STR(?lexeme), "http://www.wikidata.org/entity/", "") AS ?lexemeID) ?nomSingular ?nomPlural diff --git a/src/scribe_data/language_data_extraction/Danish/adjectives/query_adjectives_1.sparql b/src/scribe_data/language_data_extraction/Danish/adjectives/query_adjectives_1.sparql index f08070804..e310ea383 100644 --- a/src/scribe_data/language_data_extraction/Danish/adjectives/query_adjectives_1.sparql +++ b/src/scribe_data/language_data_extraction/Danish/adjectives/query_adjectives_1.sparql @@ -2,7 +2,7 @@ # All Danish (Q9035) adjectives and some of the available forms. # Enter this query at https://query.wikidata.org/. -SELECT DISTINCT +SELECT (REPLACE(STR(?lexeme), "http://www.wikidata.org/entity/", "") AS ?lexemeID) ?adjective ?commonSingularIndefinite diff --git a/src/scribe_data/language_data_extraction/Danish/adjectives/query_adjectives_2.sparql b/src/scribe_data/language_data_extraction/Danish/adjectives/query_adjectives_2.sparql index b4eb71462..508b65120 100644 --- a/src/scribe_data/language_data_extraction/Danish/adjectives/query_adjectives_2.sparql +++ b/src/scribe_data/language_data_extraction/Danish/adjectives/query_adjectives_2.sparql @@ -2,7 +2,7 @@ # All Danish (Q9035) adjectives and some of the available forms. # Enter this query at https://query.wikidata.org/. -SELECT DISTINCT +SELECT (REPLACE(STR(?lexeme), "http://www.wikidata.org/entity/", "") AS ?lexemeID) ?adjective ?pluralPositive diff --git a/src/scribe_data/language_data_extraction/Danish/adjectives/query_adjectives_3.sparql b/src/scribe_data/language_data_extraction/Danish/adjectives/query_adjectives_3.sparql index 6d283ead8..eddb0dacb 100644 --- a/src/scribe_data/language_data_extraction/Danish/adjectives/query_adjectives_3.sparql +++ b/src/scribe_data/language_data_extraction/Danish/adjectives/query_adjectives_3.sparql @@ -2,7 +2,7 @@ # All Danish (Q9035) adjectives and some of the available forms. # Enter this query at https://query.wikidata.org/. -SELECT DISTINCT +SELECT (REPLACE(STR(?lexeme), "http://www.wikidata.org/entity/", "") AS ?lexemeID) ?adjective ?indefiniteSuperlative diff --git a/src/scribe_data/language_data_extraction/Danish/adverbs/query_adverbs.sparql b/src/scribe_data/language_data_extraction/Danish/adverbs/query_adverbs.sparql index 51bb8caae..177735c7a 100644 --- a/src/scribe_data/language_data_extraction/Danish/adverbs/query_adverbs.sparql +++ b/src/scribe_data/language_data_extraction/Danish/adverbs/query_adverbs.sparql @@ -2,7 +2,7 @@ # All Danish (Q9035) adverbs. # Enter this query at https://query.wikidata.org/. -SELECT DISTINCT +SELECT (REPLACE(STR(?lexeme), "http://www.wikidata.org/entity/", "") AS ?lexemeID) ?adverb diff --git a/src/scribe_data/language_data_extraction/Danish/nouns/query_nouns.sparql b/src/scribe_data/language_data_extraction/Danish/nouns/query_nouns.sparql index f93c9715f..dae3b9b66 100644 --- a/src/scribe_data/language_data_extraction/Danish/nouns/query_nouns.sparql +++ b/src/scribe_data/language_data_extraction/Danish/nouns/query_nouns.sparql @@ -2,7 +2,7 @@ # All Danish (Q9035) nouns, their plurals and their genders. # Enter this query at https://query.wikidata.org/. -SELECT DISTINCT +SELECT (REPLACE(STR(?lexeme), "http://www.wikidata.org/entity/", "") AS ?lexemeID) ?singular ?plural diff --git a/src/scribe_data/language_data_extraction/English/nouns/query_nouns.sparql b/src/scribe_data/language_data_extraction/English/nouns/query_nouns.sparql index a740d1d8f..e60883fbe 100644 --- a/src/scribe_data/language_data_extraction/English/nouns/query_nouns.sparql +++ b/src/scribe_data/language_data_extraction/English/nouns/query_nouns.sparql @@ -2,7 +2,7 @@ # All English (Q1860) nouns and their plural. # Enter this query at https://query.wikidata.org/. -SELECT DISTINCT +SELECT (REPLACE(STR(?lexeme), "http://www.wikidata.org/entity/", "") AS ?lexemeID) ?singular ?plural diff --git a/src/scribe_data/language_data_extraction/English/verbs/query_verbs.sparql b/src/scribe_data/language_data_extraction/English/verbs/query_verbs.sparql index 15228d792..ee09c6f00 100644 --- a/src/scribe_data/language_data_extraction/English/verbs/query_verbs.sparql +++ b/src/scribe_data/language_data_extraction/English/verbs/query_verbs.sparql @@ -2,7 +2,7 @@ # All English (Q1860) verbs and the currently implemented tenses for each. # Enter this query at https://query.wikidata.org/. -SELECT DISTINCT +SELECT (REPLACE(STR(?lexeme), "http://www.wikidata.org/entity/", "") AS ?lexemeID) ?infinitive ?presSimp diff --git a/src/scribe_data/language_data_extraction/Esperanto/adjectives/query_adjectives.sparql b/src/scribe_data/language_data_extraction/Esperanto/adjectives/query_adjectives.sparql index ec51925c5..e85a304cb 100644 --- a/src/scribe_data/language_data_extraction/Esperanto/adjectives/query_adjectives.sparql +++ b/src/scribe_data/language_data_extraction/Esperanto/adjectives/query_adjectives.sparql @@ -2,7 +2,7 @@ # All Esperanto (Q143) adjectives. # Enter this query at https://query.wikidata.org/. -SELECT DISTINCT +SELECT (REPLACE(STR(?lexeme), "http://www.wikidata.org/entity/", "") AS ?lexemeID) ?adjective diff --git a/src/scribe_data/language_data_extraction/Esperanto/adverbs/query_adverbs.sparql b/src/scribe_data/language_data_extraction/Esperanto/adverbs/query_adverbs.sparql index 27892bf6b..41367afa6 100644 --- a/src/scribe_data/language_data_extraction/Esperanto/adverbs/query_adverbs.sparql +++ b/src/scribe_data/language_data_extraction/Esperanto/adverbs/query_adverbs.sparql @@ -2,7 +2,7 @@ # All Esperanto (Q143) adverbs. # Enter this query at https://query.wikidata.org/. -SELECT DISTINCT +SELECT (REPLACE(STR(?lexeme), "http://www.wikidata.org/entity/", "") AS ?lexemeID) ?adverb diff --git a/src/scribe_data/language_data_extraction/Esperanto/nouns/query_nouns.sparql b/src/scribe_data/language_data_extraction/Esperanto/nouns/query_nouns.sparql index 4074b4280..9271cdfbe 100644 --- a/src/scribe_data/language_data_extraction/Esperanto/nouns/query_nouns.sparql +++ b/src/scribe_data/language_data_extraction/Esperanto/nouns/query_nouns.sparql @@ -2,7 +2,7 @@ # All Esperanto (Q143) nouns and their plurals for the given cases. # Enter this query at https://query.wikidata.org/. -SELECT DISTINCT +SELECT (REPLACE(STR(?lexeme), "http://www.wikidata.org/entity/", "") AS ?lexemeID) ?nomSingular ?accSingular diff --git a/src/scribe_data/language_data_extraction/Esperanto/personal_pronouns/query_personal_pronouns.sparql b/src/scribe_data/language_data_extraction/Esperanto/personal_pronouns/query_personal_pronouns.sparql index 007f374ad..e4d9281f7 100644 --- a/src/scribe_data/language_data_extraction/Esperanto/personal_pronouns/query_personal_pronouns.sparql +++ b/src/scribe_data/language_data_extraction/Esperanto/personal_pronouns/query_personal_pronouns.sparql @@ -2,7 +2,7 @@ # All Esperanto (Q143) personal pronouns. # Enter this query at https://query.wikidata.org/. -SELECT DISTINCT +SELECT (REPLACE(STR(?lexeme), "http://www.wikidata.org/entity/", "") AS ?lexemeID) ?personalPronouns diff --git a/src/scribe_data/language_data_extraction/Esperanto/verbs/query_verbs.sparql b/src/scribe_data/language_data_extraction/Esperanto/verbs/query_verbs.sparql index e08cf8748..074006a84 100644 --- a/src/scribe_data/language_data_extraction/Esperanto/verbs/query_verbs.sparql +++ b/src/scribe_data/language_data_extraction/Esperanto/verbs/query_verbs.sparql @@ -2,7 +2,7 @@ # All Esperanto (Q143) verbs and the currently implemented tenses for each. # Enter this query at https://query.wikidata.org/. -SELECT DISTINCT +SELECT (REPLACE(STR(?lexeme), "http://www.wikidata.org/entity/", "") AS ?lexemeID) ?infinitive ?presIndicative diff --git a/src/scribe_data/language_data_extraction/Estonian/adverbs/query_adverbs_1.sparql b/src/scribe_data/language_data_extraction/Estonian/adverbs/query_adverbs_1.sparql index 905f25945..a251d58c1 100644 --- a/src/scribe_data/language_data_extraction/Estonian/adverbs/query_adverbs_1.sparql +++ b/src/scribe_data/language_data_extraction/Estonian/adverbs/query_adverbs_1.sparql @@ -2,7 +2,7 @@ # All Estonian (Q380057) adverbs and the corresponding forms per case. # Enter this query at https://query.wikidata.org/ -SELECT DISTINCT +SELECT (REPLACE(STR(?lexeme), "http://www.wikidata.org/entity/", "") AS ?lexemeID) ?adverb ?nominativeSingular diff --git a/src/scribe_data/language_data_extraction/Estonian/adverbs/query_adverbs_2.sparql b/src/scribe_data/language_data_extraction/Estonian/adverbs/query_adverbs_2.sparql index 4aa682c1e..48119a3b5 100644 --- a/src/scribe_data/language_data_extraction/Estonian/adverbs/query_adverbs_2.sparql +++ b/src/scribe_data/language_data_extraction/Estonian/adverbs/query_adverbs_2.sparql @@ -2,7 +2,7 @@ # All Estonian (Q380057) adverbs and the corresponding forms per case. # Enter this query at https://query.wikidata.org/ -SELECT DISTINCT +SELECT (REPLACE(STR(?lexeme), "http://www.wikidata.org/entity/", "") AS ?lexemeID) ?adverb ?adessiveSingular diff --git a/src/scribe_data/language_data_extraction/Estonian/nouns/query_nouns.sparql b/src/scribe_data/language_data_extraction/Estonian/nouns/query_nouns.sparql index e46a8e378..011f0b946 100644 --- a/src/scribe_data/language_data_extraction/Estonian/nouns/query_nouns.sparql +++ b/src/scribe_data/language_data_extraction/Estonian/nouns/query_nouns.sparql @@ -2,7 +2,7 @@ # All Estonian (Q9072) nouns and their plural. # Enter this query at https://query.wikidata.org/. -SELECT DISTINCT +SELECT (REPLACE(STR(?lexeme), "http://www.wikidata.org/entity/", "") AS ?lexemeID) ?singular ?plural diff --git a/src/scribe_data/language_data_extraction/Estonian/verbs/query_verbs.sparql b/src/scribe_data/language_data_extraction/Estonian/verbs/query_verbs.sparql index 7db9134ab..933685fc2 100644 --- a/src/scribe_data/language_data_extraction/Estonian/verbs/query_verbs.sparql +++ b/src/scribe_data/language_data_extraction/Estonian/verbs/query_verbs.sparql @@ -2,7 +2,7 @@ # All Estonian (Q9072) verbs and the currently implemented tenses for each. # Enter this query at https://query.wikidata.org/ -SELECT DISTINCT +SELECT (REPLACE(STR(?lexeme), "http://www.wikidata.org/entity/", "") AS ?lexemeID) ?verb diff --git a/src/scribe_data/language_data_extraction/Finnish/nouns/query_nouns.sparql b/src/scribe_data/language_data_extraction/Finnish/nouns/query_nouns.sparql index 4e781d3b4..f11c4a097 100644 --- a/src/scribe_data/language_data_extraction/Finnish/nouns/query_nouns.sparql +++ b/src/scribe_data/language_data_extraction/Finnish/nouns/query_nouns.sparql @@ -2,7 +2,7 @@ # All Finnish (Q1412) nouns and their plural for the given cases. # Enter this query at https://query.wikidata.org/. -SELECT DISTINCT +SELECT (REPLACE(STR(?lexeme), "http://www.wikidata.org/entity/", "") AS ?lexemeID) ?nomSingular ?nomPlural diff --git a/src/scribe_data/language_data_extraction/French/nouns/query_nouns.sparql b/src/scribe_data/language_data_extraction/French/nouns/query_nouns.sparql index 12a992c69..32653659a 100644 --- a/src/scribe_data/language_data_extraction/French/nouns/query_nouns.sparql +++ b/src/scribe_data/language_data_extraction/French/nouns/query_nouns.sparql @@ -2,7 +2,7 @@ # All French (Q150) nouns, their plurals and their genders. # Enter this query at https://query.wikidata.org/. -SELECT DISTINCT +SELECT (REPLACE(STR(?lexeme), "http://www.wikidata.org/entity/", "") AS ?lexemeID) ?singular ?plural diff --git a/src/scribe_data/language_data_extraction/German/nouns/query_nouns.sparql b/src/scribe_data/language_data_extraction/German/nouns/query_nouns.sparql index 9c835843d..bda5d2b30 100644 --- a/src/scribe_data/language_data_extraction/German/nouns/query_nouns.sparql +++ b/src/scribe_data/language_data_extraction/German/nouns/query_nouns.sparql @@ -2,7 +2,7 @@ # All German (Q188) nouns, their plurals and their genders in the given cases. # Enter this query at https://query.wikidata.org/. -SELECT DISTINCT +SELECT (REPLACE(STR(?lexeme), "http://www.wikidata.org/entity/", "") AS ?lexemeID) ?nomSingular ?nomPlural diff --git a/src/scribe_data/language_data_extraction/German/verbs/query_verbs_1.sparql b/src/scribe_data/language_data_extraction/German/verbs/query_verbs_1.sparql index c8b64df6b..e255fb7bc 100644 --- a/src/scribe_data/language_data_extraction/German/verbs/query_verbs_1.sparql +++ b/src/scribe_data/language_data_extraction/German/verbs/query_verbs_1.sparql @@ -2,7 +2,7 @@ # All German (Q188) verbs and a portion of the currently implemented tenses for each. # Enter this query at https://query.wikidata.org/. -# Not SELECT DISTINCT as we want to get verbs with both sein and haben as auxiliaries +# Not SELECT as we want to get verbs with both sein and haben as auxiliaries SELECT (REPLACE(STR(?lexeme), "http://www.wikidata.org/entity/", "") AS ?lexemeID) ?infinitive diff --git a/src/scribe_data/language_data_extraction/German/verbs/query_verbs_2.sparql b/src/scribe_data/language_data_extraction/German/verbs/query_verbs_2.sparql index 98d6b718b..e209dcc48 100644 --- a/src/scribe_data/language_data_extraction/German/verbs/query_verbs_2.sparql +++ b/src/scribe_data/language_data_extraction/German/verbs/query_verbs_2.sparql @@ -2,7 +2,7 @@ # All German (Q188) verbs and a portion of the currently implemented tenses for each. # Enter this query at https://query.wikidata.org/. -# Not SELECT DISTINCT as we want to get verbs with both sein and haben as auxiliaries +# Not SELECT as we want to get verbs with both sein and haben as auxiliaries SELECT (REPLACE(STR(?lexeme), "http://www.wikidata.org/entity/", "") AS ?lexemeID) ?infinitive ?pastParticiple ?auxiliaryVerb diff --git a/src/scribe_data/language_data_extraction/Greek/nouns/query_nouns.sparql b/src/scribe_data/language_data_extraction/Greek/nouns/query_nouns.sparql index f1e2d8a01..ca48a52ff 100644 --- a/src/scribe_data/language_data_extraction/Greek/nouns/query_nouns.sparql +++ b/src/scribe_data/language_data_extraction/Greek/nouns/query_nouns.sparql @@ -2,7 +2,7 @@ # All Greek (Q36510) nouns, their plurals and their genders. # Enter this query at https://query.wikidata.org/. -SELECT DISTINCT +SELECT (REPLACE(STR(?lexeme), "http://www.wikidata.org/entity/", "") AS ?lexemeID) ?singular ?plural diff --git a/src/scribe_data/language_data_extraction/Greek/verbs/query_verbs.sparql b/src/scribe_data/language_data_extraction/Greek/verbs/query_verbs.sparql index 51811421a..ea669818d 100644 --- a/src/scribe_data/language_data_extraction/Greek/verbs/query_verbs.sparql +++ b/src/scribe_data/language_data_extraction/Greek/verbs/query_verbs.sparql @@ -2,7 +2,7 @@ # All Greek (Q36510) verb snd the currently implemented tenses for each. # Enter this query at https://query.wikidata.org/. -SELECT DISTINCT +SELECT (REPLACE(STR(?lexeme), "http://www.wikidata.org/entity/", "") AS ?lexemeID) ?infinitive ?presFPS ?presSPS ?presTPS diff --git a/src/scribe_data/language_data_extraction/Hausa/nouns/query_nouns.sparql b/src/scribe_data/language_data_extraction/Hausa/nouns/query_nouns.sparql index 6734402bd..84800a22e 100644 --- a/src/scribe_data/language_data_extraction/Hausa/nouns/query_nouns.sparql +++ b/src/scribe_data/language_data_extraction/Hausa/nouns/query_nouns.sparql @@ -2,7 +2,7 @@ # All Hausa (Q56475) nouns and their gender. # Enter this query at https://query.wikidata.org/. -SELECT DISTINCT +SELECT (REPLACE(STR(?lexeme), "http://www.wikidata.org/entity/", "") AS ?lexemeID) ?singular ?plural diff --git a/src/scribe_data/language_data_extraction/Hebrew/nouns/query_nouns.sparql b/src/scribe_data/language_data_extraction/Hebrew/nouns/query_nouns.sparql index 41773856c..093cea32a 100644 --- a/src/scribe_data/language_data_extraction/Hebrew/nouns/query_nouns.sparql +++ b/src/scribe_data/language_data_extraction/Hebrew/nouns/query_nouns.sparql @@ -2,7 +2,7 @@ # All Hebrew (Q9288) nouns, their plurals and their genders. # Enter this query at https://query.wikidata.org/. -SELECT DISTINCT +SELECT (REPLACE(STR(?lexeme), "http://www.wikidata.org/entity/", "") AS ?lexemeID) ?noun ?plural diff --git a/src/scribe_data/language_data_extraction/Hebrew/verbs/query_verbs_1.sparql b/src/scribe_data/language_data_extraction/Hebrew/verbs/query_verbs_1.sparql index d922b978a..14c361444 100644 --- a/src/scribe_data/language_data_extraction/Hebrew/verbs/query_verbs_1.sparql +++ b/src/scribe_data/language_data_extraction/Hebrew/verbs/query_verbs_1.sparql @@ -2,7 +2,7 @@ # All Hebrew (Q9288) verbs and the currently implemented tenses for each. # Enter this query at https://query.wikidata.org/. -SELECT DISTINCT +SELECT (REPLACE(STR(?lexeme), "http://www.wikidata.org/entity/", "") AS ?lexemeID) ?infinitive ?presSF ?presSM ?presPF ?presPM diff --git a/src/scribe_data/language_data_extraction/Hebrew/verbs/query_verbs_2.sparql b/src/scribe_data/language_data_extraction/Hebrew/verbs/query_verbs_2.sparql index bb4793004..fefb8547d 100644 --- a/src/scribe_data/language_data_extraction/Hebrew/verbs/query_verbs_2.sparql +++ b/src/scribe_data/language_data_extraction/Hebrew/verbs/query_verbs_2.sparql @@ -2,7 +2,7 @@ # All Hebrew (Q9288) verbs and the currently implemented tenses for each. # Enter this query at https://query.wikidata.org/. -SELECT DISTINCT +SELECT (REPLACE(STR(?lexeme), "http://www.wikidata.org/entity/", "") AS ?lexemeID) ?infinitive ?impSPSF ?impSPSM ?impSPPF ?impSPPM diff --git a/src/scribe_data/language_data_extraction/Hebrew/verbs/query_verbs_3.sparql b/src/scribe_data/language_data_extraction/Hebrew/verbs/query_verbs_3.sparql index b39eea963..e38e2025d 100644 --- a/src/scribe_data/language_data_extraction/Hebrew/verbs/query_verbs_3.sparql +++ b/src/scribe_data/language_data_extraction/Hebrew/verbs/query_verbs_3.sparql @@ -2,7 +2,7 @@ # All Hebrew (Q9288) verbs and the currently implemented tenses for each. # Enter this query at https://query.wikidata.org/. -SELECT DISTINCT +SELECT (REPLACE(STR(?lexeme), "http://www.wikidata.org/entity/", "") AS ?lexemeID) ?pastFPS ?pastSPSF ?pastSPSM ?pastTPSF ?pastTPSM ?pastFPP ?pastSPPF ?pastSPPM ?pastTPPF ?pastTPPM diff --git a/src/scribe_data/language_data_extraction/Hebrew/verbs/query_verbs_4.sparql b/src/scribe_data/language_data_extraction/Hebrew/verbs/query_verbs_4.sparql index c17d4198a..02c3d9ad4 100644 --- a/src/scribe_data/language_data_extraction/Hebrew/verbs/query_verbs_4.sparql +++ b/src/scribe_data/language_data_extraction/Hebrew/verbs/query_verbs_4.sparql @@ -2,7 +2,7 @@ # All Hebrew (Q9288) verbs and the currently implemented tenses for each. # Enter this query at https://query.wikidata.org/. -SELECT DISTINCT +SELECT (REPLACE(STR(?lexeme), "http://www.wikidata.org/entity/", "") AS ?lexemeID) ?futFPS ?futSPSF ?futSPSM ?futTPSF ?futTPSM ?futFPP ?futSPPF ?futSPPM ?futTPPF ?futTPPM diff --git a/src/scribe_data/language_data_extraction/Hindustani/Hindi/adjectives/query_adjectives.sparql b/src/scribe_data/language_data_extraction/Hindustani/Hindi/adjectives/query_adjectives.sparql index 166de38d2..b1bba3c61 100644 --- a/src/scribe_data/language_data_extraction/Hindustani/Hindi/adjectives/query_adjectives.sparql +++ b/src/scribe_data/language_data_extraction/Hindustani/Hindi/adjectives/query_adjectives.sparql @@ -4,7 +4,7 @@ # Note: We need to filter for "hi" to remove Urdu (ur) words. -SELECT DISTINCT +SELECT (REPLACE(STR(?lexeme), "http://www.wikidata.org/entity/", "") AS ?lexemeID) ?adjective ?singulativeNumeral diff --git a/src/scribe_data/language_data_extraction/Hindustani/Hindi/adverbs/query_adverbs.sparql b/src/scribe_data/language_data_extraction/Hindustani/Hindi/adverbs/query_adverbs.sparql index 68fc55632..7e8ec4c66 100644 --- a/src/scribe_data/language_data_extraction/Hindustani/Hindi/adverbs/query_adverbs.sparql +++ b/src/scribe_data/language_data_extraction/Hindustani/Hindi/adverbs/query_adverbs.sparql @@ -4,7 +4,7 @@ # Note: We need to filter for "hi" to remove Urdu (ur) words. -SELECT DISTINCT +SELECT (REPLACE(STR(?lexeme), "http://www.wikidata.org/entity/", "") AS ?lexemeID) ?adverb diff --git a/src/scribe_data/language_data_extraction/Hindustani/Hindi/nouns/query_nouns.sparql b/src/scribe_data/language_data_extraction/Hindustani/Hindi/nouns/query_nouns.sparql index de8590d05..5d315392b 100644 --- a/src/scribe_data/language_data_extraction/Hindustani/Hindi/nouns/query_nouns.sparql +++ b/src/scribe_data/language_data_extraction/Hindustani/Hindi/nouns/query_nouns.sparql @@ -4,7 +4,7 @@ # Note: We need to filter for "hi" to remove Urdu (ur) words. -SELECT DISTINCT +SELECT (REPLACE(STR(?lexeme), "http://www.wikidata.org/entity/", "") AS ?lexemeID) ?singular ?plural diff --git a/src/scribe_data/language_data_extraction/Hindustani/Hindi/postpositions/query_postpositions.sparql b/src/scribe_data/language_data_extraction/Hindustani/Hindi/postpositions/query_postpositions.sparql index dde9fb0ac..e026332f1 100644 --- a/src/scribe_data/language_data_extraction/Hindustani/Hindi/postpositions/query_postpositions.sparql +++ b/src/scribe_data/language_data_extraction/Hindustani/Hindi/postpositions/query_postpositions.sparql @@ -4,7 +4,7 @@ # Note: We need to filter for "hi" to remove Urdu (ur) words. -SELECT DISTINCT +SELECT ?lexeme (REPLACE(STR(?lexeme), "http://www.wikidata.org/entity/", "") AS ?lexemeID) ?postposition diff --git a/src/scribe_data/language_data_extraction/Hindustani/Hindi/prepositions/query_prepositions.sparql b/src/scribe_data/language_data_extraction/Hindustani/Hindi/prepositions/query_prepositions.sparql index 3951f263c..d0e47bb32 100644 --- a/src/scribe_data/language_data_extraction/Hindustani/Hindi/prepositions/query_prepositions.sparql +++ b/src/scribe_data/language_data_extraction/Hindustani/Hindi/prepositions/query_prepositions.sparql @@ -4,7 +4,7 @@ # Note: We need to filter for "hi" to remove Urdu (ur) words. -SELECT DISTINCT +SELECT ?lexeme (REPLACE(STR(?lexeme), "http://www.wikidata.org/entity/", "") AS ?lexemeID) ?preposition diff --git a/src/scribe_data/language_data_extraction/Hindustani/Hindi/verbs/query_verbs.sparql b/src/scribe_data/language_data_extraction/Hindustani/Hindi/verbs/query_verbs.sparql index 984121e97..1a9b4f58c 100644 --- a/src/scribe_data/language_data_extraction/Hindustani/Hindi/verbs/query_verbs.sparql +++ b/src/scribe_data/language_data_extraction/Hindustani/Hindi/verbs/query_verbs.sparql @@ -4,7 +4,7 @@ # Note: We need to filter for "hi" to remove Urdu (ur) words. -SELECT DISTINCT +SELECT (REPLACE(STR(?lexeme), "http://www.wikidata.org/entity/", "") AS ?lexemeID) ?infinitive ?directCase diff --git a/src/scribe_data/language_data_extraction/Hindustani/Urdu/adjectives/query_adjectives.sparql b/src/scribe_data/language_data_extraction/Hindustani/Urdu/adjectives/query_adjectives.sparql index 01aa22aa2..a4f18e40f 100644 --- a/src/scribe_data/language_data_extraction/Hindustani/Urdu/adjectives/query_adjectives.sparql +++ b/src/scribe_data/language_data_extraction/Hindustani/Urdu/adjectives/query_adjectives.sparql @@ -4,7 +4,7 @@ # Note: We need to filter for "ur" to remove Hindi (hi) words. -SELECT DISTINCT +SELECT (REPLACE(STR(?lexeme), "http://www.wikidata.org/entity/", "") AS ?lexemeID) ?adjective ?singulativeNumeral diff --git a/src/scribe_data/language_data_extraction/Hindustani/Urdu/adverbs/query_adverbs.sparql b/src/scribe_data/language_data_extraction/Hindustani/Urdu/adverbs/query_adverbs.sparql index 09a8d7ca7..53c4bdfc9 100644 --- a/src/scribe_data/language_data_extraction/Hindustani/Urdu/adverbs/query_adverbs.sparql +++ b/src/scribe_data/language_data_extraction/Hindustani/Urdu/adverbs/query_adverbs.sparql @@ -4,7 +4,7 @@ # Note: We need to filter for "ur" to remove Hindi (hi) words. -SELECT DISTINCT +SELECT (REPLACE(STR(?lexeme), "http://www.wikidata.org/entity/", "") AS ?lexemeID) ?adverb diff --git a/src/scribe_data/language_data_extraction/Hindustani/Urdu/nouns/query_nouns.sparql b/src/scribe_data/language_data_extraction/Hindustani/Urdu/nouns/query_nouns.sparql index e3b70b995..66efb97c6 100644 --- a/src/scribe_data/language_data_extraction/Hindustani/Urdu/nouns/query_nouns.sparql +++ b/src/scribe_data/language_data_extraction/Hindustani/Urdu/nouns/query_nouns.sparql @@ -4,7 +4,7 @@ # Note: We need to filter for "ur" to remove Hindi (hi) words. -SELECT DISTINCT +SELECT (REPLACE(STR(?lexeme), "http://www.wikidata.org/entity/", "") AS ?lexemeID) ?singular ?plural diff --git a/src/scribe_data/language_data_extraction/Hindustani/Urdu/postpositions/query_postpositions.sparql b/src/scribe_data/language_data_extraction/Hindustani/Urdu/postpositions/query_postpositions.sparql index d64490145..10c9a36f7 100644 --- a/src/scribe_data/language_data_extraction/Hindustani/Urdu/postpositions/query_postpositions.sparql +++ b/src/scribe_data/language_data_extraction/Hindustani/Urdu/postpositions/query_postpositions.sparql @@ -4,7 +4,7 @@ # Note: We need to filter for "ur" to remove Hindi (hi) words. -SELECT DISTINCT +SELECT ?lexeme (REPLACE(STR(?lexeme), "http://www.wikidata.org/entity/", "") AS ?lexemeID) ?postposition diff --git a/src/scribe_data/language_data_extraction/Hindustani/Urdu/prepositions/query_prepositions.sparql b/src/scribe_data/language_data_extraction/Hindustani/Urdu/prepositions/query_prepositions.sparql index 6ed3f531c..6feddaa71 100644 --- a/src/scribe_data/language_data_extraction/Hindustani/Urdu/prepositions/query_prepositions.sparql +++ b/src/scribe_data/language_data_extraction/Hindustani/Urdu/prepositions/query_prepositions.sparql @@ -4,7 +4,7 @@ # Note: We need to filter for "ur" to remove Hindi (hi) words. -SELECT DISTINCT +SELECT ?lexeme (REPLACE(STR(?lexeme), "http://www.wikidata.org/entity/", "") AS ?lexemeID) ?preposition diff --git a/src/scribe_data/language_data_extraction/Hindustani/Urdu/verbs/query_verbs.sparql b/src/scribe_data/language_data_extraction/Hindustani/Urdu/verbs/query_verbs.sparql index bf1d8b1fd..e6df3771c 100644 --- a/src/scribe_data/language_data_extraction/Hindustani/Urdu/verbs/query_verbs.sparql +++ b/src/scribe_data/language_data_extraction/Hindustani/Urdu/verbs/query_verbs.sparql @@ -4,7 +4,7 @@ # Note: We need to filter for "ur" to remove Hindustani (hi) words. -SELECT DISTINCT +SELECT (REPLACE(STR(?lexeme), "http://www.wikidata.org/entity/", "") AS ?lexemeID) ?infinitive ?directCase diff --git a/src/scribe_data/language_data_extraction/Indonesian/nouns/query_nouns.sparql b/src/scribe_data/language_data_extraction/Indonesian/nouns/query_nouns.sparql index 084a67768..b26a0059d 100644 --- a/src/scribe_data/language_data_extraction/Indonesian/nouns/query_nouns.sparql +++ b/src/scribe_data/language_data_extraction/Indonesian/nouns/query_nouns.sparql @@ -2,7 +2,7 @@ # All Indonesian (Q9240) nouns. # Enter this query at https://query.wikidata.org/. -SELECT DISTINCT +SELECT (REPLACE(STR(?lexeme), "http://www.wikidata.org/entity/", "") AS ?lexemeID) ?noun diff --git a/src/scribe_data/language_data_extraction/Italian/nouns/query_nouns.sparql b/src/scribe_data/language_data_extraction/Italian/nouns/query_nouns.sparql index 2f85a9453..fbbd08aaa 100644 --- a/src/scribe_data/language_data_extraction/Italian/nouns/query_nouns.sparql +++ b/src/scribe_data/language_data_extraction/Italian/nouns/query_nouns.sparql @@ -2,7 +2,7 @@ # All Italian (Q652) nouns, their plurals and their genders. # Enter this query at https://query.wikidata.org/. -SELECT DISTINCT +SELECT (REPLACE(STR(?lexeme), "http://www.wikidata.org/entity/", "") AS ?lexemeID) ?singular ?plural diff --git a/src/scribe_data/language_data_extraction/Japanese/nouns/query_nouns.sparql b/src/scribe_data/language_data_extraction/Japanese/nouns/query_nouns.sparql index 0dde5908a..098661648 100644 --- a/src/scribe_data/language_data_extraction/Japanese/nouns/query_nouns.sparql +++ b/src/scribe_data/language_data_extraction/Japanese/nouns/query_nouns.sparql @@ -2,7 +2,7 @@ # All Japanese (Q5287) nouns. # Enter this query at https://query.wikidata.org/. -SELECT DISTINCT +SELECT (REPLACE(STR(?lexeme), "http://www.wikidata.org/entity/", "") AS ?lexemeID) ?noun diff --git a/src/scribe_data/language_data_extraction/Korean/adverbs/query_adverbs.sparql b/src/scribe_data/language_data_extraction/Korean/adverbs/query_adverbs.sparql index 020073b13..a76b657f0 100644 --- a/src/scribe_data/language_data_extraction/Korean/adverbs/query_adverbs.sparql +++ b/src/scribe_data/language_data_extraction/Korean/adverbs/query_adverbs.sparql @@ -2,7 +2,7 @@ # All Korean (Q9176) adverbs. # Enter this query at https://query.wikidata.org/. -SELECT DISTINCT +SELECT (REPLACE(STR(?lexeme), "http://www.wikidata.org/entity/", "") AS ?lexemeID) ?adverb diff --git a/src/scribe_data/language_data_extraction/Korean/postposition/query_postpositions.sparql b/src/scribe_data/language_data_extraction/Korean/postposition/query_postpositions.sparql index a1a8cb473..5a6cb2d44 100644 --- a/src/scribe_data/language_data_extraction/Korean/postposition/query_postpositions.sparql +++ b/src/scribe_data/language_data_extraction/Korean/postposition/query_postpositions.sparql @@ -2,7 +2,7 @@ # All Korean (Q9176) postpositions. # Enter this query at https://query.wikidata.org/. -SELECT DISTINCT +SELECT (REPLACE(STR(?lexeme), "http://www.wikidata.org/entity/", "") AS ?lexemeID) ?postposition diff --git a/src/scribe_data/language_data_extraction/Kurmanji/nouns/query_nouns.sparql b/src/scribe_data/language_data_extraction/Kurmanji/nouns/query_nouns.sparql index 822d09e61..a6839c2e0 100644 --- a/src/scribe_data/language_data_extraction/Kurmanji/nouns/query_nouns.sparql +++ b/src/scribe_data/language_data_extraction/Kurmanji/nouns/query_nouns.sparql @@ -2,7 +2,7 @@ # All Kurmanji (Q36163) nouns and their gender. # Enter this query at https://query.wikidata.org/. -SELECT DISTINCT +SELECT (REPLACE(STR(?lexeme), "http://www.wikidata.org/entity/", "") AS ?lexemeID) ?directDefSingular ?gender diff --git a/src/scribe_data/language_data_extraction/Malay/nouns/query_nouns.sparql b/src/scribe_data/language_data_extraction/Malay/nouns/query_nouns.sparql index 4002b553d..1da57f106 100644 --- a/src/scribe_data/language_data_extraction/Malay/nouns/query_nouns.sparql +++ b/src/scribe_data/language_data_extraction/Malay/nouns/query_nouns.sparql @@ -2,7 +2,7 @@ # All Malay (Q9237) nouns. # Enter this query at https://query.wikidata.org/. -SELECT DISTINCT +SELECT (REPLACE(STR(?lexeme), "http://www.wikidata.org/entity/", "") AS ?lexemeID) ?noun diff --git a/src/scribe_data/language_data_extraction/Malayalam/adjectives/query_adjectives.sparql b/src/scribe_data/language_data_extraction/Malayalam/adjectives/query_adjectives.sparql index 6e666edae..2bff79f3c 100644 --- a/src/scribe_data/language_data_extraction/Malayalam/adjectives/query_adjectives.sparql +++ b/src/scribe_data/language_data_extraction/Malayalam/adjectives/query_adjectives.sparql @@ -2,7 +2,7 @@ # All Malayalam (Q36236) adjectives. # Enter this query at https://query.wikidata.org/. -SELECT DISTINCT +SELECT (REPLACE(STR(?lexeme), "http://www.wikidata.org/entity/", "") AS ?lexemeID) ?adjective diff --git a/src/scribe_data/language_data_extraction/Malayalam/adverbs/query_adverbs.sparql b/src/scribe_data/language_data_extraction/Malayalam/adverbs/query_adverbs.sparql index a0b17ddd7..e1a0af8d5 100644 --- a/src/scribe_data/language_data_extraction/Malayalam/adverbs/query_adverbs.sparql +++ b/src/scribe_data/language_data_extraction/Malayalam/adverbs/query_adverbs.sparql @@ -2,7 +2,7 @@ # All Malayalam (Q36236) adverbs. # Enter this query at https://query.wikidata.org/. -SELECT DISTINCT +SELECT (REPLACE(STR(?lexeme), "http://www.wikidata.org/entity/", "") AS ?lexemeID) ?adverb diff --git a/src/scribe_data/language_data_extraction/Malayalam/nouns/query_nouns.sparql b/src/scribe_data/language_data_extraction/Malayalam/nouns/query_nouns.sparql index be39d953e..d1402399b 100644 --- a/src/scribe_data/language_data_extraction/Malayalam/nouns/query_nouns.sparql +++ b/src/scribe_data/language_data_extraction/Malayalam/nouns/query_nouns.sparql @@ -2,7 +2,7 @@ # All Malayalam (Q36236) nouns and their plurals in the given cases. # Enter this query at https://query.wikidata.org/. -SELECT DISTINCT +SELECT (REPLACE(STR(?lexeme), "http://www.wikidata.org/entity/", "") AS ?lexemeID) ?nomSingular ?gender diff --git a/src/scribe_data/language_data_extraction/Malayalam/prepositions/query_prepositions.sparql b/src/scribe_data/language_data_extraction/Malayalam/prepositions/query_prepositions.sparql index 0e2487fce..59b1c4075 100644 --- a/src/scribe_data/language_data_extraction/Malayalam/prepositions/query_prepositions.sparql +++ b/src/scribe_data/language_data_extraction/Malayalam/prepositions/query_prepositions.sparql @@ -2,7 +2,7 @@ # All Malayalam (Q36236) prepositions. # Enter this query at https://query.wikidata.org/. -SELECT DISTINCT +SELECT (REPLACE(STR(?lexeme), "http://www.wikidata.org/entity/", "") AS ?lexemeID) ?preposition diff --git a/src/scribe_data/language_data_extraction/Malayalam/verbs/query_verbs.sparql b/src/scribe_data/language_data_extraction/Malayalam/verbs/query_verbs.sparql index 351b7af31..0db34c67c 100644 --- a/src/scribe_data/language_data_extraction/Malayalam/verbs/query_verbs.sparql +++ b/src/scribe_data/language_data_extraction/Malayalam/verbs/query_verbs.sparql @@ -2,7 +2,7 @@ # All Malayalam (Q36236) verbs and the currently implemented tenses for each. # Enter this query at https://query.wikidata.org/. -SELECT DISTINCT +SELECT (REPLACE(STR(?lexeme), "http://www.wikidata.org/entity/", "") AS ?lexemeID) ?verb ?presentInfinitive diff --git "a/src/scribe_data/language_data_extraction/Norwegian/Bokm\303\245l/nouns/query_nouns.sparql" "b/src/scribe_data/language_data_extraction/Norwegian/Bokm\303\245l/nouns/query_nouns.sparql" index 1e8779c90..4f505b678 100644 --- "a/src/scribe_data/language_data_extraction/Norwegian/Bokm\303\245l/nouns/query_nouns.sparql" +++ "b/src/scribe_data/language_data_extraction/Norwegian/Bokm\303\245l/nouns/query_nouns.sparql" @@ -3,7 +3,7 @@ # Enter this query at https://query.wikidata.org/. # Note: This query is for Bokmål (Q25167) rather than Nynorsk (Q25164). -SELECT DISTINCT +SELECT (REPLACE(STR(?lexeme), "http://www.wikidata.org/entity/", "") AS ?lexemeID) ?indefSingular ?defSingular diff --git "a/src/scribe_data/language_data_extraction/Norwegian/Bokm\303\245l/verbs/query_verbs.sparql" "b/src/scribe_data/language_data_extraction/Norwegian/Bokm\303\245l/verbs/query_verbs.sparql" index 1292041c3..e4cd7bef0 100644 --- "a/src/scribe_data/language_data_extraction/Norwegian/Bokm\303\245l/verbs/query_verbs.sparql" +++ "b/src/scribe_data/language_data_extraction/Norwegian/Bokm\303\245l/verbs/query_verbs.sparql" @@ -3,7 +3,7 @@ # Enter this query at https://query.wikidata.org/. # Note: This query is for Bokmål (Q25167) rather than Nynorsk (Q25164). -SELECT DISTINCT +SELECT (REPLACE(STR(?lexeme), "http://www.wikidata.org/entity/", "") AS ?lexemeID) ?infinitive ?present diff --git a/src/scribe_data/language_data_extraction/Norwegian/Nynorsk/nouns/query_nouns.sparql b/src/scribe_data/language_data_extraction/Norwegian/Nynorsk/nouns/query_nouns.sparql index d2cb20182..60384065f 100644 --- a/src/scribe_data/language_data_extraction/Norwegian/Nynorsk/nouns/query_nouns.sparql +++ b/src/scribe_data/language_data_extraction/Norwegian/Nynorsk/nouns/query_nouns.sparql @@ -3,7 +3,7 @@ # Enter this query at https://query.wikidata.org/. # Note: This query is for Nynorsk (Q25164) rather than Bokmål (Q25167). -SELECT DISTINCT +SELECT (REPLACE(STR(?lexeme), "http://www.wikidata.org/entity/", "") AS ?lexemeID) ?indefSingular ?defSingular diff --git a/src/scribe_data/language_data_extraction/Norwegian/Nynorsk/verbs/query_verbs.sparql b/src/scribe_data/language_data_extraction/Norwegian/Nynorsk/verbs/query_verbs.sparql index c18c6d3c9..2cbc7d65f 100644 --- a/src/scribe_data/language_data_extraction/Norwegian/Nynorsk/verbs/query_verbs.sparql +++ b/src/scribe_data/language_data_extraction/Norwegian/Nynorsk/verbs/query_verbs.sparql @@ -3,7 +3,7 @@ # Enter this query at https://query.wikidata.org/. # Note: This query is for Nynorsk (Q25164) rather than Bokmål (Q25167). -SELECT DISTINCT +SELECT (REPLACE(STR(?lexeme), "http://www.wikidata.org/entity/", "") AS ?lexemeID) ?infinitive ?aInfinitiveActive diff --git a/src/scribe_data/language_data_extraction/Pidgin/Nigerian/adverbs/query_adverbs.sparql b/src/scribe_data/language_data_extraction/Pidgin/Nigerian/adverbs/query_adverbs.sparql index d65394d09..be6e4810a 100644 --- a/src/scribe_data/language_data_extraction/Pidgin/Nigerian/adverbs/query_adverbs.sparql +++ b/src/scribe_data/language_data_extraction/Pidgin/Nigerian/adverbs/query_adverbs.sparql @@ -2,7 +2,7 @@ # All Nigerian Pidgin (Q33655) adverbs. # Enter this query at https://query.wikidata.org/. -SELECT DISTINCT +SELECT (REPLACE(STR(?lexeme), "http://www.wikidata.org/entity/", "") AS ?lexemeID) ?adverb diff --git a/src/scribe_data/language_data_extraction/Pidgin/Nigerian/nouns/query_nouns.sparql b/src/scribe_data/language_data_extraction/Pidgin/Nigerian/nouns/query_nouns.sparql index 3af46c7af..a22b1e059 100644 --- a/src/scribe_data/language_data_extraction/Pidgin/Nigerian/nouns/query_nouns.sparql +++ b/src/scribe_data/language_data_extraction/Pidgin/Nigerian/nouns/query_nouns.sparql @@ -2,7 +2,7 @@ # All Nigerian Pidgin (Q33655) nouns, their plurals and their genders. # Enter this query at https://query.wikidata.org/. -SELECT DISTINCT +SELECT (REPLACE(STR(?lexeme), "http://www.wikidata.org/entity/", "") AS ?lexemeID) ?singular ?plural diff --git a/src/scribe_data/language_data_extraction/Polish/nouns/query_nouns.sparql b/src/scribe_data/language_data_extraction/Polish/nouns/query_nouns.sparql index 420f8e1b2..5bd7e4fd4 100644 --- a/src/scribe_data/language_data_extraction/Polish/nouns/query_nouns.sparql +++ b/src/scribe_data/language_data_extraction/Polish/nouns/query_nouns.sparql @@ -2,7 +2,7 @@ # All Polish (Q809) nouns, their plurals and their genders in the given cases. # Enter this query at https://query.wikidata.org/. -SELECT DISTINCT +SELECT (REPLACE(STR(?lexeme), "http://www.wikidata.org/entity/", "") AS ?lexemeID) ?nomSingular ?nomPlural diff --git a/src/scribe_data/language_data_extraction/Portuguese/nouns/query_nouns.sparql b/src/scribe_data/language_data_extraction/Portuguese/nouns/query_nouns.sparql index e4d95e96a..705ae0cb0 100644 --- a/src/scribe_data/language_data_extraction/Portuguese/nouns/query_nouns.sparql +++ b/src/scribe_data/language_data_extraction/Portuguese/nouns/query_nouns.sparql @@ -2,7 +2,7 @@ # All Portuguese (Q5146) nouns, their plurals and their genders. # Enter this query at https://query.wikidata.org/. -SELECT DISTINCT +SELECT (REPLACE(STR(?lexeme), "http://www.wikidata.org/entity/", "") AS ?lexemeID) ?singular ?plural diff --git a/src/scribe_data/language_data_extraction/Portuguese/verbs/query_verbs.sparql b/src/scribe_data/language_data_extraction/Portuguese/verbs/query_verbs.sparql index 7ce7c48d9..79abc949d 100644 --- a/src/scribe_data/language_data_extraction/Portuguese/verbs/query_verbs.sparql +++ b/src/scribe_data/language_data_extraction/Portuguese/verbs/query_verbs.sparql @@ -2,7 +2,7 @@ # All Portuguese (Q5146) verbs and the currently implemented tenses for each. # Enter this query at https://query.wikidata.org/. -SELECT DISTINCT +SELECT (REPLACE(STR(?lexeme), "http://www.wikidata.org/entity/", "") AS ?lexemeID) ?infinitive ?presFPS ?presSPS ?presTPS diff --git a/src/scribe_data/language_data_extraction/Punjabi/Gurmukhi/nouns/query_nouns.sparql b/src/scribe_data/language_data_extraction/Punjabi/Gurmukhi/nouns/query_nouns.sparql index 011a9df9d..3fa164731 100644 --- a/src/scribe_data/language_data_extraction/Punjabi/Gurmukhi/nouns/query_nouns.sparql +++ b/src/scribe_data/language_data_extraction/Punjabi/Gurmukhi/nouns/query_nouns.sparql @@ -4,7 +4,7 @@ # Note: We need to filter for "pa" to select Gurmukhi words. -SELECT DISTINCT +SELECT (REPLACE(STR(?lexeme), "http://www.wikidata.org/entity/", "") AS ?lexemeID) ?singular ?plural diff --git a/src/scribe_data/language_data_extraction/Punjabi/Gurmukhi/verbs/query_verbs.sparql b/src/scribe_data/language_data_extraction/Punjabi/Gurmukhi/verbs/query_verbs.sparql index 72558a266..99999e0b0 100644 --- a/src/scribe_data/language_data_extraction/Punjabi/Gurmukhi/verbs/query_verbs.sparql +++ b/src/scribe_data/language_data_extraction/Punjabi/Gurmukhi/verbs/query_verbs.sparql @@ -4,7 +4,7 @@ # Note: We need to filter for "pa" to select Gurmukhi words. -SELECT DISTINCT +SELECT (REPLACE(STR(?lexeme), "http://www.wikidata.org/entity/", "") AS ?lexemeID) ?verb diff --git a/src/scribe_data/language_data_extraction/Punjabi/Shahmukhi/nouns/query_nouns.sparql b/src/scribe_data/language_data_extraction/Punjabi/Shahmukhi/nouns/query_nouns.sparql index 2ba573bfe..a930fb16f 100644 --- a/src/scribe_data/language_data_extraction/Punjabi/Shahmukhi/nouns/query_nouns.sparql +++ b/src/scribe_data/language_data_extraction/Punjabi/Shahmukhi/nouns/query_nouns.sparql @@ -4,7 +4,7 @@ # Note: We need to filter for "pnb" to select Shahmukhi words. -SELECT DISTINCT +SELECT ?lexeme (REPLACE(STR(?lexeme), "http://www.wikidata.org/entity/", "") AS ?lexemeID) ?singular diff --git a/src/scribe_data/language_data_extraction/Punjabi/Shahmukhi/verbs/query_verbs.sparql b/src/scribe_data/language_data_extraction/Punjabi/Shahmukhi/verbs/query_verbs.sparql index 221016f93..4d7de132f 100644 --- a/src/scribe_data/language_data_extraction/Punjabi/Shahmukhi/verbs/query_verbs.sparql +++ b/src/scribe_data/language_data_extraction/Punjabi/Shahmukhi/verbs/query_verbs.sparql @@ -4,7 +4,7 @@ # Note: We need to filter for "pnb" to select Shahmukhi words. -SELECT DISTINCT +SELECT (REPLACE(STR(?lexeme), "http://www.wikidata.org/entity/", "") AS ?lexemeID) ?verb diff --git a/src/scribe_data/language_data_extraction/Russian/nouns/query_nouns.sparql b/src/scribe_data/language_data_extraction/Russian/nouns/query_nouns.sparql index da6685228..25abb07a9 100644 --- a/src/scribe_data/language_data_extraction/Russian/nouns/query_nouns.sparql +++ b/src/scribe_data/language_data_extraction/Russian/nouns/query_nouns.sparql @@ -2,7 +2,7 @@ # All Russian (Q7737) nouns, their plurals and their genders in the given cases. # Enter this query at https://query.wikidata.org/. -SELECT DISTINCT +SELECT (REPLACE(STR(?lexeme), "http://www.wikidata.org/entity/", "") AS ?lexemeID) ?nomSingular ?nomPlural diff --git a/src/scribe_data/language_data_extraction/Russian/verbs/query_verbs.sparql b/src/scribe_data/language_data_extraction/Russian/verbs/query_verbs.sparql index 7b7aaf4fa..501d23e1c 100644 --- a/src/scribe_data/language_data_extraction/Russian/verbs/query_verbs.sparql +++ b/src/scribe_data/language_data_extraction/Russian/verbs/query_verbs.sparql @@ -2,7 +2,7 @@ # All Russian (Q7737) verbs and the currently implemented tenses for each. # Enter this query at https://query.wikidata.org/. -SELECT DISTINCT +SELECT (REPLACE(STR(?lexeme), "http://www.wikidata.org/entity/", "") AS ?lexemeID) ?infinitive ?presFPS ?presSPS ?presTPS diff --git a/src/scribe_data/language_data_extraction/Slovak/adverbs/query_adverbs.sparql b/src/scribe_data/language_data_extraction/Slovak/adverbs/query_adverbs.sparql index d80e628fc..e123b7cc9 100644 --- a/src/scribe_data/language_data_extraction/Slovak/adverbs/query_adverbs.sparql +++ b/src/scribe_data/language_data_extraction/Slovak/adverbs/query_adverbs.sparql @@ -2,7 +2,7 @@ # All Slovak (Q9058) adverbs. # Enter this query at https://query.wikidata.org/. -SELECT DISTINCT +SELECT (REPLACE(STR(?lexeme), "http://www.wikidata.org/entity/", "") AS ?lexemeID) ?adverb diff --git a/src/scribe_data/language_data_extraction/Slovak/nouns/query_nouns.sparql b/src/scribe_data/language_data_extraction/Slovak/nouns/query_nouns.sparql index b10482aac..9bafa552e 100644 --- a/src/scribe_data/language_data_extraction/Slovak/nouns/query_nouns.sparql +++ b/src/scribe_data/language_data_extraction/Slovak/nouns/query_nouns.sparql @@ -2,7 +2,7 @@ # All Slovak (Q9058) nouns, their plurals and their genders for the given cases. # Enter this query at https://query.wikidata.org/. -SELECT DISTINCT +SELECT (REPLACE(STR(?lexeme), "http://www.wikidata.org/entity/", "") AS ?lexemeID) ?nomSingular ?nomPlural diff --git a/src/scribe_data/language_data_extraction/Spanish/nouns/query_nouns.sparql b/src/scribe_data/language_data_extraction/Spanish/nouns/query_nouns.sparql index a88e8faae..dd0b54d87 100644 --- a/src/scribe_data/language_data_extraction/Spanish/nouns/query_nouns.sparql +++ b/src/scribe_data/language_data_extraction/Spanish/nouns/query_nouns.sparql @@ -3,7 +3,7 @@ # Enter this query at https://query.wikidata.org/. # Note: Spansih sometimes has masculine and feminine versions on a single lexeme. -SELECT DISTINCT +SELECT (REPLACE(STR(?lexeme), "http://www.wikidata.org/entity/", "") AS ?lexemeID) ?singular ?plural diff --git a/src/scribe_data/language_data_extraction/Swahili/nouns/query_nouns.sparql b/src/scribe_data/language_data_extraction/Swahili/nouns/query_nouns.sparql index 547430d04..fb7055fb0 100644 --- a/src/scribe_data/language_data_extraction/Swahili/nouns/query_nouns.sparql +++ b/src/scribe_data/language_data_extraction/Swahili/nouns/query_nouns.sparql @@ -2,7 +2,7 @@ # All Swahili (Q7838) nouns and their plurals. # Enter this query at https://query.wikidata.org/. -SELECT DISTINCT +SELECT (REPLACE(STR(?lexeme), "http://www.wikidata.org/entity/", "") AS ?lexemeID) ?singular ?plural diff --git a/src/scribe_data/language_data_extraction/Swedish/nouns/query_nouns.sparql b/src/scribe_data/language_data_extraction/Swedish/nouns/query_nouns.sparql index 720fb5b23..243733b0b 100644 --- a/src/scribe_data/language_data_extraction/Swedish/nouns/query_nouns.sparql +++ b/src/scribe_data/language_data_extraction/Swedish/nouns/query_nouns.sparql @@ -2,7 +2,7 @@ # All Swedish (Q9027) nouns with their plural, gender and genitive forms. # Enter this query at https://query.wikidata.org/. -SELECT DISTINCT +SELECT (REPLACE(STR(?lexeme), "http://www.wikidata.org/entity/", "") AS ?lexemeID) ?nomIndefSingular ?nomIndefPlural diff --git a/src/scribe_data/language_data_extraction/Tajik/nouns/query_nouns.sparql b/src/scribe_data/language_data_extraction/Tajik/nouns/query_nouns.sparql index 11ffb5718..44b5f0aae 100644 --- a/src/scribe_data/language_data_extraction/Tajik/nouns/query_nouns.sparql +++ b/src/scribe_data/language_data_extraction/Tajik/nouns/query_nouns.sparql @@ -2,7 +2,7 @@ # All Tajik (Q9260) nouns. # Enter this query at https://query.wikidata.org/. -SELECT DISTINCT +SELECT (REPLACE(STR(?lexeme), "http://www.wikidata.org/entity/", "") AS ?lexemeID) ?noun diff --git a/src/scribe_data/language_data_extraction/Tamil/nouns/query_nouns.sparql b/src/scribe_data/language_data_extraction/Tamil/nouns/query_nouns.sparql index 16ceb45f0..ae10914e1 100644 --- a/src/scribe_data/language_data_extraction/Tamil/nouns/query_nouns.sparql +++ b/src/scribe_data/language_data_extraction/Tamil/nouns/query_nouns.sparql @@ -2,7 +2,7 @@ # All Tamil (Q5885) nouns and their plurals for the given cases. # Enter this query at https://query.wikidata.org/. -SELECT DISTINCT +SELECT (REPLACE(STR(?lexeme), "http://www.wikidata.org/entity/", "") AS ?lexemeID) ?nomSingular ?nomPlural diff --git a/src/scribe_data/language_data_extraction/Ukrainian/nouns/query_nouns.sparql b/src/scribe_data/language_data_extraction/Ukrainian/nouns/query_nouns.sparql index 30450c04f..1548b4c46 100644 --- a/src/scribe_data/language_data_extraction/Ukrainian/nouns/query_nouns.sparql +++ b/src/scribe_data/language_data_extraction/Ukrainian/nouns/query_nouns.sparql @@ -2,7 +2,7 @@ # All Ukrainian (Q8798) nouns, their plurals and their genders for the given cases. # Enter this query at https://query.wikidata.org/. -SELECT DISTINCT +SELECT (REPLACE(STR(?lexeme), "http://www.wikidata.org/entity/", "") AS ?lexemeID) ?nomSingular ?nomPlural diff --git a/src/scribe_data/language_data_extraction/Yoruba/adverbs/query_adverbs.sparql b/src/scribe_data/language_data_extraction/Yoruba/adverbs/query_adverbs.sparql index 38387bde2..ad6db4eff 100644 --- a/src/scribe_data/language_data_extraction/Yoruba/adverbs/query_adverbs.sparql +++ b/src/scribe_data/language_data_extraction/Yoruba/adverbs/query_adverbs.sparql @@ -2,7 +2,7 @@ # All Yoruba (Q34311) adverbs. # Enter this query at https://query.wikidata.org/. -SELECT DISTINCT +SELECT (REPLACE(STR(?lexeme), "http://www.wikidata.org/entity/", "") AS ?lexemeID) ?adverb diff --git a/src/scribe_data/language_data_extraction/Yoruba/nouns/query_nouns.sparql b/src/scribe_data/language_data_extraction/Yoruba/nouns/query_nouns.sparql index 47c83c80d..44a8f48fa 100644 --- a/src/scribe_data/language_data_extraction/Yoruba/nouns/query_nouns.sparql +++ b/src/scribe_data/language_data_extraction/Yoruba/nouns/query_nouns.sparql @@ -2,7 +2,7 @@ # All Yoruba (Q34311) nouns. # Enter this query at https://query.wikidata.org/. -SELECT DISTINCT +SELECT (REPLACE(STR(?lexeme), "http://www.wikidata.org/entity/", "") AS ?lexemeID) ?noun From b5fecce762438a8c97c97a6e5eb4e526d01ecb2f Mon Sep 17 00:00:00 2001 From: Angel osim <69635048+Otom-obhazi@users.noreply.github.com> Date: Tue, 15 Oct 2024 13:07:15 +0100 Subject: [PATCH 04/36] Create query_adverbs.sparql adverb for chinese/mandarin --- .../Chinese/Mandarin/Adverbs/query_adverbs.sparql | 13 +++++++++++++ 1 file changed, 13 insertions(+) create mode 100644 src/scribe_data/language_data_extraction/Chinese/Mandarin/Adverbs/query_adverbs.sparql diff --git a/src/scribe_data/language_data_extraction/Chinese/Mandarin/Adverbs/query_adverbs.sparql b/src/scribe_data/language_data_extraction/Chinese/Mandarin/Adverbs/query_adverbs.sparql new file mode 100644 index 000000000..46251a815 --- /dev/null +++ b/src/scribe_data/language_data_extraction/Chinese/Mandarin/Adverbs/query_adverbs.sparql @@ -0,0 +1,13 @@ +# tool: scribe-data +# All Standard Mandarin Chinese (Q727694) adverbs. +# Enter this query at https://query.wikidata.org/. + +SELECT DISTINCT + (REPLACE(STR(?lexeme), "http://www.wikidata.org/entity/", "") AS ?lexemeID) + ?adverb + +WHERE { + ?lexeme dct:language wd:Q727694 ; + wikibase:lexicalCategory wd:Q380057 ; + wikibase:lemma ?adverb . +} From ae15e7772597dade1259d200ea441b43075256de Mon Sep 17 00:00:00 2001 From: Andrew Tavis McAllister Date: Tue, 15 Oct 2024 18:21:12 +0200 Subject: [PATCH 05/36] Add filter for language --- .../Chinese/Mandarin/Adverbs/query_adverbs.sparql | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/scribe_data/language_data_extraction/Chinese/Mandarin/Adverbs/query_adverbs.sparql b/src/scribe_data/language_data_extraction/Chinese/Mandarin/Adverbs/query_adverbs.sparql index 46251a815..8633280f4 100644 --- a/src/scribe_data/language_data_extraction/Chinese/Mandarin/Adverbs/query_adverbs.sparql +++ b/src/scribe_data/language_data_extraction/Chinese/Mandarin/Adverbs/query_adverbs.sparql @@ -2,7 +2,7 @@ # All Standard Mandarin Chinese (Q727694) adverbs. # Enter this query at https://query.wikidata.org/. -SELECT DISTINCT +SELECT (REPLACE(STR(?lexeme), "http://www.wikidata.org/entity/", "") AS ?lexemeID) ?adverb @@ -10,4 +10,5 @@ WHERE { ?lexeme dct:language wd:Q727694 ; wikibase:lexicalCategory wd:Q380057 ; wikibase:lemma ?adverb . + FILTER(LANG(?adverb) = "zh") . } From f5f74049df6d915d4eae84f8ae984b388a191b99 Mon Sep 17 00:00:00 2001 From: Angel osim <69635048+Otom-obhazi@users.noreply.github.com> Date: Tue, 15 Oct 2024 14:08:24 +0100 Subject: [PATCH 06/36] Create query_adverbs.sparql adverb for english --- .../English/Adverbs/query_adverbs.sparql | 13 +++++++++++++ 1 file changed, 13 insertions(+) create mode 100644 src/scribe_data/language_data_extraction/English/Adverbs/query_adverbs.sparql diff --git a/src/scribe_data/language_data_extraction/English/Adverbs/query_adverbs.sparql b/src/scribe_data/language_data_extraction/English/Adverbs/query_adverbs.sparql new file mode 100644 index 000000000..cf29f5aef --- /dev/null +++ b/src/scribe_data/language_data_extraction/English/Adverbs/query_adverbs.sparql @@ -0,0 +1,13 @@ +# tool: scribe-data +# All English (Q1860) adverbs. +# Enter this query at https://query.wikidata.org/. + +SELECT DISTINCT + (REPLACE(STR(?lexeme), "http://www.wikidata.org/entity/", "") AS ?lexemeID) + ?adverb + +WHERE { + ?lexeme dct:language wd:Q1860 ; + wikibase:lexicalCategory wd:Q380057 ; + wikibase:lemma ?adverb . +} From e250233d33cd8e4f5b362e0ee162c35e0a08aaa6 Mon Sep 17 00:00:00 2001 From: Andrew Tavis McAllister Date: Tue, 15 Oct 2024 18:32:34 +0200 Subject: [PATCH 07/36] Remove adverb file and prepare tests --- .../English/Adverbs/query_adverbs.sparql | 13 ------------- tests/cli/test_list.py | 1 + 2 files changed, 1 insertion(+), 13 deletions(-) delete mode 100644 src/scribe_data/language_data_extraction/English/Adverbs/query_adverbs.sparql diff --git a/src/scribe_data/language_data_extraction/English/Adverbs/query_adverbs.sparql b/src/scribe_data/language_data_extraction/English/Adverbs/query_adverbs.sparql deleted file mode 100644 index cf29f5aef..000000000 --- a/src/scribe_data/language_data_extraction/English/Adverbs/query_adverbs.sparql +++ /dev/null @@ -1,13 +0,0 @@ -# tool: scribe-data -# All English (Q1860) adverbs. -# Enter this query at https://query.wikidata.org/. - -SELECT DISTINCT - (REPLACE(STR(?lexeme), "http://www.wikidata.org/entity/", "") AS ?lexemeID) - ?adverb - -WHERE { - ?lexeme dct:language wd:Q1860 ; - wikibase:lexicalCategory wd:Q380057 ; - wikibase:lemma ?adverb . -} diff --git a/tests/cli/test_list.py b/tests/cli/test_list.py index 1ec2ec1e4..def230511 100644 --- a/tests/cli/test_list.py +++ b/tests/cli/test_list.py @@ -80,6 +80,7 @@ def test_list_data_types_specific_language(self, mock_print): call("Available data types: English"), call("-----------------------------"), call("adjectives"), + call("adverbs"), call("emoji-keywords"), call("nouns"), call("verbs"), From 52dca1911b453bcf7e9c8d531e03b65fba77cea1 Mon Sep 17 00:00:00 2001 From: Andrew Tavis McAllister Date: Tue, 15 Oct 2024 18:33:04 +0200 Subject: [PATCH 08/36] Re-add English adverbs --- .../English/adverbs/query_adverbs.sparql | 13 +++++++++++++ 1 file changed, 13 insertions(+) create mode 100644 src/scribe_data/language_data_extraction/English/adverbs/query_adverbs.sparql diff --git a/src/scribe_data/language_data_extraction/English/adverbs/query_adverbs.sparql b/src/scribe_data/language_data_extraction/English/adverbs/query_adverbs.sparql new file mode 100644 index 000000000..cf29f5aef --- /dev/null +++ b/src/scribe_data/language_data_extraction/English/adverbs/query_adverbs.sparql @@ -0,0 +1,13 @@ +# tool: scribe-data +# All English (Q1860) adverbs. +# Enter this query at https://query.wikidata.org/. + +SELECT DISTINCT + (REPLACE(STR(?lexeme), "http://www.wikidata.org/entity/", "") AS ?lexemeID) + ?adverb + +WHERE { + ?lexeme dct:language wd:Q1860 ; + wikibase:lexicalCategory wd:Q380057 ; + wikibase:lemma ?adverb . +} From 7dbf7b018e088571206a0f5eec39190cfdca7cbc Mon Sep 17 00:00:00 2001 From: Veronica Waiganjo Date: Tue, 15 Oct 2024 15:06:47 +0300 Subject: [PATCH 09/36] Add Chinese Mndarin adverbs,prepositions,adjectives and emoji keywords --- .../Prepositions/query_prepositions.sparql | 13 ++++++ .../adjectives/query_adjectives.sparql | 13 ++++++ .../Mandarin/adverbs/query_adverbs.sparql | 13 ++++++ .../Mandarin/emoji_keywords/__init__.py | 0 .../emoji_keywords/generate_emoji_keywords.py | 46 +++++++++++++++++++ 5 files changed, 85 insertions(+) create mode 100644 src/scribe_data/language_data_extraction/Chinese/Mandarin/Prepositions/query_prepositions.sparql create mode 100644 src/scribe_data/language_data_extraction/Chinese/Mandarin/adjectives/query_adjectives.sparql create mode 100644 src/scribe_data/language_data_extraction/Chinese/Mandarin/adverbs/query_adverbs.sparql create mode 100644 src/scribe_data/language_data_extraction/Chinese/Mandarin/emoji_keywords/__init__.py create mode 100644 src/scribe_data/language_data_extraction/Chinese/Mandarin/emoji_keywords/generate_emoji_keywords.py diff --git a/src/scribe_data/language_data_extraction/Chinese/Mandarin/Prepositions/query_prepositions.sparql b/src/scribe_data/language_data_extraction/Chinese/Mandarin/Prepositions/query_prepositions.sparql new file mode 100644 index 000000000..f34db8f8b --- /dev/null +++ b/src/scribe_data/language_data_extraction/Chinese/Mandarin/Prepositions/query_prepositions.sparql @@ -0,0 +1,13 @@ +# tool: scribe-data +# All Standard Mandarin Chinese (Q1412) prepositions. +# Enter this query at https://query.wikidata.org/. + +SELECT + (REPLACE(STR(?lexeme), "http://www.wikidata.org/entity/", "") AS ?lexemeID) + ?preposition + +WHERE { + ?lexeme dct:language wd:Q727694 ; + wikibase:lexicalCategory wd:Q4833830 ; + wikibase:lemma ?preposition . +} diff --git a/src/scribe_data/language_data_extraction/Chinese/Mandarin/adjectives/query_adjectives.sparql b/src/scribe_data/language_data_extraction/Chinese/Mandarin/adjectives/query_adjectives.sparql new file mode 100644 index 000000000..75f5f6df3 --- /dev/null +++ b/src/scribe_data/language_data_extraction/Chinese/Mandarin/adjectives/query_adjectives.sparql @@ -0,0 +1,13 @@ +# tool: scribe-data +# All Mandarin Chinese (Q727694) adjectives. +# Enter this query at https://query.wikidata.org/. + +SELECT + (REPLACE(STR(?lexeme), "http://www.wikidata.org/entity/", "") AS ?lexemeID) + ?adjective + +WHERE { + ?lexeme dct:language wd:Q727694 ; + wikibase:lexicalCategory wd:Q34698 ; + wikibase:lemma ?adjective . +} diff --git a/src/scribe_data/language_data_extraction/Chinese/Mandarin/adverbs/query_adverbs.sparql b/src/scribe_data/language_data_extraction/Chinese/Mandarin/adverbs/query_adverbs.sparql new file mode 100644 index 000000000..c22972442 --- /dev/null +++ b/src/scribe_data/language_data_extraction/Chinese/Mandarin/adverbs/query_adverbs.sparql @@ -0,0 +1,13 @@ +# tool: scribe-data +# All Mandarin Chinese (Q727694) adverb. +# Enter this query at https://query.wikidata.org/. + +SELECT + (REPLACE(STR(?lexeme), "http://www.wikidata.org/entity/", "") AS ?lexemeID) + ?adverb + +WHERE { + ?lexeme dct:language wd:Q727694 ; + wikibase:lexicalCategory wd:Q380057 ; + wikibase:lemma ?adverb. +} diff --git a/src/scribe_data/language_data_extraction/Chinese/Mandarin/emoji_keywords/__init__.py b/src/scribe_data/language_data_extraction/Chinese/Mandarin/emoji_keywords/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/src/scribe_data/language_data_extraction/Chinese/Mandarin/emoji_keywords/generate_emoji_keywords.py b/src/scribe_data/language_data_extraction/Chinese/Mandarin/emoji_keywords/generate_emoji_keywords.py new file mode 100644 index 000000000..fb1e509b1 --- /dev/null +++ b/src/scribe_data/language_data_extraction/Chinese/Mandarin/emoji_keywords/generate_emoji_keywords.py @@ -0,0 +1,46 @@ +""" +Generates keyword-emoji relationships from a selection of Mandarin Chinese words. + +.. raw:: html + +""" + +import argparse + +from scribe_data.unicode.process_unicode import gen_emoji_lexicon +from scribe_data.utils import export_formatted_data + +LANGUAGE = "Standard Mandarin" +DATA_TYPE = "emoji-keywords" +emojis_per_keyword = 3 + +parser = argparse.ArgumentParser() +parser.add_argument("--file-path") +args = parser.parse_args() + +if emoji_keywords_dict := gen_emoji_lexicon( + language=LANGUAGE, + emojis_per_keyword=emojis_per_keyword, +): + export_formatted_data( + file_path=args.file_path, + formatted_data=emoji_keywords_dict, + query_data_in_use=True, + language=LANGUAGE, + data_type=DATA_TYPE, + ) From 5a383f2b9a1853b60eb758274660aaffd13df8f5 Mon Sep 17 00:00:00 2001 From: Veronica Waiganjo Date: Tue, 15 Oct 2024 15:25:29 +0300 Subject: [PATCH 10/36] Update Mandarin prepositions query --- .../Chinese/Mandarin/Prepositions/query_prepositions.sparql | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/scribe_data/language_data_extraction/Chinese/Mandarin/Prepositions/query_prepositions.sparql b/src/scribe_data/language_data_extraction/Chinese/Mandarin/Prepositions/query_prepositions.sparql index f34db8f8b..4757b637f 100644 --- a/src/scribe_data/language_data_extraction/Chinese/Mandarin/Prepositions/query_prepositions.sparql +++ b/src/scribe_data/language_data_extraction/Chinese/Mandarin/Prepositions/query_prepositions.sparql @@ -1,5 +1,5 @@ # tool: scribe-data -# All Standard Mandarin Chinese (Q1412) prepositions. +# All Standard Mandarin Chinese (Q727694) prepositions. # Enter this query at https://query.wikidata.org/. SELECT From 1942d0989fe9a53593bb7e34dcd3b451563f9e68 Mon Sep 17 00:00:00 2001 From: Veronica Waiganjo Date: Tue, 15 Oct 2024 16:32:05 +0300 Subject: [PATCH 11/36] Remove Mandarin Adverbs directory --- .../Chinese/Mandarin/adverbs/query_adverbs.sparql | 13 ------------- 1 file changed, 13 deletions(-) delete mode 100644 src/scribe_data/language_data_extraction/Chinese/Mandarin/adverbs/query_adverbs.sparql diff --git a/src/scribe_data/language_data_extraction/Chinese/Mandarin/adverbs/query_adverbs.sparql b/src/scribe_data/language_data_extraction/Chinese/Mandarin/adverbs/query_adverbs.sparql deleted file mode 100644 index c22972442..000000000 --- a/src/scribe_data/language_data_extraction/Chinese/Mandarin/adverbs/query_adverbs.sparql +++ /dev/null @@ -1,13 +0,0 @@ -# tool: scribe-data -# All Mandarin Chinese (Q727694) adverb. -# Enter this query at https://query.wikidata.org/. - -SELECT - (REPLACE(STR(?lexeme), "http://www.wikidata.org/entity/", "") AS ?lexemeID) - ?adverb - -WHERE { - ?lexeme dct:language wd:Q727694 ; - wikibase:lexicalCategory wd:Q380057 ; - wikibase:lemma ?adverb. -} From 3d505a76c082943195fef21cf10fa064eafd6907 Mon Sep 17 00:00:00 2001 From: Angel osim <69635048+Otom-obhazi@users.noreply.github.com> Date: Tue, 15 Oct 2024 14:39:56 +0100 Subject: [PATCH 12/36] Create query_adverbs.sparql adverb for Tajik --- .../Tajik/Adverbs/query_adverbs.sparql | 13 +++++++++++++ 1 file changed, 13 insertions(+) create mode 100644 src/scribe_data/language_data_extraction/Tajik/Adverbs/query_adverbs.sparql diff --git a/src/scribe_data/language_data_extraction/Tajik/Adverbs/query_adverbs.sparql b/src/scribe_data/language_data_extraction/Tajik/Adverbs/query_adverbs.sparql new file mode 100644 index 000000000..b0d714b01 --- /dev/null +++ b/src/scribe_data/language_data_extraction/Tajik/Adverbs/query_adverbs.sparql @@ -0,0 +1,13 @@ +# tool: scribe-data +# All Tajik (Q9260) adverbs. +# Enter this query at https://query.wikidata.org/. + +SELECT DISTINCT + (REPLACE(STR(?lexeme), "http://www.wikidata.org/entity/", "") AS ?lexemeID) + ?adverb + +WHERE { + ?lexeme dct:language wd:Q9260 ; + wikibase:lexicalCategory wd:Q380057 ; + wikibase:lemma ?adverb . +} From a871de31d150fdde8d66c18b087d43a05e91d886 Mon Sep 17 00:00:00 2001 From: Angel osim <69635048+Otom-obhazi@users.noreply.github.com> Date: Tue, 15 Oct 2024 15:26:34 +0100 Subject: [PATCH 13/36] Create generate_emoji_keywords.py Creating emoji_keywords --- .../emoji_keywords/generate_emoji_keywords.py | 46 +++++++++++++++++++ 1 file changed, 46 insertions(+) create mode 100644 src/scribe_data/language_data_extraction/Slovak/emoji_keywords/generate_emoji_keywords.py diff --git a/src/scribe_data/language_data_extraction/Slovak/emoji_keywords/generate_emoji_keywords.py b/src/scribe_data/language_data_extraction/Slovak/emoji_keywords/generate_emoji_keywords.py new file mode 100644 index 000000000..2b0baa7d3 --- /dev/null +++ b/src/scribe_data/language_data_extraction/Slovak/emoji_keywords/generate_emoji_keywords.py @@ -0,0 +1,46 @@ +""" +Generates keyword-emoji relationships from a selection of Slovak words. + +.. raw:: html + +""" + +import argparse + +from scribe_data.unicode.process_unicode import gen_emoji_lexicon +from scribe_data.utils import export_formatted_data + +LANGUAGE = "Slovak" +DATA_TYPE = "emoji-keywords" +emojis_per_keyword = 3 + +parser = argparse.ArgumentParser() +parser.add_argument("--file-path") +args = parser.parse_args() + +if emoji_keywords_dict := gen_emoji_lexicon( + language=LANGUAGE, + emojis_per_keyword=emojis_per_keyword, +): + export_formatted_data( + file_path=args.file_path, + formatted_data=emoji_keywords_dict, + query_data_in_use=True, + language=LANGUAGE, + data_type=DATA_TYPE, + ) From 318cceb757718e00cde73ed5a681c265f53a0852 Mon Sep 17 00:00:00 2001 From: Andrew Tavis McAllister Date: Tue, 15 Oct 2024 18:41:57 +0200 Subject: [PATCH 14/36] Add missing init file --- .../language_data_extraction/Slovak/emoji_keywords/__init__.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 src/scribe_data/language_data_extraction/Slovak/emoji_keywords/__init__.py diff --git a/src/scribe_data/language_data_extraction/Slovak/emoji_keywords/__init__.py b/src/scribe_data/language_data_extraction/Slovak/emoji_keywords/__init__.py new file mode 100644 index 000000000..e69de29bb From 52b74268244dc978c78ffe93aebea3f2d907a37d Mon Sep 17 00:00:00 2001 From: Angel osim <69635048+Otom-obhazi@users.noreply.github.com> Date: Tue, 15 Oct 2024 16:03:04 +0100 Subject: [PATCH 15/36] Create query_adverbs.sparql Adverb for Basque --- .../Basque/Adverbs/query_adverbs.sparql | 13 +++++++++++++ 1 file changed, 13 insertions(+) create mode 100644 src/scribe_data/language_data_extraction/Basque/Adverbs/query_adverbs.sparql diff --git a/src/scribe_data/language_data_extraction/Basque/Adverbs/query_adverbs.sparql b/src/scribe_data/language_data_extraction/Basque/Adverbs/query_adverbs.sparql new file mode 100644 index 000000000..1cc1a63c9 --- /dev/null +++ b/src/scribe_data/language_data_extraction/Basque/Adverbs/query_adverbs.sparql @@ -0,0 +1,13 @@ +# tool: scribe-data +# All Basque (Q8752) adverbs. +# Enter this query at https://query.wikidata.org/. + +SELECT + (REPLACE(STR(?lexeme), "http://www.wikidata.org/entity/", "") AS ?lexemeID) + ?adverb + +WHERE { + ?lexeme dct:language wd:Q8752; + wikibase:lexicalCategory wd:Q380057 ; + wikibase:lemma ?adverb . +} From e16dc242b6454b2e3a0fd1c932a8e9ec9447b23c Mon Sep 17 00:00:00 2001 From: Andrew Tavis McAllister Date: Tue, 15 Oct 2024 18:46:46 +0200 Subject: [PATCH 16/36] Rename adverb directory --- .../Basque/{Adverbs => adverbs}/query_adverbs.sparql | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename src/scribe_data/language_data_extraction/Basque/{Adverbs => adverbs}/query_adverbs.sparql (100%) diff --git a/src/scribe_data/language_data_extraction/Basque/Adverbs/query_adverbs.sparql b/src/scribe_data/language_data_extraction/Basque/adverbs/query_adverbs.sparql similarity index 100% rename from src/scribe_data/language_data_extraction/Basque/Adverbs/query_adverbs.sparql rename to src/scribe_data/language_data_extraction/Basque/adverbs/query_adverbs.sparql From e0f0598096652dbedc0fb1d35b74228fbf74d6de Mon Sep 17 00:00:00 2001 From: Arpita kesharwani <107834813+KesharwaniArpita@users.noreply.github.com> Date: Tue, 15 Oct 2024 22:48:58 +0530 Subject: [PATCH 17/36] Create query_adjectives_1.sparql --- .../adjectives/query_adjectives_1.sparql | 22 +++++++++++++++++++ 1 file changed, 22 insertions(+) create mode 100644 src/scribe_data/language_data_extraction/Czech/adjectives/query_adjectives_1.sparql diff --git a/src/scribe_data/language_data_extraction/Czech/adjectives/query_adjectives_1.sparql b/src/scribe_data/language_data_extraction/Czech/adjectives/query_adjectives_1.sparql new file mode 100644 index 000000000..1eba99f95 --- /dev/null +++ b/src/scribe_data/language_data_extraction/Czech/adjectives/query_adjectives_1.sparql @@ -0,0 +1,22 @@ +# tool: scribe-data +# All Czech (Q9056) adjectives in the given cases. +# Enter this query at https://query.wikidata.org/. + +SELECT + (REPLACE(STR(?lexeme), "http://www.wikidata.org/entity/", "") AS ?lexemeID) + ?adjective + ?nominative + +WHERE { + ?lexeme dct:language wd:Q9056 ; + wikibase:lexicalCategory wd:Q34698 ; + wikibase:lemma ?adjective . + + # MARK: Nominative + + OPTIONAL { + ?lexeme ontolex:lexicalForm ?nominativeForm . + ?nominativeForm ontolex:representation ?nominative ; + wikibase:grammaticalFeature wd:Q131105 . + } . +} From 51d1f1d171b1e4c8cef805a77a431d782ac4776e Mon Sep 17 00:00:00 2001 From: Arpita kesharwani <107834813+KesharwaniArpita@users.noreply.github.com> Date: Tue, 15 Oct 2024 22:51:23 +0530 Subject: [PATCH 18/36] Create query_adjective_2.sparql --- .../Czech/adjectives/query_adjective_2.sparql | 22 +++++++++++++++++++ 1 file changed, 22 insertions(+) create mode 100644 src/scribe_data/language_data_extraction/Czech/adjectives/query_adjective_2.sparql diff --git a/src/scribe_data/language_data_extraction/Czech/adjectives/query_adjective_2.sparql b/src/scribe_data/language_data_extraction/Czech/adjectives/query_adjective_2.sparql new file mode 100644 index 000000000..43e34962f --- /dev/null +++ b/src/scribe_data/language_data_extraction/Czech/adjectives/query_adjective_2.sparql @@ -0,0 +1,22 @@ +# tool: scribe-data +# All Czech (Q9056) adjectives in the given cases. +# Enter this query at https://query.wikidata.org/. + +SELECT + (REPLACE(STR(?lexeme), "http://www.wikidata.org/entity/", "") AS ?lexemeID) + ?adjective + ?genitive + +WHERE { + ?lexeme dct:language wd:Q9056 ; + wikibase:lexicalCategory wd:Q34698 ; + wikibase:lemma ?adjective . + + # MARK: Genitive + + OPTIONAL { + ?lexeme ontolex:lexicalForm ?genitiveForm . + ?genitiveForm ontolex:representation ?genitive ; + wikibase:grammaticalFeature wd:Q146233 . + } . +} From cc7b9e67559fa1c1e69002941141ad8f3ebb892c Mon Sep 17 00:00:00 2001 From: Arpita kesharwani <107834813+KesharwaniArpita@users.noreply.github.com> Date: Tue, 15 Oct 2024 23:05:29 +0530 Subject: [PATCH 19/36] Create query_adjectives_3.sparql --- .../adjectives/query_adjectives_3.sparql | 22 +++++++++++++++++++ 1 file changed, 22 insertions(+) create mode 100644 src/scribe_data/language_data_extraction/Czech/adjectives/query_adjectives_3.sparql diff --git a/src/scribe_data/language_data_extraction/Czech/adjectives/query_adjectives_3.sparql b/src/scribe_data/language_data_extraction/Czech/adjectives/query_adjectives_3.sparql new file mode 100644 index 000000000..3be851852 --- /dev/null +++ b/src/scribe_data/language_data_extraction/Czech/adjectives/query_adjectives_3.sparql @@ -0,0 +1,22 @@ +# tool: scribe-data +# All Czech (Q9056) adjectives in the given cases. +# Enter this query at https://query.wikidata.org/. + +SELECT + (REPLACE(STR(?lexeme), "http://www.wikidata.org/entity/", "") AS ?lexemeID) + ?adjective + ?locative + +WHERE { + ?lexeme dct:language wd:Q9056 ; + wikibase:lexicalCategory wd:Q34698 ; + wikibase:lemma ?adjective . + + # MARK: Locative + + OPTIONAL { + ?lexeme ontolex:lexicalForm ?locativeForm . + ?locativeForm ontolex:representation ?locative ; + wikibase:grammaticalFeature wd:Q202142 . + } . +} From 2fc8ed778bafeb7516880713bb1b1fdbb28207fe Mon Sep 17 00:00:00 2001 From: Arpita kesharwani <107834813+KesharwaniArpita@users.noreply.github.com> Date: Tue, 15 Oct 2024 23:05:50 +0530 Subject: [PATCH 20/36] Rename query_adjective_2.sparql to query_adjectives_2.sparql --- .../{query_adjective_2.sparql => query_adjectives_2.sparql} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename src/scribe_data/language_data_extraction/Czech/adjectives/{query_adjective_2.sparql => query_adjectives_2.sparql} (100%) diff --git a/src/scribe_data/language_data_extraction/Czech/adjectives/query_adjective_2.sparql b/src/scribe_data/language_data_extraction/Czech/adjectives/query_adjectives_2.sparql similarity index 100% rename from src/scribe_data/language_data_extraction/Czech/adjectives/query_adjective_2.sparql rename to src/scribe_data/language_data_extraction/Czech/adjectives/query_adjectives_2.sparql From 0bd670eb2de1fef13836fc0967f67561f8658306 Mon Sep 17 00:00:00 2001 From: Arpita kesharwani <107834813+KesharwaniArpita@users.noreply.github.com> Date: Tue, 15 Oct 2024 23:10:01 +0530 Subject: [PATCH 21/36] Create query_adverbs.sparql --- .../Czech/adverbs/query_adverbs.sparql | 13 +++++++++++++ 1 file changed, 13 insertions(+) create mode 100644 src/scribe_data/language_data_extraction/Czech/adverbs/query_adverbs.sparql diff --git a/src/scribe_data/language_data_extraction/Czech/adverbs/query_adverbs.sparql b/src/scribe_data/language_data_extraction/Czech/adverbs/query_adverbs.sparql new file mode 100644 index 000000000..913ebbc3e --- /dev/null +++ b/src/scribe_data/language_data_extraction/Czech/adverbs/query_adverbs.sparql @@ -0,0 +1,13 @@ +# tool: scribe-data +# All Czech (Q9056) adverbs. +# Enter this query at https://query.wikidata.org/. + +SELECT + (REPLACE(STR(?lexeme), "http://www.wikidata.org/entity/", "") AS ?lexemeID) + ?adverb + +WHERE { + ?lexeme dct:language wd:Q9056 ; + wikibase:lexicalCategory wd:Q380057 ; + wikibase:lemma ?adverb . +} From f276d16e24c2f8ea73f764ede84cb533c7158d75 Mon Sep 17 00:00:00 2001 From: Arpita kesharwani <107834813+KesharwaniArpita@users.noreply.github.com> Date: Tue, 15 Oct 2024 23:23:00 +0530 Subject: [PATCH 22/36] Create generate_emoji_keywords.py --- .../emoji_keywords/generate_emoji_keywords.py | 47 +++++++++++++++++++ 1 file changed, 47 insertions(+) create mode 100644 src/scribe_data/language_data_extraction/Czech/emoji_keywords/generate_emoji_keywords.py diff --git a/src/scribe_data/language_data_extraction/Czech/emoji_keywords/generate_emoji_keywords.py b/src/scribe_data/language_data_extraction/Czech/emoji_keywords/generate_emoji_keywords.py new file mode 100644 index 000000000..0723195e4 --- /dev/null +++ b/src/scribe_data/language_data_extraction/Czech/emoji_keywords/generate_emoji_keywords.py @@ -0,0 +1,47 @@ + +""" +Generates keyword-emoji relationships from a selection of Czech words. + +.. raw:: html + +""" + +import argparse + +from scribe_data.unicode.process_unicode import gen_emoji_lexicon +from scribe_data.utils import export_formatted_data + +LANGUAGE = "Czech" +DATA_TYPE = "emoji-keywords" +emojis_per_keyword = 3 + +parser = argparse.ArgumentParser() +parser.add_argument("--file-path") +args = parser.parse_args() + +if emoji_keywords_dict := gen_emoji_lexicon( + language=LANGUAGE, + emojis_per_keyword=emojis_per_keyword, +): + export_formatted_data( + file_path=args.file_path, + formatted_data=emoji_keywords_dict, + query_data_in_use=True, + language=LANGUAGE, + data_type=DATA_TYPE, + ) From a5779515dbf3fb85804712fc5996c338eb90b9b8 Mon Sep 17 00:00:00 2001 From: Andrew Tavis McAllister Date: Tue, 15 Oct 2024 21:36:01 +0200 Subject: [PATCH 23/36] Add forms to adjectives query --- .../Czech/adverbs/query_adverbs.sparql | 20 ++++++++++++++++++- .../Czech/emoji_keywords/__init__.py | 0 2 files changed, 19 insertions(+), 1 deletion(-) create mode 100644 src/scribe_data/language_data_extraction/Czech/emoji_keywords/__init__.py diff --git a/src/scribe_data/language_data_extraction/Czech/adverbs/query_adverbs.sparql b/src/scribe_data/language_data_extraction/Czech/adverbs/query_adverbs.sparql index 913ebbc3e..693955f2b 100644 --- a/src/scribe_data/language_data_extraction/Czech/adverbs/query_adverbs.sparql +++ b/src/scribe_data/language_data_extraction/Czech/adverbs/query_adverbs.sparql @@ -1,13 +1,31 @@ # tool: scribe-data -# All Czech (Q9056) adverbs. +# All Czech (Q9056) adverbs in the given cases. # Enter this query at https://query.wikidata.org/. SELECT (REPLACE(STR(?lexeme), "http://www.wikidata.org/entity/", "") AS ?lexemeID) ?adverb + ?compararive + ?superlative WHERE { ?lexeme dct:language wd:Q9056 ; wikibase:lexicalCategory wd:Q380057 ; wikibase:lemma ?adverb . + + # MARK: Comparative + + OPTIONAL { + ?lexeme ontolex:lexicalForm ?comparariveForm . + ?comparariveForm ontolex:representation ?compararive ; + wikibase:grammaticalFeature wd:Q14169499 . + } + + # MARK: Superlative + + OPTIONAL { + ?lexeme ontolex:lexicalForm ?superlativeForm . + ?superlativeForm ontolex:representation ?superlative ; + wikibase:grammaticalFeature wd:Q1817208 . + } } diff --git a/src/scribe_data/language_data_extraction/Czech/emoji_keywords/__init__.py b/src/scribe_data/language_data_extraction/Czech/emoji_keywords/__init__.py new file mode 100644 index 000000000..e69de29bb From adc061f1550009ee422ea9470603cd3045d4253d Mon Sep 17 00:00:00 2001 From: Omar Agiez Date: Tue, 15 Oct 2024 20:30:54 +0300 Subject: [PATCH 24/36] adding a sparql file in Tamil/adverbs for Tamil adverbs --- .../language_data_extraction/Tamil/adverbs/query_adverbs.sparql | 0 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 src/scribe_data/language_data_extraction/Tamil/adverbs/query_adverbs.sparql diff --git a/src/scribe_data/language_data_extraction/Tamil/adverbs/query_adverbs.sparql b/src/scribe_data/language_data_extraction/Tamil/adverbs/query_adverbs.sparql new file mode 100644 index 000000000..e69de29bb From 7d0195bc55b31a79e64755c9b4f905414189b4c7 Mon Sep 17 00:00:00 2001 From: Omar Agiez Date: Tue, 15 Oct 2024 20:32:51 +0300 Subject: [PATCH 25/36] simple sparql query for fetching Tamil adverbs from wikidata --- .../Tamil/adverbs/query_adverbs.sparql | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/src/scribe_data/language_data_extraction/Tamil/adverbs/query_adverbs.sparql b/src/scribe_data/language_data_extraction/Tamil/adverbs/query_adverbs.sparql index e69de29bb..86a7a8eb9 100644 --- a/src/scribe_data/language_data_extraction/Tamil/adverbs/query_adverbs.sparql +++ b/src/scribe_data/language_data_extraction/Tamil/adverbs/query_adverbs.sparql @@ -0,0 +1,13 @@ +# tool: scribe-data +# All Tamil (Q5885) adverbs. +# Enter this query at https://query.wikidata.org/. + +SELECT + (REPLACE(STR(?lexeme), "http://www.wikidata.org/entity/", "") AS ?lexemeID) + ?adverb + +WHERE { + ?lexeme dct:language wd:Q5885 ; + wikibase:lexicalCategory wd:Q380057 ; + wikibase:lemma ?adverb . +} From 7c3b037ff4c3612910752c854dcd9de996dc5eff Mon Sep 17 00:00:00 2001 From: Andrew Tavis McAllister Date: Tue, 15 Oct 2024 21:40:03 +0200 Subject: [PATCH 26/36] Add vocative --- .../Tamil/adverbs/query_adverbs.sparql | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/src/scribe_data/language_data_extraction/Tamil/adverbs/query_adverbs.sparql b/src/scribe_data/language_data_extraction/Tamil/adverbs/query_adverbs.sparql index 86a7a8eb9..72e2a4a96 100644 --- a/src/scribe_data/language_data_extraction/Tamil/adverbs/query_adverbs.sparql +++ b/src/scribe_data/language_data_extraction/Tamil/adverbs/query_adverbs.sparql @@ -1,13 +1,22 @@ # tool: scribe-data -# All Tamil (Q5885) adverbs. +# All Tamil (Q5885) adverbs in the given case. # Enter this query at https://query.wikidata.org/. SELECT (REPLACE(STR(?lexeme), "http://www.wikidata.org/entity/", "") AS ?lexemeID) ?adverb + ?vocative WHERE { ?lexeme dct:language wd:Q5885 ; wikibase:lexicalCategory wd:Q380057 ; wikibase:lemma ?adverb . + + # MARK: Vocative + + OPTIONAL { + ?lexeme ontolex:lexicalForm ?vocativeForm . + ?vocativeForm ontolex:representation ?vocative ; + wikibase:grammaticalFeature wd:Q185077 . + } } From ae2e662873e923aa10cb8c6f372d19f307a8b262 Mon Sep 17 00:00:00 2001 From: axif Date: Tue, 15 Oct 2024 23:06:57 +0600 Subject: [PATCH 27/36] fix lists of arguments to be validated --- src/scribe_data/cli/cli_utils.py | 142 +++++++++++++++++-------------- src/scribe_data/cli/main.py | 13 ++- tests/cli/test_utils.py | 42 ++++++++- 3 files changed, 127 insertions(+), 70 deletions(-) diff --git a/src/scribe_data/cli/cli_utils.py b/src/scribe_data/cli/cli_utils.py index e3e62485c..8de5c7dec 100644 --- a/src/scribe_data/cli/cli_utils.py +++ b/src/scribe_data/cli/cli_utils.py @@ -23,7 +23,7 @@ import difflib import json from pathlib import Path -from typing import Union +from typing import Union, List from scribe_data.utils import DEFAULT_JSON_EXPORT_DIR @@ -155,79 +155,91 @@ def print_formatted_data(data: Union[dict, list], data_type: str) -> None: # MARK: Validate -def validate_language_and_data_type(language: str, data_type: str): +def validate_language_and_data_type( + language: Union[str, List[str], bool, None], + data_type: Union[str, List[str], bool, None], +): """ Validates that the language and data type QIDs are not None. Parameters ---------- - language : str - The language to validate. - - data_type : str - The data type to validate. + language : str or list + The language(s) to validate. + data_type : str or list + The data type(s) to validate. Raises ------ - ValueError - If either the language or data type is invalid (None). + ValueError + If any of the languages or data types is invalid, with all errors reported together. """ - # Not functional for lists of arguments yet. - if isinstance(language, list) or isinstance(data_type, list): - return - - language_is_valid = True - data_type_is_valid = True - - value_error = "" - closest_language_match_string = "" - closest_data_type_match_string = "" - - if ( - isinstance(language, str) - and language.lower() not in language_to_qid.keys() - and not language.startswith("Q") - and not language[1:].isdigit() - ): - language_is_valid = False - if closest_language_match := difflib.get_close_matches( - language, language_map.keys(), n=1 - ): - closest_language_match_cap = closest_language_match[0].capitalize() - closest_language_match_string = ( - f" The closest matching language is {closest_language_match_cap}." - ) - - if ( - isinstance(data_type, str) - and data_type not in data_type_metadata.keys() - and not data_type.startswith("Q") - and not data_type[1:].isdigit() - ): - data_type_is_valid = False - if closest_data_type_match := difflib.get_close_matches( - data_type, data_type_metadata.keys(), n=1 + def validate_single_item(item, valid_options, item_type): + """ + Validates a single item against a list of valid options, providing error messages and suggestions. + + Parameters + ---------- + item : str + The item to validate. + valid_options : list + A list of valid options against which the item will be validated. + item_type : str + A description of the item type (e.g., "language", "data-type") used in error messages. + + Returns + ------- + str or None + Returns an error message if the item is invalid, or None if the item is valid. + """ + if ( + isinstance(item, str) + and item.lower().strip() not in valid_options + and not item.startswith("Q") + and not item[1:].isdigit() ): - closest_data_type_match_string = ( - f" The closest matching data-type is {closest_data_type_match[0]}." + closest_match = difflib.get_close_matches(item, valid_options, n=1) + closest_match_str = ( + f" The closest matching {item_type} is {closest_match[0]}" + if closest_match + else "" ) - - if not language_is_valid and data_type_is_valid: - value_error = ( - f"Invalid language {language} passed.{closest_language_match_string}" - ) - - raise ValueError(value_error) - - elif language_is_valid and not data_type_is_valid: - value_error = ( - f"Invalid data-type {data_type} passed.{closest_data_type_match_string}" - ) - - raise ValueError(value_error) - - elif not language_is_valid and not data_type_is_valid: - value_error = f"Invalid language {language} and data-type {data_type} passed.{closest_language_match_string}{closest_data_type_match_string}" - - raise ValueError(value_error) + return f"Invalid {item_type} {item}{closest_match_str}" + return None + + errors = [] + + # Handle language validation + if language is None or isinstance(language, bool): + pass + elif isinstance(language, str): + language = [language] + elif not isinstance(language, list): + errors.append("Language must be a string or a list of strings.") + + if language is not None and isinstance(language, list): + for lang in language: + error = validate_single_item(lang, language_to_qid.keys(), "language") + if error: + errors.append(error) + + # Handle data type validation + if data_type is None or isinstance(data_type, bool): + pass + elif isinstance(data_type, str): + data_type = [data_type] + elif not isinstance(data_type, list): + errors.append("Data type must be a string or a list of strings.") + + if data_type is not None and isinstance(data_type, list): + for dt in data_type: + error = validate_single_item(dt, data_type_metadata.keys(), "data-type") + if error: + errors.append(error) + + # Raise ValueError with the combined error message + if errors: + raise ValueError(" and ".join(errors) + " passed.") + else: + return True diff --git a/src/scribe_data/cli/main.py b/src/scribe_data/cli/main.py index 7c88485a2..1cf4758a0 100644 --- a/src/scribe_data/cli/main.py +++ b/src/scribe_data/cli/main.py @@ -201,10 +201,15 @@ def main() -> None: # MARK: Setup CLI args = parser.parse_args() - if args.language or args.data_type: - validate_language_and_data_type( - language=args.language, data_type=args.data_type - ) + + try: + if args.language or args.data_type: + validate_language_and_data_type( + language=args.language, data_type=args.data_type + ) + except ValueError as e: + print(e) + return if args.upgrade: upgrade_cli() diff --git a/tests/cli/test_utils.py b/tests/cli/test_utils.py index 149716c2d..32ab82262 100644 --- a/tests/cli/test_utils.py +++ b/tests/cli/test_utils.py @@ -216,5 +216,45 @@ def test_validate_language_and_data_type_both_invalid(self, mock_get_qid): self.assertEqual( str(context.exception), - "Invalid language InvalidLanguage and data-type InvalidDataType passed.", + "Invalid language InvalidLanguage and Invalid data-type InvalidDataType passed.", ) + + def test_validate_language_and_data_type_with_list(self): + """Test validation with lists of languages and data types.""" + languages = ["English", "Spanish"] + data_types = ["nouns", "verbs"] + try: + validate_language_and_data_type(languages, data_types) + except ValueError: + self.fail( + "validate_language_and_data_type raised ValueError unexpectedly with valid lists!" + ) + + def test_validate_language_and_data_type_with_qids(self): + """Test validation directly with QIDs.""" + language_qid = "Q1860" # QID for English + data_type_qid = "Q1084" # QID for nouns + try: + validate_language_and_data_type(language_qid, data_type_qid) + except ValueError: + self.fail( + "validate_language_and_data_type raised ValueError unexpectedly with valid QIDs!" + ) + + def test_validate_language_and_data_type_invalid_list(self): + """Test validation with invalid lists.""" + languages = ["English", "Klingon"] + data_types = ["nouns", "alienverbs"] + with self.assertRaises(ValueError) as context: + validate_language_and_data_type(languages, data_types) + self.assertIn("Invalid language Klingon", str(context.exception)) + self.assertIn("Invalid data-type alienverbs", str(context.exception)) + + def test_validate_language_and_data_type_mixed_validity_in_lists(self): + """Test validation with mixed valid and invalid entries in lists.""" + languages = ["English", "InvalidLanguage"] + data_types = ["nouns", "InvalidDataType"] + with self.assertRaises(ValueError) as context: + validate_language_and_data_type(languages, data_types) + self.assertIn("Invalid language InvalidLanguage", str(context.exception)) + self.assertIn("Invalid data-type InvalidDataType", str(context.exception)) From 3e6835c2e5b98a454516e2876e490d541b5e9dc1 Mon Sep 17 00:00:00 2001 From: Andrew Tavis McAllister Date: Tue, 15 Oct 2024 22:06:17 +0200 Subject: [PATCH 28/36] Minor formatting and edits to outputs --- src/scribe_data/cli/cli_utils.py | 52 +++++++++++++++++++------------- src/scribe_data/cli/main.py | 3 +- tests/cli/test_utils.py | 24 +++++---------- 3 files changed, 41 insertions(+), 38 deletions(-) diff --git a/src/scribe_data/cli/cli_utils.py b/src/scribe_data/cli/cli_utils.py index 8de5c7dec..4f59a65ef 100644 --- a/src/scribe_data/cli/cli_utils.py +++ b/src/scribe_data/cli/cli_utils.py @@ -23,7 +23,7 @@ import difflib import json from pathlib import Path -from typing import Union, List +from typing import List, Union from scribe_data.utils import DEFAULT_JSON_EXPORT_DIR @@ -164,15 +164,16 @@ def validate_language_and_data_type( Parameters ---------- - language : str or list - The language(s) to validate. - data_type : str or list - The data type(s) to validate. + language : str or list + The language(s) to validate. + + data_type : str or list + The data type(s) to validate. Raises ------ - ValueError - If any of the languages or data types is invalid, with all errors reported together. + ValueError + If any of the languages or data types is invalid, with all errors reported together. """ def validate_single_item(item, valid_options, item_type): @@ -181,17 +182,17 @@ def validate_single_item(item, valid_options, item_type): Parameters ---------- - item : str - The item to validate. - valid_options : list - A list of valid options against which the item will be validated. - item_type : str - A description of the item type (e.g., "language", "data-type") used in error messages. + item : str + The item to validate. + valid_options : list + A list of valid options against which the item will be validated. + item_type : str + A description of the item type (e.g., "language", "data-type") used in error messages. Returns ------- - str or None - Returns an error message if the item is invalid, or None if the item is valid. + str or None + Returns an error message if the item is invalid, or None if the item is valid. """ if ( isinstance(item, str) @@ -201,45 +202,54 @@ def validate_single_item(item, valid_options, item_type): ): closest_match = difflib.get_close_matches(item, valid_options, n=1) closest_match_str = ( - f" The closest matching {item_type} is {closest_match[0]}" + f" The closest matching {item_type} is {closest_match[0]}." if closest_match else "" ) - return f"Invalid {item_type} {item}{closest_match_str}" + + return f"Invalid {item_type} {item}.{closest_match_str}" + return None errors = [] - # Handle language validation + # Handle language validation. if language is None or isinstance(language, bool): pass + elif isinstance(language, str): language = [language] + elif not isinstance(language, list): errors.append("Language must be a string or a list of strings.") if language is not None and isinstance(language, list): for lang in language: error = validate_single_item(lang, language_to_qid.keys(), "language") + if error: errors.append(error) - # Handle data type validation + # Handle data type validation. if data_type is None or isinstance(data_type, bool): pass + elif isinstance(data_type, str): data_type = [data_type] + elif not isinstance(data_type, list): errors.append("Data type must be a string or a list of strings.") if data_type is not None and isinstance(data_type, list): for dt in data_type: error = validate_single_item(dt, data_type_metadata.keys(), "data-type") + if error: errors.append(error) - # Raise ValueError with the combined error message + # Raise ValueError with the combined error message. if errors: - raise ValueError(" and ".join(errors) + " passed.") + raise ValueError("\n".join(errors)) + else: return True diff --git a/src/scribe_data/cli/main.py b/src/scribe_data/cli/main.py index 1cf4758a0..506bbcdd1 100644 --- a/src/scribe_data/cli/main.py +++ b/src/scribe_data/cli/main.py @@ -207,8 +207,9 @@ def main() -> None: validate_language_and_data_type( language=args.language, data_type=args.data_type ) + except ValueError as e: - print(e) + print(f"Input validation failed with error: {e}") return if args.upgrade: diff --git a/tests/cli/test_utils.py b/tests/cli/test_utils.py index 32ab82262..a827666a2 100644 --- a/tests/cli/test_utils.py +++ b/tests/cli/test_utils.py @@ -29,6 +29,8 @@ validate_language_and_data_type, ) +# MARK: Utils + class TestCLIUtils(unittest.TestCase): def test_correct_data_type(self): @@ -145,6 +147,9 @@ def test_print_formatted_data_unknown_type(self): mock_print.assert_called_once_with("unknown data type") +# MARK: Validate + + class TestValidateLanguageAndDataType(unittest.TestCase): def setUp(self): self.qid_mapping = { @@ -182,9 +187,7 @@ def test_validate_language_and_data_type_invalid_language(self, mock_get_qid): language=language_qid, data_type=data_type_qid ) - self.assertEqual( - str(context.exception), "Invalid language InvalidLanguage passed." - ) + self.assertEqual(str(context.exception), "Invalid language InvalidLanguage.") @patch("scribe_data.cli.total.get_qid_by_input") def test_validate_language_and_data_type_invalid_data_type(self, mock_get_qid): @@ -198,9 +201,7 @@ def test_validate_language_and_data_type_invalid_data_type(self, mock_get_qid): language=language_qid, data_type=data_type_qid ) - self.assertEqual( - str(context.exception), "Invalid data-type InvalidDataType passed." - ) + self.assertEqual(str(context.exception), "Invalid data-type InvalidDataType.") @patch("scribe_data.cli.total.get_qid_by_input") def test_validate_language_and_data_type_both_invalid(self, mock_get_qid): @@ -216,7 +217,7 @@ def test_validate_language_and_data_type_both_invalid(self, mock_get_qid): self.assertEqual( str(context.exception), - "Invalid language InvalidLanguage and Invalid data-type InvalidDataType passed.", + "Invalid language InvalidLanguage.\nInvalid data-type InvalidDataType.", ) def test_validate_language_and_data_type_with_list(self): @@ -241,15 +242,6 @@ def test_validate_language_and_data_type_with_qids(self): "validate_language_and_data_type raised ValueError unexpectedly with valid QIDs!" ) - def test_validate_language_and_data_type_invalid_list(self): - """Test validation with invalid lists.""" - languages = ["English", "Klingon"] - data_types = ["nouns", "alienverbs"] - with self.assertRaises(ValueError) as context: - validate_language_and_data_type(languages, data_types) - self.assertIn("Invalid language Klingon", str(context.exception)) - self.assertIn("Invalid data-type alienverbs", str(context.exception)) - def test_validate_language_and_data_type_mixed_validity_in_lists(self): """Test validation with mixed valid and invalid entries in lists.""" languages = ["English", "InvalidLanguage"] From 343ffdb5e7cc2d7e7ee25ab505b2bc3ded41565f Mon Sep 17 00:00:00 2001 From: Purnama S Rahayu Date: Mon, 14 Oct 2024 19:49:38 +0700 Subject: [PATCH 29/36] add workflow check_query_identifiers and dummy script #339 --- .../workflows/check_query_identifiers.yaml | 43 +++++++++++++++++++ .../Mandarin/Adverbs/query_adverbs.sparql | 14 ------ 2 files changed, 43 insertions(+), 14 deletions(-) create mode 100644 .github/workflows/check_query_identifiers.yaml delete mode 100644 src/scribe_data/language_data_extraction/Chinese/Mandarin/Adverbs/query_adverbs.sparql diff --git a/.github/workflows/check_query_identifiers.yaml b/.github/workflows/check_query_identifiers.yaml new file mode 100644 index 000000000..99300015d --- /dev/null +++ b/.github/workflows/check_query_identifiers.yaml @@ -0,0 +1,43 @@ +name: check_query_identifiers.yaml +on: + push: + branches: [main] + pull_request: + branches: [main] + +jobs: + format_check: + strategy: + fail-fast: false + matrix: + os: + - ubuntu-latest + python-version: + - "3.9" + + runs-on: ${{ matrix.os }} + + name: Run Check Query Identifiers + + steps: + - name: Checkout + uses: actions/checkout@v3 + + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v4 + with: + python-version: ${{ matrix.python-version }} + + - name: Install dependencies + run: | + python -m pip install --upgrade uv + uv venv + uv pip install -r requirements.txt + + - name: Activate virtualenv + run: | + . .venv/bin/activate + echo PATH=$PATH >> $GITHUB_ENV + + - name: Run Python script + run: python src/scribe_data/check/check_query_identifiers.py diff --git a/src/scribe_data/language_data_extraction/Chinese/Mandarin/Adverbs/query_adverbs.sparql b/src/scribe_data/language_data_extraction/Chinese/Mandarin/Adverbs/query_adverbs.sparql deleted file mode 100644 index 8633280f4..000000000 --- a/src/scribe_data/language_data_extraction/Chinese/Mandarin/Adverbs/query_adverbs.sparql +++ /dev/null @@ -1,14 +0,0 @@ -# tool: scribe-data -# All Standard Mandarin Chinese (Q727694) adverbs. -# Enter this query at https://query.wikidata.org/. - -SELECT - (REPLACE(STR(?lexeme), "http://www.wikidata.org/entity/", "") AS ?lexemeID) - ?adverb - -WHERE { - ?lexeme dct:language wd:Q727694 ; - wikibase:lexicalCategory wd:Q380057 ; - wikibase:lemma ?adverb . - FILTER(LANG(?adverb) = "zh") . -} From 230fa58f00a0762e8411291e9b8922f51ad72f7d Mon Sep 17 00:00:00 2001 From: Purnama S Rahayu Date: Tue, 15 Oct 2024 08:53:30 +0700 Subject: [PATCH 30/36] Update workflow to trigger on future commits --- .github/workflows/check_query_identifiers.yaml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/.github/workflows/check_query_identifiers.yaml b/.github/workflows/check_query_identifiers.yaml index 99300015d..45b8d7e0a 100644 --- a/.github/workflows/check_query_identifiers.yaml +++ b/.github/workflows/check_query_identifiers.yaml @@ -3,7 +3,9 @@ on: push: branches: [main] pull_request: - branches: [main] + branches: + - main + types: [opened, reopened, synchronize] jobs: format_check: From 408abc932b75aab0ac7830f284fd3455472267a5 Mon Sep 17 00:00:00 2001 From: Andrew Tavis McAllister Date: Tue, 15 Oct 2024 22:26:21 +0200 Subject: [PATCH 31/36] Deactivate workflow so it can be brought into other PRs --- .../workflows/check_query_identifiers.yaml | 36 +++++++++---------- 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/.github/workflows/check_query_identifiers.yaml b/.github/workflows/check_query_identifiers.yaml index 45b8d7e0a..739c5fec3 100644 --- a/.github/workflows/check_query_identifiers.yaml +++ b/.github/workflows/check_query_identifiers.yaml @@ -25,21 +25,21 @@ jobs: - name: Checkout uses: actions/checkout@v3 - - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v4 - with: - python-version: ${{ matrix.python-version }} - - - name: Install dependencies - run: | - python -m pip install --upgrade uv - uv venv - uv pip install -r requirements.txt - - - name: Activate virtualenv - run: | - . .venv/bin/activate - echo PATH=$PATH >> $GITHUB_ENV - - - name: Run Python script - run: python src/scribe_data/check/check_query_identifiers.py + # - name: Set up Python ${{ matrix.python-version }} + # uses: actions/setup-python@v4 + # with: + # python-version: ${{ matrix.python-version }} + + # - name: Install dependencies + # run: | + # python -m pip install --upgrade uv + # uv venv + # uv pip install -r requirements.txt + + # - name: Activate virtualenv + # run: | + # . .venv/bin/activate + # echo PATH=$PATH >> $GITHUB_ENV + + # - name: Run Python script + # run: python src/scribe_data/check/check_query_identifiers.py From bf02ac8595b56d95c39394110993ee22089ebc38 Mon Sep 17 00:00:00 2001 From: Andrew Tavis McAllister Date: Tue, 15 Oct 2024 22:27:39 +0200 Subject: [PATCH 32/36] Remove yaml from workflow name --- .github/workflows/check_query_identifiers.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/check_query_identifiers.yaml b/.github/workflows/check_query_identifiers.yaml index 739c5fec3..780da47da 100644 --- a/.github/workflows/check_query_identifiers.yaml +++ b/.github/workflows/check_query_identifiers.yaml @@ -1,4 +1,4 @@ -name: check_query_identifiers.yaml +name: check_query_identifiers on: push: branches: [main] From 08f6ed117b142032209fafb5f55e91c82086ca75 Mon Sep 17 00:00:00 2001 From: Andrew Tavis McAllister Date: Wed, 16 Oct 2024 12:55:16 +0200 Subject: [PATCH 33/36] Update unicode docs --- src/scribe_data/cli/get.py | 2 +- .../unicode/{UNICODE.md => UNICODE_INSTALLTION.md} | 14 +++++++++++++- 2 files changed, 14 insertions(+), 2 deletions(-) rename src/scribe_data/unicode/{UNICODE.md => UNICODE_INSTALLTION.md} (52%) diff --git a/src/scribe_data/cli/get.py b/src/scribe_data/cli/get.py index c3d5eecc9..3cbea6980 100644 --- a/src/scribe_data/cli/get.py +++ b/src/scribe_data/cli/get.py @@ -154,5 +154,5 @@ def get_data( "\nThe Scribe-Data emoji functionality is powered by PyICU, which is currently not installed." ) print( - "Please check the installation steps at https://gitlab.pyicu.org/main/pyicu for more information.\n" + "Please check the installation guide at https://github.com/scribe-org/Scribe-Data/blob/main/src/scribe_data/unicode/UNICODE_INSTALLTION.md for more information.\n" ) diff --git a/src/scribe_data/unicode/UNICODE.md b/src/scribe_data/unicode/UNICODE_INSTALLTION.md similarity index 52% rename from src/scribe_data/unicode/UNICODE.md rename to src/scribe_data/unicode/UNICODE_INSTALLTION.md index 2d15a7a7d..e8f493163 100644 --- a/src/scribe_data/unicode/UNICODE.md +++ b/src/scribe_data/unicode/UNICODE_INSTALLTION.md @@ -1,5 +1,17 @@ -# scribe_data.unicode +# Scribe-Data Unicode Functionality Installation The Scribe-Data Unicode process is powered by [cldr-json](https://github.com/unicode-org/cldr-json) data from the [Unicode Consortium](https://home.unicode.org/) and [PyICU](https://gitlab.pyicu.org/main/pyicu), a Python extension that wraps the Unicode Consortium's [International Components for Unicode (ICU)](https://github.com/unicode-org/icu) C++ project. Please see the [installation guide for PyICU](https://gitlab.pyicu.org/main/pyicu#installing-pyicu) as the extension must be linked to ICU on your machine to work properly. + +Note that some of the commands may be incorrect. On macOS you may need to do the following: + +```bash +# Instead of: +export PATH="$(brew --prefix)/opt/icu4c/bin:$(brew --prefix)/opt/icu4c/sbin:$PATH" +export PKG_CONFIG_PATH="$PKG_CONFIG_PATH:$(brew --prefix)/opt/icu4c/lib/pkgconfig" + +# Run: +echo "/opt/homebrew/opt/icu4c/bin:/opt/homebrew/opt/icu4c/sbin:$PATH" +echo "PKG_CONFIG_PATH=$PKG_CONFIG_PATH:/opt/homebrew/opt/icu4c/lib/pkgconfig" +``` From 5fba72fbb2bec2247f7da8ce6a8d869cf64dad7e Mon Sep 17 00:00:00 2001 From: Andrew Tavis McAllister Date: Wed, 16 Oct 2024 13:53:36 +0200 Subject: [PATCH 34/36] Update Sphynx RTD theme for docs --- docs/source/conf.py | 4 ++-- requirements.txt | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/source/conf.py b/docs/source/conf.py index 8d6e22d30..0c9e706d5 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -63,7 +63,7 @@ "pytest-cov", "ruff", "SPARQLWrapper", - "tqdm" + "tqdm", ] # Add any paths that contain templates here, relative to this directory. @@ -91,7 +91,7 @@ html_theme = "sphinx_rtd_theme" -html_theme_path = [sphinx_rtd_theme.get_html_theme_path()] +html_theme_path = [sphinx_rtd_theme] # Theme options are theme-specific and customize the look and feel of a theme # further. For a list of options available for each theme, see the diff --git a/requirements.txt b/requirements.txt index 16c262084..abbd5e443 100644 --- a/requirements.txt +++ b/requirements.txt @@ -16,5 +16,5 @@ regex>=2023.3.23 rich>=10.0.0 ruff>=0.3.3 SPARQLWrapper>=2.0.0 -sphinx-rtd-theme>=2.0.0 +sphinx-rtd-theme>=3.0.0 tqdm==4.66.4 From d37872c109464d3e7e666d000f11eadebab88d43 Mon Sep 17 00:00:00 2001 From: Akindele Michael Date: Wed, 16 Oct 2024 13:22:36 +0100 Subject: [PATCH 35/36] Cleanup query validation logic: update data_type_pattern and clean up print statements --- .../check/check_query_identifiers.py | 86 +++++++++++++------ 1 file changed, 61 insertions(+), 25 deletions(-) diff --git a/src/scribe_data/check/check_query_identifiers.py b/src/scribe_data/check/check_query_identifiers.py index b379a5c86..52d9fe158 100644 --- a/src/scribe_data/check/check_query_identifiers.py +++ b/src/scribe_data/check/check_query_identifiers.py @@ -10,22 +10,41 @@ def extract_qid_from_sparql(file_path: Path, pattern: str) -> str: """ - Extract the QID based on the pattern provided (either language or data type). + Extracts the QID from a SPARQL query file based on the provided pattern. + + Parameters + ---------- + file_path : Path + The path to the SPARQL query file from which to extract the QID. + pattern : str + The regex pattern used to match the QID (either for language or data type). + + Returns + ------- + str + The extracted QID if found, otherwise None. """ try: with open(file_path, "r", encoding="utf-8") as file: content = file.read() match = re.search(pattern, content) if match: - return match.group(0).replace("wd:", "") + return match.group(0).split("wd:")[1] except Exception as e: print(f"Error reading {file_path}: {e}") return None def check_queries(): + """ + Validates SPARQL queries in the specified directory to check for correct language + and data type QIDs. + + This function scans all SPARQL query files in the LANGUAGE_DATA_EXTRACTION_DIR + and prints out any files with incorrect QIDs for both languages and data types. + """ language_pattern = r"\?lexeme dct:language wd:Q\d+" - data_type_pattern = r"wikibase:lexicalCategory wd:Q\d+" + data_type_pattern = r"wikibase:lexicalCategory\s+wd:Q\d+" incorrect_languages = [] incorrect_data_types = [] @@ -41,17 +60,34 @@ def check_queries(): incorrect_data_types.append(query_file) if incorrect_languages: - print("Queries with incorrect languages QIDs are:") + print("Incorrect Language QIDs found in the following files:") for file in incorrect_languages: print(f"- {file}") + print("\n----------------------------------------------------------------\n") if incorrect_data_types: - print("Queries with incorrect data type QIDs are:") + print("Incorrect Data Type QIDs found in the following files:") for file in incorrect_data_types: print(f"- {file}") + print("\n----------------------------------------------------------------\n") -def is_valid_language(query_file, lang_qid): +def is_valid_language(query_file: Path, lang_qid: str) -> bool: + """ + Validates the language QID against the expected QID for the directory. + + Parameters + ---------- + query_file : Path + The path to the SPARQL query file being validated. + lang_qid : str + The QID of the language extracted from the SPARQL query. + + Returns + ------- + bool + True if the language QID is valid, otherwise False. + """ lang_directory_name = query_file.parent.parent.name.lower() languages = language_metadata.get( "languages" @@ -61,39 +97,39 @@ def is_valid_language(query_file, lang_qid): ) if not language_entry: - print( - f"Warning: Language '{lang_directory_name}' not found in language_metadata.json." - ) return False expected_language_qid = language_entry["qid"] - print("Expected language QID:", expected_language_qid) if lang_qid != expected_language_qid: - print( - f"Incorrect language QID in {lang_directory_name}. " - f"Found: {lang_qid}, Expected: {expected_language_qid}" - ) return False return True -def is_valid_data_type(query_file, data_type_qid): +def is_valid_data_type(query_file: Path, data_type_qid: str) -> bool: + """ + Validates the data type QID against the expected QID for the directory. + + Parameters + ---------- + query_file : Path + The path to the SPARQL query file being validated. + data_type_qid : str + The QID of the data type extracted from the SPARQL query. + + Returns + ------- + bool + True if the data type QID is valid, otherwise False. + """ directory_name = query_file.parent.name # e.g., "nouns" or "verbs" expected_data_type_qid = data_type_metadata.get(directory_name) if data_type_qid != expected_data_type_qid: - print( - f"Warning: Incorrect data type QID in {query_file}. Found: {data_type_qid}, Expected: {expected_data_type_qid}" - ) return False return True -# Examples: - -# file_path = Path("French/verbs/query_verbs.sparql") -# print(is_valid_data_type(file_path, "QW24907")) # check for data type -# print(is_valid_language(file_path, "Q150")) # check for if valid language - -check_queries() +# Run the check_queries function +# MARK: TODO: Remove Call +# check_queries() From 5e8626534a36b0a36598f37930efb4828b3b8c4f Mon Sep 17 00:00:00 2001 From: Andrew Tavis McAllister Date: Thu, 17 Oct 2024 00:56:24 +0200 Subject: [PATCH 36/36] Minor edits to script formatting --- .../check/check_query_identifiers.py | 29 ++++++++++--------- 1 file changed, 16 insertions(+), 13 deletions(-) diff --git a/src/scribe_data/check/check_query_identifiers.py b/src/scribe_data/check/check_query_identifiers.py index 52d9fe158..5f8276e4d 100644 --- a/src/scribe_data/check/check_query_identifiers.py +++ b/src/scribe_data/check/check_query_identifiers.py @@ -3,8 +3,8 @@ from scribe_data.cli.cli_utils import ( LANGUAGE_DATA_EXTRACTION_DIR, - language_metadata, data_type_metadata, + language_metadata, ) @@ -14,24 +14,26 @@ def extract_qid_from_sparql(file_path: Path, pattern: str) -> str: Parameters ---------- - file_path : Path - The path to the SPARQL query file from which to extract the QID. - pattern : str - The regex pattern used to match the QID (either for language or data type). + file_path : Path + The path to the SPARQL query file from which to extract the QID. + + pattern : str + The regex pattern used to match the QID (either for language or data type). Returns ------- - str - The extracted QID if found, otherwise None. + str + The extracted QID if found, otherwise None. """ try: with open(file_path, "r", encoding="utf-8") as file: content = file.read() - match = re.search(pattern, content) - if match: - return match.group(0).split("wd:")[1] + if match := re.search(pattern, content): + return match[0].split("wd:")[1] + except Exception as e: print(f"Error reading {file_path}: {e}") + return None @@ -63,12 +65,14 @@ def check_queries(): print("Incorrect Language QIDs found in the following files:") for file in incorrect_languages: print(f"- {file}") + print("\n----------------------------------------------------------------\n") if incorrect_data_types: print("Incorrect Data Type QIDs found in the following files:") for file in incorrect_data_types: print(f"- {file}") + print("\n----------------------------------------------------------------\n") @@ -103,6 +107,7 @@ def is_valid_language(query_file: Path, lang_qid: str) -> bool: if lang_qid != expected_language_qid: return False + return True @@ -125,9 +130,7 @@ def is_valid_data_type(query_file: Path, data_type_qid: str) -> bool: directory_name = query_file.parent.name # e.g., "nouns" or "verbs" expected_data_type_qid = data_type_metadata.get(directory_name) - if data_type_qid != expected_data_type_qid: - return False - return True + return data_type_qid == expected_data_type_qid # Run the check_queries function