diff --git a/README.md b/README.md index 2d266a7..c5b0560 100644 --- a/README.md +++ b/README.md @@ -71,7 +71,7 @@ Please note that KLPT is under development and some of the functionalities will - preprocess + preprocess normalization ✓ (v0.1.0) ✓ (v0.1.0) @@ -86,6 +86,11 @@ Please note that KLPT is under development and some of the functionalities will ✓ (v0.1.0) ✓ (v0.1.0) + + stopwords 🆕 + ✓ (v0.1.4) + ✓ (v0.1.4) + tokenize word tokenization
@@ -198,6 +203,16 @@ It is recommended that the output of this module be used as the input of subsequ 'hêvîya' ``` +In addition, it is possible to remove Kurdish [stopwords](https://en.wikipedia.org/wiki/Stop_word) using the `stopwords` variable. You can define a function like the following to do so: + +```python +from klpt.preprocess import Preprocess + +def remove_stopwords(text, dialect, script): + p = Preprocess(dialect, script) + return [token for token in text.split() if token not in p.stopwords] +``` + ### Tokenization This module focuses on the tokenization of both Kurmanji and Sorani dialects of Kurdish with the following functions: diff --git a/dist/klpt-0.1.4-py3-none-any.whl b/dist/klpt-0.1.4-py3-none-any.whl new file mode 100644 index 0000000..4624c3f Binary files /dev/null and b/dist/klpt-0.1.4-py3-none-any.whl differ diff --git a/dist/klpt-0.1.4.tar.gz b/dist/klpt-0.1.4.tar.gz new file mode 100644 index 0000000..da5da47 Binary files /dev/null and b/dist/klpt-0.1.4.tar.gz differ diff --git a/klpt/__init__.py b/klpt/__init__.py index 319c238..f607575 100644 --- a/klpt/__init__.py +++ b/klpt/__init__.py @@ -12,14 +12,6 @@ __author__ = __maintainer__ __author_email__ = __maintainer_email__ -# with open("data/stopwords.json") as f: -# stopwords = json.load(f)[self.dialect][self.script] -# if __name__ == "__main__": - -# def remove_stopwords(self, text): -# """remove stopwords""" -# return " ".join([token for token in text.split() if token not in self.stopwords]) - _ROOT = os.path.abspath(os.path.dirname(__file__)) def get_data(path): return os.path.join(_ROOT, '', path) @@ -41,6 +33,6 @@ def get_data(path): "analyser": { "Sorani": get_data("data/ckb-analyser.att"), "Kurmanji": get_data("data/kmr-analyser.att") - } -} - + }, + "stopwords": get_data("data/stopwords.json") +} \ No newline at end of file diff --git a/klpt/data/preprocess_map.json b/klpt/data/preprocess_map.json index 2d0b610..1608d22 100644 --- a/klpt/data/preprocess_map.json +++ b/klpt/data/preprocess_map.json @@ -141,7 +141,7 @@ "Arabic": {}, "Latin": { "iy": "îy", - "istan ": "îstan ", + "îstan ": "istan ", "(\\d)'(an)": "\\1\\2", "(\\d)-(an)": "\\1\\2", "(\\d)`(an)": "\\1\\2", diff --git a/klpt/data/test_cases.json b/klpt/data/test_cases.json index 5701755..6fea7cd 100644 --- a/klpt/data/test_cases.json +++ b/klpt/data/test_cases.json @@ -39,7 +39,7 @@ "Arabic": {}, "Latin": { "Kurdiya me": "Kurdîya me", - "Ermenistan": "Ermenîstan", + "Ermenîstan": "Ermenistan", "2018-an": "2018an", "di sala 2018-an": "di sala 2018an", "2018-An": "2018An", @@ -88,5 +88,19 @@ "": {"έτσι": ""}, "�": {"έτσι": "����"} } - } + }, + "stopwords": { + "Sorani": { + "Arabic": { + "وێبگەڕەکانی ئەمڕۆ جگە لە توانای نمایشکردن و گەڕۆکی لە پەڕەکانی وێبدا": ["وێبگەڕەکانی", "توانای", "نمایشکردن", "گەڕۆکی", "پەڕەکانی", "وێبدا"] + }, + "Latin": {} + }, + "Kurmanji": { + "Arabic": {}, + "Latin": { + "Gelek bername û malper agahiyên xwe li ser danegehê vedişêrin": ["Gelek", "bername", "malper", "agahiyên", "danegehê", "vedişêrin"] + } + } + } } diff --git a/klpt/preprocess.py b/klpt/preprocess.py index dd270f0..271ebbb 100644 --- a/klpt/preprocess.py +++ b/klpt/preprocess.py @@ -26,6 +26,8 @@ class Preprocess: - `standardize`: given a normalized text, it returns standardized text based on the Kurdish orthographies following recommendations for [Kurmanji](https://books.google.ie/books?id=Z7lDnwEACAAJ) and [Sorani](http://yageyziman.com/Renusi_Kurdi.htm) - `unify_numerals`: conversion of the various types of numerals used in Kurdish texts + In addition, it is possible to remove stopwords using the `stopwords` variable. It is better to remove stopwords after the tokenization task. + It is recommended that the output of this module be used as the input of subsequent tasks in an NLP pipeline. Example: @@ -46,6 +48,8 @@ class Preprocess: 'di sala 2018an' >>> preprocessor_kmr.standardize("hêviya") 'hêvîya' + >>> preprocessor_kmr.stopwords[:10] + ['a', 'an', 'bareya', 'bareyê', 'barên', 'basa', 'be', 'belê', 'ber', 'bereya'] ``` The preprocessing rules are provided at [`data/preprocess_map.json`](https://github.com/sinaahmadi/klpt/blob/master/klpt/data/preprocess_map.json). @@ -70,6 +74,9 @@ def __init__(self, dialect, script, numeral="Latin"): self.numeral = configuration.numeral # self.preprocess_map = config.preprocess_map + with open(klpt.data_directory["stopwords"], "r") as f: + self.stopwords = json.load(f)[dialect][script] + def standardize(self, text): """ Method of standardization of Kurdish orthographies @@ -137,7 +144,6 @@ def normalize(self, text): return temp_text.strip() - def unify_numerals(self, text): """ Convert numerals to the desired one diff --git a/setup.py b/setup.py index 2aebaf3..f828a0b 100644 --- a/setup.py +++ b/setup.py @@ -10,7 +10,7 @@ setup( name="klpt", - version="0.1.3", + version="0.1.4", description="Kurdish Language Processing Toolkit", long_description=long_description, long_description_content_type="text/markdown", diff --git a/tests/test_preprocess.py b/tests/test_preprocess.py index 16b828c..12bc6a4 100644 --- a/tests/test_preprocess.py +++ b/tests/test_preprocess.py @@ -8,6 +8,7 @@ import klpt import json + class TestPreprocess(unittest.TestCase): """ Test unit for the Preprocess class""" def setUp(self): @@ -44,5 +45,13 @@ def test_unify_numerals(self): prep = Preprocess("Sorani", "Latin", numeral) self.assertEqual(prep.unify_numerals(case), self.test_cases["numerals"][numeral][case]) + def test_stopwords(self): + # print("stopwords") + for dialect in self.options["dialects"]: + for script in self.options["scripts"]: + for case in self.test_cases["stopwords"][dialect][script]: + prep = Preprocess(dialect, script) + self.assertEqual([token for token in case.split() if token not in prep.stopwords], self.test_cases["stopwords"][dialect][script][case]) + if __name__ == "__main__": unittest.main() \ No newline at end of file