diff --git a/README.md b/README.md
index 2d266a7..c5b0560 100644
--- a/README.md
+++ b/README.md
@@ -71,7 +71,7 @@ Please note that KLPT is under development and some of the functionalities will
- preprocess |
+ preprocess |
normalization |
✓ (v0.1.0) |
✓ (v0.1.0) |
@@ -86,6 +86,11 @@ Please note that KLPT is under development and some of the functionalities will
✓ (v0.1.0) |
✓ (v0.1.0) |
+
+ stopwords 🆕 |
+ ✓ (v0.1.4) |
+ ✓ (v0.1.4) |
+
tokenize |
word tokenization
|
@@ -198,6 +203,16 @@ It is recommended that the output of this module be used as the input of subsequ
'hêvîya'
```
+In addition, it is possible to remove Kurdish [stopwords](https://en.wikipedia.org/wiki/Stop_word) using the `stopwords` variable. You can define a function like the following to do so:
+
+```python
+from klpt.preprocess import Preprocess
+
+def remove_stopwords(text, dialect, script):
+ p = Preprocess(dialect, script)
+ return [token for token in text.split() if token not in p.stopwords]
+```
+
### Tokenization
This module focuses on the tokenization of both Kurmanji and Sorani dialects of Kurdish with the following functions:
diff --git a/dist/klpt-0.1.4-py3-none-any.whl b/dist/klpt-0.1.4-py3-none-any.whl
new file mode 100644
index 0000000..4624c3f
Binary files /dev/null and b/dist/klpt-0.1.4-py3-none-any.whl differ
diff --git a/dist/klpt-0.1.4.tar.gz b/dist/klpt-0.1.4.tar.gz
new file mode 100644
index 0000000..da5da47
Binary files /dev/null and b/dist/klpt-0.1.4.tar.gz differ
diff --git a/klpt/__init__.py b/klpt/__init__.py
index 319c238..f607575 100644
--- a/klpt/__init__.py
+++ b/klpt/__init__.py
@@ -12,14 +12,6 @@
__author__ = __maintainer__
__author_email__ = __maintainer_email__
-# with open("data/stopwords.json") as f:
-# stopwords = json.load(f)[self.dialect][self.script]
-# if __name__ == "__main__":
-
-# def remove_stopwords(self, text):
-# """remove stopwords"""
-# return " ".join([token for token in text.split() if token not in self.stopwords])
-
_ROOT = os.path.abspath(os.path.dirname(__file__))
def get_data(path):
return os.path.join(_ROOT, '', path)
@@ -41,6 +33,6 @@ def get_data(path):
"analyser": {
"Sorani": get_data("data/ckb-analyser.att"),
"Kurmanji": get_data("data/kmr-analyser.att")
- }
-}
-
+ },
+ "stopwords": get_data("data/stopwords.json")
+}
\ No newline at end of file
diff --git a/klpt/data/preprocess_map.json b/klpt/data/preprocess_map.json
index 2d0b610..1608d22 100644
--- a/klpt/data/preprocess_map.json
+++ b/klpt/data/preprocess_map.json
@@ -141,7 +141,7 @@
"Arabic": {},
"Latin": {
"iy": "îy",
- "istan ": "îstan ",
+ "îstan ": "istan ",
"(\\d)'(an)": "\\1\\2",
"(\\d)-(an)": "\\1\\2",
"(\\d)`(an)": "\\1\\2",
diff --git a/klpt/data/test_cases.json b/klpt/data/test_cases.json
index 5701755..6fea7cd 100644
--- a/klpt/data/test_cases.json
+++ b/klpt/data/test_cases.json
@@ -39,7 +39,7 @@
"Arabic": {},
"Latin": {
"Kurdiya me": "Kurdîya me",
- "Ermenistan": "Ermenîstan",
+ "Ermenîstan": "Ermenistan",
"2018-an": "2018an",
"di sala 2018-an": "di sala 2018an",
"2018-An": "2018An",
@@ -88,5 +88,19 @@
"": {"Îτσι": ""},
"�": {"Îτσι": "����"}
}
- }
+ },
+ "stopwords": {
+ "Sorani": {
+ "Arabic": {
+ "وێبگەڕەکانی ئەمڕۆ جگە لە توانای نمایشکردن و گەڕۆکی لە پەڕەکانی وێبدا": ["وێبگەڕەکانی", "توانای", "نمایشکردن", "گەڕۆکی", "پەڕەکانی", "وێبدا"]
+ },
+ "Latin": {}
+ },
+ "Kurmanji": {
+ "Arabic": {},
+ "Latin": {
+ "Gelek bername û malper agahiyên xwe li ser danegehê vedişêrin": ["Gelek", "bername", "malper", "agahiyên", "danegehê", "vedişêrin"]
+ }
+ }
+ }
}
diff --git a/klpt/preprocess.py b/klpt/preprocess.py
index dd270f0..271ebbb 100644
--- a/klpt/preprocess.py
+++ b/klpt/preprocess.py
@@ -26,6 +26,8 @@ class Preprocess:
- `standardize`: given a normalized text, it returns standardized text based on the Kurdish orthographies following recommendations for [Kurmanji](https://books.google.ie/books?id=Z7lDnwEACAAJ) and [Sorani](http://yageyziman.com/Renusi_Kurdi.htm)
- `unify_numerals`: conversion of the various types of numerals used in Kurdish texts
+ In addition, it is possible to remove stopwords using the `stopwords` variable. It is better to remove stopwords after the tokenization task.
+
It is recommended that the output of this module be used as the input of subsequent tasks in an NLP pipeline.
Example:
@@ -46,6 +48,8 @@ class Preprocess:
'di sala 2018an'
>>> preprocessor_kmr.standardize("hêviya")
'hêvîya'
+ >>> preprocessor_kmr.stopwords[:10]
+ ['a', 'an', 'bareya', 'bareyê', 'barên', 'basa', 'be', 'belê', 'ber', 'bereya']
```
The preprocessing rules are provided at [`data/preprocess_map.json`](https://github.com/sinaahmadi/klpt/blob/master/klpt/data/preprocess_map.json).
@@ -70,6 +74,9 @@ def __init__(self, dialect, script, numeral="Latin"):
self.numeral = configuration.numeral
# self.preprocess_map = config.preprocess_map
+ with open(klpt.data_directory["stopwords"], "r") as f:
+ self.stopwords = json.load(f)[dialect][script]
+
def standardize(self, text):
"""
Method of standardization of Kurdish orthographies
@@ -137,7 +144,6 @@ def normalize(self, text):
return temp_text.strip()
-
def unify_numerals(self, text):
"""
Convert numerals to the desired one
diff --git a/setup.py b/setup.py
index 2aebaf3..f828a0b 100644
--- a/setup.py
+++ b/setup.py
@@ -10,7 +10,7 @@
setup(
name="klpt",
- version="0.1.3",
+ version="0.1.4",
description="Kurdish Language Processing Toolkit",
long_description=long_description,
long_description_content_type="text/markdown",
diff --git a/tests/test_preprocess.py b/tests/test_preprocess.py
index 16b828c..12bc6a4 100644
--- a/tests/test_preprocess.py
+++ b/tests/test_preprocess.py
@@ -8,6 +8,7 @@
import klpt
import json
+
class TestPreprocess(unittest.TestCase):
""" Test unit for the Preprocess class"""
def setUp(self):
@@ -44,5 +45,13 @@ def test_unify_numerals(self):
prep = Preprocess("Sorani", "Latin", numeral)
self.assertEqual(prep.unify_numerals(case), self.test_cases["numerals"][numeral][case])
+ def test_stopwords(self):
+ # print("stopwords")
+ for dialect in self.options["dialects"]:
+ for script in self.options["scripts"]:
+ for case in self.test_cases["stopwords"][dialect][script]:
+ prep = Preprocess(dialect, script)
+ self.assertEqual([token for token in case.split() if token not in prep.stopwords], self.test_cases["stopwords"][dialect][script][case])
+
if __name__ == "__main__":
unittest.main()
\ No newline at end of file