version 0.1.4 with stopwords

sinaahmadi · Dec 1, 2021 · 5e66dfa · 5e66dfa
1 parent 9c517f8
commit 5e66dfa
Show file tree

Hide file tree

Showing 9 changed files with 53 additions and 17 deletions.
diff --git a/README.md b/README.md
@@ -71,7 +71,7 @@ Please note that KLPT is under development and some of the functionalities will
 </thead>
 <tbody>
   <tr>
-    <td rowspan="3"><code>preprocess</code></td>
+    <td rowspan="4"><code>preprocess</code></td>
     <td>normalization</td>
     <td>&#10003; (v0.1.0)</td>
     <td>&#10003; (v0.1.0)</td>
@@ -86,6 +86,11 @@ Please note that KLPT is under development and some of the functionalities will
     <td>&#10003; (v0.1.0)</td>
     <td>&#10003; (v0.1.0)</td>
   </tr>
+  <tr>
+    <td>stopwords 🆕</td>
+    <td>&#10003; (v0.1.4)</td>
+    <td>&#10003; (v0.1.4)</td>
+  </tr>
   <tr>
     <td rowspan="3"><code>tokenize</code></td>
     <td>word tokenization<br></td>
@@ -198,6 +203,16 @@ It is recommended that the output of this module be used as the input of subsequ
 'hêvîya'
 ```
 
+In addition, it is possible to remove Kurdish [stopwords](https://en.wikipedia.org/wiki/Stop_word) using the `stopwords` variable. You can define a function like the following to do so:
+
+```python
+from klpt.preprocess import Preprocess
+
+def remove_stopwords(text, dialect, script):
+    p = Preprocess(dialect, script)
+    return [token for token in text.split() if token not in p.stopwords]
+```
+
 ### Tokenization
 
 This module focuses on the tokenization of both Kurmanji and Sorani dialects of Kurdish with the following functions:

diff --git a/dist/klpt-0.1.4-py3-none-any.whl b/dist/klpt-0.1.4-py3-none-any.whl
diff --git a/dist/klpt-0.1.4.tar.gz b/dist/klpt-0.1.4.tar.gz
diff --git a/klpt/__init__.py b/klpt/__init__.py
@@ -12,14 +12,6 @@
 __author__ = __maintainer__
 __author_email__ = __maintainer_email__
 
-# with open("data/stopwords.json") as f:
-#         stopwords = json.load(f)[self.dialect][self.script]
-# if __name__ == "__main__":
-
-# def remove_stopwords(self, text):
-#         """remove stopwords"""
-#         return " ".join([token for token in text.split() if token not in self.stopwords])
-
 _ROOT = os.path.abspath(os.path.dirname(__file__))
 def get_data(path):
     return os.path.join(_ROOT, '', path)
@@ -41,6 +33,6 @@ def get_data(path):
     "analyser": {
         "Sorani": get_data("data/ckb-analyser.att"),
         "Kurmanji": get_data("data/kmr-analyser.att")
-    }
-}
-
+    },
+    "stopwords": get_data("data/stopwords.json")
+}
diff --git a/klpt/data/preprocess_map.json b/klpt/data/preprocess_map.json
@@ -141,7 +141,7 @@
       "Arabic": {},
       "Latin": {
         "iy": "îy",
-        "istan ": "îstan ",
+        "îstan ": "istan ",
         "(\\d)'(an)": "\\1\\2",
         "(\\d)-(an)": "\\1\\2",
         "(\\d)`(an)": "\\1\\2",

diff --git a/klpt/data/test_cases.json b/klpt/data/test_cases.json
@@ -39,7 +39,7 @@
 			"Arabic": {},
 			"Latin": {
 				"Kurdiya me": "Kurdîya me",
-				"Ermenistan": "Ermenîstan",
+				"Ermenîstan": "Ermenistan",
 				"2018-an": "2018an",
 				"di sala 2018-an": "di sala 2018an",
 				"2018-An": "2018An",
@@ -88,5 +88,19 @@
 			"<U>": {"έτσι": "<U><U><U><U>"},
 			"�": {"έτσι": "����"}
 		}
-    }
+    },
+	"stopwords": {
+		"Sorani": {
+			"Arabic": {
+				"وێبگەڕەکانی ئەمڕۆ جگە لە توانای نمایشکردن و گەڕۆکی لە پەڕەکانی وێبدا": ["وێبگەڕەکانی", "توانای", "نمایشکردن", "گەڕۆکی", "پەڕەکانی", "وێبدا"]
+			},
+			"Latin": {}
+		},
+		"Kurmanji": {
+			"Arabic": {},
+			"Latin": {
+				"Gelek bername û malper agahiyên xwe li ser danegehê vedişêrin": ["Gelek", "bername", "malper", "agahiyên", "danegehê", "vedişêrin"]
+			}
+		}
+	}
 }
diff --git a/klpt/preprocess.py b/klpt/preprocess.py
@@ -26,6 +26,8 @@ class Preprocess:
     - `standardize`: given a normalized text, it returns standardized text based on the Kurdish orthographies following recommendations for [Kurmanji](https://books.google.ie/books?id=Z7lDnwEACAAJ) and [Sorani](http://yageyziman.com/Renusi_Kurdi.htm)
     - `unify_numerals`: conversion of the various types of numerals used in Kurdish texts
 
+    In addition, it is possible to remove stopwords using the `stopwords` variable. It is better to remove stopwords after the tokenization task.
+
     It is recommended that the output of this module be used as the input of subsequent tasks in an NLP pipeline.
     
     Example:
@@ -46,6 +48,8 @@ class Preprocess:
     'di sala 2018an'
     >>> preprocessor_kmr.standardize("hêviya")
     'hêvîya'
+    >>> preprocessor_kmr.stopwords[:10]
+    ['a', 'an', 'bareya', 'bareyê', 'barên', 'basa', 'be', 'belê', 'ber', 'bereya']
     ```
 
     The preprocessing rules are provided at [`data/preprocess_map.json`](https://github.com/sinaahmadi/klpt/blob/master/klpt/data/preprocess_map.json).
@@ -70,6 +74,9 @@ def __init__(self, dialect, script, numeral="Latin"):
         self.numeral = configuration.numeral
         # self.preprocess_map = config.preprocess_map
 
+        with open(klpt.data_directory["stopwords"], "r") as f:
+            self.stopwords = json.load(f)[dialect][script]
+
     def standardize(self, text):
         """
         Method of standardization of Kurdish orthographies
@@ -137,7 +144,6 @@ def normalize(self, text):
 
         return temp_text.strip()
 
-
     def unify_numerals(self, text):
         """
         Convert numerals to the desired one

diff --git a/setup.py b/setup.py
@@ -10,7 +10,7 @@
 
 setup(
     name="klpt",
-    version="0.1.3",
+    version="0.1.4",
     description="Kurdish Language Processing Toolkit",
     long_description=long_description,
     long_description_content_type="text/markdown",

diff --git a/tests/test_preprocess.py b/tests/test_preprocess.py
@@ -8,6 +8,7 @@
 import klpt
 import json
 
+
 class TestPreprocess(unittest.TestCase):
     """ Test unit for the Preprocess class"""
     def setUp(self):
@@ -44,5 +45,13 @@ def test_unify_numerals(self):
                 prep = Preprocess("Sorani", "Latin", numeral)
                 self.assertEqual(prep.unify_numerals(case), self.test_cases["numerals"][numeral][case])
 
+    def test_stopwords(self):
+        # print("stopwords")
+        for dialect in self.options["dialects"]:
+            for script in self.options["scripts"]:
+                for case in self.test_cases["stopwords"][dialect][script]:
+                    prep = Preprocess(dialect, script)
+                    self.assertEqual([token for token in case.split() if token not in prep.stopwords], self.test_cases["stopwords"][dialect][script][case])
+
 if __name__ == "__main__":
     unittest.main()