Skip to content

Commit

Permalink
version 0.1.4 with stopwords
Browse files Browse the repository at this point in the history
  • Loading branch information
sinaahmadi committed Dec 1, 2021
1 parent 9c517f8 commit 5e66dfa
Show file tree
Hide file tree
Showing 9 changed files with 53 additions and 17 deletions.
17 changes: 16 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,7 @@ Please note that KLPT is under development and some of the functionalities will
</thead>
<tbody>
<tr>
<td rowspan="3"><code>preprocess</code></td>
<td rowspan="4"><code>preprocess</code></td>
<td>normalization</td>
<td>&#10003; (v0.1.0)</td>
<td>&#10003; (v0.1.0)</td>
Expand All @@ -86,6 +86,11 @@ Please note that KLPT is under development and some of the functionalities will
<td>&#10003; (v0.1.0)</td>
<td>&#10003; (v0.1.0)</td>
</tr>
<tr>
<td>stopwords 🆕</td>
<td>&#10003; (v0.1.4)</td>
<td>&#10003; (v0.1.4)</td>
</tr>
<tr>
<td rowspan="3"><code>tokenize</code></td>
<td>word tokenization<br></td>
Expand Down Expand Up @@ -198,6 +203,16 @@ It is recommended that the output of this module be used as the input of subsequ
'hêvîya'
```

In addition, it is possible to remove Kurdish [stopwords](https://en.wikipedia.org/wiki/Stop_word) using the `stopwords` variable. You can define a function like the following to do so:

```python
from klpt.preprocess import Preprocess

def remove_stopwords(text, dialect, script):
p = Preprocess(dialect, script)
return [token for token in text.split() if token not in p.stopwords]
```

### Tokenization

This module focuses on the tokenization of both Kurmanji and Sorani dialects of Kurdish with the following functions:
Expand Down
Binary file added dist/klpt-0.1.4-py3-none-any.whl
Binary file not shown.
Binary file added dist/klpt-0.1.4.tar.gz
Binary file not shown.
14 changes: 3 additions & 11 deletions klpt/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,14 +12,6 @@
__author__ = __maintainer__
__author_email__ = __maintainer_email__

# with open("data/stopwords.json") as f:
# stopwords = json.load(f)[self.dialect][self.script]
# if __name__ == "__main__":

# def remove_stopwords(self, text):
# """remove stopwords"""
# return " ".join([token for token in text.split() if token not in self.stopwords])

_ROOT = os.path.abspath(os.path.dirname(__file__))
def get_data(path):
return os.path.join(_ROOT, '', path)
Expand All @@ -41,6 +33,6 @@ def get_data(path):
"analyser": {
"Sorani": get_data("data/ckb-analyser.att"),
"Kurmanji": get_data("data/kmr-analyser.att")
}
}

},
"stopwords": get_data("data/stopwords.json")
}
2 changes: 1 addition & 1 deletion klpt/data/preprocess_map.json
Original file line number Diff line number Diff line change
Expand Up @@ -141,7 +141,7 @@
"Arabic": {},
"Latin": {
"iy": "îy",
"istan ": "îstan ",
"îstan ": "istan ",
"(\\d)'(an)": "\\1\\2",
"(\\d)-(an)": "\\1\\2",
"(\\d)`(an)": "\\1\\2",
Expand Down
18 changes: 16 additions & 2 deletions klpt/data/test_cases.json
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@
"Arabic": {},
"Latin": {
"Kurdiya me": "Kurdîya me",
"Ermenistan": "Ermenîstan",
"Ermenîstan": "Ermenistan",
"2018-an": "2018an",
"di sala 2018-an": "di sala 2018an",
"2018-An": "2018An",
Expand Down Expand Up @@ -88,5 +88,19 @@
"<U>": {"έτσι": "<U><U><U><U>"},
"�": {"έτσι": "����"}
}
}
},
"stopwords": {
"Sorani": {
"Arabic": {
"وێبگەڕەکانی ئەمڕۆ جگە لە توانای نمایشکردن و گەڕۆکی لە پەڕەکانی وێبدا": ["وێبگەڕەکانی", "توانای", "نمایشکردن", "گەڕۆکی", "پەڕەکانی", "وێبدا"]
},
"Latin": {}
},
"Kurmanji": {
"Arabic": {},
"Latin": {
"Gelek bername û malper agahiyên xwe li ser danegehê vedişêrin": ["Gelek", "bername", "malper", "agahiyên", "danegehê", "vedişêrin"]
}
}
}
}
8 changes: 7 additions & 1 deletion klpt/preprocess.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,8 @@ class Preprocess:
- `standardize`: given a normalized text, it returns standardized text based on the Kurdish orthographies following recommendations for [Kurmanji](https://books.google.ie/books?id=Z7lDnwEACAAJ) and [Sorani](http://yageyziman.com/Renusi_Kurdi.htm)
- `unify_numerals`: conversion of the various types of numerals used in Kurdish texts
In addition, it is possible to remove stopwords using the `stopwords` variable. It is better to remove stopwords after the tokenization task.
It is recommended that the output of this module be used as the input of subsequent tasks in an NLP pipeline.
Example:
Expand All @@ -46,6 +48,8 @@ class Preprocess:
'di sala 2018an'
>>> preprocessor_kmr.standardize("hêviya")
'hêvîya'
>>> preprocessor_kmr.stopwords[:10]
['a', 'an', 'bareya', 'bareyê', 'barên', 'basa', 'be', 'belê', 'ber', 'bereya']
```
The preprocessing rules are provided at [`data/preprocess_map.json`](https://github.com/sinaahmadi/klpt/blob/master/klpt/data/preprocess_map.json).
Expand All @@ -70,6 +74,9 @@ def __init__(self, dialect, script, numeral="Latin"):
self.numeral = configuration.numeral
# self.preprocess_map = config.preprocess_map

with open(klpt.data_directory["stopwords"], "r") as f:
self.stopwords = json.load(f)[dialect][script]

def standardize(self, text):
"""
Method of standardization of Kurdish orthographies
Expand Down Expand Up @@ -137,7 +144,6 @@ def normalize(self, text):

return temp_text.strip()


def unify_numerals(self, text):
"""
Convert numerals to the desired one
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@

setup(
name="klpt",
version="0.1.3",
version="0.1.4",
description="Kurdish Language Processing Toolkit",
long_description=long_description,
long_description_content_type="text/markdown",
Expand Down
9 changes: 9 additions & 0 deletions tests/test_preprocess.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
import klpt
import json


class TestPreprocess(unittest.TestCase):
""" Test unit for the Preprocess class"""
def setUp(self):
Expand Down Expand Up @@ -44,5 +45,13 @@ def test_unify_numerals(self):
prep = Preprocess("Sorani", "Latin", numeral)
self.assertEqual(prep.unify_numerals(case), self.test_cases["numerals"][numeral][case])

def test_stopwords(self):
# print("stopwords")
for dialect in self.options["dialects"]:
for script in self.options["scripts"]:
for case in self.test_cases["stopwords"][dialect][script]:
prep = Preprocess(dialect, script)
self.assertEqual([token for token in case.split() if token not in prep.stopwords], self.test_cases["stopwords"][dialect][script][case])

if __name__ == "__main__":
unittest.main()

0 comments on commit 5e66dfa

Please sign in to comment.