OpenPecha · 10zinten · Jan 11, 2020 · Jan 12, 2020 · Jul 10, 2020 · Jul 27, 2020
diff --git a/botok/tokenizers/tokenize.py b/botok/tokenizers/tokenize.py
@@ -1,4 +1,9 @@
 # coding: utf-8
+from concurrent.futures import ProcessPoolExecutor
+import copy
+import math
+import os
+
 from .token import Token
 from ..vars import NAMCHE, TSEK
 from ..vars import chunk_values as u
@@ -17,11 +22,52 @@ def __init__(self, trie):
         self.pre_processed = None
         self.trie = trie
 
-    def tokenize(self, pre_processed, debug=False):
+
+    def parallelized_tokenize(self, pre_processed):
+
+        def __flatten(l):
+            return sum(l, [])
+
+        # group chunks by punctuation
+        chunks_grouped_by_punct = []
+        grouped_chunks = []
+        found_group = False
+        for i, chunk in enumerate(pre_processed.chunks):
+            if not chunk[0] and chunk[1][0] == 105:
+                found_group = True
+            else:
+                grouped_chunks.append(chunk)
+
+            if found_group:
+                grouped_chunks.append(chunk)
+                chunks_grouped_by_punct.append(grouped_chunks)
+                grouped_chunks = []
+                found_group = False
+        else:
+            if grouped_chunks:
+                chunks_grouped_by_punct.append(grouped_chunks)
+
+        # create pre_process objects base in no. of cpu
+        grouped_pre_processed = []
+        n_cpu = os.cpu_count()
+        groups_per_cpu = math.ceil(len(chunks_grouped_by_punct) / n_cpu)
+        start, end = 0, groups_per_cpu
+        for _ in range(n_cpu):
+            dup_preprocess = copy.deepcopy(pre_processed)
+            dup_preprocess.chunks = __flatten(chunks_grouped_by_punct[start: end])
+            grouped_pre_processed.append(dup_preprocess)
+            start, end = end, end+groups_per_cpu
+
+        # do mutliprocessing tokenization
+        with ProcessPoolExecutor() as executor:
+            tokenized_sents = executor.map(self.tokenize, grouped_pre_processed)
+        return __flatten(tokenized_sents)
+
+
+    def tokenize(self, pre_processed):
         """
 
         :param pre_processed: PyBoTextChunks of the text to be tokenized
-        :param debug: prints debug info in True
         :return: a list of Token objects
         """
         self.pre_processed = pre_processed
@@ -253,9 +299,4 @@ def _has_skrt_char(self, char_groups):
             A.SKRT_VOW in char_groups.values()
             or A.SKRT_CONS in char_groups.values()
             or A.SKRT_SUB_CONS in char_groups.values()
-        )
-
-    @staticmethod
-    def debug(debug, to_print):
-        if debug:
-            print(to_print, flush=True)
+        )
diff --git a/botok/tokenizers/wordtokenizer.py b/botok/tokenizers/wordtokenizer.py
@@ -1,16 +1,16 @@
 # coding: utf8
-from pathlib import Path
 import csv
+from pathlib import Path
 
-from .tokenize import Tokenize
-from ..modifytokens.splitaffixed import split_affixed
-from ..modifytokens.mergedagdra import MergeDagdra
-from ..modifytokens.adjusttokens import AdjustTokens
-from ..tries.trie import Trie
 from ..chunks.chunks import TokChunks
-from ..textunits.bosyl import BoSyl
 from ..config import Config
-from ..vars import TSEK, AA
+from ..modifytokens.adjusttokens import AdjustTokens
+from ..modifytokens.mergedagdra import MergeDagdra
+from ..modifytokens.splitaffixed import split_affixed
+from ..textunits.bosyl import BoSyl
+from ..tries.trie import Trie
+from ..vars import AA, TSEK
+from .tokenize import Tokenize
 
 part_lemmas = {}
 filename = Path(__file__).parent.parent / "resources" / "particles.tsv"
@@ -66,18 +66,23 @@ def __init__(
         )
         self.adj = AdjustTokens(main=adj_main, custom=adj_custom)
 
-    def tokenize(self, string, split_affixes=True, spaces_as_punct=False, debug=False):
+    def tokenize(
+        self, string, split_affixes=True, spaces_as_punct=False, parallelize=False
+    ):
         """
         :param string: to be tokenized
         :param split_affixes: separates the affixed particles into seperate tokens if True
-        :param debug: print debug info while parsing
+        :param parallelize: do multiprocessed tokenization
         :return: list of pybo.tokenizers.Token objects
         """
         preprocessed = TokChunks(
             string, ignore_chars=self.ignore_chars, space_as_punct=spaces_as_punct
         )
         preprocessed.serve_syls_to_trie()
-        tokens = self.tok.tokenize(preprocessed, debug=debug)
+        if parallelize:
+            tokens = self.tok.parallelized_tokenize(preprocessed)
+        else:
+            tokens = self.tok.tokenize(preprocessed)
 
         if split_affixes:
             split_affixed(tokens)

diff --git a/tests/benchmark.py b/tests/benchmark.py
@@ -0,0 +1,68 @@
+import urllib.request
+
+import pytest
+
+from botok import WordTokenizer
+
+
+@pytest.fixture(scope="module")
+def testcases():
+    file_url = "https://raw.githubusercontent.com/OpenPecha/P000001/master/P000001.opf/base/v001.txt"
+    response = urllib.request.urlopen(file_url)
+    large_size = response.read().decode("utf-8")
+    medium_size = large_size[: len(large_size) // 3]
+    small_size = medium_size[: len(medium_size) // 5]
+    return large_size, medium_size, small_size
+
+
+@pytest.fixture(scope="module")
+def wt():
+    return WordTokenizer()
+
+
+def test_non_parallized_tok_small(benchmark, wt, testcases):
+    _, _, small = testcases
+
+    @benchmark
+    def tok():
+        wt.tokenize(small)
+
+
+def test_parallized_tok_small(benchmark, wt, testcases):
+    _, _, small = testcases
+
+    @benchmark
+    def tok():
+        wt.tokenize(small, parallelize=True)
+
+
+def test_non_parallized_tok_medium(benchmark, wt, testcases):
+    _, medium, _ = testcases
+
+    @benchmark
+    def tok():
+        wt.tokenize(medium)
+
+
+def test_parallized_tok_medium(benchmark, wt, testcases):
+    _, medium, _ = testcases
+
+    @benchmark
+    def tok():
+        wt.tokenize(medium, parallelize=True)
+
+
+def test_non_parallized_tok_large(benchmark, wt, testcases):
+    large, _, _ = testcases
+
+    @benchmark
+    def tok():
+        wt.tokenize(large)
+
+
+def test_parallized_tok_large(benchmark, wt, testcases):
+    large, _, _ = testcases
+
+    @benchmark
+    def tok():
+        wt.tokenize(large, parallelize=True)
diff --git a/tests/test_bugs.py b/tests/test_bugs.py
@@ -106,13 +106,13 @@ def test_multiple_spaces():
 def test_bug1():
     tok = WordTokenizer("POS")
     string = "བ་ཀུ་"
-    tokens = tok.tokenize(string, debug=True)
+    tokens = tok.tokenize(string)
     assert tokens
 
 
 def test_bug2():
     string = "བྲ་གྲྀ་"
-    tokens = pos_tok.tokenize(string, debug=True)
+    tokens = pos_tok.tokenize(string)
     assert tokens
 
 
@@ -167,8 +167,4 @@ def test_shad_in_syllable():
         ("TEXT", "ལེ གས"),
         ("PUNCT", "། "),
         ("TEXT", "བཀྲ་"),
-    ]
-
-
-if __name__ == "__main__":
-    test_syl_tokenize()
+    ]
diff --git a/tests/test_tokenize.py b/tests/test_tokenize.py
@@ -1,6 +1,8 @@
 # coding: utf8
-from botok import *
 from textwrap import dedent
+
+from botok import *
+
 from helpers import pos_tok
 
 
@@ -29,7 +31,7 @@ def test_tokenize():
                         syls_start_end: [{'start': 0, 'end': 4}, {'start': 4, 'end': 7}]
                         start: 5
                         len: 7
-                        
+
                         """
     )
     str(tokens[0])
@@ -56,7 +58,64 @@ def test_tokenize():
                         syls_start_end: [{'start': 0, 'end': 4}, {'start': 4, 'end': 7}]
                         start: 5
                         len: 7
-
+
+                        """
+    )
+    assert str(tokens[2]) == expected
+
+
+def test_parallized_token():
+    profile = "empty"
+    main, custom = Config().get_tok_data_paths(profile)
+    tok = Tokenize(Trie(BoSyl, profile, main, custom))
+    tok.trie.inflect_n_modify_trie("བཀྲ་ཤིས་")
+    tok.trie.inflect_n_add_data("བཀྲ་ཤིས་\tNOUN\t\tབཀྲ་ཤིས་\t17500")
+    tok.trie.inflect_n_modify_trie("མཐའ་")
+    tok.trie.inflect_n_add_data("མཐའ་\tNOUN")
+    in_str = "མཐའི་བཀྲ་ཤིས། ཀཀ abc མཐའི་རྒྱ་མཚོ་"
+    preproc = TokChunks(in_str)
+    preproc.serve_syls_to_trie()
+    tokens = tok.parallelized_tokenize(preproc)
+    expected = dedent(
+        """\
+                        text: "བཀྲ་ཤིས"
+                        text_cleaned: "བཀྲ་ཤིས་"
+                        text_unaffixed: "བཀྲ་ཤིས་"
+                        syls: ["བཀྲ", "ཤིས"]
+                        senses: | pos: NOUN, freq: 17500, sense: བཀྲ་ཤིས་, affixed: False |
+                        char_types: |CONS|CONS|SUB_CONS|TSEK|CONS|VOW|CONS|
+                        chunk_type: TEXT
+                        syls_idx: [[0, 1, 2], [4, 5, 6]]
+                        syls_start_end: [{'start': 0, 'end': 4}, {'start': 4, 'end': 7}]
+                        start: 5
+                        len: 7
+
+                        """
+    )
+    str(tokens[0])
+    assert str(tokens[1]) == expected
+    assert tokens[2].text == "། "
+    assert tokens[2].chunk_type == "PUNCT"
+    # add sense to བཀྲ་ཤིས་
+    tokens = pos_tok.tokenize(in_str, parallelize=True)
+    expected = dedent(
+        """\
+                        text: "བཀྲ་ཤིས"
+                        text_cleaned: "བཀྲ་ཤིས་"
+                        text_unaffixed: "བཀྲ་ཤིས་"
+                        syls: ["བཀྲ", "ཤིས"]
+                        pos: NOUN
+                        lemma: བཀྲ་ཤིས་
+                        sense: བཀྲ་ཤིས་
+                        senses: | pos: NOUN, freq: 17204, affixed: False, lemma: བཀྲ་ཤིས་ | pos: NOUN, freq: 17500, sense: བཀྲ་ཤིས་, affixed: False, lemma: བཀྲ་ཤིས་ |
+                        char_types: |CONS|CONS|SUB_CONS|TSEK|CONS|VOW|CONS|
+                        chunk_type: TEXT
+                        freq: 17500
+                        syls_idx: [[0, 1, 2], [4, 5, 6]]
+                        syls_start_end: [{'start': 0, 'end': 4}, {'start': 4, 'end': 7}]
+                        start: 5
+                        len: 7
+
                         """
     )
     assert str(tokens[2]) == expected
@@ -82,6 +141,26 @@ def test_non_max2():
     assert tokens[2]["senses"][0]["pos"] == "NO_POS"
 
 
+def test_parallized_non_max2():
+    profile = "empty"
+    main, custom = Config().get_tok_data_paths(profile)
+    tok = Tokenize(Trie(BoSyl, profile, main, custom))
+    tok.trie.inflect_n_modify_trie("བཀྲ་ཤིས་")
+    tok.trie.inflect_n_add_data("བཀྲ་ཤིས་\tNOUN")
+    tok.trie.inflect_n_modify_trie(
+        "བཀྲ་ཤིས་བདེ་ལེགས།"
+    )  # to ensure we're not in a maximal match
+    preproc = TokChunks("བཀྲ་ཤིས་བདེ་བཀྲ་")
+    preproc.serve_syls_to_trie()
+    tokens = tok.parallelized_tokenize(preproc)
+    assert tokens[0].text == "བཀྲ་ཤིས་"
+    assert tokens[0]["senses"][0]["pos"] == "NOUN"
+    assert tokens[1].text == "བདེ་"
+    assert tokens[1]["senses"][0]["pos"] == "NON_WORD"
+    assert tokens[2].text == "བཀྲ་"
+    assert tokens[2]["senses"][0]["pos"] == "NO_POS"
+
+
 def test_non_max_end_of_string():
     profile = "empty"
     main, custom = Config().get_tok_data_paths(profile)
@@ -96,5 +175,19 @@ def test_non_max_end_of_string():
     assert tokens[0].text == "བཀྲ་ཤིས་"
     assert tokens[1].text == "བདེ་"
 
+def test_parallized_non_max_end_of_string():
+    profile = "empty"
+    main, custom = Config().get_tok_data_paths(profile)
+    tok = Tokenize(Trie(BoSyl, profile, main, custom))
+    tok.trie.inflect_n_modify_trie("བཀྲ་ཤིས་")
+    tok.trie.inflect_n_modify_trie(
+        "བཀྲ་ཤིས་བདེ་ལེགས།"
+    )  # to ensure we're not in a maximal match
+    preproc = TokChunks("བཀྲ་ཤིས་བདེ་")
+    preproc.serve_syls_to_trie()
+    tokens = tok.parallelized_tokenize(preproc)
+    assert tokens[0].text == "བཀྲ་ཤིས་"
+    assert tokens[1].text == "བདེ་"
+
 if __name__ == "__main__":
-    test_non_max2()
+    test_parallized_token()