From a3ee925bdb0fe00bd3e49669dc22c8e210add080 Mon Sep 17 00:00:00 2001
From: tenzin <ten13zin@gmail.com>
Date: Sat, 11 Jan 2020 20:56:45 +0530
Subject: [PATCH 1/4] add mutlitprocess tokenization with tests

---
 botok/tokenizers/tokenize.py      | 42 +++++++++++---
 botok/tokenizers/wordtokenizer.py |  9 ++-
 tests/test_tokenize.py            | 93 ++++++++++++++++++++++++++++++-
 3 files changed, 131 insertions(+), 13 deletions(-)

diff --git a/botok/tokenizers/tokenize.py b/botok/tokenizers/tokenize.py
index 3849138..cf930a3 100644
--- a/botok/tokenizers/tokenize.py
+++ b/botok/tokenizers/tokenize.py
@@ -1,4 +1,7 @@
 # coding: utf-8
+from concurrent.futures import ProcessPoolExecutor
+import copy
+
 from .token import Token
 from ..vars import NAMCHE, TSEK
 from ..vars import chunk_values as u
@@ -17,11 +20,39 @@ def __init__(self, trie):
         self.pre_processed = None
         self.trie = trie
 
-    def tokenize(self, pre_processed, debug=False):
+
+    def parallelized_tokenize(self, pre_processed):
+        grouped_pre_processed = []
+        grouped_chunks = []
+        found_group = False
+        for chunk in pre_processed.chunks:
+            if not chunk[0] and chunk[1][0] == 105:
+                found_group = True
+            else:
+                grouped_chunks.append(chunk)
+
+            if found_group:
+                grouped_chunks.append(chunk)
+                dup_pre_processed = copy.deepcopy(pre_processed)
+                dup_pre_processed.chunks = grouped_chunks
+                grouped_pre_processed.append(dup_pre_processed)
+                grouped_chunks = []
+                found_group = False
+        else:
+            if grouped_chunks:
+                dup_pre_processed = copy.deepcopy(pre_processed)
+                dup_pre_processed.chunks = grouped_chunks
+                grouped_pre_processed.append(dup_pre_processed)
+
+        with ProcessPoolExecutor() as executor:
+            tokenized_sents = executor.map(self.tokenize, grouped_pre_processed)
+        return sum(tokenized_sents, [])
+
+
+    def tokenize(self, pre_processed):
         """
 
         :param pre_processed: PyBoTextChunks of the text to be tokenized
-        :param debug: prints debug info in True
         :return: a list of Token objects
         """
         self.pre_processed = pre_processed
@@ -253,9 +284,4 @@ def _has_skrt_char(self, char_groups):
             A.SKRT_VOW in char_groups.values()
             or A.SKRT_CONS in char_groups.values()
             or A.SKRT_SUB_CONS in char_groups.values()
-        )
-
-    @staticmethod
-    def debug(debug, to_print):
-        if debug:
-            print(to_print, flush=True)
+        )
\ No newline at end of file
diff --git a/botok/tokenizers/wordtokenizer.py b/botok/tokenizers/wordtokenizer.py
index f673122..5f04fd0 100644
--- a/botok/tokenizers/wordtokenizer.py
+++ b/botok/tokenizers/wordtokenizer.py
@@ -66,18 +66,21 @@ def __init__(
         )
         self.adj = AdjustTokens(main=adj_main, custom=adj_custom)
 
-    def tokenize(self, string, split_affixes=True, spaces_as_punct=False, debug=False):
+    def tokenize(self, string, split_affixes=True, spaces_as_punct=False, parallelize=False):
         """
         :param string: to be tokenized
         :param split_affixes: separates the affixed particles into seperate tokens if True
-        :param debug: print debug info while parsing
+        :param parallelize: do multiprocessed tokenization
         :return: list of pybo.tokenizers.Token objects
         """
         preprocessed = TokChunks(
             string, ignore_chars=self.ignore_chars, space_as_punct=spaces_as_punct
         )
         preprocessed.serve_syls_to_trie()
-        tokens = self.tok.tokenize(preprocessed, debug=debug)
+        if parallelize:
+            tokens = self.tok.parallelized_tokenize(preprocessed)
+        else:
+            tokens = self.tok.tokenize(preprocessed)
 
         if split_affixes:
             split_affixed(tokens)
diff --git a/tests/test_tokenize.py b/tests/test_tokenize.py
index 65328a5..a24dbdd 100644
--- a/tests/test_tokenize.py
+++ b/tests/test_tokenize.py
@@ -62,6 +62,64 @@ def test_tokenize():
     assert str(tokens[2]) == expected
 
 
+def test_parallized_token():
+    profile = "empty"
+    main, custom = Config().get_tok_data_paths(profile)
+    tok = Tokenize(Trie(BoSyl, profile, main, custom))
+    tok.trie.inflect_n_modify_trie("བཀྲ་ཤིས་")
+    tok.trie.inflect_n_add_data("བཀྲ་ཤིས་\tNOUN\t\tབཀྲ་ཤིས་\t17500")
+    tok.trie.inflect_n_modify_trie("མཐའ་")
+    tok.trie.inflect_n_add_data("མཐའ་\tNOUN")
+    in_str = "མཐའི་བཀྲ་ཤིས། ཀཀ abc མཐའི་རྒྱ་མཚོ་"
+    preproc = TokChunks(in_str)
+    preproc.serve_syls_to_trie()
+    tokens = tok.parallelized_tokenize(preproc)
+    expected = dedent(
+        """\
+                        text: "བཀྲ་ཤིས"
+                        text_cleaned: "བཀྲ་ཤིས་"
+                        text_unaffixed: "བཀྲ་ཤིས་"
+                        syls: ["བཀྲ", "ཤིས"]
+                        senses: | pos: NOUN, freq: 17500, sense: བཀྲ་ཤིས་, affixed: False |
+                        char_types: |CONS|CONS|SUB_CONS|TSEK|CONS|VOW|CONS|
+                        chunk_type: TEXT
+                        syls_idx: [[0, 1, 2], [4, 5, 6]]
+                        syls_start_end: [{'start': 0, 'end': 4}, {'start': 4, 'end': 7}]
+                        start: 5
+                        len: 7
+                        
+                        """
+    )
+    str(tokens[0])
+    assert str(tokens[1]) == expected
+    assert tokens[2].text == "། "
+    assert tokens[2].chunk_type == "PUNCT"
+    # add sense to བཀྲ་ཤིས་
+    pos_tok.tok.trie.inflect_n_add_data("བཀྲ་ཤིས་\tNOUN\t\tབཀྲ་ཤིས་\t17500")
+    tokens = pos_tok.tokenize(in_str, parallelize=True)
+    expected = dedent(
+        """\
+                        text: "བཀྲ་ཤིས"
+                        text_cleaned: "བཀྲ་ཤིས་"
+                        text_unaffixed: "བཀྲ་ཤིས་"
+                        syls: ["བཀྲ", "ཤིས"]
+                        pos: NOUN
+                        lemma: བཀྲ་ཤིས་
+                        sense: བཀྲ་ཤིས་
+                        senses: | pos: NOUN, freq: 17204, affixed: False, lemma: བཀྲ་ཤིས་ | pos: NOUN, freq: 17500, sense: བཀྲ་ཤིས་, affixed: False, lemma: བཀྲ་ཤིས་ |
+                        char_types: |CONS|CONS|SUB_CONS|TSEK|CONS|VOW|CONS|
+                        chunk_type: TEXT
+                        freq: 17500
+                        syls_idx: [[0, 1, 2], [4, 5, 6]]
+                        syls_start_end: [{'start': 0, 'end': 4}, {'start': 4, 'end': 7}]
+                        start: 5
+                        len: 7
+                        
+                        """
+    )
+    assert str(tokens[2]) == expected
+
+
 def test_non_max2():
     profile = "empty"
     main, custom = Config().get_tok_data_paths(profile)
@@ -82,6 +140,26 @@ def test_non_max2():
     assert tokens[2]["senses"][0]["pos"] == "NO_POS"
 
 
+def test_parallized_non_max2():
+    profile = "empty"
+    main, custom = Config().get_tok_data_paths(profile)
+    tok = Tokenize(Trie(BoSyl, profile, main, custom))
+    tok.trie.inflect_n_modify_trie("བཀྲ་ཤིས་")
+    tok.trie.inflect_n_add_data("བཀྲ་ཤིས་\tNOUN")
+    tok.trie.inflect_n_modify_trie(
+        "བཀྲ་ཤིས་བདེ་ལེགས།"
+    )  # to ensure we're not in a maximal match
+    preproc = TokChunks("བཀྲ་ཤིས་བདེ་བཀྲ་")
+    preproc.serve_syls_to_trie()
+    tokens = tok.parallelized_tokenize(preproc)
+    assert tokens[0].text == "བཀྲ་ཤིས་"
+    assert tokens[0]["senses"][0]["pos"] == "NOUN"
+    assert tokens[1].text == "བདེ་"
+    assert tokens[1]["senses"][0]["pos"] == "NON_WORD"
+    assert tokens[2].text == "བཀྲ་"
+    assert tokens[2]["senses"][0]["pos"] == "NO_POS"
+
+
 def test_non_max_end_of_string():
     profile = "empty"
     main, custom = Config().get_tok_data_paths(profile)
@@ -96,5 +174,16 @@ def test_non_max_end_of_string():
     assert tokens[0].text == "བཀྲ་ཤིས་"
     assert tokens[1].text == "བདེ་"
 
-if __name__ == "__main__":
-    test_non_max2()
\ No newline at end of file
+def test_parallized_non_max_end_of_string():
+    profile = "empty"
+    main, custom = Config().get_tok_data_paths(profile)
+    tok = Tokenize(Trie(BoSyl, profile, main, custom))
+    tok.trie.inflect_n_modify_trie("བཀྲ་ཤིས་")
+    tok.trie.inflect_n_modify_trie(
+        "བཀྲ་ཤིས་བདེ་ལེགས།"
+    )  # to ensure we're not in a maximal match
+    preproc = TokChunks("བཀྲ་ཤིས་བདེ་")
+    preproc.serve_syls_to_trie()
+    tokens = tok.parallelized_tokenize(preproc)
+    assert tokens[0].text == "བཀྲ་ཤིས་"
+    assert tokens[1].text == "བདེ་"
\ No newline at end of file

From 329ac090d60c731a2874a1aaf609e42cb213d95c Mon Sep 17 00:00:00 2001
From: tenzin <ten13zin@gmail.com>
Date: Sun, 12 Jan 2020 23:27:58 +0530
Subject: [PATCH 2/4] add mutliprocessing tokenizer

---
 botok/tokenizers/tokenize.py      |  35 ++++++---
 botok/tokenizers/wordtokenizer.py |   1 +
 tests/test_bugs.py                |  10 +--
 tests/test_tokenize.py            |   6 +-
 tests/test_wordtokenizer.py       | 125 ++++++++++++++++++++++++++++++
 5 files changed, 158 insertions(+), 19 deletions(-)

diff --git a/botok/tokenizers/tokenize.py b/botok/tokenizers/tokenize.py
index cf930a3..a1ad35d 100644
--- a/botok/tokenizers/tokenize.py
+++ b/botok/tokenizers/tokenize.py
@@ -1,6 +1,8 @@
 # coding: utf-8
 from concurrent.futures import ProcessPoolExecutor
 import copy
+import math
+import os
 
 from .token import Token
 from ..vars import NAMCHE, TSEK
@@ -22,31 +24,44 @@ def __init__(self, trie):
 
 
     def parallelized_tokenize(self, pre_processed):
-        grouped_pre_processed = []
+
+        def __flatten(l):
+            return sum(l, [])
+
+        # group chunks by punctuation
+        chunks_grouped_by_punct = []
         grouped_chunks = []
         found_group = False
-        for chunk in pre_processed.chunks:
+        for i, chunk in enumerate(pre_processed.chunks):
             if not chunk[0] and chunk[1][0] == 105:
                 found_group = True
             else:
                 grouped_chunks.append(chunk)
-
+            
             if found_group:
                 grouped_chunks.append(chunk)
-                dup_pre_processed = copy.deepcopy(pre_processed)
-                dup_pre_processed.chunks = grouped_chunks
-                grouped_pre_processed.append(dup_pre_processed)
+                chunks_grouped_by_punct.append(grouped_chunks)
                 grouped_chunks = []
                 found_group = False
         else:
             if grouped_chunks:
-                dup_pre_processed = copy.deepcopy(pre_processed)
-                dup_pre_processed.chunks = grouped_chunks
-                grouped_pre_processed.append(dup_pre_processed)
+                chunks_grouped_by_punct.append(grouped_chunks)
+
+        # create pre_process objects base in no. of cpu
+        grouped_pre_processed = []
+        n_cpu = os.cpu_count()
+        groups_per_cpu = math.ceil(len(chunks_grouped_by_punct) / n_cpu)
+        start, end = 0, groups_per_cpu
+        for _ in range(n_cpu):
+            dup_preprocess = copy.deepcopy(pre_processed)
+            dup_preprocess.chunks = __flatten(chunks_grouped_by_punct[start: end])
+            grouped_pre_processed.append(dup_preprocess)
+            start, end = end, end+groups_per_cpu
 
+        # do mutliprocessing tokenization
         with ProcessPoolExecutor() as executor:
             tokenized_sents = executor.map(self.tokenize, grouped_pre_processed)
-        return sum(tokenized_sents, [])
+        return __flatten(tokenized_sents)
 
 
     def tokenize(self, pre_processed):
diff --git a/botok/tokenizers/wordtokenizer.py b/botok/tokenizers/wordtokenizer.py
index 5f04fd0..361a3c0 100644
--- a/botok/tokenizers/wordtokenizer.py
+++ b/botok/tokenizers/wordtokenizer.py
@@ -66,6 +66,7 @@ def __init__(
         )
         self.adj = AdjustTokens(main=adj_main, custom=adj_custom)
 
+    @profile
     def tokenize(self, string, split_affixes=True, spaces_as_punct=False, parallelize=False):
         """
         :param string: to be tokenized
diff --git a/tests/test_bugs.py b/tests/test_bugs.py
index 0279ed2..f814894 100644
--- a/tests/test_bugs.py
+++ b/tests/test_bugs.py
@@ -106,13 +106,13 @@ def test_multiple_spaces():
 def test_bug1():
     tok = WordTokenizer("POS")
     string = "བ་ཀུ་"
-    tokens = tok.tokenize(string, debug=True)
+    tokens = tok.tokenize(string)
     assert tokens
 
 
 def test_bug2():
     string = "བྲ་གྲྀ་"
-    tokens = pos_tok.tokenize(string, debug=True)
+    tokens = pos_tok.tokenize(string)
     assert tokens
 
 
@@ -167,8 +167,4 @@ def test_shad_in_syllable():
         ("TEXT", "ལེ གས"),
         ("PUNCT", "། "),
         ("TEXT", "བཀྲ་"),
-    ]
-
-
-if __name__ == "__main__":
-    test_syl_tokenize()
\ No newline at end of file
+    ]
\ No newline at end of file
diff --git a/tests/test_tokenize.py b/tests/test_tokenize.py
index a24dbdd..96bb412 100644
--- a/tests/test_tokenize.py
+++ b/tests/test_tokenize.py
@@ -95,7 +95,6 @@ def test_parallized_token():
     assert tokens[2].text == "། "
     assert tokens[2].chunk_type == "PUNCT"
     # add sense to བཀྲ་ཤིས་
-    pos_tok.tok.trie.inflect_n_add_data("བཀྲ་ཤིས་\tNOUN\t\tབཀྲ་ཤིས་\t17500")
     tokens = pos_tok.tokenize(in_str, parallelize=True)
     expected = dedent(
         """\
@@ -186,4 +185,7 @@ def test_parallized_non_max_end_of_string():
     preproc.serve_syls_to_trie()
     tokens = tok.parallelized_tokenize(preproc)
     assert tokens[0].text == "བཀྲ་ཤིས་"
-    assert tokens[1].text == "བདེ་"
\ No newline at end of file
+    assert tokens[1].text == "བདེ་"
+
+if __name__ == "__main__":
+    test_parallized_token()
\ No newline at end of file
diff --git a/tests/test_wordtokenizer.py b/tests/test_wordtokenizer.py
index 59a4f6a..7c99abe 100644
--- a/tests/test_wordtokenizer.py
+++ b/tests/test_wordtokenizer.py
@@ -121,6 +121,121 @@ def test_get_default_lemma():
     assert tokens[2].text_unaffixed == "" == tokens[2].text_cleaned
 
 
+def test_parallelized_get_default_lemma():
+    input_str = "བཀྲ་ཤིས་བདེ་ལེགས། མཐའི་རྒྱ་མཚོར་གནས་སོ།། །།ཀཀ"
+    profile = "POS"
+
+    # reconstitute all the pieces that WordTokenizer gathers
+    main, custom = Config().get_tok_data_paths(profile)
+    tok = Tokenize(Trie(BoSyl, profile, main, custom))
+    preproc = TokChunks(input_str)
+    preproc.serve_syls_to_trie()
+    tokens = tok.parallelized_tokenize(preproc)
+    split_affixed(tokens)
+
+    # if __get_default_lemma() is not run, only the lemmas coming from the lemma folder will be included
+    # in the Token objects.
+    assert str(tokens[3]) == dedent(
+        """\
+                                text: "མཐ"
+                                text_cleaned: "མཐ"
+                                text_unaffixed: "མཐ"
+                                syls: ["མཐ"]
+                                senses: | pos: NOUN, freq: 45097, affixed: True |
+                                char_types: |CONS|CONS|
+                                chunk_type: TEXT
+                                affix_host: True
+                                syls_idx: [[0, 1]]
+                                syls_start_end: [{'start': 0, 'end': 2}]
+                                start: 18
+                                len: 2
+                                
+                                """
+    )
+    assert "lemma" not in tokens[3]["senses"][0]
+
+    assert str(tokens[4]) == dedent(
+        """\
+                                text: "འི་"
+                                text_cleaned: "འི་"
+                                text_unaffixed: "འི་"
+                                syls: ["འི"]
+                                pos: PART
+                                char_types: |CONS|VOW|TSEK|
+                                chunk_type: TEXT
+                                affix: True
+                                syls_idx: [[0, 1]]
+                                syls_start_end: [{'start': 2, 'end': 5}]
+                                start: 20
+                                len: 3
+                                
+                                """
+    )
+
+    # regular words also have no lemmas
+    assert "lemma" not in tokens[0]["senses"][0]
+
+    # doing the same thing using WordTokenizer, which will apply its __get_default_lemma() method
+    # the profile is the same, so no lemma comes from the trie content files.
+    tokens = pos_tok.tokenize(input_str, parallelize=True)
+
+    # the lemma is Token.text_unaffixed with an extra འ and/or a tsek where required
+    assert str(tokens[3]) == dedent(
+        """\
+                                text: "མཐ"
+                                text_cleaned: "མཐ"
+                                text_unaffixed: "མཐ"
+                                syls: ["མཐ"]
+                                pos: NOUN
+                                lemma: མཐའ་
+                                senses: | pos: NOUN, freq: 45097, affixed: True, lemma: མཐའ་ |
+                                char_types: |CONS|CONS|
+                                chunk_type: TEXT
+                                freq: 45097
+                                affix_host: True
+                                syls_idx: [[0, 1]]
+                                syls_start_end: [{'start': 0, 'end': 2}]
+                                start: 18
+                                len: 2
+                                
+                                """
+    )
+    assert tokens[3]["senses"][0]["lemma"] == "མཐའ་"
+
+    # for particles, WordTokenizer reads the lemmas from a file and attributes them
+    assert str(tokens[4]) == dedent(
+        """\
+                                text: "འི་"
+                                text_cleaned: "འི་"
+                                text_unaffixed: "འི་"
+                                syls: ["འི"]
+                                pos: PART
+                                lemma: གི་
+                                senses: | lemma: གི་ |
+                                char_types: |CONS|VOW|TSEK|
+                                chunk_type: TEXT
+                                affix: True
+                                syls_idx: [[0, 1]]
+                                syls_start_end: [{'start': 2, 'end': 5}]
+                                start: 20
+                                len: 3
+                                
+                                """
+    )
+
+    # for regular words, Token.text_unaffixed is simply copied
+    assert tokens[0]["senses"][0]["lemma"] == "བཀྲ་ཤིས་"
+
+    # non-words do not have lemmas
+    assert "lemma" not in tokens[10]["senses"][0]
+    assert tokens[10].text_cleaned == "ཀཀ་"
+    assert tokens[10].text_unaffixed == "ཀཀ་"
+
+    # Token objects whose chunk_type is not 'TEXT' will be attributed no lemma.
+    # text_unaffixed and text_cleaned are also empty. Token.text must be retrieved
+    assert tokens[2].text_unaffixed == "" == tokens[2].text_cleaned
+
+
 def test_spaces_as_punct():
     input_str = "བ ཀྲ་ཤིས་ བདེ་ལེགས། \nམཐའི་རྒྱ་མཚོར་ག ནས་སོ།། །།ཀཀ"
     profile = "POS"
@@ -130,3 +245,13 @@ def test_spaces_as_punct():
     assert tokens[1].text == " "
     assert tokens[2].text == "ཀྲ་"
     assert tokens[8].text == " \n"
+
+def test_parallelized_spaces_as_punct():
+    input_str = "བ ཀྲ་ཤིས་ བདེ་ལེགས། \nམཐའི་རྒྱ་མཚོར་ག ནས་སོ།། །།ཀཀ"
+    profile = "POS"
+    wt = WordTokenizer(tok_profile=profile)
+    tokens = wt.tokenize(input_str, spaces_as_punct=True, parallelize=True)
+    assert tokens[0].text == "བ"
+    assert tokens[1].text == " "
+    assert tokens[2].text == "ཀྲ་"
+    assert tokens[8].text == " \n"
\ No newline at end of file

From 092999adcf6207969e3e0dad5f737da62d687794 Mon Sep 17 00:00:00 2001
From: tenzin <ten13zin@gmail.com>
Date: Fri, 10 Jul 2020 19:08:06 +0530
Subject: [PATCH 3/4] add: parallel_tokenize test

---
 botok/tokenizers/wordtokenizer.py | 21 +++++++++++----------
 tests/test_tokenize.py            | 14 ++++++++------
 2 files changed, 19 insertions(+), 16 deletions(-)

diff --git a/botok/tokenizers/wordtokenizer.py b/botok/tokenizers/wordtokenizer.py
index 361a3c0..fd59950 100644
--- a/botok/tokenizers/wordtokenizer.py
+++ b/botok/tokenizers/wordtokenizer.py
@@ -1,16 +1,16 @@
 # coding: utf8
-from pathlib import Path
 import csv
+from pathlib import Path
 
-from .tokenize import Tokenize
-from ..modifytokens.splitaffixed import split_affixed
-from ..modifytokens.mergedagdra import MergeDagdra
-from ..modifytokens.adjusttokens import AdjustTokens
-from ..tries.trie import Trie
 from ..chunks.chunks import TokChunks
-from ..textunits.bosyl import BoSyl
 from ..config import Config
-from ..vars import TSEK, AA
+from ..modifytokens.adjusttokens import AdjustTokens
+from ..modifytokens.mergedagdra import MergeDagdra
+from ..modifytokens.splitaffixed import split_affixed
+from ..textunits.bosyl import BoSyl
+from ..tries.trie import Trie
+from ..vars import AA, TSEK
+from .tokenize import Tokenize
 
 part_lemmas = {}
 filename = Path(__file__).parent.parent / "resources" / "particles.tsv"
@@ -66,8 +66,9 @@ def __init__(
         )
         self.adj = AdjustTokens(main=adj_main, custom=adj_custom)
 
-    @profile
-    def tokenize(self, string, split_affixes=True, spaces_as_punct=False, parallelize=False):
+    def tokenize(
+        self, string, split_affixes=True, spaces_as_punct=False, parallelize=False
+    ):
         """
         :param string: to be tokenized
         :param split_affixes: separates the affixed particles into seperate tokens if True
diff --git a/tests/test_tokenize.py b/tests/test_tokenize.py
index 96bb412..8c4482c 100644
--- a/tests/test_tokenize.py
+++ b/tests/test_tokenize.py
@@ -1,6 +1,8 @@
 # coding: utf8
-from botok import *
 from textwrap import dedent
+
+from botok import *
+
 from helpers import pos_tok
 
 
@@ -29,7 +31,7 @@ def test_tokenize():
                         syls_start_end: [{'start': 0, 'end': 4}, {'start': 4, 'end': 7}]
                         start: 5
                         len: 7
-                        
+
                         """
     )
     str(tokens[0])
@@ -56,7 +58,7 @@ def test_tokenize():
                         syls_start_end: [{'start': 0, 'end': 4}, {'start': 4, 'end': 7}]
                         start: 5
                         len: 7
-                        
+
                         """
     )
     assert str(tokens[2]) == expected
@@ -87,7 +89,7 @@ def test_parallized_token():
                         syls_start_end: [{'start': 0, 'end': 4}, {'start': 4, 'end': 7}]
                         start: 5
                         len: 7
-                        
+
                         """
     )
     str(tokens[0])
@@ -113,7 +115,7 @@ def test_parallized_token():
                         syls_start_end: [{'start': 0, 'end': 4}, {'start': 4, 'end': 7}]
                         start: 5
                         len: 7
-                        
+
                         """
     )
     assert str(tokens[2]) == expected
@@ -188,4 +190,4 @@ def test_parallized_non_max_end_of_string():
     assert tokens[1].text == "བདེ་"
 
 if __name__ == "__main__":
-    test_parallized_token()
\ No newline at end of file
+    test_parallized_token()

From 6e9f35a36b92636a9f17b61a9ac969a7f2b17aab Mon Sep 17 00:00:00 2001
From: tenzin <ten13zin@gmail.com>
Date: Mon, 27 Jul 2020 11:38:13 +0530
Subject: [PATCH 4/4] test: add tokenize benchmarking

---
 tests/benchmark.py | 68 ++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 68 insertions(+)
 create mode 100644 tests/benchmark.py

diff --git a/tests/benchmark.py b/tests/benchmark.py
new file mode 100644
index 0000000..92ccdfe
--- /dev/null
+++ b/tests/benchmark.py
@@ -0,0 +1,68 @@
+import urllib.request
+
+import pytest
+
+from botok import WordTokenizer
+
+
+@pytest.fixture(scope="module")
+def testcases():
+    file_url = "https://raw.githubusercontent.com/OpenPecha/P000001/master/P000001.opf/base/v001.txt"
+    response = urllib.request.urlopen(file_url)
+    large_size = response.read().decode("utf-8")
+    medium_size = large_size[: len(large_size) // 3]
+    small_size = medium_size[: len(medium_size) // 5]
+    return large_size, medium_size, small_size
+
+
+@pytest.fixture(scope="module")
+def wt():
+    return WordTokenizer()
+
+
+def test_non_parallized_tok_small(benchmark, wt, testcases):
+    _, _, small = testcases
+
+    @benchmark
+    def tok():
+        wt.tokenize(small)
+
+
+def test_parallized_tok_small(benchmark, wt, testcases):
+    _, _, small = testcases
+
+    @benchmark
+    def tok():
+        wt.tokenize(small, parallelize=True)
+
+
+def test_non_parallized_tok_medium(benchmark, wt, testcases):
+    _, medium, _ = testcases
+
+    @benchmark
+    def tok():
+        wt.tokenize(medium)
+
+
+def test_parallized_tok_medium(benchmark, wt, testcases):
+    _, medium, _ = testcases
+
+    @benchmark
+    def tok():
+        wt.tokenize(medium, parallelize=True)
+
+
+def test_non_parallized_tok_large(benchmark, wt, testcases):
+    large, _, _ = testcases
+
+    @benchmark
+    def tok():
+        wt.tokenize(large)
+
+
+def test_parallized_tok_large(benchmark, wt, testcases):
+    large, _, _ = testcases
+
+    @benchmark
+    def tok():
+        wt.tokenize(large, parallelize=True)