From a3ee925bdb0fe00bd3e49669dc22c8e210add080 Mon Sep 17 00:00:00 2001 From: tenzin Date: Sat, 11 Jan 2020 20:56:45 +0530 Subject: [PATCH 1/4] add mutlitprocess tokenization with tests --- botok/tokenizers/tokenize.py | 42 +++++++++++--- botok/tokenizers/wordtokenizer.py | 9 ++- tests/test_tokenize.py | 93 ++++++++++++++++++++++++++++++- 3 files changed, 131 insertions(+), 13 deletions(-) diff --git a/botok/tokenizers/tokenize.py b/botok/tokenizers/tokenize.py index 3849138..cf930a3 100644 --- a/botok/tokenizers/tokenize.py +++ b/botok/tokenizers/tokenize.py @@ -1,4 +1,7 @@ # coding: utf-8 +from concurrent.futures import ProcessPoolExecutor +import copy + from .token import Token from ..vars import NAMCHE, TSEK from ..vars import chunk_values as u @@ -17,11 +20,39 @@ def __init__(self, trie): self.pre_processed = None self.trie = trie - def tokenize(self, pre_processed, debug=False): + + def parallelized_tokenize(self, pre_processed): + grouped_pre_processed = [] + grouped_chunks = [] + found_group = False + for chunk in pre_processed.chunks: + if not chunk[0] and chunk[1][0] == 105: + found_group = True + else: + grouped_chunks.append(chunk) + + if found_group: + grouped_chunks.append(chunk) + dup_pre_processed = copy.deepcopy(pre_processed) + dup_pre_processed.chunks = grouped_chunks + grouped_pre_processed.append(dup_pre_processed) + grouped_chunks = [] + found_group = False + else: + if grouped_chunks: + dup_pre_processed = copy.deepcopy(pre_processed) + dup_pre_processed.chunks = grouped_chunks + grouped_pre_processed.append(dup_pre_processed) + + with ProcessPoolExecutor() as executor: + tokenized_sents = executor.map(self.tokenize, grouped_pre_processed) + return sum(tokenized_sents, []) + + + def tokenize(self, pre_processed): """ :param pre_processed: PyBoTextChunks of the text to be tokenized - :param debug: prints debug info in True :return: a list of Token objects """ self.pre_processed = pre_processed @@ -253,9 +284,4 @@ def _has_skrt_char(self, char_groups): A.SKRT_VOW in char_groups.values() or A.SKRT_CONS in char_groups.values() or A.SKRT_SUB_CONS in char_groups.values() - ) - - @staticmethod - def debug(debug, to_print): - if debug: - print(to_print, flush=True) + ) \ No newline at end of file diff --git a/botok/tokenizers/wordtokenizer.py b/botok/tokenizers/wordtokenizer.py index f673122..5f04fd0 100644 --- a/botok/tokenizers/wordtokenizer.py +++ b/botok/tokenizers/wordtokenizer.py @@ -66,18 +66,21 @@ def __init__( ) self.adj = AdjustTokens(main=adj_main, custom=adj_custom) - def tokenize(self, string, split_affixes=True, spaces_as_punct=False, debug=False): + def tokenize(self, string, split_affixes=True, spaces_as_punct=False, parallelize=False): """ :param string: to be tokenized :param split_affixes: separates the affixed particles into seperate tokens if True - :param debug: print debug info while parsing + :param parallelize: do multiprocessed tokenization :return: list of pybo.tokenizers.Token objects """ preprocessed = TokChunks( string, ignore_chars=self.ignore_chars, space_as_punct=spaces_as_punct ) preprocessed.serve_syls_to_trie() - tokens = self.tok.tokenize(preprocessed, debug=debug) + if parallelize: + tokens = self.tok.parallelized_tokenize(preprocessed) + else: + tokens = self.tok.tokenize(preprocessed) if split_affixes: split_affixed(tokens) diff --git a/tests/test_tokenize.py b/tests/test_tokenize.py index 65328a5..a24dbdd 100644 --- a/tests/test_tokenize.py +++ b/tests/test_tokenize.py @@ -62,6 +62,64 @@ def test_tokenize(): assert str(tokens[2]) == expected +def test_parallized_token(): + profile = "empty" + main, custom = Config().get_tok_data_paths(profile) + tok = Tokenize(Trie(BoSyl, profile, main, custom)) + tok.trie.inflect_n_modify_trie("བཀྲ་ཤིས་") + tok.trie.inflect_n_add_data("བཀྲ་ཤིས་\tNOUN\t\tབཀྲ་ཤིས་\t17500") + tok.trie.inflect_n_modify_trie("མཐའ་") + tok.trie.inflect_n_add_data("མཐའ་\tNOUN") + in_str = "མཐའི་བཀྲ་ཤིས། ཀཀ abc མཐའི་རྒྱ་མཚོ་" + preproc = TokChunks(in_str) + preproc.serve_syls_to_trie() + tokens = tok.parallelized_tokenize(preproc) + expected = dedent( + """\ + text: "བཀྲ་ཤིས" + text_cleaned: "བཀྲ་ཤིས་" + text_unaffixed: "བཀྲ་ཤིས་" + syls: ["བཀྲ", "ཤིས"] + senses: | pos: NOUN, freq: 17500, sense: བཀྲ་ཤིས་, affixed: False | + char_types: |CONS|CONS|SUB_CONS|TSEK|CONS|VOW|CONS| + chunk_type: TEXT + syls_idx: [[0, 1, 2], [4, 5, 6]] + syls_start_end: [{'start': 0, 'end': 4}, {'start': 4, 'end': 7}] + start: 5 + len: 7 + + """ + ) + str(tokens[0]) + assert str(tokens[1]) == expected + assert tokens[2].text == "། " + assert tokens[2].chunk_type == "PUNCT" + # add sense to བཀྲ་ཤིས་ + pos_tok.tok.trie.inflect_n_add_data("བཀྲ་ཤིས་\tNOUN\t\tབཀྲ་ཤིས་\t17500") + tokens = pos_tok.tokenize(in_str, parallelize=True) + expected = dedent( + """\ + text: "བཀྲ་ཤིས" + text_cleaned: "བཀྲ་ཤིས་" + text_unaffixed: "བཀྲ་ཤིས་" + syls: ["བཀྲ", "ཤིས"] + pos: NOUN + lemma: བཀྲ་ཤིས་ + sense: བཀྲ་ཤིས་ + senses: | pos: NOUN, freq: 17204, affixed: False, lemma: བཀྲ་ཤིས་ | pos: NOUN, freq: 17500, sense: བཀྲ་ཤིས་, affixed: False, lemma: བཀྲ་ཤིས་ | + char_types: |CONS|CONS|SUB_CONS|TSEK|CONS|VOW|CONS| + chunk_type: TEXT + freq: 17500 + syls_idx: [[0, 1, 2], [4, 5, 6]] + syls_start_end: [{'start': 0, 'end': 4}, {'start': 4, 'end': 7}] + start: 5 + len: 7 + + """ + ) + assert str(tokens[2]) == expected + + def test_non_max2(): profile = "empty" main, custom = Config().get_tok_data_paths(profile) @@ -82,6 +140,26 @@ def test_non_max2(): assert tokens[2]["senses"][0]["pos"] == "NO_POS" +def test_parallized_non_max2(): + profile = "empty" + main, custom = Config().get_tok_data_paths(profile) + tok = Tokenize(Trie(BoSyl, profile, main, custom)) + tok.trie.inflect_n_modify_trie("བཀྲ་ཤིས་") + tok.trie.inflect_n_add_data("བཀྲ་ཤིས་\tNOUN") + tok.trie.inflect_n_modify_trie( + "བཀྲ་ཤིས་བདེ་ལེགས།" + ) # to ensure we're not in a maximal match + preproc = TokChunks("བཀྲ་ཤིས་བདེ་བཀྲ་") + preproc.serve_syls_to_trie() + tokens = tok.parallelized_tokenize(preproc) + assert tokens[0].text == "བཀྲ་ཤིས་" + assert tokens[0]["senses"][0]["pos"] == "NOUN" + assert tokens[1].text == "བདེ་" + assert tokens[1]["senses"][0]["pos"] == "NON_WORD" + assert tokens[2].text == "བཀྲ་" + assert tokens[2]["senses"][0]["pos"] == "NO_POS" + + def test_non_max_end_of_string(): profile = "empty" main, custom = Config().get_tok_data_paths(profile) @@ -96,5 +174,16 @@ def test_non_max_end_of_string(): assert tokens[0].text == "བཀྲ་ཤིས་" assert tokens[1].text == "བདེ་" -if __name__ == "__main__": - test_non_max2() \ No newline at end of file +def test_parallized_non_max_end_of_string(): + profile = "empty" + main, custom = Config().get_tok_data_paths(profile) + tok = Tokenize(Trie(BoSyl, profile, main, custom)) + tok.trie.inflect_n_modify_trie("བཀྲ་ཤིས་") + tok.trie.inflect_n_modify_trie( + "བཀྲ་ཤིས་བདེ་ལེགས།" + ) # to ensure we're not in a maximal match + preproc = TokChunks("བཀྲ་ཤིས་བདེ་") + preproc.serve_syls_to_trie() + tokens = tok.parallelized_tokenize(preproc) + assert tokens[0].text == "བཀྲ་ཤིས་" + assert tokens[1].text == "བདེ་" \ No newline at end of file From 329ac090d60c731a2874a1aaf609e42cb213d95c Mon Sep 17 00:00:00 2001 From: tenzin Date: Sun, 12 Jan 2020 23:27:58 +0530 Subject: [PATCH 2/4] add mutliprocessing tokenizer --- botok/tokenizers/tokenize.py | 35 ++++++--- botok/tokenizers/wordtokenizer.py | 1 + tests/test_bugs.py | 10 +-- tests/test_tokenize.py | 6 +- tests/test_wordtokenizer.py | 125 ++++++++++++++++++++++++++++++ 5 files changed, 158 insertions(+), 19 deletions(-) diff --git a/botok/tokenizers/tokenize.py b/botok/tokenizers/tokenize.py index cf930a3..a1ad35d 100644 --- a/botok/tokenizers/tokenize.py +++ b/botok/tokenizers/tokenize.py @@ -1,6 +1,8 @@ # coding: utf-8 from concurrent.futures import ProcessPoolExecutor import copy +import math +import os from .token import Token from ..vars import NAMCHE, TSEK @@ -22,31 +24,44 @@ def __init__(self, trie): def parallelized_tokenize(self, pre_processed): - grouped_pre_processed = [] + + def __flatten(l): + return sum(l, []) + + # group chunks by punctuation + chunks_grouped_by_punct = [] grouped_chunks = [] found_group = False - for chunk in pre_processed.chunks: + for i, chunk in enumerate(pre_processed.chunks): if not chunk[0] and chunk[1][0] == 105: found_group = True else: grouped_chunks.append(chunk) - + if found_group: grouped_chunks.append(chunk) - dup_pre_processed = copy.deepcopy(pre_processed) - dup_pre_processed.chunks = grouped_chunks - grouped_pre_processed.append(dup_pre_processed) + chunks_grouped_by_punct.append(grouped_chunks) grouped_chunks = [] found_group = False else: if grouped_chunks: - dup_pre_processed = copy.deepcopy(pre_processed) - dup_pre_processed.chunks = grouped_chunks - grouped_pre_processed.append(dup_pre_processed) + chunks_grouped_by_punct.append(grouped_chunks) + + # create pre_process objects base in no. of cpu + grouped_pre_processed = [] + n_cpu = os.cpu_count() + groups_per_cpu = math.ceil(len(chunks_grouped_by_punct) / n_cpu) + start, end = 0, groups_per_cpu + for _ in range(n_cpu): + dup_preprocess = copy.deepcopy(pre_processed) + dup_preprocess.chunks = __flatten(chunks_grouped_by_punct[start: end]) + grouped_pre_processed.append(dup_preprocess) + start, end = end, end+groups_per_cpu + # do mutliprocessing tokenization with ProcessPoolExecutor() as executor: tokenized_sents = executor.map(self.tokenize, grouped_pre_processed) - return sum(tokenized_sents, []) + return __flatten(tokenized_sents) def tokenize(self, pre_processed): diff --git a/botok/tokenizers/wordtokenizer.py b/botok/tokenizers/wordtokenizer.py index 5f04fd0..361a3c0 100644 --- a/botok/tokenizers/wordtokenizer.py +++ b/botok/tokenizers/wordtokenizer.py @@ -66,6 +66,7 @@ def __init__( ) self.adj = AdjustTokens(main=adj_main, custom=adj_custom) + @profile def tokenize(self, string, split_affixes=True, spaces_as_punct=False, parallelize=False): """ :param string: to be tokenized diff --git a/tests/test_bugs.py b/tests/test_bugs.py index 0279ed2..f814894 100644 --- a/tests/test_bugs.py +++ b/tests/test_bugs.py @@ -106,13 +106,13 @@ def test_multiple_spaces(): def test_bug1(): tok = WordTokenizer("POS") string = "བ་ཀུ་" - tokens = tok.tokenize(string, debug=True) + tokens = tok.tokenize(string) assert tokens def test_bug2(): string = "བྲ་གྲྀ་" - tokens = pos_tok.tokenize(string, debug=True) + tokens = pos_tok.tokenize(string) assert tokens @@ -167,8 +167,4 @@ def test_shad_in_syllable(): ("TEXT", "ལེ གས"), ("PUNCT", "། "), ("TEXT", "བཀྲ་"), - ] - - -if __name__ == "__main__": - test_syl_tokenize() \ No newline at end of file + ] \ No newline at end of file diff --git a/tests/test_tokenize.py b/tests/test_tokenize.py index a24dbdd..96bb412 100644 --- a/tests/test_tokenize.py +++ b/tests/test_tokenize.py @@ -95,7 +95,6 @@ def test_parallized_token(): assert tokens[2].text == "། " assert tokens[2].chunk_type == "PUNCT" # add sense to བཀྲ་ཤིས་ - pos_tok.tok.trie.inflect_n_add_data("བཀྲ་ཤིས་\tNOUN\t\tབཀྲ་ཤིས་\t17500") tokens = pos_tok.tokenize(in_str, parallelize=True) expected = dedent( """\ @@ -186,4 +185,7 @@ def test_parallized_non_max_end_of_string(): preproc.serve_syls_to_trie() tokens = tok.parallelized_tokenize(preproc) assert tokens[0].text == "བཀྲ་ཤིས་" - assert tokens[1].text == "བདེ་" \ No newline at end of file + assert tokens[1].text == "བདེ་" + +if __name__ == "__main__": + test_parallized_token() \ No newline at end of file diff --git a/tests/test_wordtokenizer.py b/tests/test_wordtokenizer.py index 59a4f6a..7c99abe 100644 --- a/tests/test_wordtokenizer.py +++ b/tests/test_wordtokenizer.py @@ -121,6 +121,121 @@ def test_get_default_lemma(): assert tokens[2].text_unaffixed == "" == tokens[2].text_cleaned +def test_parallelized_get_default_lemma(): + input_str = "བཀྲ་ཤིས་བདེ་ལེགས། མཐའི་རྒྱ་མཚོར་གནས་སོ།། །།ཀཀ" + profile = "POS" + + # reconstitute all the pieces that WordTokenizer gathers + main, custom = Config().get_tok_data_paths(profile) + tok = Tokenize(Trie(BoSyl, profile, main, custom)) + preproc = TokChunks(input_str) + preproc.serve_syls_to_trie() + tokens = tok.parallelized_tokenize(preproc) + split_affixed(tokens) + + # if __get_default_lemma() is not run, only the lemmas coming from the lemma folder will be included + # in the Token objects. + assert str(tokens[3]) == dedent( + """\ + text: "མཐ" + text_cleaned: "མཐ" + text_unaffixed: "མཐ" + syls: ["མཐ"] + senses: | pos: NOUN, freq: 45097, affixed: True | + char_types: |CONS|CONS| + chunk_type: TEXT + affix_host: True + syls_idx: [[0, 1]] + syls_start_end: [{'start': 0, 'end': 2}] + start: 18 + len: 2 + + """ + ) + assert "lemma" not in tokens[3]["senses"][0] + + assert str(tokens[4]) == dedent( + """\ + text: "འི་" + text_cleaned: "འི་" + text_unaffixed: "འི་" + syls: ["འི"] + pos: PART + char_types: |CONS|VOW|TSEK| + chunk_type: TEXT + affix: True + syls_idx: [[0, 1]] + syls_start_end: [{'start': 2, 'end': 5}] + start: 20 + len: 3 + + """ + ) + + # regular words also have no lemmas + assert "lemma" not in tokens[0]["senses"][0] + + # doing the same thing using WordTokenizer, which will apply its __get_default_lemma() method + # the profile is the same, so no lemma comes from the trie content files. + tokens = pos_tok.tokenize(input_str, parallelize=True) + + # the lemma is Token.text_unaffixed with an extra འ and/or a tsek where required + assert str(tokens[3]) == dedent( + """\ + text: "མཐ" + text_cleaned: "མཐ" + text_unaffixed: "མཐ" + syls: ["མཐ"] + pos: NOUN + lemma: མཐའ་ + senses: | pos: NOUN, freq: 45097, affixed: True, lemma: མཐའ་ | + char_types: |CONS|CONS| + chunk_type: TEXT + freq: 45097 + affix_host: True + syls_idx: [[0, 1]] + syls_start_end: [{'start': 0, 'end': 2}] + start: 18 + len: 2 + + """ + ) + assert tokens[3]["senses"][0]["lemma"] == "མཐའ་" + + # for particles, WordTokenizer reads the lemmas from a file and attributes them + assert str(tokens[4]) == dedent( + """\ + text: "འི་" + text_cleaned: "འི་" + text_unaffixed: "འི་" + syls: ["འི"] + pos: PART + lemma: གི་ + senses: | lemma: གི་ | + char_types: |CONS|VOW|TSEK| + chunk_type: TEXT + affix: True + syls_idx: [[0, 1]] + syls_start_end: [{'start': 2, 'end': 5}] + start: 20 + len: 3 + + """ + ) + + # for regular words, Token.text_unaffixed is simply copied + assert tokens[0]["senses"][0]["lemma"] == "བཀྲ་ཤིས་" + + # non-words do not have lemmas + assert "lemma" not in tokens[10]["senses"][0] + assert tokens[10].text_cleaned == "ཀཀ་" + assert tokens[10].text_unaffixed == "ཀཀ་" + + # Token objects whose chunk_type is not 'TEXT' will be attributed no lemma. + # text_unaffixed and text_cleaned are also empty. Token.text must be retrieved + assert tokens[2].text_unaffixed == "" == tokens[2].text_cleaned + + def test_spaces_as_punct(): input_str = "བ ཀྲ་ཤིས་ བདེ་ལེགས། \nམཐའི་རྒྱ་མཚོར་ག ནས་སོ།། །།ཀཀ" profile = "POS" @@ -130,3 +245,13 @@ def test_spaces_as_punct(): assert tokens[1].text == " " assert tokens[2].text == "ཀྲ་" assert tokens[8].text == " \n" + +def test_parallelized_spaces_as_punct(): + input_str = "བ ཀྲ་ཤིས་ བདེ་ལེགས། \nམཐའི་རྒྱ་མཚོར་ག ནས་སོ།། །།ཀཀ" + profile = "POS" + wt = WordTokenizer(tok_profile=profile) + tokens = wt.tokenize(input_str, spaces_as_punct=True, parallelize=True) + assert tokens[0].text == "བ" + assert tokens[1].text == " " + assert tokens[2].text == "ཀྲ་" + assert tokens[8].text == " \n" \ No newline at end of file From 092999adcf6207969e3e0dad5f737da62d687794 Mon Sep 17 00:00:00 2001 From: tenzin Date: Fri, 10 Jul 2020 19:08:06 +0530 Subject: [PATCH 3/4] add: parallel_tokenize test --- botok/tokenizers/wordtokenizer.py | 21 +++++++++++---------- tests/test_tokenize.py | 14 ++++++++------ 2 files changed, 19 insertions(+), 16 deletions(-) diff --git a/botok/tokenizers/wordtokenizer.py b/botok/tokenizers/wordtokenizer.py index 361a3c0..fd59950 100644 --- a/botok/tokenizers/wordtokenizer.py +++ b/botok/tokenizers/wordtokenizer.py @@ -1,16 +1,16 @@ # coding: utf8 -from pathlib import Path import csv +from pathlib import Path -from .tokenize import Tokenize -from ..modifytokens.splitaffixed import split_affixed -from ..modifytokens.mergedagdra import MergeDagdra -from ..modifytokens.adjusttokens import AdjustTokens -from ..tries.trie import Trie from ..chunks.chunks import TokChunks -from ..textunits.bosyl import BoSyl from ..config import Config -from ..vars import TSEK, AA +from ..modifytokens.adjusttokens import AdjustTokens +from ..modifytokens.mergedagdra import MergeDagdra +from ..modifytokens.splitaffixed import split_affixed +from ..textunits.bosyl import BoSyl +from ..tries.trie import Trie +from ..vars import AA, TSEK +from .tokenize import Tokenize part_lemmas = {} filename = Path(__file__).parent.parent / "resources" / "particles.tsv" @@ -66,8 +66,9 @@ def __init__( ) self.adj = AdjustTokens(main=adj_main, custom=adj_custom) - @profile - def tokenize(self, string, split_affixes=True, spaces_as_punct=False, parallelize=False): + def tokenize( + self, string, split_affixes=True, spaces_as_punct=False, parallelize=False + ): """ :param string: to be tokenized :param split_affixes: separates the affixed particles into seperate tokens if True diff --git a/tests/test_tokenize.py b/tests/test_tokenize.py index 96bb412..8c4482c 100644 --- a/tests/test_tokenize.py +++ b/tests/test_tokenize.py @@ -1,6 +1,8 @@ # coding: utf8 -from botok import * from textwrap import dedent + +from botok import * + from helpers import pos_tok @@ -29,7 +31,7 @@ def test_tokenize(): syls_start_end: [{'start': 0, 'end': 4}, {'start': 4, 'end': 7}] start: 5 len: 7 - + """ ) str(tokens[0]) @@ -56,7 +58,7 @@ def test_tokenize(): syls_start_end: [{'start': 0, 'end': 4}, {'start': 4, 'end': 7}] start: 5 len: 7 - + """ ) assert str(tokens[2]) == expected @@ -87,7 +89,7 @@ def test_parallized_token(): syls_start_end: [{'start': 0, 'end': 4}, {'start': 4, 'end': 7}] start: 5 len: 7 - + """ ) str(tokens[0]) @@ -113,7 +115,7 @@ def test_parallized_token(): syls_start_end: [{'start': 0, 'end': 4}, {'start': 4, 'end': 7}] start: 5 len: 7 - + """ ) assert str(tokens[2]) == expected @@ -188,4 +190,4 @@ def test_parallized_non_max_end_of_string(): assert tokens[1].text == "བདེ་" if __name__ == "__main__": - test_parallized_token() \ No newline at end of file + test_parallized_token() From 6e9f35a36b92636a9f17b61a9ac969a7f2b17aab Mon Sep 17 00:00:00 2001 From: tenzin Date: Mon, 27 Jul 2020 11:38:13 +0530 Subject: [PATCH 4/4] test: add tokenize benchmarking --- tests/benchmark.py | 68 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 68 insertions(+) create mode 100644 tests/benchmark.py diff --git a/tests/benchmark.py b/tests/benchmark.py new file mode 100644 index 0000000..92ccdfe --- /dev/null +++ b/tests/benchmark.py @@ -0,0 +1,68 @@ +import urllib.request + +import pytest + +from botok import WordTokenizer + + +@pytest.fixture(scope="module") +def testcases(): + file_url = "https://raw.githubusercontent.com/OpenPecha/P000001/master/P000001.opf/base/v001.txt" + response = urllib.request.urlopen(file_url) + large_size = response.read().decode("utf-8") + medium_size = large_size[: len(large_size) // 3] + small_size = medium_size[: len(medium_size) // 5] + return large_size, medium_size, small_size + + +@pytest.fixture(scope="module") +def wt(): + return WordTokenizer() + + +def test_non_parallized_tok_small(benchmark, wt, testcases): + _, _, small = testcases + + @benchmark + def tok(): + wt.tokenize(small) + + +def test_parallized_tok_small(benchmark, wt, testcases): + _, _, small = testcases + + @benchmark + def tok(): + wt.tokenize(small, parallelize=True) + + +def test_non_parallized_tok_medium(benchmark, wt, testcases): + _, medium, _ = testcases + + @benchmark + def tok(): + wt.tokenize(medium) + + +def test_parallized_tok_medium(benchmark, wt, testcases): + _, medium, _ = testcases + + @benchmark + def tok(): + wt.tokenize(medium, parallelize=True) + + +def test_non_parallized_tok_large(benchmark, wt, testcases): + large, _, _ = testcases + + @benchmark + def tok(): + wt.tokenize(large) + + +def test_parallized_tok_large(benchmark, wt, testcases): + large, _, _ = testcases + + @benchmark + def tok(): + wt.tokenize(large, parallelize=True)