Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Multiprocessing tokenization #70

Closed
wants to merge 4 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
57 changes: 49 additions & 8 deletions botok/tokenizers/tokenize.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,9 @@
# coding: utf-8
from concurrent.futures import ProcessPoolExecutor
import copy
import math
import os

from .token import Token
from ..vars import NAMCHE, TSEK
from ..vars import chunk_values as u
Expand All @@ -17,11 +22,52 @@ def __init__(self, trie):
self.pre_processed = None
self.trie = trie

def tokenize(self, pre_processed, debug=False):

def parallelized_tokenize(self, pre_processed):

def __flatten(l):
return sum(l, [])

# group chunks by punctuation
chunks_grouped_by_punct = []
grouped_chunks = []
found_group = False
for i, chunk in enumerate(pre_processed.chunks):
if not chunk[0] and chunk[1][0] == 105:
found_group = True
else:
grouped_chunks.append(chunk)

if found_group:
grouped_chunks.append(chunk)
chunks_grouped_by_punct.append(grouped_chunks)
grouped_chunks = []
found_group = False
else:
if grouped_chunks:
chunks_grouped_by_punct.append(grouped_chunks)

# create pre_process objects base in no. of cpu
grouped_pre_processed = []
n_cpu = os.cpu_count()
groups_per_cpu = math.ceil(len(chunks_grouped_by_punct) / n_cpu)
start, end = 0, groups_per_cpu
for _ in range(n_cpu):
dup_preprocess = copy.deepcopy(pre_processed)
dup_preprocess.chunks = __flatten(chunks_grouped_by_punct[start: end])
grouped_pre_processed.append(dup_preprocess)
start, end = end, end+groups_per_cpu

# do mutliprocessing tokenization
with ProcessPoolExecutor() as executor:
tokenized_sents = executor.map(self.tokenize, grouped_pre_processed)
return __flatten(tokenized_sents)


def tokenize(self, pre_processed):
"""

:param pre_processed: PyBoTextChunks of the text to be tokenized
:param debug: prints debug info in True
:return: a list of Token objects
"""
self.pre_processed = pre_processed
Expand Down Expand Up @@ -253,9 +299,4 @@ def _has_skrt_char(self, char_groups):
A.SKRT_VOW in char_groups.values()
or A.SKRT_CONS in char_groups.values()
or A.SKRT_SUB_CONS in char_groups.values()
)

@staticmethod
def debug(debug, to_print):
if debug:
print(to_print, flush=True)
)
27 changes: 16 additions & 11 deletions botok/tokenizers/wordtokenizer.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,16 @@
# coding: utf8
from pathlib import Path
import csv
from pathlib import Path

from .tokenize import Tokenize
from ..modifytokens.splitaffixed import split_affixed
from ..modifytokens.mergedagdra import MergeDagdra
from ..modifytokens.adjusttokens import AdjustTokens
from ..tries.trie import Trie
from ..chunks.chunks import TokChunks
from ..textunits.bosyl import BoSyl
from ..config import Config
from ..vars import TSEK, AA
from ..modifytokens.adjusttokens import AdjustTokens
from ..modifytokens.mergedagdra import MergeDagdra
from ..modifytokens.splitaffixed import split_affixed
from ..textunits.bosyl import BoSyl
from ..tries.trie import Trie
from ..vars import AA, TSEK
from .tokenize import Tokenize

part_lemmas = {}
filename = Path(__file__).parent.parent / "resources" / "particles.tsv"
Expand Down Expand Up @@ -66,18 +66,23 @@ def __init__(
)
self.adj = AdjustTokens(main=adj_main, custom=adj_custom)

def tokenize(self, string, split_affixes=True, spaces_as_punct=False, debug=False):
def tokenize(
self, string, split_affixes=True, spaces_as_punct=False, parallelize=False
):
"""
:param string: to be tokenized
:param split_affixes: separates the affixed particles into seperate tokens if True
:param debug: print debug info while parsing
:param parallelize: do multiprocessed tokenization
:return: list of pybo.tokenizers.Token objects
"""
preprocessed = TokChunks(
string, ignore_chars=self.ignore_chars, space_as_punct=spaces_as_punct
)
preprocessed.serve_syls_to_trie()
tokens = self.tok.tokenize(preprocessed, debug=debug)
if parallelize:
tokens = self.tok.parallelized_tokenize(preprocessed)
else:
tokens = self.tok.tokenize(preprocessed)

if split_affixes:
split_affixed(tokens)
Expand Down
68 changes: 68 additions & 0 deletions tests/benchmark.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
import urllib.request

import pytest

from botok import WordTokenizer


@pytest.fixture(scope="module")
def testcases():
file_url = "https://raw.githubusercontent.com/OpenPecha/P000001/master/P000001.opf/base/v001.txt"
response = urllib.request.urlopen(file_url)
large_size = response.read().decode("utf-8")
medium_size = large_size[: len(large_size) // 3]
small_size = medium_size[: len(medium_size) // 5]
return large_size, medium_size, small_size


@pytest.fixture(scope="module")
def wt():
return WordTokenizer()


def test_non_parallized_tok_small(benchmark, wt, testcases):
_, _, small = testcases

@benchmark
def tok():
wt.tokenize(small)


def test_parallized_tok_small(benchmark, wt, testcases):
_, _, small = testcases

@benchmark
def tok():
wt.tokenize(small, parallelize=True)


def test_non_parallized_tok_medium(benchmark, wt, testcases):
_, medium, _ = testcases

@benchmark
def tok():
wt.tokenize(medium)


def test_parallized_tok_medium(benchmark, wt, testcases):
_, medium, _ = testcases

@benchmark
def tok():
wt.tokenize(medium, parallelize=True)


def test_non_parallized_tok_large(benchmark, wt, testcases):
large, _, _ = testcases

@benchmark
def tok():
wt.tokenize(large)


def test_parallized_tok_large(benchmark, wt, testcases):
large, _, _ = testcases

@benchmark
def tok():
wt.tokenize(large, parallelize=True)
10 changes: 3 additions & 7 deletions tests/test_bugs.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,13 +106,13 @@ def test_multiple_spaces():
def test_bug1():
tok = WordTokenizer("POS")
string = "བ་ཀུ་"
tokens = tok.tokenize(string, debug=True)
tokens = tok.tokenize(string)
assert tokens


def test_bug2():
string = "བྲ་གྲྀ་"
tokens = pos_tok.tokenize(string, debug=True)
tokens = pos_tok.tokenize(string)
assert tokens


Expand Down Expand Up @@ -167,8 +167,4 @@ def test_shad_in_syllable():
("TEXT", "ལེ གས"),
("PUNCT", "། "),
("TEXT", "བཀྲ་"),
]


if __name__ == "__main__":
test_syl_tokenize()
]
101 changes: 97 additions & 4 deletions tests/test_tokenize.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
# coding: utf8
from botok import *
from textwrap import dedent

from botok import *

from helpers import pos_tok


Expand Down Expand Up @@ -29,7 +31,7 @@ def test_tokenize():
syls_start_end: [{'start': 0, 'end': 4}, {'start': 4, 'end': 7}]
start: 5
len: 7

"""
)
str(tokens[0])
Expand All @@ -56,7 +58,64 @@ def test_tokenize():
syls_start_end: [{'start': 0, 'end': 4}, {'start': 4, 'end': 7}]
start: 5
len: 7


"""
)
assert str(tokens[2]) == expected


def test_parallized_token():
profile = "empty"
main, custom = Config().get_tok_data_paths(profile)
tok = Tokenize(Trie(BoSyl, profile, main, custom))
tok.trie.inflect_n_modify_trie("བཀྲ་ཤིས་")
tok.trie.inflect_n_add_data("བཀྲ་ཤིས་\tNOUN\t\tབཀྲ་ཤིས་\t17500")
tok.trie.inflect_n_modify_trie("མཐའ་")
tok.trie.inflect_n_add_data("མཐའ་\tNOUN")
in_str = "མཐའི་བཀྲ་ཤིས། ཀཀ abc མཐའི་རྒྱ་མཚོ་"
preproc = TokChunks(in_str)
preproc.serve_syls_to_trie()
tokens = tok.parallelized_tokenize(preproc)
expected = dedent(
"""\
text: "བཀྲ་ཤིས"
text_cleaned: "བཀྲ་ཤིས་"
text_unaffixed: "བཀྲ་ཤིས་"
syls: ["བཀྲ", "ཤིས"]
senses: | pos: NOUN, freq: 17500, sense: བཀྲ་ཤིས་, affixed: False |
char_types: |CONS|CONS|SUB_CONS|TSEK|CONS|VOW|CONS|
chunk_type: TEXT
syls_idx: [[0, 1, 2], [4, 5, 6]]
syls_start_end: [{'start': 0, 'end': 4}, {'start': 4, 'end': 7}]
start: 5
len: 7

"""
)
str(tokens[0])
assert str(tokens[1]) == expected
assert tokens[2].text == "། "
assert tokens[2].chunk_type == "PUNCT"
# add sense to བཀྲ་ཤིས་
tokens = pos_tok.tokenize(in_str, parallelize=True)
expected = dedent(
"""\
text: "བཀྲ་ཤིས"
text_cleaned: "བཀྲ་ཤིས་"
text_unaffixed: "བཀྲ་ཤིས་"
syls: ["བཀྲ", "ཤིས"]
pos: NOUN
lemma: བཀྲ་ཤིས་
sense: བཀྲ་ཤིས་
senses: | pos: NOUN, freq: 17204, affixed: False, lemma: བཀྲ་ཤིས་ | pos: NOUN, freq: 17500, sense: བཀྲ་ཤིས་, affixed: False, lemma: བཀྲ་ཤིས་ |
char_types: |CONS|CONS|SUB_CONS|TSEK|CONS|VOW|CONS|
chunk_type: TEXT
freq: 17500
syls_idx: [[0, 1, 2], [4, 5, 6]]
syls_start_end: [{'start': 0, 'end': 4}, {'start': 4, 'end': 7}]
start: 5
len: 7

"""
)
assert str(tokens[2]) == expected
Expand All @@ -82,6 +141,26 @@ def test_non_max2():
assert tokens[2]["senses"][0]["pos"] == "NO_POS"


def test_parallized_non_max2():
profile = "empty"
main, custom = Config().get_tok_data_paths(profile)
tok = Tokenize(Trie(BoSyl, profile, main, custom))
tok.trie.inflect_n_modify_trie("བཀྲ་ཤིས་")
tok.trie.inflect_n_add_data("བཀྲ་ཤིས་\tNOUN")
tok.trie.inflect_n_modify_trie(
"བཀྲ་ཤིས་བདེ་ལེགས།"
) # to ensure we're not in a maximal match
preproc = TokChunks("བཀྲ་ཤིས་བདེ་བཀྲ་")
preproc.serve_syls_to_trie()
tokens = tok.parallelized_tokenize(preproc)
assert tokens[0].text == "བཀྲ་ཤིས་"
assert tokens[0]["senses"][0]["pos"] == "NOUN"
assert tokens[1].text == "བདེ་"
assert tokens[1]["senses"][0]["pos"] == "NON_WORD"
assert tokens[2].text == "བཀྲ་"
assert tokens[2]["senses"][0]["pos"] == "NO_POS"


def test_non_max_end_of_string():
profile = "empty"
main, custom = Config().get_tok_data_paths(profile)
Expand All @@ -96,5 +175,19 @@ def test_non_max_end_of_string():
assert tokens[0].text == "བཀྲ་ཤིས་"
assert tokens[1].text == "བདེ་"

def test_parallized_non_max_end_of_string():
profile = "empty"
main, custom = Config().get_tok_data_paths(profile)
tok = Tokenize(Trie(BoSyl, profile, main, custom))
tok.trie.inflect_n_modify_trie("བཀྲ་ཤིས་")
tok.trie.inflect_n_modify_trie(
"བཀྲ་ཤིས་བདེ་ལེགས།"
) # to ensure we're not in a maximal match
preproc = TokChunks("བཀྲ་ཤིས་བདེ་")
preproc.serve_syls_to_trie()
tokens = tok.parallelized_tokenize(preproc)
assert tokens[0].text == "བཀྲ་ཤིས་"
assert tokens[1].text == "བདེ་"

if __name__ == "__main__":
test_non_max2()
test_parallized_token()
Loading