Skip to content

Commit

Permalink
pretokenization added
Browse files Browse the repository at this point in the history
  • Loading branch information
chandralegend committed Jul 21, 2024
1 parent e658ed5 commit 02e2de7
Show file tree
Hide file tree
Showing 9 changed files with 199,407 additions and 7 deletions.
Empty file.
49,825 changes: 49,825 additions & 0 deletions multi_tokenizer/pretrained/chinese_tokenizer.json

Large diffs are not rendered by default.

49,824 changes: 49,824 additions & 0 deletions multi_tokenizer/pretrained/english_tokenizer.json

Large diffs are not rendered by default.

49,824 changes: 49,824 additions & 0 deletions multi_tokenizer/pretrained/hindi_tokenizer.json

Large diffs are not rendered by default.

49,825 changes: 49,825 additions & 0 deletions multi_tokenizer/pretrained/spanish_tokenizer.json

Large diffs are not rendered by default.

Empty file.
26 changes: 22 additions & 4 deletions multi_tokenizer/tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from multi_tokenizer.language_detect import LanguageDetector
from multi_tokenizer.pretrained import (
LanguageSpecificTokenizer,
PretrainedTokenizers,
get_tokenizer_by_language,
)

Expand All @@ -14,11 +15,20 @@
class MultiTokenizer:
"""MultiTokenizer Class."""

def __init__(self, tokenizers: list[LanguageSpecificTokenizer]) -> None:
def __init__(
self, tokenizers: list[LanguageSpecificTokenizer | PretrainedTokenizers]
) -> None:
"""Initialize MultiTokenizer."""
self.tokenizers = tokenizers
self.tokenizers = [
(
tokenizer
if isinstance(tokenizer, LanguageSpecificTokenizer)
else tokenizer.value
)
for tokenizer in tokenizers
]
self.language_detector = LanguageDetector(
[tokenizer.language for tokenizer in tokenizers]
[tokenizer.language for tokenizer in self.tokenizers]
)

def pre_tokenize(self, text: str) -> list[tuple[str, tuple[int, int]]]:
Expand All @@ -34,8 +44,9 @@ def pre_tokenize(self, text: str) -> list[tuple[str, tuple[int, int]]]:
output = (
[(tokenizer.language_prefix_token, (-1, 0))]
+ output
+ [(tokenizer.language_suffix_token, (len(text) - 1, len(text)))]
+ [(tokenizer.language_suffix_token, (len(detected_text) - 2, len(detected_text) - 1))]
)
# Offsetting the start and end indices of the tokens to match the original text
output = [
(
token,
Expand Down Expand Up @@ -74,3 +85,10 @@ def get_vocab(self) -> dict[str, dict[str, int]]:
for tokenizer in self.tokenizers:
vocab[tokenizer.language.name] = tokenizer.get_vocab()
return vocab

def get_vocab_size(self) -> int:
"""Get Vocabulary Size."""
vocab = self.get_vocab()
return sum(
len(vocab[language]) for language in vocab
) # TODO: This is probably wrong
5 changes: 2 additions & 3 deletions support/lang_detection/lang_detect.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
},
{
"cell_type": "code",
"execution_count": 8,
"execution_count": 16,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -21,7 +21,7 @@
},
{
"cell_type": "code",
"execution_count": 15,
"execution_count": 18,
"metadata": {},
"outputs": [
{
Expand All @@ -38,7 +38,6 @@
"sentence = \"I love NLP. 我喜欢自然语言处理. Me encanta NLP.\"\n",
"\n",
"for result in detector.detect_multiple_languages_of(sentence):\n",
" result.\n",
" print(f\"{result.language}: '{sentence[result.start_index:result.end_index]}'\")"
]
},
Expand Down
85 changes: 85 additions & 0 deletions support/try_multitokenizer.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"from multi_tokenizer import MultiTokenizer, PretrainedTokenizers"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"lang_tokenizers = [\n",
" PretrainedTokenizers.ENGLISH,\n",
" PretrainedTokenizers.CHINESE,\n",
" PretrainedTokenizers.SPANISH\n",
"]\n",
"tokenizer = MultiTokenizer(lang_tokenizers)"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[('<ZH>', (0, 1)),\n",
" ('æĪijçĪ±èĩªçĦ¶è¯Ńè¨Ģå¤ĦçIJĨ', (1, 9)),\n",
" ('.', (9, 10)),\n",
" ('Ġ', (10, 11)),\n",
" ('</ZH>', (9, 10)),\n",
" ('<ES>', (10, 11)),\n",
" ('Me', (11, 13)),\n",
" ('Ġencanta', (13, 21)),\n",
" ('ĠNLP', (21, 25)),\n",
" ('.', (25, 26)),\n",
" ('</ES>', (24, 25))]"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"tokenizer.pre_tokenize(\"我爱自然语言处理. Me encanta NLP.\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "multi-tokenizer",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.4"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

0 comments on commit 02e2de7

Please sign in to comment.