From 6533d0575df45956d9fe3e925a3ce7aa8730ea50 Mon Sep 17 00:00:00 2001 From: chandralegend Date: Sun, 11 Aug 2024 08:44:04 +0530 Subject: [PATCH] added support to python 3.8 and up --- multi_tokenizer/language_detect.py | 14 +-- multi_tokenizer/pretrained/__init__.py | 14 +-- multi_tokenizer/tokenizer.py | 17 ++-- pyproject.toml | 2 +- support/try_multitokenizer.ipynb | 136 +++++++++++++------------ 5 files changed, 94 insertions(+), 89 deletions(-) diff --git a/multi_tokenizer/language_detect.py b/multi_tokenizer/language_detect.py index 1e70517..fef2ab0 100644 --- a/multi_tokenizer/language_detect.py +++ b/multi_tokenizer/language_detect.py @@ -1,27 +1,29 @@ """Language Detection Module.""" +from typing import List + from lingua import DetectionResult, Language, LanguageDetectorBuilder class LanguageDetector: """Language Detector.""" - def __init__(self, languages: list[Language]) -> None: + def __init__(self, languages: List[Language]) -> None: """Initialize Language Detector.""" self.languages = languages self.detector = LanguageDetectorBuilder.from_languages(*languages).build() - def detect(self, text: str) -> list[DetectionResult]: + def detect(self, text: str) -> List[DetectionResult]: """Detect Language.""" results = self.detector.detect_multiple_languages_of(text) return results - def split_n_detect(self, text: str, sep: str = " ") -> list[DetectionResult]: + def split_n_detect(self, text: str, sep: str = " ") -> List[DetectionResult]: """Split Text and Detect Language.""" def merge_results( - results: list[list[DetectionResult]], - ) -> list[DetectionResult]: + results: List[List[DetectionResult]], + ) -> List[DetectionResult]: """Merge Results. If consecutive words are detected as the same language, merge them.""" merged_results: list[DetectionResult] = [] for result in results: @@ -60,7 +62,7 @@ def merge_results( merged_results = merge_results(results) return merged_results - def batch_detect(self, texts: list[str]) -> list[list[DetectionResult]]: + def batch_detect(self, texts: List[str]) -> List[List[DetectionResult]]: """Detect Language in Batch.""" results = self.detector.detect_multiple_languages_in_parallel_of(texts) return results diff --git a/multi_tokenizer/pretrained/__init__.py b/multi_tokenizer/pretrained/__init__.py index 4d47c5a..ff1cc2f 100644 --- a/multi_tokenizer/pretrained/__init__.py +++ b/multi_tokenizer/pretrained/__init__.py @@ -2,7 +2,7 @@ import os from enum import Enum -from typing import Any +from typing import Any, Dict, List, Tuple from lingua import Language @@ -19,8 +19,8 @@ def __init__( self, tokenizer_path: str, language: Language, - language_prefix: tuple[str, int], - language_suffix: tuple[str, int], + language_prefix: Tuple[str, int], + language_suffix: Tuple[str, int], ) -> None: """Initialize Language Specific Tokenizer.""" self.language = language @@ -33,19 +33,19 @@ def pre_tokenizer(self) -> Any: """Get Pre Tokenizer.""" return self.tokenizer.pre_tokenizer - def encode(self, text: str) -> list[int]: + def encode(self, text: str) -> List[int]: """Get Encoder.""" return self.tokenizer.encode(text).ids - def tokenize(self, text: str) -> list[str]: + def tokenize(self, text: str) -> List[str]: """Tokenize Text.""" return self.tokenizer.encode(text).tokens - def decode(self, ids: list[int]) -> str: + def decode(self, ids: List[int]) -> str: """Decode Text.""" return self.tokenizer.decode(ids) - def get_vocab(self) -> dict[str, int]: + def get_vocab(self) -> Dict[str, int]: """Get Vocab.""" return self.tokenizer.get_vocab() diff --git a/multi_tokenizer/tokenizer.py b/multi_tokenizer/tokenizer.py index c824039..fd821aa 100644 --- a/multi_tokenizer/tokenizer.py +++ b/multi_tokenizer/tokenizer.py @@ -1,6 +1,7 @@ """Multi Tokenizer Module.""" import pickle +from typing import Dict, List, Tuple, Union from lingua import Language @@ -13,8 +14,8 @@ class MultiTokenizer: def __init__( self, - tokenizers: list[LanguageSpecificTokenizer | PretrainedTokenizers], - fallback_tokenizer: LanguageSpecificTokenizer | PretrainedTokenizers, + tokenizers: List[Union[LanguageSpecificTokenizer, PretrainedTokenizers]], + fallback_tokenizer: Union[LanguageSpecificTokenizer, PretrainedTokenizers], split_text: bool = False, sep: str = " ", ) -> None: @@ -41,7 +42,7 @@ def __init__( self.split_text = split_text self.sep = sep - def pre_tokenize(self, text: str) -> list[tuple[str, tuple[int, int]]]: + def pre_tokenize(self, text: str) -> List[Tuple[str, Tuple[int, int]]]: """Pre Tokenize Text.""" pre_tokenized_text = [] language_detections = ( @@ -62,7 +63,7 @@ def pre_tokenize(self, text: str) -> list[tuple[str, tuple[int, int]]]: detected_text = text[detection.start_index : detection.end_index] last_end_index = detection.end_index tokenizer = self.get_tokenizer_by_language(detection.language) - output: list[tuple[str, tuple[int, int]]] = ( + output: List[Tuple[str, Tuple[int, int]]] = ( tokenizer.pre_tokenizer.pre_tokenize_str(detected_text) ) output = ( @@ -117,7 +118,7 @@ def get_tokenizer_by_prefix_id(self, prefix_id: int) -> LanguageSpecificTokenize return tokenizer raise ValueError(f"Tokenizer for prefix ID {prefix_id} not found.") - def encode(self, text: str) -> list[int]: + def encode(self, text: str) -> List[int]: """Encode Text.""" ids = [] language_detections = ( @@ -147,7 +148,7 @@ def encode(self, text: str) -> list[int]: ids.extend(token_ids) return ids - def tokenize(self, text: str) -> list[str]: + def tokenize(self, text: str) -> List[str]: """Tokenize Text.""" tokens = [] language_detections = ( @@ -176,7 +177,7 @@ def tokenize(self, text: str) -> list[str]: tokens.extend(self.fallback_tokenizer.tokenize(text[last_end_index:])) return tokens - def decode(self, token_ids: list[int]) -> str: + def decode(self, token_ids: List[int]) -> str: """Decode Encoding.""" decoded_str = [] cur_tokenizer = None @@ -210,7 +211,7 @@ def load(path: str) -> "MultiTokenizer": with open(path, "rb") as file: return pickle.load(file) - def get_vocab(self) -> dict[str, dict[str, int]]: + def get_vocab(self) -> Dict[str, Dict[str, int]]: """Get Vocabulary.""" vocab = {} for tokenizer in self.tokenizers: diff --git a/pyproject.toml b/pyproject.toml index 71b0161..c1a8d2b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -7,7 +7,7 @@ license = "MIT" readme = "README.md" [tool.poetry.dependencies] -python = "^3.12" +python = "^3.8.19" tokenizers = "^0.19.1" lingua-language-detector = "^2.0.2" diff --git a/support/try_multitokenizer.ipynb b/support/try_multitokenizer.ipynb index 529f8f3..7e20b82 100644 --- a/support/try_multitokenizer.ipynb +++ b/support/try_multitokenizer.ipynb @@ -19,7 +19,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 2, "metadata": {}, "outputs": [], "source": [ @@ -27,6 +27,7 @@ " PretrainedTokenizers.ENGLISH,\n", " PretrainedTokenizers.CHINESE,\n", " PretrainedTokenizers.HINDI,\n", + " PretrainedTokenizers.SPANISH\n", "]\n", "fallback_tokenizer = PretrainedTokenizers.ENGLISH\n", "\n", @@ -35,71 +36,68 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[('', (0, 1)),\n", - " ('The', (1, 4)),\n", - " ('Ġcat', (4, 8)),\n", - " ('Ġis', (8, 11)),\n", - " ('Ġcute', (11, 16)),\n", - " ('.', (16, 17)),\n", - " ('', (15, 16)),\n", - " (' ', (16, 17)),\n", - " ('', (17, 18)),\n", - " ('çĮ«å¾Īåı¯çĪ±', (18, 22)),\n", - " ('.', (22, 23)),\n", - " ('', (21, 22)),\n", - " (' ', (22, 23)),\n", - " ('', (23, 24)),\n", - " ('ब', (24, 25)),\n", - " ('ि', (25, 26)),\n", - " ('ल', (26, 27)),\n", - " ('à¥į', (27, 28)),\n", - " ('ल', (28, 29)),\n", - " ('à¥Ģ', (29, 30)),\n", - " ('Ġबह', (30, 33)),\n", - " ('à¥ģ', (33, 34)),\n", - " ('त', (34, 35)),\n", - " ('Ġप', (35, 37)),\n", - " ('à¥į', (37, 38)),\n", - " ('य', (38, 39)),\n", - " ('ा', (39, 40)),\n", - " ('र', (40, 41)),\n", - " ('à¥Ģ', (41, 42)),\n", - " ('Ġह', (42, 44)),\n", - " ('à¥Ī.', (44, 46)),\n", - " ('', (44, 45)),\n", - " (' නර්තනය ඉතා ආදරේ.', (45, 62))]" + " ('Translate', (1, 10)),\n", + " ('Ġthis', (10, 15)),\n", + " ('Ġhindi', (15, 21)),\n", + " ('Ġsentence', (21, 30)),\n", + " ('Ġto', (30, 33)),\n", + " ('Ġenglish', (33, 41)),\n", + " ('', (39, 40)),\n", + " (' ', (40, 41)),\n", + " ('', (41, 42)),\n", + " ('-', (42, 43)),\n", + " ('Ġब', (43, 45)),\n", + " ('ि', (45, 46)),\n", + " ('ल', (46, 47)),\n", + " ('à¥į', (47, 48)),\n", + " ('ल', (48, 49)),\n", + " ('à¥Ģ', (49, 50)),\n", + " ('Ġबह', (50, 53)),\n", + " ('à¥ģ', (53, 54)),\n", + " ('त', (54, 55)),\n", + " ('Ġप', (55, 57)),\n", + " ('à¥į', (57, 58)),\n", + " ('य', (58, 59)),\n", + " ('ा', (59, 60)),\n", + " ('र', (60, 61)),\n", + " ('à¥Ģ', (61, 62)),\n", + " ('Ġह', (62, 64)),\n", + " ('', (62, 63)),\n", + " ('ै.', (63, 65))]" ] }, - "execution_count": 17, + "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# sentence = \"The cat is cute. 猫很可爱. बिल्ली बहुत प्यारी है.\"\n", - "# sentence = \"Translate this hindi sentence to english - बिल्ली बहुत प्यारी है.\"\n", - "sentence = \"The cat is cute. 猫很可爱. बिल्ली बहुत प्यारी है. නර්තනය ඉතා ආදරේ.\"\n", + "sentence = \"Translate this hindi sentence to english - बिल्ली बहुत प्यारी है.\"\n", + "# sentence = \"The cat is cute. 猫很可爱. बिल्ली बहुत प्यारी है. නර්තනය ඉතා ආදරේ.\"\n", + "# sentence = \"The cat is cute. 猫很可爱. बिल्ली बहुत प्यारी है. El gato es lindo.\"\n", "tokenizer.pre_tokenize(sentence)" ] }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 11, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "62" + "65" ] }, - "execution_count": 18, + "execution_count": 11, "metadata": {}, "output_type": "execute_result" } @@ -110,7 +108,7 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 12, "metadata": {}, "outputs": [ { @@ -119,7 +117,7 @@ "25000" ] }, - "execution_count": 19, + "execution_count": 12, "metadata": {}, "output_type": "execute_result" } @@ -130,7 +128,7 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 13, "metadata": {}, "outputs": [], "source": [ @@ -140,15 +138,15 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 14, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "['', 'The', 'Ġcat', 'Ġis', 'Ġcute', '.', '', 'Ġ', '', 'çĮ', '«', 'å¾Ī', 'åı¯', 'çĪ', '±', '.', '', 'Ġ', '', 'ब', 'ि', 'ल', 'à¥į', 'ल', 'à¥Ģ', 'Ġबह', 'à¥ģ', 'त', 'Ġप', 'à¥į', 'य', 'ा', 'र', 'à¥Ģ', 'Ġह', 'à¥Ī.', '', 'Ġ', 'à', '¶', '±', 'à', '¶', '»', 'à', '·', 'Ĭ', 'à', '¶', 'Ń', 'à', '¶', '±', 'à', '¶', 'º', 'Ġ', 'à', '¶', 'ī', 'à', '¶', 'Ń', 'à', '·', 'ı', 'Ġ', 'à', '¶', 'Ĩ', 'à', '¶', '¯', 'à', '¶', '»', 'à', '·', 'ļ', '.']\n", - "[3, 383, 714, 416, 2065, 24, 4, 231, 7, 1512, 115, 9849, 368, 439, 120, 24, 8, 231, 9, 329, 277, 285, 282, 285, 273, 342, 286, 283, 294, 282, 292, 270, 272, 273, 287, 919, 10, 231, 167, 125, 120, 167, 125, 130, 167, 126, 243, 167, 125, 266, 167, 125, 120, 167, 125, 129, 231, 167, 125, 242, 167, 125, 266, 167, 126, 248, 231, 167, 125, 239, 167, 125, 118, 167, 125, 130, 167, 126, 259, 24]\n" + "['', 'Tr', 'ans', 'l', 'ate', 'Ġthis', 'Ġhind', 'i', 'Ġsentence', 'Ġto', 'Ġeng', 'lish', '', 'Ġ', '', '-', 'Ġब', 'ि', 'ल', 'à¥į', 'ल', 'à¥Ģ', 'Ġबह', 'à¥ģ', 'त', 'Ġप', 'à¥į', 'य', 'ा', 'र', 'à¥Ģ', 'Ġह', '', 'à', '¥', 'Ī', '.']\n", + "[3, 7235, 6614, 86, 755, 775, 10763, 83, 19412, 276, 3602, 9113, 4, 231, 9, 23, 290, 277, 285, 282, 285, 273, 342, 286, 283, 294, 282, 292, 270, 272, 273, 287, 10, 167, 109, 241, 24]\n" ] } ], @@ -166,16 +164,16 @@ }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 15, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "80" + "37" ] }, - "execution_count": 22, + "execution_count": 15, "metadata": {}, "output_type": "execute_result" } @@ -186,15 +184,15 @@ }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 16, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Decoded String: The cat is cute. 猫很可爱. बिल्ली बहुत प्यारी है. නර්තනය ඉතා ආදරේ.\n", - "Original String: The cat is cute. 猫很可爱. बिल्ली बहुत प्यारी है. නර්තනය ඉතා ආදරේ.\n" + "Decoded String: Translate this hindi sentence to english - बिल्ली बहुत प्यारी है.\n", + "Original String: Translate this hindi sentence to english - बिल्ली बहुत प्यारी है.\n" ] } ], @@ -205,14 +203,18 @@ }, { "cell_type": "code", - "execution_count": 24, + "execution_count": 10, "metadata": {}, "outputs": [ { - "name": "stderr", - "output_type": "stream", - "text": [ - "Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.\n" + "ename": "ModuleNotFoundError", + "evalue": "No module named 'transformers'", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mModuleNotFoundError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[10], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mtransformers\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m AutoTokenizer\n\u001b[1;32m 3\u001b[0m aya_tokenizer \u001b[38;5;241m=\u001b[39m AutoTokenizer\u001b[38;5;241m.\u001b[39mfrom_pretrained(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mCohereForAI/aya-23-8B\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n", + "\u001b[0;31mModuleNotFoundError\u001b[0m: No module named 'transformers'" ] } ], @@ -224,7 +226,7 @@ }, { "cell_type": "code", - "execution_count": 25, + "execution_count": 11, "metadata": {}, "outputs": [ { @@ -244,7 +246,7 @@ }, { "cell_type": "code", - "execution_count": 26, + "execution_count": 12, "metadata": {}, "outputs": [ { @@ -253,7 +255,7 @@ "255029" ] }, - "execution_count": 26, + "execution_count": 12, "metadata": {}, "output_type": "execute_result" } @@ -264,7 +266,7 @@ }, { "cell_type": "code", - "execution_count": 27, + "execution_count": 13, "metadata": {}, "outputs": [ { @@ -282,7 +284,7 @@ }, { "cell_type": "code", - "execution_count": 28, + "execution_count": 14, "metadata": {}, "outputs": [ { @@ -301,14 +303,14 @@ }, { "cell_type": "code", - "execution_count": 29, + "execution_count": 15, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "243 μs ± 29.7 μs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)\n" + "200 μs ± 15 μs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)\n" ] } ], @@ -318,14 +320,14 @@ }, { "cell_type": "code", - "execution_count": 31, + "execution_count": 16, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "892 μs ± 24.6 μs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)\n" + "909 μs ± 98.6 μs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)\n" ] } ], @@ -357,7 +359,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.12.4" + "version": "3.8.19" } }, "nbformat": 4,