Skip to content

Commit

Permalink
added support to python 3.8 and up
Browse files Browse the repository at this point in the history
  • Loading branch information
chandralegend committed Aug 11, 2024
1 parent 11a9677 commit 6533d05
Show file tree
Hide file tree
Showing 5 changed files with 94 additions and 89 deletions.
14 changes: 8 additions & 6 deletions multi_tokenizer/language_detect.py
Original file line number Diff line number Diff line change
@@ -1,27 +1,29 @@
"""Language Detection Module."""

from typing import List

from lingua import DetectionResult, Language, LanguageDetectorBuilder


class LanguageDetector:
"""Language Detector."""

def __init__(self, languages: list[Language]) -> None:
def __init__(self, languages: List[Language]) -> None:
"""Initialize Language Detector."""
self.languages = languages
self.detector = LanguageDetectorBuilder.from_languages(*languages).build()

def detect(self, text: str) -> list[DetectionResult]:
def detect(self, text: str) -> List[DetectionResult]:
"""Detect Language."""
results = self.detector.detect_multiple_languages_of(text)
return results

def split_n_detect(self, text: str, sep: str = " ") -> list[DetectionResult]:
def split_n_detect(self, text: str, sep: str = " ") -> List[DetectionResult]:
"""Split Text and Detect Language."""

def merge_results(
results: list[list[DetectionResult]],
) -> list[DetectionResult]:
results: List[List[DetectionResult]],
) -> List[DetectionResult]:
"""Merge Results. If consecutive words are detected as the same language, merge them."""
merged_results: list[DetectionResult] = []
for result in results:
Expand Down Expand Up @@ -60,7 +62,7 @@ def merge_results(
merged_results = merge_results(results)
return merged_results

def batch_detect(self, texts: list[str]) -> list[list[DetectionResult]]:
def batch_detect(self, texts: List[str]) -> List[List[DetectionResult]]:
"""Detect Language in Batch."""
results = self.detector.detect_multiple_languages_in_parallel_of(texts)
return results
14 changes: 7 additions & 7 deletions multi_tokenizer/pretrained/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

import os
from enum import Enum
from typing import Any
from typing import Any, Dict, List, Tuple

from lingua import Language

Expand All @@ -19,8 +19,8 @@ def __init__(
self,
tokenizer_path: str,
language: Language,
language_prefix: tuple[str, int],
language_suffix: tuple[str, int],
language_prefix: Tuple[str, int],
language_suffix: Tuple[str, int],
) -> None:
"""Initialize Language Specific Tokenizer."""
self.language = language
Expand All @@ -33,19 +33,19 @@ def pre_tokenizer(self) -> Any:
"""Get Pre Tokenizer."""
return self.tokenizer.pre_tokenizer

def encode(self, text: str) -> list[int]:
def encode(self, text: str) -> List[int]:
"""Get Encoder."""
return self.tokenizer.encode(text).ids

def tokenize(self, text: str) -> list[str]:
def tokenize(self, text: str) -> List[str]:
"""Tokenize Text."""
return self.tokenizer.encode(text).tokens

def decode(self, ids: list[int]) -> str:
def decode(self, ids: List[int]) -> str:
"""Decode Text."""
return self.tokenizer.decode(ids)

def get_vocab(self) -> dict[str, int]:
def get_vocab(self) -> Dict[str, int]:
"""Get Vocab."""
return self.tokenizer.get_vocab()

Expand Down
17 changes: 9 additions & 8 deletions multi_tokenizer/tokenizer.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
"""Multi Tokenizer Module."""

import pickle
from typing import Dict, List, Tuple, Union

from lingua import Language

Expand All @@ -13,8 +14,8 @@ class MultiTokenizer:

def __init__(
self,
tokenizers: list[LanguageSpecificTokenizer | PretrainedTokenizers],
fallback_tokenizer: LanguageSpecificTokenizer | PretrainedTokenizers,
tokenizers: List[Union[LanguageSpecificTokenizer, PretrainedTokenizers]],
fallback_tokenizer: Union[LanguageSpecificTokenizer, PretrainedTokenizers],
split_text: bool = False,
sep: str = " ",
) -> None:
Expand All @@ -41,7 +42,7 @@ def __init__(
self.split_text = split_text
self.sep = sep

def pre_tokenize(self, text: str) -> list[tuple[str, tuple[int, int]]]:
def pre_tokenize(self, text: str) -> List[Tuple[str, Tuple[int, int]]]:
"""Pre Tokenize Text."""
pre_tokenized_text = []
language_detections = (
Expand All @@ -62,7 +63,7 @@ def pre_tokenize(self, text: str) -> list[tuple[str, tuple[int, int]]]:
detected_text = text[detection.start_index : detection.end_index]
last_end_index = detection.end_index
tokenizer = self.get_tokenizer_by_language(detection.language)
output: list[tuple[str, tuple[int, int]]] = (
output: List[Tuple[str, Tuple[int, int]]] = (
tokenizer.pre_tokenizer.pre_tokenize_str(detected_text)
)
output = (
Expand Down Expand Up @@ -117,7 +118,7 @@ def get_tokenizer_by_prefix_id(self, prefix_id: int) -> LanguageSpecificTokenize
return tokenizer
raise ValueError(f"Tokenizer for prefix ID {prefix_id} not found.")

def encode(self, text: str) -> list[int]:
def encode(self, text: str) -> List[int]:
"""Encode Text."""
ids = []
language_detections = (
Expand Down Expand Up @@ -147,7 +148,7 @@ def encode(self, text: str) -> list[int]:
ids.extend(token_ids)
return ids

def tokenize(self, text: str) -> list[str]:
def tokenize(self, text: str) -> List[str]:
"""Tokenize Text."""
tokens = []
language_detections = (
Expand Down Expand Up @@ -176,7 +177,7 @@ def tokenize(self, text: str) -> list[str]:
tokens.extend(self.fallback_tokenizer.tokenize(text[last_end_index:]))
return tokens

def decode(self, token_ids: list[int]) -> str:
def decode(self, token_ids: List[int]) -> str:
"""Decode Encoding."""
decoded_str = []
cur_tokenizer = None
Expand Down Expand Up @@ -210,7 +211,7 @@ def load(path: str) -> "MultiTokenizer":
with open(path, "rb") as file:
return pickle.load(file)

def get_vocab(self) -> dict[str, dict[str, int]]:
def get_vocab(self) -> Dict[str, Dict[str, int]]:
"""Get Vocabulary."""
vocab = {}
for tokenizer in self.tokenizers:
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ license = "MIT"
readme = "README.md"

[tool.poetry.dependencies]
python = "^3.12"
python = "^3.8.19"
tokenizers = "^0.19.1"
lingua-language-detector = "^2.0.2"

Expand Down
Loading

0 comments on commit 6533d05

Please sign in to comment.