Skip to content

Commit

Permalink
wip(language): language models from Hugging Face
Browse files Browse the repository at this point in the history
  • Loading branch information
bouassaba committed Nov 15, 2024
1 parent 1953d06 commit d2b0931
Show file tree
Hide file tree
Showing 2 changed files with 92 additions and 8 deletions.
12 changes: 4 additions & 8 deletions language/api/routers/entities.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,27 +8,23 @@
# by the GNU Affero General Public License v3.0 only, included in the file
# licenses/AGPL.txt.

from typing import TypedDict
from flask import Blueprint, request, jsonify
import spacy.cli
from ..services.models import nlp
from ..services.entities import EntityExtractor

bp = Blueprint("entities", __name__)

multi_language_model = "xx_ent_wiki_sm"
spacy.cli.download(multi_language_model)

nlp = spacy.load(multi_language_model)
nlp.add_pipe("sentencizer")


@bp.route("/v3/entities", methods=["POST"])
def get_entities():
global nlp

content = request.json
text = content["text"]
language = content["language"]

entity_extractor = EntityExtractor(nlp)
entity_extractor = EntityExtractor(nlp[language])
dtos = entity_extractor.run(text)

return jsonify(dtos)
88 changes: 88 additions & 0 deletions language/api/services/models.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
import spacy.cli
import pip
import pkg_resources

models = {
"eng": {
"package": "en_core_web_sm",
"url": "https://huggingface.co/spacy/en_core_web_sm/resolve/main/en_core_web_sm-any-py3-none-any.whl",
},
"deu": {
"package": "de_core_news_sm",
"url": "https://huggingface.co/spacy/de_core_news_sm/resolve/main/de_core_news_sm-any-py3-none-any.whl",
},
"fra": {
"package": "fi_core_news_sm",
"url": "https://huggingface.co/spacy/fi_core_news_sm/resolve/main/fi_core_news_sm-any-py3-none-any.whl",
},
"ita": {
"package": "it_core_news_sm",
"url": "https://huggingface.co/spacy/it_core_news_sm/resolve/main/it_core_news_sm-any-py3-none-any.whl",
},
"jpn": {
"package": "ja_core_news_sm",
"url": "https://huggingface.co/spacy/ja_core_news_sm/resolve/main/ja_core_news_sm-any-py3-none-any.whl",
},
"nld": {
"package": "nl_core_news_sm",
"url": "https://huggingface.co/spacy/nl_core_news_sm/resolve/main/nl_core_news_sm-any-py3-none-any.whl",
},
# "por": {
# "package": "pt_core_news_sm",
# "url": "https://huggingface.co/spacy/pt_core_news_sm/resolve/main/pt_core_news_sm-any-py3-none-any.whl",
# },
# "spa": {
# "package": "es_core_news_sm",
# "url": "https://huggingface.co/spacy/es_core_news_sm/resolve/main/es_core_news_sm-any-py3-none-any.whl",
# },
# "swe": {
# "package": "sv_core_news_sm",
# "url": "https://huggingface.co/spacy/sv_core_news_sm/resolve/main/sv_core_news_sm-any-py3-none-any.whl",
# },
# "nor": {
# "package": "nb_core_news_sm",
# "url": "https://huggingface.co/spacy/nb_core_news_sm/resolve/main/nb_core_news_sm-any-py3-none-any.whl",
# },
"fin": {
"package": "fi_core_news_sm",
"url": "https://huggingface.co/spacy/fi_core_news_sm/resolve/main/fi_core_news_sm-any-py3-none-any.whl",
},
# "dan": {
# "package": "da_core_news_sm",
# "url": "https://huggingface.co/spacy/da_core_news_sm/resolve/main/da_core_news_sm-any-py3-none-any.whl",
# },
# "chi_sim": {
# "package": "zh_core_web_sm",
# "url": "https://huggingface.co/spacy/zh_core_web_sm/resolve/main/zh_core_web_sm-any-py3-none-any.whl",
# },
# "chi_tra": {
# "package": "zh_core_web_sm",
# "url": "https://huggingface.co/spacy/zh_core_web_sm/resolve/main/zh_core_web_sm-any-py3-none-any.whl",
# },
# "rus": {
# "package": "ru_core_news_sm",
# "url": "https://huggingface.co/spacy/ru_core_news_sm/resolve/main/ru_core_news_sm-any-py3-none-any.whl",
# },
"hin": {
"package": "xx_ent_wiki_sm",
"url": "https://huggingface.co/spacy/xx_ent_wiki_sm/resolve/main/xx_ent_wiki_sm-any-py3-none-any.whl",
},
"ara": {
"package": "xx_ent_wiki_sm",
"url": "https://huggingface.co/spacy/xx_ent_wiki_sm/resolve/main/xx_ent_wiki_sm-any-py3-none-any.whl",
},
}


nlp = {}
package_max_length = max(len(model["package"]) for model in models.values())
for key in models.keys():
package = models[key]["package"]
url = models[key]["url"]
if package in [pkg.key for pkg in pkg_resources.working_set]:
pip.main(["install", f"{package} @ {url}"])
else:
highlighted_package = f"\033[1m{package.ljust(package_max_length)}\033[0m"
print(f"🧠 Model {highlighted_package} is already installed.")
nlp[key] = spacy.load(package)
nlp[key].add_pipe("sentencizer")

0 comments on commit d2b0931

Please sign in to comment.