-
Notifications
You must be signed in to change notification settings - Fork 7
/
Models.py
67 lines (55 loc) · 2.6 KB
/
Models.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
import gensim
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from nltk.tokenize import word_tokenize
from gensim.models.doc2vec import Doc2Vec
import nltk
from transformers import AutoTokenizer, AutoModel
from sklearn.metrics.pairwise import cosine_similarity
import torch
import numpy as np
import streamlit as st
#Mean Pooling - Take attention mask into account for correct averaging
def mean_pooling(model_output, attention_mask):
token_embeddings = model_output[0] #First element of model_output contains all token embeddings
input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
@st.cache_resource
def get_HF_embeddings(sentences):
# Load model from HuggingFace Hub
tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/bert-base-nli-mean-tokens')
model = AutoModel.from_pretrained('sentence-transformers/bert-base-nli-mean-tokens')
# Tokenize sentences
encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt', max_length=512)
# Compute token embeddings
with torch.no_grad():
model_output = model(**encoded_input)
# Perform pooling. In this case, max pooling.
embeddings = mean_pooling(model_output, encoded_input['attention_mask'])
# print("Sentence embeddings:")
# print(embeddings)
return embeddings
@st.cache_data
def get_doc2vec_embeddings(JD, text_resume):
nltk.download("punkt")
data = [JD]
resume_embeddings = []
tagged_data = [TaggedDocument(words=word_tokenize(_d.lower()), tags=[str(i)]) for i, _d in enumerate(data)]
#print (tagged_data)
model = gensim.models.doc2vec.Doc2Vec(vector_size=512, min_count=3, epochs=80)
model.build_vocab(tagged_data)
model.train(tagged_data, total_examples=model.corpus_count, epochs=80)
JD_embeddings = np.transpose(model.docvecs['0'].reshape(-1,1))
for i in text_resume:
text = word_tokenize(i.lower())
embeddings = model.infer_vector(text)
resume_embeddings.append(np.transpose(embeddings.reshape(-1,1)))
return (JD_embeddings, resume_embeddings)
def cosine(embeddings1, embeddings2):
# get the match percentage
score_list = []
for i in embeddings1:
matchPercentage = cosine_similarity(np.array(i), np.array(embeddings2))
matchPercentage = np.round(matchPercentage, 4)*100 # round to two decimal
print("Your resume matches about" + str(matchPercentage[0])+ "% of the job description.")
score_list.append(str(matchPercentage[0][0]))
return score_list