-
Notifications
You must be signed in to change notification settings - Fork 1
/
company_name_similarity.py
92 lines (82 loc) · 4.12 KB
/
company_name_similarity.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
import csv
import os
from collections import defaultdict
import sys
import difflib
import string
from time import time
import random
import re
import datetime
import pickle
import decimal
from company_score_tfidf import keyword_score_map
class CompanyNameSimilarity:
def preprocess(self, field):
return field.lower().replace('\n','').replace("'",' ').replace('\r','').replace('/',' ').\
replace('&',' ').replace('-',' ').replace('|','').replace(',','').replace('.',' ').replace('@',' ').replace('#',' ')
def normalize_company_name(self, company_name):
stop_list = ["organisation","org","inc", "ltd", "labs", "lab", "llc",
"llp", "corporation", "corp","fed","plc","inc", "co", "svc","services","service", "company",
"dept","department","assoc","association","limited","incorporation"]
abbreviate = {
"cu":"credit union"
}
return " ".join([self.preprocess(word).strip() if self.preprocess(word) not in abbreviate else abbreviate[self.preprocess(word)] \
for word in company_name.split() if self.preprocess(word).strip() not in stop_list])
def is_company_approx_contained(self, str1, str2, str1_set, str2_set, str12_set):
approx_contained_threshold = 0.3
if len(str1) == 0:
return False
#str1_set = set([x.lower().strip() for x in str1.split()])
#str2_set = set([x.lower().strip() for x in str2.split()])
if len(str1_set) == 0 or len(str2_set) == 0:
return False
score_num = len(str12_set)
score_den1 = len(str1_set)
score_den2 = len(str2_set)
return ((score_num / score_den1)+(score_num/score_den2))/2 > approx_contained_threshold
def match_score(self, str1, str2, str1_set, str2_set, mode = 'reflex'):
#str1 = self.normalize_company_name(str1)
#str2 = self.normalize_company_name(str2)
if len(str1) == 0:
return 0
#str1_set = set([x.lower().strip() for x in str1.split()])
#str2_set = set([x.lower().strip() for x in str2.split()])
if len(str1_set) == 0 or len(str2_set) == 0:
return 0
str12_set = str1_set & str2_set
if not self.is_company_approx_contained(str1,str2, str1_set, str2_set, str12_set):
return 0
mismatch_set1=[elem for elem in str1.split() if elem not in str12_set]
mismatch_set2=[elem for elem in str2.split() if elem not in str12_set]
partial_match_score_1 = self.compute_partial_match_score(mismatch_set1,mismatch_set2)
partial_match_score_2 = self.compute_partial_match_score(mismatch_set2,mismatch_set1)
score_num = len(str12_set)
score_den1 = len(str1_set)
score_den2 = len(str2_set)
if mode == 'non-reflex':
return (score_num + partial_match_score_1) / score_den1
if mode == 'reflex':
return ((score_num + partial_match_score_1) / score_den1)+((score_num + partial_match_score_2)/float(score_den2))/2
def compute_partial_match_score(self, set_str1,set_str2):
set_compute = set_str1
set_check = set_str2
if len(set_compute)==0:
return 0
score=0
for word in set_compute:
check_word_list=difflib.get_close_matches(word,set_check, n=1, cutoff=0.6)
score += (-float(self.score_company_name(set([word])))) if len(check_word_list) == 0 \
else difflib.SequenceMatcher(None,word, check_word_list[0] ).ratio()
if len(check_word_list) > 0:
if check_word_list[0] in set_check: set_check.remove(check_word_list[0])
return score
def score_company_name(self,word_set):
score = 0
for word in list(word_set):
if not word in keyword_score_map:
score += 1.0
else:
score += float(keyword_score_map[word])
return score