Skip to content

Commit

Permalink
Merge pull request #55 from ybiquitous/add-similarity-methods
Browse files Browse the repository at this point in the history
fix: add "similarity" methods and deprecate "distance" methods
  • Loading branch information
tonytonyjan authored May 16, 2024
2 parents 1327330 + 0fd7b5d commit d662816
Show file tree
Hide file tree
Showing 4 changed files with 120 additions and 78 deletions.
18 changes: 9 additions & 9 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
![test](https://github.com/tonytonyjan/jaro_winkler/actions/workflows/test.yml/badge.svg)

[jaro_winkler](https://rubygems.org/gems/jaro_winkler) is an implementation of [Jaro-Winkler distance](http://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance) algorithm which is written in C extension and will fallback to pure Ruby version in platforms other than MRI/KRI like JRuby or Rubinius. **Both of C and Ruby implementation support any kind of string encoding, such as UTF-8, EUC-JP, Big5, etc.**
[jaro_winkler](https://rubygems.org/gems/jaro_winkler) is an implementation of [Jaro-Winkler similarity](http://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance) algorithm which is written in C extension and will fallback to pure Ruby version in platforms other than MRI/KRI like JRuby or Rubinius. **Both of C and Ruby implementation support any kind of string encoding, such as UTF-8, EUC-JP, Big5, etc.**

# Installation

Expand All @@ -13,30 +13,30 @@ gem install jaro_winkler
```ruby
require 'jaro_winkler'

# Jaro Winkler Distance
# Jaro Winkler Similarity

JaroWinkler.distance "MARTHA", "MARHTA"
JaroWinkler.similarity "MARTHA", "MARHTA"
# => 0.9611
JaroWinkler.distance "MARTHA", "marhta", ignore_case: true
JaroWinkler.similarity "MARTHA", "marhta", ignore_case: true
# => 0.9611
JaroWinkler.distance "MARTHA", "MARHTA", weight: 0.2
JaroWinkler.similarity "MARTHA", "MARHTA", weight: 0.2
# => 0.9778

# Jaro Distance
# Jaro Similarity

JaroWinkler.jaro_distance "MARTHA", "MARHTA"
JaroWinkler.jaro_similarity "MARTHA", "MARHTA"
# => 0.9444444444444445
```

There is no `JaroWinkler.jaro_winkler_distance`, it's tediously long.
There is no `JaroWinkler.jaro_winkler_similarity`, it's tediously long.

## Options

Name | Type | Default | Note
----------- | ------ | ------- | ------------------------------------------------------------------------------------------------------------
ignore_case | boolean | false | All lower case characters are converted to upper case prior to the comparison.
weight | number | 0.1 | A constant scaling factor for how much the score is adjusted upwards for having common prefixes.
threshold | number | 0.7 | The prefix bonus is only added when the compared strings have a Jaro distance above the threshold.
threshold | number | 0.7 | The prefix bonus is only added when the compared strings have a Jaro similarity above the threshold.
adj_table | boolean | false | The option is used to give partial credit for characters that may be errors due to known phonetic or character recognition errors. A typical example is to match the letter "O" with the number "0".

# Adjusting Table
Expand Down
16 changes: 16 additions & 0 deletions ext/jaro_winkler/jaro_winkler.c
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,9 @@
VALUE rb_mJaroWinkler, rb_eError, rb_eInvalidWeightError;

VALUE rb_jaro_winkler_distance(int argc, VALUE *argv, VALUE self);
VALUE rb_jaro_winkler_similarity(int argc, VALUE *argv, VALUE self);
VALUE rb_jaro_distance(int argc, VALUE *argv, VALUE self);
VALUE rb_jaro_similarity(int argc, VALUE *argv, VALUE self);
VALUE distance(int argc, VALUE *argv, VALUE self,
double (*distance_fn)(uint32_t *codepoints1, size_t len1,
uint32_t *codepoints2, size_t len2,
Expand All @@ -25,6 +27,10 @@ void Init_jaro_winkler_ext(void) {
rb_jaro_winkler_distance, -1);
rb_define_singleton_method(rb_mJaroWinkler, "jaro_distance", rb_jaro_distance,
-1);
rb_define_singleton_method(rb_mJaroWinkler, "similarity",
rb_jaro_winkler_similarity, -1);
rb_define_singleton_method(rb_mJaroWinkler, "jaro_similarity", rb_jaro_similarity,
-1);
}

VALUE distance(int argc, VALUE *argv, VALUE self,
Expand Down Expand Up @@ -69,9 +75,19 @@ VALUE distance(int argc, VALUE *argv, VALUE self,
}

VALUE rb_jaro_distance(int argc, VALUE *argv, VALUE self) {
rb_warn("JaroWinkler.jaro_distance is deprecated. Use JaroWinkler.jaro_similarity instead.");
return rb_jaro_similarity(argc, argv, self);
}

VALUE rb_jaro_similarity(int argc, VALUE *argv, VALUE self) {
return distance(argc, argv, self, jaro_distance_from_codes);
}

VALUE rb_jaro_winkler_distance(int argc, VALUE *argv, VALUE self) {
rb_warn("JaroWinkler.distance is deprecated. Use JaroWinkler.similarity instead.");
return rb_jaro_winkler_similarity(argc, argv, self);
}

VALUE rb_jaro_winkler_similarity(int argc, VALUE *argv, VALUE self) {
return distance(argc, argv, self, jaro_winkler_distance_from_codes);
}
10 changes: 10 additions & 0 deletions lib/jaro_winkler/jaro_winkler_pure.rb
Original file line number Diff line number Diff line change
Expand Up @@ -14,11 +14,21 @@ class InvalidWeightError < Error; end

class << self
def distance(str1, str2, options = {})
warn("JaroWinkler.distance is deprecated. Use JaroWinkler.similarity instead.")
similarity(str1, str2, options)
end

def similarity(str1, str2, options = {})
validate!(str1, str2)
_distance str1.codepoints.to_a, str2.codepoints.to_a, options
end

def jaro_distance(str1, str2, options = {})
warn("JaroWinkler.jaro_distance is deprecated. Use JaroWinkler.jaro_similarity instead.")
jaro_similarity(str1, str2, options)
end

def jaro_similarity(str1, str2, options = {})
validate!(str1, str2)
_jaro_distance str1.codepoints.to_a, str2.codepoints.to_a, options
end
Expand Down
154 changes: 85 additions & 69 deletions test/tests.rb
Original file line number Diff line number Diff line change
@@ -1,110 +1,118 @@
# encoding: utf-8
module Tests
def test_similarity
assert_similarity 0.9667, 'henka', 'henkan'
assert_similarity 1.0, 'al', 'al'
assert_similarity 0.9611, 'martha', 'marhta'
assert_similarity 0.8324, 'jones', 'johnson'
assert_similarity 0.9583, 'abcvwxyz', 'cabvwxyz'
assert_similarity 0.84, 'dwayne', 'duane'
assert_similarity 0.8133, 'dixon', 'dicksonx'
assert_similarity 0.0, 'fvie', 'ten'
assert_similarity 1.0, 'tony', 'tony'
assert_similarity 1.0, 'tonytonyjan', 'tonytonyjan'
assert_similarity 1.0, 'x', 'x'
assert_similarity 0.0, '', ''
assert_similarity 0.0, 'tony', ''
assert_similarity 0.0, '', 'tony'
assert_similarity 0.8727, 'tonytonyjan', 'tony'
assert_similarity 0.8727, 'tony', 'tonytonyjan'
assert_similarity 0.9407, 'necessary', 'nessecary'
assert_similarity 0.9067, 'does_exist', 'doesnt_exist'
assert_similarity 0.975, '12345678', '12345687'
assert_similarity 0.975, '12345678', '12345867'
assert_similarity 0.95, '12345678', '12348567'
end

def test_jaro_similarity
assert_jaro_similarity 0.9444, 'henka', 'henkan'
assert_jaro_similarity 1.0, 'al', 'al'
assert_jaro_similarity 0.9444, 'martha', 'marhta'
assert_jaro_similarity 0.7905, 'jones', 'johnson'
assert_jaro_similarity 0.9583, 'abcvwxyz', 'cabvwxyz'
assert_jaro_similarity 0.8222, 'dwayne', 'duane'
assert_jaro_similarity 0.7667, 'dixon', 'dicksonx'
assert_jaro_similarity 0.0, 'fvie', 'ten'
assert_jaro_similarity 1.0, 'tony', 'tony'
assert_jaro_similarity 1.0, 'tonytonyjan', 'tonytonyjan'
assert_jaro_similarity 1.0, 'x', 'x'
assert_jaro_similarity 0.0, '', ''
assert_jaro_similarity 0.0, 'tony', ''
assert_jaro_similarity 0.0, '', 'tony'
assert_jaro_similarity 0.7879, 'tonytonyjan', 'tony'
assert_jaro_similarity 0.7879, 'tony', 'tonytonyjan'
assert_jaro_similarity 0.9259, 'necessary', 'nessecary'
assert_jaro_similarity 0.8444, 'does_exist', 'doesnt_exist'
assert_jaro_similarity 0.9583, '12345678', '12345687'
assert_jaro_similarity 0.9583, '12345678', '12345867'
assert_jaro_similarity 0.9167, '12345678', '12348567'
assert_jaro_similarity 0.604, 'tonytonyjan', 'janjantony'
end

def test_distance
assert_distance 0.9667, 'henka', 'henkan'
assert_distance 1.0, 'al', 'al'
assert_distance 0.9611, 'martha', 'marhta'
assert_distance 0.8324, 'jones', 'johnson'
assert_distance 0.9583, 'abcvwxyz', 'cabvwxyz'
assert_distance 0.84, 'dwayne', 'duane'
assert_distance 0.8133, 'dixon', 'dicksonx'
assert_distance 0.0, 'fvie', 'ten'
assert_distance 1.0, 'tony', 'tony'
assert_distance 1.0, 'tonytonyjan', 'tonytonyjan'
assert_distance 1.0, 'x', 'x'
assert_distance 0.0, '', ''
assert_distance 0.0, 'tony', ''
assert_distance 0.0, '', 'tony'
assert_distance 0.8727, 'tonytonyjan', 'tony'
assert_distance 0.8727, 'tony', 'tonytonyjan'
assert_distance 0.9407, 'necessary', 'nessecary'
assert_distance 0.9067, 'does_exist', 'doesnt_exist'
assert_distance 0.975, '12345678', '12345687'
assert_distance 0.975, '12345678', '12345867'
assert_distance 0.95, '12345678', '12348567'
assert_distance 0.9667, 'henka', 'henkan'
end

def test_jaro_distance
assert_jaro_distance 0.9444, 'henka', 'henkan'
assert_jaro_distance 1.0, 'al', 'al'
assert_jaro_distance 0.9444, 'martha', 'marhta'
assert_jaro_distance 0.7905, 'jones', 'johnson'
assert_jaro_distance 0.9583, 'abcvwxyz', 'cabvwxyz'
assert_jaro_distance 0.8222, 'dwayne', 'duane'
assert_jaro_distance 0.7667, 'dixon', 'dicksonx'
assert_jaro_distance 0.0, 'fvie', 'ten'
assert_jaro_distance 1.0, 'tony', 'tony'
assert_jaro_distance 1.0, 'tonytonyjan', 'tonytonyjan'
assert_jaro_distance 1.0, 'x', 'x'
assert_jaro_distance 0.0, '', ''
assert_jaro_distance 0.0, 'tony', ''
assert_jaro_distance 0.0, '', 'tony'
assert_jaro_distance 0.7879, 'tonytonyjan', 'tony'
assert_jaro_distance 0.7879, 'tony', 'tonytonyjan'
assert_jaro_distance 0.9259, 'necessary', 'nessecary'
assert_jaro_distance 0.8444, 'does_exist', 'doesnt_exist'
assert_jaro_distance 0.9583, '12345678', '12345687'
assert_jaro_distance 0.9583, '12345678', '12345867'
assert_jaro_distance 0.9167, '12345678', '12348567'
assert_jaro_distance 0.604, 'tonytonyjan', 'janjantony'
assert_jaro_distance 0.9444, 'henka', 'henkan'
end

def test_unicode
assert_distance 0.9818, '變形金剛4:絕跡重生', '變形金剛4: 絕跡重生'
assert_distance 0.8222, '連勝文', '連勝丼'
assert_distance 0.8222, '馬英九', '馬英丸'
assert_distance 0.6667, '良い', 'いい'
assert_similarity 0.9818, '變形金剛4:絕跡重生', '變形金剛4: 絕跡重生'
assert_similarity 0.8222, '連勝文', '連勝丼'
assert_similarity 0.8222, '馬英九', '馬英丸'
assert_similarity 0.6667, '良い', 'いい'
end

def test_ignore_case
assert_distance 0.9611, 'MARTHA', 'marhta', ignore_case: true
assert_similarity 0.9611, 'MARTHA', 'marhta', ignore_case: true
end

def test_weight
assert_distance 0.9778, 'MARTHA', 'MARHTA', weight: 0.2
assert_similarity 0.9778, 'MARTHA', 'MARHTA', weight: 0.2
end

def test_threshold
assert_distance 0.9444, 'MARTHA', 'MARHTA', threshold: 0.99
assert_similarity 0.9444, 'MARTHA', 'MARHTA', threshold: 0.99
end


def test_adjusting_table
assert_distance 0.9667, 'HENKA', 'HENKAN', adj_table: true
assert_distance 1.0, 'AL', 'AL', adj_table: true
assert_distance 0.9611, 'MARTHA', 'MARHTA', adj_table: true
assert_distance 0.8598, 'JONES', 'JOHNSON', adj_table: true
assert_distance 0.9583, 'ABCVWXYZ', 'CABVWXYZ', adj_table: true
assert_distance 0.8730, 'DWAYNE', 'DUANE', adj_table: true
assert_distance 0.8393, 'DIXON', 'DICKSONX', adj_table: true
assert_distance 0.0, 'FVIE', 'TEN', adj_table: true
assert_similarity 0.9667, 'HENKA', 'HENKAN', adj_table: true
assert_similarity 1.0, 'AL', 'AL', adj_table: true
assert_similarity 0.9611, 'MARTHA', 'MARHTA', adj_table: true
assert_similarity 0.8598, 'JONES', 'JOHNSON', adj_table: true
assert_similarity 0.9583, 'ABCVWXYZ', 'CABVWXYZ', adj_table: true
assert_similarity 0.8730, 'DWAYNE', 'DUANE', adj_table: true
assert_similarity 0.8393, 'DIXON', 'DICKSONX', adj_table: true
assert_similarity 0.0, 'FVIE', 'TEN', adj_table: true
end

def test_error
assert_raises JaroWinkler::InvalidWeightError do
JaroWinkler.distance 'MARTHA', 'MARHTA', weight: 0.26
JaroWinkler.similarity 'MARTHA', 'MARHTA', weight: 0.26
end
end

def test_long_string
JaroWinkler.distance 'haisai' * 20, 'haisai' * 20
JaroWinkler.similarity 'haisai' * 20, 'haisai' * 20
end

def test_encoding
assert_encoding '焦玟綾', '焦紋綾', Encoding::Big5
assert_encoding '簡煒航', '簡偉航', Encoding::Big5_HKSCS
assert_encoding '西島之', '西鳥志', Encoding::EUCJP
assert_encoding '松本行弘', '枩本行弘', Encoding::Shift_JIS
assert_distance 1.0, "\xe8".force_encoding('iso8859-1'), 'è'
assert_similarity 1.0, "\xe8".force_encoding('iso8859-1'), 'è'
end

def test_raises_type_error
assert_raises(TypeError){ JaroWinkler.distance 'MARTHA', nil }
assert_raises(TypeError){ JaroWinkler.distance nil, 'MARTHA' }
assert_raises(TypeError){ JaroWinkler.distance nil, nil }
assert_raises(TypeError){ JaroWinkler.distance 'MARTHA', :non_string }
assert_raises(TypeError){ JaroWinkler.distance :non_string, 'MARTHA' }
assert_raises(TypeError){ JaroWinkler.distance :non_string, :non_string }
assert_raises(TypeError){ JaroWinkler.similarity 'MARTHA', nil }
assert_raises(TypeError){ JaroWinkler.similarity nil, 'MARTHA' }
assert_raises(TypeError){ JaroWinkler.similarity nil, nil }
assert_raises(TypeError){ JaroWinkler.similarity 'MARTHA', :non_string }
assert_raises(TypeError){ JaroWinkler.similarity :non_string, 'MARTHA' }
assert_raises(TypeError){ JaroWinkler.similarity :non_string, :non_string }
end

private
Expand All @@ -114,10 +122,18 @@ def assert_distance score, str1, str2, **options
end

def assert_encoding str1, str2, encoding, **options
assert_distance JaroWinkler.distance(str1, str2), str1.encode(encoding), str2.encode(encoding)
assert_similarity JaroWinkler.distance(str1, str2), str1.encode(encoding), str2.encode(encoding)
end

def assert_jaro_distance score, str1, str2, **options
assert_in_delta score, JaroWinkler.jaro_distance(str1, str2, **options)
end
end

def assert_similarity score, str1, str2, **options
assert_in_delta score, JaroWinkler.similarity(str1, str2, **options)
end

def assert_jaro_similarity score, str1, str2, **options
assert_in_delta score, JaroWinkler.jaro_similarity(str1, str2, **options)
end
end

0 comments on commit d662816

Please sign in to comment.