1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21 """A class to calculate a similarity based on the Levenshtein
22 distance. See http://en.wikipedia.org/wiki/Levenshtein_distance.
23
24 If available, the python-Levenshtein package will be used which will provide
25 better performance as it is implemented natively. See
26 http://trific.ath.cx/python/levenshtein/
27 """
28
29 import math
30
31
33 """Calculates the distance for use in similarity calculation. Python
34 version."""
35 l1 = len(a)
36 l2 = len(b)
37 if stopvalue == -1:
38 stopvalue = l2
39 current = range(l1 + 1)
40 for i in range(1, l2 + 1):
41 previous, current = current, [i] + [0] * l1
42 least = l2
43 for j in range(1, l1 + 1):
44 change = previous[j-1]
45 if a[j-1] != b[i-1]:
46 change = change + 1
47 insert = previous[j] + 1
48 delete = current[j-1] + 1
49 current[j] = min(insert, delete, change)
50 if least > current[j]:
51 least = current[j]
52
53
54 if least > stopvalue:
55 return least
56
57 return current[l1]
58
59
61 """Same as python_distance in functionality. This uses the fast C
62 version if we detected it earlier.
63
64 Note that this does not support arbitrary sequence types, but only
65 string types."""
66 return Levenshtein.distance(a, b)
67
68 try:
69 import Levenshtein as Levenshtein
70 distance = native_distance
71 except ImportError:
72 import logging
73 logging.warning("Python-Levenshtein not found. Continuing with built-in (slower) fuzzy matching.")
74 distance = python_distance
75
76
78
80 self.MAX_LEN = max_len
81
101
103 """Returns the similarity between a and b based on Levenshtein distance. It
104 can stop prematurely as soon as it sees that a and b will be no simmilar than
105 the percentage specified in stoppercentage.
106
107 The Levenshtein distance is calculated, but the following should be noted:
108 - Only the first MAX_LEN characters are considered. Long strings differing
109 at the end will therefore seem to match better than they should. See the
110 use of the variable penalty to lessen the effect of this.
111 - Strings with widely different lengths give the opportunity for shortcut.
112 This is by definition of the Levenshtein distance: the distance will be
113 at least as much as the difference in string length.
114 - Calculation is stopped as soon as a similarity of stoppercentage becomes
115 unattainable. See the use of the variable stopvalue.
116 - Implementation uses memory O(min(len(a), len(b))
117 - Excecution time is O(len(a)*len(b))
118 """
119 l1, l2 = len(a), len(b)
120 if l1 == 0 or l2 == 0:
121 return 0
122
123 if l1 > l2:
124 l1, l2 = l2, l1
125 a, b = b, a
126
127
128
129 maxsimilarity = 100 - 100.0*abs(l1 - l2)/l2
130 if maxsimilarity < stoppercentage:
131 return maxsimilarity * 1.0
132
133
134 penalty = 0
135 if l2 > self.MAX_LEN:
136 b = b[:self.MAX_LEN]
137 l2 = self.MAX_LEN
138 penalty += 7
139 if l1 > self.MAX_LEN:
140 a = a[:self.MAX_LEN]
141 l1 = self.MAX_LEN
142 penalty += 7
143
144
145 stopvalue = math.ceil((100.0 - stoppercentage)/100 * l2)
146 dist = distance(a, b, stopvalue)
147 if dist > stopvalue:
148 return stoppercentage - 1.0
149
150
151
152 if dist != 0:
153 penalty = 0
154 return 100 - (dist*1.0/l2)*100 - penalty
155
156
157 if __name__ == "__main__":
158 from sys import argv
159 comparer = LevenshteinComparer()
160 print "Similarity:\n%s" % comparer.similarity(argv[1], argv[2], 50)
161