Revision 27057 branches/v2_0_0_prep/libraries/libGeocoding/src/org/gvsig/geocoding/distance/LevenshteinDistance.java
LevenshteinDistance.java | ||
---|---|---|
29 | 29 |
|
30 | 30 |
/** |
31 | 31 |
* Levenshtein Distance |
32 |
* |
|
32 | 33 |
* @author vsanjaime |
33 |
* |
|
34 |
*
|
|
34 | 35 |
*/ |
35 | 36 |
public class LevenshteinDistance { |
36 | 37 |
|
37 |
|
|
38 | 38 |
/** |
39 | 39 |
* calculate levenshtein distance between two strings |
40 |
* |
|
40 | 41 |
* @param str1 |
41 | 42 |
* @param str2 |
42 | 43 |
* @return |
... | ... | |
46 | 47 |
.toCharArray()); |
47 | 48 |
} |
48 | 49 |
|
49 |
|
|
50 | 50 |
/** |
51 | 51 |
* get levenshtein distance between two strings |
52 |
* |
|
52 | 53 |
* @param s |
53 | 54 |
* @param t |
54 | 55 |
* @return |
55 | 56 |
*/ |
56 |
public static int getLevenshteinDistance (String s, String t) { |
|
57 |
if (s == null || t == null) { |
|
58 |
throw new IllegalArgumentException("Strings must not be null"); |
|
59 |
} |
|
60 |
|
|
61 |
/* |
|
62 |
The difference between this impl. and the previous is that, rather |
|
63 |
than creating and retaining a matrix of size s.length()+1 by t.length()+1, |
|
64 |
we maintain two single-dimensional arrays of length s.length()+1. The first, d, |
|
65 |
is the 'current working' distance array that maintains the newest distance cost |
|
66 |
counts as we iterate through the characters of String s. Each time we increment |
|
67 |
the index of String t we are comparing, d is copied to p, the second int[]. Doing so |
|
68 |
allows us to retain the previous cost counts as required by the algorithm (taking |
|
69 |
the minimum of the cost count to the left, up one, and diagonally up and to the left |
|
70 |
of the current cost count being calculated). (Note that the arrays aren't really |
|
71 |
copied anymore, just switched...this is clearly much better than cloning an array |
|
72 |
or doing a System.arraycopy() each time through the outer loop.) |
|
57 |
public static int getLevenshteinDistance(String s, String t) { |
|
58 |
if (s == null || t == null) { |
|
59 |
throw new IllegalArgumentException("Strings must not be null"); |
|
60 |
} |
|
73 | 61 |
|
74 |
Effectively, the difference between the two implementations is this one does not |
|
75 |
cause an out of memory condition when calculating the LD over two very large strings. |
|
76 |
*/ |
|
77 |
|
|
78 |
int n = s.length(); // length of s |
|
79 |
int m = t.length(); // length of t |
|
80 |
|
|
81 |
if (n == 0) { |
|
82 |
return m; |
|
83 |
} else if (m == 0) { |
|
84 |
return n; |
|
85 |
} |
|
62 |
/* |
|
63 |
* The difference between this impl. and the previous is that, rather |
|
64 |
* than creating and retaining a matrix of size s.length()+1 by |
|
65 |
* t.length()+1, we maintain two single-dimensional arrays of length |
|
66 |
* s.length()+1. The first, d, is the 'current working' distance array |
|
67 |
* that maintains the newest distance cost counts as we iterate through |
|
68 |
* the characters of String s. Each time we increment the index of |
|
69 |
* String t we are comparing, d is copied to p, the second int[]. Doing |
|
70 |
* so allows us to retain the previous cost counts as required by the |
|
71 |
* algorithm (taking the minimum of the cost count to the left, up one, |
|
72 |
* and diagonally up and to the left of the current cost count being |
|
73 |
* calculated). (Note that the arrays aren't really copied anymore, just |
|
74 |
* switched...this is clearly much better than cloning an array or doing |
|
75 |
* a System.arraycopy() each time through the outer loop.) |
|
76 |
* |
|
77 |
* Effectively, the difference between the two implementations is this |
|
78 |
* one does not cause an out of memory condition when calculating the LD |
|
79 |
* over two very large strings. |
|
80 |
*/ |
|
86 | 81 |
|
87 |
int p[] = new int[n+1]; //'previous' cost array, horizontally |
|
88 |
int d[] = new int[n+1]; // cost array, horizontally |
|
89 |
int _d[]; //placeholder to assist in swapping p and d |
|
82 |
int n = s.length(); // length of s |
|
83 |
int m = t.length(); // length of t |
|
90 | 84 |
|
91 |
// indexes into strings s and t |
|
92 |
int i; // iterates through s |
|
93 |
int j; // iterates through t |
|
85 |
if (n == 0) { |
|
86 |
return m; |
|
87 |
} else if (m == 0) { |
|
88 |
return n; |
|
89 |
} |
|
94 | 90 |
|
95 |
char t_j; // jth character of t |
|
91 |
int p[] = new int[n + 1]; // 'previous' cost array, horizontally |
|
92 |
int d[] = new int[n + 1]; // cost array, horizontally |
|
93 |
int _d[]; // placeholder to assist in swapping p and d |
|
96 | 94 |
|
97 |
int cost; // cost |
|
95 |
// indexes into strings s and t |
|
96 |
int i; // iterates through s |
|
97 |
int j; // iterates through t |
|
98 | 98 |
|
99 |
for (i = 0; i<=n; i++) { |
|
100 |
p[i] = i; |
|
101 |
} |
|
102 |
|
|
103 |
for (j = 1; j<=m; j++) { |
|
104 |
t_j = t.charAt(j-1); |
|
105 |
d[0] = j; |
|
106 |
|
|
107 |
for (i=1; i<=n; i++) { |
|
108 |
cost = s.charAt(i-1)==t_j ? 0 : 1; |
|
109 |
// minimum of cell to the left+1, to the top+1, diagonally left and up +cost |
|
110 |
d[i] = Math.min(Math.min(d[i-1]+1, p[i]+1), p[i-1]+cost); |
|
111 |
} |
|
99 |
char t_j; // jth character of t |
|
112 | 100 |
|
113 |
// copy current distance counts to 'previous row' distance counts |
|
114 |
_d = p; |
|
115 |
p = d; |
|
116 |
d = _d; |
|
117 |
} |
|
118 |
|
|
119 |
// our last action in the above loop was to switch d and p, so p now |
|
120 |
// actually has the most recent cost counts |
|
121 |
return p[n]; |
|
101 |
int cost; // cost |
|
102 |
|
|
103 |
for (i = 0; i <= n; i++) { |
|
104 |
p[i] = i; |
|
122 | 105 |
} |
123 |
|
|
106 |
|
|
107 |
for (j = 1; j <= m; j++) { |
|
108 |
t_j = t.charAt(j - 1); |
|
109 |
d[0] = j; |
|
110 |
|
|
111 |
for (i = 1; i <= n; i++) { |
|
112 |
cost = s.charAt(i - 1) == t_j ? 0 : 1; |
|
113 |
// minimum of cell to the left+1, to the top+1, diagonally left |
|
114 |
// and up +cost |
|
115 |
d[i] = Math.min(Math.min(d[i - 1] + 1, p[i] + 1), p[i - 1] |
|
116 |
+ cost); |
|
117 |
} |
|
118 |
|
|
119 |
// copy current distance counts to 'previous row' distance counts |
|
120 |
_d = p; |
|
121 |
p = d; |
|
122 |
d = _d; |
|
123 |
} |
|
124 |
|
|
125 |
// our last action in the above loop was to switch d and p, so p now |
|
126 |
// actually has the most recent cost counts |
|
127 |
return p[n]; |
|
128 |
} |
|
129 |
|
|
124 | 130 |
/** |
125 | 131 |
* |
126 | 132 |
* @param a |
... | ... | |
135 | 141 |
return b; |
136 | 142 |
return c; |
137 | 143 |
} |
138 |
|
|
144 |
|
|
139 | 145 |
/** |
140 | 146 |
* calculate levenshtein distance between two chars |
147 |
* |
|
141 | 148 |
* @param str1 |
142 | 149 |
* @param str2 |
143 | 150 |
* @return |
Also available in: Unified diff