Revision 27057 branches/v2_0_0_prep/libraries/libGeocoding/src/org/gvsig/geocoding/distance/LevenshteinDistance.java

View differences:

LevenshteinDistance.java
29 29

  
30 30
/**
31 31
 * Levenshtein Distance
32
 * 
32 33
 * @author vsanjaime
33
 *
34
 * 
34 35
 */
35 36
public class LevenshteinDistance {
36 37

  
37

  
38 38
	/**
39 39
	 * calculate levenshtein distance between two strings
40
	 * 
40 41
	 * @param str1
41 42
	 * @param str2
42 43
	 * @return
......
46 47
				.toCharArray());
47 48
	}
48 49

  
49
	
50 50
	/**
51 51
	 * get levenshtein distance between two strings
52
	 * 
52 53
	 * @param s
53 54
	 * @param t
54 55
	 * @return
55 56
	 */
56
	public static int getLevenshteinDistance (String s, String t) {
57
		  if (s == null || t == null) {
58
		    throw new IllegalArgumentException("Strings must not be null");
59
		  }
60
				
61
		  /*
62
		    The difference between this impl. and the previous is that, rather 
63
		     than creating and retaining a matrix of size s.length()+1 by t.length()+1, 
64
		     we maintain two single-dimensional arrays of length s.length()+1.  The first, d,
65
		     is the 'current working' distance array that maintains the newest distance cost
66
		     counts as we iterate through the characters of String s.  Each time we increment
67
		     the index of String t we are comparing, d is copied to p, the second int[].  Doing so
68
		     allows us to retain the previous cost counts as required by the algorithm (taking 
69
		     the minimum of the cost count to the left, up one, and diagonally up and to the left
70
		     of the current cost count being calculated).  (Note that the arrays aren't really 
71
		     copied anymore, just switched...this is clearly much better than cloning an array 
72
		     or doing a System.arraycopy() each time  through the outer loop.)
57
	public static int getLevenshteinDistance(String s, String t) {
58
		if (s == null || t == null) {
59
			throw new IllegalArgumentException("Strings must not be null");
60
		}
73 61

  
74
		     Effectively, the difference between the two implementations is this one does not 
75
		     cause an out of memory condition when calculating the LD over two very large strings.  		
76
		  */		
77
				
78
		  int n = s.length(); // length of s
79
		  int m = t.length(); // length of t
80
				
81
		  if (n == 0) {
82
		    return m;
83
		  } else if (m == 0) {
84
		    return n;
85
		  }
62
		/*
63
		 * The difference between this impl. and the previous is that, rather
64
		 * than creating and retaining a matrix of size s.length()+1 by
65
		 * t.length()+1, we maintain two single-dimensional arrays of length
66
		 * s.length()+1. The first, d, is the 'current working' distance array
67
		 * that maintains the newest distance cost counts as we iterate through
68
		 * the characters of String s. Each time we increment the index of
69
		 * String t we are comparing, d is copied to p, the second int[]. Doing
70
		 * so allows us to retain the previous cost counts as required by the
71
		 * algorithm (taking the minimum of the cost count to the left, up one,
72
		 * and diagonally up and to the left of the current cost count being
73
		 * calculated). (Note that the arrays aren't really copied anymore, just
74
		 * switched...this is clearly much better than cloning an array or doing
75
		 * a System.arraycopy() each time through the outer loop.)
76
		 * 
77
		 * Effectively, the difference between the two implementations is this
78
		 * one does not cause an out of memory condition when calculating the LD
79
		 * over two very large strings.
80
		 */
86 81

  
87
		  int p[] = new int[n+1]; //'previous' cost array, horizontally
88
		  int d[] = new int[n+1]; // cost array, horizontally
89
		  int _d[]; //placeholder to assist in swapping p and d
82
		int n = s.length(); // length of s
83
		int m = t.length(); // length of t
90 84

  
91
		  // indexes into strings s and t
92
		  int i; // iterates through s
93
		  int j; // iterates through t
85
		if (n == 0) {
86
			return m;
87
		} else if (m == 0) {
88
			return n;
89
		}
94 90

  
95
		  char t_j; // jth character of t
91
		int p[] = new int[n + 1]; // 'previous' cost array, horizontally
92
		int d[] = new int[n + 1]; // cost array, horizontally
93
		int _d[]; // placeholder to assist in swapping p and d
96 94

  
97
		  int cost; // cost
95
		// indexes into strings s and t
96
		int i; // iterates through s
97
		int j; // iterates through t
98 98

  
99
		  for (i = 0; i<=n; i++) {
100
		     p[i] = i;
101
		  }
102
				
103
		  for (j = 1; j<=m; j++) {
104
		     t_j = t.charAt(j-1);
105
		     d[0] = j;
106
				
107
		     for (i=1; i<=n; i++) {
108
		        cost = s.charAt(i-1)==t_j ? 0 : 1;
109
		        // minimum of cell to the left+1, to the top+1, diagonally left and up +cost				
110
		        d[i] = Math.min(Math.min(d[i-1]+1, p[i]+1),  p[i-1]+cost);  
111
		     }
99
		char t_j; // jth character of t
112 100

  
113
		     // copy current distance counts to 'previous row' distance counts
114
		     _d = p;
115
		     p = d;
116
		     d = _d;
117
		  } 
118
				
119
		  // our last action in the above loop was to switch d and p, so p now 
120
		  // actually has the most recent cost counts
121
		  return p[n];
101
		int cost; // cost
102

  
103
		for (i = 0; i <= n; i++) {
104
			p[i] = i;
122 105
		}
123
	
106

  
107
		for (j = 1; j <= m; j++) {
108
			t_j = t.charAt(j - 1);
109
			d[0] = j;
110

  
111
			for (i = 1; i <= n; i++) {
112
				cost = s.charAt(i - 1) == t_j ? 0 : 1;
113
				// minimum of cell to the left+1, to the top+1, diagonally left
114
				// and up +cost
115
				d[i] = Math.min(Math.min(d[i - 1] + 1, p[i] + 1), p[i - 1]
116
						+ cost);
117
			}
118

  
119
			// copy current distance counts to 'previous row' distance counts
120
			_d = p;
121
			p = d;
122
			d = _d;
123
		}
124

  
125
		// our last action in the above loop was to switch d and p, so p now
126
		// actually has the most recent cost counts
127
		return p[n];
128
	}
129

  
124 130
	/**
125 131
	 * 
126 132
	 * @param a
......
135 141
			return b;
136 142
		return c;
137 143
	}
138
	
144

  
139 145
	/**
140 146
	 * calculate levenshtein distance between two chars
147
	 * 
141 148
	 * @param str1
142 149
	 * @param str2
143 150
	 * @return

Also available in: Unified diff