root / org.gvsig.toolbox / trunk / org.gvsig.toolbox / org.gvsig.toolbox.algorithm / src / main / java / es / unex / sextante / tables / normalityTest / SWilk.java @ 59
History | View | Annotate | Download (16.5 KB)
1 |
package es.unex.sextante.tables.normalityTest; |
---|---|
2 |
|
3 |
|
4 |
/**
|
5 |
* Calculates the Shapiro-Wilk W test and its significance level.
|
6 |
*
|
7 |
* Ported from original FORTRAN 77 code from the journal Applied Statistics published by the Royal Statistical Society and
|
8 |
* distributed by Carnegie Mellon University at http://lib.stat.cmu.edu/apstat/.
|
9 |
*
|
10 |
* To help facilitate debugging and maintenance, this port has been changed as little as feasible from the original FORTRAN 77
|
11 |
* code, to allow comparisons with the original. Variable names have been left alone when possible (except for capitalizing
|
12 |
* constants), and the logic flow (though translated and indented) is essentially unchanged.
|
13 |
*
|
14 |
* The original FORTRAN source for these routines has been released by the Royal Statistical Society for free public distribution,
|
15 |
* and this Java implementation is released to the public domain.
|
16 |
*
|
17 |
* This java implementation is part of the limewire project.
|
18 |
*/
|
19 |
public class SWilk { |
20 |
/*
|
21 |
* Constants and polynomial coefficients for swilk(). NOTE: FORTRAN counts the elements of the array x[length] as
|
22 |
* x[1] through x[length], not x[0] through x[length-1]. To avoid making pervasive, subtle changes to the algorithm
|
23 |
* (which would inevitably introduce pervasive, subtle bugs) the referenced arrays are padded with an unused 0th
|
24 |
* element, and the algorithm is ported so as to continue accessing from [1] through [length].
|
25 |
*/
|
26 |
private static final double C1[] = { Double.NaN, 0.0E0, 0.221157E0, -0.147981E0, -0.207119E1, 0.4434685E1, -0.2706056E1 }; |
27 |
private static final double C2[] = { Double.NaN, 0.0E0, 0.42981E-1, -0.293762E0, -0.1752461E1, 0.5682633E1, -0.3582633E1 }; |
28 |
private static final double C3[] = { Double.NaN, 0.5440E0, -0.39978E0, 0.25054E-1, -0.6714E-3 }; |
29 |
private static final double C4[] = { Double.NaN, 0.13822E1, -0.77857E0, 0.62767E-1, -0.20322E-2 }; |
30 |
private static final double C5[] = { Double.NaN, -0.15861E1, -0.31082E0, -0.83751E-1, 0.38915E-2 }; |
31 |
private static final double C6[] = { Double.NaN, -0.4803E0, -0.82676E-1, 0.30302E-2 }; |
32 |
private static final double C7[] = { Double.NaN, 0.164E0, 0.533E0 }; |
33 |
private static final double C8[] = { Double.NaN, 0.1736E0, 0.315E0 }; |
34 |
private static final double C9[] = { Double.NaN, 0.256E0, -0.635E-2 }; |
35 |
private static final double G[] = { Double.NaN, -0.2273E1, 0.459E0 }; |
36 |
private static final double Z90 = 0.12816E1, Z95 = 0.16449E1, Z99 = 0.23263E1; |
37 |
private static final double ZM = 0.17509E1, ZSS = 0.56268E0; |
38 |
private static final double BF1 = 0.8378E0, XX90 = 0.556E0, XX95 = 0.622E0; |
39 |
private static final double SQRTH = 0.70711E0, TH = 0.375E0, SMALL = 1E-19; |
40 |
private static final double PI6 = 0.1909859E1, STQR = 0.1047198E1; |
41 |
private static final boolean UPPER = true; |
42 |
|
43 |
|
44 |
/**
|
45 |
* ALGORITHM AS R94 APPL. STATIST. (1995) VOL.44, NO.4
|
46 |
*
|
47 |
* Calculates Shapiro-Wilk normality test and P-value for sample sizes 3 <= n <= 5000 . Handles censored or uncensored data.
|
48 |
* Corrects AS 181, which was found to be inaccurate for n > 50.
|
49 |
*
|
50 |
* NOTE: Semi-strange porting kludge alert. FORTRAN allows subroutine arguments to be modified by the called routine (passed by
|
51 |
* reference, not value), and the original code for this routine makes use of that feature to return multiple results. To avoid
|
52 |
* changing the code any more than necessary, I've used Java arrays to simulate this pass-by-reference feature. Specifically,
|
53 |
* in the original code w, pw, and ifault are output results, not input parameters. Pass in double[1] arrays for w and pw, and
|
54 |
* an int[1] array for ifault, and extract the computed values from the [0] element on return. The argument init is both input
|
55 |
* and output; use a boolean[1] array and initialize [0] to false before the first call. The routine will update the value to
|
56 |
* true to record that initialization has been performed, to speed up subsequent calls on the same data set. Note that although
|
57 |
* the contents of a[] will be computed by the routine on the first call, the caller must still allocate the array space and
|
58 |
* pass the unfilled array in to the subroutine. The routine will set the contents but not allocate the space.
|
59 |
*
|
60 |
* As described above with the constants, the data arrays x[] and a[] are referenced with a base element of 1 (like FORTRAN)
|
61 |
* instead of 0 (like Java) to avoid screwing up the algorithm. To pass in 100 data points, declare x[101] and fill elements
|
62 |
* x[1] through x[100] with data. x[0] will be ignored.
|
63 |
*
|
64 |
* You might want to eliminate the ifault parameter completely, and throw Java exceptions instead. I didn't want to change the
|
65 |
* code that much.
|
66 |
*
|
67 |
* @param init
|
68 |
* Input & output; pass in boolean[1], initialize to false before first call, routine will set to true
|
69 |
* @param x
|
70 |
* Input; Data set to analyze; 100 points go in x[101] array from x[1] through x[100]
|
71 |
* @param n
|
72 |
* Input; Number of data points in x
|
73 |
* @param n1
|
74 |
* Input; dunno
|
75 |
* @param n2
|
76 |
* Input; dunno either
|
77 |
* @param a
|
78 |
* Output when init[0] == false, Input when init[0] == true; holds computed test coefficients
|
79 |
* @param w
|
80 |
* Output; pass in double[1], will contain result in w[0] on return
|
81 |
* @param pw
|
82 |
* Output; pass in double[1], will contain result in pw[0] on return
|
83 |
* @param ifault
|
84 |
* Output; pass in int[1], will contain error code (0 == good) in ifault[0] on return
|
85 |
*/
|
86 |
public static void swilk(final boolean[] init, |
87 |
final double[] x, |
88 |
final int n, |
89 |
final int n1, |
90 |
final int n2, |
91 |
final double[] a, |
92 |
final double[] w, |
93 |
final double[] pw, |
94 |
final int[] ifault) { |
95 |
|
96 |
pw[0] = 1.0; |
97 |
if (w[0] >= 0.0) { |
98 |
w[0] = 1.0; |
99 |
} |
100 |
final double an = n; |
101 |
ifault[0] = 3; |
102 |
final int nn2 = n / 2; |
103 |
if (n2 < nn2) {
|
104 |
return;
|
105 |
} |
106 |
ifault[0] = 1; |
107 |
if (n < 3) { |
108 |
return;
|
109 |
} |
110 |
|
111 |
// If INIT is false, calculates coefficients for the test
|
112 |
|
113 |
if (!init[0]) { |
114 |
if (n == 3) { |
115 |
a[1] = SQRTH;
|
116 |
} |
117 |
else {
|
118 |
final double an25 = an + 0.25; |
119 |
double summ2 = 0.0; |
120 |
for (int i = 1; i <= n2; ++i) { |
121 |
a[i] = ppnd((i - TH) / an25); |
122 |
summ2 += a[i] * a[i]; |
123 |
} |
124 |
summ2 *= 2.0;
|
125 |
final double ssumm2 = Math.sqrt(summ2); |
126 |
final double rsn = 1.0 / Math.sqrt(an); |
127 |
final double a1 = poly(C1, 6, rsn) - a[1] / ssumm2; |
128 |
|
129 |
// Normalize coefficients
|
130 |
|
131 |
int i1;
|
132 |
double fac;
|
133 |
if (n > 5) { |
134 |
i1 = 3;
|
135 |
final double a2 = -a[2] / ssumm2 + poly(C2, 6, rsn); |
136 |
fac = Math.sqrt((summ2 - 2.0 * a[1] * a[1] - 2.0 * a[2] * a[2]) / (1.0 - 2.0 * a1 * a1 - 2.0 * a2 * a2)); |
137 |
a[1] = a1;
|
138 |
a[2] = a2;
|
139 |
} |
140 |
else {
|
141 |
i1 = 2;
|
142 |
fac = Math.sqrt((summ2 - 2.0 * a[1] * a[1]) / (1.0 - 2.0 * a1 * a1)); |
143 |
a[1] = a1;
|
144 |
} |
145 |
for (int i = i1; i <= nn2; ++i) { |
146 |
a[i] = -a[i] / fac; |
147 |
} |
148 |
} |
149 |
init[0] = true; |
150 |
} |
151 |
if (n1 < 3) { |
152 |
return;
|
153 |
} |
154 |
final int ncens = n - n1; |
155 |
ifault[0] = 4; |
156 |
if (ncens < 0 || (ncens > 0 && n < 20)) { |
157 |
return;
|
158 |
} |
159 |
ifault[0] = 5; |
160 |
final double delta = ncens / an; |
161 |
if (delta > 0.8) { |
162 |
return;
|
163 |
} |
164 |
|
165 |
// If W input as negative, calculate significance level of -W
|
166 |
|
167 |
double w1, xx;
|
168 |
if (w[0] < 0.0) { |
169 |
w1 = 1.0 + w[0]; |
170 |
ifault[0] = 0; |
171 |
} |
172 |
else {
|
173 |
|
174 |
// Check for zero range
|
175 |
|
176 |
ifault[0] = 6; |
177 |
final double range = x[n1] - x[1]; |
178 |
if (range < SMALL) {
|
179 |
return;
|
180 |
} |
181 |
|
182 |
// Check for correct sort order on range - scaled X
|
183 |
|
184 |
ifault[0] = 7; |
185 |
xx = x[1] / range;
|
186 |
double sx = xx;
|
187 |
double sa = -a[1]; |
188 |
int j = n - 1; |
189 |
for (int i = 2; i <= n1; ++i) { |
190 |
final double xi = x[i] / range; |
191 |
// IF (XX-XI .GT. SMALL) PRINT *,' ANYTHING'
|
192 |
sx += xi; |
193 |
if (i != j) {
|
194 |
sa += sign(1, i - j) * a[Math.min(i, j)]; |
195 |
} |
196 |
xx = xi; |
197 |
--j; |
198 |
} |
199 |
ifault[0] = 0; |
200 |
if (n > 5000) { |
201 |
ifault[0] = 2; |
202 |
} |
203 |
|
204 |
// Calculate W statistic as squared correlation between data and coefficients
|
205 |
|
206 |
sa /= n1; |
207 |
sx /= n1; |
208 |
double ssa = 0.0; |
209 |
double ssx = 0.0; |
210 |
double sax = 0.0; |
211 |
j = n; |
212 |
double asa;
|
213 |
for (int i = 1; i <= n1; ++i) { |
214 |
if (i != j) {
|
215 |
asa = sign(1, i - j) * a[Math.min(i, j)] - sa; |
216 |
} |
217 |
else {
|
218 |
asa = -sa; |
219 |
} |
220 |
final double xsx = x[i] / range - sx; |
221 |
ssa += asa * asa; |
222 |
ssx += xsx * xsx; |
223 |
sax += asa * xsx; |
224 |
--j; |
225 |
} |
226 |
|
227 |
// W1 equals (1-W) calculated to avoid excessive rounding error
|
228 |
// for W very near 1 (a potential problem in very large samples)
|
229 |
|
230 |
final double ssassx = Math.sqrt(ssa * ssx); |
231 |
w1 = (ssassx - sax) * (ssassx + sax) / (ssa * ssx); |
232 |
} |
233 |
w[0] = 1.0 - w1; |
234 |
|
235 |
// Calculate significance level for W (exact for N=3)
|
236 |
|
237 |
if (n == 3) { |
238 |
pw[0] = PI6 * (Math.asin(Math.sqrt(w[0])) - STQR); |
239 |
return;
|
240 |
} |
241 |
double y = Math.log(w1); |
242 |
xx = Math.log(an);
|
243 |
double m = 0.0; |
244 |
double s = 1.0; |
245 |
if (n <= 11) { |
246 |
final double gamma = poly(G, 2, an); |
247 |
if (y >= gamma) {
|
248 |
pw[0] = SMALL;
|
249 |
return;
|
250 |
} |
251 |
y = -Math.log(gamma - y);
|
252 |
m = poly(C3, 4, an);
|
253 |
s = Math.exp(poly(C4, 4, an)); |
254 |
} |
255 |
else {
|
256 |
m = poly(C5, 4, xx);
|
257 |
s = Math.exp(poly(C6, 3, xx)); |
258 |
} |
259 |
if (ncens > 0) { |
260 |
|
261 |
// Censoring by proportion NCENS/N. Calculate mean and sd of normal equivalent deviate of W.
|
262 |
|
263 |
final double ld = -Math.log(delta); |
264 |
final double bf = 1.0 + xx * BF1; |
265 |
final double z90f = Z90 + bf * Math.pow(poly(C7, 2, Math.pow(XX90, xx)), ld); |
266 |
final double z95f = Z95 + bf * Math.pow(poly(C8, 2, Math.pow(XX95, xx)), ld); |
267 |
final double z99f = Z99 + bf * Math.pow(poly(C9, 2, xx), ld); |
268 |
|
269 |
// Regress Z90F,...,Z99F on normal deviates Z90,...,Z99 to get
|
270 |
// pseudo-mean and pseudo-sd of z as the slope and intercept
|
271 |
|
272 |
final double zfm = (z90f + z95f + z99f) / 3.0; |
273 |
final double zsd = (Z90 * (z90f - zfm) + Z95 * (z95f - zfm) + Z99 * (z99f - zfm)) / ZSS; |
274 |
final double zbar = zfm - zsd * ZM; |
275 |
m += zbar * s; |
276 |
s *= zsd; |
277 |
} |
278 |
pw[0] = alnorm((y - m) / s, UPPER);
|
279 |
} |
280 |
|
281 |
|
282 |
/**
|
283 |
* Constructs an int with the absolute value of x and the sign of y
|
284 |
*
|
285 |
* @param x
|
286 |
* int to copy absolute value from
|
287 |
* @param y
|
288 |
* int to copy sign from
|
289 |
* @return int with absolute value of x and sign of y
|
290 |
*/
|
291 |
private static int sign(final int x, |
292 |
final int y) { |
293 |
int result = Math.abs(x); |
294 |
if (y < 0.0) { |
295 |
result = -result; |
296 |
} |
297 |
return result;
|
298 |
} |
299 |
|
300 |
// Constants & polynomial coefficients for ppnd(), slightly renamed to avoid conflicts. Could define
|
301 |
// them inside ppnd(), but static constants are more efficient.
|
302 |
|
303 |
// Coefficients for P close to 0.5
|
304 |
private static final double A0_p = 3.3871327179E+00, A1_p = 5.0434271938E+01, A2_p = 1.5929113202E+02, |
305 |
A3_p = 5.9109374720E+01, B1_p = 1.7895169469E+01, B2_p = 7.8757757664E+01, B3_p = 6.7187563600E+01; |
306 |
|
307 |
// Coefficients for P not close to 0, 0.5 or 1 (names changed to avoid conflict with swilk())
|
308 |
private static final double C0_p = 1.4234372777E+00, C1_p = 2.7568153900E+00, C2_p = 1.3067284816E+00, |
309 |
C3_p = 1.7023821103E-01, D1_p = 7.3700164250E-01, D2_p = 1.2021132975E-01; |
310 |
|
311 |
// Coefficients for P near 0 or 1.
|
312 |
private static final double E0_p = 6.6579051150E+00, E1_p = 3.0812263860E+00, E2_p = 4.2868294337E-01, |
313 |
E3_p = 1.7337203997E-02, F1_p = 2.4197894225E-01, F2_p = 1.2258202635E-02; |
314 |
|
315 |
private static final double SPLIT1 = 0.425, SPLIT2 = 5.0, CONST1 = 0.180625, CONST2 = 1.6; |
316 |
|
317 |
|
318 |
/**
|
319 |
* ALGORITHM AS 241 APPL. STATIST. (1988) VOL. 37, NO. 3, 477-484.
|
320 |
*
|
321 |
* Produces the normal deviate Z corresponding to a given lower tail area of P; Z is accurate to about 1 part in 10**7.
|
322 |
*
|
323 |
* @param p
|
324 |
* @return
|
325 |
*/
|
326 |
private static double ppnd(final double p) { |
327 |
final double q = p - 0.5; |
328 |
double r;
|
329 |
if (Math.abs(q) <= SPLIT1) { |
330 |
r = CONST1 - q * q; |
331 |
return q * (((A3_p * r + A2_p) * r + A1_p) * r + A0_p) / (((B3_p * r + B2_p) * r + B1_p) * r + 1.0); |
332 |
} |
333 |
else {
|
334 |
if (q < 0.0) { |
335 |
r = p; |
336 |
} |
337 |
else {
|
338 |
r = 1.0 - p;
|
339 |
} |
340 |
if (r <= 0.0) { |
341 |
return 0.0; |
342 |
} |
343 |
r = Math.sqrt(-Math.log(r)); |
344 |
double normal_dev;
|
345 |
if (r <= SPLIT2) {
|
346 |
r -= CONST2; |
347 |
normal_dev = (((C3_p * r + C2_p) * r + C1_p) * r + C0_p) / ((D2_p * r + D1_p) * r + 1.0);
|
348 |
} |
349 |
else {
|
350 |
r -= SPLIT2; |
351 |
normal_dev = (((E3_p * r + E2_p) * r + E1_p) * r + E0_p) / ((F2_p * r + F1_p) * r + 1.0);
|
352 |
} |
353 |
if (q < 0.0) { |
354 |
normal_dev = -normal_dev; |
355 |
} |
356 |
return normal_dev;
|
357 |
} |
358 |
} |
359 |
|
360 |
|
361 |
/**
|
362 |
* Algorithm AS 181.2 Appl. Statist. (1982) Vol. 31, No. 2
|
363 |
*
|
364 |
* Calculates the algebraic polynomial of order nord-1 with array of coefficients c. Zero order coefficient is c[1]
|
365 |
*
|
366 |
* @param c
|
367 |
* @param nord
|
368 |
* @param x
|
369 |
* @return
|
370 |
*/
|
371 |
private static double poly(final double[] c, |
372 |
final int nord, |
373 |
final double x) { |
374 |
double poly = c[1]; |
375 |
if (nord == 1) { |
376 |
return poly;
|
377 |
} |
378 |
double p = x * c[nord];
|
379 |
if (nord != 2) { |
380 |
final int n2 = nord - 2; |
381 |
int j = n2 + 1; |
382 |
for (int i = 1; i <= n2; ++i) { |
383 |
p = (p + c[j]) * x; |
384 |
--j; |
385 |
} |
386 |
} |
387 |
poly += p; |
388 |
return poly;
|
389 |
} |
390 |
|
391 |
// Constants & polynomial coefficients for alnorm(), slightly renamed to avoid conflicts.
|
392 |
private static final double CON_a = 1.28, LTONE_a = 7.0, UTZERO_a = 18.66; |
393 |
private static final double P_a = 0.398942280444, Q_a = 0.39990348504, R_a = 0.398942280385, A1_a = 5.75885480458, |
394 |
A2_a = 2.62433121679, A3_a = 5.92885724438, B1_a = -29.8213557807, B2_a = 48.6959930692, C1_a = -3.8052E-8, |
395 |
C2_a = 3.98064794E-4, C3_a = -0.151679116635, C4_a = 4.8385912808, C5_a = 0.742380924027, C6_a = 3.99019417011, |
396 |
D1_a = 1.00000615302, D2_a = 1.98615381364, D3_a = 5.29330324926, D4_a = -15.1508972451, D5_a = 30.789933034; |
397 |
|
398 |
|
399 |
/**
|
400 |
* Algorithm AS66 Applied Statistics (1973) vol.22, no.3
|
401 |
*
|
402 |
* Evaluates the tail area of the standardised normal curve from x to infinity if upper is true or from minus infinity to x if
|
403 |
* upper is false.
|
404 |
*
|
405 |
* @param x
|
406 |
* @param upper
|
407 |
* @return
|
408 |
*/
|
409 |
private static double alnorm(final double x, |
410 |
final boolean upper) { |
411 |
boolean up = upper;
|
412 |
double z = x;
|
413 |
if (z < 0.0) { |
414 |
up = !up; |
415 |
z = -z; |
416 |
} |
417 |
double fn_val;
|
418 |
if (z > LTONE_a && (!up || z > UTZERO_a)) {
|
419 |
fn_val = 0.0;
|
420 |
} |
421 |
else {
|
422 |
double y = 0.5 * z * z; |
423 |
if (z <= CON_a) {
|
424 |
fn_val = 0.5 - z * (P_a - Q_a * y / (y + A1_a + B1_a / (y + A2_a + B2_a / (y + A3_a))));
|
425 |
} |
426 |
else {
|
427 |
fn_val = R_a |
428 |
* Math.exp(-y)
|
429 |
/ (z + C1_a + D1_a |
430 |
/ (z + C2_a + D2_a / (z + C3_a + D3_a / (z + C4_a + D4_a / (z + C5_a + D5_a / (z + C6_a)))))); |
431 |
} |
432 |
} |
433 |
if (!up) {
|
434 |
fn_val = 1.0 - fn_val;
|
435 |
} |
436 |
return fn_val;
|
437 |
} |
438 |
} |