Statistics
| Revision:

root / org.gvsig.toolbox / trunk / org.gvsig.toolbox / org.gvsig.toolbox.algorithm / src / main / java / es / unex / sextante / tables / normalityTest / SWilk.java @ 59

History | View | Annotate | Download (16.5 KB)

1
package es.unex.sextante.tables.normalityTest;
2

    
3

    
4
/**
5
 * Calculates the Shapiro-Wilk W test and its significance level.
6
 *
7
 * Ported from original FORTRAN 77 code from the journal Applied Statistics published by the Royal Statistical Society and
8
 * distributed by Carnegie Mellon University at http://lib.stat.cmu.edu/apstat/.
9
 *
10
 * To help facilitate debugging and maintenance, this port has been changed as little as feasible from the original FORTRAN 77
11
 * code, to allow comparisons with the original. Variable names have been left alone when possible (except for capitalizing
12
 * constants), and the logic flow (though translated and indented) is essentially unchanged.
13
 *
14
 * The original FORTRAN source for these routines has been released by the Royal Statistical Society for free public distribution,
15
 * and this Java implementation is released to the public domain.
16
 *
17
 * This java implementation is part of the limewire project.
18
 */
19
public class SWilk {
20
   /*
21
    * Constants and polynomial coefficients for swilk(). NOTE: FORTRAN counts the elements of the array x[length] as
22
    * x[1] through x[length], not x[0] through x[length-1]. To avoid making pervasive, subtle changes to the algorithm
23
    * (which would inevitably introduce pervasive, subtle bugs) the referenced arrays are padded with an unused 0th
24
    * element, and the algorithm is ported so as to continue accessing from [1] through [length].
25
    */
26
   private static final double  C1[]  = { Double.NaN, 0.0E0, 0.221157E0, -0.147981E0, -0.207119E1, 0.4434685E1, -0.2706056E1 };
27
   private static final double  C2[]  = { Double.NaN, 0.0E0, 0.42981E-1, -0.293762E0, -0.1752461E1, 0.5682633E1, -0.3582633E1 };
28
   private static final double  C3[]  = { Double.NaN, 0.5440E0, -0.39978E0, 0.25054E-1, -0.6714E-3 };
29
   private static final double  C4[]  = { Double.NaN, 0.13822E1, -0.77857E0, 0.62767E-1, -0.20322E-2 };
30
   private static final double  C5[]  = { Double.NaN, -0.15861E1, -0.31082E0, -0.83751E-1, 0.38915E-2 };
31
   private static final double  C6[]  = { Double.NaN, -0.4803E0, -0.82676E-1, 0.30302E-2 };
32
   private static final double  C7[]  = { Double.NaN, 0.164E0, 0.533E0 };
33
   private static final double  C8[]  = { Double.NaN, 0.1736E0, 0.315E0 };
34
   private static final double  C9[]  = { Double.NaN, 0.256E0, -0.635E-2 };
35
   private static final double  G[]   = { Double.NaN, -0.2273E1, 0.459E0 };
36
   private static final double  Z90   = 0.12816E1, Z95 = 0.16449E1, Z99 = 0.23263E1;
37
   private static final double  ZM    = 0.17509E1, ZSS = 0.56268E0;
38
   private static final double  BF1   = 0.8378E0, XX90 = 0.556E0, XX95 = 0.622E0;
39
   private static final double  SQRTH = 0.70711E0, TH = 0.375E0, SMALL = 1E-19;
40
   private static final double  PI6   = 0.1909859E1, STQR = 0.1047198E1;
41
   private static final boolean UPPER = true;
42

    
43

    
44
   /**
45
    * ALGORITHM AS R94 APPL. STATIST. (1995) VOL.44, NO.4
46
    *
47
    * Calculates Shapiro-Wilk normality test and P-value for sample sizes 3 <= n <= 5000 . Handles censored or uncensored data.
48
    * Corrects AS 181, which was found to be inaccurate for n > 50.
49
    *
50
    * NOTE: Semi-strange porting kludge alert. FORTRAN allows subroutine arguments to be modified by the called routine (passed by
51
    * reference, not value), and the original code for this routine makes use of that feature to return multiple results. To avoid
52
    * changing the code any more than necessary, I've used Java arrays to simulate this pass-by-reference feature. Specifically,
53
    * in the original code w, pw, and ifault are output results, not input parameters. Pass in double[1] arrays for w and pw, and
54
    * an int[1] array for ifault, and extract the computed values from the [0] element on return. The argument init is both input
55
    * and output; use a boolean[1] array and initialize [0] to false before the first call. The routine will update the value to
56
    * true to record that initialization has been performed, to speed up subsequent calls on the same data set. Note that although
57
    * the contents of a[] will be computed by the routine on the first call, the caller must still allocate the array space and
58
    * pass the unfilled array in to the subroutine. The routine will set the contents but not allocate the space.
59
    *
60
    * As described above with the constants, the data arrays x[] and a[] are referenced with a base element of 1 (like FORTRAN)
61
    * instead of 0 (like Java) to avoid screwing up the algorithm. To pass in 100 data points, declare x[101] and fill elements
62
    * x[1] through x[100] with data. x[0] will be ignored.
63
    *
64
    * You might want to eliminate the ifault parameter completely, and throw Java exceptions instead. I didn't want to change the
65
    * code that much.
66
    *
67
    * @param init
68
    *                Input & output; pass in boolean[1], initialize to false before first call, routine will set to true
69
    * @param x
70
    *                Input; Data set to analyze; 100 points go in x[101] array from x[1] through x[100]
71
    * @param n
72
    *                Input; Number of data points in x
73
    * @param n1
74
    *                Input; dunno
75
    * @param n2
76
    *                Input; dunno either
77
    * @param a
78
    *                Output when init[0] == false, Input when init[0] == true; holds computed test coefficients
79
    * @param w
80
    *                Output; pass in double[1], will contain result in w[0] on return
81
    * @param pw
82
    *                Output; pass in double[1], will contain result in pw[0] on return
83
    * @param ifault
84
    *                Output; pass in int[1], will contain error code (0 == good) in ifault[0] on return
85
    */
86
   public static void swilk(final boolean[] init,
87
                            final double[] x,
88
                            final int n,
89
                            final int n1,
90
                            final int n2,
91
                            final double[] a,
92
                            final double[] w,
93
                            final double[] pw,
94
                            final int[] ifault) {
95

    
96
      pw[0] = 1.0;
97
      if (w[0] >= 0.0) {
98
         w[0] = 1.0;
99
      }
100
      final double an = n;
101
      ifault[0] = 3;
102
      final int nn2 = n / 2;
103
      if (n2 < nn2) {
104
         return;
105
      }
106
      ifault[0] = 1;
107
      if (n < 3) {
108
         return;
109
      }
110

    
111
      // If INIT is false, calculates coefficients for the test
112

    
113
      if (!init[0]) {
114
         if (n == 3) {
115
            a[1] = SQRTH;
116
         }
117
         else {
118
            final double an25 = an + 0.25;
119
            double summ2 = 0.0;
120
            for (int i = 1; i <= n2; ++i) {
121
               a[i] = ppnd((i - TH) / an25);
122
               summ2 += a[i] * a[i];
123
            }
124
            summ2 *= 2.0;
125
            final double ssumm2 = Math.sqrt(summ2);
126
            final double rsn = 1.0 / Math.sqrt(an);
127
            final double a1 = poly(C1, 6, rsn) - a[1] / ssumm2;
128

    
129
            // Normalize coefficients
130

    
131
            int i1;
132
            double fac;
133
            if (n > 5) {
134
               i1 = 3;
135
               final double a2 = -a[2] / ssumm2 + poly(C2, 6, rsn);
136
               fac = Math.sqrt((summ2 - 2.0 * a[1] * a[1] - 2.0 * a[2] * a[2]) / (1.0 - 2.0 * a1 * a1 - 2.0 * a2 * a2));
137
               a[1] = a1;
138
               a[2] = a2;
139
            }
140
            else {
141
               i1 = 2;
142
               fac = Math.sqrt((summ2 - 2.0 * a[1] * a[1]) / (1.0 - 2.0 * a1 * a1));
143
               a[1] = a1;
144
            }
145
            for (int i = i1; i <= nn2; ++i) {
146
               a[i] = -a[i] / fac;
147
            }
148
         }
149
         init[0] = true;
150
      }
151
      if (n1 < 3) {
152
         return;
153
      }
154
      final int ncens = n - n1;
155
      ifault[0] = 4;
156
      if (ncens < 0 || (ncens > 0 && n < 20)) {
157
         return;
158
      }
159
      ifault[0] = 5;
160
      final double delta = ncens / an;
161
      if (delta > 0.8) {
162
         return;
163
      }
164

    
165
      // If W input as negative, calculate significance level of -W
166

    
167
      double w1, xx;
168
      if (w[0] < 0.0) {
169
         w1 = 1.0 + w[0];
170
         ifault[0] = 0;
171
      }
172
      else {
173

    
174
         // Check for zero range
175

    
176
         ifault[0] = 6;
177
         final double range = x[n1] - x[1];
178
         if (range < SMALL) {
179
            return;
180
         }
181

    
182
         // Check for correct sort order on range - scaled X
183

    
184
         ifault[0] = 7;
185
         xx = x[1] / range;
186
         double sx = xx;
187
         double sa = -a[1];
188
         int j = n - 1;
189
         for (int i = 2; i <= n1; ++i) {
190
            final double xi = x[i] / range;
191
            // IF (XX-XI .GT. SMALL) PRINT *,' ANYTHING'
192
            sx += xi;
193
            if (i != j) {
194
               sa += sign(1, i - j) * a[Math.min(i, j)];
195
            }
196
            xx = xi;
197
            --j;
198
         }
199
         ifault[0] = 0;
200
         if (n > 5000) {
201
            ifault[0] = 2;
202
         }
203

    
204
         // Calculate W statistic as squared correlation between data and coefficients
205

    
206
         sa /= n1;
207
         sx /= n1;
208
         double ssa = 0.0;
209
         double ssx = 0.0;
210
         double sax = 0.0;
211
         j = n;
212
         double asa;
213
         for (int i = 1; i <= n1; ++i) {
214
            if (i != j) {
215
               asa = sign(1, i - j) * a[Math.min(i, j)] - sa;
216
            }
217
            else {
218
               asa = -sa;
219
            }
220
            final double xsx = x[i] / range - sx;
221
            ssa += asa * asa;
222
            ssx += xsx * xsx;
223
            sax += asa * xsx;
224
            --j;
225
         }
226

    
227
         // W1 equals (1-W) calculated to avoid excessive rounding error
228
         // for W very near 1 (a potential problem in very large samples)
229

    
230
         final double ssassx = Math.sqrt(ssa * ssx);
231
         w1 = (ssassx - sax) * (ssassx + sax) / (ssa * ssx);
232
      }
233
      w[0] = 1.0 - w1;
234

    
235
      // Calculate significance level for W (exact for N=3)
236

    
237
      if (n == 3) {
238
         pw[0] = PI6 * (Math.asin(Math.sqrt(w[0])) - STQR);
239
         return;
240
      }
241
      double y = Math.log(w1);
242
      xx = Math.log(an);
243
      double m = 0.0;
244
      double s = 1.0;
245
      if (n <= 11) {
246
         final double gamma = poly(G, 2, an);
247
         if (y >= gamma) {
248
            pw[0] = SMALL;
249
            return;
250
         }
251
         y = -Math.log(gamma - y);
252
         m = poly(C3, 4, an);
253
         s = Math.exp(poly(C4, 4, an));
254
      }
255
      else {
256
         m = poly(C5, 4, xx);
257
         s = Math.exp(poly(C6, 3, xx));
258
      }
259
      if (ncens > 0) {
260

    
261
         // Censoring by proportion NCENS/N. Calculate mean and sd of normal equivalent deviate of W.
262

    
263
         final double ld = -Math.log(delta);
264
         final double bf = 1.0 + xx * BF1;
265
         final double z90f = Z90 + bf * Math.pow(poly(C7, 2, Math.pow(XX90, xx)), ld);
266
         final double z95f = Z95 + bf * Math.pow(poly(C8, 2, Math.pow(XX95, xx)), ld);
267
         final double z99f = Z99 + bf * Math.pow(poly(C9, 2, xx), ld);
268

    
269
         // Regress Z90F,...,Z99F on normal deviates Z90,...,Z99 to get
270
         // pseudo-mean and pseudo-sd of z as the slope and intercept
271

    
272
         final double zfm = (z90f + z95f + z99f) / 3.0;
273
         final double zsd = (Z90 * (z90f - zfm) + Z95 * (z95f - zfm) + Z99 * (z99f - zfm)) / ZSS;
274
         final double zbar = zfm - zsd * ZM;
275
         m += zbar * s;
276
         s *= zsd;
277
      }
278
      pw[0] = alnorm((y - m) / s, UPPER);
279
   }
280

    
281

    
282
   /**
283
    * Constructs an int with the absolute value of x and the sign of y
284
    *
285
    * @param x
286
    *                int to copy absolute value from
287
    * @param y
288
    *                int to copy sign from
289
    * @return int with absolute value of x and sign of y
290
    */
291
   private static int sign(final int x,
292
                           final int y) {
293
      int result = Math.abs(x);
294
      if (y < 0.0) {
295
         result = -result;
296
      }
297
      return result;
298
   }
299

    
300
   // Constants & polynomial coefficients for ppnd(), slightly renamed to avoid conflicts. Could define
301
   // them inside ppnd(), but static constants are more efficient.
302

    
303
   // Coefficients for P close to 0.5
304
   private static final double A0_p = 3.3871327179E+00, A1_p = 5.0434271938E+01, A2_p = 1.5929113202E+02,
305
            A3_p = 5.9109374720E+01, B1_p = 1.7895169469E+01, B2_p = 7.8757757664E+01, B3_p = 6.7187563600E+01;
306

    
307
   // Coefficients for P not close to 0, 0.5 or 1 (names changed to avoid conflict with swilk())
308
   private static final double C0_p = 1.4234372777E+00, C1_p = 2.7568153900E+00, C2_p = 1.3067284816E+00,
309
            C3_p = 1.7023821103E-01, D1_p = 7.3700164250E-01, D2_p = 1.2021132975E-01;
310

    
311
   // Coefficients for P near 0 or 1.
312
   private static final double E0_p = 6.6579051150E+00, E1_p = 3.0812263860E+00, E2_p = 4.2868294337E-01,
313
            E3_p = 1.7337203997E-02, F1_p = 2.4197894225E-01, F2_p = 1.2258202635E-02;
314

    
315
   private static final double SPLIT1 = 0.425, SPLIT2 = 5.0, CONST1 = 0.180625, CONST2 = 1.6;
316

    
317

    
318
   /**
319
    * ALGORITHM AS 241 APPL. STATIST. (1988) VOL. 37, NO. 3, 477-484.
320
    *
321
    * Produces the normal deviate Z corresponding to a given lower tail area of P; Z is accurate to about 1 part in 10**7.
322
    *
323
    * @param p
324
    * @return
325
    */
326
   private static double ppnd(final double p) {
327
      final double q = p - 0.5;
328
      double r;
329
      if (Math.abs(q) <= SPLIT1) {
330
         r = CONST1 - q * q;
331
         return q * (((A3_p * r + A2_p) * r + A1_p) * r + A0_p) / (((B3_p * r + B2_p) * r + B1_p) * r + 1.0);
332
      }
333
      else {
334
         if (q < 0.0) {
335
            r = p;
336
         }
337
         else {
338
            r = 1.0 - p;
339
         }
340
         if (r <= 0.0) {
341
            return 0.0;
342
         }
343
         r = Math.sqrt(-Math.log(r));
344
         double normal_dev;
345
         if (r <= SPLIT2) {
346
            r -= CONST2;
347
            normal_dev = (((C3_p * r + C2_p) * r + C1_p) * r + C0_p) / ((D2_p * r + D1_p) * r + 1.0);
348
         }
349
         else {
350
            r -= SPLIT2;
351
            normal_dev = (((E3_p * r + E2_p) * r + E1_p) * r + E0_p) / ((F2_p * r + F1_p) * r + 1.0);
352
         }
353
         if (q < 0.0) {
354
            normal_dev = -normal_dev;
355
         }
356
         return normal_dev;
357
      }
358
   }
359

    
360

    
361
   /**
362
    * Algorithm AS 181.2 Appl. Statist. (1982) Vol. 31, No. 2
363
    *
364
    * Calculates the algebraic polynomial of order nord-1 with array of coefficients c. Zero order coefficient is c[1]
365
    *
366
    * @param c
367
    * @param nord
368
    * @param x
369
    * @return
370
    */
371
   private static double poly(final double[] c,
372
                              final int nord,
373
                              final double x) {
374
      double poly = c[1];
375
      if (nord == 1) {
376
         return poly;
377
      }
378
      double p = x * c[nord];
379
      if (nord != 2) {
380
         final int n2 = nord - 2;
381
         int j = n2 + 1;
382
         for (int i = 1; i <= n2; ++i) {
383
            p = (p + c[j]) * x;
384
            --j;
385
         }
386
      }
387
      poly += p;
388
      return poly;
389
   }
390

    
391
   // Constants & polynomial coefficients for alnorm(), slightly renamed to avoid conflicts.
392
   private static final double CON_a = 1.28, LTONE_a = 7.0, UTZERO_a = 18.66;
393
   private static final double P_a   = 0.398942280444, Q_a = 0.39990348504, R_a = 0.398942280385, A1_a = 5.75885480458,
394
            A2_a = 2.62433121679, A3_a = 5.92885724438, B1_a = -29.8213557807, B2_a = 48.6959930692, C1_a = -3.8052E-8,
395
            C2_a = 3.98064794E-4, C3_a = -0.151679116635, C4_a = 4.8385912808, C5_a = 0.742380924027, C6_a = 3.99019417011,
396
            D1_a = 1.00000615302, D2_a = 1.98615381364, D3_a = 5.29330324926, D4_a = -15.1508972451, D5_a = 30.789933034;
397

    
398

    
399
   /**
400
    * Algorithm AS66 Applied Statistics (1973) vol.22, no.3
401
    *
402
    * Evaluates the tail area of the standardised normal curve from x to infinity if upper is true or from minus infinity to x if
403
    * upper is false.
404
    *
405
    * @param x
406
    * @param upper
407
    * @return
408
    */
409
   private static double alnorm(final double x,
410
                                final boolean upper) {
411
      boolean up = upper;
412
      double z = x;
413
      if (z < 0.0) {
414
         up = !up;
415
         z = -z;
416
      }
417
      double fn_val;
418
      if (z > LTONE_a && (!up || z > UTZERO_a)) {
419
         fn_val = 0.0;
420
      }
421
      else {
422
         double y = 0.5 * z * z;
423
         if (z <= CON_a) {
424
            fn_val = 0.5 - z * (P_a - Q_a * y / (y + A1_a + B1_a / (y + A2_a + B2_a / (y + A3_a))));
425
         }
426
         else {
427
            fn_val = R_a
428
                     * Math.exp(-y)
429
                     / (z + C1_a + D1_a
430
                                   / (z + C2_a + D2_a / (z + C3_a + D3_a / (z + C4_a + D4_a / (z + C5_a + D5_a / (z + C6_a))))));
431
         }
432
      }
433
      if (!up) {
434
         fn_val = 1.0 - fn_val;
435
      }
436
      return fn_val;
437
   }
438
}