Line data Source code
1 : % Anderson-Darling Tests
2 : %
3 : % The k-sample test was implemented from the equations found in
4 : % Scholz F.W. and Stephens M.A., "K-Sample Anderson-Darling Tests",
5 : % Journal of the American Statistical Association, Vol 82, 399 (1987)
6 : %
7 : private define get_unique_and_tied (a)
8 : {
9 3 : variable n = length (a);
10 3 : variable i = array_sort (a);
11 3 : variable z = a[__tmp(i)];
12 :
13 3 : if (z[0] == z[-1])
14 0 : return [z[0]], [n];
15 :
16 : variable j, k;
17 : % Algorithm examples:
18 : % z = [1, 2, 2, 3, 4, 4, 4]; [1,2]
19 : % shift(z,-1) = [4, 1, 2, 2, 3, 4, 4]; [2,1]
20 : % z!=sh(z) = [1, 1, 0, 1, 1, 0, 0]; [1,1]
21 : % j=where(&k) = [0,1,3,4]; [0,1]
22 : % k = [2,5,6]; []
23 : % k-1 = [1,4,5]; []
24 3 : j = where (shift (z,-1) != z, &k);
25 :
26 3 : variable multiplicity = Int_Type[n] + 1;
27 3 : variable nk = length(k);
28 3 : i = 0;
29 7 : while (i < nk)
30 : {
31 4 : variable count = 0;
32 8 : variable k0 = k[i], k1 = k0;
33 8 : while ((i < nk) && (k[i] == k1))
34 : {
35 4 : k1++;
36 4 : i++;
37 : }
38 4 : multiplicity[k0-1] += k1-k0;
39 : }
40 3 : return z[j], multiplicity[j];
41 : }
42 :
43 : define ad_ktest_pval (t, m)
44 : {
45 6 : variable alphas = [0.25, 0.1, 0.05, 0.025, 0.01];
46 : % S&S suggests the parametrization
47 : % t_m(alpha) = b0(alpha) + b1(alpha)/sqrt(m) + b2(alpha)/m
48 6 : variable b0_alpha = [0.675, 1.281, 1.645, 1.960, 2.326];
49 6 : variable b1_alpha = [-0.245, 0.250, 0.678, 1.149, 1.822];
50 6 : variable b2_alpha = [-0.105, -0.305, -0.362, -0.391, -0.396];
51 :
52 : % The algorithm:
53 : % 1. Use bk_alpha to compute tm_alpha for given m.
54 : % 2. Use linear interpolation of log(alpha/(1-alpha) vs tm_alpha.
55 : % 3. ==> exp(v) = alpha/(1-alpha)
56 : % ==> alpha = exp(v)/(1+exp(v)), where v is interpolated value.
57 :
58 6 : variable i, n = length (alphas);
59 6 : variable logodd = log (alphas/(1-alphas));
60 6 : variable tm_alpha = b0_alpha + b1_alpha/sqrt(m) + b2_alpha/m;
61 6 : i = wherelast (t >= tm_alpha);
62 6 : if (i == NULL)
63 0 : i = 0;
64 6 : else if (i == n - 1)
65 2 : i = n - 2;
66 :
67 : % v = p0*(1-s) + s*p1 ==> v = p0 + s*(p1-p0)
68 : % v0 = p0 + (t-t0)/(t1-t0)*(p1-p0)
69 24 : variable v0 = logodd[i], v1 = logodd[i+1], t0 = tm_alpha[i], t1 = tm_alpha[i+1];
70 6 : variable v = v0 + (v1-v0)*((t-t0)/(t1-t0));
71 :
72 6 : variable alpha = exp(v)/(1+exp(v));
73 6 : if (alpha < 0) alpha = 0.0;
74 6 : else if (alpha > 1.0) alpha = 1.0;
75 6 : return alpha;
76 : }
77 :
78 : define ad_ktest ()
79 : {
80 4 : variable tref = NULL;
81 4 : variable arg, nargs = _NARGS;
82 4 : if (nargs > 1)
83 : {
84 3 : arg = ();
85 3 : if (_typeof (arg) == Ref_Type)
86 : {
87 3 : tref = arg;
88 3 : nargs--;
89 : }
90 0 : else arg; % leave it on stack
91 : }
92 :
93 4 : if (nargs == 0)
94 1 : usage ("\
95 : pval = ad_ktest ({X1,X2,...Xn} [,&statistic]; qualifiers);\n\
96 : Qualifiers:\n\
97 : pval2=&pval2 P-value corresponding to continuous case (no ties)\n\
98 : stat2=&stat2 Statistic for the continuous case\n\
99 : "
100 : );
101 :
102 : variable datasets;
103 3 : if (nargs == 1)
104 2 : datasets = ();
105 : else
106 1 : datasets = __pop_list (nargs);
107 :
108 3 : if ((_typeof (datasets) != Array_Type)
109 : && (typeof (datasets) != List_Type))
110 0 : throw InvalidParmError, "Expecting a list of arrays or an array of arrays";
111 :
112 : variable i, j, k;
113 : variable z_i, zstar, n;
114 :
115 3 : k = length (datasets);
116 3 : if (k < 2)
117 0 : throw InvalidParmError, "ad_ktest requires at least 2 datasets";
118 :
119 3 : zstar = datasets[0];
120 :
121 3 : _for i (1, k-1, 1)
122 6 : zstar = [zstar, datasets[i]];
123 :
124 3 : n = length (zstar);
125 :
126 : variable multiplicity;
127 3 : (zstar, multiplicity) = get_unique_and_tied (zstar);
128 :
129 3 : variable cap_l = length(zstar);
130 :
131 6 : variable s = 0.0, s_a = 0.0;
132 3 : variable mj = Double_Type[k];
133 6 : variable cap_Bj = 0.0, cap_Bj_a = 0.0;
134 :
135 3 : variable kmult = Int_Type[cap_l, k];
136 : variable zstar_j;
137 3 : cap_l--;
138 : #ifexists wherefirst_ge
139 3 : _for i (0, k-1, 1)
140 : {
141 9 : z_i = datasets[i];
142 9 : z_i = z_i[array_sort (z_i)];
143 : % exploit the fact that both z_i and zstar are sorted.
144 9 : variable i0 = 0;
145 9 : _for j (0, cap_l, 1)
146 : {
147 : variable i1;
148 144 : i1 = wherefirst_ge (z_i, zstar[j], i0);
149 144 : if (i1 == NULL)
150 0 : continue;
151 : % This point is reached about a quarter of the time.
152 144 : i0 = i1;
153 144 : i1 = wherefirst_ne (z_i, zstar[j], i0);
154 144 : if (i1 == NULL)
155 : {
156 : % end of array
157 9 : if (zstar[j] == z_i[i0])
158 9 : kmult[j,i] = length(z_i)-i0;
159 9 : break;
160 : }
161 :
162 135 : kmult[j,i] = i1-i0;
163 135 : i0 = i1;
164 : }
165 : }
166 : #endif
167 3 : _for j (0, cap_l, 1)
168 : {
169 52 : zstar_j = zstar[j];
170 52 : variable l_j = multiplicity[j];
171 52 : cap_Bj += l_j;
172 52 : cap_Bj_a += 0.5*l_j;
173 104 : variable ds = 0.0, ds_a = 0.0;
174 52 : _for i (0, k-1, 1)
175 : {
176 174 : z_i = datasets[i];
177 174 : variable n_i = length (z_i);
178 : #ifexists wherefirst_ne
179 174 : variable dmij = 0.5*kmult[j,i];
180 : #else
181 : variable dmij = 0.5*length (where (z_i == zstar_j));
182 : #endif
183 174 : variable mij = mj[i] + dmij;
184 174 : variable top = (n*mij - n_i*cap_Bj_a);
185 174 : ds_a += top*top/n_i;
186 :
187 174 : mij += dmij;
188 174 : top = (n*mij - n_i*cap_Bj);
189 174 : ds += top*top/n_i;
190 174 : mj[i] = mij;
191 : }
192 52 : s_a += l_j*ds_a/(cap_Bj_a*(n-cap_Bj_a) - 0.25*l_j*n)/n;
193 52 : if (j != cap_l)
194 49 : s += l_j*ds/(cap_Bj * (n-cap_Bj))/n;
195 52 : cap_Bj_a = cap_Bj;
196 : }
197 3 : s_a *= (n-1.0)/n;
198 :
199 3 : variable h = sum (1.0/[1:n-1]);
200 3 : variable g = sum (cumsum (1.0/([n-1:2:-1]))/[2:n-1]);
201 3 : variable cap_h = 0.0;
202 :
203 3 : _for i (0, k-1, 1)
204 9 : cap_h += 1.0/length(datasets[i]);
205 :
206 : variable
207 9 : k2 = k*k, hk = h*k, g4m6 = 4*g-6,
208 3 : a = g4m6*(k-1)+(10-6*g)*cap_h,
209 3 : b = (2*g-4)*k2 + 8*hk + (2*g-14*h-4)*cap_h - 8*h + g4m6,
210 3 : c = (6*h+2*g-2)*k2 + (4*h-g4m6)*k + (2*h-6)*cap_h + 4*h,
211 3 : d = (2*h+6)*k2 - 4*hk;
212 :
213 3 : variable sig = d + n*(c + n*(b + n*a));
214 3 : sig = sqrt(sig/(n-1)/(n-2)/(n-3));
215 3 : variable t = (s - (k-1))/sig;
216 3 : variable t_a = (s_a - (k-1))/sig;
217 : %vmessage ("s = %g, s_a = %g", s, s_a);
218 :
219 3 : variable pval = ad_ktest_pval (t_a, k-1);
220 6 : if (tref != NULL) @tref = t_a;
221 :
222 3 : variable pval2_ref = qualifier ("pval2");
223 3 : variable stat2_ref = qualifier ("stat2");
224 :
225 6 : if (typeof (pval2_ref) == Ref_Type) @pval2_ref = ad_ktest_pval (t, k-1);
226 6 : if (typeof (stat2_ref) == Ref_Type) @stat2_ref = t;
227 :
228 3 : return pval;
229 : }
230 :
231 : % This function was derived from
232 : % Marsaglia and Marsaglia, Evaluating the Anderson-Darling
233 : % Distribution, Journal of Statistical Software, Vol. 9, Issue 2, Feb
234 : % 2004.
235 : define anderson_darling_cdf ()
236 : {
237 2 : if (_NARGS != 2)
238 : {
239 0 : usage ("\
240 : cdf = anderson_darling_cdf (A2, nsamp)\n\
241 : %% Computes the Anderson-Darling CDF at the points A2 for sample-sizes nsamp.\n\
242 : "
243 : );
244 : }
245 2 : variable z, ndata; (z, ndata) = ();
246 2 : variable nz = length (z);
247 :
248 2 : if (typeof (ndata) != Array_Type)
249 2 : ndata = ndata + Int_Type[nz];
250 :
251 2 : if (length (ndata) != nz)
252 0 : throw InvalidParmError, "the length of A2 and nsamp do not match";
253 :
254 2 : variable x = Double_Type[nz];
255 :
256 : variable i, j, a, zz;
257 :
258 : % Evaluate adinf(z)
259 2 : i = where (0.0 < z < 2.0, &j);
260 2 : if (length (i))
261 : {
262 1 : zz = z[i];
263 1 : a = [2.00012, 0.247105, -0.0649821, 0.0347962, -0.0116720, 0.00168691];
264 1 : x[i] = exp(-1.2337141/zz)*polynom(a, zz)/sqrt(zz);
265 : }
266 2 : if (length (j))
267 : {
268 1 : zz = z[j];
269 1 : a = [1.0776, -2.30695, 0.43424, -0.082433, 0.008056, -0.0003146];
270 1 : x[j] = exp(-exp(polynom (a, zz)));
271 : }
272 :
273 : % Now compute the correction for finite n (sec 3 of Marsaglia's paper)
274 2 : variable ndatainv = 1.0/ndata;
275 2 : variable c = 0.01265 + 0.1757*ndatainv;
276 2 : variable dx = Double_Type[nz];
277 :
278 2 : i = where (x < c);
279 2 : if (length (i))
280 : {
281 0 : zz = x[i]/c[i];
282 0 : dx[i] = ((0.0037*ndatainv + 0.00078)*ndatainv + 0.00006)*ndatainv
283 : * sqrt(zz)*(1.0-zz)*(49.0*zz-102.0);
284 : }
285 :
286 2 : i = where (c <= x < 0.8);
287 2 : if (length (i))
288 : {
289 1 : variable ci = c[i];
290 1 : zz = x[i];
291 1 : a = [-.00022633, 6.54034, -14.6538, 14.458, -8.259, 1.91864];
292 1 : dx[i] = ndatainv*(0.04213 + ndatainv*0.01365)
293 : * polynom (a, (zz-ci)/(0.8-ci));
294 : }
295 :
296 2 : i = where (x >= 0.8);
297 2 : if (length (i))
298 : {
299 1 : a = [-130.2137, 745.2337, -1705.091, 1950.646, -1116.360, 255.7844];
300 1 : dx[i] = polynom (a, x[i])*ndatainv;
301 : }
302 :
303 2 : x = __tmp(x) + dx;
304 2 : ifnot (Array_Type == typeof(z))
305 2 : x = x[0];
306 :
307 2 : return x;
308 : }
309 :
310 : define ad_test ()
311 : {
312 4 : variable s_ref = NULL;
313 4 : if (_NARGS == 2)
314 3 : s_ref = ();
315 1 : else if (_NARGS != 1)
316 1 : usage ("\
317 : p = ad_test (X [,&Asquared]);\n\ %% 1-sample Anderson-Darling test\n\
318 : Qualifiers:\n\
319 : ;cdf %% The X values are the CDFs of the underlying distribution\n\
320 : %% and 0 <= X <= 1\n\
321 : ;mean=val %% The mean of the assumed normal distribution\n\
322 : ;stddev=val %% The stddev of the assumed normal distribution\n\
323 : "
324 : );
325 :
326 3 : variable cdf = ();
327 3 : variable is_cdf = qualifier_exists ("cdf");
328 :
329 3 : variable n = length (cdf);
330 3 : variable factor = 1.0;
331 :
332 3 : ifnot (is_cdf)
333 : {
334 1 : variable mu = qualifier ("mean");
335 1 : variable sd = qualifier ("stddev");
336 :
337 1 : if (sd == NULL)
338 : {
339 1 : if (mu == NULL)
340 : {
341 1 : factor = (1.0 + (0.75 + 2.25/n)/n);
342 1 : sd = stddev (cdf);
343 : }
344 0 : else sd = sample_stddev (cdf);
345 : }
346 2 : if (mu == NULL) mu = mean (cdf);
347 : %vmessage ("mean=%g, stddev=%g, factor=%g", mu, sd, factor);
348 1 : cdf = normal_cdf (cdf, mu, sd);
349 : }
350 :
351 3 : cdf = __tmp(cdf)[array_sort(cdf)];
352 :
353 3 : variable ii = [1:2*n:2];
354 3 : variable a2 = -n - (sum(ii*log(cdf) + (2*n-ii)*log(1.0-cdf)))/n;
355 :
356 3 : if (s_ref != NULL)
357 3 : @s_ref = a2;
358 :
359 3 : if (is_cdf)
360 : {
361 2 : return 1.0 - anderson_darling_cdf (a2, n);
362 : }
363 1 : a2 = factor * a2;
364 :
365 : % Augostino & Stephens, 1986
366 1 : if (a2 >= 0.6)
367 1 : return exp(1.2937 + (-5.709 + 0.0186*a2)*a2);
368 0 : if (a2 >= 0.34)
369 0 : return exp(0.9177 + (-4.279 - 1.38*a2)*a2);
370 0 : if (a2 >= 0.2)
371 0 : return 1.0 - exp(-8.318 + (42.796 - 59.938*a2)*a2);
372 0 : return 1.0 - exp(-13.436 + (101.14 - 223.73*a2)*a2);
373 : }
374 :
|