LCOV - code coverage report
Current view: top level - modules/statslib - ad_test.sl (source / functions) Hit Total Coverage
Test: all.lcov Lines: 180 196 91.8 %
Date: 2022-08-02 14:41:00 Functions: 5 5 100.0 %

          Line data    Source code
       1             : % Anderson-Darling Tests
       2             : % 
       3             : % The k-sample test was implemented from the equations found in
       4             : % Scholz F.W. and Stephens M.A., "K-Sample Anderson-Darling Tests",
       5             : % Journal of the American Statistical Association, Vol 82, 399 (1987)
       6             : %
       7             : private define get_unique_and_tied (a)
       8             : {
       9           3 :    variable n = length (a);
      10           3 :    variable i = array_sort (a);
      11           3 :    variable z = a[__tmp(i)];
      12             : 
      13           3 :    if (z[0] == z[-1])
      14           0 :      return [z[0]], [n];
      15             : 
      16             :    variable j, k;
      17             :    % Algorithm examples:
      18             :    % z =           [1, 2, 2, 3, 4, 4, 4];      [1,2]
      19             :    % shift(z,-1) = [4, 1, 2, 2, 3, 4, 4];      [2,1]
      20             :    % z!=sh(z)    = [1, 1, 0, 1, 1, 0, 0];      [1,1]
      21             :    % j=where(&k) = [0,1,3,4];                  [0,1]
      22             :    % k           = [2,5,6];                    []
      23             :    % k-1         = [1,4,5];                    []
      24           3 :    j = where (shift (z,-1) != z, &k);
      25             : 
      26           3 :    variable multiplicity = Int_Type[n] + 1;
      27           3 :    variable nk = length(k);
      28           3 :    i = 0;
      29           7 :    while (i < nk)
      30             :      {
      31           4 :         variable count = 0;
      32           8 :         variable k0 = k[i], k1 = k0;
      33           8 :         while ((i < nk) && (k[i] == k1))
      34             :           {
      35           4 :              k1++;
      36           4 :              i++;
      37             :           }
      38           4 :         multiplicity[k0-1] += k1-k0;
      39             :      }
      40           3 :    return z[j], multiplicity[j];
      41             : }
      42             : 
      43             : define ad_ktest_pval (t, m)
      44             : {
      45           6 :    variable alphas = [0.25, 0.1, 0.05, 0.025, 0.01];
      46             :    % S&S suggests the parametrization
      47             :    %   t_m(alpha) = b0(alpha) + b1(alpha)/sqrt(m) + b2(alpha)/m
      48           6 :    variable b0_alpha = [0.675, 1.281, 1.645, 1.960, 2.326];
      49           6 :    variable b1_alpha = [-0.245, 0.250, 0.678, 1.149, 1.822];
      50           6 :    variable b2_alpha = [-0.105, -0.305, -0.362, -0.391, -0.396];
      51             : 
      52             :    % The algorithm:
      53             :    %  1.  Use bk_alpha to compute tm_alpha for given m.
      54             :    %  2.  Use linear interpolation of log(alpha/(1-alpha) vs tm_alpha.
      55             :    %  3.  ==> exp(v) = alpha/(1-alpha)
      56             :    %      ==> alpha = exp(v)/(1+exp(v)), where v is interpolated value.
      57             : 
      58           6 :    variable i, n = length (alphas);
      59           6 :    variable logodd = log (alphas/(1-alphas));
      60           6 :    variable tm_alpha = b0_alpha + b1_alpha/sqrt(m) + b2_alpha/m;
      61           6 :    i = wherelast (t >= tm_alpha);
      62           6 :    if (i == NULL)
      63           0 :      i = 0;
      64           6 :    else if (i == n - 1)
      65           2 :      i = n - 2;
      66             : 
      67             :    % v = p0*(1-s) + s*p1 ==> v = p0 + s*(p1-p0)
      68             :    % v0 = p0 + (t-t0)/(t1-t0)*(p1-p0)
      69          24 :    variable v0 = logodd[i], v1 = logodd[i+1], t0 = tm_alpha[i], t1 = tm_alpha[i+1];
      70           6 :    variable v = v0 + (v1-v0)*((t-t0)/(t1-t0));
      71             : 
      72           6 :    variable alpha = exp(v)/(1+exp(v));
      73           6 :    if (alpha < 0) alpha = 0.0;
      74           6 :    else if (alpha > 1.0) alpha = 1.0;
      75           6 :    return alpha;
      76             : }
      77             : 
      78             : define ad_ktest ()
      79             : {
      80           4 :    variable tref = NULL;
      81           4 :    variable arg, nargs = _NARGS;
      82           4 :    if (nargs > 1)
      83             :      {
      84           3 :         arg = ();
      85           3 :         if (_typeof (arg) == Ref_Type)
      86             :           {
      87           3 :              tref = arg;
      88           3 :              nargs--;
      89             :           }
      90           0 :         else arg;                      %  leave it on stack
      91             :      }
      92             : 
      93           4 :    if (nargs == 0)
      94           1 :      usage ("\
      95             : pval = ad_ktest ({X1,X2,...Xn} [,&statistic]; qualifiers);\n\
      96             : Qualifiers:\n\
      97             :   pval2=&pval2    P-value corresponding to continuous case (no ties)\n\
      98             :   stat2=&stat2    Statistic for the continuous case\n\
      99             : "
     100             :            );
     101             : 
     102             :    variable datasets;
     103           3 :    if (nargs == 1)
     104           2 :      datasets = ();
     105             :    else
     106           1 :      datasets = __pop_list (nargs);
     107             : 
     108           3 :    if ((_typeof (datasets) != Array_Type)
     109             :        && (typeof (datasets) != List_Type))
     110           0 :      throw InvalidParmError, "Expecting a list of arrays or an array of arrays";
     111             : 
     112             :    variable i, j, k;
     113             :    variable z_i, zstar, n;
     114             : 
     115           3 :    k = length (datasets);
     116           3 :    if (k < 2)
     117           0 :      throw InvalidParmError, "ad_ktest requires at least 2 datasets";
     118             : 
     119           3 :    zstar = datasets[0];
     120             : 
     121           3 :    _for i (1, k-1, 1)
     122           6 :      zstar = [zstar, datasets[i]];
     123             : 
     124           3 :    n = length (zstar);
     125             : 
     126             :    variable multiplicity;
     127           3 :    (zstar, multiplicity) = get_unique_and_tied (zstar);
     128             : 
     129           3 :    variable cap_l = length(zstar);
     130             : 
     131           6 :    variable s = 0.0, s_a = 0.0;
     132           3 :    variable mj = Double_Type[k];
     133           6 :    variable cap_Bj = 0.0, cap_Bj_a = 0.0;
     134             : 
     135           3 :    variable kmult = Int_Type[cap_l, k];
     136             :    variable zstar_j;
     137           3 :    cap_l--;
     138             : #ifexists wherefirst_ge
     139           3 :    _for i (0, k-1, 1)
     140             :      {
     141           9 :         z_i = datasets[i];
     142           9 :         z_i = z_i[array_sort (z_i)];
     143             :         % exploit the fact that both z_i and zstar are sorted.
     144           9 :         variable i0 = 0;
     145           9 :         _for j (0, cap_l, 1)
     146             :           {
     147             :              variable i1;
     148         144 :              i1 = wherefirst_ge (z_i, zstar[j], i0);
     149         144 :              if (i1 == NULL)
     150           0 :                continue;
     151             :              % This point is reached about a quarter of the time.
     152         144 :              i0 = i1;
     153         144 :              i1 = wherefirst_ne (z_i, zstar[j], i0);
     154         144 :              if (i1 == NULL)
     155             :                {
     156             :                   % end of array
     157           9 :                   if (zstar[j] == z_i[i0])
     158           9 :                     kmult[j,i] = length(z_i)-i0;
     159           9 :                   break;
     160             :                }
     161             : 
     162         135 :              kmult[j,i] = i1-i0;
     163         135 :              i0 = i1;
     164             :           }
     165             :      }
     166             : #endif
     167           3 :    _for j (0, cap_l, 1)
     168             :      {
     169          52 :         zstar_j = zstar[j];
     170          52 :         variable l_j = multiplicity[j];
     171          52 :         cap_Bj += l_j;
     172          52 :         cap_Bj_a += 0.5*l_j;
     173         104 :         variable ds = 0.0, ds_a = 0.0;
     174          52 :         _for i (0, k-1, 1)
     175             :           {
     176         174 :              z_i = datasets[i];
     177         174 :              variable n_i = length (z_i);
     178             : #ifexists wherefirst_ne
     179         174 :              variable dmij = 0.5*kmult[j,i];
     180             : #else
     181             :              variable dmij = 0.5*length (where (z_i == zstar_j));
     182             : #endif
     183         174 :              variable mij = mj[i] + dmij;
     184         174 :              variable top = (n*mij - n_i*cap_Bj_a);
     185         174 :              ds_a += top*top/n_i;
     186             : 
     187         174 :              mij += dmij;
     188         174 :              top = (n*mij - n_i*cap_Bj);
     189         174 :              ds += top*top/n_i;
     190         174 :              mj[i] = mij;
     191             :           }
     192          52 :         s_a += l_j*ds_a/(cap_Bj_a*(n-cap_Bj_a) - 0.25*l_j*n)/n;
     193          52 :         if (j != cap_l)
     194          49 :           s += l_j*ds/(cap_Bj * (n-cap_Bj))/n;
     195          52 :         cap_Bj_a = cap_Bj;
     196             :      }
     197           3 :    s_a *= (n-1.0)/n;
     198             : 
     199           3 :    variable h = sum (1.0/[1:n-1]);
     200           3 :    variable g = sum (cumsum (1.0/([n-1:2:-1]))/[2:n-1]);
     201           3 :    variable cap_h = 0.0;
     202             : 
     203           3 :    _for i (0, k-1, 1)
     204           9 :      cap_h += 1.0/length(datasets[i]);
     205             : 
     206             :    variable
     207           9 :      k2 = k*k, hk = h*k, g4m6 = 4*g-6,
     208           3 :      a = g4m6*(k-1)+(10-6*g)*cap_h,
     209           3 :      b = (2*g-4)*k2 + 8*hk + (2*g-14*h-4)*cap_h - 8*h + g4m6,
     210           3 :      c = (6*h+2*g-2)*k2 + (4*h-g4m6)*k + (2*h-6)*cap_h + 4*h,
     211           3 :      d = (2*h+6)*k2 - 4*hk;
     212             : 
     213           3 :    variable sig = d + n*(c + n*(b + n*a));
     214           3 :    sig = sqrt(sig/(n-1)/(n-2)/(n-3));
     215           3 :    variable t = (s - (k-1))/sig;
     216           3 :    variable t_a = (s_a - (k-1))/sig;
     217             :    %vmessage ("s = %g, s_a = %g", s, s_a);
     218             : 
     219           3 :    variable pval = ad_ktest_pval (t_a, k-1);
     220           6 :    if (tref != NULL) @tref = t_a;
     221             : 
     222           3 :    variable pval2_ref = qualifier ("pval2");
     223           3 :    variable stat2_ref = qualifier ("stat2");
     224             : 
     225           6 :    if (typeof (pval2_ref) == Ref_Type) @pval2_ref = ad_ktest_pval (t, k-1);
     226           6 :    if (typeof (stat2_ref) == Ref_Type) @stat2_ref = t;
     227             : 
     228           3 :    return pval;
     229             : }
     230             : 
     231             : % This function was derived from
     232             : %  Marsaglia and Marsaglia, Evaluating the Anderson-Darling
     233             : %  Distribution, Journal of Statistical Software, Vol. 9, Issue 2, Feb
     234             : %  2004.
     235             : define anderson_darling_cdf ()
     236             : {
     237           2 :    if (_NARGS != 2)
     238             :      {
     239           0 :         usage ("\
     240             : cdf = anderson_darling_cdf (A2, nsamp)\n\
     241             : %% Computes the Anderson-Darling CDF at the points A2 for sample-sizes nsamp.\n\
     242             : "
     243             :               );
     244             :      }
     245           2 :    variable z, ndata; (z, ndata) = ();
     246           2 :    variable nz = length (z);
     247             : 
     248           2 :    if (typeof (ndata) != Array_Type)
     249           2 :      ndata = ndata + Int_Type[nz];
     250             : 
     251           2 :    if (length (ndata) != nz)
     252           0 :      throw InvalidParmError, "the length of A2 and nsamp do not match";
     253             : 
     254           2 :    variable x = Double_Type[nz];
     255             : 
     256             :    variable i, j, a, zz;
     257             : 
     258             :    % Evaluate adinf(z)
     259           2 :    i = where (0.0 < z < 2.0, &j);
     260           2 :    if (length (i))
     261             :      {
     262           1 :         zz = z[i];
     263           1 :         a = [2.00012, 0.247105, -0.0649821, 0.0347962, -0.0116720, 0.00168691];
     264           1 :         x[i] = exp(-1.2337141/zz)*polynom(a, zz)/sqrt(zz);
     265             :      }
     266           2 :    if (length (j))
     267             :      {
     268           1 :         zz = z[j];
     269           1 :         a = [1.0776, -2.30695, 0.43424, -0.082433, 0.008056, -0.0003146];
     270           1 :         x[j] = exp(-exp(polynom (a, zz)));
     271             :      }
     272             : 
     273             :    % Now compute the correction for finite n (sec 3 of Marsaglia's paper)
     274           2 :    variable ndatainv = 1.0/ndata;
     275           2 :    variable c = 0.01265 + 0.1757*ndatainv;
     276           2 :    variable dx = Double_Type[nz];
     277             : 
     278           2 :    i = where (x < c);
     279           2 :    if (length (i))
     280             :      {
     281           0 :         zz = x[i]/c[i];
     282           0 :         dx[i] = ((0.0037*ndatainv + 0.00078)*ndatainv + 0.00006)*ndatainv
     283             :           * sqrt(zz)*(1.0-zz)*(49.0*zz-102.0);
     284             :      }
     285             : 
     286           2 :    i = where (c <= x < 0.8);
     287           2 :    if (length (i))
     288             :      {
     289           1 :         variable ci = c[i];
     290           1 :         zz = x[i];
     291           1 :         a = [-.00022633, 6.54034, -14.6538, 14.458, -8.259, 1.91864];
     292           1 :         dx[i] = ndatainv*(0.04213 + ndatainv*0.01365)
     293             :           * polynom (a, (zz-ci)/(0.8-ci));
     294             :      }
     295             : 
     296           2 :    i = where (x >= 0.8);
     297           2 :    if (length (i))
     298             :      {
     299           1 :         a = [-130.2137, 745.2337, -1705.091, 1950.646, -1116.360, 255.7844];
     300           1 :         dx[i] = polynom (a, x[i])*ndatainv;
     301             :      }
     302             : 
     303           2 :    x = __tmp(x) + dx;
     304           2 :    ifnot (Array_Type == typeof(z))
     305           2 :      x = x[0];
     306             : 
     307           2 :    return x;
     308             : }
     309             : 
     310             : define ad_test ()
     311             : {
     312           4 :    variable s_ref = NULL;
     313           4 :    if (_NARGS == 2)
     314           3 :      s_ref = ();
     315           1 :    else if (_NARGS != 1)
     316           1 :      usage ("\
     317             : p = ad_test (X [,&Asquared]);\n\ %% 1-sample Anderson-Darling test\n\
     318             : Qualifiers:\n\
     319             :   ;cdf        %% The X values are the CDFs of the underlying distribution\n\
     320             :               %%   and 0 <= X <= 1\n\
     321             :   ;mean=val   %% The mean of the assumed normal distribution\n\
     322             :   ;stddev=val %% The stddev of the assumed normal distribution\n\
     323             : "
     324             :            );
     325             : 
     326           3 :    variable cdf = ();
     327           3 :    variable is_cdf = qualifier_exists ("cdf");
     328             : 
     329           3 :    variable n = length (cdf);
     330           3 :    variable factor = 1.0;
     331             : 
     332           3 :    ifnot (is_cdf)
     333             :      {
     334           1 :         variable mu = qualifier ("mean");
     335           1 :         variable sd = qualifier ("stddev");
     336             : 
     337           1 :         if (sd == NULL)
     338             :           {
     339           1 :              if (mu == NULL)
     340             :                {
     341           1 :                   factor = (1.0 + (0.75 + 2.25/n)/n);
     342           1 :                   sd = stddev (cdf);
     343             :                }
     344           0 :              else sd = sample_stddev (cdf);
     345             :           }
     346           2 :         if (mu == NULL) mu = mean (cdf);
     347             :         %vmessage ("mean=%g, stddev=%g, factor=%g", mu, sd, factor);
     348           1 :         cdf = normal_cdf (cdf, mu, sd);
     349             :      }
     350             : 
     351           3 :    cdf = __tmp(cdf)[array_sort(cdf)];
     352             : 
     353           3 :    variable ii = [1:2*n:2];
     354           3 :    variable a2 = -n - (sum(ii*log(cdf) + (2*n-ii)*log(1.0-cdf)))/n;
     355             : 
     356           3 :    if (s_ref != NULL)
     357           3 :      @s_ref = a2;
     358             : 
     359           3 :    if (is_cdf)
     360             :      {
     361           2 :         return 1.0 - anderson_darling_cdf (a2, n);
     362             :      }
     363           1 :    a2 = factor * a2;
     364             : 
     365             :    % Augostino & Stephens, 1986
     366           1 :    if (a2 >= 0.6)
     367           1 :      return exp(1.2937 + (-5.709 + 0.0186*a2)*a2);
     368           0 :    if (a2 >= 0.34)
     369           0 :      return exp(0.9177 + (-4.279 - 1.38*a2)*a2);
     370           0 :    if (a2 >= 0.2)
     371           0 :      return 1.0 - exp(-8.318 + (42.796 - 59.938*a2)*a2);
     372           0 :    return 1.0 - exp(-13.436 + (101.14 - 223.73*a2)*a2);
     373             : }
     374             : 

Generated by: LCOV version 1.13