-
function [pval x K P] = fexact( varargin )
%FEXACT Fisher's exact test
%
%
Fisher's
exact
test
is
a
statistical
test
used
to
compare
binary
outcomes
% between two
groups. For example, The number of positive and
negative
lab
% results in
individuals with disease (cases) compared to
healthy
% (controls). The test is
applicable most of the time a 2x2 contingency
% table can be made. This table is
referred to in the usage below
%
cases controls
% pos a c
K
% neg b d -
% total N - M
%
% This function returns a
pvalue against the hypothesis that the table
or a
% more extreme table
might have occured by chance
%
% usage
% p = fexact(X,y)
% fisher exact test for
situations in which the raw data have not
% been counted and placed. X is a
MxP matrix of dichotomous
variables
% and y is a vector of
dichotomous outcome variables
%
(disease/control). X and y must be in {0,1}
%
% p = fexact(...,
'options', values)
% options:
% tail l(eft)','r(ight)',
'b(oth)'
% left tail tests a
negative association. p( x<=a, M, K, N)
% right
tail
tests
a
positive
association.
p(
x<=b,
M,
K,
M-N)
% both
(default)
is
either
a
positive
or
negative
association.
%
this is the sum from the left and right tails of
the
% distribution
including
all
x
where
p(x|M,K,N)
is
less
than
% or equal p(a|M,K,N).
% perm Q. Repeat entire set of
tests Q*respz times with permuted
y
variables.
% The reported
p-values are corrected for multiple tests by
% interpolating
into
the
emprical
distribution
of
the
minimum
%
p-value obtained from each round
%
To use this option the X and y calling convention
must be
used
% repsz
Used
with
the
PERM
option.
specifies
the
number
of
replicates
% to
compute
at
one
time
(default=100).
The
larger
the
number
% the
faster
the
results,
but
the
more
memory
required.
This
is
% needed because permutation
takes far too much memory to
fully
% vectorize.
REPSZ
controls
the
size
that
is
vectorized
each
% loop.
%
% [p a K C] =
fexact(X,y);
% also returns
vector a. a(i) is the upper left square of the
contingency
%
corresponding to the ith column of X
crosstabulated by y. K is
a
% vector containing a+b for the
ith column of X.
% C
is
a
lookup
table
for
CDFs
that
can
be
used
in
subsequent
calls
% to fexact to further improve
performance.
% C( x(i)+1, K(i)+1)
is the cdf for the tail specified in
%
options
%
% p = fexact(
a,M,K,N, options)
% M
and
N
are
constants.
a
and
K
are
P-vectors.
No
checks
for
valid
% input are made.
%
% p = fexact( a,M,K,N,C]
% C is a lookup table that improve
performance. Intended to be
% used
with permutation testing where 1000s of calls are
made.
% the option
%
whatever tail was used to generate them.
%
% NOTES and LIMITATIONS:
% This function is extremely fast
when doing multiple tests and is
%
acceptable with a small number of tests. However,
it uses large
% tracks
of
memory.
On
my
2Gb
home
Intel
Core2
Quad
Core
Vista
machine
% this
function
does
250,000
tests
with
100
observations
each
in
0.10
%
seconds. I run out of memory when I do more
%
% example
% x
= unidrnd(2, 200, 1000)-1;
% y =
unidrnd(2,200,1)-1;
% p = fexact( x, y
); % generates p-values for 1000 tests
in
x
% p = fexact( x, y,
'perm', 10) % reports p-value relative to empirial
cdf
%
% Copyright 2009 Mike Boedigheimer
% Amgen Inc.
% Department of
Computational Biology
% mboedigh@
p = inputParser;
uired('A');
ional('y',[]);
ional('K',[]);
ional('N',[]);
ional('C',[]);
ional('tail',
'b', @(c)ismember( c, {'b','l','r'} ));
ional('perm', 0, @(x)
isnumeric(x)&isscalar(x));
ional('repsz', 100, @(x)
isnumeric(x)&isscalar(x));
(varargin{:});
P
= s.C;
if isempty(s.N) % using
fexact(X,y,options)
X = s.A;
y = s.y;
if
(~islogical(y) && ~all( ismember( y, [0 1])) ) &&
...
(~islogical(X) && ~all(
ismember( X(:), [0 1])) )
error('linstats:fexact:InvalidArgument', 'X and y
must be in
(0,1)' );
end
y = logical(y);
N =
sum(y); % number of cases
M = length(y);
K = sum(X,1)';
x = X'*sparse(y); % in unofficial
testing using timeit. This was
faster
than sum(X(y==1,:))
% and faster
than sum(bsxfun(@eq,X,y))
else % using
fexact( a,M,K,N ...)
x = s.A;
M = s.y;
N = s.N;
K = s.K;
end
switch
case 'l'; tail =
-1;
case 'r'; tail = 1;
case 'b'; tail = 2;
end
if N==0 || N==M
pval =
ones(1,length(x));
return;
end;
if
isempty(P)
P = getLookup(
M,N,tail);
end
pval = P( sub2ind(size(P), x+1, K+1));
if > 1
[fx x] = doPerm(X,y,M,K,P,,);
pval = interp1( x, fx, pval);
end
function [fx
x] = doPerm(X,y,M,K,P,nperms,repsz)
pperm = ones(repsz,nperms);
% sub2ind was previously 85% of the
execution time.
% this method computes
the index directly rather than
%
computing x and then calliung sub2ind. It is much
% faster because it uses the fast
multiple engine
% precompute K*nrows
+1, then add x after it is calculated
X
= [ K*size(P,1)+1 X' ];
B =
ones(1,repsz);
for permi =
1:nperms
%
generate
new
random
vector
y,
by
shuffling
the rows
(do
not
change
% the counts of
M,K or M)
[i i] = sort(
randn(M,repsz) );
pperm(:,permi) =
min( P( X*[B;y(i)] ));
end
[fx x] = mecdf(pperm(:));
function P = getLookup( M,
N, tail)
F = gammaln(
1:M+1); % used to compute factorials. For small
problems
% It is overkill to generate
all factorials up