In this posting I have included two sections. The first section is a script
which creates some data for a 2X2X2 factorial analysis, does the analysis,
and a post hoc test. As a newcomer to S+ I have had some difficulty in
accomplishing this seemingly simple task, in particular setting up the data
for the post hoc test. I was not able to find examples, and I eventually found
a
bug in multicomp(). I hope this example will be useful material for some
teacher,
or other new user.
The second section is my correction of the bug. The bug causes an error return
from multicomp() when using the valid.chech=F argument and a Tukey HSD test
Phil Gleason
Section 1
### create some data which will have three factors of two levels
# each, and will have a three way interaction between the factors.
str.nlvls <- 2 # number of levels for factor 1
pos.nlvls <- 2 # number of levels for factor 2
blk.nlvls <- 2 # number of levels for factor 3
nreps <- 5 # number of replications for each combination of levels
# these numbers assure a three way interaction
param.means <- array(c(100,20,85,10,99,20,83,11,101,19,86,8,75,30,87,19),
dim=c(str.nlvls,pos.nlvls,blk.nlvls))
# give names to the factors, and to the levels of each factor
dimnames(param.means) <- list(str=paste("str",1:str.nlvls,sep=""),
pos=paste("pos",1:pos.nlvls,sep=""),blk=LETTERS[1:blk.nlvls])
# create a matrix of all combinations of levels of each factor
art.design <- fac.design(c(str.nlvls,pos.nlvls,blk.nlvls),
factor.names=dimnames(param.means),replications=nreps)
# create a matrix which will hold the dependent variable
depvar <- vector(mode="numeric",length=dim(art.design)[1])
# put everything together into a data frame
art.synth <- data.frame(art.design,depvar=depvar)
# these nested for loops create data by adding a small amount of
# noise to the values specified by the variable param.means.
for (str.lvls in levels(art.synth$str)) {
for (pos.lvls in levels(art.synth$pos)) {
for (blk.lvls in levels(art.synth$blk)) {
art.synth[art.synth$str==str.lvls & art.synth$pos==pos.lvls &
art.synth$blk==blk.lvls,"depvar"] <- param.means[str.lvls, pos.lvls,blk.lvls]
+ rnorm(nreps,0,5.0)
}
}
}
# get the observed means
agg.means <- aggregate(art.synth$depvar,list(art.synth$str,art.synth$pos,
art.synth$blk),mean,na.rm=T)
# get the numbers alone
agm <- agg.means[,4]
# create composite labels so that the Tukey results will be properly labeled
temp.names <- apply(as.matrix(agg.means[,1:3]),1,paste,collapse=":")
# apply the composite labels to the means
names(agm) <- temp.names
# get the cell variance
agg.var <- aggregate(art.synth$depvar,list(art.synth$str,art.synth$pos,
art.synth$blk),var,na.method="omit",unbiased=T)
mse <- mean(agg.var[,4])
# get the cell counts. The function cell.count is included below.
agg.counts <- aggregate(art.synth$depvar,list(art.synth$str,art.synth$pos,
art.synth$blk),cell.count)
t.agg.counts <- agg.counts[,4]
# calculate the harmonic mean, in case the counts are unbalanced
lt.agg.counts <- length(t.agg.counts)
har.mean <- lt.agg.counts/sum(1/t.agg.counts)
# create the matrix which is required by multicomp.default
# see also p. 459 of the "Guide to Statistics" (v 4.5)
vmat <- mse*diag(1/rep(har.mean,lt.agg.counts))
# specify a full factorial model
fac.model <- "depvar~str*pos*blk"
# do the ANOVA
art.anova <- aov(fac.model,data=art.synth,na.action=na.omit)
art.summary <- summary(art.anova)
print(art.summary)
# do the post hoc Tukey HSD
# mcmp is used here instead of multicomp because there is a bug
# in multicomp which ignores valid.check=F for Tukey tests. Mcmp is
# the same as multicomp except for a simple correction listed below.
art.tukey <- mcmp(agm,vmat=vmat,df.residual=sum(agg.counts[,4]-1),
comparisons="mca",method="tukey",alpha=0.01,valid.check=F)
print(art.tukey)
### here's the cell.count function
cell.count <- function (x)
{
sum(is.number(x))
}
Section 2.
### Here's the section of multicomp.default in which I believe
# there is a bug. Please read the comments for an explanation of the bug.
# Tukey
if((method == "tukey") || (method == "best.fast") || (
method == "best")) {
# check validity of Tukey method using Hayter's condition
k <- 1
while(k * (k - 1) < 2 * p) k <- k + 1
### this is where tukey.valid has to be INITIALIZED
# this is the CORRECTED position.
tukey.valid<-F
if(valid.check) {
# this is the original position
# NOTE tukey.valid is initialized only if valid.check=T
tukey.valid <- F
if(k * (k - 1) == 2 * p) {
vardiffs <- diag(LpVL)
regmat <- matrix(0, p, k)
count <- 1
for(i in 1:(k - 1)) {
for(j in (i + 1):k) {
regmat[count, i] <- regmat[count,
j] <- 1
count <- count + 1
}
}
options(warn = -1)
lsout <- lsfit(regmat, vardiffs, int
= F)
options(warn = 0)
if((min(lsout$coef) > 0) & (max(abs(
residuals(lsout))) < 0.0001))
tukey.valid <- T
if(k <= 3)
tukey.valid <- T
}
if((method == "tukey") & (!tukey.valid)
)
return(
"Tukey's method cannot be shown valid using Hayter's (1989 JASA)
sufficient condition"
)
}
### here is where the bug has its effect
# if tukey.valid is NOT initialized at this point
# there is an ERROR
if((tukey.valid) | (!valid.check))
dvalue["tukey"] <- (1/sqrt(2)) * qtukey(
1 - alpha, k, df.residual)
}
-----------------------------------------------------------------------
This message was distributed by s-news@wubios.wustl.edu. To unsubscribe
send e-mail to s-news-request@wubios.wustl.edu with the BODY of the
message: unsubscribe s-news