% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/AllMethods.R, R/wmwTest.R
\name{wmwTest}
\alias{wmwTest}
\alias{wmwTest,matrix,IndexList-method}
\alias{wmwTest,numeric,IndexList-method}
\alias{wmwTest,matrix,GmtList-method}
\alias{wmwTest,eSet,GmtList-method}
\alias{wmwTest,eSet,numeric-method}
\alias{wmwTest,eSet,logical-method}
\alias{wmwTest,eSet,list-method}
\alias{wmwTest,ANY,numeric-method}
\alias{wmwTest,ANY,logical-method}
\alias{wmwTest,ANY,list-method}
\alias{wmwTest,matrix,SignedIndexList-method}
\alias{wmwTest,matrix,SignedGenesets-method}
\alias{wmwTest,numeric,SignedIndexList-method}
\alias{wmwTest,eSet,SignedIndexList-method}
\alias{wmwTest,eSet,SignedGenesets-method}
\title{Wilcoxon-Mann-Whitney rank sum test for high-throughput expression
profiling data}
\usage{
wmwTest(
  x,
  indexList,
  col = "GeneSymbol",
  valType = c("p.greater", "p.less", "p.two.sided", "U", "abs.log10p.greater",
    "log10p.less", "abs.log10p.two.sided", "Q", "r", "f", "U1", "U2"),
  simplify = TRUE
)

\S4method{wmwTest}{matrix,IndexList}(x, indexList, valType = "p.greater", simplify = TRUE)

\S4method{wmwTest}{numeric,IndexList}(x, indexList, valType = "p.greater", simplify = TRUE)

\S4method{wmwTest}{matrix,GmtList}(x, indexList, valType = "p.greater", simplify = TRUE)

\S4method{wmwTest}{eSet,GmtList}(
  x,
  indexList,
  col = "GeneSymbol",
  valType = "p.greater",
  simplify = TRUE
)

\S4method{wmwTest}{eSet,numeric}(
  x,
  indexList,
  col = "GeneSymbol",
  valType = "p.greater",
  simplify = TRUE
)

\S4method{wmwTest}{eSet,logical}(
  x,
  indexList,
  col = "GeneSymbol",
  valType = "p.greater",
  simplify = TRUE
)

\S4method{wmwTest}{eSet,list}(
  x,
  indexList,
  col = "GeneSymbol",
  valType = "p.greater",
  simplify = TRUE
)

\S4method{wmwTest}{ANY,numeric}(x, indexList, valType = "p.greater", simplify = TRUE)

\S4method{wmwTest}{ANY,logical}(x, indexList, valType = "p.greater", simplify = TRUE)

\S4method{wmwTest}{ANY,list}(x, indexList, valType = "p.greater", simplify = TRUE)

\S4method{wmwTest}{matrix,SignedIndexList}(x, indexList, valType, simplify = TRUE)

\S4method{wmwTest}{matrix,SignedGenesets}(x, indexList, valType, simplify = TRUE)

\S4method{wmwTest}{numeric,SignedIndexList}(x, indexList, valType, simplify = TRUE)

\S4method{wmwTest}{eSet,SignedIndexList}(x, indexList, valType, simplify = TRUE)

\S4method{wmwTest}{eSet,SignedGenesets}(
  x,
  indexList,
  col = "GeneSymbol",
  valType = c("p.greater", "p.less", "p.two.sided", "U", "abs.log10p.greater",
    "log10p.less", "abs.log10p.two.sided", "Q", "r", "f", "U1", "U2"),
  simplify = TRUE
)
}
\arguments{
\item{x}{A numeric matrix. All other data types (e.g. numeric vectors
or \code{ExpressionSet} objects) are coerced into matrix.}

\item{indexList}{A list of integer indices (starting from 1) indicating
signature genes. Can be of length zero. Other data types (e.g. a list
of numeric or logical vectors, or a numeric or logical vector) are
coerced into such a list. See \code{details} below for a special case
using GMT files.}

\item{col}{a string sometimes used with a \code{eSet}}

\item{valType}{The value type to be returned, allowed values
include \code{p.greater}, \code{p.less}, \code{abs.log10p.greater} and
\code{abs.log10p.less} (one-sided tests),\code{p.two.sided}, and \code{U}
statistic (or more specifically, either \code{U1} or \code{U2}), and a few
other variants. See details below.}

\item{simplify}{Logical. If not, the returning value is in matrix
format; if set to \code{TRUE}, the results are simplified into
vectors when possible (default).}
}
\value{
A numeric matrix or vector containing the statistic.
}
\description{
wmwTest is a highly efficient Wilcoxon-Mann-Whitney rank sum
test for high-dimensional data, such as gene expression profiling. For datasets with
more than 100 features (genes), the function can be more than 1,000
times faster than its R implementations (\code{wilcox.test} in 
\code{stats}, or \code{rankSumTestWithCorrelation} in \code{limma}).
}
\details{
The basic application of the function is to test the enrichment of
gene sets in expression profiling data or differentially expressed
data (the matrix with feature/gene in rows and samples in columns).

A special case is when \code{x} is an \code{eSet} object
(e.g. \code{ExpressionSet}), and \code{indexList} is a list returned
from \code{readGmt} function. In this case, the only requirement is
that one column named \code{GeneSymbol} in the \code{featureData}
contain gene symbols used in the GMT file. The same applies to signed Gmt files. See the example below.

Besides the conventional value types such as \sQuote{p.greater},
\sQuote{p.less}, \sQuote{p.two.sided} , and \sQuote{U} (the U-statistic),
\code{wmwTest} (from version 0.99-1) provides further value types:
\code{abs.log10p.greater} and \code{log10p.less} perform log10
transformation on respective \emph{p}-values and give the
transformed value a proper sign (positive for greater than, and
negative for less than); \code{abs.log10p.two.sided} transforms
two-sided \emph{p}-values to non-negative values; and \code{Q} score
reports absolute log10-transformation of \emph{p}-value of the
two-side variant, and gives a proper sign to it, depending on whether it is
rather greater than (positive) or  less than (negative).

From version 1.19.1, the rank-biserial correlation coefficient (\sQuote{r})
and the common language effect size (\sQuote{f}) are supported value types.

Before version 1.19.3, the \sQuote{U} statistic returned is in fact
\sQuote{U2}. From version 1.19.3, \sQuote{U1} is returned when \sQuote{U} is
used, and users can specify additional parameter values \sQuote{U1} and
\sQuote{U2}. The sum of \sQuote{U1} and \sQuote{U2} is the product of the
sizes of two vectors to be compared.
}
\section{Methods (by class)}{
\itemize{
\item \code{x = matrix,indexList = IndexList}: \code{x} is a \code{matrix} and \code{indexList} is a \code{IndexList}

\item \code{x = numeric,indexList = IndexList}: \code{x} is a \code{numeric} and \code{indexList} is a \code{IndexList}

\item \code{x = matrix,indexList = GmtList}: \code{x} is a \code{matrix} and \code{indexList} is a \code{GmtList}

\item \code{x = eSet,indexList = GmtList}: \code{x} is a \code{eSet} and \code{indexList} is a \code{GmtList}

\item \code{x = eSet,indexList = numeric}: \code{x} is a \code{eSet} and \code{indexList} is a \code{numeric}

\item \code{x = eSet,indexList = logical}: \code{x} is a \code{eSet} and \code{indexList} is a \code{logical}

\item \code{x = eSet,indexList = list}: \code{x} is a \code{eSet} and \code{indexList} is a \code{list}

\item \code{x = ANY,indexList = numeric}: \code{x} is \code{ANY} and \code{indexList} is a \code{numeric}

\item \code{x = ANY,indexList = logical}: \code{x} is \code{ANY} and \code{indexList} is a \code{logical}

\item \code{x = ANY,indexList = list}: \code{x} is \code{ANY} and \code{indexList} is a \code{list}

\item \code{x = matrix,indexList = SignedIndexList}: \code{x} is a \code{matrix} and \code{indexList} is a 
\code{SignedIndexList}

\item \code{x = matrix,indexList = SignedGenesets}: \code{x} is a \code{eSet} and \code{indexList} is a 
\code{SignedIndexList}

\item \code{x = numeric,indexList = SignedIndexList}: \code{x} is a \code{numeric} and \code{indexList} is a 
\code{SignedIndexList}

\item \code{x = eSet,indexList = SignedIndexList}: \code{x} is a \code{eSet} and \code{indexList} is a 
\code{SignedIndexList}

\item \code{x = eSet,indexList = SignedGenesets}: \code{x} is a \code{eSet} and \code{indexList} is a 
\code{SignedIndexList}
}}

\note{
The function has been optimized for expression profiling data. It
avoids repetitive ranking of data as done by native R implementations
and uses efficient C code to increase the performance and control
memory use. Simulation studies using expression profiles of 22000
genes in 2000 samples and 200 gene sets suggested that the C
implementation can be >1000 times faster than the R
implementation. And it is possible to further accelerate by
parallel calling the function with \code{mclapply} in the \code{multicore} package.
}
\examples{
## R-native data structures
set.seed(1887)
rd <- rnorm(1000)
rl <- sample(c(TRUE, FALSE), 1000, replace=TRUE)
wmwTest(rd, rl, valType="p.two.sided")
wmwTest(rd, which(rl), valType="p.two.sided")
rd1 <- rd + ifelse(rl, 0.5, 0)
wmwTest(rd1, rl, valType="p.greater")
wmwTest(rd1, rl, valType="U")
rd2 <- rd - ifelse(rl, 0.2, 0)
wmwTest(rd2, rl, valType="p.greater")
wmwTest(rd2, rl, valType="p.two.sided")
wmwTest(rd2, rl, valType="p.less")
wmwTest(rd2, rl, valType="r")
wmwTest(rd2, rl, valType="f")

## matrix forms
rmat <- matrix(c(rd, rd1, rd2), ncol=3, byrow=FALSE)
wmwTest(rmat, rl, valType="p.two.sided")
wmwTest(rmat, rl, valType="p.greater")

wmwTest(rmat, which(rl), valType="p.two.sided")
wmwTest(rmat, which(rl), valType="p.greater")

## other valTypes
wmwTest(rmat, which(rl), valType="U")
wmwTest(rmat, which(rl), valType="abs.log10p.greater")
wmwTest(rmat, which(rl), valType="log10p.less")
wmwTest(rmat, which(rl), valType="abs.log10p.two.sided")
wmwTest(rmat, which(rl), valType="Q")
wmwTest(rmat, which(rl), valType="r")
wmwTest(rmat, which(rl), valType="f")

## using ExpressionSet
data(sample.ExpressionSet)
testSet <- sample.ExpressionSet
fData(testSet)$GeneSymbol <- paste("GENE_",1:nrow(testSet), sep="")
mySig1 <- sample(c(TRUE, FALSE), nrow(testSet), prob=c(0.25, 0.75), replace=TRUE)
wmwTest(testSet, which(mySig1), valType="p.greater")

## using integer
exprs(testSet)[,1L] <- exprs(testSet)[,1L] + ifelse(mySig1, 50, 0)
wmwTest(testSet, which(mySig1), valType="p.greater")

## using lists
mySig2 <- sample(c(TRUE, FALSE), nrow(testSet), prob=c(0.6, 0.4), replace=TRUE)
wmwTest(testSet, list(first=mySig1, second=mySig2))
## using GMT file
gmt_file <- system.file("extdata/exp.tissuemark.affy.roche.symbols.gmt", package="BioQC")
gmt_list <- readGmt(gmt_file)

gss <- sample(unlist(sapply(gmt_list, function(x) x$genes)), 1000)
eset<-new("ExpressionSet",
         exprs=matrix(rnorm(10000), nrow=1000L),
         phenoData=new("AnnotatedDataFrame", data.frame(Sample=LETTERS[1:10])),
         featureData=new("AnnotatedDataFrame",data.frame(GeneSymbol=gss)))
esetWmwRes <- wmwTest(eset ,gmt_list, valType="p.greater")
summary(esetWmwRes)

## using signed GMT file
signed_gmt_file <- system.file("extdata/test.gmt", package="BioQC")
signed_gmt <- readSignedGmt(signed_gmt_file)
esetSignedWmwRes <- wmwTest(eset, signed_gmt, valType="p.greater")

esetMat <- exprs(eset); rownames(esetMat) <- fData(eset)$GeneSymbol
esetSignedWmwRes2 <- wmwTest(esetMat, signed_gmt, valType="p.greater")
}
\references{
Barry, W.T., Nobel, A.B., and Wright, F.A. (2008). A statistical framework for testing functional namespaces in microarray data. _Annals of Applied Statistics_ 2, 286-315.

Wu, D, and Smyth, GK (2012). Camera: a competitive gene set test
accounting for inter-gene correlation. _Nucleic Acids Research_ 40(17):e133

Zar, JH (1999). _Biostatistical Analysis 4th Edition_. Prentice-Hall International, Upper Saddle River, New Jersey.
}
\seealso{
code{wilcox.test} in the \code{stats} package, and \code{rankSumTestWithCorrelation} in
the \code{limma} package.
}
\author{
Jitao David Zhang <jitao_david.zhang@roche.com>, with critical inputs
from Jan Aettig and Iakov Davydov about U statistics.
}
