% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/gsvaNewAPI.R
\name{readGMT}
\alias{readGMT}
\title{Import Gene Sets from a GMT File}
\usage{
readGMT(
  con,
  sep = "\\t",
  geneIdType = "auto",
  collectionType = NullCollection(),
  valueType = c("GeneSetCollection", "list"),
  deduplUse = c("first", "drop", "union", "smallest", "largest"),
  ...
)
}
\arguments{
\item{con}{A connection object or a non-empty character string of length 1
containing e.g. the filename or URL of a (possibly compressed) GMT file.}

\item{sep}{The character string separating members of each gene set in the
GMT file.}

\item{geneIdType}{By default a character vector of length 1 with the special
value \code{"auto"} or an object of a subclass of \code{GeneIdentifierType}.  If set
to \code{"auto"}, the function will try to derive the gene ID type from argument
\code{geneIdsList} using \code{\link{guessGeneIdType}}.
Other values, including \code{NULL}, will be ignored with a warning and
\code{geneIdType=NullIdentifier()} will be used instead.
Depending on the value of argument \code{valueType}, the gene ID type of the
resulting list or of all \code{GeneSet} objects in the resulting
\code{GeneSetCollection} will be set to this value.}

\item{collectionType}{Only used when \code{valueType == "GeneSetCollection"}. See
\code{getGmt} for more information.}

\item{valueType}{A character vector of length 1 specifying the desired type
of return value.  It must be one of:
\itemize{
\item \code{GeneSetCollection} (the default): a \code{GeneSetCollection} object as defined
and described by package \code{GSEABase}.
\item \code{list}: a named list of gene sets represented as character vectors of gene IDs.
This format is much simpler and cannot store the metadata required for automatic
mapping of gene IDs.
}}

\item{deduplUse}{A character vector of length 1 specifying one of several
methods to handle duplicated gene set names.
Duplicated gene set names are explicitly forbidden by the
\href{https://software.broadinstitute.org/cancer/software/gsea/wiki/index.php/Data_formats}{GMT file format specification}
but can nevertheless be encountered in the wild.
The available choices are:
\itemize{
\item \code{first} (the default): drops all gene sets whose names are duplicated
according to the base R function and retains only the first occurence of a
gene set name.
\item \code{drop}:  removes \emph{all} gene sets that have a duplicated name, including its
first occurrence.
\item \code{union}: replaces gene sets with duplicated names by a single gene set
containing the union of all their gene IDs.
\item \code{smallest}: drops gene sets with duplicated names and retains only the
smallest of them, i.e. the one with the fewest gene IDs.  If there are
several smallest gene sets, the first will be selected.
\item \code{largest}: drops gene sets with duplicated names and retains only the
largest of them, i.e. the one with the most gene IDs.  If there are
several largest gene sets, the first will be selected.
}}

\item{...}{Further arguments passed on to \code{readLines()}}
}
\value{
The gene sets imported from the GMT file, with duplicate gene sets
resolved according to argument \code{deduplUse} and in the format determined by
argument \code{valueType}.
}
\description{
Imports a list of gene sets from a GMT (Gene Matrix Transposed)
format file, offering a choice of ways to handle duplicated gene set names.
}
\examples{
library(GSVA)
suppressPackageStartupMessages(library(GSVAdata))

fname <- file.path(system.file("extdata", package="GSVAdata"),
   "c2.subsetdups.v7.5.symbols.gmt.gz")

## by default, guess geneIdType from content and return a GeneSetCollection
genesets <- readGMT(fname)
genesets

## how to manually override the geneIdType
genesets <- readGMT(fname, geneIdType=NullIdentifier())
genesets

## how to drop *all* gene sets with duplicated names (instead of ignoring
## only the duplicated one)
genesets <- readGMT(fname, deduplUse="drop")
genesets

## return a simple list instead of a GeneSetCollection
genesets <- readGMT(fname, valueType="list")
head(genesets, 2)

## the list has a geneIdType, too
gsvaAnnotation(genesets)

}
\seealso{
\code{\link{readLines}},
\code{\link[GSEABase:GeneSetCollection-class]{GeneSetCollection}},
\code{\link[GSEABase:GeneIdentifierType-class]{GeneIdentifierType}},
\code{\link[GSEABase]{getGmt}},
}
