% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/collapseMutantsByAA.R
\name{collapseMutantsBySimilarity}
\alias{collapseMutantsBySimilarity}
\alias{collapseMutantsByAA}
\alias{collapseMutants}
\title{Collapse mutants by similarity}
\usage{
collapseMutantsBySimilarity(
  se,
  assayName,
  scoreMethod = "rowSum",
  sequenceCol = "sequence",
  collapseMaxDist = 0,
  collapseMinScore = 0,
  collapseMinRatio = 0,
  verbose = TRUE
)

collapseMutantsByAA(se)

collapseMutants(se, nameCol)
}
\arguments{
\item{se}{A \code{\link[SummarizedExperiment]{SummarizedExperiment}}
generated by \code{\link{summarizeExperiment}}}

\item{assayName}{The name of the assay that will be used to calculate a 
"score" (typically derived from the read counts) for each variant.}

\item{scoreMethod}{Character scalar giving the approach used to calculate 
ranking scores from the assay defined by \code{assayName}. Currently, 
this can be one of \code{"rowSum"} or \code{"rowMean"}. All filtering
criteria will be applied to these scores.}

\item{sequenceCol}{Character scalar giving the name of the column in 
\code{rowData(se)} that contains the nucleotide sequence of the 
variants.}

\item{collapseMaxDist}{Numeric scalar defining the tolerance for collapsing 
similar sequences. If the value is in [0, 1), it defines the maximal 
Hamming distance in terms of a fraction of sequence length:
(\code{round(collapseMaxDist * nchar(sequence))}).
A value greater or equal to 1 is rounded and directly used as the 
maximum allowed Hamming distance. Note that sequences can only be
collapsed if they are all of the same length.}

\item{collapseMinScore}{Numeric scalar, indicating the minimum score for the 
sequence to be considered for collapsing with similar sequences.}

\item{collapseMinRatio}{Numeric scalar. During collapsing of
similar sequences, a low-frequency sequence will be collapsed 
with a higher-frequency sequence only if the ratio between the 
high-frequency and the low-frequency scores is at least this 
high. The default value of 0 indicates that no such check is performed.}

\item{verbose}{Logical, whether to print progress messages.}

\item{nameCol}{A character scalar providing the column of 
\code{rowData(se)} that contains the amino acid mutant names (that will 
be the new row names).}
}
\value{
A \code{\link[SummarizedExperiment]{SummarizedExperiment}} where
    counts have been aggregated by the mutated amino acid(s).
}
\description{
These functions can be used to collapse variants, either by similarity or 
according to a pre-defined grouping. The functions \code{collapseMutants} 
and \code{collapseMutantsByAA} assume that a grouping variable is available 
as a column in \code{rowData(se)} (\code{collapseMutantsByAA} is a 
convenience function for the case when this column is "mutantNameAA", and 
is provided for backwards compatibility). The 
\code{collapseMutantsBySimilarity} will generate the grouping variable 
based on user-provided thresholds on the sequence similarity (defined by 
the Hamming distance), and subsequently collapse based on the derived 
grouping.
}
\examples{
library(SummarizedExperiment)
se <- readRDS(system.file("extdata", "GSE102901_cis_se.rds",
                          package = "mutscan"))[1:200, ]
## The rows of this object correspond to individual codon variants
dim(se)
head(rownames(se))

## Collapse by amino acid
sec <- collapseMutantsByAA(se)
## The rows of the collapsed object correspond to amino acid variants
dim(sec)
head(rownames(sec))
## The mutantName column contains the individual codon variants that were 
## collapsed
head(rowData(sec))

## Collapse similar sequences
sec2 <- collapseMutantsBySimilarity(
    se = se, assayName = "counts", scoreMethod = "rowSum",
    sequenceCol = "sequence", collapseMaxDist = 2,
    collapseMinScore = 0, collapseMinRatio = 0)
dim(sec2)
head(rownames(sec2))
head(rowData(sec2))
## collapsed count matrix
assay(sec2, "counts")

}
\author{
Charlotte Soneson, Michael Stadler
}
