% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/AllGenerics.R
\name{demuxmix}
\alias{demuxmix}
\alias{demuxmix,matrix,missing-method}
\alias{demuxmix,matrix,numeric-method}
\alias{demuxmix,Matrix,missing-method}
\alias{demuxmix,Matrix,numeric-method}
\title{Demultiplexing using mixture models}
\usage{
demuxmix(
  hto,
  rna,
  pAcpt = 0.9^nrow(hto),
  model = "auto",
  alpha = 0.9,
  beta = 0.9,
  correctTails = TRUE,
  tol = 10^-5,
  maxIter = 100,
  k.hto = 1.5,
  k.rna = 1.5,
  clusterInit = list()
)
}
\arguments{
\item{hto}{A matrix of HTO counts where each row corresponds to a hashtag
and each column to a droplet. The matrix must have unique row names.}

\item{rna}{An optional numeric vector with the number of genes detected in
the RNA library for each droplet. Same length as columns in \code{hto}.
If missing, parameter \code{model} must be set to "naive".}

\item{pAcpt}{Acceptance probability that must be reached in order to
assign a droplet to a hashtag. Droplets with lower probabilities are
classified as "uncertain". This parameter can be changed after running
demuxmix by applying \code{\link{pAcpt<-}} to the returned object.}

\item{model}{A character specifying the type of mixture model to be used.
Either "naive", "regpos", "reg" or "auto". The last three options
require parameter \code{rna} to be specified. "auto" selects the best
model based on the classification error probability summed over all
droplets.}

\item{alpha}{Threshold defining the left tail of the mixture
distribution where droplets should not be classified as "positive".
Threshold must be between 0 and 1. See details.}

\item{beta}{Threshold for defining the right tail of the mixture
distribution where droplets should not be classified as "negative".
Threshold must be between 0 and 1. See details.}

\item{correctTails}{If \code{TRUE}, droplets meeting the threshold defined by
\code{alpha} (\code{beta}) are classified as "negative" ("positive") even
if the mixture model suggests a different classification. See details.}

\item{tol}{Convergence criterion for the EM algorithm used to fit the mixture
models. The algorithm stops when the relative increase of the log
likelihood is less than or equal to \code{tol}.}

\item{maxIter}{Maximum number of iterations for the EM algorithm and
for the alternating iteration process fitting the NB regression models
within each EM iteration.}

\item{k.hto}{Factor to define outliers in the HTO counts. Among droplets
positive for the hashtag based on initial clustering, HTO counts
larger than the 0.75 quantile + \code{k.hto} * IQR are considered
outliers. See details.}

\item{k.rna}{Factor to define outliers in the numbers of detected genes.
Numbers of detected genes larger than the 0.75 quantile +
\code{k.rna} * IQR are considered outliers. See details.}

\item{clusterInit}{Optional list of numeric vectors to manually specify the
droplet to component assignment used to initialize the EM algorithm. The
name of each list element must match a row name of \code{hto}. The length
of each element must match the number of columns of \code{hto}. Only the
values \code{1} and \code{2} are allowed, where \code{1} indicates the
respective droplet belongs to the negative component with lower mean count.}
}
\value{
\code{demuxmix} returns an object of class \code{\link{Demuxmix}}.
  Classification results can be extracted with \code{\link{dmmClassify}}.
  Various plot methods (see below) are available to assess the model fit.
}
\description{
This method uses mixture models as probabilistic framework to assign
droplets to hashtags and to identify multiplets based on counts obtained from
a hashtag oligonucleotide (HTO) library. If the numbers of detected genes
from the corresponding RNA library are passed as second argument,
regression mixture models may be used, which often improves the
classification accuracy by leveraging the relationship between HTO and RNA
read counts.
}
\details{
The single cell dataset should undergo basic filtering to
  remove low quality or empty droplets before calling this function,
  but the HTO counts should not be transformed or pre-processed
  otherwise. The number of detected genes passed via the optional argument
  \code{rna} is typically defined as the number of genes in the RNA library
  with at least one read.
  
  The method fits a two-component negative binomial mixture model for each
  hashtag. The type of mixture model used can be specified by \code{model}.
  "naive" fits a standard mixture model. "reg" fits a regression mixture
  model using the given number of detected genes (\code{rna}) as covariate
  in the regression model. "regpos" uses a regression model only for the
  positive but not for the negative component. If model is set to "auto",
  all three models are fitted and the model with the lowest posterior
  classification error probability summed over all droplets is selected.
  Details are stored in the slot \code{modelSelection} of the returned
  object. In most real HTO datasets, regression mixture models outperform
  the naive mixture model.
  
  The \code{demuxmix} method consists of 3 steps, which can be tuned
  by the respective parameters. The default settings work well for a wide
  range of datasets and usually do not need to be adapted unless any issues
  arise during model fitting and quality control. An exception is the
  acceptance probability \code{pAcpt}, which may be set to smaller or
  larger value depending on the desired trade-off between number of
  unclassified/discarded droplets and expected error rate. Steps 1 and 2
  are executed for each HTO separately; step 3 classifies the droplets based
  on the results from all HTOs. Therefore, parameters affecting steps 1 and 2
  (incl. \code{model}) can be specified for each HTO using a vector with
  one element per HTO. Shorter vectors will be extended.
  
  \enumerate{
    \item Preprocessing (\code{k.hto}, \code{k.rna}). Droplets are clustered
    into a negative and a positive group based on the HTO counts using
    k-means. Droplets in the positive group with HTO counts larger than the
    0.75 quantile + \code{k.hto} times the IQR of the HTO counts in the
    positive group are marked as outliers. Outliers are still classified but
    will not be used to fit the mixture model for this HTO in step 2. If
    the parameter \code{rna} is given and the \code{model} is "reg" or
    "regpos", all droplets (both groups) with number of detected genes larger
    than the 0.75 quantile + k.rna times the IQR are marked as outliers, too,
    since these cells could affect the fitting of the regression model
    negatively. If more than 15\% of the cells are marked as outliers,
    a warning message is printed and larger values for \code{k.hto}
    and \code{k.rna} might be preferable. If the model fit seems to be
    affected by a few large values (very high variance of the positive
    component), smaller values should be chosen. On rare occasions, k-means
    clustering can result in inadequate clusters, and the derived
    distributional parameters are invalid. Poor clustering can be observed
    if (i) the HTO failed and the distribution is not bimodal or (ii) the
    fraction of positive cells tagged by the HTO is very small. An error
    message is displayed, and if (ii) is determined as the cause, an initial
    manual assignment can be specified by \code{clusterInit} to bypass
    the k-means clustering.
    
    \item Model fitting (\code{model}, \code{alpha}, \code{beta},
    \code{correctTails}, \code{tol}, \code{maxIter}). An EM algorithm is
    used to fit the mixture model to the HTO counts which were not marked as
    outliers in step 1. \code{maxIter} defines the maximum number of
    iterations of the EM algorithm, and, if \code{model} is "reg", "regpos"
    or "auto", it also defines the maximum number of iterations to fit the
    negative binomial regression models within each EM iteration. \code{tol}
    defines the convergence criterion for the EM algorithm. The algorithm
    stops if \eqn{\Delta LL/LL \le} \code{tol}. After the mixture model has
    been fitted, the posterior probability that the i-th droplet is positive
    for the hashtag \eqn{P(C_i = pos)} is calculated. Depending on the given
    data, these probabilities can be inaccurate at the far tails of the
    mixture distribution. Specifically, a positive component with large
    variance can have a larger value close to zero than the negative
    component, if the negative component is narrow and shifted to the right
    due to background HTO reads. If \code{correctTails} is \code{TRUE}, the
    following two rules are applied to avoid false classifications at the
    far tails. First, if the i-th droplet is classified as positive based on
    the posterior probability, but the probability to detected more than the
    observed \eqn{y_i} HTO counts in a negative droplet is
    \eqn{P(Y \ge y_i | neg)} > \code{alpha}, then \eqn{P(C_i = pos)} is set
    to 0 (left tail). Second, if the i-th droplet is classified as negative,
    but \eqn{P(Y \le y_i | pos)} > \code{beta}, \eqn{P(C_i = pos)} is set to
    1 (right tail). For most datasets, these rules will not apply and it is
    recommended not to change these values. If \code{correctTails} is
    \code{FALSE}, posterior probabilities will not be altered, but potential
    problems at the tails will still be logged in the slot
    \code{tailException} of the returned object.
    
    \item Classification (\code{pAcpt}). The posterior probabilities
    obtained from the models fitted to each HTO separately are 
    used to calculate the most likely class for each cell. The following
    classes are considered: one class for each HTO (singlets), one
    class for each possible multiplet, and a negative class representing
    droplets negative for all HTOs (i.e. empty droplets or droplets
    containing only cell debris). Each droplet is assigned to the most
    likely class unless the probability is smaller than \code{pAcpt},
    in which case the droplet is assigned to the class "uncertain".
    Classification results can be accessed by running
    \code{\link{dmmClassify}} on an object returned by \code{demuxmix}. The
    acceptance probability can be changed after running \code{demuxmix} using
    \code{\link{pAcpt<-}}.
  }
}
\examples{
set.seed(2642)
simdata <- dmmSimulateHto(class = rbind(c(rep(TRUE, 220), rep(FALSE, 200)),
                                        c(rep(FALSE, 200), rep(TRUE, 220))))

dmm <- demuxmix(simdata$hto, model = "naive")
dmm
table(dmmClassify(dmm)$HTO, simdata$groundTruth)

dmmreg <- demuxmix(simdata$hto, rna = simdata$rna)
dmm
table(dmmClassify(dmmreg)$HTO, simdata$groundTruth)
summary(dmmreg)

pAcpt(dmmreg) <- 0.5
summary(dmmreg)

dmmOverlap(dmmreg)
\donttest{
plotDmmHistogram(dmmreg)
plotDmmScatter(dmmreg, hto="HTO_1")}

}
\seealso{
\code{\link{dmmClassify}} to extract the classification results
  and \code{\link{summary}} to summarize the results.
  \code{\link{plotDmmHistogram}}, \code{\link{plotDmmScatter}},
  \code{\link{plotDmmPosteriorP}}, and \code{\link{dmmOverlap}} to assess
  the model fit.
}
