% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/automorphism_prob.R
\name{automorphism_prob}
\alias{automorphism_prob}
\alias{automorphism_prob,AutomorphismByCoef-method}
\alias{automorphism_prob,AutomorphismByCoefList-method}
\title{Autmorphism Probability}
\usage{
automorphism_prob(x, ...)

\S4method{automorphism_prob}{AutomorphismByCoef}(
  x,
  initial_alpha = NULL,
  method = c("L-BFGS-B", "Nelder-Mead", "BFGS", "CG", "SANN", "Brent"),
  maxit = 500,
  abstol = 10^-8,
  ...
)

\S4method{automorphism_prob}{AutomorphismByCoefList}(
  x,
  initial_alpha = NULL,
  method = c("L-BFGS-B", "Nelder-Mead", "BFGS", "CG", "SANN", "Brent"),
  maxit = 500,
  abstol = 10^-8
)
}
\arguments{
\item{x}{An AutomorphismByCoefList-class object returned by function
\link{automorphism_bycoef}.}

\item{...}{Not in use yet.}

\item{initial_alpha}{A vector of initial guess values for pseudo counts
\eqn{\alpha_i} (see Description). Default is: \eqn{\alpha_i = 1 / (n_i + 1)},
which corresponds to \eqn{initial_alpha = NULL}.}

\item{method, maxit, abstol}{Parameter values to pass into
\code{\link[stats]{optim}}.}
}
\value{
A data frame with the posterior probabilities.
}
\description{
This function applies a Dirichlet-Multinomial Modelling (in Bayesian
framework) to compute the posterior probability of each
\emph{type of mutational event}. DNA bases are classified based on the
physicochemical criteria used to ordering the set of codons: number of
hydrogen bonds (strong-weak, S-W), chemical type (purine-pyrimidine, Y-R),
and chemical groups (amino versus keto, M-K) (see reference 4). Preserved
codon positions are labeled with letter “H”.

As a result, mutational events are grouped by type of mutation covering all
the possible combinations of symbols: "Y", "R", "M", "W", "K", "S", and "H",
for example:  "YSH", "RSH", "MSH", "WSH", "KSH", "SSH", "HSH", "YHH", "RHH",
and so on. Insertion/deletion mutations are not considered.
\subsection{Maximum Likelihood Estimation (MLE) for Dirichlet Parameters}{

Given the observed data \eqn{n = (n_1, ..., n_k)}, where \eqn{n_i} is the
frequency for the mutation type \eqn{i}, we want to estimate the parameters
of the Dirichlet distribution, \eqn{\alpha = (\alpha_1, ..., \alpha_k)},
that maximize the marginal likelihood.
}

\subsection{Marginal Likelihood}{

The marginal likelihood of the data under the Dirichlet-multinomial model is
given by

\deqn{P\left(n\middle|\alpha\right) = \frac{N!}{\prod_{i=1}^{k}n_i}
\frac{\Gamma\left(\sum_{i=1}^{k}\alpha_i\right)}{\Gamma
\left(N+\sum_{i=1}^{k}\alpha_i\right)}\prod_{i=1}^{k}\frac{\Gamma
\left(n_i+\alpha_i\right)}{\Gamma\left(\alpha_i\right)}}

where \eqn{N = \sum_{i=1}^{k}n_i}.
}

\subsection{Optimization}{

To perform MLE, we maximize the log of this likelihood:
\eqn{log(P\left(n\middle|\alpha\right))}. That is, we aim to maximize this
log-likelihood with respect to \eqn{\alpha}. This is done numerically
because there's no closed-form solution. Here, we use:

\deqn{Arg\max_{\alpha} {\{log\left(P\left(n\middle|\alpha\right)\right\}}}

with initial guess set as \eqn{\alpha_i = 1 / (n_i + 1)}.
}

\subsection{Posterior Distribution}{

Let be \eqn{\theta} the vector of probabilities for each mutation type. It's
the parameter we're estimating, which represents the probabilities of
observing each mutation type in the multinomial distribution. The conjugate
distribution in this context refers to the Dirichlet distribution, which is
the prior distribution for \eqn{\theta}. When we have observed data, the
posterior distribution of \eqn{\theta} given this data is also a Dirichlet
distribution due to the conjugate property.

The prior distribution, before observing any data, our belief about the
distribution of \eqn{\theta} is given by:

\deqn{\theta\sim\ Dirichlet\left(\alpha_1,\alpha_k,\ldots,\alpha_k\right)}

where the \eqn{\alpha_i} are known as initial concentration parameters,
which represents our initial belief or assumption about the distribution
of the mutation types. These parameters control how the probability mass is
distributed among the mutation types. We might set these initial parameters
based on some prior knowledge or simply to provide a non-informative prior.

Once we have the MLE for \eqn{\alpha}, the posterior distribution of
\eqn{\theta} given the data updates these parameters to incorporate the
observed data:

\deqn{\theta\mid\ n\sim\ Dirichlet\left(\alpha_1^\ast+n1,\alpha_k^\ast+n_2,
\ldots,\alpha_k^\ast+n_k\right)}

where \eqn{\alpha^\ast_i} are the MLE estimates, i.e.,
\eqn{\alpha_i^\ast+n_i} are our updated belief about the frequencies of
mutation types in the population sample.

The expected values of \eqn{\theta_i}, given the data under the posterior
Dirichlet distribution is:

\deqn{E\left[\theta_i\right]=\frac{\alpha_i^*+n_i}
{\sum_{j=1}^{k}\left(\alpha_j^*+n_j\right)}}

These expected values directly corresponds to the "posterior probability" of
observing mutation type \eqn{i}.

This approach provides a rigorous estimation of the Dirichlet parameters
under the Dirichlet-multinomial model using MLE.
}
}
\examples{
## Load the data set
data("autby_coef", package = "GenomAutomorphism")
post_prob <- automorphism_prob(autby_coef[1:10])
head(post_prob,10)
}
\seealso{
\link{automorphism_bycoef}
}
