% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/methods.R
\name{term_sim}
\alias{term_sim}
\title{Semantic similarity}
\usage{
term_sim(dag, terms, method, control = list(), verbose = simona_opt$verbose)
}
\arguments{
\item{dag}{An \code{ontology_DAG} object.}

\item{terms}{A vector of term names.}

\item{method}{A term similarity method. All available methods are in \code{\link[=all_term_sim_methods]{all_term_sim_methods()}}.}

\item{control}{A list of parameters passing to individual methods. See the subsections.}

\item{verbose}{Whether to print messages.}
}
\value{
A numeric symmetric matrix.
}
\description{
Semantic similarity
}
\section{Methods}{

\subsection{Sim_Lin_1998}{

The similarity between two terms \code{a} and \code{b} is calculated as the IC of their MICA term \code{c} normalized by the average of the IC of the two terms:

\if{html}{\out{<div class="sourceCode">}}\preformatted{sim = IC(c)/((IC(a) + IC(b))/2) 
    = 2*IC(c)/(IC(a) + IC(b))
}\if{html}{\out{</div>}}

Although any IC method can be used here, in more applications, it is normally used together with the \emph{IC_annotation} method.

Paper link: \doi{10.5555/645527.657297}.
}


\subsection{Sim_Resnik_1999}{

The IC method is fixed to \code{IC_annotation}.

The original Resnik similarity is the IC of the MICA term. There are three ways to normalize the Resnik similarity into the scale of \verb{[0, 1]}:
\enumerate{
\item \emph{Nunif}
}

\if{html}{\out{<div class="sourceCode">}}\preformatted{sim = IC(c)/log(N)
}\if{html}{\out{</div>}}

where \code{N} is the total number of items annotated to the whole DAG, i.e. number of items annotated to the root. Then the IC
of a term with only one item annotated is \code{-log(1/N)} = log(N)` which is the maximal IC value in the DAG.
\enumerate{
\item \emph{Nmax}
}

\code{IC_max} is the maximal IC of all terms. If there is a term with only one item annotated, \code{Nmax} is identical to the `Nunif* method.

\if{html}{\out{<div class="sourceCode">}}\preformatted{sim = IC(c)/IC_max
}\if{html}{\out{</div>}}
\enumerate{
\item \emph{Nunivers}
}

The IC is normalized by the maximal IC of term \code{a} and \code{b}.

\if{html}{\out{<div class="sourceCode">}}\preformatted{sim = IC(c)/max(IC(a), IC(b))
}\if{html}{\out{</div>}}

Paper link: \doi{10.1613/jair.514}, \doi{10.1186/1471-2105-9-S5-S4}, \doi{10.1186/1471-2105-11-562}, \doi{10.1155/2013/292063}.

The normalization method can be set with the \code{norm_method} parameter:

\if{html}{\out{<div class="sourceCode">}}\preformatted{term_sim(dag, terms, control = list(norm_method = "Nmax"))
}\if{html}{\out{</div>}}

Possible values for the \code{norm_method} parameter are "Nunif", "Nmax", "Nunivers" and "none".
}


\subsection{Sim_FaITH_2010}{

It is calculated as:

\if{html}{\out{<div class="sourceCode">}}\preformatted{sim = IC(c)/(IC(a) + IC(b) - IC(c))
}\if{html}{\out{</div>}}

The relation between \emph{FaITH_2010} similarity and \emph{Lin_1998} similarity is:

\if{html}{\out{<div class="sourceCode">}}\preformatted{sim_FaITH = sim_Lin/(2 - sim_Lin)
}\if{html}{\out{</div>}}

Paper link: \doi{10.1007/978-3-642-17746-0_39}.
}


\subsection{Sim_Relevance_2006}{

The IC method is fixed to \code{IC_annotation}.

If thinking \emph{Lin_1998} is a measure of how close term \code{a} and \code{b} to their MICA term \code{c}, the relevance method corrects it by multiplying
a factor which considers the specificity of how \code{c} brings the information. The factor is calculated as \code{1-p(c)} where \code{p(c)} is the annotation-based
probability \code{p(c) = k/N} where \code{k} is the number of items annotated to \code{c} and \code{N} is the total number of items annotated to the DAG. Then
the Relevance semantic similarity is calculated as:

\if{html}{\out{<div class="sourceCode">}}\preformatted{sim = (1 - p(c)) * IC_Lin 
    = (1 - p(c)) * 2*IC(c)/(IC(a) + IC(b))
}\if{html}{\out{</div>}}

Paper link: \doi{10.1186/1471-2105-7-302}.
}


\subsection{Sim_SimIC_2010}{

The IC method is fixed to \code{IC_annotation}.

The SimIC method is an improved correction method of the Relevance method because the latter works bad when \code{p(c)} is very small. The SimIC
correction factor for MICA term \code{c} is:

\if{html}{\out{<div class="sourceCode">}}\preformatted{1 - 1/(1 + IC(c))
}\if{html}{\out{</div>}}

Then the similarity is:

\if{html}{\out{<div class="sourceCode">}}\preformatted{sim = (1 - 1/(1 + IC(c))) * IC_Lin 
    = (1 - 1/(1 + IC(c))) * 2*IC(c)/(IC(a) + IC(b))
}\if{html}{\out{</div>}}

Paper link: \doi{10.48550/arXiv.1001.0958}.
}


\subsection{Sim_XGraSM_2013}{

The IC method is fixed to \code{IC_annotation}.

Being different from the "Relevance" and "SimIC_2010" methods that only use the IC of the MICA term, the \emph{XGraSM_2013} uses IC of all common ancestor terms of \code{a} and \code{b}.
First it calculates the mean IC of all common ancestor terms with positive IC values:

\if{html}{\out{<div class="sourceCode">}}\preformatted{IC_mean = mean_t(IC(t)) where t is an ancestor of both a and b, and IC(t) > 0
}\if{html}{\out{</div>}}

then similar to the \emph{Lin_1998} method, normalize to the average IC of \code{a} and \code{b}:

\if{html}{\out{<div class="sourceCode">}}\preformatted{sim = IC_mean*2/(IC(a) + IC(b))
}\if{html}{\out{</div>}}

Paper link: \doi{10.1186/1471-2105-14-284}.
}


\subsection{Sim_EISI_2015}{

The IC method is fixed to \code{IC_annotation}.

It also selects a subset of common ancestors of terms \code{a} and \code{b}. It only selects common ancestors which can reach \code{a} or \code{b} via one of its child terms
that does not belong to the common ancestors. In other words, from the common ancestor, there exist a path where
the information is uniquely transmitted to \code{a} or \code{b}, not passing the other.

Then the mean IC of the subset common ancestors is calculated and normalized by the \emph{Lin_1998} method.

Paper link: \doi{10.1016/j.gene.2014.12.062}.
}


\subsection{Sim_AIC_2014}{

It uses the aggregate information content from ancestors. First define the semantic weight (\code{Sw}) of a term \code{t} in the DAG:

\if{html}{\out{<div class="sourceCode">}}\preformatted{Sw = 1/(1 + exp(-1/IC(t)))
}\if{html}{\out{</div>}}

Then calculate the aggregation only in the common ancestors and the aggregationn
in the ancestors of the two terms \code{a} and \code{b} separatedly:

\if{html}{\out{<div class="sourceCode">}}\preformatted{SV_\{common ancestors\} = sum_\{t in common ancestors\}(Sw(t))
SV_a = sum\{a' in a's ancestors\}(Sw(a'))
SV_b = sum\{b' in b's ancestors\}(Sw(b'))
}\if{html}{\out{</div>}}

The similarity is calculated as the ratio between the aggregation on the common ancestors and the average on \code{a}'s ancestors and \code{b}'s ancestors separatedly.

\if{html}{\out{<div class="sourceCode">}}\preformatted{sim = 2*SV_\{common_ancestors\}/(SV_a + SV_b)
}\if{html}{\out{</div>}}

Paper link: \doi{10.1109/tcbb.2013.176}.
}


\subsection{Sim_Zhang_2006}{

It uses the \emph{IC_Zhang_2006} IC method and the \emph{Lin_1998} method to calculate similarities:

\if{html}{\out{<div class="sourceCode">}}\preformatted{sim = 2*IC_zhang(c)/(IC_zhang(a) + IC_zhang(b))
}\if{html}{\out{</div>}}
}


\subsection{Sim_universal}{

It uses the \emph{IC_universal} IC method and the \emph{Nunivers} method to calculate similarities:

\if{html}{\out{<div class="sourceCode">}}\preformatted{sim = IC_universal(c)/max(IC_universal(a), IC_universal(b))
}\if{html}{\out{</div>}}
}


\subsection{Sim_Wang_2007}{

First, S-value of an ancestor term \code{c} on a term \code{a} (\code{S(c->a)}) is calculated (the definition of the S-value can be found in the help page of \code{\link[=term_IC]{term_IC()}}).
Similar to the \emph{Sim_AIC_2014}, aggregation only to common ancestors, to \code{a}'s ancestors and to \code{b}'s ancestors are calculated.

\if{html}{\out{<div class="sourceCode">}}\preformatted{SV_\{common ancestors\} = sum_\{c in common ancestors\}(S(c->a) + S(c->b))
SV_a = sum\{a' in a's ancestors\}(S(a'->a))
SV_b = sum\{b' in b's ancestors\}(S(b'->b))
}\if{html}{\out{</div>}}

Then the similarity is calculated as:

\if{html}{\out{<div class="sourceCode">}}\preformatted{sim = SV_\{common_ancestors\}*2/(SV_a + SV_b)
}\if{html}{\out{</div>}}

Paper link: \doi{10.1093/bioinformatics/btm087}.

The contribution of different semantic relations can be set with the \code{contribution_factor} parameter. The value should be a named numeric
vector where names should cover the relations defined in \code{relations} set in \code{\link[=create_ontology_DAG]{create_ontology_DAG()}}. For example, if there are two relations
"relation_a" and "relation_b" set in the DAG, the value for \code{contribution_factor} can be set as:

\if{html}{\out{<div class="sourceCode">}}\preformatted{term_sim(dag, terms, method = "Sim_Wang_2007", 
    control = list(contribution_factor = c("relation_a" = 0.8, "relation_b" = 0.6)))
}\if{html}{\out{</div>}}
}


\subsection{Sim_GOGO_2018}{

It is very similar as \emph{Sim_Wang_2007}, but with a corrected contribution factor when calculating the S-value.
From a parent term to a child term, \emph{Sim_Wang_2007} directly uses a weight for the relation between the parent
and the child, e.g. 0.8 for "is_a" relation type and 0.6 for "part_of" relation type. In \emph{Sim_GOGO_2018}, the weight
is also scaled by the total number of children of that parent:

\if{html}{\out{<div class="sourceCode">}}\preformatted{w = 1/(c + nc) + w_0
}\if{html}{\out{</div>}}

where w_0 is the original contribution factor, \code{nc} is the number of child terms of the parent, \code{c} is calculated to ensure that
maximal value of \code{w} is no larger than 1, i.e. \code{c = max(w_0)/(1 - max(w_0))}, assuming minimal value of \code{nc} is 1. By default \emph{Sim_GOGO_2018}
sets contribution factor of 0.4 for "is_a" and 0.3 for "part_of", then \code{w = 1/(2/3 + nc) + w_0}.

Paper link: \doi{10.1038/s41598-018-33219-y}.

The contribution of different semantic relations can be set with the \code{contribution_factor} parameter. The value should be a named numeric
vector where names should cover the relations defined in \code{relations} set in \code{\link[=create_ontology_DAG]{create_ontology_DAG()}}. For example, if there are two relations
"relation_a" and "relation_b" set in the DAG, the value for \code{contribution_factor} can be set as:

\if{html}{\out{<div class="sourceCode">}}\preformatted{term_sim(dag, terms, method = "Sim_GOGO_2018", 
    control = list(contribution_factor = c("relation_a" = 0.4, "relation_b" = 0.3)))
}\if{html}{\out{</div>}}
}


\subsection{Sim_Rada_1989}{

It is based on the distance between term \code{a} and \code{b}. It is defined as:

\if{html}{\out{<div class="sourceCode">}}\preformatted{sim = 1/(1 + d(a, b))
}\if{html}{\out{</div>}}

The distance can be the shortest distance between \code{a} and \code{b} or the longest distance via the LCA term.

Paper link: \doi{10.1109/21.24528}.

There is a parameter \code{distance} which takes value of "longest_distances_via_LCA" (the default) or "shortest_distances_via_NCA":

\if{html}{\out{<div class="sourceCode">}}\preformatted{term_sim(dag, terms, method = "Sim_Rada_1989",
    control = list(distance = "shortest_distances_via_NCA"))
}\if{html}{\out{</div>}}
}


\subsection{Sim_Resnik_edge_2005}{

It is also based on the distance between term \code{a} and \code{b}:

\if{html}{\out{<div class="sourceCode">}}\preformatted{sim = 1 - d(a, b)/2/max_depth
}\if{html}{\out{</div>}}

where \code{max_depth} is the maximal depth (maximal distance from root) in the DAG. Similarly, \code{d(a, b)} can be the shortest
distance or the longest distance via LCA.

Paper link: \doi{10.1145/1097047.1097051}.

There is a parameter \code{distance} which takes value of "longest_distances_via_LCA" (the default) or "shortest_distances_via_NCA":

\if{html}{\out{<div class="sourceCode">}}\preformatted{term_sim(dag, terms, method = "Sim_Resnik_edge_2005",
    control = list(distance = "shortest_distances_via_NCA"))
}\if{html}{\out{</div>}}
}


\subsection{Sim_Leocock_1998}{

It is similar as the \emph{Sim_Resnik_edge_2005} method, but it applies log-transformation on the distance and the depth:

\if{html}{\out{<div class="sourceCode">}}\preformatted{sim = 1 - log(d(a, b) + 1)/log(2*max_depth + 1)
}\if{html}{\out{</div>}}

Paper link: \doi{10.1186/1471-2105-13-261}.

There is a parameter \code{distance} which takes value of "longest_distances_via_LCA" (the default) or "shortest_distances_via_NCA":

\if{html}{\out{<div class="sourceCode">}}\preformatted{term_sim(dag, terms, method = "Sim_Leocock_1998",
    control = list(distance = "shortest_distances_via_NCA"))
}\if{html}{\out{</div>}}
}


\subsection{Sim_WP_1994}{

It is based on the depth of the LCA term \code{c} and the longest distance between term \code{a} and \code{b}:

\if{html}{\out{<div class="sourceCode">}}\preformatted{sim = 2*depth(c)/(len_c(a, b) + 2*depth(c))
}\if{html}{\out{</div>}}

where \code{len_c(a, b)} is the longest distance between \code{a} and \code{b} via LCA \code{c}. The denominator in the equation can also be written as:

\if{html}{\out{<div class="sourceCode">}}\preformatted{len_c(a, b) + 2*depth(c) = depth(c) + len(c, a) + depth(c) + len(c, b)
                         = depth_c(a) + depth_c(b)
}\if{html}{\out{</div>}}

where \code{depth_c(a)} is the longest distance from root to \code{a} passing through \code{c}.

Paper link: \doi{10.3115/981732.981751}.
}


\subsection{Sim_Slimani_2006}{

It is a correction of the \emph{Sim_WP_1994} method. The correction factor for term \code{a} and \code{b} regarding to their LCA \code{t} is:

\if{html}{\out{<div class="sourceCode">}}\preformatted{CF(a, b) = (1-lambda)*(min(depth(a), depth(b)) - depth(c)) + 
           lambda/(1 + abs(depth(a) - depth(b)))
}\if{html}{\out{</div>}}

\code{lambda} takes value of 1 if \code{a} and \code{b} are in ancestor-offspring relation, or else it takes 0.

Paper link: \url{https://zenodo.org/record/1075130}.
}


\subsection{Sim_Shenoy_2012}{

It is a correction of the \emph{Sim_WP_1994} method. The correction factor for term \code{a} and \code{b} is:

\if{html}{\out{<div class="sourceCode">}}\preformatted{CF(a, b) = exp(-lambda*d(a, b)/max_depth)
}\if{html}{\out{</div>}}

\code{lambda} takes value of 1 if \code{a} and \code{b} are in ancestor-offspring relation, or else it takes 0. `d(a, b)

Paper link: \doi{10.48550/arXiv.1211.4709}.

There is a parameter \code{distance} which takes value of "longest_distances_via_LCA" (the default) or "shortest_distances_via_NCA":

\if{html}{\out{<div class="sourceCode">}}\preformatted{term_sim(dag, terms, method = "Sim_Leocock_1998",
    control = list(distance = "shortest_distances_via_NCA"))
}\if{html}{\out{</div>}}
}


\subsection{Sim_Pekar_2002}{

It is very similar to the \emph{Sim_WP_1994} method:

\if{html}{\out{<div class="sourceCode">}}\preformatted{sim = depth(c)/(len_c(a, b) + depth(c))
    = d(root, c)/(d(c, a) + d(c, b) + d(root, c))
}\if{html}{\out{</div>}}

where \code{d(a, b)} is the longest distance between \code{a} and \code{b}.

Paper link: \url{https://aclanthology.org/C02-1090/}.
}


\subsection{Sim_Stojanovic_2001}{

It is purely based on the depth of term \code{a}, \code{b} and their LCA \code{c}.

\if{html}{\out{<div class="sourceCode">}}\preformatted{sim = depth(c)/(depth(a) + depth(b) - depth(c))
}\if{html}{\out{</div>}}

The similarity value might be negative because there is no restrction that the path from root to \code{a} or \code{b} must pass \code{c}.

Paper link: \doi{10.1145/500737.500762}.
}


\subsection{Sim_Wang_edge_2012}{

It is calculated as:

\if{html}{\out{<div class="sourceCode">}}\preformatted{sim = depth(c)^2/depth_c(a)/depth_c(b)
}\if{html}{\out{</div>}}

where \code{depth_c(a)} is the longest distance between root to \code{a} passing through \code{c}.

Paper link: \doi{10.1186/1477-5956-10-s1-s18}.
}


\subsection{Sim_Zhong_2002}{

For a term \code{x}, it first calculates a "mile-stone" value as

\if{html}{\out{<div class="sourceCode">}}\preformatted{m(x) = 0.5/2^depth(x)
}\if{html}{\out{</div>}}

The the distance bewteen term \code{a} and \code{b} via LCA term \code{c} is:

\if{html}{\out{<div class="sourceCode">}}\preformatted{D(c, a) + D(c, b) = m(c) - m(a) + m(c) - m(b)
                  = 2*m(c) - m(a) - m(b)
                  = 1/2^depth(c) - 0.5/2^depth(a) - 0.5/2^depth(b)
}\if{html}{\out{</div>}}

We change the original \code{depth(a)} to let it go through LCA term \code{c} when calculating the depth:

\if{html}{\out{<div class="sourceCode">}}\preformatted{1/2^depth(c) - 0.5/2^depth(a) - 0.5/2^depth(b) 
    = 1/2^depth(c)- 0.5/2^(depth(c) + len(c, a)) - 0.5/2^(depth(c) + len(c, b))
    = 1/2^depth(c) * (1 - 1/2^(len(c, a) + 1) - 1/2^(len(c, b) + 1))
    = 2^-depth(c) * (1 - 2^-(len(c, a) + 1) - 2^-(len(c, b) + 1))
}\if{html}{\out{</div>}}

And the final similarity is \code{1 - distance}:

\if{html}{\out{<div class="sourceCode">}}\preformatted{sim = 1 - 2^-depth(c) * (1 - 2^-(len(c, a) + 1) - 2^-(len(c, b) + 1))
}\if{html}{\out{</div>}}

Paper link: \doi{10.1007/3-540-45483-7_8}.

There is a parameter \code{depth_via_LCA} that can be set to \code{TRUE} or \code{FALSE}. IF it is set to \code{TRUE}, \code{depth(a)} is re-defined
as should pass the LCA term \code{c}. If it is \code{FALSE}, it goes to the original similarity definition in the paper and note the
similarity might be negative.

\if{html}{\out{<div class="sourceCode">}}\preformatted{term_sim(dag, terms, method = "Sim_Zhong_2002",
    control = list(depth_via_LCA = FALSE))
}\if{html}{\out{</div>}}
}


\subsection{Sim_AlMubaid_2006}{

It also takes accout of the distance between term \code{a} and \code{b}, and the depth of the LCA term \code{c} in the DAG.
The distance is calculated as:

\if{html}{\out{<div class="sourceCode">}}\preformatted{D(a, b) = log(1 + d(a, b)*(max_depth - depth(c)))
}\if{html}{\out{</div>}}

Here \code{d(a, b)} can be the shortest distance between \code{a} and \code{b} or the longst distance via LCA \code{c}.

Then the distance is transformed into the similarity value scaled by the possible maximal and minimal values of \code{D(a, b)} from the DAG:

\if{html}{\out{<div class="sourceCode">}}\preformatted{D_max = log(1 + 2*max_depth * max_depth)
}\if{html}{\out{</div>}}

And the minimal value of \code{D(a, b)} is zero when \code{a} is identical to \code{b}. Then the similarity value is scaled as:

\if{html}{\out{<div class="sourceCode">}}\preformatted{sim = 1 - D(a, b)/D_max
}\if{html}{\out{</div>}}

Paper link: \doi{10.1109/IEMBS.2006.259235}.

There is a parameter \code{distance} which takes value of "longest_distances_via_LCA" (the default) or "shortest_distances_via_NCA":

\if{html}{\out{<div class="sourceCode">}}\preformatted{term_sim(dag, terms, method = "Sim_AlMubaid_2006",
    control = list(distance = "shortest_distances_via_NCA"))
}\if{html}{\out{</div>}}
}


\subsection{Sim_Li_2003}{

It is similar to the \emph{Sim_AlMubaid_2006} method, but uses a non-linear form:

\if{html}{\out{<div class="sourceCode">}}\preformatted{sim = exp(0.2*d(a, b)) * atan(0.6*depth(c))
}\if{html}{\out{</div>}}

where \code{d(a, b)} can be the shortest distance or the longest distance via LCA.

Paper link: \doi{10.1109/TKDE.2003.1209005}.

There is a parameter \code{distance} which takes value of "longest_distances_via_LCA" (the default) or "shortest_distances_via_NCA":

\if{html}{\out{<div class="sourceCode">}}\preformatted{term_sim(dag, terms, method = "Sim_Li_2003",
    control = list(distance = "shortest_distances_via_NCA"))
}\if{html}{\out{</div>}}
}


\subsection{Sim_RSS_2013}{

The similarity is adjusted by the positions of term \code{a}, \code{b} and the LCA term \code{c} in the DAG. The similarity is defined as:

\if{html}{\out{<div class="sourceCode">}}\preformatted{sim = max_depth/(max_depth + d(a, b)) * alpha/(alpha + beta)
}\if{html}{\out{</div>}}

where \code{d(a, b)} is the distance between \code{a} and \code{b} which can be the shortest distance or the longest distance via LCA.

In the tuning factor, \code{alpha} is the distance of LCA to root, which is \code{depth(c)}. \code{beta} is the distance to leaves, which
is the minimal distance (or the minimal height) of term \code{a} and \code{b}:

\if{html}{\out{<div class="sourceCode">}}\preformatted{alpha/(alpha + beta) = depth(c)/(depth(c) + min(height(a), height(b)))
}\if{html}{\out{</div>}}

Paper link: \doi{10.1371/journal.pone.0066745}.

There is a parameter \code{distance} which takes value of "longest_distances_via_LCA" (the default) or "shortest_distances_via_NCA":

\if{html}{\out{<div class="sourceCode">}}\preformatted{term_sim(dag, terms, method = "Sim_RSS_2013",
    control = list(distance = "shortest_distances_via_NCA"))
}\if{html}{\out{</div>}}
}


\subsection{Sim_HRSS_2013}{

It is similar as the \emph{Sim_RSS_2013} method, but it uses information content instead of the distance to adjust the similarity.

It first defines the semantic distance between term \code{a} and \code{b} as the sum of the distance to their MICA term \code{c}:

\if{html}{\out{<div class="sourceCode">}}\preformatted{D(a, b) = D(c, a) + D(c, b)
}\if{html}{\out{</div>}}

And the distance between an ancestor to a term is:

\if{html}{\out{<div class="sourceCode">}}\preformatted{D(c, a) = IC(a) - IC(c)  # if c is an ancestor of a
D(a, b) = D(c, a) + D(c, b) = IC(a) + IC(b) - 2*IC(c) # if c is the MICA of a and b
}\if{html}{\out{</div>}}

Similarly, the similarity is also corrected by the position of MICA term and \code{a} and \code{b} in the DAG:

\if{html}{\out{<div class="sourceCode">}}\preformatted{1/(1 + D(a, b)) * alpha/(alph + beta)
}\if{html}{\out{</div>}}

Now \code{alpha} is the IC of the MICA term:

\if{html}{\out{<div class="sourceCode">}}\preformatted{alpha = IC(c)
}\if{html}{\out{</div>}}

And \code{beta} is the average of the maximal semantic distance of \code{a} and \code{b} to leaves.

\if{html}{\out{<div class="sourceCode">}}\preformatted{beta = 0.5*(IC(l_a) - IC(a) + IC(l_b) - IC(b))
}\if{html}{\out{</div>}}

where \code{l_a} is the leaf that \code{a} can reach with the highest IC (i.e. most informative leaf), and so is \code{l_b}.

Paper link: \doi{10.1371/journal.pone.0066745}.
}


\subsection{Sim_Shen_2010}{

It is based on the information content of terms on the path connecting term \code{a} and \code{b} via their MICA term \code{c}.

Denote a list of terms \verb{a, ..., c, ..., b} which are composed by the shortest path from \code{a} to \code{c} and from \code{b} to \code{c}, the difference
between \code{a} and \code{b} is the sum of \code{1/IC} of the terms on the path:

\if{html}{\out{<div class="sourceCode">}}\preformatted{sum_\{x in the path\}(1/IC(x))
}\if{html}{\out{</div>}}

Then the distance is scaled into \verb{[0, 1]} by an arctangent tarnsformation:

\if{html}{\out{<div class="sourceCode">}}\preformatted{atan(sum_\{x in the path\}(1/IC(x)))/(pi/2)
}\if{html}{\out{</div>}}

And finally the similarity is:

\if{html}{\out{<div class="sourceCode">}}\preformatted{sim = 1 - atan(sum_\{x in the path\}(1/IC(x)))/(pi/2)
}\if{html}{\out{</div>}}

Paper link: \doi{10.1109/BIBM.2010.5706623}.
}


\subsection{Sim_SSDD_2013}{

It is similar as the \emph{Sim_Shen_2010} which also sums content along the path passing through LCA term.
Instead of summing the information content, the \emph{Sim_SSDD_2013} sums up a so-called "T-value":

\if{html}{\out{<div class="sourceCode">}}\preformatted{sim = 1 - atan(sum_\{x in the path\}(T(x)))/(pi/2)
}\if{html}{\out{</div>}}

Each term has a T-value and it measures the semantic content a term averagely inherited from its parents
and distributed to its offsprings. The T-value of root is 1. Assume a term \code{t} has two parents \code{p1} and \code{p1},
The T-value for term \code{t} is averaged from its

\if{html}{\out{<div class="sourceCode">}}\preformatted{(w1*T(p1) + w2*T(p2))/2
}\if{html}{\out{</div>}}

Since the parent may have other child terms, a factor \code{w1} or \code{w2} is multiplied to \code{T(p1)} and \code{T(p2)}. Taking
\code{p1} as an example, it has \code{n_p} offsprings (including itself) and \code{t} has \code{n_t} offsprings (including itself),
this means \code{n_t/n_p} of information is transmitted from \code{p1} to downstream via \code{t}, thus \code{w1} is defined as \code{n_t/n_p}.

Paper link: \doi{10.1016/j.ygeno.2013.04.010}.
}


\subsection{Sim_Jiang_1997}{

First semantic distance between term \code{a} and \code{b} via MICA term \code{c} is defined as:

\if{html}{\out{<div class="sourceCode">}}\preformatted{D(a, b) = IC(a) + IC(b) - 2*IC(c)
}\if{html}{\out{</div>}}

Then there are several normalization method to change the distance to similarity and to scale it into the range of \verb{[0, 1]}.
\itemize{
\item max: \code{1 - D(a, b)/2/IC_max}
\item Couto: \code{min(1, D(a, b)/IC_max)}
\item Lin: \code{1 - D(a, b)/(IC(a) + IC(b))} which is the same as the \emph{Sim_Lin_1998} method
\item Garla: \code{1 - log(D(a, b) + 1)/log(2*IC_max + 1)}
\item log-Lin: \code{1 - log(D(a, b) + 1)/log(IC(a) + IC(b) + 1)}
\item Rada: \code{1/(1 + D(a, b))}
}

Paper link: \url{https://aclanthology.org/O97-1002/}.

There is a parameter \code{norm_method} which takes value in "max", "Couto", "Lin", "Carla", "log-Lin", "Rada":

\if{html}{\out{<div class="sourceCode">}}\preformatted{term_sim(dag, terms, method = "Sim_Jiang_1997",
    control = list(norm_method = "Lin"))
}\if{html}{\out{</div>}}
}


\subsection{Sim_Kappa}{

Denote two sets \code{A} and \code{B} as the items annotated to term \code{a} and \code{b}. The similarity value is \href{https://en.wikipedia.org/wiki/Cohen\%27s_kappa}{the kappa coeffcient}
of the two sets.

The universe or the background can be set via parameter \code{anno_universe}:

\if{html}{\out{<div class="sourceCode">}}\preformatted{term_sim(dag, terms, method = "Sim_kappa",
    control = list(anno_universe = ...))
}\if{html}{\out{</div>}}
}


\subsection{Sim_Jaccard}{

Denote two sets \code{A} and \code{B} as the items annotated to term \code{a} and \code{b}. The similarity value is the Jaccard coeffcient
of the two sets, defined as \code{length(intersect(A, B))/length(union(A, B))}.

The universe or the background can be set via parameter \code{anno_universe}:

\if{html}{\out{<div class="sourceCode">}}\preformatted{term_sim(dag, terms, method = "Sim_Jaccard",
    control = list(anno_universe = ...))
}\if{html}{\out{</div>}}
}


\subsection{Sim_Dice}{

Denote two sets \code{A} and \code{B} as the items annotated to term \code{a} and \code{b}. The similarity value is the Dice coeffcient
of the two sets, defined as \code{2*length(intersect(A, B))/(length(A) + length(B))}.

The universe or the background can be set via parameter \code{anno_universe}:

\if{html}{\out{<div class="sourceCode">}}\preformatted{term_sim(dag, terms, method = "Sim_Dice",
    control = list(anno_universe = ...))
}\if{html}{\out{</div>}}
}


\subsection{Sim_Overlap}{

Denote two sets \code{A} and \code{B} as the items annotated to term \code{a} and \code{b}. The similarity value is the overlap coeffcient
of the two sets, defined as \code{length(intersect(A, B))/min(length(A), length(B))}.

The universe or the background can be set via parameter \code{anno_universe}:

\if{html}{\out{<div class="sourceCode">}}\preformatted{term_sim(dag, terms, method = "Sim_Overlap",
    control = list(anno_universe = ...))
}\if{html}{\out{</div>}}
}


\subsection{Sim_Ancestor}{

Denote \code{S_a} and \code{S_b} are two sets of ancestor terms of term \code{a} and \code{b} (including \code{a} and \code{b}), the
semantic similarity is defined as:

\if{html}{\out{<div class="sourceCode">}}\preformatted{length(intersect(S_a, S_b))/length(union(S_a, S_b))
}\if{html}{\out{</div>}}

\if{html}{\out{<div class="sourceCode">}}\preformatted{term_sim(dag, terms, method = "Sim_Ancestor")
}\if{html}{\out{</div>}}
}
}

\examples{
parents  = c("a", "a", "b", "b", "c", "d")
children = c("b", "c", "c", "d", "e", "f")
annotation = list(
    "a" = 1:3,
    "b" = 3:4,
    "c" = 5,
    "d" = 7,
    "e" = 4:7,
    "f" = 8
)
dag = create_ontology_DAG(parents, children, annotation = annotation)
term_sim(dag, dag_all_terms(dag), method = "Sim_Lin_1998")
}
