% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/sort_clusters.R
\name{sort_clusters}
\alias{sort_clusters}
\title{Sort clusters by frequency}
\usage{
sort_clusters(clusters, map_subset = NULL)
}
\arguments{
\item{clusters}{A vector with cluster labels.}

\item{map_subset}{A logical vector of length equal to \code{clusters} specifying
which elements of \code{clusters} to use to determine the ranking of the clusters.}
}
\value{
A \code{factor()} version of \code{clusters} where the levels are ordered by
frequency.
}
\description{
This function takes a vector with cluster labels, recasts it as a \code{factor()},
and sorts the \code{factor()} levels by frequency such that the most frequent
cluster is the first level and so on.
}
\examples{

## Build an initial set of cluster labels
clus <- letters[unlist(lapply(4:1, function(x) rep(x, x)))]

## In this case, it's a character vector
class(clus)

## We see that we have 10 elements in this vector, which is
## an unnamed character vector
clus

## letter 'd' is the most frequent
table(clus)

## Sort them and obtain a factor. Notice that it's a named
## factor, and the names correspond to the original values
## in the character vector.
sort_clusters(clus)

## Since 'd' was the most frequent, it gets assigned to the first level
## in the factor variable.
table(sort_clusters(clus))

## If we skip the first 3 values of clus (which are all 'd'), we can
## change the most frequent cluster. And thus the ordering of the
## factor levels.
sort_clusters(clus, map_subset = seq_len(length(clus)) > 3)

## Let's try with a factor variable
clus_factor <- factor(clus)
## sort_clusters() returns an identical result in this case
stopifnot(identical(sort_clusters(clus), sort_clusters(clus_factor)))

## What happens if you have a logical variable with NAs?
set.seed(20240712)
log_var <- sample(c(TRUE, FALSE, NA),
    1000,
    replace = TRUE,
    prob = c(0.3, 0.15, 0.55)
)
## Here, the NAs are the most frequent group.
table(log_var, useNA = "ifany")

## The NAs are not used for sorting. Since we have more 'TRUE' than 'FALSE'
## then, 'TRUE' becomes the first level.
table(sort_clusters(log_var), useNA = "ifany")
}
