% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/main.R
\name{run_kmer_tsma}
\alias{run_kmer_tsma}
\title{\emph{k}-mer-based Transcript Set Motif Analysis}
\usage{
run_kmer_tsma(
  foreground_sets,
  background_set,
  motifs = NULL,
  k = 6,
  fg_permutations = 5000,
  kmer_significance_threshold = 0.01,
  produce_plot = TRUE,
  p_adjust_method = "BH",
  p_combining_method = "fisher",
  n_cores = 1
)
}
\arguments{
\item{foreground_sets}{list of foreground sets; a foreground set is a
character vector of
DNA or RNA sequences (not both) and a strict subset of the
\code{background_set}}

\item{background_set}{character vector of DNA or RNA sequences that
constitute the
background set}

\item{motifs}{a list of motifs that is used to score the specified sequences.
If \code{is.null(motifs)} then all Transite motifs are used.}

\item{k}{length of \emph{k}-mer, either \code{6} for hexamers or
\code{7} for heptamers}

\item{fg_permutations}{numer of foreground permutations}

\item{kmer_significance_threshold}{p-value threshold for significance,
e.g., \code{0.05} or
\code{0.01} (used for volcano plots)}

\item{produce_plot}{if \code{TRUE} volcano plots and distribution plots
are created}

\item{p_adjust_method}{see \code{\link[stats]{p.adjust}}}

\item{p_combining_method}{one of the following: Fisher (1932)
(\code{"fisher"}), Stouffer (1949),
Liptak (1958) (\code{"SL"}), Mudholkar and George (1979)
(\code{"MG"}), and Tippett (1931)
(\code{"tippett"}) (see \code{\link{p_combine}})}

\item{n_cores}{number of computing cores to use}
}
\value{
A list of lists (one for each transcript set) with the
following components:
\tabular{rl}{
  \code{enrichment_df} \tab the result of
  \code{\link{compute_kmer_enrichment}} \cr
  \code{motif_df} \tab \cr
  \code{motif_kmers_dfs} \tab \cr
  \code{volcano_plots} \tab volcano plots for each
  motif (see \code{\link{draw_volcano_plot}}) \cr
  \code{perm_test_plots} \tab plots of the empirical distribution of
  \emph{k}-mer enrichment values for each motif \cr
  \code{enriched_kmers_combined_p_values} \tab \cr
  \code{depleted_kmers_combined_p_values} \tab
}
}
\description{
Calculates the enrichment of putative binding sites in foreground sets
versus a background set
using \emph{k}-mers to identify putative binding sites
}
\details{
Motif transcript set analysis can be used to identify RNA binding
proteins, whose targets are
significantly overrepresented or underrepresented in certain sets
of transcripts.

The aim of Transcript Set Motif Analysis (TSMA) is to identify the
overrepresentation
and underrepresentation of potential RBP targets (binding sites)
in a set (or sets) of
sequences, i.e., the foreground set, relative to the entire population
of sequences.
The latter is called background set, which can be composed of all
sequences of the genes
of a microarray platform or all sequences of an organism or any
other meaningful
superset of the foreground sets.

The \emph{k}-mer-based approach breaks the sequences of foreground
and background sets into
\emph{k}-mers and calculates the enrichment on a \emph{k}-mer level.
In this case, motifs are
not represented as position weight matrices, but as lists of \emph{k}-mers.

Statistically significantly enriched or depleted \emph{k}-mers
are then used to
calculate a score for each RNA-binding protein, which quantifies its
target overrepresentation.
}
\examples{
# define simple sequence sets for foreground and background
foreground_set1 <- c(
  "CAACAGCCUUAAUU", "CAGUCAAGACUCC", "CUUUGGGGAAU",
  "UCAUUUUAUUAAA", "AAUUGGUGUCUGGAUACUUCCCUGUACAU",
  "AUCAAAUUA", "AGAU", "GACACUUAAAGAUCCU",
  "UAGCAUUAACUUAAUG", "AUGGA", "GAAGAGUGCUCA",
  "AUAGAC", "AGUUC", "CCAGUAA"
)
foreground_set2 <- c("UUAUUUA", "AUCCUUUACA", "UUUUUUU", "UUUCAUCAUU")
foreground_sets <- list(foreground_set1, foreground_set2)
background_set <- unique(c(foreground_set1, foreground_set2, c(
  "CCACACAC", "CUCAUUGGAG", "ACUUUGGGACA", "CAGGUCAGCA",
  "CCACACCGG", "GUCAUCAGU", "GUCAGUCC", "CAGGUCAGGGGCA"
)))

# run k-mer based TSMA with all Transite motifs (recommended):
# results <- run_kmer_tsma(foreground_sets, background_set)

# run TSMA with one motif:
motif_db <- get_motif_by_id("M178_0.6")
results <- run_kmer_tsma(foreground_sets, background_set, motifs = motif_db)
\dontrun{
# define example sequence sets for foreground and background
foreground_set1 <- gsub("T", "U", transite:::ge$foreground1_df$seq)
foreground_set2 <- gsub("T", "U", transite:::ge$foreground2_df$seq)
foreground_sets <- list(foreground_set1, foreground_set2)
background_set <- gsub("T", "U", transite:::ge$background_df$seq)

# run TSMA with all Transite motifs
results <- run_kmer_tsma(foreground_sets, background_set)

# run TSMA with a subset of Transite motifs
results <- run_kmer_tsma(foreground_sets, background_set,
  motifs = get_motif_by_rbp("ELAVL1"))

# run TSMA with user-defined motif
toy_motif <- create_kmer_motif(
  "toy_motif", "example RBP",
  c("AACCGG", "AAAACG", "AACACG"), "example type", "example species", "user"
)
results <- run_matrix_tsma(foreground_sets, background_set,
  motifs = list(toy_motif))
}

}
\seealso{
Other TSMA functions: 
\code{\link{draw_volcano_plot}()},
\code{\link{run_matrix_tsma}()}

Other \emph{k}-mer functions: 
\code{\link{calculate_kmer_enrichment}()},
\code{\link{check_kmers}()},
\code{\link{compute_kmer_enrichment}()},
\code{\link{count_homopolymer_corrected_kmers}()},
\code{\link{create_kmer_origin_list}()},
\code{\link{draw_volcano_plot}()},
\code{\link{estimate_significance}()},
\code{\link{estimate_significance_core}()},
\code{\link{generate_kmers}()},
\code{\link{generate_permuted_enrichments}()},
\code{\link{run_kmer_spma}()}
}
\concept{TSMA functions}
\concept{\emph{k}-mer functions}
