% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/remp.R
\name{remp}
\alias{remp}
\title{Repetitive element methylation prediction}
\usage{
remp(
  methyDat = NULL,
  REtype = c("Alu", "L1", "ERV"),
  Seq.GR = NULL,
  parcel = NULL,
  work.dir = tempdir(),
  win = 1000,
  method = c("rf", "xgbTree", "svmLinear", "svmRadial", "naive"),
  autoTune = TRUE,
  param = NULL,
  seed = NULL,
  ncore = NULL,
  BPPARAM = NULL,
  verbose = FALSE
)
}
\arguments{
\item{methyDat}{A \code{\link{RatioSet}}, \code{\link{GenomicRatioSet}}, \code{\link{DataFrame}},
\code{data.table}, \code{data.frame}, or \code{matrix} of Illumina BeadChip methylation data
(450k or EPIC array) or Illumina methylation percentage estimates by sequencing. See Details. 
Alternatively, user can also specify a pre-built data template (see \code{\link{rempTemplate}}).
\code{remp} to carry out the prediction. See \code{\link{rempTemplate}}. With template specified, \code{methyDat},
\code{REtype}, \code{parcel}, and \code{work.dir} can be skipped.}

\item{REtype}{Type of RE. Currently \code{"Alu"}, \code{"L1"}, and \code{"ERV"} are supported. If \code{NULL}, 
the type of RE will be extracted from \code{parcel}.}

\item{Seq.GR}{A \code{\link{GRanges}} object containing genomic locations of the CpGs profiled by sequencing
platforms. This parameter should not be \code{NULL} if the input methylation data \code{methyDat} are
obtained by sequencing. Note that the genomic location can be in either hg19 or hg38 build. 
See details in \code{\link{initREMP}}.}

\item{parcel}{An \code{\link{REMParcel}} object containing necessary data to carry out the
prediction. If \code{NULL}, \code{REtype} must specify a type of RE so that the function can search the
\code{.rds} data file in \code{work.dir} exported by \code{\link{initREMP}} (with \code{export = TRUE})
or \code{\link{saveParcel}}.}

\item{work.dir}{Path to the directory where the annotation data generated by \code{\link{initREMP}}
are saved. Valid when the argument \code{parcel} is missing. If not specified, temporary directory
\code{tempdir()} will be used. If specified, the directory path has to be the same as the
one specified in \code{\link{initREMP}} or in \code{\link{saveParcel}}.}

\item{win}{An integer specifying window size to confine the upstream and downstream flanking
region centered on the predicted CpG in RE for prediction. Default = \code{1000}. See Details.}

\item{method}{Name of model/approach for prediction. Currently \code{"rf"} (Random Forest),
\code{"xgbTree"} (Extreme Gradient Boosting), \code{"svmLinear"} (SVM with linear kernel), \code{"svmRadial"}
(SVM with radial kernel), and \code{"naive"} (carrying over methylation values of the closest
CpG site) are available. Default = \code{"rf"} (Random Forest). See Details.}

\item{autoTune}{Logical parameter. If \code{TRUE}, a 3-time repeated 5-fold cross validation
will be performed to determine the best model parameter. If \code{FALSE}, the \code{param} option
(see below) must be specified. Default = \code{TRUE}. Auto-tune will be disabled using Random Forest.
See Details.}

\item{param}{A list specifying fixed model tuning parameter(s) (not applicable for Random Forest, see Details).
For Extreme Gradient Boosting, \code{param} list must contain '\code{$nrounds}', '\code{$max_depth}',
'\code{$eta}', '\code{$gamma}', '\code{$colsample_bytree}', '\code{$min_child_weight}', and '\code{$subsample}'.
See \code{xgbTree} in package \code{caret}. For SVM, \code{param} list must contain '\code{$C}' (cost) for linear kernel
or '\code{$sigma}' and '\code{$C}' for radial basis function kernel. This parameter is valid only
when \code{autoTune = FALSE}.}

\item{seed}{Random seed for Random Forest model for reproducible prediction results.
Default is \code{NULL}, which generates a seed.}

\item{ncore}{Number of cores used for parallel computing. By default, max number of cores available
in the machine will be utilized. If \code{ncore = 1}, no parallel computing is allowed.}

\item{BPPARAM}{An optional \code{\link{BiocParallelParam}} instance determining the parallel back-end to
be used during evaluation. If not specified, default back-end in the machine will be used.}

\item{verbose}{Logical parameter. Should the function be verbose?}
}
\value{
A \code{\link{REMProduct}} object containing predicted RE methylation results.
}
\description{
\code{remp} is used to predict genomewide methylation levels of locus-specific repetitive elements (RE).
Two major RE types in human, Alu element (Alu) and LINE-1 (L1) are available.
}
\details{
Before running \code{remp}, user should make sure the methylation data have gone through
proper quality control, background correction, and normalization procedures. Both beta value
and M value are allowed. Rows represents probes and columns represents samples. For array data,
please make sure to have row names that specify the Illumina probe ID (i.e. cg00000029). For sequencing
data, please provide the genomic location of CpGs in a \code{\link{GRanges}} obejct and
specify it using \code{Seq.GR} parameter. \code{win = 1000} is based on previous findings showing that
neighboring CpGs are more likely to be co-modified within 1000 bp. User can specify narrower window size
for slight improvement of prediction accuracy at the cost of less predicted RE. Window size greater than 1000 is not
recommended as the machine learning models would not be able to learn much userful information
for prediction but introduce noise. Random Forest model (\code{method = "rf"}) is recommented
as it offers more accurate prediction and it also enables prediction reliability functionality.
Prediction reliability is estimated by conditional standard deviation using Quantile Regression Forest.
Please note that if parallel computing is allowed, parallel Random Forest
(powered by package \code{\link{ranger}}) will be used automatically. The performance of
Random Forest model is often relatively insensitive to the choice of \code{mtry}.
Therefore, auto-tune will be turned off using Random Forest and \code{mtry} will be set to one third
of the total number of predictors. For SVM, if \code{autoTune = TRUE}, preset tuning parameter
search grid can be access and modified using \code{\link{remp_options}}.
}
\examples{
# Obtain example Illumina example data (450k)
if (!exists("GM12878_450k")) 
  GM12878_450k <- getGM12878("450k")

# Make sure you have run 'initREMP' first. See ?initREMP.
if (!exists("remparcel")) {
  data(Alu.hg19.demo)
  remparcel <- initREMP(arrayType = "450k",
                        REtype = "Alu",
                        annotation.source = "AH",
                        genome = "hg19",
                        RE = Alu.hg19.demo,
                        ncore = 1,
                        verbose = TRUE)
}

# With data template pre-built. See ?rempTemplate.
if (!exists("template")) 
  template <- rempTemplate(GM12878_450k, 
                           parcel = remparcel, 
                           win = 1000, 
                           verbose = TRUE)

# Run remp with pre-built template:
remp.res <- remp(template, ncore = 1)

# Or run remp without pre-built template (identical results):
\dontrun{
  remp.res <- remp(GM12878_450k, 
                   REtype = "Alu", 
                   parcel = remparcel, 
                   ncore = 1,
                   verbose = TRUE)
}

remp.res
details(remp.res)
rempB(remp.res) # Methylation data (beta value)

# Extract CpG location information. 
# This accessor is inherit from class 'RangedSummarizedExperiment')
rowRanges(remp.res)

# RE annotation information
rempAnnot(remp.res)

# Add gene annotation
remp.res <- decodeAnnot(remp.res, type = "symbol")
rempAnnot(remp.res)

# (Recommended) Trim off less reliable prediction
remp.res <- rempTrim(remp.res)

# Obtain RE-level methylation (aggregate by mean)
remp.res <- rempAggregate(remp.res)
rempB(remp.res) # Methylation data (beta value)

# Extract RE location information
rowRanges(remp.res)

# Density plot across predicted RE
remplot(remp.res)

}
\seealso{
See \code{\link{initREMP}} to prepare necessary annotation database before running \code{remp}.
}
