% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/PAC_gtf.R
\name{PAC_gtf}
\alias{PAC_gtf}
\title{Annotate against a GTF file}
\usage{
PAC_gtf(
  PAC,
  genome = NULL,
  mismatches = 3,
  return = "simplify",
  stranded = FALSE,
  gtf = NULL,
  targets = NULL,
  threads = 1
)
}
\arguments{
\item{PAC}{PAC-list object. The Anno object needs to contain genome
coordinates in the following columns: "chr","start","end", and "strand".
These coordinates are eaisly obtained by using \code{PAC_genome}}

\item{genome}{Character indicating the path to a reference genome file in
fasta format. Note that a bowtie index, created by for example
Rbowtie::bowtie-build,  must be contained in the same folder and having the
same basename as the fasta.}

\item{mismatches}{Integer indicating the number of allowed mismatches for the
genome mapping.}

\item{return}{Character indicating what information to return. 

   If return="simplify" (default), a table with one unique
   sequence per row will be returned. Multiple hits between genomic
   coordinates and gtf coordinates will be merged and only unique annotations
   will be reported.
   
   If return="full", a list will be returned containing 1 table reporting all
   annotations for each genomic coordinate of each unique sequence.
   
   If return="all", both a simplified table and a full annotation list will
   be returned as a list.
   
   If return="merge", a simplified table will be merged with the Anno table
   of the provided PAC object, and the updated PAC object containing the new
   annotations will be returned.}

\item{stranded}{Logical whether mapping should be strand specific. If
stranded=TRUE, then hits between feature and PAC read sequence will not be
reported if the feature is located on opposite strand. If
stranded=FALSE (default), then both sense and anti-sense overlaps will be
reported. Note, stranded=FALSE is recommended since PAC_gtf will report on
which strand each feature/sequence is mapping. Thus, strand specific
analysis can be done in hindsight.}

\item{gtf}{Named list of characters, indicating file path(s) to other
gtf files with differing formats. Can also directly be provided as a tibble
dataframe in a named list.}

\item{targets}{Named list of character vectors indicating target columns
in the listed gtf files in \emph{gtf_other}. Important, the listed objects
must have the same length and names as in \emph{gtf}. The vector
indicates the column names as if imported by rtracklayer::readGFF.}

\item{threads}{Integer indicating the number of parallel processes.}
}
\value{
List, tibble dataframe or updated PAC object. See option
  \emph{return} for more information.
}
\description{
This function will annotate a PAC object using input from a GTF/GFF file.
}
\details{
Given a PAC object and a gtf formatted annotation file(s), this function will
attempt to annotate sequences mapped to a reference genome against genomic
coordinates in the gtf file(s). In case no genomic mapping coordinates are
available in the PAC object, the function provides a backdoor into the
\emph{PAC reannotation} workflow, where genome mapping is performed using
Bowtie.
}
\examples{

# Load PAC
load(system.file("extdata", "drosophila_sRNA_pac_filt_anno.Rdata", 
                 package = "seqpac", mustWork = TRUE))

# Create a gtf file from PAC coordinates
anno <- anno(pac)
anno <- anno[!grepl("Warning", anno$mis0_chromosomes_genome),]
anno <- anno[!is.na(anno$mis0_chromosomes_genome),]
coord <- anno$mis0_chromosomes_genome
coord <- suppressWarnings(do.call("rbind", strsplit(coord, "\\\\|"))[,1])
coord <- suppressWarnings(do.call("rbind", strsplit(coord, "\\\\;start=|;")))
gr <- GenomicRanges::GRanges(seqnames=coord[,1], 
                             IRanges::IRanges(as.numeric(coord[,2]), 
                                              as.numeric(coord[,2])+
                                              anno$Size ), 
                             strand=coord[,3])

GenomicRanges::mcols(gr) <- data.frame(biotype=anno$Biotypes_mis3, 
                                       bio_zero=as.character(anno$mis0_bio))
spl <- sample(1:length(gr), round(length(gr)*0.3), replace=FALSE)
gr1 <- gr[spl]
gr2 <- gr[!1:length(gr) \%in\% spl]

# Prepare temp folder and save artifical gtf
out1 <- paste0(tempdir(), "/temp1.gtf")
out2 <- paste0(tempdir(), "/temp2.gtf")

rtracklayer::export(gr1, out1, format="gtf")
rtracklayer::export(gr2, out2, format="gtf")

# Make sure PAC contains full genome coordinates
# (In the add_reanno function you may choose how many coordinates to report)
# (If there are more, a 'Warning' will be added to the annotation)
# (Here we remove those to avoid problems)

new_anno <-  anno(pac) [, grepl("chromosomes_genome|Size", 
                               colnames(anno(pac)))]
test <- new_anno[, 3:ncol(new_anno)]
test <- apply(test, 2, function(x){substr(x, 1, 7)})
test <- apply(test, 1, function(x){paste(x, collapse = "")})
new_anno$temp <- ifelse(grepl("Warning",  test), "rm", "keep")
anno(pac) <- new_anno
pac_small <- PAC_filter(pac, subset_only=TRUE, nucleotide_range = c(20,22),
                  anno_target=list("temp", "keep"))

#  Run PAC_gtf
gtf <- list(gtf1=out1, gtf2=out2)
target <- list(gtf1=c("biotype","bio_zero"), gtf2=c("biotype","bio_zero"))

pac_merge <- PAC_gtf(pac_small, mismatches=0, return="merge", 
                    gtf=gtf, target=target, threads=2)


}
\seealso{
\url{https://github.com/OestLab/seqpac} for updates.

Other PAC analysis: 
\code{\link{PAC_covplot}()},
\code{\link{PAC_deseq}()},
\code{\link{PAC_filter}()},
\code{\link{PAC_filtsep}()},
\code{\link{PAC_jitter}()},
\code{\link{PAC_mapper}()},
\code{\link{PAC_nbias}()},
\code{\link{PAC_norm}()},
\code{\link{PAC_pca}()},
\code{\link{PAC_pie}()},
\code{\link{PAC_saturation}()},
\code{\link{PAC_sizedist}()},
\code{\link{PAC_stackbar}()},
\code{\link{PAC_summary}()},
\code{\link{PAC_trna}()},
\code{\link{as.PAC}()},
\code{\link{filtsep_bin}()},
\code{\link{map_rangetype}()},
\code{\link{tRNA_class}()}
}
\concept{PAC analysis}
