% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/streaming_framework.R
\name{streaming_demultiplex}
\alias{streaming_demultiplex}
\title{Demultiplexing with streaming}
\usage{
streaming_demultiplex(
  state_init,
  loader,
  archiver,
  barcodes,
  allowed_mismatches,
  segments,
  segment_lengths
)
}
\arguments{
\item{state_init}{The initial state to pass into \code{loader}.}

\item{loader}{Function loading the reads. It has the signature
\code{f(state)}, where \code{state} is a user-defined object
which is initialized to be \code{state_init} and for the subsequent
iterations taken as the \code{state} field of the output of \code{archiver}.
Its return value is a list with the following fields:
\itemize{
\item \code{state}: The state to be passed into \code{archiver}.
\item \code{sequences}: A
\code{\link[Biostrings:XStringSet-class]{XStringSet}} object, the sequences
to be demultiplexed in the current chunk.
\item \code{should_terminate}: A scalar logical. If \code{TRUE},
the demultiplexing process terminates and the final results are returned.
Notice that this termination happens before the sequences of the final
call to \code{loader} are demultiplexed.
}}

\item{archiver}{Function taking care of archiving the demultiplexed results.
Its arguments are:
\itemize{
\item \code{state}: The state of the process returned by \code{loader}.
\item \code{filtered_res}: The output from running
\code{\link[=combinatorial_demultiplex]{combinatorial_demultiplex()}} and
\code{\link[=filter_demultiplex_res]{filter_demultiplex_res()}} on the data expect that the
field \code{summary_res} is missing.
}
Its output is a state object fed into the next call to \code{loader}.}

\item{barcodes}{A list of
\code{\link[Biostrings:XStringSet-class]{XStringSet}}
objects in the same order they appear in the \code{sequences},
the barcodes to be used for demultiplexing.
All of the barcodes in each
\code{\link[Biostrings:XStringSet-class]{XStringSet}} must
have the same length as specified by the \code{segment_lengths} argument
and be named.
For computational reasons,
the maximum possible length of an individual barcode is 127.}

\item{allowed_mismatches}{Integer vector of length one or the same length
as the number of barcode segments; the threshold Hamming distance. All reads
having a number of mismatches above this number in any of the barcodes will
be filtered away.}

\item{segments}{Character vector showing the segments of the
sequences from 5' end to 3' end. The code applied is as follows:
\itemize{
\item \code{'A'}: Adapter (often referred to as linker),
is trimmed and ignored
\item \code{'B'}: Barcode, used for demultiplexing
\item \code{'P'}: Payload, sequence to be kept after trimming
and demultiplexing (e.g. cDNA or UMI).
}
If this vector is named, this will determine the names of the payload sets.
Names of the barcode sets will be determined by the names of the argument
\code{barcodes} (if any).}

\item{segment_lengths}{Integer vector with the same length
as \code{segments}, lengths of the segments provided in the same order as in
\code{segments}.
Up to one of the non-barcode segments can have its length
set to \code{NA} which means
it is considered a variadic length segment.}
}
\value{
A list with three elements:
\itemize{
\item \code{freq_table}: The frequency table for all reads,
akin to the output of
\code{\link[=create_freq_table]{create_freq_table()}}.
\item \code{summary_res}: The summary result of match filtering of all reads
per \code{\link[=create_summary_res]{create_summary_res()}}.
\item \code{state_final}: The final state object returned from \code{loader}.

}
}
\description{
This function provides an interface to
\code{\link[=combinatorial_demultiplex]{combinatorial_demultiplex()}} and
\code{\link[=filter_demultiplex_res]{filter_demultiplex_res()}} such that reads are streamed in chunks
instead having to load everything at once, hence reducing memory consumption.
It accepts two functions which are called once per chunk:
A data loader function for producing the sequences of the chunk and an
archiver writing the results to file.
}
\details{
The data loader decides the size of each chunk.
While this framework does not provide any restriction on the \code{state}
object, the loader and archiver must be written such that the state objects
they return are compatible.
Since the data loader alone decides when to terminate,
bad terminations crieria can cause a runaway loop.
Usually, it will be useful to have a progress tracker of how many reads
are demultiplexed. The framework itself does not implement this, so
it is typically implemented into the archiver or loader.

For technical reasons, it is not possible to do streaming when the number of
possible barcode combinations exceeds \eqn{2^{32}-1\approx 2.1\cdot 10^{9}}.
}
\examples{
library(purrr)
library(Biostrings)
input_fastq <- system.file(
    "extdata", "PETRI-seq_forward_reads.fq.gz", package = "posDemux")
output_barcode_table <- tempfile(pattern = "barcode_table", fileext = ".txt")

callbacks <- streaming_callbacks(
    input_file = input_fastq, output_table_file = output_barcode_table,
    chunk_size = 10000, verbose = TRUE)
barcode_files <- system.file(
    "extdata/PETRI-seq_barcodes",
    c(bc1 = "bc1.fa", bc2 = "bc2.fa", bc3 = "bc3.fa"),
    package = "posDemux"
    )
names(barcode_files) <- paste0("bc", 1L:3L)
barcode_index <- map(barcode_files, readDNAStringSet)
barcodes <- barcode_index[c("bc3", "bc2", "bc1")]
sequence_annotation <- c(UMI = "P", "B", "A", "B", "A", "B", "A")
segment_lengths <- c(7L, 7L, 15L, 7L, 14L, 7L, NA_integer_)
streaming_summary_res <- streaming_demultiplex(
    state_init = callbacks$state_init, loader = callbacks$loader,
    archiver = callbacks$archiver, barcodes = barcodes, allowed_mismatches = 1L,
    segments = sequence_annotation, segment_lengths = segment_lengths
    )
}
\seealso{
\code{\link[=filter_demultiplex_res]{filter_demultiplex_res()}}, \code{\link[=combinatorial_demultiplex]{combinatorial_demultiplex()}},
\code{\link[=create_freq_table]{create_freq_table()}}, and \code{\link[=create_summary_res]{create_summary_res()}}
for the underlying processing.
}
