## ----setup, echo=FALSE, results="hide", warning=FALSE, eval=TRUE--------------
suppressPackageStartupMessages({library(fraq)})

## ----eval=FALSE---------------------------------------------------------------
# if (!requireNamespace("BiocManager", quietly = TRUE)) {
#     install.packages("BiocManager")
# }
# BiocManager::install("fraq")

## ----walkthrough-setup, eval=TRUE, message=FALSE, warning=FALSE---------------
set.seed(314156)
example_dir <- file.path(tempdir(), "fraq_function_examples")
dir.create(example_dir, showWarnings = FALSE, recursive = TRUE)
R1 <- file.path(example_dir, "example_R1.fastq")
R2 <- file.path(example_dir, "example_R2.fastq")
generate_random_fastq(c(R1, R2), n_reads = 2000, read_length = 150)
example_reads <- c(R1, R2)

## ----fraq_summary, eval=TRUE--------------------------------------------------
library(fraq)
# summarize quality metrics
qc <- fraq_summary(c(R1, R2))

## ----fig-qc-quicklook, eval=TRUE, fig.cap="Quick QC overview from `fraq_summary()`.", fig.width=8, fig.asp=0.6, warning=FALSE, message=FALSE, echo=FALSE, fig.path="plots/"----

# quick-look QC plots
op <- par(no.readonly = FALSE)
par(mfrow = c(2, 2), mar = c(4, 4, 2.5, 1))

insert <- qc$insert_size
if (!is.null(insert) && nrow(insert) > 0) {
    barplot(
        insert$count,
        names.arg = insert$insert_size,
        col = "#38bdf8",
        border = NA,
        xlab = "Insert size",
        ylab = "Count",
        main = "Insert-size distribution"
    )
} else {
    len <- qc$length_distribution_R1
    barplot(
        len$count,
        names.arg = len$length,
        col = "#38bdf8",
        border = NA,
        xlab = "Read length",
        ylab = "Count",
        main = "Length distribution"
    )
}

qual <- qc$per_base_quality_R1
plot(qual$position, qual$mean_q, type = "l", lwd = 2, col = "#6366f1",
        xlab = "Cycle", ylab = "Mean Phred score", main = "Mean quality by cycle")
grid(col = "#e5e7eb", lty = "dotted")

base_content <- qc$per_base_content_R1
palette <- c(
    A = "#22c55e",
    C = "#0ea5e9",
    G = "#f97316",
    T = "#ef4444",
    N = "#94a3b8"
)
plot(NA, xlim = range(base_content$position), ylim = c(0, 1),
        xlab = "Cycle", ylab = "Fraction", main = "Base composition")
present_bases <- intersect(names(palette), unique(base_content$base))
for (b in present_bases) {
    dat <- base_content[base_content$base == b, , drop = FALSE]
    dat <- dat[order(dat$position), ]
    lines(dat$position, dat$fraction, col = palette[[b]], lwd = 2)
}
if (length(present_bases)) {
    legend("topright", legend = present_bases, col = palette[present_bases],
            lwd = 2, bty = "n", cex = 0.9)
}

base_totals <- aggregate(count ~ base, base_content, sum)
cols <- palette[base_totals$base]
cols[is.na(cols)] <- "#94a3b8"
barplot(
    base_totals$count,
    names.arg = base_totals$base,
    col = cols,
    border = NA,
    xlab = "Base",
    ylab = "Total count",
    main = "Base totals"
)

par(op)

## ----walkthrough-filtering, eval=TRUE, message=FALSE, warning=FALSE-----------
filter_dir <- file.path(example_dir, "filtering")
dir.create(filter_dir, showWarnings = FALSE)

downsampled_reads <- file.path(
    filter_dir,
    c("example_R1_ds.fastq.gz", "example_R2_ds.fastq.gz")
)
fraq_downsample(example_reads, downsampled_reads, amount = 0.50, nthreads = 2L)

quality_reads <- file.path(
    filter_dir,
    c("example_R1_q20.fastq.gz", "example_R2_q20.fastq.gz")
)
fraq_quality_filter(
    input = downsampled_reads,
    output = quality_reads,
    min_mean_quality = 22,
    max_low_q_bases = 6L,
    low_q_threshold = 18L,
    nthreads = 2L
)

filtered_stats <- fraq_summary(quality_reads, nthreads = 2L)$basic_stats_R1
filtered_stats

## ----walkthrough-splitting-setup, eval=TRUE, message=FALSE, warning=FALSE-----
split_dir <- file.path(example_dir, "splitting")
if (dir.exists(split_dir)) unlink(split_dir, recursive = TRUE)
dir.create(split_dir, showWarnings = FALSE)
barcode_set <- c("ACGTAA", "TTGGCC")
demux_patterns <- file.path(
    split_dir,
    c("R1_{barcode}.fastq.gz", "R2_{barcode}.fastq.gz")
)

## ----walkthrough-demux, eval=TRUE, message=FALSE, warning=FALSE---------------

fraq_demux(
    input = example_reads,
    output_format = demux_patterns,
    barcodes = barcode_set,
    max_distance = 1L,
    nthreads = 2L
)
basename(sort(
    list.files(split_dir, pattern = "R1_.*\\.fastq.gz$", full.names = TRUE)
))

## ----walkthrough-chunk, eval=TRUE, message=FALSE, warning=FALSE---------------
chunk_prefix <- file.path(split_dir, "chunk_demo")
fraq_chunk(
    input = example_reads[1],
    output_prefix = chunk_prefix,
    output_suffix = "gz",
    chunk_size = 200,
    nthreads = 2L
)
basename(sort(
    list.files(
        split_dir,
        pattern = "chunk_demo_chunk.+\\.fastq.gz$",
        full.names = TRUE
    )
))

## ----walkthrough-barcodes, eval=TRUE, message=FALSE, warning=FALSE------------
fraq_count_barcodes(
    input = example_reads,
    barcodes = barcode_set,
    max_distance = 0L,
    allow_revcomp = FALSE,
    nthreads = 2L
)

## ----walkthrough-modification, eval=TRUE, message=FALSE, warning=FALSE--------
mod_dir <- file.path(example_dir, "modification")
if (dir.exists(mod_dir)) unlink(mod_dir, recursive = TRUE)
dir.create(mod_dir, showWarnings = FALSE)

## ----walkthrough-modification-convert, eval=TRUE, message=FALSE, warning=FALSE----
converted_fastq <- file.path(
    mod_dir,
    c("example_R1.fastq.zst", "example_R2.fastq.zst")
)
fraq_convert(example_reads, converted_fastq, nthreads = 2L)

## ----walkthrough-modification-concat, eval=TRUE, message=FALSE, warning=FALSE----
concatenated <- file.path(mod_dir, "example_all.fastq.gz")
fraq_concat(converted_fastq, concatenated, nthreads = 2L)

## ----walkthrough-modification-merge, eval=TRUE, message=FALSE, warning=FALSE----
merged_reads <- file.path(mod_dir, "example_merged.fastq")
unmerged_reads <- file.path(
    mod_dir,
    c("example_unmerged_R1.fastq", "example_unmerged_R2.fastq")
)
fraq_merge_pairs(
    input = example_reads,
    output_merged = merged_reads,
    output_unmerged = unmerged_reads,
    min_overlap = 20L,
    max_mismatch_rate = 0.05,
    nthreads = 2L
)

## ----walkthrough-modification-trim, eval=TRUE, message=FALSE, warning=FALSE----
trimmed_reads <- file.path(
    mod_dir,
    c("example_trimmed_R1.fastq", "example_trimmed_R2.fastq")
)
fraq_trim_adapters(
    input = example_reads,
    output = trimmed_reads,
    adapters = "ACTAC",
    max_distance = 1L,
    filter_untrimmed = FALSE,
    nthreads = 2L
)

## ----rcpp_example, eval=FALSE-------------------------------------------------
# input  <- c("sample_R1.fastq.gz", "sample_R2.fastq.gz")
# output <- c("filtered_R1.fastq.zst", "filtered_R2.fastq.zst")
# fraq_gc_filter(input, output, gc_min = 0.30, gc_max = 0.70)

## ----fig-streaming-example, eval=FALSE----------------------------------------
# library(fraq)
# generate_random_fastq("R1.fastq")
# generate_random_fastq("R2.fastq")
# fraq_downsample(input=c("R1.fastq", "R2.fastq"),
#                 output=c("ds_R1.fastq.fifo", "ds_R2.fastq.fifo"),
#                 amount = 0.25, nthreads = 5L)

## ----fraq_options, eval=TRUE--------------------------------------------------
fraq_options("blocksize")          # current block size (default: 65535 reads)
fraq_options("blocksize", 16384L)  # shrink batches when running small tests
fraq_options("zstd_compress_level", 6L)
fraq_options("gzip_compress_level", 4L)

## ----fig-session-info, eval=TRUE----------------------------------------------
sessionInfo()

