## ----knitr options, echo = FALSE, include = FALSE-----------------------------
plotDev <- if (requireNamespace("ragg", quietly = TRUE)) {
  "ragg_png"
} else if (requireNamespace("Cairo", quietly = TRUE)) {
  "cairo_png"
} else {
  "png"
}

knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>",
  fig.width = 7L,
  fig.height = 5L,
  dev = plotDev,
  fig.crop = FALSE,
  dpi = 100L
)

## ----preamble, message=FALSE, warning=FALSE-----------------------------------
library(COTAN)
library(zeallot)

# necessary to solve precedence of overloads
conflicted::conflict_prefer("%<-%", "zeallot")

# enable multi-processing (not on Windows)
prevOptState <- options(parallelly.fork.enable = TRUE)

## ----setup-and-log------------------------------------------------------------
dataDir <- file.path(tempdir(), "COTAN_vignette_data")
dir.create(dataDir, recursive = TRUE, showWarnings = FALSE)

print(dataDir)

outDir <- dataDir

# Log-level 2 was chosen to showcase better how the package works
# In normal usage a level of 0 or 1 is more appropriate
setLoggingLevel(2L)

# This file will contain all the logs produced by the package
# as if at the highest logging level
setLoggingFile(file.path(outDir, "vignette_DEA.log"))

## ----download-data------------------------------------------------------------
GEO <- "GSM2861514"

dir.create(file.path(dataDir, GEO), showWarnings = FALSE)

datasetFileName <-
  file.path(dataDir, GEO, "GSM2861514_E175_All_Cells_DGE.txt.gz")


# retries up to 5 times to get the dataset
attempts <- 0L
maxAttempts <- 5L
ok <- FALSE

while (attempts < maxAttempts && !ok) {
  attempts <- attempts + 1L

  if (!file.exists(datasetFileName)) {
    res <- try(
      GEOquery::getGEOSuppFiles(
        GEO = GEO,
        makeDirectory = TRUE,
        baseDir = dataDir,
        filter_regex = base::basename(datasetFileName),
        fetch_files = TRUE
      ),
      silent = TRUE
    )
  }

  ok <- file.exists(datasetFileName)

  if (!ok && attempts < maxAttempts) {
    Sys.sleep(1)
  }
}

assertthat::assert_that(
  ok,
  msg = paste0(
    "Failed to retrieve file '", datasetFileName,
    "' after ", maxAttempts, " attempts."
  )
)

rawDataset <- read.csv(datasetFileName, sep = "\t", row.names = 1L)

print(dim(rawDataset))

## ----create-cotan-object------------------------------------------------------
cond <- "mouse_cortex_E17.5"

obj <- COTAN(raw = rawDataset)
obj <-
  initializeMetaDataset(
    obj,
    GEO = GEO,
    sequencingMethod = "Drop_seq",
    sampleCondition = cond
  )

logThis(paste0("Condition ", getMetadataElement(obj, datasetTags()[["cond"]])),
        logLevel = 1L)

## ----add-origin-to-the-cotan-object-------------------------------------------
# mark cells origin
data(vignette.cells.origin)
head(vignette.cells.origin, 18)

obj <- 
  addCondition(obj, condName = "origin", conditions = vignette.cells.origin)

rm(vignette.cells.origin)

## ----drop-low-quality-cells---------------------------------------------------
# use previously established results to determine
# which cells were dropped in the cleaning stage
data(vignette.split.clusters)

cellsToDrop <-
  getCells(obj)[!(getCells(obj) %in% names(vignette.split.clusters))]

obj <- dropGenesCells(obj, cells = cellsToDrop)

# Log the remaining number of cells

logThis(paste("n cells", getNumCells(obj)), logLevel = 1L)

table(getCondition(obj, condName = "origin"))

rm(vignette.split.clusters)

## ----calculate-and-store-coex-matrix------------------------------------------
obj <-
  proceedToCoex(
    obj,
    calcCoex = FALSE,
    optimizeForSpeed = TRUE,
    cores = 3L,
    deviceStr = "cuda"
  )

## ----load-pre-calculated-clusterizations--------------------------------------
data("vignette.split.clusters", package = "COTAN")
data("vignette.merge.clusters", package = "COTAN")

vignette.split.clusters <-
  asClusterization(
    vignette.split.clusters, # a named vector/factor/data.frame
    allCells = getCells(obj) # used only to check names are coherent
  )
vignette.merge.clusters <-
  asClusterization(
    vignette.merge.clusters, # a named vector/factor/data.frame
    allCells = getCells(obj) # used only to check names are coherent
  )

# explicitly calculate the DEA of the clusterization to store it
vignette.split.coexDF <-
  DEAOnClusters(obj, clusters = vignette.split.clusters)
vignette.merge.coexDF <-
  DEAOnClusters(obj, clusters = vignette.merge.clusters)

obj <-
  addClusterization(
    obj,
    clName = "split",
    clusters = vignette.split.clusters,
    coexDF = vignette.split.coexDF,
    override = FALSE
  )

obj <-
  addClusterization(
    obj,
    clName = "merge",
    clusters = vignette.merge.clusters,
    coexDF = vignette.merge.coexDF,
    override = FALSE
  )

# these will be recovered from the COTAN obj as needed
rm(vignette.split.clusters, vignette.split.coexDF)
rm(vignette.merge.clusters, vignette.merge.coexDF)

## ----extract-clusterizations--------------------------------------------------
# COTAn always stores clusterizations as factors.
splitClusters <- getClusters(obj, clName = "split")

# Use the utility factorToVector() to properly decay any named factor
# to a named char array
# splitClusters <- factorToVector(splitClusters)

# In this case the following calls are no-op since the clusterization
# were created by COTAN and so they already made nice and reordered

splitClusters <- niceFactorLevels(splitClusters)

c(splitClusters, splitCoexDF, perm) %<-% 
  reorderClusterization(
    obj,
    clName = "split",
    reverse = FALSE, 
    coexDF = NULL,
    useDEA = TRUE, # T: Cosine dist. on DEA, F: Eucl. dist. on avg. zero/one
    distance = NULL,
    hclustMethod = "ward.D2"
  )

mergeClusters <- getClusters(obj, clName = "merge")

table(splitClusters, mergeClusters)

## ----clusterization-manipulation----------------------------------------------
# this has an inverse `fromClustersList()`
splitClustersAsList <- toClustersList(splitClusters)

assertthat::assert_that(length(splitClustersAsList) == nlevels(splitClusters))

splitClustersOrigin <-
  rlang::set_names(
    rlang::rep_along(x = NA_character_, along = splitClustersAsList),
    names(splitClustersAsList)
  )

origin <- getCondition(obj, "origin")

for (clName in names(splitClustersAsList)) {
  cluster <- splitClustersAsList[[clName]]

  # assign most common origin to the cluster
  splitClustersOrigin[[clName]] <- names(which.max(table(origin[cluster])))

  # print the average non-zero expression in the cluster
  clRawData <- getRawData(obj)[, cluster, drop = FALSE]
  clRawData <- clRawData[clRawData > 0.0]

  cat(
    paste("Cluster", clName, "of", splitClustersOrigin[[clName]],
          "\torigin - average non-zero expression:", mean(clRawData)), "\n")

  rm(clRawData)
}

# If one needs to reorder the cells by cluster,
# labels are ordered as in the clusterization
orderCellsByMergeCluster <- groupByClusters(mergeClusters)

plot(getNumExpressedGenes(obj)[orderCellsByMergeCluster],
     ylab = "Num expressed genes", xlab = NA_character_)

mergeClustersAsList <- toClustersList(mergeClusters)

mergeClustersOrigin <- 
  vapply(
    mergeClustersAsList,
    \(cluster, origin) {
      names(which.max(table(origin[cluster])))
    },
    FUN.VALUE = character(1L),
    origin
  )
names(mergeClustersOrigin) <- names(mergeClustersAsList)


# It is also possible to reorder a subset of cells using
orderCellsBySomeMergeClusters <- 
  groupByClustersList(
    getCells(obj),
    mergeClustersAsList[c(2, 4, 6)]
  )

plot(getNumExpressedGenes(obj)[orderCellsBySomeMergeClusters],
     ylab = "Library Size", xlab = NA_character_)

## ----clusters-in-a-tree-------------------------------------------------------
treePlot <-
  clustersTreePlot(
    obj,
    kCuts = 2L,
    clName = "split",
    useDEA = TRUE  # T: Cosine dist. on DEA, F: Eucl. dist. on avg. zero/one
  )[["dend"]]

plot(treePlot)

# use origin to mark the clusters
dendextend::labels(treePlot) <- splitClustersOrigin[base::labels(treePlot)]

plot(treePlot)

## ----define-specific-genes-lists----------------------------------------------
# these are some genes associated to each cortical layer
layersGenes <- list(
  "L1"   = c("Reln",   "Lhx5"),
  "L2/3" = c("Cux1",   "Satb2"),
  "L4"   = c("Rorb",   "Sox5"),
  "L5/6" = c("Bcl11b", "Fezf2"),
  "Prog" = c("Hes1",    "Vim")
)

neuralTypeGenes <- list(
  # Neural Progenitor Genes
  "NPGs" = c("Nes", "Vim", "Sox2", "Sox1", "Notch1", "Hes1", "Hes5", "Pax6"),
  # Pan Neural Genes
  "PNGs" = c("Map2", "Tubb3", "Neurod1", "Nefm", "Nefl", "Dcx", "Tbr1"),
  # House Keeping
  "hk"   = c("Calm1", "Cox6b1", "Ppia", "Rpl18", "Cox7c", "Erh", "H3f3a",
             "Taf1", "Taf2", "Gapdh", "Actb", "Golph3", "Zfr", "Sub1",
             "Tars", "Amacr")
)

## ----clusterization-data-visualization, fig.height=8, fig.width=20------------
c(splitHeatmap, splitScoreDF, splitPValueDF) %<-%
  clustersMarkersHeatmapPlot(
    obj,
    groupMarkers = layersGenes,
    clName = "split",
    kCuts = 2L,
    adjustmentMethod = "bonferroni",
    condNameList = list("origin")
  )

ComplexHeatmap::draw(splitHeatmap)

c(mergeHeatmap, mergeScoreDF, mergePValueDF) %<-%
  clustersMarkersHeatmapPlot(
    obj,
    groupMarkers = neuralTypeGenes,
    clName = "merge",
    kCuts = 3L,
    adjustmentMethod = "bonferroni",
    condNameList = list("origin")
  )

ComplexHeatmap::draw(mergeHeatmap)

## ----find-marge-markers-------------------------------------------------------
mergeClusterMarkers <-
  findClustersMarkers(
    obj,
    clName = "merge",
    n = 5L,
    markers = unlist(layersGenes),
    adjustmentMethod = "bonferroni"
  )

foundMarkers <- list()

# All relevant genes with strong `p-values`
geneIsEnriched <- mergeClusterMarkers[, "adjPVal"] < 1e-10

for (clName in levels(mergeClusters)) {
  geneIsInCluster <- mergeClusterMarkers[, "CL"] == clName
  foundMarkers[[clName]] <-
    mergeClusterMarkers[geneIsEnriched & geneIsInCluster, "Gene", drop = TRUE]
}

# number of genes per cluster
lengths(foundMarkers)

## ----cl-data-visualization_found-markers, fig.height=8, fig.width=20----------
c(mergeHeatmap, ..) %<-%
  clustersMarkersHeatmapPlot(
    obj,
    groupMarkers = foundMarkers,
    clName = "merge",
    condNameList = list("origin")
  )

ComplexHeatmap::draw(mergeHeatmap)

## ----create-sub-object--------------------------------------------------------
# We will focus on the clusters `03` (likely part of layers 5/6) and
# `05` (likely part of layers 2/3) of the `split` clusterization

cellsToDrop <- getCells(obj)[!(splitClusters %in% c("03", "05"))]

obj2 <- dropGenesCells(obj, cells = cellsToDrop)

obj2 <- proceedToCoex(obj2, calcCoex = FALSE)

table(getClusters(obj2, clName = "split"))

## ----plot-sub-object-cluster-data, eval=FALSE, include=TRUE-------------------
# # this does not give more information than the same full-plot above
# 
# c(splitHeatmap2, ., .) %<-%
#   clustersMarkersHeatmapPlot(
#     obj2,
#     groupMarkers = layersGenes,
#     clName = "split",
#     kCuts = 2L,
#     adjustmentMethod = "bonferroni"
#   )
# 
# ComplexHeatmap::draw(splitHeatmap2)

## ----find-sub-object-markers--------------------------------------------------
deaMarkers <- findClustersMarkers(
  obj2,
  n = 10L,
  clName = "split",
  adjustmentMethod = "bonferroni",
  markers = layersGenes[c("L2/3", "L5/6")]
)

# over-expressed genes follow the under-expressed ones
deaMarkers[11:20, ]
deaMarkers[31:40, ]

## ----clean-up-----------------------------------------------------------------
if (file.exists(file.path(outDir, paste0(cond, ".cotan.RDS")))) {
  # delete file if it exists
  file.remove(file.path(outDir, paste0(cond, ".cotan.RDS")))
}
if (file.exists(file.path(outDir, paste0(cond, "_times.csv")))) {
  # delete file if it exists
  file.remove(file.path(outDir, paste0(cond, "_times.csv")))
}
if (dir.exists(file.path(outDir, cond))) {
  unlink(file.path(outDir, cond), recursive = TRUE)
}
# if (dir.exists(file.path(outDir, GEO))) {
#   unlink(file.path(outDir, GEO), recursive = TRUE)
# }

# stop logging to file
setLoggingFile("")
file.remove(file.path(outDir, "vignette_uniform_clustering.log"))

options(prevOptState)

## ----closure------------------------------------------------------------------
Sys.time()

sessionInfo()