## ----install_github, eval=FALSE-----------------------------------------------
# # install.packages("devtools")
# #devtools::install_github("wenmm/EMTscore")
# #devtools::install_github("wenmm/EMTscoreData")

## ----install------------------------------------------------------------------
#if (!require("BiocManager", quietly = TRUE))
#    install.packages("BiocManager")

#BiocManager::install("EMTscore")
#BiocManager::install("EMTscoreData")

## -----------------------------------------------------------------------------
library(EMTscoreData)
#library(EMTscore)
library(ExperimentHub)
library(SummarizedExperiment)
library(Seurat)
library(ggplot2)

## -----------------------------------------------------------------------------
eh <- ExperimentHub()
query(eh , 'EMTscoreData')

## -----------------------------------------------------------------------------
A549_TNF <- eh[['EH10291']]
A549_EGF <- eh[['EH10292']]
A549_TGFB1 <- eh[['EH10293']]

## -----------------------------------------------------------------------------
class(A549_TNF)
A549_TNF

## -----------------------------------------------------------------------------
#List available assays (common ones: "counts", "logcounts")
assayNames(A549_TNF)

## -----------------------------------------------------------------------------
#List available per-cell metadata fields
colnames(colData(A549_TNF))

## -----------------------------------------------------------------------------
set.seed(1)

subset_cells <- function(sce, n = 500) {
n <- min(n, ncol(sce))
sce[, sample(seq_len(ncol(sce)), n)]
}

A549_TNF_small <- subset_cells(A549_TNF, n = 1500)
A549_EGF_small <- subset_cells(A549_EGF, n = 1500)
A549_TGFB1_small <- subset_cells(A549_TGFB1, n = 1500)

objects <- list(
A549_TGFB1 = A549_TGFB1_small,
A549_EGF = A549_EGF_small,
A549_TNF = A549_TNF_small
)

#Confirm the reduced size

sapply(objects, ncol)

## -----------------------------------------------------------------------------
library(BiocFileCache)
url <- "https://zenodo.org/records/18168504/files/HALLMARK_EPITHELIAL_MESENCHYMAL_TRANSITION.v2025.1.Hs.gmt"

bfc <- BiocFileCache() # creates/uses a cache folder
gmt_file <- bfcrpath(bfc, url) # downloads once; returns the cached file path

## -----------------------------------------------------------------------------
read_gmt <- function(fname){
  #Read all lines from the GMT file
  gmt_lines <- readLines(fname)
  #Split each line by tab; each line becomes a character vector
  gmt_list <- lapply(gmt_lines, function(x) unlist(strsplit(x, split="\t")))
  #For each line, genes start at element 3 (after name and description)
  gmt_genes <- lapply(gmt_list, function(x) x[3:length(x)])
  #Get unique gene list
  genes <- unique(unlist(gmt_genes))
  #Return a data frame whose column name is `gene` and whose values are gene names.
  return(data.frame(gene = genes, stringsAsFactors = FALSE))
}

Genesets <- read_gmt(gmt_file)
Genesets

## -----------------------------------------------------------------------------
## ------------------------------------------------------------------
## Prepare input objects
## ------------------------------------------------------------------
## We first organize the datasets into a named list. Each element
## corresponds to one experimental condition and contains a Seurat
## object (or a SingleCellExperiment object that will be converted
## internally). This unified structure allows us to apply the same
## EMT scoring procedure to multiple datasets in a consistent way.

objects <- list(
A549_TGFB1 = A549_TGFB1_small,
A549_EGF   = A549_EGF_small,
A549_TNF   = A549_TNF_small
)

## EMT gene list extracted from the GMT file. These genes define
## the epithelial–mesenchymal transition (EMT) signature used
## for score calculation.
emt_genes <- Genesets$gene


## ------------------------------------------------------------------
## Function: add_EMT_score
## ------------------------------------------------------------------
## This function computes an EMT score for each cell in a collection
## of Seurat (or SingleCellExperiment) objects using a predefined
## EMT gene set.
##
## Input:
##   - objects: a named list of Seurat objects or SingleCellExperiment
##              objects. Each object represents one dataset or condition.
##   - gmt_file: path to a GMT file containing EMT-related genes.
##   - emt_name: name of the metadata column that will store the EMT score.
##
## Output:
##   - A list of Seurat objects, each containing a new metadata column
##     with one EMT score per cell.
##
## Note:
##   For demonstration purposes in this vignette, the EMT score is
##   calculated using Seurat's AddModuleScore function. More advanced
##   or alternative EMT scoring methods are implemented in the
##   companion EMTscore package.

add_EMT_score <- function(objects,
                          gmt_file,
                          emt_name = "EMT_Score") {
  
  ## Read the EMT gene set from the GMT file
  Genesets <- read_gmt(gmt_file)
  emt_genes <- Genesets$gene
  
  ## Apply EMT scoring to each object in the list
  obj_list <- lapply(names(objects), function(name) {
    
    obj <- objects[[name]]
    
    # Convert SCE → Seurat
    if (inherits(obj, "SingleCellExperiment")) {
      message("Converting SCE to Seurat: ", name)
      obj <- as.Seurat(obj, data = "logcounts")
    }
    if (!inherits(obj, "Seurat")) {
      stop("Object ", name, " is not a Seurat or SCE object.")
    }
    
    ## Update the Seurat object to the current structure if needed.
    obj <- UpdateSeuratObject(obj)
    
    # get gene expression matrix
    geneExp <- GetAssayData(obj, assay = "RNA", layer = "data")
    
    ## Compute the EMT score using the EMT gene set.
    ## AddModuleScore returns a column named '<emt_name>1', which we
    ## subsequently rename for clarity.
    obj <- AddModuleScore(obj, features = list(emt_genes), name = emt_name, ctrl = 5)
    old <- paste0(emt_name, "1")
    obj <- SeuratObject::AddMetaData(
      object   = obj,
      metadata = obj[[old]][, 1, drop = TRUE],  # 提取为向量
      col.name = emt_name
    )
    obj[[old]] <- NULL
    return(obj)
  })
  
  
  ## Preserve original dataset names
  names(obj_list) <- names(objects)
  return(obj_list)
}


## ------------------------------------------------------------------
## Function: plot_EMT_from_objects
## ------------------------------------------------------------------
## This function visualizes EMT scores as a function of pseudotime
## across multiple datasets.
##
## Input:
##   - obj_list: a list of Seurat objects produced by add_EMT_score().
##   - col_name: name of the metadata column representing pseudotime.
##   - emt_score_col: name of the metadata column storing EMT scores.
##
## Output:
##   - A ggplot object showing smoothed EMT score trends along pseudotime
##     for each experimental condition.

plot_EMT_from_objects <- function(obj_list, col_name,
                                  emt_score_col) {
  
  ## Merge metadata from all Seurat objects into a single data frame
  ## for visualization.
  plot_df <- do.call(rbind, lapply(names(obj_list), function(name) {
    obj <- obj_list[[name]]
    df <- obj[[]][, c(col_name, emt_score_col), drop = FALSE]
    colnames(df) <- c(col_name, emt_score_col)
    ## Track the experimental condition
    df$Condition <- name
    ## Order cells by pseudotime for smoother visualization
    df <- df[order(df[[col_name]]), ]  
    df
  }))
  
  # Draw smooth curves
  p <- ggplot(plot_df, aes_string(x = col_name, y = emt_score_col, 
                                  color = "Condition")) +
    geom_smooth(method = "loess", se = FALSE, linewidth = 1.2) +
    theme_classic(base_size = 14) +
    labs(x = col_name, y = emt_score_col, color = "Condition")
  
  return(p)
}

## ------------------------------------------------------------------
## Run EMT scoring and visualization
## ------------------------------------------------------------------
## Compute EMT scores for all datasets and visualize their dynamics
## along pseudotime.

seurat_objs <- add_EMT_score(objects, 
                             gmt_file = gmt_file,
                             emt_name = "EMT_score")
p_Seurat <- plot_EMT_from_objects(seurat_objs, 
                                  col_name = "Pseudotime", 
                                  emt_score_col = "EMT_score")
p_Seurat

## -----------------------------------------------------------------------------
sessionInfo()

