## ----setup,include=FALSE------------------------------------------------------
knitr::opts_chunk$set(echo = TRUE)

## ----pkg-overview, echo=FALSE,out.width="120%",fig.align='center',fig.cap="tidyexposomics pipeline overview. QC, association testing, integration, and enrichment steps on a MultiAssayExperiment."----
knitr::include_graphics("./overview.png")

## ----install, eval=FALSE------------------------------------------------------
# # install the package
# BiocManager::install("tidyexposomics")
# 
# # load the package
# library(tidyexposomics)

## ----command-str, echo=FALSE,fig.align='center',fig.cap= "Command naming conventions used throughout `tidyexposomics`. More complex pipelines begin with the `run_` prefix, visualizations with `plot_`, and data processing with `filter_`, `transform_`, `pivot_`, or `extract_` prefixes."----

knitr::include_graphics("./command_str.png")

## ----ontology-app,eval=FALSE--------------------------------------------------
# # Launch the shiny app to annotate exposure variables
# shiny::runApp(build_ont_annot_app())

## ----ontology-app-img, echo=FALSE,out.width="150%",fig.align='center',fig.cap="Screenshot of the ontology annotation app. The sidebar has an upload button to load your exposure metadata file. The main panel displays the uploaded exposure metadata where users can select variables to annotate and categorize. The sidebar also contains buttons to apply annotations/categorizations and download the annotated file."----
knitr::include_graphics("./ont_annot.png")

## ----mae-overview,echo=FALSE,fig.align='center',fig.cap="Overview of the MultiAssayExperiment structure linking samples, assays, and metadata."----
knitr::include_graphics("./mae_rep.png")

## ----load-libraries,message=FALSE,warning=FALSE-------------------------------
# Load Libraries
library(tidyverse)
library(tidyexposomics)

## ----load-data----------------------------------------------------------------
# Load example data
data("tidyexposomics_example")

# Create exposomic set object
expom <- create_exposomicset(
    codebook = tidyexposomics_example$annotated_cb,
    exposure = tidyexposomics_example$meta,
    omics = list(
        "Gene Expression" = tidyexposomics_example$exp_filt,
        "Methylation" = tidyexposomics_example$methyl_filt
    ),
    row_data = list(
        "Gene Expression" = tidyexposomics_example$exp_fdata,
        "Methylation" = tidyexposomics_example$methyl_fdata
    )
)

## ----exp-vars-----------------------------------------------------------------
# Grab exposure variables
exp_vars <- tidyexposomics_example$annotated_cb |>
    filter(category %in% c(
        "aerosol",
        "main group molecular entity",
        "polyatomic entity"
    )) |>
    pull(variable) |>
    as.character()

## ----missing-bar,fig.height=3,fig.width=6,fig.cap="Count of features with missing data above a 0% missingness threshold by data layer. Exposure data have variables with missingness."----

# Plot the missingness summary
plot_missing(
    exposomicset = expom,
    plot_type = "summary",
    threshold = 0
)

## ----missing-bar-lollipop,fig.height=2,fig.width=4,fig.cap= "Percent missingness per exposure variable. Parity, `h_parity_None`, shows the highest missingness."----

# Plot missing variables withing exposure group
plot_missing(
    exposomicset = expom,
    plot_type = "lollipop",
    threshold = 0,
    layers = "Exposure"
)

## ----impute-missing-----------------------------------------------------------

# Impute missing values
expom <- run_impute_missing(
    exposomicset = expom,
    exposure_impute_method = "missforest",
    exposure_cols = exp_vars
)

## ----filter-omics-------------------------------------------------------------

# filter omics layers by variance and expression
# Methylation filtering
expom <- filter_omics(
    exposomicset = expom,
    method = "variance",
    assays = "Methylation",
    assay_name = 1,
    min_var = 0.05
)

# Gene expression filtering
expom <- filter_omics(
    exposomicset = expom,
    method = "expression",
    assays = "Gene Expression",
    assay_name = 1,
    min_value = 1,
    min_prop = 0.3
)

## ----normality-check----------------------------------------------------------

# Check variable normality
expom <- run_normality_check(
    exposomicset = expom,
    action = "add"
)

## ----trasform-vars------------------------------------------------------------

# Transform variables
expom <- transform_exposure(
    exposomicset = expom,
    transform_method = "boxcox_best",
    exposure_cols = exp_vars
)

## ----norm-plot,fig.width= 4,fig.height= 4, fig.cap= "Normality status of numeric exposure variables after Box-Cox transformation."----

# Examine normality summary
plot_normality_summary(
    exposomicset = expom,
    transformed = TRUE
)

## ----pca----------------------------------------------------------------------

# Perform principal component analysis
expom <- run_pca(
    exposomicset = expom,
    log_trans_exp = TRUE,
    log_trans_omics = TRUE,
    action = "add"
)

## ----pca-plot, fig.align='center', fig.width= 9, fig.height= 7,fig.cap= "PCA of sample and feature space with sample outlier detection."----

# Plot the PCA plot of sample and feature space
plot_pca(exposomicset = expom)

## ----filt-outliers------------------------------------------------------------

# Filter out sample outliers
expom <- filter_sample_outliers(
    exposomicset = expom,
    outliers = c("s1231")
)

## ----corr-pcs-----------------------------------------------------------------

# Run the correlation analysis
expom <- run_correlation(
    exposomicset = expom,
    feature_type = "pcs",
    exposure_cols = exp_vars,
    n_pcs = 20,
    action = "add",
    correlation_cutoff = 0,
    pval_cutoff = 1
)

## ----plot-pc-corr,fig.width= 7,fig.height= 2.2,fig.cap= "Correlation heatmap of exposures versus principal components. Child lead levels (`hs_pb_c_Log2`) and maternal BPA levels (`hs_bpa_madj_Log2`) are associated with the most principal components."----


# Plot the correlation tile plot
plot_correlation_tile(
    exposomicset = expom,
    feature_type = "pcs",
    pval_cutoff = 0.05
)

## ----exp-sum, fig.width=6, fig.height=4---------------------------------------

# Summarize exposure data
run_summarize_exposures(
    exposomicset = expom,
    action = "get",
    exposure_cols = exp_vars
) |>
    head()

## ----plot-aerosol, fig.width=4, fig.height=3.5, fig.cap="Distribution of aerosol exposures by sex."----

# Plot aerosol exposure distributions by sex
plot_exposures(
    exposomicset = expom,
    group_by = "e3_sex_None",
    exposure_cat = "aerosol",
    plot_type = "boxplot",
    ylab = "Values",
    title = "Aerosol Exposure by Sex"
)

## ----sample-clusters----------------------------------------------------------

# Sample clustering
expom <- run_cluster_samples(
    exposomicset = expom,
    exposure_cols = exp_vars,
    clustering_approach = "dynamic",
    action = "add"
)

## ----plot-sample-clusters, fig.height=4, fig.width=12, fig.cap="Sample clustering heatmap using exposure profiles (z-scored). Clusters appear mostly driven by aerosol exposure during pregnancy."----

# Plot the sample clusters
plot_sample_clusters(
    exposomicset = expom,
    exposure_cols = exp_vars
)

## ----correlate-exposures------------------------------------------------------

# Run correlation analysis
expom <- run_correlation(
    exposomicset = expom,
    feature_type = "exposures",
    action = "add",
    exposure_cols = exp_vars,
    correlation_cutoff = 0.3
)

## ----exposure-circos-corr, fig.height=10,fig.width=10,fig.align='center',fig.cap="Circos view of exposure-exposure correlations (threshold 0.3)."----

# Plot exposure correlation circos plot
plot_circos_correlation(
    exposomicset = expom,
    feature_type = "exposures",
    corr_threshold = 0.3,
    exposure_cols = exp_vars
)

## ----assoc-exposures----------------------------------------------------------

# Perform ExWAS
expom <- run_association(
    exposomicset = expom,
    source = "exposures",
    outcome = "hs_asthma",
    feature_set = exp_vars,
    action = "add",
    family = "binomial"
)

## ----plot-exposure-assoc,fig.height=3, fig.width=5, fig.align='center',fig.cap="ExWAS associations of exposures with asthma status. No exposures are significantly associated (P < 0.05) with asthma status."----

# Plot the association forest plot
plot_association(
    exposomicset = expom,
    source = "exposures",
    terms = exp_vars,
    filter_thresh = 0.05,
    filter_col = "p.value",
    r2_col = "r2"
)

## ----associate-top-omics,fig.height=3,fig.width=4.5,fig.align='center'--------

# Perform ExWAS
expom <- run_association(
    exposomicset = expom,
    outcome = "hs_asthma",
    source = "omics",
    top_n = 500,
    action = "add",
    family = "binomial"
)

## ----plot-manhattan,fig.height=7, fig.width=5, fig.align='center',fig.cap="Manhattan plot of omics-wide associations with asthma status."----

# Plot the manhattan plot
plot_manhattan(
    exposomicset = expom,
    min_per_cat = 0,
    feature_col = "feature_clean",
    vars_to_label = c(
        "TC19001180.hg.1",
        "TC01000565.hg.1",
        "cg01937701",
        "hs_mibp_cadj_Log2"
    ),
    panel_sizes = c(1, 3, 1, 3, 1, 1, 1),
    facet_angle = 0
)

## ----calc-exposome, results='hide'--------------------------------------------

# determine which aerosol variables to use
aerosols <- c("h_pm25_ratio_preg_None", "h_pm10_ratio_preg_None")

# Create exposome scores
expom <- expom |>
    run_exposome_score(
        exposure_cols = aerosols,
        score_type = "median",
        score_column_name = "exposome_median_score"
    ) |>
    run_exposome_score(
        exposure_cols = aerosols,
        score_type = "pca",
        score_column_name = "exposome_pca_score"
    ) |>
    run_exposome_score(
        exposure_cols = aerosols,
        score_type = "irt",
        score_column_name = "exposome_irt_score"
    ) |>
    run_exposome_score(
        exposure_cols = aerosols,
        score_type = "quantile",
        score_column_name = "exposome_quantile_score"
    ) |>
    run_exposome_score(
        exposure_cols = aerosols,
        score_type = "var",
        score_column_name = "exposome_var_score"
    )

## ----associate-exposome-score-------------------------------------------------

# Associate exposome scores with outcome
expom <- run_association(
    exposomicset = expom,
    outcome = "hs_asthma",
    source = "exposures",
    feature_set = c(
        "exposome_median_score",
        "exposome_pca_score",
        "exposome_irt_score",
        "exposome_quantile_score",
        "exposome_var_score"
    ),
    action = "add",
    family = "binomial"
)

## ----plot-exposome-scores, fig.height=2.5, fig.width=5,fig.cap="Associations of aerosol exposome scores with asthma status. The variance-based score has the strongest association with asthma status."----

# Plot the association forest plot
plot_association(
    exposomicset = expom,
    source = "exposures",
    terms = c(
        "exposome_median_score",
        "exposome_pca_score",
        "exposome_irt_score",
        "exposome_quantile_score",
        "exposome_var_score"
    ),
    filter_col = "p.value",
    filter_thresh = 0.05,
    r2_col = "r2"
)

## ----diff-abundance,results='hide'--------------------------------------------

# Run differential abundance analysis
expom <- run_differential_abundance(
    exposomicset = expom,
    formula = ~hs_asthma,
    method = "limma_trend",
    scaling_method = "none",
    action = "add"
)

## ----volcano-plot,fig.height=3.5, fig.width=6, fig.align='center',fig.cap="Volcano plot of differentially abundant features across omics layers."----

# Plot the volcano plot
plot_volcano(
    exposomicset = expom,
    top_n_label = 3,
    feature_col = "feature_clean",
    logFC_thresh = log2(1),
    pval_thresh = 0.05,
    pval_col = "P.Value",
    logFC_col = "logFC",
    nrow = 1
)

## ----sensitivity-analysis,results='hide'--------------------------------------

# Perform Sensitivity Analysis
expom <- run_sensitivity_analysis(
    exposomicset = expom,
    base_formula = ~hs_asthma,
    methods = c("limma_trend"),
    scaling_methods = c("none"),
    pval_col = "P.Value",
    logfc_col = "logFC",
    logFC_threshold = log2(1),
    pval_threshold = 0.05,
    stability_metric = "stability_score",
    bootstrap_n = 10,
    action = "add"
)

## ----plot-stability-score,fig.height=2.5, fig.width=8, fig.cap="Sensitivity summary highlighting robust features by stability score."----

# Plot distriubtion of stability scores
plot_sensitivity_summary(
    exposomicset = expom,
    stability_score_thresh = 0.34,
    stability_metric = "stability_score"
)

## ----multiomics-int-----------------------------------------------------------

# Perform multi-omics integration
expom <- run_multiomics_integration(
    exposomicset = expom,
    method = "DIABLO",
    n_factors = 5,
    outcome = "hs_asthma",
    action = "add"
)

## ----factor-summary, fig.height=2, fig.width=5, fig.align='center',fig.cap="Contribution of each omics layer to DIABLO latent factors."----

# Plot factor summary
plot_factor_summary(
    exposomicset = expom,
    midpoint = 2.5
)

## ----associate-factors--------------------------------------------------------

# Identify factors that are associated with the outcome
expom <- run_association(
    exposomicset = expom,
    source = "factors",
    outcome = "hs_asthma",
    feature_set = exp_vars,
    action = "add",
    family = "binomial"
)

## ----plot-associate-factors,fig.width=5, fig.height=3,fig.cap="Associations of latent factors with asthma after covariate adjustment."----

# Plot the forest plot
plot_association(
    exposomicset = expom,
    source = "factors",
    filter_col = "p_adjust",
    filter_thresh = 0.05,
    r2_col = "r2"
)

## ----extract-top-features-----------------------------------------------------

# Extract top features that contribute to a factor
expom <- extract_top_factor_features(
    exposomicset = expom,
    method = "percentile",
    pval_col = "p_adjust",
    pval_thresh = 0.05,
    percentile = 0.7,
    action = "add"
)

## ----plot-top-features,fig.height=8,fig.width=6,fig.align='center',fig.cap="Top-loading features per factor and omics layer."----

# Plot the top factor features
plot_top_factor_features(
    exposomicset = expom,
    top_n = 5,
    feature_col = "feature_clean"
)

## ----omics-exp-corr-----------------------------------------------------------

# Grab top common factor features and ensure
# feature is renamed to variable for the variable_map
top_factor_features <- expom |>
    extract_results(result = "multiomics_integration") |>
    pluck("top_factor_features") |>
    dplyr::select(variable = feature, exp_name)

# Correlate top factor features with exposures
# Perform correlation analysis between
# factor features and exposures
expom <- run_correlation(
    exposomicset = expom,
    feature_type = "omics",
    variable_map = top_factor_features,
    exposure_cols = exp_vars,
    action = "add",
    correlation_cutoff = 0.3,
    pval_cutoff = 0.05,
    cor_pval_column = "p.value"
)

# Perform correlation analysis between factor features
expom <- run_correlation(
    exposomicset = expom,
    feature_type = "omics",
    variable_map = top_factor_features,
    exposure_cols = exp_vars,
    feature_cors = TRUE,
    action = "add",
    correlation_cutoff = 0.3,
    pval_cutoff = 0.05,
    cor_pval_column = "p.value"
)

## ----exp-omic-corr-summary,fig.height=7,fig.width=12,fig.align='center',fig.cap="Summary of exposure-omics and feature-feature associations."----

# Plot correlation summary
plot_correlation_summary(
    exposomicset = expom,
    mode = "summary",
    feature_type = "omics"
)

## ----exposure-shared-corr, fig.height=10,fig.width=10,fig.cap="Circos of exposures sharing correlated molecular features."----

# Plot the circos plot of shared correlated features
plot_circos_correlation(
    exposomicset = expom,
    feature_type = "omics",
    shared_cutoff = 1,
    midpoint = 1.5
)

## ----create-network, fig.width=10,fig.height=7--------------------------------

# Create omics within feature correlation network
expom <- run_create_network(
    exposomicset = expom,
    feature_type = "omics_feature_cor",
    action = "add"
)

# Create omics-exposure correlation network
expom <- run_create_network(
    exposomicset = expom,
    feature_type = "omics",
    action = "add"
)

## ----plot-network,fig.width=7, fig.height=7,fig.cap="Bipartite network of exposures and correlated omics features where node labels reflect most central nodes."----

# Plot the exposure-omics network
# Setting the seed so that the graph layout is consistent 
# given that layouts can have random starting points
set.seed(42)
plot_network(
    exposomicset = expom,
    network = "omics",
    top_n_nodes = 50,
    include_stats = TRUE,
    cor_thresh = 0.2,
    node_color_var = "group",
    label = TRUE,
    label_top_n = 5
)

## ----exposure-impact----------------------------------------------------------

# Run exposure-omics impact analysis
expom <- run_exposure_impact(
    exposomicset = expom,
    feature_type = "omics"
)

## ----plot-exposure-impact,fig.width=5.5,fig.height=5, fig.cap= "Network centrality metrics of omics features associated with each exposure."----

# Plot the exposure impact summary
plot_exposure_impact(
    exposomicset = expom,
    feature_type = "omics",
    min_per_group = 10
)

## ----enrichment---------------------------------------------------------------

# Run enrichment analysis on factor features correlated with exposures
expom <- run_enrichment(
    exposomicset = expom,
    feature_type = c("omics_cor"),
    feature_col = "feature_clean",
    db = c("GO"),
    species = "goa_human",
    fenr_col = "gene_symbol",
    padj_method = "none",
    pval_thresh = 0.1,
    min_set = 1,
    max_set = 800,
    clustering_approach = "diana",
    action = "add"
)

## ----enrichment-summary,fig.width=14,fig.height=5, fig.cap="Summary of enriched GO terms grouped by overlap and exposure category."----

# Plot the summary diagram
plot_enrichment(
    exposomicset = expom,
    feature_type = "omics_cor",
    plot_type = "summary"
)

## ----enrichment-dotplot,fig.height=11, fig.width=9, fig.cap="Dotplot of top enriched GO terms by omics layer and exposure category."----

# Plot a dotplot of terms
plot_enrichment(
    exposomicset = expom,
    feature_type = "omics_cor",
    plot_type = "dotplot",
    top_n = 15,
    add_top_genes = TRUE,
    top_n_genes = 5
)

## ----enrichment-term-network, fig.width=6, fig.height=5, fig.cap="Network of enriched GO terms connected by shared genes."----

# Plot the term network plot
# Setting a seed so that the plot layout is consistent
set.seed(42)
plot_enrichment(
    exposomicset = expom,
    feature_type = "omics_cor",
    plot_type = "network",
    label_top_n = 1
)

## ----enrichment-heatmap, fig.height=3, fig.width=8, fig.cap="Heatmap of genes driving enriched GO terms (Group 1) with log2 fold-change overlay."----

# Plot a heatmap of genes and corresponding GO terms
plot_enrichment(
    exposomicset = expom,
    feature_type = "omics_cor",
    go_groups = "Group 1",
    plot_type = "heatmap",
    heatmap_fill = TRUE,
    feature_col = "feature_clean"
)

## ----enrichment-cnetplot, fig.width=6, fig.height=6, fig.cap="Cnet plot linking enriched terms to contributing genes (Group 1)."----

# Plot the gene-term network
# Setting a seed so that the plot layout is consistent
set.seed(42)
plot_enrichment(
    exposomicset = expom,
    feature_type = "omics_cor",
    go_groups = "Group 1",
    plot_type = "cnet",
    feature_col = "feature_clean"
)

## ----pivot-sample-------------------------------------------------------------

# Pivot sample data to a tibble
expom |>
    pivot_sample() |>
    head()

## ----pivot-sample-example-----------------------------------------------------

# Count the number of asthma cases by sex status
expom |>
    pivot_sample() |>
    group_by(hs_asthma, e3_sex_None) |>
    reframe(n = n())

## ----pivot-feature------------------------------------------------------------

# Pivot feature data to a tibble
expom |>
    pivot_feature() |>
    head()

## ----pivot-feature-example----------------------------------------------------

# Count the number of features per omic layer
expom |>
    pivot_feature() |>
    group_by(.exp_name) |>
    summarise(n = n())

## ----pivot-exp----------------------------------------------------------------

# Pivot experiment data to a tibble
expom |>
    pivot_exp(
        exp_name = "Gene Expression",
        features = "TC01004453.hg.1"
    ) |>
    head()

## ----pivot-exp-example, fig.height=4, fig.width=4, fig.align='center',fig.cap="IL23R gene expression levels (probe TC01004453.hg.1) by asthma status."----

# Plot expression of IL23R
expom |>
    pivot_exp(
        exp_name = "Gene Expression",
        features = "TC01004453.hg.1"
    ) |>
    ggplot(aes(
        x = as.character(hs_asthma),
        y = log2(counts + 1),
        color = as.character(hs_asthma),
        fill = as.character(hs_asthma)
    )) +
    geom_boxplot(alpha = 0.5) +
    geom_jitter(alpha = 0.1) +
    ggpubr::geom_pwc(label = "{p.adj.format}{p.adj.signif}") +
    theme_minimal() +
    ggpubr::rotate_x_text(angle = 45) +
    ggsci::scale_color_cosmic() +
    ggsci::scale_fill_cosmic() +
    labs(
        x = "",
        y = expression(Log[2] * "Abd."),
        fill = "Asthma Status",
        color = "Asthma Status"
    )

## ----pipeline-summary---------------------------------------------------------

# Run the pipeline summary
expom |>
    run_pipeline_summary(console_print = TRUE, include_notes = TRUE)

## ----save-results-------------------------------------------------------------

# Save results
extract_results_excel(
    exposomicset = expom,
    file = tempfile(),
    result_types = "all"
)

## ----session_info-------------------------------------------------------------

sessionInfo()

