library(MultiAssayExperiment)
library(HDF5Array)
library(SummarizedExperiment)The HDF5Array package provides an on-disk representation of large datasets
without the need to load them into memory. Convenient lazy evaluation
operations allow the user to manipulate such large data files based on
metadata. The DelayedMatrix class in the DelayedArray package provides a
way to connect to a large matrix that is stored on disk.
First, we create a small matrix for constructing the DelayedMatrix class.
smallMatrix <- matrix(rnorm(10e5), ncol = 20)We add rownames and column names to the matrix object for compatibility with
the MultiAssayExperiment representation.
rownames(smallMatrix) <- paste0("GENE", seq_len(nrow(smallMatrix)))
colnames(smallMatrix) <- paste0("SampleID", seq_len(ncol(smallMatrix)))Here we use the DelayedArray constructor function to create a
DelayedMatrix object.
smallMatrix <- DelayedArray(smallMatrix)
class(smallMatrix)## [1] "DelayedMatrix"
## attr(,"package")
## [1] "DelayedArray"# show method
smallMatrix## <50000 x 20> DelayedMatrix object of type "double":
##             SampleID1   SampleID2   SampleID3 ...  SampleID19  SampleID20
##     GENE1  0.99863493  0.38402445  0.34563167   .  0.58733759  0.05014844
##     GENE2  0.50178165 -1.72706719 -0.34513229   . -0.23261079  1.33266253
##     GENE3  1.14669017  0.62155928 -0.75700561   .  0.49856519 -0.17014319
##     GENE4 -0.79700138  0.04970944  0.59936655   .  0.90207943 -0.31551263
##     GENE5  0.83084287  0.30082134  0.52008594   .  0.96026464 -0.44805325
##       ...           .           .           .   .           .           .
## GENE49996  -2.3695408   0.4651033  -0.8835262   .  -0.4267942   1.2926334
## GENE49997  -1.3134266   1.3700319   1.9691280   .  -0.7722852   0.3570562
## GENE49998   0.2522855   0.6065744  -0.6423508   .   2.5176884  -1.5817982
## GENE49999  -0.4583904  -1.0459608  -1.1500916   .   0.2770945  -0.4385220
## GENE50000   0.2883502  -0.3348985   0.2726856   .   0.4261716  -2.4640464dim(smallMatrix)## [1] 50000    20Finally, the rhdf5 package stores dimnames in a standard location.
In order to make use of this functionality, we would use writeHDF5Array
with the with.dimnames argument:
testh5 <- tempfile(fileext = ".h5")
writeHDF5Array(smallMatrix, filepath = testh5, name = "smallMatrix",
    with.dimnames = TRUE)## <50000 x 20> HDF5Matrix object of type "double":
##             SampleID1   SampleID2   SampleID3 ...  SampleID19  SampleID20
##     GENE1  0.99863493  0.38402445  0.34563167   .  0.58733759  0.05014844
##     GENE2  0.50178165 -1.72706719 -0.34513229   . -0.23261079  1.33266253
##     GENE3  1.14669017  0.62155928 -0.75700561   .  0.49856519 -0.17014319
##     GENE4 -0.79700138  0.04970944  0.59936655   .  0.90207943 -0.31551263
##     GENE5  0.83084287  0.30082134  0.52008594   .  0.96026464 -0.44805325
##       ...           .           .           .   .           .           .
## GENE49996  -2.3695408   0.4651033  -0.8835262   .  -0.4267942   1.2926334
## GENE49997  -1.3134266   1.3700319   1.9691280   .  -0.7722852   0.3570562
## GENE49998   0.2522855   0.6065744  -0.6423508   .   2.5176884  -1.5817982
## GENE49999  -0.4583904  -1.0459608  -1.1500916   .   0.2770945  -0.4385220
## GENE50000   0.2883502  -0.3348985   0.2726856   .   0.4261716  -2.4640464To see the file structure we use h5ls:
h5ls(testh5)##                    group                  name       otype dclass        dim
## 0                      / .smallMatrix_dimnames   H5I_GROUP                  
## 1 /.smallMatrix_dimnames                     1 H5I_DATASET STRING      50000
## 2 /.smallMatrix_dimnames                     2 H5I_DATASET STRING         20
## 3                      /           smallMatrix H5I_DATASET  FLOAT 50000 x 20Note that a large matrix from an HDF5 file can also be loaded using the
HDF5ArraySeed and DelayedArray functions.
hdf5Data <- HDF5ArraySeed(file = testh5, name = "smallMatrix")
newDelayedMatrix <- DelayedArray(hdf5Data)
class(newDelayedMatrix)## [1] "HDF5Matrix"
## attr(,"package")
## [1] "HDF5Array"newDelayedMatrix## <50000 x 20> HDF5Matrix object of type "double":
##             SampleID1   SampleID2   SampleID3 ...  SampleID19  SampleID20
##     GENE1  0.99863493  0.38402445  0.34563167   .  0.58733759  0.05014844
##     GENE2  0.50178165 -1.72706719 -0.34513229   . -0.23261079  1.33266253
##     GENE3  1.14669017  0.62155928 -0.75700561   .  0.49856519 -0.17014319
##     GENE4 -0.79700138  0.04970944  0.59936655   .  0.90207943 -0.31551263
##     GENE5  0.83084287  0.30082134  0.52008594   .  0.96026464 -0.44805325
##       ...           .           .           .   .           .           .
## GENE49996  -2.3695408   0.4651033  -0.8835262   .  -0.4267942   1.2926334
## GENE49997  -1.3134266   1.3700319   1.9691280   .  -0.7722852   0.3570562
## GENE49998   0.2522855   0.6065744  -0.6423508   .   2.5176884  -1.5817982
## GENE49999  -0.4583904  -1.0459608  -1.1500916   .   0.2770945  -0.4385220
## GENE50000   0.2883502  -0.3348985   0.2726856   .   0.4261716  -2.4640464DelayedMatrix with MultiAssayExperimentA DelayedMatrix alone conforms to the MultiAssayExperiment API requirements.
Shown below, the DelayedMatrix can be put into a named list and passed into
the MultiAssayExperiment constructor function.
HDF5MAE <- MultiAssayExperiment(experiments = list(smallMatrix = smallMatrix))
sampleMap(HDF5MAE)## DataFrame with 20 rows and 3 columns
##           assay     primary     colname
##        <factor> <character> <character>
## 1   smallMatrix   SampleID1   SampleID1
## 2   smallMatrix   SampleID2   SampleID2
## 3   smallMatrix   SampleID3   SampleID3
## 4   smallMatrix   SampleID4   SampleID4
## 5   smallMatrix   SampleID5   SampleID5
## ...         ...         ...         ...
## 16  smallMatrix  SampleID16  SampleID16
## 17  smallMatrix  SampleID17  SampleID17
## 18  smallMatrix  SampleID18  SampleID18
## 19  smallMatrix  SampleID19  SampleID19
## 20  smallMatrix  SampleID20  SampleID20colData(HDF5MAE)## DataFrame with 20 rows and 0 columnsSummarizedExperiment with DelayedMatrix backendA more information rich DelayedMatrix can be created when used in conjunction
with the SummarizedExperiment class and it can even include rowRanges.
The flexibility of the MultiAssayExperiment API supports classes with
minimal requirements. Additionally, this SummarizedExperiment with the
DelayedMatrix backend can be part of a bigger MultiAssayExperiment object.
Below is a minimal example of how this would work:
HDF5SE <- SummarizedExperiment(assays = smallMatrix)
assay(HDF5SE)## <50000 x 20> DelayedMatrix object of type "double":
##             SampleID1   SampleID2   SampleID3 ...  SampleID19  SampleID20
##     GENE1  0.99863493  0.38402445  0.34563167   .  0.58733759  0.05014844
##     GENE2  0.50178165 -1.72706719 -0.34513229   . -0.23261079  1.33266253
##     GENE3  1.14669017  0.62155928 -0.75700561   .  0.49856519 -0.17014319
##     GENE4 -0.79700138  0.04970944  0.59936655   .  0.90207943 -0.31551263
##     GENE5  0.83084287  0.30082134  0.52008594   .  0.96026464 -0.44805325
##       ...           .           .           .   .           .           .
## GENE49996  -2.3695408   0.4651033  -0.8835262   .  -0.4267942   1.2926334
## GENE49997  -1.3134266   1.3700319   1.9691280   .  -0.7722852   0.3570562
## GENE49998   0.2522855   0.6065744  -0.6423508   .   2.5176884  -1.5817982
## GENE49999  -0.4583904  -1.0459608  -1.1500916   .   0.2770945  -0.4385220
## GENE50000   0.2883502  -0.3348985   0.2726856   .   0.4261716  -2.4640464MultiAssayExperiment(list(HDF5SE = HDF5SE))## A MultiAssayExperiment object of 1 listed
##  experiment with a user-defined name and respective class.
##  Containing an ExperimentList class object of length 1:
##  [1] HDF5SE: SummarizedExperiment with 50000 rows and 20 columns
## Functionality:
##  experiments() - obtain the ExperimentList instance
##  colData() - the primary/phenotype DataFrame
##  sampleMap() - the sample coordination DataFrame
##  `$`, `[`, `[[` - extract colData columns, subset, or experiment
##  *Format() - convert into a long or wide DataFrame
##  assays() - convert ExperimentList to a SimpleList of matrices
##  exportClass() - save data to flat filesAdditional scenarios are currently in development where an HDF5Matrix is
hosted remotely. Many opportunities exist when considering on-disk and off-disk
representations of data with MultiAssayExperiment.
sessionInfo()## R version 4.5.0 Patched (2025-04-21 r88169)
## Platform: x86_64-apple-darwin20
## Running under: macOS Monterey 12.7.6
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/4.5-x86_64/Resources/lib/libRblas.0.dylib 
## LAPACK: /Library/Frameworks/R.framework/Versions/4.5-x86_64/Resources/lib/libRlapack.dylib;  LAPACK version 3.12.1
## 
## locale:
## [1] C/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## time zone: America/New_York
## tzcode source: internal
## 
## attached base packages:
## [1] stats4    stats     graphics  grDevices utils     datasets  methods  
## [8] base     
## 
## other attached packages:
##  [1] HDF5Array_1.37.0            h5mread_1.1.0              
##  [3] rhdf5_2.53.0                DelayedArray_0.35.1        
##  [5] SparseArray_1.9.0           S4Arrays_1.9.0             
##  [7] abind_1.4-8                 Matrix_1.7-3               
##  [9] survminer_0.5.0             ggpubr_0.6.0               
## [11] ggplot2_3.5.2               survival_3.8-3             
## [13] UpSetR_1.4.0                RaggedExperiment_1.33.2    
## [15] MultiAssayExperiment_1.35.3 SummarizedExperiment_1.39.0
## [17] Biobase_2.69.0              GenomicRanges_1.61.0       
## [19] GenomeInfoDb_1.45.3         IRanges_2.43.0             
## [21] S4Vectors_0.47.0            BiocGenerics_0.55.0        
## [23] generics_0.1.3              MatrixGenerics_1.21.0      
## [25] matrixStats_1.5.0           BiocStyle_2.37.0           
## 
## loaded via a namespace (and not attached):
##  [1] tidyselect_1.2.1     dplyr_1.1.4          farver_2.1.2        
##  [4] fastmap_1.2.0        digest_0.6.37        lifecycle_1.0.4     
##  [7] magrittr_2.0.3       compiler_4.5.0       rlang_1.1.6         
## [10] sass_0.4.10          tools_4.5.0          yaml_2.3.10         
## [13] data.table_1.17.0    knitr_1.50           ggsignif_0.6.4      
## [16] labeling_0.4.3       xml2_1.3.8           plyr_1.8.9          
## [19] RColorBrewer_1.1-3   withr_3.0.2          purrr_1.0.4         
## [22] grid_4.5.0           xtable_1.8-4         Rhdf5lib_1.31.0     
## [25] scales_1.4.0         dichromat_2.0-0.1    tinytex_0.57        
## [28] cli_3.6.5            rmarkdown_2.29       crayon_1.5.3        
## [31] km.ci_0.5-6          httr_1.4.7           reshape2_1.4.4      
## [34] commonmark_1.9.5     BiocBaseUtils_1.11.0 cachem_1.1.0        
## [37] stringr_1.5.1        splines_4.5.0        BiocManager_1.30.25 
## [40] XVector_0.49.0       survMisc_0.5.6       vctrs_0.6.5         
## [43] jsonlite_2.0.0       carData_3.0-5        litedown_0.7        
## [46] bookdown_0.43        car_3.1-3            rstatix_0.7.2       
## [49] Formula_1.2-5        magick_2.8.6         tidyr_1.3.1         
## [52] jquerylib_0.1.4      glue_1.8.0           ggtext_0.1.2        
## [55] stringi_1.8.7        gtable_0.3.6         UCSC.utils_1.5.0    
## [58] tibble_3.2.1         pillar_1.10.2        rhdf5filters_1.21.0 
## [61] htmltools_0.5.8.1    R6_2.6.1             KMsurv_0.1-5        
## [64] evaluate_1.0.3       lattice_0.22-7       markdown_2.0        
## [67] backports_1.5.0      gridtext_0.1.5       broom_1.0.8         
## [70] bslib_0.9.0          Rcpp_1.0.14          gridExtra_2.3       
## [73] xfun_0.52            zoo_1.8-14           pkgconfig_2.0.3