Skip to content

matter vs hdf5: which format performs better for storing big array like datasets?

Hervé Pagès edited this page Oct 8, 2019 · 3 revisions

Note that matter does not seem to compress the data so for our comparison with hdf5 to be meaningful we'll use an uncompressed hdf5 dataset. We'll also compare with a compressed hdf5 dataset, for reference.

The dataset

We'll use a 3-dimensional dataset of type integer for the benchmark:

set.seed(123)
a0 <- array(as.integer(runif(250e6, max=100)), dim=c(3000, 800, 125))

Store the dataset in matter and hdf5 formats (writing operations)

Create matter_arr object and matter_arr-based DelayedArray object

library(matter)
system.time(a1 <- matter_arr(a0, datamode="integer", dim=dim(a0)))
#    user  system elapsed 
#   1.210   1.239   2.451

The resulting file is 1.2 Gb:

file.info(a1@paths)$size / 1e9
# [1] 1.2

Note that a matter_arr object can be wrapped in a DelayedArray object:

library(DelayedArray)
A1 <- DelayedArray(a1)
A1
# <3000 x 800 x 125> array of class DelayedArray and type "integer":
# ,,1
#           [,1]   [,2]   [,3]   [,4] ... [,797] [,798] [,799] [,800]
#    [1,]     28     20     44     79   .     47     97     83     93
#    [2,]     78     94     39     87   .     22     22     86     73
#     ...      .      .      .      .   .      .      .      .      .
# [2999,]     43     79     26     79   .     50     31     67     52
# [3000,]     68     90     78     45   .     83     60      1     83
#
# ...
#
# ,,125
#           [,1]   [,2]   [,3]   [,4] ... [,797] [,798] [,799] [,800]
#    [1,]     28     10     42     77   .     35     96     35     84
#    [2,]     77      1     66     92   .     67     45      4     46
#     ...      .      .      .      .   .      .      .      .      .
# [2999,]     56     80     87     57   .     77     54     44     30
# [3000,]     27     26     12     46   .     11     64     95      9

Create hdf5-based DelayedArray object (no compression)

library(HDF5Array)
system.time(A2 <- writeHDF5Array(a0, chunkdim=c(50, 50, 10), level=0)) 
#    user  system elapsed 
#   1.890   0.831   3.581 

The resulting file is 1.25 Gb:

file.info(path(A2))$size / 1e9
# [1] 1.248704

Create hdf5-based DelayedArray object (using compression)

system.time(A3 <- writeHDF5Array(a0, chunkdim=c(50, 50, 10), level=6))
#    user  system elapsed 
# 112.849   0.455 115.710 

The resulting file is 0.39 Gb:

file.info(path(A3))$size / 1e9
# [1] 0.3891443

Extract data (reading operations)

system.time(x1 <- a1[891:1400, 401:700, 77])
#    user  system elapsed 
#   0.908   0.000   0.907

system.time(x2 <- as.matrix(A2[891:1400, 401:700, 77]))
#    user  system elapsed 
#   0.028   0.000   0.028

system.time(x3 <- as.matrix(A3[891:1400, 401:700, 77]))
#    user  system elapsed 
#   0.046   0.000   0.047

identical(x1, x2)
# [1] TRUE
identical(x1, x3)
# [1] TRUE
system.time(x1 <- a1[(310:11)*7, (1:100)*8, 77])
#    user  system elapsed 
#   0.213   0.016   0.229

system.time(x2 <- as.matrix(A2[(310:11)*7, (1:100)*8, 77]))
#    user  system elapsed 
#   0.024   0.008   0.034

system.time(x3 <- as.matrix(A3[(310:11)*7, (1:100)*8, 77]))
#    user  system elapsed 
#   0.264   0.016   0.283

identical(x1, x2)
# [1] TRUE
identical(x1, x3)
# [1] TRUE
i <- list(sample(3000L, 50), sample(800L, 25), sample(125L, 10))

system.time(x1 <- extract_array(a1, i))
#    user  system elapsed 
#   0.089   0.020   0.109

system.time(x2 <- extract_array(A2, i))
#    user  system elapsed 
#   0.016   0.064   0.079

system.time(x3 <- extract_array(A3, i))
#    user  system elapsed 
#   1.045   0.016   1.061

identical(x1, x2)
# [1] TRUE
identical(x1, x3)
# [1] TRUE

Summarization

These functions use parallelized block processing behind the scene so concurrent read access comes into play. We'll set the number of workers to 4 and the block size to 2.5 Mb:

workers <- 4
block_size <- 2.5e6  # 2.5 Mb
setAutoBPPARAM(MulticoreParam(workers))
setAutoBlockSize(block_size)
DelayedArray:::set_verbose_block_processing(TRUE)
system.time(cs1 <- colSums(A1[ , , 77L]))
# Processing block 7/8 ... OK
# Processing block 8/8 ... OK
# Processing block 1/8 ... OK
# Processing block 2/8 ... OK
# Processing block 3/8 ... OK
# Processing block 4/8 ... OK
# Processing block 5/8 ... OK
# Processing block 6/8 ... OK
#    user  system elapsed 
#   7.989   0.436   4.696 

system.time(cs2 <- colSums(A2[ , , 77L]))
# Processing block 1/8 ... OK
# Processing block 2/8 ... OK
# Error in mcexit(0L) : ignoring SIGPIPE signal
# Processing block 3/8 ... OK
# Processing block 4/8 ... OK
# Error in mcexit(0L) : ignoring SIGPIPE signal
# Processing block 5/8 ... OK
# Processing block 6/8 ... OK
# Error in mcexit(0L) : ignoring SIGPIPE signal
# Processing block 7/8 ... OK
# Processing block 8/8 ... OK
#    user  system elapsed 
#   0.135   0.241   0.198 

system.time(cs3 <- colSums(A3[ , , 77L]))
# Processing block 1/8 ... OK
# Processing block 2/8 ... OK
# Error in mcexit(0L) : ignoring SIGPIPE signal
# Processing block 3/8 ... OK
# Processing block 4/8 ... OK
# Processing block 7/8 ... OK
# Processing block 8/8 ... OK
# Error in mcexit(0L) : ignoring SIGPIPE signal
# Processing block 5/8 ... OK
# Processing block 6/8 ... OK
#    user  system elapsed 
#   0.204   0.148   0.292 

identical(cs1, cs2)
# [1] TRUE
identical(cs1, cs3)
# [1] TRUE

Conclusion

Writing operations: matter performs slightly better than uncompressed hdf5 for writing. Not surprisingly matter and uncompressed hdf5 are both much faster than compressed hdf5 for writing but at the cost of a much bigger resulting file.

Reading operations: both uncompressed and compressed hdf5 outperform matter for reading by an order of magnitude (with some variations).

sessionInfo()

> sessionInfo()
R version 3.6.0 Patched (2019-05-02 r76454)
Platform: x86_64-pc-linux-gnu (64-bit)
Running under: Ubuntu 16.04.5 LTS

Matrix products: default
BLAS:   /home/hpages/R/R-3.6.r76454/lib/libRblas.so
LAPACK: /home/hpages/R/R-3.6.r76454/lib/libRlapack.so

locale:
 [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
 [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
 [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
 [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
 [9] LC_ADDRESS=C               LC_TELEPHONE=C            
[11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       

attached base packages:
[1] parallel  stats4    stats     graphics  grDevices utils     datasets 
[8] methods   base     

other attached packages:
 [1] HDF5Array_1.13.9    rhdf5_2.29.3        DelayedArray_0.11.8
 [4] IRanges_2.19.16     S4Vectors_0.23.25   BiocGenerics_0.31.6
 [7] matrixStats_0.55.0  matter_1.11.1       biglm_0.9-1        
[10] DBI_1.0.0           BiocParallel_1.19.3

loaded via a namespace (and not attached):
[1] lattice_0.20-38 digest_0.6.21   grid_3.6.0      irlba_2.3.3    
[5] Matrix_1.2-17   Rhdf5lib_1.7.5  tools_3.6.0     compiler_3.6.0