TileDBArray 1.15.0
TileDB implements a framework for local and remote storage of dense and sparse arrays.
We can use this as a DelayedArray
backend to provide an array-level abstraction,
thus allowing the data to be used in many places where an ordinary array or matrix might be used.
The TileDBArray package implements the necessary wrappers around TileDB-R
to support read/write operations on TileDB arrays within the DelayedArray framework.
TileDBArray
Creating a TileDBArray
is as easy as:
X <- matrix(rnorm(1000), ncol=10)
library(TileDBArray)
writeTileDBArray(X)
## <100 x 10> TileDBMatrix object of type "double":
## [,1] [,2] [,3] ... [,9] [,10]
## [1,] 0.77130227 3.24690387 0.32189158 . -0.54342533 -0.17056161
## [2,] -1.36814041 1.02448007 0.30129413 . -0.18154540 0.71735324
## [3,] -0.07725020 -0.64689223 1.16938150 . 0.72513320 0.03742136
## [4,] -0.02924394 0.83754184 0.21667648 . 0.43990968 0.87899976
## [5,] 1.41319627 1.15562876 0.19034869 . 0.52704296 1.19207541
## ... . . . . . .
## [96,] -0.09087176 0.14821451 1.05887623 . 0.790647029 -0.588015200
## [97,] -1.64288129 0.41546813 1.13309185 . -0.004169193 -1.282189185
## [98,] -2.08686239 0.89015828 -0.08001252 . -1.333592592 -0.721536598
## [99,] -0.09181938 0.86977560 -1.22544313 . -1.646125971 0.360498535
## [100,] -0.23110171 -0.83906869 0.45874610 . 0.061780107 0.770741287
Alternatively, we can use coercion methods:
as(X, "TileDBArray")
## <100 x 10> TileDBMatrix object of type "double":
## [,1] [,2] [,3] ... [,9] [,10]
## [1,] 0.77130227 3.24690387 0.32189158 . -0.54342533 -0.17056161
## [2,] -1.36814041 1.02448007 0.30129413 . -0.18154540 0.71735324
## [3,] -0.07725020 -0.64689223 1.16938150 . 0.72513320 0.03742136
## [4,] -0.02924394 0.83754184 0.21667648 . 0.43990968 0.87899976
## [5,] 1.41319627 1.15562876 0.19034869 . 0.52704296 1.19207541
## ... . . . . . .
## [96,] -0.09087176 0.14821451 1.05887623 . 0.790647029 -0.588015200
## [97,] -1.64288129 0.41546813 1.13309185 . -0.004169193 -1.282189185
## [98,] -2.08686239 0.89015828 -0.08001252 . -1.333592592 -0.721536598
## [99,] -0.09181938 0.86977560 -1.22544313 . -1.646125971 0.360498535
## [100,] -0.23110171 -0.83906869 0.45874610 . 0.061780107 0.770741287
This process works also for sparse matrices:
Y <- Matrix::rsparsematrix(1000, 1000, density=0.01)
writeTileDBArray(Y)
## <1000 x 1000> sparse TileDBMatrix object of type "double":
## [,1] [,2] [,3] ... [,999] [,1000]
## [1,] 0 0 0 . 0 0
## [2,] 0 0 0 . 0 0
## [3,] 0 0 0 . 0 0
## [4,] 0 0 0 . 0 0
## [5,] 0 0 0 . 0 0
## ... . . . . . .
## [996,] 0 0 0 . 0 0
## [997,] 0 0 0 . 0 0
## [998,] 0 0 0 . 0 0
## [999,] 0 0 0 . 0 0
## [1000,] 0 0 0 . 0 0
Logical and integer matrices are supported:
writeTileDBArray(Y > 0)
## <1000 x 1000> sparse TileDBMatrix object of type "logical":
## [,1] [,2] [,3] ... [,999] [,1000]
## [1,] FALSE FALSE FALSE . FALSE FALSE
## [2,] FALSE FALSE FALSE . FALSE FALSE
## [3,] FALSE FALSE FALSE . FALSE FALSE
## [4,] FALSE FALSE FALSE . FALSE FALSE
## [5,] FALSE FALSE FALSE . FALSE FALSE
## ... . . . . . .
## [996,] FALSE FALSE FALSE . FALSE FALSE
## [997,] FALSE FALSE FALSE . FALSE FALSE
## [998,] FALSE FALSE FALSE . FALSE FALSE
## [999,] FALSE FALSE FALSE . FALSE FALSE
## [1000,] FALSE FALSE FALSE . FALSE FALSE
As are matrices with dimension names:
rownames(X) <- sprintf("GENE_%i", seq_len(nrow(X)))
colnames(X) <- sprintf("SAMP_%i", seq_len(ncol(X)))
writeTileDBArray(X)
## <100 x 10> TileDBMatrix object of type "double":
## SAMP_1 SAMP_2 SAMP_3 ... SAMP_9 SAMP_10
## GENE_1 0.77130227 3.24690387 0.32189158 . -0.54342533 -0.17056161
## GENE_2 -1.36814041 1.02448007 0.30129413 . -0.18154540 0.71735324
## GENE_3 -0.07725020 -0.64689223 1.16938150 . 0.72513320 0.03742136
## GENE_4 -0.02924394 0.83754184 0.21667648 . 0.43990968 0.87899976
## GENE_5 1.41319627 1.15562876 0.19034869 . 0.52704296 1.19207541
## ... . . . . . .
## GENE_96 -0.09087176 0.14821451 1.05887623 . 0.790647029 -0.588015200
## GENE_97 -1.64288129 0.41546813 1.13309185 . -0.004169193 -1.282189185
## GENE_98 -2.08686239 0.89015828 -0.08001252 . -1.333592592 -0.721536598
## GENE_99 -0.09181938 0.86977560 -1.22544313 . -1.646125971 0.360498535
## GENE_100 -0.23110171 -0.83906869 0.45874610 . 0.061780107 0.770741287
TileDBArray
sTileDBArray
s are simply DelayedArray
objects and can be manipulated as such.
The usual conventions for extracting data from matrix-like objects work as expected:
out <- as(X, "TileDBArray")
dim(out)
## [1] 100 10
head(rownames(out))
## [1] "GENE_1" "GENE_2" "GENE_3" "GENE_4" "GENE_5" "GENE_6"
head(out[,1])
## GENE_1 GENE_2 GENE_3 GENE_4 GENE_5 GENE_6
## 0.77130227 -1.36814041 -0.07725020 -0.02924394 1.41319627 -1.31367588
We can also perform manipulations like subsetting and arithmetic.
Note that these operations do not affect the data in the TileDB backend;
rather, they are delayed until the values are explicitly required,
hence the creation of the DelayedMatrix
object.
out[1:5,1:5]
## <5 x 5> DelayedMatrix object of type "double":
## SAMP_1 SAMP_2 SAMP_3 SAMP_4 SAMP_5
## GENE_1 0.77130227 3.24690387 0.32189158 1.05300528 0.32255274
## GENE_2 -1.36814041 1.02448007 0.30129413 0.48991726 -2.20785710
## GENE_3 -0.07725020 -0.64689223 1.16938150 -0.64264457 -0.11194536
## GENE_4 -0.02924394 0.83754184 0.21667648 -0.16525518 0.28509208
## GENE_5 1.41319627 1.15562876 0.19034869 0.43153208 -1.16019115
out * 2
## <100 x 10> DelayedMatrix object of type "double":
## SAMP_1 SAMP_2 SAMP_3 ... SAMP_9 SAMP_10
## GENE_1 1.54260454 6.49380774 0.64378316 . -1.08685065 -0.34112323
## GENE_2 -2.73628082 2.04896014 0.60258826 . -0.36309081 1.43470647
## GENE_3 -0.15450039 -1.29378446 2.33876300 . 1.45026639 0.07484272
## GENE_4 -0.05848788 1.67508368 0.43335296 . 0.87981936 1.75799951
## GENE_5 2.82639253 2.31125752 0.38069738 . 1.05408592 2.38415083
## ... . . . . . .
## GENE_96 -0.1817435 0.2964290 2.1177525 . 1.581294058 -1.176030401
## GENE_97 -3.2857626 0.8309363 2.2661837 . -0.008338386 -2.564378370
## GENE_98 -4.1737248 1.7803166 -0.1600250 . -2.667185185 -1.443073196
## GENE_99 -0.1836388 1.7395512 -2.4508863 . -3.292251941 0.720997071
## GENE_100 -0.4622034 -1.6781374 0.9174922 . 0.123560214 1.541482575
We can also do more complex matrix operations that are supported by DelayedArray:
colSums(out)
## SAMP_1 SAMP_2 SAMP_3 SAMP_4 SAMP_5 SAMP_6
## -10.19718108 -5.97620986 5.58064853 -4.74519244 -3.84388321 -9.64553070
## SAMP_7 SAMP_8 SAMP_9 SAMP_10
## -14.84532184 9.52059831 -10.77986822 -0.04416733
out %*% runif(ncol(out))
## [,1]
## GENE_1 4.43784612
## GENE_2 -0.31823139
## GENE_3 0.48951097
## GENE_4 0.19818679
## GENE_5 1.84937998
## GENE_6 -2.56503183
## GENE_7 -1.82395385
## GENE_8 0.24067532
## GENE_9 2.62863437
## GENE_10 2.13538713
## GENE_11 0.08859003
## GENE_12 0.60369082
## GENE_13 -1.25440807
## GENE_14 -2.67027786
## GENE_15 -0.31830795
## GENE_16 -1.02122065
## GENE_17 -1.16754872
## GENE_18 -2.19396746
## GENE_19 -1.87955083
## GENE_20 4.37667665
## GENE_21 -2.26963186
## GENE_22 1.69741598
## GENE_23 -3.14327261
## GENE_24 0.44417177
## GENE_25 -1.54357427
## GENE_26 0.57849674
## GENE_27 -3.18206187
## GENE_28 -1.40566653
## GENE_29 0.78320913
## GENE_30 -0.20102944
## GENE_31 -1.18852759
## GENE_32 0.44512583
## GENE_33 2.58793737
## GENE_34 4.03748649
## GENE_35 1.74820775
## GENE_36 0.87248473
## GENE_37 -0.96294318
## GENE_38 0.52653761
## GENE_39 -2.43100922
## GENE_40 -0.77587210
## GENE_41 -3.29531088
## GENE_42 -0.17016710
## GENE_43 -2.95786842
## GENE_44 -0.96272371
## GENE_45 1.74412967
## GENE_46 -1.62804004
## GENE_47 1.63249810
## GENE_48 2.07216930
## GENE_49 -0.95687846
## GENE_50 1.62140436
## GENE_51 -2.89767075
## GENE_52 3.13496868
## GENE_53 0.32856503
## GENE_54 -0.46448357
## GENE_55 1.13694960
## GENE_56 -1.16521568
## GENE_57 0.47944035
## GENE_58 0.06023887
## GENE_59 -0.77431227
## GENE_60 -1.23159230
## GENE_61 -3.38287794
## GENE_62 0.18212345
## GENE_63 -0.45125611
## GENE_64 -0.84498904
## GENE_65 0.96429271
## GENE_66 0.83994643
## GENE_67 -2.80021162
## GENE_68 -1.50342859
## GENE_69 0.35572224
## GENE_70 1.65045787
## GENE_71 -0.08415924
## GENE_72 -1.99764242
## GENE_73 -3.80534003
## GENE_74 -0.09006674
## GENE_75 -4.92573073
## GENE_76 2.52093144
## GENE_77 -2.60367611
## GENE_78 2.20649608
## GENE_79 -1.46030367
## GENE_80 -0.79818807
## GENE_81 2.02140790
## GENE_82 -1.58648978
## GENE_83 -0.63317689
## GENE_84 1.44592701
## GENE_85 2.70894772
## GENE_86 0.32690933
## GENE_87 -2.76904569
## GENE_88 -0.55027993
## GENE_89 1.99395576
## GENE_90 -1.39069788
## GENE_91 1.17118506
## GENE_92 -1.98997949
## GENE_93 1.16912607
## GENE_94 1.10863604
## GENE_95 1.20840083
## GENE_96 -2.18306059
## GENE_97 1.18339175
## GENE_98 -2.86718662
## GENE_99 1.19066454
## GENE_100 1.75769333
We can adjust some parameters for creating the backend with appropriate arguments to writeTileDBArray()
.
For example, the example below allows us to control the path to the backend
as well as the name of the attribute containing the data.
X <- matrix(rnorm(1000), ncol=10)
path <- tempfile()
writeTileDBArray(X, path=path, attr="WHEE")
## <100 x 10> TileDBMatrix object of type "double":
## [,1] [,2] [,3] ... [,9] [,10]
## [1,] 0.30284746 -0.65738472 -0.71106788 . 0.3114409 -0.7441812
## [2,] 0.40547465 0.18051708 0.08354731 . 0.3520923 1.3198254
## [3,] 0.12492900 0.69285257 0.15557501 . -2.1337386 0.4693052
## [4,] -0.20996425 0.19438122 0.36388345 . -1.1551735 0.3278646
## [5,] -1.40602584 -0.67108867 0.80227465 . -0.3167730 -1.5416560
## ... . . . . . .
## [96,] -0.20279039 1.38893566 1.85670116 . -0.75158010 -1.41310029
## [97,] -2.45398322 0.09350175 1.35927502 . 0.34193036 0.08703692
## [98,] -0.04882512 -0.65416050 -0.10525229 . -0.61611502 0.56033771
## [99,] 0.10668713 -1.00359079 -1.67625924 . -0.48960882 0.82397721
## [100,] 0.06838464 -1.10521608 0.07467330 . -1.32795668 -0.15039921
As these arguments cannot be passed during coercion, we instead provide global variables that can be set or unset to affect the outcome.
path2 <- tempfile()
setTileDBPath(path2)
as(X, "TileDBArray") # uses path2 to store the backend.
## <100 x 10> TileDBMatrix object of type "double":
## [,1] [,2] [,3] ... [,9] [,10]
## [1,] 0.30284746 -0.65738472 -0.71106788 . 0.3114409 -0.7441812
## [2,] 0.40547465 0.18051708 0.08354731 . 0.3520923 1.3198254
## [3,] 0.12492900 0.69285257 0.15557501 . -2.1337386 0.4693052
## [4,] -0.20996425 0.19438122 0.36388345 . -1.1551735 0.3278646
## [5,] -1.40602584 -0.67108867 0.80227465 . -0.3167730 -1.5416560
## ... . . . . . .
## [96,] -0.20279039 1.38893566 1.85670116 . -0.75158010 -1.41310029
## [97,] -2.45398322 0.09350175 1.35927502 . 0.34193036 0.08703692
## [98,] -0.04882512 -0.65416050 -0.10525229 . -0.61611502 0.56033771
## [99,] 0.10668713 -1.00359079 -1.67625924 . -0.48960882 0.82397721
## [100,] 0.06838464 -1.10521608 0.07467330 . -1.32795668 -0.15039921
sessionInfo()
## R version 4.4.0 Patched (2024-04-24 r86482)
## Platform: aarch64-apple-darwin20
## Running under: macOS Ventura 13.6.6
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/4.4-arm64/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.4-arm64/Resources/lib/libRlapack.dylib; LAPACK version 3.12.0
##
## locale:
## [1] C/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## time zone: America/New_York
## tzcode source: internal
##
## attached base packages:
## [1] stats4 stats graphics grDevices utils datasets methods
## [8] base
##
## other attached packages:
## [1] RcppSpdlog_0.0.16 TileDBArray_1.15.0 DelayedArray_0.31.0
## [4] SparseArray_1.5.0 S4Arrays_1.5.0 abind_1.4-5
## [7] IRanges_2.39.0 S4Vectors_0.43.0 MatrixGenerics_1.17.0
## [10] matrixStats_1.3.0 BiocGenerics_0.51.0 Matrix_1.7-0
## [13] BiocStyle_2.33.0
##
## loaded via a namespace (and not attached):
## [1] bit_4.0.5 jsonlite_1.8.8 compiler_4.4.0
## [4] BiocManager_1.30.22 crayon_1.5.2 Rcpp_1.0.12
## [7] nanoarrow_0.4.0.1 jquerylib_0.1.4 yaml_2.3.8
## [10] fastmap_1.1.1 lattice_0.22-6 R6_2.5.1
## [13] RcppCCTZ_0.2.12 XVector_0.45.0 tiledb_0.26.0
## [16] knitr_1.46 bookdown_0.39 bslib_0.7.0
## [19] rlang_1.1.3 cachem_1.0.8 xfun_0.43
## [22] sass_0.4.9 bit64_4.0.5 cli_3.6.2
## [25] zlibbioc_1.51.0 spdl_0.0.5 digest_0.6.35
## [28] grid_4.4.0 lifecycle_1.0.4 data.table_1.15.4
## [31] evaluate_0.23 nanotime_0.3.7 zoo_1.8-12
## [34] rmarkdown_2.26 tools_4.4.0 htmltools_0.5.8.1