Merging OTU or sample indices based on variables in the data can be a useful means of reducing noise or excess features in an analysis or graphic.

Loading included data

library(microbiomedataset)
library(tidyverse)
data("global_patterns")
global_patterns
#> -------------------- 
#> microbiomedataset version: 0.99.1 
#> -------------------- 
#> 1.expression_data:[ 19216 x 26 data.frame]
#> 2.sample_info:[ 26 x 8 data.frame]
#> 3.variable_info:[ 19216 x 8 data.frame]
#> 4.sample_info_note:[ 8 x 2 data.frame]
#> 5.variable_info_note:[ 8 x 2 data.frame]
#> -------------------- 
#> Processing information (extract_process_info())
#> create_microbiome_dataset ---------- 
#>             Package               Function.used                Time
#> 1 microbiomedataset create_microbiome_dataset() 2022-07-10 10:56:13

Merge samples

Remove empty taxa

global_patterns2 <-
  global_patterns %>%
  mutate2variable(what = "sum_intensity") %>%
  activate_microbiome_dataset(what = "variable_info") %>%
  dplyr::filter(sum_intensity > 0)

humantypes <- c("Feces", "Mock", "Skin", "Tongue")
global_patterns2 <-
  global_patterns2 %>%
  activate_microbiome_dataset(what = "sample_info") %>%
  dplyr::mutate(human = SampleType %in% humantypes)

Now on to the merging examples.

merged_global_patterns2 <- 
  microbiomedataset::summarise_samples(object = global_patterns2, 
                                       group_by = "SampleType")
extract_sample_info(merged_global_patterns2)
#>            sample_id  Primer Final_Barcode Barcode_truncated_plus_T
#> 1               Soil ILBC_01        AACGCA                   TGCGTT
#> 2              Feces ILBC_04        AAGAGA                   TCTCTT
#> 3               Skin ILBC_07        AATCGT                   ACGATT
#> 4             Tongue ILBC_10        ACACGA                   TCGTGT
#> 5         Freshwater ILBC_13        ACACTG                   CAGTGT
#> 6 Freshwater (creek) ILBC_16        ACAGCA                   TGCTGT
#> 7              Ocean ILBC_19        ACAGTT                   AACTGT
#> 8 Sediment (estuary) ILBC_22        ACATGT                   ACATGT
#> 9               Mock ILBC_27        ACCGCA                   TGCGGT
#>   Barcode_full_length         SampleType
#> 1         CTAGCGTGCGT               Soil
#> 2         TCGACATCTCT              Feces
#> 3         CGAGTCACGAT               Skin
#> 4         TGTGGCTCGTG             Tongue
#> 5         CATGAACAGTG         Freshwater
#> 6         GACCACTGCTG Freshwater (creek)
#> 7         TCGCGCAACTG              Ocean
#> 8         CACGTGACATG Sediment (estuary)
#> 9         TGACTCTGCGG               Mock
#>                                    Description   class human
#> 1     Calhoun South Carolina Pine soil, pH 4.9 Subject FALSE
#> 2      M3, Day 1, fecal swab, whole body study Subject  TRUE
#> 3      M3, Day 1, right palm, whole body study Subject  TRUE
#> 4         M3, Day 1, tongue, whole body study  Subject  TRUE
#> 5 Lake Mendota Minnesota, 24 meter epilimnion  Subject FALSE
#> 6                 Allequash Creek, 0-1cm depth Subject FALSE
#> 7       Newport Pier, CA surface water, Time 1 Subject FALSE
#> 8               Tijuana River Reserve, depth 1 Subject FALSE
#> 9                                        Even1 Subject  TRUE

Merge taxas

merged_variables <-
  microbiomedataset::summarize_variables(
    object = global_patterns2,
    variable_index = 1:5,
    remain_variable_info_index = 1
  )
dim(merged_variables)
#> variables   samples 
#>     18984        26
dim(global_patterns2)
#> variables   samples 
#>     18988        26

Session information

sessionInfo()
#> R version 4.2.1 (2022-06-23)
#> Platform: x86_64-apple-darwin17.0 (64-bit)
#> Running under: macOS Big Sur ... 10.16
#> 
#> Matrix products: default
#> BLAS:   /Library/Frameworks/R.framework/Versions/4.2/Resources/lib/libRblas.0.dylib
#> LAPACK: /Library/Frameworks/R.framework/Versions/4.2/Resources/lib/libRlapack.dylib
#> 
#> locale:
#> [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
#> 
#> attached base packages:
#> [1] stats     graphics  grDevices utils     datasets  methods   base     
#> 
#> other attached packages:
#>  [1] forcats_0.5.1.9000       stringr_1.4.0            purrr_0.3.4             
#>  [4] readr_2.1.2              tidyr_1.2.0              tibble_3.1.7            
#>  [7] ggplot2_3.3.6            tidyverse_1.3.1          dplyr_1.0.9             
#> [10] microbiomedataset_0.99.7
#> 
#> loaded via a namespace (and not attached):
#>   [1] utf8_1.2.2                  tidyselect_1.1.2           
#>   [3] htmlwidgets_1.5.4           grid_4.2.1                 
#>   [5] BiocParallel_1.30.3         munsell_0.5.0              
#>   [7] codetools_0.2-18            ragg_1.2.2                 
#>   [9] preprocessCore_1.58.0       withr_2.5.0                
#>  [11] colorspace_2.0-3            Biobase_2.56.0             
#>  [13] phyloseq_1.40.0             knitr_1.39                 
#>  [15] rstudioapi_0.13             stats4_4.2.1               
#>  [17] mzID_1.34.0                 MatrixGenerics_1.8.1       
#>  [19] GenomeInfoDbData_1.2.8      polyclip_1.10-0            
#>  [21] farver_2.1.1                rhdf5_2.40.0               
#>  [23] rprojroot_2.0.3             vctrs_0.4.1                
#>  [25] generics_0.1.3              xfun_0.31                  
#>  [27] R6_2.5.1                    doParallel_1.0.17          
#>  [29] GenomeInfoDb_1.32.2         clue_0.3-61                
#>  [31] graphlayouts_0.8.0          MsCoreUtils_1.8.0          
#>  [33] bitops_1.0-7                rhdf5filters_1.8.0         
#>  [35] cachem_1.0.6                gridGraphics_0.5-1         
#>  [37] DelayedArray_0.22.0         assertthat_0.2.1           
#>  [39] scales_1.2.0                ggraph_2.0.5               
#>  [41] gtable_0.3.0                affy_1.74.0                
#>  [43] tidygraph_1.2.1             rlang_1.0.3                
#>  [45] systemfonts_1.0.4           mzR_2.30.0                 
#>  [47] GlobalOptions_0.1.2         splines_4.2.1              
#>  [49] Rdisop_1.56.0               lazyeval_0.2.2             
#>  [51] impute_1.70.0               broom_1.0.0                
#>  [53] modelr_0.1.8                BiocManager_1.30.18        
#>  [55] yaml_2.3.5                  reshape2_1.4.4             
#>  [57] backports_1.4.1             tools_4.2.1                
#>  [59] ggplotify_0.1.0             affyio_1.66.0              
#>  [61] ellipsis_0.3.2              jquerylib_0.1.4            
#>  [63] biomformat_1.24.0           RColorBrewer_1.1-3         
#>  [65] BiocGenerics_0.42.0         MSnbase_2.22.0             
#>  [67] Rcpp_1.0.8.3                plyr_1.8.7                 
#>  [69] zlibbioc_1.42.0             RCurl_1.98-1.7             
#>  [71] pbapply_1.5-0               GetoptLong_1.0.5           
#>  [73] viridis_0.6.2               S4Vectors_0.34.0           
#>  [75] zoo_1.8-10                  SummarizedExperiment_1.26.1
#>  [77] haven_2.5.0                 ggrepel_0.9.1              
#>  [79] cluster_2.1.3               fs_1.5.2                   
#>  [81] magrittr_2.0.3              masstools_0.99.13          
#>  [83] data.table_1.14.2           openxlsx_4.2.5             
#>  [85] circlize_0.4.15             reprex_2.0.1               
#>  [87] pcaMethods_1.88.0           ProtGenerics_1.28.0        
#>  [89] matrixStats_0.62.0          hms_1.1.1                  
#>  [91] evaluate_0.15               XML_3.99-0.10              
#>  [93] readxl_1.4.0                IRanges_2.30.0             
#>  [95] gridExtra_2.3               shape_1.4.6                
#>  [97] compiler_4.2.1              ncdf4_1.19                 
#>  [99] crayon_1.5.1                htmltools_0.5.2            
#> [101] mgcv_1.8-40                 tzdb_0.3.0                 
#> [103] lubridate_1.8.0             DBI_1.1.3                  
#> [105] tweenr_1.0.2                dbplyr_2.2.1               
#> [107] ComplexHeatmap_2.12.0       MASS_7.3-57                
#> [109] Matrix_1.4-1                ade4_1.7-19                
#> [111] permute_0.9-7               cli_3.3.0                  
#> [113] vsn_3.64.0                  parallel_4.2.1             
#> [115] igraph_1.3.2                GenomicRanges_1.48.0       
#> [117] pkgconfig_2.0.3             pkgdown_2.0.5              
#> [119] plotly_4.10.0               xml2_1.3.3                 
#> [121] MALDIquant_1.21             foreach_1.5.2              
#> [123] bslib_0.3.1                 multtest_2.52.0            
#> [125] XVector_0.36.0              massdataset_1.0.5          
#> [127] rvest_1.0.2                 yulab.utils_0.0.5          
#> [129] digest_0.6.29               vegan_2.6-2                
#> [131] Biostrings_2.64.0           rmarkdown_2.14             
#> [133] cellranger_1.1.0            tidytree_0.3.9             
#> [135] rjson_0.2.21                lifecycle_1.0.1            
#> [137] nlme_3.1-158                jsonlite_1.8.0             
#> [139] Rhdf5lib_1.18.2             desc_1.4.1                 
#> [141] viridisLite_0.4.0           limma_3.52.2               
#> [143] fansi_1.0.3                 pillar_1.7.0               
#> [145] ggsci_2.9                   lattice_0.20-45            
#> [147] fastmap_1.1.0               httr_1.4.3                 
#> [149] survival_3.3-1              glue_1.6.2                 
#> [151] zip_2.2.0                   png_0.1-7                  
#> [153] iterators_1.0.14            ggforce_0.3.3              
#> [155] stringi_1.7.6               sass_0.4.1                 
#> [157] textshaping_0.3.6           memoise_2.0.1              
#> [159] ape_5.6-2