Functions for accessing and preprocessing Data

Loading included data

Let’s load the demo data.

library(microbiomedataset)
library(tidyverse)
data("global_patterns")
global_patterns
#> -------------------- 
#> microbiomedataset version: 0.99.1 
#> -------------------- 
#> 1.expression_data:[ 19216 x 26 data.frame]
#> 2.sample_info:[ 26 x 8 data.frame]
#> 3.variable_info:[ 19216 x 8 data.frame]
#> 4.sample_info_note:[ 8 x 2 data.frame]
#> 5.variable_info_note:[ 8 x 2 data.frame]
#> -------------------- 
#> Processing information (extract_process_info())
#> create_microbiome_dataset ---------- 
#>             Package               Function.used                Time
#> 1 microbiomedataset create_microbiome_dataset() 2022-07-10 10:56:13

Accessors

dim(global_patterns)
#> [1] 19216    26
nrow(global_patterns)
#> [1] 19216
ncol(global_patterns)
#> [1] 26

colnames(global_patterns)
#>  [1] "CL3"      "CC1"      "SV1"      "M31Fcsw"  "M11Fcsw"  "M31Plmr" 
#>  [7] "M11Plmr"  "F21Plmr"  "M31Tong"  "M11Tong"  "LMEpi24M" "SLEpi20M"
#> [13] "AQC1cm"   "AQC4cm"   "AQC7cm"   "NP2"      "NP3"      "NP5"     
#> [19] "TRRsed1"  "TRRsed2"  "TRRsed3"  "TS28"     "TS29"     "Even1"   
#> [25] "Even2"    "Even3"

head(rownames(global_patterns))
#> [1] "549322" "522457" "951"    "244423" "586076" "246140"

extract_sample_info(global_patterns) %>% 
  colnames()
#> [1] "sample_id"                "Primer"                  
#> [3] "Final_Barcode"            "Barcode_truncated_plus_T"
#> [5] "Barcode_full_length"      "SampleType"              
#> [7] "Description"              "class"

extract_variable_info(global_patterns) %>% 
  colnames()
#> [1] "variable_id" "Kingdom"     "Phylum"      "Class"       "Order"      
#> [6] "Family"      "Genus"       "Species"

extract_expression_data(global_patterns) %>% 
  head()
#>        CL3 CC1 SV1 M31Fcsw M11Fcsw M31Plmr M11Plmr F21Plmr M31Tong M11Tong
#> 549322   0   0   0       0       0       0       0       0       0       0
#> 522457   0   0   0       0       0       0       0       0       0       0
#> 951      0   0   0       0       0       0       1       0       0       0
#> 244423   0   0   0       0       0       0       0       0       0       0
#> 586076   0   0   0       0       0       0       0       0       0       0
#> 246140   0   0   0       0       0       0       0       0       0       0
#>        LMEpi24M SLEpi20M AQC1cm AQC4cm AQC7cm NP2 NP3 NP5 TRRsed1 TRRsed2
#> 549322        0        1     27    100    130   1   0   0       0       0
#> 522457        0        0      0      2      6   0   0   0       0       0
#> 951           0        0      0      0      0   0   0   0       0       0
#> 244423        0        0      0     22     29   0   0   0       0       0
#> 586076        0        0      0      2      1   0   0   0       0       0
#> 246140        0        0      0      1      3   0   0   0       0       0
#>        TRRsed3 TS28 TS29 Even1 Even2 Even3
#> 549322       0    0    0     0     0     0
#> 522457       0    0    0     0     0     0
#> 951          0    0    0     0     0     0
#> 244423       0    0    0     0     0     0
#> 586076       0    0    0     0     0     0
#> 246140       0    0    0     0     0     0

extract_sample_info(global_patterns) %>% 
  head()
#>   sample_id  Primer Final_Barcode Barcode_truncated_plus_T Barcode_full_length
#> 1       CL3 ILBC_01        AACGCA                   TGCGTT         CTAGCGTGCGT
#> 2       CC1 ILBC_02        AACTCG                   CGAGTT         CATCGACGAGT
#> 3       SV1 ILBC_03        AACTGT                   ACAGTT         GTACGCACAGT
#> 4   M31Fcsw ILBC_04        AAGAGA                   TCTCTT         TCGACATCTCT
#> 5   M11Fcsw ILBC_05        AAGCTG                   CAGCTT         CGACTGCAGCT
#> 6   M31Plmr ILBC_07        AATCGT                   ACGATT         CGAGTCACGAT
#>   SampleType                                Description   class
#> 1       Soil   Calhoun South Carolina Pine soil, pH 4.9 Subject
#> 2       Soil   Cedar Creek Minnesota, grassland, pH 6.1 Subject
#> 3       Soil Sevilleta new Mexico, desert scrub, pH 8.3 Subject
#> 4      Feces    M3, Day 1, fecal swab, whole body study Subject
#> 5      Feces   M1, Day 1, fecal swab, whole body study  Subject
#> 6       Skin    M3, Day 1, right palm, whole body study Subject

extract_variable_info(global_patterns) %>% 
  head()
#>   variable_id Kingdom        Phylum        Class        Order        Family
#> 1      549322 Archaea Crenarchaeota Thermoprotei         <NA>          <NA>
#> 2      522457 Archaea Crenarchaeota Thermoprotei         <NA>          <NA>
#> 3         951 Archaea Crenarchaeota Thermoprotei Sulfolobales Sulfolobaceae
#> 4      244423 Archaea Crenarchaeota        Sd-NA         <NA>          <NA>
#> 5      586076 Archaea Crenarchaeota        Sd-NA         <NA>          <NA>
#> 6      246140 Archaea Crenarchaeota        Sd-NA         <NA>          <NA>
#>        Genus                  Species
#> 1       <NA>                     <NA>
#> 2       <NA>                     <NA>
#> 3 Sulfolobus Sulfolobusacidocaldarius
#> 4       <NA>                     <NA>
#> 5       <NA>                     <NA>
#> 6       <NA>                     <NA>

Preprocessing

The microbiomedataset package also includes functions for filtering, subsetting, and merging abundance data.

In the following example, the global_patterns data is first transformed to relative abundance, creating the new global_patterns2 object, which is then filtered such that only OTUs with a mean greater than 10^-5 are kept.

global_patterns2 <-
  global_patterns %>%
  transform2relative_intensity() %>%
  mutate2variable(what = "mean_intensity") %>%
  activate_microbiome_dataset(what = "variable_info") %>%
  filter(mean_intensity > 10 ^ (-5))

This results in a highly-subsetted object, global_patterns2, containing just 4624 of the original ~19216 OTUs.

Next, only remain the variables that phylum Chlamydiae.

global_patterns_chl <-
  global_patterns %>%
  activate_microbiome_dataset(what = "variable_info") %>%
  dplyr::filter(Phylum == "Chlamydiae")

Next, only remain the samples with total intensity > 20.

global_patterns_chl <-
  global_patterns_chl %>%
  mutate2sample(what = "sum_intensity") %>%
  activate_microbiome_dataset(what = "sample_info") %>%
  filter(sum_intensity > 20)

Session information

sessionInfo()
#> R version 4.2.1 (2022-06-23)
#> Platform: x86_64-apple-darwin17.0 (64-bit)
#> Running under: macOS Big Sur ... 10.16
#> 
#> Matrix products: default
#> BLAS:   /Library/Frameworks/R.framework/Versions/4.2/Resources/lib/libRblas.0.dylib
#> LAPACK: /Library/Frameworks/R.framework/Versions/4.2/Resources/lib/libRlapack.dylib
#> 
#> locale:
#> [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
#> 
#> attached base packages:
#> [1] stats     graphics  grDevices utils     datasets  methods   base     
#> 
#> other attached packages:
#>  [1] forcats_0.5.2             stringr_1.4.1            
#>  [3] purrr_1.0.1               readr_2.1.3              
#>  [5] tidyr_1.2.1               tibble_3.1.8             
#>  [7] ggplot2_3.4.0             tidyverse_1.3.2          
#>  [9] dplyr_1.0.10              microbiomedataset_0.99.10
#> 
#> loaded via a namespace (and not attached):
#>   [1] utf8_1.2.2                  tidyselect_1.1.2           
#>   [3] htmlwidgets_1.5.4           grid_4.2.1                 
#>   [5] BiocParallel_1.30.3         munsell_0.5.0              
#>   [7] codetools_0.2-18            ragg_1.2.2                 
#>   [9] preprocessCore_1.58.0       withr_2.5.0                
#>  [11] colorspace_2.0-3            Biobase_2.56.0             
#>  [13] phyloseq_1.40.0             knitr_1.40                 
#>  [15] rstudioapi_0.14             stats4_4.2.1               
#>  [17] mzID_1.34.0                 MatrixGenerics_1.8.1       
#>  [19] GenomeInfoDbData_1.2.8      polyclip_1.10-4            
#>  [21] farver_2.1.1                rhdf5_2.40.0               
#>  [23] rprojroot_2.0.3             vctrs_0.5.2                
#>  [25] generics_0.1.3              xfun_0.33                  
#>  [27] timechange_0.1.1            R6_2.5.1                   
#>  [29] doParallel_1.0.17           GenomeInfoDb_1.32.4        
#>  [31] clue_0.3-61                 graphlayouts_0.8.1         
#>  [33] MsCoreUtils_1.8.0           bitops_1.0-7               
#>  [35] rhdf5filters_1.8.0          cachem_1.0.6               
#>  [37] gridGraphics_0.5-1          DelayedArray_0.22.0        
#>  [39] assertthat_0.2.1            scales_1.2.1               
#>  [41] googlesheets4_1.0.1         ggraph_2.0.6               
#>  [43] gtable_0.3.1                affy_1.74.0                
#>  [45] tidygraph_1.2.2             rlang_1.0.6                
#>  [47] systemfonts_1.0.4           mzR_2.30.0                 
#>  [49] GlobalOptions_0.1.2         splines_4.2.1              
#>  [51] Rdisop_1.56.0               lazyeval_0.2.2             
#>  [53] gargle_1.2.1                impute_1.70.0              
#>  [55] broom_1.0.1                 modelr_0.1.9               
#>  [57] BiocManager_1.30.18         yaml_2.3.5                 
#>  [59] reshape2_1.4.4              backports_1.4.1            
#>  [61] tools_4.2.1                 ggplotify_0.1.0            
#>  [63] affyio_1.66.0               ellipsis_0.3.2             
#>  [65] jquerylib_0.1.4             biomformat_1.24.0          
#>  [67] RColorBrewer_1.1-3          BiocGenerics_0.42.0        
#>  [69] MSnbase_2.22.0              Rcpp_1.0.9                 
#>  [71] plyr_1.8.7                  zlibbioc_1.42.0            
#>  [73] RCurl_1.98-1.8              pbapply_1.5-0              
#>  [75] GetoptLong_1.0.5            viridis_0.6.2              
#>  [77] S4Vectors_0.34.0            zoo_1.8-11                 
#>  [79] haven_2.5.1                 SummarizedExperiment_1.26.1
#>  [81] ggrepel_0.9.2               cluster_2.1.4              
#>  [83] fs_1.5.2                    magrittr_2.0.3             
#>  [85] masstools_1.0.8             data.table_1.14.6          
#>  [87] openxlsx_4.2.5.1            circlize_0.4.15            
#>  [89] reprex_2.0.2                googledrive_2.0.0          
#>  [91] pcaMethods_1.88.0           ProtGenerics_1.28.0        
#>  [93] matrixStats_0.62.0          hms_1.1.2                  
#>  [95] evaluate_0.16               XML_3.99-0.10              
#>  [97] readxl_1.4.1                IRanges_2.30.1             
#>  [99] gridExtra_2.3               shape_1.4.6                
#> [101] compiler_4.2.1              ncdf4_1.20                 
#> [103] crayon_1.5.1                htmltools_0.5.3            
#> [105] mgcv_1.8-42                 tzdb_0.3.0                 
#> [107] lubridate_1.9.0             DBI_1.1.3                  
#> [109] tweenr_2.0.2                dbplyr_2.2.1               
#> [111] ComplexHeatmap_2.12.1       MASS_7.3-58.1              
#> [113] Matrix_1.5-1                ade4_1.7-19                
#> [115] permute_0.9-7               cli_3.4.1                  
#> [117] vsn_3.64.0                  parallel_4.2.1             
#> [119] igraph_1.3.5                GenomicRanges_1.48.0       
#> [121] pkgconfig_2.0.3             pkgdown_2.0.6              
#> [123] plotly_4.10.0               xml2_1.3.3                 
#> [125] MALDIquant_1.22             foreach_1.5.2              
#> [127] bslib_0.4.0                 multtest_2.52.0            
#> [129] XVector_0.36.0              massdataset_1.0.21         
#> [131] rvest_1.0.3                 yulab.utils_0.0.5          
#> [133] digest_0.6.31               vegan_2.6-2                
#> [135] Biostrings_2.64.1           cellranger_1.1.0           
#> [137] rmarkdown_2.16              tidytree_0.4.1             
#> [139] curl_4.3.3                  rjson_0.2.21               
#> [141] lifecycle_1.0.3             nlme_3.1-159               
#> [143] jsonlite_1.8.0              Rhdf5lib_1.18.2            
#> [145] desc_1.4.2                  viridisLite_0.4.1          
#> [147] limma_3.52.3                fansi_1.0.3                
#> [149] pillar_1.8.1                ggsci_2.9                  
#> [151] lattice_0.20-45             fastmap_1.1.0              
#> [153] httr_1.4.4                  survival_3.4-0             
#> [155] glue_1.6.2                  remotes_2.4.2              
#> [157] zip_2.2.1                   png_0.1-8                  
#> [159] iterators_1.0.14            ggforce_0.3.4              
#> [161] stringi_1.7.8               sass_0.4.2                 
#> [163] textshaping_0.3.6           memoise_2.0.1              
#> [165] ape_5.6-2

Xiaotao Shen (https://www.shenxt.info/)

Created on 2022-07-09 and updated on 2023-04-19

Loading included data

Accessors

Preprocessing

Session information