vignettes/preprocess.Rmd
preprocess.Rmd
Let’s load the demo data.
library(microbiomedataset)
library(tidyverse)
data("global_patterns")
global_patterns
#> --------------------
#> microbiomedataset version: 0.99.1
#> --------------------
#> 1.expression_data:[ 19216 x 26 data.frame]
#> 2.sample_info:[ 26 x 8 data.frame]
#> 3.variable_info:[ 19216 x 8 data.frame]
#> 4.sample_info_note:[ 8 x 2 data.frame]
#> 5.variable_info_note:[ 8 x 2 data.frame]
#> --------------------
#> Processing information (extract_process_info())
#> create_microbiome_dataset ----------
#> Package Function.used Time
#> 1 microbiomedataset create_microbiome_dataset() 2022-07-10 10:56:13
dim(global_patterns)
#> [1] 19216 26
nrow(global_patterns)
#> [1] 19216
ncol(global_patterns)
#> [1] 26
colnames(global_patterns)
#> [1] "CL3" "CC1" "SV1" "M31Fcsw" "M11Fcsw" "M31Plmr"
#> [7] "M11Plmr" "F21Plmr" "M31Tong" "M11Tong" "LMEpi24M" "SLEpi20M"
#> [13] "AQC1cm" "AQC4cm" "AQC7cm" "NP2" "NP3" "NP5"
#> [19] "TRRsed1" "TRRsed2" "TRRsed3" "TS28" "TS29" "Even1"
#> [25] "Even2" "Even3"
extract_sample_info(global_patterns) %>%
colnames()
#> [1] "sample_id" "Primer"
#> [3] "Final_Barcode" "Barcode_truncated_plus_T"
#> [5] "Barcode_full_length" "SampleType"
#> [7] "Description" "class"
extract_variable_info(global_patterns) %>%
colnames()
#> [1] "variable_id" "Kingdom" "Phylum" "Class" "Order"
#> [6] "Family" "Genus" "Species"
extract_expression_data(global_patterns) %>%
head()
#> CL3 CC1 SV1 M31Fcsw M11Fcsw M31Plmr M11Plmr F21Plmr M31Tong M11Tong
#> 549322 0 0 0 0 0 0 0 0 0 0
#> 522457 0 0 0 0 0 0 0 0 0 0
#> 951 0 0 0 0 0 0 1 0 0 0
#> 244423 0 0 0 0 0 0 0 0 0 0
#> 586076 0 0 0 0 0 0 0 0 0 0
#> 246140 0 0 0 0 0 0 0 0 0 0
#> LMEpi24M SLEpi20M AQC1cm AQC4cm AQC7cm NP2 NP3 NP5 TRRsed1 TRRsed2
#> 549322 0 1 27 100 130 1 0 0 0 0
#> 522457 0 0 0 2 6 0 0 0 0 0
#> 951 0 0 0 0 0 0 0 0 0 0
#> 244423 0 0 0 22 29 0 0 0 0 0
#> 586076 0 0 0 2 1 0 0 0 0 0
#> 246140 0 0 0 1 3 0 0 0 0 0
#> TRRsed3 TS28 TS29 Even1 Even2 Even3
#> 549322 0 0 0 0 0 0
#> 522457 0 0 0 0 0 0
#> 951 0 0 0 0 0 0
#> 244423 0 0 0 0 0 0
#> 586076 0 0 0 0 0 0
#> 246140 0 0 0 0 0 0
extract_sample_info(global_patterns) %>%
head()
#> sample_id Primer Final_Barcode Barcode_truncated_plus_T Barcode_full_length
#> 1 CL3 ILBC_01 AACGCA TGCGTT CTAGCGTGCGT
#> 2 CC1 ILBC_02 AACTCG CGAGTT CATCGACGAGT
#> 3 SV1 ILBC_03 AACTGT ACAGTT GTACGCACAGT
#> 4 M31Fcsw ILBC_04 AAGAGA TCTCTT TCGACATCTCT
#> 5 M11Fcsw ILBC_05 AAGCTG CAGCTT CGACTGCAGCT
#> 6 M31Plmr ILBC_07 AATCGT ACGATT CGAGTCACGAT
#> SampleType Description class
#> 1 Soil Calhoun South Carolina Pine soil, pH 4.9 Subject
#> 2 Soil Cedar Creek Minnesota, grassland, pH 6.1 Subject
#> 3 Soil Sevilleta new Mexico, desert scrub, pH 8.3 Subject
#> 4 Feces M3, Day 1, fecal swab, whole body study Subject
#> 5 Feces M1, Day 1, fecal swab, whole body study Subject
#> 6 Skin M3, Day 1, right palm, whole body study Subject
extract_variable_info(global_patterns) %>%
head()
#> variable_id Kingdom Phylum Class Order Family
#> 1 549322 Archaea Crenarchaeota Thermoprotei <NA> <NA>
#> 2 522457 Archaea Crenarchaeota Thermoprotei <NA> <NA>
#> 3 951 Archaea Crenarchaeota Thermoprotei Sulfolobales Sulfolobaceae
#> 4 244423 Archaea Crenarchaeota Sd-NA <NA> <NA>
#> 5 586076 Archaea Crenarchaeota Sd-NA <NA> <NA>
#> 6 246140 Archaea Crenarchaeota Sd-NA <NA> <NA>
#> Genus Species
#> 1 <NA> <NA>
#> 2 <NA> <NA>
#> 3 Sulfolobus Sulfolobusacidocaldarius
#> 4 <NA> <NA>
#> 5 <NA> <NA>
#> 6 <NA> <NA>
The microbiomedataset
package also includes functions
for filtering, subsetting, and merging abundance data.
In the following example, the global_patterns
data is
first transformed to relative abundance, creating the new
global_patterns2
object, which is then filtered such that
only OTUs with a mean greater than 10^-5 are kept.
global_patterns2 <-
global_patterns %>%
transform2relative_intensity() %>%
mutate2variable(what = "mean_intensity") %>%
activate_microbiome_dataset(what = "variable_info") %>%
filter(mean_intensity > 10 ^ (-5))
This results in a highly-subsetted object,
global_patterns2
, containing just 4624 of the original
~19216 OTUs.
Next, only remain the variables that phylum Chlamydiae.
global_patterns_chl <-
global_patterns %>%
activate_microbiome_dataset(what = "variable_info") %>%
dplyr::filter(Phylum == "Chlamydiae")
Next, only remain the samples with total intensity > 20.
global_patterns_chl <-
global_patterns_chl %>%
mutate2sample(what = "sum_intensity") %>%
activate_microbiome_dataset(what = "sample_info") %>%
filter(sum_intensity > 20)
sessionInfo()
#> R version 4.2.1 (2022-06-23)
#> Platform: x86_64-apple-darwin17.0 (64-bit)
#> Running under: macOS Big Sur ... 10.16
#>
#> Matrix products: default
#> BLAS: /Library/Frameworks/R.framework/Versions/4.2/Resources/lib/libRblas.0.dylib
#> LAPACK: /Library/Frameworks/R.framework/Versions/4.2/Resources/lib/libRlapack.dylib
#>
#> locale:
#> [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
#>
#> attached base packages:
#> [1] stats graphics grDevices utils datasets methods base
#>
#> other attached packages:
#> [1] forcats_0.5.2 stringr_1.4.1
#> [3] purrr_1.0.1 readr_2.1.3
#> [5] tidyr_1.2.1 tibble_3.1.8
#> [7] ggplot2_3.4.0 tidyverse_1.3.2
#> [9] dplyr_1.0.10 microbiomedataset_0.99.10
#>
#> loaded via a namespace (and not attached):
#> [1] utf8_1.2.2 tidyselect_1.1.2
#> [3] htmlwidgets_1.5.4 grid_4.2.1
#> [5] BiocParallel_1.30.3 munsell_0.5.0
#> [7] codetools_0.2-18 ragg_1.2.2
#> [9] preprocessCore_1.58.0 withr_2.5.0
#> [11] colorspace_2.0-3 Biobase_2.56.0
#> [13] phyloseq_1.40.0 knitr_1.40
#> [15] rstudioapi_0.14 stats4_4.2.1
#> [17] mzID_1.34.0 MatrixGenerics_1.8.1
#> [19] GenomeInfoDbData_1.2.8 polyclip_1.10-4
#> [21] farver_2.1.1 rhdf5_2.40.0
#> [23] rprojroot_2.0.3 vctrs_0.5.2
#> [25] generics_0.1.3 xfun_0.33
#> [27] timechange_0.1.1 R6_2.5.1
#> [29] doParallel_1.0.17 GenomeInfoDb_1.32.4
#> [31] clue_0.3-61 graphlayouts_0.8.1
#> [33] MsCoreUtils_1.8.0 bitops_1.0-7
#> [35] rhdf5filters_1.8.0 cachem_1.0.6
#> [37] gridGraphics_0.5-1 DelayedArray_0.22.0
#> [39] assertthat_0.2.1 scales_1.2.1
#> [41] googlesheets4_1.0.1 ggraph_2.0.6
#> [43] gtable_0.3.1 affy_1.74.0
#> [45] tidygraph_1.2.2 rlang_1.0.6
#> [47] systemfonts_1.0.4 mzR_2.30.0
#> [49] GlobalOptions_0.1.2 splines_4.2.1
#> [51] Rdisop_1.56.0 lazyeval_0.2.2
#> [53] gargle_1.2.1 impute_1.70.0
#> [55] broom_1.0.1 modelr_0.1.9
#> [57] BiocManager_1.30.18 yaml_2.3.5
#> [59] reshape2_1.4.4 backports_1.4.1
#> [61] tools_4.2.1 ggplotify_0.1.0
#> [63] affyio_1.66.0 ellipsis_0.3.2
#> [65] jquerylib_0.1.4 biomformat_1.24.0
#> [67] RColorBrewer_1.1-3 BiocGenerics_0.42.0
#> [69] MSnbase_2.22.0 Rcpp_1.0.9
#> [71] plyr_1.8.7 zlibbioc_1.42.0
#> [73] RCurl_1.98-1.8 pbapply_1.5-0
#> [75] GetoptLong_1.0.5 viridis_0.6.2
#> [77] S4Vectors_0.34.0 zoo_1.8-11
#> [79] haven_2.5.1 SummarizedExperiment_1.26.1
#> [81] ggrepel_0.9.2 cluster_2.1.4
#> [83] fs_1.5.2 magrittr_2.0.3
#> [85] masstools_1.0.8 data.table_1.14.6
#> [87] openxlsx_4.2.5.1 circlize_0.4.15
#> [89] reprex_2.0.2 googledrive_2.0.0
#> [91] pcaMethods_1.88.0 ProtGenerics_1.28.0
#> [93] matrixStats_0.62.0 hms_1.1.2
#> [95] evaluate_0.16 XML_3.99-0.10
#> [97] readxl_1.4.1 IRanges_2.30.1
#> [99] gridExtra_2.3 shape_1.4.6
#> [101] compiler_4.2.1 ncdf4_1.20
#> [103] crayon_1.5.1 htmltools_0.5.3
#> [105] mgcv_1.8-42 tzdb_0.3.0
#> [107] lubridate_1.9.0 DBI_1.1.3
#> [109] tweenr_2.0.2 dbplyr_2.2.1
#> [111] ComplexHeatmap_2.12.1 MASS_7.3-58.1
#> [113] Matrix_1.5-1 ade4_1.7-19
#> [115] permute_0.9-7 cli_3.4.1
#> [117] vsn_3.64.0 parallel_4.2.1
#> [119] igraph_1.3.5 GenomicRanges_1.48.0
#> [121] pkgconfig_2.0.3 pkgdown_2.0.6
#> [123] plotly_4.10.0 xml2_1.3.3
#> [125] MALDIquant_1.22 foreach_1.5.2
#> [127] bslib_0.4.0 multtest_2.52.0
#> [129] XVector_0.36.0 massdataset_1.0.21
#> [131] rvest_1.0.3 yulab.utils_0.0.5
#> [133] digest_0.6.31 vegan_2.6-2
#> [135] Biostrings_2.64.1 cellranger_1.1.0
#> [137] rmarkdown_2.16 tidytree_0.4.1
#> [139] curl_4.3.3 rjson_0.2.21
#> [141] lifecycle_1.0.3 nlme_3.1-159
#> [143] jsonlite_1.8.0 Rhdf5lib_1.18.2
#> [145] desc_1.4.2 viridisLite_0.4.1
#> [147] limma_3.52.3 fansi_1.0.3
#> [149] pillar_1.8.1 ggsci_2.9
#> [151] lattice_0.20-45 fastmap_1.1.0
#> [153] httr_1.4.4 survival_3.4-0
#> [155] glue_1.6.2 remotes_2.4.2
#> [157] zip_2.2.1 png_0.1-8
#> [159] iterators_1.0.14 ggforce_0.3.4
#> [161] stringi_1.7.8 sass_0.4.2
#> [163] textshaping_0.3.6 memoise_2.0.1
#> [165] ape_5.6-2