Skip to contents

This article focuses on the core manipulation verbs that keep the abundance matrix and metadata synchronized.

library(microbiomedataset)
data("global_patterns", package = "microbiomedataset")

object <- global_patterns

Filter with dplyr semantics

Use activate_microbiome_dataset() to target sample_info or variable_info, then apply regular dplyr verbs.

soil_object <-
  object %>%
  activate_microbiome_dataset("sample_info") %>%
  dplyr::filter(SampleType == "Soil")

dim(soil_object@expression_data)
#> [1] 19216     3
unique(soil_object@sample_info$SampleType)
#> [1] Soil
#> 9 Levels: Feces Freshwater Freshwater (creek) Mock ... Tongue
firmicutes_object <-
  object %>%
  activate_microbiome_dataset("variable_info") %>%
  dplyr::filter(Phylum == "Firmicutes")

dim(firmicutes_object@expression_data)
#> [1] 4356   26
head(unique(firmicutes_object@variable_info$Phylum))
#> [1] "Firmicutes"

Use explicit pruning verbs

When you want the operation to be obvious from the function name, use the package-level verbs.

pruned_samples <- prune_samples(object, sample_id = object@sample_info$sample_id[1:5])
pruned_taxa <- prune_taxa(object, variable_id = object@variable_info$variable_id[1:50])

dim(pruned_samples@expression_data)
#> [1] 19216     5
dim(pruned_taxa@expression_data)
#> [1] 50 26

Apply common filtering rules

Use microbiome-specific filtering helpers for prevalence, abundance, and sample library size.

taxa_filtered <-
  microbiomedataset::filter_taxa(
    object,
    min_prevalence = 0.2,
    min_abundance = 10
  )

sample_filtered <-
  microbiomedataset::filter_samples(
    object,
    min_abundance = 1000
  )

dim(taxa_filtered@expression_data)
#> [1] 12294    26
dim(sample_filtered@expression_data)
#> [1] 19216    26

Transform counts

transform_counts() is the unified entry point for common abundance transformations.

relative_object <- transform_counts(object, method = "relative")
log_object <- transform_counts(object, method = "log", pseudocount = 1)
pa_object <- transform_counts(object, method = "presence_absence")

colSums(relative_object@expression_data)[1:5]
#>     CL3     CC1     SV1 M31Fcsw M11Fcsw 
#>       1       1       1       1       1
range(log_object@expression_data[, 1])
#> [1] 0.000000 4.528711
table(pa_object@expression_data[, 1])[1:2]
#> 
#>     0     1 
#> 12252  6964

You can also transform after taxonomic aggregation:

genus_relative <-
  transform_taxa(
    object,
    taxonomic_rank = "Genus",
    what = "sum_intensity",
    method = "relative"
  )

colSums(genus_relative@expression_data)[1:5]
#>     CL3     CC1     SV1 M31Fcsw M11Fcsw 
#>       1       1       1       1       1