Data are converted to phyloseq
format. Experiment
samples are first run on their own, and then artificial calculus samples
are run together with calculus and plaque reference samples.
calculus_otu <- taxatable %>%
column_to_rownames(var = "#OTU ID") %>%
dplyr::select(which(
colnames(.) %in% filter(
analysis_metadata, str_detect(Env, "calculus|plaque"))$`#SampleID`
)) %>%
otu_table(taxa_are_rows = T)
experiment_otu <- taxatable %>%
column_to_rownames(var = "#OTU ID") %>%
dplyr::select(which(colnames(.) %in% experiment_metadata$`#SampleID`)) %>%
otu_table(taxa_are_rows = T)
experiment_sample <- experiment_metadata %>%
filter(`#SampleID` %in% analysis_metadata$`#SampleID`) %>%
column_to_rownames(var = "#SampleID") %>%
sample_data()
experiment_phyloseq <- phyloseq(experiment_otu, experiment_sample)
calculus_sample <- analysis_metadata %>%
filter(str_detect(Env, "calculus|plaque")) %>%
column_to_rownames(var = "#SampleID") %>%
sample_data()
calculus_phyloseq <- phyloseq(calculus_otu, calculus_sample)
Differential abundance is calculated using the ANCOM-BC method from
the ANCOMBC
R package. P-values adjusted using the false
discovery rate (FDR) method. Samples grouped by sample type
(i.e. saliva, plaque, modern calculus, artificial calculus).
First, differential abundance of experimental samples only, comparing abundance between donated saliva, medium, and the final calculus product.
exp_da <- ANCOMBC::ancombc(
experiment_phyloseq,
formula = "Env",
group = "Env",
global = F,
p_adj_method = "fdr")
Warning: Small sample size detected for the following group(s):
saliva
ANCOM-BC results would be unstable when the sample size is < 5 per group
Species with the highest log-fold change mainly see a reduction from saliva to medium, and then a slight further reduction from medium to calculus. The largest reduction of species occurs in Rothia dentocariosa (LFC = 12.9250215).
calculus_da <- ANCOMBC::ancombc(
calculus_phyloseq,
formula = "Env",
group = "Env",
global = F,
p_adj_method = "fdr")
# table with log-fold change statistics between artificial calculus and others
# negative lfc means higher abundance in artificial calculus
# positive lfc means lower abundance in art calculus
calc_logf_change <- as_tibble(calculus_da$res$lfc, rownames = "species") #%>%
#select(species, Envmedium) %>%
calc_logf_se <- as_tibble(calculus_da$res$se, rownames = "species") #%>%
#select(!c(Envsaliva.x, Envsaliva.y)) %>%
calc_logf_q <- as_tibble(calculus_da$res$q_val, rownames = "species")
calc_logf_full <- calc_logf_change %>%
pivot_longer(-species, values_to = "lfc") %>%
full_join(
calc_logf_se %>%
pivot_longer(-species, values_to = "se"),
by = c("species", "name")
) %>%
full_join(
calc_logf_q %>%
pivot_longer(-species, values_to = "q_value"),
by = c("species", "name")
) %>%
mutate(lower = lfc - se,
upper = lfc + se,
name = str_remove(name, "^Env"),
abn = case_when(sign(lfc) == -1 ~ "byoc_calculus",
TRUE ~ name)) %>%
rename(env = name)
head(calc_logf_full)
Plots of log-fold change between samples
# Plot of largest absolute log-fold changes
calc_logf_full %>%
group_by(env) %>%
arrange(desc(abs(lfc))) %>%
slice_head(n = 20) %>% # causes loss of some env values
ungroup() %>%
slice_head(n = 20) %>% # extract top 20 lfc from modern_calculus
bind_rows(filter( # recombine with other env values so all are included in plot
calc_logf_full,
env != "modern_calculus",
species %in% .$species
)
) %>%
ggplot(aes(x = lfc, y = reorder(species, lfc), col = abn)) +
geom_point(size = 2) +
geom_linerange(aes(xmin = lower, xmax = upper), size = 1) +
geom_vline(xintercept = 0, linetype = "dashed") +
facet_wrap(~ env) +
theme_bw() +
theme(
legend.position = "none",
axis.title.y = element_blank()
)
# Top 30 loadings from PC1
calc_logf_full %>%
filter(species %in% pca_loadings$species[1:20]) %>%
slice_head(n = 60) %>% # pca_loadings already arranged by PC1
ggplot(aes(x = lfc, y = reorder(species, lfc), col = abn)) +
geom_point(size = 2) +
geom_linerange(aes(xmin = lower, xmax = upper), size = 1) +
geom_vline(xintercept = 0, linetype = "dashed") +
facet_wrap(~ env) +
theme_bw() +
theme(
legend.position = "none",
axis.title.y = element_blank()
)
calc_logf_full %>%
filter(species %in% arrange(pca_loadings, desc(abs(PC2)))$species[1:30]) %>%
slice_head(n = 30) #%>% # pca_loadings already arranged by PC1
# Top 30 loadings from PC2
calc_logf_full %>%
filter(species %in% arrange(pca_loadings, desc(abs(PC2)))$species[1:20]) %>%
slice_head(n = 60) %>% # pca_loadings already arranged by PC1
ggplot(aes(x = lfc, y = reorder(species, lfc), col = abn)) +
geom_point(size = 2) +
geom_linerange(aes(xmin = lower, xmax = upper), size = 1) +
geom_vline(xintercept = 0, linetype = "dashed") +
facet_wrap(~ env) +
theme_bw() +
theme(
legend.position = "none",
axis.title.y = element_blank()
)
Bias-corrected log observed abundances were calculated by following
the example in vignette("ANCOMBC")
.
# from vignette("ANCOMBC")
calc_samp_frac <- calculus_da$samp_frac
# Replace NA with 0
calc_samp_frac[is.na(calc_samp_frac)] = 0
# add 1 to counts for log-transformation
calc_log_exp_otu <- log(microbiome::abundances(calculus_otu) + 1)
# Adjust the log observed abundances
# Bias-corrected (log) observed abundances
calc_log_abn <- t(t(calc_log_exp_otu) - calc_samp_frac) %>%
as_tibble(rownames = "species")
# need to convert counts back to zero (counts that are = samp_frac)
calc_log_abn
Enterococcus faecalis has the highest log-fold change, with a higher abundance in the artificial calculus samples compared to the reference samples and may represent one of the main differences between artificial calculus and other reference samples, especially modern calculus, consistent with the results of the sPCA analysis.