Load necessary packages.

library(cuperdec)
library(decontam)
library(here)
library(tidyverse)

Upload the data and metadata.

# upload data
lib_conc <- readr::read_tsv(here("01-documentation/SYN_DNA_concentrations.tsv")) # library concentrations
metadata <- readr::read_tsv(here("01-documentation/metadata.tsv"))
experiment_metadata <- readr::read_tsv(here("01-documentation/experiment-metadata.tsv"))
analysis_metadata <- readr::read_tsv(here("01-documentation/analysis-metadata.tsv"))
kraken_taxatab <- readr::read_tsv(here("04-analysis/OTUfilter_table.tsv"))
sourcetracker2 <- readr::read_tsv(here("04-analysis/sourcetracker/sourcetracker2_output/mixing_proportions.txt"))
sourcetracker2_stdevs <- readr::read_tsv(here("04-analysis/sourcetracker/sourcetracker2_output/mixing_proportions_stds.txt")) 
otu_decontam <- read_tsv(here("05-results/post-decontam_taxatable.tsv"))
all_data_long <- read_tsv(here("04-analysis/sourcetracker/source-comb_long.tsv"))

SourceTracker

Steps taken for SourceTracker analysis.

OTU table was filtered for relative abundance. Percent abundance of each taxon across all samples was calculated and then taxa with lower than 0.001% abundance were filtered out. See data prep script.

Installation of Qiime2 and dev version of SourceTracker via conda:

wget https://data.qiime2.org/distro/core/qiime2-2022.2-py38-linux-conda.yml
conda env create -n qiime2-2022.2 --file qiime2-2022.2-py38-linux-conda.yml
rm qiime2-2022.2-py38-linux-conda.yml # cleanup
conda activate qiime2-2022.2
pip install https://github.com/biota/sourcetracker2/archive/master.zip

Installation of Qiime1 to access filter_samples_from_otu_table.py

conda create -n qiime1 python=2.7 qiime -c bioconda

Convert OTU table from .tsv to .biom.

biom convert -i 04-analysis/OTUfilter_table.tsv -o 04-analysis/sourcetracker/OTUfilter-table-from-tsv_json.biom --table-type="OTU table" --to-json

Filter OTU table

filter_samples_from_otu_table.py \
-i 04-analysis/sourcetracker/OTUfilter-table-from-tsv_json.biom \
-o 04-analysis/sourcetracker/OTUs1000filter_table.biom \
-n 1000

Table summary

biom summarize-table -i 04-analysis/sourcetracker/OTUs1000filter_table.biom > 04-analysis/sourcetracker/summary_OTUs1000filter-table.txt

Convert to TSV for use with the decontam package.

biom convert -i 04-analysis/sourcetracker/OTUs1000filter_table.biom -o 04-analysis/decontam/pre-decontam_OTUfiltered-table_from-biom.tsv --to-tsv

Run SourceTracker2 on the filtered OTU table with rarefaction depth of 1000 for both source and samples. Samples and sources were mapped in the ST_comb-plaque-map.txt. The plaque source is a combination of supragingival and subgingival plaque.

conda activate qiime2-2022.2

Another run with indoor_air sources included and sediment removed

sourcetracker2 \
    -i 04-analysis/sourcetracker/OTUs1000filter_table.biom \
    -m 04-analysis/sourcetracker/ST_comb-plaque-map.txt \
    --source_sink_column SourceSink  \
    --source_column_value source \
    --source_rarefaction_depth 1000 \
    --sink_rarefaction_depth 1000 \
    --sink_column_value sink \
    --source_category_column Env \
    -o 04-analysis/sourcetracker/sourcetracker2_output \
    --jobs 2 \
    --per_sink_feature_assignments

The full results can be found in 04-analysis/sourcetracker/sourcetracker2_output

# convert to long format
sourcetracker2_long <- sourcetracker2 %>%
  pivot_longer(cols = where(is.numeric), 
               values_to = "proportion", 
               names_to = "SampleID") %>%
  rename(source = ...1)

Visualisation of sources

The samples were organised by day they were sampled,

day_order <- experiment_metadata %>%
  mutate(Env = factor(
    Env, 
    levels = c("saliva", "medium", "calculus") # force level order so it doesn't order alphabetically
    ) 
  ) %>% 
  group_by(Env) %>%
  arrange(day, .by_group = T) %>%
  filter(`#SampleID` %in% colnames(sourcetracker2)) %>%
  mutate(rm = if_else(`#SampleID` %in% analysis_metadata$`#SampleID`, F, T),
         col = case_when(rm == T ~ "red",
                         rm == F ~ "black"))

removed_samples <- experiment_metadata %>%
  mutate(rm = if_else(`#SampleID` %in% analysis_metadata$`#SampleID`, F, T),
         col = case_when(rm == T ~ "red",
                         rm == F ~ "black")) %>% 
  mutate(sample = `#SampleID`) %>%
  arrange(day)

then visualised.

sourcetracker2_long %>% 
  ggplot(aes(y = SampleID, x= proportion, fill = source)) +
    geom_col() +
    theme_minimal() +
    scale_y_discrete(limits = day_order$`#SampleID`) +
    scale_fill_viridis_d(option = "C") +
    theme(axis.title = element_blank(),
          axis.text.y = element_text(vjust = 0.5, hjust = 0.5, colour = day_order$col))

Many of the later medium samples were assigned to the ‘Unknown’ and ‘indoor_air’ categories. To see whether this was the result of external contamination or the presence of oral taxa with an unknown origin (which could be due to the taxa matching multiple sources), the taxa from ‘Unknown’ were compared to the oral reference database from the cuperdec R package. Only samples with a large proportion of oral taxa and a low proportion of indoor_air (indoor_air + Unknown < oral and oral > 70%) were included. Included sample names are indicated with black, and removed samples with red.

oral_taxa <- cuperdec_database_ex %>%
  filter(isolation_source == "oral")
all_data_long %>% 
  filter(count > 0) %>% 
  mutate(oral_source = if_else(taxon %in% oral_taxa$species, "oral", "other")) %>%
  ggplot(aes(y = SampleID, x = count, fill = oral_source)) +
    geom_col(position = "fill") +
    scale_fill_viridis_d(option = "C") +
    theme_minimal() +
    theme(axis.text.y = element_text(colour = day_order$col),
          axis.title.y = element_blank()) +
    scale_y_discrete(limits = day_order$`#SampleID`)

A large proportion of the species assigned to the various sources are known to be oral taxa. The samples do also seem to contain some external contamination.

Based on the results from SourceTracker, samples c(“SYN015.F0101”, “SYN017.F0101”, “SYN015.G0101”, “SYN017.G0101”, “SYN015.H0101”, “SYN018.H0101”, “SYN013.I0101”, “SYN016.I0101”) were removed from the analysis.

cuperdec

taxa_table <- load_taxa_table(kraken_taxatab)
iso_database <- load_database(cuperdec_database_ex, target = "oral")
metadata_table <- load_map(metadata,
                           sample_col = "#SampleID",
                           source_col = "Env"
                           )

curves <- calculate_curve(taxa_table, iso_database)
filter_result <- simple_filter(curves, 60)
plot_cuperdec(curves, metadata_table, filter_result)

decontam

species_table_long <- otu_decontam %>%
  pivot_longer(cols = where(is.numeric), names_to = "sample", values_to = "count")

byoc_table_long <- species_table_long %>% 
  filter(str_detect(sample, "SYN"),
         sample %in% analysis_metadata$`#SampleID`,
         count > 0)

species_summ <- byoc_table_long %>%
  group_by(sample) %>%
  count(`#OTU ID`) %>%
  summarise(count = sum(n))

After running the decontam package, the samples contained between 87 and 269 species with a mean of 172.89.

---
title: "Authentication"
output: html_notebook
---

```{r setup, include=FALSE}
knitr::opts_knit$set(message = FALSE)
```

Load necessary packages.

```{r}
#| label: dependencies
library(cuperdec)
library(decontam)
library(here)
library(tidyverse)
```

Upload the data and metadata.

```{r message=FALSE}
#| label: data-upload
# upload data
lib_conc <- readr::read_tsv(here("01-documentation/SYN_DNA_concentrations.tsv")) # library concentrations
metadata <- readr::read_tsv(here("01-documentation/metadata.tsv"))
experiment_metadata <- readr::read_tsv(here("01-documentation/experiment-metadata.tsv"))
analysis_metadata <- readr::read_tsv(here("01-documentation/analysis-metadata.tsv"))
kraken_taxatab <- readr::read_tsv(here("04-analysis/OTUfilter_table.tsv"))
sourcetracker2 <- readr::read_tsv(here("04-analysis/sourcetracker/sourcetracker2_output/mixing_proportions.txt"))
sourcetracker2_stdevs <- readr::read_tsv(here("04-analysis/sourcetracker/sourcetracker2_output/mixing_proportions_stds.txt")) 
otu_decontam <- read_tsv(here("05-results/post-decontam_taxatable.tsv"))
all_data_long <- read_tsv(here("04-analysis/sourcetracker/source-comb_long.tsv"))
```

## SourceTracker

Steps taken for SourceTracker analysis.

OTU table was filtered for relative abundance. Percent abundance of each taxon
across all samples was calculated and then taxa with lower than 0.001% abundance
were filtered out. See [data prep script](../02-scripts/01-dataprep.R).

Installation of Qiime2 and dev version of SourceTracker via conda:

```sh
wget https://data.qiime2.org/distro/core/qiime2-2022.2-py38-linux-conda.yml
conda env create -n qiime2-2022.2 --file qiime2-2022.2-py38-linux-conda.yml
rm qiime2-2022.2-py38-linux-conda.yml # cleanup
conda activate qiime2-2022.2
pip install https://github.com/biota/sourcetracker2/archive/master.zip
```

Installation of Qiime1 to access filter_samples_from_otu_table.py

```sh
conda create -n qiime1 python=2.7 qiime -c bioconda
```

Convert OTU table from *.tsv* to *.biom*.

```sh
biom convert -i 04-analysis/OTUfilter_table.tsv -o 04-analysis/sourcetracker/OTUfilter-table-from-tsv_json.biom --table-type="OTU table" --to-json
```

Filter OTU table

```sh
filter_samples_from_otu_table.py \
-i 04-analysis/sourcetracker/OTUfilter-table-from-tsv_json.biom \
-o 04-analysis/sourcetracker/OTUs1000filter_table.biom \
-n 1000
```

Table summary

```sh
biom summarize-table -i 04-analysis/sourcetracker/OTUs1000filter_table.biom > 04-analysis/sourcetracker/summary_OTUs1000filter-table.txt
```

Convert to TSV for use with the **decontam** package.

```sh
biom convert -i 04-analysis/sourcetracker/OTUs1000filter_table.biom -o 04-analysis/decontam/pre-decontam_OTUfiltered-table_from-biom.tsv --to-tsv
```

Run SourceTracker2 on the filtered OTU table with rarefaction depth of 1000
for both source and samples. Samples and sources were mapped in the
[ST_comb-plaque-map.txt](../04-analysis/sourcetracker/ST_comb-plaque-map.txt).
The plaque source is a combination of supragingival and subgingival plaque.

```sh
conda activate qiime2-2022.2
```

Another run with indoor_air sources included and sediment removed

```sh
sourcetracker2 \
    -i 04-analysis/sourcetracker/OTUs1000filter_table.biom \
    -m 04-analysis/sourcetracker/ST_comb-plaque-map.txt \
    --source_sink_column SourceSink  \
    --source_column_value source \
    --source_rarefaction_depth 1000 \
    --sink_rarefaction_depth 1000 \
    --sink_column_value sink \
    --source_category_column Env \
    -o 04-analysis/sourcetracker/sourcetracker2_output \
    --jobs 2 \
    --per_sink_feature_assignments
```

The full results can be found in [04-analysis/sourcetracker/sourcetracker2_output](../04-analysis/sourcetracker/sourcetracker2_output/)

```{r}
# convert to long format
sourcetracker2_long <- sourcetracker2 %>%
  pivot_longer(cols = where(is.numeric), 
               values_to = "proportion", 
               names_to = "SampleID") %>%
  rename(source = ...1)
```


### Visualisation of sources

The samples were organised by day they were sampled,

```{r}
day_order <- experiment_metadata %>%
  mutate(Env = factor(
    Env, 
    levels = c("saliva", "medium", "calculus") # force level order so it doesn't order alphabetically
    ) 
  ) %>% 
  group_by(Env) %>%
  arrange(day, .by_group = T) %>%
  filter(`#SampleID` %in% colnames(sourcetracker2)) %>%
  mutate(rm = if_else(`#SampleID` %in% analysis_metadata$`#SampleID`, F, T),
         col = case_when(rm == T ~ "red", # excluded samples coloured red
                         rm == F ~ "black"))

# removed_samples <- experiment_metadata %>%
#   mutate(rm = if_else(`#SampleID` %in% analysis_metadata$`#SampleID`, F, T),
#          col = case_when(rm == T ~ "red",
#                          rm == F ~ "black")) %>% 
#   mutate(sample = `#SampleID`) %>%
#   arrange(day)
```

then visualised.

```{r}
#| label: st-plot
#| fig-cap: "Plot of estimated contributions of various sources to the artificial calculus and artificial saliva samples. Samples are arranged from left to right by how late in the experiment they were sampled, with left being the earliest samples."
sourcetracker2_long %>% 
  ggplot(aes(y = SampleID, x= proportion, fill = source)) +
    geom_col() +
    theme_minimal() +
    scale_y_discrete(limits = day_order$`#SampleID`) +
    scale_fill_viridis_d(option = "C") +
    theme(axis.title = element_blank(),
          axis.text.y = element_text(vjust = 0.5, hjust = 0.5, colour = day_order$col))
```

Many of the later medium samples were assigned to the 'Unknown' and 'indoor_air'
categories. To see whether this was the result of external
contamination or the presence of oral taxa with an unknown origin (which could be
due to the taxa matching multiple sources), the taxa from 'Unknown' were compared
to the oral reference database from the **cuperdec** R package. Only samples with
a large proportion of oral taxa and a low proportion of indoor_air
(`indoor_air + Unknown < oral` and `oral > 70%`) were included. Included sample
names are indicated with black, and removed samples with red.

```{r}
#| label: unknowns-plot
#| fig-cap: "Plot of SourceTracker samples, whether are oral or other."
oral_taxa <- cuperdec_database_ex %>%
  filter(isolation_source == "oral")
all_data_long %>% 
  filter(count > 0) %>% 
  mutate(oral_source = if_else(taxon %in% oral_taxa$species, "oral", "other")) %>%
  ggplot(aes(y = SampleID, x = count, fill = oral_source)) +
    geom_col(position = "fill") +
    scale_fill_viridis_d(option = "C") +
    theme_minimal() +
    theme(axis.text.y = element_text(colour = day_order$col),
          axis.title.y = element_blank()) +
    scale_y_discrete(limits = day_order$`#SampleID`)
```

A large proportion of the species assigned to the various sources are known to
be oral taxa. The samples do also seem to contain some external contamination.

Based on the results from SourceTracker, samples
`r paste(as.vector(filter(removed_samples, rm == T)[,1]), sep = ",")`
were removed from the analysis.

## cuperdec

```{r}
taxa_table <- load_taxa_table(kraken_taxatab)
iso_database <- load_database(cuperdec_database_ex, target = "oral")
metadata_table <- load_map(metadata,
                           sample_col = "#SampleID",
                           source_col = "Env"
                           )

curves <- calculate_curve(taxa_table, iso_database)
filter_result <- simple_filter(curves, 60)
```

```{r}
plot_cuperdec(curves, metadata_table, filter_result)
```

## decontam

```{r}
species_table_long <- otu_decontam %>%
  pivot_longer(cols = where(is.numeric), names_to = "sample", values_to = "count")

byoc_table_long <- species_table_long %>% 
  filter(str_detect(sample, "SYN"),
         sample %in% analysis_metadata$`#SampleID`,
         count > 0)

species_summ <- byoc_table_long %>%
  group_by(sample) %>%
  count(`#OTU ID`) %>%
  summarise(count = sum(n))
```

After running the **decontam** package, the samples contained between
`r paste(range(species_summ$count), collapse = " and ")`
species with a mean of
`r paste(round(mean(species_summ$count), 2))`.

```{r}
# remaining species count in each sample
byoc_table_long %>% 
  group_by(sample) %>%
  count(`#OTU ID`) %>% 
  ggplot(aes(y = sample, x = n)) +
    geom_col() +
  theme_minimal() +
    theme(
      axis.line.x = element_line(),
      axis.title.x = element_blank(),
      panel.grid.major.y = element_blank(),
      axis.title.y = element_blank())
```

