---
title: "alphadiversity"
author: "Stijn Schreven"
date: "19 juli 2018"
output: html_document
---
# Alpha diversity  

## Loading Data  
```{r}
ps1.decontam <- readRDS("./phyobjects/ps1.decontam.rds")
print(ps1.decontam)

ps1.exp <- readRDS("./phyobjects/ps1.exp.rds")
print(ps1.exp)
```

## 1. Exploration  
### 1.1. View phylum abundance in all samples  
```{r}
# Abundance histogram total
ps1_df_taxa <- data.table(tax_table(ps1.decontam), 
                        ASVabundance = taxa_sums(ps1.decontam), 
                        ASV= taxa_names(ps1.decontam))

ps1_tax_plot <- ggplot(ps1_df_taxa, aes(ASVabundance)) + 
  geom_histogram() + ggtitle("Histogram of ASVs (unique sequence) counts") + 
  theme_bw() + scale_x_log10() + ylab("Frequency of ASVs") + xlab("Abundance (raw counts)")

print(ps1_tax_plot)

# Abundance-prevalence per phylum
p <- plot_taxa_prevalence(ps1.decontam, "Phylum")
p
```

## 1.2. Coefficient of variation (CV)  
```{r}
p1 <- plot_taxa_cv(ps1.exp, plot.type = "scatter")
p1 + scale_x_log10()
```

## 1.3. Sequencing depth  
```{r}
p_seqdepth.time <- plot_read_distribution(ps1.exp, "Timepoint", "density") +
                      facet_grid(rows = vars(Density), cols = vars(Diet))
p_seqdepth.time
```

## 1.4. Variation in reads  
```{r}
summary(sample_sums(ps1.exp))

# rarefaction curves
otu_tab <- t(abundances(ps1.exp))
prar <- vegan::rarecurve(otu_tab, 
                      step = 50, label = FALSE, 
                      sample = min(rowSums(otu_tab), 
                                   col = "blue", cex = 0.6))
```

## 1.5. Normalize to lower sequence depth  
This will remove some samples (Nreads = 0 in some substrate samples of timepoint 0) and OTUs (for diverse samples with many reads). Try out different sample sizes (2000, 5000, 10000, 100000).
```{r}
set.seed(9242)  # This will help in reproducing the filtering and nomalisation. 

ps0.rar <- rarefy_even_depth(ps1.exp, sample.size = 2000)

saveRDS(ps0.rar, "./phyobjects/ps0.rar.rds")
```
2000: removes 16 samples and 38 OTUs
5000: removes 19 samples and .. OTUs
10000: removes 26 samples and .. OTUs
100000: removes 70 samples and ... OTUs
Chosen 2000 for now.

```{r}
barplot(sample_sums(ps0.rar), las =2)

p.rar <- plot_taxa_prevalence(ps0.rar, "Phylum")
p.rar
ggsave("./figures/Taxa_prevalence_rarefied2000rd.pdf", height = 7, width = 10)
```

## 2. Diversity indices  
```{r}
# data table with all diversity indices (on data normalized to equal sequence depth 2000 reads)
bsf.div <- diversities(ps0.rar, index = "all")

datatable(bsf.div)

# get the metadata out as separate object
bsf.meta <- meta(ps0.rar)

# Add the rownames as a new colum for easy integration later.
bsf.meta$sam_name <- rownames(bsf.meta)

# Add the rownames to diversity table
bsf.div$sam_name <- rownames(bsf.div)

# merge these two data frames into one
div.df <- merge(bsf.div,bsf.meta, by = "sam_name")

# check the tables
colnames(div.df)
```

Plotting a diversity index:
```{r}
# Now use this data frame to plot 
levels(div.df$Timepoint)
levels(div.df$Density)[2] <- "50"
levels(div.df$Density)[3] <- "100"
levels(div.df$Density)[4] <- "200"
levels(div.df$Timepoint)[2] <- "5"
levels(div.df$Timepoint)[3] <- "10"
levels(div.df$Timepoint)[4] <- "15"
levels(div.df$Diet)
levels(div.df$Diet)[1] <- "CF"
levels(div.df$Diet)[2] <- "CM"
levels(div.df$Diet)[3] <- "CS"

div.sum <- ddply(div.df, ~Diet+Type+Density+Timepoint, summarise, shmean=mean(shannon), shsd=sd(shannon), shse=se(shannon))

p <- ggboxplot(div.df, x = "Timepoint", y = "shannon",
              fill = "Type", palette = "jco", xlab = "Time (d)", ylab = "Shannon diversity") +
              facet_grid(rows = vars(Diet), cols = vars(Density)) + 
              theme(text = element_text(size=12), axis.text.y=element_text(size=rel(0.8)),
                    panel.border= element_rect(linetype = "solid", fill = NA),
                    panel.spacing.y = unit(1,"lines"))
p

# create lineplot to see diversity change over time, panel by diet, group by density, and type?
pd <- position_dodge(0.1) # move errorbars .05 to the left and right
pl <- ggplot(div.sum, aes(x=Timepoint, y=shmean, colour=Type)) + 
    ylab("Shannon diversity") + xlab("Time (d)") +
    geom_errorbar(aes(ymin=shmean-shse, ymax=shmean+shse), width=.1, position=pd) +
    geom_point(position=pd) +
    facet_grid(rows = vars(Density), cols = vars(Diet))
pl
ggsave("./figures/Shannon_diversity.pdf", height = 7, width = 10)

#alternative plot format
pl2 <- ggplot(div.sum, aes(x=Timepoint, y=shmean, colour=Density, group=Density))
pl2 + geom_line(aes(linetype=Density), size=.6) + 
     geom_point(aes(shape=Density), size=2, position=pd) + 
     geom_errorbar(aes(ymin=shmean-shse, ymax=shmean+shse), width=.1, position=pd) +
  ylab("Shannon diversity") + xlab("Time (d)") +
  facet_grid(rows = vars(Type), cols = vars(Diet))
ggsave("./figures/Shannon_div_perDensity.pdf", height = 7, width = 10)

pl3 <- ggplot(div.sum, aes(x=Density, y=shmean, colour=Timepoint, group=Timepoint))
pl3 + geom_line(aes(linetype=Timepoint), size=.6) + 
     geom_point(aes(shape=Timepoint), size=2, position=pd) + 
     geom_errorbar(aes(ymin=shmean-shse, ymax=shmean+shse), width=.1, position=pd) +
  ylab("Shannon diversity") + xlab("Larval density (N per container)") +
  facet_grid(rows = vars(Type), cols = vars(Diet))
ggsave("./figures/Shannon_div_perTimepoint.pdf", height = 7, width = 10)

# and Shannon diversity per container (substrate and larvae pooled)
div.sumT <- ddply(div.df, ~Diet+Density+Timepoint, summarise, shmean=mean(shannon), shsd=sd(shannon), shse=se(shannon))

pl4 <- ggplot(div.sumT, aes(x=Density, y=shmean, colour=Timepoint, group=Timepoint))
pl4 + geom_line(aes(linetype=Timepoint), size=.6) + 
     geom_point(aes(shape=Timepoint), size=2, position=pd) + 
     geom_errorbar(aes(ymin=shmean-shse, ymax=shmean+shse), width=.1, position=pd) +
  ylab("Shannon diversity") + xlab("Larval density (N per container)") +
  facet_grid(cols = vars(Diet))
ggsave("./figures/Shannon_div_Total_perTimepoint.pdf", height = 7, width = 10)

pl5 <- ggplot(div.sumT, aes(x=Timepoint, y=shmean, colour=Density, group=Density))
pl5 + geom_line(aes(linetype=Density), size=.6) + 
     geom_point(aes(shape=Density), size=2, position=pd) + 
     geom_errorbar(aes(ymin=shmean-shse, ymax=shmean+shse), width=.1, position=pd) +
  ylab("Shannon diversity") + xlab("Time (d))") +
  facet_grid(cols = vars(Diet))
ggsave("./figures/Shannon_div_Total_perDensity.pdf", height = 7, width = 10)
```

Plotting several diversity indices:
```{r}
# Alternative way
# convert phyloseq object into a long data format.  
div.df2 <- div.df[,c("Diet", "Density", "Timepoint", "Type", "inverse_simpson", "gini_simpson", "shannon", "fisher", "coverage")]

# the names are not pretty. we can replace them
colnames(div.df2) <- c("Diet", "Density", "Timepoint", "Type","Inverse Simpson", "Gini-Simpson", "Shannon", "Fisher", "Coverage")
colnames(div.df2)

div_df_melt <- reshape2::melt(div.df2)
## Using Location as id variables
head(div_df_melt)

# Now use this data frame to plot 
p <- ggboxplot(div_df_melt, x = "Diet", y = "value",
              fill = "Diet", 
              palette = "jco", 
              legend= "right",
              facet.by = "variable", 
              scales = "free")
p

# we will remove the x axis lables
p <- p + rremove("x.text")
p
ggsave("./figures/Diversities.pdf", height = 4, width = 10)
```

Adding pairwise comparisons:
```{r}
lev <- levels(div_df_melt$Diet) # get the variables

# make a pairwise list that we want to compare.
L.pairs <- combn(seq_along(lev), 2, simplify = FALSE, FUN = function(i)lev[i])

p2 <- p + stat_compare_means(comparisons = L.pairs, 
                             label = "p.signif", 
                             symnum.args = list(cutpoints = c(0, 0.0001, 0.001, 0.01, 0.05, 0.1, 1), 
                                                symbols = c("****", "***", "**", "*", "n.s")))

## Stijn: better do multiple comparisons and add letters in Photoshop or so, if including the multifacet design as above (rows = Diet, cols = Density, x = Timepoint, y = diversity index)

print(p2)
```

## 3. Phylogenetic diversity  
### 3.1. Prepare data  
```{r}
library(picante)

ps0.rar.asvtab <- as.data.frame(ps0.rar@otu_table)

ps0.rar.tree <- ps0.rar@phy_tree

# hmp.meta from previous code chunks

# We first need to check if the tree is rooted or not 

ps0.rar@phy_tree
```

### 3.2. Running picante::pd function  
```{r}
# it is a rooted tree
df.pd <- pd(t(ps0.rar.asvtab), ps0.rar.tree,include.root=T)
# t(otu_table) transposes the table for use in picante and the tre file comes from the first code chunk we used to read tree file (see making a phyloseq object section).

datatable(df.pd)
```

Plotting phylogenetic diversity index with pairwise comparisons:
```{r}
# add new column with PD values
bsf.meta$Phylogenetic_Diversity <- df.pd$PD

# plot PD
pd.plot <- ggboxplot(bsf.meta, x = "Diet", 
                     y = "Phylogenetic_Diversity",
                     fill = "Diet", 
                     palette = "jco",
                     ylab = "Phylogenetic Diversity", 
                     xlab = "Diet",
                     legend = "right")
pd.plot <- pd.plot + rotate_x_text()

pd.plot + stat_compare_means(comparisons = L.pairs, 
                             label = "p.signif", 
                             symnum.args = list(cutpoints = c(0, 0.0001, 0.001, 0.01, 0.05, 0.1, 1), 
                                                symbols = c("****", "***", "**", "*", "n.s")))
```

Plotting per Diet and Density.
```{r}
# Stijn: alternative display
div.df3 <- merge(bsf.div,bsf.meta, by = "sam_name")
levels(div.df3$Density)
levels(div.df3$Density)[2] <- "50"
levels(div.df3$Density)[3] <- "100"
levels(div.df3$Density)[4] <- "200"
levels(div.df3$Timepoint)[2] <- "5"
levels(div.df3$Timepoint)[3] <- "10"
levels(div.df3$Timepoint)[4] <- "15"
colnames(div.df3)

div.sumPD <- ddply(div.df3, ~Diet+Type+Density+Timepoint, summarise, pdmean=mean(Phylogenetic_Diversity), pdsd=sd(Phylogenetic_Diversity), pdse=se(Phylogenetic_Diversity))

pd <- position_dodge(0.1) # move errorbars .05 to the left and right
ppd <- ggplot(div.sumPD, aes(x=Timepoint, y=pdmean, colour=Type)) + 
    ylab("Phylogenetic diversity") + xlab("Time (d)") +
    geom_errorbar(aes(ymin=pdmean-pdse, ymax=pdmean+pdse), width=.1, position=pd) +
    geom_point(position=pd) +
    facet_grid(rows = vars(Density), cols = vars(Diet))
ppd
ggsave("./figures/Phylogenetic_diversity.pdf", height = 7, width = 10)

# alternative plots
ppd2 <- ggplot(div.sumPD, aes(x=Density, y=pdmean, colour=Type, alpha = Timepoint, group=Timepoint))
ppd2 <- ppd2 + geom_line(aes(linetype=Timepoint), size=.6) + 
     geom_point(aes(shape=Timepoint), size=2, position=pd) +
     scale_alpha_discrete(name = "Day", range = c(0.4, 1)) +
     scale_color_manual(name = "Type", values = c("#1f78b4", "#33a02c")) +
     geom_errorbar(aes(ymin=pdmean-pdse, ymax=pdmean+pdse), width=.1, position=pd) +
     ylab("Phylogenetic diversity") + xlab("Larval density (N / container))") +
     facet_grid(rows = vars(Type), cols = vars(Diet)) +
     theme_classic() + theme(panel.grid.major = element_line(colour = "grey80"), panel.margin = unit(.5,"lines"),
                             panel.border = element_rect(color = "black", fill = NA, size = .3))
ggsave("./figures/Phylog_div_perTimepoint.pdf", height = 7, width = 10)
png("./figures/Phylog_div_perTimepoint.png", width = 10, height = 7, units = "in", res=200)
ppd2
dev.off()

ppd3 <- ggplot(div.sumPD, aes(x=Timepoint, y=pdmean, colour=Type, alpha = Density, group=Density))
ppd3 <- ppd3 + geom_line(aes(linetype=Density), size=.6) + 
     geom_point(aes(shape=Density), size=2, position=pd) +
     scale_alpha_discrete(name = "Larval density", range = c(0.4, 1)) +
     scale_color_manual(name = "Type", values = c("#1f78b4", "#33a02c")) +
     geom_errorbar(aes(ymin=pdmean-pdse, ymax=pdmean+pdse), width=.1, position=pd) +
     ylab("Phylogenetic diversity") + xlab("Time (d))") +
     facet_grid(rows = vars(Type), cols = vars(Diet)) +
     theme_classic() + theme(panel.grid.major = element_line(colour = "grey80"), panel.margin = unit(.5,"lines"),
                             panel.border = element_rect(color = "black", fill = NA, size = .3))
ggsave("./figures/Phylog_div_perDensity.pdf", height = 7, width = 10)
png("./figures/Phylog_div_perDensity.png", width = 10, height = 7, units = "in", res=200)
ppd3
dev.off()
```

## 4. Richness  
```{r}
S.ps1exp <- richness(ps1.exp, detection=0)
colnames(S.ps1exp) <- c("S")

# get the metadata out as separate object
ps1exp.meta <- meta(ps1.exp)

# Add the rownames as a new colum for easy integration later.
ps1exp.meta$sam_name <- rownames(ps1exp.meta)

# Add the rownames to diversity table
S.ps1exp$sam_name <- rownames(S.ps1exp)

# merge these two data frames into one
S.df <- merge(S.ps1exp,ps1exp.meta, by = "sam_name")

# check the tables
colnames(S.df)

# change factor level names
levels(S.df$Density)
levels(S.df$Density)[2] <- "50"
levels(S.df$Density)[3] <- "100"
levels(S.df$Density)[4] <- "200"
levels(S.df$Timepoint)[2] <- "5"
levels(S.df$Timepoint)[3] <- "10"
levels(S.df$Timepoint)[4] <- "15"

# calculate mean and SE,SD for each treatment
div.sumS <- ddply(S.df, ~Diet+Type+Density+Timepoint, summarise, S_mean=mean(S), S_sd=sd(S), S_se=se(S))

# plot richness
pd <- position_dodge(0.1) # move errorbars .05 to the left and right
Sp <- ggplot(div.sumS, aes(x=Timepoint, y=S_mean, colour=Type)) + 
    ylab("Richness") + xlab("Time (d)") +
    geom_errorbar(aes(ymin=S_mean-S_se, ymax=S_mean+S_se), width=.1, position=pd) +
    geom_point(position=pd) +
    facet_grid(rows = vars(Density), cols = vars(Diet))
Sp
ggsave("./figures/Richness.pdf", height = 7, width = 10)

# alternative plot richness
Sp2 <- ggplot(div.sumS, aes(x=Timepoint, y=S_mean, colour=Type, alpha = Density, group=Density))
Sp2 <- Sp2 + geom_line(aes(linetype=Density), size=.6) + 
     geom_point(aes(shape=Density), size=2, position=pd) +
     scale_alpha_discrete(name= "Larval density", range = c(0.4, 1)) +
     scale_color_manual(name = "Type", values = c("#1f78b4", "#33a02c")) +
     geom_errorbar(aes(ymin=S_mean-S_se, ymax=S_mean+S_se), width=.1, position=pd) +
     ylab("OTU Richness") + xlab("Time (d))") +
     facet_grid(rows = vars(Type), cols = vars(Diet)) +
     theme_classic() + theme(panel.grid.major = element_line(colour = "grey80"), panel.margin = unit(.5,"lines"),
                             panel.border = element_rect(color = "black", fill = NA, size = .3))
ggsave("./figures/Richness_perDensity.pdf", height = 7, width = 10)

png("./figures/Richness_perDensity.png", width = 10, height = 7, units = "in", res=200)
Sp2
dev.off()

# analyse treatment effects on richness
S.lm1 <- lm(S ~ Diet+Type+Density+Timepoint, data=S.df)
anova(S.lm1)
summary(S.lm1)
plot(S.lm1)
plot(residuals(S.lm1)~S.df$Diet); abline(0,0)
plot(residuals(S.lm1)~S.df$Type); abline(0,0)
plot(residuals(S.lm1)~S.df$Density); abline(0,0)
plot(residuals(S.lm1)~S.df$Timepoint); abline(0,0)
hist(residuals(S.lm1), breaks=80, col="grey")
# heteroskedasticity for Timepoint
# heavy right-tail in distribution
S.gls1 <- gls(S ~ Diet+Type+Density+Timepoint, data=S.df)
S.gls2 <- gls(S ~ Diet+Type+Density+Timepoint, data=S.df, weights = varIdent(form=~1|Timepoint))
AIC(S.gls1,S.gls2)
anova(S.gls2)
summary(S.gls2)
# based on AIC, S.gls2 is better.
plot(S.gls2)
plot(residuals(S.gls2)~S.df$Diet); abline(0,0)
plot(residuals(S.gls2)~S.df$Type); abline(0,0)
plot(residuals(S.gls2)~S.df$Density); abline(0,0)
plot(residuals(S.gls2)~S.df$Timepoint); abline(0,0)
hist(residuals(S.gls2), breaks=80, col="grey")
# need time correlation structure? (repeated measures)

```

## 5. Correlation between library size and richness  
Check for correlation between increasing library size (sequence depth, number of reads in a sample) and richness.
```{r}
lib.div <- diversities(ps1.exp, index = "all")
lib.div2 <- richness(ps1.exp)

# let us add number of total reads/samples
lib.div$ReadsPerSample <- sample_sums(ps1.exp)
lib.div$Richness <- lib.div2$`0`
colnames(lib.div)
```

Plotting correlation between library size and richness
```{r}
p1 <- ggscatter(lib.div, "shannon", "ReadsPerSample") + 
  stat_cor(method = "pearson")

p2 <- ggscatter(lib.div, "inverse_simpson", "ReadsPerSample",
          add = "loess") + 
  stat_cor(method = "pearson")

p3 <- ggscatter(lib.div, "Richness", "ReadsPerSample",
          add = "loess") + 
  stat_cor(method = "pearson", 
           label.x = 100, 
           label.y = 50000)

ggarrange(p1,p2,p3, ncol=2, nrow = 2)
```
There seem to be some correlations between sequence depth and diversity. So it is better to normalize to a lower sequence depth...?

```{r}
sessionInfo()
```