---
title: "abundancetests"
author: "Stijn Schreven"
date: "12 juni 2018"
output: html_document
---
# Advanced models for differential abundance  

## Data properties  
```{r}
ps1.exp <- readRDS("//wurnet.nl/homes/schre030/My Documents/Courses and workshops/Microbioma data analysis/SpringSchoolOwnData/phyobjects/ps1.exp.rds")

# remove all OTUs with less than 10 reads (approx 400)
x1 <- prune_taxa(taxa_sums(ps1.exp) > 10, ps1.exp) 
summarize_phyloseq(x1)
# remove all samples with less than 500 reads (16 samples)
#x1 <- prune_samples(sample_sums(x1) > 500, x1) # don't subset, because need this level for deseq all contrasts

# discrete count data
print(abundances(x1)[1:5,1:3])

# sparsity
hist(log10(1 + abundances(x1)), 100)
sumx <- sample_sums(x1)

print(ps1.exp)
print(x1)

# rarity
medians <- apply(abundances(x1),1,median)/1e3
A <- melt(abundances(x1))
A$Var1 <- factor(A$Var1, levels = rev(names(sort(medians))))
p <- ggplot(A, aes(x = Var1, y = value)) +
        geom_boxplot() +
    labs(y = "Abundance (reads)", x = "Taxonomic Group") +
    scale_y_log10()
print(p)

# overdispersion
means <- apply(abundances(x1),1,mean)
variances <- apply(abundances(x1),1,var)
## Calculate mean and variance over samples for each taxon
df <- melt(abundances(x1))
names(df) <- c("Taxon", "Sample", "Reads")
df <- df %>% group_by(Taxon) %>%
             summarise(mean = mean(Reads),
                   variance = var(Reads))
## Illustrate overdispersion
p <- ggplot(df, aes(x = mean, y = variance)) +
       geom_point() +
       geom_abline(aes(intercept = 0, slope = 1)) +
       scale_x_log10(labels = scales::scientific) +
       scale_y_log10(labels = scales::scientific) +
       labs(title = "Overdispersion (variance > mean)")
print(p)
ggsave("//wurnet.nl/homes/schre030/My Documents/Courses and workshops/Microbioma data analysis/SpringSchoolOwnData/figures/Overdispersion.pdf", height = 5, width = 7)
```

## Differential abundance testing for sequencing data (DESeq2)  
```{r}
# more info on DESeq2: https://bioconductor.statistik.tu-dortmund.de/packages/3.5/bioc/vignettes/DESeq2/inst/doc/DESeq2.html
# Start by converting phyloseq object to deseq2 format
ds2 <- phyloseq_to_deseq2(x1, ~ Diet + Type + Density + Timepoint) # this notation does not take Timepoint as repeated measures, but as independent factor, which is not true...

# Run DESeq2 analysis (all taxa at once!)
dds <- DESeq(ds2)
# gives: "Error in estimateSizeFactorsForMatrix(counts(object), locfunc = locfunc, : every gene contains at least one zero, cannot compute log geometric means"
# see https://github.com/joey711/phyloseq/issues/387
## this error is because of high prevalence of sparsely sampled OTUs; calculate size factors separately using a zero-tolerant variant of geometric mean
# calculate geometric means prior to estimate size factors
gm_mean <- function(x, na.rm=TRUE){
  exp(sum(log(x[x > 0]), na.rm=na.rm) / length(x))
}
geoMeans <- apply(counts(ds2), 1, gm_mean)
ds2 <- estimateSizeFactors(ds2, geoMeans = geoMeans)
ds2 <- DESeq(ds2, fitType="local") # function takes about 20-30 minutes to run!

# Investigate results
deseq.results <- as.data.frame(results(ds2))
deseq.results$taxon <- rownames(results(ds2))

# Sort (arrange) by pvalue and effect size
deseq.results <- deseq.results %>%
                   arrange(pvalue, log2FoldChange)

# Print the result table
# Let us only show significant hits
knitr::kable(deseq.results %>%
               filter(pvalue < 0.05 & log2FoldChange > 1.5),
         digits = 5)

# Store DESeq results in data frame
res <- results(ds2)
res <- res[order(res$padj, na.last=NA), ]
alpha = 0.01
sigtab <- res[(res$padj < alpha), ]
sigtab <- cbind(as(sigtab, "data.frame"), as(tax_table(x1)[rownames(sigtab), ], "matrix"))
head(sigtab)

# show only significantly enriched OTUs (as opposed to reduced OTUs)
posigtab <- sigtab[sigtab[, "log2FoldChange"] > 0, ]
posigtab <- posigtab[, c("baseMean", "log2FoldChange", "lfcSE", "padj", "Phylum", "Class", "Family", "Genus")]

# Plot results
theme_set(theme_bw())
sigtabgen <- subset(sigtab, !is.na(Genus))
# Phylum order
x <- tapply(sigtabgen$log2FoldChange, sigtabgen$Phylum, function(x) max(x))
x <- sort(x, TRUE)
sigtabgen$Phylum <- factor(as.character(sigtabgen$Phylum), levels=names(x))
# Genus order
x <- tapply(sigtabgen$log2FoldChange, sigtabgen$Genus, function(x) max(x))
x <- sort(x, TRUE)
sigtabgen$Genus <- factor(as.character(sigtabgen$Genus), levels=names(x))
ggplot(sigtabgen, aes(y=Genus, x=log2FoldChange, color=Phylum)) + 
  geom_vline(xintercept = 0.0, color = "gray", size = 0.5) +
  geom_point(size=6) + 
  theme(axis.text.x = element_text(angle = -90, hjust = 0, vjust=0.5))

```

## Extracting DEseq2 results and in-depth output  
```{r}
plotMA(res, ylim=c(-2,2))
# use shrinkage of logFoldChange first
# coef=2 depends on the contrast you want to make (factor levels, by default takes last entered factor), for now just to try out shrinkage effect
resLFC <- lfcShrink(ds2, coef=2, type="apeglm")
resNorm <- lfcShrink(ds2, coef=2, type="normal")
resAsh <- lfcShrink(ds2, coef=2, type="ashr")

# compare spread in shrunk LFC values
par(mfrow=c(1,3), mar=c(4,4,2,1))
xlim <- c(1,1e5); ylim <- c(-3,3)
plotMA(resLFC, xlim=xlim, ylim=ylim, main="apeglm")
plotMA(resNorm, xlim=xlim, ylim=ylim, main="normal")
plotMA(resAsh, xlim=xlim, ylim=ylim, main="ashr")
#apeglm seems best to reduce noise

# Transform count data for visualisation
# Regularized log-transformation
#rld <- rlog(ds2)
# Variance stabilizing transformation (vst)
#vsd <- vst(ds2,blind=F)
vsd <- varianceStabilizingTransformation(ds2,blind=F)

p.vsd <- meanSdPlot(assay(vsd))

# Heatmap of deseq2 output
select <- order(rowMeans(counts(ds2,normalized=TRUE)),
                decreasing=TRUE)[1:50]
df <- as.data.frame(colData(ds2)[,c("Diet","Type","Density","Timepoint")])
heat.vsd <- pheatmap(assay(vsd)[select,], cluster_rows=T, show_rownames=F, show_colnames=F,
         cluster_cols=T, annotation_col=df)
# how to attach genus IDs to OTU in heatmap?
```



