library (iNEXT)
## Warning: package 'iNEXT' was built under R version 4.2.2
library(MicrobiotaProcess)
library(readr)
library(phyloseq)
library(ggplot2)
library(dplyr)
library(tidyverse)
library(plyr)
library(vegan)
library(ggplot2)
library(ggforce)
library(Biostrings)
library(microbiome)
library(DESeq2)
library(tools)
library(compositions)
library(psadd)
library(RRPP)
library(rcompanion)
library(philr)
library(ggordiplots)
load("DADA2_output_and_phyloseq_object.Rdata")
votu <- otu_table(ps)
votu <- as.data.frame(votu) # convert to data frame
str(votu) # check that observations are numeric
anyNA(votu) # check for NA in data
vdata <- sample_data(ps)
str(vdata) # examine classifications
rarecurve <- get_rarecurve(obj=ps, chunks=400)
p_rare <- ggrarecurve(obj=rarecurve, factorNames="Group",
shadow=F,
indexNames=c("Observe")) +
scale_color_manual(values=c("#00AED7", "#FD9347","#a3b18a","#f0a6ca"))+
theme_bw()+
theme(axis.text=element_text(size=10),
panel.grid=element_blank(), panel.border = element_rect(size=1),
strip.background = element_blank(),
strip.text.x = element_blank())
## The color has been set automatically, you can reset it manually by adding scale_color_manual(values=yourcolors)
p_rare <- p_rare + geom_vline(xintercept = min(rowSums(votu)))
vegan::rarecurve(votu, step=100, ylab="ASVs", lwd=1.5, col = vdata$Treatment, label=F)
abline(v=(min(rowSums(votu))))
alpha <- phyloseq::estimate_richness(ps, split=TRUE, measures=c("Observed", "Chao1", "Shannon", "Simpson"))
row.names(alpha) <- sample_names(ps)
sample1 <- sample[-56,]
sample_alpha <- merge(sample1, alpha, by='row.names')
alpha1 <- ggplot(sample_alpha, aes(x = Group, y = Chao1, fill=Group)) +
geom_boxplot() +
theme_classic() +
theme(axis.title = element_text(size = 12, face = "bold", colour = "grey30"),
panel.background = element_blank(), panel.border = element_rect(size =1, fill = NA),
axis.ticks = element_blank(), legend.key = element_blank(),
legend.position = "none")
alpha2 <- ggplot(sample_alpha, aes(x = Group, y = Shannon, fill=Group)) +
geom_boxplot() +
theme_classic() +
theme(axis.title = element_text(size = 12, face = "bold", colour = "grey30"),
panel.background = element_blank(), panel.border = element_rect(size =1, fill = NA),
axis.ticks = element_blank(), legend.key = element_blank(),
legend.position = "none")
alpha3 <- ggplot(sample_alpha, aes(x = Group, y = Simpson, fill=Group)) +
geom_boxplot() +
theme_classic() +
theme(axis.title = element_text(size = 12, face = "bold", colour = "grey30"),
panel.background = element_blank(), panel.border = element_rect(size =1, fill = NA),
axis.ticks = element_blank(), legend.key = element_blank(),
legend.title = element_text(size = 12, face = "bold", colour = "grey30"),
legend.text = element_text(size = 10, colour = "grey30"))
alpha1+alpha2+alpha3
# Check assumptions
shapiro.test((sample_alpha$Observed))
##
## Shapiro-Wilk normality test
##
## data: (sample_alpha$Observed)
## W = 0.89256, p-value = 0.0001386
rcompanion::plotNormalHistogram(sample_alpha$Observed)
rich <- sample_alpha$Shannon
rich.rrpp <- RRPP::lm.rrpp(rich ~ Group,
data = sample_alpha, SS.type="III",
print.progress = FALSE, iter=9999)
anova(rich.rrpp, effect.type = "F")
##
## Analysis of Variance, using Residual Randomization
## Permutation procedure: Randomization of null model residuals
## Number of permutations: 10000
## Estimation method: Ordinary Least Squares
## Sums of Squares and Cross-products: Type III
## Effect sizes (Z) based on F distributions
##
## Df SS MS Rsq F Z Pr(>F)
## Group 3 0.3918 0.13060 0.01744 0.3018 -0.93758 0.8225
## Residuals 51 22.0694 0.43273 0.98256
## Total 54 22.4612
##
## Call: RRPP::lm.rrpp(f1 = rich ~ Group, iter = 9999, SS.type = "III",
## data = sample_alpha, print.progress = FALSE)
ps.nosing <- phyloseq::prune_taxa(phyloseq::taxa_sums(ps) > 1, ps) # Remove singleton
# Check numbers of taxa
ntaxa(ps)
## [1] 5894
ntaxa(ps.nosing)
## [1] 5817
ps.bacteria <- phyloseq::subset_taxa(ps.nosing, Kingdom=="Bacteria") # Subset Bacteria
ps.bacteria
## phyloseq-class experiment-level object
## otu_table() OTU Table: [ 5616 taxa and 55 samples ]
## sample_data() Sample Data: [ 55 samples by 7 sample variables ]
## tax_table() Taxonomy Table: [ 5616 taxa by 7 taxonomic ranks ]
## refseq() DNAStringSet: [ 5616 reference sequences ]
ps.archaea <- phyloseq::subset_taxa(ps.nosing, Kingdom=="Archaea") # Subset Archaea
ps.archaea
## phyloseq-class experiment-level object
## otu_table() OTU Table: [ 152 taxa and 55 samples ]
## sample_data() Sample Data: [ 55 samples by 7 sample variables ]
## tax_table() Taxonomy Table: [ 152 taxa by 7 taxonomic ranks ]
## refseq() DNAStringSet: [ 152 reference sequences ]
# VST using DeSeq2
ps.bac.ds <- phyloseq::phyloseq_to_deseq2(ps.bacteria, ~1)
## converting counts to integer mode
ps.bac.ds <- DESeq2::estimateSizeFactors(ps.bac.ds)
ps.bac.ds <- DESeq2::estimateDispersions(ps.bac.ds, fitType = "parametric")
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
DESeq2::plotDispEsts(ps.bac.ds)
ps.bac.vst <- ps.bacteria
vst.bac <- DESeq2::getVarianceStabilizedData(ps.bac.ds)
phyloseq::otu_table(ps.bac.vst) <- phyloseq::otu_table(vst.bac, taxa_are_rows = TRUE)
ps.bac.vst
## phyloseq-class experiment-level object
## otu_table() OTU Table: [ 5616 taxa and 55 samples ]
## sample_data() Sample Data: [ 55 samples by 7 sample variables ]
## tax_table() Taxonomy Table: [ 5616 taxa by 7 taxonomic ranks ]
## refseq() DNAStringSet: [ 5616 reference sequences ]
# Sanity check
psadd::plot_mv(ps.bacteria, title = "Raw Mean-Variance Plot with log-transform")
psadd::plot_mv(ps.bac.vst, title = "VST Mean-Variance Plot with log-transform")
## Warning in self$trans$transform(x): NaNs produced
## Warning: Transformation introduced infinite values in continuous x-axis
## Warning: Removed 5070 rows containing missing values (geom_point).
phyloseq::plot_bar(ps.bacteria)
phyloseq::plot_bar(ps.bac.vst)
hist(phyloseq::sample_sums(ps.bacteria))
hist(phyloseq::sample_sums(ps.bac.vst))
# Make all values positive
min(otu_table(ps.bac.vst))
## [1] -5.159595
ps.bac.vst.pos <- transform_sample_counts(ps.bac.vst, function(x) x+5.2)
min(otu_table(ps.bac.vst.pos))
## [1] 0.04040469
With this ps object, one can use phyloseq package to plot the beta diversity graph.
I used vegan instead of phyloseq here to have more freedom to graph the plots using ggplot2.
asv.vst.bac <- t(vst.bac)
min(asv.vst.bac)
## [1] -5.159595
asv.vst.bac <- asv.vst.bac + 5.2 # Make sure all data positive
dist.bac <- vegdist(asv.vst.bac, method = "bray")
ord.bac.vst <- cmdscale(dist.bac, eig = TRUE)
scores.bac <- as.data.frame(scores(ord.bac.vst))
scores.bac1 <- scores.bac[match(row.names(sample1), row.names(scores.bac)),] #reorder the scores to desired sample order by matching with sample row names
scores.bac1$Group <- sample1$Group
scores.bac1$Time <- sample1$Time
scores.bac1$HeatShock <- sample1$HeatShock
eig.bac <- ord.bac.vst$eig # Access the Eigenvalues of each principle components
eig.bac.per <- eig.bac/sum(eig.bac) # percentage of eigenvalues
# Remove initial D3 samples in the dataset
scores.bac1.new <- scores.bac1[!grepl("D3",row.names(scores.bac1)),]
ggplot(data = scores.bac1, aes(x = Dim1, y = Dim2)) +
geom_mark_ellipse(data = scores.bac1.new, aes(color = Group))+
geom_point(data = scores.bac1, aes(alpha = factor(Time),colour = Group, shape = Group), size = 5, stroke=1.5) +
geom_point(data=scores.bac1 %>%
filter(HeatShock=="Yes"),
pch = 3, size=2, colour = "#4a4e69", stroke = 1) +
scale_alpha_discrete(range = c(0.2, 1), name = "Time (d)")+
scale_shape_manual(values=c(15, 17, 18, 19))+
scale_colour_manual(values = c( "#4ba3c3","orange","#8cb369","#ff595e")) +
theme(axis.title = element_text(size = 15, face = "bold"),
axis.text.y = element_text(size=12),axis.text.x = element_text(size=12),
panel.background = element_blank(), panel.border = element_rect(size =1.5, fill = NA),
axis.ticks = element_blank(), legend.key = element_blank(),
legend.title = element_text(size = 12, face = "bold", colour = "grey30"),
legend.text = element_text(size = 10, colour = "grey30")) +
labs(colour = "Group") + xlab("PC1 31.0%") + ylab("PC2 14.8%") +
coord_fixed(xlim = c(-0.4,0.4),ylim = c(-0.4,0.3)) +
ggtitle("(B) Bacterial Diversity")+ theme(plot.title = element_text(size = 18,margin = margin(b = 10, unit = "pt")))
## Warning: Using alpha for a discrete variable is not advised.
# Remove Initial D3 samples
asv.vst.bac.new <- asv.vst.bac[!grepl("D3",row.names(asv.vst.bac)),]
dist.bac.new <- vegdist(asv.vst.bac.new, method = "bray")
sample.perm <- sample1[!grepl("D3",row.names(sample1)),]
bac.adonis.G <- adonis2(dist.bac.new ~ Group, data = sample.perm, permutations = 9999, method = "bray")
print(bac.adonis.G)
## Permutation test for adonis under reduced model
## Terms added sequentially (first to last)
## Permutation: free
## Number of permutations: 9999
##
## adonis2(formula = dist.bac.new ~ Group, data = sample.perm, permutations = 9999, method = "bray")
## Df SumOfSqs R2 F Pr(>F)
## Group 3 0.8024 0.13094 2.0592 0.0033 **
## Residual 41 5.3257 0.86906
## Total 44 6.1281 1.00000
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
bac.rrpp <- RRPP::lm.rrpp(dist.bac.new ~ Group,
data=sample.perm, SS.type="III",
print.progress = FALSE,
seed="random",
iter=9999)
anova(bac.rrpp, effect.type = "F", error = c("Residuals"))
##
## Analysis of Variance, using Residual Randomization
## Permutation procedure: Randomization of null model residuals
## Number of permutations: 10000
## Estimation method: Ordinary Least Squares
## Sums of Squares and Cross-products: Type III
## Effect sizes (Z) based on F distributions
##
## Df SS MS Rsq F Z Pr(>F)
## Group 3 0.8024 0.26748 0.13094 2.0592 2.643 0.0032 **
## Residuals 41 5.3257 0.12990 0.86906
## Total 44 6.1281
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Call: RRPP::lm.rrpp(f1 = dist.bac.new ~ Group, iter = 9999, seed = "random",
## SS.type = "III", data = sample.perm, print.progress = FALSE)
group <- interaction(sample.perm$Group)
PW.bac <- pairwise(bac.rrpp, groups = group)
summary(PW.bac,confidence = 0.95, test.type = "dist")
##
## Pairwise comparisons
##
## Groups: C37 T50 T65 T80
##
## RRPP: 10000 permutations
##
## LS means:
## Vectors hidden (use show.vectors = TRUE to view)
##
## Pairwise distances between means, plus statistics
## d UCL (95%) Z Pr > d
## C37:T50 0.1286000 0.2184631 -0.9520118 0.8235
## C37:T65 0.2600543 0.2353704 2.0488974 0.0183
## C37:T80 0.3135088 0.2415742 2.6797890 0.0013
## T50:T65 0.1911724 0.1989987 1.4866287 0.0718
## T50:T80 0.2498265 0.2047458 2.4335826 0.0057
## T65:T80 0.1848961 0.2216693 0.8586459 0.2029
# VST
ps.archaea.pos <- transform_sample_counts(ps.archaea, function(x) x+1) # Avoid 0 in all samples for DESeq2 to work
ps.arc.ds <- phyloseq::phyloseq_to_deseq2(ps.archaea.pos, ~1)
## converting counts to integer mode
ps.arc.ds <- DESeq2::estimateSizeFactors(ps.arc.ds)
ps.arc.ds <- DESeq2::estimateDispersions(ps.arc.ds, fitType = "parametric")
## gene-wise dispersion estimates
## mean-dispersion relationship
## -- note: fitType='parametric', but the dispersion trend was not well captured by the
## function: y = a/x + b, and a local regression fit was automatically substituted.
## specify fitType='local' or 'mean' to avoid this message next time.
## final dispersion estimates
DESeq2::plotDispEsts(ps.arc.ds)
ps.arc.vst <- ps.archaea
vst.arc <- DESeq2::getVarianceStabilizedData(ps.arc.ds)
phyloseq::otu_table(ps.arc.vst) <- phyloseq::otu_table(vst.arc, taxa_are_rows = TRUE)
ps.arc.vst
## phyloseq-class experiment-level object
## otu_table() OTU Table: [ 152 taxa and 55 samples ]
## sample_data() Sample Data: [ 55 samples by 7 sample variables ]
## tax_table() Taxonomy Table: [ 152 taxa by 7 taxonomic ranks ]
## refseq() DNAStringSet: [ 152 reference sequences ]
# Sanity check
psadd::plot_mv(ps.archaea, title = "Raw Mean-Variance Plot with log-transform")
psadd::plot_mv(ps.arc.vst, title = "VST Mean-Variance Plot with log-transform")
## Warning in self$trans$transform(x): NaNs produced
## Warning: Transformation introduced infinite values in continuous x-axis
## Warning: Removed 98 rows containing missing values (geom_point).
phyloseq::plot_bar(ps.archaea)
phyloseq::plot_bar(ps.arc.vst)
hist(phyloseq::sample_sums(ps.archaea))
hist(phyloseq::sample_sums(ps.arc.vst))
# Use Vegan to plot Beta diversity to make graph prettier
asv.vst.arc <- t(vst.arc)
min(asv.vst.arc)
## [1] -1.52006
asv.vst.arc <- asv.vst.arc + 1.6
dist.arc <- vegdist(asv.vst.arc, method = "bray")
ord.arc.vst <- cmdscale(dist.arc, eig = TRUE)
scores.arc <- as.data.frame(scores(ord.arc.vst))
scores.arc1 <- scores.arc[match(row.names(sample1), row.names(scores.arc)),] #reorder the scores to desired sample order by matching with sample row names
scores.arc1$Group <- sample1$Group
scores.arc1$Time <- sample1$Time
scores.arc1$HeatShock <- sample1$HeatShock
eig.arc <- ord.arc.vst$eig
eig.arc.per <- eig.arc/sum(eig.arc)
# Remove initial D3 samples in the dataset
scores.arc1.new <- scores.arc1[!grepl("D3",row.names(scores.arc1)),]
ggplot(data = scores.arc1, aes(x = Dim1, y = Dim2)) +
geom_mark_ellipse(data = scores.arc1.new, aes(color = Group))+
geom_point(data = scores.arc1, aes(alpha = factor(Time),colour = Group, shape = Group), size = 5, stroke=1.5) +
geom_point(data=scores.arc1 %>%
filter(HeatShock=="Yes"),
pch = 3, size=2, colour = "#4a4e69", stroke = 1) +
scale_alpha_discrete(range = c(0.2, 1), name = "Time (d)")+
scale_shape_manual(values=c(15, 17, 18, 19))+
scale_colour_manual(values = c("#4ba3c3","orange","#8cb369","#ff595e")) +
theme(axis.title = element_text(size = 15, face = "bold"),
axis.text.y = element_text(size=12),axis.text.x = element_text(size=12),
panel.background = element_blank(), panel.border = element_rect(size =1.5, fill = NA),
axis.ticks = element_blank(), legend.key = element_blank(),
legend.title = element_text(size = 12, face = "bold", colour = "grey30"),
legend.text = element_text(size = 10, colour = "grey30")) +
labs(colour = "Group") + xlab("PC1 34.2%") + ylab("PC2 18.2%") +
coord_fixed(xlim = c(-0.4,0.4),ylim = c(-0.4,0.3)) +
ggtitle("(A) Archaeal Diversity")+ theme(plot.title = element_text(size = 18,margin = margin(b = 10, unit = "pt")))
## Warning: Using alpha for a discrete variable is not advised.
# permANOVA
asv.vst.arc.new <- asv.vst.arc[!grepl("D3",row.names(asv.vst.arc)),]
dist.arc.new <- vegdist(asv.vst.arc.new, method = "bray")
sample.perm <- sample1[!grepl("D3",row.names(sample1)),]
arc.adonis.G <- adonis2(dist.arc.new ~ Group, data = sample.perm, permutations = 9999, method = "bray")
print(arc.adonis.G)
## Permutation test for adonis under reduced model
## Terms added sequentially (first to last)
## Permutation: free
## Number of permutations: 9999
##
## adonis2(formula = dist.arc.new ~ Group, data = sample.perm, permutations = 9999, method = "bray")
## Df SumOfSqs R2 F Pr(>F)
## Group 3 0.8988 0.15235 2.4564 0.0034 **
## Residual 41 5.0005 0.84765
## Total 44 5.8993 1.00000
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
arc.rrpp <- RRPP::lm.rrpp(dist.arc.new ~ Group,
data=sample.perm, SS.type="III",
print.progress = FALSE,
seed="random",
iter=9999)
anova(arc.rrpp, effect.type = "F", error = c("Residuals"))
##
## Analysis of Variance, using Residual Randomization
## Permutation procedure: Randomization of null model residuals
## Number of permutations: 10000
## Estimation method: Ordinary Least Squares
## Sums of Squares and Cross-products: Type III
## Effect sizes (Z) based on F distributions
##
## Df SS MS Rsq F Z Pr(>F)
## Group 3 1.7677 0.58922 0.1279 2.0044 2.7128 0.0025 **
## Residuals 41 12.0525 0.29396 0.8721
## Total 44 13.8201
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Call: RRPP::lm.rrpp(f1 = dist.arc.new ~ Group, iter = 9999, seed = "random",
## SS.type = "III", data = sample.perm, print.progress = FALSE)
group <- interaction(sample.perm$Group)
PW.arc <- pairwise(arc.rrpp, groups = group)
summary(PW.arc,confidence = 0.95, test.type = "dist")
##
## Pairwise comparisons
##
## Groups: C37 T50 T65 T80
##
## RRPP: 10000 permutations
##
## LS means:
## Vectors hidden (use show.vectors = TRUE to view)
##
## Pairwise distances between means, plus statistics
## d UCL (95%) Z Pr > d
## C37:T50 0.1929113 0.3210114 -1.094786 0.8633
## C37:T65 0.3361846 0.3472361 1.524595 0.0688
## C37:T80 0.4793835 0.3550464 2.895115 0.0008
## T50:T65 0.2691085 0.2909663 1.293633 0.1010
## T50:T80 0.3803979 0.3014603 2.646089 0.0028
## T65:T80 0.3097240 0.3249433 1.431697 0.0770