## Beta diversity significance method ## Written by Dr. Arda Gülay (2015), DTU @ Technical University of Denmark. ##If you use this method/codes, please cite: ##Gülay A, Smets BF. 2015. An improved method to set significance thresholds for ß diversity testing in microbial community comparisons. Environ. Microbiol. 17:3154–3167. ##R session info used in this code:--------------------------------------------------------- ##R version 3.1.1 (2014-07-10) ##Platform: x86_64-w64-mingw32/x64 (64-bit) ##locale: ##[1] LC_COLLATE=English_United Kingdom.1252 LC_CTYPE=English_United Kingdom.1252 ##[3] LC_MONETARY=English_United Kingdom.1252 LC_NUMERIC=C ##[5] LC_TIME=English_United Kingdom.1252 ##attached base packages: ##[1] stats graphics grDevices utils datasets methods base ##other attached packages: ##[1] phyloseq_1.9.12 data.table_1.9.6 ggplot2_1.0.0 reshape2_1.4 vegan_2.0-10 ##[6] lattice_0.20-29 permute_0.8-3 ##------------------------------------------------------------------------------------------- #Load libraries library(phyloseq) library(vegan) library(reshape2) library(ggplot2) library(data.table) #Data import (choose one: Use biom or txt based) ## Below is "biom" based data import R-codes using outputs ("biom","sample.data","Tree") from QIIME # Set working directory setwd("H:/Beta diversity/R-code/Phyloseq_based") # Import biom file import_biom("otutable_for_R_code.biom")->sample_biom # Import sample metadata file import_qiime_sample_data("Sample_data.txt")->b ## An example Sample_data: (Determination of Meta community) row names are sampleIDs and Meta-Descriptions ## Each Meta group should at least contain more than one sample ## #SampleID Description ## ISL.F2.Com.a Meta1 ## ISL.F2.Com.b Meta1 ## ISL.F2.Com.c Meta1 ## ISL.F2.Top.a Meta2 ## ISL.F2.Top.b Meta2 ## ISL.F2.Top.c Meta2 # Import phylogenetic tree file read_tree("rep_set_from_unaligned.tree")->c #Below is "species_table" based data import R-codes using outputs ("otutable","sample.data") (you can still use this if you do not have #a phylogenetic tree; select the distance method from below list rather than "unifrac") ## read.table("otutable_for_R_code.txt", header = TRUE)->sample_txt ## otu_table(sample_otutable.txt, taxa_are_rows=TRUE)->sample_biom ## import_qiime_sample_data("Sample_data.txt")->b ##An example otutable: column names are samples, row names are species (could also be species names), values are species counts: ## ISL.F2.Top.b ISL.F2.Com.b ISL.F2.Top.a ISL.F7.Com.a ## 0 0 3 0 7 ## 1 5 0 1 0 ## 2 7 0 1 6 ## 3 0 8 1 0 ## An example Sample_data: (Determination of Meta community) row names are sampleIDs and Meta-Descriptions ## Each Meta group should at least contain more than one sample ## #SampleID Description ## ISL.F2.Com.a Meta1 ## ISL.F2.Com.b Meta1 ## ISL.F2.Com.c Meta1 ## ISL.F2.Top.a Meta2 ## ISL.F2.Top.b Meta2 ## ISL.F2.Top.c Meta2 #Option selections: distance_method<-c("unifrac") ###select the distance method from below: ## UniFrac DPCoA JSD vegdist1 vegdist2 ## "unifrac" "dpcoa" "jsd" "manhattan" "euclidean" ## vegdist3 vegdist4 vegdist5 vegdist6 vegdist7 ## "canberra" "bray" "kulczynski" "jaccard" "gower" ## vegdist8 vegdist9 vegdist10 vegdist11 vegdist12 ## "altGower" "morisita" "horn" "mountford" "raup" ## vegdist13 vegdist14 vegdist15 betadiver1 betadiver2 ## "binomial" "chao" "cao" "w" "-1" ## betadiver3 betadiver4 betadiver5 betadiver6 betadiver7 ## "c" "wb" "r" "I" "e" ## betadiver8 betadiver9 betadiver10 betadiver11 betadiver12 ## "t" "me" "j" "sor" "m" ## betadiver13 betadiver14 betadiver15 betadiver16 betadiver17 ## "-2" "co" "cc" "g" "-3" ## betadiver18 betadiver19 betadiver20 betadiver21 betadiver22 ## "l" "19" "hk" "rlb" "sim" ## betadiver23 betadiver24 dist1 dist2 dist3 ## "gl" "z" "maximum" "binary" "minkowski" ## designdist ## "ANY" unifrac_weighted=c(TRUE) #select weighted or unweighted by writing "TRUE" or "FALSE" for unifrac analysis #Format adjustment merge_phyloseq(sample_biom,b,c)->sample merge_samples(sample,"Description")->sample_meta row.names(sample_data(sample_meta))->j #listing descriptions list_subsamples_phyloseq<-list() otutables_in_meta<-list() Distance_matrix_list<-list() Data_quantile_all<-list() Data_mean_all<-list() Quantitable_list<-list() MeanTable_list<-list() meta_tables<-list() merged_otutables<-list() rarified_otutables<-list() for (i in 1:length(unlist(strsplit(as.character(row.names(sample_data(sample_meta)))," ")))) { otutables_in_meta[[i]]<-subset_samples(sample, Description%in%j[i]) meta_tables[[i]] <- subset_samples(sample_meta, Description%in%i) merged_otutables[[i]] <- merge_phyloseq(otutables_in_meta[[i]],meta_tables[[i]]) subsampled_merged_otutables<-c() subsamples_phyloseq<-c() distance_matrix<-c() melted_distance_matrix<-c() filtered_list<-c() sample_size<-min(sample_sums(merged_otutables[[i]])) #Calculation of rarified otutables: ##As a defult: samples size is the lowest sample size among samples of each Meta-group. ##On the other hand, you can determine your own threshold with "sample.size=" option of "rarefy_even_depth" command below. sample.size = min(sample_sums(merged_otutables[[i]])) for (s in 1:10) { rarified_otutables[[s]] <- rarefy_even_depth(merged_otutables[[i]], replace=TRUE, trimOTUs=FALSE, rngseed=FALSE,sample.size=sample_size) subsampled_merged_otutables<- cbind(subsampled_merged_otutables,otu_table(rarified_otutables[[s]])) } ##Changing colnames in subsampled otutables for unifrac analysis: (sorun burda cikiyor!!) for (k in seq(from=(length(colnames(otu_table(rarified_otutables[[1]])))), to=(length(colnames(otu_table(rarified_otutables[[1]])))*10), by=(length(colnames(otu_table(rarified_otutables[[1]])))))) { if (k==length(colnames(otu_table(rarified_otutables[[1]])))*10) break for (m in 1:length(colnames(otu_table(rarified_otutables[[1]])))) { colnames(subsampled_merged_otutables)[[k+m]]<-paste(colnames(subsampled_merged_otutables)[[m]], k, sep = "_") } } OTU1 <- otu_table(subsampled_merged_otutables, taxa_are_rows=TRUE) subsamples_phyloseq<-phyloseq(OTU1,c) list_subsamples_phyloseq[[i]]<- subsamples_phyloseq rm(subsampled_merged_otutables) rm(subsamples_phyloseq) list_subsamples_phyloseq[[i]] <- prune_taxa(taxa_sums(list_subsamples_phyloseq[[i]]) > 0, list_subsamples_phyloseq[[i]]) ##Calculation of distance matrix if (distance_method=="unifrac") { Distance_matrix_list[[i]] <- UniFrac(list_subsamples_phyloseq[[i]], weighted=unifrac_weighted, normalized=TRUE, parallel=FALSE, fast=TRUE) } else { Distance_matrix_list[[i]] <- distance(list_subsamples_phyloseq[[i]], method = distance_method) } distance_matrix<-as.matrix(na.omit(Distance_matrix_list[[i]])) melted_distance_matrix<-melt(distance_matrix) melted_distance_matrix<-melted_distance_matrix[grep ("[^0.00000000]", melted_distance_matrix$value, ignore.case=T),] Data_quantile<-list() Data_mean<-list() filtered_list<-c() for (t in sample_names(merged_otutables[[i]])) { for (g in sample_names(merged_otutables[[i]])){if (t!=g&&t!=sample_names(meta_tables[[i]])){ filtered_list<-melted_distance_matrix[grep(t, melted_distance_matrix$Var1, ignore.case=F),] filtered_list<-filtered_list[grep(g, filtered_list$Var2, ignore.case=F),] Data_quantile[[paste(t,g,sep = ".vs.")]]<-quantile(filtered_list$value) Data_mean[[paste(t,g,sep = ".vs.")]]<-mean(filtered_list$value) }else{break}}} Data_quantile2<-list() Data_mean2<-list() filtered_list<-c() for (q in 1:length(sample_names(merged_otutables[[i]]))) { filtered_list<-melted_distance_matrix[grep(sample_names(merged_otutables[[i]])[q], melted_distance_matrix$Var1, ignore.case=F),] filtered_list<-filtered_list[grep(sample_names(merged_otutables[[i]])[q], filtered_list$Var2, ignore.case=F),] Data_quantile2[[sample_names(merged_otutables[[i]])[q]]]<-quantile(filtered_list$value) Data_mean2[[sample_names(merged_otutables[[i]])[q]]]<-mean(filtered_list$value) } Data_quantile<-append(Data_quantile,Data_quantile2) Data_mean<-append(Data_mean,Data_mean2) Data_quantile_all<-append(Data_quantile_all,list(Data_quantile)) Data_mean_all<-append(Data_mean_all,list(Data_mean)) ##Calculation of algorithm results QuantileTable<-c() for (w in unique(as.data.frame.table(t(as.data.frame(Data_quantile_all[[i]])))$Var2)) { ww<-paste("^",w,"$",sep = "") filtered0<-as.data.frame.table(t(as.data.frame(Data_quantile_all[[i]])))[grep(ww, as.data.frame.table(t(as.data.frame(Data_quantile_all[[i]])))$Var2, ignore.case=F),] #Quantile region filtered1<-filtered0[grep(".vs.", filtered0$Var1, ignore.case=F, invert=TRUE),] #intra_Beta filtered2<-filtered0[grep(".vs.", filtered0$Var1, ignore.case=F),] # inter_Beta filtered3<-filtered0[grep(row.names(otu_table(meta_tables[[i]])),filtered0$Var1, ignore.case=F),] for (r in unique(filtered2$Var1)) { corrected<-(filtered2[grep(r, filtered2$Var1, ignore.case=F, invert=FALSE),]$Freq)-(((filtered1[grep(unlist(strsplit(r,".vs."))[1], filtered1$Var1, ignore.case=F, invert=FALSE),]$Freq)+(filtered1[grep(unlist(strsplit(r,".vs."))[2], filtered1$Var1, ignore.case=F, invert=FALSE),]$Freq))/2) QuantileTable <-rbind(QuantileTable,c(paste("Corrected_",r,sep = ""),w,corrected)) } QuantileTable <- rbind(QuantileTable,c(c(row.names(otu_table(meta_tables[[i]])),w,filtered3$Freq))) } Quantitable_list[[i]]<- QuantileTable MeanTable<-c() filtered0<-as.data.frame.table(t(as.data.frame(Data_mean_all[[i]])))[grep(".vs.", as.data.frame.table(t(as.data.frame(Data_mean_all[[i]])))$Var1, ignore.case=F),] filtered1<-as.data.frame.table(t(as.data.frame(Data_mean_all[[i]])))[grep(".vs.", as.data.frame.table(t(as.data.frame(Data_mean_all[[i]])))$Var1, invert=TRUE,ignore.case=F),] for (u in filtered0$Var1) { corrected<-(filtered2[grep(u, filtered0$Var1, ignore.case=F, invert=FALSE),]$Freq)-(((filtered1[grep(unlist(strsplit(u,".vs."))[1], filtered1$Var1, ignore.case=F, invert=FALSE),]$Freq)+(filtered1[grep(unlist(strsplit(u,".vs."))[2], filtered1$Var1, ignore.case=F, invert=FALSE),]$Freq))/2) MeanTable <- rbind(MeanTable,c(paste("Corrected_",u,sep = ""),corrected)) } meta<-as.data.frame.table(t(as.data.frame(Data_mean_all[[i]])))[grep(row.names(otu_table(meta_tables[[i]])), as.data.frame.table(t(as.data.frame(Data_mean_all[[i]])))$Var1, ignore.case=F, invert=FALSE),]$Freq MeanTable <-rbind(MeanTable,c(row.names(otu_table(meta_tables[[i]])),meta)) MeanTable_list[[i]]<- MeanTable } ##Formating the method results for output table<-c() table_list<-c() for (i in 1:length(unlist(strsplit(as.character(row.names(sample_data(sample_meta)))," ")))) { a<-reshape(as.data.frame(Quantitable_list[[i]]), idvar="V1", timevar="V2", direction="wide") rownames(a)<-a$V1 a$V1<-NULL for (w in 1:length(colnames(a))) { a[,w]->b table<-cbind(table,as.numeric(levels(b))[b]) colnames(table)[w]<-colnames(a)[w] } rownames(table)<-rownames(a) table_list<-rbind(table_list,table) table<-c() } data.frame(table_list)->table_list table_list$V1 <- factor(row.names(table_list), levels = row.names(table_list)) ##Boxplot visualization of method output ggplot(table_list, aes(x=V1, ymin=`V3.0.`, lower=`V3.25.`, middle=`V3.50.`,upper=`V3.75.`, ymax=`V3.100.`),ordered=TRUE) + geom_boxplot(stat="identity")+theme_bw()+theme(axis.text.x = element_text(angle = 90, hjust = 1)) ##Numerical visualization of results of the method for (i in 1:length(unlist(strsplit(as.character(row.names(sample_data(sample_meta)))," ")))) { a<-data.frame(MeanTable_list[[i]]) rownames(a)<-a$X1 a$X1<-NULL colnames(a)<-paste("Results:",row.names(otu_table(meta_tables[[i]])),"group/","Subsample Size:", min(sample_sums(merged_otutables[[i]])), sep=" ") print(a) if (i==length(unlist(strsplit(as.character(row.names(sample_data(sample_meta)))," ")))){ print("If the corrected inter-beta value > Meta intra-beta value, then, the sample pairs are significantly different. more detail in Gülay et al., 2015") } }