#Thesis Title: BENEFICIAL PLANT-MICROBE INTERACTIONS IN AGROECOSYSTEMS: DECIPHERING THE RHIZOSPHERE MICROBIAL COMMUNITY 
#              IN FIELD GROWN Brassica napus L. 
# PhD Candidate: Zelalem M. Taye
# Electronic archive for the codes used in the data analysis

# 1. Sequence data processing

## FASTQ files were processed using DADA2 implemented in QIIME2. Data processing was done on a server.
# Below you will find script to install and get started with your sequence data processiong.

# a.	Set up for processing sequences reads using QIIME2 on a server  

## First obtain access to server
##ssh to the server and provide your password as needed

ssh xxx@oats.usask.ca # ssh to your account name xxx, provide password when prompted.

## create working director for your analysis. Remember where you created your directory.

mkdir canola_2016

## Install Miniconda and qiime2: using the guide for Linux system. Please refer to QIIME2 website 
## (https://qiime2.org/) for more details.

## install miniconda first

wget https://repo.continuum.io/miniconda/Miniconda3-3.7.0-Linux-x86_64.sh -O ~/miniconda.sh

bash ~/miniconda.sh -b -p $HOME/miniconda
export PATH="$HOME/miniconda/bin:$PATH"
conda update conda
conda install wget

## install qiime2: through the Linux approach from qiime2 webpage. See QIIME2 webpage 
## (qiime2.https://org/) for more details if needed.

wget https://data.qiime2.org/distro/core/qiime2-2019.1-py36-linux-conda.yml
conda env create -n qiime2-2019.1 --file qiime2-2019.1-py36-linux-conda.yml

## Useful scripts for working between your local machine and server
## Transferring files to and from server. First exist from the server and cd to your local directory 
## that contains files you want to transfer.

## to transfer individual files

scp paired-end-trimmed_MF_17.qza xxx@oats:/home/xxx/canola_2016
scp manifest.csv xxx@oats:/home/xxx/canola_2016
scp MF_metadata.txt xxx@oats:/home/xxx/canola_2016

## to transfer the whole folder

scp -r Soil xxx@oats:/home/xxx/canola_2016/soilraw
scp -r primer_trimmed_fastqs xxx@oats:/home/xxx/canola_2016

## To transfer from server to local folder

scp xxx@oats:/home/xxx/canola_2016/paired-end-trimmed.qza /mnt/d
scp xxx@oats:/home/xxx/canola_2016/paired-end-trimmed.qzv /mnt/d

## to copy a file from one directory into another in linux

cp my_SILVA_V3_V5_342F_806R_qiime2_2019_1_classifier.qza /home/xxx/canola_2016/
  my_SILVA_V3_V5_342F_806R_qiime2_2019_1_classifier.qza

## to extract tar.gz files

tar xvzf gg_13_8_otus.tar.gz

#### ................................#### ........................................... ####

## b.	Process the sequence data using QIIME2

## import your data (fastq) into qiime. To import an already demultiplexed sequence files use the manifest 
## approach in qiime tutorial (https://qiime2.org/).

## use qiime tools import command to import. Copy and paste the following command. 
## Just change the input-path and output- path for your specific case

qiime tools import \
--type 'SampleData[PairedEndSequencesWithQuality]' \
--input-path manifest.csv \
--output-path paired-end-trimmed.qza \
--input-format PairedEndFastqManifestPhred33

## visualized the imported paired end reads
qiime demux summarize \
--i-data paired-end-trimmed.qza \
--o-visualization paired-end-trimmed.qzv    ## qzv files can be visualized in qiime2's webpage (qiime2.https://org/)

## denoising DAAD2: trim and trunc specification will be decided based on quality score. 
## Examine the qza file and decide on these parameters. You will also be able to 
## check how much of your sequence were retained after applying your parameters (denoising stats).

qiime dada2 denoise-paired \
--i-demultiplexed-seqs paired-end-trimmed.qza \
--p-trim-left-f 13 \
--p-trim-left-r 0 \
--p-trunc-len-f 260 \
--p-trunc-len-r 208 \
--o-table table.qza \
--o-representative-sequences rep-seqs.qza \
--o-denoising-stats denoising-stats.qza

## visualize the denoising stats

qiime metadata tabulate \
--m-input-file denoising-stats.qza \
--o-visualization denoising-stats.qzv

##FeatureTable and FeatureData summaries: generate summaries of the artifacts generate by dada2 denoising step

qiime feature-table summarize \
--i-table table.qza \
--o-visualization table.qzv \
--m-sample-metadata-file metadata.txt

qiime tools export \
--input-path table.qza \
--output-path feature-table
qiime feature-table tabulate-seqs \
--i-data rep-seqs.qza \
--o-visualization rep-seqs.qzv

## Now train taxonomy classifier and assign taxonomy
## train classifying sequence.
## Download latest greengenes database 
## use the 99 seq and taxonomy for training
## change sequence in to qiime artifact

qiime tools import \
--type 'FeatureData[Sequence]' \
--input-path 99_otus.fasta \
--output-path 99_seqs.qza 

qiime tools import \
--type 'FeatureData[Taxonomy]' \
--input-format HeaderlessTSVTaxonomyFormat \
--input-path 99_otu_taxonomy.txt \
--output-path ref-taxonomy.qza

qiime feature-classifier extract-reads \
--i-sequences 99_seqs.qza \
--p-f-primer CTACGGGGGGCAGCAG  \
--p-r-primer GGACTACCGGGGTATCT \
--p-min-length 100 \
--p-max-length 440 \
--o-reads ref-seqsg.qza

qiime feature-classifier fit-classifier-naive-bayes \
--i-reference-reads ref-seqsg.qza \
--i-reference-taxonomy ref-taxonomy.qza \
--o-classifier my_ggenes_V3_V5_342F_806R_qiime2_2019_1_classifier.qza

## Now you have trained your classifier proceed to testing it on your denoised rep.sequence.

qiime feature-classifier classify-sklearn \
--i-classifier my_ggenes_V3_V5_342F_806R_qiime2_2019_1_classifier.qza \
--i-reads rep-seqs.qza \
--o-classification taxonomy2g.qza

qiime tools export \
--input-path taxonomy2g.qza \
--output-path taxonomy-with-spaces2g
qiime metadata tabulate \
--m-input-file taxonomy-with-spaces2g/taxonomy.tsv  \
--o-visualization taxonomy-as-metadata2.qzv

qiime tools export \
--input-path taxonomy-as-metadata2.qzv \
--output-path taxonomy-as-metadata2g

qiime tools import \
--type 'FeatureData[Taxonomy]' \
--input-path taxonomy-as-metadata2g/metadata.tsv \
--output-path taxonomy-without-spaces2g.qza

qiime metadata tabulate \
--m-input-file taxonomy-without-spaces2g.qza \
--o-visualization taxonomy-without-spaces2g.qzv

qiime tools export \
--input-path taxonomy-without-spaces2g.qza \
--output-path taxonomy-without-spaces2g

## generate interactive bar plots with the taxonomy

qiime taxa barplot \
--i-table table.qza \
--i-taxonomy taxonomy-without-spaces2g.qza \
--m-metadata-file metadata.txt \
--o-visualization taxa-bar-plots2g.qzv

## Filtering mitochondria, chloroplast from: table.qza, rep.seq
#filter table-qza

qiime taxa filter-table \
--i-table table.qza \
--i-taxonomy taxonomy-without-spaces2g.qza \
--p-exclude mitochondria,chloroplast \
--o-filtered-table table-no-mitochondria-no-chloroplast2g.qza

qiime feature-table summarize \
--i-table table-no-mitochondria-no-chloroplast2g.qza \
--o-visualization table-no-mitochondria-no-chloroplast2g.qzv \
--m-sample-metadata-file metadata.txt

## export filtered feature table as biome
qiime tools export \
--input-path table-no-mitochondria-no-chloroplast2g.qza \
--output-path feature-table-m-c-filtered2g

##Generate a tree 

qiime phylogeny align-to-tree-mafft-fasttree \
--i-sequences rep-seqs.qza \
--o-alignment aligned-rep-seqs.qza \
--o-masked-alignment masked-aligned-rep-seqs.qza \
--o-tree unrooted-tree.qza \
--o-rooted-tree rooted-tree.qza

## ........................## ....................##........................##

# c.	QIIME2 to phyloseq object: example script to move from Qiime2 output to working with phyloseq object

setwd("D:/.. ") # set working directory

## read the otu table. This is qiime2 feature tabel (asv) converted to txt format. Conver the biome to txt file.

##Mitochondria and chloroplast are removed 

otu<-read.table(file = "otu_table-no-mc_SC.txt",header = TRUE)
head(otu)

##read taxonomy ## from qiime2

tax<-read.table(file = "taxonomy_SC.txt",sep = '\t', header = TRUE)
head(tax)

## merge the above two files: the merging is necessary as the two files have different row length. 
## We have filtered out chloroplast and mitochondria from the asv but not from taxonomy. By merging we will match them.

merged_file<-merge(otu,tax,by.x=c("OTUID"),by.y=c("OTUID"))

## output merged file

write.table(merged_file, file = "combined_otu_tax_SC",sep = '\t',col.names = TRUE,row.names = FALSE)

## You need to open the merged files and create two files. 
## one for taxonomy taking the seq and taxonomy columns (also change the ';' separated taxon to column).
## This is done manually

## asv table with sample names and abundance of each sequence. 
## Then these independent files will be imported to R again for further processing.

### upload the final tax and seq(otu) files and convert into phyloseq object

## load the following libraries or install if you don't have

library(ggplot2)
library(phyloseq)
library(ape)
library(biomformat)

## read seq table

otu_table=read.csv("seq_final_SC.csv", sep = ",",row.names = 1)
otu_table=as.matrix(otu_table)

## read in taxonomy: have been separated into different taxon level above- manually in excel

taxonomy=read.csv("tax_final_SC.csv",sep="," ,row.names=1)
taxonomy=as.matrix(taxonomy)

## metadata=read.table("metadata.txt",row.names = 1)# Reading it as txt file created a problem downstream 
## by not showing sample variables in phyloseq object

metadata=read.csv("SC_metadata.csv",sep="," ,row.names=1)
head(metadata)

### import them as phyloseq objects

OTU=otu_table(otu_table,taxa_are_rows=TRUE)
TAX=tax_table(taxonomy)
META=sample_data(metadata)

## cheke that your otu names are consistent across objects

taxa_names(TAX)
taxa_names(OTU)

## make sure files have the same sample names

sample_names(OTU)
sample_names(META)

## Everything looks good --- now proceed to merging them into one phyloseq object

physeq=phyloseq(OTU,TAX,META)

sample_variables(physeq)## check if sample variables are correctly presented in the phyloseq object

## now collect the Aliivibrio and adjust for sequencing depth

can_rhi_2016.fischeri <- subset_taxa(physeq, Genus == "Aliivibrio" )

## remove internal standard from dataset

can_rhi_2016.no_fischeri <- subset_taxa(physeq, Genus != "Aliivibrio")

## now using the internal standard to correct the abundance

fischeri <- as.vector(sample_sums(can_rhi_2016.fischeri))   ## vector created for the internal standard
wi <- 1e-10  ## weight of internal standard gDNA added to the samples
gi <- 4.49e-15  ### Weight of genome of the internal standard
ci <- 8   ##16S copy number of the internal standard
adj <- wi/gi*ci
fischeri.1 <- adj/fischeri
fischeri.1[is.infinite(fischeri.1)] <- 0
snv.table <- as.matrix(can_rhi_2016.no_fischeri@otu_table)  ### export asv table as matrix
snv.table.norm <- as.data.frame(sweep(snv.table, 2, fischeri.1, `*`))  # This divides the abundance matrix 
                                                                      # by the vector to normalize the data
tax.table.norm <- as.matrix(can_rhi_2016.no_fischeri@tax_table)  # This will extract taxonomic information 
                                                                # excluding the internal standard 

## recreate the phyloseq object using the internal standard abundance adjusted otutable
## import them as phyloseq objects

OTU=otu_table(snv.table.norm,taxa_are_rows=TRUE)
TAX=tax_table(tax.table.norm)
META=sample_data(metadata)

## check that your otu names are consistent across objects

taxa_names(TAX)
taxa_names(OTU)

## make sure files have the same sample names

sample_names(OTU)
sample_names(META)

## Everything looks good --- now proceed to merging them into one phyloseq object

physeq.norm=phyloseq(OTU,TAX,META)
sample_variables(physeq.norm)

## Further processing may be needed based on your data. For example to remove duplicates (PCR and extraction), checks, etc.
## You can subset by week separate fiels or you can during analysis within your phyloseq object 

rhiz_2016.norm.w1 <- subset_samples(can_2016.no.dup.2, Week == "1")   # example Subset to soil w1 samples

rhiz_2016.norm.2.w1<- prune_taxa(taxa_sums(rhiz_2016.norm.w1) > 0, rhiz_2016.norm.w1) # Remove zero-sum taxa before                                                                                             # outputting

## Remove zero-sum taxa before outputting (this also makes subsequent processing steps proceed much more quickly)

can_2016.norm.soil_zeor_fintered <- prune_taxa(taxa_sums(can_2016.no.dup.2) > 0, can_2016.no.dup.2)

## save as phyloseq object the zero sum filtered object
save(can_2016.norm.soil_zeor_fintered,file="D:/out_puts/..../myphylo_zero_sum_filtered_SC_17.RData")

## export taxa, asv and metadata files of the filtered phyloseq object

can.snv.table.2 <- as.matrix(can_2016.norm.soil_zeor_fintered@otu_table)
can.tax.table.2 <- as.matrix(can_2016.norm.soil_zeor_fintered@tax_table)
can.sam.table.2 <- as.data.frame(can_2016.norm.soil_zeor_fintered@sam_data)

# write the above files
write.csv(can.snv.table.2, "D:/out_puts/...../can.asv.rhizo_SC.csv")
write.csv(can.tax.table.2, "D:/out_puts/...../can.tax.rhizo_SC.csv")
write.csv(can.sam.table.2, "D:/out_puts/....../can.sam.rhizo_SC.csv")

##creat biomfile: for future use when needed

can.2016.biom.rhizo <- make_biom(can.snv.table.2, can.sam.table.2, can.tax.table.2)
write_biom(can.2016.biom.rhizo, "D:/out_puts//canola_rhizo_SC.biom")

# ........................................... # .............................. # .............................. # .............................

#Summary_statistics_canola_sequence_data.R file
### This scripts are to do several summary stats on the final phyloseq object
# load the phyloseq object saved as can.physq.data.2016
#### Summary statistics 2016 canola sequence data##
# load can.physq.data.2016 object

## Published article title: Core and Differentially Abundant Bacterial Taxa in the Rhizosphere of Field Grown Brassica #napus Genotypes: Implications for Canola Breeding

##2016 phyloseq object

can_2016.norm.soil_zeor_fintered

# There ate 12567 zero sum taxa in 477 samples
#phyloseq-class experiment-level object
#otu_table()   OTU Table:         [ 12567 taxa and 477 samples ]
#sample_data() Sample Data:       [ 477 samples by 22 sample variables ]
#tax_table()   Taxonomy Table:    [ 12567 taxa by 7 taxonomic ranks ]

# number of taxa

ntaxa(can_2016.norm.soil_zeor_fintered) # there are 12567 taxa

#number of samples

nsamples(can_2016.norm.soil_zeor_fintered)## there are 477 samples 


### summery of the sample reads

sum(sample_sums(can_2016.norm.soil_zeor_fintered))## 1571433759 total number of reads in the dataset
min(sample_sums(can_2016.norm.soil_zeor_fintered))### 5925.293 minimum number of reads in the dataset
max(sample_sums(can_2016.norm.soil_zeor_fintered))### 354234808 maximum number of reads in the dataset
mean(sample_sums(can_2016.norm.soil_zeor_fintered))### 3294410 mean number of reads in the dataset

### creat a data table with total number of reads per sample (per canola line)

library(data.table)
sample.read.per.sample = data.table(as(sample_data(can_2016.norm.soil_zeor_fintered), "data.frame"),
                                    TotalReads = sample_sums(can_2016.norm.soil_zeor_fintered), keep.rownames = TRUE)

## write as csv fiel

write.csv(sample.read.per.sample, 'D:/out_puts/phyloseq/with_green_genes_taxa/summary_tables/totalread.per.sample_gg.csv')


## if you want to work on the rounded abundance valuece to nearest intiger run the following code first

canola2016_nearest_integer_gg<-ceiling(otu_table(can_2016.norm.soil_zeor_fintered))

### merge the otu table with integer values with taxa table and meta data

can.physq.data.2016.nearest.intiger_gg<-merge_phyloseq(canola2016_nearest_integer_gg,tax_table(can_2016.norm.soil_zeor_fintered),
                                                    sample_data(can_2016.norm.soil_zeor_fintered))

#save nearest intiger as RData for later easier loading

save(can.physq.data.2016.nearest.intiger_gg,file='D:/out_puts/phyloseq/with_green_genes_taxa/canola2016_nearest_integer_filtered_gg.RData')

## based on rounded to nearest intiger data

sum(sample_sums(can.physq.data.2016.nearest.intiger_gg))##1571475596 total number of reads in the dataset
min(sample_sums(can.physq.data.2016.nearest.intiger_gg))### 5936 minimum number of reads in the dataset
max(sample_sums(can.physq.data.2016.nearest.intiger_gg))### 354234976 maximum number of reads in the dataset
mean(sample_sums(can.physq.data.2016.nearest.intiger_gg))### 3294498 mean number of reads in the dataset

## the following is a function to get the most abundandant taxa for each sample and its corresponding abundance:find.top.taxa

find.top.taxa <- function(x,taxa){
  require(phyloseq)
  top.taxa <- tax_glom(x, taxa)
  otu <- otu_table(t(top.taxa))
  if (taxa_are_rows(otu)){
    otu <- t(otu)
  }
  tax <- tax_table(top.taxa)
  j<-apply(otu,1,which.max)
  k <- j[!duplicated(j)]
  l <- data.frame(tax[k,])
  m <- data.frame(otu[,k])
  colnames(m) = l[,taxa]
  n <- colnames(m)[apply(m,1,which.max)]
  m[,taxa] <- n
  return(m)
}
top.taxa <- find.top.taxa(can_2016.norm.soil_zeor_fintered,"Phylum")

#

### use the find.top.taxa function to get the most abundant phyla within each sample

top.taxa <- find.top.taxa(can_2016.norm.soil_zeor_fintered,"Phylum")

### save the top phyla per sample as csv file: summarize frequency of 

write.csv(top.taxa, 'D:/out_puts/phyloseq/with_green_genes_taxa/summary_tables/top.phylum.per.sample_gg.csv')


#### creat csv file with prevalence and total abunndance of each snv and add taxomomy of each snv

prevelancedf = apply(X = otu_table(can_2016.norm.soil_zeor_fintered),
                     MARGIN = 1,
                     FUN = function(x){sum(x > 0)})                              ### calculate prevalence of each snv


# Add taxonomy and total read counts to this data.frame

prevelancedf = data.frame(Prevalence = prevelancedf,
                          TotalAbundance = taxa_sums(can_2016.norm.soil_zeor_fintered),
                          tax_table(can_2016.norm.soil_zeor_fintered))
prevelancedf[1:10,]

# write it as a csv file

write.csv(prevelancedf, 'D:/out_puts/phyloseq/with_green_genes_taxa/summary_tables/Prevalenceandtotalabundace_each_snv_gg.csv')

## plot ##Plot of all phylum showing total abunandce and relative prevalence in each sample

library(ggplot2)

prevelancedf1 = subset(prevelancedf, Phylum %in% get_taxa_unique(can_2016.norm.soil_zeor_fintered, taxonomic.rank = "Phylum"))

ggplot(prevelancedf1, aes(TotalAbundance, Prevalence / nsamples(can_2016.norm.soil_zeor_fintered),color=Phylum)) +theme_bw()+
  geom_point(size = 2) +
  scale_x_log10() +  xlab("Total Abundance") + ylab("Prevalence [Frac. Samples]") + theme(axis.text.x=element_text(angle=90)) +
  facet_wrap(~Phylum) + theme(legend.position="none")

#prevelance/abundance phylum 

phylum_prevalece_totaladundance=plyr::ddply(prevelancedf, "Phylum", function(df1){
  data.frame(frequence=length(df1$Prevalence), mean_prevalence=mean(df1$Prevalence),total_abundance=sum(df1$TotalAbundance,na.rm = T),stringsAsFactors = F)
})

# write it as a csv file
write.csv(phylum_prevalece_totaladundance, 'D:/out_puts/phyloseq/with_green_genes_taxa/summary_tables/PHYLUM_Prevalence_totalabundace_gg.csv')


#### Summary of the three major phyla : total abundance, mean, Acidobacteria, Actinobacteria, Proteobacteria 

threemajorphyla = subset_taxa(can_2016.norm.soil_zeor_fintered, Phylum=="Acidobacteria" | Phylum=="Actinobacteria" 
                              |Phylum=="Proteobacteria")  ## subset the three phyla that you want to have summary for


threemajorphyla_glom=tax_glom(threemajorphyla, taxrank="Phylum")

threemajorphyla_glom_df = psmelt(threemajorphyla_glom)# change to data frame

summary_threemajorphyla=ddply(threemajorphyla_glom_df,c("CanolaLine","Phylum"), summarise, N=length(Abundance), 
                   Average_Abundace=mean(Abundance),Minimum=min(Abundance),Maximum=max(Abundance), Total=sum(Abundance))

### a data frame with the summary you want

write.csv(summary_threemajorphyla,"D:/out_puts/phyloseq/with_green_genes_taxa/summary_tables/summary_threemajorphyla_perline_gg.csv")# save this summary for reporting


# .............................. # ............................................ # ...................................... # .............................
#Chapter 3
#Title:Core and Differentially Abundant Bacterial Taxa in the Rhizosphere of Field Grown Brassica napus Genotypes:
#Implications for Canola Breeding 
#Taye et al., 2020

setwd("D:/.. ") # set working directory
# load
library(microbiomeSeq)
library(phyloseq)
library(ggplot2)
library(vegan)
library(plotly)

## Alpha diversity
# with canola line as grouping
## rename YN04-C1213 TO NAM-94
#load the NAM-94 Corrected sample data

can.sam.rhizo_gg <- read.csv("D:/out_puts/phyloseq/with_green_genes_taxa/can.sam.rhizo_gg.csv", sep=",", row.names=1)## new                                                                           # sample_data with corrected canola line neame
sampledata=as.data.frame(can.sam.rhizo_gg)
head(sampledata)

# turn the sampledata dataframe into sample_data. Don't forget this step otherwise your new variable will not be merged in the phyloseq object you create

sampledata=sample_data(sampledata)

## merge the sample data with the phyloseq object using merge_phyloseq 

phyloseq.object.corrected.line.names.2016 = merge_phyloseq(can.physq.data.2016.nearest.intiger_gg, sampledata)

# save this phyloseq object for later use

save(phyloseq.object.corrected.line.names.2016,file="D:/out_puts/phyloseq/with_green_genes_taxa/phyloseq.object.corrected.line.names.2016_gg.RData")

# check levels of the canola.Line variable

sample_variables(phyloseq.object.corrected.line.names.2016) ## your new variable is added, now proceed with your analysis

### Different normalizations affect the diversity analysis. 

# First without any normalization as we have already normalized our data with internal standard. Eveness was not significant with internal standadr normalized data, so it is removed.

p<-plot_anova_diversity(phyloseq.object.corrected.line.names.2016, method = c("richness", "simpson", "shannon","evenness"), grouping_column = "Canola.Lines", pValueCutoff = 0.05) 

p=p+xlab("Canola Lines")

p=p+theme(axis.title.x = element_text(size = 14, face = "bold"), axis.title.y = element_text(size = 14, face = "bold"))
p=p+theme(axis.text.x = element_text(size=12), axis.text.y=element_text(size = 12))
print(p)

# normalize abundance by relative abundance method, relative abundance normlization creates decimal numbers and the alpha diversity analsis in vegan only takes intgers. 

phyloseq.object.corrected.line.names.2016.relative = normalise_data(phyloseq.object.corrected.line.names.2016, norm.method = "relative")
phyloseq.object.corrected.line.names.2016.relative_integer<-ceiling(otu_table(phyloseq.object.corrected.line.names.2016.relative))
phyloseq.object.corrected.line.names.2016.relative_integer<-merge_phyloseq(phyloseq.object.corrected.line.names.2016.relative_integer,tax_table(phyloseq.object.corrected.line.names.2016.relative),sample_data(phyloseq.object.corrected.line.names.2016.relative))

#p1<-plot_anova_diversity(phyloseq.object.corrected.line.names.2016.relative, method = c("richness", "simpson", "shannon","evenness"), grouping_column = "Canola.Lines", pValueCutoff = 0.05) 
p1<-plot_anova_diversity(phyloseq.object.corrected.line.names.2016.relative_integer, method = c("richness", "simpson", "shannon","evenness"), grouping_column = "Canola.Lines", pValueCutoff = 0.05) 

p1=p1+xlab("Canola Lines")

p1=p1+theme(axis.title.x = element_text(size = 14, face = "bold"), axis.title.y = element_text(size = 14, face = "bold"))
p1=p1+theme(axis.text.x = element_text(size=12), axis.text.y=element_text(size = 12))
print(p1)


### Alpha divrsity comparison following: random sub sampling normalization approach

set.seed(TRUE)
phyloseq.object.corrected.line.names.2016.randsubsampling = normalise_data(phyloseq.object.corrected.line.names.2016, norm.method = "randomsubsample")
p1<-plot_anova_diversity(phyloseq.object.corrected.line.names.2016.randsubsampling, method = c("richness", "simpson", "shannon","evenness"), grouping_column = "Canola.Lines", pValueCutoff = 0.05) 

p1=p1+xlab("Canola Lines")

p1=p1+theme(axis.title.x = element_text(size = 12, face = "bold", family="TT Times New Roman"), axis.title.y = element_text(size = 12, face = "bold", family="TT Times New Roman"))
p1=p1+theme(axis.text.x = element_text(size=10, family="TT Times New Roman"), axis.text.y=element_text(size = 10,family="TT Times New Roman"))
print(p1)

# Alpha diversiy comparison following edgeR normlalization method : I will be using the edgeR normalized alpha diversity output for the manuscript.

phyloseq.object.corrected.line.names.2016.edgernorm = normalise_data(phyloseq.object.corrected.line.names.2016, norm.method = "edgernorm")
p2<-plot_anova_diversity(phyloseq.object.corrected.line.names.2016.edgernorm, method = c("richness","evenness"), grouping_column = "Canola.Lines", pValueCutoff = 0.05, outfile='D:/out_puts/phyloseq/with_green_genes_taxa/microbiomeseq_analyssi/alpha_diversity_line_rich_evennes.csv') 

p2=p2+xlab("")

p2=p2+theme(axis.title.x = element_blank(), axis.title.y = element_text(12, face = "bold", family="TT Times New Roman", colour="black"))
p2=p2+theme(axis.text.x = element_text(size=8, family="TT Times New Roman"), axis.text.y=element_text(size = 8, family="TT Times New Roman"))+theme(legend.position = "none")
print(p2)

### With sampling week as a grouping factor
#creat a factor column for week as it is indicated in intiger names

sample_data(phyloseq.object.corrected.line.names.2016.edgernorm)$Weeks = as.factor(sample_data(phyloseq.object.corrected.line.names.2016.edgernorm)$Week)

## now plot the alphadiversity :eveness was not significant

p3<-plot_anova_diversity(phyloseq.object.corrected.line.names.2016.edgernorm, method = c("richness", "simpson", "shannon", "evenness"), grouping_column = "Weeks", pValueCutoff = 0.05) 

p3=p3+xlab("Sampling Week")

p3=p3+theme(axis.title.x = element_text(size = 14), axis.title.y = element_text(size = 14))
p3=p3+theme(axis.text.x = element_text(size=12, angle = 0), axis.text.y=element_text(size = 12))

print(p3)

####2016 beta-diversity

#PERMANOVA ON edgeR method normalized data using adonis: using phyloseq # 

set.seed(100)
can_bray.2016 <-phyloseq::distance(phyloseq.object.corrected.line.names.2016.edgernorm, method = "bray")# calculate bray                                                                                                    curtis distance matrix

sampledf = data.frame(sample_data(phyloseq.object.corrected.line.names.2016.edgernorm))# make a data frame from the                                                                                               sample_data of the phyloseq object

adonis(can_bray.2016~Canola.Lines, data = sampledf) # perform permanova test using adonis function in Vegan ## is significant- rejucting that genotypes have the same centroid 

beta = betadisper(can_bray.2016, sampledf$Canola.Lines)# Homogenity of dispersion test 

permutest(beta)## was not significant - so we can not reject the null hypothessis that canola genotypes have the same disperssion. This makes accepting the adonis result and that it is not due to difference in group disperssion

#Calculate mean distance between genotypes

can_bray_mean = meandist(can_bray.2016, sampledf$Canola.Lines)## calculates mean distance between canola lines
head(can_bray_mean) ## this is a dist object, so first change it into matrix
library(reshape) # you need this to export the distance measures in data.frame format
can_bray_mean_matrix = as.matrix(can_bray_mean) # change it into matrix
m2 <- melt(can_bray_mean_matrix)[melt(upper.tri(can_bray_mean_matrix))$value,] # use melt, upper.tri from reshape 
names(m2) <- c("c1", "c2", "distance") # give name to the columns
m2 # check the out put

write.table(m2,'D:/out_puts/phyloseq/with_green_genes_taxa/microbiomeseq_analyssi/can_bray_mean_line.2016.csv', sep = ",")                                                                                              # write the output as csv file

# plot and correlate genetic distance between reference line and genotypes with bray distane between reference line and genotypes

library(ggpubr)
bray_genetic_distance_with_NAM_0 = as.data.frame(bray_genetic_distance_with_NAM_0) # as.dataframe

p4= ggscatter(bray_genetic_distance_with_NAM_0, x = "Genetic_Dstance", y = "Mean_Bray_Curtis", 
              add = "reg.line", conf.int = TRUE,
              cor.coef = TRUE, cor.method = "pearson",
              xlab="", ylab = "Mean Bray-curtis")## scater plot with trend line and correlation coefficient with p value 

p4= p4 +theme(axis.title.x = element_text(size = 14), axis.title.y = element_text(size = 14))
p4=p4+theme(axis.text.x = element_text(size=12), axis.text.y=element_text(size = 12))
p4

#2016 multivariate: subseted into Vegetative (1,2,3) Flowering(4,5,6,7) and maturity (8,9,10)

#subset week 123: vegetative

## adonis test for sampling week

adonis(can_bray.2016~Week, data = sampledf) #significant
beta_w = betadisper(can_bray.2016, as.factor(sampledf$Week))
permutest(beta_w)# also significant therefore dificult to trust adonis significant difference as it could be due to the the variation in disperssion

## plot PCoA ordination plot using the calculated bray distance 
can.2016.pcoa = ordinate(phyloseq.object.corrected.line.names.2016.edgernorm, method = "PCoA", distance = "bray")
plot_ordination(physeq = phyloseq.object.corrected.line.names.2016.edgernorm, ordination = can.2016.pcoa, color = "Canola.Lines")
#

##Dominat taxa persample: 
library(microbiome)
library(ggpubr)
library(knitr)
library(dplyr)

dp = dominant(physeqGenus, level = "Phylum")
dp = dominant(physeqGenus, level = "Class")
dp = dominant(physeqGenus, level = "Order")
dp = dominant(physeqGenus, level = "Family")
dp = dominant(physeqGenus, level = "Genus")


kable(head(dp))


#Phylum to genus
write.csv(dp, 'D:/out_puts/phyloseq/with_green_genes_taxa/microbiomeseq_analyssi/dominant_taxa_per_sample_Genus.csv')

### Dominant taxonomic groups with each canola line

dp = dominant(NAM0_0, level = "Phylum")
dp = dominant(NAM0_0, level = "Class")
dp = dominant(NAM0_0, level = "Order")
dp = dominant(NAM0_0, level = "Family")
dp = dominant(NAM0_0, level = "Genus")
##
dp = dominant(NAM_13, level = "Phylum")
dp = dominant(NAM_13, level = "Class")
dp = dominant(NAM_13, level = "Order")
dp = dominant(NAM_13, level = "Family")
dp = dominant(NAM_13, level = "Genus")
##
dp = dominant(NAM_14, level = "Phylum")
dp = dominant(NAM_14, level = "Class")
dp = dominant(NAM_14, level = "Order")
dp = dominant(NAM_14, level = "Family")
dp = dominant(NAM_14, level = "Genus")
##
dp = dominant(NAM_17, level = "Phylum")
dp = dominant(NAM_17, level = "Class")
dp = dominant(NAM_17, level = "Order")
dp = dominant(NAM_17, level = "Family")
dp = dominant(NAM_17, level = "Genus")
##
dp = dominant(NAM_23, level = "Phylum")
dp = dominant(NAM_23, level = "Class")
dp = dominant(NAM_23, level = "Order")
dp = dominant(NAM_23, level = "Family")
dp = dominant(NAM_23, level = "Genus")
##
dp = dominant(NAM_32, level = "Phylum")
dp = dominant(NAM_32, level = "Class")
dp = dominant(NAM_32, level = "Order")
dp = dominant(NAM_32, level = "Family")
dp = dominant(NAM_32, level = "Genus")
##
dp = dominant(NAM_37, level = "Phylum")
dp = dominant(NAM_37, level = "Class")
dp = dominant(NAM_37, level = "Order")
dp = dominant(NAM_37, level = "Family")
dp = dominant(NAM_37, level = "Genus")
##
dp = dominant(NAM_43, level = "Phylum")
dp = dominant(NAM_43, level = "Class")
dp = dominant(NAM_43, level = "Order")
dp = dominant(NAM_43, level = "Family")
dp = dominant(NAM_43, level = "Genus")
##
dp = dominant(NAM_46, level = "Phylum")
dp = dominant(NAM_46, level = "Class")
dp = dominant(NAM_46, level = "Order")
dp = dominant(NAM_46, level = "Family")
dp = dominant(NAM_46, level = "Genus")
##
dp = dominant(NAM_48, level = "Phylum")
dp = dominant(NAM_48, level = "Class")
dp = dominant(NAM_48, level = "Order")
dp = dominant(NAM_48, level = "Family")
dp = dominant(NAM_48, level = "Genus")
##
dp = dominant(NAM_5, level = "Phylum")
dp = dominant(NAM_5, level = "Class")
dp = dominant(NAM_5, level = "Order")
dp = dominant(NAM_5, level = "Family")
dp = dominant(NAM_5, level = "Genus")
##
dp = dominant(NAM_72, level = "Phylum")
dp = dominant(NAM_72, level = "Class")
dp = dominant(NAM_72, level = "Order")
dp = dominant(NAM_72, level = "Family")
dp = dominant(NAM_72, level = "Genus")
##
dp = dominant(NAM_76, level = "Phylum")
dp = dominant(NAM_76, level = "Class")
dp = dominant(NAM_76, level = "Order")
dp = dominant(NAM_76, level = "Family")
dp = dominant(NAM_76, level = "Genus")
##
dp = dominant(NAM_79, level = "Phylum")
dp = dominant(NAM_79, level = "Class")
dp = dominant(NAM_79, level = "Order")
dp = dominant(NAM_79, level = "Family")
dp = dominant(NAM_79, level = "Genus")
##
dp = dominant(NAM_94, level = "Phylum")
dp = dominant(NAM_94, level = "Class")
dp = dominant(NAM_94, level = "Order")
dp = dominant(NAM_94, level = "Family")
dp = dominant(NAM_94, level = "Genus")

###WRITE THE DOMINANTE TAXONOMIC GROUPS IN each line as csv

write.csv(dp, 'D:/out_puts/phyloseq/with_green_genes_taxa/microbiomeseq_analyssi/Dominant_taxa_per_Line/dominant_Phylum_NAM_94.csv')
write.csv(dp, 'D:/out_puts/phyloseq/with_green_genes_taxa/microbiomeseq_analyssi/Dominant_taxa_per_Line/dominant_Class_NAM_94.csv')
write.csv(dp, 'D:/out_puts/phyloseq/with_green_genes_taxa/microbiomeseq_analyssi/Dominant_taxa_per_Line/dominant_Order_NAM_94.csv')
write.csv(dp, 'D:/out_puts/phyloseq/with_green_genes_taxa/microbiomeseq_analyssi/Dominant_taxa_per_Line/dominant_Family_NAM_94.csv')
write.csv(dp, 'D:/out_puts/phyloseq/with_green_genes_taxa/microbiomeseq_analyssi/Dominant_taxa_per_Line/dominant_Genus_NAM_94.csv')


##### ....................#######.............................#######..................########

# Heritability analysis

## Import the phyloseq object and EdgeR normalize 
phyloseq.object.corrected.line.names.2016 # this is my phyloseq object

phyloseq.object.corrected.line.names.2016.edgernorm = normalise_data(phyloseq.object.corrected.line.names.2016, norm.method = "edgernorm")
phyloseq.object.corrected.line.names.2016.edgernorm

# Calculate all alpha diversity measures: use phyloseq and microbiome R packages
library(microbiome)
library(knitr)
library(dplyr)
library(ggpubr)

Alpha.can.2016 <- alpha(phyloseq.object.corrected.line.names.2016.edgernorm, index = "all") ## alpha diversity based on whole data

kable(head(Alpha.can.2016))

# get metadata from the phyloseq object and add the calculated diversity measures
can.meta <- meta(phyloseq.object.corrected.line.names.2016.edgernorm)

kable(head(can.meta))

#observed|    chao1| diversity_inverse_simpson| diversity_gini_simpson| diversity_shannon| diversity_fisher|
# diversity_coverage| evenness_camargo| evenness_pielou| evenness_simpson| evenness_evar| evenness_bulla| dominance_dbp|
#dominance_dmn| dominance_absolute| dominance_relative| dominance_simpson| dominance_core_abundance| dominance_gini|
#  rarity_log_modulo_skewness| rarity_low_abundance| rarity_noncore_abundance| rarity_rare_abundance|

### Diversity_shannon, diversity_gini_simpson, diversity_inverse_simpson, evenness_pielou and eveness_simpson 
#were considered for heritability analysis. You can test the other diversity measures if interested

can.meta$observed <- Alpha.can.2016$observed
can.meta$chao1 <- Alpha.can.2016$chao1
can.meta$diversity_inverse_simpson <- Alpha.can.2016$diversity_inverse_simpson
can.meta$diversity_gini_simpson <- Alpha.can.2016$diversity_gini_simpson
can.meta$diversity_shannon <- Alpha.can.2016$diversity_shannon
can.meta$diversity_fisher <- Alpha.can.2016$diversity_fisher
can.meta$diversity_coverage <- Alpha.can.2016$diversity_coverage
can.meta$evenness_camargo <- Alpha.can.2016$evenness_camargo
can.meta$evenness_pielou <- Alpha.can.2016$evenness_pielou
can.meta$evenness_simpson <- Alpha.can.2016$evenness_simpson
can.meta$evenness_evar <- Alpha.can.2016$evenness_evar
can.meta$evenness_bulla <- Alpha.can.2016$evenness_bulla
can.meta$dominance_dbp <- Alpha.can.2016$dominance_dbp
can.meta$dominance_dmn <- Alpha.can.2016$dominance_dmn
can.meta$dominance_absolute <- Alpha.can.2016$dominance_absolute
can.meta$dominance_relative <- Alpha.can.2016$dominance_relative
can.meta$dominance_simpson <- Alpha.can.2016$dominance_simpson
can.meta$dominance_core_abundance <- Alpha.can.2016$dominance_core_abundance
can.meta$dominance_gini <- Alpha.can.2016$dominance_gini
can.meta$rarity_log_modulo_skewness <- Alpha.can.2016$rarity_log_modulo_skewness
can.meta$rarity_low_abundance <- Alpha.can.2016$rarity_low_abundance
can.meta$rarity_noncore_abundance <- Alpha.can.2016$rarity_noncore_abundance
can.meta$rarity_rare_abundance <- Alpha.can.2016$rarity_rare_abundance

####
write.csv(can.meta, 'D:/out_puts/phyloseq/with_green_genes_taxa/microbiomeseq_analyssi/all_alpha_diversity_measures_microbiomer.csv')
Plot=as.factor(can.meta$Plot)

#### use the diversity measures to calculate broad-sense heritability
### variance components were extracted from each fitted models from which table 5 in manuscript is prepared

library(lme4)

#m1 = lmer(log(chao1)~(1|CanolaLine) + (1|CanolaLine:Weeks), data=all_alpha_diversity_measures_microbiomer, REML = TRUE)
#summary(m1) # 
#h1= (0.000001146)/((0.000001146) +(( 0.000016432)/10)+((0.000435987)/30))

m2 = lmer(diversity_inverse_simpson~(1|CanolaLine) + (1|CanolaLine:Weeks), data=all_alpha_diversity_measures_microbiomer, REML = TRUE)
summary(m2)

h2= (46.79 )/((46.79 )+((204.49)/10)+((3688.61)/30))

m3 = lmer(diversity_gini_simpson~(1|CanolaLine) + (1|CanolaLine:Weeks), data=all_alpha_diversity_measures_microbiomer, REML = TRUE)
summary(m3) 
h3= (0.00002678)/((0.00002678) +((0.00002116)/10)+((0.00132184/30)))


m4 = lmer(diversity_shannon~(1|CanolaLine) + (1|CanolaLine:Weeks), data=all_alpha_diversity_measures_microbiomer, REML = TRUE)
summary(m4)
h4= (0.003671)/((0.003671) +((0.019060)/10)+ ((0.542428)/30))

m5 = lmer(evenness_pielou~(1|CanolaLine) + (1|CanolaLine:Weeks), data=all_alpha_diversity_measures_microbiomer, REML = TRUE)
summary(m5)
h5= ( 0.0000412)/( (0.0000412) +((0.0002139)/10) +((0.0060884)/30))


m6 = lmer(evenness_simpson~(1|CanolaLine) + (1|CanolaLine:Weeks), data=all_alpha_diversity_measures_microbiomer, REML = TRUE)
summary(m6)
h6= (0.0000002963)/((0.0000002963) +((0.0000012948)/10) + ((0.0000233561)/30))

#m7 = lmer(dominance_dbp~(1|CanolaLine) + (1|CanolaLine:Weeks), data=all_alpha_diversity_measures_microbiomer, REML = TRUE)
#summary(m7)
#h7= (0.0001648)/((0.0001648) +((0.0004511)/10) + ((0.0057704)/30))

#m8 = lmer(dominance_dmn~(1|CanolaLine) + (1|CanolaLine:Weeks), data=all_alpha_diversity_measures_microbiomer, REML = TRUE)
#summary(m8)
#h8= (0.0002727)/((0.0002727) +(( 0.0007020)/10) + ((0.0105888)/30))

#m9 = lmer(dominance_relative~(1|CanolaLine) + (1|CanolaLine:Weeks), data=all_alpha_diversity_measures_microbiomer, REML = TRUE)
#summary(m9)
#h9= (0.0001648)/((0.0001648) +(( 0.0004511)/10) + ((0.0057704)/30))

#m10 = lmer(dominance_simpson~(1|CanolaLine) + (1|CanolaLine:Weeks), data=all_alpha_diversity_measures_microbiomer, REML = TRUE)
#summary(m10)
#h10= (0.00002678)/((0.00002678) +(( 0.00002116)/10) + ((0.00132184)/30))

#m11 = lmer(rarity_low_abundance~(1|CanolaLine) + (1|CanolaLine:Weeks), data=all_alpha_diversity_measures_microbiomer, REML = TRUE)
#summary(m11)
#h11= (0.00002116 +0.00002678)/(0.00002116 +0.00002678 +0.0001648+0.00132184)

##################     //          #######################################

#### subset data to only flowering - calculate alpha diversity measures and estimate heritability

can.Flowering.2016 = subset_samples(phyloseq.object.corrected.line.names.2016, Week=="4"|Week=="5"|Week=="6"|Week=="7") ##                                                                                    #subset to weeks 4,5,6,7:Flowering stage

can.Flowering.2016.edgernorm = normalise_data(can.Flowering.2016, norm.method = "edgernorm")## normalize the data using                                                                                                edgeR normalization

Alpha.can.2016.Flowering <- alpha(can.Flowering.2016.edgernorm, index = "all")

# get metadata from the phyloseq object and add the calculated diversity measures

can.meta <- meta(can.Flowering.2016.edgernorm)

kable(head(can.meta))

#
can.meta$observed <- Alpha.can.2016.Flowering$observed
can.meta$chao1 <- Alpha.can.2016.Flowering$chao1
can.meta$diversity_inverse_simpson <- Alpha.can.2016.Flowering$diversity_inverse_simpson
can.meta$diversity_gini_simpson <- Alpha.can.2016.Flowering$diversity_gini_simpson
can.meta$diversity_shannon <- Alpha.can.2016.Flowering$diversity_shannon
can.meta$diversity_fisher <- Alpha.can.2016.Flowering$diversity_fisher
can.meta$diversity_coverage <- Alpha.can.2016.Flowering$diversity_coverage
can.meta$evenness_camargo <- Alpha.can.2016.Flowering$evenness_camargo
can.meta$evenness_pielou <- Alpha.can.2016.Flowering$evenness_pielou
can.meta$evenness_simpson <- Alpha.can.2016.Flowering$evenness_simpson
can.meta$evenness_evar <- Alpha.can.2016.Flowering$evenness_evar
can.meta$evenness_bulla <- Alpha.can.2016.Flowering$evenness_bulla
can.meta$dominance_dbp <- Alpha.can.2016.Flowering$dominance_dbp
can.meta$dominance_dmn <- Alpha.can.2016.Flowering$dominance_dmn
can.meta$dominance_absolute <- Alpha.can.2016.Flowering$dominance_absolute
can.meta$dominance_relative <- Alpha.can.2016.Flowering$dominance_relative
can.meta$dominance_simpson <- Alpha.can.2016.Flowering$dominance_simpson
can.meta$dominance_core_abundance <- Alpha.can.2016.Flowering$dominance_core_abundance
can.meta$dominance_gini <- Alpha.can.2016.Flowering$dominance_gini
can.meta$rarity_log_modulo_skewness <- Alpha.can.2016.Flowering$rarity_log_modulo_skewness
can.meta$rarity_low_abundance <- Alpha.can.2016.Flowering$rarity_low_abundance
can.meta$rarity_noncore_abundance <- Alpha.can.2016.Flowering$rarity_noncore_abundance
can.meta$rarity_rare_abundance <- Alpha.can.2016.Flowering$rarity_rare_abundance

####
write.csv(can.meta, 'D:/out_puts/phyloseq/with_green_genes_taxa/microbiomeseq_analyssi/all_alpha_diversity_measures_flowering_microbiomer.csv')

# 
#### use the diversity measures to calculate broad-sense heritability at flowering

library(lme4)

#m11 = lmer(chao1~(1|CanolaLine), data=all_alpha_diversity_measures_flowering_microbiomer, REML = TRUE)
#summary(m11) #model did not converge
#h11 = (123775943273)/(123775943273 + 2483337164217)

m12 = lmer(diversity_inverse_simpson~(1|CanolaLine), data=all_alpha_diversity_measures_flowering_microbiomer, REML = TRUE)
summary(m12)

h12= (403.1)/((403.1) + ((3366.9)/12))

m13 = lmer(diversity_gini_simpson~(1|CanolaLine), data=all_alpha_diversity_measures_flowering_microbiomer, REML = TRUE)
summary(m13)
h13= (0.00005502)/((0.00005502) + ((0.00114562)/12))


m14 = lmer(diversity_shannon~(1|CanolaLine), data=all_alpha_diversity_measures_flowering_microbiomer, REML = TRUE)
summary(m14)
h14= (0.0489)/((0.0489) +((0.5645)/12))

m15 = lmer(evenness_pielou~(1|CanolaLine), data=all_alpha_diversity_measures_flowering_microbiomer, REML = TRUE)
summary(m15)
h15= (0.0005489)/((0.0005489) + ((0.0063357)/12))

m16 = lmer(evenness_simpson~(1|CanolaLine), data=all_alpha_diversity_measures_flowering_microbiomer, REML = TRUE)
summary(m16)
h16= (0.000002552)/((0.000002552) + ((0.000021319)/12))

#m17 = lmer(dominance_dbp~(1|CanolaLine), data=all_alpha_diversity_measures_flowering_microbiomer, REML = TRUE)
#summary(m17)
#h17= (0.0003317)/((0.0003317) + ((0.0063080)/12))

#m18 = lmer(dominance_dmn~(1|CanolaLine), data=all_alpha_diversity_measures_flowering_microbiomer, REML = TRUE)
#summary(m18)
#h18= (0.0008469)/((0.0008469) + ((0.0111165)/12))

#m19 = lmer(dominance_relative~(1|CanolaLine), data=all_alpha_diversity_measures_flowering_microbiomer, REML = TRUE)
#summary(m19)
#h19= (0.0003317)/((0.0003317) + ((0.0063080)/12))

#m20 = lmer(dominance_simpson~(1|CanolaLine), data=all_alpha_diversity_measures_flowering_microbiomer, REML = TRUE)
#summary(m20)
#h20= (0.00005502)/((0.00005502) + ((0.00114562)/12))

#m21 = lmer(rarity_low_abundance~(1|CanolaLine), data=all_alpha_diversity_measures_flowering_microbiomer, REML = TRUE)
#summary(m21)
#h21= (0.0002578)/((0.0002578) + ((0.0097715)/12))

############################################ //######################################################

##PERMANOVA, correlation between bray-curtis and genotype genetic distance.

phyloseq.object.corrected.line.names.2016 ## 2016 canola rhizosphere dataset

## Vegetative stage

can.vegetative.2016 = subset_samples(phyloseq.object.corrected.line.names.2016, Week=="1"|Week=="2"|Week=="3") ## subset to                                                                                               weeks 1,2,3:vegetative stage

can.vegetative.2016.edgernorm = normalise_data(can.vegetative.2016, norm.method = "edgernorm")## normalize the data using                                                                                                   edgeR normalization

###PERMANOVA on the normalized data using adonis: using phyloseq # 

set.seed(100)
can_bray.2016.veg <-phyloseq::distance(can.vegetative.2016.edgernorm, 
                                       method = "bray")# calculate bray curtis distance matrix

sampledf.vg = data.frame(sample_data(can.vegetative.2016.edgernorm))# make a data frame from the sample_data of the                                                                             phyloseq object

adonis(can_bray.2016.veg~Canola.Lines, data = sampledf.vg) # perform permanova test using adonis function in Vegan ## is not significant- accepting that genotypes have the same centroid 

beta = betadisper(can_bray.2016.veg, sampledf.vg$Canola.Lines)# Homogenity of dispersion test 

permutest(beta)## was  significant - so we can  reject the null hypothessis that canola genotypes have the same disperssion 
#we did not proceed for correlating mean bray distance with genetic distance since we have not observed significant variation in bray distance.

## Flowering stage

can.Flowering.2016 = subset_samples(phyloseq.object.corrected.line.names.2016, Week=="4"|Week=="5"|Week=="6"|Week=="7") ##                                                                                    subset to weeks 4,5,6,7:Flowering stage

can.Flowering.2016.edgernorm = normalise_data(can.Flowering.2016, norm.method = "edgernorm")## normalize the data using                                                                                                edgeR normalization 

#PERMANOVA ON edgeR method normalized data using adonis: using phyloseq # 

set.seed(100)
can_bray.2016.fl <-phyloseq::distance(can.Flowering.2016.edgernorm, 
                                      method = "bray")# calculate bray curtis distance matrix

sampledf.fl = data.frame(sample_data(can.Flowering.2016.edgernorm))# make a data frame from the sample_data of the phyloseq                                                                     object

adonis(can_bray.2016.fl~Canola.Lines, data = sampledf.fl) # perform permanova test using adonis function in Vegan ## was significant- rejecting that genotypes have the same centroid 

beta = betadisper(can_bray.2016.fl, sampledf.fl$Canola.Lines)# Homogenity of dispersion test 

permutest(beta)## was not significant - so we can not reject the null hypothessis that canola genotypes have the same disperssion. 

#### calculate mean bray-curtis distance between each pair of genotypes : then correlate it with genetic distance

#Calculate mean distance between genotypes

can_bray_mean.fl = meandist(can_bray.2016.fl, sampledf.fl$Canola.Lines)## calculates mean distance between canola lines
head(can_bray_mean.fl) ## this is a dist object, so first change it into matrix
library(reshape) # you need this to export the distance measures in data.frame format
can_bray_mean_matrix.fl = as.matrix(can_bray_mean.fl) # change it into matrix
m3 <- melt(can_bray_mean_matrix.fl)[melt(upper.tri(can_bray_mean_matrix.fl))$value,] # use melt, upper.tri from reshape 
names(m3) <- c("c1", "c2", "distance") # give name to the columns
m3 # check the out put

write.table(m3,'D:/out_puts/phyloseq/with_green_genes_taxa/microbiomeseq_analyssi/can_bray_mean_line.2016.flowering.csv', sep = ",") # write the output as csv file

# plot, correlate plant genetic distance + bray distane 

library(ggpubr)

# creat a data frame manually with mean bray dis and genetic dist
#bray curtis- genetic distance correlation based on flowering data, all line pairs

bray_genetic_distance_all_lines_Flowering <- read_excel("D:/out_puts/phyloseq/with_green_genes_taxa/microbiomeseq_analyssi/bray_genetic_distance_all_lines_Flowering.xlsx")

p12= ggscatter(bray_genetic_distance_all_lines_Flowering, x = "Genetic.Distance", y = "Mean.Bray.Curtis", 
               add = "reg.line", conf.int = TRUE,
               cor.coef = TRUE, cor.method = "pearson",
               xlab="", ylab = "")## scater plot with trend line and correlation coefficient with p value 
p12= p12 +theme(axis.title.x = element_text(size = 14), axis.title.y = element_text(size = 14))
p12=p12+theme(axis.text.x = element_text(size=12), axis.text.y=element_text(size = 12))
p12 ## positive and significant correlation

## Maturity

can.maturity.2016 = subset_samples(phyloseq.object.corrected.line.names.2016, Week=="8"|Week=="9"|Week=="10") ## subset to                                                                                              weeks 8. 9,10 - maturity stage

can.maturity.2016.edgernorm = normalise_data(can.maturity.2016, norm.method = "edgernorm")## normalize the data using edgeR                                                                                              normalization

#PERMANOVA ON edgeR method normalized data using adonis: using phyloseq # 

set.seed(100)
can_bray.2016.ma <-phyloseq::distance(can.maturity.2016.edgernorm, 
                                      method = "bray")# calculate bray curtis distance matrix
sampledf.ma = data.frame(sample_data(can.maturity.2016.edgernorm))# make a data frame from the sample_data of the phyloseq                                                                    object
adonis(can_bray.2016.ma~Canola.Lines, data = sampledf.ma) # perform permanova test using adonis function in Vegan ## was not significant- Acepting that genotypes have the same centroid 
beta = betadisper(can_bray.2016.ma, sampledf.ma$Canola.Lines)# Homogenity of dispersion test 
permutest(beta)## was not significant - so we can not reject the null hypothessis that canola genotypes have the same disperssion. 

#we did not proceed for correlating mean bray distance with genetic distance since we have not observed significant variation in bray distance 


#### Weeks 1-7(vegetative to flowering) 

can.Veg.Flow.2016 = subset_samples(phyloseq.object.corrected.line.names.2016, Week=="1"| Week=="2"| Week=="3"|Week=="4"|Week=="5"|
                                     Week=="6"|Week=="7") ## subset to weeks1,2,3 4,5,6,7:vegetative+Flowering stage


can.Veg.Flow.2016.edgernorm = normalise_data(can.Veg.Flow.2016, norm.method = "edgernorm")## normalize the data using edgeR                                                                                              normalization

set.seed(100)
can_bray.2016.vg.fl <-phyloseq::distance(can.Veg.Flow.2016.edgernorm, 
                                         method = "bray")# calculate bray curtis distance matrix
sampledf.vg.fl = data.frame(sample_data(can.Veg.Flow.2016.edgernorm))# make a data frame from the sample_data of the                                                                              phyloseq object
adonis(can_bray.2016.vg.fl~Canola.Lines, data = sampledf.vg.fl) # perform permanova test using adonis function in Vegan ## was significant- rejecting that genotypes have the same centroid 
beta = betadisper(can_bray.2016.vg.fl, sampledf.vg.fl$Canola.Lines)# Homogenity of dispersion test 
permutest(beta)## was not significant - so we can not reject the null hypothessis that canola genotypes have the same disperssion. 


#### calculate mean bray-curtis distance between each pair of genotypes : then correlate it with genetic distance

#Calculate mean distance between genotypes
can_bray_mean.vg.fl = meandist(can_bray.2016.vg.fl, sampledf.vg.fl$Canola.Lines)## calculates mean distance between canola                                                                                   lines
head(can_bray_mean.vg.fl) ## this is a dist object, so first change it into matrix
library(reshape) # you need this to export the distance measures in data.frame format
can_bray_mean_matrix.vg.fl = as.matrix(can_bray_mean.vg.fl) # change it into matrix
m4 <- melt(can_bray_mean_matrix.vg.fl)[melt(upper.tri(can_bray_mean_matrix.vg.fl))$value,] # use melt, upper.tri from                                                                                                   reshape 
names(m4) <- c("c1", "c2", "distance") # give name to the columns
m4 # check the out put

write.table(m4,'D:/out_puts/phyloseq/with_green_genes_taxa/microbiomeseq_analyssi/can_bray_mean_line.2016.vegetative.flowering.csv', sep = ",") # write the output as csv file

# plot and correlate genetic distance between reference line and genotypes with bray distane between reference line and genotypes

library(ggpubr)

# creat a data frame manually with mean bray dis and genetic dist
##bray curtis- genetic distance correlation based on vegetative-flowering data, all line pairs

bray_genetic_distance_all_lines_vegetative_flowering <- read_excel("D:/out_puts/phyloseq/with_green_genes_taxa/microbiomeseq_analyssi/bray_genetic_distance_all_lines_vegetative_flowering.xlsx")
p13= ggscatter(bray_genetic_distance_all_lines_vegetative_flowering, x = "Genetic.Distance", y = "Mean.Bray.Curtis", 
               add = "reg.line", conf.int = TRUE,
               cor.coef = TRUE, cor.method = "pearson",
               xlab="Genetic distance", ylab = "Mean Bray-Curtis")## scater plot with trend line and correlation coefficien                                                                      t with p value 
p13= p13 +theme(axis.title.x = element_text(size = 14), axis.title.y = element_text(size = 14))
p13=p13+theme(axis.text.x = element_text(size=12), axis.text.y=element_text(size = 12))
p13 ## positive and significant correlation

####and weeks 4-10 (flowering to maturity) 

can.Flow.matu.2016 = subset_samples(phyloseq.object.corrected.line.names.2016, Week=="4"|Week=="5"|
                                      Week=="6"|Week=="7"|Week=="8"|Week=="9"|Week=="10") ## subset to weeks 4,5,6,7,8,9,10                                                                                              :Fowering+maturity stage

can.Flow.matu.2016.edgernorm = normalise_data(can.Flow.matu.2016, norm.method = "edgernorm")## normalize the data using                                                                                                   edgeR normalization

set.seed(100)
can_bray.2016.fl.ma <-phyloseq::distance(can.Flow.matu.2016.edgernorm, 
                                         method = "bray")# calculate bray curtis distance matrix
sampledf.fl.ma = data.frame(sample_data(can.Flow.matu.2016.edgernorm))# make a data frame from the sample_data of the                                                                              phyloseq object
adonis(can_bray.2016.fl.ma~Canola.Lines, data = sampledf.fl.ma) # perform permanova test using adonis function in Vegan ## was  significant- rejecting that genotypes have the same centroid 
beta = betadisper(can_bray.2016.fl.ma, sampledf.fl.ma$Canola.Lines)# Homogenity of dispersion test 
permutest(beta)## was not significant - so we can not reject the null hypothessis that canola genotypes have the same disperssion. 

#### calculate mean bray-curtis distance between each pair of genotypes : then correlate it with genetic distance

#Calculate mean distance between genotypes

can_bray_mean.fl.ma = meandist(can_bray.2016.fl.ma, sampledf.fl.ma$Canola.Lines)## calculates mean distance between canola                                                                                    lines
head(can_bray_mean.fl.ma) ## this is a dist object, so first change it into matrix
library(reshape) # you need this to export the distance measures in data.frame format
can_bray_mean_matrix.fl.ma = as.matrix(can_bray_mean.fl.ma) # change it into matrix
m5 <- melt(can_bray_mean_matrix.fl.ma)[melt(upper.tri(can_bray_mean_matrix.fl.ma))$value,] # use melt, upper.tri from                                                                                                  reshape 
names(m5) <- c("c1", "c2", "distance") # give name to the columns
m5 # check the out put

write.table(m5,'D:/out_puts/phyloseq/with_green_genes_taxa/microbiomeseq_analyssi/can_bray_mean_line.2016.flowering.maturity.csv', sep = ",") # write the output as csv file

# creat a data frame manually with mean bray dis and genetic dist
##bray curtis- genetic distance correlation based on flowering-maturity data, all line pairs

bray_genetic_distance_all_lines_Flowering_maturity <- read_excel("D:/out_puts/phyloseq/with_green_genes_taxa/microbiomeseq_analyssi/bray_genetic_distance_all_lines_Flowering_maturity.xlsx")
p14= ggscatter(bray_genetic_distance_all_lines_Flowering_maturity, x = "Genetic.Distance", y = "Mean.Bray.Curtis", 
               add = "reg.line", conf.int = TRUE,
               cor.coef = TRUE, cor.method = "pearson",
               xlab="Genetic distance", ylab = "")## scater plot with trend line and correlation coefficient with p value 
p14= p14 +theme(axis.title.x = element_text(size = 14), axis.title.y = element_text(size = 14))
p14=p14+theme(axis.text.x = element_text(size=12), axis.text.y=element_text(size = 12))
p14 ## positive and significant correlation

# ......................... # .................................. # .......................................... # ..................................

# Chapter 3
#Title:Core and Differentially Abundant Bacterial Taxa in the Rhizosphere of Field Grown Brassica napus Genotypes:
#Implications for Canola Breeding 
#Taye et al., 2020

# Core Microbiome Analysis

library(microbiome) ## Install microbiome R package and all dependencies if you have not installed
library(knitr)

# load the 2016 phyloseq object and convert absolute counts to compositional (relative) abundance

phyloseq.object.corrected.line.names.2016

pysqcdata.compositional <- transform(phyloseq.object.corrected.line.names.2016, "compositional")

## Core Microbiome with detection threshold of 0.01/100 and a series of prevalence tresholds ranging from 50 - 100/100

#### Core taxa at 50% prevalnce treshold 

pysqcdata.core1 <- core(pysqcdata.compositional, 
                        detection = .01/100, prevalence = 50/100)
pysqcdata.core1 ## this is a phyloseq object with core taxa identified with the specified tresholds

## Names of the core taxa based on the above specified detection and prevalence treshold

core.taxa <- taxa(pysqcdata.core1)
class(core.taxa)
# get the taxonomy data
tax.mat <- tax_table(pysqcdata.core1)
tax.df <- as.data.frame(tax.mat)
# add the OTus to last column
tax.df$OTU <- rownames(tax.df)

# select taxonomy of only 
# those OTUs that are core memebers based on the thresholds that were used.
core.taxa.class1 <- dplyr::filter(tax.df, rownames(tax.df) %in% core.taxa)
knitr::kable(core.taxa.class1)

#Save csv of core microbiome at 50 prevalence treshold
write.csv(core.taxa.class1, 'D:/out_puts/phyloseq/with_green_genes_taxa/summary_tables/core_taxa_tables/core.taxa.50.gg.csv')

### Core taxa at 55% prevalnce treshold 

pysqcdata.core2 <- core(pysqcdata.compositional, 
                        detection = .01/100, prevalence = 55/100)
pysqcdata.core2

## Names of the core taxa based on the above specified detection and prevalence treshold
core.taxa <- taxa(pysqcdata.core2)
class(core.taxa)
# get the taxonomy data
tax.mat <- tax_table(pysqcdata.core2)
tax.df <- as.data.frame(tax.mat)

# add the OTus to last column
tax.df$OTU <- rownames(tax.df)

# select taxonomy of only 
# those OTUs that are core memebers based on the thresholds that were used.
core.taxa.class2 <- dplyr::filter(tax.df, rownames(tax.df) %in% core.taxa)
knitr::kable(core.taxa.class2)

write.csv(core.taxa.class2, 'D:/out_puts/phyloseq/with_green_genes_taxa/summary_tables/core_taxa_tables/core.taxa.55.gg.csv')

### Core taxa at 60% prevalnce treshold 

pysqcdata.core3 <- core(pysqcdata.compositional, 
                        detection = .01/100, prevalence = 60/100)
pysqcdata.core3

# Names of the core taxa based on the above specified detection and prevalence treshold

core.taxa <- taxa(pysqcdata.core3)
class(core.taxa)
# get the taxonomy data
tax.mat <- tax_table(pysqcdata.core3)
tax.df <- as.data.frame(tax.mat)

# add the OTus to last column
tax.df$OTU <- rownames(tax.df)

# select taxonomy of only 
# those OTUs that are core memebers based on the thresholds that were used.
core.taxa.class3 <- dplyr::filter(tax.df, rownames(tax.df) %in% core.taxa)
knitr::kable(core.taxa.class3)

write.csv(core.taxa.class3, 'D:/out_puts/phyloseq/with_green_genes_taxa/summary_tables/core_taxa_tables/core.taxa.60.gg.csv')

### Core taxa at 65% prevalnce treshold

pysqcdata.core4 <- core(pysqcdata.compositional, 
                        detection = .01/100, prevalence = 65/100)
pysqcdata.core4

core.taxa <- taxa(pysqcdata.core4)
class(core.taxa)
# get the taxonomy data
tax.mat <- tax_table(pysqcdata.core4)
tax.df <- as.data.frame(tax.mat)

# add the OTus to last column
tax.df$OTU <- rownames(tax.df)

# select taxonomy of only 
# those OTUs that are core memebers based on the thresholds that were used.
core.taxa.class4 <- dplyr::filter(tax.df, rownames(tax.df) %in% core.taxa)
knitr::kable(core.taxa.class4)

write.csv(core.taxa.class4, 'D:/out_puts/phyloseq/with_green_genes_taxa/summary_tables/core_taxa_tables/core.taxa.65.gg.csv')

### Core taxa at 70% prevalnce treshold

pysqcdata.core5 <- core(pysqcdata.compositional, 
                        detection = .01/100, prevalence = 70/100)
pysqcdata.core5

# Names of the core taxa based on the above specified detection and prevalence treshold
core.taxa <- taxa(pysqcdata.core5)
class(core.taxa)
# get the taxonomy data
tax.mat <- tax_table(pysqcdata.core5)
tax.df <- as.data.frame(tax.mat)

# add the OTus to last column
tax.df$OTU <- rownames(tax.df)

# select taxonomy of only 
# those OTUs that are core memebers based on the thresholds that were used.
core.taxa.class5 <- dplyr::filter(tax.df, rownames(tax.df) %in% core.taxa)
knitr::kable(core.taxa.class5)

write.csv(core.taxa.class5, 'D:/out_puts/phyloseq/with_green_genes_taxa/summary_tables/core_taxa_tables/core.taxa.70.gg.csv')

### Core taxa at 75% prevalnce treshold

pysqcdata.core6 <- core(pysqcdata.compositional, 
                        detection = .01/100, prevalence = 75/100)
pysqcdata.core6

# Names of the core taxa based on the above specified detection and prevalence treshold

core.taxa <- taxa(pysqcdata.core6)
class(core.taxa)
# get the taxonomy data
tax.mat <- tax_table(pysqcdata.core6)
tax.df <- as.data.frame(tax.mat)

# add the OTus to last column
tax.df$OTU <- rownames(tax.df)

# select taxonomy of only 
# those OTUs that are core memebers based on the thresholds that were used.
core.taxa.class6 <- dplyr::filter(tax.df, rownames(tax.df) %in% core.taxa)
knitr::kable(core.taxa.class6)

write.csv(core.taxa.class6, 'D:/out_puts/phyloseq/with_green_genes_taxa/summary_tables/core_taxa_tables/core.taxa.75.gg.csv')

### Core taxa at 80% prevalnce treshold

pysqcdata.core7 <- core(pysqcdata.compositional, 
                        detection = .01/100, prevalence = 80/100)
pysqcdata.core7

# Names of the core taxa based on the above specified detection and prevalence treshold
core.taxa <- taxa(pysqcdata.core7)
class(core.taxa)
# get the taxonomy data
tax.mat <- tax_table(pysqcdata.core7)
tax.df <- as.data.frame(tax.mat)

# add the OTus to last column
tax.df$OTU <- rownames(tax.df)

# select taxonomy of only 
# those OTUs that are core memebers based on the thresholds that was used.
core.taxa.class7 <- dplyr::filter(tax.df, rownames(tax.df) %in% core.taxa)
knitr::kable(core.taxa.class7)

write.csv(core.taxa.class7, 'D:/out_puts/phyloseq/with_green_genes_taxa/summary_tables/core_taxa_tables/core.taxa.80.gg.csv')

### Core taxa at 85% prevalnce treshold

pysqcdata.core8 <- core(pysqcdata.compositional, 
                        detection = .01/100, prevalence = 85/100)
pysqcdata.core8

# Names of the core taxa based on the above specified detection and prevalence treshold

core.taxa <- taxa(pysqcdata.core8)
class(core.taxa)
# get the taxonomy data
tax.mat <- tax_table(pysqcdata.core8)
tax.df <- as.data.frame(tax.mat)

# add the OTus to last column
tax.df$OTU <- rownames(tax.df)

# select taxonomy of only 
# those OTUs that are core memebers based on the thresholds that were used.
core.taxa.class8 <- dplyr::filter(tax.df, rownames(tax.df) %in% core.taxa)
knitr::kable(core.taxa.class8)

write.csv(core.taxa.class8, 'D:/out_puts/phyloseq/with_green_genes_taxa/summary_tables/core_taxa_tables/core.taxa.85.gg.csv')

### Core taxa at 90% prevalnce treshold

pysqcdata.core9 <- core(pysqcdata.compositional, 
                        detection = .01/100, prevalence = 90/100)
pysqcdata.core9

# Names of the core taxa based on the above specified detection and prevalence treshold

core.taxa <- taxa(pysqcdata.core9)
class(core.taxa)
# get the taxonomy data
tax.mat <- tax_table(pysqcdata.core9)
tax.df <- as.data.frame(tax.mat)

# add the OTus to last column
tax.df$OTU <- rownames(tax.df)

# select taxonomy of only 
# those OTUs that are core memebers based on the thresholds that were used.
core.taxa.class9 <- dplyr::filter(tax.df, rownames(tax.df) %in% core.taxa)
knitr::kable(core.taxa.class9)

write.csv(core.taxa.class9, 'D:/out_puts/phyloseq/with_green_genes_taxa/summary_tables/core_taxa_tables/core.taxa.90.gg.csv')

### Core taxa at 95% prevalnce treshold

pysqcdata.core10 <- core(pysqcdata.compositional, 
                         detection = .01/100, prevalence = 96/100)
pysqcdata.core10

# Names of the core taxa based on the above specified detection and prevalence treshold

core.taxa <- taxa(pysqcdata.core10)
class(core.taxa)
# get the taxonomy data
tax.mat <- tax_table(pysqcdata.core10)
tax.df <- as.data.frame(tax.mat)

# add the OTus to last column
tax.df$OTU <- rownames(tax.df)

# select taxonomy of only 
# those OTUs that are core memebers based on the thresholds that were used.
core.taxa.class10 <- dplyr::filter(tax.df, rownames(tax.df) %in% core.taxa)
knitr::kable(core.taxa.class10)

write.csv(core.taxa.class10, 'D:/out_puts/phyloseq/with_green_genes_taxa/summary_tables/core_taxa_tables/core.taxa.96.gg.csv')

#There were no core taxa at 100% prevalence treshold

#####################  //#####################################

## 2017 three site core microbiome analysis

### Site: LL

# Load your phyloseq object

myphylo_zero_sum_filtered_LL_17.RData

library(microbiome)

# convert absolute counts to compositional (relative) abundances

pysqcdata.compositional <- transform(myphylo_zero_sum_filtered_LL_17"compositional")

### Core taxa at 75% prevalnce treshold

pysqcdata.core6 <- core(pysqcdata.compositional, 
                        detection = .01/100, prevalence = 75/100)
pysqcdata.core6

# Names of the core taxa based on the above specified detection and prevalence treshold

core.taxa <- taxa(pysqcdata.core6)
class(core.taxa)

# get the taxonomy data

tax.mat <- tax_table(pysqcdata.core6)
tax.df <- as.data.frame(tax.mat)

# add the OTus to last column

tax.df$OTU <- rownames(tax.df)

# select taxonomy of only 
# those OTUs that are core memebers based on the thresholds that were used.

core.taxa.class6 <- dplyr::filter(tax.df, rownames(tax.df) %in% core.taxa)
knitr::kable(core.taxa.class6)

write.csv(core.taxa.class6, 'D:/out_puts/2017_canola/LL_qqime2_outputs/tables/core_taxa_tables/core.taxa.75.LL_17.csv')

### Site: MF

# Load your phyloseq object

myphylo_zero_sum_filtered_MF_17.RData

# convert absolute counts to compositional (relative) abundances

pysqcdata.compositional <- transform(myphylo_zero_sum_filtered_MF_17, "compositional")

### Core taxa at 75% prevalnce treshold

pysqcdata.core6 <- core(pysqcdata.compositional, 
                        detection = .01/100, prevalence = 75/100)
pysqcdata.core6

# Names of the core taxa based on the above specified detection and prevalence treshold

core.taxa <- taxa(pysqcdata.core6)
class(core.taxa)

# get the taxonomy data

tax.mat <- tax_table(pysqcdata.core6)
tax.df <- as.data.frame(tax.mat)

# add the OTus to last column

tax.df$OTU <- rownames(tax.df)

# select taxonomy of only 
# those OTUs that are core memebers based on the thresholds that were used.

core.taxa.class6 <- dplyr::filter(tax.df, rownames(tax.df) %in% core.taxa)
knitr::kable(core.taxa.class6)

write.csv(core.taxa.class6, 'D:/out_puts/2017_canola/MF_qqime2_outputs/tables/core_taxa_tables/core.taxa.75.MF_17.csv')

### Site: SC

# Load your phyloseq object

myphylo_zero_sum_filtered_SC_17.RData

# convert absolute counts to compositional (relative) abundances

pysqcdata.compositional <- transform(myphylo_zero_sum_filtered_SC_17, "compositional")

### Core taxa at 75% prevalnce treshold

pysqcdata.core6 <- core(pysqcdata.compositional, 
                        detection = .01/100, prevalence = 75/100)
pysqcdata.core6

# Names of the core taxa based on the above specified detection and prevalence treshold

core.taxa <- taxa(pysqcdata.core6)
class(core.taxa)

# get the taxonomy data

tax.mat <- tax_table(pysqcdata.core6)
tax.df <- as.data.frame(tax.mat)

# add the OTus to last column

tax.df$OTU <- rownames(tax.df)

# select taxonomy of only 
# those OTUs that are core memebers based on the thresholds that were used.

core.taxa.class6 <- dplyr::filter(tax.df, rownames(tax.df) %in% core.taxa)
knitr::kable(core.taxa.class6)

write.csv(core.taxa.class6, 'D:/out_puts/2017_canola/SC_qqime2_outputs/tables/core_taxa_tables/core.taxa.75.SC_17.csv')

# .............................. # ......................... # ................................... # ..........................

# Chapter 3
#Title:Core and Differentially Abundant Bacterial Taxa in the Rhizosphere of Field Grown Brassica napus Genotypes: 
#Implications for Canola Breeding
#Taye et al., 2020

## Differential abundance analysis 
## Install edgeR and all other dependencies if you have not already

source("https://bioconductor.org/biocLite.R")
biocLite("edgeR")

## load all required packages
## load required ackage: edgeR, ggplot2
#library(phyloseq)
#library(edgeR)
#library(ggplot2)

## ........................       #### ....................   #### ....................... ####

## load the phyloseq object created
## convert the abundace values to the nearest intigers: we have spiked in A.fisheri during PCR amplification and latter used for normalization to acount for variation in sequecnign depth. The normalized values are no longer integers. Hence, we rounded the abundance value to nearest integer.

canola2016_nearest_integer<-ceiling(otu_table(can_2016.norm.soil_zeor_fintered)) # 'can_2016.norm.soil_zeor_fintered' is our                                                                                    phyloseq object.

## merge the otu table with integer values with taxa table and meta data


can.physq.data.2016.nearest.intiger <- merge_phyloseq(canola2016_nearest_integer,tax_table(can_2016.norm.soil_zeor_fintered), sample_data(can_2016.norm.soil_zeor_fintered))

#save nearest intiger as RData# for later use as needed

#save(canola2016_nearest_integer,file='T:/_Docs/R_scripts/Annotated_r_scripts_and_working_r_objects/canola2016_nearest_integer.RData')

## Aglomerate to a genus level
## Differencial abundace analysis will be done at genus level.The following function will aggregrate taxa to Genus level

physeqGenus <- tax_glom(can.physq.data.2016.nearest.intiger, "Genus")

#save agglomerated physeqGenus object # for later use as needed

save(physeqGenus,file='D:/out_puts/phyloseq/with_green_genes_taxa/summary_tables/differential_abundance/physeqGenus_gg.RData')

##Check the phyloseq object (physeqGenus) you created: geneus level

physeqGenus ## will show summary of the phyloseq object

#phyloseq-class experiment-level object
#otu_table()   OTU Table:         [ 558 taxa and 477 samples ]
#sample_data() Sample Data:       [ 477 samples by 22 sample variables ]
#tax_table()   Taxonomy Table:    [ 558 taxa by 7 taxonomic ranks ]

#Get variables of interest from your phyloseq object 
## we are interested to compare canola genotypes and hence "CanolaLine" is our variable of interest

canolaline = get_variable(physeqGenus, "CanolaLine")
head(canolaline)

##Now get your otutable from the phyloseq object. If it does not run add 1 to avoide log(0) error:over flow
## we have added 1 

x = as(otu_table(physeqGenus), "matrix") + 1L
taxonomy = data.frame(as(tax_table(physeqGenus), "matrix"))

## Now you can turn into a DGEList

x = DGEList(counts=x, group=canolaline, genes=taxonomy, remove.zeros=TRUE)

## you can explore the DGEList created
x$samples

## Calculate normalization factro. Through our experimental design we have acounted for variation in sequencinng depth
# by normalizing the abundance values using the spikein. We tried different normalization methods implemented in edgeR
#and we did not find noticeable variation form plots (see plotBCV), finally we decided to aligning the upper quantiles 
#of the count per million within the libraries.

x = calcNormFactors(x, method="upperquartile")

## you can explore the value of the normalization

x$samples

##Before proceeding to the differential abundance analysis lets see the sample relations in multidimentional scaling

plotMDS(x, method="bcv", col=as.numeric(canolaline)) 

## Specify the design matrix: canola genotype is our trait of interest and as such we have specified the design

design = model.matrix(~canolaline)

## save the desgn matrix for your reference

head(design)

write.csv(design, 'D:/out_puts/phyloseq/with_green_genes_taxa/summary_tables/differential_abundance/design.csv')

## Disperssion estimate
#estimateDisp function runs all three estimates and is indicated as better rather than running indiviudually. 

library(statmod) # load statmod

x=estimateDisp(x,design, robust=TRUE) #robust=TRUE has been used to protect the empirical Bayes estimates against the possibility of outlier taxa with exceptionally large or small individual dispersions.

# visualize disperssion estimates using  BCV plot

plotBCV(x)

##Fit the negative binomial model with the specified design


fit <- glmQLFit(x, design)

##You can examine the fitted coeficients

head(fit$coefficients)

plotQLDisp(fit) #visualize the fitted mean-QL disperssion trend, squeezed QL estimates

## Now proceed to determining differentially expressed genus. Here quasi-likelihood F-tests is fitted.
#First a Genus wise glm is fitted and quasi-likelihood F-test comparing 
#pairs of canola lines (NAM-0 VS NAM13 (COEF=2 indicates NAM-13 since it is in the 2nd column of the design matrix)) is condcuted. 
#This will be done for all pair of comparisons, The reference genotype NAM-0 vs the remaining fiftteen genotypes.

############# NAM-0 Vs NAM-13###############

qlf <- glmQLFTest(fit, coef=2)
topTags(qlf)# to look at the toptags

##Now lets specify a false FDR treshold and take only those significant at this value. alpha 0.01 

qlfnam13 = topTags(qlf, n=nrow(x), adjust.method="BH", sort.by="PValue")
qlfnam13 = qlfnam13@.Data[[1]]
alpha = 0.01
sigtab = qlfnam13[(qlfnam13$FDR < alpha), ]
sigtab = cbind(as(sigtab, "data.frame"), as(tax_table(physeqGenus)[rownames(sigtab), ], "matrix"))
dim(sigtab) # prints the number of differentially abundant genera at FDR of 0.01
head(sigtab)# you can have a quick look at the head of the table with differentially significantlly abundant genera in NAM              -13

##There is a separate script below (extracting edgeR results) for extracting and better plotting edgeR results.Use this one for a quick overview

# Plot log-fold change against log-counts per million, with DE genus highlighted FOR NAM13.

detags <- rownames(sigtab)
plotSmear(qlf, de.tags=detags)
abline(h=c(-3, 3), col="blue")

## get the number of significanty more abundant and less abundant genus at adj pvalue= 0.01

nam13res=decideTestsDGE(qlf,adjust.method="BH", p.value=0.01)

summary(nam13res)

#Visualize the significnat DE GENUS IN NAM13. Use separate scrip for better plotting 

sigtanam13 = subset(sigtab, !is.na(Genus))
# Phylum order
m = tapply(sigtanam13$logFC, sigtanam13$Phylum, function(y) max(y))
m = sort(m, TRUE)
sigtanam13$Phylum = factor(as.character(sigtanam13$Phylum), levels = names(m))
# Genus order
m = tapply(sigtanam13$logFC, sigtanam13$Genus, function(m) max(m))
m = sort(m, TRUE)
sigtanam13$Genus = factor(as.character(sigtanam13$Genus), levels = names(m))
ggplot(sigtanam13, aes(x = Genus, y = logFC, color = Phylum)) + geom_point(size=8) + 
  theme(axis.text.x = element_text(angle = -90, hjust = 0, vjust = 0.5)) +
  ggtitle("Log Fold Change of Significant Genus in NAM-13")

## save the significant DE geneus in NAME13 as csv in your folder/WD

write.csv(sigtab, 'D:/out_puts/phyloseq/with_green_genes_taxa/summary_tables/differential_abundance/NAM_13_line_gg.csv')

############# NAM-0 Vs NAM-14###############

qlf1 <- glmQLFTest(fit, coef=3)
topTags(qlf1)

##Now lets specify a false FDR treshold and take only those significant at this value. alpha 0.01

qlfnam14 = topTags(qlf1, n=nrow(x), adjust.method="BH", sort.by="PValue")
qlfnam14 = qlfnam14@.Data[[1]]
alpha = 0.01
sigtab1 = qlfnam14[(qlfnam14$FDR < alpha), ]
sigtab1 = cbind(as(sigtab1, "data.frame"), as(tax_table(physeqGenus)[rownames(sigtab1), ], "matrix"))
dim(sigtab1)
head(sigtab1)#you can have a quick look at the head of the table with differentially significantlly abundant genera in NAM-14

## Plot log-fold change against log-counts per million, with DE genus highlighted FOR NAM14

detags1 <- rownames(sigtab1)
plotSmear(qlf1, de.tags=detags1)
abline(h=c(-3, 3), col="blue")

#number of up, down and not significant

nam14res=decideTestsDGE(qlf1,adjust.method="BH", p.value=0.01)
summary(nam14res)

## Visualize the significnat DE GENUS IN NAM14

sigtanam14 = subset(sigtab1, !is.na(Genus))
# Phylum order
j = tapply(sigtanam14$logFC, sigtanam14$Phylum, function(j) max(j))
j = sort(j, TRUE)
sigtanam14$Phylum = factor(as.character(sigtanam14$Phylum), levels = names(j))
# Genus order
j = tapply(sigtanam14$logFC, sigtanam14$Genus, function(j) max(j))
j = sort(j, TRUE)
sigtanam14$Genus = factor(as.character(sigtanam14$Genus), levels = names(j))
ggplot(sigtanam14, aes(x = Genus, y = logFC, color = Phylum)) + geom_point(size=8) + 
  theme(axis.text.x = element_text(angle = -90, hjust = 0, vjust = 0.5)) +
  ggtitle("Log Fold Change of Significant Genus in NAM-14")

## save the significant DE geneus in NAME14 as csv in your folder/WD

write.csv(sigtab1, 'D:/out_puts/phyloseq/with_green_genes_taxa/summary_tables/differential_abundance/NAM_14_line_gg.csv')

############# NAM-0 Vs NAM-17###############

qlf2 <- glmQLFTest(fit, coef=4)
topTags(qlf2)

##Now lets specify a false FDR treshold and take only those significant at this value. alpha 0.01

qlfnam17 = topTags(qlf2, n=nrow(x), adjust.method="BH", sort.by="PValue")
qlfnam17 = qlfnam17@.Data[[1]]
alpha = 0.01
sigtab2 = qlfnam17[(qlfnam17$FDR < alpha), ]
sigtab2 = cbind(as(sigtab2, "data.frame"), as(tax_table(physeqGenus)[rownames(sigtab2), ], "matrix"))
dim(sigtab2)
head(sigtab2)

## Plot log-fold change against log-counts per million, with DE genus highlighted FOR NAM17

detags2 <- rownames(sigtab2)
plotSmear(qlf2, de.tags=detags2)
abline(h=c(-1, 1), col="blue")

#number of up, down and not significant geneus

nam17res=decideTestsDGE(qlf2,adjust.method="BH", p.value=0.01)

summary(nam17res)

## Visualize the significnat DE GENUS IN NAM17

sigtanam17 = subset(sigtab2, !is.na(Genus))
# Phylum order
k = tapply(sigtanam17$logFC, sigtanam17$Phylum, function(k) max(k))
k = sort(k, TRUE)
sigtanam17$Phylum = factor(as.character(sigtanam17$Phylum), levels = names(k))
# Genus order
k = tapply(sigtanam17$logFC, sigtanam17$Genus, function(k) max(k))
k = sort(k, TRUE)
sigtanam17$Genus = factor(as.character(sigtanam17$Genus), levels = names(k))
ggplot(sigtanam17, aes(x = Genus, y = logFC, color = Phylum)) + geom_point(size=8) + 
  theme(axis.text.x = element_text(angle = -90, hjust = 0, vjust = 0.5)) +
  ggtitle("Log Fold Change of Significant Genus in NAM-17")

## save the significant DE geneus in NAM17 as csv in your folder/WD

write.csv(sigtab2, 'D:/out_puts/phyloseq/with_green_genes_taxa/summary_tables/differential_abundance/NAM_17_line_gg.csv')

############# NAM-0 Vs NAM-23###############

qlf3 <- glmQLFTest(fit, coef=5)
topTags(qlf3)

#Here lets specify a FDR treshold and take only those significant at this value. alpha 0.01  

qlfnam23 = topTags(qlf3, n=nrow(x), adjust.method="BH", sort.by="PValue")
qlfnam23 = qlfnam23@.Data[[1]]
alpha = 0.01
sigtab3 = qlfnam23[(qlfnam23$FDR < alpha), ]
sigtab3 = cbind(as(sigtab3, "data.frame"), as(tax_table(physeqGenus)[rownames(sigtab3), ], "matrix"))
dim(sigtab3)
head(sigtab3)

## Plot log-fold change against log-counts per million, with DE genus highlighted FOR NAM23

detags3 <- rownames(sigtab3)
plotSmear(qlf3, de.tags=detags3)
abline(h=c(-1, 1), col="blue")

#number of up, down, not significnat genus

nam23res=decideTestsDGE(qlf3,adjust.method="BH", p.value=0.01)
summary(nam23res)

## Visualize the significnat DE GENUS IN NAM23

sigtanam23 = subset(sigtab3, !is.na(Genus))
# Phylum order
l = tapply(sigtanam23$logFC, sigtanam23$Phylum, function(l) max(l))
l = sort(l, TRUE)
sigtanam23$Phylum = factor(as.character(sigtanam23$Phylum), levels = names(l))
# Genus order
l = tapply(sigtanam23$logFC, sigtanam23$Genus, function(l) max(l))
l = sort(l, TRUE)
sigtanam23$Genus = factor(as.character(sigtanam23$Genus), levels = names(l))
ggplot(sigtanam23, aes(x = Genus, y = logFC, color = Phylum)) + geom_point(size=8) + 
  theme(axis.text.x = element_text(angle = -90, hjust = 0, vjust = 0.5)) +
  ggtitle("Log Fold Change of Significant Genus in NAM-23")

## save the significant DE geneus in NAME30 as csv in your folder/WD

write.csv(sigtab3, 'D:/out_puts/phyloseq/with_green_genes_taxa/summary_tables/differential_abundance/NAM_23_line_gg.csv')

############# NAM-0 Vs NAM-30###############

qlf4 <- glmQLFTest(fit, coef = 6)
topTags(qlf4)

#Here lets specify a FDR treshold and take only those significant at this value. alpha 0.01  

qlfnam30 = topTags(qlf4, n=nrow(x), adjust.method="BH", sort.by="PValue")
qlfnam30 = qlfnam30@.Data[[1]]
alpha = 0.01
sigtab4 = qlfnam30[(qlfnam30$FDR < alpha), ]
sigtab4 = cbind(as(sigtab4, "data.frame"), as(tax_table(physeqGenus)[rownames(sigtab4), ], "matrix"))
dim(sigtab4)
head(sigtab4)

## Plot log-fold change against log-counts per million, with DE genus highlighted FOR NAM30

detags4 <- rownames(sigtab4)
plotSmear(qlf4, de.tags=detags4)
abline(h=c(-1, 1), col="blue")

#number of up, down, and not significant

nam30res=decideTestsDGE(qlf4,adjust.method="BH", p.value=0.01)
summary(nam30res)

## Visualize the significnat DE GENUS IN NAM30

sigtanam30 = subset(sigtab4, !is.na(Genus))
# Phylum order
b = tapply(sigtanam30$logFC, sigtanam30$Phylum, function(b) max(b))
b = sort(b, TRUE)
sigtanam30$Phylum = factor(as.character(sigtanam30$Phylum), levels = names(b))
# Genus order
b = tapply(sigtanam30$logFC, sigtanam30$Genus, function(b) max(b))
b = sort(b, TRUE)
sigtanam30$Genus = factor(as.character(sigtanam30$Genus), levels = names(b))
ggplot(sigtanam30, aes(x = Genus, y = logFC, color = Phylum)) + geom_point(size=8) + 
  theme(axis.text.x = element_text(angle = -90, hjust = 0, vjust = 0.5)) +
  ggtitle("Log Fold Change of Significant Genus in NAM-30")

## save the significant DE geneus in NAME32 as csv in your folder/WD

write.csv(sigtab4, 'D:/out_puts/phyloseq/with_green_genes_taxa/summary_tables/differential_abundance/NAM_30_line_gg.csv')

############# NAM-0 Vs NAM-32###############

qlf5 <- glmQLFTest(fit, coef=7)
topTags(qlf5)

##Here lets specify a FDR treshold and take only those significant at this value. alpha 0.01  

qlfnam32 = topTags(qlf5, n=nrow(x), adjust.method="BH", sort.by="PValue")
qlfnam32 = qlfnam32@.Data[[1]]
alpha = 0.01
sigtab5 = qlfnam32[(qlfnam32$FDR < alpha), ]
sigtab5 = cbind(as(sigtab5, "data.frame"), as(tax_table(physeqGenus)[rownames(sigtab5), ], "matrix"))
dim(sigtab5)
head(sigtab5)

## Plot log-fold change against log-counts per million, with DE genus highlighted FOR NAM32

detags5 <- rownames(sigtab5)
plotSmear(qlf5, de.tags=detags5)
abline(h=c(-1, 1), col="blue")

#number of up, down and not significant geneus

nam32res=decideTestsDGE(qlf5,adjust.method="BH", p.value=0.01)
summary(nam32res)

## Visualize the significnat DE GENUS IN NAM32

sigtanam32 = subset(sigtab5, !is.na(Genus))
# Phylum order
c = tapply(sigtanam32$logFC, sigtanam32$Phylum, function(c) max(c))
c = sort(c, TRUE)
sigtanam32$Phylum = factor(as.character(sigtanam32$Phylum), levels = names(c))
# Genus order
c = tapply(sigtanam32$logFC, sigtanam32$Genus, function(c) max(c))
c = sort(c, TRUE)
sigtanam32$Genus = factor(as.character(sigtanam32$Genus), levels = names(c))
ggplot(sigtanam32, aes(x = Genus, y = logFC, color = Phylum)) + geom_point(size=8) + 
  theme(axis.text.x = element_text(angle = -90, hjust = 0, vjust = 0.5)) +
  ggtitle("Log Fold Change of Significant Genus in NAM-32")

## save the significant DE geneus in NAME32 as csv in your folder/WD

write.csv(sigtab5, 'D:/out_puts/phyloseq/with_green_genes_taxa/summary_tables/differential_abundance/NAM_32_line_gg.csv')

############# NAM-0 Vs NAM-37###############

qlf6 <- glmQLFTest(fit, coef=8)
topTags(qlf6)

#Here lets specify a FDR treshold and take only those significant at this value. alpha 0.01  

qlfnam37 = topTags(qlf6, n=nrow(x), adjust.method="BH", sort.by="PValue")
qlfnam37 = qlfnam37@.Data[[1]]
alpha = 0.01
sigtab6 = qlfnam37[(qlfnam37$FDR < alpha), ]
sigtab6 = cbind(as(sigtab6, "data.frame"), as(tax_table(physeqGenus)[rownames(sigtab6), ], "matrix"))
dim(sigtab6)
head(sigtab6)

## Plot log-fold change against log-counts per million, with DE genus highlighted FOR NAM37

detags6 <- rownames(sigtab6)
plotSmear(qlf6, de.tags=detags6)
abline(h=c(-1, 1), col="blue")

#number of up, down and not significant genus

nam37res=decideTestsDGE(qlf6,adjust.method="BH", p.value=0.01)
summary(nam37res)

## Visualize the significnat DE GENUS IN NAM37

sigtanam37 = subset(sigtab6, !is.na(Genus))
# Phylum order
d = tapply(sigtanam37$logFC, sigtanam37$Phylum, function(d) max(d))
d = sort(d, TRUE)
sigtanam37$Phylum = factor(as.character(sigtanam37$Phylum), levels = names(d))
# Genus order
d = tapply(sigtanam37$logFC, sigtanam37$Genus, function(d) max(d))
d = sort(d, TRUE)
sigtanam37$Genus = factor(as.character(sigtanam37$Genus), levels = names(d))
ggplot(sigtanam37, aes(x = Genus, y = logFC, color = Phylum)) + geom_point(size=8) + 
  theme(axis.text.x = element_text(angle = -90, hjust = 0, vjust = 0.5)) +
  ggtitle("Log Fold Change of Significant Genus in NAM-37")

## save the significant DE geneus in NAM37 as csv in your folder/WD

write.csv(sigtab6, 'D:/out_puts/phyloseq/with_green_genes_taxa/summary_tables/differential_abundance/NAM_37_line_gg.csv')

############# NAM-0 Vs NAM-43###############

qlf7 <- glmQLFTest(fit, coef=9)
topTags(qlf7)

##Here lets specify a FDR treshold and take only those significant at this value. alpha 0.01  

qlfnam43 = topTags(qlf7, n=nrow(x), adjust.method="BH", sort.by="PValue")
qlfnam43 = qlfnam43@.Data[[1]]
alpha = 0.01
sigtab7 = qlfnam43[(qlfnam43$FDR < alpha), ]
sigtab7 = cbind(as(sigtab7, "data.frame"), as(tax_table(physeqGenus)[rownames(sigtab7), ], "matrix"))
dim(sigtab7)
head(sigtab7)

## Plot log-fold change against log-counts per million, with DE genus highlighted FOR NAM43

detags7 <- rownames(sigtab7)
plotSmear(qlf7, de.tags=detags7)
abline(h=c(-1, 1), col="blue")

## number of up, down and not signifant taxa

nam43res=decideTestsDGE(qlf7,adjust.method="BH", p.value=0.01)
summary(nam43res)

## Visualize the significnat DE GENUS IN NAM43

sigtanam43 = subset(sigtab7, !is.na(Genus))
# Phylum order
f = tapply(sigtanam43$logFC, sigtanam43$Phylum, function(f) max(f))
f = sort(f, TRUE)
sigtanam43$Phylum = factor(as.character(sigtanam43$Phylum), levels = names(f))
# Genus order
f = tapply(sigtanam43$logFC, sigtanam43$Genus, function(f) max(f))
f = sort(f, TRUE)
sigtanam43$Genus = factor(as.character(sigtanam43$Genus), levels = names(f))
ggplot(sigtanam43, aes(x = Genus, y = logFC, color = Phylum)) + geom_point(size=8) + 
  theme(axis.text.x = element_text(angle = -90, hjust = 0, vjust = 0.5)) +
  ggtitle("Log Fold Change of Significant Genus in NAM-43")

## save the significant DE geneus in NAME43 as csv in your folder/WD

write.csv(sigtab7, 'D:/out_puts/phyloseq/with_green_genes_taxa/summary_tables/differential_abundance/NAM_43_line_gg.csv')

############# NAM-0 Vs NAM-46 ###############

qlf8 <- glmQLFTest(fit, coef=10)
topTags(qlf8)

##Here lets specify a FDR treshold and take only those significant at this value. alpha 0.01  

qlfnam46 = topTags(qlf8, n=nrow(x), adjust.method="BH", sort.by="PValue")
qlfnam46 = qlfnam46@.Data[[1]]
alpha = 0.01
sigtab8 = qlfnam46[(qlfnam46$FDR < alpha), ]
sigtab8 = cbind(as(sigtab8, "data.frame"), as(tax_table(physeqGenus)[rownames(sigtab8), ], "matrix"))
dim(sigtab8)
head(sigtab8)

## Plot log-fold change against log-counts per million, with DE genus highlighted FOR NAM46

detags8 <- rownames(sigtab8)
plotSmear(qlf8, de.tags=detags8)
abline(h=c(-1, 1), col="blue")

# number of up, down and not significant genus

nam46res=decideTestsDGE(qlf8,adjust.method="BH", p.value=0.01)
summary(nam46res)

## Visualize the significnat DE GENUS IN NAM46

sigtanam46 = subset(sigtab8, !is.na(Genus))
# Phylum order
g = tapply(sigtanam46$logFC, sigtanam46$Phylum, function(g) max(g))
g = sort(g, TRUE)
sigtanam46$Phylum = factor(as.character(sigtanam46$Phylum), levels = names(g))
# Genus order
g = tapply(sigtanam46$logFC, sigtanam46$Genus, function(g) max(g))
g = sort(g, TRUE)
sigtanam46$Genus = factor(as.character(sigtanam46$Genus), levels = names(g))
ggplot(sigtanam46, aes(x = Genus, y = logFC, color = Phylum)) + geom_point(size=8) + 
  theme(axis.text.x = element_text(angle = -90, hjust = 0, vjust = 0.5)) +
  ggtitle("Log Fold Change of Significant Genus in NAM-46")

## save the significant DE geneus in NAME43 as csv in your folder/WD

write.csv(sigtab8, 'D:/out_puts/phyloseq/with_green_genes_taxa/summary_tables/differential_abundance/NAM_46_line_gg.csv')

############# NAM-0 Vs NAM-48 ###############

qlf9 <- glmQLFTest(fit, coef=11)
topTags(qlf9)

##Here lets specify a FDR treshold and take only those significant at this value. alpha 0.01  

qlfnam48 = topTags(qlf9, n=nrow(x), adjust.method="BH", sort.by="PValue")
qlfnam48 = qlfnam48@.Data[[1]]
alpha = 0.01
sigtab9 = qlfnam48[(qlfnam48$FDR < alpha), ]
sigtab9 = cbind(as(sigtab9, "data.frame"), as(tax_table(physeqGenus)[rownames(sigtab9), ], "matrix"))
dim(sigtab9)
head(sigtab9)

## Plot log-fold change against log-counts per million, with DE genus highlighted FOR NAM48

detags9 <- rownames(sigtab9)
plotSmear(qlf9, de.tags=detags9)
abline(h=c(-1, 1), col="blue")

#number of up, down and not significant

nam48res=decideTestsDGE(qlf9,adjust.method="BH", p.value=0.01)
summary(nam48res)

## Visualize the significnat DE GENUS IN NAM48

sigtanam48 = subset(sigtab9, !is.na(Genus))
# Phylum order
h = tapply(sigtanam48$logFC, sigtanam48$Phylum, function(h) max(h))
h = sort(h, TRUE)
sigtanam48$Phylum = factor(as.character(sigtanam48$Phylum), levels = names(h))
# Genus order
h = tapply(sigtanam48$logFC, sigtanam48$Genus, function(h) max(h))
h = sort(h, TRUE)
sigtanam48$Genus = factor(as.character(sigtanam48$Genus), levels = names(h))
ggplot(sigtanam48, aes(x = Genus, y = logFC, color = Phylum)) + geom_point(size=8) + 
  theme(axis.text.x = element_text(angle = -90, hjust = 0, vjust = 0.5)) +
  ggtitle("Log Fold Change of Significant Genus in NAM-48")

## save the significant DE geneus in NAME5 as csv in your folder/WD

write.csv(sigtab9, 'D:/out_puts/phyloseq/with_green_genes_taxa/summary_tables/differential_abundance/NAM_48_line_gg.csv')

############# NAM-0 Vs NAM-5 ###############

qlf10 <- glmQLFTest(fit, coef=12)
topTags(qlf10)

####Here lets specify a FDR treshold and take only those significant at this value. alpha 0.01  

qlfnam5 = topTags(qlf10, n=nrow(x), adjust.method="BH", sort.by="PValue")
qlfnam5 = qlfnam5@.Data[[1]]
alpha = 0.01
sigtab10 = qlfnam5[(qlfnam5$FDR < alpha), ]
sigtab10 = cbind(as(sigtab10, "data.frame"), as(tax_table(physeqGenus)[rownames(sigtab10), ], "matrix"))
dim(sigtab10)
head(sigtab10)

## Plot log-fold change against log-counts per million, with DE genus highlighted FOR NAM5

detags10 <- rownames(sigtab10)
plotSmear(qlf10, de.tags=detags10)
abline(h=c(-1, 1), col="blue")

#number of up, down and not significant genus

nam5res=decideTestsDGE(qlf10,adjust.method="BH", p.value=0.01)
summary(nam5res)

## Visualize the significnat DE GENUS IN NAM5

sigtanam5 = subset(sigtab10, !is.na(Genus))
# Phylum order
i = tapply(sigtanam5$logFC, sigtanam5$Phylum, function(i) max(i))
i = sort(i, TRUE)
sigtanam5$Phylum = factor(as.character(sigtanam5$Phylum), levels = names(i))
# Genus order
i = tapply(sigtanam5$logFC, sigtanam5$Genus, function(i) max(i))
i = sort(i, TRUE)
sigtanam5$Genus = factor(as.character(sigtanam5$Genus), levels = names(i))
ggplot(sigtanam5, aes(x = Genus, y = logFC, color = Phylum)) + geom_point(size=8) + 
  theme(axis.text.x = element_text(angle = -90, hjust = 0, vjust = 0.5)) +
  ggtitle("Log Fold Change of Significant Genus in NAM-5")

## save the significant DE geneus in NAME5 as csv in your folder/WD

write.csv(sigtab10, 'D:/out_puts/phyloseq/with_green_genes_taxa/summary_tables/differential_abundance/NAM_5_line_gg.csv')

############# NAM-0 Vs NAM-72 ###############

qlf11 <- glmQLFTest(fit, coef=13)
topTags(qlf11)

##Here lets specify a FDR treshold and take only those significant at this value. alpha 0.01  

qlfnam72 = topTags(qlf11, n=nrow(x), adjust.method="BH", sort.by="PValue")
qlfnam72 = qlfnam72@.Data[[1]]
alpha = 0.01
sigtab11 = qlfnam72[(qlfnam72$FDR < alpha), ]
sigtab11 = cbind(as(sigtab11, "data.frame"), as(tax_table(physeqGenus)[rownames(sigtab11), ], "matrix"))
dim(sigtab11)
head(sigtab11)

## Plot log-fold change against log-counts per million, with DE genus highlighted FOR NAM72

detags11 <- rownames(sigtab11)
plotSmear(qlf11, de.tags=detags11)
abline(h=c(-1, 1), col="blue")

#number of up, down and not significant genus

nam72res=decideTestsDGE(qlf11,adjust.method="BH", p.value=0.01)

summary(nam72res)

## Visualize the significnat DE GENUS IN NAM72

sigtanam72 = subset(sigtab11, !is.na(Genus))
# Phylum order
n = tapply(sigtanam72$logFC, sigtanam72$Phylum, function(n) max(n))
n = sort(n, TRUE)
sigtanam72$Phylum = factor(as.character(sigtanam72$Phylum), levels = names(n))
# Genus order
n = tapply(sigtanam72$logFC, sigtanam72$Genus, function(n) max(n))
n = sort(n, TRUE)
sigtanam72$Genus = factor(as.character(sigtanam72$Genus), levels = names(n))
ggplot(sigtanam72, aes(x = Genus, y = logFC, color = Phylum)) + geom_point(size=8) + 
  theme(axis.text.x = element_text(angle = -90, hjust = 0, vjust = 0.5)) +
  ggtitle("Log Fold Change of Significant Genus in NAM-72")

## save the significant DE geneus in NAME72 as csv in your folder/WD

write.csv(sigtab11, 'D:/out_puts/phyloseq/with_green_genes_taxa/summary_tables/differential_abundance/NAM_72_line_gg.csv')

############# NAM-0 Vs NAM-76 ###############

qlf12 <- glmQLFTest(fit, coef=14)
topTags(qlf12)

##Here lets specify a FDR treshold and take only those significant at this value. alpha 0.01  

qlfnam76 = topTags(qlf12, n=nrow(x), adjust.method="BH", sort.by="PValue")
qlfnam76 = qlfnam76@.Data[[1]]
alpha = 0.01
sigtab12 = qlfnam76[(qlfnam76$FDR < alpha), ]
sigtab12 = cbind(as(sigtab12, "data.frame"), as(tax_table(physeqGenus)[rownames(sigtab12), ], "matrix"))
dim(sigtab12)
head(sigtab12)

## Plot log-fold change against log-counts per million, with DE genus highlighted FOR NAM76

detags12 <- rownames(sigtab12)
plotSmear(qlf12, de.tags=detags12)
abline(h=c(-1, 1), col="blue")

#number of up, down and not significant

nam76res=decideTestsDGE(qlf12,adjust.method="BH", p.value=0.01)
summary(nam76res)

### Visualize the significnat DE GENUS IN NAM76

sigtanam76 = subset(sigtab12, !is.na(Genus))
# Phylum order
p = tapply(sigtanam76$logFC, sigtanam76$Phylum, function(p) max(p))
p = sort(p, TRUE)
sigtanam76$Phylum = factor(as.character(sigtanam76$Phylum), levels = names(p))
# Genus order
p = tapply(sigtanam76$logFC, sigtanam76$Genus, function(p) max(p))
p = sort(p, TRUE)
sigtanam76$Genus = factor(as.character(sigtanam76$Genus), levels = names(p))
ggplot(sigtanam76, aes(x = Genus, y = logFC, color = Phylum)) + geom_point(size=8) + 
  theme(axis.text.x = element_text(angle = -90, hjust = 0, vjust = 0.5)) +
  ggtitle("Log Fold Change of Significant Genus in NAM-76")


# save the significant DE geneus in NAME76 as csv in your folder/WD

write.csv(sigtab12, 'D:/out_puts/phyloseq/with_green_genes_taxa/summary_tables/differential_abundance/NAM_76_line_gg.csv')

############# NAM-0 Vs NAM-79 ###############

qlf13 <- glmQLFTest(fit, coef=15)
topTags(qlf13)

##Here lets specify a FDR treshold and take only those significant at this value. alpha 0.01  

qlfnam79 = topTags(qlf13, n=nrow(x), adjust.method="BH", sort.by="PValue")
qlfnam79 = qlfnam79@.Data[[1]]
alpha = 0.01
sigtab13 = qlfnam79[(qlfnam79$FDR < alpha), ]
sigtab13 = cbind(as(sigtab13, "data.frame"), as(tax_table(physeqGenus)[rownames(sigtab13), ], "matrix"))
dim(sigtab13)
head(sigtab13)

## Plot log-fold change against log-counts per million, with DE genus highlighted FOR NAM79

detags13 <- rownames(sigtab13)
plotSmear(qlf13, de.tags=detags13)
abline(h=c(-1, 1), col="blue")

# number of up, down and not significant

nam79res=decideTestsDGE(qlf13,adjust.method="BH", p.value=0.01)
summary(nam79res)

## Visualize the significnat DE GENUS IN NAM79

sigtanam79 = subset(sigtab13, !is.na(Genus))
# Phylum order
q = tapply(sigtanam79$logFC, sigtanam79$Phylum, function(q) max(q))
q = sort(q, TRUE)
sigtanam79$Phylum = factor(as.character(sigtanam79$Phylum), levels = names(q))
# Genus order
q = tapply(sigtanam79$logFC, sigtanam79$Genus, function(q) max(q))
q = sort(q, TRUE)
sigtanam79$Genus = factor(as.character(sigtanam79$Genus), levels = names(q))
ggplot(sigtanam79, aes(x = Genus, y = logFC, color = Phylum)) + geom_point(size=8) + 
  theme(axis.text.x = element_text(angle = -90, hjust = 0, vjust = 0.5)) +
  ggtitle("Log Fold Change of Significant Genus in NAM-79")

# save the significant DE geneus in NAME79 as csv in your folder/WD

write.csv(sigtab13, 'D:/out_puts/phyloseq/with_green_genes_taxa/summary_tables/differential_abundance/NAM_79_line_gg.csv')

############# NAM-0 Vs YN04_C1213(NAM-94)###############

qlf14 <- glmQLFTest(fit, coef=16)
topTags(qlf14)

##Here lets specify a FDR treshold and take only those significant at this value. alpha 0.01  

qlfnamYN04 = topTags(qlf14, n=nrow(x), adjust.method="BH", sort.by="PValue")
qlfnamYN04 = qlfnamYN04@.Data[[1]]
alpha = 0.01
sigtab14 = qlfnamYN04[(qlfnamYN04$FDR < alpha), ]
sigtab14 = cbind(as(sigtab14, "data.frame"), as(tax_table(physeqGenus)[rownames(sigtab14), ], "matrix"))
dim(sigtab14)
head(sigtab14)

## Plot log-fold change against log-counts per million, with DE genus highlighted FOR YN04-C1213

detags14 <- rownames(sigtab14)
plotSmear(qlf14, de.tags=detags14)
abline(h=c(-1, 1), col="blue")

# number of up, down and not significant

namyno4res=decideTestsDGE(qlf14,adjust.method="BH", p.value=0.01)
summary(namyno4res)

## Visualize the significnat DE GENUS IN YN04-C1213

sigtanamYN04 = subset(sigtab14, !is.na(Genus))
# Phylum order
r = tapply(sigtanamYN04$logFC, sigtanamYN04$Phylum, function(r) max(r))
r = sort(r, TRUE)
sigtanamYN04$Phylum = factor(as.character(sigtanamYN04$Phylum), levels = names(r))
# Genus order
r = tapply(sigtanamYN04$logFC, sigtanamYN04$Genus, function(r) max(r))
r = sort(r, TRUE)
sigtanamYN04$Genus = factor(as.character(sigtanamYN04$Genus), levels = names(r))
ggplot(sigtanamYN04, aes(x = Genus, y = logFC, color = Phylum)) + geom_point(size=8) + 
  theme(axis.text.x = element_text(angle = -90, hjust = 0, vjust = 0.5)) +
  ggtitle("Log Fold Change of Significant Genus in NAM-YN04")


# save the significant DE geneus in YN04-C1213 (NAM-94) as csv in your folder/WD

write.csv(sigtab14, 'D:/out_puts/phyloseq/with_green_genes_taxa/summary_tables/differential_abundance/YN04-C1213_line_gg.csv')

##### extracting edgeR results ######

## sample relationship in multidimentional scaling (calculated common disperssion is also shown in the graph). Common disperssion,
#trend and genus wise disperssion estimates plot

par( mfrow=c(1 ,2) )

plotMDS(x, method="bcv",cex.lab=1.2, cex.axis=1.2, cex.main=1.2 )

plotBCV(x,main="Estimate of the negative binomial dispersion", cex.lab=1.2, cex.axis=1.2, cex.main=1.2, lwd=2,pch=14)
plotBCV(x, cex.lab=1.2, cex.axis=1.2, cex.main=1.2, lwd=2,pch=14)  ## NB disperssion estimate
plotQLDisp(fit,cex.lab=1.2,cex.axis=1.2, cex.main=1.2, lwd=2,pch=14) ## fitted mean-QL disperssion trend

abline(h=c(-3, 3), col="blue", lwd=2)
plotMD(qlf,adjust.method="BH", p.value=0.05)

plotMD(qlf)
sum( sigtab2$Genus %in% sigtab$Genus ) / length( sigtab$Genus ) * 100

sum( sigtab2$Genus %in% sigtab$Genus )
sum( sigtab2$Genus %in% sigtab1$Genus )
sum( sigtab2$Genus %in% sigtab2$Genus )
sum( sigtab2$Genus %in% sigtab3$Genus )
sum( sigtab2$Genus %in% sigtab4$Genus )
sum( sigtab2$Genus %in% sigtab5$Genus )
sum( sigtab2$Genus %in% sigtab6$Genus )
sum( sigtab2$Genus %in% sigtab7$Genus )
sum( sigtab2$Genus %in% sigtab8$Genus )
sum( sigtab2$Genus %in% sigtab9$Genus )
sum( sigtab2$Genus %in% sigtab10$Genus )
sum( sigtab2$Genus %in% sigtab11$Genus )
sum( sigtab2$Genus %in% sigtab12$Genus )
sum( sigtab2$Genus %in% sigtab13$Genus )
sum( sigtab2$Genus %in% sigtab14$Genus )

# INTERSECT THE SIGNIFCANT differential abundant genus in each line to find the commonly occuring differential abundant gens in all the line compared with Nam-0
sharedsig_taxa= Reduce(intersect, list(sigtab$Genus,sigtab2$Genus,sigtab3$Genus,sigtab4$Genus,sigtab5$Genus,sigtab6$Genus,sigtab7$Genus,sigtab8$Genus,sigtab9$Genus,sigtab10$Genus,sigtab11$Genus, sigtab12$Genus,sigtab13$Genus,sigtab14$Genus))

write.csv(sharedsig_taxa, 'D:/out_puts/phyloseq/with_green_genes_taxa/summary_tables/differential_abundance/all_canola_line_shared_sig_taxa.csv')


### histogram of the logCPM of the top 100 significantly differentailly expressed taxa

par( mfrow=c(2 ,2) )

hist(sigtab$logCPM, breaks=25 , xlab="LogCPM " 
     , xlim=c(0,12) , ylim=c(0,0.2), freq=FALSE , main="Nam-13", cex.lab=1.6, cex.axis=1.5, cex.main=1.5)
text(0.2,0.2,"a", cex=1.2)
hist( sigtab1$logCPM, breaks=25 , xlab="LogCPM",
      xlim=c(0,12) , ylim=c(0,0.2), freq=FALSE , main="Nam-14", cex.lab=1.6, cex.axis=1.5, cex.main=1.5 )
text(0.1,0.2,"b", cex=1.2)
hist(sigtab2$logCPM, breaks=25 , xlab="LogCPM" ,
     xlim=c(0,12) , ylim=c(0,0.2) , freq=FALSE , main="Nam-17",cex.lab=1.6, cex.axis=1.5, cex.main=1.5 )
text(0.1,0.2,"c", cex=1.2)

hist(sigtab3$logCPM, breaks=25 , xlab="LogCPM" ,
     xlim=c(0,12) , ylim=c(0,0.2) , freq=FALSE , main="Nam-23",cex.lab=1.6, cex.axis=1.5, cex.main=1.5 )
text(0.1,0.2,"d", cex=1.2)


par( mfrow=c(2 ,2) )
hist(sigtab4$logCPM, breaks=25 , xlab="LogCPM " 
     , xlim=c(0,12) , ylim=c(0,0.2), freq=FALSE , main="Nam-30", cex.lab=1.6, cex.axis=1.5, cex.main=1.5)
text(0.1,0.2,"e", cex=1.2)
hist( sigtab5$logCPM, breaks=25 , xlab="LogCPM",
      xlim=c(0,12) , ylim=c(0,0.2), freq=FALSE , main="Nam-32", cex.lab=1.6, cex.axis=1.5, cex.main=1.5 )
text(0.1,0.2,"f", cex=1.2)
hist(sigtab6$logCPM, breaks=25 , xlab="LogCPM" ,
     xlim=c(0,12) , ylim=c(0,0.2) , freq=FALSE , main="Nam-37",cex.lab=1.6, cex.axis=1.5, cex.main=1.5 )
text(0.1,0.2,"g", cex=1.2)

hist(sigtab7$logCPM, breaks=25 , xlab="LogCPM" ,
     xlim=c(0,12) , ylim=c(0,0.2) , freq=FALSE , main="Nam-43",cex.lab=1.6, cex.axis=1.5, cex.main=1.5 )
text(0.1,0.2,"h", cex=1.2)

par( mfrow=c(2 ,2) )
hist(sigtab8$logCPM, breaks=25 , xlab="LogCPM " 
     , xlim=c(0,12) , ylim=c(0,0.2), freq=FALSE , main="Nam-46", cex.lab=1.6, cex.axis=1.5, cex.main=1.5)
text(0.1,0.2,"i", cex=1.2)
hist( sigtab9$logCPM, breaks=25 , xlab="LogCPM",
      xlim=c(0,12) , ylim=c(0,0.2), freq=FALSE , main="Nam-48", cex.lab=1.6, cex.axis=1.5, cex.main=1.5 )
text(0.1,0.2,"j", cex=1.2)
hist(sigtab10$logCPM, breaks=25 , xlab="LogCPM" ,
     xlim=c(0,12) , ylim=c(0,0.2) , freq=FALSE , main="Nam-5",cex.lab=1.6, cex.axis=1.5, cex.main=1.5 )
text(0.1,0.2,"k", cex=1.2)

hist(sigtab11$logCPM, breaks=25 , xlab="LogCPM" ,
     xlim=c(0,12) , ylim=c(0,0.2) , freq=FALSE , main="Nam-72",cex.lab=1.6, cex.axis=1.5, cex.main=1.5 )
text(0.1,0.2,"l", cex=1.2)

par( mfrow=c(2 ,2) )
hist(sigtab12$logCPM, breaks=25 , xlab="LogCPM" ,
     xlim=c(0,12) , ylim=c(0,0.2) , freq=FALSE , main="Nam-76",cex.lab=1.6, cex.axis=1.5, cex.main=1.5 )
text(0.1,0.2,"m", cex=1.2)

hist(sigtab13$logCPM, breaks=25 , xlab="LogCPM" ,
     xlim=c(0,12) , ylim=c(0,0.2) , freq=FALSE , main="Nam-79",cex.lab=1.6, cex.axis=1.5, cex.main=1.5 )
text(0.1,0.2,"n", cex=1.2)

hist(sigtab14$logCPM, breaks=25 , xlab="LogCPM" ,
     xlim=c(0,12) , ylim=c(0,0.2) , freq=FALSE , main="YN04-C1213",cex.lab=1.6, cex.axis=1.5, cex.main=1.5 )
text(0.1,0.2,"o", cex=1.2)

### MA plots showing the relationhip between average abundence (averageLOGCPM) and logfold change. Red dot indicates the significantly differentially
#abundant genus, the black dot the not significant. The blue line indicates the the minimum fold change at which the significant abundunt genus have. 

par( mfrow=c(2 ,2) )
plotSmear(qlf, de.tags=detags,main="Nam-13",cex.lab=1.6, cex.axis=1.5, cex.main=1.5, pch=10) 
abline(h=c(-3, 3), col="blue", lwd=2)

plotSmear(qlf1, de.tags=detags1,main="Nam-14",cex.lab=1.6, cex.axis=1.5, cex.main=1.5, pch=10)
abline(h=c(-3, 3), col="blue",lwd=2)

plotSmear(qlf2, de.tags=detags2,main="Nam-17",cex.lab=1.6, cex.axis=1.5, cex.main=1.5, pch=10)
abline(h=c(-1, 1), col="blue",lwd=2)

plotSmear(qlf3, de.tags=detags3,main="Nam-23",cex.lab=1.6, cex.axis=1.5, cex.main=1.5, pch=10)
abline(h=c(-1, 1), col="blue",lwd=2) 


par( mfrow=c(2 ,2) )
plotSmear(qlf4, de.tags=detags4,main="Nam-30",cex.lab=1.6, cex.axis=1.5, cex.main=1.5, pch=10)
abline(h=c(-1, 1), col="blue",lwd=2) 

plotSmear(qlf5, de.tags=detags5,main="Nam-32",cex.lab=1.6, cex.axis=1.5, cex.main=1.5, pch=10)
abline(h=c(-1, 1), col="blue",lwd=2) 

plotSmear(qlf6, de.tags=detags6,main="Nam-37",cex.lab=1.6, cex.axis=1.5, cex.main=1.5, pch=10)
abline(h=c(-1, 1), col="blue",lwd=2)
plotSmear(qlf7, de.tags=detags7,main="Nam-43",cex.lab=1.6, cex.axis=1.5, cex.main=1.5, pch=10)
abline(h=c(-1, 1), col="blue",lwd=2)


par( mfrow=c(2 ,2) ) 
plotSmear(qlf8, de.tags=detags8,main="Nam-46",cex.lab=1.6, cex.axis=1.5, cex.main=1.5, pch=10)
abline(h=c(-1, 1), col="blue",lwd=2)

plotSmear(qlf9, de.tags=detags9,main="Nam-48",cex.lab=1.6, cex.axis=1.5, cex.main=1.5, pch=10)
abline(h=c(-1, 1), col="blue",lwd=2)

plotSmear(qlf10, de.tags=detags10,main="Nam-5",cex.lab=1.6, cex.axis=1.5, cex.main=1.5, pch=10)
abline(h=c(-1, 1), col="blue",lwd=2)

plotSmear(qlf11, de.tags=detags11,main="Nam-72",cex.lab=1.6, cex.axis=1.5, cex.main=1.5, pch=10)
abline(h=c(-1, 1), col="blue",lwd=2)


par( mfrow=c(2 ,2) ) 
plotSmear(qlf12, de.tags=detags12,main="Nam-76",cex.lab=1.6, cex.axis=1.5, cex.main=1.5, pch=10)
abline(h=c(-1, 1), col="blue",lwd=2)
plotSmear(qlf13, de.tags=detags13,main="Nam-79",cex.lab=1.6, cex.axis=1.5, cex.main=1.5, pch=10)
abline(h=c(-1, 1), col="blue",lwd=2)
plotSmear(qlf14, de.tags=detags14,main="YN04-C1213",cex.lab=1.6, cex.axis=1.5, cex.main=1.5, pch=10)
abline(h=c(-1, 1), col="blue",lwd=2)


## plotMD better option than plotsmear visually

par( mfrow=c(2 ,2),mar=c(4.5, 4.5, 0.5,0.5) ) 
plotMD(qlf,main="",cex.lab=1.6, cex.axis=1.5, cex.main=1.5, xlab="",legend=FALSE,bty = "n", xlim=c(0,15))
text(2,10,"NAM 13", cex=1.2)

plotMD(qlf1,main="",cex.lab=1.6, cex.axis=1.5, cex.main=1.5,xlab="",legend=FALSE, ylab="",bty = "n", xlim=c(0,15))
text(2,10,"NAM 14", cex=1.2)


plotMD(qlf2, main="",cex.lab=1.6, cex.axis=1.5, cex.main=1.5,legend=FALSE,bty = "n", xlim=c(0,15))
text(2,14,"NAM 17", cex=1.2)

plotMD(qlf3, main="",cex.lab=1.6, cex.axis=1.5, cex.main=1.5,ylab="",legend=FALSE,bty = "n", xlim=c(0,15)) 
text(2,10,"NAM 23", cex=1.2)  

#
par( mfrow=c(2 ,2),mar=c(4.5, 4.5, 0.5,0.5)  )
plotMD(qlf4,main="",cex.lab=1.6, cex.axis=1.5, cex.main=1.5, xlab="",legend=FALSE,bty = "n", xlim=c(0,15))
text(2,10,"NAM 30", cex=1.2)

plotMD(qlf5,main="",cex.lab=1.6, cex.axis=1.5, cex.main=1.5,xlab="",legend=FALSE, ylab="",bty = "n",xlim=c(0,15))
text(2,10,"NAM 32", cex=1.2)

plotMD(qlf6,main="",cex.lab=1.6, cex.axis=1.5, cex.main=1.5,legend=FALSE,bty = "n", xlim=c(0,15))
text(2,10,"NAM 37", cex=1.2)

plotMD(qlf7,main="",cex.lab=1.6, cex.axis=1.5, cex.main=1.5,ylab="",legend=FALSE,bty = "n", xlim=c(0,15))
text(2,10,"NAM 43", cex=1.2)

#
par( mfrow=c(2 ,2),mar=c(4.5, 4.5, 0.5,0.5) ) 
plotMD(qlf8,main="",cex.lab=1.6, cex.axis=1.5, cex.main=1.5, xlab="",legend=FALSE,bty = "n", xlim=c(0,15))
text(2,15,"NAM 46", cex=1.2)

plotMD(qlf9,main="",cex.lab=1.6, cex.axis=1.5, cex.main=1.5,xlab="", legend=FALSE,ylab="",bty = "n", xlim=c(0,15))
text(2,10,"NAM 48", cex=1.2)

plotMD(qlf10,main="",cex.lab=1.6, cex.axis=1.5, cex.main=1.5,legend=FALSE,bty = "n", xlim=c(0,15))
text(2,10,"NAM 5", cex=1.2)

plotMD(qlf11,main="",cex.lab=1.6, cex.axis=1.5, cex.main=1.5,ylab="",legend=FALSE,bty = "n", xlim=c(0,15))
text(2,10,"NAM 72", cex=1.2)


par( mfrow=c(2 ,2),mar=c(4.5, 4.5, 0.5,0.5) ) 
plotMD(qlf12,main="",cex.lab=1.6, cex.axis=1.5, cex.main=1.5,xlab="",legend=FALSE,bty = "n", xlim=c(0,15))
text(2,10,"NAM 76", cex=1.2)

plotMD(qlf13,main="",cex.lab=1.6, cex.axis=1.5, cex.main=1.5, ylab="",legend=FALSE,bty = "n", xlim=c(0,15))
text(2,10,"NAM 79", cex=1.2)

plotMD(qlf14,main="",cex.lab=1.6, cex.axis=1.5, cex.main=1.5,bty = "n", legend=FALSE, xlim=c(0,15))
text(3,10,"YN04-C1213", cex=1.2)


### genotype plot order based on totla number of differentially abundnat genera in each line.

par( mfrow=c(2 ,2),mar=c(4.5, 4.5, 0.5,0.5) ) 
xx= plotMD(qlf8,main="",cex.lab=1.6, cex.axis=1.5, cex.main=1.5, xlab="",legend=FALSE,bty = "n", xlim=c(0,15))
text(2,15,"NAM-46", cex=1.2)

plotMD(qlf2, main="",cex.lab=1.6, cex.axis=1.5, cex.main=1.5,xlab="",ylab="",legend=FALSE,bty = "n", xlim=c(0,15))
text(2,14,"NAM-17", cex=1.2)

plotMD(qlf13,main="",cex.lab=1.6, cex.axis=1.5, cex.main=1.5,legend=FALSE,bty = "n", xlim=c(0,15))
text(2,10,"NAM-79", cex=1.2)

plotMD(qlf4,main="",cex.lab=1.6, cex.axis=1.5, cex.main=1.5,ylab="",legend=FALSE,bty = "n", xlim=c(0,15))
text(2,10,"NAM-30", cex=1.2)

####
par( mfrow=c(2 ,2),mar=c(4.5, 4.5, 0.5,0.5) ) 
plotMD(qlf1,main="",cex.lab=1.6, cex.axis=1.5, cex.main=1.5,xlab="",legend=FALSE,bty = "n", xlim=c(0,15))
text(2,10,"NAM-14", cex=1.2)

plotMD(qlf,main="",cex.lab=1.6, cex.axis=1.5, cex.main=1.5, xlab="",ylab="",legend=FALSE,bty = "n", xlim=c(0,15))
text(2,10,"NAM-13", cex=1.2)

plotMD(qlf10,main="",cex.lab=1.6, cex.axis=1.5, cex.main=1.5,legend=FALSE,bty = "n", xlim=c(0,15))
text(2,10,"NAM-5", cex=1.2)

plotMD(qlf3, main="",cex.lab=1.6, cex.axis=1.5, cex.main=1.5,ylab="",legend=FALSE,bty = "n", xlim=c(0,15)) 
text(2,10,"NAM-23", cex=1.2)  


####

par( mfrow=c(2 ,2),mar=c(4.5, 4.5, 0.5,0.5) ) 
plotMD(qlf7,main="",cex.lab=1.6, cex.axis=1.5, cex.main=1.5,xlab="",legend=FALSE,bty = "n", xlim=c(0,15))
text(2,10,"NAM-43", cex=1.2)

plotMD(qlf12,main="",cex.lab=1.6, cex.axis=1.5, cex.main=1.5,xlab="",ylab="",legend=FALSE,bty = "n", xlim=c(0,15))
text(2,10,"NAM-76", cex=1.2)

plotMD(qlf6,main="",cex.lab=1.6, cex.axis=1.5, cex.main=1.5,legend=FALSE,bty = "n", xlim=c(0,15))
text(2,10,"NAM-37", cex=1.2)

plotMD(qlf14,main="",cex.lab=1.6, cex.axis=1.5, cex.main=1.5,bty = "n",ylab="", legend=FALSE, xlim=c(0,15))
text(3,10,"NAM-94", cex=1.2)

####
par( mfrow=c(2 ,2),mar=c(4.5, 4.5, 0.5,0.5) ) 
plotMD(qlf5,main="",cex.lab=1.6, cex.axis=1.5, cex.main=1.5,xlab="",legend=FALSE,bty = "n",xlim=c(0,15))
text(2,10,"NAM-32", cex=1.2)

plotMD(qlf9,main="",cex.lab=1.6, cex.axis=1.5, cex.main=1.5, legend=FALSE,ylab="",bty = "n", xlim=c(0,15))
text(2,10,"NAM-48", cex=1.2)

plotMD(qlf11,main="",cex.lab=1.6, cex.axis=1.5, cex.main=1.5,legend=FALSE,bty = "n", xlim=c(0,15))
text(2,10,"NAM-72", cex=1.2)


### final all in one

## genotype plot order based on totla number of differentially abundnat genera in each line.

par( mfrow=c(4 ,4),mar=c(4.5, 4.5, 0.5,0.5) ) 
plotMD(qlf8,main="",cex.lab=1.6, cex.axis=1.5, cex.main=1.5, xlab="",legend=FALSE,bty = "n", xlim=c(0,15))
text(2,15,"NAM-46", cex=1.2)

plotMD(qlf2, main="",cex.lab=1.6, cex.axis=1.5, cex.main=1.5,xlab="",ylab="",legend=FALSE,bty = "n", xlim=c(0,15))
text(2,14,"NAM-17", cex=1.2)

plotMD(qlf13,main="",cex.lab=1.6, cex.axis=1.5,xlab="", ylab="", cex.main=1.5,legend=FALSE,bty = "n", xlim=c(0,15))
text(2,10,"NAM-79", cex=1.2)

plotMD(qlf4,main="",cex.lab=1.6, cex.axis=1.5, cex.main=1.5,xlab="",ylab="",legend=FALSE,bty = "n", xlim=c(0,15))
text(2,10,"NAM-30", cex=1.2)

plotMD(qlf1,main="",cex.lab=1.6, cex.axis=1.5, cex.main=1.5,xlab="",legend=FALSE,bty = "n", xlim=c(0,15))
text(2,10,"NAM-14", cex=1.2)

plotMD(qlf,main="",cex.lab=1.6, cex.axis=1.5, cex.main=1.5, xlab="",ylab="",legend=FALSE,bty = "n", xlim=c(0,15))
text(2,10,"NAM-13", cex=1.2)

plotMD(qlf10,main="",cex.lab=1.6, cex.axis=1.5, xlab="", ylab="", cex.main=1.5,legend=FALSE,bty = "n", xlim=c(0,15))
text(2,10,"NAM-5", cex=1.2)

plotMD(qlf3, main="",cex.lab=1.6, cex.axis=1.5,xlab="", cex.main=1.5,ylab="",legend=FALSE,bty = "n", xlim=c(0,15)) 
text(2,10,"NAM-23", cex=1.2)  

plotMD(qlf7,main="",cex.lab=1.6, cex.axis=1.5, cex.main=1.5,xlab="",legend=FALSE,bty = "n", xlim=c(0,15))
text(2,10,"NAM-43", cex=1.2)

plotMD(qlf12,main="",cex.lab=1.6, cex.axis=1.5, cex.main=1.5,xlab="",ylab="",legend=FALSE,bty = "n", xlim=c(0,15))
text(2,10,"NAM-76", cex=1.2)

plotMD(qlf6,main="",cex.lab=1.6, cex.axis=1.5,xlab="", ylab="", cex.main=1.5,legend=FALSE,bty = "n", xlim=c(0,15))
text(2,10,"NAM-37", cex=1.2)

plotMD(qlf14,main="",cex.lab=1.6, cex.axis=1.5, cex.main=1.5,bty = "n",ylab="", legend=FALSE, xlim=c(0,15))
text(3,10,"NAM-94", cex=1.2)

plotMD(qlf5,main="",cex.lab=1.6, cex.axis=1.5, cex.main=1.5,legend=FALSE,bty = "n",xlim=c(0,15))
text(2,10,"NAM-32", cex=1.2)

plotMD(qlf9,main="",cex.lab=1.6, cex.axis=1.5, cex.main=1.5, legend=FALSE,ylab="",bty = "n", xlim=c(0,15))
text(2,10,"NAM-48", cex=1.2)

plotMD(qlf11,main="",cex.lab=1.6, cex.axis=1.5,ylab="", cex.main=1.5,legend=FALSE,bty = "n", xlim=c(0,15))
text(2,10,"NAM-72", cex=1.2)
###

# visualizing the top 20 genus based on logflod change

qlfnam13 = topTags(qlf, n=nrow(x), adjust.method="BH", sort.by="logFC")
qlfnam13 = qlfnam13@.Data[[1]]
alpha = 0.01
sigtab = qlfnam13[(qlfnam13$FDR < alpha), ]
sigtab = cbind(as(sigtab, "data.frame"), as(tax_table(physeqGenus)[rownames(sigtab), ], "matrix"))
dim(sigtab)

sigtanam13 = subset(sigtab, !is.na(Genus))
# Phylum order
m = tapply(sigtanam13$logFC, sigtanam13$Phylum, function(y) max(y))
m = sort(m, TRUE)
sigtanam13$Phylum = factor(as.character(sigtanam13$Phylum), levels = names(m))
# Genus order
m = tapply(sigtanam13$logFC, sigtanam13$Genus, function(m) max(m))
m = sort(m, TRUE)
sigtanam13$Genus = factor(as.character(sigtanam13$Genus), levels = names(m))
ggplot(sigtanam13, aes(x = Genus, y = logFC, color = Phylum)) + geom_point(size=8) + 
  theme(axis.text.x = element_text(angle = -90, hjust = 0, vjust = 0.5)) +
  ggtitle("Log Fold Change of Significant Genus in NAM-13")


#
qlfnam14 = topTags(qlf1, n=nrow(x), adjust.method="BH", sort.by="logFC")
qlfnam14 = qlfnam14@.Data[[1]]
alpha = 0.01
sigtab1 = qlfnam14[(qlfnam14$FDR < alpha), ]
sigtab1 = cbind(as(sigtab1, "data.frame"), as(tax_table(physeqGenus)[rownames(sigtab1), ], "matrix"))
dim(sigtab1)

sigtanam14 = subset(sigtab1, !is.na(Genus))
# Phylum order
j = tapply(sigtanam14$logFC, sigtanam14$Phylum, function(j) max(j))
j = sort(j, TRUE)
sigtanam14$Phylum = factor(as.character(sigtanam14$Phylum), levels = names(j))
# Genus order
j = tapply(sigtanam14$logFC, sigtanam14$Genus, function(j) max(j))
j = sort(j, TRUE)
sigtanam14$Genus = factor(as.character(sigtanam14$Genus), levels = names(j))
ggplot(sigtanam14, aes(x = Genus, y = logFC, color = Phylum)) + geom_point(size=8) + 
  theme(axis.text.x = element_text(angle = -90, hjust = 0, vjust = 0.5)) +
  ggtitle("Log Fold Change of Significant Genus in NAM-14")
#

qlfnam17 = topTags(qlf2, n=nrow(x), adjust.method="BH", sort.by="logFC")
qlfnam17 = qlfnam17@.Data[[1]]
alpha = 0.01
sigtab2 = qlfnam17[(qlfnam17$FDR < alpha), ]
sigtab2 = cbind(as(sigtab2, "data.frame"), as(tax_table(physeqGenus)[rownames(sigtab2), ], "matrix"))
dim(sigtab2)

sigtanam17 = subset(sigtab2, !is.na(Genus))
# Phylum order
k = tapply(sigtanam17$logFC, sigtanam17$Phylum, function(k) max(k))
k = sort(k, TRUE)
sigtanam17$Phylum = factor(as.character(sigtanam17$Phylum), levels = names(k))
# Genus order
k = tapply(sigtanam17$logFC, sigtanam17$Genus, function(k) max(k))
k = sort(k, TRUE)
sigtanam17$Genus = factor(as.character(sigtanam17$Genus), levels = names(k))
ggplot(sigtanam17, aes(x = Genus, y = logFC, color = Phylum)) + geom_point(size=8) + 
  theme(axis.text.x = element_text(angle = -90, hjust = 0, vjust = 0.5)) +
  ggtitle("Log Fold Change of Significant Genus in NAM-17")
#

qlfnam23 = topTags(qlf3, n=nrow(x), adjust.method="BH", sort.by="logFC")
qlfnam23 = qlfnam23@.Data[[1]]
alpha = 0.01
sigtab3 = qlfnam23[(qlfnam23$FDR < alpha), ]
sigtab3 = cbind(as(sigtab3, "data.frame"), as(tax_table(physeqGenus)[rownames(sigtab3), ], "matrix"))
dim(sigtab3)

sigtanam23 = subset(sigtab3, !is.na(Genus))
# Phylum order
l = tapply(sigtanam23$logFC, sigtanam23$Phylum, function(l) max(l))
l = sort(l, TRUE)
sigtanam23$Phylum = factor(as.character(sigtanam23$Phylum), levels = names(l))
# Genus order
l = tapply(sigtanam23$logFC, sigtanam23$Genus, function(l) max(l))
l = sort(l, TRUE)
sigtanam23$Genus = factor(as.character(sigtanam23$Genus), levels = names(l))
ggplot(sigtanam23, aes(x = Genus, y = logFC, color = Phylum)) + geom_point(size=8) + 
  theme(axis.text.x = element_text(angle = -90, hjust = 0, vjust = 0.5)) +
  ggtitle("Log Fold Change of Significant Genus in NAM-23")
#

qlfnam30 = topTags(qlf4, n=nrow(x), adjust.method="BH", sort.by="logFC")
qlfnam30 = qlfnam30@.Data[[1]]
alpha = 0.01
sigtab4 = qlfnam30[(qlfnam30$FDR < alpha), ]
sigtab4 = cbind(as(sigtab4, "data.frame"), as(tax_table(physeqGenus)[rownames(sigtab4), ], "matrix"))
dim(sigtab4)

sigtanam30 = subset(sigtab4, !is.na(Genus))
# Phylum order
b = tapply(sigtanam30$logFC, sigtanam30$Phylum, function(b) max(b))
b = sort(b, TRUE)
sigtanam30$Phylum = factor(as.character(sigtanam30$Phylum), levels = names(b))
# Genus order
b = tapply(sigtanam30$logFC, sigtanam30$Genus, function(b) max(b))
b = sort(b, TRUE)
sigtanam30$Genus = factor(as.character(sigtanam30$Genus), levels = names(b))
ggplot(sigtanam30, aes(x = Genus, y = logFC, color = Phylum)) + geom_point(size=8) + 
  theme(axis.text.x = element_text(angle = -90, hjust = 0, vjust = 0.5)) +
  ggtitle("Log Fold Change of Significant Genus in NAM-30")
#

qlfnam32 = topTags(qlf5, n=nrow(x), adjust.method="BH", sort.by="logFC")
qlfnam32 = qlfnam32@.Data[[1]]
alpha = 0.01
sigtab5 = qlfnam32[(qlfnam32$FDR < alpha), ]
sigtab5 = cbind(as(sigtab5, "data.frame"), as(tax_table(physeqGenus)[rownames(sigtab5), ], "matrix"))
dim(sigtab5)

sigtanam32 = subset(sigtab5, !is.na(Genus))
# Phylum order
c = tapply(sigtanam32$logFC, sigtanam32$Phylum, function(c) max(c))
c = sort(c, TRUE)
sigtanam32$Phylum = factor(as.character(sigtanam32$Phylum), levels = names(c))
# Genus order
c = tapply(sigtanam32$logFC, sigtanam32$Genus, function(c) max(c))
c = sort(c, TRUE)
sigtanam32$Genus = factor(as.character(sigtanam32$Genus), levels = names(c))
ggplot(sigtanam32, aes(x = Genus, y = logFC, color = Phylum)) + geom_point(size=8) + 
  theme(axis.text.x = element_text(angle = -90, hjust = 0, vjust = 0.5)) +
  ggtitle("Log Fold Change of Significant Genus in NAM-32")
#

qlfnam37 = topTags(qlf6, n=nrow(x), adjust.method="BH", sort.by="logFC")
qlfnam37 = qlfnam37@.Data[[1]]
alpha = 0.01
sigtab6 = qlfnam37[(qlfnam37$FDR < alpha), ]
sigtab6 = cbind(as(sigtab6, "data.frame"), as(tax_table(physeqGenus)[rownames(sigtab6), ], "matrix"))
dim(sigtab6)

sigtanam37 = subset(sigtab6, !is.na(Genus))
# Phylum order
d = tapply(sigtanam37$logFC, sigtanam37$Phylum, function(d) max(d))
d = sort(d, TRUE)
sigtanam37$Phylum = factor(as.character(sigtanam37$Phylum), levels = names(d))
# Genus order
d = tapply(sigtanam37$logFC, sigtanam37$Genus, function(d) max(d))
d = sort(d, TRUE)
sigtanam37$Genus = factor(as.character(sigtanam37$Genus), levels = names(d))
ggplot(sigtanam37, aes(x = Genus, y = logFC, color = Phylum)) + geom_point(size=8) + 
  theme(axis.text.x = element_text(angle = -90, hjust = 0, vjust = 0.5)) +
  ggtitle("Log Fold Change of Significant Genus in NAM-37")
#

qlfnam43 = topTags(qlf7, n=nrow(x), adjust.method="BH", sort.by="logFC")
qlfnam43 = qlfnam43@.Data[[1]]
alpha = 0.01
sigtab7 = qlfnam43[(qlfnam43$FDR < alpha), ]
sigtab7 = cbind(as(sigtab7, "data.frame"), as(tax_table(physeqGenus)[rownames(sigtab7), ], "matrix"))
dim(sigtab7)

sigtanam43 = subset(sigtab7, !is.na(Genus))
# Phylum order
f = tapply(sigtanam43$logFC, sigtanam43$Phylum, function(f) max(f))
f = sort(f, TRUE)
sigtanam43$Phylum = factor(as.character(sigtanam43$Phylum), levels = names(f))
# Genus order
f = tapply(sigtanam43$logFC, sigtanam43$Genus, function(f) max(f))
f = sort(f, TRUE)
sigtanam43$Genus = factor(as.character(sigtanam43$Genus), levels = names(f))
ggplot(sigtanam43, aes(x = Genus, y = logFC, color = Phylum)) + geom_point(size=8) + 
  theme(axis.text.x = element_text(angle = -90, hjust = 0, vjust = 0.5)) +
  ggtitle("Log Fold Change of Significant Genus in NAM-43")
#

qlfnam46 = topTags(qlf8, n=nrow(x), adjust.method="BH", sort.by="logFC")
qlfnam46 = qlfnam46@.Data[[1]]
alpha = 0.01
sigtab8 = qlfnam46[(qlfnam46$FDR < alpha), ]
sigtab8 = cbind(as(sigtab8, "data.frame"), as(tax_table(physeqGenus)[rownames(sigtab8), ], "matrix"))
dim(sigtab8)

sigtanam46 = subset(sigtab8, !is.na(Genus))
# Phylum order
g = tapply(sigtanam46$logFC, sigtanam46$Phylum, function(g) max(g))
g = sort(g, TRUE)
sigtanam46$Phylum = factor(as.character(sigtanam46$Phylum), levels = names(g))
# Genus order
g = tapply(sigtanam46$logFC, sigtanam46$Genus, function(g) max(g))
g = sort(g, TRUE)
sigtanam46$Genus = factor(as.character(sigtanam46$Genus), levels = names(g))
ggplot(sigtanam46, aes(x = Genus, y = logFC, color = Phylum)) + geom_point(size=8) + 
  theme(axis.text.x = element_text(angle = -90, hjust = 0, vjust = 0.5)) +
  ggtitle("Log Fold Change of Significant Genus in NAM-46")
#

qlfnam48 = topTags(qlf9, n=nrow(x), adjust.method="BH", sort.by="logFC")
qlfnam48 = qlfnam48@.Data[[1]]
alpha = 0.01
sigtab9 = qlfnam48[(qlfnam48$FDR < alpha), ]
sigtab9 = cbind(as(sigtab9, "data.frame"), as(tax_table(physeqGenus)[rownames(sigtab9), ], "matrix"))
dim(sigtab9)

sigtanam48 = subset(sigtab9, !is.na(Genus))
# Phylum order
h = tapply(sigtanam48$logFC, sigtanam48$Phylum, function(h) max(h))
h = sort(h, TRUE)
sigtanam48$Phylum = factor(as.character(sigtanam48$Phylum), levels = names(h))
# Genus order
h = tapply(sigtanam48$logFC, sigtanam48$Genus, function(h) max(h))
h = sort(h, TRUE)
sigtanam48$Genus = factor(as.character(sigtanam48$Genus), levels = names(h))
ggplot(sigtanam48, aes(x = Genus, y = logFC, color = Phylum)) + geom_point(size=8) + 
  theme(axis.text.x = element_text(angle = -90, hjust = 0, vjust = 0.5)) +
  ggtitle("Log Fold Change of Significant Genus in NAM-48")
#

qlfnam5 = topTags(qlf10, n=nrow(x), adjust.method="BH", sort.by="logFC")
qlfnam5 = qlfnam5@.Data[[1]]
alpha = 0.01
sigtab10 = qlfnam5[(qlfnam5$FDR < alpha), ]
sigtab10 = cbind(as(sigtab10, "data.frame"), as(tax_table(physeqGenus)[rownames(sigtab10), ], "matrix"))
dim(sigtab10)

sigtanam5 = subset(sigtab10, !is.na(Genus))
# Phylum order
i = tapply(sigtanam5$logFC, sigtanam5$Phylum, function(i) max(i))
i = sort(i, TRUE)
sigtanam5$Phylum = factor(as.character(sigtanam5$Phylum), levels = names(i))
# Genus order
i = tapply(sigtanam5$logFC, sigtanam5$Genus, function(i) max(i))
i = sort(i, TRUE)
sigtanam5$Genus = factor(as.character(sigtanam5$Genus), levels = names(i))
ggplot(sigtanam5, aes(x = Genus, y = logFC, color = Phylum)) + geom_point(size=8) + 
  theme(axis.text.x = element_text(angle = -90, hjust = 0, vjust = 0.5)) +
  ggtitle("Log Fold Change of Significant Genus in NAM-5")
#

qlfnam72 = topTags(qlf11, n=nrow(x), adjust.method="BH", sort.by="logFC")
qlfnam72 = qlfnam72@.Data[[1]]
alpha = 0.01
sigtab11 = qlfnam72[(qlfnam72$FDR < alpha), ]
sigtab11 = cbind(as(sigtab11, "data.frame"), as(tax_table(physeqGenus)[rownames(sigtab11), ], "matrix"))
dim(sigtab11)

sigtanam72 = subset(sigtab11, !is.na(Genus))
# Phylum order
n = tapply(sigtanam72$logFC, sigtanam72$Phylum, function(n) max(n))
n = sort(n, TRUE)
sigtanam72$Phylum = factor(as.character(sigtanam72$Phylum), levels = names(n))
# Genus order
n = tapply(sigtanam72$logFC, sigtanam72$Genus, function(n) max(n))
n = sort(n, TRUE)
sigtanam72$Genus = factor(as.character(sigtanam72$Genus), levels = names(n))
ggplot(sigtanam72, aes(x = Genus, y = logFC, color = Phylum)) + geom_point(size=8) + 
  theme(axis.text.x = element_text(angle = -90, hjust = 0, vjust = 0.5)) +
  ggtitle("Log Fold Change of Significant Genus in NAM-72")
#
qlfnam76 = topTags(qlf12, n=nrow(x), adjust.method="BH", sort.by="logFC")
qlfnam76 = qlfnam76@.Data[[1]]
alpha = 0.01
sigtab12 = qlfnam76[(qlfnam76$FDR < alpha), ]
sigtab12 = cbind(as(sigtab12, "data.frame"), as(tax_table(physeqGenus)[rownames(sigtab12), ], "matrix"))
dim(sigtab12)

sigtanam76 = subset(sigtab12, !is.na(Genus))
# Phylum order
p = tapply(sigtanam76$logFC, sigtanam76$Phylum, function(p) max(p))
p = sort(p, TRUE)
sigtanam76$Phylum = factor(as.character(sigtanam76$Phylum), levels = names(p))
# Genus order
p = tapply(sigtanam76$logFC, sigtanam76$Genus, function(p) max(p))
p = sort(p, TRUE)
sigtanam76$Genus = factor(as.character(sigtanam76$Genus), levels = names(p))
ggplot(sigtanam76, aes(x = Genus, y = logFC, color = Phylum)) + geom_point(size=8) + 
  theme(axis.text.x = element_text(angle = -90, hjust = 0, vjust = 0.5)) +
  ggtitle("Log Fold Change of Significant Genus in NAM-76")
#

qlfnam79 = topTags(qlf13, n=nrow(x), adjust.method="BH", sort.by="logFC")
qlfnam79 = qlfnam79@.Data[[1]]
alpha = 0.01
sigtab13 = qlfnam79[(qlfnam79$FDR < alpha), ]
sigtab13 = cbind(as(sigtab13, "data.frame"), as(tax_table(physeqGenus)[rownames(sigtab13), ], "matrix"))
dim(sigtab13)

sigtanam79 = subset(sigtab13, !is.na(Genus))
# Phylum order
q = tapply(sigtanam79$logFC, sigtanam79$Phylum, function(q) max(q))
q = sort(q, TRUE)
sigtanam79$Phylum = factor(as.character(sigtanam79$Phylum), levels = names(q))
# Genus order
q = tapply(sigtanam79$logFC, sigtanam79$Genus, function(q) max(q))
q = sort(q, TRUE)
sigtanam79$Genus = factor(as.character(sigtanam79$Genus), levels = names(q))
ggplot(sigtanam79, aes(x = Genus, y = logFC, color = Phylum)) + geom_point(size=8) + 
  theme(axis.text.x = element_text(angle = -90, hjust = 0, vjust = 0.5)) +
  ggtitle("Log Fold Change of Significant Genus in NAM-79")
#

qlfnamYN04 = topTags(qlf14, n=nrow(x), adjust.method="BH", sort.by="logFC")
qlfnamYN04 = qlfnamYN04@.Data[[1]]
alpha = 0.01
sigtab14 = qlfnamYN04[(qlfnamYN04$FDR < alpha), ]
sigtab14 = cbind(as(sigtab14, "data.frame"), as(tax_table(physeqGenus)[rownames(sigtab14), ], "matrix"))
dim(sigtab14)

sigtanamYN04 = subset(sigtab14, !is.na(Genus))
# Phylum order
r = tapply(sigtanamYN04$logFC, sigtanamYN04$Phylum, function(r) max(r))
r = sort(r, TRUE)
sigtanamYN04$Phylum = factor(as.character(sigtanamYN04$Phylum), levels = names(r))
# Genus order
r = tapply(sigtanamYN04$logFC, sigtanamYN04$Genus, function(r) max(r))
r = sort(r, TRUE)
sigtanamYN04$Genus = factor(as.character(sigtanamYN04$Genus), levels = names(r))
ggplot(sigtanamYN04, aes(x = Genus, y = logFC, color = Phylum)) + geom_point(size=8) + 
  theme(axis.text.x = element_text(angle = -90, hjust = 0, vjust = 0.5)) +
  ggtitle("Log Fold Change of Significant Genus in NAM-YN04")


### NOW EXTRACT THE TO 20

top20nam13=sigtanam13[1:20,]
top30nam14=sigtanam14[1:20,]
top30nam17=sigtanam17[1:20,]
top30nam23=sigtanam23[1:20,]
top30nam30=sigtanam30[1:20,]
top30nam32=sigtanam32[1:20,]
top30nam37=sigtanam37[1:20,]
top30nam43=sigtanam43[1:20,]
top30nam46=sigtanam46[1:20,]
top30nam48=sigtanam48[1:20,]
top30nam5=sigtanam5[1:20,]
top30nam72=sigtanam72[1:20,]
top30nam76=sigtanam76[1:20,]
top30nam79=sigtanam79[1:20,]
top30yn04=sigtanamYN04[1:20,]


#PRODUCE THE PLOTS

library(gridExtra)

#install.packages("devtools")
#library(devtools)
#install_github("DavidGarciaCallejas/DGC")

#### NAM 13

# Phylum order

m = tapply(top20nam13$logFC, top20nam13$Phylum, function(y) max(y))
m = sort(m, TRUE)
top20nam13$Phylum = factor(as.character(top20nam13$Phylum), levels = names(m))
# Genus order
m = tapply(top20nam13$logFC, top20nam13$Genus, function(m) max(m))
m = sort(m, TRUE)
top20nam13$Genus = factor(as.character(top20nam13$Genus), levels = names(m))

p13= ggplot(top20nam13, aes(x = Genus, y = logFC, color = Phylum)) + geom_point(size=8) + theme_bw()+ xlab("")+
  theme(axis.text.x = element_text(angle = -90, hjust = 0, vjust = 0.5)) +theme(axis.text=element_text(size=16),axis.title=element_text(size=16,face="bold"))+
  theme(legend.title = element_text(size=16,face="bold")) +theme(legend.text = element_text(size=16,face="bold")) +
  ggtitle("NAM 13")+theme(plot.title = element_text(size = 16, face = "bold"))

## NAM 14

# Phylum order
m = tapply(top30nam14$logFC, top30nam14$Phylum, function(y) max(y))
m = sort(m, TRUE)
top30nam14$Phylum = factor(as.character(top30nam14$Phylum), levels = names(m))
# Genus order
m = tapply(top30nam14$logFC, top30nam14$Genus, function(m) max(m))
m = sort(m, TRUE)
top30nam14$Genus = factor(as.character(top30nam14$Genus), levels = names(m))

p14= ggplot(top30nam14, aes(x = Genus, y = logFC, color = Phylum)) + geom_point(size=8) + theme_bw()+xlab("")+
  theme(axis.text.x = element_text(angle = -90, hjust = 0, vjust = 0.5)) +theme(axis.text=element_text(size=16),axis.title=element_text(size=16,face="bold"))+
  theme(legend.title = element_text(size=16,face="bold")) +theme(legend.text = element_text(size=16,face="bold")) +
  ggtitle("NAM 14")+theme(plot.title = element_text(size = 16, face = "bold")) 

### NAM 17

# Phylum order
m = tapply(top30nam17$logFC, top30nam17$Phylum, function(y) max(y))
m = sort(m, TRUE)
top30nam17$Phylum = factor(as.character(top30nam17$Phylum), levels = names(m))
# Genus order
m = tapply(top30nam17$logFC, top30nam17$Genus, function(m) max(m))
m = sort(m, TRUE)
top30nam17$Genus = factor(as.character(top30nam17$Genus), levels = names(m))

p17= ggplot(top30nam17, aes(x = Genus, y = logFC, color = Phylum)) + geom_point(size=8) + theme_bw()+xlab("")+
  theme(axis.text.x = element_text(angle = -90, hjust = 0, vjust = 0.5)) +theme(axis.text=element_text(size=16),axis.title=element_text(size=16,face="bold"))+
  theme(legend.title = element_text(size=16,face="bold")) +theme(legend.text = element_text(size=16,face="bold")) +
  ggtitle("NAM 17")+theme(plot.title = element_text(size = 16, face = "bold"))

#### NAM 23

# Phylum order
m = tapply(top30nam23$logFC, top30nam23$Phylum, function(y) max(y))
m = sort(m, TRUE)
top30nam23$Phylum = factor(as.character(top30nam23$Phylum), levels = names(m))
# Genus order
m = tapply(top30nam23$logFC, top30nam23$Genus, function(m) max(m))
m = sort(m, TRUE)
top30nam23$Genus = factor(as.character(top30nam23$Genus), levels = names(m))

p23= ggplot(top30nam23, aes(x = Genus, y = logFC, color = Phylum)) + geom_point(size=8) + theme_bw()+xlab("")+
  theme(axis.text.x = element_text(angle = -90, hjust = 0, vjust = 0.5)) +theme(axis.text=element_text(size=16),axis.title=element_text(size=16,face="bold"))+
  theme(legend.title = element_text(size=16,face="bold")) +theme(legend.text = element_text(size=16,face="bold")) + 
  ggtitle("NAM 23")+theme(plot.title = element_text(size = 16, face = "bold"))

### NAM 30

# Phylum order
m = tapply(top30nam30$logFC, top30nam30$Phylum, function(y) max(y))
m = sort(m, TRUE)
top30nam30$Phylum = factor(as.character(top30nam30$Phylum), levels = names(m))
# Genus order
m = tapply(top30nam30$logFC, top30nam30$Genus, function(m) max(m))
m = sort(m, TRUE)
top30nam30$Genus = factor(as.character(top30nam30$Genus), levels = names(m))

p30= ggplot(top30nam30, aes(x = Genus, y = logFC, color = Phylum)) + geom_point(size=8) + theme_bw()+xlab("")+
  theme(axis.text.x = element_text(angle = -90, hjust = 0, vjust = 0.5)) +theme(axis.text=element_text(size=16),axis.title=element_text(size=16,face="bold"))+
  theme(legend.title = element_text(size=16,face="bold")) +theme(legend.text = element_text(size=16,face="bold")) +
  ggtitle("NAM 30")+theme(plot.title = element_text(size = 16, face = "bold"))

### NAM 32

# Phylum order
m = tapply(top30nam32$logFC, top30nam32$Phylum, function(y) max(y))
m = sort(m, TRUE)
top30nam32$Phylum = factor(as.character(top30nam32$Phylum), levels = names(m))
# Genus order
m = tapply(top30nam32$logFC, top30nam32$Genus, function(m) max(m))
m = sort(m, TRUE)
top30nam32$Genus = factor(as.character(top30nam32$Genus), levels = names(m))

p32= ggplot(top30nam32, aes(x = Genus, y = logFC, color = Phylum)) + geom_point(size=8) + theme_bw()+xlab("")+
  theme(axis.text.x = element_text(angle = -90, hjust = 0, vjust = 0.5)) +theme(axis.text=element_text(size=16),axis.title=element_text(size=16,face="bold"))+
  theme(legend.title = element_text(size=16,face="bold")) +theme(legend.text = element_text(size=16,face="bold")) + 
  ggtitle("NAM 32")+theme(plot.title = element_text(size = 16, face = "bold"))

####NAM 37

# Phylum order
m = tapply(top30nam37$logFC, top30nam37$Phylum, function(y) max(y))
m = sort(m, TRUE)
top30nam37$Phylum = factor(as.character(top30nam37$Phylum), levels = names(m))
# Genus order
m = tapply(top30nam37$logFC, top30nam37$Genus, function(m) max(m))
m = sort(m, TRUE)
top30nam37$Genus = factor(as.character(top30nam37$Genus), levels = names(m))

p37= ggplot(top30nam37, aes(x = Genus, y = logFC, color = Phylum)) + geom_point(size=8) + theme_bw()+xlab("")+
  theme(axis.text.x = element_text(angle = -90, hjust = 0, vjust = 0.5)) +theme(axis.text=element_text(size=16),axis.title=element_text(size=16,face="bold"))+
  theme(legend.title = element_text(size=16,face="bold")) +theme(legend.text = element_text(size=16,face="bold")) + 
  ggtitle("NAM 37")+theme(plot.title = element_text(size = 16, face = "bold"))

### NAM 43

# Phylum order
m = tapply(top30nam43$logFC, top30nam43$Phylum, function(y) max(y))
m = sort(m, TRUE)
top30nam43$Phylum = factor(as.character(top30nam43$Phylum), levels = names(m))
# Genus order
m = tapply(top30nam43$logFC, top30nam43$Genus, function(m) max(m))
m = sort(m, TRUE)
top30nam43$Genus = factor(as.character(top30nam43$Genus), levels = names(m))

p43= ggplot(top30nam43, aes(x = Genus, y = logFC, color = Phylum)) + geom_point(size=8) + theme_bw()+xlab("")+
  theme(axis.text.x = element_text(angle = -90, hjust = 0, vjust = 0.5)) +theme(axis.text=element_text(size=16),axis.title=element_text(size=16,face="bold"))+
  theme(legend.title = element_text(size=16,face="bold")) +theme(legend.text = element_text(size=16,face="bold")) + 
  ggtitle("NAM 43")+theme(plot.title = element_text(size = 16, face = "bold"))

###NAM 46

# Phylum order
m = tapply(top30nam46$logFC, top30nam46$Phylum, function(y) max(y))
m = sort(m, TRUE)
top30nam46$Phylum = factor(as.character(top30nam46$Phylum), levels = names(m))
# Genus order
m = tapply(top30nam46$logFC, top30nam46$Genus, function(m) max(m))
m = sort(m, TRUE)
top30nam46$Genus = factor(as.character(top30nam46$Genus), levels = names(m))

p46= ggplot(top30nam46, aes(x = Genus, y = logFC, color = Phylum)) + geom_point(size=8) + theme_bw()+xlab("")+
  theme(axis.text.x = element_text(angle = -90, hjust = 0, vjust = 0.5)) +theme(axis.text=element_text(size=16),axis.title=element_text(size=16,face="bold"))+
  theme(legend.title = element_text(size=16,face="bold")) +theme(legend.text = element_text(size=16,face="bold")) + 
  ggtitle("NAM 46")+theme(plot.title = element_text(size = 16, face = "bold"))

####NAM 48

# Phylum order
m = tapply(top30nam48$logFC, top30nam48$Phylum, function(y) max(y))
m = sort(m, TRUE)
top30nam48$Phylum = factor(as.character(top30nam48$Phylum), levels = names(m))
# Genus order
m = tapply(top30nam48$logFC, top30nam48$Genus, function(m) max(m))
m = sort(m, TRUE)
top30nam48$Genus = factor(as.character(top30nam48$Genus), levels = names(m))

p48= ggplot(top30nam48, aes(x = Genus, y = logFC, color = Phylum)) + geom_point(size=8) + theme_bw()+xlab("")+
  theme(axis.text.x = element_text(angle = -90, hjust = 0, vjust = 0.5)) +theme(axis.text=element_text(size=16),axis.title=element_text(size=16,face="bold"))+
  theme(legend.title = element_text(size=16,face="bold")) +theme(legend.text = element_text(size=16,face="bold")) + 
  ggtitle("NAM 48")+theme(plot.title = element_text(size = 16, face = "bold"))

#### NAM 5

# Phylum order
m = tapply(top30nam5$logFC, top30nam5$Phylum, function(y) max(y))
m = sort(m, TRUE)
top30nam5$Phylum = factor(as.character(top30nam5$Phylum), levels = names(m))
# Genus order
m = tapply(top30nam5$logFC, top30nam5$Genus, function(m) max(m))
m = sort(m, TRUE)
top30nam5$Genus = factor(as.character(top30nam5$Genus), levels = names(m))

p5= ggplot(top30nam5, aes(x = Genus, y = logFC, color = Phylum)) + geom_point(size=8) + theme_bw()+xlab("")+
  theme(axis.text.x = element_text(angle = -90, hjust = 0, vjust = 0.5)) +theme(axis.text=element_text(size=16),axis.title=element_text(size=16,face="bold"))+
  theme(legend.title = element_text(size=16,face="bold")) +theme(legend.text = element_text(size=16,face="bold")) + 
  ggtitle("NAM 5")+theme(plot.title = element_text(size = 16, face = "bold"))

### NAM 72

# Phylum order
m = tapply(top30nam72$logFC, top30nam72$Phylum, function(y) max(y))
m = sort(m, TRUE)
top30nam72$Phylum = factor(as.character(top30nam72$Phylum), levels = names(m))
# Genus order
m = tapply(top30nam72$logFC, top30nam72$Genus, function(m) max(m))
m = sort(m, TRUE)
top30nam72$Genus = factor(as.character(top30nam72$Genus), levels = names(m))

p72= ggplot(top30nam72, aes(x = Genus, y = logFC, color = Phylum)) + geom_point(size=8) + theme_bw()+xlab("")+
  theme(axis.text.x = element_text(angle = -90, hjust = 0, vjust = 0.5)) +theme(axis.text=element_text(size=16),axis.title=element_text(size=16,face="bold"))+
  theme(legend.title = element_text(size=16,face="bold")) +theme(legend.text = element_text(size=16,face="bold")) + 
  ggtitle("NAM 72")+theme(plot.title = element_text(size = 16, face = "bold"))

#NAM 76

# Phylum order
m = tapply(top30nam76$logFC, top30nam76$Phylum, function(y) max(y))
m = sort(m, TRUE)
top30nam76$Phylum = factor(as.character(top30nam76$Phylum), levels = names(m))
# Genus order
m = tapply(top30nam76$logFC, top30nam76$Genus, function(m) max(m))
m = sort(m, TRUE)
top30nam76$Genus = factor(as.character(top30nam76$Genus), levels = names(m))

p76= ggplot(top30nam76, aes(x = Genus, y = logFC, color = Phylum)) + geom_point(size=8) + theme_bw()+xlab("")+
  theme(axis.text.x = element_text(angle = -90, hjust = 0, vjust = 0.5)) +theme(axis.text=element_text(size=16),axis.title=element_text(size=16,face="bold"))+
  theme(legend.title = element_text(size=16,face="bold")) +theme(legend.text = element_text(size=16,face="bold")) +
  ggtitle("NAM 76")+theme(plot.title = element_text(size = 16, face = "bold"))

####NAM 79

# Phylum order
m = tapply(top30nam79$logFC, top30nam79$Phylum, function(y) max(y))
m = sort(m, TRUE)
top30nam79$Phylum = factor(as.character(top30nam79$Phylum), levels = names(m))
# Genus order
m = tapply(top30nam79$logFC, top30nam79$Genus, function(m) max(m))
m = sort(m, TRUE)
top30nam79$Genus = factor(as.character(top30nam79$Genus), levels = names(m))

p79= ggplot(top30nam79, aes(x = Genus, y = logFC, color = Phylum)) + geom_point(size=8) + theme_bw()+xlab("")+
  theme(axis.text.x = element_text(angle = -90, hjust = 0, vjust = 0.5)) +theme(axis.text=element_text(size=16),axis.title=element_text(size=16,face="bold"))+
  theme(legend.title = element_text(size=16,face="bold")) +theme(legend.text = element_text(size=16,face="bold")) +
  ggtitle("NAM 79")+theme(plot.title = element_text(size = 16, face = "bold"))

#YN04C1213

# Phylum order
m = tapply(top30yn04$logFC, top30yn04$Phylum, function(y) max(y))
m = sort(m, TRUE)
top30yn04$Phylum = factor(as.character(top30yn04$Phylum), levels = names(m))
# Genus order
m = tapply(top30yn04$logFC, top30yn04$Genus, function(m) max(m))
m = sort(m, TRUE)
top30yn04$Genus = factor(as.character(top30yn04$Genus), levels = names(m))

pyno4= ggplot(top30yn04, aes(x = Genus, y = logFC, color = Phylum)) + geom_point(size=8) + theme_bw()+xlab("")+
  theme(axis.text.x = element_text(angle = -90, hjust = 0, vjust = 0.5)) +theme(axis.text=element_text(size=16),axis.title=element_text(size=16,face="bold"))+
  theme(legend.title = element_text(size=16,face="bold")) +theme(legend.text = element_text(size=16,face="bold")) + 
  ggtitle("NAM 94")+theme(plot.title = element_text(size = 16, face = "bold"))

## plot 
library(ggpubr)
library(reports)
multi.page <- ggarrange(p13,p14,p17,p23,p30,p32,p37,p43,p46,p48,p5,p72,p76,p79,pyno4,
                        nrow = 1, ncol = 2)

multi.page[[8]]#
ggexport(multi.page, filename = "multi.page.ggplot2.png")
p13 
p14
p17
p23
p30
p32
p37
p43
p46
p48
p5
p72
p76
p79
pyno4

# .................................... # ..................................... # ..................................... #..................................

# Chapter 4
## Root Growth Dynamics, Dominant Rhizosphere Bacteria, and Correlation Between Dominant Bacterial Genera and Root Traits Through Brassica Napus Development Taye et al. 2020

setwd("D:/.....")

##

library(ggplot2)
library(phyloseq)
library(ape)
library(biomformat)

#import tax, asv and metadata tables to create phyloseq object. Here the processed final asv and tax table is used. the only update is more sample vaiables (root traits) are added in the metadata

otu_table=read.csv("can.asv.rhizo_gg.csv", sep = ",",row.names = 1)
otu_table=as.matrix(otu_table)

taxonomy=read.csv("can.tax.rhizo_gg.csv",sep="," ,row.names=1)
taxonomy=as.matrix(taxonomy)

metadata=read.csv("can.sam.rhizo_gg.csv",sep="," ,row.names=1)

### import them as phyloseq objects

OTU=otu_table(otu_table,taxa_are_rows=TRUE)
TAX=tax_table(taxonomy)
META=sample_data(metadata)

## cheke that your otu names are consistent across objects

taxa_names(TAX)
taxa_names(OTU)

## make sure files have the same sample names

sample_names(OTU)
sample_names(META)


## Everything looks good --- now proceed to merging them into one phyloseq object

physeq.updated.line.names.new.variable=phyloseq(OTU,TAX,META)

sample_variables(physeq.updated.line.names.new.variable)## check if sample varibales are correctly presented in the phyloseq object

## save the updated phyloseq object

save(physeq.updated.line.names.new.variable,file="2016.physeq.updated.line.names.new.variable.RData")

### Changes in rhizosphere bacterial community associated to three different developmental stages of B. napus

setwd("D:.....")

library(microbiome)
library(ggpubr)
library(knitr)
library(dplyr)

### Dominant taxonomic groups per sample: at different canola lines, growth stages

# subset vegetative stage

veg = subset_samples(physeq.updated.line.names.new.variable, Growth.Stage=="Vegetative")
dp.1 = dominant(veg, level = "Phylum")
dp.2 = dominant(veg, level = "Class")
dp.3 = dominant(veg, level = "Order")
dp.4 = dominant(veg, level = "Family")
dp.5 = dominant(veg, level = "Genus")

## write dominant taxa at vegetative

write.csv(dp.1, 'veg.dominant.phylum.csv')
write.csv(dp.2, 'veg.dominant.class.csv')
write.csv(dp.3, 'veg.dominant.order.csv')
write.csv(dp.4, 'veg.dominant.family.csv')
write.csv(dp.5, 'veg.dominant.genus.csv')

#subset flowering

Flw = subset_samples(physeq.updated.line.names.new.variable, Growth.Stage=="Flowering")

dp.1 = dominant(Flw, level = "Phylum")
dp.2 = dominant(Flw, level = "Class")
dp.3 = dominant(Flw, level = "Order")
dp.4 = dominant(Flw, level = "Family")
dp.5 = dominant(Flw, level = "Genus")

## write dominant taxa at vegetative

write.csv(dp.1, 'Flw.dominant.phylum.csv')
write.csv(dp.2, 'Flw.dominant.class.csv')
write.csv(dp.3, 'Flw.dominant.order.csv')
write.csv(dp.4, 'Flw.dominant.family.csv')
write.csv(dp.5, 'Flw.dominant.genus.csv')

# subset maturity

mtu = subset_samples(physeq.updated.line.names.new.variable, Growth.Stage=="Maturity")

dp.1 = dominant(mtu, level = "Phylum")
dp.2 = dominant(mtu, level = "Class")
dp.3 = dominant(mtu, level = "Order")
dp.4 = dominant(mtu, level = "Family")
dp.5 = dominant(mtu, level = "Genus")

## write dominant taxa at vegetative
write.csv(dp.1, 'mtu.dominant.phylum.csv')
write.csv(dp.2, 'mtu.dominant.class.csv')
write.csv(dp.3, 'mtu.dominant.order.csv')
write.csv(dp.4, 'mtu.dominant.family.csv')
write.csv(dp.5, 'mtu.dominant.genus.csv')

## top ten taxa by mean abundance across growth stage

setwd("D:/out_puts/phyloseq/with_green_genes_taxa/Ampvis_visualization/Mean_abundance_growth_stage_ampvis")
setwd("D:/out_puts/phyloseq/with_green_genes_taxa/Ampvis_visualization")

library(ampvis2)
library(radiant) ## this is for rownames_to_column function
library(tibble)## for as.table function
library(phyloseq)

### make ampvis data object from phyloseq

ps_data = physeq.updated.line.names.new.variable ## my already created phyloseq object

## convert phyloseq object into ampvis object 
# Load data into ampvis

otu.table <- otu_table(ps_data)
tax.table <- tax_table(ps_data)
otu.table <- as.data.frame(otu.table)
tax.table <- as.data.frame(tax.table)
meta.table <- sample_data(ps_data)
meta.table <- as.tibble(meta.table)
write.csv(otu.table, 'otu.table.amp.csv')## and add OTU column name and reload
write.csv(tax.table, 'tax.table.amp.csv')

## load the OTUID name otu and tax files

otu.table.amp <- read.csv("otu.table.amp.csv")
tax.table.amp <- read.csv("tax.table.amp.csv")

#merge the two as datafrmae

otu.table.tax <- merge.data.frame(otu.table.amp, tax.table.amp, by= "OTU")
head(rownames(otu.table.tax))
head(rownames(otu.table.tax))

class(meta.table)
head(rownames(otu.table))
head(rownames(tax.table))
head(colnames(otu.table))
head(colnames(tax.table))

# Creat the Ampvis object

ps1.av2 <- amp_load(otutable = otu.table.tax, metadata = meta.table)

##cummulative rank abundance of genera at three growth stages

amp_rankabundance(ps1.av2, group_by = "Growth_Stage")

# Top ten most abudnant Phyla based on relative mean read abundance
## Phyla - across canola growth stages
# reordr the levels of growthsatge so that it goes from veg to mat in the plots

ps1.av2$metadata$Growth_Stage <- factor(ps1.av2$metadata$Growth_Stage, levels = c("Vegetative", "Flowering", "Maturity"))

# plot top 10 phylum across growth stages

amp_heatmap(ps1.av2, group_by = "Growth_Stage", plot_values = TRUE) +
  theme(axis.text.x = element_text(angle = 90, size=12, vjust = 1),
        axis.text.y = element_text(size=12),axis.ticks.x = element_blank(), axis.ticks.y = element_blank(), 
        legend.position="right", legend.text = element_text(size = 12), legend.title = element_text(size = 12))## across                                                                                  canola growth stages## top 10 phyal

## Top 10 class

# plot top 10 class across growth stages
amp_heatmap(ps1.av2, group_by = "Growth_Stage", tax_aggregate = "Class", tax_add = "Phylum",plot_values = TRUE) +
  theme(axis.text.x = element_text(angle = 90, size=12, vjust = 1),axis.ticks.x = element_blank(), axis.ticks.y = element_blank(),
        axis.text.y = element_text(size=12),
        legend.position="right", legend.text = element_text(size = 12), legend.title = element_text(size = 12))## across                                                                                                      canola growth stages

## Top 10 Family

# plot top 10 Family across growth stages

amp_heatmap(ps1.av2, group_by = "Growth_Stage", tax_aggregate = "Family", tax_add = "Phylum",plot_values = TRUE) +
  theme(axis.text.x = element_text(angle = 90, size=12, vjust = 1),axis.ticks.x = element_blank(), axis.ticks.y = element_blank(),
        axis.text.y = element_text(size=12),
        legend.position="right", legend.text = element_text(size = 12), legend.title = element_text(size = 12))## across                                                                                                      canola growth stages

## Top 10 Genus

# plot top 10 genera across growth stages

amp_heatmap(ps1.av2, group_by = "Growth_Stage", tax_aggregate = "Genus", tax_add = "Phylum",plot_values = TRUE) +
  theme(axis.text.x = element_text(angle = 90, size=12, vjust = 1),
        axis.text.y = element_text(size=12),axis.ticks.x = element_blank(), axis.ticks.y = element_blank(),
        legend.position="right", legend.text = element_text(size = 12), legend.title = element_text(size = 12))## across                                                                                                       canola growth stages

############## Frequency of each taxa at different growth stages #############

setwd("D:/out_puts/phyloseq/with_green_genes_taxa/Ampvis_visualization/Frequency_of_taxa_by_growth_stage")

# subset each growth stage

veg
Flw
mtu

# remove zero sum taxa

veg= prune_taxa(taxa_sums(veg) > 0, veg)
Flw= prune_taxa(taxa_sums(Flw) > 0, Flw)
mtu= prune_taxa(taxa_sums(mtu) > 0, mtu)

## frequecny  of phyla in each growth stage

num.taxa.py.vg = table(tax_table(veg)[,"Phylum"])
num.taxa.py.fl = table(tax_table(Flw)[,"Phylum"])
num.taxa.py.mt = table(tax_table(mtu)[,"Phylum"])

# create a dataframe

num.taxa.py.vg = as.data.frame(num.taxa.py.vg)
head(num.taxa.py.vg)

num.taxa.py.fl = as.data.frame(num.taxa.py.fl)
head(num.taxa.py.fl)

num.taxa.py.mt = as.data.frame(num.taxa.py.mt)
head(num.taxa.py.mt)

## save as csv

write.csv(num.taxa.py.vg, 'Phylum_frequency_vegeative.csv')
write.csv(num.taxa.py.fl, 'Phylum_frequency_flowering.csv')
write.csv(num.taxa.py.mt, 'Phylum_frequency_maturity.csv')

## Frequency of Class at each growth stage

num.taxa.cls.vg = table(tax_table(veg)[,"Class"])
num.taxa.cls.fl = table(tax_table(Flw)[,"Class"])
num.taxa.cls.mt = table(tax_table(mtu)[,"Class"])

# create a dataframe

num.taxa.cls.vg = as.data.frame(num.taxa.cls.vg)
head(num.taxa.cls.vg)

num.taxa.cls.fl = as.data.frame(num.taxa.cls.fl)
head(num.taxa.cls.fl)

num.taxa.cls.mt = as.data.frame(num.taxa.cls.mt)
head(num.taxa.cls.mt)

## save as csv

write.csv(num.taxa.cls.vg, 'Class_frequency_vegeative.csv')
write.csv(num.taxa.cls.fl, 'Class_frequency_flowering.csv')
write.csv(num.taxa.cls.mt, 'Class_frequency_maturity.csv')

## Frequence of order at eahc growth stage

num.taxa.ord.vg = table(tax_table(veg)[,"Order"])
num.taxa.ord.fl = table(tax_table(Flw)[,"Order"])
num.taxa.ord.mt = table(tax_table(mtu)[,"Order"])

# create a dataframe

num.taxa.ord.vg = as.data.frame(num.taxa.ord.vg)
head(num.taxa.ord.vg)

num.taxa.ord.fl = as.data.frame(num.taxa.ord.fl)
head(num.taxa.ord.fl)

num.taxa.ord.mt = as.data.frame(num.taxa.ord.mt)
head(num.taxa.ord.mt)

## save as csv

write.csv(num.taxa.ord.vg, 'Order_frequency_vegeative.csv')
write.csv(num.taxa.ord.fl, 'Order_frequency_flowering.csv')
write.csv(num.taxa.ord.mt, 'Order_frequency_maturity.csv')

## Frequency of Family at eahc growth stage

num.taxa.fam.vg = table(tax_table(veg)[,"Family"])
num.taxa.fam.fl = table(tax_table(Flw)[,"Family"])
num.taxa.fam.mt = table(tax_table(mtu)[,"Family"])

# create a dataframe

num.taxa.fam.vg = as.data.frame(num.taxa.fam.vg)
head(num.taxa.fam.vg)

num.taxa.fam.fl = as.data.frame(num.taxa.fam.fl)
head(num.taxa.fam.fl)

num.taxa.fam.mt = as.data.frame(num.taxa.fam.mt)
head(num.taxa.fam.mt)

## save as csv

write.csv(num.taxa.fam.vg, 'Family_frequency_vegeative.csv')
write.csv(num.taxa.fam.fl, 'Family_frequency_flowering.csv')
write.csv(num.taxa.fam.mt, 'Family_frequency_maturity.csv')

## Frequency of genus at eahc growth stage

num.taxa.gns.vg = table(tax_table(veg)[,"Genus"])
num.taxa.gns.fl = table(tax_table(Flw)[,"Genus"])
num.taxa.gns.mt = table(tax_table(mtu)[,"Genus"])

# create a dataframe

num.taxa.gns.vg = as.data.frame(num.taxa.gns.vg)
head(num.taxa.gns.vg)

num.taxa.gns.fl = as.data.frame(num.taxa.gns.fl)
head(num.taxa.gns.fl)

num.taxa.gns.mt = as.data.frame(num.taxa.gns.mt)
head(num.taxa.gns.mt)

## save as csv

write.csv(num.taxa.gns.vg, 'Genus_frequency_vegeative.csv')
write.csv(num.taxa.gns.fl, 'Genus_frequency_flowering.csv')
write.csv(num.taxa.gns.mt, 'Genus_frequency_maturity.csv')

##### ............................#### ...............##### ......................... #####

## Correlation between canola rhizosphere dominat genera and previously identified canola root-growth promoting 
#bacteria and canola root traits under field condition at different growth stages

setwd("D:/out_puts/phyloseq/with_green_genes_taxa/Ampvis_visualization/Root_traits_canola_PGPB")

## top ten genera in all growth stages based on percent of frequency of occurence

# Pseudomonas 
# Flavobacterium 
# Lysobacter 
# Rhodoplanes 
# Pedobacter 
# Vibrio 
# Stenotrophomonas 
# Arthrobacter 


## top ten genera in all growth stages based on percent mean relative abundance 

# Arthrobacter
# Stenotrophomonas
# Gluconacetobacter
# Skermanella
# Pseudomonas
# Bradyrhizobium 


## Total dominat genera (based on both frequence and mean relative abundance): "Use this Dominat genera for correlation"

# Pseudomonas 
# Flavobacterium 
# Lysobacter 
# Rhodoplanes 
# Pedobacter 
# Vibrio 
# Stenotrophomonas 
# Arthrobacter
# Gluconacetobacter
# Skermanella
# Bradyrhizobium 

## Canola PGPR previously indetified in canola (identified by Jorge et al 2019 and Bertrand et al., 2000): 
## " use this canola PGPR for coreelation" 

# Paenibacillus 
# Pantoea 
# Pseudomonas 
# Rhizobium 
# Stenotrophomonas 
# agrococcus 
# Bacillus 
# Leifsonia 
# Microbacterium 
# Rhodococcus 
# Xanthomonas
# Variovorax 
# Agrobacterium 
# Phyllobacterium

####load libraries 

library(ggplot2)
library(phyloseq)
library(ape)
library(biomformat)
library(microbiome)
library(ggpubr)
library(knitr)
library(dplyr)
library(ampvis2)
library(radiant) ## this is for rownames_to_column function
library(tibble)## for as.table function
library(microbiomeSeq)


# load the phyloseq object containing the  2016 rhizosphere bacterial dataset

load("D:/out_puts/phyloseq/with_green_genes_taxa/2016.physeq.updated.line.names.new.variable.RData")

#### subset dominat genera based on both frequence and mean relative abundance


physeq.top.10.genera.root.trait =  subset_taxa(physeq.updated.line.names.new.variable, Genus=="Pseudomonas"| Genus=="Flavobacterium"|
                                                 Genus=="Lysobacter"|Genus=="Rhodoplanes"|Genus=="Pedobacter"|Genus=="Vibrio"|Genus=="Stenotrophomonas"
                                               |Genus=="Arthrobacter"|Genus=="Gluconacetobacter"|Genus=="Skermanella"|Genus=="Bradyrhizobium")

## remove zero sum taxa

physeq.top.10.genera.root.trait= prune_taxa(taxa_sums(physeq.top.10.genera.root.trait) > 0, physeq.top.10.genera.root.trait)

## Using the taxa.env.correlation function of microbiome seq, lets do correlation between genera and root traits

sample_variables(physeq.top.10.genera.root.trait) ## get the sample variable names

physeq <- taxa_level(physeq.top.10.genera.root.trait, "Genus") ### tax_glom at geneus level with genus name indicated


# do the correlation at each growth stage (veg, flw and Mat)

root.top.10.taxa.cor <- taxa.env.correlation(physeq, grouping_column = "Growth.Stage", method = "pearson", 
                                             pvalue.threshold = 0.05, padjust.method = "BH", adjustment = 4, num.taxa = 173, 
                                             select.variables = c("PercentExtraFRL", "PercentFRL" , "Root.Biomass", "Total_RL","CoarseRL" ,"TFRL2mm", "ExtraFRL", "FRL"))


# reorder growth stage levels

root.top.10.taxa.cor$Type <- factor(root.top.10.taxa.cor$Type, levels = c("Vegetative", "Flowering", "Maturity"))

p <- plot_taxa_env(root.top.10.taxa.cor)
p = p + xlab("Growth stages")
p = p + theme(axis.title.x = element_text(size = 12, face = "bold"), axis.text.x=element_text(size = 12))
p = p + theme(axis.text.y=element_text(size = 12))
p = p + theme(axis.text.y=element_text(angle = 45))

print(p) #  p-values for multiple comparisions using Benjamin and Hochberg: adjustment = 4 - adjust Taxa (row on the correlation plot)

#  save csv file containing the correlation and p values

write.csv(root.top.10.taxa.cor, 'dominat.genera.root.trait.corr.with.pvalues.csv')

### Subset Canola PGPR previously indetified in canola (identified by Jorge et al 2019 and Bertrand et al., 2000)
## of the 14 pgpr reported 12 of thee genera were present in our dataset 

physeq.pgpr.genera.root.trait =  subset_taxa(physeq.updated.line.names.new.variable, Genus=="Paenibacillus"| Genus=="Pantoea"|
                                               Genus=="Pseudomonas"|Genus=="Rhizobium"|Genus=="Stenotrophomonas"|Genus=="agrococcus"|Genus=="Bacillus"
                                             |Genus=="Leifsonia"|Genus=="Microbacterium"|Genus=="Rhodococcus"|Genus=="Xanthomonas"|Genus=="Variovorax"
                                             |Genus=="Agrobacterium"|Genus=="Phyllobacterium")

## remove zero sum taxa

physeq.pgpr.genera.root.trait= prune_taxa(taxa_sums(physeq.pgpr.genera.root.trait) > 0, physeq.pgpr.genera.root.trait)

## Using the taxa.env.correlation function of microbiome seq, lets do correlation between genera and root traits

sample_variables(physeq.pgpr.genera.root.trait) ## get the sample variable names

physeq1 <- taxa_level(physeq.pgpr.genera.root.trait, "Genus") ### tax_glom at geneus level with genus name indicated


# do the correlation at each growth stage (veg, flw and Mat)

root.pgpr.taxa.cor <- taxa.env.correlation(physeq1, grouping_column = "Growth.Stage", method = "pearson", 
                                           pvalue.threshold = 0.05, padjust.method = "BH", adjustment = 4, num.taxa = 173, 
                                           select.variables = c("PercentExtraFRL", "PercentFRL" , "Root.Biomass", "Total_RL","CoarseRL" ,"TFRL2mm", "ExtraFRL", "FRL"))


# reorder growth stage levels

root.pgpr.taxa.cor$Type <- factor(root.pgpr.taxa.cor$Type, levels = c("Vegetative", "Flowering", "Maturity"))

p1 <- plot_taxa_env(root.pgpr.taxa.cor)
p1 = p1 + xlab("Growth stages")
p1 = p1 + theme(axis.title.x = element_text(size = 12, face = "bold"), axis.text.x=element_text(size = 12))
p1 = p1 + theme(axis.text.y=element_text(size = 12))
p1 = p1 + theme(axis.text.y=element_text(angle = 45))

print(p1) #  p-values for multiple comparisions using Benjamin and Hochberg: adjustment = 4 - adjust Taxa (row on the correlation plot)

#  save csv file containing the correlation and p values

write.csv(root.pgpr.taxa.cor, 'pgprt.genera.root.trait.corr.with.pvalues.csv')

### Correlation of dominat and canola gprb with environmental and soil characteristics


physeq.top.10.genera.root.trait =  subset_taxa(physeq.updated.line.names.new.variable, Genus=="Pseudomonas"| Genus=="Flavobacterium"|
                                                 Genus=="Lysobacter"|Genus=="Rhodoplanes"|Genus=="Pedobacter"|Genus=="Vibrio"|Genus=="Stenotrophomonas"
                                               |Genus=="Arthrobacter"|Genus=="Gluconacetobacter"|Genus=="Skermanella"|Genus=="Bradyrhizobium")

## remove zero sum taxa

physeq.top.10.genera.root.trait= prune_taxa(taxa_sums(physeq.top.10.genera.root.trait) > 0, physeq.top.10.genera.root.trait)

physeq <- taxa_level(physeq.top.10.genera.root.trait, "Genus") ### tax_glom at geneus level with genus name indicated

# do the correlation at each growth stage (veg, flw and Mat)

envi.top.10.taxa.cor <- taxa.env.correlation(physeq, grouping_column = "Growth.Stage", method = "pearson", 
                                             pvalue.threshold = 0.05, padjust.method = "BH", adjustment = 4, num.taxa = 173, 
                                             select.variables = c("T.WP", "PPT.WP", "T.SD","PPT.SD" ,"Soil.moisture", "Soil.water.content"))


# reorder growth stage levels
envi.top.10.taxa.cor$Type <- factor(envi.top.10.taxa.cor$Type, levels = c("Vegetative", "Flowering", "Maturity"))

p2 <- plot_taxa_env(envi.top.10.taxa.cor)
p2 = p2 + xlab("Growth stages")
p2 = p2 + theme(axis.title.x = element_text(size = 12, face = "bold"), axis.text.x=element_text(size = 12))
p2 = p2 + theme(axis.text.y=element_text(size = 12))
p2 = p2 + theme(axis.text.y=element_text(angle = 45))

print(p2) #  p-values for multiple comparisions using Benjamin and Hochberg: adjustment = 4 - adjust Taxa (row on the correlation plot)

#  save csv file containing the correlation and p values

write.csv(envi.top.10.taxa.cor, 'ominat.genera.envi.soil.variable.corr.with.pvalues.csv')

##

physeq.pgpr.genera.root.trait =  subset_taxa(physeq.updated.line.names.new.variable, Genus=="Paenibacillus"| Genus=="Pantoea"|
                                               Genus=="Pseudomonas"|Genus=="Rhizobium"|Genus=="Stenotrophomonas"|Genus=="agrococcus"|Genus=="Bacillus"
                                             |Genus=="Leifsonia"|Genus=="Microbacterium"|Genus=="Rhodococcus"|Genus=="Xanthomonas"|Genus=="Variovorax"
                                             |Genus=="Agrobacterium"|Genus=="Phyllobacterium")

## remove zero sum taxa

physeq.pgpr.genera.root.trait= prune_taxa(taxa_sums(physeq.pgpr.genera.root.trait) > 0, physeq.pgpr.genera.root.trait)

## Using the taxa.env.correlation function of microbiome seq, lets do correlation between genera and root traits

sample_variables(physeq.pgpr.genera.root.trait) ## get the sample variable names

physeq1 <- taxa_level(physeq.pgpr.genera.root.trait, "Genus") ### tax_glom at geneus level with genus name indicated


# do the correlation at each growth stage (veg, flw and Mat)

envi.pgpr.taxa.cor <- taxa.env.correlation(physeq1, grouping_column = "Growth.Stage", method = "pearson", 
                                           pvalue.threshold = 0.05, padjust.method = "BH", adjustment = 4, num.taxa = 173, 
                                           select.variables = c("T.WP", "PPT.WP", "T.SD","PPT.SD" ,"Soil.moisture", "Soil.water.content"))


# reorder growth stage levels

envi.pgpr.taxa.cor$Type <- factor(envi.pgpr.taxa.cor$Type, levels = c("Vegetative", "Flowering", "Maturity"))

p3 <- plot_taxa_env(envi.pgpr.taxa.cor)
p3 = p3 + xlab("Growth stages")
p3 = p3 + theme(axis.title.x = element_text(size = 12, face = "bold"), axis.text.x=element_text(size = 12))
p3 = p3 + theme(axis.text.y=element_text(size = 12))
p3 = p3 + theme(axis.text.y=element_text(angle = 45))

print(p3) #  p-values for multiple comparisions using Benjamin and Hochberg: adjustment = 4 - adjust Taxa (row on the correlation plot)

#  save csv file containing the correlation and p values

write.csv(envi.pgpr.taxa.cor, 'pgprt.genera.envi.soil.variables.corr.with.pvalues.csv')


#### ....... ####


### Ratio among the three dominat phyla: how the ratio changes across growth stages + correlation between acidobacteria:proteobactera to root traits

## Agglomerate taxa to phylum level

Physeq.phylum = tax_glom(physeq.updated.line.names.new.variable, taxrank = "Phylum")

##subset the two most abundant phyla: Acidobacteria and proteobacteria

Physeq.phylum.aciso.proteo = subset_taxa(Physeq.phylum, Phylum=="Acidobacteria"|Phylum=="Proteobacteria")

# get abudnace of Acidobacteria at the three growth stages

## modify the source microbiomer r function (brratio) that calculate the ratio Bacteriodetes/Firmicutes

library(microbiome)

bfratio <- function (x) {
  a <- transform(aggregate_taxa(x, level = "Phylum"), "compositional")
  b <- abundances(a)["Proteobacteria",]
  f <- abundances(a)["Acidobacteria",]  
  #data.frame(Bacteroidetes = b, Firmicutes = f, Ratio = b/f)
  b/f
}

## subset vegetative, flowering and maturity stage samples

Physeq.phylum.aciso.proteo.veg = subset_samples(Physeq.phylum.aciso.proteo, Growth.Stage=="Vegetative")
Physeq.phylum.aciso.proteo.flw = subset_samples(Physeq.phylum.aciso.proteo, Growth.Stage=="Flowering")
Physeq.phylum.aciso.proteo.mat = subset_samples(Physeq.phylum.aciso.proteo, Growth.Stage=="Maturity")

## calculate Proteobacteria/Acidobacteria ratio

mean(bfratio(Physeq.phylum.aciso.proteo.veg)) #  5.094917

pro.acid.ratio= bfratio(Physeq.phylum.aciso.proteo.flw) # there is one sample (ORIG1CS1143NW06000c) with inf value (no acidobacteria). Remove it to calculate mean

pro.acid.ratio = pro.acid.ratio[!is.infinite(pro.acid.ratio)]
mean(pro.acid.ratio)# flowering sateg P/A ratio =  10.26853

mean(bfratio(Physeq.phylum.aciso.proteo.mat)) # 7.939675

## save proteobacteria:Acidobacteria as a datarame and correlate with fine root traits.

veg.Pro.acido.ratio = bfratio(Physeq.phylum.aciso.proteo.veg)

write.csv(veg.Pro.acido.ratio, 'veg.Pro.acido.ratio.csv')## add column nammes and re-read to merge with metadata
veg.Pro.acido.ratio= read.csv("veg.Pro.acido.ratio.csv")
veg.Pro.acido.ratio= as.data.frame(veg.Pro.acido.ratio)
metav = sample_data(Physeq.phylum.aciso.proteo.veg)
metav = as.data.frame(metav)
metav$P.A.ratio = veg.Pro.acido.ratio$P.A.Ratio ## add the P:A ratio to the meta data table containing the root traits.

#
flw.pro.acid.ratio= bfratio(Physeq.phylum.aciso.proteo.flw)
write.csv(flw.pro.acid.ratio, 'flw.Pro.acido.ratio.csv')## add column nammes and re-read to merge with metadata
flw.pro.acid.ratio= read.csv("flw.Pro.acido.ratio.csv")
flw.pro.acid.ratio= as.data.frame(flw.pro.acid.ratio)
metaf = sample_data(Physeq.phylum.aciso.proteo.flw)
metaf = as.data.frame(metaf)
metaf$P.A.ratio = flw.pro.acid.ratio$P.A.Ratio ## add the P:A ratio to the meta data table containing the root traits.

##
mat.pro.acid.ratio= bfratio(Physeq.phylum.aciso.proteo.mat)
write.csv(mat.pro.acid.ratio, 'mat.pro.acid.ratio.csv')## add column nammes and re-read to merge with metadata
mat.pro.acid.ratio= read.csv("mat.pro.acid.ratio.csv")
mat.pro.acid.ratio= as.data.frame(mat.pro.acid.ratio)
metam = sample_data(Physeq.phylum.aciso.proteo.mat)
metam = as.data.frame(metam)
metam$P.A.ratio = mat.pro.acid.ratio$P.A.Ratio ## add the P:A ratio to the meta data table containing the root traits.

#### plot correlation between root traits and P:A ratio

library(ggpubr)

p= ggscatter(metav, x = "Total_RL", y = "P.A.ratio", 
             add = "reg.line", conf.int = TRUE,
             cor.coef = TRUE, cor.method = "pearson",
             xlab="Total root length", ylab = "Proteobacteria:Acidobacteria")## scater plot with trend line and correlation                                                                                 coefficient with p value 
p= p +theme(axis.title.x = element_text(size = 14), axis.title.y = element_text(size = 14))
p=p+theme(axis.text.x = element_text(size=12), axis.text.y=element_text(size = 12))
p


##

p1= ggscatter(metav, x = "CoarseRL", y = "P.A.ratio", 
              add = "reg.line", conf.int = TRUE,
              cor.coef = TRUE, cor.method = "pearson",
              xlab="Coarse root length", ylab = "")## scater plot with trend line and correlation coefficient with p value 
p1= p1 +theme(axis.title.x = element_text(size = 14), axis.title.y = element_text(size = 14))
p1=p1+theme(axis.text.x = element_text(size=12), axis.text.y=element_text(size = 12))
p1

##

p2= ggscatter(metav, x = "TFRL2mm", y = "P.A.ratio", 
              add = "reg.line", conf.int = TRUE,
              cor.coef = TRUE, cor.method = "pearson",
              xlab="Total Fine root length", ylab = "Proteobacteria:Acidobacteria")## scater plot with trend line and                                                                                          correlation coefficient with p value 
p2= p2 +theme(axis.title.x = element_text(size = 14), axis.title.y = element_text(size = 14))
p2=p2+theme(axis.text.x = element_text(size=12), axis.text.y=element_text(size = 12))
p2

##

p3= ggscatter(metav, x = "Root.Biomass", y = "P.A.ratio", 
              add = "reg.line", conf.int = TRUE,
              cor.coef = TRUE, cor.method = "pearson",
              xlab="Root biomass", ylab = "")## scater plot with trend line and correlation coefficient with p value 
p3= p3 +theme(axis.title.x = element_text(size = 14), axis.title.y = element_text(size = 14))
p3=p3+theme(axis.text.x = element_text(size=12), axis.text.y=element_text(size = 12))
p3

###
ggarrange(p, p1, p2, p3, labels = c("A", "B","C", "D"))

#### Actinobacteria: Acidobacteria ratio

aaratio <- function (x) {
  a <- transform(aggregate_taxa(x, level = "Phylum"), "compositional")
  b <- abundances(a)["Actinobacteria",]
  f <- abundances(a)["Acidobacteria",]  
  #data.frame(Bacteroidetes = b, Firmicutes = f, Ratio = b/f)
  b/f
}

##

Physeq.phylum.acti.acido = subset_taxa(Physeq.phylum, Phylum=="Acidobacteria"|Phylum=="Actinobacteria")

## subset vegetative, flowering and maturity stage samples
Physeq.phylum.acti.acido.veg = subset_samples(Physeq.phylum.acti.acido, Growth.Stage=="Vegetative")
Physeq.phylum.acti.acido.flw = subset_samples(Physeq.phylum.acti.acido, Growth.Stage=="Flowering")
Physeq.phylum.acti.acido.mat = subset_samples(Physeq.phylum.acti.acido, Growth.Stage=="Maturity")
##

mean(aaratio(Physeq.phylum.acti.acido.veg))#2.027333
acti.acid.ratio = aaratio(Physeq.phylum.acti.acido.flw)# there is one sample (ORIG1CS1143NW06000c) with inf value (no acidobacteria). Remove it to calculate mean
acti.acid.ratio = acti.acid.ratio[!is.infinite(acti.acid.ratio)]
mean(acti.acid.ratio)#3.530675 #flowering

mean(aaratio(Physeq.phylum.acti.acido.mat))# 4.224899

############## genus level P:A RATIO ################################

Physeq.class = tax_glom(physeq.updated.line.names.new.variable, taxrank = "Genus")

##subset the two most abundant phyla: Acidobacteria and proteobacteria

Physeq.steno_arthro = subset_taxa(Physeq.class, Genus=="Stenotrophomonas"|Genus=="Arthrobacter")
Physeq.skerm_arthro = subset_taxa(Physeq.class, Genus=="Skermanella"|Genus=="Arthrobacter")
Physeq.Pseudo_arthro = subset_taxa(Physeq.class, Genus=="Bradyrhizobium"|Genus=="Arthrobacter")


#get_taxa_unique(Physeq.class.aciso.proteo, taxonomic.rank=rank_names(Physeq.class.aciso.proteo)[3], errorIfNULL=TRUE)

# get abudnace of Acidobacteria at the three growth stages

## modify the source microbiomer r function (brratio) that calculate the ratio Bacteriodetes/Firmicutes
library(microbiome)
bfratio <- function (x) {
  a <- transform(aggregate_taxa(x, level = "Genus"), "compositional")
  b <- abundances(a)["Bradyrhizobium",]
  f <- abundances(a)["Arthrobacter",]  
  #data.frame(Bacteroidetes = b, Firmicutes = f, Ratio = b/f)
  b/f
}
## subset vegetative, flowering and maturity stage samples
Physeq.steno_arthro.veg = subset_samples(Physeq.steno_arthro, Growth.Stage=="Vegetative")
Physeq.steno_arthro.flw = subset_samples(Physeq.steno_arthro, Growth.Stage=="Flowering")
Physeq.steno_arthro.mat = subset_samples(Physeq.steno_arthro, Growth.Stage=="Maturity")

## calculate Proteobacteria/Acidobacteria ratio
alphapro.steno_arthro.ratio= bfratio(Physeq.steno_arthro.veg)
alphapro.steno_arthro.ratio = alphapro.steno_arthro.ratio[!is.nan(alphapro.steno_arthro.ratio)]
alphapro.steno_arthro.ratio = alphapro.steno_arthro.ratio[!is.infinite(alphapro.steno_arthro.ratio)]
mean(alphapro.steno_arthro.ratio)# vegetative sateg steno/Arthro ratio =  0.4964963

alphapro.steno_arthro.ratio= bfratio(Physeq.steno_arthro.flw) # 
alphapro.steno_arthro.ratio = alphapro.steno_arthro.ratio[!is.nan(alphapro.steno_arthro.ratio)]
alphapro.steno_arthro.ratio = alphapro.steno_arthro.ratio[!is.infinite(alphapro.steno_arthro.ratio)]
mean(alphapro.steno_arthro.ratio)# flowering sateg steno/Arthro ratio =  0.9993716

alphapro.steno_arthro.ratio= bfratio(Physeq.steno_arthro.mat) # 
alphapro.steno_arthro.ratio = alphapro.steno_arthro.ratio[!is.nan(alphapro.steno_arthro.ratio)]
alphapro.steno_arthro.ratio = alphapro.steno_arthro.ratio[!is.infinite(alphapro.steno_arthro.ratio)]
mean(alphapro.steno_arthro.ratio)# Maturity sateg steno/Arthro ratio =  0.5879472

##

## subset vegetative, flowering and maturity stage samples
Physeq.steno_arthro.veg = subset_samples(Physeq.skerm_arthro, Growth.Stage=="Vegetative")
Physeq.steno_arthro.flw = subset_samples(Physeq.skerm_arthro, Growth.Stage=="Flowering")
Physeq.steno_arthro.mat = subset_samples(Physeq.skerm_arthro, Growth.Stage=="Maturity")

## calculate Proteobacteria/Acidobacteria ratio
alphapro.steno_arthro.ratio= bfratio(Physeq.steno_arthro.veg)
alphapro.steno_arthro.ratio = alphapro.steno_arthro.ratio[!is.nan(alphapro.steno_arthro.ratio)]
alphapro.steno_arthro.ratio = alphapro.steno_arthro.ratio[!is.infinite(alphapro.steno_arthro.ratio)]
mean(alphapro.steno_arthro.ratio)# vegetative sateg Skerm/Arthro ratio =  1.152353

alphapro.steno_arthro.ratio= bfratio(Physeq.steno_arthro.flw) # 
alphapro.steno_arthro.ratio = alphapro.steno_arthro.ratio[!is.nan(alphapro.steno_arthro.ratio)]
alphapro.steno_arthro.ratio = alphapro.steno_arthro.ratio[!is.infinite(alphapro.steno_arthro.ratio)]
mean(alphapro.steno_arthro.ratio)# flowering sateg Skerm/Arthro ratio =  0.4422514

alphapro.steno_arthro.ratio= bfratio(Physeq.steno_arthro.mat) # 
alphapro.steno_arthro.ratio = alphapro.steno_arthro.ratio[!is.nan(alphapro.steno_arthro.ratio)]
alphapro.steno_arthro.ratio = alphapro.steno_arthro.ratio[!is.infinite(alphapro.steno_arthro.ratio)]
mean(alphapro.steno_arthro.ratio)# Maturity sateg steno/Arthro ratio =  0.4416642

##

## subset vegetative, flowering and maturity stage samples
Physeq.steno_arthro.veg = subset_samples(Physeq.Pseudo_arthro, Growth.Stage=="Vegetative")
Physeq.steno_arthro.flw = subset_samples(Physeq.Pseudo_arthro, Growth.Stage=="Flowering")
Physeq.steno_arthro.mat = subset_samples(Physeq.Pseudo_arthro, Growth.Stage=="Maturity")

## calculate Proteobacteria/Acidobacteria ratio
alphapro.steno_arthro.ratio= bfratio(Physeq.steno_arthro.veg)
alphapro.steno_arthro.ratio = alphapro.steno_arthro.ratio[!is.nan(alphapro.steno_arthro.ratio)]
alphapro.steno_arthro.ratio = alphapro.steno_arthro.ratio[!is.infinite(alphapro.steno_arthro.ratio)]
mean(alphapro.steno_arthro.ratio)# vegetative sateg Brady/Arthro ratio =  0.9194767

alphapro.steno_arthro.ratio= bfratio(Physeq.steno_arthro.flw) # 
alphapro.steno_arthro.ratio = alphapro.steno_arthro.ratio[!is.nan(alphapro.steno_arthro.ratio)]
alphapro.steno_arthro.ratio = alphapro.steno_arthro.ratio[!is.infinite(alphapro.steno_arthro.ratio)]
mean(alphapro.steno_arthro.ratio)# flowering sateg Brady/Arthro ratio =  0.3059721

alphapro.steno_arthro.ratio= bfratio(Physeq.steno_arthro.mat) # 
alphapro.steno_arthro.ratio = alphapro.steno_arthro.ratio[!is.nan(alphapro.steno_arthro.ratio)]
alphapro.steno_arthro.ratio = alphapro.steno_arthro.ratio[!is.infinite(alphapro.steno_arthro.ratio)]
mean(alphapro.steno_arthro.ratio)# Maturity sateg Brady/Arthro ratio =  0.2874067


#### ........................................ #### ...................... #### ..................... ####

## Root growth dynamics 

setwd("D:/out_puts/phyloseq/with_green_genes_taxa/Ampvis_visualization/Root_traits_canola_PGPB")

#load the meta data with root traits

head(can_sam_rhizo_gg)
data.root = as.data.frame(can.sam.rhizo_gg)

##
#hist(data.root$Total_RL)
#hist(sqrt(data.root$FRL)) ## square root transformation normalize the data
#hist(sqrt(data.root$TFRL2mm)) 
#hist(sqrt(data.root$CoarseRL)) 
#hist(sqrt(data.root$ExtraFRL)) 
#hist(sqrt(data.root$Total_RL)) 

#### visualize the data: by plotting
#with(data.root, cor(FRL, TFRL2mm))
#ggplot(data=data.root, aes(x=Growth.Stage, y=FRL))+ geom_point()

## fit generalized additive models for each root lenght trait to evaluate root dynaics throughout the ten sampling weeks
##fine root length:FRL
## GAM model using whole data and sampling point
# fine root length
#Growth= as.factor(data.root$Growth.Stage)

library(mgcv)

#library(gam)
#week=as.factor(data.root$Week)

gm1 <- gam(FRL ~ s(Week),data=data.root)
plot(gm1)
summary(gm1)

pd_0 <- data.frame(Week=seq(1,10,length=477))

pv_0 <- predict(gm1,newdata=pd_0,type="link",se=TRUE)
plot(data.root$Week,data.root$FRL,type="p",xlab="Sampling Week",ylab="Total Fine Root Length (cm)",cex.axis=1.5,cex.lab=1.5, lwd=1)
lines((pd_0[,1]),pv_0$fit,lwd=3,col="blue")
lines((pd_0[,1]),pv_0$fit+pv_0$se.fit*1.96,lwd=1,lty=3)
lines((pd_0[,1]),pv_0$fit-pv_0$se.fit *1.96,lwd=1,lty=3)

plt= list(data.root$Plot)
gmx <- gamm(FRL ~ s(Week),random=list(plt=~1),data=data.root)

##TFRL2mm : fine root length 0.5 to 2mm

gm2 <- gam(TFRL2mm ~ s(Week)+s(plot,bs="re"),data=data.root)


gm2 <- gam(TFRL2mm ~ s(Week),data=data.root)
plot(gm2)
summary(gm2)

pd_1 <- data.frame(Week=seq(1,10,length=477))

pv_1 <- predict(gm2,newdata=pd_1,type="link",se=TRUE)
plot(data.root$Week,data.root$TFRL2mm,type="p",xlab="Sampling Week",ylab="Fine Root Length (0.5-2 mm) (cm)",cex.axis=1.5,cex.lab=1.5, lwd=1)
lines((pd_1[,1]),pv_1$fit,lwd=3,col="blue")
lines((pd_1[,1]),pv_1$fit+pv_0$se.fit*1.96,lwd=1,lty=3)
lines((pd_1[,1]),pv_1$fit-pv_0$se.fit *1.96,lwd=1,lty=3)

### ExtraFRL : extra fine root length (0 -0.5mm)

gm3 <- gam(ExtraFRL ~ s(Week),data=data.root)
plot(gm3)
summary(gm3)

pd_2 <- data.frame(Week=seq(1,10,length=477))

pv_2 <- predict(gm3,newdata=pd_2,type="link",se=TRUE)
plot(data.root$Week,data.root$ExtraFRL,type="p",xlab="Sampling Week",ylab="Extra Fine Root Length (0-0.5 mm) (cm)",cex.axis=1.5,cex.lab=1.5, lwd=1)
lines((pd_2[,1]),pv_2$fit,lwd=3,col="blue")
lines((pd_2[,1]),pv_2$fit+pv_0$se.fit*1.96,lwd=1,lty=3)
lines((pd_2[,1]),pv_2$fit-pv_0$se.fit *1.96,lwd=1,lty=3)

##CoarseRL : coarse rot length (cm)

gm4 <- gam(CoarseRL ~ s(Week),data=data.root)
plot(gm4)
summary(gm4)

pd_3 <- data.frame(Week=seq(1,10,length=477))

pv_3 <- predict(gm4,newdata=pd_3,type="link",se=TRUE)
plot(data.root$Week,data.root$CoarseRL,type="p",xlab="Sampling Week",ylab="Coarse root length (cm)",cex.axis=1.5,cex.lab=1.5, lwd=1)
lines((pd_3[,1]),pv_3$fit,lwd=3,col="blue")
lines((pd_3[,1]),pv_3$fit+pv_0$se.fit*1.96,lwd=1,lty=3)
lines((pd_3[,1]),pv_3$fit-pv_0$se.fit *1.96,lwd=1,lty=3)

##Total_RL: 

gm5 <- gam(Total_RL ~ s(Week),data=data.root)
plot(gm5)
summary(gm5)

pd_4 <- data.frame(Week=seq(1,10,length=477))

pv_4 <- predict(gm5,newdata=pd_4,type="link",se=TRUE)
plot(data.root$Week,data.root$Total_RL,type="p",xlab="Sampling Week",ylab="Total root length (cm)",cex.axis=1.5,cex.lab=1.5, lwd=1)
lines((pd_4[,1]),pv_4$fit,lwd=3,col="blue")
lines((pd_4[,1]),pv_4$fit+pv_0$se.fit*1.96,lwd=1,lty=3)
lines((pd_4[,1]),pv_4$fit-pv_0$se.fit *1.96,lwd=1,lty=3)

##Root biomass

rootbiomass= data.root$`Root Biomass`
gm6 <- gam(rootbiomass ~ s(Week),data=data.root)
plot(gm6)
summary(gm6)

pd_5 <- data.frame(Week=seq(1,10,length=477))

pv_5 <- predict(gm6,newdata=pd_5,type="link",se=TRUE)
plot(data.root$Week,data.root$`Root Biomass`,type="p",xlab="Sampling Week",ylab="Root Biomass (g)",cex.axis=1.5,cex.lab=1.5, lwd=1)
lines((pd_5[,1]),pv_5$fit,lwd=3,col="blue")
lines((pd_5[,1]),pv_5$fit+pv_0$se.fit*1.96,lwd=1,lty=3)
lines((pd_5[,1]),pv_5$fit-pv_0$se.fit *1.96,lwd=1,lty=3)

#### plot together 

par(mfrow = c(2,2),xpd = NA,
    oma = c(4,4,2,0),
    mar = c(2,3,2,3))

plot(data.root$Week,data.root$FRL,type="p",xlab="",ylab ="Total Fine Root Length (cm)",cex.axis=1.5,cex.lab=1.5, lwd=1)
p+lines((pd_0[,1]),pv_0$fit,lwd=3,col="blue")
lines((pd_0[,1]),pv_0$fit+pv_0$se.fit*1.96,lwd=1,lty=3)
lines((pd_0[,1]),pv_0$fit-pv_0$se.fit *1.96,lwd=1,lty=3)
text(1,180,"(a)",cex=1.5)

plot(data.root$Week,data.root$TFRL2mm,type="p",xlab="",ylab="Fine Root Length (0.5-2 mm) (cm)",cex.axis=1.5,cex.lab=1.5, lwd=1)
lines((pd_1[,1]),pv_1$fit,lwd=3,col="blue")
lines((pd_1[,1]),pv_1$fit+pv_1$se.fit*1.96,lwd=1,lty=3)
lines((pd_1[,1]),pv_1$fit-pv_1$se.fit *1.96,lwd=1,lty=3)
text(1,1000,"(b)",cex=1.5)

plot(data.root$Week,data.root$ExtraFRL,type="p",xlab="Sampling Week",ylab="Extra Fine Root Length (0-0.5 mm) (cm)",cex.axis=1.5,cex.lab=1.5, lwd=1)
lines((pd_2[,1]),pv_2$fit,lwd=3,col="blue")
lines((pd_2[,1]),pv_2$fit+pv_2$se.fit*1.96,lwd=1,lty=3)
lines((pd_2[,1]),pv_2$fit-pv_2$se.fit *1.96,lwd=1,lty=3)
text(1,850,"(c)",cex=1.5)

plot(data.root$Week,data.root$CoarseRL,type="p",xlab="Sampling Week",ylab="Coarse root length (cm)",cex.axis=1.5,cex.lab=1.5, lwd=1)
lines((pd_3[,1]),pv_3$fit,lwd=3,col="blue")
lines((pd_3[,1]),pv_3$fit+pv_3$se.fit*1.96,lwd=1,lty=3)
lines((pd_3[,1]),pv_3$fit-pv_3$se.fit *1.96,lwd=1,lty=3)
text(1,50,"(d)",cex=1.5)

par(mfrow = c(1,2),xpd = NA,
    oma = c(4,4,2,0),
    mar = c(2,3,2,3))
plot(data.root$Week,data.root$Total_RL,type="p",xlab="Sampling Week",ylab="Total root length (cm)",cex.axis=1.5,cex.lab=1.5, lwd=1)
lines((pd_4[,1]),pv_4$fit,lwd=3,col="blue")
lines((pd_4[,1]),pv_4$fit+pv_4$se.fit*1.96,lwd=1,lty=3)
lines((pd_4[,1]),pv_4$fit-pv_4$se.fit *1.96,lwd=1,lty=3)
text(1,1000,"(a)",cex=1.5)

plot(data.root$Week,data.root$`Root Biomass`,type="p",xlab="Sampling Week",ylab="Root Biomass (g)",cex.axis=1.5,cex.lab=1.5, lwd=1)
lines((pd_5[,1]),pv_5$fit,lwd=3,col="blue")
lines((pd_5[,1]),pv_5$fit+pv_5$se.fit*1.96,lwd=1,lty=3)
lines((pd_5[,1]),pv_5$fit-pv_5$se.fit *1.96,lwd=1,lty=3)
text(1,4,"(b)",cex=1.5)

### Is there variability in root growth dynamics among genotypes. gam models considering canola genotypes: do on total root length and biomass

canola= as.factor(data.root$Canola.Lines)
plot= as.factor(data.root$Plot)

# fit gam model 

gm6 <- gam(Total_RL ~ s(Week, by = canola),data=data.root)
summary(gm6)
plot(gm6)


gm6 <- gam(FRL ~ s(Week,by = canola ),data=data.root)
plot(gm6)
summary(gm6)

par(mfrow = c(2,2),
    oma = c(5,3,2,0),
    mar = c(4,4,2,3))

pd_6 <- data.frame(Week=seq(1,10,length=30),canola=rep("NAM-0",30))
pv_6 <- predict(gm6,newdata=pd_6,type="link",se=TRUE)
plot(data.root$Week,data.root$Total_RL,type="p",xlab="",ylab="Total Root Length (cm)",cex.axis=1.5,cex.lab=1.5, lwd=1)
with(data.root[canola=="NAM-0",],points(Week,subset(Total_RL, canola=="NAM-0")))
lines((pd_6[,1]),pv_6$fit,lwd=3,col="blue")
lines((pd_6[,1]),pv_6$fit+pv_6$se.fit*1.96,lwd=1,lty=3)
lines((pd_6[,1]),pv_6$fit-pv_6$se.fit *1.96,lwd=1,lty=3)
text(1,1000,"(a)",cex=1.5)

pd_7 <- data.frame(Week=seq(1,10,length=30),canola=rep("NAM-13",30))
pv_7 <- predict(gm6,newdata=pd_7,type="link",se=TRUE)
plot(data.root$Week,data.root$Total_RL,type="p",xlab="",ylab="",cex.axis=1.5,cex.lab=1.5, lwd=1)
with(data.root[canola=="NAM-13",],points(Week,subset(Total_RL, canola=="NAM-13")))
lines((pd_7[,1]),pv_7$fit,lwd=3,col="blue")
lines((pd_7[,1]),pv_7$fit+pv_7$se.fit*1.96,lwd=1,lty=3)
lines((pd_7[,1]),pv_7$fit-pv_7$se.fit *1.96,lwd=1,lty=3)
text(1,1000,"(b)",cex=1.5)

pd_8 <- data.frame(Week=seq(1,10,length=30),canola=rep("NAM-14",30))
pv_8 <- predict(gm6,newdata=pd_8,type="link",se=TRUE)
plot(data.root$Week,data.root$Total_RL,type="p",xlab="Sampling Week",ylab="Total Root Length (cm)",cex.axis=1.5,cex.lab=1.5, lwd=1)
with(data.root[canola=="NAM-14",],points(Week,subset(Total_RL, canola=="NAM-14")))
lines((pd_8[,1]),pv_8$fit,lwd=3,col="blue")
lines((pd_8[,1]),pv_8$fit+pv_8$se.fit*1.96,lwd=1,lty=3)
lines((pd_8[,1]),pv_8$fit-pv_8$se.fit *1.96,lwd=1,lty=3)
text(1,1000,"(c)",cex=1.5)

pd_9 <- data.frame(Week=seq(1,10,length=30),canola=rep("NAM-17",30))
pv_9 <- predict(gm6,newdata=pd_9,type="link",se=TRUE)
plot(data.root$Week,data.root$Total_RL,type="p",xlab="Sampling Week",ylab="",cex.axis=1.5,cex.lab=1.5, lwd=1)
with(data.root[canola=="NAM-17",],points(Week,subset(Total_RL, canola=="NAM-17")))
lines((pd_9[,1]),pv_9$fit,lwd=3,col="blue")
lines((pd_9[,1]),pv_9$fit+pv_9$se.fit*1.96,lwd=1,lty=3)
lines((pd_9[,1]),pv_9$fit-pv_9$se.fit *1.96,lwd=1,lty=3)
text(1,1000,"(d)",cex=1.5)

par(mfrow = c(2,2),
    oma = c(5,3,2,0),
    mar = c(4,4,2,3))

pd_10 <- data.frame(Week=seq(1,10,length=30),canola=rep("NAM-23",30))
pv_10 <- predict(gm6,newdata=pd_10,type="link",se=TRUE)
plot(data.root$Week,data.root$Total_RL,type="p",xlab="",ylab="Total Root Length (cm)",cex.axis=1.5,cex.lab=1.5, lwd=1)
with(data.root[canola=="NAM-23",],points(Week,subset(Total_RL, canola=="NAM-23")))
lines((pd_10[,1]),pv_10$fit,lwd=3,col="blue")
lines((pd_10[,1]),pv_10$fit+pv_10$se.fit*1.96,lwd=1,lty=3)
lines((pd_10[,1]),pv_10$fit-pv_10$se.fit *1.96,lwd=1,lty=3)
text(1,1000,"(e)",cex=1.5)

pd_11 <- data.frame(Week=seq(1,10,length=30),canola=rep("NAM-30",30))
pv_11 <- predict(gm6,newdata=pd_11,type="link",se=TRUE)
plot(data.root$Week,data.root$Total_RL,type="p",xlab="",ylab="",cex.axis=1.5,cex.lab=1.5, lwd=1)
with(data.root[canola=="NAM-30",],points(Week,subset(Total_RL, canola=="NAM-30")))
lines((pd_11[,1]),pv_11$fit,lwd=3,col="blue")
lines((pd_11[,1]),pv_11$fit+pv_11$se.fit*1.96,lwd=1,lty=3)
lines((pd_11[,1]),pv_11$fit-pv_11$se.fit *1.96,lwd=1,lty=3)
text(1,1000,"(f)",cex=1.5)

pd_12 <- data.frame(Week=seq(1,10,length=30),canola=rep("NAM-32",30))
pv_12 <- predict(gm6,newdata=pd_12,type="link",se=TRUE)
plot(data.root$Week,data.root$Total_RL,type="p",xlab="Sampling Week",ylab="Total Root Length (cm)",cex.axis=1.5,cex.lab=1.5, lwd=1)
with(data.root[canola=="NAM-32",],points(Week,subset(Total_RL, canola=="NAM-32")))
lines((pd_12[,1]),pv_12$fit,lwd=3,col="blue")
lines((pd_12[,1]),pv_12$fit+pv_12$se.fit*1.96,lwd=1,lty=3)
lines((pd_12[,1]),pv_12$fit-pv_12$se.fit *1.96,lwd=1,lty=3)
text(1,1000,"(g)",cex=1.5)

pd_13 <- data.frame(Week=seq(1,10,length=30),canola=rep("NAM-37",30))
pv_13 <- predict(gm6,newdata=pd_13,type="link",se=TRUE)
plot(data.root$Week,data.root$Total_RL,type="p",xlab="Sampling Week",ylab="",cex.axis=1.5,cex.lab=1.5, lwd=1)
with(data.root[canola=="NAM-37",],points(Week,subset(Total_RL, canola=="NAM-37")))
lines((pd_13[,1]),pv_13$fit,lwd=3,col="blue")
lines((pd_13[,1]),pv_13$fit+pv_13$se.fit*1.96,lwd=1,lty=3)
lines((pd_13[,1]),pv_13$fit-pv_13$se.fit *1.96,lwd=1,lty=3)
text(1,1000,"(h)",cex=1.5)

par(mfrow = c(2,2),
    oma = c(5,3,2,0),
    mar = c(4,4,2,3))

pd_14 <- data.frame(Week=seq(1,10,length=30),canola=rep("NAM-43",30))
pv_14 <- predict(gm6,newdata=pd_14,type="link",se=TRUE)
plot(data.root$Week,data.root$Total_RL,type="p",xlab="",ylab="Total Root Length (cm)",cex.axis=1.5,cex.lab=1.5, lwd=1)
with(data.root[canola=="NAM-43",],points(Week,subset(Total_RL, canola=="NAM-43")))
lines((pd_14[,1]),pv_14$fit,lwd=3,col="blue")
lines((pd_14[,1]),pv_14$fit+pv_14$se.fit*1.96,lwd=1,lty=3)
lines((pd_14[,1]),pv_14$fit-pv_14$se.fit *1.96,lwd=1,lty=3)
text(1,1000,"(i)",cex=1.5)


pd_15 <- data.frame(Week=seq(1,10,length=30),canola=rep("NAM-46",30))
pv_15 <- predict(gm6,newdata=pd_15,type="link",se=TRUE)
plot(data.root$Week,data.root$Total_RL,type="p",xlab="",ylab="",cex.axis=1.5,cex.lab=1.5, lwd=1)
with(data.root[canola=="NAM-46",],points(Week,subset(Total_RL, canola=="NAM-46")))
lines((pd_15[,1]),pv_15$fit,lwd=3,col="blue")
lines((pd_15[,1]),pv_15$fit+pv_15$se.fit*1.96,lwd=1,lty=3)
lines((pd_15[,1]),pv_15$fit-pv_15$se.fit *1.96,lwd=1,lty=3)
text(1,1000,"(j)",cex=1.5)


pd_16 <- data.frame(Week=seq(1,10,length=30),canola=rep("NAM-48",30))
pv_16 <- predict(gm6,newdata=pd_16,type="link",se=TRUE)
plot(data.root$Week,data.root$Total_RL,type="p",xlab="Sampling Week",ylab="Total Root Length (cm)",cex.axis=1.5,cex.lab=1.5, lwd=1)
with(data.root[canola=="NAM-48",],points(Week,subset(Total_RL, canola=="NAM-48")))
lines((pd_16[,1]),pv_16$fit,lwd=3,col="blue")
lines((pd_16[,1]),pv_16$fit+pv_16$se.fit*1.96,lwd=1,lty=3)
lines((pd_16[,1]),pv_16$fit-pv_16$se.fit *1.96,lwd=1,lty=3)
text(1,1000,"(k)",cex=1.5)


pd_17 <- data.frame(Week=seq(1,10,length=30),canola=rep("NAM-5",30))
pv_17 <- predict(gm6,newdata=pd_17,type="link",se=TRUE)
plot(data.root$Week,data.root$Total_RL,type="p",xlab="Sampling Week",ylab="",cex.axis=1.5,cex.lab=1.5, lwd=1)
with(data.root[canola=="NAM-5",],points(Week,subset(Total_RL, canola=="NAM-5")))
lines((pd_17[,1]),pv_17$fit,lwd=3,col="blue")
lines((pd_17[,1]),pv_17$fit+pv_17$se.fit*1.96,lwd=1,lty=3)
lines((pd_17[,1]),pv_17$fit-pv_17$se.fit *1.96,lwd=1,lty=3)
text(1,1000,"(l)",cex=1.5)

par(mfrow = c(2,2),
    oma = c(5,3,2,0),
    mar = c(4,4,2,3))

pd_18 <- data.frame(Week=seq(1,10,length=30),canola=rep("NAM-72",30))
pv_18 <- predict(gm6,newdata=pd_18,type="link",se=TRUE)
plot(data.root$Week,data.root$Total_RL,type="p",xlab="",ylab="Total Root Length (cm)",cex.axis=1.5,cex.lab=1.5, lwd=1)
with(data.root[canola=="NAM-72",],points(Week,subset(Total_RL, canola=="NAM-72")))
lines((pd_18[,1]),pv_18$fit,lwd=3,col="blue")
lines((pd_18[,1]),pv_18$fit+pv_18$se.fit*1.96,lwd=1,lty=3)
lines((pd_18[,1]),pv_18$fit-pv_18$se.fit *1.96,lwd=1,lty=3)
text(1,1000,"(m)",cex=1.5)


pd_19 <- data.frame(Week=seq(1,10,length=30),canola=rep("NAM-76",30))
pv_19 <- predict(gm6,newdata=pd_19,type="link",se=TRUE)
plot(data.root$Week,data.root$Total_RL,type="p",xlab="",ylab="",cex.axis=1.5,cex.lab=1.5, lwd=1)
with(data.root[canola=="NAM-76",],points(Week,subset(Total_RL, canola=="NAM-76")))
lines((pd_19[,1]),pv_19$fit,lwd=3,col="blue")
lines((pd_19[,1]),pv_19$fit+pv_19$se.fit*1.96,lwd=1,lty=3)
lines((pd_19[,1]),pv_19$fit-pv_19$se.fit *1.96,lwd=1,lty=3)
text(1,1000,"(n)",cex=1.5)


pd_20 <- data.frame(Week=seq(1,10,length=30),canola=rep("NAM-79",30))
pv_20 <- predict(gm6,newdata=pd_20,type="link",se=TRUE)
plot(data.root$Week,data.root$Total_RL,type="p",xlab="Sampling Week",ylab="Total Root Length (cm)",cex.axis=1.5,cex.lab=1.5, lwd=1)
with(data.root[canola=="NAM-79",],points(Week,subset(Total_RL, canola=="NAM-79")))
lines((pd_20[,1]),pv_20$fit,lwd=3,col="blue")
lines((pd_20[,1]),pv_20$fit+pv_20$se.fit*1.96,lwd=1,lty=3)
lines((pd_20[,1]),pv_20$fit-pv_20$se.fit *1.96,lwd=1,lty=3)
text(1,1000,"(o)",cex=1.5)


pd_21 <- data.frame(Week=seq(1,10,length=30),canola=rep("NAM-94",30))
pv_21 <- predict(gm6,newdata=pd_21,type="link",se=TRUE)
plot(data.root$Week,data.root$Total_RL,type="p",xlab="Sampling Week",ylab="",cex.axis=1.5,cex.lab=1.5, lwd=1)
with(data.root[canola=="NAM-94",],points(Week,subset(Total_RL, canola=="NAM-94")))
lines((pd_21[,1]),pv_21$fit,lwd=3,col="blue")
lines((pd_21[,1]),pv_21$fit+pv_21$se.fit*1.96,lwd=1,lty=3)
lines((pd_21[,1]),pv_21$fit-pv_21$se.fit *1.96,lwd=1,lty=3)
text(1,1000,"(p)",cex=1.5)

#### summary statistics by canola line and growth stage

library(psych)

RL.summary= describeBy(data.root$Total_RL,list(data.root$Canola.Lines,data.root$Growth.Stage),mat=TRUE)
RL.summary= as.data.frame(RL.summary)
write.csv(RL.summary, 'RL.summary.csv')

# ................................................. # .................................. # ..................................... # ...................................

# Chapter 5

## BRASSICA NAPUS RHIZOSPHERE BACTERIAL COMMUNITY ASSEMBLY: DRIVER AND CORE-HUB COMMUNITIES ACROSS GROWTH STAGES AND BETWEEN GENOTYPES. Taye et al., 2020

# Co-occurence network construction
# method used: CCREPE: Compositionality Corrected by PErmutation and REnormalization: in ccrepe r package

# set your working directory

setwd('D:/....')

# install ccrepe r package 

if (!requireNamespace("BiocManager", quietly = TRUE))
  install.packages("BiocManager")
BiocManager::install("ccrepe")

# metagMisc package contains miscellaneous functions for metagenomic analysis

install.packages("remotes")
remotes::install_github("vmikk/metagMisc")

# Load libraries 

library(ccrepe)
library(metagMisc)
library(igraph)
library(phyloseq)


# Load your data: already created phyloseq object
load("D:/...../2016.physeq.updated.line.names.new.variable.RData")

physeq.updated.line.names.new.variable

# agglomerate to genus level

physeqgenus.2016 = tax_glom(physeq.updated.line.names.new.variable,taxrank = 'Genus')

# remove zerosum taxa

physeqgenus.2016 = prune_taxa(taxa_sums(physeqgenus.2016) > 0, physeqgenus.2016)

# I. co-occurence networks at different growth stages (vegetative, flowering and maturity)
# subset shared taxa between vegitative, flowering and maturity
# install phylosmith package for subsetting common taxa

devtools::install_github('schuyler-smith/phylosmith')
library(phylosmith)

# subset common taxa: to construt the netwrok using a comparable set

physeqgenus.2016.common = common_taxa(physeqgenus.2016, treatment = "Growth.Stage", n='all')

# subset otu with the common taxa

physeqgenus.2016= prune_taxa(physeqgenus.2016.common, physeqgenus.2016)

# Summary of the subseted phyloseq object

physeqgenus.2016

#get variables of the phyloseq object

get_variable(physeqgenus.2016)

# subset dataset by growth stage
# Vegetative

can.veg.2016<-subset_samples(physeqgenus.2016,Growth.Stage=="Vegetative")
can.veg.2016

# Flowering

Can.flw.2016<-subset_samples(physeqgenus.2016,Growth.Stage=="Flowering")
Can.flw.2016

# maturity

Can.mat.2016<-subset_samples(physeqgenus.2016,Growth.Stage=="Maturity")
Can.mat.2016

# Remove zero sum taxa from each growth stage

can.veg.2016 = prune_taxa(taxa_sums(can.veg.2016) > 0, can.veg.2016)
Can.flw.2016 = prune_taxa(taxa_sums(Can.flw.2016) > 0, Can.flw.2016)
Can.mat.2016 = prune_taxa(taxa_sums(Can.mat.2016) > 0, Can.mat.2016)

# Remove unclassified genera

can.veg.2016 = subset_taxa(can.veg.2016, Genus!="unclassified")
Can.flw.2016 = subset_taxa(Can.flw.2016, Genus!="unclassified")
Can.mat.2016 = subset_taxa(Can.mat.2016, Genus!="unclassified")

# Summary of your phyloseq objects

can.veg.2016
Can.flw.2016
Can.mat.2016

# transform to compositionality

library(microbiome)

# Vegtative stage

can.veg.2016.comp<-transform(can.veg.2016, "compositional")

# Flowering stage

Can.flw.2016.comp<-transform(Can.flw.2016, "compositional")

# Maturity

Can.mat.2016.comp<-transform(Can.mat.2016, "compositional")

## conver it to dataframe

# chnage the phyloseq object to dataframe. I prefered phyloseq_to_df  function (psmelt is another option....)

# Vegetative stage

m<-phyloseq_to_df(can.veg.2016.comp, addtax = T, addtot = F, addmaxrank = F,
                  sorting = "abundance")

# Flowering stage

n<-phyloseq_to_df(Can.flw.2016.comp, addtax = T, addtot = F, addmaxrank = F,
                  sorting = "abundance")

# maturity stage

o<-phyloseq_to_df(Can.mat.2016.comp, addtax = T, addtot = F, addmaxrank = F,
                  sorting = "abundance")

# save  the above dataframes as csv, Open and transpose it so that you have samples in a row ( t(data))
# and genera in column.

# save the final txt file as Can_veg_2016_genus_comp.txt, Can_flw_2016_genus_comp.txt, and Can_mat_2016_genus_comp.txt in the same folder as the raw data - untrasposed csv file

# Vegetative, flowering and maturity dataframes

write.csv(m, "can.veg.2016.comp.df.csv")
write.csv(n, "Can.flw.2016.comp.df.csv")
write.csv(o, "Can.mat.2016.comp.df.csv")

# load the .txt fiels with relative abundance 

#Vegetative

data=read.table(file = 'Can_veg_2016_genus_comp.txt',sep="\t",header=T, row.names = 1, check.names = F)
data1=read.table(file = 'Can_flw_2016_genus_comp.txt',sep="\t",header=T, row.names = 1, check.names = F)
data2=read.table(file = 'Can_mat_2016_genus_comp.txt',sep="\t",header=T, row.names = 1, check.names = F)

# generate edgelist document for vegetative

data.rowsum <- apply(data, 1, sum)
data.norm <- data/data.rowsum

# default iterations = 1000 and metric = spearman

ccrepe.output <- ccrepe(x = data.norm)

# considering only positive significant correlations

ccrepe.output.sig<-ifelse(ccrepe.output$sim.score > 0 & ccrepe.output$p.values<0.05,1,0) 

# consider only the upper triangular matrix 

ccrepe.output.sig.upper <- graph.adjacency(ccrepe.output.sig,mode="upper")

# remove loops which occur due to the diagonal matrix

ccrepe.output.sig.upper <- simplify(ccrepe.output.sig.upper)

# from igraph package

ccrepe.output.sig.upper.edges<-get.edgelist(ccrepe.output.sig.upper)

# save the edgelist file

write.table(ccrepe.output.sig.upper.edges,paste("edgelist",'veg_ccrepe.output.sig.upper.edges',sep='_'),sep='\t',quote=F, row.names = F, col.names = F)

# considering only negative significant correlations

ccrepe.output.sig.neg<-ifelse(ccrepe.output$sim.score < 0 & ccrepe.output$p.values<0.05,1,0)

# consider only the upper triangular matrix 

ccrepe.output.sig.upper.neg <- graph.adjacency(ccrepe.output.sig.neg,mode="upper")

# remove loops which occur due to the diagonal matrix

ccrepe.output.sig.upper.neg <- simplify(ccrepe.output.sig.upper.neg)

# from igraph package

ccrepe.output.sig.upper.edges.neg<-get.edgelist(ccrepe.output.sig.upper.neg)

# save the edgelist file with negative sig correlations

write.table(ccrepe.output.sig.upper.edges.neg,paste("edgelist",'neg_veg_ccrepe.output.sig.upper.edges',sep='_'),sep='\t',quote=F, row.names = F, col.names = F)

## generate edgelist document for flowering

data.rowsum1 <- apply(data1, 1, sum)
data.norm1 <- data1/data.rowsum1

# default iterations = 1000 and metric = spearman

ccrepe.output1 <- ccrepe(x = data.norm1)

# considering only positive significant correlations

ccrepe.output.sig1<-ifelse(ccrepe.output1$sim.score > 0 & ccrepe.output1$p.values<0.05,1,0) 

# consider only the upper triangular matrix

ccrepe.output.sig.upper1 <- graph.adjacency(ccrepe.output.sig1,mode="upper")

# remove loops which occur due to the diagonal matrix

ccrepe.output.sig.upper1 <- simplify(ccrepe.output.sig.upper1)

# from igraph package

ccrepe.output.sig.upper.edges1<-get.edgelist(ccrepe.output.sig.upper1)

write.table(ccrepe.output.sig.upper.edges1,paste("edgelist",'flw_ccrepe.output.sig.upper.edges',sep='_'),sep='\t',quote=F, row.names = F, col.names = F)

# considering only negative significant correlations

ccrepe.output.sig1.neg<-ifelse(ccrepe.output1$sim.score < 0 & ccrepe.output1$p.values<0.05,1,0) 

# consider only the upper triangular matrix

ccrepe.output.sig.upper1.neg <- graph.adjacency(ccrepe.output.sig1.neg,mode="upper")
ccrepe.output.sig.upper1.neg <- simplify(ccrepe.output.sig.upper1.neg)

# from igraph package

ccrepe.output.sig.upper.edges1.neg<-get.edgelist(ccrepe.output.sig.upper1.neg)

write.table(ccrepe.output.sig.upper.edges1.neg,paste("edgelist",'neg_flw_ccrepe.output.sig.upper.edges',sep='_'),sep='\t',quote=F, row.names = F, col.names = F)

## generate edgelist document for maturity

data.rowsum2 <- apply(data2, 1, sum)
data.norm2 <- data2/data.rowsum2

# default iterations = 1000 and metric = spearman

ccrepe.output2 <- ccrepe(x = data.norm2)

# considering only positive significant correlations

ccrepe.output.sig2<-ifelse(ccrepe.output2$sim.score > 0 & ccrepe.output2$p.values<0.05,1,0) 

# consider only the upper triangular matrix 

ccrepe.output.sig.upper2 <- graph.adjacency(ccrepe.output.sig2,mode="upper")

# remove loops which occur due to the diagonal matrix

ccrepe.output.sig.upper2 <- simplify(ccrepe.output.sig.upper2)

# from igraph package

ccrepe.output.sig.upper.edges2<-get.edgelist(ccrepe.output.sig.upper2)

write.table(ccrepe.output.sig.upper.edges2,paste("edgelist",'mat_ccrepe.output.sig.upper.edges',sep='_'),sep='\t',quote=F, row.names = F, col.names = F)

# considering only negative significant correlations

ccrepe.output.sig2.neg<-ifelse(ccrepe.output2$sim.score < 0 & ccrepe.output2$p.values<0.05,1,0) 

ccrepe.output.sig.upper2.neg <- graph.adjacency(ccrepe.output.sig2.neg,mode="upper")
ccrepe.output.sig.upper2.neg <- simplify(ccrepe.output.sig.upper2.neg)
ccrepe.output.sig.upper.edges2.neg<-get.edgelist(ccrepe.output.sig.upper2.neg)

write.table(ccrepe.output.sig.upper.edges2.neg,paste("edgelist",'neg_mat_ccrepe.output.sig.upper.edges',sep='_'),sep='\t',quote=F, row.names = F, col.names = F)

#### for the cross growth stage comparisons the following edgelist files created above are explored in the Netshift tool for further netwrok inferences 

veg_ccrepe.output.sig.upper.edges
neg_veg_ccrepe.output.sig.upper.edges
flw_ccrepe.output.sig.upper.edges
neg_flw_ccrepe.output.sig.upper.edges
mat_ccrepe.output.sig.upper.edges
neg_mat_ccrepe.output.sig.upper.edges


#### ......... ##### .............. ##### ........... ####

# load 2016 rhizosphere microbiome dataset as phyloseq object

load("D:/....../2016.physeq.updated.line.names.new.variable.RData")

# check summary of your phyloseq object

physeq.updated.line.names.new.variable

# Load required libraries

library(metagMisc)
library(ccrepe)
library(igraph)
library(phyloseq)
library(phylosmith)

# agglomerate to genus level

physeqgenus.2016 = tax_glom(physeq.updated.line.names.new.variable,taxrank = 'Genus')

# remove zerosum taxa

physeqgenus.2016 = prune_taxa(taxa_sums(physeqgenus.2016) > 0, physeqgenus.2016)

# subset the four canola lines (2 yellow seeded (NAM-48 and NAM-94) and 2 balck seeded (NAM-13 and NAM-23))

physeqgenus.2016.subset = subset_samples(physeqgenus.2016, Canola.Lines =="NAM-13"|Canola.Lines=="NAM-23"|Canola.Lines=="NAM-48"|Canola.Lines=="NAM-94")

# summary of subseted phyloseq

physeqgenus.2016.subset

# subset shared taxa between all four lines

physeqgenus.2016.subset.common = common_taxa(physeqgenus.2016.subset, treatment = "Canola.Lines", n='all')

# subset otu with the common taxa

physeqgenus.2016.subset= prune_taxa(physeqgenus.2016.subset.common, physeqgenus.2016)

# summary of common taxa

physeqgenus.2016.subset

# subset dataset by breeding line vs diversity collection - seed color and genetic distance similarity

# breeding lines - yellow seeded:(NAM-48 and NAM-94)

can.yellow.seeded.2016<-subset_samples(physeqgenus.2016.subset,Canola.Lines=="NAM-48"|Canola.Lines=="NAM-94")

# summary of breeding lines dataset

can.yellow.seeded.2016

# diversity collections - black seeded:(NAM-13 and NAM-23)

can.black.seeded.2016<-subset_samples(physeqgenus.2016.subset,Canola.Lines=="NAM-13"|Canola.Lines=="NAM-23")

# summary of diversity collections dataset

can.black.seeded.2016

# Remove zero sum taxa from each data set

can.yellow.seeded.2016 = prune_taxa(taxa_sums(can.yellow.seeded.2016) > 0, can.yellow.seeded.2016)
can.black.seeded.2016 =  prune_taxa(taxa_sums(can.black.seeded.2016) > 0, can.black.seeded.2016)

# Remove unclassified genera

can.yellow.seeded.2016 = subset_taxa(can.yellow.seeded.2016, Genus!="unclassified")
can.black.seeded.2016 = subset_taxa(can.black.seeded.2016, Genus!="unclassified")

# summary of the filltered datasets

can.yellow.seeded.2016
can.black.seeded.2016

# transform to compositionality

library(microbiome)

can.yellow.seeded.2016.comp<-transform(can.yellow.seeded.2016, "compositional")
can.black.seeded.2016.comp<-transform(can.black.seeded.2016, "compositional")

# conver to dataframe
#chnage the phyloseq object to dataframe.
# breeding lines

m<-phyloseq_to_df(can.yellow.seeded.2016.comp, addtax = T, addtot = F, addmaxrank = F,
                  sorting = "abundance")

# diversity collections

n<-phyloseq_to_df(can.black.seeded.2016.comp, addtax = T, addtot = F, addmaxrank = F,
                  sorting = "abundance")

# save as csv the melted. Open this file and transpose it so that you have samples in a row
# breeding lines (Yellow seeded) and diversity collections (black seeded )dataframes

write.csv(m, "can.yellow.seeded.2016.comp.df.csv")
write.csv(n, "Can.black.seeded.2016.comp.df.csv")

# load the .txt fiels with relative abundance 

data=read.table(file = 'can.yellow.seeded.2016.comp.df.txt',sep="\t",header=T, row.names = 1, check.names = F)
data1=read.table(file = 'Can.black.seeded.2016.comp.df.txt',sep="\t",header=T, row.names = 1, check.names = F)

# generate edgelist document for breeding lines - yellow seeded 

data.rowsum <- apply(data, 1, sum)
data.norm <- data/data.rowsum

# default iterations = 1000 and metric = spearman

ccrepe.output <- ccrepe(x = data.norm)

# considering only positive significant correlations

ccrepe.output.sig<-ifelse(ccrepe.output$sim.score > 0 & ccrepe.output$p.values<0.05,1,0)

# consider only the upper triangular matrix

ccrepe.output.sig.upper <- graph.adjacency(ccrepe.output.sig,mode="upper")

# remove loops which occur due to the diagonal matrix

ccrepe.output.sig.upper <- simplify(ccrepe.output.sig.upper) 

# from igraph package

ccrepe.output.sig.upper.edges<-get.edgelist(ccrepe.output.sig.upper) 

write.table(ccrepe.output.sig.upper.edges,paste("edgelist",'yellow_seed_ccrepe.output.sig.upper.edges',sep='_'),sep='\t',quote=F, row.names = F, col.names = F)

# generate edgelist document for diversity collections - black seeded

data.rowsum1 <- apply(data1, 1, sum)

data.norm1 <- data1/data.rowsum1

# default iterations = 1000 and metric = spearman

ccrepe.output1 <- ccrepe(x = data.norm1)

# considering only positive significant correlations

ccrepe.output.sig1<-ifelse(ccrepe.output1$sim.score > 0 & ccrepe.output1$p.values<0.05,1,0) 

# consider only the upper triangular matrix

ccrepe.output.sig.upper1 <- graph.adjacency(ccrepe.output.sig1,mode="upper")

# remove loops which occur due to the diagonal matrix

ccrepe.output.sig.upper1 <- simplify(ccrepe.output.sig.upper1)

# from igraph package

ccrepe.output.sig.upper.edges1<-get.edgelist(ccrepe.output.sig.upper1)

write.table(ccrepe.output.sig.upper.edges1,paste("edgelist",'black_seed_ccrepe.output.sig.upper.edges',sep='_'),sep='\t',quote=F, row.names = F, col.names = F)

####


#### for the between genotype comparisons the following edgelist files created above are explored in the Netshift tool for further netwrok inferences 

yellow_seed_ccrepe.output.sig.upper.edges
black_seed_ccrepe.output.sig.upper.edges


# Across growth stage and between genotype netwrok compariosons using NetShift tool
# Kuntal, B.K., Chandrakar, P., Sadhu, S. et al. ‘NetShift’: a methodology for understanding ‘driver microbes’ from healthy and disease microbiome datasets. ISME J 13, 442–454 (2019). https://doi.org/10.1038/s41396-018-0291-x

#1. Across growth stage comparions

# - access the NetShift tool at https://web.rniapps.net/netshift/

# - To compare the network changes from vegetative to flowering stage. Input the vegetative network as control and flowering in case network
# - for flowerign to maturity changes - input flowering network as control and maturity netwrok as case
# - for vegetative to mturity changes - input vegetative netwrok as control and maturity as case

#2. Between genotypes: 
# - diversity collection network used as control and breeding line network as case.


#### ........... #### ..............#### ..............####

## Metabolic pathwasy and fuctional feature inference for the diver genera identified 

# Malo Le Boulch, Patrice Déhais, Sylvie Combes, Géraldine Pascal, The MACADAM database: a MetAboliC pAthways DAtabase for Microbial taxonomic groups for mining potential metabolic capacities of archaeal and bacterial taxonomic groups, Database, Volume 2019, 2019, baz049, https://doi.org/10.1093/database/baz049

# access MACADAM database at http://macadam.toulouse.inra.fr/

# use the driver genera idnetied in each of the growth stage and genotype comparisons
# For a precise query on the input genera, the taxonomic rank was specified to genus level. This strict taxonomic specification allows queries only on target taxonomic names and not beyond the specified taxonomic rank.
# No specific metabolic pathway or functional feature was specified in the query to get all possible metabolic pathways and functional features associated with each genus.
# To return only complete metabolic pathways, a minimum completeness score of 1 was used. 

# For each list of potential driver genera the MACADAM database outputs a list of functional (from FAPROTAX and IJSEM) and pathway (from MetaCyc and MicroCyc) information including the number of organisms that have the function and pathways and a list of input genera that have the target function and pathways.