library(gplots)
library(RColorBrewer)
library(ggplot2)
library(pheatmap)
library(dplyr)
library(ape)
library(cluster)
#library(reshape)
#library(ggtree)
library(data.table)
allele.data <- read.csv("Diversity_of_CRISPR_loci_in_Escherichia.tsv", sep = '\t')#, fileEncoding="UTF-16LE")
rnames <- allele.data[,1] # assign labels in column 1 to "rnames"
mat_allele.data <- data.matrix(allele.data[,2:ncol(allele.data)]) # transform column 2-5 into a matrix
rownames(mat_allele.data) <- rnames # assign row names
#rnames <- baps.data[,1] # assign labels in column 1 to "rnames"
#mat_baps.data <- data.matrix(baps.data[,2:ncol(baps.data)]) # transform column 2-5 into a matrix
#rownames(mat_baps.data) <- rnames # assign row names
head(mat_allele.data)
dim(mat_allele.data)
#allelic profiles heatmap
#ap.hmap <- pheatmap(mat_allele.data, clustering_method = "complete", annotation_row = as.data.frame(mat_mic.data), cluster_rows=1, cluster_cols = 0, show_colnames=0, show_rownames=0)
ap.hmap <- pheatmap(mat_allele.data, clustering_method = "complete", cluster_rows=1, cluster_cols = 0, show_colnames=0, show_rownames=0)
cgMLST_dissim_mat <- as.matrix(daisy(mat_allele.data, metric='gower'))
head(cgMLST_dissim_mat)
gd.hmap <- pheatmap(cgMLST_dissim_mat, cluster_rows=1, cluster_cols = 1, show_rownames=0, show_colnames=0)
#d <- dist(mat_allele.data, method = "manhattan") #gave 6 clusters
#h_clust <- hclust(d, method = "ward.D") #ward clustering gave 6 clusters
#h_clust <- hclust(d, method = "complete") #complete clustering gave too many clusters
cgMLST_dissim_mat_for_clust <- daisy(mat_allele.data, metric='gower')
h_clust <- hclust(cgMLST_dissim_mat_for_clust, method = "ward.D") #ward clustering gave 6 clusters
cgMLST_tree.newClustering <- as.phylo(hclust(cgMLST_dissim_mat_for_clust, method = "ward.D"))
plot(h_clust)
rect.hclust(h_clust,k=12)
#extract clusters
clusters <- cutree(h_clust,k=12)
#save to file
write.csv(file = "Diversity_of_CRISPR_loci_in_Escherichia.gower.ward.D.results.12groups.csv",clusters)