In [1]:
library(gplots)
library(RColorBrewer)
library(ggplot2)
library(pheatmap)
library(dplyr)
library(ape)
library(cluster)
#library(reshape)
#library(ggtree)
library(data.table)
Attaching package: ‘gplots’


The following object is masked from ‘package:stats’:

    lowess



Attaching package: ‘dplyr’


The following objects are masked from ‘package:stats’:

    filter, lag


The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union



Attaching package: ‘data.table’


The following objects are masked from ‘package:dplyr’:

    between, first, last


In [2]:
allele.data <- read.csv("Diversity_of_CRISPR_loci_in_Escherichia.tsv", sep = '\t')#, fileEncoding="UTF-16LE")
In [3]:
rnames <- allele.data[,1]                            # assign labels in column 1 to "rnames"
mat_allele.data <- data.matrix(allele.data[,2:ncol(allele.data)])  # transform column 2-5 into a matrix
rownames(mat_allele.data) <- rnames                  # assign row names

#rnames <- baps.data[,1]                            # assign labels in column 1 to "rnames"
#mat_baps.data <- data.matrix(baps.data[,2:ncol(baps.data)])  # transform column 2-5 into a matrix
#rownames(mat_baps.data) <- rnames                  # assign row names

head(mat_allele.data)
dim(mat_allele.data)
A matrix: 6 × 7 of type int
X2.1X2.2X2.3X2.2.3X4.1X4.2X4.1.2
SMS-3-5230 00001
101.1 73160002
H10407 43 80002
K12143 70002
ATCC 8739223290004
53638103 50004
  1. 100
  2. 7
In [4]:
#allelic profiles heatmap
#ap.hmap <- pheatmap(mat_allele.data, clustering_method = "complete", annotation_row = as.data.frame(mat_mic.data), cluster_rows=1, cluster_cols = 0, show_colnames=0, show_rownames=0)
ap.hmap <- pheatmap(mat_allele.data, clustering_method = "complete", cluster_rows=1, cluster_cols = 0, show_colnames=0, show_rownames=0)
In [5]:
cgMLST_dissim_mat <- as.matrix(daisy(mat_allele.data, metric='gower'))
head(cgMLST_dissim_mat)
Warning message in daisy(mat_allele.data, metric = "gower"):
“binary variable(s) 2 treated as interval scaled”
A matrix: 6 × 100 of type dbl
SMS-3-5101.1H10407K12ATCC 873953638IAI1HS55989SE11EC63 (B2)EC64 (B2)EC65 (B2)EC66 (B2)EC67 (B1)EC68 (B1)EC69 (B1)EC70 (B1)EC71 (B1)EC72 (B1)
SMS-3-50.00000000.326436780.301313630.248768470.37619050.315106730.32742200.29047620.345484400.27963880.20555560.18095240.32777780.15238100.301642040.322167490.229392450.277668310.3062397370.27290640
101.10.32643680.000000000.053694580.077668310.19261080.125615760.11527090.43596060.019047620.10558290.37961140.29786540.50183360.26929390.034318560.033825940.097044330.048768470.0487684730.05353038
H104070.30131360.053694580.000000000.052545160.24630540.100492610.16896550.41083740.044170770.15927750.32591680.24417080.44813900.21559930.019376030.087520530.071921180.033497540.0049261080.03825944
K120.24876850.077668310.052545160.000000000.20361250.086042690.12627260.39638750.096715930.11658460.36860970.28686370.49083200.25829230.052873560.073399010.019376030.028899840.0574712640.02413793
ATCC 87390.37619050.192610840.246305420.203612480.00000000.175369460.07733990.48571430.211658460.10607550.57222220.43333330.69444440.46190480.226929390.158784890.203940890.212807880.2413793100.20804598
536380.31510670.125615760.100492610.086042690.17536950.000000000.15517240.31034480.144663380.20262730.39685280.25796390.51907500.28653530.100821020.130870280.085714290.076847290.1054187190.08160920
In [6]:
gd.hmap <- pheatmap(cgMLST_dissim_mat, cluster_rows=1, cluster_cols = 1, show_rownames=0, show_colnames=0)
In [18]:
#d <- dist(mat_allele.data, method = "manhattan") #gave 6 clusters
#h_clust <- hclust(d, method = "ward.D") #ward clustering gave 6 clusters
#h_clust <- hclust(d, method = "complete") #complete clustering gave too many clusters
cgMLST_dissim_mat_for_clust <- daisy(mat_allele.data, metric='gower')
h_clust <- hclust(cgMLST_dissim_mat_for_clust, method = "ward.D") #ward clustering gave 6 clusters
cgMLST_tree.newClustering <- as.phylo(hclust(cgMLST_dissim_mat_for_clust, method = "ward.D"))
plot(h_clust)
rect.hclust(h_clust,k=12)
Warning message in daisy(mat_allele.data, metric = "gower"):
“binary variable(s) 2 treated as interval scaled”
In [19]:
#extract clusters
clusters <- cutree(h_clust,k=12)
#save to file
write.csv(file = "Diversity_of_CRISPR_loci_in_Escherichia.gower.ward.D.results.12groups.csv",clusters)