K-Means Clustering

R12: K-Means Clustering
1. Example 1
x1 <- c(1, 2, 3, 4, 7, 8, 10)
x2 <- c(8, 2, 3, 1, 11, 8, 10)
X <- cbind(x1, x2)
plot(X, pch=16)
identify(X, labels=1:7)
[1] 1 2 3 4 5 6 7
X.km <- kmeans(X,2)
X.km
K-means clustering with 2 clusters of sizes 3, 4
Cluster means:
x1
x2
1 8.333333 9.666667
2 2.500000 3.500000
Clustering vector:
[1] 2 2 2 2 1 1 1
Within cluster sum of squares by cluster:
[1] 9.333333 34.000000
Available components:
[1] "cluster" "centers" "withinss" "size"
points(X,pch=X.km$cluster+1, col=X.km$cluster+1)
points(X.km$centers, col=2:3, pch=2:3, cex=1.5)
2. Random starts give different solutions
library(MASS)
mu3 <- c(4.5, 4.5)
Sigma2 <- matrix(c(2.25,1.5,1.5,2.25),nrow=2)
unimodal <- mvrnorm(50, mu=mu3, Sigma=Sigma2)
plot(unimodal, pch=16)
uni.3m1 <- kmeans(unimodal,3)# Random starts uni.3m2 <- kmeans(unimodal,3)# different answers
plot(unimodal, pch=uni.3m1$cluster, col=uni.3m1$cluster)
points(unimodal, pch=uni.3m2$cluster+3, col=uni.3m2$cluster)
table(data.frame(km1=uni.3m1$cluster,km2=uni.3m2$cluster))
km2
km1 1 2 3
1 0 8 0
2 12 8 0
3 3 0 19
uni.3m1$withinss
[1] 15.55542 25.67524 46.35496
sum(uni.3m1$withinss)
[1] 87.58563
uni.3m2$withinss
[1] 16.73345 32.24931 37.93183
sum(uni.3m2$withinss)
[1] 86.9146
The nstart argument tells kmeans to try that many random starts and keep the best.
With 20 or 25 random starts, you’ll generally find the overall best solution unless
your sample size is really big.
uni.3m3 <- kmeans(unimodal,3, nstart=25)
uni.3m4 <- kmeans(unimodal,3, nstart=25)
plot(unimodal, pch=uni.3m3$cluster, col=uni.3m3$cluster)
points(unimodal, pch=uni.3m4$cluster+3, col=uni.3m4$cluster)
table(data.frame(km1=uni.3m3$cluster,km2=uni.3m4$cluster))
km2
km1 1 2 3
1 24 0 0
2 0 12 0
3 0 0 14
uni.3m3$withinss
[1] 50.90327 11.01443 23.77887
sum(uni.3m3$withinss)
[1] 85.69657
3. Swiss Canton Data
swiss.4m <- kmeans(scale(swiss), 4, nstart=25)
pairs(swiss, pch=swiss.4m$cluster, col=swiss.4m$cluster)
library(rgl)
plot3d(swiss.pca$x[,1:3],type="s",size=.25, col=swiss.4m$cluster)
swiss.4m$withinss
[1] 45.231535 10.191962 37.298639
sum(swiss.4m$withinss)
[1] 102.3866
9.664499
4. Number of Clusters – Two possible approaches
1) Scree plot: within group sum of squares versus the number of clusters
2) The “Pseudo-F” Statistic: large pseudo-F (adjusted between group SS/ adjusted within group SS)
indicates the efficiency of the partition.
pseudoF=
function(X, k, ns = 25)
{nk <- length(k)
n <- nrow(X)
T <- sum(scale(X,scale=F)^2)
W <- rep(T, nk)
for (i in 1:nk)
{cli <- kmeans(X, k[i], nstart=ns)
W[i] <- sum(cli$withinss)
}
B <- T-W
pF <- (B/(k-1))/(W/(n-k))
return(list(k=k, W=W, pF=pF))
}
pseudoF(scale(swiss), 2:6)
$k
[1] 2 3 4 5 6
$W
[1] 178.14377 124.20515 102.38664
85.22272
72.55739
$pF
[1] 24.71897 26.88686 24.30452 23.50502 22.99186
swiss.3m <- kmeans(scale(swiss), 3)
sum(swiss.3m$withinss)
[1] 124.2051
5. Comparisons with Hierarchical Clustering and Interpretation
pairs(swiss, pch=swiss.3m$cluster, col=swiss.3m$cluster)
plot3d(swiss.pca$x[,1:3],type="s",size=.25, col=swiss.3m$cluster)
swiss.hc.w <- hclust(dist(scale(swiss)), "ward")
plot(swiss.hc.w)
table(data.frame( km3=swiss.3m$cluster, hc3=cutree(swiss.hc.w,3)))
hc3
km3 1 2 3
1 0 16 0
2 23 0 0
3 1 0 7
pairs(swiss.3m$centers, pch=1:3, col=1:3)
swiss.3m$centers
Fertility Agriculture Examination Education
Catholic Infant.Mortality
1 0.8331491 0.65426591 -0.8839264 -0.4527862 1.3189394
0.2857993
2 -0.0914650 0.01020332
0.1294049 -0.2871779 -0.7980284
-0.0698400
3 -1.4033364 -1.33786636
1.3958136 1.7312087 -0.3435471
-0.3708087
6. Predictions
mu1 <- c(3,3)
mu2 <- c(6,6)
Sigma1 <- matrix(c(1,0,0,1),nrow=2)
bimodal <- rbind(mvrnorm(25,mu=mu1,Sigma=Sigma1),
mvrnorm(25,mu=mu2,Sigma=Sigma1))
plot(bimodal, pch=16)
bim.2m <- kmeans(bimodal, 2)
plot(bimodal, pch=bim.2m$cluster, col = bim.2m$cluster)
points(bim.2m$centers, pch=3, col=1:2)
points(unimodal, pch=4, col=4)
predict.kmeans=
function(km, data)
{k <- nrow(km$centers)
n <- nrow(data)
d <- as.matrix(dist(rbind(km$centers, data)))[-(1:k),1:k]
out <- apply(d, 1, which.min)
return(out)}
pred.uni.bim <- predict.kmeans(bim.2m, unimodal)
points(unimodal, pch=pred.uni.bim+2, col=pred.uni.bim)
7. Validation
iris0 <- iris[,1:4]
s <- sample(150, 75)
iris1 <- iris0[s,]
iris2 <- iris0[-s,]
# Calibration Set
# Validation Set
pseudoF(iris1, 2:6)
$k
[1] 2 3 4 5 6
$W
[1] 88.63962 37.73809 24.54750 19.22262 15.91015
$pF
[1] 222.5698 306.3637 322.3484 309.2311 297.4934
iris1.3m <- kmeans(iris1, 3, 25)
pairs(iris1, pch= iris1.3m$cluster, col=iris1.3m$cluster)
pred.iris2.iris1 <- predict.kmeans(iris1.3m, iris2)
iris2.3m <- kmeans(iris2, 3, 25)
pairs(iris2, pch=pred.iris2.iris1, col=iris2.3m$cluster)
table(data.frame(S1=pred.iris2.iris1,S2=iris2.3m$cluster))
S2
S1
1 2 3
1 24 0 0
2 0 6 27
3 0 18 0
iris1.4m <- kmeans(iris1, 4, 25)
pairs(iris1, pch=iris1.4m$cluster, col=iris1.4m$cluster)
pred.iris2.iris1 <- predict.kmeans(iris1.4m, iris2)
iris2.4m <- kmeans(iris2, 4, 25)
pairs(iris2, pch=pred.iris2.iris1, col=iris2.4m$cluster)
table(data.frame(S1=pred.iris2.iris1,S2=iris2.4m$cluster))
S2
S1
1 2 3 4
1 0 20 11 0
2 0 0 4 0
3 13 3 0 0
4 0 0 0 24
iris1.2m <- kmeans(iris1, 2, 25)
pred.iris2.iris1 <- predict.kmeans(iris1.2m, iris2)
iris2.2m <- kmeans(iris2, 2, 25)
pairs(iris2, pch=pred.iris2.iris1, col=iris2.2m$cluster)
table(data.frame(S1=pred.iris2.iris1,S2=iris2.2m$cluster))
S2
S1
1 2
1 27 0
2 0 48
# K Means Clustering
x1 <- c(1, 2, 3, 4, 7, 8, 10)
x2 <- c(8, 2, 3, 1, 11, 8, 10)
X <- cbind(x1, x2)
plot(X, pch=16)
identify(X, labels=1:7)
X.km <- kmeans(X,2)
X.km
points(X,pch=X.km$cluster+1, col=X.km$cluster+1)
points(X.km$centers, col=2:3, pch=2:3, cex=1.5)
library(MASS)
mu3 <- c(4.5, 4.5)
Sigma2 <- matrix(c(2.25,1.5,1.5,2.25),nrow=2)
unimodal <- mvrnorm(50, mu=mu3, Sigma=Sigma2)
plot(unimodal, pch=16)
uni.3m1 <- kmeans(unimodal,3) # Random starts uni.3m2 <- kmeans(unimodal,3) # different answers
plot(unimodal, pch=uni.3m1$cluster, col=uni.3m1$cluster)
points(unimodal, pch=uni.3m2$cluster+3, col=uni.3m2$cluster)
table(data.frame(km1=uni.3m1$cluster,km2=uni.3m2$cluster))
uni.3m1$withinss
sum(uni.3m1$withinss)
uni.3m2$withinss
sum(uni.3m2$withinss)
uni.3m3 <- kmeans(unimodal,3, nstart=25)
uni.3m4 <- kmeans(unimodal,3, nstart=25)
plot(unimodal, pch=uni.3m3$cluster, col=uni.3m3$cluster)
points(unimodal, pch=uni.3m4$cluster+3, col=uni.3m4$cluster)
table(data.frame(km1=uni.3m3$cluster,km2=uni.3m4$cluster))
uni.3m3$withinss
sum(uni.3m3$withinss)
# Swiss Canton Data
swiss.4m <- kmeans(scale(swiss), 4, nstart=25)
pairs(swiss, pch=swiss.4m$cluster, col=swiss.4m$cluster)
library(rgl)
plot3d(swiss.pca$x[,1:3],type="s",size=.25, col=swiss.4m$cluster)
swiss.4m$withinss
sum(swiss.4m$withinss)
# Number of clusters?
"Pseudo-F" Statistic
source("pseudoF.txt")
pseudoF
pseudoF(scale(swiss), 2:6)
swiss.3m <- kmeans(scale(swiss), 3)
sum(swiss.3m$withinss)
pairs(swiss, pch=swiss.3m$cluster, col=swiss.3m$cluster)
plot3d(swiss.pca$x[,1:3],type="s",size=.25, col=swiss.3m$cluster)
# Compare to hierarchical results
swiss.hc.w <- hclust(dist(scale(swiss)), "ward")
plot(swiss.hc.w)
table(data.frame(km3=swiss.3m$cluster,hc3=cutree(swiss.hc.w,3)))
# Interpretations
pairs(swiss.3m$centers, pch=1:3, col=1:3)
swiss.3m$centers
# Predictions
bim.2m <- kmeans(bimodal, 2)
plot(bimodal, pch=bim.2m$cluster, col = bim.2m$cluster)
points(bim.2m$centers, pch=3, col=1:2)
points(unimodal, pch=4, col=4)
source("predict.kmeans.txt")
predict.kmeans
pred.uni.bim <- predict.kmeans(bim.2m, unimodal)
pred.uni.bim
points(unimodal, pch=pred.uni.bim+2, col=pred.uni.bim)
# Validation
iris0 <- iris[,1:4]
s <- sample(150, 75)
iris1 <- iris0[s,]
iris2 <- iris0[-s,]
pseudoF(iris1, 2:6)
# Calibration Set
# Validation Set
iris1.3m <- kmeans(iris1, 3, 25)
pairs(iris1, pch=iris1.3m$cluster, col=iris1.3m$cluster)
pred.iris2.iris1 <- predict.kmeans(iris1.3m, iris2)
iris2.3m <- kmeans(iris2, 3, 25)
pairs(iris2, pch=pred.iris2.iris1, col=iris2.3m$cluster)
table(data.frame(S1=pred.iris2.iris1,S2=iris2.3m$cluster))
iris1.4m <- kmeans(iris1, 4, 25)
pairs(iris1, pch=iris1.4m$cluster, col=iris1.4m$cluster)
pred.iris2.iris1 <- predict.kmeans(iris1.4m, iris2)
iris2.4m <- kmeans(iris2, 4, 25)
pairs(iris2, pch=pred.iris2.iris1, col=iris2.4m$cluster)
table(data.frame(S1=pred.iris2.iris1,S2=iris2.4m$cluster))
iris1.2m <- kmeans(iris1, 2, 25)
pred.iris2.iris1 <- predict.kmeans(iris1.2m, iris2)
iris2.2m <- kmeans(iris2, 2, 25)
pairs(iris2, pch=pred.iris2.iris1, col=iris2.2m$cluster)
table(data.frame(S1=pred.iris2.iris1,S2=iris2.2m$cluster))