Chapter6 Clustering


6.1 Dendrogram

EFA_feature = EFA_with_score %>%
  dplyr::select(PA1, PA3,PA5)

# Run a cluster analysis on a distance matrix and using the Ward method
c<- hclust(dist(EFA_feature), method="ward.D2") 

# Dendrogram 
library(dendextend)
plot(set(as.dendrogram(c),  
         "branches_k_color", # to highlight the cluster solution with a color
         k = 3),
     ylab = "Distance",
     main = "Dendrogram",
     cex = 0.2)

6.2 K-Means

set.seed(42)
EFA_kmeans <- kmeans(EFA_feature , centers = 3)
EFA_kmeans
## K-means clustering with 3 clusters of sizes 137, 144, 119
## 
## Cluster means:
##      PA1    PA3    PA5
## 1  0.638 -0.477  0.722
## 2 -0.952 -0.453 -0.467
## 3  0.417  1.096 -0.266
## 
## Clustering vector:
##   [1] 2 1 3 2 1 1 3 2 3 2 2 3 2 1 3 3 1 3 2 2 2 2 1 1 3 3 3 3 3 3 2
##  [32] 1 1 3 3 3 1 1 1 1 1 3 2 2 3 3 3 3 3 2 3 3 2 1 2 3 2 3 3 3 1 2
##  [63] 2 2 1 2 2 1 1 1 1 2 2 1 1 2 2 1 2 3 3 2 2 2 3 2 2 1 3 2 3 3 2
##  [94] 3 1 1 1 2 2 2 2 3 2 2 2 1 3 3 1 3 3 3 2 1 2 2 1 2 2 2 3 2 3 1
## [125] 2 2 2 2 1 2 1 1 3 2 2 1 1 1 1 1 1 1 2 2 1 2 3 3 2 2 1 2 3 1 1
## [156] 2 3 2 1 3 3 1 1 2 3 2 1 1 2 3 1 2 2 3 3 1 1 3 3 1 3 2 1 1 3 1
## [187] 1 2 2 3 1 1 2 2 3 2 2 1 3 1 1 2 3 2 1 1 2 1 2 2 3 1 2 1 1 2 3
## [218] 1 2 1 1 2 1 2 1 2 3 2 1 2 2 1 1 3 3 1 3 1 2 1 2 3 3 2 3 2 3 1
## [249] 2 2 3 2 1 1 3 2 3 2 2 1 1 3 3 2 1 1 2 2 3 2 1 3 1 2 1 1 1 1 1
## [280] 3 3 2 1 1 3 2 2 1 3 2 3 2 2 1 1 2 1 3 3 3 1 2 1 1 2 1 1 3 3 3
## [311] 2 2 2 3 2 1 2 3 1 1 2 1 2 3 2 3 2 2 2 3 1 2 3 3 3 3 2 2 1 2 1
## [342] 1 1 1 3 2 2 3 1 2 3 2 1 2 3 3 3 3 3 1 1 1 2 1 3 3 2 1 1 2 3 1
## [373] 1 3 2 3 2 2 1 3 1 1 3 1 1 3 3 2 1 2 1 3 1 3 3 2 2 1 1 2
## 
## Within cluster sum of squares by cluster:
## [1] 203 268 204
##  (between_SS / total_SS =  43.6 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"    
## [5] "tot.withinss" "betweenss"    "size"         "iter"        
## [9] "ifault"
# factor plot 
fviz_cluster(EFA_kmeans, 
             data = EFA_feature) + 
  theme_bw()

6.3 Interpretation of the results

6.3.1 Heatmap Table

# Average for each cluster with one step
vars_cluster_agg = aggregate(vars_cluster[, 2:31],
          by = list(cluster = EFA_kmeans$cluster), 
          FUN = mean)
# reshape
df <- vars_cluster_agg %>% 
  gather(variable, value, -cluster)  # to transfrom from wide to long format
library(gt)
library(scales)

# reshpae, long to wide (cluster )
df_wider = df%>%
  dplyr::mutate(cluster = paste0("Cluster", cluster)) %>%
  dplyr::mutate(., across(where(is.numeric), round, 2)) %>%
  spread(., key=cluster, value =value)

# cluster columns
clusterCols = c("Cluster1", "Cluster2", "Cluster3")

# color
colfunc <- colorRampPalette(c("darkblue", "lightgrey"))

# DT table 
DT::datatable(df_wider, options = list(pageLength = 15)) %>%
  formatStyle("Cluster1",
              backgroundColor = styleEqual(sort(unique(df_wider$Cluster1),
                                                decreasing = TRUE),
                                           colfunc(length(
                                             unique(df_wider$Cluster1)
                                           )))) %>%
  formatStyle("Cluster2",
              backgroundColor = styleEqual(sort(unique(df_wider$Cluster2),
                                                decreasing = TRUE),
                                           colfunc(length(
                                             unique(df_wider$Cluster2)
                                           )))) %>%
  formatStyle("Cluster3",
              backgroundColor = styleEqual(sort(unique(df_wider$Cluster3),
                                                decreasing = TRUE),
                                           colfunc(length(
                                             unique(df_wider$Cluster3)
                                           )))) %>%
  formatStyle(clusterCols, color = "white")

6.3.2 Histogram

df = vars_cluster %>%
  gather(variable, value, -cluster)  %>%
  as.data.frame()

# factorize cluster 
df$cluster <- factor(df$cluster)

# plot
vars_cluster_hist = ggplot(df, aes(value, 
                     fill = cluster, 
                     color = cluster)) +
  geom_histogram(alpha = 0.3, position = "identity")  +
  facet_wrap( ~ variable, scales = "free",ncol = 6)  +
  theme_economist_white() 

vars_cluster_hist_path = file.path(plotDir, "vars_cluster_hist.png")

ggsave(
  filename = vars_cluster_hist_path,
  plot = vars_cluster_hist,
  width = 3000,
  height = 3000,
  units = "px"
)