Chapter7 Demographic


7.1 Merge cluster data with decographic data

demo_cluster <- cbind(microvan[,c(2,33:39)], cluster= EFA_kmeans$cluster) %>%
  as.data.frame() %>%
  select(cluster, everything())

df = demo_cluster %>%
  gather(variable, value, -cluster)  %>%
  as.data.frame()


DT::datatable(demo_cluster , 
              options = list(pageLength = 10)) 

7.2 Histogram

# factorize cluster 
df$cluster <- factor(df$cluster)

df$variable <- factor(df$variable,
                      levels = c("mvliking","age",
                                 "income",
                                 "miles",
                                 "educ",
                                 "numkids",
                                 "recycle",
                                 "female"))

# plot
demo_cluster_hist = ggplot(df, aes(value, 
                     fill = cluster, 
                     color = cluster)) +
  geom_histogram(alpha = 0.3, position = "identity")  +
  facet_wrap( ~ variable, scales = "free",ncol = 4)  +
  theme_economist_white() 

demo_cluster_hist_path = file.path(plotDir, "demo_cluster_hist.png")

ggsave(
  filename = demo_cluster_hist_path,
  plot = demo_cluster_hist,
  width = 2400,
  height = 1800,
  units = "px"
)
## `stat_bin()` using `bins = 30`. Pick better value with
## `binwidth`.

7.3 Ridge plot

# ridge plot
demo_cluster_ridge = df %>%
  ggplot( aes(y=cluster, x=value,  fill=cluster)) +
    geom_density_ridges(alpha=0.3, stat="binline", bins=20) +
   facet_wrap( ~ variable, scales = "free",ncol = 4) +
    xlab("") +
    ylab("") +  
  theme_economist_white() 


demo_cluster_ridge_path = file.path(plotDir, "demo_cluster_ridge.png")

ggsave(
  filename = demo_cluster_ridge_path,
  plot = demo_cluster_ridge,
  width = 2400,
  height = 1800,
  units = "px"
)

7.3.1 Count of Subjects in Clusters

demo_cluster_count = demo_cluster %>%
  dplyr::select(cluster)%>%
  dplyr::group_by(cluster)%>%
  dplyr::mutate(n_subjects =n()) %>%
  distinct() %>%
  dplyr::arrange(cluster)


# DT table 
DT::datatable(demo_cluster_count,rownames = F) 

7.3.2 Stat

7.3.2.1 female

library(ggstatsplot)
DF = demo_cluster
DF$cluster <- factor(DF$cluster, levels = c(1,2,3))
DF$female <- factor(DF$female, levels = c(0,1))
custom_colors = c("#B2182B", "white", "#4D4D4D")

png_name = "female" 
female_path= file.path(plotDir,paste0(png_name,".png"))
png(
  filename = female_path,
  width = 400,
  height = 600
)

# female
ggbarstats(
  data         = DF,
  #type = "nonparametric",
  x            = cluster,
  y            = female,
  title = png_name

  # ggtheme      = ggthemes::theme_tufte(base_size = 12)
) +
  scale_color_manual(values = c(custom_colors))

dev.off()
## quartz_off_screen 
##                 2

7.3.2.2 Education

# educ
ggbarstats(
  data         = DF,
  #type = "nonparametric",
  x            = educ,
  y            = cluster,title = "educ"

  # ggtheme      = ggthemes::theme_tufte(base_size = 12)
) +
  scale_color_manual(values = c(custom_colors))

#### Recycle

png_name = "female" 
recycle_path= file.path(plotDir,paste0(png_name,".png"))

png(
  filename = recycle_path,
  width = 400,
  height = 600
)

#
ggbarstats(
  data         = DF,
  #type = "nonparametric",
  y            = cluster,
  x            = recycle,
  title = png_name,

  # ggtheme      = ggthemes::theme_tufte(base_size = 12)
) 
dev.off()
## quartz_off_screen 
##                 2
#ggbarstats(
 # data         = DF,
  #type = "nonparametric",
  #y            = recycle,
  #x            = cluster,
  #title = png_name,

  # ggtheme      = ggthemes::theme_tufte(base_size = 12)
#)

7.3.3 Kruskal–Wallis H test

kwh_test_educ <- DF %>%
      rstatix::kruskal_test(educ ~ cluster) %>%
      as.data.frame()

kwh_test_numkids <- DF %>%
      rstatix::kruskal_test(numkids~ cluster) %>%
      as.data.frame()

kwh_test_mvliking <- DF %>%
      rstatix::kruskal_test(mvliking~ cluster) %>%
      as.data.frame()

kwh_test_age<- DF %>%
      rstatix::kruskal_test(age~ cluster) %>%
      as.data.frame()


kwh_test_income <- DF %>%
      rstatix::kruskal_test(income~ cluster) %>%
      as.data.frame()

kwh_test_miles <- DF %>%
      rstatix::kruskal_test(miles~ cluster) %>%
      as.data.frame()

kwh_test_female <- DF %>%
      rstatix::kruskal_test(female~ cluster) %>%
      as.data.frame()

kwh_test_recycle <- DF %>%
      rstatix::kruskal_test(recycle~ cluster) %>%
      as.data.frame()

kwh_test = dplyr::bind_rows(
  kwh_test_educ,
  kwh_test_mvliking,
  kwh_test_age,
  kwh_test_income,
  kwh_test_miles,
  kwh_test_female,
  kwh_test_recycle )

DT::datatable(kwh_test)

7.3.4 Factor vs Cluster