xwMOOC 자동차

1. 초기품질지수와 차량 내구성 조사 비교
2. IQS 대 VDS
3. 자동차 제조사 군집분석

1. 초기품질지수와 차량 내구성 조사 비교

초기품질지수(IQS), 차량 내구성 조사(VDS)를 조합해서 주요 제조사별로 교차분석을 수행해 보자.

2. IQS 대 VDS

2.1. 환경설정 및 데이터 가져오기

초기품질지수(IQS), 차량 내구성 조사(VDS) 교차분석에 필요한 팩키지와 데이터를 가져온다.

# 0. 환경설정 ---------------
library(tidyverse)
library(rvest)
library(forcats)
library(ggpubr)
library(extrafont)
loadfonts()
library(gridExtra)
library(lubridate)

# 1. 데이터 가져오기 ---------------

iqs_df <- readRDS("data_processed/iqs_df.rds")
vds_df <- readRDS("data_processed/vds_df.rds")

2.2. 데이터 전처리

IQS, VDS 데이터프레임에 “구분” 필드를 추가해서 구분할 수 있도록 하고 bind_rows 함수로 결합시킨다. 그리고 나서 고급차가 아닌 일반 대중차를 대량생산하는 주요 자동차 제조사를 식별하는 필드를 추가한다.

# 2. 데이터 전처리 ---------------

iqs_df <- iqs_df %>% 
  mutate(구분 = "IQS") %>% 
  rename(산업평균 = IQS산업평균)

vds_df <- vds_df %>% 
  mutate(구분 = "VDS") %>% 
  rename(산업평균 = IQS산업평균)

iqs_vds_df <- bind_rows(iqs_df, vds_df)

iqs_vds_df <- iqs_vds_df %>% 
  mutate(현대기아 = case_when(
    stringr::str_detect(제조사, "Kia") ~ "기아",
    stringr::str_detect(제조사, "Volkswagen") ~ "폭스바겐",
    stringr::str_detect(제조사, "Hyundai") ~ "현대",
    stringr::str_detect(제조사, "Toyota") ~ "도요타",
    stringr::str_detect(제조사, "Industry Average") ~ "산업평균",
    TRUE ~ "해외업체"
  )) %>% 
  mutate(현대기아 = factor(현대기아, levels=c("기아", "현대", "도요타", "폭스바겐", "산업평균", "해외업체")))

2.3. IQS, VDS 표

각 회사를 대표하는 색상을 정의하고 나서 연도별 IQS, VDS 문제갯수를 도표로 작성한다.

# 3. 시각화 ---------------
## 3.0. 색상 팔레트 설정
hkmc_cols <- c(기아 = "#ff0000",
               현대 = "#0000ff",
               폭스바겐 = "#4286f4",
               도요타 ="#ad1f1f",
               산업평균 = "#000000",
               해외업체 = "#b7b7b7")

## 3.0. 주요 메이커 추출
key_auto_maker <- c("BMW", "Ford", "GMC", "Honda", "Hyundai", "Industry Average","Kia","Mercedes-Benz", "Toyota", "Volkswagen")

## 3.1. 표
iqs_vds_df %>% spread(연도, 문제갯수) %>% 
  DT::datatable()

2.3. IQS, VDS 시각화

각 회사를 대표하는 색상도 반영하고, 연도별 IQS, VDS 문제갯수를 동적 그래프로 살펴볼 수 있도록 도식화한다.

## 3.2. 시각화
iqs_vds_gg <- iqs_vds_df %>% 
  mutate(연도 = make_date(year=연도)) %>% 
  filter(제조사 %in% key_auto_maker) %>% 
  ggplot(aes(x=연도, y=문제갯수, group=제조사, color=현대기아)) +
    geom_line() +
    geom_point() +
    scale_color_manual(values=hkmc_cols) +
    facet_wrap(~구분, scale="free") +
    theme_pubr(base_family="NanumGothic") +
    labs(x="", y="결함갯수", color="제조사")

ggplotly(iqs_vds_gg)

3. 자동차 제조사 군집분석

수많은 제조업체가 있는데 초기품질지수와 차량내구성 점수를 바탕으로 결함갯수, 즉 품질이 유사한 업체를 군집분석을 통해 확인해보자.

3.1. 군집분석 도구상자 및 데이터

먼저 군집분석에 필요한 관련 팩키지와 함께 IQS, VDS 데이터를 준비하자.

# 0. 추가 환경설정 ---------------
# library(factoextra)
# library(FactoMineR)
# library(clustertend)
# library(NbClust)
# library(clValid)
# library(mclust)
# library(pheatmap)
# library(ggthemes)
# library(extrafont)
# library(gridExtra)

# 1. 데이터 가져오기 ---------------

iqs_df <- readRDS("data_processed/iqs_df.rds")
vds_df <- readRDS("data_processed/vds_df.rds")

3.2. IQS, VDS 군집데이터 전처리

먼저 군집분석에 필요한 관련 팩키지와 함께 IQS, VDS 데이터를 준비하자. 결측값이 있는 경우는 각 제조사 평균 IQS, VDS 결함갯수로 채워넣고, Fiat는 군집분석을 통해서 이상점으로 파악되어 분석에서 제외한다.

# 2. 데이터 전처리 ---------------
## 2.1. IQS 폭넓은 데이터
iqs_df <- iqs_df %>% 
  mutate(구분 = "IQS") %>% 
  rename(산업평균 = IQS산업평균)

iqs_spread_df <- iqs_df %>% spread(연도, 문제갯수) %>% 
  filter(!제조사 %in% c("CadiMac", "Genesis", "Suzuki")) %>% 
  mutate(제조사평균 = rowMeans(.[-1:-7], na.rm=TRUE)) %>% 
  mutate(`2011` = ifelse(is.na(`2011`), 제조사평균, `2011`), 
         `2012` = ifelse(is.na(`2012`), 제조사평균, `2012`), 
         `2013` = ifelse(is.na(`2013`), 제조사평균, `2013`), 
         `2014` = ifelse(is.na(`2014`), 제조사평균, `2014`), 
         `2015` = ifelse(is.na(`2015`), 제조사평균, `2015`), 
         `2016` = ifelse(is.na(`2016`), 제조사평균, `2016`), 
         `2017` = ifelse(is.na(`2017`), 제조사평균, `2017`)) %>% 
  select(-제조사평균, -산업평균, -현대기아)

## 2.2. VDS 폭넓은 데이터
vds_df <- vds_df %>% 
  mutate(구분 = "VDS") %>% 
  rename(산업평균 = IQS산업평균)

vds_spread_df <- vds_df %>% spread(연도, 문제갯수) %>% 
  filter(!제조사 %in% c("SAAB", "smart", "Suzuki")) %>% 
  mutate(제조사평균 = rowMeans(.[-1:-7], na.rm=TRUE)) %>%
  mutate(`2011` = ifelse(is.na(`2011`), 제조사평균, `2011`), 
         `2012` = ifelse(is.na(`2012`), 제조사평균, `2012`), 
         `2013` = ifelse(is.na(`2013`), 제조사평균, `2013`), 
         `2014` = ifelse(is.na(`2014`), 제조사평균, `2014`), 
         `2015` = ifelse(is.na(`2015`), 제조사평균, `2015`), 
         `2016` = ifelse(is.na(`2016`), 제조사평균, `2016`), 
         `2017` = ifelse(is.na(`2017`), 제조사평균, `2017`)) %>% 
  select(-제조사평균, -산업평균, -현대기아)

## 2.3. 군집분석용 데이터 -----------------------
iqs_vds_spread_df <- inner_join(iqs_spread_df, vds_spread_df, by=c("제조사")) %>% 
  select(-구분.x, -구분.y) %>% 
  filter(제조사 != "Fiat")

3.3. IQS, VDS 군집 알고리즘

IQS, VDS 결함갯수 데이터를 바탕으로 군집분석을 수행하기 위해서 순차적으로 최적 군집 알고리즘을 구축해 나간다.

군집은 존재하는가?
군집이 존재한다면 몇개일까?
데이터에 적합한 최적 군집 알고리즘은 무엇일까?

# 3. 군집분석 ---------------
## 3.1. 군집분석 데이터
iqs_vds_cl_df <- iqs_vds_spread_df %>% 
  select(-제조사) %>% 
  as.data.frame()

row.names(iqs_vds_cl_df) <- iqs_vds_spread_df$제조사

## 3.2. 군집은 존재하는가?
iqs_vds_cl_df %>% 
  dist() %>% 
  fviz_dist(., show_labels = FALSE) +
  labs(title = "IQS, VDS 자동차 품질 군집분석") +
  coord_fixed()

iqs_vds_cl_df %>% 
  clustertend::hopkins(., nrow(iqs_vds_cl_df) -1)

$H
[1] 0.3337565

## 3.3. 군집이 존재한다면 몇개일까? ------------------
### 3.3.1. 팔꿈치 방법
elbow_g <- iqs_vds_cl_df %>% 
  fviz_nbclust(., kmeans, method = "wss") +
  theme_few(base_family = "NanumGothic") +
  geom_vline(xintercept = 3, linetype = 2) +
  scale_y_continuous(labels = scales::comma ) +
  labs(x="군집갯수(k)", y="전체 군집내 제곱합", title="최적 군집갯수", subtitle = "팔꿈치 방법(Elbow method)") 

### 3.3.2. 실루엣 방법
silhouette_g <- iqs_vds_cl_df %>% 
  fviz_nbclust(., kmeans, method = "silhouette") +
  theme_few(base_family = "NanumGothic") +
  geom_vline(xintercept = 2, linetype = 2) +
  scale_y_continuous(labels = scales::comma ) +
  labs(x="군집갯수(k)", y="평균 실루엣 폭", 
       title="최적 군집갯수", subtitle = "실루엣 방법(silhouette method)") 

### 3.3.3. 갭 방법
gap_g <- iqs_vds_cl_df %>% 
  fviz_nbclust(., kmeans, nstart = 25, method = "gap_stat", nboot = 50) +
  theme_few(base_family = "NanumGothic") +
  geom_vline(xintercept = 2, linetype = 2) +
  scale_y_continuous(labels = scales::comma ) +
  labs(x="군집갯수(k)", y="갭 통계량(k)", 
       title="최적 군집갯수", subtitle = "갭 방법(gap method)") 

grid.arrange(elbow_g, silhouette_g, gap_g, nrow=3)

### 3.3.4. NbClust 방법

iqs_vds_cl_df %>% 
  NbClust(., distance = "euclidean", min.nc = 2, max.nc = 10, method = "kmeans") %>% 
  fviz_nbclust() +
  scale_y_continuous(labels = scales::comma ) +
  labs(x="군집갯수(k)", y="추천 빈도수", 
       title="최적 군집갯수", subtitle = "NbClust 방법")

*** : The Hubert index is a graphical method of determining the number of clusters.
                In the plot of Hubert index, we seek a significant knee that corresponds to a 
                significant increase of the value of the measure i.e the significant peak in Hubert
                index second differences plot.

*** : The D index is a graphical method of determining the number of clusters. 
                In the plot of D index, we seek a significant knee (the significant peak in Dindex
                second differences plot) that corresponds to a significant increase of the value of
                the measure. 
 
******************************************************************* 
* Among all indices:                                                
* 7 proposed 2 as the best number of clusters 
* 8 proposed 3 as the best number of clusters 
* 1 proposed 4 as the best number of clusters 
* 1 proposed 5 as the best number of clusters 
* 2 proposed 8 as the best number of clusters 
* 4 proposed 9 as the best number of clusters 

                   ***** Conclusion *****                            
 
* According to the majority rule, the best number of clusters is  3 
 
 
******************************************************************* 
Among all indices: 
===================
* 2 proposed  0 as the best number of clusters
* 1 proposed  1 as the best number of clusters
* 7 proposed  2 as the best number of clusters
* 8 proposed  3 as the best number of clusters
* 1 proposed  4 as the best number of clusters
* 1 proposed  5 as the best number of clusters
* 2 proposed  8 as the best number of clusters
* 4 proposed  9 as the best number of clusters

Conclusion
=========================
* According to the majority rule, the best number of clusters is  3 .

## 3.4. 최적 군집 알고리즘 선택 ------------------
clmethods <- c("hierarchical", "kmeans", "diana", "fanny", "model", "sota", "pam", "clara", "agnes")

clust_algo <- iqs_vds_cl_df %>% 
  as.matrix %>%
  clValid(., nClust = 2:4,
          clMethods = clmethods, validation = "internal")

summary(clust_algo)


Clustering Methods:
 hierarchical kmeans diana fanny model sota pam clara agnes 

Cluster sizes:
 2 3 4 

Validation Measures:
                                 2       3       4
                                                  
hierarchical Connectivity   5.4536 13.2325 15.8825
             Dunn           0.3683  0.4130  0.4130
             Silhouette     0.3876  0.3095  0.2510
kmeans       Connectivity  10.9016 12.8798 15.6964
             Dunn           0.1978  0.4668  0.4773
             Silhouette     0.3241  0.3145  0.2609
diana        Connectivity  16.0913 23.5175 24.9464
             Dunn           0.2084  0.2282  0.2688
             Silhouette     0.2791  0.1856  0.1804
fanny        Connectivity  16.0913      NA      NA
             Dunn           0.2084      NA      NA
             Silhouette     0.2791      NA      NA
model        Connectivity  29.5702 12.8798 19.7175
             Dunn           0.1256  0.4668  0.3402
             Silhouette     0.0797  0.3145  0.2159
sota         Connectivity  16.0913 23.5175 28.9710
             Dunn           0.2084  0.2282  0.2870
             Silhouette     0.2791  0.1856  0.2042
pam          Connectivity   7.4262 12.8798 23.2282
             Dunn           0.3306  0.4668  0.3402
             Silhouette     0.3058  0.3145  0.2430
clara        Connectivity   7.4262 12.8798 23.2282
             Dunn           0.3306  0.4668  0.3402
             Silhouette     0.3058  0.3145  0.2430
agnes        Connectivity   5.4536 13.2325 15.8825
             Dunn           0.3683  0.4130  0.4130
             Silhouette     0.3876  0.3095  0.2510

Optimal Scores:

             Score  Method       Clusters
Connectivity 5.4536 hierarchical 2       
Dunn         0.4773 kmeans       4       
Silhouette   0.3876 hierarchical 2

3.4. 최종 IQS, VDS 군집 분석

통계 알고리즘이 제안하는 군집 갯수와 알고리즘과 자동차 제조사 정보를 종합하여 군집은 3개로 놓고, 계층적 군집알고리즘을 적합시킨다.

군집 1: 렉서스, 포르쉐, 도요타가 주요 제조사로 포함된 고품질 고가 차량 제조사
군집 2: 현대, 기아, BMW, GM 등이 포함된 중간 품질 대중차량 제조사
군집 3: 미니, 랜드로버, 폭스바겐이 포함된 품질문제가 심각한 차량 제조사

## 3.5. 최적 군집 알고리즘 적합: 계층적 군집 3개 ------------------

iqs_vds_hclust <- iqs_vds_cl_df %>% 
  get_dist(method = "euclidean") %>% 
  hclust(method = "ward.D2")

## 3.6. 군집분석 시각화 ------------------
### 3.6.1. 수목도(dendogram) 
fviz_dend(iqs_vds_hclust, k = 3,
          cex = 0.5,
          k_colors = c("#2E9FDF", "#FC4E07"),
          color_labels_by_k = TRUE,
          rect = TRUE) +
  theme_void(base_family="NanumGothic") + 
  labs(title="자동차 제조사 품질", subtitle="계층적 군집분석 알고리즘 - 군집 3개")

### 3.6.2. 군집 시각화
iqs_vds_grp <- cutree(iqs_vds_hclust, k = 3)

fviz_cluster(list(data = iqs_vds_cl_df, cluster = iqs_vds_grp),
             palette = c("#2E9FDF", "#FC4E07", "#0daf05"),
             ellipse.type = "convex", 
             repel = TRUE, 
             show.clust.cent = FALSE, ggtheme = theme_minimal(base_family = "NanumGothic")) +
  labs(title="자동차 제조사 품질", subtitle="계층적 군집분석 알고리즘 - 군집 3개")