1 기계학습 예측모형 기본기 [^best-algorithm] [^applied-predictive-modeling]

# 0. 환경설정 ------
library(caret)
library(tidyverse)
library(janitor)
library(doSNOW)

# 1. 데이터 ------
data(GermanCredit)

# 2. 데이터 전처리 ------
## 변수명 정리 -----
credit_dat <- GermanCredit %>% 
  clean_names() %>% 
  tbl_df

## 예측모형에 사용되는 변수정리 -----
all_variables <- names(credit_dat)
remove_variables <- names(credit_dat)[nearZeroVar(credit_dat)]

credit_df <- credit_dat[ , setdiff(all_variables, remove_variables)]

# 3. 예측모형 ------
## 3.1. 병렬처리 환경설정
num_cores <- parallel:::detectCores()
start_time <- Sys.time()

cl <- makeCluster(num_cores, type = "SOCK")
registerDoSNOW(cl)

## 3.2. 훈련 vs 검증/시험
train_test_index <- createDataPartition(credit_df$class, p = 0.7, list = FALSE)

train <- credit_df[train_test_index, ]
test <- credit_df[-train_test_index, ]

## 3.3. 모형 개발/검증 데이터셋 준비 ------
cv_folds <- createMultiFolds(train$class, k = 10, times = 5)

cv_ctrl <- trainControl(method = "cv", number = 10,
                        index = cv_folds, 
                        summaryFunction = twoClassSummary,
                        classProbs = TRUE, # 매우 중요
                        verboseIter = TRUE)

ranger_tune_grid <- expand.grid(
  .mtry = c(2,16,31,48),
  .splitrule = c("gini","extratrees"),
  .min.node.size = c(5, 10)
)

glmnet_tune_grid <- expand.grid(
  alpha = 0:1,
  lambda = seq(0.0001, 1, length=10)
)

## 3.2. 예측모형 적용
### glmnet
gc_glmnet_model <- train(class ~., train,
                  method = "glmnet",
                  metric = "Sens",
                  preProcess = c("zv", "center", "scale", "spatialSign"),
                  tuneGrid = glmnet_tune_grid,
                  trControl = cv_ctrl)
Aggregating results
Selecting tuning parameters
Fitting alpha = 1, lambda = 1e-04 on full training set
### ranger
gc_ranger_model <- train(class ~., train,
                  method = "ranger",
                  metric = "Sens",
                  preProcess = c("zv", "center", "scale", "spatialSign"),
                  tuneGrid = ranger_tune_grid,
                  # tuneLength = 7,
                  trControl = cv_ctrl)
Aggregating results
Selecting tuning parameters
Fitting mtry = 48, splitrule = extratrees, min.node.size = 5 on full training set
# 4. 모형 비교평가-----
model_list <- list(
  glmnet = gc_glmnet_model,
  rf = gc_ranger_model
)

resamps <- resamples(model_list)

summary(resamps)

Call:
summary.resamples(object = resamps)

Models: glmnet, rf 
Number of resamples: 50 

ROC 
            Min.   1st Qu.    Median      Mean   3rd Qu.      Max. NA's
glmnet 0.6375121 0.7482993 0.7750243 0.7739359 0.8058795 0.8960155    0
rf     0.5850340 0.7103984 0.7570457 0.7508649 0.7947036 0.8824101    0

Sens 
            Min.   1st Qu.    Median      Mean   3rd Qu.      Max. NA's
glmnet 0.3333333 0.4285714 0.4761905 0.4933333 0.5595238 0.6666667    0
rf     0.1904762 0.3452381 0.4285714 0.4142857 0.4761905 0.6190476    0

Spec 
            Min.   1st Qu.    Median      Mean   3rd Qu.      Max. NA's
glmnet 0.7346939 0.8367347 0.8571429 0.8612245 0.8979592 0.9387755    0
rf     0.7755102 0.8571429 0.8775510 0.8865306 0.9183673 0.9795918    0
dotplot(resamps, metric = "Sens")

# 5. 모형성능 평가 -----
gc_pred_class <- predict(gc_glmnet_model, newdata = test, type="raw")
## 혼동행렬 -----
confusionMatrix(gc_pred_class, test$class)
Confusion Matrix and Statistics

          Reference
Prediction Bad Good
      Bad   38   25
      Good  52  185
                                        
               Accuracy : 0.7433        
                 95% CI : (0.69, 0.7918)
    No Information Rate : 0.7           
    P-Value [Acc > NIR] : 0.056076      
                                        
                  Kappa : 0.3316        
 Mcnemar's Test P-Value : 0.003047      
                                        
            Sensitivity : 0.4222        
            Specificity : 0.8810        
         Pos Pred Value : 0.6032        
         Neg Pred Value : 0.7806        
             Prevalence : 0.3000        
         Detection Rate : 0.1267        
   Detection Prevalence : 0.2100        
      Balanced Accuracy : 0.6516        
                                        
       'Positive' Class : Bad           
                                        
mc_total_time <- Sys.time() - start_time
mc_total_time
Time difference of 4.867497 mins
stopCluster(cl)