기계학습 예측모형 기본기 [^best-algorithm] [^applied-predictive-modeling]
# 0. 환경설정 ------
library(caret)
library(tidyverse)
library(janitor)
library(doSNOW)
# 1. 데이터 ------
data(GermanCredit)
# 2. 데이터 전처리 ------
## 변수명 정리 -----
credit_dat <- GermanCredit %>%
clean_names() %>%
tbl_df
## 예측모형에 사용되는 변수정리 -----
all_variables <- names(credit_dat)
remove_variables <- names(credit_dat)[nearZeroVar(credit_dat)]
credit_df <- credit_dat[ , setdiff(all_variables, remove_variables)]
# 3. 예측모형 ------
## 3.1. 병렬처리 환경설정
num_cores <- parallel:::detectCores()
start_time <- Sys.time()
cl <- makeCluster(num_cores, type = "SOCK")
registerDoSNOW(cl)
## 3.2. 훈련 vs 검증/시험
train_test_index <- createDataPartition(credit_df$class, p = 0.7, list = FALSE)
train <- credit_df[train_test_index, ]
test <- credit_df[-train_test_index, ]
## 3.3. 모형 개발/검증 데이터셋 준비 ------
cv_folds <- createMultiFolds(train$class, k = 10, times = 5)
cv_ctrl <- trainControl(method = "cv", number = 10,
index = cv_folds,
summaryFunction = twoClassSummary,
classProbs = TRUE, # 매우 중요
verboseIter = TRUE)
ranger_tune_grid <- expand.grid(
.mtry = c(2,16,31,48),
.splitrule = c("gini","extratrees"),
.min.node.size = c(5, 10)
)
glmnet_tune_grid <- expand.grid(
alpha = 0:1,
lambda = seq(0.0001, 1, length=10)
)
## 3.2. 예측모형 적용
### glmnet
gc_glmnet_model <- train(class ~., train,
method = "glmnet",
metric = "Sens",
preProcess = c("zv", "center", "scale", "spatialSign"),
tuneGrid = glmnet_tune_grid,
trControl = cv_ctrl)
Aggregating results
Selecting tuning parameters
Fitting alpha = 1, lambda = 1e-04 on full training set
### ranger
gc_ranger_model <- train(class ~., train,
method = "ranger",
metric = "Sens",
preProcess = c("zv", "center", "scale", "spatialSign"),
tuneGrid = ranger_tune_grid,
# tuneLength = 7,
trControl = cv_ctrl)
Aggregating results
Selecting tuning parameters
Fitting mtry = 48, splitrule = extratrees, min.node.size = 5 on full training set
# 4. 모형 비교평가-----
model_list <- list(
glmnet = gc_glmnet_model,
rf = gc_ranger_model
)
resamps <- resamples(model_list)
summary(resamps)
Call:
summary.resamples(object = resamps)
Models: glmnet, rf
Number of resamples: 50
ROC
Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
glmnet 0.6375121 0.7482993 0.7750243 0.7739359 0.8058795 0.8960155 0
rf 0.5850340 0.7103984 0.7570457 0.7508649 0.7947036 0.8824101 0
Sens
Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
glmnet 0.3333333 0.4285714 0.4761905 0.4933333 0.5595238 0.6666667 0
rf 0.1904762 0.3452381 0.4285714 0.4142857 0.4761905 0.6190476 0
Spec
Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
glmnet 0.7346939 0.8367347 0.8571429 0.8612245 0.8979592 0.9387755 0
rf 0.7755102 0.8571429 0.8775510 0.8865306 0.9183673 0.9795918 0
dotplot(resamps, metric = "Sens")
# 5. 모형성능 평가 -----
gc_pred_class <- predict(gc_glmnet_model, newdata = test, type="raw")
## 혼동행렬 -----
confusionMatrix(gc_pred_class, test$class)
Confusion Matrix and Statistics
Reference
Prediction Bad Good
Bad 38 25
Good 52 185
Accuracy : 0.7433
95% CI : (0.69, 0.7918)
No Information Rate : 0.7
P-Value [Acc > NIR] : 0.056076
Kappa : 0.3316
Mcnemar's Test P-Value : 0.003047
Sensitivity : 0.4222
Specificity : 0.8810
Pos Pred Value : 0.6032
Neg Pred Value : 0.7806
Prevalence : 0.3000
Detection Rate : 0.1267
Detection Prevalence : 0.2100
Balanced Accuracy : 0.6516
'Positive' Class : Bad
mc_total_time <- Sys.time() - start_time
mc_total_time
Time difference of 4.867497 mins