Women’s e-commerce cloting reviews 데이터를 바탕으로 텍스트 데이터를 예측모형에 Feature로 넣어 예측력을 향상시키는 방안을 살펴보자.
캐글 Women’s e-commerce cloting reviews 데이터는 총 11개 변수로 구성되어 있고 관측점이 23,486개로 구성되어 있다. Recommended IND를 라벨 목표변수로 두고 예측모형을 구축해보자.
cloth_dat <- cloth_dat %>%
clean_names() %>%
filter(complete.cases(.)) %>%
rename(y = recommended_ind)
cloth_df <- cloth_dat %>%
mutate(y = factor(y, levels=c(1,0), labels=c("yes", "no"))) %>%
mutate_if(is.character, as.factor) %>%
select(y, age, title, review_text, division_name, department_name, class_name) %>%
mutate(class = fct_lump(class_name, 9)) %>%
select(y, age, division = division_name, department = department_name, class)
# 2. 예측모형 -----
## 2.1. 훈련/시험 데이터 분할 ------
library(caret)
xy_index <- createDataPartition(cloth_df$y, times =1, p=0.5, list=FALSE)
train_df <- cloth_df[xy_index, ]
test_df <- cloth_df[-xy_index, ]
## 2.2. 모형 개발/검증 데이터셋 준비 ------
cv_folds <- createMultiFolds(train_df$y, k = 5, times = 1)
cv_cntrl <- trainControl(method = "repeatedcv", number = 5,
sampling = "down",
summaryFunction = twoClassSummary,
classProbs = TRUE,
repeats = 1, index = cv_folds)
## 2.2. 모형 개발/검증 데이터셋 준비 ------
library(doSNOW)
# 실행시간
start.time <- Sys.time()
cl <- makeCluster(4, type = "SOCK")
registerDoSNOW(cl)
cloth_glm <- train(y ~ ., data = train_df,
method = "glm",
family = "binomial",
metric='Sens',
trControl = cv_cntrl,
tuneLength = 7)
cloth_rf <- train(y ~ ., data = train_df,
method = "ranger",
metric='Sens',
num.tree = 100,
importance = "permutation",
trControl = cv_cntrl,
tuneLength = 7)
stopCluster(cl)
total.time <- Sys.time() - start.time
total.time
Time difference of 31.08911 secs
# 3. 예측모형 성능 -----
## GLM
glm_pred_df <- predict(cloth_glm, newdata=test_df, type="prob") %>%
tbl_df %>%
mutate(class = factor(ifelse(yes > no, "yes", "no"), levels = c("yes", "no")),
prob = yes)
confusionMatrix(glm_pred_df$class, test_df$y)
Confusion Matrix and Statistics
Reference
Prediction yes no
yes 3881 756
no 4162 1031
Accuracy : 0.4997
95% CI : (0.4898, 0.5096)
No Information Rate : 0.8182
P-Value [Acc > NIR] : 1
Kappa : 0.0342
Mcnemar's Test P-Value : <2e-16
Sensitivity : 0.4825
Specificity : 0.5769
Pos Pred Value : 0.8370
Neg Pred Value : 0.1985
Prevalence : 0.8182
Detection Rate : 0.3948
Detection Prevalence : 0.4717
Balanced Accuracy : 0.5297
'Positive' Class : yes
## randomForest
rf_pred_df <- predict(cloth_rf, newdata=test_df) %>%
tbl_df %>%
rename(class = value)
confusionMatrix(rf_pred_df$class, test_df$y)
Confusion Matrix and Statistics
Reference
Prediction yes no
yes 3917 794
no 4126 993
Accuracy : 0.4995
95% CI : (0.4896, 0.5094)
No Information Rate : 0.8182
P-Value [Acc > NIR] : 1
Kappa : 0.0247
Mcnemar's Test P-Value : <2e-16
Sensitivity : 0.4870
Specificity : 0.5557
Pos Pred Value : 0.8315
Neg Pred Value : 0.1940
Prevalence : 0.8182
Detection Rate : 0.3985
Detection Prevalence : 0.4792
Balanced Accuracy : 0.5213
'Positive' Class : yes