1 영화평론 데이터

영화 평론 찬반/긍부정(polarity)에 대한 영문 데이터를 얻어 이를 데이터프레임으로 변환시킨다. 특징적인 것은 긍정부정 영화 평론 정보가 텍스트 파일로 담겨있어 이를 예측 모형을 구축할 수 있는 형태로 변환시킨다.

library(tidyverse)
library(tidytext)
library(glue)

## 부정 파일...
neg_files <- list.files("data/review_polarity/txt_sentoken/neg/")
neg_path_files <- glue("data/review_polarity/txt_sentoken/neg/{neg_files}")

neg_file <- map(neg_path_files, read_file)
neg_dat <- tibble(filename = neg_files, review = neg_file)

neg_df <- neg_dat %>% 
  mutate(polarity = "neg") %>% 
  mutate(review = map_chr(review, unlist)) %>% 
  select(polarity, review)

## 긍정 파일...
pos_files <- list.files("data/review_polarity/txt_sentoken/pos/")
pos_path_files <- glue("data/review_polarity/txt_sentoken/pos/{pos_files}")

pos_file <- map(pos_path_files, read_file)
pos_dat <- tibble(filename = pos_files, review = pos_file)

pos_df <- pos_dat %>% 
  mutate(polarity = "pos") %>% 
  mutate(review = map_chr(review, unlist)) %>% 
  select(polarity, review)

## 데이터프레임
review_df <- bind_rows(neg_df, pos_df)

review_df %>% 
  group_by(polarity) %>% 
  sample_n(1) %>% 
  DT::datatable()

2 찬반 예측모형

2.1 훈련/시험 데이터

library(tidymodels)
review_split <- initial_split(review_df)

review_train <- training(review_split)
review_test <- testing(review_split)

2.2 피처 공학

library(textrecipes)

review_rec <- recipe(polarity ~ review, review_train) %>% 
  step_tokenize(review) %>% 
  step_stopwords(review) %>% 
  step_tokenfilter(review, max_tokens = 300) %>% 
  step_tfidf(review) %>% 
  step_normalize(all_predictors())

review_prep <- prep(review_rec)

review_prep
Data Recipe

Inputs:

      role #variables
   outcome          1
 predictor          1

Training data contained 1500 data points and no missing data.

Operations:

Tokenization for review [trained]
Stop word removal for review [trained]
Text filtering for review [trained]
Term frequency-inverse document frequency with review [trained]
Centering and scaling for tfidf_review_10, ... [trained]

2.3 모형 적합

library(workflows)

lasso_spec <- logistic_reg(penalty = tune(), mixture = 1) %>%
  set_engine("glmnet")

lasso_wf <- workflow() %>%
  add_recipe(review_rec) %>%
  add_model(lasso_spec)

lasso_wf
══ Workflow ════════════════════════════════════════════════════════════════════
Preprocessor: Recipe
Model: logistic_reg()

── Preprocessor ────────────────────────────────────────────────────────────────
5 Recipe Steps

● step_tokenize()
● step_stopwords()
● step_tokenfilter()
● step_tfidf()
● step_normalize()

── Model ───────────────────────────────────────────────────────────────────────
Logistic Regression Model Specification (classification)

Main Arguments:
  penalty = tune()
  mixture = 1

Computational engine: glmnet 

2.4 초모수 튜닝

## 멀티코어 병렬처리 활성화
# all_cores <- parallel::detectCores(logical = TRUE)
# 
# library(doParallel)
# cl <- makePSOCKcluster(all_cores)
# registerDoParallel(cl)

doParallel::registerDoParallel()

## k-fold 설정
review_folds <- bootstraps(review_train, strata = polarity)
review_folds
# Bootstrap sampling using stratification 
# A tibble: 25 x 2
   splits             id         
   <list>             <chr>      
 1 <split [1.5K/546]> Bootstrap01
 2 <split [1.5K/565]> Bootstrap02
 3 <split [1.5K/554]> Bootstrap03
 4 <split [1.5K/561]> Bootstrap04
 5 <split [1.5K/556]> Bootstrap05
 6 <split [1.5K/540]> Bootstrap06
 7 <split [1.5K/573]> Bootstrap07
 8 <split [1.5K/555]> Bootstrap08
 9 <split [1.5K/549]> Bootstrap09
10 <split [1.5K/548]> Bootstrap10
# … with 15 more rows
## 초모수 탐색
library(tune)

lambda_grid <- grid_regular(penalty(), levels = 40)

lasso_grid <- tune_grid(
  lasso_wf,
  resamples = review_folds,
  grid = lambda_grid,
  metrics = metric_set(roc_auc, ppv, npv)
)

lasso_grid %>%
  collect_metrics()
# A tibble: 120 x 6
    penalty .metric .estimator  mean     n std_err
      <dbl> <chr>   <chr>      <dbl> <int>   <dbl>
 1 1.00e-10 npv     binary     0.690    25 0.00644
 2 1.00e-10 ppv     binary     0.687    25 0.00554
 3 1.00e-10 roc_auc binary     0.739    25 0.00655
 4 1.80e-10 npv     binary     0.690    25 0.00644
 5 1.80e-10 ppv     binary     0.687    25 0.00554
 6 1.80e-10 roc_auc binary     0.739    25 0.00655
 7 3.26e-10 npv     binary     0.690    25 0.00644
 8 3.26e-10 ppv     binary     0.687    25 0.00554
 9 3.26e-10 roc_auc binary     0.739    25 0.00655
10 5.88e-10 npv     binary     0.690    25 0.00644
# … with 110 more rows
lasso_grid %>%
  collect_metrics() %>%
  ggplot(aes(penalty, mean, color = .metric)) +
  geom_line(size = 1.5, show.legend = FALSE) +
  facet_wrap(~.metric) +
  scale_x_log10()

2.5 최종모형과 성능

# 모형 성능지표 
best_auc <- lasso_grid %>%
  select_best("roc_auc")
best_auc
# A tibble: 1 x 1
  penalty
    <dbl>
1  0.0160
final_lasso <- finalize_workflow(lasso_wf, best_auc)
final_lasso
══ Workflow ════════════════════════════════════════════════════════════════════
Preprocessor: Recipe
Model: logistic_reg()

── Preprocessor ────────────────────────────────────────────────────────────────
5 Recipe Steps

● step_tokenize()
● step_stopwords()
● step_tokenfilter()
● step_tfidf()
● step_normalize()

── Model ───────────────────────────────────────────────────────────────────────
Logistic Regression Model Specification (classification)

Main Arguments:
  penalty = 0.0160371874375133
  mixture = 1

Computational engine: glmnet 
review_final <- last_fit(final_lasso, review_split)

review_final %>%
  collect_metrics()
# A tibble: 2 x 3
  .metric  .estimator .estimate
  <chr>    <chr>          <dbl>
1 accuracy binary         0.746
2 roc_auc  binary         0.831