영화 평론 찬반/긍부정(polarity)에 대한 영문 데이터를 얻어 이를 데이터프레임으로 변환시킨다. 특징적인 것은 긍정부정 영화 평론 정보가 텍스트 파일로 담겨있어 이를 예측 모형을 구축할 수 있는 형태로 변환시킨다.
## 부정 파일...
neg_files <- list.files("data/review_polarity/txt_sentoken/neg/")
neg_path_files <- glue("data/review_polarity/txt_sentoken/neg/{neg_files}")
neg_file <- map(neg_path_files, read_file)
neg_dat <- tibble(filename = neg_files, review = neg_file)
neg_df <- neg_dat %>%
mutate(polarity = "neg") %>%
mutate(review = map_chr(review, unlist)) %>%
select(polarity, review)
## 긍정 파일...
pos_files <- list.files("data/review_polarity/txt_sentoken/pos/")
pos_path_files <- glue("data/review_polarity/txt_sentoken/pos/{pos_files}")
pos_file <- map(pos_path_files, read_file)
pos_dat <- tibble(filename = pos_files, review = pos_file)
pos_df <- pos_dat %>%
mutate(polarity = "pos") %>%
mutate(review = map_chr(review, unlist)) %>%
select(polarity, review)
## 데이터프레임
review_df <- bind_rows(neg_df, pos_df)
review_df %>%
group_by(polarity) %>%
sample_n(1) %>%
review_rec <- recipe(polarity ~ review, review_train) %>%
step_tokenize(review) %>%
step_stopwords(review) %>%
step_tokenfilter(review, max_tokens = 300) %>%
step_tfidf(review) %>%
review_prep <- prep(review_rec)
Data Recipe
role #variables
outcome 1
predictor 1
Training data contained 1500 data points and no missing data.
Tokenization for review [trained]
Stop word removal for review [trained]
Text filtering for review [trained]
Term frequency-inverse document frequency with review [trained]
Centering and scaling for tfidf_review_10, ... [trained]
lasso_spec <- logistic_reg(penalty = tune(), mixture = 1) %>%
lasso_wf <- workflow() %>%
add_recipe(review_rec) %>%
══ Workflow ════════════════════════════════════════════════════════════════════
Preprocessor: Recipe
Model: logistic_reg()
── Preprocessor ────────────────────────────────────────────────────────────────
5 Recipe Steps
● step_tokenize()
● step_stopwords()
● step_tokenfilter()
● step_tfidf()
● step_normalize()
── Model ───────────────────────────────────────────────────────────────────────
Logistic Regression Model Specification (classification)
Main Arguments:
penalty = tune()
mixture = 1
Computational engine: glmnet
## 멀티코어 병렬처리 활성화
# all_cores <- parallel::detectCores(logical = TRUE)
# library(doParallel)
# cl <- makePSOCKcluster(all_cores)
# registerDoParallel(cl)
## k-fold 설정
review_folds <- bootstraps(review_train, strata = polarity)
# Bootstrap sampling using stratification
# A tibble: 25 x 2
splits id
<list> <chr>
1 <split [1.5K/546]> Bootstrap01
2 <split [1.5K/565]> Bootstrap02
3 <split [1.5K/554]> Bootstrap03
4 <split [1.5K/561]> Bootstrap04
5 <split [1.5K/556]> Bootstrap05
6 <split [1.5K/540]> Bootstrap06
7 <split [1.5K/573]> Bootstrap07
8 <split [1.5K/555]> Bootstrap08
9 <split [1.5K/549]> Bootstrap09
10 <split [1.5K/548]> Bootstrap10
# … with 15 more rows
## 초모수 탐색
lambda_grid <- grid_regular(penalty(), levels = 40)
lasso_grid <- tune_grid(
resamples = review_folds,
grid = lambda_grid,
metrics = metric_set(roc_auc, ppv, npv)
lasso_grid %>%
# A tibble: 120 x 6
penalty .metric .estimator mean n std_err
<dbl> <chr> <chr> <dbl> <int> <dbl>
1 1.00e-10 npv binary 0.690 25 0.00644
2 1.00e-10 ppv binary 0.687 25 0.00554
3 1.00e-10 roc_auc binary 0.739 25 0.00655
4 1.80e-10 npv binary 0.690 25 0.00644
5 1.80e-10 ppv binary 0.687 25 0.00554
6 1.80e-10 roc_auc binary 0.739 25 0.00655
7 3.26e-10 npv binary 0.690 25 0.00644
8 3.26e-10 ppv binary 0.687 25 0.00554
9 3.26e-10 roc_auc binary 0.739 25 0.00655
10 5.88e-10 npv binary 0.690 25 0.00644
# … with 110 more rows
# A tibble: 1 x 1
1 0.0160
══ Workflow ════════════════════════════════════════════════════════════════════
Preprocessor: Recipe
Model: logistic_reg()
── Preprocessor ────────────────────────────────────────────────────────────────
5 Recipe Steps
● step_tokenize()
● step_stopwords()
● step_tokenfilter()
● step_tfidf()
● step_normalize()
── Model ───────────────────────────────────────────────────────────────────────
Logistic Regression Model Specification (classification)
Main Arguments:
penalty = 0.0160371874375133
mixture = 1
Computational engine: glmnet
# A tibble: 2 x 3
.metric .estimator .estimate
<chr> <chr> <dbl>
1 accuracy binary 0.746
2 roc_auc binary 0.831