전통적인 통계기반의 예측모형을 사용하는 것이 아니라 최근에 개발된 딥러닝을 적용시켜 뉴스그룹(뉴스기사) 분류작업을 수행해보자.
전통적인 통계기반 예측모형 대신에 딥러닝에 포함되는 모형은 여러가지가 있지만 아무래도 워드2벡(word2vec) 을 가장 먼저 꼽고 있다.
# -*- coding: utf-8 -*-
## 환경설정
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
import nltk
# 저자 제작 사용자정의 함수 가져오기
import sys
sys.path.insert(0, './code')
import text_normalizer as tn
data_df = pd.read_csv('data/clean_newsgroups.csv')
from sklearn.model_selection import train_test_split
train_corpus, test_corpus, train_label_nums, test_label_nums, train_label_names, test_label_names =\
train_test_split(np.array(data_df['Clean Article']), np.array(data_df['Target Label']),
np.array(data_df['Target Name']), test_size=0.33, random_state=42)
train_corpus.shape, test_corpus.shape
tokenized_train = [tn.tokenizer.tokenize(text)
for text in train_corpus]
tokenized_test = [tn.tokenizer.tokenize(text)
for text in test_corpus]
워드2벡(word2vec)이 포함된 라이브러리로 gensim
이 선택되었다. 다음 명령어를 사용해서 gensim
라이브러리를 설치한다.
$ pip install -U gensim
import gensim
# build word2vec model
w2v_num_features = 100
w2v_model = gensim.models.Word2Vec(tokenized_train, size=w2v_num_features, window=100,
min_count=2, sample=1e-3, sg=1, iter=5, workers=10)
def document_vectorizer(corpus, model, num_features):
vocabulary = set(model.wv.index2word)
def average_word_vectors(words, model, vocabulary, num_features):
feature_vector = np.zeros((num_features,), dtype="float64")
nwords = 0.
for word in words:
if word in vocabulary:
nwords = nwords + 1.
feature_vector = np.add(feature_vector, model.wv[word])
if nwords:
feature_vector = np.divide(feature_vector, nwords)
return feature_vector
features = [average_word_vectors(tokenized_sentence, model, vocabulary, num_features)
for tokenized_sentence in corpus]
return np.array(features)
# generate averaged word vector features from word2vec model
avg_wv_train_features = document_vectorizer(corpus=tokenized_train, model=w2v_model,
num_features=w2v_num_features)
avg_wv_test_features = document_vectorizer(corpus=tokenized_test, model=w2v_model,
num_features=w2v_num_features)
print(f'Word2Vec model:> Train features shape: \t {avg_wv_train_features.shape}\n \
Test features shape: \t {avg_wv_test_features.shape}')
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import SGDClassifier
svm = SGDClassifier(loss='hinge', penalty='l2', random_state=42, max_iter=500)
svm.fit(avg_wv_train_features, train_label_names)
svm_w2v_cv_scores = cross_val_score(svm, avg_wv_train_features, train_label_names, cv=5)
svm_w2v_cv_mean_score = np.mean(svm_w2v_cv_scores)
print('CV Accuracy (5-fold):', svm_w2v_cv_scores)
print('Mean CV Accuracy:', svm_w2v_cv_mean_score)
svm_w2v_test_score = svm.score(avg_wv_test_features, test_label_names)
print('Test Accuracy:', svm_w2v_test_score)
# feature engineering with GloVe model
train_nlp = [tn.nlp(item) for item in train_corpus]
train_glove_features = np.array([item.vector for item in train_nlp])
test_nlp = [tn.nlp(item) for item in test_corpus]
test_glove_features = np.array([item.vector for item in test_nlp])
print('GloVe model:> Train features shape:', train_glove_features.shape,
' Test features shape:', test_glove_features.shape)
svm = SGDClassifier(loss='hinge', penalty='l2', random_state=42, max_iter=500)
svm.fit(train_glove_features, train_label_names)
svm_glove_cv_scores = cross_val_score(svm, train_glove_features, train_label_names, cv=5)
svm_glove_cv_mean_score = np.mean(svm_glove_cv_scores)
print('CV Accuracy (5-fold):', svm_glove_cv_scores)
print('Mean CV Accuracy:', svm_glove_cv_mean_score)
svm_glove_test_score = svm.score(test_glove_features, test_label_names)
print('Test Accuracy:', svm_glove_test_score)
from gensim.models.fasttext import FastText
ft_num_features = 1000
# sg decides whether to use the skip-gram model (1) or CBOW (0)
ft_model = FastText(tokenized_train, size=ft_num_features, window=100,
min_count=2, sample=1e-3, sg=1, iter=5, workers=10)
# generate averaged word vector features from word2vec model
avg_ft_train_features = document_vectorizer(corpus=tokenized_train, model=ft_model,
num_features=ft_num_features)
avg_ft_test_features = document_vectorizer(corpus=tokenized_test, model=ft_model,
num_features=ft_num_features)
print('FastText model:> Train features shape:', avg_ft_train_features.shape,
' Test features shape:', avg_ft_test_features.shape)
svm = SGDClassifier(loss='hinge', penalty='l2', random_state=42, max_iter=500)
svm.fit(avg_ft_train_features, train_label_names)
svm_ft_cv_scores = cross_val_score(svm, avg_ft_train_features, train_label_names, cv=5)
svm_ft_cv_mean_score = np.mean(svm_ft_cv_scores)
print('CV Accuracy (5-fold):', svm_ft_cv_scores)
print('Mean CV Accuracy:', svm_ft_cv_mean_score)
svm_ft_test_score = svm.score(avg_ft_test_features, test_label_names)
print('Test Accuracy:', svm_ft_test_score)
from sklearn.neural_network import MLPClassifier
mlp = MLPClassifier(solver='adam', alpha=1e-5, learning_rate='adaptive', early_stopping=True,
activation = 'relu', hidden_layer_sizes=(512, 512), random_state=42)
mlp.fit(avg_ft_train_features, train_label_names)
svm_ft_test_score = mlp.score(avg_ft_test_features, test_label_names)
print('Test Accuracy:', svm_ft_test_score)