뉴스기사를 20개 그룹으로 분류하는 분류기를 개발해보자.
## 환경설정
import warnings
warnings.filterwarnings('ignore')
# -*- coding: utf-8 -*-
%matplotlib inline
## 라이브러리 가져오기
from sklearn.datasets import fetch_20newsgroups
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
# 저자 제작 사용자정의 함수 가져오기
import sys
sys.path.insert(0, './code')
import text_normalizer as tn
영문으로 구성된 뉴스데이터셋을 sklearn.datasets
에서 가져온다.
fetch_20newsgroups()
함수로 뉴스그룹 데이터를 가져온다. data_labels_map
딕셔너리를 만드는데 다음 과정을 거친다. 즉 리스트를 딕셔너리로 변환하는 방법은 다음과 같다.
data.target_names
(리스트) →enumerate(data.target_names)
→dict(enumerate(data.target_names))
(딕셔너리)
data = fetch_20newsgroups(subset='all', shuffle=True,
remove=('headers', 'footers', 'quotes'))
data_labels_map = dict(enumerate(data.target_names))
list comprehension을 통해서 target_names
를 뽑아내고 뉴스기사 원문을 비롯하여 target_labels
를 추출해서 파이썬 딕셔너리로 만든 후에 판다스 데이터프레임 객체로 저장시킨다.
corpus, target_labels, target_names = (data.data, data.target, [data_labels_map[label] for label in data.target])
data_df = pd.DataFrame({'Article': corpus, 'Target Label': target_labels, 'Target Name': target_names})
print(data_df.shape)
data_df.head(10)
total_nulls = data_df[data_df.Article.str.strip() == ''].shape[0]
print(f"깡통 문서: {total_nulls}")
data_df = data_df[~(data_df.Article.str.strip() == '')]
data_df.shape
nltk
라이브러리 불용어 사전을 가져오고, 2-그램 부정(no, not) 감성분석에 활용을 위해서 불용어 리스트에서 제거하지 않고 자체 제작한 normalize_corpus()
함수로 말뭉치를 깔끔하게 정제한다.
시간이 제법 소요되기 때문에 %timeit
마술 명령어(magic command)를 사용하여 정제에 거리는 시간을 측정한다.
%timeit
import nltk
stopword_list = nltk.corpus.stopwords.words('english')
# just to keep negation if any in bi-grams
stopword_list.remove('no')
stopword_list.remove('not')
# normalize our corpus
norm_corpus = tn.normalize_corpus(corpus=data_df['Article'], html_stripping=True, contraction_expansion=True,
accented_char_removal=True, text_lower_case=True, text_lemmatization=True,
text_stemming=False, special_char_removal=True, remove_digits=True,
stopword_removal=True, stopwords=stopword_list)
data_df['Clean Article'] = norm_corpus
data_df = data_df[['Article', 'Clean Article', 'Target Label', 'Target Name']]
data_df.head(10)
정규표현식(r'^(\s?)+$'
)과 매칭되면 NA
로 치환시키고 NA` 포함된 것은 제거시킨다.
data_df = data_df.replace(r'^(\s?)+$', np.nan, regex=True)
data_df.info()
data_df = data_df.dropna().reset_index(drop=True)
data_df.info()
깔끔하게 정제시킨 데이터가 시간이 많이 소요되어 이를 .csv
파일로 저장시킨다.
data_df.to_csv('data/clean_newsgroups.csv', index=False, encoding='utf-8')
train_test_split()
를 통해서 훈련/시험 데이터로 분리시킨다.
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xa9 in position 24574: invalid start byte
유형의 오류가 발생되는 경우, 앞서.to_csv()
저장할 때encoding='utf-8'
을 명시하고,.read_csv()
에서 가져올 때도 동일하게 지정한다.
data_df = pd.read_csv('data/clean_newsgroups.csv', encoding="utf-8")
from sklearn.model_selection import train_test_split
train_corpus, test_corpus, train_label_nums, test_label_nums, train_label_names, test_label_names =\
train_test_split(np.array(data_df['Clean Article']), np.array(data_df['Target Label']),
np.array(data_df['Target Name']), test_size=0.33, random_state=42)
train_corpus.shape, test_corpus.shape
train_label_names
ndarray 객체를 Counter
객체로 빈도수를 계산한 후에 딕셔너리 객체로 저장시킨다. test_label_names
에도 동일한 작업을 수행하고 이를 데이터프레임으로 생성시킨 후에 정렬시킨다.
from collections import Counter
trd = dict(Counter(train_label_names))
tsd = dict(Counter(test_label_names))
(pd.DataFrame([[key, trd[key], tsd[key]] for key in trd],
columns=['Target Label', 'Train Count', 'Test Count'])
.sort_values(by=['Train Count', 'Test Count'],
ascending=False))
뉴스 텍스트를 BoW 모형으로 바꿔 뉴스기사 분류를 위한 X
행렬로 변환시키는 작업을 훈련/시험 데이터에 공통으로 수행한다.
혹은 Tf-idf
로 바꾸어서 예측모형 입력값으로 넣는 것도 가능하다.
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import cross_val_score
# build BOW features on train articles
cv = CountVectorizer(binary=False, min_df=0.0, max_df=1.0)
cv_train_features = cv.fit_transform(train_corpus)
# transform test articles into features
cv_test_features = cv.transform(test_corpus)
print(f'BOW model:> \n Train features shape: \t {cv_train_features.shape},\n Test features shape: \t {cv_test_features.shape}')
from sklearn.feature_extraction.text import TfidfVectorizer
# build BOW features on train articles
tv = TfidfVectorizer(use_idf=True, min_df=0.0, max_df=1.0)
tv_train_features = tv.fit_transform(train_corpus)
# transform test articles into features
tv_test_features = tv.transform(test_corpus)
print(f'TFIDF model:> \n Train features shape: \t {tv_train_features.shape},\n Test features shape: \t {tv_test_features.shape}')
%%time
from sklearn.naive_bayes import MultinomialNB
mnb = MultinomialNB(alpha=1)
mnb.fit(cv_train_features, train_label_names)
mnb_bow_cv_scores = cross_val_score(mnb, cv_train_features, train_label_names, cv=5)
mnb_bow_cv_mean_score = np.mean(mnb_bow_cv_scores)
print('CV Accuracy (5-fold):', mnb_bow_cv_scores)
print('Mean CV Accuracy:', mnb_bow_cv_mean_score)
mnb_bow_test_score = mnb.score(cv_test_features, test_label_names)
print('Test Accuracy:', mnb_bow_test_score)
%%time
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(penalty='l2', max_iter=100, C=1, random_state=42)
lr.fit(cv_train_features, train_label_names)
lr_bow_cv_scores = cross_val_score(lr, cv_train_features, train_label_names, cv=5)
lr_bow_cv_mean_score = np.mean(lr_bow_cv_scores)
print('CV Accuracy (5-fold):', lr_bow_cv_scores)
print('Mean CV Accuracy:', lr_bow_cv_mean_score)
lr_bow_test_score = lr.score(cv_test_features, test_label_names)
print('Test Accuracy:', lr_bow_test_score)
%%time
from sklearn.svm import LinearSVC
svm = LinearSVC(penalty='l2', C=1, random_state=42)
svm.fit(cv_train_features, train_label_names)
svm_bow_cv_scores = cross_val_score(svm, cv_train_features, train_label_names, cv=5)
svm_bow_cv_mean_score = np.mean(svm_bow_cv_scores)
print('CV Accuracy (5-fold):', svm_bow_cv_scores)
print('Mean CV Accuracy:', svm_bow_cv_mean_score)
svm_bow_test_score = svm.score(cv_test_features, test_label_names)
print('Test Accuracy:', svm_bow_test_score)
%%time
from sklearn.linear_model import SGDClassifier
svm_sgd = SGDClassifier(loss='hinge', penalty='l2', max_iter=5, random_state=42)
svm_sgd.fit(cv_train_features, train_label_names)
svmsgd_bow_cv_scores = cross_val_score(svm_sgd, cv_train_features, train_label_names, cv=5)
svmsgd_bow_cv_mean_score = np.mean(svmsgd_bow_cv_scores)
print('CV Accuracy (5-fold):', svmsgd_bow_cv_scores)
print('Mean CV Accuracy:', svmsgd_bow_cv_mean_score)
svmsgd_bow_test_score = svm_sgd.score(cv_test_features, test_label_names)
print('Test Accuracy:', svmsgd_bow_test_score)
%%time
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(n_estimators=10, random_state=42)
rfc.fit(cv_train_features, train_label_names)
rfc_bow_cv_scores = cross_val_score(rfc, cv_train_features, train_label_names, cv=5)
rfc_bow_cv_mean_score = np.mean(rfc_bow_cv_scores)
print('CV Accuracy (5-fold):', rfc_bow_cv_scores)
print('Mean CV Accuracy:', rfc_bow_cv_mean_score)
rfc_bow_test_score = rfc.score(cv_test_features, test_label_names)
print('Test Accuracy:', rfc_bow_test_score)
%%time
from sklearn.ensemble import GradientBoostingClassifier
gbc = GradientBoostingClassifier(n_estimators=10, random_state=42)
gbc.fit(cv_train_features, train_label_names)
gbc_bow_cv_scores = cross_val_score(gbc, cv_train_features, train_label_names, cv=5)
gbc_bow_cv_mean_score = np.mean(gbc_bow_cv_scores)
print('CV Accuracy (5-fold):', gbc_bow_cv_scores)
print('Mean CV Accuracy:', gbc_bow_cv_mean_score)
gbc_bow_test_score = gbc.score(cv_test_features, test_label_names)
print('Test Accuracy:', gbc_bow_test_score)
pd.DataFrame([['Naive Bayes', mnb_bow_cv_mean_score, mnb_bow_test_score],
['Logistic Regression', lr_bow_cv_mean_score, lr_bow_test_score],
['Linear SVM', svm_bow_cv_mean_score, svm_bow_test_score],
['Linear SVM (SGD)', svmsgd_bow_cv_mean_score, svmsgd_bow_test_score],
['Random Forest', rfc_bow_cv_mean_score, rfc_bow_test_score],
['Gradient Boosted Machines', gbc_bow_cv_mean_score, gbc_bow_test_score]],
columns=['Model', 'CV Score (TF)', 'Test Score (TF)'],
).T
%%time
mnb = MultinomialNB(alpha=1)
mnb.fit(tv_train_features, train_label_names)
mnb_tfidf_cv_scores = cross_val_score(mnb, tv_train_features, train_label_names, cv=5)
mnb_tfidf_cv_mean_score = np.mean(mnb_tfidf_cv_scores)
print('CV Accuracy (5-fold):', mnb_tfidf_cv_scores)
print('Mean CV Accuracy:', mnb_tfidf_cv_mean_score)
mnb_tfidf_test_score = mnb.score(tv_test_features, test_label_names)
print('Test Accuracy:', mnb_tfidf_test_score)
%%time
lr = LogisticRegression(penalty='l2', max_iter=100, C=1, random_state=42)
lr.fit(tv_train_features, train_label_names)
lr_tfidf_cv_scores = cross_val_score(lr, tv_train_features, train_label_names, cv=5)
lr_tfidf_cv_mean_score = np.mean(lr_tfidf_cv_scores)
print('CV Accuracy (5-fold):', lr_tfidf_cv_scores)
print('Mean CV Accuracy:', lr_tfidf_cv_mean_score)
lr_tfidf_test_score = lr.score(tv_test_features, test_label_names)
print('Test Accuracy:', lr_tfidf_test_score)
%%time
svm = LinearSVC(penalty='l2', C=1, random_state=42)
svm.fit(tv_train_features, train_label_names)
svm_tfidf_cv_scores = cross_val_score(svm, tv_train_features, train_label_names, cv=5)
svm_tfidf_cv_mean_score = np.mean(svm_tfidf_cv_scores)
print('CV Accuracy (5-fold):', svm_tfidf_cv_scores)
print('Mean CV Accuracy:', svm_tfidf_cv_mean_score)
svm_tfidf_test_score = svm.score(tv_test_features, test_label_names)
print('Test Accuracy:', svm_tfidf_test_score)
%%time
svm_sgd = SGDClassifier(loss='hinge', penalty='l2', max_iter=5, random_state=42)
svm_sgd.fit(tv_train_features, train_label_names)
svmsgd_tfidf_cv_scores = cross_val_score(svm_sgd, tv_train_features, train_label_names, cv=5)
svmsgd_tfidf_cv_mean_score = np.mean(svmsgd_tfidf_cv_scores)
print('CV Accuracy (5-fold):', svmsgd_tfidf_cv_scores)
print('Mean CV Accuracy:', svmsgd_tfidf_cv_mean_score)
svmsgd_tfidf_test_score = svm_sgd.score(tv_test_features, test_label_names)
print('Test Accuracy:', svmsgd_tfidf_test_score)
%%time
rfc = RandomForestClassifier(n_estimators=10, random_state=42)
rfc.fit(tv_train_features, train_label_names)
rfc_tfidf_cv_scores = cross_val_score(rfc, tv_train_features, train_label_names, cv=5)
rfc_tfidf_cv_mean_score = np.mean(rfc_tfidf_cv_scores)
print('CV Accuracy (5-fold):', rfc_tfidf_cv_scores)
print('Mean CV Accuracy:', rfc_tfidf_cv_mean_score)
rfc_tfidf_test_score = rfc.score(tv_test_features, test_label_names)
print('Test Accuracy:', rfc_tfidf_test_score)
%%time
gbc = GradientBoostingClassifier(n_estimators=10, random_state=42)
gbc.fit(tv_train_features, train_label_names)
gbc_tfidf_cv_scores = cross_val_score(gbc, tv_train_features, train_label_names, cv=5)
gbc_tfidf_cv_mean_score = np.mean(gbc_tfidf_cv_scores)
print('CV Accuracy (5-fold):', gbc_tfidf_cv_scores)
print('Mean CV Accuracy:', gbc_tfidf_cv_mean_score)
gbc_tfidf_test_score = gbc.score(tv_test_features, test_label_names)
print('Test Accuracy:', gbc_tfidf_test_score)
pd.DataFrame([['Naive Bayes', mnb_bow_cv_mean_score, mnb_bow_test_score,
mnb_tfidf_cv_mean_score, mnb_tfidf_test_score],
['Logistic Regression', lr_bow_cv_mean_score, lr_bow_test_score,
lr_tfidf_cv_mean_score, lr_tfidf_test_score],
['Linear SVM', svm_bow_cv_mean_score, svm_bow_test_score,
svm_tfidf_cv_mean_score, svm_tfidf_test_score],
['Linear SVM (SGD)', svmsgd_bow_cv_mean_score, svmsgd_bow_test_score,
svmsgd_tfidf_cv_mean_score, svmsgd_tfidf_test_score],
['Random Forest', rfc_bow_cv_mean_score, rfc_bow_test_score,
rfc_tfidf_cv_mean_score, rfc_tfidf_test_score],
['Gradient Boosted Machines', gbc_bow_cv_mean_score, gbc_bow_test_score,
gbc_tfidf_cv_mean_score, gbc_tfidf_test_score]],
columns=['Model', 'CV Score (TF)', 'Test Score (TF)', 'CV Score (TF-IDF)', 'Test Score (TF-IDF)'],
).T
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer
mnb_pipeline = Pipeline([('tfidf', TfidfVectorizer()),
('mnb', MultinomialNB())
])
param_grid = {'tfidf__ngram_range': [(1, 1), (1, 2)],
'mnb__alpha': [1e-5, 1e-4, 1e-2, 1e-1, 1]
}
gs_mnb = GridSearchCV(mnb_pipeline, param_grid, cv=5, verbose=2)
gs_mnb = gs_mnb.fit(train_corpus, train_label_names)
gs_mnb.best_estimator_.get_params()
cv_results = gs_mnb.cv_results_
results_df = pd.DataFrame({'rank': cv_results['rank_test_score'],
'params': cv_results['params'],
'cv score (mean)': cv_results['mean_test_score'],
'cv score (std)': cv_results['std_test_score']}
)
results_df = results_df.sort_values(by=['rank'], ascending=True)
pd.set_option('display.max_colwidth', 100)
results_df
best_mnb_test_score = gs_mnb.score(test_corpus, test_label_names)
print(f'Test Accuracy : {best_mnb_test_score}')
import model_evaluation_utils as meu
mnb_predictions = gs_mnb.predict(test_corpus)
unique_classes = list(set(test_label_names))
meu.get_metrics(true_labels=test_label_names, predicted_labels=mnb_predictions)
meu.display_classification_report(true_labels=test_label_names,
predicted_labels=mnb_predictions, classes=unique_classes)
label_data_map = {v:k for k, v in data_labels_map.items()}
label_map_df = pd.DataFrame(list(label_data_map.items()), columns=['Label Name', 'Label Number'])
label_map_df
unique_class_nums = label_map_df['Label Number'].values
mnb_prediction_class_nums = [label_data_map[item] for item in mnb_predictions]
meu.display_confusion_matrix_pretty(true_labels=test_label_nums,
predicted_labels=mnb_prediction_class_nums, classes=unique_class_nums)
unique_classes = label_map_df['Label Name'].values
meu.display_confusion_matrix_pretty(true_labels=test_label_names,
predicted_labels=mnb_predictions, classes=unique_classes)
label_map_df[label_map_df['Label Number'].isin([0, 15, 19])]
train_idx, test_idx = train_test_split(np.array(range(len(data_df['Article']))), test_size=0.33, random_state=42)
test_idx
predict_probas = gs_mnb.predict_proba(test_corpus).max(axis=1)
test_df = data_df.iloc[test_idx]
test_df['Predicted Name'] = mnb_predictions
test_df['Predicted Confidence'] = predict_probas
test_df.head()
pd.set_option('display.max_colwidth', 200)
res_df = (test_df[(test_df['Target Name'] == 'talk.religion.misc') & (test_df['Predicted Name'] == 'soc.religion.christian')]
.sort_values(by=['Predicted Confidence'], ascending=False).head(5))
res_df
pd.set_option('display.max_colwidth', 200)
res_df = (test_df[(test_df['Target Name'] == 'talk.religion.misc') & (test_df['Predicted Name'] == 'alt.atheism')]
.sort_values(by=['Predicted Confidence'], ascending=False).head(5))
res_df