틱택토(tic-tac-toe) 데이터셋을 GitHub에서 불러와서 One-Hot 인코딩으로 피처공학 기법을 적용하여 준비를 한 뒤에 RandomForestClassifier
모형으로 승부를 예측한다.
# 예측모형 관련 모듈 불러오기
import pandas as pd # 데이터셋
from sklearn.preprocessing import LabelEncoder # Feature 공학
from sklearn.model_selection import train_test_split # 훈련/시험 데이터셋 분할
from sklearn.feature_selection import SelectKBest, f_classif # 변수선택
from sklearn.ensemble import RandomForestClassifier # 예측모형
from sklearn.metrics import accuracy_score # 성능측정
from sklearn.model_selection import GridSearchCV # 초모수 탐색
from sklearn.pipeline import Pipeline # 파이프라인
from sklearn.metrics import f1_score, make_scorer # 평가지표
from sklearn.metrics import confusion_matrix # 비용 오차행렬
# 원본 데이터 가져오기
ttt_pd = pd.read_csv("https://raw.githubusercontent.com/datasets/tic-tac-toe/master/data/tic-tac-toe.csv")
# 피처 공학
ttt_pd = pd.get_dummies(ttt_pd)
# 예측변수와 Label 구분, 훈련/시험 데이터 분할
features, labels = ttt_pd.drop('class', 1), ttt_pd['class']
X_train, X_test, y_train, y_test = train_test_split(
features, labels, test_size = 0.3, random_state=7)
# 성능비교를 위한 딕셔너리
accuracies = {}
# Random Forest ------------------------------
rf_model = RandomForestClassifier(random_state = 77).fit(
X_train, y_train)
C:\Users\STATKC~1\ANACON~1\lib\site-packages\sklearn\ensemble\forest.py:246: FutureWarning: The default value of n_estimators will change from 10 in version 0.20 to 100 in 0.22.
"10 in version 0.20 to 100 in 0.22.", FutureWarning)
rf_predictions = rf_model.predict(X_test)
pd.Series(rf_predictions).value_counts()
True 191
False 97
dtype: int64
accuracies['rf'] = accuracy_score(y_test, rf_predictions)
accuracies
{'rf': 0.9236111111111112}
예측모형이 X_test
를 바탕으로 시험데이터(y_test
)를 예측하여 정확도를 파악한다.
from sklearn.metrics import accuracy_score, precision_score, recall_score
model_metrics = {}
model_metrics['rf: 정확도'] = accuracy_score(y_test, rf_predictions)
model_metrics['rf: Precision'] = precision_score(y_test, rf_predictions)
model_metrics['rf: Recall'] = recall_score(y_test, rf_predictions)
pd.DataFrame([model_metrics])
rf: Precision rf: Recall rf: 정확도
0 0.91623 0.966851 0.923611
혼동행렬(confusion matrix)를 confusion_matrix()
함수를 사용해서 예측모형이 정확히 예측한 것과 오차가 생긴 곳을 비교할 수 있다.
from sklearn.metrics import confusion_matrix
ttt_cm = confusion_matrix(y_test, rf_predictions)
print(ttt_cm)
[[ 91 16]
[ 6 175]]
초모수와 예측모형 성능을 산출해보자. 가장 먼저 random forest 예측모형의 의사결정나무 숫자를 10 부터 100까지 변화시켜서 10개 모형을 만들고 각각의 성능변화를 살펴본다.
test_scores, train_scores = [], []
for i in range(10, 100, 10):
rf_model = RandomForestClassifier(n_estimators = i, random_state=777)
rf_model.fit(X_train, y_train)
train_preds = rf_model.predict(X_train)
test_preds = rf_model.predict(X_test)
train_scores.append(round(accuracy_score(y_train, train_preds), 1))
test_scores.append(round(accuracy_score(y_test, test_preds), 1))
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
max_depth=None, max_features='auto', max_leaf_nodes=None,
min_impurity_decrease=0.0, min_impurity_split=None,
min_samples_leaf=1, min_samples_split=2,
min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
oob_score=False, random_state=777, verbose=0, warm_start=False)
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
max_depth=None, max_features='auto', max_leaf_nodes=None,
min_impurity_decrease=0.0, min_impurity_split=None,
min_samples_leaf=1, min_samples_split=2,
min_weight_fraction_leaf=0.0, n_estimators=20, n_jobs=None,
oob_score=False, random_state=777, verbose=0, warm_start=False)
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
max_depth=None, max_features='auto', max_leaf_nodes=None,
min_impurity_decrease=0.0, min_impurity_split=None,
min_samples_leaf=1, min_samples_split=2,
min_weight_fraction_leaf=0.0, n_estimators=30, n_jobs=None,
oob_score=False, random_state=777, verbose=0, warm_start=False)
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
max_depth=None, max_features='auto', max_leaf_nodes=None,
min_impurity_decrease=0.0, min_impurity_split=None,
min_samples_leaf=1, min_samples_split=2,
min_weight_fraction_leaf=0.0, n_estimators=40, n_jobs=None,
oob_score=False, random_state=777, verbose=0, warm_start=False)
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
max_depth=None, max_features='auto', max_leaf_nodes=None,
min_impurity_decrease=0.0, min_impurity_split=None,
min_samples_leaf=1, min_samples_split=2,
min_weight_fraction_leaf=0.0, n_estimators=50, n_jobs=None,
oob_score=False, random_state=777, verbose=0, warm_start=False)
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
max_depth=None, max_features='auto', max_leaf_nodes=None,
min_impurity_decrease=0.0, min_impurity_split=None,
min_samples_leaf=1, min_samples_split=2,
min_weight_fraction_leaf=0.0, n_estimators=60, n_jobs=None,
oob_score=False, random_state=777, verbose=0, warm_start=False)
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
max_depth=None, max_features='auto', max_leaf_nodes=None,
min_impurity_decrease=0.0, min_impurity_split=None,
min_samples_leaf=1, min_samples_split=2,
min_weight_fraction_leaf=0.0, n_estimators=70, n_jobs=None,
oob_score=False, random_state=777, verbose=0, warm_start=False)
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
max_depth=None, max_features='auto', max_leaf_nodes=None,
min_impurity_decrease=0.0, min_impurity_split=None,
min_samples_leaf=1, min_samples_split=2,
min_weight_fraction_leaf=0.0, n_estimators=80, n_jobs=None,
oob_score=False, random_state=777, verbose=0, warm_start=False)
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
max_depth=None, max_features='auto', max_leaf_nodes=None,
min_impurity_decrease=0.0, min_impurity_split=None,
min_samples_leaf=1, min_samples_split=2,
min_weight_fraction_leaf=0.0, n_estimators=90, n_jobs=None,
oob_score=False, random_state=777, verbose=0, warm_start=False)
ttt_rf_df = pd.DataFrame({'train': train_scores,
'test': test_scores})
ttt_rf_df
train test
0 1.0 0.9
1 1.0 0.9
2 1.0 0.9
3 1.0 1.0
4 1.0 1.0
5 1.0 0.9
6 1.0 0.9
7 1.0 0.9
8 1.0 0.9
예측모형 초모수를 임의추출하여 적합시키는 예측모형을 구축해본다.
import random
# 의사결정나무 갯수
n_trees = list(range(10, 100, 10))
# 의사결정나무 깊이
max_depth = [4, 8, 12, 16]
# 노드당 관측점 갯수
min_obs_split = [3, 5, 7, 9]
# 최대 feature
max_features = [3, 5, 7, 9]
rf_model = rfr = RandomForestClassifier(n_estimators=random.choice(n_trees),
max_depth=random.choice(max_depth),
min_samples_split=random.choice(min_obs_split),
max_features=random.choice(max_features))
rf_model.fit(X_train, y_train)
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
max_depth=16, max_features=7, max_leaf_nodes=None,
min_impurity_decrease=0.0, min_impurity_split=None,
min_samples_leaf=1, min_samples_split=9,
min_weight_fraction_leaf=0.0, n_estimators=20, n_jobs=None,
oob_score=False, random_state=None, verbose=0,
warm_start=False)
train_preds = rf_model.predict(X_train)
test_preds = rf_model.predict(X_test)
test_scores.append(round(accuracy_score(y_test, test_preds), 1))
print(' 훈련 정확도: {0:.2}'.format(round(accuracy_score(y_train, train_preds), 1)), "\n",
'시험 정확도: {0:.2}'.format(round(accuracy_score(y_test, test_preds), 1)), )
훈련 정확도: 1.0
시험 정확도: 0.9
RandomizedSearchCV()
메쏘드를 사용해서 격자탐색보다 효율적으로 최적 모수를 탐색해보자.
from sklearn.model_selection import RandomizedSearchCV
param_list = {"n_estimators": list(range(10, 100, 10)),
"max_depth": [4, 8, 12, 16],
"max_features": [3, 5, 7, 9],
"min_samples_split": [3, 5, 7, 9]}
rf_model = RandomForestClassifier(random_state = 777)
rf_random_search = RandomizedSearchCV(
estimator=rf_model,
param_distributions = param_list,
n_iter = 10,
cv = 5,
scoring = make_scorer(accuracy_score))
rf_random_search.fit(X_train, y_train)
## 선택된 초모수
RandomizedSearchCV(cv=5, error_score='raise-deprecating',
estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
max_depth=None, max_features='auto', max_leaf_nodes=None,
min_impurity_decrease=0.0, min_impurity_split=None,
min_samples_leaf=1, min_samples_split=2,
min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,
oob_score=False, random_state=777, verbose=0, warm_start=False),
fit_params=None, iid='warn', n_iter=10, n_jobs=None,
param_distributions={'n_estimators': [10, 20, 30, 40, 50, 60, 70, 80, 90], 'max_depth': [4, 8, 12, 16], 'max_features': [3, 5, 7, 9], 'min_samples_split': [3, 5, 7, 9]},
pre_dispatch='2*n_jobs', random_state=None, refit=True,
return_train_score='warn', scoring=make_scorer(accuracy_score),
verbose=0)
rf_random_search.best_params_
## 훈련/시험 성능
{'n_estimators': 80, 'min_samples_split': 3, 'max_features': 9, 'max_depth': 12}
train_preds = rf_random_search.predict(X_train)
test_preds = rf_random_search.predict(X_test)
test_scores.append(round(accuracy_score(y_test, test_preds), 1))
print(' 훈련 정확도: {0:.2}'.format(round(accuracy_score(y_train, train_preds), 1)), "\n",
'시험 정확도: {0:.2}'.format(round(accuracy_score(y_test, test_preds), 1)), )
훈련 정확도: 1.0
시험 정확도: 1.0