1 구글 추세

2 DL 프레임워크 선택 1

3 케라스

3.1 카라스 R 코드

# 0. keras 와 tensorflow 패키지
library(keras)

# 1. 모형 아키텍처 정의
logistic_reg <- keras_model_sequential() %>%
  layer_dense(units = 1,
              input_shape = 1,
              activation = "sigmoid")

# 모형 컴파일
logistic_reg %>% 
  compile(
    loss = "binary_crossentropy",
    optimizer = optimizer_adam(learning_rate = 0.01),
    metrics = list("accuracy")
  )

# 모형살펴보기
summary(logistic_reg)
Model: "sequential"
________________________________________________________________________________
Layer (type)                        Output Shape                    Param #     
================================================================================
dense (Dense)                       (None, 1)                       2           
================================================================================
Total params: 2
Trainable params: 2
Non-trainable params: 0
________________________________________________________________________________
# 데이터 
lr_raw <-read_rds("data/lr_tbl.rds")

# 정규화 전처리 과정 
lr_tbl <- lr_raw %>% 
  mutate(학습시간 = scale(학습시간) )

# select predictor (x) and dependent variable (y) and convert to matrix
x_train <- as.matrix(lr_tbl %>% select(학습시간))
y_train <- as.matrix(lr_tbl %>% select(입학여부))

# 모형학습
history <- logistic_reg %>%  fit(
  x = x_train,
  y = y_train,
  epochs = 500,
  validation_split = 0,
  verbose = 0
)

plot(history)

# 로지스틱 회귀모형 계수
logistic_reg$weights
[[1]]
<tf.Variable 'dense/kernel:0' shape=(1, 1) dtype=float32, numpy=array([[1.3267483]], dtype=float32)>

[[2]]
<tf.Variable 'dense/bias:0' shape=(1,) dtype=float32, numpy=array([0.03166157], dtype=float32)>

3.2 GLM 통계 R 코드

# 비교를 위해 GLM 적합
glm_fit <- glm(입학여부  ~ scale(학습시간 ), data = lr_tbl, family = binomial)
glm_fit

Call:  glm(formula = 입학여부 ~ scale(학습시간), family = binomial, 
    data = lr_tbl)

Coefficients:
    (Intercept)  scale(학습시간)  
         0.1165           2.2677  

Degrees of Freedom: 19 Total (i.e. Null);  18 Residual
Null Deviance:      27.73 
Residual Deviance: 16.06    AIC: 20.06

3.3 성능 비교

tensorflowkeras 프레임워크 사용

library(yardstick)

# 합격여부 예측
keras_pred_tbl <- lr_tbl %>% 
  mutate(predicted_tf = predict(logistic_reg, x_train),
         class_tf     = ifelse(predicted_tf < 0.5, 0, 1)) %>% 
  mutate(입학여부 = factor(입학여부),
         class_tf = factor(class_tf))

keras_pred_tbl %>% 
  conf_mat(truth = 입학여부, estimate = class_tf,
           dnn = c("예측값", "참값"))
      참값
예측값 0 1
     0 8 3
     1 2 7
keras_pred_tbl %>% 
  accuracy(truth = 입학여부, estimate = class_tf)
# A tibble: 1 × 3
  .metric  .estimator .estimate
  <chr>    <chr>          <dbl>
1 accuracy binary          0.75

GLM 로지스틱 모형 사용

glm_fit <- glm(입학여부  ~ scale(학습시간 ), data = lr_tbl, family = binomial)
glm_fit

Call:  glm(formula = 입학여부 ~ scale(학습시간), family = binomial, 
    data = lr_tbl)

Coefficients:
    (Intercept)  scale(학습시간)  
         0.1165           2.2677  

Degrees of Freedom: 19 Total (i.e. Null);  18 Residual
Null Deviance:      27.73 
Residual Deviance: 16.06    AIC: 20.06
lr_pred_tbl <- lr_tbl %>% 
  mutate(predicted_lr = predict(glm_fit, newdata=lr_tbl, type = "response")) %>% 
  mutate(class_lr     = ifelse(predicted_lr < 0.5, 0, 1))  %>% 
  mutate(입학여부 = factor(입학여부),
         class_lr = factor(class_lr))
  
lr_pred_tbl %>% 
  conf_mat(truth = 입학여부, estimate = class_lr,
           dnn = c("예측값", "참값"))
      참값
예측값 0 1
     0 8 2
     1 2 8
lr_pred_tbl %>% 
  accuracy(truth = 입학여부, estimate = class_lr)
# A tibble: 1 × 3
  .metric  .estimator .estimate
  <chr>    <chr>          <dbl>
1 accuracy binary           0.8

3.4 배포

# hdf5 format 내보내기
save_model_hdf5(logistic_reg, 'data/keras_lr_model.h5')

# 모형 불러오기
library(keras)
lr_model <- load_model_hdf5('data/keras_lr_model.h5')

# 신규 데이터 예측활용
lr_model$weights
[[1]]
<tf.Variable 'dense/kernel:0' shape=(1, 1) dtype=float32, numpy=array([[1.3267483]], dtype=float32)>

[[2]]
<tf.Variable 'dense/bias:0' shape=(1,) dtype=float32, numpy=array([0.03166157], dtype=float32)>
predict(lr_model, scale(c(0.1, 5, 10)))
          [,1]
[1,] 0.2157518
[2,] 0.5056816
[3,] 0.7962232
 

데이터 과학자 이광춘 저작

kwangchun.lee.7@gmail.com