1 선형 회귀모형 1

회귀분석 사례

library(tidyverse)
library(rvest)

data_url <- "https://en.wikipedia.org/wiki/Simple_linear_regression"

data_raw <- read_html(x = data_url) %>% 
  html_elements(".wikitable") %>% 
  html_table() %>% 
  .[[1]] 

reg_tbl <- data_raw %>% 
  janitor::clean_names() %>% 
  pivot_longer(-x1) %>% 
  mutate(구분 = ifelse(str_detect(x1, "Height"), "신장", "몸무게")) %>% 
  select(name, 구분, value) %>% 
  pivot_wider(names_from = 구분, values_from = value)  %>% 
  select(신장, 몸무게)

fs::dir_create("data")

reg_tbl %>% 
  write_rds("data/reg_tbl.rds")

1.1 모형 개발 전

1.1.1 원데이터

reg_tbl <-  
  read_rds("data/reg_tbl.rds")

reg_tbl %>% 
  reactable::reactable()

1.1.2 요약 통계량

reg_tbl %>% 
  skimr::skim()
Data summary
Name Piped data
Number of rows 15
Number of columns 2
_______________________
Column type frequency:
numeric 2
________________________
Group variables None

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
신장 0 1 1.65 0.11 1.47 1.56 1.65 1.74 1.83 ▇▇▇▇▇
몸무게 0 1 62.08 7.04 52.21 56.52 61.29 67.19 74.46 ▇▆▆▆▃

1.1.3 시각화

reg_tbl %>% 
  ggplot(aes(x = 신장, y= 몸무게)) +
    geom_point() +
    labs(title = "선형 회귀모형 예제 데이터",
         x = "신장 (cm)",
         y = "몸무게 (kg)") +
    theme_light()

1.2 수식으로 표현

1.2.1 점화식 수식

1.2.1.1 수식

신장(\(X\))과 몸무게(\(Y\)) 관계를 수식으로 표현

\[ \begin{equation} Y_i = \beta_0 + \beta_1 X_i + \epsilon_i \end{equation} \]

데이터를 통해 신장(\(X\))과 몸무게(\(Y\)) 관계를 추정한 수식

\[ \begin{equation} \hat{몸무게}_i = \hat{\beta}_0 + \hat{\beta}_1 {신장}_i + \hat{\epsilon}_i \end{equation} \]

1.2.1.2 미지수 추정

미지수 \(\hat{\beta}_0\), \(\hat{\beta}_1\) 를 데이터를 이용하여 추정

\[ \begin{equation} \hat{\beta}_1 = \frac{\sum(X_i – \bar{X}) (Y_i – \bar{Y})} {\sum(X_i – \bar{X})^2} \end{equation} \]

\[ \begin{equation} \hat{\beta}_0 = \bar{Y} – \hat{\beta}_1 \bar{X} \end{equation} \]

1.2.1.3 위키 사례

\[ \begin{align*} \mathrm{몸무게}_{i} = \beta_{0} &+ \beta_{1} \times \mathrm{신장}_{i} + \epsilon \end{align*} \]

\[ \begin{align*} \mathrm{몸무게}_{i} = -39.06 &+ 61.27 \times \mathrm{신장}_{i} + \epsilon \end{align*} \]

1.2.2 행렬 수식

1.2.2.1 행렬 표현

\[ \left[ \begin{array}{c} y_1 \\ y_2 \\ \vdots \\ y_n \end{array} \right] = \begin{bmatrix} 1 & x_1 \\ 1 & x_2 \\ \vdots & \vdots \\ 1 & x_n \end{bmatrix} \times \left[ \begin{array}{c} \beta_0 \\ \beta_1 \end{array} \right] + \left[ \begin{array}{c} \epsilon_1 \\ \epsilon_2 \\ \vdots \\ \epsilon_n \end{array} \right] \]

\[ \begin{gather} Y = X \beta + \epsilon \end{gather} \]

1.2.2.2 행렬 방정식 풀기

\[ \begin{gather} Y = X \beta + \epsilon \\ \beta = (X^{T}X)^{-1}X^{T}Y \end{gather} \]

1.2.2.3 행렬 프로그래밍 코드

# Y -----------
Y <- reg_tbl$몸무게


# X -----------
intercept <- rep(1, length(Y))
X <- cbind(beta_0 = intercept, beta_1 = reg_tbl$신장)


betas <- solve(t(X) %*% X) %*% t(X) %*% Y

betas
            [,1]
beta_0 -39.06196
beta_1  61.27219

1.2.2.4 회귀분석 프로그램

lm(몸무게 ~ 신장, data = reg_tbl)

Call:
lm(formula = 몸무게 ~ 신장, data = reg_tbl)

Coefficients:
(Intercept)         신장  
     -39.06        61.27  

1.3 최적화 함수

1.3.1 오차 함수

\[ \min_{\beta} \sum_{i=1}^{n} \epsilon_i = \min_{\beta} \sum_{i=1}^{n}(\beta_0 + \beta_1 \times x_i - y_i)^2 \]

1.3.2 오차 최소화

# reg_tbl
# # A tibble: 15 × 2
#     신장 몸무게
#    <dbl>  <dbl>
#  1  1.47   52.2
#  2  1.5    53.1
#  3  1.52   54.5
#  4  1.55   55.8

loss_fn <- function(par, data){ 
  
  sum((par[1] + par[2] * data$신장 - data$몸무게)^2)
}

optim(par = c(0, 1), fn = loss_fn, method = "BFGS", data = reg_tbl,
      control = list(maxit = 1000, 
                     trace = TRUE,
                     REPORT = 1))
initial  value 55443.106500 
iter   2 value 807.932602
iter   3 value 173.357708
iter   4 value 141.165075
iter   5 value 9.051769
iter   6 value 7.534280
iter   7 value 7.491106
iter   8 value 7.490620
iter   9 value 7.490609
iter  10 value 7.490559
iter  10 value 7.490558
iter  10 value 7.490558
final  value 7.490558 
converged
$par
[1] -39.06195  61.27219

$value
[1] 7.490558

$counts
function gradient 
      22       10 

$convergence
[1] 0

$message
NULL

1.4 회귀모형 검정

reg_tbl %>% 
  ggplot(aes(x = 신장, y= 몸무게)) +
    geom_point() +
    labs(title = "선형 회귀모형 예제 데이터",
         x = "신장 (cm)",
         y = "몸무게 (kg)") +
    theme_light() +
    geom_smooth(method = "lm", se = FALSE, formula =  y ~ x ) +
    ggpmisc::stat_poly_eq(aes(label = paste(..eq.label.., sep = "~~~")), 
               label.x.npc = "right", label.y.npc = 0.15,
               eq.with.lhs = "italic(hat(y))~`=`~",
               eq.x.rhs = "~italic(x)",
               formula = y ~ x, parse = TRUE, size = 5) 

2 이항 회귀모형

2.1 데이터

library(tidyverse)
library(rvest)

lr_data_url <- "https://en.wikipedia.org/wiki/Logistic_regression"

lr_data_raw <- read_html(x = lr_data_url) %>% 
  html_elements(".wikitable") %>% 
  html_table() %>% 
  .[[1]] 

lr_tbl <- lr_data_raw %>% 
  janitor::clean_names() %>% 
  pivot_longer(-x1) %>% 
  mutate(구분 = ifelse(str_detect(x1, "Hours"), "학습시간", "입학여부")) %>% 
  select(name, 구분, value) %>% 
  pivot_wider(names_from = 구분, values_from = value)  %>% 
  select(학습시간, 입학여부)

# fs::dir_create("data")

lr_tbl %>% 
  write_rds("data/lr_tbl.rds")
lr_tbl <-read_rds("data/lr_tbl.rds")

lr_tbl %>% 
  reactable::reactable()

2.2 요약 통계량

lr_tbl %>% 
  skimr::skim()
Data summary
Name Piped data
Number of rows 20
Number of columns 2
_______________________
Column type frequency:
numeric 2
________________________
Group variables None

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
학습시간 0 1 2.79 1.51 0.5 1.69 2.62 4.06 5.5 ▇▇▆▅▅
입학여부 0 1 0.50 0.51 0.0 0.00 0.50 1.00 1.0 ▇▁▁▁▇

2.3 시각화

lr_tbl %>% 
  ggplot(aes(x = 학습시간, y = 입학여부)) +
    geom_point() +
    geom_smooth(method = "glm", 
      method.args = list(family = "binomial"), 
      se = FALSE) +
      labs(title = "학습시간과 입학확률",
           x = "학습시간",
           y = "합격확률 (%)") +
      theme_light() +
      scale_y_continuous(labels = scales::percent)

2.4 내장 모형 활용

adm_lr <- glm(입학여부 ~ 학습시간, family = "binomial", data = lr_tbl)

adm_lr

Call:  glm(formula = 입학여부 ~ 학습시간, family = "binomial", 
    data = lr_tbl)

Coefficients:
(Intercept)     학습시간  
     -4.078        1.505  

Degrees of Freedom: 19 Total (i.e. Null);  18 Residual
Null Deviance:      27.73 
Residual Deviance: 16.06    AIC: 20.06

2.5 합격 예측 시각화

library(crosstalk)

crosstalk_lr_raw <- tibble( 학습시간 = seq(from = 1, to = 5, 0.1) )

crosstalk_lr_tbl <- crosstalk_lr_raw %>% 
  mutate(합격확률 = predict(adm_lr, newdata = crosstalk_lr_raw, type = "response" )) %>% 
  left_join(lr_tbl) %>% 
  mutate(입학여부 = factor(입학여부, levels = c(0, 1), labels = c("불합격", "합격")) )

crosstalk_lr_g <- crosstalk_lr_tbl %>% 
    ggplot(aes(x = 학습시간, y = 합격확률) ) +
      geom_point() +
      geom_point(aes(x = 학습시간, y = as.numeric(입학여부) - 1, color = 입학여부 ) ) +
      geom_smooth(method = "glm", method.args = list(family = "binomial"),
      se = FALSE) +
      labs(title = "학습시간과 입학확률",
           x = "학습시간",
           y = "합격확률 (%)") +
      theme_light() +
      scale_y_continuous(labels = scales::percent)

plotly::ggplotly(crosstalk_lr_g )

2.6 직접 구현

최우추정량(MLE)을 찾는 것은 - 우도(Likelihood)값을 구하는 것과 동일하기 General-purpose optimization 에 함수를 정의해서 모수 초기화하여 함께 넣어 반복적으로 근사시켜 모수를 계산한다.

\[ NLL(y) = -{\log(p(y))} \]

\[ \min_{\theta} \sum_y {-\log(p(y;\theta))} \]

\[ \max_{\theta} \prod_y p(y;\theta) \]

sigmoid_fn <-  function(x){
  
  1/(1+exp(-x))
  
}

neg_log_likelihood <- function(par, data, y, include_alpha = T) {
  
  # 출력변수 정의
  x <- data[,names(data) != y]
  y_data <- data[,y]

  # 1. 선형결합
  if( include_alpha ){

    # 선형결합: beta_1 * x_1 + beta_2 * x_2 + ...
    linear_combination <- mapply("*", x, par[2:length(par)])

    # 알파(편향) 계수 결합 : alpha + beta_1 * x_1 + beta_2 * x_2 + ...
    theta <-  rowSums(linear_combination) + par[1]
  } else {
    theta <- rowSums(mapply("*", x, par))
  }

  # 2. 확률 계산
  p <- sigmoid_fn(theta)
  # p <- exp(theta) / (1 + exp(theta))

  # 3. 우도값 계산: -log likelihood 
  value <- - sum(y_data * log(p) + (1-y_data)*log(1-p)) 

  return(value)
}

library(optimx)

lr_opt <- optimx(
  par = c(0,0),
  fn = neg_log_likelihood,
  data = lr_tbl,
  y = "입학여부",
   method = "Nelder-Mead",
  include_alpha = TRUE,
  itnmax = 100,
  control = list(trace = TRUE, all.methods = FALSE)
)
fn is  fn 
Looking for method =  Nelder-Mead 
Function has  2  arguments
Analytic gradient not made available.
Analytic Hessian not made available.
Scale check -- log parameter ratio= -Inf   log bounds ratio= NA 
Method:  Nelder-Mead 
  Nelder-Mead direct search function minimizer
function value for initial parameters = 13.862944
  Scaled convergence tolerance is 2.06574e-07
Stepsize computed as 0.100000
BUILD              3 13.887933 13.096825
EXTENSION          5 13.862944 12.582216
EXTENSION          7 13.096825 12.067907
EXTENSION          9 12.582216 11.285614
LO-REDUCTION      11 12.067907 11.285614
LO-REDUCTION      13 11.874408 11.285614
EXTENSION         15 11.469089 10.348151
LO-REDUCTION      17 11.285614 10.348151
EXTENSION         19 10.370274 8.914547
LO-REDUCTION      21 10.348151 8.914547
EXTENSION         23 9.266657 8.226456
REFLECTION        25 8.914547 8.030679
LO-REDUCTION      27 8.259889 8.030679
LO-REDUCTION      29 8.226456 8.030679
LO-REDUCTION      31 8.114076 8.030679
HI-REDUCTION      33 8.053435 8.030679
HI-REDUCTION      35 8.052924 8.030679
LO-REDUCTION      37 8.038012 8.030679
HI-REDUCTION      39 8.033349 8.030679
LO-REDUCTION      41 8.031439 8.030144
HI-REDUCTION      43 8.030679 8.030087
HI-REDUCTION      45 8.030144 8.029950
HI-REDUCTION      47 8.030087 8.029938
HI-REDUCTION      49 8.029950 8.029917
HI-REDUCTION      51 8.029938 8.029881
LO-REDUCTION      53 8.029917 8.029881
HI-REDUCTION      55 8.029890 8.029881
LO-REDUCTION      57 8.029889 8.029880
HI-REDUCTION      59 8.029881 8.029880
HI-REDUCTION      61 8.029880 8.029879
HI-REDUCTION      63 8.029880 8.029879
REFLECTION        65 8.029879 8.029879
Exiting from Nelder Mead minimizer
    67 function evaluations used
Post processing for method  Nelder-Mead 
Successful convergence! 
Compute Hessian approximation at finish of  Nelder-Mead 
Compute gradient approximation at finish of  Nelder-Mead 
Save results from method  Nelder-Mead 
$par
[1] -4.076953  1.504453

$value
[1] 8.029879

$message
NULL

$convcode
[1] 0

$fevals
function 
      67 

$gevals
gradient 
      NA 

$nitns
[1] NA

$kkt1
[1] TRUE

$kkt2
[1] TRUE

$xtimes
user.self 
     0.08 

Assemble the answers
lr_opt[, 1:5] %>% 
  as.data.frame() %>% 
  rownames_to_column("method") %>% 
  filter(method == "Nelder-Mead") %>% 
  select(method, p1, p2)
       method        p1       p2
1 Nelder-Mead -4.076953 1.504453

glm() 함수로 구현한 것과 값이 동일한지 상호확인한다.

adm_lr$coefficients
(Intercept)    학습시간 
  -4.077713    1.504645 

2.7 예측값 생성

predict_fn <- function(x, par){

  if( ncol(x) < length(par) ){
    theta <- rowSums(mapply("*", x, par[2:length(par)])) +  as.numeric(par[1])
  } else {
    theta <- rowSums(mapply("*", x, par))
  }

  prob <- sigmoid_fn(theta)

  return(prob)
}

custom_pred <- predict_fn(lr_tbl %>% select(학습시간),  lr_opt[, 1:2])

lr_tbl  %>% 
  mutate(자체제작_예측 = custom_pred) %>% 
  mutate(예측 = predict(adm_lr, newdata = lr_tbl %>% select(학습시간), type ="response")) %>% 
  reactable::reactable()

3 신경망

library(nnet)
library(NeuralNetTools)

lr_nn_tbl <- lr_tbl %>% 
  rename(adm_yn = 입학여부,
         learning_hour = 학습시간) %>% 
  mutate(adm_yn = as.factor(adm_yn))

adm_nn <- nnet(adm_yn ~ learning_hour,
               size = 2,
               softmax = FALSE,
               data = lr_nn_tbl)
# weights:  7
initial  value 14.452720 
iter  10 value 8.853278
iter  20 value 8.047945
iter  30 value 7.576388
iter  40 value 7.018420
iter  50 value 6.241457
iter  60 value 6.193520
iter  70 value 6.185768
iter  80 value 6.185419
iter  90 value 6.184381
iter 100 value 6.184270
final  value 6.184270 
stopped after 100 iterations
plotnet(adm_nn)

cm <- table(lr_nn_tbl$adm_yn, predict(adm_nn, lr_nn_tbl, type="class"))

cat("\n신경망 모형 오차행렬(Confusion matrix): \n")

신경망 모형 오차행렬(Confusion matrix): 
print(cm) 
   
     0  1
  0 10  0
  1  4  6
 

데이터 과학자 이광춘 저작

kwangchun.lee.7@gmail.com