library(tidyverse)
library(rvest)
<- "https://en.wikipedia.org/wiki/Simple_linear_regression"
data_url
<- read_html(x = data_url) %>%
data_raw html_elements(".wikitable") %>%
html_table() %>%
1]]
.[[
<- data_raw %>%
reg_tbl ::clean_names() %>%
janitorpivot_longer(-x1) %>%
mutate(구분 = ifelse(str_detect(x1, "Height"), "신장", "몸무게")) %>%
select(name, 구분, value) %>%
pivot_wider(names_from = 구분, values_from = value) %>%
select(신장, 몸무게)
::dir_create("data")
fs
%>%
reg_tbl write_rds("data/reg_tbl.rds")
<-
reg_tbl read_rds("data/reg_tbl.rds")
%>%
reg_tbl ::reactable() reactable
%>%
reg_tbl ::skim() skimr
Name | Piped data |
Number of rows | 15 |
Number of columns | 2 |
_______________________ | |
Column type frequency: | |
numeric | 2 |
________________________ | |
Group variables | None |
Variable type: numeric
skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
---|---|---|---|---|---|---|---|---|---|---|
신장 | 0 | 1 | 1.65 | 0.11 | 1.47 | 1.56 | 1.65 | 1.74 | 1.83 | ▇▇▇▇▇ |
몸무게 | 0 | 1 | 62.08 | 7.04 | 52.21 | 56.52 | 61.29 | 67.19 | 74.46 | ▇▆▆▆▃ |
%>%
reg_tbl ggplot(aes(x = 신장, y= 몸무게)) +
geom_point() +
labs(title = "선형 회귀모형 예제 데이터",
x = "신장 (cm)",
y = "몸무게 (kg)") +
theme_light()
신장(\(X\))과 몸무게(\(Y\)) 관계를 수식으로 표현
\[ \begin{equation} Y_i = \beta_0 + \beta_1 X_i + \epsilon_i \end{equation} \]
데이터를 통해 신장(\(X\))과 몸무게(\(Y\)) 관계를 추정한 수식
\[ \begin{equation} \hat{몸무게}_i = \hat{\beta}_0 + \hat{\beta}_1 {신장}_i + \hat{\epsilon}_i \end{equation} \]
미지수 \(\hat{\beta}_0\), \(\hat{\beta}_1\) 를 데이터를 이용하여 추정
\[ \begin{equation} \hat{\beta}_1 = \frac{\sum(X_i – \bar{X}) (Y_i – \bar{Y})} {\sum(X_i – \bar{X})^2} \end{equation} \]
\[ \begin{equation} \hat{\beta}_0 = \bar{Y} – \hat{\beta}_1 \bar{X} \end{equation} \]
\[ \begin{align*} \mathrm{몸무게}_{i} = \beta_{0} &+ \beta_{1} \times \mathrm{신장}_{i} + \epsilon \end{align*} \]
\[ \begin{align*} \mathrm{몸무게}_{i} = -39.06 &+ 61.27 \times \mathrm{신장}_{i} + \epsilon \end{align*} \]
\[ \left[ \begin{array}{c} y_1 \\ y_2 \\ \vdots \\ y_n \end{array} \right] = \begin{bmatrix} 1 & x_1 \\ 1 & x_2 \\ \vdots & \vdots \\ 1 & x_n \end{bmatrix} \times \left[ \begin{array}{c} \beta_0 \\ \beta_1 \end{array} \right] + \left[ \begin{array}{c} \epsilon_1 \\ \epsilon_2 \\ \vdots \\ \epsilon_n \end{array} \right] \]
\[ \begin{gather} Y = X \beta + \epsilon \end{gather} \]
\[ \begin{gather} Y = X \beta + \epsilon \\ \beta = (X^{T}X)^{-1}X^{T}Y \end{gather} \]
# Y -----------
<- reg_tbl$몸무게
Y
# X -----------
<- rep(1, length(Y))
intercept <- cbind(beta_0 = intercept, beta_1 = reg_tbl$신장)
X
<- solve(t(X) %*% X) %*% t(X) %*% Y
betas
betas
[,1]
beta_0 -39.06196
beta_1 61.27219
lm(몸무게 ~ 신장, data = reg_tbl)
Call:
lm(formula = 몸무게 ~ 신장, data = reg_tbl)
Coefficients:
(Intercept) 신장
-39.06 61.27
\[ \min_{\beta} \sum_{i=1}^{n} \epsilon_i = \min_{\beta} \sum_{i=1}^{n}(\beta_0 + \beta_1 \times x_i - y_i)^2 \]
# reg_tbl
# # A tibble: 15 × 2
# 신장 몸무게
# <dbl> <dbl>
# 1 1.47 52.2
# 2 1.5 53.1
# 3 1.52 54.5
# 4 1.55 55.8
<- function(par, data){
loss_fn
sum((par[1] + par[2] * data$신장 - data$몸무게)^2)
}
optim(par = c(0, 1), fn = loss_fn, method = "BFGS", data = reg_tbl,
control = list(maxit = 1000,
trace = TRUE,
REPORT = 1))
initial value 55443.106500
iter 2 value 807.932602
iter 3 value 173.357708
iter 4 value 141.165075
iter 5 value 9.051769
iter 6 value 7.534280
iter 7 value 7.491106
iter 8 value 7.490620
iter 9 value 7.490609
iter 10 value 7.490559
iter 10 value 7.490558
iter 10 value 7.490558
final value 7.490558
converged
$par
[1] -39.06195 61.27219
$value
[1] 7.490558
$counts
function gradient
22 10
$convergence
[1] 0
$message
NULL
%>%
reg_tbl ggplot(aes(x = 신장, y= 몸무게)) +
geom_point() +
labs(title = "선형 회귀모형 예제 데이터",
x = "신장 (cm)",
y = "몸무게 (kg)") +
theme_light() +
geom_smooth(method = "lm", se = FALSE, formula = y ~ x ) +
::stat_poly_eq(aes(label = paste(..eq.label.., sep = "~~~")),
ggpmisclabel.x.npc = "right", label.y.npc = 0.15,
eq.with.lhs = "italic(hat(y))~`=`~",
eq.x.rhs = "~italic(x)",
formula = y ~ x, parse = TRUE, size = 5)
library(tidyverse)
library(rvest)
<- "https://en.wikipedia.org/wiki/Logistic_regression"
lr_data_url
<- read_html(x = lr_data_url) %>%
lr_data_raw html_elements(".wikitable") %>%
html_table() %>%
1]]
.[[
<- lr_data_raw %>%
lr_tbl ::clean_names() %>%
janitorpivot_longer(-x1) %>%
mutate(구분 = ifelse(str_detect(x1, "Hours"), "학습시간", "입학여부")) %>%
select(name, 구분, value) %>%
pivot_wider(names_from = 구분, values_from = value) %>%
select(학습시간, 입학여부)
# fs::dir_create("data")
%>%
lr_tbl write_rds("data/lr_tbl.rds")
<-read_rds("data/lr_tbl.rds")
lr_tbl
%>%
lr_tbl ::reactable() reactable
%>%
lr_tbl ::skim() skimr
Name | Piped data |
Number of rows | 20 |
Number of columns | 2 |
_______________________ | |
Column type frequency: | |
numeric | 2 |
________________________ | |
Group variables | None |
Variable type: numeric
skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
---|---|---|---|---|---|---|---|---|---|---|
학습시간 | 0 | 1 | 2.79 | 1.51 | 0.5 | 1.69 | 2.62 | 4.06 | 5.5 | ▇▇▆▅▅ |
입학여부 | 0 | 1 | 0.50 | 0.51 | 0.0 | 0.00 | 0.50 | 1.00 | 1.0 | ▇▁▁▁▇ |
%>%
lr_tbl ggplot(aes(x = 학습시간, y = 입학여부)) +
geom_point() +
geom_smooth(method = "glm",
method.args = list(family = "binomial"),
se = FALSE) +
labs(title = "학습시간과 입학확률",
x = "학습시간",
y = "합격확률 (%)") +
theme_light() +
scale_y_continuous(labels = scales::percent)
<- glm(입학여부 ~ 학습시간, family = "binomial", data = lr_tbl)
adm_lr
adm_lr
Call: glm(formula = 입학여부 ~ 학습시간, family = "binomial",
data = lr_tbl)
Coefficients:
(Intercept) 학습시간
-4.078 1.505
Degrees of Freedom: 19 Total (i.e. Null); 18 Residual
Null Deviance: 27.73
Residual Deviance: 16.06 AIC: 20.06
library(crosstalk)
<- tibble( 학습시간 = seq(from = 1, to = 5, 0.1) )
crosstalk_lr_raw
<- crosstalk_lr_raw %>%
crosstalk_lr_tbl mutate(합격확률 = predict(adm_lr, newdata = crosstalk_lr_raw, type = "response" )) %>%
left_join(lr_tbl) %>%
mutate(입학여부 = factor(입학여부, levels = c(0, 1), labels = c("불합격", "합격")) )
<- crosstalk_lr_tbl %>%
crosstalk_lr_g ggplot(aes(x = 학습시간, y = 합격확률) ) +
geom_point() +
geom_point(aes(x = 학습시간, y = as.numeric(입학여부) - 1, color = 입학여부 ) ) +
geom_smooth(method = "glm", method.args = list(family = "binomial"),
se = FALSE) +
labs(title = "학습시간과 입학확률",
x = "학습시간",
y = "합격확률 (%)") +
theme_light() +
scale_y_continuous(labels = scales::percent)
::ggplotly(crosstalk_lr_g ) plotly
최우추정량(MLE)을 찾는 것은 - 우도(Likelihood)값을 구하는 것과 동일하기 General-purpose optimization 에 함수를 정의해서 모수 초기화하여 함께 넣어 반복적으로 근사시켜 모수를 계산한다.
\[ NLL(y) = -{\log(p(y))} \]
\[ \min_{\theta} \sum_y {-\log(p(y;\theta))} \]
\[ \max_{\theta} \prod_y p(y;\theta) \]
<- function(x){
sigmoid_fn
1/(1+exp(-x))
}
<- function(par, data, y, include_alpha = T) {
neg_log_likelihood
# 출력변수 정의
<- data[,names(data) != y]
x <- data[,y]
y_data
# 1. 선형결합
if( include_alpha ){
# 선형결합: beta_1 * x_1 + beta_2 * x_2 + ...
<- mapply("*", x, par[2:length(par)])
linear_combination
# 알파(편향) 계수 결합 : alpha + beta_1 * x_1 + beta_2 * x_2 + ...
<- rowSums(linear_combination) + par[1]
theta else {
} <- rowSums(mapply("*", x, par))
theta
}
# 2. 확률 계산
<- sigmoid_fn(theta)
p # p <- exp(theta) / (1 + exp(theta))
# 3. 우도값 계산: -log likelihood
<- - sum(y_data * log(p) + (1-y_data)*log(1-p))
value
return(value)
}
library(optimx)
<- optimx(
lr_opt par = c(0,0),
fn = neg_log_likelihood,
data = lr_tbl,
y = "입학여부",
method = "Nelder-Mead",
include_alpha = TRUE,
itnmax = 100,
control = list(trace = TRUE, all.methods = FALSE)
)
fn is fn
Looking for method = Nelder-Mead
Function has 2 arguments
Analytic gradient not made available.
Analytic Hessian not made available.
Scale check -- log parameter ratio= -Inf log bounds ratio= NA
Method: Nelder-Mead
Nelder-Mead direct search function minimizer
function value for initial parameters = 13.862944
Scaled convergence tolerance is 2.06574e-07
Stepsize computed as 0.100000
BUILD 3 13.887933 13.096825
EXTENSION 5 13.862944 12.582216
EXTENSION 7 13.096825 12.067907
EXTENSION 9 12.582216 11.285614
LO-REDUCTION 11 12.067907 11.285614
LO-REDUCTION 13 11.874408 11.285614
EXTENSION 15 11.469089 10.348151
LO-REDUCTION 17 11.285614 10.348151
EXTENSION 19 10.370274 8.914547
LO-REDUCTION 21 10.348151 8.914547
EXTENSION 23 9.266657 8.226456
REFLECTION 25 8.914547 8.030679
LO-REDUCTION 27 8.259889 8.030679
LO-REDUCTION 29 8.226456 8.030679
LO-REDUCTION 31 8.114076 8.030679
HI-REDUCTION 33 8.053435 8.030679
HI-REDUCTION 35 8.052924 8.030679
LO-REDUCTION 37 8.038012 8.030679
HI-REDUCTION 39 8.033349 8.030679
LO-REDUCTION 41 8.031439 8.030144
HI-REDUCTION 43 8.030679 8.030087
HI-REDUCTION 45 8.030144 8.029950
HI-REDUCTION 47 8.030087 8.029938
HI-REDUCTION 49 8.029950 8.029917
HI-REDUCTION 51 8.029938 8.029881
LO-REDUCTION 53 8.029917 8.029881
HI-REDUCTION 55 8.029890 8.029881
LO-REDUCTION 57 8.029889 8.029880
HI-REDUCTION 59 8.029881 8.029880
HI-REDUCTION 61 8.029880 8.029879
HI-REDUCTION 63 8.029880 8.029879
REFLECTION 65 8.029879 8.029879
Exiting from Nelder Mead minimizer
67 function evaluations used
Post processing for method Nelder-Mead
Successful convergence!
Compute Hessian approximation at finish of Nelder-Mead
Compute gradient approximation at finish of Nelder-Mead
Save results from method Nelder-Mead
$par
[1] -4.076953 1.504453
$value
[1] 8.029879
$message
NULL
$convcode
[1] 0
$fevals
function
67
$gevals
gradient
NA
$nitns
[1] NA
$kkt1
[1] TRUE
$kkt2
[1] TRUE
$xtimes
user.self
0.08
Assemble the answers
1:5] %>%
lr_opt[, as.data.frame() %>%
rownames_to_column("method") %>%
filter(method == "Nelder-Mead") %>%
select(method, p1, p2)
method p1 p2
1 Nelder-Mead -4.076953 1.504453
glm()
함수로 구현한 것과 값이 동일한지 상호확인한다.
$coefficients adm_lr
(Intercept) 학습시간
-4.077713 1.504645
<- function(x, par){
predict_fn
if( ncol(x) < length(par) ){
<- rowSums(mapply("*", x, par[2:length(par)])) + as.numeric(par[1])
theta else {
} <- rowSums(mapply("*", x, par))
theta
}
<- sigmoid_fn(theta)
prob
return(prob)
}
<- predict_fn(lr_tbl %>% select(학습시간), lr_opt[, 1:2])
custom_pred
%>%
lr_tbl mutate(자체제작_예측 = custom_pred) %>%
mutate(예측 = predict(adm_lr, newdata = lr_tbl %>% select(학습시간), type ="response")) %>%
::reactable() reactable
library(nnet)
library(NeuralNetTools)
<- lr_tbl %>%
lr_nn_tbl rename(adm_yn = 입학여부,
learning_hour = 학습시간) %>%
mutate(adm_yn = as.factor(adm_yn))
<- nnet(adm_yn ~ learning_hour,
adm_nn size = 2,
softmax = FALSE,
data = lr_nn_tbl)
# weights: 7
initial value 14.452720
iter 10 value 8.853278
iter 20 value 8.047945
iter 30 value 7.576388
iter 40 value 7.018420
iter 50 value 6.241457
iter 60 value 6.193520
iter 70 value 6.185768
iter 80 value 6.185419
iter 90 value 6.184381
iter 100 value 6.184270
final value 6.184270
stopped after 100 iterations
plotnet(adm_nn)
<- table(lr_nn_tbl$adm_yn, predict(adm_nn, lr_nn_tbl, type="class"))
cm
cat("\n신경망 모형 오차행렬(Confusion matrix): \n")
신경망 모형 오차행렬(Confusion matrix):
print(cm)
0 1
0 10 0
1 4 6
데이터 과학자 이광춘 저작
kwangchun.lee.7@gmail.com