\[ \frac{1} {1 + e^{- \beta_0 - \beta_1X_i - \cdots - \beta_nX_n)} } = \text{출력} \]
회귀모형
\[ z = \left ( \sum_i w_i \times x_i + b \right ) \]
시각화
library(tidyverse)
## 선형회귀 모형
<- function(x) {x}
reg_func
ggplot(data.frame(x=c(-4, 4)), mapping=aes(x=x)) +
geom_hline(yintercept=0, color='red') +
geom_vline(xintercept=0, color='red') +
stat_function(fun=reg_func, colour = "dodgerblue3") +
ggtitle('회귀함수') +
scale_x_continuous(name='x') +
scale_y_continuous(name='f(x)') +
theme(plot.title = element_text(hjust = 0.5)) +
theme_light()
Sigmoid (이항 회귀모형)
\[ \sigma(z) = \left ( \frac{1} {1 + e^{-z}} \right ) \]
시각화
<- function(f, title, range){
plot_activation_function ggplot(data.frame(x=range), mapping=aes(x=x)) +
geom_hline(yintercept=0, color='red', alpha=1/4) +
geom_vline(xintercept=0, color='red', alpha=1/4) +
stat_function(fun=f, colour = "dodgerblue3") +
ggtitle(title) +
scale_x_continuous(name='x') +
scale_y_continuous(name='f(x)') +
theme(plot.title = element_text(hjust = 0.5)) +
theme_light()
}
<- function(x){1 / (1 + exp(-x))}
f plot_activation_function(f, '이항 회귀모형', c(-4,4))
Tanh(Hyperbolic Tangent)
\[ tanh(x) = \frac{e^x - e^{-x}}{e^x + e^{-x}} = \frac{1 - e^{-2x}}{1 + e^{-2x}} \]
시각화
<- function(x){tanh(x)}
tanh_func plot_activation_function(tanh_func, 'TanH', c(-4,4))
Softmax (다중분류 회귀모형)
\[ \sigma(z_i) = \frac{e^{z_{i}}}{\sum_{j=1}^K e^{z_{j}}} \ \ \ for\ i=1,2,\dots,K \]
시각화
ReLu (Rectified Linear Unit)
\[ ReLu(z) = max(0, z) \]
시각화
<- function(x){ ifelse(x < 0 , 0, x )}
rec_lu_func plot_activation_function(rec_lu_func, 'RecLU', c(-4,4))
연속형
\[ \sum_{i=1}^{D}|x_i-y_i| \]
\[ \sum_{i=1}^{D}(x_i-y_i)^2 \]
\[ L_{\delta}= \left\{\begin{matrix} \frac{1}{2}(y - \hat{y})^{2} & if \left | (y - \hat{y}) \right | < \delta\\ \delta ((y - \hat{y}) - \frac1 2 \delta) & otherwise \end{matrix}\right. \]
범주형
\[ -{(y\log(p) + (1 - y)\log(1 - p))} \]
\[ -\sum_{c=1}^My_{o,c}\log(p_{o,c}) \]
\[ NLL(y) = -{\log(p(y))} \]
\[ \min_{\theta} \sum_y {-\log(p(y;\theta))} \]
\[ \max_{\theta} \prod_y p(y;\theta) \]
\[ max(0, 1 - y \cdot \hat{y}) \]
\[ KL(\hat{y} || y) = \sum_{c=1}^{M}\hat{y}_c \log{\frac{\hat{y}_c}{y_c}} \]
\[ JS(\hat{y} || y) = \frac{1}{2}(KL(y||\frac{y+\hat{y}}{2}) + KL(\hat{y}||\frac{y+\hat{y}}{2})) \]
\[\frac{dy}{dx} = \frac{dy}{du} \cdot \frac{du}{dx}\] 예를 들어 \(f(x) = (x^3 + 2x)^4\) 를 미분하면…
\[ f'(x) = 4 \times (x^3 + 2x)^3 \times \frac{d}{dx}(x^3 + 2x) \\ = 4 \times (x^3 + 2x)^3 \times (3x^2 + 2) \]
\(w_{i,j}^k\) 가중치에 대한 손실함수 \(E\)에 대한 편미분값을 다음과 같이 정의할 수 있다.
\[ \frac{\partial E}{ \partial w} = \delta_{j}^{k} o_{i}^{k-1} \]
각 입력값과 출력값에 대응되는 편미분값을 다음과 같이 정의할 수 있고,
\[ \frac{ \partial E(X,\theta)}{\partial w_{ij}^k} = \frac{1}{N} \sum_{d=1}^{N}\frac{\partial}{\partial w_{ij}^k} \left ( \frac{1}{2} (\hat{y}_d - y_d)^2 \right ) = \frac{1}{N} \sum_{d=1}^{N}\frac{\partial E_d}{\partial w_{ij}^k} \] 따라서, 가중치를 다음과 같이 계산할 수 있다.
\[\Delta w_{ij}^k = - \alpha \frac{ \partial E(X,\theta)}{\partial w_{ij}^k} \]
\[ x \beta = \beta_0 + \beta_1X_i + \cdots + \beta_nX_n \\ \] \[ \text{활성화 함수} = {\frac{exp(x\beta)}{1 + exp (x\beta)}} = {\frac{1}{1 + exp (-x\beta)}} \]
\[ \text{확률} = \frac{exp( \beta_0 + \beta_1X_i + \cdots + \beta_nX_n )} {1 + exp ( \beta_0 + \beta_1X_i + \cdots + \beta_nX_n)} = \frac{1} {1 + e^{- \beta_0 - \beta_1X_i - \cdots - \beta_nX_n)} } \]
문제는 사진속에 담긴 고양이와 개 사진을 데이터로 삼아 개(1)와 고양이(0)를 분류하는 분류기를 구현하는 것이다. 데이터는 캐글 Dogs vs. Cats - Create an algorithm to distinguish dogs from cats 웹사이트에서 구할 수 있다.
library(magick)
library(tidyverse)
<- fs::dir_ls("data/dogs-vs-cats/train/")
train_imgs
<- train_imgs[str_detect(train_imgs, "dog\\.")][sample(5)]
dogs_imgs <- train_imgs[str_detect(train_imgs, "cat\\.")][sample(5)]
cats_imgs
<- c(dogs_imgs, cats_imgs)
show_imgs
<- map(show_imgs, image_read) %>%
cats_dogs image_join()
1:5] %>%
cats_dogs[image_resize("70x70") %>%
image_append(stack = FALSE)
6:10] %>%
cats_dogs[image_resize("70x70") %>%
image_append(stack = FALSE)
library(tidyverse)
# 활성화 함수 정의 (RELU 등) -----------
<- function(x){
sigmoid 1/(1+exp(-x))
}
# 패러미터 초기화 ----------------------
<- function(dim){
initialize_with_zeros = matrix(0, nrow = dim, ncol = 1)
w = 0
b return(list(w, b))
}
# 순방향 전파 후 손실 계산 후 역전파----
<- function(w, b, X, Y){
propagate <- ncol(X)
m # 순방향 전파
<- sigmoid((t(w) %*% X) + b)
A # 손실(오차) 계산
<- (-1 / m) * sum(Y * log(A) + (1 - Y) * log(1 - A))
cost # 역전파
<- (1 / m) * (X %*% t(A - Y))
dw <- (1 / m) * rowSums(A - Y)
db <- list(dw, db)
grads return(list(grads, cost))
}
# 최적화 (경사하강법 등) ----------------
<- function(w, b, X, Y, num_iter, learning_rate, print_cost = FALSE) {
optimize <- list()
costs
for (i in 1:num_iter) {
# 경사값과 손실 계산
<- propagate(w, b, X, Y)[[1]]
grads <- propagate(w, b, X, Y)[[2]]
cost # 미분값 가져오기
<- matrix(grads[[1]])
dw <- grads[[2]]
db # 패러미터 갱신
<- w - learning_rate * dw
w <- b - learning_rate * db
b # 손실값 기록
if (i %% 100 == 0) {
<- cost
costs[[i]]
}# 500회마다 손실 출력
if ((print_cost == TRUE) & (i %% 500 == 0)) {
cat(sprintf(" %d 회 반복후 손실: %06f\n", i,
costs[[i]]))
}<- list(w, b)
params <- list(dw, db)
grads
}return(list(params, grads, costs))
}
# 예측 -------------------------
<- function(w, b, X) {
pred <- ncol(X)
m <- matrix(0, nrow = 1, ncol = m)
Y_prediction # 고양이/개 예측 확률 계산
<- sigmoid((t(w) %*% X) + b)
A # 예측확률이 50% 넘으면 개, 그렇지 않으면 고양이로 라벨
for (i in 1:ncol(A)) {
if (A[1, i] > 0.5) {
1, i] = 1
Y_prediction[else {
} 1, i] = 0
Y_prediction[
}
}return(Y_prediction)
}
# 신경망 모형 -------------------------
<- function(X_train,
nn_model
Y_train,
X_test,
Y_test,
num_iter,
learning_rate,print_cost = FALSE){
# 초기값 0으로 설정
<- initialize_with_zeros(nrow(X_train))[[1]]
w <- initialize_with_zeros(nrow(X_train))[[2]]
b # 경사하강법 손실 최소화
<- optimize(w,
optFn_output
b,
X_train,
Y_train,
num_iter,
learning_rate,
print_cost)
<- optFn_output[[1]]
parameters <- optFn_output[[2]]
grads <- optFn_output[[3]]
costs # 모수 w, b 저장
<- as.matrix(parameters[[1]])
w <- parameters[[2]]
b # 훈련/시험 데이터 예측
= pred(w, b, X_train)
pred_train = pred(w, b, X_test)
pred_test # 훈련/시험 손실 출력 확인
cat(sprintf("훈련 정확도: %#.2f \n", mean(pred_train == Y_train) * 100))
cat(sprintf("시험 정확도: %#.2f \n", mean(pred_test == Y_test) * 100))
= list( "costs"= costs,
res "pred_train" = pred_train,
"pred_test"= pred_test,
"w" = w,
"b" = b,
"learning_rate" = learning_rate,
"num_iter" = num_iter)
return(res)
}
입력 데이터가 이미지이기 때문에 이를 기계가 계산할 수 있는 숫자로 변환한다.
<- image_read("data/dogs-vs-cats/train/cat.1.jpg")
raw_img # raw_mat <- image_data(raw_img, 'rgba')
<- image_convert(raw_img, "tiff")
raw_tiff <- as.integer(raw_tiff[[1]])
raw_array
dim(raw_array)
1] raw_array[,,
dim(trainx)
library(EBImage)
<- "data/dogs-vs-cats/train"
file_path_train <- "data/dogs-vs-cats/test"
file_path_test
library(pbapply)
= 64
height = 64
width = 3
channels
<- function(dir_path, width, height) {
extract_feature
<- width * height
img_size <- list.files(dir_path)
images <- ifelse(grepl("dog", images) == T, 1, 0)
label print(paste("Processing", length(images), "images"))
<- pblapply(images, function(imgname) {
feature_list <- readImage(file.path(dir_path, imgname))
img <- EBImage::resize(img, w = width, h = height)
img_resized <- matrix(reticulate::array_reshape(img_resized, (width *
img_matrix * channels)), nrow = width * height * channels)
height <- as.vector(t(img_matrix))
img_vector return(img_vector)
})<- do.call(rbind, feature_list)
feature_matrix return(list(t(feature_matrix), label))
}
# data_train <-extract_feature(file_path_train, width = 64,height = 64)
# data_train %>%
# write_rds("data/dogs-vs-cats/data_train.rds")
<-
data_train read_rds("data/dogs-vs-cats/data_train.rds")
<-data_train[[1]]
trainx <-data_train[[2]]
trainy
# data_test <-extract_feature(file_path_test,width = 64,height = 64)
# data_test %>%
# write_rds("data/dogs-vs-cats/data_test.rds")
<-
data_test read_rds("data/dogs-vs-cats/data_test.rds")
<-data_test[[1]]
testx <- data_test[[2]]
testy
# 데이터 전처리 --------------------
<- scale(trainx)
trainx <- scale(testx) testx
# 모형 적합 ---------------------
<- nn_model( trainx,
model
trainy,
testx,
testy,num_iter = 5000,
learning_rate = 0.01,
print_cost = TRUE)
# Cost after iteration 500: 0.011374
# Cost after iteration 1000: 0.005671
# Cost after iteration 1500: 0.003772
# Cost after iteration 2000: 0.002825
# Cost after iteration 2500: 0.002257
# Cost after iteration 3000: 0.001880
# Cost after iteration 3500: 0.001610
# Cost after iteration 4000: 0.001408
# Cost after iteration 4500: 0.001251
# Cost after iteration 5000: 0.001126
# train accuracy: 100.00
# test accuracy: 62.00
<- c(1:5000)
x <- model$costs
y <- smooth.spline(x, y, spar = 0.35)
smoothingSpline
plot(NULL, type = "n", xlab = "Iterations", ylab = "Cost",
xlim = c(1, 5000), ylim = c(0, 1),
xaxt = "n", yaxt = "n", cex.lab = 0.7)
lines(smoothingSpline, col = ’deepskyblue4’)
axis(side = 1, col = "black", cex.axis = 0.7)
axis(side = 2, col = "black", cex.axis = 0.7)
legend(1550, 0.9, inset = 0.001, c(’Learning rate = 0.01’), cex = 0.6)
Hyper parameter를 달리하여 이미지 분류 정확도를 높임.
<- c(0.01, 0.002, 0.005)
learning_rates <- list()
models <- list()
smoothingSpline
plot(NULL, type = "n", xlab = "Iterations", ylab = "Cost",
xlim = c(1, 5000), ylim = c(0, 1), xaxt = "n", yaxt = "n", cex.lab = 0.7)
for(i in 1:length(learning_rates)){
cat(sprintf("Learning rate: %#.3f \n", learning_rates[i]))
<- simple_model(trainx,
models[[i]]
trainy,
testx,
testy,num_iter = 5000,
learning_rate = learning_rates[i],
print_cost = F)
cat(’\n-------------------------------------------------------\n’)
<- c(1:5000)
x <- unlist(models[[i]]$costs)
y = smooth.spline(x, y, spar = 0.35)
smoothingSpline lines(smoothingSpline, col = i + 2, lwd = 2)
}
axis(side = 1, col = "black", cex.axis = 0.7)
axis(side = 2, col = "black", cex.axis = 0.7)
legend("topright", inset = 0.001,
c(’Learning rate = 0.01’, ’Learning rate = 0.002’, ’Learning rate = 0.005’),
lwd = c(2, 2, 2),
lty = c(1, 1, 1),
col = c(’green3’, ’blue’, ’cyan’),
cex = 0.6)
# Learning rate: 0.010
# train accuracy: 100.00
# test accuracy: 62.00
# -------------------------------------------------------
# Learning rate: 0.002
# train accuracy: 100.00
# test accuracy: 60.00
# -------------------------------------------------------
# Learning rate: 0.005
# train accuracy: 100.00
# test accuracy: 62.00
# -------------------------------------------------------
이미지 인식에 특화된 신경망 아키텍쳐로 Convolutional Neural Network 는 합성곱(Convolution) 연산을 통해 이미지 정보를 유지하면서도 기존 신경망 아키텍처와 비교하여 연산량을 대폭 줄였다.
“You shall know a word by the company it keep”
John Rupert Firth