COVerAGE-DB: A database of COVID-19 cases and deaths by age 데이터가 공개되어 코로나19로 인한 사망에 대한 많은 정보를 얻을 수 있다.
COVerAGE-DB - R 팩키지가 있어 바로 데이터를 분석할 수 있게 실시간으로 정보를 제공하고 있다.
library(tidyverse)
# remotes::install_github("eshom/covid-age-data")
library(covidAgeData)
library(lubridate)
# inputDB <- download_covid("inputDB", progress = FALSE)
<- read_csv("inputDB.zip",
inputDB skip = 1,
col_types = "cccccciccdc")
<- subset_covid(inputDB, Country = "South Korea", Region = "All") %>%
korea_tbl # 변수명과 자료형 작업
as_tibble() %>%
::clean_names(.) %>%
janitormutate(date = dmy(date)) %>%
# 분석에 필수 변수만 추리는 작업
select(date, sex, age, measure, value) %>%
# 변수 가독성 향상 작업
mutate(sex = case_when(sex == 'b' ~ "남녀",
== 'f' ~ "여성",
sex == 'm' ~ "남성"),
sex age = case_when(age == '0' ~ "00~09",
== '10' ~ "10~19",
age == '20' ~ "20~29",
age == '30' ~ "30~39",
age == '40' ~ "40~49",
age == '50' ~ "50~59",
age == '60' ~ "60~69",
age == '70' ~ "70~79",
age == '80' ~ "80~"),
age measure = ifelse(measure == "Cases", "확진", "사망")) %>%
# 결측값 제거
filter(!is.na(age))
korea_tbl
# A tibble: 10,836 x 5
date sex age measure value
<date> <chr> <chr> <chr> <dbl>
1 2020-03-04 남녀 00~09 확진 34
2 2020-03-04 남녀 10~19 확진 233
3 2020-03-04 남녀 20~29 확진 1575
4 2020-03-04 남녀 30~39 확진 631
5 2020-03-04 남녀 40~49 확진 790
6 2020-03-04 남녀 50~59 확진 1051
7 2020-03-04 남녀 60~69 확진 646
8 2020-03-04 남녀 70~79 확진 260
9 2020-03-04 남녀 80~ 확진 108
10 2020-03-04 남녀 00~09 사망 0
# … with 10,826 more rows
앞선 데이터 전처리 이후 대한민국 코로나 19 확진 및 사망 남녀 연령별로 살펴보기 위해 데이터를 확인한다.
%>%
korea_tbl ::skim(.) skimr
Name | Piped data |
Number of rows | 10836 |
Number of columns | 5 |
_______________________ | |
Column type frequency: | |
character | 3 |
Date | 1 |
numeric | 1 |
________________________ | |
Group variables | None |
Variable type: character
skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
---|---|---|---|---|---|---|---|
sex | 0 | 1 | 2 | 2 | 0 | 3 | 0 |
age | 0 | 1 | 3 | 5 | 0 | 9 | 0 |
measure | 0 | 1 | 2 | 2 | 0 | 2 | 0 |
Variable type: Date
skim_variable | n_missing | complete_rate | min | max | median | n_unique |
---|---|---|---|---|---|---|
date | 0 | 1 | 2020-03-04 | 2021-04-11 | 2020-06-23 | 398 |
Variable type: numeric
skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
---|---|---|---|---|---|---|---|---|---|---|
value | 0 | 1 | 1454.75 | 3050.77 | 0 | 4 | 170 | 1316.25 | 20197 | ▇▁▁▁▁ |
library(timetk)
%>%
korea_tbl filter(date >= ymd("2020-07-01")) %>%
filter(measure == "사망") %>%
group_by(age) %>%
plot_time_series(date, value,
.facet_ncol = 3,
.interactive = FALSE,
.facet_scales = "free",
.line_alpha = 1,
.smooth = FALSE,
.smooth_size = 0.5,
.smooth_alpha = 0.3) +
scale_y_continuous(labels = scales::comma_format(), limits = c(0,1200)) +
scale_x_date(date_labels = "%y-%m") +
theme_bw(base_family = "NanumGothic") +
labs(title = "연령대별 누적사망자")
%>%
korea_tbl filter(date >= ymd("2020-07-01")) %>%
filter(measure == "확진") %>%
group_by(age) %>%
plot_time_series(date, value,
.facet_ncol = 3,
.interactive = FALSE,
.facet_scales = "free",
.line_alpha = 1,
.smooth = FALSE,
.smooth_size = 0.5,
.smooth_alpha = 0.3) +
scale_y_continuous(labels = scales::comma_format(), limits = c(0, 20000)) +
scale_x_date(date_labels = "%y-%m") +
theme_bw(base_family = "NanumGothic") +
labs(title = "연령대별 누적 확진자")
Age-specific case fatality ratio (ASCFR)
%>%
korea_tbl pivot_wider(names_from = measure, values_from = value) %>%
filter(date >= ymd("2020-07-01")) %>%
mutate(ASCFR = 사망 / 확진) %>%
filter(!is.na(ASCFR)) %>%
ggplot(aes(x = age, y = ASCFR, group = date, color = date)) +
geom_line(alpha = .2) +
scale_y_log10(labels = scales::percent) +
theme_bw(base_family = "NanumGothic") +
labs(x = "",
title = "연령대별 사망율 비율",
color = "월")
나이를 20으로 끊어 5개 집단으로 나누고 확진자 추이를 살펴보자.
library(colorspace)
%>%
korea_tbl separate(age, into = c("age", "etc"), sep = "~", convert = TRUE) %>%
pivot_wider(names_from = measure, values_from = value) %>%
mutate(age20 = age - age %% 20) %>%
group_by(date, age20) %>%
summarize(확진 = sum(확진)) %>%
group_by(age20) %>%
arrange(date) %>%
mutate(new = 확진 - lead(확진)) %>%
ungroup() %>%
group_by(date) %>%
mutate(N = sum(new),
frac = new / N) %>%
ungroup() %>%
ggplot(aes(x = date,
y = frac,
fill = as.factor(age20))) +
geom_area() +
scale_fill_discrete_sequential("Emrld") +
theme_bw(base_family = "NanumGothic") +
labs(x = "",
y = "점유율",
title = "연령대별 확진자 점유 추세",
fill = "연령대(20년)") +
scale_y_continuous(labels = scales::percent, limits = c(0,1)) +
scale_x_date(date_labels = "%y-%m")
library(wpp2019)
data(popM)
<- popM %>%
kor_pop filter(str_detect(name, "Republic of Korea")) %>%
select(Age = age, Population = `2020`) %>%
mutate(Population = Population * 1000,
Age = as.character(Age)) %>%
separate(Age,
into = c("Age",NA),
sep = "-") %>%
mutate(Age = ifelse(Age == "100+", 100, as.integer(Age)),
Age = Age - Age %% 10) %>%
group_by(Age) %>%
summarize(Population = sum(Population), .groups = "drop") %>%
::clean_names()
janitor
kor_pop
# A tibble: 11 x 2
age population
<dbl> <dbl>
1 0 2133051
2 10 2464113
3 20 3553988
4 30 3708100
5 40 4202235
6 50 4251059
7 60 3143765
8 70 1594721
9 80 564807
10 90 49428
11 100 587
<- korea_tbl %>%
kor_pop_mortality separate(age, into = c("age", "etc"), sep = "~", convert = TRUE) %>%
select(-etc) %>%
pivot_wider(names_from = measure, values_from = value) %>%
left_join(kor_pop)
%>%
kor_pop_mortality filter(date >= ymd("2020-07-01")) %>%
mutate(확진율 = 확진 / population) %>%
ggplot(aes(x = date, y = 확진율, group = age, color = as.factor(age))) +
geom_line() +
scale_color_discrete_sequential("Magenta") +
theme_bw(base_family = "NanumGothic") +
labs(x = "",
y = "확진율",
title = "연령대별 확진율 추세",
color = "연령대") +
scale_y_log10(labels = scales::percent) +
scale_x_date(date_labels = "%y-%m")
%>%
kor_pop_mortality filter(date >= ymd("2020-07-01")) %>%
mutate(사망율 = 사망 / population) %>%
ggplot(aes(x = date, y = 사망율, group = age, color = as.factor(age))) +
geom_line() +
scale_color_discrete_sequential("Magenta") +
theme_bw(base_family = "NanumGothic") +
labs(x = "",
y = "사망율",
title = "연령대별 사망율 추세",
color = "연령대") +
scale_y_log10(labels = scales::percent) +
scale_x_date(date_labels = "%y-%m")
4 people take part in a vaccination program, to be given a vaccine that requires 2 doses to be effective against the disease.
In our data:
library(countrycode)
<- read_csv("https://raw.githubusercontent.com/owid/covid-19-data/master/public/data/vaccinations/vaccinations.csv",
vaccine col_types= "ccDddddddddd")
<- vaccine %>%
vaccine_kor mutate(ISO2 = countrycode(iso_code,
origin = 'iso3c',
destination = 'iso2c')) %>%
select(location,
ISO2,
date,
total_vaccinations,
people_vaccinated,%>%
people_fully_vaccinated) filter(str_detect(str_to_lower(location), "korea"))
%>%
vaccine_kor pivot_longer(cols = total_vaccinations:people_fully_vaccinated,
names_to = "접종구분",
values_to = "접종자수") %>%
mutate(접종자수 = ifelse(is.na(접종자수), 0, 접종자수)) %>%
ggplot(aes(x = date, y = 접종자수, group = 접종구분)) +
geom_line() +
facet_wrap( ~ 접종구분) +
theme_bw(base_family = "NanumGothic") +
labs(x = "",
y = "접종자수",
title = "시간별 접종자 증가 추세") +
scale_y_continuous(labels = scales::comma) +
scale_x_date(date_labels = "%y-%m")
https://github.com/owid/covid-19-data/tree/master/public/data/vaccinations
<- read_csv("https://raw.githubusercontent.com/owid/covid-19-data/master/public/data/vaccinations/vaccinations-by-manufacturer.csv",
v_manufacture col_types= "cDcd")
<- v_manufacture %>%
v_manufacture_tbl arrange(location, vaccine) %>%
group_by(location, vaccine) %>%
slice(n()) %>%
ungroup()
%>%
v_manufacture_tbl ggplot(aes(x = vaccine, y = total_vaccinations)) +
geom_col() +
facet_wrap(~location, scales = "free_x") +
theme_bw(base_family = "NanumGothic") +
labs(x = "",
y = "백신 확보량",
title = "국가별 백신 확보량") +
scale_y_continuous(labels = scales::comma)
데이터 과학자 이광춘 저작
kwangchun.lee.7@gmail.com