1 연령별 코로나 사망자1

COVerAGE-DB: A database of COVID-19 cases and deaths by age 데이터가 공개되어 코로나19로 인한 사망에 대한 많은 정보를 얻을 수 있다.

2 R 팩키지

COVerAGE-DB - R 팩키지가 있어 바로 데이터를 분석할 수 있게 실시간으로 정보를 제공하고 있다.

library(tidyverse)
# remotes::install_github("eshom/covid-age-data")
library(covidAgeData)
library(lubridate)

# inputDB <- download_covid("inputDB", progress = FALSE)
inputDB <-  read_csv("inputDB.zip",
                     skip = 1,
                     col_types = "cccccciccdc")

korea_tbl <- subset_covid(inputDB, Country = "South Korea", Region = "All") %>% 
  # 변수명과 자료형 작업
  as_tibble() %>% 
  janitor::clean_names(.) %>% 
  mutate(date = dmy(date)) %>%
  # 분석에 필수 변수만 추리는 작업
  select(date, sex, age, measure, value) %>% 
  # 변수 가독성 향상 작업
  mutate(sex = case_when(sex == 'b' ~ "남녀",
                         sex == 'f' ~ "여성",
                         sex == 'm' ~ "남성"),
         age = case_when(age == '0'  ~ "00~09",
                         age == '10' ~ "10~19",
                         age == '20' ~ "20~29",
                         age == '30' ~ "30~39",
                         age == '40' ~ "40~49",
                         age == '50' ~ "50~59",
                         age == '60' ~ "60~69",
                         age == '70' ~ "70~79",
                         age == '80' ~ "80~"),
         measure = ifelse(measure == "Cases", "확진", "사망")) %>% 
  # 결측값 제거  
  filter(!is.na(age))

  
korea_tbl
# A tibble: 10,836 x 5
   date       sex   age   measure value
   <date>     <chr> <chr> <chr>   <dbl>
 1 2020-03-04 남녀  00~09 확진       34
 2 2020-03-04 남녀  10~19 확진      233
 3 2020-03-04 남녀  20~29 확진     1575
 4 2020-03-04 남녀  30~39 확진      631
 5 2020-03-04 남녀  40~49 확진      790
 6 2020-03-04 남녀  50~59 확진     1051
 7 2020-03-04 남녀  60~69 확진      646
 8 2020-03-04 남녀  70~79 확진      260
 9 2020-03-04 남녀  80~   확진      108
10 2020-03-04 남녀  00~09 사망        0
# … with 10,826 more rows

3 한국

앞선 데이터 전처리 이후 대한민국 코로나 19 확진 및 사망 남녀 연령별로 살펴보기 위해 데이터를 확인한다.

korea_tbl %>%
  skimr::skim(.)
Data summary
Name Piped data
Number of rows 10836
Number of columns 5
_______________________
Column type frequency:
character 3
Date 1
numeric 1
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
sex 0 1 2 2 0 3 0
age 0 1 3 5 0 9 0
measure 0 1 2 2 0 2 0

Variable type: Date

skim_variable n_missing complete_rate min max median n_unique
date 0 1 2020-03-04 2021-04-11 2020-06-23 398

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
value 0 1 1454.75 3050.77 0 4 170 1316.25 20197 ▇▁▁▁▁

4 탐색적 데이터분석

4.1 연령대별 누적 사망자

library(timetk)

korea_tbl %>%
  filter(date >= ymd("2020-07-01")) %>% 
  filter(measure == "사망") %>% 
  group_by(age) %>% 
  plot_time_series(date, value, 
                   .facet_ncol   = 3, 
                   .interactive  = FALSE,
                   .facet_scales = "free",
                   .line_alpha   = 1,
                   .smooth       = FALSE, 
                   .smooth_size  = 0.5,
                   .smooth_alpha = 0.3) +
  scale_y_continuous(labels = scales::comma_format(), limits = c(0,1200)) +
  scale_x_date(date_labels = "%y-%m") +
  theme_bw(base_family = "NanumGothic") +
  labs(title = "연령대별 누적사망자")

4.2 연령대별 누적 확진자

korea_tbl %>%
  filter(date >= ymd("2020-07-01")) %>% 
  filter(measure == "확진") %>% 
  group_by(age) %>% 
  plot_time_series(date, value, 
                   .facet_ncol   = 3, 
                   .interactive  = FALSE,
                   .facet_scales = "free",
                   .line_alpha   = 1,
                   .smooth       = FALSE, 
                   .smooth_size  = 0.5,
                   .smooth_alpha = 0.3) +
  scale_y_continuous(labels = scales::comma_format(), limits = c(0, 20000)) +
  scale_x_date(date_labels = "%y-%m") +
  theme_bw(base_family = "NanumGothic") +
  labs(title = "연령대별 누적 확진자")

5 ASCFR

Age-specific case fatality ratio (ASCFR)

korea_tbl %>% 
  pivot_wider(names_from = measure, values_from = value) %>% 
  filter(date >= ymd("2020-07-01")) %>% 
  mutate(ASCFR = 사망 / 확진) %>%
  filter(!is.na(ASCFR)) %>%
  ggplot(aes(x = age, y = ASCFR, group = date, color = date)) +
  geom_line(alpha = .2) +
  scale_y_log10(labels = scales::percent) +
  theme_bw(base_family = "NanumGothic") +
  labs(x = "", 
       title = "연령대별 사망율 비율",
       color = "월")

6 연령대별 확진자 추이

나이를 20으로 끊어 5개 집단으로 나누고 확진자 추이를 살펴보자.

library(colorspace)

korea_tbl %>% 
  separate(age, into = c("age", "etc"), sep = "~", convert = TRUE) %>% 
  pivot_wider(names_from = measure, values_from = value) %>% 
  mutate(age20 = age - age %% 20) %>%
  group_by(date, age20) %>%
  summarize(확진 = sum(확진)) %>%
  group_by(age20) %>%
  arrange(date) %>%
  mutate(new = 확진 - lead(확진)) %>%
  ungroup() %>%
  group_by(date) %>%
  mutate(N = sum(new),
         frac = new / N) %>%
  ungroup() %>%
  ggplot(aes(x = date,
             y = frac,
             fill = as.factor(age20))) +
  geom_area() +
  scale_fill_discrete_sequential("Emrld") + 
  theme_bw(base_family = "NanumGothic") +
  labs(x     = "", 
       y     = "점유율",
       title = "연령대별 확진자 점유 추세",
       fill  = "연령대(20년)") +
  scale_y_continuous(labels = scales::percent, limits = c(0,1)) +
  scale_x_date(date_labels = "%y-%m")

7 인구 데이터

library(wpp2019)

data(popM)

kor_pop <- popM %>%
  filter(str_detect(name, "Republic of Korea")) %>% 
  select(Age = age, Population = `2020`) %>%
  mutate(Population = Population * 1000,
         Age = as.character(Age)) %>%
  separate(Age,
           into = c("Age",NA),
           sep = "-") %>%
  mutate(Age = ifelse(Age == "100+", 100, as.integer(Age)),
         Age = Age - Age %% 10) %>%
  group_by(Age) %>%
  summarize(Population = sum(Population), .groups = "drop") %>% 
  janitor::clean_names()

kor_pop
# A tibble: 11 x 2
     age population
   <dbl>      <dbl>
 1     0    2133051
 2    10    2464113
 3    20    3553988
 4    30    3708100
 5    40    4202235
 6    50    4251059
 7    60    3143765
 8    70    1594721
 9    80     564807
10    90      49428
11   100        587

7.1 연령대별 확진율

kor_pop_mortality <- korea_tbl %>% 
  separate(age, into = c("age", "etc"), sep = "~", convert = TRUE) %>% 
  select(-etc) %>% 
  pivot_wider(names_from = measure, values_from = value) %>% 
  left_join(kor_pop)

kor_pop_mortality %>% 
  filter(date >= ymd("2020-07-01")) %>% 
  mutate(확진율 = 확진 / population) %>%
  ggplot(aes(x = date, y = 확진율, group = age, color = as.factor(age))) +
  geom_line() +
  scale_color_discrete_sequential("Magenta") +
  theme_bw(base_family = "NanumGothic") +
  labs(x     = "", 
       y     = "확진율",
       title = "연령대별 확진율 추세",
       color = "연령대") +
  scale_y_log10(labels = scales::percent) +
  scale_x_date(date_labels = "%y-%m")

7.2 연령대별 사망률

kor_pop_mortality %>% 
  filter(date >= ymd("2020-07-01")) %>% 
  mutate(사망율 = 사망 / population) %>%
  ggplot(aes(x = date, y = 사망율, group = age, color = as.factor(age))) +
  geom_line() +
  scale_color_discrete_sequential("Magenta") +
  theme_bw(base_family = "NanumGothic") +
  labs(x     = "", 
       y     = "사망율",
       title = "연령대별 사망율 추세",
       color = "연령대") +
  scale_y_log10(labels = scales::percent) +
  scale_x_date(date_labels = "%y-%m")

8 백신

4 people take part in a vaccination program, to be given a vaccine that requires 2 doses to be effective against the disease.

  • Dina has received 2 doses;
  • Joel has received 1 dose;
  • Tommy has received 1 dose;
  • Ellie has not received any dose.

In our data:

  • The total number of doses administered (total_vaccinations) will be equal to 4 (2 + 1 + 1);
  • The total number of people vaccinated (people_vaccinated) will be equal to 3 (Dina, Joel, Tommy);
  • The total number of people fully vaccinated (people_fully_vaccinated) will be equal to 1 (Dina).
library(countrycode)

vaccine <- read_csv("https://raw.githubusercontent.com/owid/covid-19-data/master/public/data/vaccinations/vaccinations.csv",
                  col_types= "ccDddddddddd")

vaccine_kor <- vaccine %>% 
  mutate(ISO2 = countrycode(iso_code,
                             origin = 'iso3c',
                             destination = 'iso2c')) %>%
  select(location,
         ISO2,
         date,
         total_vaccinations,
         people_vaccinated,
         people_fully_vaccinated) %>% 
  filter(str_detect(str_to_lower(location), "korea"))

vaccine_kor %>% 
  pivot_longer(cols = total_vaccinations:people_fully_vaccinated, 
               names_to = "접종구분", 
               values_to = "접종자수") %>% 
  mutate(접종자수 = ifelse(is.na(접종자수), 0, 접종자수)) %>% 
  ggplot(aes(x = date, y = 접종자수, group = 접종구분)) + 
    geom_line() +
    facet_wrap( ~ 접종구분) +
    theme_bw(base_family = "NanumGothic") +
    labs(x     = "", 
         y     = "접종자수",
         title = "시간별 접종자 증가 추세") +
    scale_y_continuous(labels = scales::comma) +
    scale_x_date(date_labels = "%y-%m")

8.1 백신 제조사

https://github.com/owid/covid-19-data/tree/master/public/data/vaccinations

v_manufacture <- read_csv("https://raw.githubusercontent.com/owid/covid-19-data/master/public/data/vaccinations/vaccinations-by-manufacturer.csv",
                  col_types= "cDcd")

v_manufacture_tbl <- v_manufacture %>% 
  arrange(location, vaccine) %>% 
  group_by(location, vaccine) %>% 
  slice(n()) %>% 
  ungroup()

v_manufacture_tbl %>% 
  ggplot(aes(x = vaccine, y = total_vaccinations)) +
    geom_col() +
    facet_wrap(~location, scales = "free_x") +
      theme_bw(base_family = "NanumGothic") +
    labs(x     = "", 
         y     = "백신 확보량",
         title = "국가별 백신 확보량") +
    scale_y_continuous(labels = scales::comma)

 

데이터 과학자 이광춘 저작

kwangchun.lee.7@gmail.com