1 서울 데이터

나무위키 제21대 국회의원 선거/여론조사에서 특정 여론조사가 많아져서 최근 각 시도별로 쪼개지게 되었다. 잘 되었다. 모듈형태로 개발해서 테스트하기 딱 좋은 상황이 되었다.

1.1 특정 여론조사

나무위키 제21대 국회의원 선거/여론조사에서 특정 여론조사를 선정하여 이를 긁어내어 데이터프레임으로 만들어보자.

webshot2::webshot("https://namu.wiki/w/제21대 국회의원 선거/여론조사/서울특별시", selector = '#app > div > div:nth-child(2) > article > div:nth-child(5) > div:nth-child(2) > div > div > div:nth-child(7) > div:nth-child(8) > table', "data/one-sample-seoul.png")

전체 제21대 국회의원 선거/여론조사에서 특정 선거구 “서울특별시 종로구”을 선정하여 데이터를 스크랩핑하여 데이터프레임으로 변환시킨다.

library(tidyverse)
library(rvest)

# Sys.setlocale("LC_ALL", "C")

seoul_html <- read_html("https://namu.wiki/w/제21대 국회의원 선거/여론조사/서울특별시")

seoul_html %>% 
  html_nodes('div.wiki-table-wrap') %>% length

seoul_txt <- seoul_html %>% 
  html_nodes('div.wiki-table-wrap') %>% 
  .[8] %>% 
  html_nodes('div.wiki-paragraph') %>% 
  html_text()

# Sys.setlocale("LC_ALL", "Korean")

## 여론조사 구성요소별로 분리
### 여론조사개요
survey_title <- seoul_txt[1]
survey_overview <- str_split_fixed(survey_title, ",", n=2)

survey_agency <- survey_overview[1]
survey_date <- str_extract(survey_overview[2], pattern = "[0-9]{4}년\\s?[0-9]월.*일")
survey_method <- str_remove(survey_overview[2], pattern = ",\\s[0-9]{4}년\\s?[0-9]월.*일")

survey_main <- seoul_txt[-1]

### 후보자 지지율
survey_value <- survey_main[str_detect(survey_main, "%")]

### 후보자 정당
survey_text <- survey_main[str_detect(survey_main, "^(?!.*%)")]

index_vec <- str_detect(survey_text, "무당층")

location_무당층 <- which(str_detect(survey_text, "무당층"))

survey_party <- survey_text[1:location_무당층]

### 후보자명
survey_candidate <- c(survey_text[(location_무당층+1):length(survey_text)], "무당층")


# 데이터프레임

seoul_sample_df <- tibble(정당 = survey_party,
      후보자명 = survey_candidate,
      지지율 = survey_value,
      조사업체 = survey_agency,
      조사방식 = survey_method,
      조사일자 = survey_date)

seoul_sample_df %>% 
  write_rds("data/seoul_sample_df.rds")

제대로 스크랩핑이 되었는지 확인한다.

seoul_sample_df <- read_rds("data/seoul_sample_df.rds")

seoul_sample_df %>% 
  DT::datatable()

1.2 여론조사 함수

앞서 작성된 결과를 바탕으로 이를 함수로 만들어서 값을 특정한 여론조사결과를 표로 추출하는 함수를 만들어본다.

seoul_html <- read_html("https://namu.wiki/w/제21대 국회의원 선거/여론조사/서울특별시")


scrape_table <- function(table_number) {
  
  namu_txt <- seoul_html %>% 
    html_nodes('div.wiki-table-wrap') %>% 
    .[table_number] %>% 
    html_nodes('div.wiki-paragraph') %>% 
    html_text()
  
  ## 여론조사 구성요소별로 분리
  ### 여론조사개요
  survey_title <- namu_txt[1]
  survey_overview <- str_split_fixed(survey_title, ",", n=2)
  
  survey_agency <- survey_overview[1]
  survey_date <- str_extract(survey_overview[2], pattern = "[0-9]{4}년.*일")
  survey_method <- str_remove(survey_overview[2], pattern = ",\\s[0-9]{4}년\\s?[0-9]월.*일")
  
  survey_main <- namu_txt[-1]
  
  ### 후보자 지지율
  survey_value <- survey_main[str_detect(survey_main, "%")]
  
  ### 후보자 정당
  survey_text <- survey_main[str_detect(survey_main, "^(?!.*%)")]
  
  index_vec <- str_detect(survey_text, "무당층")
  
  location_무당층 <- which(str_detect(survey_text, "무당층"))
  
  survey_party <- survey_text[1:location_무당층]
  
  ### 후보자명
  survey_candidate <- c(survey_text[(location_무당층+1):length(survey_text)], "무당층")
  
  
  # 데이터프레임
  
  seoul_sample_df <- tibble(정당 = survey_party,
        후보자명 = survey_candidate,
        지지율 = survey_value,
        조사업체 = survey_agency,
        조사방식 = survey_method,
        조사일자 = survey_date)
  
  return(seoul_sample_df)
}

scrape_table(8)

# A tibble: 3 x 6
  정당   후보자명 지지율 조사업체             조사방식 조사일자          
  <chr>  <chr>    <chr>  <chr>                <chr>    <chr>             
1 민주   이낙연   50.6%  조원씨앤아이 조사[5] ars      2020년 2월 17~18일
2 통합   황교안   35.8%  조원씨앤아이 조사[5] ars      2020년 2월 17~18일
3 무당층 무당층   9.6%   조원씨앤아이 조사[5] ars      2020년 2월 17~18일

1.3 반복

앞서 작성된 결과를 바탕으로 이를 함수로 만들어서 값을 특정한 여론조사결과를 표로 추출하는 함수를 활용하여 여론조사 결과를 추출하도록 반복문을 작성한다.

반복을 돌리기에 앞서 반복을 돌릴 색인값을 찾아낸다.

possibly_scrap_table <- possibly(scrape_table, otherwise = "Error")

seoul_length <- seoul_html %>% 
  html_nodes('div.wiki-table-wrap') %>% 
  length

seoul_full_list <- map(1:seoul_length, possibly_scrap_table)
names(seoul_full_list) <- 1:seoul_length

seoul_full_list %>% 
  write_rds("data/seoul_full_list.rds")

서울 지역 여론조사가 제대로 수행된 것만 추출하여 데이터를 정제한다.

seoul_full_list <- read_rds("data/seoul_full_list.rds")

seoul_full_df <- seoul_full_list %>% 
  enframe %>% 
  mutate(check = map_dbl(value, is.data.frame)) %>% 
  filter(check==TRUE) %>% 
  unnest(value) 

seoul_full_df %>% 
  DT::datatable()

seoul_full_df %>% 
  write_rds("data/seoul_full_df.rds")

대한민국 제21대 국회의원 선거

선거구별 지지율 데이터

Tidyverse Korea

2020-04-05

1 서울 데이터

1.1 특정 여론조사

1.2 여론조사 함수

1.3 반복

2 전국