중앙선거관리위원회 → 중앙선거여론조사심의위원회 → 알림마당 → 여론조사기관등록현황에 보면 2020-03-21 시점 기준 “기관등록 현황”과 “기관등록 취소현황”을 파악할 수 있다.
# 0. 환경설정 -----
library(tidyverse)
library(rvest)
library(httr)
library(glue)
Sys.setlocale("LC_ALL", "C")
[1] "C"
first_url <- "https://www.nesdc.go.kr/portal/content/onvy/list.do"
first_html <- first_url %>%
read_html()
Sys.setlocale("LC_ALL", "Korean")
[1] "LC_COLLATE=Korean_Korea.949;LC_CTYPE=Korean_Korea.949;LC_MONETARY=Korean_Korea.949;LC_NUMERIC=C;LC_TIME=Korean_Korea.949"
## 전화번호
전화번호 <- first_html %>%
html_nodes(xpath='//*[@id="frm"]/div[3]/div') %>%
xml_children() %>%
xml_find_all("//span[@class='col ws']") %>%
xml_text %>%
str_extract_all("([0-9]{2,3}-)?[0-9]{3,4}-[0-9]{3,4}", simplify =TRUE)
전화번호 <- 전화번호[전화번호!=""]
## 등록일자
등록일자 <- first_html %>%
html_nodes(xpath='//*[@id="frm"]/div[3]/div') %>%
xml_children() %>%
xml_find_all("//span[@class='col ws']") %>%
xml_text %>%
str_extract_all("[0-9]{4}-[0-9]{2}-[0-9]{2}", simplify =TRUE)
등록일자 <- 등록일자[등록일자!=""]
link_list <- first_html %>%
html_nodes(xpath='//*[@id="frm"]/div[3]/div') %>%
xml_children() %>%
xml_attrs("row tr")
link_df <- map(link_list, "href") %>% enframe %>%
mutate(type = map_lgl(value, is.null)) %>%
filter(!type) %>%
unnest(value) %>%
mutate(link = glue("https://www.nesdc.go.kr{value}")) %>%
select(link)
convert_df <- function(link) {
Sys.setlocale("LC_ALL", "C")
link_df <- link %>%
read_html() %>%
html_node(xpath = '/html/body/section/div/div/div[1]/table') %>%
html_table(fill = TRUE) %>%
spread(X1, X2)
Sys.setlocale("LC_ALL", "Korean")
return(link_df)
}
company_df <- map_df(link_df$link, convert_df)
company_df %>%
DT::datatable()
nesdc_page <- list()
for(page_index in 1:9) {
element_name <- glue("link_{page_index}")
nesdc_page[[element_name]] <- glue("https://www.nesdc.go.kr/portal/content/onvy/list.do?&sdate=&edate=&searchVO=egovframework.rte.nesdc.bbs.vo.SearchVO@386025ce[sdate=,edate=,xdate=,searchTime=,searchCnd=,searchWrd=,pageIndex=1,pageUnit=10,pageSize=10,firstIndex=0,lastIndex=10,recordCountPerPage=10,rowNo=0,bbsNm=,delCode=0,orgTableNm=,tableNm=,orgPath=,newPath=,atchFileId=,orgFileNm=,fileSn=0,fileNm=,totalRows=0,totalCnt=0,siteName=,currentSize=0,totCnt=0,userIp=,siteId=,programId=,targetMethod=,pageQueryString=%3Cnull%3E,menuNo=%3Cnull%3E,imgUrl=,publish=,useAt=Y,saveType=,userId=,telecomCd=,pollGubuncd=,usePurpose=]&searchWrd=&searchCnd=&pageIndex={page_index}")
}
nesdc_page %>%
enframe() %>%
mutate(link = map_chr(value, as.character)) %>%
mutate()
# A tibble: 9 x 3
name value link
<chr> <list> <chr>
1 link_1 <glue> https://www.nesdc.go.kr/portal/content/onvy/list.do?&sdate=&eda~
2 link_2 <glue> https://www.nesdc.go.kr/portal/content/onvy/list.do?&sdate=&eda~
3 link_3 <glue> https://www.nesdc.go.kr/portal/content/onvy/list.do?&sdate=&eda~
4 link_4 <glue> https://www.nesdc.go.kr/portal/content/onvy/list.do?&sdate=&eda~
5 link_5 <glue> https://www.nesdc.go.kr/portal/content/onvy/list.do?&sdate=&eda~
6 link_6 <glue> https://www.nesdc.go.kr/portal/content/onvy/list.do?&sdate=&eda~
7 link_7 <glue> https://www.nesdc.go.kr/portal/content/onvy/list.do?&sdate=&eda~
8 link_8 <glue> https://www.nesdc.go.kr/portal/content/onvy/list.do?&sdate=&eda~
9 link_9 <glue> https://www.nesdc.go.kr/portal/content/onvy/list.do?&sdate=&eda~
company_url_list <- list()
for(i in 1:9){
page_list <- nesdc_page[[i]] %>%
read_html() %>%
html_nodes(xpath='//*[@id="frm"]/div[3]/div') %>%
xml_children() %>%
xml_attrs("row tr")
line_df <- map(page_list, "href") %>% enframe %>%
mutate(type = map_lgl(value, is.null)) %>%
filter(!type) %>%
unnest(value) %>%
mutate(link = glue("https://www.nesdc.go.kr{value}")) %>%
select(link)
company_url_list[[i]] <- line_df
}
## 각 회사별 URL
company_url_chr <- company_url_list %>% unlist
## 각 회사별 상세정보
company_df <- map_df(company_url_chr, convert_df) %>%
as_tibble() %>%
mutate(등록일자 = lubridate::ymd(등록일자))
company_df %>%
write_rds("data/company_df.rds")
reg_df <- company_df %>%
count(등록일자, name="업체수") %>%
mutate(연월 = format(등록일자, "%Y-%m")) %>%
group_by(연월) %>%
summarise(업체수 = sum(업체수)) %>%
ungroup() %>%
mutate(연월 = lubridate::ymd(glue("{연월}-15")))
registration_g <- reg_df %>%
ggplot(aes(x=연월, y=업체수)) +
geom_point() +
geom_line() +
geom_vline(xintercept = lubridate::ymd("2017-05-09"), color="blue", linetype="dashed") +
geom_vline(xintercept = lubridate::ymd("2018-06-13"), color="blue", linetype="dashed") +
geom_vline(xintercept = lubridate::ymd("2020-04-15"), color="blue", linetype="dashed") +
theme_bw() +
labs(x="", y="여론조사기관 등록업체수", title="중앙선거여론조사심의위원회 등록업체현황") +
geom_curve(aes(x = lubridate::ymd("2018-06-13"), y = 5, xend = lubridate::ymd("2018-09-13"), yend = 10),
curvature = 0.35, angle = 50, size=1, color ="blue", arrow = arrow(type = "closed", length = unit(0.35,"cm"))) +
annotate("label", x=lubridate::ymd("2018-09-13"), y=11, label="지방선거") +
geom_curve(aes(x = lubridate::ymd("2017-05-09"), y = 5, xend = lubridate::ymd("2017-09-09"), yend = 20),
curvature = 0.35, angle = 50, size=1, color ="blue", arrow = arrow(type = "closed", length = unit(0.35,"cm"))) +
annotate("label", x=lubridate::ymd("2017-09-09"), y=21, label="대통령 선거") +
geom_curve(aes(x = lubridate::ymd("2020-04-15"), y = 5, xend = lubridate::ymd("2019-12-09"), yend = 15),
curvature = -0.35, angle = 50, size=1, color ="blue", arrow = arrow(type = "closed", length = unit(0.35,"cm"))) +
annotate("label", x=lubridate::ymd("2019-12-09"), y=16, label="국회의원 선거")
registration_g
reg_plotly_g <- company_df %>%
mutate(연월 = format(등록일자, "%Y-%m")) %>%
group_by(연월) %>%
summarise(업체수 = n(),
조사기관명 = glue_collapse(조사기관명, sep="\n")) %>%
mutate(연월 = lubridate::ymd(glue("{연월}-15"))) %>%
ggplot(aes(x=연월, y=업체수, text=조사기관명, group=1)) +
geom_point() +
geom_line() +
geom_vline(xintercept = lubridate::ymd("2017-05-09"), color="blue", linetype="dashed") +
geom_vline(xintercept = lubridate::ymd("2018-06-13"), color="blue", linetype="dashed") +
geom_vline(xintercept = lubridate::ymd("2020-04-15"), color="blue", linetype="dashed") +
theme_bw() +
labs(x="", y="여론조사기관 등록업체수", title="중앙선거여론조사심의위원회 등록업체현황")
plotly::ggplotly(reg_plotly_g, tooltip="text")