xwMOOC 데이터 제품
국회의원(20대) - 유튜브
작업흐름
데이터 긁어오기 1
제20대 국회의원 목록
가장 먼저 제20대 국회의원 목록이 필요하다. 이를 위해서 위키백과사전에서 대한민국 제20대 국회의원 목록 (지역구별)를 긁어오자.
위키백과사전에서 ULR을확인했으니 rvest 팩키지를 활용하여 xpath를 찾아 이를 서울특별시부터, 제주도까지 긁어온다.
# 0. 환경설정 -----
library(httr)
library(rvest)
library(tidyverse)
library(urltools)
library(glue)
library(extrafont)
loadfonts()
Sys.setlocale("LC_ALL", "C")
"https://ko.wikipedia.org/wiki/대한민국_제20대_국회의원_목록_(지역구별)"
url <-
vector("list", length=19)
cm_list <-
for(idx in 1:17) {
GET(url = url) %>%
cm_list[[idx]] <- read_html() %>%
html_node(xpath = glue('//*[@id="mw-content-text"]/div/table[', idx+2, ']')) %>%
html_table(fill=TRUE) %>%
as_tibble()
}
Sys.setlocale("LC_ALL", "Korean")
## 1.2. 시도별 국회의원 정보 -----
cm_list[1:17]
cm_list <-names(cm_list) <- c("서울특별시","부산광역시","대구광역시","인천광역시","광주광역시","대전광역시","울산광역시","세종특별자치시","경기도","강원도","충청북도","충청남도","전라북도","전라남도","경상북도","경상남도", "제주특별자치도")
## 2.3. 리스트 --> 데이터프레임 -----
map(cm_list, "이름") %>% flatten %>% unlist %>% as.character()
name_v <- map(cm_list, "선거구") %>% flatten %>% unlist %>% as.character()
precinct_v <- map(cm_list, "소속정당") %>% flatten %>% unlist %>% as.character()
party_v <- map(cm_list, "관할구역") %>% flatten %>% unlist %>% as.character()
geo_v <- map(cm_list, "선수") %>% flatten %>% unlist %>% as.character()
num_v <-
data.frame(이름=name_v,
cm_df <-
선거구=precinct_v,
정당=party_v,
지역=geo_v,%>%
선수=num_v) separate(이름, into=c("한글명", "한자"), sep="\\(" ) %>%
filter(!is.na(한자))
::datatable(cm_df)
DT
%>% write_rds("data/cm_df.rds") cm_df
국회의원 사이트 정보
국회의원 명단이 준비되었다면 다음으로 사이트에서 유튜브 주소를 가져온다. 이를 위해서 국회의원 한명을 대상으로 사이트 정보를 제대로 가져오는지 준비한다.
# 1. 데이터: 네이버 인물정보 -----
## 1.1. 네이버 주소 -----
"https://search.naver.com/search.naver?where=nexearch&sm=top_hty&fbm=1&ie=utf8&query="
url <- "김병욱"
people_name <-
glue({url}, {people_name})
naver_url <-
## 1.2. 네이버 인물정보 긁어오기 -----
### 인물정보
read_html(naver_url) %>%
key <- html_node(xpath = '//*[@id="people_info_z"]/div[2]/div/dl') %>%
html_nodes('dt') %>%
html_text()
read_html(naver_url) %>%
value <- html_node(xpath = '//*[@id="people_info_z"]/div[2]/div/dl') %>%
html_nodes('dd') %>%
html_text() %>%
.[-1]
read_html(naver_url) %>%
site <- html_node(xpath = '//*[@id="people_info_z"]/div[2]/div/dl') %>%
html_nodes('dd') %>%
html_nodes("a") %>%
html_attr("href")
tibble(key = key,
p_info_df <-value = value) %>%
spread(key, value) %>%
mutate(`이름` = "김병욱") %>%
mutate(site = list(site))
::datatable(p_info_df) DT
특정 국회의원 사이트를 받아오는 스크립트를 함수로 작성한다.
# 1. 네이버 인물정보 함수 -----
## 1.1. 이름
function(people_name) {
people_info <-## 1.1. 네이버 주소 -----
"https://search.naver.com/search.naver?where=nexearch&sm=top_hty&fbm=1&ie=utf8&query="
url <-
glue({url}, {people_name})
naver_url <-
## 1.2. 네이버 인물정보 긁어오기 -----
read_html(naver_url) %>%
key <- html_node(xpath = '//*[@id="people_info_z"]/div[2]/div/dl') %>%
html_nodes('dt') %>%
html_text()
read_html(naver_url) %>%
value <- html_node(xpath = '//*[@id="people_info_z"]/div[2]/div/dl') %>%
html_nodes('dd') %>%
html_text() %>%
.[-1]
read_html(naver_url) %>%
site <- html_node(xpath = '//*[@id="people_info_z"]/div[2]/div/dl') %>%
html_nodes('dd') %>%
html_nodes("a") %>%
html_attr("href")
tibble(key = key,
p_info_df <-value = value) %>%
spread(key, value) %>%
mutate(`이름` = people_name) %>%
mutate(site = list(site))
return(p_info_df)
}
## 1.2. 경합
function(url, people_name) {
people_url_info <-## 1.1. 네이버 주소 -----
glue(url, people_name)
naver_url <-
## 1.2. 네이버 인물정보 긁어오기 -----
### 인물정보
read_html(naver_url) %>%
key <- html_node(xpath = '//*[@id="people_info_z"]/div[2]/div/dl') %>%
html_nodes('dt') %>%
html_text()
read_html(naver_url) %>%
value <- html_node(xpath = '//*[@id="people_info_z"]/div[2]/div/dl') %>%
html_nodes('dd') %>%
html_text() %>%
.[-1]
read_html(naver_url) %>%
site <- html_node(xpath = '//*[@id="people_info_z"]/div[2]/div/dl') %>%
html_nodes('dd') %>%
html_nodes("a") %>%
html_attr("href")
tibble(key = key,
p_info_df <-value = value) %>%
spread(key, value) %>%
mutate(`이름` = people_name) %>%
mutate(site = list(site))
return(p_info_df)
}
# 2. 네이버 인물 데이터 -----
people_info("김병욱")
people_url_info("https://search.naver.com/search.naver?where=nexearch&sm=tab_etc&mra=bjky&pkid=1&os=99291&query=", "진영")
특정 국회의원 사이트를 받아오는 스크립트를 함수로 작성한 것을 활용하는 단계로 넘어간다. 앞서 입수한 국회의원 명단을 넣어 데이터를 긁어온다.
# 2. 국회의원 명단 -----
cm_df$한글명
cm_name_v <-
# 3. 네이버 국회의원 정보 -----
vector("list", length=length(cm_name_v))
np_list <-
for(i in 1:length(cm_name_v)) {
tryCatch({
people_info(cm_name_v[i])
np_list[[i]] <-# cat(i, ": ", cm_name_v[i], "\n")
error=function(e){})
},
}
names(np_list) <- cm_name_v
## 3.1. 네이버 국회의원 정보 보완 -----
### 결측값 국회의원
map_int(np_list, length)
np_missing_v <- np_missing_v[np_missing_v == 0]
np_missing_v <-
tribble(
np_missing_df <-~"이름", ~"url",
"진영", "https://search.naver.com/search.naver?where=nexearch&sm=tab_etc&mra=bjky&pkid=1&os=99291&query=",
"김영호", "https://search.naver.com/search.naver?where=nexearch&sm=tab_etc&mra=bjky&pkid=1&os=145649&query=",
"김성태", "https://search.naver.com/search.naver?where=nexearch&sm=tab_etc&mra=bjky&pkid=1&os=154130&query=",
"이훈", "https://search.naver.com/search.naver?where=nexearch&sm=tab_etc&mra=bjky&pkid=1&os=285873&query=",
"최명길", "https://search.naver.com/search.naver?where=nexearch&sm=tab_etc&mra=bjky&pkid=1&os=96414&query=",
"김정훈", "https://search.naver.com/search.naver?where=nexearch&sm=tab_etc&mra=bjky&pkid=1&os=99295&query=",
"안상수", "https://search.naver.com/search.naver?where=nexearch&sm=tab_etc&mra=bjky&pkid=1&os=95772&query=",
"윤상현", "https://search.naver.com/search.naver?where=nexearch&sm=tab_etc&mra=bjky&pkid=1&os=140452&query=",
"김경진", "https://search.naver.com/search.naver?where=nexearch&sm=tab_etc&mra=bjky&pkid=1&os=152472&query=",
"최경환", "https://search.naver.com/search.naver?where=nexearch&sm=tab_etc&mra=bjky&pkid=1&os=99359&query=",
"이장우", "https://search.naver.com/search.naver?where=nexearch&sm=tab_etc&mra=bjky&pkid=1&os=125793&query=",
"이상민", "https://search.naver.com/search.naver?where=nexearch&sm=tab_etc&mra=bjky&pkid=1&os=99176&query=",
"김영진", "https://search.naver.com/search.naver?where=nexearch&sm=tab_etc&mra=bjky&pkid=1&os=311980&query=",
"김진표", "https://search.naver.com/search.naver?where=nexearch&sm=tab_etc&mra=bjky&pkid=1&os=97441&query=",
"김상희", "https://search.naver.com/search.naver?where=nexearch&sm=tab_etc&mra=bjky&pkid=1&os=151651&query=",
"김성원", "https://search.naver.com/search.naver?where=nexearch&sm=tab_etc&mra=bjky&pkid=1&os=314223&query=",
"김철민", "https://search.naver.com/search.naver?where=nexearch&sm=tab_etc&mra=bjky&pkid=1&os=178883&query=",
"조정식", "https://search.naver.com/search.naver?where=nexearch&sm=tab_etc&mra=bjky&pkid=1&os=99207&query=",
"김정우", "https://search.naver.com/search.naver?where=nexearch&sm=tab_etc&mra=bjky&pkid=1&os=3143802&query=",
"이현재", "https://search.naver.com/search.naver?where=nexearch&sm=tab_etc&mra=bjky&pkid=1&os=122137&query=",
"정성호", "https://search.naver.com/search.naver?where=nexearch&sm=tab_etc&mra=bjky&pkid=1&os=99194&query=",
"김종민", "https://search.naver.com/search.naver?where=nexearch&sm=tab_etc&mra=bjky&pkid=1&os=113207&query=",
"이정현", "https://search.naver.com/search.naver?where=nexearch&sm=tab_etc&mra=bjky&pkid=1&os=154105&query=",
"박준영", "https://search.naver.com/search.naver?where=nexearch&sm=tab_etc&mra=bjky&pkid=1&os=100016&query=",
"이철우", "https://search.naver.com/search.naver?where=nexearch&sm=tab_etc&mra=bjky&pkid=1&os=123634&query=",
"김재원", "https://search.naver.com/search.naver?where=nexearch&sm=tab_etc&mra=bjky&pkid=1&os=99363&query=",
"최경환", "https://search.naver.com/search.naver?where=nexearch&sm=tab_etc&mra=bjky&pkid=1&os=248894&query=",
"이주영", "https://search.naver.com/search.naver?where=nexearch&sm=tab_etc&mra=bjky&pkid=1&os=126184&query=",
"김재경", "https://search.naver.com/search.naver?where=nexearch&sm=tab_etc&mra=bjky&pkid=1&os=99387&query=",
"엄용수", "https://search.naver.com/search.naver?where=nexearch&sm=tab_etc&mra=bjky&pkid=1&os=125837&query="
)
### 결측값 국회의원 데이터 가져오기
vector("list", length=nrow(np_missing_df))
np_missing_list <-
for(i in 1:nrow(np_missing_df)) {
tryCatch({
people_url_info(as.character(np_missing_df[i,2]), as.character(np_missing_df[i,1]))
np_missing_list[[i]] <-# cat(i, ": ", as.character(np_missing_df[i,1]), "\n")
error=function(e){})
},
}
names(np_missing_list) <- np_missing_df$이름
::jsonedit(np_missing_list)
listviewer
## 3.2. 네이버 국회의원 결합 -----
append(np_list, np_missing_list)
np_comp_list <-
plyr::compact(np_comp_list) # NULL 제거
np_comp_list <-
::jsonedit(np_comp_list)
listviewer
## 3.3. 네이버 국회의원 데이터 저장 -----
%>% write_rds("data/np_comp_list.rds") np_comp_list
데이터 정제
긁어온 데이터를 SNS별로 정당별로 정제하여 데이터프레임으로 깔끔하게 정제해 둔다.
tibble(
np_comp_dat <-name = names(np_comp_list),
site = map(np_comp_list, "site"))
np_comp_dat %>%
np_comp_dat <- mutate(num_url = map_int(site, length)) %>%
filter(num_url > 0) %>%
unnest(site) %>%
unnest(site) %>%
filter(!str_detect(site, "^?where=|https://people.search.naver.com/"))
np_comp_dat %>%
np_comp_df <- mutate(sns = case_when(str_detect(site, "blog.naver.com/|^http://blog") ~ "네이버 블로그",
str_detect(site, "www.facebook.com") ~ "페이스북",
str_detect(site, "twitter.com") ~ "트위터",
str_detect(site, "youtube.com") ~ "유튜브",
str_detect(site, "instagram.com/") ~ "인스타그램",
str_detect(site, "m.post") ~ "네이버 포스트",
str_detect(site, "blog.daum.net") ~ "다음 블로그",
str_detect(site, "band.us") ~ "밴드",
str_detect(site, "cafe.daum.net") ~ "다음 카페",
str_detect(site, "cafe.naver") ~ "네이버 카페",
str_detect(site, "kakao") ~ "카카오",
str_detect(site, "tistory") ~ "티스토리",
TRUE ~ "공식사이트"))
left_join(np_comp_df, cm_df %>% select(한글명,정당), by=c("name" = "한글명")) %>%
np_comp_df <- mutate(current_party = str_extract(정당, "(국민의당|무소속|더불어민주당|바른미래당|민중당|자유한국당|민주평화당|대한애국당|정의당|새누리당)$"))
SNS 분석
정당별 SNS 활용
국회의원이 선호하는 SNS
library(DT)
%>%
np_comp_df count(sns, sort=TRUE) %>%
mutate(pcnt = n/sum(n)) %>%
DT::datatable() %>%
DT::formatPercentage("pcnt", digits=1)
정당별 선호하는 SNS
%>%
np_comp_df filter(sns %in% c("공식사이트", "네이버 블로그", "유튜브", "인스타그램", "트위터", "페이스북")) %>%
filter(current_party %in% c("더불어민주당", "자유한국당", "바른미래당", "민주평화당")) %>%
count(sns, current_party, sort=TRUE) %>%
mutate(sns = factor(sns, levels = c("공식사이트", "페이스북", "네이버 블로그", "유튜브", "인스타그램", "트위터"))) %>%
mutate(current_party = factor(current_party, levels = c("더불어민주당", "자유한국당", "바른미래당", "민주평화당"))) %>%
ggplot(aes(x=current_party, y=n, fill=sns)) +
geom_col(position = "dodge") +
coord_flip() +
labs(x="", y="", fill="SNS") +
theme_minimal(base_family = "NanumGothic")
%>%
np_comp_df filter(sns %in% c("공식사이트", "네이버 블로그", "유튜브", "인스타그램", "트위터", "페이스북")) %>%
filter(current_party %in% c("더불어민주당", "자유한국당", "바른미래당", "민주평화당")) %>%
count(sns, current_party, sort=TRUE) %>%
mutate(current_party = factor(current_party, levels = c("더불어민주당", "자유한국당", "바른미래당", "민주평화당"))) %>%
ggplot(aes(x=sns, y=n, fill=current_party)) +
geom_col(position = "dodge") +
coord_flip() +
labs(x="", y="", fill="정당") +
theme_minimal(base_family = "NanumGothic") +
scale_fill_manual(values=c("더불어민주당"="blue", "민주평화당"="green", "바른미래당"="cyan", "자유한국당"="red"))
유튜브 트래픽 2
유튜브 채널ID 알아내기
read_rds("data/np_comp_list.rds")
np_comp_list <-
read_rds("data/np_comp_df")
np_comp_df <-
np_comp_df %>%
np_comp_yt_df <- filter(str_detect(site, "youtube")) %>%
select(name, site) %>%
mutate(channel = str_replace(site, "^(http|https)://www.youtube.com/", "")) %>%
mutate(channel_id = case_when(
str_detect(channel, "^(c/|user/)") ~ str_replace(channel, "c/|user/", ""),
str_detect(channel, "^channel/") ~ str_replace(channel, "channel/", ""),
TRUE ~ channel)) %>%
mutate(channel_len = str_length(channel_id))
np_comp_yt_df %>%
user_v <- filter(channel_len < 20) %>%
pull(channel_id)
유튜브 ID를 채널로 바꾸기
library(RSelenium)
# 1. 데이터 ----
rsDriver(browser="chrome", port=77L)
rem_driver <- rem_driver[["client"]]
remdrv_client <-
$navigate("http://johnnythetank.github.io/youtube-channel-name-converter/")
remdrv_client
remdrv_client$findElement('class', 'form-control')
yt_web_element <-
$clearElement()
yt_web_element$sendKeysToElement(list('maumgil88', key='enter'))
yt_web_element
remdrv_client$findElement('class', 'ng-binding')
yt_channel_web_element <-
$getElementText() %>% unlist() yt_channel_web_element
다음 코드는 동작을 하지 않네요… ㅎㅎ
remdrv_client$findElement('class', 'form-control')
yt_web_element <-
function(yt_name) {
query_channel_id <-
tryCatch({
$clearElement()
yt_web_element$sendKeysToElement(list(yt_name, key='enter'))
yt_web_element
remdrv_client$findElement('class', 'ng-binding')
yt_channel_web_element <- yt_channel_web_element$getElementText()
channel_id <-
warning = function(yt_name) {
}, yt_name
channel_id <-error = function(yt_name) {
}, yt_name
channel_id <-
})return(channel_id)
}
query_channel_id('2haechan')
library(tuber) # devtools::install_github("soodoku/tuber", build_vignettes = TRUE)
Error in library(tuber): there is no package called 'tuber'
yt_oauth(yt_app_id, yt_app_pw)
Error in yt_oauth(yt_app_id, yt_app_pw): could not find function "yt_oauth"
get_stats(video_id = "peLTHX-rBxM")
Error in get_stats(video_id = "peLTHX-rBxM"): could not find function "get_stats"
np_comp_yt_df %>%
channel_id <- filter(channel_len == 24) %>%
pull(channel_id)
list()
yt_congress_stat <-
for(channel in channel_id) {
print(channel)
get_channel_stats(channel_id=channel)
yt_congress_stat[[channel]] <- }
[1] "UCeKt3AtHLhkL1sSZrxpYi8Q"
Error in get_channel_stats(channel_id = channel): could not find function "get_channel_stats"
::jsonedit(yt_congress_stat) listviewer
tibble(
yt_congress_stat_df <-channel_id = channel_id,
view_count = map(yt_congress_stat, ~.$statistics$viewCount) %>% unlist %>% as.integer,
video_count = map(yt_congress_stat, ~.$statistics$videoCount) %>% unlist %>% as.integer,
subscriber_count = map(yt_congress_stat, ~.$statistics$subscriberCount) %>% unlist %>% as.integer,
comment_count = map(yt_congress_stat, ~.$statistics$commentCount) %>% unlist %>% as.integer
)
Error: Tibble columns must have compatible sizes.
* Size 45: Existing data.
* Size 0: Column `view_count`.
i Only values of size one are recycled.
np_comp_yt_df %>%
np_comp_yt_df <- filter(channel_len == 24) %>%
left_join(yt_congress_stat_df)
Error in is.data.frame(y): object 'yt_congress_stat_df' not found
%>% ggplot(aes(x=view_count)) +
yt_congress_stat_df geom_density()
Error in ggplot(., aes(x = view_count)): object 'yt_congress_stat_df' not found
%>%
yt_congress_stat_df arrange(-view_count)
Error in arrange(., -view_count): object 'yt_congress_stat_df' not found