1 차기 대선주자

JTBC가 리얼미터에 2021-04-11 요청한 여론조사결과 등록현황 상세보기에 나온 데이터를 다운로드하자.

library(tidyverse)
library(magick)

download.file(url = "https://www.nesdc.go.kr/files/result/202107/FILE_202104120333254180.pdf.files/1.png", 
              destfile = "data/nesdc/jtbc/20210401/1.png", mode = "wb")

test_img <- image_read("data/nesdc/jtbc/20210401/1.png")

test_img %>% 
  image_resize("30%")

library(rvest)

sample_html <- read_html("https://www.nesdc.go.kr/files/result/202107/FILE_202104120333254180.pdf.htm")

sample_html
{html_document}
<html lang="ko" xmlns="http://www.w3.org/1999/xhtml" xmlns:svg="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:v="urn:schemas-microsoft-com:vml" xmlns:o="urn:schemas-microsoft-com:office:office">
[1] <head>\n<meta content="IE=8,IE=10" http-equiv="X-UA-Compatible">\n<meta c ...
[2] <body>\n\t\t\n\t\t<div id="wrap" oncontextmenu="return false">\n\t\t\t<!- ...
  # html_nodes(xpath = '//*[@id="img-center-pane"]')
  # html_nodes(xpath = '//*[@id="page-area0"]')
  # html_nodes(css = '#page-area0')

2 차기대선 주사 선호도1

2.1 다운로드

download.file(url = "https://www.nesdc.go.kr/files/result/202107/FILE_202104120333254180.pdf.files/6.png", 
              destfile = "data/nesdc/jtbc/20210401/6.png", mode = "wb")

main_img <- image_read("data/nesdc/jtbc/20210401/6.png")

main_img %>% 
  image_resize("100%")

2.2 이미지 전처리작업

library(tesseract)

table_text <- tesseract::ocr_data("data/nesdc/jtbc/20210401/6.png", engine = "kor")
table_text %>% head(20)
# A tibble: 20 x 3
   word         confidence bbox           
   <chr>             <dbl> <chr>          
 1 "14"               44.1 65,57,96,73    
 2 ".7086"            30.3 459,66,482,75  
 3 "李④린"            93.3 487,65,506,76  
 4 "?\u0080"          93.3 514,65,523,76  
 5 "??"               79.4 524,65,533,76  
 6 "二쇱옄"           96.8 535,65,553,76  
 7 "?좏샇??"          93.3 561,65,592,76  
 8 "議?"              93.3 598,66,607,75  
 9 "??"               93.3 608,65,619,76  
10 "|"                89.1 621,65,622,76  
11 "??"               92.7 630,65,639,76  
12 "1"                45.8 641,65,656,76  
13 "??"               93.2 647,56,655,85  
14 "議곗궗"           97.0 662,65,683,76  
15 "媛쒖슂"           92.0 689,65,709,76  
16 "吏\u0080??"       54.9 89,100,105,107 
17 "-"                89.5 562,88,578,118 
18 "090"              52.7 617,106,621,108
19 "??"               62.4 168,129,184,133
20 "??"                0   179,119,195,150
main_processed_img <- main_img %>% 
  image_crop(geometry_area(x = 280, y = 125, 700, 820))   %>% 
  image_transparent("white", fuzz=10) %>% 
  image_background("white") %>%
  image_negate() %>%
  image_morphology(method = "Thinning", kernel = "Rectangle:20x1+0+0^<") %>%
  image_negate() 

main_processed_img

2.3 텍스트 추출

whitelist <- glue::glue("1234567890")

table_text <- tesseract::ocr_data(main_processed_img, engine = tesseract(language = "eng",
                                  options = list(tessedit_char_whitelist = whitelist)))  

table_text %>% head(20)
# A tibble: 20 x 3
   word    confidence bbox           
   <chr>        <dbl> <chr>          
 1 4816           0   28,3,112,15    
 2 2637773        0   29,16,400,29   
 3 2244           0   29,29,431,45   
 4 93             0   32,45,431,59   
 5 223            0   28,59,256,72   
 6 51            57.1 390,61,399,69  
 7 09             0   27,73,187,86   
 8 190           37.3 387,76,403,84  
 9 2243           0   29,88,328,101  
10 3              0   29,101,431,117 
11 2              0   28,117,431,131 
12 38             0   101,133,112,141
13 35             0   101,148,112,156
14 3785           0   99,160,331,173 
15 23328          0   33,174,431,189 
16 639224         0   32,189,431,203 
17 5              0   392,205,431,218
18 6433           0   32,217,325,230 
19 6             45.6 392,220,431,232
20 5656           0   32,232,399,245 

2.4 칼럼 추출

main_column_img <- main_img %>% 
  image_crop(geometry_area(x = 280, y = 125, 70, 820))   %>% 
  image_transparent("white", fuzz=5) %>% 
  image_background("white") %>%
  image_negate() %>%
  image_morphology(method = "Thinning", kernel = "Rectangle:20x1+0+0^<") %>%
  image_negate() 

main_column_img

column_text <- tesseract::ocr_data(main_column_img, engine = tesseract(language = "eng",
                                  options = list(tessedit_char_whitelist = whitelist)))  

column_text
# A tibble: 52 x 3
   word  confidence bbox         
   <chr>      <dbl> <chr>        
 1 48          89.5 29,4,40,12   
 2 2           82.5 29,18,40,26  
 3 29          54.6 29,33,40,41  
 4 9           93.3 32,47,37,55  
 5 42          17.2 29,61,40,69  
 6 150         90.4 27,76,43,84  
 7 20          95.9 29,90,40,98  
 8 35          83.1 29,105,40,113
 9 41          71.0 29,119,39,127
10 55          69.0 29,133,40,141
# ... with 42 more rows
 

데이터 과학자 이광춘 저작

kwangchun.lee.7@gmail.com