JTBC가 리얼미터에 2021-04-11 요청한 여론조사결과 등록현황 상세보기에 나온 데이터를 다운로드하자.
library(tidyverse)
library(magick)
download.file(url = "https://www.nesdc.go.kr/files/result/202107/FILE_202104120333254180.pdf.files/1.png",
destfile = "data/nesdc/jtbc/20210401/1.png", mode = "wb")
image_read("data/nesdc/jtbc/20210401/1.png")
test_img <-
%>%
test_img image_resize("30%")
library(rvest)
read_html("https://www.nesdc.go.kr/files/result/202107/FILE_202104120333254180.pdf.htm")
sample_html <-
sample_html
{html_document}
<html lang="ko" xmlns="http://www.w3.org/1999/xhtml" xmlns:svg="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:v="urn:schemas-microsoft-com:vml" xmlns:o="urn:schemas-microsoft-com:office:office">
[1] <head>\n<meta content="IE=8,IE=10" http-equiv="X-UA-Compatible">\n<meta c ...
[2] <body>\n\t\t\n\t\t<div id="wrap" oncontextmenu="return false">\n\t\t\t<!- ...
# html_nodes(xpath = '//*[@id="img-center-pane"]')
# html_nodes(xpath = '//*[@id="page-area0"]')
# html_nodes(css = '#page-area0')
download.file(url = "https://www.nesdc.go.kr/files/result/202107/FILE_202104120333254180.pdf.files/6.png",
destfile = "data/nesdc/jtbc/20210401/6.png", mode = "wb")
image_read("data/nesdc/jtbc/20210401/6.png")
main_img <-
%>%
main_img image_resize("100%")
library(tesseract)
tesseract::ocr_data("data/nesdc/jtbc/20210401/6.png", engine = "kor")
table_text <-%>% head(20) table_text
# A tibble: 20 x 3
word confidence bbox
<chr> <dbl> <chr>
1 "14" 44.1 65,57,96,73
2 ".7086" 30.3 459,66,482,75
3 "李④린" 93.3 487,65,506,76
4 "?\u0080" 93.3 514,65,523,76
5 "??" 79.4 524,65,533,76
6 "二쇱옄" 96.8 535,65,553,76
7 "?좏샇??" 93.3 561,65,592,76
8 "議?" 93.3 598,66,607,75
9 "??" 93.3 608,65,619,76
10 "|" 89.1 621,65,622,76
11 "??" 92.7 630,65,639,76
12 "1" 45.8 641,65,656,76
13 "??" 93.2 647,56,655,85
14 "議곗궗" 97.0 662,65,683,76
15 "媛쒖슂" 92.0 689,65,709,76
16 "吏\u0080??" 54.9 89,100,105,107
17 "-" 89.5 562,88,578,118
18 "090" 52.7 617,106,621,108
19 "??" 62.4 168,129,184,133
20 "??" 0 179,119,195,150
main_img %>%
main_processed_img <- image_crop(geometry_area(x = 280, y = 125, 700, 820)) %>%
image_transparent("white", fuzz=10) %>%
image_background("white") %>%
image_negate() %>%
image_morphology(method = "Thinning", kernel = "Rectangle:20x1+0+0^<") %>%
image_negate()
main_processed_img
glue::glue("1234567890")
whitelist <-
tesseract::ocr_data(main_processed_img, engine = tesseract(language = "eng",
table_text <-options = list(tessedit_char_whitelist = whitelist)))
%>% head(20) table_text
# A tibble: 20 x 3
word confidence bbox
<chr> <dbl> <chr>
1 4816 0 28,3,112,15
2 2637773 0 29,16,400,29
3 2244 0 29,29,431,45
4 93 0 32,45,431,59
5 223 0 28,59,256,72
6 51 57.1 390,61,399,69
7 09 0 27,73,187,86
8 190 37.3 387,76,403,84
9 2243 0 29,88,328,101
10 3 0 29,101,431,117
11 2 0 28,117,431,131
12 38 0 101,133,112,141
13 35 0 101,148,112,156
14 3785 0 99,160,331,173
15 23328 0 33,174,431,189
16 639224 0 32,189,431,203
17 5 0 392,205,431,218
18 6433 0 32,217,325,230
19 6 45.6 392,220,431,232
20 5656 0 32,232,399,245
main_img %>%
main_column_img <- image_crop(geometry_area(x = 280, y = 125, 70, 820)) %>%
image_transparent("white", fuzz=5) %>%
image_background("white") %>%
image_negate() %>%
image_morphology(method = "Thinning", kernel = "Rectangle:20x1+0+0^<") %>%
image_negate()
main_column_img
tesseract::ocr_data(main_column_img, engine = tesseract(language = "eng",
column_text <-options = list(tessedit_char_whitelist = whitelist)))
column_text
# A tibble: 52 x 3
word confidence bbox
<chr> <dbl> <chr>
1 48 89.5 29,4,40,12
2 2 82.5 29,18,40,26
3 29 54.6 29,33,40,41
4 9 93.3 32,47,37,55
5 42 17.2 29,61,40,69
6 150 90.4 27,76,43,84
7 20 95.9 29,90,40,98
8 35 83.1 29,105,40,113
9 41 71.0 29,119,39,127
10 55 69.0 29,133,40,141
# ... with 42 more rows
데이터 과학자 이광춘 저작
kwangchun.lee.7@gmail.com