1 이미지속 테이블

스택오버플로우extracting tables from jpeg into a dataframe in R에 소개된 이미지를 대상으로 기계판독 불가능한 이미지 속 표를 추출하여 보자. 먼저 이미지를 webshot2 팩키지를 통해 다운로드 받아 둔다.

library(tidyverse)
library(webshot2)

webshot(url = "https://pbs.twimg.com/media/Dv3pIsIUwAEdu--.jpg:large", 
        file = "fig/table-in-image.png")

제대로 다운로드 받았는지 magick 팩키지를 사용해서 제대로 다운로드 되었는지 확인한다. webshot2로 다운로드 받은 파일은 무슨 이유인지 OCR 인식이 잘 되지 않으니 table.jpg 파일을 OCR 인식에 바로 사용한다.

library(magick)

table_img <- image_read("fig/table.jpg")
table_img

2 테이블 OCR

library(tesseract)

table_text <- tesseract::ocr_data("fig/table.jpg", engine = "eng")
table_text %>% head(20)
                word confidence          bbox
1              Tilt}   4.128876    15,3,61,27
2                 PE  27.541054   99,8,117,19
3              Price  86.028587  176,8,212,19
4             Change  87.016716  228,7,284,22
5             Corr30  90.272118  294,8,343,19
6               HV30  91.880241  353,8,392,19
7             HV30HL  90.931671  402,8,460,19
8              IVX30  89.034782  470,8,513,19
9              IVXHL  87.536232  523,8,567,19
10           Put-Vol  82.518509  585,7,640,19
11          Call-Vol  82.518509  660,7,716,19
12           Vol-PCR  54.457203  726,7,787,19
13            Put-O1  18.291786  809,8,856,19
14               fo)  43.144173  885,3,908,27
15               ro)   6.140968  917,3,937,27
16               Me)  27.496712  937,3,973,27
17                 3  20.106567 973,8,1013,19
18              Rely  23.390289    5,32,35,45
19 10/8/2018/112.82|  19.747498  75,21,232,60
20            -0.12|  19.747498 235,21,297,60

3 이미지 전처리

magick 팩키지 다양한 기능을 활용하여 텍스트와 배경을 잘 구분되도록 작업한다.

table_img_processed <- table_img %>% 
  image_transparent("white", fuzz=82) %>% 
  image_background("white") %>%
  image_negate() %>%
  image_morphology(method = "Thinning", kernel = "Rectangle:20x1+0+0^<") %>%
  image_negate() %>%
  image_crop(geometry_area(0, 0, 80, 25)) 

table_img_processed

이미지 전처리 작업이 끝난 이미지를 대상으로 OCR 작업을 수행한다.

whitelist <- glue::glue("1234567890-|/,\\.")

table_text <- tesseract::ocr(table_img_processed, engine = tesseract(language = "eng",
                                  options = list(tessedit_char_whitelist = whitelist)))  

table_text %>% head(20)
[1] "10/8/2018|112.82|-0.12|0.11/.040,55|0.06|0.46/ 566|156|3.63|143,115|28,439|5.03\n10/5/2018|112.94|-0.44|0.16|0.04|0.58|.06|0.43|1,672|2,236|0.75|143,091|28,385|5.04\n10/4/2018|113.38|-0.47|0.10|0.04|0.55|0.06|0.46|27,212|4,966|5.48|142,867|27,384|5.22\n10/3/2018|113.85|-0.67|0,00|0.04|0.51|0.05|0.29|23,522|5,812|4.05|131,320|25,340|5.18\n10/2/2018|114.42|-0.51|0.01|.04|9.44|9.05|0.191 470|994|9.47|121,260|25,901|4.68\n9/28/2018|114.93|-0.08||0.07|0.04|9.37|0.05|0.11|20,076|394|50.95|121,437|25,341|4.79\n9/27/2018|114.93|-0.08|0.07|.04|9.37|.05|0.00/1,270/2,876|0.44|/111,445|24,163|4.61\n9/26/2018|114.86|0.41|0.02|.04|9.39|.05|0.11|40,34|228|277.22|111,477|24,120|4.62\n9/25/2018|114.45|0.00|0.11|.04|0.34|0.0510.291 254|198|1.28|91,430|24,055|3.80\n9/24/2018|114.45|-0.26|0.11|.04|0.34|/0.05|0.25|23,236/2,908|7.99|91,306|24,604|3,71\n9/21/2018|114.7110.02|0.02|.04|0.38|0.05|0.27/22,444|/11,742/1.91198,292|46,553|2.11\n9/20/2018|114.69|0.30|0.01|.04|0.34|0.05|0.23|50,462|296|270.48|98,359|43,700|2.25\n9/19/2018|114.39|-0.17|-0.05|.04|0.31/0.05|0.2112,604/5,254|.50|73,313|45,603|1.61\n9/18/2018|114.56|-0.47|-0.07|.04|0.34|0.05/0.20| 370|118|3.14|-72,826|43,078|1.69\n9/17/2018|115.03|-0.07|0.01|0.03|9.28|9.05|0.18| 156|4,262|0,04|72,859|43,050|1.69\n9/14/2018|115.10|-0.14|0.04/.04|0.32|9.05|0.231 5|130|4.52|72,835|40,926|1.78\n9/13/2018|115.24|0.24|0.05|.04|0.32|.06|0.36 420 460|0.9172,69740,945|1.,78\n9/12/2018|115.00|0.23|0.06|0.04|0.48|0.05|0.12| 3| 16|2.38|72,582|40,962|1.77\n9/11/2018|114,77|-0.26|0.11|0.04|0.51|/0.05|0.20 10|956|.7|72,573|40,96211.77\n9/10/2018|115.03|0.38|0.15|.04|0.50/.05|0.19 108 46|2.3572,53240,916|1.77\n9/7/2018|114,.65|-0.48|0.10|0.04|0.45|0.05|9.271 68| 22|3.09|72,507|40,899|1.77\n9/6/2018|115.13|0.32|0.06|0.04|0,39|0.05|9.27| 72\\| 26|0.84|72,503|40,894|1.77\n9/5/2018|114,81|-0.07|0.13|0.04|0.36|0.05|9,22| 954|3,010|0.32|72,499|40,878||1.77\n9/4/201|114,.88|-0.56|0.16|0.04|0.40|0.05|9.20/1,140|276|4.|72,280|40,943|1.76\n8/31/2018|115.44|-0.17|0.11|0.04|9.37|.04|0.201 136| 96|1.42|72,223|40,941|1.76\n8/30/2018|115.61|0.11|0.12|.04|9.40|0.0510.101 72\\| 24|3.072,298|42,173|1.71\n8/29/2018|115.50|-0.12|0.11|0.04|9.41|.4|0.201 60|2,248|.03|72,188|42,166|1.71\n8/22/2018|115.62|-0.44|0.13|0.04|9.40/0.05|0.131 1|262|1.98|72,172|42,147|1.,71\n8/27/2018|116.06|-0.18|0.12|0.04|0.34|0.05|0.101 63|254|2.51|-72,081||42,250|1.71\n8/24/2018|116.24|0.23|0.17|.04|0.34|0.05|0.22 254|2,314|0.21|74,958|42,157|1.71\n8/23/2018|116.01|0.04|.14|.04|0.32|0.05|0.221 64|568|1.14|71,903|41,864|1,72\n8/21/2018|115.98|-0.29|0.13|0.04|9.36|0.05|0.31/ 312|402|0.78|74,592|41,759|1.,71\n8/20/2018|116.27|0.30|0.13|.04|0.33|.05|0.17/7,398/2,242/3.30/74,465|41,607/1.72\n8/17/2018|115.97|0.10|0.13|.04|9.31/.05|0.18| 54|750|0.79|-77,711|45,405|1.71\n8/16/2018|115.87|0.15|0.15|.04|9.31/.05|0.25| 66|1,240|0.05|77,530|49,093|1.58\n8/14/2018|115.48|0.15|0.17|.04|9.38|0.05|0.18| 4| 20|240|77,527|48,489|1.60\n8/13/2018|115.33|-0.03|0.13|.04|0.42|0.05|0.15|1,270/120|10.58|77,524|\\48,489|1.60\n8/10/2018|115.36|0.19|0.13|.04|9.42|0.05|0.22 362|1,600|0.35|76,923|48,497|1.59\n/9/2018|115.17|0.09|0.15|0.04|0.42|0,04|0.20/30,856||2,448|1260/76,882|47,837|1.61\n3/7/2018|115.08|-0.18|0.05|0.04|0.46|0.05|0.201 166|6|0.49|71,417|47,363|1.51\n8/6/2018|115.62|0.11|0.11|0.04|0.44|0.05|9.271 524|516|1.02|71,134|46,069|1.54\n3/3/2018|115.51|0.36|0.10|0.04|0,44|0.05|9.27| 36|750|0.05|70,949|45,952|1.54\n3/2/2018|115.51|0,36|0.10|0.04|0.449.05|0.171 698| 6|116.33|70,948|45,833|1.55\n/1/2018|115.01|-0.75|0.07|0.04|0.55|0.05|9.26 8,882 240|37.0172,39645,832|1.58\n7/31/2018|115.76|0.36|0.05|0.04|9.36|0.05|0.18| 16|326|291/71,548|45,875|1.56\n7/30/2018|115.40|-0.16|0.03|0.04|0.33//0.05|0.19|19,4601 90|216.22|72,455|46,617|1.53\n7/27/2018|115.56|0.15|0.01|.04|0.33|0.05|0.21 1,5344,404|0,35-64,039-46,607|1.37\n7/26/2018|115.41|0.11|0.05|.04|9.41|.05|0.25|2,016/152/13.26|64,111|46,247|1.39\n7/25/2018|115.30|0.15|0.07|.04|9.42|.05/0.24/ 22|44|.05|63,182|46,206|1.37\n7/24/2018|115.15|0.36|0.05|.04|9.41|0.05|0.26| 230|228|1.01|63,171|46,094|1.37\n7/23/2018|114.79|-0.46|0.01|.04|9.38|.06|0.33|5,476|2,924|1.97|63,068|46,014|1.37\n7/20/2018|115.25|-0.34|0.01|.04|9.31/0.05|0.191 20|248|0.55|62,824|57,321|1.10\n7/19/2018|115.59|0.18|-0.01|.04|9.31/0.05|0.201 556|134|4.15|64,676|61,609|1.05\n7/18/2018|115.41|-0.02|-0.08|.04|9.37|0.05|0.22 22| 36|0.61|64,551|61,603|1.05\n7/17/2018|115.51|-0.19|-0.09|0.04|9.40|0.05|0.241 72|234|032|65,153|61,598||1.06\n7/16/2018|115.51|-0.19|-0.09|0.04|9.40/0.050.241 40| 30|133/65,135|61,602||1.06\n7/13/2018|115.70|0.09|-0.21|0.04|0.50|.05|0.19| 34| 38|0.9|65,223|61,591|1.06\n7/12/2018|115.61|0.25|-0.19|.04|0.50|.05|0.18| 4|204|.02|65,209|61,585|1.06\n7/11/2018|115.36|0.231-0.31/0.04|0.53/0.05|9.22\\ 63|938|0.08|65,109|61,516|1.06\n"

4 후처리 작업

텍스트 후처리 작업을 통해 데이터 프레임으로 추출한다.

table_text %>%
  stringi::stri_split(fixed = "\n") %>%
   purrr::map(~ stringi::stri_split(str = ., fixed = "|")) %>% 
  .[[1]] %>% 
  purrr::map_df(~ tibble::tibble(Date = .[1], Price = .[2], Change = .[3], Corr30 = .[4], HV30 = .[5])) %>%
  DT::datatable()
 

데이터 과학자 이광춘 저작

kwangchun.lee.7@gmail.com