데이터 과학

데이터테이블

학습 목표

  • dplyr 과 hflights 팩키지 1

데이터 분석 시작

dplyr, hflights 팩키지를 불러와서 headsummary 명령어로 데이터를 살펴본다.

library(dplyr)
library(hflights)
head(hflights)
FALSE      Year Month DayofMonth DayOfWeek DepTime ArrTime UniqueCarrier
FALSE 5424 2011     1          1         6    1400    1500            AA
FALSE 5425 2011     1          2         7    1401    1501            AA
FALSE 5426 2011     1          3         1    1352    1502            AA
FALSE 5427 2011     1          4         2    1403    1513            AA
FALSE 5428 2011     1          5         3    1405    1507            AA
FALSE 5429 2011     1          6         4    1359    1503            AA
FALSE      FlightNum TailNum ActualElapsedTime AirTime ArrDelay DepDelay Origin
FALSE 5424       428  N576AA                60      40      -10        0    IAH
FALSE 5425       428  N557AA                60      45       -9        1    IAH
FALSE 5426       428  N541AA                70      48       -8       -8    IAH
FALSE 5427       428  N403AA                70      39        3        3    IAH
FALSE 5428       428  N492AA                62      44       -3        5    IAH
FALSE 5429       428  N262AA                64      45       -7       -1    IAH
FALSE      Dest Distance TaxiIn TaxiOut Cancelled CancellationCode Diverted
FALSE 5424  DFW      224      7      13         0                         0
FALSE 5425  DFW      224      6       9         0                         0
FALSE 5426  DFW      224      5      17         0                         0
FALSE 5427  DFW      224      9      22         0                         0
FALSE 5428  DFW      224      9       9         0                         0
FALSE 5429  DFW      224      6      13         0                         0
summary(hflights)
FALSE       Year          Month          DayofMonth      DayOfWeek    
FALSE  Min.   :2011   Min.   : 1.000   Min.   : 1.00   Min.   :1.000  
FALSE  1st Qu.:2011   1st Qu.: 4.000   1st Qu.: 8.00   1st Qu.:2.000  
FALSE  Median :2011   Median : 7.000   Median :16.00   Median :4.000  
FALSE  Mean   :2011   Mean   : 6.514   Mean   :15.74   Mean   :3.948  
FALSE  3rd Qu.:2011   3rd Qu.: 9.000   3rd Qu.:23.00   3rd Qu.:6.000  
FALSE  Max.   :2011   Max.   :12.000   Max.   :31.00   Max.   :7.000  
FALSE                                                                 
FALSE     DepTime        ArrTime     UniqueCarrier        FlightNum   
FALSE  Min.   :   1   Min.   :   1   Length:227496      Min.   :   1  
FALSE  1st Qu.:1021   1st Qu.:1215   Class :character   1st Qu.: 855  
FALSE  Median :1416   Median :1617   Mode  :character   Median :1696  
FALSE  Mean   :1396   Mean   :1578                      Mean   :1962  
FALSE  3rd Qu.:1801   3rd Qu.:1953                      3rd Qu.:2755  
FALSE  Max.   :2400   Max.   :2400                      Max.   :7290  
FALSE  NA's   :2905   NA's   :3066                                    
FALSE    TailNum          ActualElapsedTime    AirTime         ArrDelay      
FALSE  Length:227496      Min.   : 34.0     Min.   : 11.0   Min.   :-70.000  
FALSE  Class :character   1st Qu.: 77.0     1st Qu.: 58.0   1st Qu.: -8.000  
FALSE  Mode  :character   Median :128.0     Median :107.0   Median :  0.000  
FALSE                     Mean   :129.3     Mean   :108.1   Mean   :  7.094  
FALSE                     3rd Qu.:165.0     3rd Qu.:141.0   3rd Qu.: 11.000  
FALSE                     Max.   :575.0     Max.   :549.0   Max.   :978.000  
FALSE                     NA's   :3622      NA's   :3622    NA's   :3622     
FALSE     DepDelay          Origin              Dest              Distance     
FALSE  Min.   :-33.000   Length:227496      Length:227496      Min.   :  79.0  
FALSE  1st Qu.: -3.000   Class :character   Class :character   1st Qu.: 376.0  
FALSE  Median :  0.000   Mode  :character   Mode  :character   Median : 809.0  
FALSE  Mean   :  9.445                                         Mean   : 787.8  
FALSE  3rd Qu.:  9.000                                         3rd Qu.:1042.0  
FALSE  Max.   :981.000                                         Max.   :3904.0  
FALSE  NA's   :2905                                                            
FALSE      TaxiIn           TaxiOut         Cancelled       CancellationCode  
FALSE  Min.   :  1.000   Min.   :  1.00   Min.   :0.00000   Length:227496     
FALSE  1st Qu.:  4.000   1st Qu.: 10.00   1st Qu.:0.00000   Class :character  
FALSE  Median :  5.000   Median : 14.00   Median :0.00000   Mode  :character  
FALSE  Mean   :  6.099   Mean   : 15.09   Mean   :0.01307                     
FALSE  3rd Qu.:  7.000   3rd Qu.: 18.00   3rd Qu.:0.00000                     
FALSE  Max.   :165.000   Max.   :163.00   Max.   :1.00000                     
FALSE  NA's   :3066      NA's   :2947                                         
FALSE     Diverted       
FALSE  Min.   :0.000000  
FALSE  1st Qu.:0.000000  
FALSE  Median :0.000000  
FALSE  Mean   :0.002853  
FALSE  3rd Qu.:0.000000  
FALSE  Max.   :1.000000  
FALSE 

데이터프레임을 데이터테이블로 변환

tbl_df 함수를 사용해서 데이터프레임(data.frame)을 데이터테이블(data.table)로 변환한다. 데이터테이블은 내부를 살펴보면 데이터프레임을 기본으로 개발자 관점에서 더욱 쉽고, 빠르고, 직관적으로 데이터를 처리할 수 있는 자료형이다.

hflights <- tbl_df(hflights)

hflights
FALSE Source: local data frame [227,496 x 21]
FALSE 
FALSE     Year Month DayofMonth DayOfWeek DepTime ArrTime UniqueCarrier
FALSE    (int) (int)      (int)     (int)   (int)   (int)         (chr)
FALSE 1   2011     1          1         6    1400    1500            AA
FALSE 2   2011     1          2         7    1401    1501            AA
FALSE 3   2011     1          3         1    1352    1502            AA
FALSE 4   2011     1          4         2    1403    1513            AA
FALSE 5   2011     1          5         3    1405    1507            AA
FALSE 6   2011     1          6         4    1359    1503            AA
FALSE 7   2011     1          7         5    1359    1509            AA
FALSE 8   2011     1          8         6    1355    1454            AA
FALSE 9   2011     1          9         7    1443    1554            AA
FALSE 10  2011     1         10         1    1443    1553            AA
FALSE ..   ...   ...        ...       ...     ...     ...           ...
FALSE Variables not shown: FlightNum (int), TailNum (chr), ActualElapsedTime
FALSE   (int), AirTime (int), ArrDelay (int), DepDelay (int), Origin (chr), Dest
FALSE   (chr), Distance (int), TaxiIn (int), TaxiOut (int), Cancelled (int),
FALSE   CancellationCode (chr), Diverted (int)
carriers <- hflights$UniqueCarrier

변수값 재부호화(recoding)

데이터 작업 중에 가장 많이 수행하는 작업 중 하나가 변수내부 값을 다른 값으로 재부호화하는 것이다. 예를 들어, 남자Male, 여자Female처럼 한국어를 영어로 재부호화하거나, KO한국, CN중국처럼 코드화된 것을 개발자가 이해하기 새롭게 재부호화하는 것이다. car 라이브러리 recode 함수 등 다양한 재부호화 함수가 존재한다.

데이터프레임의 재부호화하는 방법을 살펴보자. 초간략 코드화된 항공사 코드를 개발자가 이해하기 쉬운 코드로 바꿔보자.

lut <- c("AA" = "American", "AS" = "Alaska", "B6" = "JetBlue", "CO" = "Continental", 
         "DL" = "Delta", "OO" = "SkyWest", "UA" = "United", "US" = "US_Airways", 
         "WN" = "Southwest", "EV" = "Atlantic_Southeast", "F9" = "Frontier", 
         "FL" = "AirTran", "MQ" = "American_Eagle", "XE" = "ExpressJet", "YV" = "Mesa")
hflights$UniqueCarrier <- lut[hflights$UniqueCarrier]         
glimpse(hflights)
FALSE Observations: 227,496
FALSE Variables: 21
FALSE $ Year              (int) 2011, 2011, 2011, 2011, 2011, 2011, 2011, 20...
FALSE $ Month             (int) 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,...
FALSE $ DayofMonth        (int) 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 1...
FALSE $ DayOfWeek         (int) 6, 7, 1, 2, 3, 4, 5, 6, 7, 1, 2, 3, 4, 5, 6,...
FALSE $ DepTime           (int) 1400, 1401, 1352, 1403, 1405, 1359, 1359, 13...
FALSE $ ArrTime           (int) 1500, 1501, 1502, 1513, 1507, 1503, 1509, 14...
FALSE $ UniqueCarrier     (chr) "American", "American", "American", "America...
FALSE $ FlightNum         (int) 428, 428, 428, 428, 428, 428, 428, 428, 428,...
FALSE $ TailNum           (chr) "N576AA", "N557AA", "N541AA", "N403AA", "N49...
FALSE $ ActualElapsedTime (int) 60, 60, 70, 70, 62, 64, 70, 59, 71, 70, 70, ...
FALSE $ AirTime           (int) 40, 45, 48, 39, 44, 45, 43, 40, 41, 45, 42, ...
FALSE $ ArrDelay          (int) -10, -9, -8, 3, -3, -7, -1, -16, 44, 43, 29,...
FALSE $ DepDelay          (int) 0, 1, -8, 3, 5, -1, -1, -5, 43, 43, 29, 19, ...
FALSE $ Origin            (chr) "IAH", "IAH", "IAH", "IAH", "IAH", "IAH", "I...
FALSE $ Dest              (chr) "DFW", "DFW", "DFW", "DFW", "DFW", "DFW", "D...
FALSE $ Distance          (int) 224, 224, 224, 224, 224, 224, 224, 224, 224,...
FALSE $ TaxiIn            (int) 7, 6, 5, 9, 9, 6, 12, 7, 8, 6, 8, 4, 6, 5, 6...
FALSE $ TaxiOut           (int) 13, 9, 17, 22, 9, 13, 15, 12, 22, 19, 20, 11...
FALSE $ Cancelled         (int) 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
FALSE $ CancellationCode  (chr) "", "", "", "", "", "", "", "", "", "", "", ...
FALSE $ Diverted          (int) 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...

결과를 일별하는데 glimpse 함수가 요긴하게 사용될 수 있다. 또다른 사례로 운항취소 코드를 개발자가 이애하기 쉬운 코드로 재부호화해보자.

lut = c("A"="carrier", "B"="weather", "C"="FFA", "D"="security","E"="not cancelled")

hflights$CancellationCode <- lut[hflights$CancellationCode]

glimpse(hflights)
FALSE Observations: 227,496
FALSE Variables: 21
FALSE $ Year              (int) 2011, 2011, 2011, 2011, 2011, 2011, 2011, 20...
FALSE $ Month             (int) 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,...
FALSE $ DayofMonth        (int) 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 1...
FALSE $ DayOfWeek         (int) 6, 7, 1, 2, 3, 4, 5, 6, 7, 1, 2, 3, 4, 5, 6,...
FALSE $ DepTime           (int) 1400, 1401, 1352, 1403, 1405, 1359, 1359, 13...
FALSE $ ArrTime           (int) 1500, 1501, 1502, 1513, 1507, 1503, 1509, 14...
FALSE $ UniqueCarrier     (chr) "American", "American", "American", "America...
FALSE $ FlightNum         (int) 428, 428, 428, 428, 428, 428, 428, 428, 428,...
FALSE $ TailNum           (chr) "N576AA", "N557AA", "N541AA", "N403AA", "N49...
FALSE $ ActualElapsedTime (int) 60, 60, 70, 70, 62, 64, 70, 59, 71, 70, 70, ...
FALSE $ AirTime           (int) 40, 45, 48, 39, 44, 45, 43, 40, 41, 45, 42, ...
FALSE $ ArrDelay          (int) -10, -9, -8, 3, -3, -7, -1, -16, 44, 43, 29,...
FALSE $ DepDelay          (int) 0, 1, -8, 3, 5, -1, -1, -5, 43, 43, 29, 19, ...
FALSE $ Origin            (chr) "IAH", "IAH", "IAH", "IAH", "IAH", "IAH", "I...
FALSE $ Dest              (chr) "DFW", "DFW", "DFW", "DFW", "DFW", "DFW", "D...
FALSE $ Distance          (int) 224, 224, 224, 224, 224, 224, 224, 224, 224,...
FALSE $ TaxiIn            (int) 7, 6, 5, 9, 9, 6, 12, 7, 8, 6, 8, 4, 6, 5, 6...
FALSE $ TaxiOut           (int) 13, 9, 17, 22, 9, 13, 15, 12, 22, 19, 20, 11...
FALSE $ Cancelled         (int) 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
FALSE $ CancellationCode  (chr) NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
FALSE $ Diverted          (int) 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...