charlatan
: Make Fake Data
faker
팩키지에서 영감을 받음!!!fakir
: Provide fake datasets that can be used to teach Rfabricatr
: Imagine your data before you collect itfakeR
: Simulates Data from a Data Frame of Different Variable TypesSimMultiCorrData
: Simulation of Correlated Data with Multiple Variable Typesbindata
: Generation of Artificial Binary Datacharlatan
팩키지파이썬 faker
팩키지에서 영감을 받아 R 팩키지로 제작되었다. 한국어 가짜 데이터를 제작하는데 파이썬 팩키지 개발 내용을 R 팩키지로 반영시키면 큰 의미가 있을 듯 싶다.
# devtools::install_github("ropensci/charlatan")
library(charlatan)
ch_generate('phone_number', n = 30, locale = "ko_KR")
# A tibble: 30 x 1
phone_number
<chr>
1 018-269-1069
2 042-390-7854
3 063-882-2100
4 044-179-4388
5 070-4934-4944
6 055-018-9953
7 052-199-5905
8 041-944-2944
9 041-056-2129
10 062-006-6959
# ... with 20 more rows
대한민국 위경도로 한정하여 위경도를 만들어낸다.
library(tidyverse)
library(leaflet)
do.call(rbind, ch_position(10, bbox = c(124, 33, 132, 43))) %>%
locations_df <- as_tibble() %>%
set_names(c("lon", "lat"))
# locations_df <- tibble(lon = ch_lon(n=10), lat = ch_lat(n=10))
%>%
locations_df leaflet() %>%
addProviderTiles("Stamen.Watercolor") %>%
addMarkers(~lon, ~lat)
fakir
팩키지charlatan
팩키지에 영감을 받아 R 교육용으로 개발된 팩키지로 실제 데이터 과학 교육에 사용될 수 있다. 한가지 문제는 프랑스를 중심으로 데이터가 제작되었다는 한계가 있다.
# devtools::install_github("ThinkR-open/fakir")
library(fakir)
fake_ticket_client(vol = 10)
# A tibble: 10 x 25
ref num_client first last job age region id_dpt departement
<chr> <chr> <chr> <chr> <chr> <dbl> <chr> <chr> <chr>
1 DOSS~ 79 Jovan O'Ke~ Gene~ 22 <NA> 23 Creuse
2 DOSS~ 69 Miss Lean~ Emer~ 68 Midi-~ 82 Tarn-et-Ga~
3 DOSS~ 120 Odell Stok~ Engi~ 24 <NA> 63 Puy-de-Dome
4 DOSS~ 31 Loren Lars~ <NA> NA Midi-~ 32 Gers
5 DOSS~ 59 Mayb~ Maye~ Furt~ 18 Breta~ 56 Morbihan
6 DOSS~ 118 Jama~ Ober~ Engi~ 18 Ile-d~ 77 Seine-et-M~
7 DOSS~ 77 Lee Scha~ Admi~ NA Ile-d~ 94 Val-de-Mar~
8 DOSS~ 65 Deme~ Auer Cont~ 21 Poito~ 86 Vienne
9 DOSS~ 141 Wilf~ Harv~ Educ~ 53 Breta~ 29 Finistere
10 DOSS~ 182 Addy~ Nien~ Earl~ 65 Breta~ 29 Finistere
# ... with 16 more variables: cb_provider <chr>, name <chr>, entry_date <dttm>,
# fidelity_points <dbl>, priority_encoded <dbl>, priority <fct>,
# timestamp <date>, year <dbl>, month <dbl>, day <int>, supported <chr>,
# supported_encoded <int>, type <chr>, type_encoded <int>, state <fct>,
# source_call <fct>
fake_ticket_client(vol = 100, split = TRUE)
tickets_db <- tickets_db
$clients
# A tibble: 200 x 14
num_client first last job age region id_dpt departement cb_provider
* <chr> <chr> <chr> <chr> <dbl> <chr> <chr> <chr> <chr>
1 1 Solo~ Hean~ Civi~ 53 Pays ~ 85 Vendee Diners Clu~
2 2 Karma Will~ Scie~ 81 Bourg~ 58 Nievre VISA 13 di~
3 3 Press Kulas Anim~ NA Aquit~ 33 Gironde <NA>
4 4 Laken McDe~ <NA> NA <NA> 2A Corse-du-S~ <NA>
5 5 Sydn~ Jask~ Hort~ 30 <NA> 27 Eure <NA>
6 6 Clay~ Runo~ Comm~ NA Langu~ 30 Gard Diners Clu~
7 7 Robe~ Purd~ Fina~ 60 Limou~ 87 Haute-Vien~ <NA>
8 8 Dr. Rona~ Astr~ 30 Pays ~ 44 Loire-Atla~ <NA>
9 9 Miss Alon~ Occu~ 18 Corse 2A Corse-du-S~ Diners Clu~
10 10 Vern~ Ondr~ Clin~ 19 Limou~ 19 Correze <NA>
# ... with 190 more rows, and 5 more variables: name <chr>, entry_date <dttm>,
# fidelity_points <dbl>, priority_encoded <dbl>, priority <fct>
$tickets
# A tibble: 100 x 10
ref num_client year month day timestamp supported type state
<chr> <chr> <dbl> <dbl> <int> <date> <chr> <chr> <fct>
1 DOSS~ 1 2015 1 8 2015-01-08 Non Inst~ Term~
2 DOSS~ 22 2018 10 30 2018-10-30 Non Inst~ Atte~
3 DOSS~ 9 2018 12 5 2018-12-05 Non Inst~ Term~
4 DOSS~ 8 2018 12 19 2018-12-19 Non Box Atte~
5 DOSS~ 30 2019 1 5 2019-01-05 Oui Inst~ Inte~
6 DOSS~ 10 2019 1 17 2019-01-17 Oui Inst~ Atte~
7 DOSS~ 37 2019 2 14 2019-02-14 Non Ligne Atte~
8 DOSS~ 37 2019 4 6 2019-04-06 Non Box Atte~
9 DOSS~ 24 2019 4 14 2019-04-14 Non <NA> En c~
10 DOSS~ 12 2019 5 1 2019-05-01 Non Inst~ Atte~
# ... with 90 more rows, and 1 more variable: source_call <fct>
fake_products(10)
# A tibble: 10 x 8
name brand color price body_location category sent_from id
<chr> <chr> <chr> <int> <chr> <chr> <chr> <int>
1 Step and Dis~ Larkin, ~ Pink 3 Waist Industri~ Taiwan 1
2 Biking Track~ Larkin, ~ OliveD~ 5 Waist Pets and~ United S~ 2
3 Wearable Tra~ Moen, Mo~ Beige 4 Feet Lifestyle Netherla~ 3
4 Multifunctio~ Weimann,~ Yellow~ 5 Head Lifestyle China 4
5 Action Camer~ Blanda, ~ Moccas~ 10 Brain Lifestyle Italy 5
6 Strapless He~ Mann and~ DeepPi~ 4 Head Medical Finland 6
7 Action Camer~ Mann and~ Maroon 5 Feet Entertai~ Finland 7
8 Microcontrol~ Moen, Mo~ Pink 3 Neck Fitness Finland 8
9 Microcontrol~ Moen, Mo~ Plum 5 Arms Awesome France 9
10 Wearable AUS~ Moen, Mo~ DarkRed 9 Torso Medical Italy 10
fake_visits(from = "2017-01-01", to = "2017-01-31")
# A tibble: 31 x 8
timestamp year month day home about blog contact
* <date> <dbl> <dbl> <int> <int> <int> <int> <int>
1 2017-01-01 2017 1 1 369 220 404 210
2 2017-01-02 2017 1 2 159 250 414 490
3 2017-01-03 2017 1 3 436 170 498 456
4 2017-01-04 2017 1 4 NA 258 526 392
5 2017-01-05 2017 1 5 362 NA 407 291
6 2017-01-06 2017 1 6 245 145 576 90
7 2017-01-07 2017 1 7 NA NA 484 167
8 2017-01-08 2017 1 8 461 103 441 NA
9 2017-01-09 2017 1 9 337 113 673 379
10 2017-01-10 2017 1 10 NA 169 308 139
# ... with 21 more rows
fake_sondage_answers(n = 10, split = TRUE)
$individus
# A tibble: 10 x 8
id_individu age sexe region id_departement nom_departement
<chr> <int> <chr> <chr> <chr> <chr>
1 ID-NYDZ-010 NA <NA> <NA> 09 Ariege
2 ID-PWLB-009 71 F Champ~ 10 Aube
3 ID-NMQG-001 42 M <NA> 05 Hautes-Alpes
4 ID-RJXN-002 71 O Poito~ 86 Vienne
5 ID-MROK-007 41 M Haute~ 27 Eure
6 ID-VMKS-004 33 O Aquit~ 40 Landes
7 ID-XEMZ-003 81 O Franc~ 70 Haute-Saone
8 ID-EUDQ-005 44 M Champ~ 10 <NA>
9 ID-DCIZ-008 92 O Basse~ 61 Orne
10 ID-KPUS-006 57 O Aquit~ 33 <NA>
# ... with 2 more variables: question_date <dttm>, year <dbl>
$answers
# A tibble: 30 x 5
id_individu type distance_km transport temps_trajet_en_heures
<chr> <chr> <dbl> <fct> <dbl>
1 ID-NYDZ-010 travail 12.2 voiture 0.15
2 ID-NYDZ-010 commerces 9.61 bus 1.01
3 ID-NYDZ-010 loisirs 549. avion 0.27
4 ID-PWLB-009 travail 11.9 voiture 0.14
5 ID-PWLB-009 commerces 27.4 voiture 0.34
6 ID-PWLB-009 loisirs 210. train 0.42
7 ID-NMQG-001 travail 2.38 velo 0.43
8 ID-NMQG-001 commerces 14.9 voiture 0.18
9 ID-NMQG-001 loisirs 446. train 0.89
10 ID-RJXN-002 travail 6.18 mobylette 0.75
# ... with 20 more rows
fabricatr
팩키지데이터를 수집하기 전에 수집된 데이터가 어떤 모양일지 미리 모의실험을 통해서 데이터를 얻은 후에 이를 후속 작업에 활용하는 것은 특별히 의미가 클 것으로 생각됩니다.
# install.packages("fabricatr")
library(fabricatr)
fabricate(
house_members <-party_id = add_level(
N = 2, party_names = c("민주당", "국민의 힘"), party_ideology = c(0.5, -0.5),
in_power = c(1, 0), party_incumbents = c(180, 103)
),rep_id = add_level(
N = party_incumbents, member_ideology = rnorm(N, party_ideology, sd = 0.5),
terms_served = draw_count(N = N, mean = 4),
female = draw_binary(N = N, prob = 0.19)
)%>% as_tibble()
)
house_members
# A tibble: 283 x 9
party_id party_names party_ideology in_power party_incumbents rep_id
<chr> <chr> <dbl> <dbl> <dbl> <chr>
1 1 민주당 0.5 1 180 001
2 1 민주당 0.5 1 180 002
3 1 민주당 0.5 1 180 003
4 1 민주당 0.5 1 180 004
5 1 민주당 0.5 1 180 005
6 1 민주당 0.5 1 180 006
7 1 민주당 0.5 1 180 007
8 1 민주당 0.5 1 180 008
9 1 민주당 0.5 1 180 009
10 1 민주당 0.5 1 180 010
# ... with 273 more rows, and 3 more variables: member_ideology <dbl>,
# terms_served <int>, female <int>
fakeR
팩키지USArrests
데이터셋은 미국 주별 주요 통계를 담고 있는 데이터셋이다. fakeR
을 통해 시계열 뿐만 아니라 cross-sectional 데이터셋도 원데이터 특성을 반영한 가짜 데이터를 만들어 낼 수 있다.
library(fakeR)
USArrests %>%
us_arrests_df <- rownames_to_column(var = "state_name") %>%
as_tibble() %>%
janitor::clean_names()
simulate_dataset(us_arrests_df %>% as.data.frame) %>%
fake_us_arrests_df <- as_tibble()
[1] "Some unordered factors..."
[1] "Numeric variables. No ordered factors..."
%>%
USArrests rownames_to_column(var = "state_name") %>%
left_join(fake_us_arrests_df) %>%
as_tibble()
# A tibble: 72 x 9
state_name Murder Assault UrbanPop Rape murder assault urban_pop rape
<chr> <dbl> <int> <int> <dbl> <dbl> <dbl> <dbl> <dbl>
1 Alabama 13.2 236 58 21.2 NA NA NA NA
2 Alaska 10 263 48 44.5 5.29 161. 79.3 12.8
3 Alaska 10 263 48 44.5 9.13 105. 48.0 17.7
4 Alaska 10 263 48 44.5 10.6 239. 78.6 25.7
5 Alaska 10 263 48 44.5 12.8 272. 71.6 26.6
6 Alaska 10 263 48 44.5 10.4 201. 73.2 25.3
7 Arizona 8.1 294 80 31 NA NA NA NA
8 Arkansas 8.8 190 50 19.5 NA NA NA NA
9 California 9 276 91 40.6 NA NA NA NA
10 Colorado 7.9 204 78 38.7 NA NA NA NA
# ... with 62 more rows
데이터 과학자 이광춘 저작
kwangchun.lee.7@gmail.com