GitHub
데이터 분석: 프로젝트 평가연세대학교 2019년 성적제출과 성적확인은 다음 기간을 통해 이뤄진다.
Software Engineering for Data Science, 데이터 공학 과목을 이수한 팀 하나를 선택하여 GitHub 저장소를 가져온다. 데이터공학 프로젝트 (TEAM 11대0)는 https://www.kickhelpers.com/ 웹사이트에 개발한 데이터 과학 제품을 AWS EC2위에 올려놨다.
library(tidyverse)
library(fs)
library(here)
library(glue)
project_dir <- glue("{here()}/data/project")
# 팀 프로젝트
## 11대 0 프로젝트 --------------
team_11_repo_url <- "https://github.com/whoareyouwhoami/ProjectTellus.git"
# 프로젝트 저장 디렉토리
clone_team_11_dir <- glue("{project_dir}/team_11")
# CMD 명령어 생성
clone_team_11_cmd <- glue("git clone {team_11_repo_url} {clone_team_11_dir}")
system(clone_team_11_cmd)
fs
팩키지 dir_ls()
함수로 GitHub 저장소에서 클론하여 가져온 디렉토리와 파일을 확인한다.
data/project/team_11/README.md
data/project/team_11/SW4DS_django
data/project/team_11/data_cleaning
data/project/team_11/documentation
data/project/team_11/modelling
data/project/team_11/requirements.txt
GitHub 저장소에서 데이터를 가져올 준비가 모두 완료된 경우 프로젝트 커밋 로그를 데이터프레임으로 변환시켜 데이터 분석 준비를 한다.
setwd(clone_team_11_dir)
log_format_options <- c(datetime = "cd", commit = "h", parents = "p", author = "an", subject = "s")
option_delim <- "\t"
log_format <- glue("%{log_format_options}") %>% glue_collapse(option_delim)
log_options <- glue('--pretty=format:"{log_format}" --date=format:"%Y-%m-%d %H:%M:%S"')
log_cmd <- glue('git -C {clone_team_11_dir} log {log_options}')
log_cmd
git -C /Users/statkclee/swc/ingest-data/data/project/team_11 log --pretty=format:"%cd %h %p %an %s" --date=format:"%Y-%m-%d %H:%M:%S"
team_11_logs <- system(log_cmd, intern = TRUE) %>%
str_split_fixed(option_delim, length(log_format_options)) %>%
as_tibble() %>%
setNames(names(log_format_options)) %>%
mutate(parents = str_split(parents, " "))
team_11_logs
# A tibble: 169 x 5
datetime commit parents author subject
<chr> <chr> <list> <chr> <chr>
1 2019-12-13 19:44:59 1f20346 <chr [1]> dial0116 Delete Intro_PT.md
2 2019-12-13 19:44:11 c163759 <chr [1]> dial0116 Update README.md
3 2019-12-13 19:43:27 ec21f88 <chr [1]> dial0116 Update Intro_PT.ipynb
4 2019-12-13 19:42:19 e64f967 <chr [1]> dial0116 Add files via upload
5 2019-12-13 19:34:56 158d743 <chr [1]> dial0116 Add files via upload
6 2019-12-13 19:27:51 3d7e258 <chr [1]> dial0116 Add files via upload
7 2019-12-13 19:20:36 1f37324 <chr [1]> dial0116 Add files via upload
8 2019-12-13 19:16:56 83a6fe0 <chr [1]> dial0116 Update README.md
9 2019-12-13 19:11:28 d59ebd0 <chr [1]> dial0116 Update README.md
10 2019-12-13 18:03:23 4ff26e8 <chr [1]> dial0116 Update README.md
# … with 159 more rows
# Start with NA
team_11_logs <- team_11_logs %>% mutate(branch = NA_integer_)
# Create a boolean vector to represent free columns (1000 should be plenty!)
free_col <- rep(TRUE, 1000)
for (i in seq_len(nrow(team_11_logs) - 1)) { # - 1 to ignore root
# Check current branch col and assign open col if NA
branch <- team_11_logs$branch[i]
if (is.na(branch)) {
branch <- which.max(free_col)
free_col[branch] <- FALSE
team_11_logs$branch[i] <- branch
}
# Go through parents
parents <- team_11_logs$parents[[i]]
for (p in parents) {
parent_col <- team_11_logs$branch[team_11_logs$commit == p]
# If col is missing, assign it to same branch (if first parent) or new
# branch (if other)
if (is.na(parent_col)) {
parent_col <- if_else(p == parents[1], branch, which.max(free_col))
# If NOT missing this means a split has occurred. Assign parent the lowest
# and re-open both cols (parent closed at the end)
} else {
free_col[c(branch, parent_col)] <- TRUE
parent_col <- min(branch, parent_col)
}
# Close parent col and assign
free_col[parent_col] <- FALSE
team_11_logs$branch[team_11_logs$commit == p] <- parent_col
}
}
team_11_logs
# A tibble: 169 x 6
datetime commit parents author subject branch
<chr> <chr> <list> <chr> <chr> <int>
1 2019-12-13 19:44:… 1f20346 <chr [1]> dial0116 Delete Intro_PT.md 1
2 2019-12-13 19:44:… c163759 <chr [1]> dial0116 Update README.md 1
3 2019-12-13 19:43:… ec21f88 <chr [1]> dial0116 Update Intro_PT.ip… 1
4 2019-12-13 19:42:… e64f967 <chr [1]> dial0116 Add files via uplo… 1
5 2019-12-13 19:34:… 158d743 <chr [1]> dial0116 Add files via uplo… 1
6 2019-12-13 19:27:… 3d7e258 <chr [1]> dial0116 Add files via uplo… 1
7 2019-12-13 19:20:… 1f37324 <chr [1]> dial0116 Add files via uplo… 1
8 2019-12-13 19:16:… 83a6fe0 <chr [1]> dial0116 Update README.md 1
9 2019-12-13 19:11:… d59ebd0 <chr [1]> dial0116 Update README.md 1
10 2019-12-13 18:03:… 4ff26e8 <chr [1]> dial0116 Update README.md 1
# … with 159 more rows
# A tibble: 8 x 2
author n
<chr> <int>
1 YW mac15 70
2 JoJo 32
3 dial0116 16
4 jiwon12-31 15
5 whoareyouwhoami 13
6 JunghaYun 11
7 JOJO 7
8 Jiwon Park 5
GitHub url을 넣으면 다운로드 받는 함수를 제작해보자.
download_repo <- function(repo_url){
project_dir <- glue("{here()}/data/project")
# 팀 프로젝트
## 11대 0 프로젝트 --------------
repo_name <- str_extract(repo_url, "[^/]*$")
# 프로젝트 저장 디렉토리
clone_dir <<- glue("{project_dir}/{repo_name}")
# CMD 명령어 생성
clone_cmd <- glue("git clone {repo_url} {clone_dir}")
cat("cloning...", clone_cmd, "\n")
system(clone_cmd)
# return(clone_cmd)
}
# 11대0 -----
team_url <- "https://github.com/whoareyouwhoami/ProjectTellus"
download_repo(team_url)
fs::dir_ls(clone_dir)
# 연세투어 -----
연세투어_url <- "https://github.com/yonseijaewon/yonsei-tour"
download_repo(연세투어_url)
fs::dir_ls(clone_dir)
# 4달러 -----
four_달러_url <- "https://github.com/lhmlhm1111/Data_GongHak"
download_repo(four_달러_url)
fs::dir_ls(clone_dir)
# 서울시 축제 대백과 -----
seoul_festival_url <- "https://github.com/HGmin1159/Seoul_Festival"
download_repo(seoul_festival_url)
fs::dir_ls(clone_dir)