2 개념
alluvial plot
을 이해하는데 다음 기본 구성요소를 이해하는 것이 도움이 된다.
- strata, or stacked bar plots, located in parallel along a (plotting) axis of (variable) axes or dimensions
- alluvia, ribbons through strata that connect the categories of individual cases or cohorts at different axes
- lodes, subdivisions of strata by their intersections with alluvia
- flows, segments of alluvia between strata
# 사례: alluvial
2.1 우울증 데이터
Eric Green, “Alluvial Diagrams - Plotting pathways over time”, towards data science 블로그에 게시된 alluvial 그래프 소개에 포함된 케냐 학생 우울증 데이터를 가져와서 시각화해보자. 전통적인 방식으로 다음과 같이 작업을 하여 설명을 달아 논문이나 보고서에 제출하게 된다.
2018년에 나온 블로그글이다 보니 다소 문법이 오래되어 최신 R 문법에 맞춰 일부 코드를 수정하여 경고나 오류가 없도록 한다.
# load packages
library(tidyverse)
# get the data
<- read_csv("https://raw.githubusercontent.com/ericpgreen/JCPP2018/master/data%20and%20replication%20files/input/dat.csv")
data
<- data %>%
tidy_data # select the key variables for analysis
select(ID, w1group,
w1_54a, w1_54b, w1_54d, w1_54e, w1_54f,
w2_54a, w2_54b, w2_54d, w2_54e, w2_54f,
w3_54a, w3_54b, w3_54d, w3_54e, w3_54f,%>%
w4_54a, w4_54b, w4_54d, w4_54e, w4_54f) # replace any NaN with NA
mutate_at(vars(-ID, -w1group), ~ ifelse(is.nan(.), NA, .)) %>%
# rescale data range (1-4) to (0-3) by subtracting 1
mutate_at(vars(-ID, -w1group), funs(. - 1)) %>%
# reverse code variable _54a
mutate_at(vars(w1_54a, w2_54a, w3_54a, w4_54a),
funs(3 - .)) %>%
# construct depression severity
rowwise() %>%
mutate(depress.1 = mean(c(w1_54a, w1_54b, w1_54d, w1_54e, w1_54f),
na.rm=TRUE),
depress.2 = mean(c(w2_54a, w2_54b, w2_54d, w2_54e, w2_54f),
na.rm=TRUE),
depress.3 = mean(c(w3_54a, w3_54b, w3_54d, w3_54e, w3_54f),
na.rm=TRUE),
depress.4 = mean(c(w4_54a, w4_54b, w4_54d, w4_54e, w4_54f),
na.rm=TRUE)
%>%
) select(ID, w1group, depress.1, depress.2, depress.3, depress.4) %>%
pivot_longer(cols = depress.1:depress.4, names_to = "time", values_to = "depress") %>%
mutate(time = str_remove(time, "depress.")) %>%
# construct indicator of depression
mutate(dep16 = ifelse(depress >= 0.8, 1, 0)) %>%
# construct indicator of depression at baseline for every obs using dep16
group_by(ID) %>%
mutate(dep16Base = dep16[time == 1L]) %>%
ungroup() %>%
# categorize participants relative to baseline (time 1)
mutate(depStatus = case_when(dep16Base==1 & dep16==1 ~ "still depressed",
==1 & dep16==0 ~ "remission",
dep16Base==0 & dep16==1 ~ "became depressed",
dep16Base==0 & dep16==0 ~ "never depressed",
dep16BaseTRUE ~ "missing")) %>%
# labels
mutate(w1group = factor(w1group, levels=c(0, 1), labels=c("control", "intervention"))) %>%
# highlight two participants
mutate(highlight = case_when(ID == 2 ~ 1,
== 3 ~ 2,
ID TRUE ~ 3))
tidy_data
# A tibble: 3,340 x 8
ID w1group time depress dep16 dep16Base depStatus highlight
<dbl> <fct> <chr> <dbl> <dbl> <dbl> <chr> <dbl>
1 1 control 1 1.2 1 1 still depressed 3
2 1 control 2 1 1 1 still depressed 3
3 1 control 3 0.6 0 1 remission 3
4 1 control 4 1.4 1 1 still depressed 3
5 2 intervention 1 0 0 0 never depressed 1
6 2 intervention 2 0.2 0 0 never depressed 1
7 2 intervention 3 0.6 0 0 never depressed 1
8 2 intervention 4 0.2 0 0 never depressed 1
9 3 control 1 1 1 1 still depressed 2
10 3 control 2 0.2 0 1 remission 2
# ... with 3,330 more rows
2.2 전통적인 시각화
%>%
tidy_data filter(highlight > 2) %>%
ggplot() +
# plot N-2 as a base layer
geom_line(aes(time, depress, color=factor(highlight), group=ID), size=0.5) +
# highlight two individuals and plot them second to appear on top
geom_line(data = tidy_data %>% filter(highlight < 3),
aes(time, depress, color=factor(highlight), group=ID), size=1) +
scale_color_manual(values=c("#ff0000", "#0000ff", "#D3D3D3")) +
scale_size_manual(values=c(1, 1, 0.5)) +
facet_wrap(~w1group) +
theme_bw() +
theme(legend.position="none")
2.3 alluvial 그래프
# convert back to wide for alluvial
<- tidy_data %>%
alluvial_data # long to wide
select(ID, w1group, time, depress, dep16) %>%
pivot_longer(depress:dep16, names_to = "variable", values_to = "value") %>%
# gather(key = variable, value = value, -ID, -w1group, -time) %>%
unite(variableT, variable, time, sep=".") %>%
spread(variableT, value) %>%
# alluvial prep
filter(complete.cases(.)) %>%
group_by(dep16.1, dep16.2, dep16.3, dep16.4) %>%
summarise(n = n()) %>%
ungroup() %>%
mutate_at(vars(-n), funs(factor(., levels=c(0, 1), labels=c("not depressed",
"depressed"))))
# alluvial diagram
library(alluvial)
alluvial(alluvial_data[,1:4], freq=alluvial_data$n,
col = ifelse(alluvial_data$dep16.4 == "depressed", "#ff0000", "#D3D3D3"),
axis_labels = c("Year 1", "Year 2", "Year 3", "Year 4"),
cex = 0.7)