Jonas Kristoffer Lindeløv 님이 직접 R 코드를 짜서 2020년 R이 SPSS를 연간 인용수에 있어 따라 잡을 것으로 예상했다. 이것이 맞는지를 확인해보자. 이를 위해서 rise-and-fall.R
스크립트를 Jobs로 실행시켜 구글 스칼러에서 가져온다.
Google Scholar Scraper.R 파일에서 JASP
관련 부분을 생략하고 한국에서 많이 언급되는 통계팩키지를 중심으로 구글 스칼러에 등록된 소프트웨어를 살펴보자.
library(rvest)
library(httr)
library(tidyverse)
# Settings
= 2010:2019
years = list(
searches R = '"the R software" OR "the R project" OR "r-project.org" OR "R development core" OR "bioconductor" OR "lme4" OR "nlme" OR "lmeR function" OR "ggplot2" OR "Hmisc" OR "r function" OR "r package" OR "mass package" OR "plyr package" OR "mvtnorm"',
SPSS = 'SPSS -"SPSS Modeler" -"Amos"',
SAS = '"SAS Institute" -JMP -"Enterprise Miner"',
STATA = '("stata" "college station") OR "StataCorp" OR "Stata Corp" OR "Stata Journal" OR "Stata Press" OR "stata command" OR "stata module"'
)= c(1, 10) # Uniformly break between searches in this interval to prevent scholar from rejecting searches
sleep_interval = 'https://scholar.google.dk/scholar?hl=en&as_sdt=0%2C5&as_ylo=9999&as_yhi=9999&q='
scholar_prefix
###################
# HANDY FUNCTIONS #
###################
# Build the URL string
= function(software, year) {
get_url = gsub('9999', as.character(year), scholar_prefix) # Enter year
url_prefix = gsub(' ', '+', searches[[software]]) # Escape spaces
url_search = gsub('\"', '%22', url_search) # Escape quotes
url_search = paste(url_prefix, url_search, sep='')
url
url
}
# Do the web search
= function(url) {
get_html = read_html(url)
html #html = content(GET(url))
html
}
= function(html) {
extract_citations # Extract the citation number
= html %>%
hits_strings html_nodes(css='.gs_ab_mdw') %>% # Name of the class where we can find citation number
html_text()
= strsplit(hits_strings[2], ' ')[[1]][2] # Second hit, second "word"
hits_string = as.numeric(gsub(',', '', hits_string)) # As numeric, not string
hits_numeric
hits_numeric
}
= function(software, year) {
get_citations # Sleep to prevent HTTP error 503
= runif(1, sleep_interval[1], sleep_interval[2])
sleep_duration Sys.sleep(sleep_duration)
# Do the search
= get_url(software, year)
url = get_html(url)
html = extract_citations(html)
citations
# Status and return
print(sprintf('Got %i scholar citations in %i for %s', citations, year, software))
citations
}
#################
# DO THE SEARCH #
#################
= expand.grid(years, names(searches))
citation_history names(citation_history) = c('year', 'software')
= citation_history %>%
citation_history # filter(software == 'SAS') %>%
rowwise() %>%
mutate(
citations = get_citations(software, year)
)
# Save it so you don't have to repeat in case Scholar locks you out
write.csv(citation_history, 'data/citations.csv', row.names = F)
library(tidyverse)
library(ggthemes)
library(extrafont)
loadfonts()
<- read_csv("data/citations_2021.csv")
citation_df
%>%
citation_df mutate(year = lubridate::make_date(year = year)) %>%
ggplot(aes(x=year, y=citations, color=software)) +
geom_line() +
geom_point(size=2) +
theme_tufte(base_family = "NanumGothic") +
labs(x="", y="논문 인용수", title="SPSS 몰락과 R의 비약", color="통계팩키지") +
scale_y_sqrt(labels = scales::comma)
데이터 과학자 이광춘 저작
kwangchun.lee.7@gmail.com