1 사라지는 SPSS 1

Jonas Kristoffer Lindeløv 님이 직접 R 코드를 짜서 2020년 R이 SPSS를 연간 인용수에 있어 따라 잡을 것으로 예상했다. 이것이 맞는지를 확인해보자. 이를 위해서 rise-and-fall.R 스크립트를 Jobs로 실행시켜 구글 스칼러에서 가져온다.

Google Scholar Scraper.R 파일에서 JASP관련 부분을 생략하고 한국에서 많이 언급되는 통계팩키지를 중심으로 구글 스칼러에 등록된 소프트웨어를 살펴보자.

library(rvest)
library(httr)
library(tidyverse)

# Settings
years = 2010:2019
searches = list(
  R = '"the R software" OR "the R project" OR "r-project.org" OR "R development core" OR "bioconductor" OR "lme4" OR "nlme" OR "lmeR function" OR "ggplot2" OR "Hmisc" OR "r function" OR  "r package" OR "mass package" OR "plyr package" OR "mvtnorm"',
  SPSS = 'SPSS -"SPSS Modeler" -"Amos"',
  SAS = '"SAS Institute" -JMP -"Enterprise Miner"',
  STATA = '("stata" "college station") OR "StataCorp" OR "Stata Corp" OR "Stata Journal" OR "Stata Press" OR "stata command" OR "stata module"'
)
sleep_interval = c(1, 10)  # Uniformly break between searches in this interval to prevent scholar from rejecting searches
scholar_prefix = 'https://scholar.google.dk/scholar?hl=en&as_sdt=0%2C5&as_ylo=9999&as_yhi=9999&q='


###################
# HANDY FUNCTIONS #
###################

# Build the URL string
get_url = function(software, year) {
  url_prefix = gsub('9999', as.character(year), scholar_prefix)  # Enter year
  url_search = gsub(' ', '+', searches[[software]])  # Escape spaces
  url_search = gsub('\"', '%22', url_search)  # Escape quotes
  url = paste(url_prefix, url_search, sep='')
  url
}

# Do the web search
get_html = function(url) {
  html = read_html(url)
  #html = content(GET(url))
  html
}

extract_citations = function(html) {
  # Extract the citation number
  hits_strings = html %>%
    html_nodes(css='.gs_ab_mdw') %>%  # Name of the class where we can find citation number
    html_text()
  hits_string = strsplit(hits_strings[2], ' ')[[1]][2]  # Second hit, second "word"
  hits_numeric = as.numeric(gsub(',', '', hits_string))  # As numeric, not string
  hits_numeric
}

get_citations = function(software, year) {
  # Sleep to prevent HTTP error 503
  sleep_duration = runif(1, sleep_interval[1], sleep_interval[2])
  Sys.sleep(sleep_duration)
  
  # Do the search
  url = get_url(software, year)
  html = get_html(url)
  citations =  extract_citations(html)
  
  # Status and return
  print(sprintf('Got %i scholar citations in %i for %s', citations, year, software))
  citations
}


#################
# DO THE SEARCH #
#################
citation_history = expand.grid(years, names(searches))
names(citation_history) = c('year', 'software')

citation_history = citation_history %>%
  # filter(software == 'SAS') %>%
  rowwise() %>%
  mutate(
    citations = get_citations(software, year)
  )

# Save it so you don't have to repeat in case Scholar locks you out
write.csv(citation_history, 'data/citations.csv', row.names = F)

2 논문 인용 분석

library(tidyverse)
library(ggthemes)
library(extrafont)
loadfonts()

citation_df <- read_csv("data/citations_2021.csv")

citation_df %>% 
  mutate(year = lubridate::make_date(year = year)) %>% 
  ggplot(aes(x=year, y=citations, color=software)) +
    geom_line() +
    geom_point(size=2) +
    theme_tufte(base_family = "NanumGothic") +
    labs(x="", y="논문 인용수", title="SPSS 몰락과 R의 비약", color="통계팩키지") +
    scale_y_sqrt(labels = scales::comma) 

 

데이터 과학자 이광춘 저작

kwangchun.lee.7@gmail.com