https://github.com/hongtaoh/32vis
Tip revision: 9960413711b0efb1f51ff7cce3548d259be8d8cb authored by Hongtao Hao on 24 May 2025, 20:13:11 UTC
Update README.md
Update README.md
Tip revision: 9960413
citation_analysis.R
library(ggplot2)
library(dplyr)
library(scales)
library(tidyverse)
library(ggpubr)
library(ggridges)
library(data.table)
library(reshape2)
library(stargazer)
library('car')
library("olsrr")
df = read.csv('../data/ht_class/ht_cleaned_paper_df.csv')
df = df[df$Year <= 2020, ]
df$Year.Distance.from.2020 = abs(df$Year - 2020)
########## CONFERENCE TRACKS
fit <- aov(Number.of.Citations ~ Conference, data = df)
summary(fit)
## All pairs are significant except for vis-vast
TukeyHSD(fit, conf.level = 0.95)
plot(TukeyHSD((fit)))
###########
df <- mutate(df, citenum_non_zero = Number.of.Citations + 0.99)
df <- mutate(
df,
gcitenum_non_zero = Citation.Counts.on.Google.Scholar + 0.99)
df <- mutate(df, citenum_log10 = log10(citenum_non_zero))
df <- mutate(df, gcitenum_log10 = log10(gcitenum_non_zero))
############ checking outliers
# core_vars<- c('Year',
# 'Conference',
# 'PaperType',
# 'Number.of.Authors',
# 'Cross.type.Collaboration',
# 'Cross.country.Collaboration',
# 'With.US.Authors',
# 'Award',
# 'Number.of.Citations'
# )
# indepdent_vars <- c('Year',
# 'Conference',
# 'PaperType',
# 'Number.of.Authors',
# 'Cross.type.Collaboration',
# 'Cross.country.Collaboration',
# 'With.US.Authors',
# 'Award')
# outcome_vars <- c('Number.of.Citations')
########
model = lm(Number.of.Citations ~
Year.Distance.from.2020 + Conference + PaperType
+ Number.of.Authors + Cross.type.Collaboration +
Cross.country.Collaboration + With.US.Authors + Award,
data = df)
# #### Diagnosis
# y_hat_obj<-predict(model)
# u_obj<-resid(model)
# s_u_obj<-rstandard(model)
# qqnorm(s_u_obj)
# abline(0,0,col='red')
# # mean(u_invst)
# shapiro.test(u_obj)
#
# plot(y_hat_obj,s_u_obj)
# abline(0,0,col='red')
# should be below 5
car::vif(model)
## ALL IN ONE
ols_plot_resid_fit(model)
ols_coll_diag(model)
summary(model)
library(jtools)
plot_summs(model,
plot.distributions = FALSE, scale=FALSE) + theme_apa()
ggsave('../results/figures/supp_reg_openalex.pdf')
stargazer(model)
model1 = lm(citenum_log10 ~
Year.Distance.from.2020 + Conference + PaperType
+ Number.of.Authors + Cross.type.Collaboration +
Cross.country.Collaboration + With.US.Authors + Award,
data = df)
summary(model1)
plot_summs(model1,
plot.distributions = FALSE, scale=FALSE) + theme_apa()
ggsave('../results/figures/supp_reg_openalex_log10.pdf')
stargazer(model1)
vif(model1)
ols_plot_resid_fit(model1)
ols_coll_diag(model1)
# model2 = lm(gcitenum_log10 ~
# Year.Distance.from.2020 + Conference + PaperType
# + Number.of.Authors + Cross.type.Collaboration +
# Cross.country.Collaboration + With.US.Authors + Award,
# data = df)
#
# summary(model2)
#
# vif(model2)
#
# ols_plot_resid_fit(model2)
# ols_coll_diag(model2)
model3 = lm(Citation.Counts.on.Google.Scholar ~
Year.Distance.from.2020 + Conference + PaperType
+ Number.of.Authors + Cross.type.Collaboration +
Cross.country.Collaboration + With.US.Authors + Award,
data = df)
summary(model3)
vif(model3)
plot_summs(model3,
plot.distributions = FALSE, scale=FALSE) + theme_apa()
ggsave('../results/figures/supp_reg_gscholar.pdf')
stargazer(model3)
# df <- mutate(df, citenum_non_zero = Number.of.Citations + 0.99)
######################################### PLOTS
# ggplot(df, aes(x = citenum_non_zero, y = Award)) +
# geom_density_ridges() +
# scale_x_continuous(
# trans = 'log10',
# breaks=trans_breaks('log10', function(x) 10^x),
# labels=trans_format('log10', math_format(10^.x))
# )
#
# dff <- subset(df, select=c('citenum_non_zero',
# 'Cross.type.Collaboration',
# 'Cross.country.Collaboration',
# 'With.US.Authors',
# 'Award'))
#
# long <- melt(dff, id.vars = c('citenum_non_zero'))
# type <- paste(toString(long$variable),
# toString(long$value),
# sep = "_"
# )
# long$type = type
# ggplot(long, aes(x = citenum_non_zero, y = type)) +
# geom_density_ridges() +
# scale_x_continuous(
# trans = 'log10',
# breaks=trans_breaks('log10', function(x) 10^x),
# labels=trans_format('log10', math_format(10^.x))
# )