1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
library(ggplot2)
library(dplyr)
library(scales)
library(tidyverse)
library(ggpubr)
library(ggridges)
library(data.table)
library(reshape2)
library(stargazer)
library('car')
library("olsrr")

df = read.csv('../data/ht_class/ht_cleaned_paper_df.csv')
df = df[df$Year <= 2020, ]
df$Year.Distance.from.2020 = abs(df$Year - 2020)

########## CONFERENCE TRACKS

fit <- aov(Number.of.Citations ~ Conference, data = df)
summary(fit)

## All pairs are significant except for vis-vast
TukeyHSD(fit, conf.level = 0.95)
plot(TukeyHSD((fit)))

###########
df <- mutate(df, citenum_non_zero = Number.of.Citations + 0.99)
df <- mutate(
  df, 
  gcitenum_non_zero = Citation.Counts.on.Google.Scholar + 0.99)
df <- mutate(df, citenum_log10 = log10(citenum_non_zero))
df <- mutate(df, gcitenum_log10 = log10(gcitenum_non_zero))

############ checking outliers
# core_vars<- c('Year', 
#               'Conference', 
#               'PaperType', 
#               'Number.of.Authors', 
#               'Cross.type.Collaboration', 
#               'Cross.country.Collaboration',
#               'With.US.Authors',
#               'Award',
#               'Number.of.Citations'
#               ) 
# indepdent_vars <- c('Year', 
#                     'Conference', 
#                     'PaperType', 
#                     'Number.of.Authors', 
#                     'Cross.type.Collaboration', 
#                     'Cross.country.Collaboration',
#                     'With.US.Authors',
#                     'Award') 
# outcome_vars <- c('Number.of.Citations')


########
model = lm(Number.of.Citations ~
             Year.Distance.from.2020 + Conference + PaperType
           + Number.of.Authors + Cross.type.Collaboration +
             Cross.country.Collaboration + With.US.Authors + Award,
           data = df)

# #### Diagnosis
# y_hat_obj<-predict(model)
# u_obj<-resid(model)
# s_u_obj<-rstandard(model)
# qqnorm(s_u_obj)
# abline(0,0,col='red')
# # mean(u_invst)
# shapiro.test(u_obj)
# 
# plot(y_hat_obj,s_u_obj)
# abline(0,0,col='red')

# should be below 5
car::vif(model)


## ALL IN ONE
ols_plot_resid_fit(model)
ols_coll_diag(model)

summary(model)

library(jtools)
plot_summs(model, 
           plot.distributions = FALSE, scale=FALSE) + theme_apa()
ggsave('../results/figures/supp_reg_openalex.pdf')

stargazer(model)

model1 = lm(citenum_log10 ~
              Year.Distance.from.2020 + Conference + PaperType
           + Number.of.Authors + Cross.type.Collaboration +
             Cross.country.Collaboration + With.US.Authors + Award,
           data = df)

summary(model1)

plot_summs(model1, 
           plot.distributions = FALSE, scale=FALSE) + theme_apa()
ggsave('../results/figures/supp_reg_openalex_log10.pdf')

stargazer(model1)

vif(model1)

ols_plot_resid_fit(model1)
ols_coll_diag(model1)

# model2 = lm(gcitenum_log10 ~
#               Year.Distance.from.2020 + Conference + PaperType
#             + Number.of.Authors + Cross.type.Collaboration +
#               Cross.country.Collaboration + With.US.Authors + Award,
#             data = df)
# 
# summary(model2)
# 
# vif(model2)
# 
# ols_plot_resid_fit(model2)
# ols_coll_diag(model2)

model3 = lm(Citation.Counts.on.Google.Scholar ~
              Year.Distance.from.2020 + Conference + PaperType
           + Number.of.Authors + Cross.type.Collaboration +
             Cross.country.Collaboration + With.US.Authors + Award,
           data = df)
summary(model3)

vif(model3)

plot_summs(model3, 
           plot.distributions = FALSE, scale=FALSE) + theme_apa()
ggsave('../results/figures/supp_reg_gscholar.pdf')

stargazer(model3)

# df <- mutate(df, citenum_non_zero = Number.of.Citations + 0.99)

######################################### PLOTS

# ggplot(df, aes(x = citenum_non_zero, y = Award)) + 
#   geom_density_ridges() + 
#   scale_x_continuous(
#     trans = 'log10',
#     breaks=trans_breaks('log10', function(x) 10^x),
#     labels=trans_format('log10', math_format(10^.x))
#   )
# 
# dff <- subset(df, select=c('citenum_non_zero', 
#                     'Cross.type.Collaboration', 
#                     'Cross.country.Collaboration',
#                     'With.US.Authors', 
#                     'Award'))
# 
# long <- melt(dff, id.vars = c('citenum_non_zero'))
# type <- paste(toString(long$variable), 
#                    toString(long$value),
#                    sep = "_"
#                      )
# long$type = type

# ggplot(long, aes(x = citenum_non_zero, y = type)) + 
#   geom_density_ridges() + 
#   scale_x_continuous(
#     trans = 'log10',
#     breaks=trans_breaks('log10', function(x) 10^x),
#     labels=trans_format('log10', math_format(10^.x))
#   )