1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
import pandas as pd
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt
font = {'family' : 'Arial',
        'weight' : 'normal',
        'size'   : 6.5}
matplotlib.rc('font', **font)

df = pd.read_csv('../data/ht_class/ht_cleaned_paper_df.csv')
# Replace True/False to Yes/No
# Snippet: https://stackoverflow.com/a/45196807
bool_cols = df.columns[df.dtypes == 'bool']
df[bool_cols] = df[bool_cols].replace({True: 'Yes', False: 'No'})
df['PaperType'] = df['PaperType'].replace({'J': 'Jor.', 'C': 'Con.'})

# cutoff
cutoff_year = 2020
df = df[df['Year'] <= cutoff_year]

citenum = 'Number of Citations'
gcitenum = 'Citation Counts on Google Scholar'

vars = ['Cross-type Collaboration', 'Cross-country Collaboration',
       'With US Authors', 'Award', 'PaperType']
short_varname_dic = {
    'Cross-type Collaboration':'Cross-type',
    'Cross-country Collaboration':'Cross-country',
    'With US Authors':'W/ US Authors'
}

# generate figure c
# THE FOLLOWING IS FOR THE REPLICABILITY STAMP
smallfontsize = 6
f, axs = plt.subplots(5,1,
                      figsize=(3.5,3.5),
                      sharex=True,
                      sharey=False,
                      gridspec_kw=dict(height_ratios=[1,1,1,1,1]))
g0 = sns.boxplot(x=df[citenum], y=df['Award'].astype(str), palette="Set2", ax=axs[0])
g0 = sns.stripplot(x=df[citenum], y=df['Award'].astype(str), 
              size=2, color="orange", linewidth=0, ax=axs[0], alpha=0.2)
g0.set_xlabel("")
g0.set_ylabel("Awards", fontsize = smallfontsize)
###########
g1 = sns.boxplot(x=df[citenum], 
                 y=df['Cross-type Collaboration'].astype(str), palette="Paired", ax=axs[1])
g1 = sns.stripplot(x=df[citenum], y=df['Cross-type Collaboration'].astype(str), 
              size=2, color="orange", linewidth=0, ax=axs[1], alpha=0.2)
g1.set_xlabel("")
g1.set_ylabel("Cross-Type", fontsize = smallfontsize)
#############
g2 = sns.boxplot(x=df[citenum], 
                 y=df['Cross-country Collaboration'].astype(str), palette="vlag", ax=axs[2])
g2 = sns.stripplot(x=df[citenum], y=df['Cross-country Collaboration'].astype(str),
                   size=2, color="orange", linewidth=0, ax=axs[2], alpha=.2)
g2.set_xlabel("")
g2.set_ylabel("Cross-Country", fontsize = smallfontsize)
################
g3 = sns.boxplot(x=df[citenum], 
                 y=df['With US Authors'].astype(str), 
                 palette="Paired", 
                 ax=axs[3],
                 order = ['No', 'Yes'],  
                )
g3 = sns.stripplot(x=df[citenum], y=df['With US Authors'].astype(str), 
                   order = ['No', 'Yes'], 
              size=2, color="orange", linewidth=0, ax=axs[3], alpha=0.2)
g3.set_xlabel("")
g3.set_ylabel("US Authors", fontsize = smallfontsize)
##################
g4 = sns.boxplot(x=citenum, y='PaperType', data=df, palette="Set2", ax=axs[4])
g4 = sns.stripplot(x=citenum, y='PaperType', data=df,
              size=2, color="orange", linewidth=0, ax=axs[4], alpha=0.2)
g4.set_xlabel('Number of citations', fontsize = 8)
g4.set_xscale("log")
g4.set_ylabel("Paper Type", fontsize = smallfontsize)
f.text(-0.1, 1.05, 'c', transform=g0.transAxes, 
            size=10, weight='bold')
f.savefig('fig-7c.png', dpi=150)

# ## generate figure a, b
# f, axs = plt.subplots(1,2,
#                       figsize=(14,5),
#                       sharex=False,
#                       sharey=False,)

# ##### Percentile
# ## OpenAlex
# cits = df[citenum].sort_values(ascending=False).tolist()
# dff = pd.DataFrame(cits, columns = ['citations'])
# dff['pdf'] = dff['citations'] / sum(dff['citations'])
# dff['cdf'] = dff['pdf'].cumsum()
# dff['ccdf'] = 1 - dff['cdf']
# dff = dff.reset_index()
# dff['rank'] = (dff['index'] + 1)/dff.shape[0]
# g2 = dff.plot(x = 'rank', y = 'cdf', grid=True, label='OpenAlex', ax=axs[0])
# # g2.set_xlabel('Paper percentile by citations (from high to low)')
# g2.set_ylabel('Cumulative citation share')

# ### Google Scholar
# cits = df[gcitenum].dropna().sort_values(ascending=False).tolist()
# dff = pd.DataFrame(cits, columns = ['citations'])
# dff['pdf'] = dff['citations'] / sum(dff['citations'])
# dff['cdf'] = dff['pdf'].cumsum()
# dff['ccdf'] = 1 - dff['cdf']
# dff = dff.reset_index()
# dff['rank'] = (dff['index'] + 1)/dff.shape[0]
# g2_2 = dff.plot(x = 'rank', y = 'cdf', grid=True, ax = g2, label='Google Scholar')
# g2_2.set_xlabel('Paper rank')
# g2_2.text(-0.1, 1.05, 'a', transform=g2_2.transAxes, 
#             size=25, weight='bold')

# ###### Conference tracks
# g3 = sns.violinplot(x='Conference', y=citenum, data = df, palette="Set2", ax=axs[1])
# g3.set_yscale("log")
# g3.set_xlabel('Conference track')
# g3.text(-0.1, 1.05, 'b', transform=g3.transAxes, 
#             size=25, weight='bold')

# ##### Save fig
# f.savefig('fig7-a-b.png', dpi=150)