https://github.com/hongtaoh/32vis
Revision 9960413711b0efb1f51ff7cce3548d259be8d8cb authored by Hongtao Hao on 24 May 2025, 20:13:11 UTC, committed by GitHub on 24 May 2025, 20:13:11 UTC
1 parent 5a056b6
Tip revision: 9960413711b0efb1f51ff7cce3548d259be8d8cb authored by Hongtao Hao on 24 May 2025, 20:13:11 UTC
Update README.md
Update README.md
Tip revision: 9960413
Snakefile
from os.path import join as pjoin
import sys
DATA_DIR = "../data/"
RAW_DATA_DIR = pjoin(DATA_DIR, "raw")
INTERIM_DATA_DIR = pjoin(DATA_DIR, "interim")
CHECKING_DIR = pjoin(INTERIM_DATA_DIR, "checking")
METHODS_REPORTING_DIR = pjoin(INTERIM_DATA_DIR, 'methods_reporting')
PROCESSED_DATA_DIR = pjoin(DATA_DIR, "processed")
PROCESSED_LARGE_DATA_DIR = pjoin(PROCESSED_DATA_DIR, "large")
CLASSIFIERS_DATA_DIR = pjoin(DATA_DIR, "classifiers")
PLOTS_DATA_DIR = pjoin(DATA_DIR, "plots")
HT_CLASS_DATA_DIR = pjoin(DATA_DIR, "ht_class")
###############################################################################
# Raw Data
###############################################################################
# raw data file downloaded directly from VISPUBDATA as CSV
VISPUBDATA = pjoin(RAW_DATA_DIR, 'vispubdata.csv')
DOIS_2021 = pjoin(RAW_DATA_DIR, 'dois_2021.csv')
###############################################################################
# Interim outputs
###############################################################################
VISPD_OPENALEX_MATCH_1 = pjoin(INTERIM_DATA_DIR, 'vispd_openalex_match_1.csv')
TITEL_QUERY_EMPTY_DOI_QUERY_404_1 = pjoin(CHECKING_DIR, 'title_query_empty_doi_query_404_1.txt')
TITLE_QUERY_404_1 = pjoin(CHECKING_DIR, 'title_query_404_1.txt')
VISPD_OPENALEX_MATCH_2 = pjoin(INTERIM_DATA_DIR, 'vispd_openalex_match_2.csv')
TITEL_QUERY_EMPTY_DOI_QUERY_404_2 = pjoin(CHECKING_DIR, 'title_query_empty_doi_query_404_2.txt')
TITLE_QUERY_404_2 = pjoin(CHECKING_DIR, 'title_query_404_2.txt')
DOI_QUERY_404_2 = pjoin(CHECKING_DIR, 'doi_query_404_2.txt')
TITEL_QUERY_EMPTY_DOI_QUERY_404_DFS = pjoin(CHECKING_DIR, 'title_query_empty_doi_query_404_dfs.txt')
TITLE_QUERY_404_DFS = pjoin(CHECKING_DIR, 'title_query_404_dfs.txt')
DOI_QUERY_404_DFS = pjoin(CHECKING_DIR, 'doi_query_404_dfs.txt')
OPENALEX_AUTHOR_DF = pjoin(INTERIM_DATA_DIR, 'openalex_author_df.csv')
IEEE_AUTHOR_DF = pjoin(INTERIM_DATA_DIR, 'ieee_author_df.csv')
AWARD_PAPER_DF = pjoin(INTERIM_DATA_DIR, 'award_paper_df.csv')
IEEE_SCRAPING_PROBLEM_DOIS = pjoin(CHECKING_DIR, 'ieee_scraping_problem_dois.txt')
WOS_ID = pjoin(METHODS_REPORTING_DIR, 'wos_id.csv')
###############################################################################
# Processed data
###############################################################################
TITLES_2021 = pjoin(PROCESSED_DATA_DIR, 'titles_2021.csv')
VISPUBDATA_PLUS = pjoin(PROCESSED_DATA_DIR, 'vispubdata_plus.csv')
VISPD_PLUS_GOOD_PAPERS = pjoin(PROCESSED_DATA_DIR, 'vispd_plus_good_papers.txt')
PAPERS_TO_STUDY = pjoin(PROCESSED_DATA_DIR, 'papers_to_study.txt')
OPENALEX_PAPER_DF = pjoin(PROCESSED_DATA_DIR, 'openalex_paper_df.csv')
OPENALEX_CONCEPT_DF = pjoin(PROCESSED_DATA_DIR, 'openalex_concept_df.csv')
OPENALEX_REFERENCE_DF = pjoin(PROCESSED_DATA_DIR, 'openalex_reference_df.csv')
OPENALEX_CITATION_AUTHOR_DF = pjoin(PROCESSED_LARGE_DATA_DIR, 'openalex_citation_author_df.csv')
OPENALEX_CITATION_CONCEPT_DF = pjoin(PROCESSED_LARGE_DATA_DIR, 'openalex_citation_concept_df.csv')
OPENALEX_CITATION_PAPER_DF = pjoin(PROCESSED_DATA_DIR, 'openalex_citation_paper_df.csv')
OPENALEX_REFERENCE_PAPER_DF = pjoin(PROCESSED_DATA_DIR, 'openalex_reference_paper_df.csv')
OPENALEX_REFERENCE_AUTHOR_DF = pjoin(PROCESSED_LARGE_DATA_DIR, 'openalex_reference_author_df.csv')
OPENALEX_REFERENCE_CONCEPT_DF = pjoin(PROCESSED_LARGE_DATA_DIR, 'openalex_reference_concept_df.csv')
OPENALEX_REFERENCE_PAPER_DF_UNIQUE = pjoin(PROCESSED_DATA_DIR, 'openalex_reference_paper_df_unique.csv')
OPENALEX_REFERENCE_AUTHOR_DF_UNIQUE = pjoin(PROCESSED_DATA_DIR, 'openalex_reference_author_df_unique.csv')
OPENALEX_REFERENCE_CONCEPT_DF_UNIQUE = pjoin(PROCESSED_DATA_DIR, 'openalex_reference_concept_df_unique.csv')
OPENALEX_REFERENCE_ERROR_DF = pjoin(PROCESSED_DATA_DIR, 'openalex_reference_error_df.csv')
IEEE_PAPER_DF = pjoin(PROCESSED_DATA_DIR, 'ieee_paper_df.csv')
# IEEE_REFERENCE_DF = pjoin(PROCESSED_DATA_DIR, 'ieee_reference_df.csv')
MERGED_AUTHOR_DF = pjoin(PROCESSED_DATA_DIR, 'merged_author_df.csv')
GSCHOLAR_DATA = pjoin(PROCESSED_DATA_DIR, 'gscholar_data.csv')
###############################################################################
# HT_CLASS Data
###############################################################################
MERGED_CNTRY_PREDICTED = pjoin(HT_CLASS_DATA_DIR, 'merged_country_predicted.csv')
MERGED_AFF_TYPE_PREDICTED = pjoin(
HT_CLASS_DATA_DIR,
'merged_aff_type_predicted.csv'
)
TYPE_CLASSIFICATION_REPORT = pjoin(HT_CLASS_DATA_DIR, 'aff_type_classification_report.txt')
CNTRY_CLASSIFICATION_REPORT = pjoin(HT_CLASS_DATA_DIR, 'cntry_classification_report.txt')
HT_CLEANED_AUTHOR_DF = pjoin(HT_CLASS_DATA_DIR, 'ht_cleaned_author_df.csv')
HT_CLEANED_PAPER_DF = pjoin(HT_CLASS_DATA_DIR, 'ht_cleaned_paper_df.csv')
###############################################################################
# Plots Data
###############################################################################
AUTHOR_CHORD_DATA_DIR = pjoin(PLOTS_DATA_DIR, "author_chord")
COOCCURRANCE_DATA_DIR = pjoin(PLOTS_DATA_DIR, "co_occurrance")
SANKY_DATA_DIR = pjoin(PLOTS_DATA_DIR, 'sankey')
# author chord diagram
AUTHOR_CHORD_DF = pjoin(AUTHOR_CHORD_DATA_DIR, 'author_chord_df.csv')
TS_AUTHOR_CHORD_DF = pjoin(AUTHOR_CHORD_DATA_DIR, 'ts_author_chord_df.csv')
# ieeevis concept cooccurance chord diagram
AGGREGATED_COOCCURANCE_DF = pjoin(COOCCURRANCE_DATA_DIR, 'aggregated_cooccurance_df.csv')
TS_AGGREGATED_COOCCURANCE_DF = pjoin(COOCCURRANCE_DATA_DIR, 'ts_aggregated_cooccurance_df.csv')
TOP_CONCEPTS_TRENDS_DF = pjoin(PLOTS_DATA_DIR, 'top_concepts_trends_df.csv')
# sankey diagram for citation flow based on concepts
SANKEY_AGGREGATED_DF = pjoin(SANKY_DATA_DIR, 'sankey_aggregated_df.csv')
SANKEY_TS_DF = pjoin(SANKY_DATA_DIR, 'sankey_ts_df.csv')
###############################################################################
# Workflows
###############################################################################
rule all:
input:
TITLES_2021,
VISPUBDATA_PLUS,
VISPD_PLUS_GOOD_PAPERS,
VISPD_OPENALEX_MATCH_1,
TITEL_QUERY_EMPTY_DOI_QUERY_404_1,
TITLE_QUERY_404_1,
TITEL_QUERY_EMPTY_DOI_QUERY_404_2,
TITLE_QUERY_404_2,
DOI_QUERY_404_2,
PAPERS_TO_STUDY,
OPENALEX_PAPER_DF,
OPENALEX_AUTHOR_DF,
OPENALEX_CONCEPT_DF,
OPENALEX_REFERENCE_DF,
TITEL_QUERY_EMPTY_DOI_QUERY_404_DFS,
TITLE_QUERY_404_DFS,
DOI_QUERY_404_DFS,
OPENALEX_CITATION_AUTHOR_DF,
OPENALEX_CITATION_CONCEPT_DF,
OPENALEX_CITATION_PAPER_DF,
IEEE_PAPER_DF,
IEEE_AUTHOR_DF,
IEEE_SCRAPING_PROBLEM_DOIS,
MERGED_AUTHOR_DF,
OPENALEX_REFERENCE_PAPER_DF,
OPENALEX_REFERENCE_AUTHOR_DF,
OPENALEX_REFERENCE_CONCEPT_DF,
OPENALEX_REFERENCE_PAPER_DF_UNIQUE,
OPENALEX_REFERENCE_AUTHOR_DF_UNIQUE,
OPENALEX_REFERENCE_CONCEPT_DF_UNIQUE,
GSCHOLAR_DATA,
AWARD_PAPER_DF,
# WOS_ID,
AUTHOR_CHORD_DF,
TS_AUTHOR_CHORD_DF,
AGGREGATED_COOCCURANCE_DF,
TS_AGGREGATED_COOCCURANCE_DF,
TOP_CONCEPTS_TRENDS_DF,
SANKEY_AGGREGATED_DF,
SANKEY_TS_DF,
MERGED_CNTRY_PREDICTED,
MERGED_AFF_TYPE_PREDICTED,
TYPE_CLASSIFICATION_REPORT,
CNTRY_CLASSIFICATION_REPORT,
HT_CLEANED_AUTHOR_DF,
HT_CLEANED_PAPER_DF,
rule get_titles_2021:
output: TITLES_2021
priority: 99
shell: "python scripts/get_titles_2021.py {output}"
rule get_vispd_plus:
input: DOIS_2021, VISPUBDATA
output: VISPUBDATA_PLUS
priority: 90
shell: "python scripts/get_vispd_plus.py {input} {output}"
rule get_vispd_plus_good_papers:
input: VISPUBDATA_PLUS
output: VISPD_PLUS_GOOD_PAPERS
priority: 80
shell: "python scripts/get_vispd_plus_good_papers.py {input} {output}"
rule get_vispd_openalex_match_1:
input: VISPD_PLUS_GOOD_PAPERS, VISPUBDATA_PLUS
output: VISPD_OPENALEX_MATCH_1, TITEL_QUERY_EMPTY_DOI_QUERY_404_1, TITLE_QUERY_404_1,
priority: 70
shell: "python scripts/get_vispd_openalex_match_1.py {input} {output}"
rule get_vispd_openalex_match_2:
input: VISPD_PLUS_GOOD_PAPERS, VISPUBDATA_PLUS
output: VISPD_OPENALEX_MATCH_2, TITEL_QUERY_EMPTY_DOI_QUERY_404_2, TITLE_QUERY_404_2, DOI_QUERY_404_2,
priority: 60
shell: "python scripts/get_vispd_openalex_match_2.py {input} {output}"
rule get_papers_to_study:
input: VISPD_PLUS_GOOD_PAPERS
output: PAPERS_TO_STUDY
priority: 50
shell: "python scripts/get_papers_to_study.py {input} {output}"
rule get_openalex_dfs:
input: PAPERS_TO_STUDY, VISPUBDATA_PLUS
output: OPENALEX_PAPER_DF, OPENALEX_AUTHOR_DF, OPENALEX_CONCEPT_DF, OPENALEX_REFERENCE_DF, TITEL_QUERY_EMPTY_DOI_QUERY_404_DFS, TITLE_QUERY_404_DFS, DOI_QUERY_404_DFS,
priority: 40
shell: "python scripts/get_openalex_dfs.py {input} {output}"
rule get_openalex_citation_dfs:
input: OPENALEX_PAPER_DF
output: OPENALEX_CITATION_AUTHOR_DF, OPENALEX_CITATION_CONCEPT_DF, OPENALEX_CITATION_PAPER_DF
priority: 30
shell: "python scripts/get_openalex_citation_dfs.py {input} {output}"
rule get_ieee_author_and_paper_title:
input: PAPERS_TO_STUDY, VISPUBDATA_PLUS
output: IEEE_AUTHOR_DF, IEEE_PAPER_DF, IEEE_SCRAPING_PROBLEM_DOIS
priority: 20
shell: "python scripts/get_ieee_author_and_paper_title.py {input} {output}"
rule get_merged_author_df:
input: IEEE_AUTHOR_DF, OPENALEX_AUTHOR_DF, PAPERS_TO_STUDY, VISPUBDATA,
output: MERGED_AUTHOR_DF,
priority: 19
shell: "python scripts/get_merged_author_df.py {input} {output}"
rule get_openalex_reference_dfs:
input: OPENALEX_REFERENCE_DF
output: OPENALEX_REFERENCE_PAPER_DF_UNIQUE, OPENALEX_REFERENCE_AUTHOR_DF_UNIQUE, OPENALEX_REFERENCE_CONCEPT_DF_UNIQUE, OPENALEX_REFERENCE_PAPER_DF, OPENALEX_REFERENCE_AUTHOR_DF, OPENALEX_REFERENCE_CONCEPT_DF, OPENALEX_REFERENCE_ERROR_DF,
priority: 18
shell: "python scripts/get_openalex_reference_dfs.py {input} {output}"
rule scrape_award_papers:
input: IEEE_AUTHOR_DF
output: AWARD_PAPER_DF
priority: 10
shell: "python scripts/scrape_award_papers.py {input} {output}"
rule get_gscholar_data:
input: PAPERS_TO_STUDY, IEEE_PAPER_DF,
output: GSCHOLAR_DATA,
priority: 9
shell: "python scripts/get_gscholar_data.py {input} {output}"
rule get_wos_id:
input: VISPD_PLUS_GOOD_PAPERS
output: WOS_ID
priority: 7
shell: "python scripts/get_wos_id.py {input} {output}"
rule CLASS_country:
input:
OPENALEX_CITATION_AUTHOR_DF,
OPENALEX_REFERENCE_AUTHOR_DF_UNIQUE,
OPENALEX_AUTHOR_DF,
MERGED_AUTHOR_DF
output: MERGED_CNTRY_PREDICTED, CNTRY_CLASSIFICATION_REPORT
shell: "python scripts/CLASS_country.py {input} {output}"
rule CLASS_type:
input:
OPENALEX_CITATION_AUTHOR_DF,
OPENALEX_REFERENCE_AUTHOR_DF_UNIQUE,
OPENALEX_AUTHOR_DF,
MERGED_AUTHOR_DF
output: MERGED_AFF_TYPE_PREDICTED, TYPE_CLASSIFICATION_REPORT
shell: "python scripts/CLASS_type.py {input} {output}"
rule get_HT_cleaned_author_df:
input:
MERGED_CNTRY_PREDICTED,
MERGED_AFF_TYPE_PREDICTED
output: HT_CLEANED_AUTHOR_DF
shell: "python scripts/get_HT_cleaned_author_df.py {input} {output}"
rule get_ht_cleaned_paper_df:
input:
PAPERS_TO_STUDY,
VISPUBDATA_PLUS,
OPENALEX_PAPER_DF,
HT_CLEANED_AUTHOR_DF,
GSCHOLAR_DATA,
AWARD_PAPER_DF,
output: HT_CLEANED_PAPER_DF,
shell: "python scripts/get_HT_cleaned_paper_df.py {input} {output}"
rule plot_data_author_chord_diagram_data:
input: HT_CLEANED_AUTHOR_DF
output: AUTHOR_CHORD_DF, TS_AUTHOR_CHORD_DF
shell: "python scripts/plot_data_author_chord_diagram_data.py {input} {output}"
rule plot_vis_concepts_cooccurance_data:
input: OPENALEX_CONCEPT_DF
output: AGGREGATED_COOCCURANCE_DF, TS_AGGREGATED_COOCCURANCE_DF
shell: "python scripts/plot_vis_concepts_cooccurance_data.py {input} {output}"
rule plot_top_concepts_trends:
input: OPENALEX_PAPER_DF, OPENALEX_CONCEPT_DF
output: TOP_CONCEPTS_TRENDS_DF
shell: "python scripts/plot_top_concepts_trends.py {input} {output}"
rule plot_sankey_data:
input:
VISPUBDATA_PLUS,
OPENALEX_CONCEPT_DF,
OPENALEX_REFERENCE_CONCEPT_DF,
OPENALEX_CITATION_CONCEPT_DF,
output: SANKEY_AGGREGATED_DF, SANKEY_TS_DF,
shell: "python scripts/plot_sankey_data.py {input} {output}"
Computing file changes ...