#!/usr/bin/env python3 """ Check the output of running Sphinx in nit-picky mode (missing references). """ from __future__ import annotations import argparse import itertools import os import re import subprocess import sys from pathlib import Path from typing import TextIO # Exclude these whether they're dirty or clean, # because they trigger a rebuild of dirty files. EXCLUDE_FILES = { "Doc/whatsnew/changelog.rst", } # Subdirectories of Doc/ to exclude. EXCLUDE_SUBDIRS = { ".env", ".venv", "env", "includes", "venv", } # Regex pattern to match the parts of a Sphinx warning WARNING_PATTERN = re.compile( r"(?P([A-Za-z]:[\\/])?[^:]+):(?P\d+): WARNING: (?P.+)" ) # Regex pattern to match the line numbers in a Git unified diff DIFF_PATTERN = re.compile( r"^@@ -(?P\d+)(?:,(?P\d+))? \+(?P\d+)(?:,(?P\d+))? @@", flags=re.MULTILINE, ) def get_diff_files(ref_a: str, ref_b: str, filter_mode: str = "") -> set[Path]: """List the files changed between two Git refs, filtered by change type.""" added_files_result = subprocess.run( [ "git", "diff", f"--diff-filter={filter_mode}", "--name-only", f"{ref_a}...{ref_b}", "--", ], stdout=subprocess.PIPE, check=True, text=True, encoding="UTF-8", ) added_files = added_files_result.stdout.strip().split("\n") return {Path(file.strip()) for file in added_files if file.strip()} def get_diff_lines(ref_a: str, ref_b: str, file: Path) -> list[int]: """List the lines changed between two Git refs for a specific file.""" diff_output = subprocess.run( [ "git", "diff", "--unified=0", f"{ref_a}...{ref_b}", "--", str(file), ], stdout=subprocess.PIPE, check=True, text=True, encoding="UTF-8", ) # Scrape line offsets + lengths from diff and convert to line numbers line_matches = DIFF_PATTERN.finditer(diff_output.stdout) # Removed and added line counts are 1 if not printed line_match_values = [ line_match.groupdict(default=1) for line_match in line_matches ] line_ints = [ (int(match_value["lineb"]), int(match_value["added"])) for match_value in line_match_values ] line_ranges = [ range(line_b, line_b + added) for line_b, added in line_ints ] line_numbers = list(itertools.chain(*line_ranges)) return line_numbers def get_para_line_numbers(file_obj: TextIO) -> list[list[int]]: """Get the line numbers of text in a file object, grouped by paragraph.""" paragraphs = [] prev_line = None for lineno, line in enumerate(file_obj): lineno = lineno + 1 if prev_line is None or (line.strip() and not prev_line.strip()): paragraph = [lineno - 1] paragraphs.append(paragraph) paragraph.append(lineno) prev_line = line return paragraphs def filter_and_parse_warnings( warnings: list[str], files: set[Path] ) -> list[re.Match[str]]: """Get the warnings matching passed files and parse them with regex.""" filtered_warnings = [ warning for warning in warnings if any(str(file) in warning for file in files) ] warning_matches = [ WARNING_PATTERN.fullmatch(warning.strip()) for warning in filtered_warnings ] non_null_matches = [warning for warning in warning_matches if warning] return non_null_matches def filter_warnings_by_diff( warnings: list[re.Match[str]], ref_a: str, ref_b: str, file: Path ) -> list[re.Match[str]]: """Filter the passed per-file warnings to just those on changed lines.""" diff_lines = get_diff_lines(ref_a, ref_b, file) with file.open(encoding="UTF-8") as file_obj: paragraphs = get_para_line_numbers(file_obj) touched_paras = [ para_lines for para_lines in paragraphs if set(diff_lines) & set(para_lines) ] touched_para_lines = set(itertools.chain(*touched_paras)) warnings_infile = [ warning for warning in warnings if str(file) in warning["file"] ] warnings_touched = [ warning for warning in warnings_infile if int(warning["line"]) in touched_para_lines ] return warnings_touched def process_touched_warnings( warnings: list[str], ref_a: str, ref_b: str ) -> list[re.Match[str]]: """Filter a list of Sphinx warnings to those affecting touched lines.""" added_files, modified_files = tuple( get_diff_files(ref_a, ref_b, filter_mode=mode) for mode in ("A", "M") ) warnings_added = filter_and_parse_warnings(warnings, added_files) warnings_modified = filter_and_parse_warnings(warnings, modified_files) modified_files_warned = { file for file in modified_files if any(str(file) in warning["file"] for warning in warnings_modified) } warnings_modified_touched = [ filter_warnings_by_diff(warnings_modified, ref_a, ref_b, file) for file in modified_files_warned ] warnings_touched = warnings_added + list( itertools.chain(*warnings_modified_touched) ) return warnings_touched def annotate_diff( warnings: list[str], ref_a: str = "main", ref_b: str = "HEAD" ) -> None: """ Convert Sphinx warning messages to GitHub Actions for changed paragraphs. Converts lines like: .../Doc/library/cgi.rst:98: WARNING: reference target not found to: ::warning file=.../Doc/library/cgi.rst,line=98::reference target not found See: https://docs.github.com/en/actions/using-workflows/workflow-commands-for-github-actions#setting-a-warning-message """ warnings_touched = process_touched_warnings(warnings, ref_a, ref_b) print("Emitting doc warnings matching modified lines:") for warning in warnings_touched: print("::warning file={file},line={line}::{msg}".format_map(warning)) print(warning[0]) if not warnings_touched: print("None") def fail_if_regression( warnings: list[str], files_with_expected_nits: set[str], files_with_nits: set[str] ) -> int: """ Ensure some files always pass Sphinx nit-picky mode (no missing references). These are files which are *not* in .nitignore. """ all_rst = { str(rst) for rst in Path("Doc/").rglob("*.rst") if rst.parts[1] not in EXCLUDE_SUBDIRS } should_be_clean = all_rst - files_with_expected_nits - EXCLUDE_FILES problem_files = sorted(should_be_clean & files_with_nits) if problem_files: print("\nError: must not contain warnings:\n") for filename in problem_files: print(filename) for warning in warnings: if filename in warning: if match := WARNING_PATTERN.fullmatch(warning): print(" {line}: {msg}".format_map(match)) return -1 return 0 def fail_if_improved( files_with_expected_nits: set[str], files_with_nits: set[str] ) -> int: """ We may have fixed warnings in some files so that the files are now completely clean. Good news! Let's add them to .nitignore to prevent regression. """ files_with_no_nits = files_with_expected_nits - files_with_nits if files_with_no_nits: print("\nCongratulations! You improved:\n") for filename in sorted(files_with_no_nits): print(filename) print("\nPlease remove from Doc/tools/.nitignore\n") return -1 return 0 def main(argv: list[str] | None = None) -> int: parser = argparse.ArgumentParser() parser.add_argument( "--annotate-diff", nargs="*", metavar=("BASE_REF", "HEAD_REF"), help="Add GitHub Actions annotations on the diff for warnings on " "lines changed between the given refs (main and HEAD, by default)", ) parser.add_argument( "--fail-if-regression", action="store_true", help="Fail if known-good files have warnings", ) parser.add_argument( "--fail-if-improved", action="store_true", help="Fail if new files with no nits are found", ) args = parser.parse_args(argv) if args.annotate_diff is not None and len(args.annotate_diff) > 2: parser.error( "--annotate-diff takes between 0 and 2 ref args, not " f"{len(args.annotate_diff)} {tuple(args.annotate_diff)}" ) exit_code = 0 wrong_directory_msg = "Must run this script from the repo root" assert Path("Doc").exists() and Path("Doc").is_dir(), wrong_directory_msg with Path("Doc/sphinx-warnings.txt").open(encoding="UTF-8") as f: warnings = f.read().splitlines() cwd = str(Path.cwd()) + os.path.sep files_with_nits = { warning.removeprefix(cwd).split(":")[0] for warning in warnings if "Doc/" in warning } with Path("Doc/tools/.nitignore").open(encoding="UTF-8") as clean_files: files_with_expected_nits = { filename.strip() for filename in clean_files if filename.strip() and not filename.startswith("#") } if args.annotate_diff is not None: annotate_diff(warnings, *args.annotate_diff) if args.fail_if_regression: exit_code += fail_if_regression( warnings, files_with_expected_nits, files_with_nits ) if args.fail_if_improved: exit_code += fail_if_improved(files_with_expected_nits, files_with_nits) return exit_code if __name__ == "__main__": sys.exit(main())