Skip to main content
  • Home
  • Development
  • Documentation
  • Donate
  • Operational login
  • Browse the archive

swh logo
SoftwareHeritage
Software
Heritage
Archive
Features
  • Search

  • Downloads

  • Save code now

  • Add forge now

  • Help

  • 0f9a83f
  • /
  • perf_test
  • /
  • compare_results.py
Raw File Download

To reference or cite the objects present in the Software Heritage archive, permalinks based on SoftWare Hash IDentifiers (SWHIDs) must be used.
Select below a type of object currently browsed in order to display its associated SWHID and permalink.

  • content
  • directory
content badge
swh:1:cnt:df2a1eee3da111ee8f61c09d4fcbcd9ed47bfe31
directory badge
swh:1:dir:b81659601070aabe959b33c2af6dd797e35f372d

This interface enables to generate software citations, provided that the root directory of browsed objects contains a citation.cff or codemeta.json file.
Select below a type of object currently browsed in order to generate citations for them.

  • content
  • directory
(requires biblatex-software package)
Generating citation ...
(requires biblatex-software package)
Generating citation ...
compare_results.py
#!/usr/bin/env python
"""
Compare two PyEMD performance test results.

Usage:
    python compare_results.py <baseline> <comparison>
    python compare_results.py results/20241201_120000_full results/20241202_120000_full

Options:
    --threshold PERCENT  Highlight changes greater than this percentage (default: 5)
    --format FORMAT      Output format: text, json, markdown (default: text)
    --alpha FLOAT        Significance level for t-test (default: 0.05)
"""

import argparse
import json
import math
import sys
from dataclasses import dataclass
from pathlib import Path
from typing import Dict, List, Optional, Tuple


def welch_ttest(mean1: float, std1: float, n1: int, mean2: float, std2: float, n2: int) -> Tuple[float, float]:
    """
    Perform Welch's t-test for two samples with different variances.

    Returns (t_statistic, p_value).
    Uses approximation for p-value calculation without scipy.
    """
    if std1 == 0 and std2 == 0:
        # No variance - can't compute t-test
        return 0.0, 1.0

    # Standard error of difference
    se1 = (std1**2) / n1 if n1 > 0 else 0
    se2 = (std2**2) / n2 if n2 > 0 else 0
    se_diff = math.sqrt(se1 + se2)

    if se_diff == 0:
        return 0.0, 1.0

    # t-statistic
    t_stat = (mean1 - mean2) / se_diff

    # Welch-Satterthwaite degrees of freedom
    if se1 + se2 == 0:
        df = 1
    else:
        num = (se1 + se2) ** 2
        denom = (se1**2) / (n1 - 1) if n1 > 1 else 0
        denom += (se2**2) / (n2 - 1) if n2 > 1 else 0
        df = num / denom if denom > 0 else 1

    # Approximate p-value using normal distribution for large df
    # For more accurate results with small df, would need scipy
    # This is a reasonable approximation for df > 30
    if df > 30:
        # Use normal approximation
        z = abs(t_stat)
        # Approximation of 2-tailed p-value from z-score
        p_value = 2 * (1 - 0.5 * (1 + math.erf(z / math.sqrt(2))))
    else:
        # For smaller df, use a conservative estimate
        # This is less accurate but avoids scipy dependency
        z = abs(t_stat) * math.sqrt(df / (df + t_stat**2))
        p_value = 2 * (1 - 0.5 * (1 + math.erf(z / math.sqrt(2))))

    return t_stat, p_value


@dataclass
class ComparisonResult:
    """Result of comparing two benchmark results."""

    test_name: str
    params: Dict
    baseline_mean: float
    comparison_mean: float
    baseline_trimmed_mean: float  # 10% trimmed mean (used for comparison)
    comparison_trimmed_mean: float
    baseline_std: float
    comparison_std: float
    baseline_runs: int
    comparison_runs: int
    diff_seconds: float  # Based on trimmed mean
    diff_percent: float  # Based on trimmed mean
    is_faster: bool
    is_significant: bool  # Based on statistical test
    p_value: float  # From t-test (uses regular mean/std)
    baseline_cv: float  # Coefficient of variation
    comparison_cv: float

    def __str__(self) -> str:
        direction = "faster" if self.is_faster else "slower"
        sign = "-" if self.is_faster else "+"
        sig_marker = "*" if self.is_significant else ""
        return (
            f"{self.test_name} ({self.params}): "
            f"{self.baseline_trimmed_mean:.4f}s → {self.comparison_trimmed_mean:.4f}s "
            f"({sign}{abs(self.diff_percent):.1f}% {direction}{sig_marker}, p={self.p_value:.3f})"
        )


def load_results(path: Path) -> Tuple[Dict, List[Dict]]:
    """Load results from a results directory or JSON file."""
    if path.is_dir():
        json_path = path / "results.json"
    else:
        json_path = path

    if not json_path.exists():
        raise FileNotFoundError(f"Results file not found: {json_path}")

    with open(json_path) as f:
        data = json.load(f)

    return data.get("system_info", {}), data.get("results", [])


def make_key(result: Dict) -> str:
    """Create a unique key for a result based on name and params."""
    params_str = json.dumps(result["params"], sort_keys=True)
    return f"{result['name']}:{params_str}"


def compare_results(
    baseline: List[Dict],
    comparison: List[Dict],
    threshold_percent: float = 5.0,
    alpha: float = 0.05,
) -> List[ComparisonResult]:
    """Compare two sets of results using statistical significance testing.

    Args:
        baseline: List of baseline benchmark results
        comparison: List of comparison benchmark results
        threshold_percent: Minimum percentage change to consider (in addition to p-value)
        alpha: Significance level for t-test (default 0.05 = 95% confidence)
    """
    # Index baseline by key
    baseline_map = {make_key(r): r for r in baseline}
    comparison_map = {make_key(r): r for r in comparison}

    results = []

    # Find matching tests
    all_keys = set(baseline_map.keys()) | set(comparison_map.keys())

    for key in sorted(all_keys):
        base = baseline_map.get(key)
        comp = comparison_map.get(key)

        if base is None or comp is None:
            continue  # Skip tests that don't exist in both

        base_mean = base["mean"]
        comp_mean = comp["mean"]
        # Use trimmed mean for comparison (falls back to mean for older results)
        base_trimmed = base.get("trimmed_mean", base_mean)
        comp_trimmed = comp.get("trimmed_mean", comp_mean)
        base_std = base.get("std", 0)
        comp_std = comp.get("std", 0)
        base_runs = base.get("runs", 1)
        comp_runs = comp.get("runs", 1)

        # Use trimmed mean for diff calculation (more robust to outliers)
        diff_seconds = comp_trimmed - base_trimmed
        diff_percent = (diff_seconds / base_trimmed) * 100 if base_trimmed > 0 else 0

        # Coefficient of variation (std/mean as percentage) - still uses regular mean
        base_cv = (base_std / base_mean * 100) if base_mean > 0 else 0
        comp_cv = (comp_std / comp_mean * 100) if comp_mean > 0 else 0

        # Perform Welch's t-test (uses regular mean/std for statistical test)
        _, p_value = welch_ttest(base_mean, base_std, base_runs, comp_mean, comp_std, comp_runs)

        # Significant if:
        # 1. p-value is below alpha (statistically significant)
        # 2. AND the percentage change exceeds threshold (practically significant)
        is_significant = p_value < alpha and abs(diff_percent) >= threshold_percent

        results.append(
            ComparisonResult(
                test_name=base["name"],
                params=base["params"],
                baseline_mean=base_mean,
                comparison_mean=comp_mean,
                baseline_trimmed_mean=base_trimmed,
                comparison_trimmed_mean=comp_trimmed,
                baseline_std=base_std,
                comparison_std=comp_std,
                baseline_runs=base_runs,
                comparison_runs=comp_runs,
                diff_seconds=diff_seconds,
                diff_percent=diff_percent,
                is_faster=diff_seconds < 0,
                is_significant=is_significant,
                p_value=p_value,
                baseline_cv=base_cv,
                comparison_cv=comp_cv,
            )
        )

    return results


def format_text(
    results: List[ComparisonResult],
    baseline_info: Dict,
    comparison_info: Dict,
    threshold: float,
    alpha: float = 0.05,
) -> str:
    """Format comparison results as plain text."""
    lines = []
    lines.append("=" * 70)
    lines.append(" PyEMD Performance Comparison")
    lines.append("=" * 70)
    lines.append("")

    # System info comparison
    lines.append("Baseline:")
    lines.append(f"  Timestamp: {baseline_info.get('timestamp', 'unknown')}")
    lines.append(f"  Git commit: {baseline_info.get('git_commit', 'unknown')[:8]}...")
    lines.append(f"  PyEMD version: {baseline_info.get('pyemd_version', 'unknown')}")
    lines.append("")

    lines.append("Comparison:")
    lines.append(f"  Timestamp: {comparison_info.get('timestamp', 'unknown')}")
    lines.append(f"  Git commit: {comparison_info.get('git_commit', 'unknown')[:8]}...")
    lines.append(f"  PyEMD version: {comparison_info.get('pyemd_version', 'unknown')}")
    lines.append("")

    lines.append("Statistical parameters:")
    lines.append(f"  Minimum % change threshold: {threshold}%")
    lines.append(f"  Significance level (alpha): {alpha}")
    lines.append(f"  A result is significant if: p < {alpha} AND |change| >= {threshold}%")
    lines.append("  Comparison uses 10% trimmed mean (outlier-robust)")
    lines.append("")

    # Summary
    faster = [r for r in results if r.is_faster and r.is_significant]
    slower = [r for r in results if not r.is_faster and r.is_significant]
    unchanged = [r for r in results if not r.is_significant]

    # Calculate average CV to show measurement quality
    avg_base_cv = sum(r.baseline_cv for r in results) / len(results) if results else 0
    avg_comp_cv = sum(r.comparison_cv for r in results) / len(results) if results else 0
    max_cv = max(max(r.baseline_cv, r.comparison_cv) for r in results) if results else 0

    lines.append("-" * 70)
    lines.append(f" Summary: {len(faster)} faster, {len(slower)} slower, {len(unchanged)} unchanged")
    lines.append(f" Average coefficient of variation: baseline={avg_base_cv:.1f}%, comparison={avg_comp_cv:.1f}%")
    lines.append("-" * 70)

    # Warning about high variance
    if max_cv > 15:
        lines.append("")
        lines.append("WARNING: High variance detected (CV > 15%)!")
        lines.append("Results may be unreliable. Consider:")
        lines.append("  - Closing other applications")
        lines.append("  - Disabling CPU frequency scaling (performance governor)")
        lines.append("  - Running more iterations")
        lines.append("  - Using a dedicated benchmark machine")

    lines.append("")

    # Significant improvements (faster)
    if faster:
        lines.append("FASTER (statistically significant improvements):")
        lines.append("-" * 40)
        for r in sorted(faster, key=lambda x: x.diff_percent):
            lines.append(f"  ✓ {r}")
        lines.append("")

    # Significant regressions (slower)
    if slower:
        lines.append("SLOWER (statistically significant regressions):")
        lines.append("-" * 40)
        for r in sorted(slower, key=lambda x: -x.diff_percent):
            lines.append(f"  ✗ {r}")
        lines.append("")

    # Unchanged
    if unchanged:
        lines.append("NOT SIGNIFICANT (p >= alpha or change < threshold):")
        lines.append("-" * 40)
        for r in unchanged:
            sign = "-" if r.is_faster else "+"
            lines.append(
                f"  = {r.test_name} ({r.params}): "
                f"{r.baseline_trimmed_mean:.4f}s → {r.comparison_trimmed_mean:.4f}s "
                f"({sign}{abs(r.diff_percent):.1f}%, p={r.p_value:.3f}, CV={r.baseline_cv:.1f}%/{r.comparison_cv:.1f}%)"
            )
        lines.append("")

    return "\n".join(lines)


def format_markdown(
    results: List[ComparisonResult], baseline_info: Dict, comparison_info: Dict, threshold: float
) -> str:
    """Format comparison results as markdown."""
    lines = []
    lines.append("# PyEMD Performance Comparison")
    lines.append("")

    # System info
    lines.append("## Environment")
    lines.append("")
    lines.append("| | Baseline | Comparison |")
    lines.append("|---|---|---|")
    lines.append(
        f"| Timestamp | {baseline_info.get('timestamp', 'unknown')} | {comparison_info.get('timestamp', 'unknown')} |"
    )
    lines.append(
        f"| Git commit | `{baseline_info.get('git_commit', 'unknown')[:8]}` | `{comparison_info.get('git_commit', 'unknown')[:8]}` |"
    )
    lines.append(
        f"| PyEMD version | {baseline_info.get('pyemd_version', 'unknown')} | {comparison_info.get('pyemd_version', 'unknown')} |"
    )
    lines.append("")

    # Summary
    faster = [r for r in results if r.is_faster and r.is_significant]
    slower = [r for r in results if not r.is_faster and r.is_significant]
    unchanged = [r for r in results if not r.is_significant]

    lines.append("## Summary")
    lines.append("")
    lines.append(f"- **{len(faster)}** tests faster (improvements)")
    lines.append(f"- **{len(slower)}** tests slower (regressions)")
    lines.append(f"- **{len(unchanged)}** tests unchanged (within {threshold}% threshold)")
    lines.append("")
    lines.append("*Note: Comparison uses 10% trimmed mean for outlier robustness*")
    lines.append("")

    # Results table
    lines.append("## Detailed Results")
    lines.append("")
    lines.append("| Test | Params | Baseline | Comparison | Change |")
    lines.append("|------|--------|----------|------------|--------|")

    for r in sorted(results, key=lambda x: x.diff_percent):
        params_str = ", ".join(f"{k}={v}" for k, v in r.params.items())
        sign = "" if r.is_faster else "+"
        status = "🟢" if r.is_faster and r.is_significant else ("🔴" if not r.is_faster and r.is_significant else "⚪")
        lines.append(
            f"| {r.test_name} | {params_str} | "
            f"{r.baseline_trimmed_mean:.4f}s | "
            f"{r.comparison_trimmed_mean:.4f}s | "
            f"{status} {sign}{r.diff_percent:.1f}% |"
        )

    lines.append("")
    return "\n".join(lines)


def format_json(results: List[ComparisonResult], baseline_info: Dict, comparison_info: Dict, threshold: float) -> str:
    """Format comparison results as JSON."""
    data = {
        "baseline_info": baseline_info,
        "comparison_info": comparison_info,
        "threshold_percent": threshold,
        "summary": {
            "total": len(results),
            "faster": len([r for r in results if r.is_faster and r.is_significant]),
            "slower": len([r for r in results if not r.is_faster and r.is_significant]),
            "unchanged": len([r for r in results if not r.is_significant]),
        },
        "results": [
            {
                "test_name": r.test_name,
                "params": r.params,
                "baseline_mean": r.baseline_mean,
                "baseline_std": r.baseline_std,
                "baseline_trimmed_mean": r.baseline_trimmed_mean,
                "comparison_mean": r.comparison_mean,
                "comparison_std": r.comparison_std,
                "comparison_trimmed_mean": r.comparison_trimmed_mean,
                "diff_seconds": r.diff_seconds,
                "diff_percent": r.diff_percent,
                "is_faster": r.is_faster,
                "is_significant": r.is_significant,
                "p_value": r.p_value,
            }
            for r in results
        ],
    }
    return json.dumps(data, indent=2)


def main():
    parser = argparse.ArgumentParser(
        description="Compare two PyEMD performance test results",
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog="""
Examples:
  python compare_results.py results/20241201_full results/20241202_full
  python compare_results.py baseline.json comparison.json --threshold 10
  python compare_results.py old/ new/ --format markdown > comparison.md
  python compare_results.py old/ new/ --alpha 0.01  # stricter significance
        """,
    )
    parser.add_argument("baseline", type=Path, help="Path to baseline results (directory or JSON file)")
    parser.add_argument("comparison", type=Path, help="Path to comparison results (directory or JSON file)")
    parser.add_argument(
        "--threshold", type=float, default=5.0, help="Minimum percentage change to consider significant (default: 5)"
    )
    parser.add_argument(
        "--alpha", type=float, default=0.05, help="Significance level for t-test (default: 0.05 = 95%% confidence)"
    )
    parser.add_argument(
        "--format", choices=["text", "json", "markdown"], default="text", help="Output format (default: text)"
    )

    args = parser.parse_args()

    try:
        baseline_info, baseline_results = load_results(args.baseline)
        comparison_info, comparison_results = load_results(args.comparison)
    except FileNotFoundError as e:
        print(f"Error: {e}", file=sys.stderr)
        sys.exit(1)

    if not baseline_results:
        print(f"Error: No results found in baseline: {args.baseline}", file=sys.stderr)
        sys.exit(1)

    if not comparison_results:
        print(f"Error: No results found in comparison: {args.comparison}", file=sys.stderr)
        sys.exit(1)

    results = compare_results(baseline_results, comparison_results, args.threshold, args.alpha)

    if not results:
        print("Error: No matching tests found between baseline and comparison", file=sys.stderr)
        sys.exit(1)

    if args.format == "text":
        output = format_text(results, baseline_info, comparison_info, args.threshold, args.alpha)
    elif args.format == "markdown":
        output = format_markdown(results, baseline_info, comparison_info, args.threshold)
    elif args.format == "json":
        output = format_json(results, baseline_info, comparison_info, args.threshold)

    print(output)

    # Exit with non-zero status if there are significant regressions
    regressions = [r for r in results if not r.is_faster and r.is_significant]
    if regressions:
        sys.exit(1)


if __name__ == "__main__":
    main()

back to top

Software Heritage — Copyright (C) 2015–2026, The Software Heritage developers. License: GNU AGPLv3+.
The source code of Software Heritage itself is available on our development forge.
The source code files archived by Software Heritage are available under their own copyright and licenses.
Terms of use: Archive access, API— Content policy— Contact— JavaScript license information— Web API