|
| 1 | +"""Normalise an error message into a stable failure signature. |
| 2 | +
|
| 3 | +Two runs that failed the *same way* almost never have byte-identical error text — |
| 4 | +paths, line numbers, memory addresses, ids and timestamps differ every time. That |
| 5 | +defeats any attempt to ask "is this the same failure as yesterday?" or "which |
| 6 | +tests fail *together*?". ``failure_signature`` strips the variable parts of an |
| 7 | +error to a canonical form and hashes it (SHA-256), so the same *kind* of failure |
| 8 | +gets the same short signature across runs — the join key the rest of the |
| 9 | +test-robustness tools (run diffing, flake clustering) group on. |
| 10 | +
|
| 11 | +Pure standard library (``re`` + ``hashlib``); no device, no ``PySide6``. |
| 12 | +""" |
| 13 | +import hashlib |
| 14 | +import re |
| 15 | +from typing import Any, Dict, Iterable, List |
| 16 | + |
| 17 | +# Ordered (pattern, replacement): the volatile parts of an error, most specific |
| 18 | +# first so e.g. a path's trailing line number isn't half-collapsed by the digit rule. |
| 19 | +_NORMALIZERS = [ |
| 20 | + (re.compile(r"[A-Za-z]:\\[^\s:*?\"<>|]+"), "<path>"), # Windows path |
| 21 | + (re.compile(r"(?:/[\w.\-]+)+/[\w.\-]+"), "<path>"), # POSIX path |
| 22 | + (re.compile(r"0x[0-9A-Fa-f]+"), "0x<addr>"), # memory address |
| 23 | + (re.compile(r"\b[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}" |
| 24 | + r"-[0-9a-fA-F]{4}-[0-9a-fA-F]{12}\b"), "<uuid>"), |
| 25 | + (re.compile(r"\d{4}-\d{2}-\d{2}[ T]\d{2}:\d{2}:\d{2}(?:\.\d+)?"), "<ts>"), |
| 26 | + (re.compile(r"\bline\s+\d+\b", re.IGNORECASE), "line <n>"), |
| 27 | + (re.compile(r"\b\d+\b"), "<n>"), # any leftover int |
| 28 | +] |
| 29 | +_WHITESPACE = re.compile(r"\s+") |
| 30 | + |
| 31 | + |
| 32 | +def normalize_error(message: str) -> str: |
| 33 | + """Collapse the volatile parts of an error message to a canonical form. |
| 34 | +
|
| 35 | + Paths, hex addresses, UUIDs, timestamps, line numbers and bare integers |
| 36 | + become placeholders, and whitespace is squeezed — so messages that differ |
| 37 | + only in those details normalise to the same string. |
| 38 | + """ |
| 39 | + text = str(message) |
| 40 | + for pattern, replacement in _NORMALIZERS: |
| 41 | + text = pattern.sub(replacement, text) |
| 42 | + return _WHITESPACE.sub(" ", text).strip() |
| 43 | + |
| 44 | + |
| 45 | +def failure_signature(message: str, *, length: int = 12) -> str: |
| 46 | + """Return a short stable SHA-256 signature of a normalised error message.""" |
| 47 | + digest = hashlib.sha256(normalize_error(message).encode("utf-8")).hexdigest() |
| 48 | + return digest[:max(1, int(length))] |
| 49 | + |
| 50 | + |
| 51 | +def group_failures(messages: Iterable[str]) -> List[Dict[str, Any]]: |
| 52 | + """Group error messages by signature, most frequent first. |
| 53 | +
|
| 54 | + Returns ``[{signature, normalized, count, examples}]`` (up to three distinct |
| 55 | + raw examples per group). ``None`` / empty messages are skipped. |
| 56 | + """ |
| 57 | + groups: Dict[str, Dict[str, Any]] = {} |
| 58 | + for message in messages: |
| 59 | + if not message: |
| 60 | + continue |
| 61 | + signature = failure_signature(message) |
| 62 | + group = groups.setdefault(signature, { |
| 63 | + "signature": signature, "normalized": normalize_error(message), |
| 64 | + "count": 0, "examples": []}) |
| 65 | + group["count"] += 1 |
| 66 | + if len(group["examples"]) < 3 and str(message) not in group["examples"]: |
| 67 | + group["examples"].append(str(message)) |
| 68 | + return sorted(groups.values(), key=lambda group: group["count"], reverse=True) |
0 commit comments