Skip to content

Commit 49a772c

Browse files
committed
Merge branch 'main' into finish-lazy-lock
# Conflicts: # src/infer_intent.rs # src/interface.rs # src/xpath_functions.rs
2 parents 5149281 + de2ca90 commit 49a772c

32 files changed

+971
-270
lines changed

PythonScripts/audit_translations/README.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -94,3 +94,7 @@ uv run python -m audit_translations de-CH
9494
# Show detailed output with English/translated snippets for rule differences
9595
uv run python -m audit_translations es --verbose
9696
```
97+
98+
### Testing
99+
100+
```uv run python -m pytest```

PythonScripts/audit_translations/auditor.py

Lines changed: 91 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717
from rich.table import Table
1818

1919
from .dataclasses import RuleInfo, RuleDifference, ComparisonResult
20-
from .parsers import parse_yaml_file, diff_rules
20+
from .parsers import parse_yaml_file, diff_rules, extract_structure_elements
2121
console = Console()
2222

2323

@@ -127,7 +127,7 @@ def merge_rules(base_rules: List[RuleInfo], region_rules: List[RuleInfo]) -> Lis
127127
if include_untranslated:
128128
for rule in translated_rules:
129129
if rule.has_untranslated_text and not rule.audit_ignore:
130-
untranslated_text.append((rule, rule.untranslated_keys))
130+
untranslated_text.append((rule, rule.untranslated_entries))
131131

132132
# Find fine-grained differences in rules that exist in both files (skip if audit-ignore)
133133
rule_differences = []
@@ -157,16 +157,16 @@ def rule_label(rule: RuleInfo) -> str:
157157
return f"[cyan]{escape(rule.name)}[/] [dim][{escape(tag)}][/]"
158158

159159

160-
def print_rule_item(rule: RuleInfo, context: str = ""):
161-
console.print(f" [dim]•[/] {rule_label(rule)} [dim](line {rule.line_number}{context})[/]")
160+
def print_rule_item(rule: RuleInfo, issue_line: int, context: str = ""):
161+
console.print(f" [dim]•[/] {rule_label(rule)} [dim](line {issue_line}{context})[/]")
162162

163163

164-
def print_diff_item(diff: RuleDifference, verbose: bool = False):
164+
def print_diff_item(diff: RuleDifference, line_en: int, line_tr: int, verbose: bool = False):
165165
"""Print a single rule difference"""
166166
rule = diff.english_rule
167167
console.print(
168168
f" [dim]•[/] {rule_label(rule)} "
169-
f"[dim](line {rule.line_number} en, {diff.translated_rule.line_number} tr)[/]"
169+
f"[dim](line {line_en} en, {line_tr} tr)[/]"
170170
)
171171
console.print(f" [dim]{diff.description}[/]")
172172
if verbose:
@@ -181,11 +181,43 @@ def issue_base(rule: RuleInfo, file_name: str, language: str) -> dict:
181181
"rule_name": rule.name or "",
182182
"rule_tag": rule.tag or "",
183183
"rule_key": rule.key,
184-
"line_en": None,
185-
"line_tr": None,
184+
"issue_line_en": None,
185+
"issue_line_tr": None,
186+
"rule_line_en": None,
187+
"rule_line_tr": None,
186188
}
187189

188190

191+
def first_structure_mismatch(
192+
english_tokens: List[str],
193+
translated_tokens: List[str],
194+
) -> Tuple[Optional[str], Optional[str]]:
195+
min_len = min(len(english_tokens), len(translated_tokens))
196+
for idx in range(min_len):
197+
if english_tokens[idx] != translated_tokens[idx]:
198+
return english_tokens[idx], translated_tokens[idx]
199+
if len(english_tokens) > min_len:
200+
return english_tokens[min_len], None
201+
if len(translated_tokens) > min_len:
202+
return None, translated_tokens[min_len]
203+
return None, None
204+
205+
206+
def resolve_issue_line(rule: RuleInfo, kind: str, token: Optional[str] = None) -> int:
207+
if kind == "match":
208+
lines = rule.line_map.get("match", [])
209+
elif kind == "condition":
210+
lines = rule.line_map.get("condition", [])
211+
elif kind == "variables":
212+
lines = rule.line_map.get("variables", [])
213+
elif kind == "structure" and token:
214+
token_key = f"structure:{token.rstrip(':')}"
215+
lines = rule.line_map.get(token_key, [])
216+
else:
217+
lines = []
218+
return lines[0] if lines else rule.line_number
219+
220+
189221
def collect_issues(
190222
result: ComparisonResult,
191223
file_name: str,
@@ -198,7 +230,8 @@ def collect_issues(
198230
issue.update(
199231
issue_type="missing_rule",
200232
diff_type="",
201-
line_en=rule.line_number,
233+
issue_line_en=rule.line_number,
234+
rule_line_en=rule.line_number,
202235
description="Rule present in English but missing in translation",
203236
english_snippet="",
204237
translated_snippet="",
@@ -211,35 +244,49 @@ def collect_issues(
211244
issue.update(
212245
issue_type="extra_rule",
213246
diff_type="",
214-
line_tr=rule.line_number,
247+
issue_line_tr=rule.line_number,
248+
rule_line_tr=rule.line_number,
215249
description="Rule present in translation but missing in English",
216250
english_snippet="",
217251
translated_snippet="",
218252
untranslated_texts=[],
219253
)
220254
issues.append(issue)
221255

222-
for rule, texts in result.untranslated_text:
223-
issue = issue_base(rule, file_name, language)
224-
issue.update(
225-
issue_type="untranslated_text",
226-
diff_type="",
227-
line_tr=rule.line_number,
228-
description="Lowercase t/ot/ct keys indicate untranslated text",
229-
english_snippet="",
230-
translated_snippet="",
231-
untranslated_texts=texts,
232-
)
233-
issues.append(issue)
256+
for rule, entries in result.untranslated_text:
257+
for key, text, line in entries:
258+
issue = issue_base(rule, file_name, language)
259+
issue.update(
260+
issue_type="untranslated_text",
261+
diff_type="",
262+
issue_line_tr=line or rule.line_number,
263+
rule_line_tr=rule.line_number,
264+
description="Lowercase t/ot/ct keys indicate untranslated text",
265+
english_snippet="",
266+
translated_snippet="",
267+
untranslated_texts=[text],
268+
)
269+
issues.append(issue)
234270

235271
for diff in result.rule_differences:
236272
rule = diff.english_rule
237273
issue = issue_base(rule, file_name, language)
274+
if diff.diff_type == "structure":
275+
en_tokens = extract_structure_elements(diff.english_rule.data)
276+
tr_tokens = extract_structure_elements(diff.translated_rule.data)
277+
en_token, tr_token = first_structure_mismatch(en_tokens, tr_tokens)
278+
issue_line_en = resolve_issue_line(diff.english_rule, "structure", en_token)
279+
issue_line_tr = resolve_issue_line(diff.translated_rule, "structure", tr_token)
280+
else:
281+
issue_line_en = resolve_issue_line(diff.english_rule, diff.diff_type)
282+
issue_line_tr = resolve_issue_line(diff.translated_rule, diff.diff_type)
238283
issue.update(
239284
issue_type="rule_difference",
240285
diff_type=diff.diff_type,
241-
line_en=diff.english_rule.line_number,
242-
line_tr=diff.translated_rule.line_number,
286+
issue_line_en=issue_line_en,
287+
issue_line_tr=issue_line_tr,
288+
rule_line_en=diff.english_rule.line_number,
289+
rule_line_tr=diff.translated_rule.line_number,
243290
description=diff.description,
244291
english_snippet=diff.english_snippet,
245292
translated_snippet=diff.translated_snippet,
@@ -279,16 +326,18 @@ def print_warnings(result: ComparisonResult, file_name: str, verbose: bool = Fal
279326
if result.missing_rules:
280327
console.print(f"\n [red]✗[/] [bold]Missing Rules[/] [[red]{len(result.missing_rules)}[/]] [dim](in English but not in translation)[/]")
281328
for rule in result.missing_rules:
282-
print_rule_item(rule, context=" in English")
329+
print_rule_item(rule, issue_line=rule.line_number, context=" in English")
283330
issues += 1
284331

285332
if result.untranslated_text:
286-
console.print(f"\n [yellow]⚠[/] [bold]Untranslated Text[/] [[yellow]{len(result.untranslated_text)}[/]] [dim](lowercase t/ot/ct keys)[/]")
287-
for rule, texts in result.untranslated_text:
288-
print_rule_item(rule)
289-
for text in texts:
333+
untranslated_count = sum(len(entries) for _, entries in result.untranslated_text)
334+
console.print(f"\n [yellow]⚠[/] [bold]Untranslated Text[/] [[yellow]{untranslated_count}[/]] [dim](lowercase t/ot/ct keys)[/]")
335+
for rule, entries in result.untranslated_text:
336+
for _, text, line in entries:
337+
issue_line = line or rule.line_number
338+
print_rule_item(rule, issue_line=issue_line)
290339
console.print(f" [dim]→[/] [yellow]\"{escape(text)}\"[/]")
291-
issues += 1
340+
issues += 1
292341

293342
if result.rule_differences:
294343
total_diffs = len(result.rule_differences)
@@ -297,13 +346,22 @@ def print_warnings(result: ComparisonResult, file_name: str, verbose: bool = Fal
297346
f"[[magenta]{total_diffs}[/]] [dim](structural differences between en and translation)[/]"
298347
)
299348
for diff in result.rule_differences:
300-
print_diff_item(diff, verbose)
349+
if diff.diff_type == "structure":
350+
en_tokens = extract_structure_elements(diff.english_rule.data)
351+
tr_tokens = extract_structure_elements(diff.translated_rule.data)
352+
en_token, tr_token = first_structure_mismatch(en_tokens, tr_tokens)
353+
line_en = resolve_issue_line(diff.english_rule, "structure", en_token)
354+
line_tr = resolve_issue_line(diff.translated_rule, "structure", tr_token)
355+
else:
356+
line_en = resolve_issue_line(diff.english_rule, diff.diff_type)
357+
line_tr = resolve_issue_line(diff.translated_rule, diff.diff_type)
358+
print_diff_item(diff, line_en=line_en, line_tr=line_tr, verbose=verbose)
301359
issues += 1
302360

303361
if result.extra_rules:
304362
console.print(f"\n [blue]ℹ[/] [bold]Extra Rules[/] [[blue]{len(result.extra_rules)}[/]] [dim](may be intentional)[/]")
305363
for rule in result.extra_rules:
306-
print_rule_item(rule)
364+
print_rule_item(rule, issue_line=rule.line_number)
307365
issues += 1
308366

309367
return issues
@@ -401,7 +459,7 @@ def audit_language(
401459
files_ok += 1
402460

403461
total_missing += len(result.missing_rules)
404-
total_untranslated += len(result.untranslated_text)
462+
total_untranslated += sum(len(entries) for _, entries in result.untranslated_text)
405463
total_extra += len(result.extra_rules)
406464
total_differences += len(result.rule_differences)
407465

PythonScripts/audit_translations/dataclasses.py

Lines changed: 36 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -5,12 +5,43 @@
55
"""
66

77
from dataclasses import dataclass, field
8-
from typing import Any, List, Tuple, Optional
8+
from typing import Any, List, Tuple, Optional, Dict
99

1010

1111
@dataclass
1212
class RuleInfo:
13-
"""Information about a single rule"""
13+
"""
14+
Information about a single rule parsed from a YAML file.
15+
16+
Attributes
17+
----------
18+
name : Optional[str]
19+
Rule name for standard rule files; None for unicode entries.
20+
tag : Optional[str]
21+
Rule tag (normalized string); None for unicode entries.
22+
key : str
23+
Stable identifier used for matching; for unicode entries this is the character or range key.
24+
line_number : int
25+
1-based line number where the rule starts in the source file.
26+
raw_content : str
27+
Raw YAML block for this rule (used for reporting/snippets).
28+
data : Optional[Any]
29+
Parsed YAML node for the rule; used for structural diffs.
30+
has_untranslated_text : bool
31+
True if the rule contains lowercase t/ot/ct/etc. values.
32+
untranslated_keys : List[str]
33+
List of untranslated text values (used for summary counts).
34+
untranslated_entries : List[Tuple[str, str, Optional[int]]]
35+
List of (key, text, line) entries extracted from lowercase translation keys.
36+
This drives per-issue JSONL output so each untranslated string can report
37+
the specific YAML line number where it appears.
38+
line_map : Dict[str, List[int]]
39+
Mapping of element type to line numbers for rule components like match,
40+
conditions, variables, and structural tokens. This is used to point
41+
structural diffs at a precise line rather than the top of the rule.
42+
audit_ignore : bool
43+
True if the raw content contains an audit-ignore marker.
44+
"""
1445
name: Optional[str] # None for unicode entries
1546
tag: Optional[str] # None for unicode entries
1647
key: str # For unicode entries, this is the character/range
@@ -19,6 +50,8 @@ class RuleInfo:
1950
data: Optional[Any] = None
2051
has_untranslated_text: bool = False
2152
untranslated_keys: List[str] = field(default_factory=list)
53+
untranslated_entries: List[Tuple[str, str, Optional[int]]] = field(default_factory=list) # (key, text, line) for JSONL output
54+
line_map: Dict[str, List[int]] = field(default_factory=dict) # Element-type -> line numbers for precise diff locations
2255
audit_ignore: bool = False
2356

2457

@@ -38,7 +71,7 @@ class ComparisonResult:
3871
"""Results from comparing English and translated files"""
3972
missing_rules: List[RuleInfo] # Rules in English but not in translation
4073
extra_rules: List[RuleInfo] # Rules in translation but not in English
41-
untranslated_text: List[Tuple[RuleInfo, List[str]]] # Rules with lowercase t/ot/ct
74+
untranslated_text: List[Tuple[RuleInfo, List[Tuple[str, str, Optional[int]]]]] # Rules with lowercase t/ot/ct
4275
file_path: str
4376
english_rule_count: int
4477
translated_rule_count: int

0 commit comments

Comments
 (0)