diff --git a/graphify/__main__.py b/graphify/__main__.py index 638df197..39465654 100644 --- a/graphify/__main__.py +++ b/graphify/__main__.py @@ -1521,6 +1521,9 @@ def main() -> None: if "--force" in argv[2:]: force = True argv = [a for a in argv if a != "--force"] + semantic = "--semantic" in argv[2:] + if semantic: + argv = [a for a in argv if a != "--semantic"] if len(argv) > 2: watch_path = Path(argv[2]) else: @@ -1534,12 +1537,17 @@ def main() -> None: print(f"error: path not found: {watch_path}", file=sys.stderr) sys.exit(1) from graphify.watch import _rebuild_code - print(f"Re-extracting code files in {watch_path} (no LLM needed)...") - ok = _rebuild_code(watch_path, force=force) + msg = f"Re-extracting code files in {watch_path}" + if semantic: + msg += " with semantic LLM..." + else: + msg += " (no LLM needed)..." + print(msg) + ok = _rebuild_code(watch_path, force=force, semantic=semantic) if ok: print("Code graph updated. For doc/paper/image changes run /graphify --update in your AI assistant.") - if not os.environ.get("MOONSHOT_API_KEY") and not os.environ.get("GRAPHIFY_NO_TIPS"): - print("Tip: set MOONSHOT_API_KEY to use Kimi K2.6 for semantic extraction — 3x cheaper, richer graphs. pip install 'graphifyy[kimi]'") + if not os.environ.get("MOONSHOT_API_KEY") and not os.environ.get("OPENAI_API_KEY") and not os.environ.get("GRAPHIFY_NO_TIPS"): + print("Tip: set OPENAI_API_KEY (gpt-5.4-mini) or MOONSHOT_API_KEY (kimi-k2.6) for semantic extraction — richer graphs. pip install 'graphifyy[all]'") else: print("Nothing to update or rebuild failed — check output above.", file=sys.stderr) sys.exit(1) diff --git a/graphify/detect.py b/graphify/detect.py index 30de00e9..3f698642 100644 --- a/graphify/detect.py +++ b/graphify/detect.py @@ -18,7 +18,7 @@ class FileType(str, Enum): _MANIFEST_PATH = "graphify-out/manifest.json" -CODE_EXTENSIONS = {'.py', '.ts', '.js', '.jsx', '.tsx', '.mjs', '.ejs', '.go', '.rs', '.java', '.cpp', '.cc', '.cxx', '.c', '.h', '.hpp', '.rb', '.swift', '.kt', '.kts', '.cs', '.scala', '.php', '.lua', '.toc', '.zig', '.ps1', '.ex', '.exs', '.m', '.mm', '.jl', '.vue', '.svelte', '.dart', '.v', '.sv', '.sql', '.r'} +CODE_EXTENSIONS = {'.py', '.ts', '.js', '.jsx', '.tsx', '.mjs', '.ejs', '.go', '.rs', '.java', '.cpp', '.cc', '.cxx', '.c', '.h', '.hpp', '.rb', '.swift', '.kt', '.kts', '.cs', '.scala', '.php', '.lua', '.toc', '.zig', '.ps1', '.ex', '.exs', '.m', '.mm', '.jl', '.vue', '.svelte', '.dart', '.v', '.sv', '.sql', '.r', '.pas', '.pp', '.inc', '.lpr', '.lfm', '.lpi'} DOC_EXTENSIONS = {'.md', '.mdx', '.txt', '.rst', '.html', '.yaml', '.yml'} PAPER_EXTENSIONS = {'.pdf'} IMAGE_EXTENSIONS = {'.png', '.jpg', '.jpeg', '.gif', '.webp', '.svg'} diff --git a/graphify/extract.py b/graphify/extract.py index 3c47ff1d..30092524 100644 --- a/graphify/extract.py +++ b/graphify/extract.py @@ -811,6 +811,39 @@ def _import_swift(node, source: bytes, file_nid: str, stem: str, edges: list, st break +def _import_pascal(node, source: bytes, file_nid: str, stem: str, edges: list, str_path: str) -> None: + """Extract moduleName from declUses in Pascal.""" + for child in node.children: + if child.type == "moduleName": + raw = _read_text(child, source) + if raw: + tgt_nid = _make_id(raw) + edges.append({ + "source": file_nid, + "target": tgt_nid, + "relation": "imports", + "context": "import", + "confidence": "EXTRACTED", + "source_file": str_path, + "source_location": f"L{node.start_point[0] + 1}", + "weight": 1.0, + }) + + +_PASCAL_CONFIG = LanguageConfig( + ts_module="tree_sitter_language_pack", + ts_language_fn="pascal", + class_types=frozenset({"declClass"}), + function_types=frozenset({"declFunc", "defFunc", "declProc", "defProc"}), + import_types=frozenset({"declUses"}), + call_types=frozenset({"exprCall"}), + name_fallback_child_types=("identifier", "moduleName"), + body_fallback_child_types=("block", "interface", "implementation"), + function_boundary_types=frozenset({"defFunc", "defProc"}), + import_handler=_import_pascal, +) + + def _read_csharp_type_name(node, source: bytes) -> str | None: """Resolve a readable C# type name from a field/type node.""" if node is None: @@ -852,15 +885,19 @@ def _read_csharp_type_name(node, source: bytes) -> str | None: def _extract_generic(path: Path, config: LanguageConfig) -> dict: """Generic AST extractor driven by LanguageConfig.""" try: - mod = importlib.import_module(config.ts_module) from tree_sitter import Language, Parser - lang_fn = getattr(mod, config.ts_language_fn, None) - if lang_fn is None: - # Fallback for PHP: try "language_php" then "language" - lang_fn = getattr(mod, "language", None) - if lang_fn is None: - return {"nodes": [], "edges": [], "error": f"No language function in {config.ts_module}"} - language = Language(lang_fn()) + if config.ts_module == "tree_sitter_language_pack": + import tree_sitter_language_pack + language = tree_sitter_language_pack.get_language(config.ts_language_fn) + else: + mod = importlib.import_module(config.ts_module) + lang_fn = getattr(mod, config.ts_language_fn, None) + if lang_fn is None: + # Fallback for PHP: try "language_php" then "language" + lang_fn = getattr(mod, "language", None) + if lang_fn is None: + return {"nodes": [], "edges": [], "error": f"No language function in {config.ts_module}"} + language = Language(lang_fn()) except ImportError: return {"nodes": [], "edges": [], "error": f"{config.ts_module} not installed"} except Exception as e: @@ -1743,6 +1780,136 @@ def extract_blade(path: Path) -> dict: return {"nodes": nodes, "edges": edges} +def extract_pascal(path: Path) -> dict: + """Extract units, classes, functions, and imports from a .pas/.pp file.""" + try: + return _extract_generic(path, _PASCAL_CONFIG) + except Exception as e: + return {"nodes": [], "edges": [], "error": str(e)} + + +def extract_lfm(path: Path) -> dict: + """Extract UI components and event handlers from Lazarus Form files (.lfm).""" + nodes: list[dict] = [] + edges: list[dict] = [] + try: + content = path.read_text(encoding="utf-8", errors="replace") + str_path = str(path) + file_nid = _make_id(str_path) + nodes.append({ + "id": file_nid, + "label": path.name, + "file_type": "code", + "source_file": str_path, + "source_location": "L1", + }) + + for i, line in enumerate(content.splitlines()): + trimmed = line.strip() + # Match object/inherited definitions: "object Form1: TForm1" + m = re.match(r"^\s*(object|inherited)\s+(\w+)\s*:\s*(\w+)", trimmed, re.I) + if m: + obj_name, obj_type = m.group(2), m.group(3) + obj_nid = _make_id(str_path, obj_name) + nodes.append({ + "id": obj_nid, + "label": f"{obj_name}: {obj_type}", + "file_type": "code", + "source_file": str_path, + "source_location": f"L{i+1}", + }) + edges.append({ + "source": file_nid, + "target": obj_nid, + "relation": "contains", + "confidence": "EXTRACTED", + "source_file": str_path, + "source_location": f"L{i+1}", + "weight": 1.0 + }) + continue + + # Match references to event handlers: "OnClick = Button1Click" + m = re.match(r"^\s*(\w+)\s*=\s*(\w+)", trimmed) + if m: + prop, handler = m.group(1), m.group(2) + if prop.startswith("On"): + handler_nid = _make_id(handler) + edges.append({ + "source": file_nid, + "target": handler_nid, + "relation": "triggers", + "context": prop, + "confidence": "EXTRACTED", + "source_file": str_path, + "source_location": f"L{i+1}", + "weight": 1.0 + }) + return {"nodes": nodes, "edges": edges} + except Exception as e: + return {"nodes": [], "edges": [], "error": str(e)} + + +def extract_lpi(path: Path) -> dict: + """Extract units and project dependencies from Lazarus Project files (.lpi).""" + nodes: list[dict] = [] + edges: list[dict] = [] + try: + import xml.etree.ElementTree as ET + tree = ET.parse(path) + root = tree.getroot() + str_path = str(path) + file_nid = _make_id(str_path) + nodes.append({ + "id": file_nid, + "label": path.name, + "file_type": "code", + "source_file": str_path, + "source_location": "L1", + }) + + # Find units in the project + for unit in root.findall(".//Units/Unit"): + filename = unit.get("Filename") + if filename: + unit_nid = _make_id(str(path.parent / filename)) + edges.append({ + "source": file_nid, + "target": unit_nid, + "relation": "contains", + "confidence": "EXTRACTED", + "source_file": str_path, + "source_location": None, + "weight": 1.0 + }) + + # Find required packages + for pkg in root.findall(".//RequiredPackages/Item"): + pkg_name = pkg.get("PackageName") + if pkg_name: + pkg_nid = _make_id(pkg_name) + nodes.append({ + "id": pkg_nid, + "label": pkg_name, + "file_type": "code", + "source_file": str_path, + "source_location": None, + }) + edges.append({ + "source": file_nid, + "target": pkg_nid, + "relation": "depends_on", + "confidence": "EXTRACTED", + "source_file": str_path, + "source_location": None, + "weight": 1.0 + }) + + return {"nodes": nodes, "edges": edges} + except Exception as e: + return {"nodes": [], "edges": [], "error": str(e)} + + def extract_dart(path: Path) -> dict: """Extract classes, mixins, functions, imports, and calls from a .dart file using regex.""" try: @@ -3685,6 +3852,12 @@ def _check_tree_sitter_version() -> None: ".vue": extract_js, ".svelte": extract_js, ".dart": extract_dart, + ".pas": extract_pascal, + ".pp": extract_pascal, + ".inc": extract_pascal, + ".lpr": extract_pascal, + ".lfm": extract_lfm, + ".lpi": extract_lpi, ".v": extract_verilog, ".sv": extract_verilog, ".sql": extract_sql, diff --git a/graphify/llm.py b/graphify/llm.py index 9cba0ff8..d305adeb 100644 --- a/graphify/llm.py +++ b/graphify/llm.py @@ -58,6 +58,13 @@ def _get_tokenizer(): "pricing": {"input": 0.74, "output": 4.66}, # USD per 1M tokens "temperature": None, # kimi-k2.6 enforces its own fixed temperature; sending any value raises 400 }, + "openai": { + "base_url": "https://api.openai.com/v1", + "default_model": "gpt-5.4-mini", + "env_key": "OPENAI_API_KEY", + "pricing": {"input": 0.15, "output": 0.60}, # USD per 1M tokens (estimated for gpt-5.4-mini) + "temperature": 0, + }, } _EXTRACTION_SYSTEM = """\ @@ -93,18 +100,85 @@ def _read_files(paths: list[Path], root: Path) -> str: return "\n\n".join(parts) -def _parse_llm_json(raw: str) -> dict: - """Strip optional markdown fences and parse JSON. Returns empty fragment on failure.""" +def _repair_json(raw: str) -> str: + """Attempt to repair truncated JSON by closing open strings and brackets.""" + raw = raw.strip() + if not raw: + return "{}" + + # 1. Close unterminated string + # We check if the last quote is unescaped + unescaped_quotes = 0 + i = 0 + while i < len(raw): + if raw[i] == '"': + # Count backslashes preceding this quote + j = i - 1 + slashes = 0 + while j >= 0 and raw[j] == "\\": + slashes += 1 + j -= 1 + if slashes % 2 == 0: + unescaped_quotes += 1 + i += 1 + if unescaped_quotes % 2 != 0: + raw += '"' + + # 2. Close unterminated brackets/braces + stack = [] + # We need to skip characters inside strings to avoid false positives + in_string = False + i = 0 + while i < len(raw): + char = raw[i] + if char == '"': + j = i - 1 + slashes = 0 + while j >= 0 and raw[j] == "\\": + slashes += 1 + j -= 1 + if slashes % 2 == 0: + in_string = not in_string + elif not in_string: + if char == "{": + stack.append("}") + elif char == "[": + stack.append("]") + elif char == "}" and stack and stack[-1] == "}": + stack.pop() + elif char == "]" and stack and stack[-1] == "]": + stack.pop() + i += 1 + + # Append the needed closers in reverse order + raw += "".join(reversed(stack)) + return raw + + +def _parse_llm_json(raw: str) -> tuple[dict, str | None]: + """Strip optional markdown fences and parse JSON. + Returns (result_dict, error_message_or_none). + Attempts to repair truncated JSON before giving up. + """ if raw.startswith("```"): - raw = raw.split("```", 2)[1] - if raw.startswith("json"): - raw = raw[4:] - raw = raw.rsplit("```", 1)[0] + try: + raw = raw.split("```", 2)[1] + if raw.startswith("json"): + raw = raw[4:] + raw = raw.rsplit("```", 1)[0] + except IndexError: + pass + + raw = raw.strip() try: - return json.loads(raw.strip()) - except json.JSONDecodeError as exc: - print(f"[graphify] LLM returned invalid JSON, skipping chunk: {exc}", file=sys.stderr) - return {"nodes": [], "edges": [], "hyperedges": []} + return json.loads(raw), None + except json.JSONDecodeError: + # Try repair + repaired = _repair_json(raw) + try: + return json.loads(repaired), None + except json.JSONDecodeError as exc: + return {"nodes": [], "edges": [], "hyperedges": []}, str(exc) def _call_openai_compat( @@ -138,7 +212,7 @@ def _call_openai_compat( if "moonshot" in base_url: kwargs["extra_body"] = {"thinking": {"type": "disabled"}} resp = client.chat.completions.create(**kwargs) - result = _parse_llm_json(resp.choices[0].message.content or "{}") + result, parse_err = _parse_llm_json(resp.choices[0].message.content or "{}") result["input_tokens"] = resp.usage.prompt_tokens if resp.usage else 0 result["output_tokens"] = resp.usage.completion_tokens if resp.usage else 0 result["model"] = model @@ -146,6 +220,7 @@ def _call_openai_compat( # mid-generation. The JSON we got back is truncated; callers should # treat this as a signal to retry with smaller input. result["finish_reason"] = resp.choices[0].finish_reason + result["parse_error"] = parse_err return result @@ -166,7 +241,7 @@ def _call_claude(api_key: str, model: str, user_message: str) -> dict: system=_EXTRACTION_SYSTEM, messages=[{"role": "user", "content": user_message}], ) - result = _parse_llm_json(resp.content[0].text if resp.content else "{}") + result, parse_err = _parse_llm_json(resp.content[0].text if resp.content else "{}") result["input_tokens"] = resp.usage.input_tokens if resp.usage else 0 result["output_tokens"] = resp.usage.output_tokens if resp.usage else 0 result["model"] = model @@ -174,6 +249,7 @@ def _call_claude(api_key: str, model: str, user_message: str) -> dict: # vocabulary so the adaptive-retry layer doesn't have to know which # backend produced the result. result["finish_reason"] = "length" if resp.stop_reason == "max_tokens" else "stop" + result["parse_error"] = parse_err return result @@ -199,7 +275,7 @@ def extract_files_direct( f"No API key for backend '{backend}'. " f"Set {cfg['env_key']} or pass api_key=." ) - mdl = model or cfg["default_model"] + mdl = model or os.environ.get("GRAPHIFY_MODEL") or cfg["default_model"] user_msg = _read_files(files, root) if backend == "claude": @@ -301,14 +377,27 @@ def _extract_with_adaptive_retry( result = extract_files_direct( chunk, backend=backend, api_key=api_key, model=model, root=root ) + parse_err = result.get("parse_error") + # Case 1: Success (no parse error) + if not parse_err: + return result + + # Case 2: Parse error but NOT due to truncation (hallucination or logic error) + # We don't retry these because splitting files won't fix model logic/hallucination. if result.get("finish_reason") != "length": + print( + f"[graphify] LLM returned invalid JSON, skipping chunk: {parse_err}", + file=sys.stderr, + ) return result + # Case 3: Truncation detected. Decide whether to retry or give up. if len(chunk) <= 1: print( f"[graphify] single-file chunk {chunk[0]} truncated at " - f"max_completion_tokens — partial result kept", + f"max_completion_tokens (output: {result.get('output_tokens')} tokens) " + f"— partial result kept. Error: {parse_err}", file=sys.stderr, ) return result @@ -316,11 +405,13 @@ def _extract_with_adaptive_retry( if _depth >= max_depth: print( f"[graphify] chunk of {len(chunk)} still truncated at recursion " - f"depth {_depth} (max {max_depth}) — partial result kept", + f"depth {_depth} (max {max_depth}) — partial result kept. " + f"Error: {parse_err}", file=sys.stderr, ) return result + # Silence the error and split/retry print( f"[graphify] chunk of {len(chunk)} truncated at depth {_depth}, " f"splitting into halves of {len(chunk) // 2} and " @@ -473,6 +564,8 @@ def detect_backend() -> str | None: """ if os.environ.get("MOONSHOT_API_KEY"): return "kimi" + if os.environ.get("OPENAI_API_KEY"): + return "openai" if os.environ.get("ANTHROPIC_API_KEY"): return "claude" return None diff --git a/graphify/watch.py b/graphify/watch.py index 12496851..f93df6a1 100644 --- a/graphify/watch.py +++ b/graphify/watch.py @@ -33,8 +33,11 @@ def _relativize_source_files(payload: dict, root: Path) -> None: continue -def _rebuild_code(watch_path: Path, *, follow_symlinks: bool = False, force: bool = False) -> bool: - """Re-run AST extraction + build + cluster + report for code files. No LLM needed. +def _rebuild_code(watch_path: Path, *, follow_symlinks: bool = False, force: bool = False, semantic: bool = False) -> bool: + """Re-run AST extraction + optional semantic extraction + build + cluster + report. + + When ``semantic`` is True, also runs LLM-based semantic extraction for code files + using the detected backend (e.g. OpenAI/Kimi). When ``force`` is True the node-count safety check in ``to_json`` is bypassed so the rebuilt graph overwrites graph.json even if it has fewer nodes. @@ -61,13 +64,31 @@ def _rebuild_code(watch_path: Path, *, follow_symlinks: bool = False, force: boo print("[graphify watch] No code files found - nothing to rebuild.") return False + # Skip cross-file resolution if no .py or .java files to save time + py_paths = [p for p in code_files if p.suffix == ".py"] + java_paths = [p for p in code_files if p.suffix == ".java"] + result = extract(code_files, cache_root=watch_root) - # Preserve semantic nodes/edges from a previous full run. - # AST-only rebuild replaces nodes for changed files; everything else is kept. - # Filter by node ID membership in the new AST output, not by file_type — - # INFERRED/AMBIGUOUS nodes extracted from code files also carry file_type="code" - # and would be wrongly dropped by a file_type-based filter. + if semantic: + from graphify.llm import extract_corpus_parallel, detect_backend + backend = detect_backend() + if backend: + print(f"[graphify] Running semantic extraction with {backend}...") + sem = extract_corpus_parallel(code_files, backend=backend, root=watch_root) + # Merge semantic results into AST result + seen = {n['id'] for n in result['nodes']} + for n in sem['nodes']: + if n['id'] not in seen: + result['nodes'].append(n) + seen.add(n['id']) + result['edges'].extend(sem['edges']) + result['hyperedges'] = sem.get('hyperedges', []) + result['input_tokens'] = sem.get('input_tokens', 0) + result['output_tokens'] = sem.get('output_tokens', 0) + else: + print("[graphify] No API key found for semantic extraction. Skipping.") + out = watch_path / "graphify-out" existing_graph = out / "graph.json" if existing_graph.exists(): @@ -87,8 +108,8 @@ def _rebuild_code(watch_path: Path, *, follow_symlinks: bool = False, force: boo "input_tokens": 0, "output_tokens": 0, } - except Exception: - pass # corrupt graph.json - proceed with AST-only + except Exception as e: + pass _relativize_source_files(result, project_root) diff --git a/pyproject.toml b/pyproject.toml index 22c83480..f198696e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -34,6 +34,7 @@ dependencies = [ "tree-sitter-objc", "tree-sitter-julia", "tree-sitter-verilog", + "tree-sitter-language-pack", ] [project.urls]