Skip to content

Commit 59e332b

Browse files
rustyconoverclaude
andcommitted
test: add narrow_bind reproducer fixture catalog
Backs test/sql/integration/narrow_bind_mismatch.test in the vgi extension. The `mismatch` virtual table advertises columns {id, val} but its scan function narrow_scan binds to {id} only — the inconsistency that used to SIGSEGV the client at scan time. The `consistent` table (wide_scan binds {id, val}) is the positive control. Registered in the consolidated vgi-fixture-worker. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
1 parent b0d777a commit 59e332b

3 files changed

Lines changed: 264 additions & 1 deletion

File tree

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
# Copyright 2025, 2026 Query Farm LLC - https://query.farm
2+
3+
"""Narrow-bind reproducer fixture.
4+
5+
Exposes a catalog whose virtual table advertises *more* columns in its
6+
listing (``catalog_schema_contents_tables`` / ``catalog_table_get``) than
7+
its scan function returns from ``on_bind``. A client that trusts the bind
8+
``output_schema`` without checking it against the planned catalog columns
9+
indexes past the end of the worker's narrower batch in
10+
``ArrowTableFunction::ArrowToDuckDB`` and SIGSEGVs. The fix makes the
11+
client fail closed at bind with a clear ``BinderException``.
12+
13+
Driven by ``test/sql/integration/narrow_bind_mismatch.test`` in
14+
``~/Development/vgi``.
15+
"""
Lines changed: 241 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,241 @@
1+
# Copyright 2025, 2026 Query Farm LLC - https://query.farm
2+
3+
"""Narrow-bind reproducer worker.
4+
5+
Two virtual tables, each backed by a table function:
6+
7+
* ``mismatch`` — advertises columns ``{id, val}`` in its catalog listing
8+
but its scan function ``narrow_scan`` binds to ``{id}`` only. This is
9+
the inconsistency that used to segfault the client at scan time
10+
(``ArrowTableFunction::ArrowToDuckDB`` walking off the end of the
11+
worker's 1-column batch). The client must now refuse it at bind with a
12+
clear ``BinderException``.
13+
14+
* ``consistent`` — advertises ``{id, val}`` and its scan function
15+
``wide_scan`` binds to ``{id, val}``. Positive control: this must keep
16+
working unchanged.
17+
"""
18+
19+
from __future__ import annotations
20+
21+
from dataclasses import dataclass
22+
from typing import Annotated, Any
23+
24+
import pyarrow as pa
25+
from vgi_rpc import ArrowSerializableDataclass
26+
from vgi_rpc.rpc import OutputCollector
27+
28+
from vgi import Worker
29+
from vgi.arguments import Arg
30+
from vgi.catalog import Catalog, Schema
31+
from vgi.catalog.catalog_interface import (
32+
AttachOpaqueData,
33+
ReadOnlyCatalogInterface,
34+
ScanFunctionResult,
35+
SchemaInfo,
36+
SchemaObjectType,
37+
SerializedSchema,
38+
TableInfo,
39+
TransactionOpaqueData,
40+
)
41+
from vgi.function import Function
42+
from vgi.invocation import BindResponse
43+
from vgi.table_function import (
44+
BindParams,
45+
ProcessParams,
46+
TableFunctionGenerator,
47+
init_single_worker,
48+
)
49+
50+
CATALOG_NAME = "narrow_bind"
51+
52+
# What the catalog advertises for both tables: two columns.
53+
_TABLE_SCHEMA: pa.Schema = pa.schema(
54+
[pa.field("id", pa.int64()), pa.field("val", pa.int64())]
55+
)
56+
# What the narrow scan function actually binds to: one column.
57+
_NARROW_BIND_SCHEMA: pa.Schema = pa.schema([pa.field("id", pa.int64())])
58+
59+
60+
@dataclass(kw_only=True)
61+
class _State(ArrowSerializableDataclass):
62+
done: bool = False
63+
64+
65+
@dataclass(frozen=True)
66+
class _Args:
67+
count: Annotated[int, Arg(0, doc="rows", ge=0)]
68+
69+
70+
@init_single_worker
71+
class NarrowScan(TableFunctionGenerator[_Args, _State]):
72+
"""Binds to a NARROWER schema than the catalog advertises (the bug)."""
73+
74+
class Meta:
75+
name = "narrow_scan"
76+
description = "bind reports a narrower schema than the table advertises"
77+
78+
@classmethod
79+
def on_bind(cls, params: BindParams[_Args]) -> BindResponse:
80+
return BindResponse(output_schema=_NARROW_BIND_SCHEMA)
81+
82+
@classmethod
83+
def initial_state(cls, params: ProcessParams[_Args]) -> _State:
84+
return _State()
85+
86+
@classmethod
87+
def process(cls, params: ProcessParams[_Args], state: _State, out: OutputCollector) -> None:
88+
if state.done:
89+
out.finish()
90+
return
91+
state.done = True
92+
out.emit(pa.RecordBatch.from_pydict({"id": [0, 1, 2]}, schema=params.output_schema))
93+
94+
95+
@init_single_worker
96+
class WideScan(TableFunctionGenerator[_Args, _State]):
97+
"""Binds to the full advertised schema (positive control — must work)."""
98+
99+
class Meta:
100+
name = "wide_scan"
101+
description = "bind matches the table's advertised schema"
102+
103+
@classmethod
104+
def on_bind(cls, params: BindParams[_Args]) -> BindResponse:
105+
return BindResponse(output_schema=_TABLE_SCHEMA)
106+
107+
@classmethod
108+
def initial_state(cls, params: ProcessParams[_Args]) -> _State:
109+
return _State()
110+
111+
@classmethod
112+
def process(cls, params: ProcessParams[_Args], state: _State, out: OutputCollector) -> None:
113+
if state.done:
114+
out.finish()
115+
return
116+
state.done = True
117+
out.emit(
118+
pa.RecordBatch.from_pydict({"id": [0, 1, 2], "val": [10, 20, 30]}, schema=params.output_schema)
119+
)
120+
121+
122+
_FUNCTIONS: list[type[Function]] = [NarrowScan, WideScan]
123+
124+
_CATALOG = Catalog(
125+
name=CATALOG_NAME,
126+
default_schema="main",
127+
schemas=[
128+
Schema(
129+
name="main",
130+
comment="narrow-bind reproducer catalog",
131+
functions=list(_FUNCTIONS),
132+
tables=[],
133+
),
134+
],
135+
)
136+
137+
138+
def _serialize_schema(s: pa.Schema) -> bytes:
139+
sink = pa.BufferOutputStream()
140+
with pa.ipc.new_stream(sink, s):
141+
pass
142+
return sink.getvalue().to_pybytes()
143+
144+
145+
# table name -> scan function name. Both advertise _TABLE_SCHEMA (2 cols).
146+
_TABLE_FUNCTIONS = {
147+
"mismatch": "narrow_scan",
148+
"consistent": "wide_scan",
149+
}
150+
151+
152+
class NarrowBindCatalog(ReadOnlyCatalogInterface):
153+
catalog = _CATALOG
154+
catalog_name = CATALOG_NAME
155+
156+
def _info(self, table_name: str) -> TableInfo:
157+
return TableInfo(
158+
comment=f"narrow-bind reproducer table -> {_TABLE_FUNCTIONS[table_name]}",
159+
tags={},
160+
name=table_name,
161+
schema_name="main",
162+
columns=SerializedSchema(_serialize_schema(_TABLE_SCHEMA)),
163+
not_null_constraints=[],
164+
unique_constraints=[],
165+
check_constraints=[],
166+
)
167+
168+
def schemas(
169+
self, *, attach_opaque_data: AttachOpaqueData, transaction_opaque_data: TransactionOpaqueData | None
170+
) -> list[SchemaInfo]:
171+
infos = super().schemas(attach_opaque_data=attach_opaque_data, transaction_opaque_data=transaction_opaque_data)
172+
for i, info in enumerate(infos):
173+
if info.name == "main":
174+
infos[i] = SchemaInfo(
175+
attach_opaque_data=info.attach_opaque_data,
176+
name=info.name,
177+
comment=info.comment,
178+
tags=info.tags,
179+
estimated_object_count={
180+
**(info.estimated_object_count or {}),
181+
"table": len(_TABLE_FUNCTIONS),
182+
},
183+
)
184+
return infos
185+
186+
def schema_contents(
187+
self,
188+
*,
189+
attach_opaque_data: AttachOpaqueData,
190+
transaction_opaque_data: TransactionOpaqueData | None,
191+
name: str,
192+
type: Any,
193+
) -> Any:
194+
if name.lower() == "main" and type == SchemaObjectType.TABLE:
195+
return [self._info(table_name) for table_name in _TABLE_FUNCTIONS]
196+
return super().schema_contents(
197+
attach_opaque_data=attach_opaque_data, transaction_opaque_data=transaction_opaque_data, name=name, type=type
198+
)
199+
200+
def table_get(
201+
self,
202+
*,
203+
attach_opaque_data: AttachOpaqueData,
204+
transaction_opaque_data: TransactionOpaqueData | None,
205+
schema_name: str,
206+
name: str,
207+
at_unit: str | None = None,
208+
at_value: str | None = None,
209+
) -> TableInfo | None:
210+
if schema_name.lower() != "main":
211+
return None
212+
if name in _TABLE_FUNCTIONS:
213+
return self._info(name)
214+
return None
215+
216+
def table_scan_function_get(
217+
self,
218+
*,
219+
attach_opaque_data: AttachOpaqueData,
220+
transaction_opaque_data: TransactionOpaqueData | None,
221+
schema_name: str,
222+
name: str,
223+
at_unit: str | None,
224+
at_value: str | None,
225+
) -> ScanFunctionResult:
226+
fn = _TABLE_FUNCTIONS.get(name)
227+
if fn is None:
228+
raise ValueError(f"unknown narrow-bind reproducer table: {name}")
229+
return ScanFunctionResult(
230+
function_name=fn,
231+
positional_arguments=[pa.scalar(3, type=pa.int64())],
232+
named_arguments={},
233+
required_extensions=[],
234+
)
235+
236+
237+
class NarrowBindWorker(Worker):
238+
catalog_interface = NarrowBindCatalog
239+
catalog_name = CATALOG_NAME
240+
catalog = _CATALOG
241+
functions = list(_FUNCTIONS)

vgi/_test_fixtures/worker.py

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1621,11 +1621,18 @@ def main() -> None:
16211621
extra is also installed.
16221622
"""
16231623
from vgi._test_fixtures.accumulate.worker import AccumulateWorker
1624+
from vgi._test_fixtures.narrow_bind.worker import NarrowBindWorker
16241625
from vgi._test_fixtures.projection_repro.worker import ProjReproWorker
16251626
from vgi._test_fixtures.schema_reconcile.worker import SchemaReconcileWorker
16261627
from vgi.meta_worker import MetaWorker
16271628

1628-
workers: list[type] = [ExampleWorker, ProjReproWorker, SchemaReconcileWorker, AccumulateWorker]
1629+
workers: list[type] = [
1630+
ExampleWorker,
1631+
ProjReproWorker,
1632+
SchemaReconcileWorker,
1633+
AccumulateWorker,
1634+
NarrowBindWorker,
1635+
]
16291636
try:
16301637
from vgi._test_fixtures.writable.worker import WritableWorker
16311638
except ImportError:

0 commit comments

Comments
 (0)