From dfc292e686cd5ebb3e5c3856cf3f4d28cae6158c Mon Sep 17 00:00:00 2001 From: Adrian Chaves Date: Thu, 11 Jun 2026 09:41:02 +0200 Subject: [PATCH 1/3] Support network capture in page objects --- docs/reference/inputs.rst | 8 ++++++ docs/usage/scrapy-poet.rst | 51 +++++++++++++++++++++++++++++++++ scrapy_zyte_api/__init__.py | 13 +++++++-- scrapy_zyte_api/_annotations.py | 17 +++++++++++ scrapy_zyte_api/_page_inputs.py | 44 ++++++++++++++++++++++++++++ scrapy_zyte_api/providers.py | 20 +++++++++++++ tests/mockserver.py | 18 ++++++++++++ tests/test_providers.py | 50 ++++++++++++++++++++++++++++++++ 8 files changed, 219 insertions(+), 2 deletions(-) diff --git a/docs/reference/inputs.rst b/docs/reference/inputs.rst index be31cf16..93f8c0d3 100644 --- a/docs/reference/inputs.rst +++ b/docs/reference/inputs.rst @@ -50,9 +50,15 @@ Built-in inputs .. autoclass:: scrapy_zyte_api.Actions :members: +.. autoclass:: scrapy_zyte_api.CapturedResponse + :members: + .. autoclass:: scrapy_zyte_api.Geolocation :members: +.. autoclass:: scrapy_zyte_api.NetworkCapture + :members: + .. autoclass:: scrapy_zyte_api.Screenshot :members: @@ -66,3 +72,5 @@ Built-in input annotations .. autofunction:: scrapy_zyte_api.actions .. autofunction:: scrapy_zyte_api.custom_attrs + +.. autofunction:: scrapy_zyte_api.network_capture diff --git a/docs/usage/scrapy-poet.rst b/docs/usage/scrapy-poet.rst index 25527831..7f817b19 100644 --- a/docs/usage/scrapy-poet.rst +++ b/docs/usage/scrapy-poet.rst @@ -129,6 +129,57 @@ resulting page object: return Product(is_valid=False) return None +.. _network-capture: + +Network capture +--------------- + +You can capture network responses made during browser rendering by adding a +:class:`scrapy_zyte_api.NetworkCapture` dependency and annotating it with +filters passed to the :func:`scrapy_zyte_api.network_capture` function: + +.. code-block:: python + + from typing import Annotated + + from scrapy_zyte_api import NetworkCapture, network_capture + + + @attrs.define + class MyPageObject(BasePage): + response: BrowserResponse + network: Annotated[ + NetworkCapture, + network_capture( + [ + { + "filterType": "url", + "value": "/api/", + "matchType": "contains", + "httpResponseBody": True, + }, + {"filterType": "resourceType", "value": "xhr"}, + ] + ), + ] + +Each filter is a :class:`~scrapy_zyte_api.NetworkCaptureFilter` dict. A +response is captured if it matches any filter. Set ``httpResponseBody`` to +``True`` on a filter to include the decoded response body for responses matched +by that filter. + +You can access the captured responses as +:class:`~scrapy_zyte_api.CapturedResponse` objects in the +:attr:`.NetworkCapture.results` attribute: + +.. code-block:: python + + def parse_network_capture(self): + for captured in self.network.results: + if "/api/products" in captured.url: + data = json.loads(captured.body) + ... + .. _custom-attrs: Custom attribute extraction diff --git a/scrapy_zyte_api/__init__.py b/scrapy_zyte_api/__init__.py index 2ba0b71c..6b19d82e 100644 --- a/scrapy_zyte_api/__init__.py +++ b/scrapy_zyte_api/__init__.py @@ -7,13 +7,19 @@ # Register web-poet serializers from . import _serialization # noqa: F401 -from ._annotations import ExtractFrom, actions, custom_attrs +from ._annotations import ExtractFrom, actions, custom_attrs, network_capture from ._middlewares import ( ScrapyZyteAPIDownloaderMiddleware, ScrapyZyteAPIRefererSpiderMiddleware, ScrapyZyteAPISpiderMiddleware, ) -from ._page_inputs import Actions, Geolocation, Screenshot +from ._page_inputs import ( + Actions, + CapturedResponse, + Geolocation, + NetworkCapture, + Screenshot, +) from ._request_fingerprinter import ScrapyZyteAPIRequestFingerprinter from ._session import ( SESSION_AGGRESSIVE_RETRY_POLICY as _SESSION_AGGRESSIVE_RETRY_POLICY, @@ -54,9 +60,11 @@ "SESSION_DEFAULT_RETRY_POLICY", "Actions", "Addon", + "CapturedResponse", "ExtractFrom", "Geolocation", "LocationSessionConfig", + "NetworkCapture", "ScrapyZyteAPIDownloadHandler", "ScrapyZyteAPIDownloaderMiddleware", "ScrapyZyteAPIRefererSpiderMiddleware", @@ -69,6 +77,7 @@ "custom_attrs", "get_request_session_id", "is_session_init_request", + "network_capture", "session_config", "session_config_registry", ] diff --git a/scrapy_zyte_api/_annotations.py b/scrapy_zyte_api/_annotations.py index bd7bfede..de63d906 100644 --- a/scrapy_zyte_api/_annotations.py +++ b/scrapy_zyte_api/_annotations.py @@ -85,6 +85,23 @@ def actions(value: Iterable[Action]) -> tuple[Any, ...]: return tuple(make_hashable(action) for action in value) +class NetworkCaptureFilter(TypedDict, total=False): + """A filter for :func:`~scrapy_zyte_api.network_capture`. + + See :ref:`network-capture`. + """ + + filterType: str + value: str + matchType: str + httpResponseBody: bool + + +def network_capture(filters: Iterable[NetworkCaptureFilter]) -> tuple[Any, ...]: + """Convert an iterable of :class:`~scrapy_zyte_api.NetworkCaptureFilter` dicts into a hashable value.""" + return tuple(make_hashable(f) for f in filters) + + def custom_attrs( input: dict[str, Any], # noqa: A002 options: dict[str, Any] | None = None, diff --git a/scrapy_zyte_api/_page_inputs.py b/scrapy_zyte_api/_page_inputs.py index 64e7efce..816e3002 100644 --- a/scrapy_zyte_api/_page_inputs.py +++ b/scrapy_zyte_api/_page_inputs.py @@ -1,4 +1,5 @@ from base64 import b64decode +from typing import Any import attrs @@ -36,3 +37,46 @@ class Screenshot: @classmethod def from_base64(cls, body): return cls(body=b64decode(body.encode())) + + +@attrs.define +class CapturedResponse: + """A network response captured during browser page rendering. + + Part of :class:`NetworkCapture`. + """ + + #: Response URL. + url: str + + #: HTTP status code. + status: int + + #: Response headers. + headers: dict[str, str] + + #: Response body. ``None`` if ``httpResponseBody`` was not set to ``True`` + #: on the matching filter. + body: bytes | None + + @classmethod + def from_dict(cls, data: dict[str, Any]) -> "CapturedResponse": + body_b64: str | None = data.get("httpResponseBody") + return cls( + url=data["url"], + status=data["statusCode"], + headers=data.get("headers", {}), + body=b64decode(body_b64) if body_b64 is not None else None, + ) + + +@attrs.define +class NetworkCapture: + """A page input that specifies network capture filters and contains captured responses. + + The filters must be :ref:`specified with an annotation + ` using :func:`~scrapy_zyte_api.network_capture`. + """ + + #: Captured responses. + results: list[CapturedResponse] diff --git a/scrapy_zyte_api/providers.py b/scrapy_zyte_api/providers.py index b37ea4c3..9e3eb8d8 100644 --- a/scrapy_zyte_api/providers.py +++ b/scrapy_zyte_api/providers.py @@ -45,6 +45,7 @@ from scrapy_zyte_api import Actions, ExtractFrom, Geolocation, Screenshot from scrapy_zyte_api._annotations import _ActionResult, _from_hashable +from scrapy_zyte_api._page_inputs import CapturedResponse, NetworkCapture from scrapy_zyte_api.utils import _ENGINE_HAS_DOWNLOAD_ASYNC, maybe_deferred_to_future if TYPE_CHECKING: @@ -101,6 +102,7 @@ class ZyteApiProvider(PageObjectInputProvider): Geolocation, JobPosting, JobPostingNavigation, + NetworkCapture, Product, ProductList, ProductNavigation, @@ -202,6 +204,16 @@ async def __call__( for action in cls.__metadata__[0]: # type: ignore[attr-defined] zyte_api_meta["actions"].append(_from_hashable(action)) continue + if cls_stripped is NetworkCapture: + if not is_typing_annotated(cls): + raise ValueError( + "NetworkCapture dependencies must be annotated, " + "e.g. Annotated[NetworkCapture, network_capture([...list of filters...])]." + ) + zyte_api_meta["networkCapture"] = [] + for f in cls.__metadata__[0]: # type: ignore[attr-defined] + zyte_api_meta["networkCapture"].append(_from_hashable(f)) + continue if cls_stripped in {CustomAttributes, CustomAttributesValues}: custom_attrs_input, custom_attrs_options = cls.__metadata__[0] # type: ignore[attr-defined] zyte_api_meta["customAttributes"] = _from_hashable(custom_attrs_input) @@ -358,6 +370,14 @@ async def __call__( result = AnnotatedInstance(Actions(actions_result), cls.__metadata__) # type: ignore[attr-defined] results.append(result) continue + if cls_stripped is NetworkCapture and is_typing_annotated(cls): + captured = [ + CapturedResponse.from_dict(item) + for item in api_response.raw_api_response.get("networkCapture", []) + ] + result = AnnotatedInstance(NetworkCapture(captured), cls.__metadata__) # type: ignore[attr-defined] + results.append(result) + continue if cls_stripped is CustomAttributes and is_typing_annotated(cls): custom_attrs_result = api_response.raw_api_response["customAttributes"] result = AnnotatedInstance( diff --git a/tests/mockserver.py b/tests/mockserver.py index 42cd142b..acca4a2e 100644 --- a/tests/mockserver.py +++ b/tests/mockserver.py @@ -232,6 +232,24 @@ def render_POST(self, request): results.append(result) response_data["actions"] = results # type: ignore[assignment] + network_capture_filters = request_data.get("networkCapture") + if network_capture_filters: + captured = [] + for f in network_capture_filters: + entry: dict = { + "url": f"https://api.example.com/data?filter={f.get('value', '')}", + "statusCode": 200, + "headers": {"content-type": "application/json"}, + "filter": f, + "interceptionStatus": "success", + } + if f.get("httpResponseBody"): + entry["httpResponseBody"] = b64encode( + b'{"captured": true}' + ).decode() + captured.append(entry) + response_data["networkCapture"] = captured + if request_data.get("product") is True: response_data["product"] = { "url": response_data["url"], diff --git a/tests/test_providers.py b/tests/test_providers.py index 59d7bf5a..ada422f6 100644 --- a/tests/test_providers.py +++ b/tests/test_providers.py @@ -40,11 +40,14 @@ from scrapy_zyte_api import ( Actions, + CapturedResponse, ExtractFrom, Geolocation, + NetworkCapture, Screenshot, actions, custom_attrs, + network_capture, ) from scrapy_zyte_api._params import _EXTRACT_KEYS from scrapy_zyte_api.handler import ScrapyZyteAPIDownloadHandler @@ -1097,6 +1100,53 @@ def parse_(self, response: DummyResponse, page: ActionProductPage): # type: ign ) +@deferred_f_from_coro_f +async def test_provider_network_capture(mockserver): + @attrs.define + class NetworkCapturePage(BasePage): + product: Product + captured: Annotated[ + NetworkCapture, + network_capture( + [ + { + "filterType": "url", + "value": "/api/", + "matchType": "contains", + "httpResponseBody": True, + }, + {"filterType": "resourceType", "value": "xhr"}, + ] + ), + ] + + class NetworkCaptureSpider(ZyteAPISpider): + def parse_(self, response: DummyResponse, page: NetworkCapturePage): # type: ignore[override] + yield {"captured": page.captured} + + settings = deepcopy(SETTINGS) + settings["ZYTE_API_URL"] = mockserver.urljoin("/") + settings["SCRAPY_POET_PROVIDERS"] = {ZyteApiProvider: 0} + + item, *_ = await _crawl_single_item(NetworkCaptureSpider, HtmlResource, settings) + nc: NetworkCapture = item["captured"] + assert isinstance(nc, NetworkCapture) + assert len(nc.results) == 2 + + first = nc.results[0] + assert isinstance(first, CapturedResponse) + assert first.url == "https://api.example.com/data?filter=/api/" + assert first.status == 200 + assert first.headers == {"content-type": "application/json"} + assert first.body == b'{"captured": true}' + + second = nc.results[1] + assert isinstance(second, CapturedResponse) + assert second.url == "https://api.example.com/data?filter=xhr" + assert second.status == 200 + assert second.body is None + + def test_item_keywords(): assert set(_EXTRACT_KEYS) == set(_ITEM_KEYWORDS.values()) From 88bd80491baa7624334c51cb2aaf1d58b682982e Mon Sep 17 00:00:00 2001 From: Adrian Chaves Date: Thu, 11 Jun 2026 09:49:53 +0200 Subject: [PATCH 2/3] Silence typing issue --- tests/mockserver.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/mockserver.py b/tests/mockserver.py index acca4a2e..24e06f19 100644 --- a/tests/mockserver.py +++ b/tests/mockserver.py @@ -248,7 +248,7 @@ def render_POST(self, request): b'{"captured": true}' ).decode() captured.append(entry) - response_data["networkCapture"] = captured + response_data["networkCapture"] = captured # type: ignore[assignment] if request_data.get("product") is True: response_data["product"] = { From ad5c6a67c20df4ac3582b14638bed0107b4efd98 Mon Sep 17 00:00:00 2001 From: Adrian Chaves Date: Thu, 11 Jun 2026 11:00:28 +0200 Subject: [PATCH 3/3] Extend test coverage --- tests/test_providers.py | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/tests/test_providers.py b/tests/test_providers.py index ada422f6..e8ec9090 100644 --- a/tests/test_providers.py +++ b/tests/test_providers.py @@ -1147,6 +1147,26 @@ def parse_(self, response: DummyResponse, page: NetworkCapturePage): # type: ig assert second.body is None +@deferred_f_from_coro_f +async def test_provider_network_capture_unannotated(mockserver, caplog): + @attrs.define + class NetworkCapturePage(BasePage): + product: Product + captured: NetworkCapture + + class NetworkCaptureSpider(ZyteAPISpider): + def parse_(self, response: DummyResponse, page: NetworkCapturePage): # type: ignore[override] + pass + + settings = deepcopy(SETTINGS) + settings["ZYTE_API_URL"] = mockserver.urljoin("/") + settings["SCRAPY_POET_PROVIDERS"] = {ZyteApiProvider: 0} + + item, *_ = await _crawl_single_item(NetworkCaptureSpider, HtmlResource, settings) + assert item is None + assert "NetworkCapture dependencies must be annotated" in caplog.text + + def test_item_keywords(): assert set(_EXTRACT_KEYS) == set(_ITEM_KEYWORDS.values())