Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions docs/reference/inputs.rst
Original file line number Diff line number Diff line change
Expand Up @@ -50,9 +50,15 @@ Built-in inputs
.. autoclass:: scrapy_zyte_api.Actions
:members:

.. autoclass:: scrapy_zyte_api.CapturedResponse
:members:

.. autoclass:: scrapy_zyte_api.Geolocation
:members:

.. autoclass:: scrapy_zyte_api.NetworkCapture
:members:

.. autoclass:: scrapy_zyte_api.Screenshot
:members:

Expand All @@ -66,3 +72,5 @@ Built-in input annotations
.. autofunction:: scrapy_zyte_api.actions

.. autofunction:: scrapy_zyte_api.custom_attrs

.. autofunction:: scrapy_zyte_api.network_capture
51 changes: 51 additions & 0 deletions docs/usage/scrapy-poet.rst
Original file line number Diff line number Diff line change
Expand Up @@ -129,6 +129,57 @@ resulting page object:
return Product(is_valid=False)
return None

.. _network-capture:

Network capture
---------------

You can capture network responses made during browser rendering by adding a
:class:`scrapy_zyte_api.NetworkCapture` dependency and annotating it with
filters passed to the :func:`scrapy_zyte_api.network_capture` function:

.. code-block:: python

from typing import Annotated

from scrapy_zyte_api import NetworkCapture, network_capture


@attrs.define
class MyPageObject(BasePage):
response: BrowserResponse
network: Annotated[
NetworkCapture,
network_capture(
[
{
"filterType": "url",
"value": "/api/",
"matchType": "contains",
"httpResponseBody": True,
},
{"filterType": "resourceType", "value": "xhr"},
]
),
]

Each filter is a :class:`~scrapy_zyte_api.NetworkCaptureFilter` dict. A
response is captured if it matches any filter. Set ``httpResponseBody`` to
``True`` on a filter to include the decoded response body for responses matched
by that filter.

You can access the captured responses as
:class:`~scrapy_zyte_api.CapturedResponse` objects in the
:attr:`.NetworkCapture.results` attribute:

.. code-block:: python

def parse_network_capture(self):
for captured in self.network.results:
if "/api/products" in captured.url:
data = json.loads(captured.body)
...

.. _custom-attrs:

Custom attribute extraction
Expand Down
13 changes: 11 additions & 2 deletions scrapy_zyte_api/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,13 +7,19 @@

# Register web-poet serializers
from . import _serialization # noqa: F401
from ._annotations import ExtractFrom, actions, custom_attrs
from ._annotations import ExtractFrom, actions, custom_attrs, network_capture
from ._middlewares import (
ScrapyZyteAPIDownloaderMiddleware,
ScrapyZyteAPIRefererSpiderMiddleware,
ScrapyZyteAPISpiderMiddleware,
)
from ._page_inputs import Actions, Geolocation, Screenshot
from ._page_inputs import (
Actions,
CapturedResponse,
Geolocation,
NetworkCapture,
Screenshot,
)
from ._request_fingerprinter import ScrapyZyteAPIRequestFingerprinter
from ._session import (
SESSION_AGGRESSIVE_RETRY_POLICY as _SESSION_AGGRESSIVE_RETRY_POLICY,
Expand Down Expand Up @@ -54,9 +60,11 @@
"SESSION_DEFAULT_RETRY_POLICY",
"Actions",
"Addon",
"CapturedResponse",
"ExtractFrom",
"Geolocation",
"LocationSessionConfig",
"NetworkCapture",
"ScrapyZyteAPIDownloadHandler",
"ScrapyZyteAPIDownloaderMiddleware",
"ScrapyZyteAPIRefererSpiderMiddleware",
Expand All @@ -69,6 +77,7 @@
"custom_attrs",
"get_request_session_id",
"is_session_init_request",
"network_capture",
"session_config",
"session_config_registry",
]
17 changes: 17 additions & 0 deletions scrapy_zyte_api/_annotations.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,23 @@ def actions(value: Iterable[Action]) -> tuple[Any, ...]:
return tuple(make_hashable(action) for action in value)


class NetworkCaptureFilter(TypedDict, total=False):
"""A filter for :func:`~scrapy_zyte_api.network_capture`.

See :ref:`network-capture`.
"""

filterType: str
value: str
matchType: str
httpResponseBody: bool


def network_capture(filters: Iterable[NetworkCaptureFilter]) -> tuple[Any, ...]:
"""Convert an iterable of :class:`~scrapy_zyte_api.NetworkCaptureFilter` dicts into a hashable value."""
return tuple(make_hashable(f) for f in filters)


def custom_attrs(
input: dict[str, Any], # noqa: A002
options: dict[str, Any] | None = None,
Expand Down
44 changes: 44 additions & 0 deletions scrapy_zyte_api/_page_inputs.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from base64 import b64decode
from typing import Any

import attrs

Expand Down Expand Up @@ -36,3 +37,46 @@ class Screenshot:
@classmethod
def from_base64(cls, body):
return cls(body=b64decode(body.encode()))


@attrs.define
class CapturedResponse:
"""A network response captured during browser page rendering.

Part of :class:`NetworkCapture`.
"""

#: Response URL.
url: str

#: HTTP status code.
status: int

#: Response headers.
headers: dict[str, str]

#: Response body. ``None`` if ``httpResponseBody`` was not set to ``True``
#: on the matching filter.
body: bytes | None

@classmethod
def from_dict(cls, data: dict[str, Any]) -> "CapturedResponse":
body_b64: str | None = data.get("httpResponseBody")
return cls(
url=data["url"],
status=data["statusCode"],
headers=data.get("headers", {}),
body=b64decode(body_b64) if body_b64 is not None else None,
)


@attrs.define
class NetworkCapture:
"""A page input that specifies network capture filters and contains captured responses.

The filters must be :ref:`specified with an annotation
<network-capture>` using :func:`~scrapy_zyte_api.network_capture`.
"""

#: Captured responses.
results: list[CapturedResponse]
20 changes: 20 additions & 0 deletions scrapy_zyte_api/providers.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@

from scrapy_zyte_api import Actions, ExtractFrom, Geolocation, Screenshot
from scrapy_zyte_api._annotations import _ActionResult, _from_hashable
from scrapy_zyte_api._page_inputs import CapturedResponse, NetworkCapture
from scrapy_zyte_api.utils import _ENGINE_HAS_DOWNLOAD_ASYNC, maybe_deferred_to_future

if TYPE_CHECKING:
Expand Down Expand Up @@ -101,6 +102,7 @@ class ZyteApiProvider(PageObjectInputProvider):
Geolocation,
JobPosting,
JobPostingNavigation,
NetworkCapture,
Product,
ProductList,
ProductNavigation,
Expand Down Expand Up @@ -202,6 +204,16 @@ async def __call__(
for action in cls.__metadata__[0]: # type: ignore[attr-defined]
zyte_api_meta["actions"].append(_from_hashable(action))
continue
if cls_stripped is NetworkCapture:
if not is_typing_annotated(cls):
raise ValueError(
"NetworkCapture dependencies must be annotated, "
"e.g. Annotated[NetworkCapture, network_capture([...list of filters...])]."
)
zyte_api_meta["networkCapture"] = []
for f in cls.__metadata__[0]: # type: ignore[attr-defined]
zyte_api_meta["networkCapture"].append(_from_hashable(f))
continue
if cls_stripped in {CustomAttributes, CustomAttributesValues}:
custom_attrs_input, custom_attrs_options = cls.__metadata__[0] # type: ignore[attr-defined]
zyte_api_meta["customAttributes"] = _from_hashable(custom_attrs_input)
Expand Down Expand Up @@ -358,6 +370,14 @@ async def __call__(
result = AnnotatedInstance(Actions(actions_result), cls.__metadata__) # type: ignore[attr-defined]
results.append(result)
continue
if cls_stripped is NetworkCapture and is_typing_annotated(cls):
captured = [
CapturedResponse.from_dict(item)
for item in api_response.raw_api_response.get("networkCapture", [])
]
result = AnnotatedInstance(NetworkCapture(captured), cls.__metadata__) # type: ignore[attr-defined]
results.append(result)
continue
if cls_stripped is CustomAttributes and is_typing_annotated(cls):
custom_attrs_result = api_response.raw_api_response["customAttributes"]
result = AnnotatedInstance(
Expand Down
18 changes: 18 additions & 0 deletions tests/mockserver.py
Original file line number Diff line number Diff line change
Expand Up @@ -232,6 +232,24 @@ def render_POST(self, request):
results.append(result)
response_data["actions"] = results # type: ignore[assignment]

network_capture_filters = request_data.get("networkCapture")
if network_capture_filters:
captured = []
for f in network_capture_filters:
entry: dict = {
"url": f"https://api.example.com/data?filter={f.get('value', '')}",
"statusCode": 200,
"headers": {"content-type": "application/json"},
"filter": f,
"interceptionStatus": "success",
}
if f.get("httpResponseBody"):
entry["httpResponseBody"] = b64encode(
b'{"captured": true}'
).decode()
captured.append(entry)
response_data["networkCapture"] = captured # type: ignore[assignment]

if request_data.get("product") is True:
response_data["product"] = {
"url": response_data["url"],
Expand Down
70 changes: 70 additions & 0 deletions tests/test_providers.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,11 +40,14 @@

from scrapy_zyte_api import (
Actions,
CapturedResponse,
ExtractFrom,
Geolocation,
NetworkCapture,
Screenshot,
actions,
custom_attrs,
network_capture,
)
from scrapy_zyte_api._params import _EXTRACT_KEYS
from scrapy_zyte_api.handler import ScrapyZyteAPIDownloadHandler
Expand Down Expand Up @@ -1097,6 +1100,73 @@ def parse_(self, response: DummyResponse, page: ActionProductPage): # type: ign
)


@deferred_f_from_coro_f
async def test_provider_network_capture(mockserver):
@attrs.define
class NetworkCapturePage(BasePage):
product: Product
captured: Annotated[
NetworkCapture,
network_capture(
[
{
"filterType": "url",
"value": "/api/",
"matchType": "contains",
"httpResponseBody": True,
},
{"filterType": "resourceType", "value": "xhr"},
]
),
]

class NetworkCaptureSpider(ZyteAPISpider):
def parse_(self, response: DummyResponse, page: NetworkCapturePage): # type: ignore[override]
yield {"captured": page.captured}

settings = deepcopy(SETTINGS)
settings["ZYTE_API_URL"] = mockserver.urljoin("/")
settings["SCRAPY_POET_PROVIDERS"] = {ZyteApiProvider: 0}

item, *_ = await _crawl_single_item(NetworkCaptureSpider, HtmlResource, settings)
nc: NetworkCapture = item["captured"]
assert isinstance(nc, NetworkCapture)
assert len(nc.results) == 2

first = nc.results[0]
assert isinstance(first, CapturedResponse)
assert first.url == "https://api.example.com/data?filter=/api/"
assert first.status == 200
assert first.headers == {"content-type": "application/json"}
assert first.body == b'{"captured": true}'

second = nc.results[1]
assert isinstance(second, CapturedResponse)
assert second.url == "https://api.example.com/data?filter=xhr"
assert second.status == 200
assert second.body is None


@deferred_f_from_coro_f
async def test_provider_network_capture_unannotated(mockserver, caplog):
@attrs.define
class NetworkCapturePage(BasePage):
product: Product
captured: NetworkCapture

class NetworkCaptureSpider(ZyteAPISpider):
def parse_(self, response: DummyResponse, page: NetworkCapturePage): # type: ignore[override]
pass

settings = deepcopy(SETTINGS)
settings["ZYTE_API_URL"] = mockserver.urljoin("/")
settings["SCRAPY_POET_PROVIDERS"] = {ZyteApiProvider: 0}

item, *_ = await _crawl_single_item(NetworkCaptureSpider, HtmlResource, settings)
assert item is None
assert "NetworkCapture dependencies must be annotated" in caplog.text


def test_item_keywords():
assert set(_EXTRACT_KEYS) == set(_ITEM_KEYWORDS.values())

Expand Down
Loading