Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix!: merge payload and data fields of Request #542

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion docs/examples/code/fill_and_submit_web_form_crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ async def request_handler(context: HttpCrawlingContext) -> None:
request = Request.from_url(
url='https://httpbin.org/post',
method='POST',
data={
payload={
'custname': 'John Doe',
'custtel': '1234567890',
'custemail': '[email protected]',
Expand Down
2 changes: 1 addition & 1 deletion docs/examples/code/fill_and_submit_web_form_request.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
request = Request.from_url(
url='https://httpbin.org/post',
method='POST',
data={
payload={
'custname': 'John Doe',
'custtel': '1234567890',
'custemail': '[email protected]',
Expand Down
2 changes: 1 addition & 1 deletion docs/examples/fill_and_submit_web_form.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ Now, let's create a POST request with the form fields and their values using the
{RequestExample}
</CodeBlock>

Alternatively, you can send form data as URL parameters using the `query_params` argument. It depends on the form and how it is implemented. However, sending the data as a POST request body using the `data` parameter is generally a better approach.
Alternatively, you can send form data as URL parameters using the `query_params` argument. It depends on the form and how it is implemented. However, sending the data as a POST request body using the `payload` is generally a better approach.

## Implementing the crawler

Expand Down
19 changes: 14 additions & 5 deletions src/crawlee/_request.py
Original file line number Diff line number Diff line change
Expand Up @@ -119,19 +119,18 @@ class BaseRequestData(BaseModel):
method: HttpMethod = 'GET'
"""HTTP request method."""

headers: Annotated[HttpHeaders, Field(default_factory=HttpHeaders())] = HttpHeaders()
headers: Annotated[HttpHeaders, Field(default_factory=HttpHeaders)] = HttpHeaders()
"""HTTP request headers."""

query_params: Annotated[HttpQueryParams, Field(alias='queryParams', default_factory=dict)] = {}
"""URL query parameters."""

payload: HttpPayload | None = None

data: Annotated[dict[str, Any], Field(default_factory=dict)] = {}
payload: Annotated[HttpPayload, Field(default_factory=dict)] = {}
"""HTTP request payload."""

user_data: Annotated[
dict[str, JsonValue], # Internally, the model contains `UserData`, this is just for convenience
Field(alias='userData', default_factory=lambda: UserData()),
Field(alias='userData', default_factory=UserData),
PlainValidator(user_data_adapter.validate_python),
PlainSerializer(
lambda instance: user_data_adapter.dump_python(
Expand Down Expand Up @@ -161,6 +160,8 @@ def from_url(
url: str,
*,
method: HttpMethod = 'GET',
headers: HttpHeaders | None = None,
query_params: HttpQueryParams | None = None,
payload: HttpPayload | None = None,
label: str | None = None,
unique_key: str | None = None,
Expand All @@ -185,6 +186,8 @@ def from_url(
unique_key=unique_key,
id=id,
method=method,
headers=headers,
query_params=query_params,
payload=payload,
**kwargs,
)
Expand Down Expand Up @@ -235,6 +238,8 @@ def from_url(
url: str,
*,
method: HttpMethod = 'GET',
headers: HttpHeaders | None = None,
query_params: HttpQueryParams | None = None,
payload: HttpPayload | None = None,
label: str | None = None,
unique_key: str | None = None,
Expand All @@ -253,6 +258,8 @@ def from_url(
Args:
url: The URL of the request.
method: The HTTP method of the request.
headers: The HTTP headers of the request.
query_params: The query parameters of the URL.
payload: The data to be sent as the request body. Typically used with 'POST' or 'PUT' requests.
label: A custom label to differentiate between request types. This is stored in `user_data`, and it is
used for request routing (different requests go to different handlers).
Expand Down Expand Up @@ -281,6 +288,8 @@ def from_url(
unique_key=unique_key,
id=id,
method=method,
headers=headers,
query_params=query_params,
payload=payload,
**kwargs,
)
Expand Down
2 changes: 1 addition & 1 deletion src/crawlee/_types.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@

HttpQueryParams: TypeAlias = dict[str, str]

HttpPayload: TypeAlias = Union[str, bytes]
HttpPayload: TypeAlias = dict[str, Any]
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The dict is kinda surprising here - I'd expect bytes or something. How did this happen? 😄

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The downstream libraries (HTTPX & CurlImpersonate) do the conversions. And both of them accept RequestData = Mapping[str, Any] type.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Right. I did some digging and this is what I found. payload is part of the Request schema on the API, and the API client requires it to be a string. It'll get serialized and passed to the API endpoint on platform. Of course, it also needs to be compatible with whatever the HTTP client accepts - that's where the actual processing gets done.

As a side note, I'm not sure what data was supposed to be, but it seems it's not needed...



def _normalize_headers(headers: Mapping[str, str]) -> dict[str, str]:
Expand Down
8 changes: 1 addition & 7 deletions src/crawlee/_utils/requests.py
Original file line number Diff line number Diff line change
Expand Up @@ -119,13 +119,7 @@ def compute_unique_key(

# Compute and return the extended unique key if required.
if use_extended_unique_key:
if payload is None:
payload_in_bytes = b''
elif isinstance(payload, str):
payload_in_bytes = payload.encode('utf-8')
else:
payload_in_bytes = payload

payload_in_bytes = b'' if payload is None else str(payload).encode('utf-8')
payload_hash = compute_short_hash(payload_in_bytes)
return f'{normalized_method}({payload_hash}):{normalized_url}'

Expand Down
8 changes: 4 additions & 4 deletions src/crawlee/http_clients/_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,15 +2,15 @@

from abc import ABC, abstractmethod
from dataclasses import dataclass
from typing import TYPE_CHECKING, Any, Protocol
from typing import TYPE_CHECKING, Protocol

from crawlee._utils.http import is_status_code_error
from crawlee.errors import HttpStatusCodeError

if TYPE_CHECKING:
from collections.abc import Iterable

from crawlee._types import HttpHeaders, HttpMethod, HttpQueryParams
from crawlee._types import HttpHeaders, HttpMethod, HttpPayload, HttpQueryParams
from crawlee.base_storage_client._models import Request
from crawlee.proxy_configuration import ProxyInfo
from crawlee.sessions import Session
Expand Down Expand Up @@ -115,7 +115,7 @@ async def send_request(
method: HttpMethod = 'GET',
headers: HttpHeaders | None = None,
query_params: HttpQueryParams | None = None,
data: dict[str, Any] | None = None,
payload: HttpPayload | None = None,
session: Session | None = None,
proxy_info: ProxyInfo | None = None,
) -> HttpResponse:
Expand All @@ -128,7 +128,7 @@ async def send_request(
method: The HTTP method to use.
headers: The headers to include in the request.
query_params: The query parameters to include in the request.
data: The data to be sent as the request body.
payload: The data to be sent as the request body.
session: The session associated with the request.
proxy_info: The information about the proxy to be used.

Expand Down
8 changes: 4 additions & 4 deletions src/crawlee/http_clients/_httpx.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
if TYPE_CHECKING:
from collections.abc import Iterable

from crawlee._types import HttpMethod, HttpQueryParams
from crawlee._types import HttpMethod, HttpPayload, HttpQueryParams
from crawlee.base_storage_client._models import Request
from crawlee.proxy_configuration import ProxyInfo
from crawlee.statistics import Statistics
Expand Down Expand Up @@ -132,7 +132,7 @@ async def crawl(
method=request.method,
headers=headers,
params=request.query_params,
data=request.data,
data=request.payload,
cookies=session.cookies if session else None,
extensions={'crawlee_session': session if self._persist_cookies_per_session else None},
)
Expand Down Expand Up @@ -167,7 +167,7 @@ async def send_request(
method: HttpMethod = 'GET',
headers: HttpHeaders | None = None,
query_params: HttpQueryParams | None = None,
data: dict[str, Any] | None = None,
payload: HttpPayload | None = None,
session: Session | None = None,
proxy_info: ProxyInfo | None = None,
) -> HttpResponse:
Expand All @@ -179,7 +179,7 @@ async def send_request(
method=method,
headers=dict(headers) if headers else None,
params=query_params,
data=data,
data=payload,
extensions={'crawlee_session': session if self._persist_cookies_per_session else None},
)

Expand Down
6 changes: 3 additions & 3 deletions src/crawlee/http_clients/curl_impersonate.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
from curl_cffi.const import CurlHttpVersion
from typing_extensions import override

from crawlee._types import HttpHeaders
from crawlee._types import HttpHeaders, HttpPayload
from crawlee._utils.blocked import ROTATE_PROXY_ERRORS
from crawlee.errors import ProxyError
from crawlee.http_clients import BaseHttpClient, HttpCrawlingResult, HttpResponse
Expand Down Expand Up @@ -153,7 +153,7 @@ async def send_request(
method: HttpMethod = 'GET',
headers: HttpHeaders | None = None,
query_params: HttpQueryParams | None = None,
data: dict[str, Any] | None = None,
payload: HttpPayload | None = None,
session: Session | None = None,
proxy_info: ProxyInfo | None = None,
) -> HttpResponse:
Expand All @@ -166,7 +166,7 @@ async def send_request(
method=method.upper(), # type: ignore # curl-cffi requires uppercase method
headers=dict(headers) if headers else None,
params=query_params,
data=data,
data=payload,
cookies=session.cookies if session else None,
allow_redirects=True,
)
Expand Down
Loading