42 lines
1.1 KiB
Python
42 lines
1.1 KiB
Python
from __future__ import annotations
|
|
|
|
from typing import Any, Literal
|
|
|
|
from pydantic import BaseModel, field_validator
|
|
|
|
|
|
class SelectorDef(BaseModel):
|
|
name: str
|
|
selector: str
|
|
selector_type: Literal["css", "xpath"] = "css"
|
|
attribute: str | None = None # None = get text content
|
|
multiple: bool = False
|
|
|
|
|
|
class ScrapeRequest(BaseModel):
|
|
url: str
|
|
fetcher_type: Literal["http", "stealth", "dynamic"] = "http"
|
|
selectors: list[SelectorDef] = []
|
|
return_html: bool = False
|
|
timeout: int = 30000
|
|
proxy: str | None = None
|
|
headers: dict[str, str] = {}
|
|
# dynamic-fetcher specific
|
|
wait_selector: str | None = None
|
|
network_idle: bool = False
|
|
headless: bool = True
|
|
|
|
@field_validator("url")
|
|
@classmethod
|
|
def url_must_have_scheme(cls, v: str) -> str:
|
|
if not v.startswith(("http://", "https://")):
|
|
raise ValueError("URL must start with http:// or https://")
|
|
return v
|
|
|
|
@field_validator("timeout")
|
|
@classmethod
|
|
def timeout_range(cls, v: int) -> int:
|
|
if not (1000 <= v <= 120_000):
|
|
raise ValueError("timeout must be between 1000 and 120000 ms")
|
|
return v
|