69 lines
2.2 KiB
Python
69 lines
2.2 KiB
Python
from __future__ import annotations
|
|
|
|
import time
|
|
from abc import ABC, abstractmethod
|
|
from typing import Any
|
|
|
|
from scrapling import Fetcher, StealthyFetcher, PlayWrightFetcher
|
|
|
|
from ..models.request import ScrapeRequest, SelectorDef
|
|
from ..models.response import ScrapeResponse
|
|
|
|
|
|
def apply_selectors(page: Any, selectors: list[SelectorDef]) -> dict[str, Any]:
|
|
"""Extract data from a Scrapling page object using CSS/XPath selectors."""
|
|
result: dict[str, Any] = {}
|
|
for sel in selectors:
|
|
try:
|
|
if sel.selector_type == "css":
|
|
if sel.multiple:
|
|
elements = page.css(sel.selector)
|
|
else:
|
|
elements = [page.css_first(sel.selector)]
|
|
else:
|
|
if sel.multiple:
|
|
elements = page.xpath(sel.selector)
|
|
else:
|
|
elements = [page.xpath_first(sel.selector)]
|
|
|
|
def extract_value(el: Any) -> str | None:
|
|
if el is None:
|
|
return None
|
|
if sel.attribute:
|
|
return el.attrib.get(sel.attribute)
|
|
return el.text
|
|
|
|
if sel.multiple:
|
|
result[sel.name] = [extract_value(el) for el in (elements or [])]
|
|
else:
|
|
result[sel.name] = extract_value(elements[0] if elements else None)
|
|
except Exception as exc:
|
|
result[sel.name] = None
|
|
result[f"{sel.name}_error"] = str(exc)
|
|
return result
|
|
|
|
|
|
class BaseScraper(ABC):
|
|
@abstractmethod
|
|
async def scrape(self, req: ScrapeRequest) -> ScrapeResponse:
|
|
...
|
|
|
|
def _build_response(
|
|
self,
|
|
req: ScrapeRequest,
|
|
page: Any,
|
|
fetcher_name: str,
|
|
start: float,
|
|
) -> ScrapeResponse:
|
|
elapsed = (time.perf_counter() - start) * 1000
|
|
html = page.html if req.return_html else None
|
|
data = apply_selectors(page, req.selectors) if req.selectors else {}
|
|
return ScrapeResponse(
|
|
url=req.url,
|
|
status_code=page.status if hasattr(page, "status") else 200,
|
|
html=html,
|
|
data=data,
|
|
fetcher_used=fetcher_name,
|
|
elapsed_ms=round(elapsed, 2),
|
|
)
|