feat: initial commit

This commit is contained in:
2026-04-18 08:59:04 +02:00
commit 862c0d1703
32 changed files with 8492 additions and 0 deletions

68
service/scrapers/base.py Normal file
View File

@@ -0,0 +1,68 @@
from __future__ import annotations
import time
from abc import ABC, abstractmethod
from typing import Any
from scrapling import Fetcher, StealthyFetcher, PlayWrightFetcher
from ..models.request import ScrapeRequest, SelectorDef
from ..models.response import ScrapeResponse
def apply_selectors(page: Any, selectors: list[SelectorDef]) -> dict[str, Any]:
"""Extract data from a Scrapling page object using CSS/XPath selectors."""
result: dict[str, Any] = {}
for sel in selectors:
try:
if sel.selector_type == "css":
if sel.multiple:
elements = page.css(sel.selector)
else:
elements = [page.css_first(sel.selector)]
else:
if sel.multiple:
elements = page.xpath(sel.selector)
else:
elements = [page.xpath_first(sel.selector)]
def extract_value(el: Any) -> str | None:
if el is None:
return None
if sel.attribute:
return el.attrib.get(sel.attribute)
return el.text
if sel.multiple:
result[sel.name] = [extract_value(el) for el in (elements or [])]
else:
result[sel.name] = extract_value(elements[0] if elements else None)
except Exception as exc:
result[sel.name] = None
result[f"{sel.name}_error"] = str(exc)
return result
class BaseScraper(ABC):
@abstractmethod
async def scrape(self, req: ScrapeRequest) -> ScrapeResponse:
...
def _build_response(
self,
req: ScrapeRequest,
page: Any,
fetcher_name: str,
start: float,
) -> ScrapeResponse:
elapsed = (time.perf_counter() - start) * 1000
html = page.html if req.return_html else None
data = apply_selectors(page, req.selectors) if req.selectors else {}
return ScrapeResponse(
url=req.url,
status_code=page.status if hasattr(page, "status") else 200,
html=html,
data=data,
fetcher_used=fetcher_name,
elapsed_ms=round(elapsed, 2),
)