feat: initial commit
This commit is contained in:
68
service/scrapers/base.py
Normal file
68
service/scrapers/base.py
Normal file
@@ -0,0 +1,68 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import time
|
||||
from abc import ABC, abstractmethod
|
||||
from typing import Any
|
||||
|
||||
from scrapling import Fetcher, StealthyFetcher, PlayWrightFetcher
|
||||
|
||||
from ..models.request import ScrapeRequest, SelectorDef
|
||||
from ..models.response import ScrapeResponse
|
||||
|
||||
|
||||
def apply_selectors(page: Any, selectors: list[SelectorDef]) -> dict[str, Any]:
|
||||
"""Extract data from a Scrapling page object using CSS/XPath selectors."""
|
||||
result: dict[str, Any] = {}
|
||||
for sel in selectors:
|
||||
try:
|
||||
if sel.selector_type == "css":
|
||||
if sel.multiple:
|
||||
elements = page.css(sel.selector)
|
||||
else:
|
||||
elements = [page.css_first(sel.selector)]
|
||||
else:
|
||||
if sel.multiple:
|
||||
elements = page.xpath(sel.selector)
|
||||
else:
|
||||
elements = [page.xpath_first(sel.selector)]
|
||||
|
||||
def extract_value(el: Any) -> str | None:
|
||||
if el is None:
|
||||
return None
|
||||
if sel.attribute:
|
||||
return el.attrib.get(sel.attribute)
|
||||
return el.text
|
||||
|
||||
if sel.multiple:
|
||||
result[sel.name] = [extract_value(el) for el in (elements or [])]
|
||||
else:
|
||||
result[sel.name] = extract_value(elements[0] if elements else None)
|
||||
except Exception as exc:
|
||||
result[sel.name] = None
|
||||
result[f"{sel.name}_error"] = str(exc)
|
||||
return result
|
||||
|
||||
|
||||
class BaseScraper(ABC):
|
||||
@abstractmethod
|
||||
async def scrape(self, req: ScrapeRequest) -> ScrapeResponse:
|
||||
...
|
||||
|
||||
def _build_response(
|
||||
self,
|
||||
req: ScrapeRequest,
|
||||
page: Any,
|
||||
fetcher_name: str,
|
||||
start: float,
|
||||
) -> ScrapeResponse:
|
||||
elapsed = (time.perf_counter() - start) * 1000
|
||||
html = page.html if req.return_html else None
|
||||
data = apply_selectors(page, req.selectors) if req.selectors else {}
|
||||
return ScrapeResponse(
|
||||
url=req.url,
|
||||
status_code=page.status if hasattr(page, "status") else 200,
|
||||
html=html,
|
||||
data=data,
|
||||
fetcher_used=fetcher_name,
|
||||
elapsed_ms=round(elapsed, 2),
|
||||
)
|
||||
Reference in New Issue
Block a user