feat: initial commit

This commit is contained in:
2026-04-18 08:59:04 +02:00
commit 862c0d1703
32 changed files with 8492 additions and 0 deletions

View File

@@ -0,0 +1,5 @@
from .fetcher import HttpScraper
from .stealthy import StealthyScraper
from .dynamic import DynamicScraper
__all__ = ["HttpScraper", "StealthyScraper", "DynamicScraper"]

68
service/scrapers/base.py Normal file
View File

@@ -0,0 +1,68 @@
from __future__ import annotations
import time
from abc import ABC, abstractmethod
from typing import Any
from scrapling import Fetcher, StealthyFetcher, PlayWrightFetcher
from ..models.request import ScrapeRequest, SelectorDef
from ..models.response import ScrapeResponse
def apply_selectors(page: Any, selectors: list[SelectorDef]) -> dict[str, Any]:
"""Extract data from a Scrapling page object using CSS/XPath selectors."""
result: dict[str, Any] = {}
for sel in selectors:
try:
if sel.selector_type == "css":
if sel.multiple:
elements = page.css(sel.selector)
else:
elements = [page.css_first(sel.selector)]
else:
if sel.multiple:
elements = page.xpath(sel.selector)
else:
elements = [page.xpath_first(sel.selector)]
def extract_value(el: Any) -> str | None:
if el is None:
return None
if sel.attribute:
return el.attrib.get(sel.attribute)
return el.text
if sel.multiple:
result[sel.name] = [extract_value(el) for el in (elements or [])]
else:
result[sel.name] = extract_value(elements[0] if elements else None)
except Exception as exc:
result[sel.name] = None
result[f"{sel.name}_error"] = str(exc)
return result
class BaseScraper(ABC):
@abstractmethod
async def scrape(self, req: ScrapeRequest) -> ScrapeResponse:
...
def _build_response(
self,
req: ScrapeRequest,
page: Any,
fetcher_name: str,
start: float,
) -> ScrapeResponse:
elapsed = (time.perf_counter() - start) * 1000
html = page.html if req.return_html else None
data = apply_selectors(page, req.selectors) if req.selectors else {}
return ScrapeResponse(
url=req.url,
status_code=page.status if hasattr(page, "status") else 200,
html=html,
data=data,
fetcher_used=fetcher_name,
elapsed_ms=round(elapsed, 2),
)

View File

@@ -0,0 +1,31 @@
from __future__ import annotations
import time
from scrapling import PlayWrightFetcher
from ..models.request import ScrapeRequest
from ..models.response import ScrapeResponse
from .base import BaseScraper
class DynamicScraper(BaseScraper):
"""Wraps Scrapling's PlayWrightFetcher — full browser via Playwright."""
async def scrape(self, req: ScrapeRequest) -> ScrapeResponse:
start = time.perf_counter()
kwargs: dict = {
"url": req.url,
"headless": req.headless,
"timeout": req.timeout,
"network_idle": req.network_idle,
}
if req.wait_selector:
kwargs["wait_selector"] = req.wait_selector
if req.proxy:
kwargs["proxy"] = req.proxy
fetcher = PlayWrightFetcher(auto_match=False)
page = await fetcher.async_fetch(**kwargs)
return self._build_response(req, page, "dynamic", start)

View File

@@ -0,0 +1,30 @@
from __future__ import annotations
import asyncio
import time
from scrapling import Fetcher
from ..models.request import ScrapeRequest
from ..models.response import ScrapeResponse
from .base import BaseScraper
class HttpScraper(BaseScraper):
"""Wraps Scrapling's Fetcher — plain HTTP, fastest option."""
async def scrape(self, req: ScrapeRequest) -> ScrapeResponse:
start = time.perf_counter()
fetcher = Fetcher(auto_match=False)
kwargs: dict = {
"url": req.url,
"timeout": req.timeout / 1000,
}
if req.headers:
kwargs["headers"] = req.headers
if req.proxy:
kwargs["proxy"] = req.proxy
page = await asyncio.to_thread(fetcher.get, **kwargs)
return self._build_response(req, page, "http", start)

View File

@@ -0,0 +1,30 @@
from __future__ import annotations
import asyncio
import time
from scrapling import StealthyFetcher
from ..models.request import ScrapeRequest
from ..models.response import ScrapeResponse
from .base import BaseScraper
class StealthyScraper(BaseScraper):
"""Wraps Scrapling's StealthyFetcher — TLS fingerprint impersonation."""
async def scrape(self, req: ScrapeRequest) -> ScrapeResponse:
start = time.perf_counter()
fetcher = StealthyFetcher(auto_match=False)
kwargs: dict = {
"url": req.url,
"timeout": req.timeout / 1000,
}
if req.headers:
kwargs["extra_headers"] = req.headers
if req.proxy:
kwargs["proxy"] = req.proxy
page = await asyncio.to_thread(fetcher.fetch, **kwargs)
return self._build_response(req, page, "stealth", start)