feat: initial commit
This commit is contained in:
5
service/scrapers/__init__.py
Normal file
5
service/scrapers/__init__.py
Normal file
@@ -0,0 +1,5 @@
|
||||
from .fetcher import HttpScraper
|
||||
from .stealthy import StealthyScraper
|
||||
from .dynamic import DynamicScraper
|
||||
|
||||
__all__ = ["HttpScraper", "StealthyScraper", "DynamicScraper"]
|
||||
68
service/scrapers/base.py
Normal file
68
service/scrapers/base.py
Normal file
@@ -0,0 +1,68 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import time
|
||||
from abc import ABC, abstractmethod
|
||||
from typing import Any
|
||||
|
||||
from scrapling import Fetcher, StealthyFetcher, PlayWrightFetcher
|
||||
|
||||
from ..models.request import ScrapeRequest, SelectorDef
|
||||
from ..models.response import ScrapeResponse
|
||||
|
||||
|
||||
def apply_selectors(page: Any, selectors: list[SelectorDef]) -> dict[str, Any]:
|
||||
"""Extract data from a Scrapling page object using CSS/XPath selectors."""
|
||||
result: dict[str, Any] = {}
|
||||
for sel in selectors:
|
||||
try:
|
||||
if sel.selector_type == "css":
|
||||
if sel.multiple:
|
||||
elements = page.css(sel.selector)
|
||||
else:
|
||||
elements = [page.css_first(sel.selector)]
|
||||
else:
|
||||
if sel.multiple:
|
||||
elements = page.xpath(sel.selector)
|
||||
else:
|
||||
elements = [page.xpath_first(sel.selector)]
|
||||
|
||||
def extract_value(el: Any) -> str | None:
|
||||
if el is None:
|
||||
return None
|
||||
if sel.attribute:
|
||||
return el.attrib.get(sel.attribute)
|
||||
return el.text
|
||||
|
||||
if sel.multiple:
|
||||
result[sel.name] = [extract_value(el) for el in (elements or [])]
|
||||
else:
|
||||
result[sel.name] = extract_value(elements[0] if elements else None)
|
||||
except Exception as exc:
|
||||
result[sel.name] = None
|
||||
result[f"{sel.name}_error"] = str(exc)
|
||||
return result
|
||||
|
||||
|
||||
class BaseScraper(ABC):
|
||||
@abstractmethod
|
||||
async def scrape(self, req: ScrapeRequest) -> ScrapeResponse:
|
||||
...
|
||||
|
||||
def _build_response(
|
||||
self,
|
||||
req: ScrapeRequest,
|
||||
page: Any,
|
||||
fetcher_name: str,
|
||||
start: float,
|
||||
) -> ScrapeResponse:
|
||||
elapsed = (time.perf_counter() - start) * 1000
|
||||
html = page.html if req.return_html else None
|
||||
data = apply_selectors(page, req.selectors) if req.selectors else {}
|
||||
return ScrapeResponse(
|
||||
url=req.url,
|
||||
status_code=page.status if hasattr(page, "status") else 200,
|
||||
html=html,
|
||||
data=data,
|
||||
fetcher_used=fetcher_name,
|
||||
elapsed_ms=round(elapsed, 2),
|
||||
)
|
||||
31
service/scrapers/dynamic.py
Normal file
31
service/scrapers/dynamic.py
Normal file
@@ -0,0 +1,31 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import time
|
||||
|
||||
from scrapling import PlayWrightFetcher
|
||||
|
||||
from ..models.request import ScrapeRequest
|
||||
from ..models.response import ScrapeResponse
|
||||
from .base import BaseScraper
|
||||
|
||||
|
||||
class DynamicScraper(BaseScraper):
|
||||
"""Wraps Scrapling's PlayWrightFetcher — full browser via Playwright."""
|
||||
|
||||
async def scrape(self, req: ScrapeRequest) -> ScrapeResponse:
|
||||
start = time.perf_counter()
|
||||
|
||||
kwargs: dict = {
|
||||
"url": req.url,
|
||||
"headless": req.headless,
|
||||
"timeout": req.timeout,
|
||||
"network_idle": req.network_idle,
|
||||
}
|
||||
if req.wait_selector:
|
||||
kwargs["wait_selector"] = req.wait_selector
|
||||
if req.proxy:
|
||||
kwargs["proxy"] = req.proxy
|
||||
|
||||
fetcher = PlayWrightFetcher(auto_match=False)
|
||||
page = await fetcher.async_fetch(**kwargs)
|
||||
return self._build_response(req, page, "dynamic", start)
|
||||
30
service/scrapers/fetcher.py
Normal file
30
service/scrapers/fetcher.py
Normal file
@@ -0,0 +1,30 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
import time
|
||||
|
||||
from scrapling import Fetcher
|
||||
|
||||
from ..models.request import ScrapeRequest
|
||||
from ..models.response import ScrapeResponse
|
||||
from .base import BaseScraper
|
||||
|
||||
|
||||
class HttpScraper(BaseScraper):
|
||||
"""Wraps Scrapling's Fetcher — plain HTTP, fastest option."""
|
||||
|
||||
async def scrape(self, req: ScrapeRequest) -> ScrapeResponse:
|
||||
start = time.perf_counter()
|
||||
fetcher = Fetcher(auto_match=False)
|
||||
|
||||
kwargs: dict = {
|
||||
"url": req.url,
|
||||
"timeout": req.timeout / 1000,
|
||||
}
|
||||
if req.headers:
|
||||
kwargs["headers"] = req.headers
|
||||
if req.proxy:
|
||||
kwargs["proxy"] = req.proxy
|
||||
|
||||
page = await asyncio.to_thread(fetcher.get, **kwargs)
|
||||
return self._build_response(req, page, "http", start)
|
||||
30
service/scrapers/stealthy.py
Normal file
30
service/scrapers/stealthy.py
Normal file
@@ -0,0 +1,30 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
import time
|
||||
|
||||
from scrapling import StealthyFetcher
|
||||
|
||||
from ..models.request import ScrapeRequest
|
||||
from ..models.response import ScrapeResponse
|
||||
from .base import BaseScraper
|
||||
|
||||
|
||||
class StealthyScraper(BaseScraper):
|
||||
"""Wraps Scrapling's StealthyFetcher — TLS fingerprint impersonation."""
|
||||
|
||||
async def scrape(self, req: ScrapeRequest) -> ScrapeResponse:
|
||||
start = time.perf_counter()
|
||||
fetcher = StealthyFetcher(auto_match=False)
|
||||
|
||||
kwargs: dict = {
|
||||
"url": req.url,
|
||||
"timeout": req.timeout / 1000,
|
||||
}
|
||||
if req.headers:
|
||||
kwargs["extra_headers"] = req.headers
|
||||
if req.proxy:
|
||||
kwargs["proxy"] = req.proxy
|
||||
|
||||
page = await asyncio.to_thread(fetcher.fetch, **kwargs)
|
||||
return self._build_response(req, page, "stealth", start)
|
||||
Reference in New Issue
Block a user