feat: initial commit

This commit is contained in:
2026-04-18 08:59:04 +02:00
commit 862c0d1703
32 changed files with 8492 additions and 0 deletions

18
service/Dockerfile Normal file
View File

@@ -0,0 +1,18 @@
FROM python:3.12-slim
WORKDIR /app
# System deps for Playwright
RUN apt-get update && apt-get install -y --no-install-recommends \
curl \
&& rm -rf /var/lib/apt/lists/*
COPY pyproject.toml .
RUN pip install --no-cache-dir -e . \
&& scrapling install
COPY . .
EXPOSE 8765
CMD ["uvicorn", "service.main:app", "--host", "0.0.0.0", "--port", "8765"]

0
service/__init__.py Normal file
View File

29
service/main.py Normal file
View File

@@ -0,0 +1,29 @@
from __future__ import annotations
import os
from fastapi import Depends, FastAPI, HTTPException, Request, Security
from fastapi.security.api_key import APIKeyHeader
from .routers import health_router, scrape_router
API_KEY = os.getenv("API_KEY", "")
api_key_header = APIKeyHeader(name="X-API-Key", auto_error=False)
async def verify_api_key(key: str | None = Security(api_key_header)) -> None:
if API_KEY and key != API_KEY:
raise HTTPException(status_code=401, detail="Invalid or missing API key")
app = FastAPI(
title="Scrapling Service",
description="HTTP microservice exposing Scrapling web-scraping fetcherss to n8n",
version="0.1.0",
)
app.include_router(health_router)
app.include_router(
scrape_router,
dependencies=[Depends(verify_api_key)],
)

View File

@@ -0,0 +1,4 @@
from .request import ScrapeRequest, SelectorDef
from .response import ScrapeResponse, HealthResponse
__all__ = ["ScrapeRequest", "SelectorDef", "ScrapeResponse", "HealthResponse"]

41
service/models/request.py Normal file
View File

@@ -0,0 +1,41 @@
from __future__ import annotations
from typing import Any, Literal
from pydantic import BaseModel, field_validator
class SelectorDef(BaseModel):
name: str
selector: str
selector_type: Literal["css", "xpath"] = "css"
attribute: str | None = None # None = get text content
multiple: bool = False
class ScrapeRequest(BaseModel):
url: str
fetcher_type: Literal["http", "stealth", "dynamic"] = "http"
selectors: list[SelectorDef] = []
return_html: bool = False
timeout: int = 30000
proxy: str | None = None
headers: dict[str, str] = {}
# dynamic-fetcher specific
wait_selector: str | None = None
network_idle: bool = False
headless: bool = True
@field_validator("url")
@classmethod
def url_must_have_scheme(cls, v: str) -> str:
if not v.startswith(("http://", "https://")):
raise ValueError("URL must start with http:// or https://")
return v
@field_validator("timeout")
@classmethod
def timeout_range(cls, v: int) -> int:
if not (1000 <= v <= 120_000):
raise ValueError("timeout must be between 1000 and 120000 ms")
return v

View File

@@ -0,0 +1,21 @@
from __future__ import annotations
from typing import Any
from pydantic import BaseModel
class ScrapeResponse(BaseModel):
url: str
status_code: int
html: str | None = None
data: dict[str, Any] = {}
fetcher_used: str
elapsed_ms: float
error: str | None = None
class HealthResponse(BaseModel):
status: str
version: str
dynamic_session_ready: bool

25
service/pyproject.toml Normal file
View File

@@ -0,0 +1,25 @@
[build-system]
requires = ["hatchling"]
build-backend = "hatchling.build"
[project]
name = "scrapling-service"
version = "0.1.0"
description = "FastAPI microservice wrapping Scrapling for n8n integration"
requires-python = ">=3.11"
dependencies = [
"scrapling[fetchers]>=0.2",
"fastapi>=0.115",
"uvicorn[standard]>=0.30",
"pydantic>=2.0",
]
[project.optional-dependencies]
dev = [
"pytest>=8.0",
"pytest-asyncio>=0.23",
"httpx>=0.27",
]
[tool.pytest.ini_options]
asyncio_mode = "auto"

View File

@@ -0,0 +1,4 @@
from .scrape import router as scrape_router
from .health import router as health_router
__all__ = ["scrape_router", "health_router"]

15
service/routers/health.py Normal file
View File

@@ -0,0 +1,15 @@
from fastapi import APIRouter
from ..models.response import HealthResponse
router = APIRouter()
VERSION = "0.1.0"
@router.get("/health", response_model=HealthResponse)
async def health() -> HealthResponse:
return HealthResponse(
status="ok",
version=VERSION,
dynamic_session_ready=True,
)

35
service/routers/scrape.py Normal file
View File

@@ -0,0 +1,35 @@
from __future__ import annotations
from fastapi import APIRouter, HTTPException
from ..models.request import ScrapeRequest
from ..models.response import ScrapeResponse
from ..scrapers import DynamicScraper, HttpScraper, StealthyScraper
router = APIRouter()
@router.post("/scrape", response_model=ScrapeResponse)
async def scrape(req: ScrapeRequest) -> ScrapeResponse:
try:
if req.fetcher_type == "http":
scraper = HttpScraper()
elif req.fetcher_type == "stealth":
scraper = StealthyScraper()
elif req.fetcher_type == "dynamic":
scraper = DynamicScraper()
else:
raise HTTPException(status_code=400, detail=f"Unknown fetcher_type: {req.fetcher_type}")
return await scraper.scrape(req)
except HTTPException:
raise
except Exception as exc:
return ScrapeResponse(
url=req.url,
status_code=0,
fetcher_used=req.fetcher_type,
elapsed_ms=0,
error=str(exc),
)

View File

@@ -0,0 +1,5 @@
from .fetcher import HttpScraper
from .stealthy import StealthyScraper
from .dynamic import DynamicScraper
__all__ = ["HttpScraper", "StealthyScraper", "DynamicScraper"]

68
service/scrapers/base.py Normal file
View File

@@ -0,0 +1,68 @@
from __future__ import annotations
import time
from abc import ABC, abstractmethod
from typing import Any
from scrapling import Fetcher, StealthyFetcher, PlayWrightFetcher
from ..models.request import ScrapeRequest, SelectorDef
from ..models.response import ScrapeResponse
def apply_selectors(page: Any, selectors: list[SelectorDef]) -> dict[str, Any]:
"""Extract data from a Scrapling page object using CSS/XPath selectors."""
result: dict[str, Any] = {}
for sel in selectors:
try:
if sel.selector_type == "css":
if sel.multiple:
elements = page.css(sel.selector)
else:
elements = [page.css_first(sel.selector)]
else:
if sel.multiple:
elements = page.xpath(sel.selector)
else:
elements = [page.xpath_first(sel.selector)]
def extract_value(el: Any) -> str | None:
if el is None:
return None
if sel.attribute:
return el.attrib.get(sel.attribute)
return el.text
if sel.multiple:
result[sel.name] = [extract_value(el) for el in (elements or [])]
else:
result[sel.name] = extract_value(elements[0] if elements else None)
except Exception as exc:
result[sel.name] = None
result[f"{sel.name}_error"] = str(exc)
return result
class BaseScraper(ABC):
@abstractmethod
async def scrape(self, req: ScrapeRequest) -> ScrapeResponse:
...
def _build_response(
self,
req: ScrapeRequest,
page: Any,
fetcher_name: str,
start: float,
) -> ScrapeResponse:
elapsed = (time.perf_counter() - start) * 1000
html = page.html if req.return_html else None
data = apply_selectors(page, req.selectors) if req.selectors else {}
return ScrapeResponse(
url=req.url,
status_code=page.status if hasattr(page, "status") else 200,
html=html,
data=data,
fetcher_used=fetcher_name,
elapsed_ms=round(elapsed, 2),
)

View File

@@ -0,0 +1,31 @@
from __future__ import annotations
import time
from scrapling import PlayWrightFetcher
from ..models.request import ScrapeRequest
from ..models.response import ScrapeResponse
from .base import BaseScraper
class DynamicScraper(BaseScraper):
"""Wraps Scrapling's PlayWrightFetcher — full browser via Playwright."""
async def scrape(self, req: ScrapeRequest) -> ScrapeResponse:
start = time.perf_counter()
kwargs: dict = {
"url": req.url,
"headless": req.headless,
"timeout": req.timeout,
"network_idle": req.network_idle,
}
if req.wait_selector:
kwargs["wait_selector"] = req.wait_selector
if req.proxy:
kwargs["proxy"] = req.proxy
fetcher = PlayWrightFetcher(auto_match=False)
page = await fetcher.async_fetch(**kwargs)
return self._build_response(req, page, "dynamic", start)

View File

@@ -0,0 +1,30 @@
from __future__ import annotations
import asyncio
import time
from scrapling import Fetcher
from ..models.request import ScrapeRequest
from ..models.response import ScrapeResponse
from .base import BaseScraper
class HttpScraper(BaseScraper):
"""Wraps Scrapling's Fetcher — plain HTTP, fastest option."""
async def scrape(self, req: ScrapeRequest) -> ScrapeResponse:
start = time.perf_counter()
fetcher = Fetcher(auto_match=False)
kwargs: dict = {
"url": req.url,
"timeout": req.timeout / 1000,
}
if req.headers:
kwargs["headers"] = req.headers
if req.proxy:
kwargs["proxy"] = req.proxy
page = await asyncio.to_thread(fetcher.get, **kwargs)
return self._build_response(req, page, "http", start)

View File

@@ -0,0 +1,30 @@
from __future__ import annotations
import asyncio
import time
from scrapling import StealthyFetcher
from ..models.request import ScrapeRequest
from ..models.response import ScrapeResponse
from .base import BaseScraper
class StealthyScraper(BaseScraper):
"""Wraps Scrapling's StealthyFetcher — TLS fingerprint impersonation."""
async def scrape(self, req: ScrapeRequest) -> ScrapeResponse:
start = time.perf_counter()
fetcher = StealthyFetcher(auto_match=False)
kwargs: dict = {
"url": req.url,
"timeout": req.timeout / 1000,
}
if req.headers:
kwargs["extra_headers"] = req.headers
if req.proxy:
kwargs["proxy"] = req.proxy
page = await asyncio.to_thread(fetcher.fetch, **kwargs)
return self._build_response(req, page, "stealth", start)