feat: initial commit
This commit is contained in:
18
service/Dockerfile
Normal file
18
service/Dockerfile
Normal file
@@ -0,0 +1,18 @@
|
||||
FROM python:3.12-slim
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
# System deps for Playwright
|
||||
RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||
curl \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
||||
COPY pyproject.toml .
|
||||
RUN pip install --no-cache-dir -e . \
|
||||
&& scrapling install
|
||||
|
||||
COPY . .
|
||||
|
||||
EXPOSE 8765
|
||||
|
||||
CMD ["uvicorn", "service.main:app", "--host", "0.0.0.0", "--port", "8765"]
|
||||
0
service/__init__.py
Normal file
0
service/__init__.py
Normal file
29
service/main.py
Normal file
29
service/main.py
Normal file
@@ -0,0 +1,29 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import os
|
||||
|
||||
from fastapi import Depends, FastAPI, HTTPException, Request, Security
|
||||
from fastapi.security.api_key import APIKeyHeader
|
||||
|
||||
from .routers import health_router, scrape_router
|
||||
|
||||
API_KEY = os.getenv("API_KEY", "")
|
||||
api_key_header = APIKeyHeader(name="X-API-Key", auto_error=False)
|
||||
|
||||
|
||||
async def verify_api_key(key: str | None = Security(api_key_header)) -> None:
|
||||
if API_KEY and key != API_KEY:
|
||||
raise HTTPException(status_code=401, detail="Invalid or missing API key")
|
||||
|
||||
|
||||
app = FastAPI(
|
||||
title="Scrapling Service",
|
||||
description="HTTP microservice exposing Scrapling web-scraping fetcherss to n8n",
|
||||
version="0.1.0",
|
||||
)
|
||||
|
||||
app.include_router(health_router)
|
||||
app.include_router(
|
||||
scrape_router,
|
||||
dependencies=[Depends(verify_api_key)],
|
||||
)
|
||||
4
service/models/__init__.py
Normal file
4
service/models/__init__.py
Normal file
@@ -0,0 +1,4 @@
|
||||
from .request import ScrapeRequest, SelectorDef
|
||||
from .response import ScrapeResponse, HealthResponse
|
||||
|
||||
__all__ = ["ScrapeRequest", "SelectorDef", "ScrapeResponse", "HealthResponse"]
|
||||
41
service/models/request.py
Normal file
41
service/models/request.py
Normal file
@@ -0,0 +1,41 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import Any, Literal
|
||||
|
||||
from pydantic import BaseModel, field_validator
|
||||
|
||||
|
||||
class SelectorDef(BaseModel):
|
||||
name: str
|
||||
selector: str
|
||||
selector_type: Literal["css", "xpath"] = "css"
|
||||
attribute: str | None = None # None = get text content
|
||||
multiple: bool = False
|
||||
|
||||
|
||||
class ScrapeRequest(BaseModel):
|
||||
url: str
|
||||
fetcher_type: Literal["http", "stealth", "dynamic"] = "http"
|
||||
selectors: list[SelectorDef] = []
|
||||
return_html: bool = False
|
||||
timeout: int = 30000
|
||||
proxy: str | None = None
|
||||
headers: dict[str, str] = {}
|
||||
# dynamic-fetcher specific
|
||||
wait_selector: str | None = None
|
||||
network_idle: bool = False
|
||||
headless: bool = True
|
||||
|
||||
@field_validator("url")
|
||||
@classmethod
|
||||
def url_must_have_scheme(cls, v: str) -> str:
|
||||
if not v.startswith(("http://", "https://")):
|
||||
raise ValueError("URL must start with http:// or https://")
|
||||
return v
|
||||
|
||||
@field_validator("timeout")
|
||||
@classmethod
|
||||
def timeout_range(cls, v: int) -> int:
|
||||
if not (1000 <= v <= 120_000):
|
||||
raise ValueError("timeout must be between 1000 and 120000 ms")
|
||||
return v
|
||||
21
service/models/response.py
Normal file
21
service/models/response.py
Normal file
@@ -0,0 +1,21 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import Any
|
||||
|
||||
from pydantic import BaseModel
|
||||
|
||||
|
||||
class ScrapeResponse(BaseModel):
|
||||
url: str
|
||||
status_code: int
|
||||
html: str | None = None
|
||||
data: dict[str, Any] = {}
|
||||
fetcher_used: str
|
||||
elapsed_ms: float
|
||||
error: str | None = None
|
||||
|
||||
|
||||
class HealthResponse(BaseModel):
|
||||
status: str
|
||||
version: str
|
||||
dynamic_session_ready: bool
|
||||
25
service/pyproject.toml
Normal file
25
service/pyproject.toml
Normal file
@@ -0,0 +1,25 @@
|
||||
[build-system]
|
||||
requires = ["hatchling"]
|
||||
build-backend = "hatchling.build"
|
||||
|
||||
[project]
|
||||
name = "scrapling-service"
|
||||
version = "0.1.0"
|
||||
description = "FastAPI microservice wrapping Scrapling for n8n integration"
|
||||
requires-python = ">=3.11"
|
||||
dependencies = [
|
||||
"scrapling[fetchers]>=0.2",
|
||||
"fastapi>=0.115",
|
||||
"uvicorn[standard]>=0.30",
|
||||
"pydantic>=2.0",
|
||||
]
|
||||
|
||||
[project.optional-dependencies]
|
||||
dev = [
|
||||
"pytest>=8.0",
|
||||
"pytest-asyncio>=0.23",
|
||||
"httpx>=0.27",
|
||||
]
|
||||
|
||||
[tool.pytest.ini_options]
|
||||
asyncio_mode = "auto"
|
||||
4
service/routers/__init__.py
Normal file
4
service/routers/__init__.py
Normal file
@@ -0,0 +1,4 @@
|
||||
from .scrape import router as scrape_router
|
||||
from .health import router as health_router
|
||||
|
||||
__all__ = ["scrape_router", "health_router"]
|
||||
15
service/routers/health.py
Normal file
15
service/routers/health.py
Normal file
@@ -0,0 +1,15 @@
|
||||
from fastapi import APIRouter
|
||||
from ..models.response import HealthResponse
|
||||
|
||||
router = APIRouter()
|
||||
|
||||
VERSION = "0.1.0"
|
||||
|
||||
|
||||
@router.get("/health", response_model=HealthResponse)
|
||||
async def health() -> HealthResponse:
|
||||
return HealthResponse(
|
||||
status="ok",
|
||||
version=VERSION,
|
||||
dynamic_session_ready=True,
|
||||
)
|
||||
35
service/routers/scrape.py
Normal file
35
service/routers/scrape.py
Normal file
@@ -0,0 +1,35 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from fastapi import APIRouter, HTTPException
|
||||
|
||||
from ..models.request import ScrapeRequest
|
||||
from ..models.response import ScrapeResponse
|
||||
from ..scrapers import DynamicScraper, HttpScraper, StealthyScraper
|
||||
|
||||
router = APIRouter()
|
||||
|
||||
|
||||
@router.post("/scrape", response_model=ScrapeResponse)
|
||||
async def scrape(req: ScrapeRequest) -> ScrapeResponse:
|
||||
try:
|
||||
if req.fetcher_type == "http":
|
||||
scraper = HttpScraper()
|
||||
elif req.fetcher_type == "stealth":
|
||||
scraper = StealthyScraper()
|
||||
elif req.fetcher_type == "dynamic":
|
||||
scraper = DynamicScraper()
|
||||
else:
|
||||
raise HTTPException(status_code=400, detail=f"Unknown fetcher_type: {req.fetcher_type}")
|
||||
|
||||
return await scraper.scrape(req)
|
||||
|
||||
except HTTPException:
|
||||
raise
|
||||
except Exception as exc:
|
||||
return ScrapeResponse(
|
||||
url=req.url,
|
||||
status_code=0,
|
||||
fetcher_used=req.fetcher_type,
|
||||
elapsed_ms=0,
|
||||
error=str(exc),
|
||||
)
|
||||
5
service/scrapers/__init__.py
Normal file
5
service/scrapers/__init__.py
Normal file
@@ -0,0 +1,5 @@
|
||||
from .fetcher import HttpScraper
|
||||
from .stealthy import StealthyScraper
|
||||
from .dynamic import DynamicScraper
|
||||
|
||||
__all__ = ["HttpScraper", "StealthyScraper", "DynamicScraper"]
|
||||
68
service/scrapers/base.py
Normal file
68
service/scrapers/base.py
Normal file
@@ -0,0 +1,68 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import time
|
||||
from abc import ABC, abstractmethod
|
||||
from typing import Any
|
||||
|
||||
from scrapling import Fetcher, StealthyFetcher, PlayWrightFetcher
|
||||
|
||||
from ..models.request import ScrapeRequest, SelectorDef
|
||||
from ..models.response import ScrapeResponse
|
||||
|
||||
|
||||
def apply_selectors(page: Any, selectors: list[SelectorDef]) -> dict[str, Any]:
|
||||
"""Extract data from a Scrapling page object using CSS/XPath selectors."""
|
||||
result: dict[str, Any] = {}
|
||||
for sel in selectors:
|
||||
try:
|
||||
if sel.selector_type == "css":
|
||||
if sel.multiple:
|
||||
elements = page.css(sel.selector)
|
||||
else:
|
||||
elements = [page.css_first(sel.selector)]
|
||||
else:
|
||||
if sel.multiple:
|
||||
elements = page.xpath(sel.selector)
|
||||
else:
|
||||
elements = [page.xpath_first(sel.selector)]
|
||||
|
||||
def extract_value(el: Any) -> str | None:
|
||||
if el is None:
|
||||
return None
|
||||
if sel.attribute:
|
||||
return el.attrib.get(sel.attribute)
|
||||
return el.text
|
||||
|
||||
if sel.multiple:
|
||||
result[sel.name] = [extract_value(el) for el in (elements or [])]
|
||||
else:
|
||||
result[sel.name] = extract_value(elements[0] if elements else None)
|
||||
except Exception as exc:
|
||||
result[sel.name] = None
|
||||
result[f"{sel.name}_error"] = str(exc)
|
||||
return result
|
||||
|
||||
|
||||
class BaseScraper(ABC):
|
||||
@abstractmethod
|
||||
async def scrape(self, req: ScrapeRequest) -> ScrapeResponse:
|
||||
...
|
||||
|
||||
def _build_response(
|
||||
self,
|
||||
req: ScrapeRequest,
|
||||
page: Any,
|
||||
fetcher_name: str,
|
||||
start: float,
|
||||
) -> ScrapeResponse:
|
||||
elapsed = (time.perf_counter() - start) * 1000
|
||||
html = page.html if req.return_html else None
|
||||
data = apply_selectors(page, req.selectors) if req.selectors else {}
|
||||
return ScrapeResponse(
|
||||
url=req.url,
|
||||
status_code=page.status if hasattr(page, "status") else 200,
|
||||
html=html,
|
||||
data=data,
|
||||
fetcher_used=fetcher_name,
|
||||
elapsed_ms=round(elapsed, 2),
|
||||
)
|
||||
31
service/scrapers/dynamic.py
Normal file
31
service/scrapers/dynamic.py
Normal file
@@ -0,0 +1,31 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import time
|
||||
|
||||
from scrapling import PlayWrightFetcher
|
||||
|
||||
from ..models.request import ScrapeRequest
|
||||
from ..models.response import ScrapeResponse
|
||||
from .base import BaseScraper
|
||||
|
||||
|
||||
class DynamicScraper(BaseScraper):
|
||||
"""Wraps Scrapling's PlayWrightFetcher — full browser via Playwright."""
|
||||
|
||||
async def scrape(self, req: ScrapeRequest) -> ScrapeResponse:
|
||||
start = time.perf_counter()
|
||||
|
||||
kwargs: dict = {
|
||||
"url": req.url,
|
||||
"headless": req.headless,
|
||||
"timeout": req.timeout,
|
||||
"network_idle": req.network_idle,
|
||||
}
|
||||
if req.wait_selector:
|
||||
kwargs["wait_selector"] = req.wait_selector
|
||||
if req.proxy:
|
||||
kwargs["proxy"] = req.proxy
|
||||
|
||||
fetcher = PlayWrightFetcher(auto_match=False)
|
||||
page = await fetcher.async_fetch(**kwargs)
|
||||
return self._build_response(req, page, "dynamic", start)
|
||||
30
service/scrapers/fetcher.py
Normal file
30
service/scrapers/fetcher.py
Normal file
@@ -0,0 +1,30 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
import time
|
||||
|
||||
from scrapling import Fetcher
|
||||
|
||||
from ..models.request import ScrapeRequest
|
||||
from ..models.response import ScrapeResponse
|
||||
from .base import BaseScraper
|
||||
|
||||
|
||||
class HttpScraper(BaseScraper):
|
||||
"""Wraps Scrapling's Fetcher — plain HTTP, fastest option."""
|
||||
|
||||
async def scrape(self, req: ScrapeRequest) -> ScrapeResponse:
|
||||
start = time.perf_counter()
|
||||
fetcher = Fetcher(auto_match=False)
|
||||
|
||||
kwargs: dict = {
|
||||
"url": req.url,
|
||||
"timeout": req.timeout / 1000,
|
||||
}
|
||||
if req.headers:
|
||||
kwargs["headers"] = req.headers
|
||||
if req.proxy:
|
||||
kwargs["proxy"] = req.proxy
|
||||
|
||||
page = await asyncio.to_thread(fetcher.get, **kwargs)
|
||||
return self._build_response(req, page, "http", start)
|
||||
30
service/scrapers/stealthy.py
Normal file
30
service/scrapers/stealthy.py
Normal file
@@ -0,0 +1,30 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
import time
|
||||
|
||||
from scrapling import StealthyFetcher
|
||||
|
||||
from ..models.request import ScrapeRequest
|
||||
from ..models.response import ScrapeResponse
|
||||
from .base import BaseScraper
|
||||
|
||||
|
||||
class StealthyScraper(BaseScraper):
|
||||
"""Wraps Scrapling's StealthyFetcher — TLS fingerprint impersonation."""
|
||||
|
||||
async def scrape(self, req: ScrapeRequest) -> ScrapeResponse:
|
||||
start = time.perf_counter()
|
||||
fetcher = StealthyFetcher(auto_match=False)
|
||||
|
||||
kwargs: dict = {
|
||||
"url": req.url,
|
||||
"timeout": req.timeout / 1000,
|
||||
}
|
||||
if req.headers:
|
||||
kwargs["extra_headers"] = req.headers
|
||||
if req.proxy:
|
||||
kwargs["proxy"] = req.proxy
|
||||
|
||||
page = await asyncio.to_thread(fetcher.fetch, **kwargs)
|
||||
return self._build_response(req, page, "stealth", start)
|
||||
Reference in New Issue
Block a user