feat: initial commit

This commit is contained in:
2026-04-18 08:59:04 +02:00
commit 862c0d1703
32 changed files with 8492 additions and 0 deletions

213
scripts/scrapling-run.ts Normal file
View File

@@ -0,0 +1,213 @@
#!/usr/bin/env ts-node
/**
* Standalone Scrapling CLI runner — no n8n needed.
*
* Usage:
* npx ts-node scripts/scrapling-run.ts <url> [options]
*
* Options:
* --fetcher http|stealth|dynamic (default: http)
* --selector <name>:<css-selector> (can be repeated)
* --html include raw HTML in output
* --wait <css-selector> wait for selector (dynamic only)
* --timeout <ms> (default: 30000)
* --format pretty|json (default: pretty)
* --service <url> service URL override
*
* Credentials (from .env.test or environment):
* SCRAPLING_SERVICE_URL default: http://localhost:8765
* SCRAPLING_API_KEY optional
*
* Examples:
* npx ts-node scripts/scrapling-run.ts https://example.com
* npx ts-node scripts/scrapling-run.ts https://news.ycombinator.com --fetcher stealth --selector "title:title"
* npx ts-node scripts/scrapling-run.ts https://spa.example.com --fetcher dynamic --wait "#app"
*/
import * as https from 'https';
import * as http from 'http';
import * as path from 'path';
import * as fs from 'fs';
// ── Load .env.test if present ─────────────────────────────────────────────────
const envFile = path.resolve(__dirname, '..', '.env.test');
if (fs.existsSync(envFile)) {
const lines = fs.readFileSync(envFile, 'utf-8').split('\n');
for (const line of lines) {
const trimmed = line.trim();
if (!trimmed || trimmed.startsWith('#')) continue;
const eq = trimmed.indexOf('=');
if (eq === -1) continue;
const key = trimmed.slice(0, eq).trim();
const value = trimmed.slice(eq + 1).trim().replace(/^["']|["']$/g, '');
if (!process.env[key]) process.env[key] = value;
}
}
// ── Parse CLI args ────────────────────────────────────────────────────────────
const args = process.argv.slice(2);
const url = args.find((a) => !a.startsWith('--'));
if (!url) {
console.error('Usage: scrapling-run.ts <url> [--fetcher http|stealth|dynamic] [--selector name:selector] ...');
process.exit(1);
}
function getArg(flag: string): string | undefined {
const idx = args.indexOf(flag);
return idx !== -1 ? args[idx + 1] : undefined;
}
function getFlag(flag: string): boolean {
return args.includes(flag);
}
function getArgs(flag: string): string[] {
const result: string[] = [];
for (let i = 0; i < args.length; i++) {
if (args[i] === flag && args[i + 1]) {
result.push(args[i + 1]);
i++;
}
}
return result;
}
const fetcherType = (getArg('--fetcher') ?? 'http') as 'http' | 'stealth' | 'dynamic';
const returnHtml = getFlag('--html');
const waitSelector = getArg('--wait');
const timeout = parseInt(getArg('--timeout') ?? '30000', 10);
const outputFormat = (getArg('--format') ?? 'pretty') as 'pretty' | 'json';
const serviceUrl = (getArg('--service') ?? process.env.SCRAPLING_SERVICE_URL ?? 'http://localhost:8765').replace(/\/$/, '');
const apiKey = process.env.SCRAPLING_API_KEY ?? '';
const rawSelectors = getArgs('--selector');
const selectors = rawSelectors.map((raw) => {
const colonIdx = raw.indexOf(':');
if (colonIdx === -1) {
console.error(`Invalid selector format: "${raw}". Expected "name:selector"`);
process.exit(1);
}
return {
name: raw.slice(0, colonIdx),
selector: raw.slice(colonIdx + 1),
selector_type: 'css' as const,
multiple: false,
};
});
// ── Minimal HTTP client ───────────────────────────────────────────────────────
function postJson(reqUrl: string, body: unknown, headers: Record<string, string>): Promise<unknown> {
return new Promise((resolve, reject) => {
const bodyStr = JSON.stringify(body);
const parsed = new URL(reqUrl);
const isHttps = parsed.protocol === 'https:';
const transport = isHttps ? https : http;
const req = transport.request(
{
hostname: parsed.hostname,
port: parsed.port || (isHttps ? 443 : 80),
path: parsed.pathname + parsed.search,
method: 'POST',
headers: {
'Content-Type': 'application/json',
'Content-Length': Buffer.byteLength(bodyStr).toString(),
...headers,
},
},
(res) => {
const chunks: Buffer[] = [];
res.on('data', (c: Buffer) => chunks.push(c));
res.on('end', () => {
const text = Buffer.concat(chunks).toString('utf-8');
if (res.statusCode && res.statusCode >= 400) {
reject(Object.assign(new Error(`HTTP ${res.statusCode}: ${text}`), { statusCode: res.statusCode }));
} else {
try {
resolve(JSON.parse(text));
} catch {
resolve(text);
}
}
});
},
);
req.on('error', reject);
req.write(bodyStr);
req.end();
});
}
// ── Main ──────────────────────────────────────────────────────────────────────
interface ScrapeResponse {
url: string;
status_code: number;
html?: string;
data: Record<string, unknown>;
fetcher_used: string;
elapsed_ms: number;
error?: string;
}
async function main(): Promise<void> {
console.log(`Service: ${serviceUrl}`);
console.log(`URL: ${url}`);
console.log(`Fetcher: ${fetcherType}`);
if (selectors.length) console.log(`Selectors: ${selectors.map((s) => `${s.name}:${s.selector}`).join(', ')}`);
console.log();
const payload: Record<string, unknown> = {
url,
fetcher_type: fetcherType,
return_html: returnHtml,
timeout,
selectors,
};
if (waitSelector) payload.wait_selector = waitSelector;
const requestHeaders: Record<string, string> = {};
if (apiKey) requestHeaders['X-API-Key'] = apiKey;
const response = (await postJson(`${serviceUrl}/scrape`, payload, requestHeaders)) as ScrapeResponse;
if (outputFormat === 'json') {
console.log(JSON.stringify(response, null, 2));
return;
}
// Pretty output
console.log(`Status: ${response.status_code}`);
console.log(`Fetcher: ${response.fetcher_used}`);
console.log(`Elapsed: ${response.elapsed_ms}ms`);
if (response.error) {
console.error(`\nError: ${response.error}`);
return;
}
if (Object.keys(response.data).length > 0) {
console.log('\nExtracted data:');
console.log('─'.repeat(50));
for (const [key, val] of Object.entries(response.data)) {
const display = Array.isArray(val) ? `[${(val as unknown[]).length} items]` : String(val);
console.log(` ${key.padEnd(25)} ${display}`);
}
}
if (response.html) {
console.log('\nHTML preview (first 500 chars):');
console.log('─'.repeat(50));
console.log(response.html.slice(0, 500));
}
}
main().catch((err) => {
console.error('\nError:', (err as Error).message ?? err);
process.exit(1);
});