214 lines
6.8 KiB
TypeScript
214 lines
6.8 KiB
TypeScript
#!/usr/bin/env ts-node
|
|
/**
|
|
* Standalone Scrapling CLI runner — no n8n needed.
|
|
*
|
|
* Usage:
|
|
* npx ts-node scripts/scrapling-run.ts <url> [options]
|
|
*
|
|
* Options:
|
|
* --fetcher http|stealth|dynamic (default: http)
|
|
* --selector <name>:<css-selector> (can be repeated)
|
|
* --html include raw HTML in output
|
|
* --wait <css-selector> wait for selector (dynamic only)
|
|
* --timeout <ms> (default: 30000)
|
|
* --format pretty|json (default: pretty)
|
|
* --service <url> service URL override
|
|
*
|
|
* Credentials (from .env.test or environment):
|
|
* SCRAPLING_SERVICE_URL default: http://localhost:8765
|
|
* SCRAPLING_API_KEY optional
|
|
*
|
|
* Examples:
|
|
* npx ts-node scripts/scrapling-run.ts https://example.com
|
|
* npx ts-node scripts/scrapling-run.ts https://news.ycombinator.com --fetcher stealth --selector "title:title"
|
|
* npx ts-node scripts/scrapling-run.ts https://spa.example.com --fetcher dynamic --wait "#app"
|
|
*/
|
|
|
|
import * as https from 'https';
|
|
import * as http from 'http';
|
|
import * as path from 'path';
|
|
import * as fs from 'fs';
|
|
|
|
// ── Load .env.test if present ─────────────────────────────────────────────────
|
|
|
|
const envFile = path.resolve(__dirname, '..', '.env.test');
|
|
if (fs.existsSync(envFile)) {
|
|
const lines = fs.readFileSync(envFile, 'utf-8').split('\n');
|
|
for (const line of lines) {
|
|
const trimmed = line.trim();
|
|
if (!trimmed || trimmed.startsWith('#')) continue;
|
|
const eq = trimmed.indexOf('=');
|
|
if (eq === -1) continue;
|
|
const key = trimmed.slice(0, eq).trim();
|
|
const value = trimmed.slice(eq + 1).trim().replace(/^["']|["']$/g, '');
|
|
if (!process.env[key]) process.env[key] = value;
|
|
}
|
|
}
|
|
|
|
// ── Parse CLI args ────────────────────────────────────────────────────────────
|
|
|
|
const args = process.argv.slice(2);
|
|
const url = args.find((a) => !a.startsWith('--'));
|
|
|
|
if (!url) {
|
|
console.error('Usage: scrapling-run.ts <url> [--fetcher http|stealth|dynamic] [--selector name:selector] ...');
|
|
process.exit(1);
|
|
}
|
|
|
|
function getArg(flag: string): string | undefined {
|
|
const idx = args.indexOf(flag);
|
|
return idx !== -1 ? args[idx + 1] : undefined;
|
|
}
|
|
|
|
function getFlag(flag: string): boolean {
|
|
return args.includes(flag);
|
|
}
|
|
|
|
function getArgs(flag: string): string[] {
|
|
const result: string[] = [];
|
|
for (let i = 0; i < args.length; i++) {
|
|
if (args[i] === flag && args[i + 1]) {
|
|
result.push(args[i + 1]);
|
|
i++;
|
|
}
|
|
}
|
|
return result;
|
|
}
|
|
|
|
const fetcherType = (getArg('--fetcher') ?? 'http') as 'http' | 'stealth' | 'dynamic';
|
|
const returnHtml = getFlag('--html');
|
|
const waitSelector = getArg('--wait');
|
|
const timeout = parseInt(getArg('--timeout') ?? '30000', 10);
|
|
const outputFormat = (getArg('--format') ?? 'pretty') as 'pretty' | 'json';
|
|
const serviceUrl = (getArg('--service') ?? process.env.SCRAPLING_SERVICE_URL ?? 'http://localhost:8765').replace(/\/$/, '');
|
|
const apiKey = process.env.SCRAPLING_API_KEY ?? '';
|
|
|
|
const rawSelectors = getArgs('--selector');
|
|
const selectors = rawSelectors.map((raw) => {
|
|
const colonIdx = raw.indexOf(':');
|
|
if (colonIdx === -1) {
|
|
console.error(`Invalid selector format: "${raw}". Expected "name:selector"`);
|
|
process.exit(1);
|
|
}
|
|
return {
|
|
name: raw.slice(0, colonIdx),
|
|
selector: raw.slice(colonIdx + 1),
|
|
selector_type: 'css' as const,
|
|
multiple: false,
|
|
};
|
|
});
|
|
|
|
// ── Minimal HTTP client ───────────────────────────────────────────────────────
|
|
|
|
function postJson(reqUrl: string, body: unknown, headers: Record<string, string>): Promise<unknown> {
|
|
return new Promise((resolve, reject) => {
|
|
const bodyStr = JSON.stringify(body);
|
|
const parsed = new URL(reqUrl);
|
|
const isHttps = parsed.protocol === 'https:';
|
|
const transport = isHttps ? https : http;
|
|
|
|
const req = transport.request(
|
|
{
|
|
hostname: parsed.hostname,
|
|
port: parsed.port || (isHttps ? 443 : 80),
|
|
path: parsed.pathname + parsed.search,
|
|
method: 'POST',
|
|
headers: {
|
|
'Content-Type': 'application/json',
|
|
'Content-Length': Buffer.byteLength(bodyStr).toString(),
|
|
...headers,
|
|
},
|
|
},
|
|
(res) => {
|
|
const chunks: Buffer[] = [];
|
|
res.on('data', (c: Buffer) => chunks.push(c));
|
|
res.on('end', () => {
|
|
const text = Buffer.concat(chunks).toString('utf-8');
|
|
if (res.statusCode && res.statusCode >= 400) {
|
|
reject(Object.assign(new Error(`HTTP ${res.statusCode}: ${text}`), { statusCode: res.statusCode }));
|
|
} else {
|
|
try {
|
|
resolve(JSON.parse(text));
|
|
} catch {
|
|
resolve(text);
|
|
}
|
|
}
|
|
});
|
|
},
|
|
);
|
|
req.on('error', reject);
|
|
req.write(bodyStr);
|
|
req.end();
|
|
});
|
|
}
|
|
|
|
// ── Main ──────────────────────────────────────────────────────────────────────
|
|
|
|
interface ScrapeResponse {
|
|
url: string;
|
|
status_code: number;
|
|
html?: string;
|
|
data: Record<string, unknown>;
|
|
fetcher_used: string;
|
|
elapsed_ms: number;
|
|
error?: string;
|
|
}
|
|
|
|
async function main(): Promise<void> {
|
|
console.log(`Service: ${serviceUrl}`);
|
|
console.log(`URL: ${url}`);
|
|
console.log(`Fetcher: ${fetcherType}`);
|
|
if (selectors.length) console.log(`Selectors: ${selectors.map((s) => `${s.name}:${s.selector}`).join(', ')}`);
|
|
console.log();
|
|
|
|
const payload: Record<string, unknown> = {
|
|
url,
|
|
fetcher_type: fetcherType,
|
|
return_html: returnHtml,
|
|
timeout,
|
|
selectors,
|
|
};
|
|
|
|
if (waitSelector) payload.wait_selector = waitSelector;
|
|
|
|
const requestHeaders: Record<string, string> = {};
|
|
if (apiKey) requestHeaders['X-API-Key'] = apiKey;
|
|
|
|
const response = (await postJson(`${serviceUrl}/scrape`, payload, requestHeaders)) as ScrapeResponse;
|
|
|
|
if (outputFormat === 'json') {
|
|
console.log(JSON.stringify(response, null, 2));
|
|
return;
|
|
}
|
|
|
|
// Pretty output
|
|
console.log(`Status: ${response.status_code}`);
|
|
console.log(`Fetcher: ${response.fetcher_used}`);
|
|
console.log(`Elapsed: ${response.elapsed_ms}ms`);
|
|
|
|
if (response.error) {
|
|
console.error(`\nError: ${response.error}`);
|
|
return;
|
|
}
|
|
|
|
if (Object.keys(response.data).length > 0) {
|
|
console.log('\nExtracted data:');
|
|
console.log('─'.repeat(50));
|
|
for (const [key, val] of Object.entries(response.data)) {
|
|
const display = Array.isArray(val) ? `[${(val as unknown[]).length} items]` : String(val);
|
|
console.log(` ${key.padEnd(25)} ${display}`);
|
|
}
|
|
}
|
|
|
|
if (response.html) {
|
|
console.log('\nHTML preview (first 500 chars):');
|
|
console.log('─'.repeat(50));
|
|
console.log(response.html.slice(0, 500));
|
|
}
|
|
}
|
|
|
|
main().catch((err) => {
|
|
console.error('\nError:', (err as Error).message ?? err);
|
|
process.exit(1);
|
|
});
|