feat: initial commit
This commit is contained in:
213
scripts/scrapling-run.ts
Normal file
213
scripts/scrapling-run.ts
Normal file
@@ -0,0 +1,213 @@
|
||||
#!/usr/bin/env ts-node
|
||||
/**
|
||||
* Standalone Scrapling CLI runner — no n8n needed.
|
||||
*
|
||||
* Usage:
|
||||
* npx ts-node scripts/scrapling-run.ts <url> [options]
|
||||
*
|
||||
* Options:
|
||||
* --fetcher http|stealth|dynamic (default: http)
|
||||
* --selector <name>:<css-selector> (can be repeated)
|
||||
* --html include raw HTML in output
|
||||
* --wait <css-selector> wait for selector (dynamic only)
|
||||
* --timeout <ms> (default: 30000)
|
||||
* --format pretty|json (default: pretty)
|
||||
* --service <url> service URL override
|
||||
*
|
||||
* Credentials (from .env.test or environment):
|
||||
* SCRAPLING_SERVICE_URL default: http://localhost:8765
|
||||
* SCRAPLING_API_KEY optional
|
||||
*
|
||||
* Examples:
|
||||
* npx ts-node scripts/scrapling-run.ts https://example.com
|
||||
* npx ts-node scripts/scrapling-run.ts https://news.ycombinator.com --fetcher stealth --selector "title:title"
|
||||
* npx ts-node scripts/scrapling-run.ts https://spa.example.com --fetcher dynamic --wait "#app"
|
||||
*/
|
||||
|
||||
import * as https from 'https';
|
||||
import * as http from 'http';
|
||||
import * as path from 'path';
|
||||
import * as fs from 'fs';
|
||||
|
||||
// ── Load .env.test if present ─────────────────────────────────────────────────
|
||||
|
||||
const envFile = path.resolve(__dirname, '..', '.env.test');
|
||||
if (fs.existsSync(envFile)) {
|
||||
const lines = fs.readFileSync(envFile, 'utf-8').split('\n');
|
||||
for (const line of lines) {
|
||||
const trimmed = line.trim();
|
||||
if (!trimmed || trimmed.startsWith('#')) continue;
|
||||
const eq = trimmed.indexOf('=');
|
||||
if (eq === -1) continue;
|
||||
const key = trimmed.slice(0, eq).trim();
|
||||
const value = trimmed.slice(eq + 1).trim().replace(/^["']|["']$/g, '');
|
||||
if (!process.env[key]) process.env[key] = value;
|
||||
}
|
||||
}
|
||||
|
||||
// ── Parse CLI args ────────────────────────────────────────────────────────────
|
||||
|
||||
const args = process.argv.slice(2);
|
||||
const url = args.find((a) => !a.startsWith('--'));
|
||||
|
||||
if (!url) {
|
||||
console.error('Usage: scrapling-run.ts <url> [--fetcher http|stealth|dynamic] [--selector name:selector] ...');
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
function getArg(flag: string): string | undefined {
|
||||
const idx = args.indexOf(flag);
|
||||
return idx !== -1 ? args[idx + 1] : undefined;
|
||||
}
|
||||
|
||||
function getFlag(flag: string): boolean {
|
||||
return args.includes(flag);
|
||||
}
|
||||
|
||||
function getArgs(flag: string): string[] {
|
||||
const result: string[] = [];
|
||||
for (let i = 0; i < args.length; i++) {
|
||||
if (args[i] === flag && args[i + 1]) {
|
||||
result.push(args[i + 1]);
|
||||
i++;
|
||||
}
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
const fetcherType = (getArg('--fetcher') ?? 'http') as 'http' | 'stealth' | 'dynamic';
|
||||
const returnHtml = getFlag('--html');
|
||||
const waitSelector = getArg('--wait');
|
||||
const timeout = parseInt(getArg('--timeout') ?? '30000', 10);
|
||||
const outputFormat = (getArg('--format') ?? 'pretty') as 'pretty' | 'json';
|
||||
const serviceUrl = (getArg('--service') ?? process.env.SCRAPLING_SERVICE_URL ?? 'http://localhost:8765').replace(/\/$/, '');
|
||||
const apiKey = process.env.SCRAPLING_API_KEY ?? '';
|
||||
|
||||
const rawSelectors = getArgs('--selector');
|
||||
const selectors = rawSelectors.map((raw) => {
|
||||
const colonIdx = raw.indexOf(':');
|
||||
if (colonIdx === -1) {
|
||||
console.error(`Invalid selector format: "${raw}". Expected "name:selector"`);
|
||||
process.exit(1);
|
||||
}
|
||||
return {
|
||||
name: raw.slice(0, colonIdx),
|
||||
selector: raw.slice(colonIdx + 1),
|
||||
selector_type: 'css' as const,
|
||||
multiple: false,
|
||||
};
|
||||
});
|
||||
|
||||
// ── Minimal HTTP client ───────────────────────────────────────────────────────
|
||||
|
||||
function postJson(reqUrl: string, body: unknown, headers: Record<string, string>): Promise<unknown> {
|
||||
return new Promise((resolve, reject) => {
|
||||
const bodyStr = JSON.stringify(body);
|
||||
const parsed = new URL(reqUrl);
|
||||
const isHttps = parsed.protocol === 'https:';
|
||||
const transport = isHttps ? https : http;
|
||||
|
||||
const req = transport.request(
|
||||
{
|
||||
hostname: parsed.hostname,
|
||||
port: parsed.port || (isHttps ? 443 : 80),
|
||||
path: parsed.pathname + parsed.search,
|
||||
method: 'POST',
|
||||
headers: {
|
||||
'Content-Type': 'application/json',
|
||||
'Content-Length': Buffer.byteLength(bodyStr).toString(),
|
||||
...headers,
|
||||
},
|
||||
},
|
||||
(res) => {
|
||||
const chunks: Buffer[] = [];
|
||||
res.on('data', (c: Buffer) => chunks.push(c));
|
||||
res.on('end', () => {
|
||||
const text = Buffer.concat(chunks).toString('utf-8');
|
||||
if (res.statusCode && res.statusCode >= 400) {
|
||||
reject(Object.assign(new Error(`HTTP ${res.statusCode}: ${text}`), { statusCode: res.statusCode }));
|
||||
} else {
|
||||
try {
|
||||
resolve(JSON.parse(text));
|
||||
} catch {
|
||||
resolve(text);
|
||||
}
|
||||
}
|
||||
});
|
||||
},
|
||||
);
|
||||
req.on('error', reject);
|
||||
req.write(bodyStr);
|
||||
req.end();
|
||||
});
|
||||
}
|
||||
|
||||
// ── Main ──────────────────────────────────────────────────────────────────────
|
||||
|
||||
interface ScrapeResponse {
|
||||
url: string;
|
||||
status_code: number;
|
||||
html?: string;
|
||||
data: Record<string, unknown>;
|
||||
fetcher_used: string;
|
||||
elapsed_ms: number;
|
||||
error?: string;
|
||||
}
|
||||
|
||||
async function main(): Promise<void> {
|
||||
console.log(`Service: ${serviceUrl}`);
|
||||
console.log(`URL: ${url}`);
|
||||
console.log(`Fetcher: ${fetcherType}`);
|
||||
if (selectors.length) console.log(`Selectors: ${selectors.map((s) => `${s.name}:${s.selector}`).join(', ')}`);
|
||||
console.log();
|
||||
|
||||
const payload: Record<string, unknown> = {
|
||||
url,
|
||||
fetcher_type: fetcherType,
|
||||
return_html: returnHtml,
|
||||
timeout,
|
||||
selectors,
|
||||
};
|
||||
|
||||
if (waitSelector) payload.wait_selector = waitSelector;
|
||||
|
||||
const requestHeaders: Record<string, string> = {};
|
||||
if (apiKey) requestHeaders['X-API-Key'] = apiKey;
|
||||
|
||||
const response = (await postJson(`${serviceUrl}/scrape`, payload, requestHeaders)) as ScrapeResponse;
|
||||
|
||||
if (outputFormat === 'json') {
|
||||
console.log(JSON.stringify(response, null, 2));
|
||||
return;
|
||||
}
|
||||
|
||||
// Pretty output
|
||||
console.log(`Status: ${response.status_code}`);
|
||||
console.log(`Fetcher: ${response.fetcher_used}`);
|
||||
console.log(`Elapsed: ${response.elapsed_ms}ms`);
|
||||
|
||||
if (response.error) {
|
||||
console.error(`\nError: ${response.error}`);
|
||||
return;
|
||||
}
|
||||
|
||||
if (Object.keys(response.data).length > 0) {
|
||||
console.log('\nExtracted data:');
|
||||
console.log('─'.repeat(50));
|
||||
for (const [key, val] of Object.entries(response.data)) {
|
||||
const display = Array.isArray(val) ? `[${(val as unknown[]).length} items]` : String(val);
|
||||
console.log(` ${key.padEnd(25)} ${display}`);
|
||||
}
|
||||
}
|
||||
|
||||
if (response.html) {
|
||||
console.log('\nHTML preview (first 500 chars):');
|
||||
console.log('─'.repeat(50));
|
||||
console.log(response.html.slice(0, 500));
|
||||
}
|
||||
}
|
||||
|
||||
main().catch((err) => {
|
||||
console.error('\nError:', (err as Error).message ?? err);
|
||||
process.exit(1);
|
||||
});
|
||||
Reference in New Issue
Block a user