#!/usr/bin/env python3 """Convert a PDF to an HTML presentation. Each page is rendered as a PNG image (via pdftoppm). Supports external assets mode for large files to avoid huge single-file HTML. Requirements: poppler-utils (pdftoppm) """ import argparse import base64 import glob import os import subprocess import sys import tempfile from pathlib import Path def get_page_count(pdf_path): """Get page count using pdfinfo if available.""" try: result = subprocess.run(["pdfinfo", pdf_path], capture_output=True, text=True) for line in result.stdout.splitlines(): if line.startswith("Pages:"): return int(line.split(":")[1].strip()) except: pass return None def convert(pdf_path: str, output_path: str | None = None, dpi: int = 150, external_assets=None): pdf_path = str(Path(pdf_path).resolve()) if not Path(pdf_path).exists(): print(f"Error: {pdf_path} not found") sys.exit(1) if subprocess.run(["which", "pdftoppm"], capture_output=True).returncode != 0: print("Error: pdftoppm not found. Install poppler-utils:") print(" apt install poppler-utils # Debian/Ubuntu") print(" brew install poppler # macOS") sys.exit(1) file_size_mb = os.path.getsize(pdf_path) / (1024 * 1024) if file_size_mb > 150: print(f"WARNING: PDF is {file_size_mb:.0f}MB — conversion may be slow and memory-intensive.") page_count = get_page_count(pdf_path) # Auto-detect external assets mode if external_assets is None: external_assets = file_size_mb > 20 or (page_count is not None and page_count > 50) if external_assets: print(f"Auto-enabling external assets mode (file: {file_size_mb:.1f}MB, pages: {page_count or 'unknown'})") output = output_path or str(Path(pdf_path).with_suffix('.html')) output_dir = Path(output).parent if external_assets: assets_dir = output_dir / "assets" assets_dir.mkdir(parents=True, exist_ok=True) with tempfile.TemporaryDirectory() as tmpdir: prefix = os.path.join(tmpdir, "page") result = subprocess.run( ["pdftoppm", "-png", "-r", str(dpi), pdf_path, prefix], capture_output=True, text=True ) if result.returncode != 0: print(f"Error converting PDF: {result.stderr}") sys.exit(1) pages = sorted(glob.glob(f"{prefix}-*.png")) if not pages: print("Error: No pages rendered from PDF") sys.exit(1) slides_html = [] for i, page_path in enumerate(pages, 1): with open(page_path, "rb") as f: page_bytes = f.read() if external_assets: img_name = f"img-{i:03d}.png" (assets_dir / img_name).write_bytes(page_bytes) src = f"assets/{img_name}" else: b64 = base64.b64encode(page_bytes).decode() src = f"data:image/png;base64,{b64}" slides_html.append( f'' ) title = Path(pdf_path).stem.replace("-", " ").replace("_", " ") html = f'''