#!/usr/bin/env python3 """Convert a PPTX file to an HTML presentation with formatting preserved. Supports external assets mode for large files to avoid huge single-file HTML. """ import argparse import base64 import io import os import re import sys from pathlib import Path def _ensure_pptx(): try: from pptx import Presentation from pptx.enum.text import PP_ALIGN return True except ImportError: print("ERROR: python-pptx not installed. Install with: pip install python-pptx") sys.exit(1) def rgb_to_hex(rgb_color): if rgb_color is None: return None try: return f"#{rgb_color}" except: return None def get_text_style(run): styles = [] try: if run.font.bold: styles.append("font-weight:bold") if run.font.italic: styles.append("font-style:italic") if run.font.underline: styles.append("text-decoration:underline") if run.font.size: styles.append(f"font-size:{run.font.size.pt}pt") if run.font.color and run.font.color.rgb: styles.append(f"color:{rgb_to_hex(run.font.color.rgb)}") if run.font.name: styles.append(f"font-family:'{run.font.name}',sans-serif") except: pass return ";".join(styles) def get_alignment(paragraph): from pptx.enum.text import PP_ALIGN try: align = paragraph.alignment if align == PP_ALIGN.CENTER: return "center" elif align == PP_ALIGN.RIGHT: return "right" elif align == PP_ALIGN.JUSTIFY: return "justify" except: pass return "left" def get_shape_position(shape, slide_width, slide_height): try: left = (shape.left / slide_width) * 100 if shape.left else 0 top = (shape.top / slide_height) * 100 if shape.top else 0 width = (shape.width / slide_width) * 100 if shape.width else 50 height = (shape.height / slide_height) * 100 if shape.height else 30 return left, top, width, height except: return 5, 5, 90, 40 def get_slide_background(slide, prs): from pptx.oxml.ns import qn for source in [slide, slide.slide_layout]: try: bg_el = source.background._element for sf in bg_el.iter(qn('a:solidFill')): clr = sf.find(qn('a:srgbClr')) if clr is not None and clr.get('val'): return f"background-color:#{clr.get('val')}" except: pass return "background-color:#ffffff" def get_shape_fill(shape): from pptx.oxml.ns import qn try: sp_pr = shape._element.find(qn('p:spPr')) if sp_pr is None: sp_pr = shape._element.find(qn('a:spPr')) if sp_pr is None: for tag in ['{http://schemas.openxmlformats.org/drawingml/2006/main}spPr', '{http://schemas.openxmlformats.org/presentationml/2006/main}spPr']: sp_pr = shape._element.find(tag) if sp_pr is not None: break if sp_pr is not None: sf = sp_pr.find(qn('a:solidFill')) if sf is not None: clr = sf.find(qn('a:srgbClr')) if clr is not None and clr.get('val'): return f"#{clr.get('val')}" except: pass return None def render_paragraph(paragraph): align = get_alignment(paragraph) parts = [] for run in paragraph.runs: text = run.text if not text: continue text = text.replace("&", "&").replace("<", "<").replace(">", ">") style = get_text_style(run) if style: parts.append(f'{text}') else: parts.append(text) if not parts: return "" content = "".join(parts) return f'
{content}
' def extract_image_data(shape): """Extract raw image bytes and content type from a shape.""" try: image = shape.image return image.blob, image.content_type except: return None, None def count_images(prs): """Count total images across all slides.""" count = 0 for slide in prs.slides: for shape in slide.shapes: if shape.shape_type == 13 or hasattr(shape, "image"): try: _ = shape.image count += 1 except: pass return count CONTENT_TYPE_TO_EXT = { 'image/png': '.png', 'image/jpeg': '.jpg', 'image/jpg': '.jpg', 'image/gif': '.gif', 'image/bmp': '.bmp', 'image/tiff': '.tiff', 'image/svg+xml': '.svg', 'image/webp': '.webp', } def convert(pptx_path, output_path=None, external_assets=None): _ensure_pptx() from pptx import Presentation file_size_mb = os.path.getsize(pptx_path) / (1024 * 1024) # Pre-flight warning for very large files if file_size_mb > 150: print(f"WARNING: File is {file_size_mb:.0f}MB — consider using PDF conversion (convert-pdf.py) for better performance.") prs = Presentation(pptx_path) slide_width = prs.slide_width slide_height = prs.slide_height aspect_ratio = slide_width / slide_height if slide_height else 16/9 total_images = count_images(prs) # Auto-detect external assets mode if external_assets is None: external_assets = file_size_mb > 20 or total_images > 50 if external_assets: print(f"Auto-enabling external assets mode (file: {file_size_mb:.1f}MB, images: {total_images})") output = output_path or str(Path(pptx_path).with_suffix('.html')) output_dir = Path(output).parent if external_assets: assets_dir = output_dir / "assets" assets_dir.mkdir(parents=True, exist_ok=True) img_counter = 0 slides_html = [] for i, slide in enumerate(prs.slides, 1): bg_style = get_slide_background(slide, prs) elements = [] for shape in sorted(slide.shapes, key=lambda s: (s.top or 0, s.left or 0)): left, top, width, height = get_shape_position(shape, slide_width, slide_height) pos_style = f"position:absolute;left:{left:.1f}%;top:{top:.1f}%;width:{width:.1f}%;height:{height:.1f}%" # Image if shape.shape_type == 13 or hasattr(shape, "image"): blob, content_type = extract_image_data(shape) if blob: img_counter += 1 if external_assets: ext = CONTENT_TYPE_TO_EXT.get(content_type, '.png') img_name = f"img-{img_counter:03d}{ext}" (assets_dir / img_name).write_bytes(blob) src = f"assets/{img_name}" else: b64 = base64.b64encode(blob).decode('utf-8') src = f"data:{content_type};base64,{b64}" elements.append( f'| {cell_text} | ' table_html += "