htmlpdfs

Sleeping

App Files Files Community

ABDALLALSWAITI commited on Oct 17, 2025

Commit

649e096

verified ·

1 Parent(s): fe2c4d2

Upload 6 files

Browse files

Files changed (3) hide show

app.py +99 -74
puppeteer_pdf.js +129 -169
requirements.txt +2 -1

app.py CHANGED Viewed

@@ -1,16 +1,18 @@
-from fastapi import FastAPI, UploadFile, File, HTTPException
-from fastapi.responses import Response
 from fastapi.middleware.cors import CORSMiddleware
 import subprocess
 import os
 import tempfile
 import shutil
 import logging
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
-app = FastAPI(title="HTML to PDF API", version="4.0.0") # Version bump
 app.add_middleware(
     CORSMiddleware,
@@ -20,95 +22,118 @@ app.add_middleware(
     allow_headers=["*"],
 )
-def convert_to_pdf_with_puppeteer(html_content: str) -> bytes:
-    """
-    Converts an HTML string to a PDF using a sophisticated Puppeteer script.
-    This function saves the HTML to a temporary file, which is crucial for
-    allowing Puppeteer to load local assets like images that have relative paths.
-    """
-    temp_dir = tempfile.mkdtemp()
-    try:
-        html_file_path = os.path.join(temp_dir, "input.html")
-        with open(html_file_path, 'w', encoding='utf-8') as f:
-            f.write(html_content)
-        logger.info(f"HTML content saved to temporary file: {html_file_path}")
-        # The puppeteer_pdf.js script is now called with the path to the HTML file.
-        # This allows it to correctly resolve relative paths for images and other assets.
-        result = subprocess.run(
-            ['node', '/app/puppeteer_pdf.js', html_file_path],
-            capture_output=True,
-            text=True,
-            timeout=90,  # Increased timeout for potentially complex pages
-            cwd='/app' # Ensure node script runs from the correct directory
-        )
-        if result.returncode != 0:
-            logger.error(f"Puppeteer script failed. STDERR: {result.stderr}")
-            raise Exception(f"PDF conversion failed: {result.stderr}")
-        logger.info(f"Puppeteer script executed successfully. STDOUT: {result.stdout}")
-        pdf_file_path = html_file_path.replace('.html', '.pdf')
-        if not os.path.exists(pdf_file_path):
-            raise FileNotFoundError(f"PDF file was not generated at the expected path: {pdf_file_path}")
-        with open(pdf_file_path, 'rb') as f:
-            pdf_bytes = f.read()
-        logger.info(f"Successfully read {len(pdf_bytes)} bytes from PDF file.")
-        return pdf_bytes
-    finally:
-        # Clean up the temporary directory
-        shutil.rmtree(temp_dir, ignore_errors=True)
-        logger.info(f"Temporary directory {temp_dir} removed.")
-@app.post("/convert")
-async def convert(html_file: UploadFile = File(...)):
     """
-    Convert an uploaded HTML file to a PDF.
-    This endpoint now intelligently handles both single-page and multi-page
-    HTML documents, ensuring that styling, page breaks, and local images
-    are rendered correctly.
     """
     try:
-        html_content = (await html_file.read()).decode('utf-8')
-        pdf_bytes = convert_to_pdf_with_puppeteer(html_content)
-        filename = os.path.splitext(html_file.filename)[0] if html_file.filename else "output"
         return Response(
             content=pdf_bytes,
-            media_type="application/pdf",
-            headers={"Content-Disposition": f"attachment; filename={filename}.pdf"}
         )
-    except FileNotFoundError as e:
-        logger.error(f"File not found error during conversion: {str(e)}")
-        raise HTTPException(status_code=500, detail=f"PDF file could not be generated or found: {str(e)}")
     except Exception as e:
-        logger.error(f"An unexpected error occurred during conversion: {str(e)}")
-        raise HTTPException(status_code=500, detail=f"An error occurred: {str(e)}")
-@app.get("/")
 async def root():
     return {
-        "service": "HTML to PDF Converter",
-        "version": "4.0.0",
-        "usage": "POST your HTML file to /convert. The service now auto-detects multi-page vs. single-page layouts."
     }
-@app.get("/health")
 async def health():
-    return {"status": "healthy"}
-if __name__ == "__main__":
     import uvicorn
-    uvicorn.run(app, host="0.0.0.0", port=7860)

+from fastapi import FastAPI, UploadFile, File, HTTPException, Form
+from fastapi.responses import Response, JSONResponse
 from fastapi.middleware.cors import CORSMiddleware
 import subprocess
 import os
 import tempfile
 import shutil
 import logging
+import aiohttp
+import base64
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
+app = FastAPI(title="HTML to PDF API", version="3.1.0")
 app.add_middleware(
     CORSMiddleware,
     allow_headers=["*"],
 )
+async def fetch_url(url: str) -> str:
+    """Fetch HTML content from a URL asynchronously."""
+    timeout = aiohttp.ClientTimeout(total=30)
+    async with aiohttp.ClientSession(timeout=timeout) as session:
+        async with session.get(url) as resp:
+            if resp.status != 200:
+                raise Exception(f"Failed to fetch URL: {url} status={resp.status}")
+            return await resp.text()
+def run_puppeteer(html_path: str, aspect_ratio: str = 'auto', mode: str = 'auto') -> str:
+    """Run the puppeteer script to convert the given html file to pdf.
+    Returns path to generated PDF.
     """
+    # Use the local puppeteer script in the repo
+    script_path = os.path.join(os.getcwd(), 'puppeteer_pdf.js')
+    if not os.path.exists(script_path):
+        raise Exception('puppeteer_pdf.js not found in working directory')
+    # Build argv: [node puppeteer_pdf.js htmlFile aspect_ratio mode]
+    cmd = ['node', script_path, html_path, aspect_ratio or 'auto', mode or 'auto']
+    logger.info('Running command: %s', ' '.join(cmd))
+    result = subprocess.run(cmd, capture_output=True, text=True, timeout=120)
+    logger.info('stdout: %s', result.stdout)
+    logger.info('stderr: %s', result.stderr)
+    if result.returncode != 0:
+        raise Exception(f'Puppeteer failed: {result.stderr}')
+    pdf_path = html_path.replace('.html', '.pdf')
+    if not os.path.exists(pdf_path):
+        raise Exception('Expected output PDF not found')
+    return pdf_path
+@app.post('/convert')
+async def convert(
+    html_file: UploadFile | None = File(None),
+    html: str | None = Form(None),
+    url: str | None = Form(None),
+    aspect_ratio: str = Form('auto'),
+    mode: str = Form('auto'),
+    output: str = Form('pdf')  # 'pdf' or 'base64'
+):
+    """Convert HTML to PDF.
+    Input can be one of:
+    - file upload (html_file)
+    - raw html string (html)
+    - url to fetch (url)
+    Options:
+    - aspect_ratio: 16:9 | 9:16 | 1:1 | auto
+    - mode: auto | single | multi
+    - output: pdf | base64
     """
+    tmpdir = tempfile.mkdtemp()
+    html_path = os.path.join(tmpdir, 'input.html')
     try:
+        # Determine input source
+        if html_file is not None:
+            content = (await html_file.read()).decode('utf-8')
+        elif html is not None:
+            content = html
+        elif url is not None:
+            content = await fetch_url(url)
+        else:
+            raise HTTPException(status_code=400, detail='No html_file, html, or url provided')
+        # Save to temp html file
+        with open(html_path, 'w', encoding='utf-8') as f:
+            f.write(content)
+        # Call puppeteer
+        pdf_path = run_puppeteer(html_path, aspect_ratio=aspect_ratio, mode=mode)
+        with open(pdf_path, 'rb') as f:
+            pdf_bytes = f.read()
+        if output == 'base64':
+            b64 = base64.b64encode(pdf_bytes).decode('ascii')
+            return JSONResponse({'pdf_base64': b64})
         return Response(
             content=pdf_bytes,
+            media_type='application/pdf',
+            headers={'Content-Disposition': 'attachment; filename=output.pdf'}
         )
     except Exception as e:
+        logger.exception('Conversion failed')
+        raise HTTPException(status_code=500, detail=str(e))
+    finally:
+        shutil.rmtree(tmpdir, ignore_errors=True)
+@app.get('/')
 async def root():
     return {
+        'service': 'HTML to PDF Converter',
+        'version': '3.1.0',
+        'usage': 'POST to /convert with html_file or html or url; params: aspect_ratio, mode, output'
     }
+@app.get('/health')
 async def health():
+    return {'status': 'healthy'}
+if __name__ == '__main__':
     import uvicorn
+    uvicorn.run(app, host='0.0.0.0', port=7860)

puppeteer_pdf.js CHANGED Viewed

@@ -1,205 +1,165 @@
 #!/usr/bin/env node
 const puppeteer = require('puppeteer');
 const fs = require('fs');
-const path = require('path');
-const [htmlFilePath] = process.argv.slice(2);
-if (!htmlFilePath) {
-    console.error('Usage: node puppeteer_pdf.js <path_to_html_file>');
     process.exit(1);
 }
 (async () => {
     let browser;
     try {
-        console.log('🚀 Starting Puppeteer PDF generation...');
-        console.log(`  - Input file: ${htmlFilePath}`);
         browser = await puppeteer.launch({
             headless: 'new',
-            executablePath: process.env.PUPPETEEER_EXECUTABLE_PATH,
-            args: ['--no-sandbox', '--disable-setuid-sandbox', '--disable-dev-shm-usage']
         });
         const page = await browser.newPage();
-        const absolutePath = path.resolve(htmlFilePath);
-        const fileUrl = `file://${absolutePath}`;
-        await page.goto(fileUrl, {
-            waitUntil: 'networkidle0',
-        });
-        console.log('✓ Page and network resources loaded.');
-        // Emulate print media for proper page break handling
         await page.emulateMediaType('print');
-        console.log('✓ Print media emulated.');
-        // Wait for all fonts on the page to be loaded and ready
-        await page.evaluateHandle('document.fonts.ready');
-        console.log('✓ Fonts are loaded and ready.');
-        // --- INTELLIGENT MODE DETECTION ---
-        // Detect if the HTML is designed for multiple pages
-        const isMultiPage = await page.evaluate(() => {
-            const slideElements = document.querySelectorAll('.slide, .page');
-            if (slideElements.length > 0) return true;
-            const styles = Array.from(document.styleSheets)
-                .map(s => Array.from(s.cssRules || []).map(r => r.cssText).join(''))
-                .join('');
-            return styles.includes('page-break-after') || styles.includes('page-break-before');
         });
-        const pdfPath = absolutePath.replace('.html', '.pdf');
-        if (isMultiPage) {
-            // MULTI-PAGE MODE: For reports, presentations, and documents.
-            console.log('✓ Multi-page document detected. Generating paginated PDF...');
-            await page.pdf({
-                path: pdfPath,
-                format: 'A4', // A standard format is best for paginated documents
-                printBackground: true,
-                preferCSSPageSize: true, // IMPORTANT: Respects CSS @page rules and page breaks
-                margin: { top: 0, right: 0, bottom: 0, left: 0 } // Let CSS handle margins
-            });
-            console.log(`✅ Multi-page PDF created successfully: ${pdfPath}`);
-        } else {
-            // SINGLE-PAGE MODE: For infographics, posters, or single-view content.
-            console.log('✓ Single-page content detected. Generating PDF based on content size...');
-            const dimensions = await page.evaluate(() => {
-                return {
-                    width: document.documentElement.scrollWidth,
-                    height: document.documentElement.scrollHeight,
-                };
-            });
-            console.log(`  - Detected dimensions: ${dimensions.width}px x ${dimensions.height}px`);
-            await page.pdf({
-                path: pdfPath,
-                width: `${dimensions.width}px`,
-                height: `${dimensions.height}px`,
-                printBackground: true,
-                scale: 1
-            });
-            console.log(`✅ Single-page PDF created successfully: ${pdfPath}`);
         }
-        const stats = fs.statSync(pdfPath);
-        console.log(`  - File size: ${(stats.size / 1024).toFixed(2)} KB`);
-    } catch (error) {
-        console.error('❌ An error occurred during PDF generation:', error);
-        process.exit(1);
-    } finally {
-        if (browser) {
-            await browser.close();
-            console.log('✓ Browser closed.');
         }
-    }
-})();#!/usr/bin/env node
-const puppeteer = require('puppeteer');
-const fs = require('fs');
-const path = require('path');
-const [htmlFilePath] = process.argv.slice(2);
-if (!htmlFilePath) {
-    console.error('Usage: node puppeteer_pdf.js <path_to_html_file>');
-    process.exit(1);
-}
-(async () => {
-    let browser;
-    try {
-        console.log('🚀 Starting Puppeteer PDF generation...');
-        console.log(`  - Input file: ${htmlFilePath}`);
-        browser = await puppeteer.launch({
-            headless: 'new',
-            executablePath: process.env.PUPPETEEER_EXECUTABLE_PATH,
-            args: ['--no-sandbox', '--disable-setuid-sandbox', '--disable-dev-shm-usage']
-        });
-        const page = await browser.newPage();
-        const absolutePath = path.resolve(htmlFilePath);
-        const fileUrl = `file://${absolutePath}`;
-        await page.goto(fileUrl, {
-            waitUntil: 'networkidle0',
-        });
-        console.log('✓ Page and network resources loaded.');
-        // Emulate print media for proper page break handling
-        await page.emulateMediaType('print');
-        console.log('✓ Print media emulated.');
-        // Wait for all fonts on the page to be loaded and ready
-        await page.evaluateHandle('document.fonts.ready');
-        console.log('✓ Fonts are loaded and ready.');
-        // --- INTELLIGENT MODE DETECTION ---
-        // Detect if the HTML is designed for multiple pages
-        const isMultiPage = await page.evaluate(() => {
-            const slideElements = document.querySelectorAll('.slide, .page');
-            if (slideElements.length > 0) return true;
-            const styles = Array.from(document.styleSheets)
-                .map(s => Array.from(s.cssRules || []).map(r => r.cssText).join(''))
-                .join('');
-            return styles.includes('page-break-after') || styles.includes('page-break-before');
-        });
-        const pdfPath = absolutePath.replace('.html', '.pdf');
-        if (isMultiPage) {
-            // MULTI-PAGE MODE: For reports, presentations, and documents.
-            console.log('✓ Multi-page document detected. Generating paginated PDF...');
-            await page.pdf({
-                path: pdfPath,
-                format: 'A4', // A standard format is best for paginated documents
-                printBackground: true,
-                preferCSSPageSize: true, // IMPORTANT: Respects CSS @page rules and page breaks
-                margin: { top: 0, right: 0, bottom: 0, left: 0 } // Let CSS handle margins
-            });
-            console.log(`✅ Multi-page PDF created successfully: ${pdfPath}`);
         } else {
-            // SINGLE-PAGE MODE: For infographics, posters, or single-view content.
-            console.log('✓ Single-page content detected. Generating PDF based on content size...');
-            const dimensions = await page.evaluate(() => {
-                return {
-                    width: document.documentElement.scrollWidth,
-                    height: document.documentElement.scrollHeight,
-                };
-            });
-            console.log(`  - Detected dimensions: ${dimensions.width}px x ${dimensions.height}px`);
-            await page.pdf({
-                path: pdfPath,
-                width: `${dimensions.width}px`,
-                height: `${dimensions.height}px`,
-                printBackground: true,
-                scale: 1
-            });
-            console.log(`✅ Single-page PDF created successfully: ${pdfPath}`);
         }
         const stats = fs.statSync(pdfPath);
-        console.log(`  - File size: ${(stats.size / 1024).toFixed(2)} KB`);
-    } catch (error) {
-        console.error('❌ An error occurred during PDF generation:', error);
         process.exit(1);
-    } finally {
-        if (browser) {
-            await browser.close();
-            console.log('✓ Browser closed.');
-        }
     }
 })();

 #!/usr/bin/env node
 const puppeteer = require('puppeteer');
 const fs = require('fs');
+const [htmlFile, aspectRatioArg = 'auto', modeArg = 'auto'] = process.argv.slice(2);
+if (!htmlFile) {
+    console.error('Usage: node puppeteer_pdf.js <html_file> [aspect_ratio] [mode]');
+    console.error('  aspect_ratio: 16:9, 1:1, 9:16, auto');
+    console.error('  mode: auto, single, multi');
     process.exit(1);
 }
+const DEFAULT_CONFIGS = {
+    '16:9': {
+        multi: { format: 'A4', landscape: true },
+        single: { width: '1920px', height: '1080px' }
+    },
+    '1:1': {
+        multi: { width: '210mm', height: '210mm' },
+        single: { width: '2100px', height: '2100px' }
+    },
+    '9:16': {
+        multi: { format: 'A4', landscape: false },
+        single: { width: '1080px', height: '1920px' }
+    }
+};
+async function detectCssPageSize(page) {
+    // Returns {hasPageRule: bool, width: string|null, height: string|null}
+    return await page.evaluate(() => {
+        // Search for @page rules
+        try {
+            const sheets = Array.from(document.styleSheets || []);
+            for (const sheet of sheets) {
+                try {
+                    const rules = sheet.cssRules || [];
+                    for (const r of rules) {
+                        if (r.type === CSSRule.PAGE_RULE) {
+                            const css = r.cssText || '';
+                            const match = css.match(/size\s*:\s*([^;\n]+)/i);
+                            if (match) {
+                                return { hasPageRule: true, size: match[1].trim() };
+                            }
+                            return { hasPageRule: true, size: null };
+                        }
+                    }
+                } catch (e) {
+                    // ignore cross-origin or invalid sheets
+                }
+            }
+        } catch (e) {}
+        return { hasPageRule: false, size: null };
+    });
+}
 (async () => {
     let browser;
     try {
+        console.log('Starting Puppeteer PDF generation...');
+        console.log('  Aspect Ratio arg:', aspectRatioArg);
+        console.log('  Mode arg:', modeArg);
         browser = await puppeteer.launch({
             headless: 'new',
+            executablePath: process.env.PUPPETEER_EXECUTABLE_PATH || '/usr/bin/chromium',
+            args: [
+                '--no-sandbox',
+                '--disable-setuid-sandbox',
+                '--disable-dev-shm-usage',
+                '--disable-gpu',
+                '--disable-software-rasterizer',
+            ]
         });
         const page = await browser.newPage();
+        const html = fs.readFileSync(htmlFile, 'utf8');
+        await page.setContent(html, { waitUntil: 'networkidle0', timeout: 30000 });
+        console.log('\u2713 HTML loaded');
         await page.emulateMediaType('print');
+        console.log('\u2713 Emulated print media');
+        // Ensure fonts and images are ready
+        await page.evaluate(() => document.fonts.ready);
+        await page.evaluate(() => new Promise(resolve => {
+            if (document.readyState === 'complete') resolve();
+            else window.addEventListener('load', resolve);
+        }));
+        console.log('\u2713 Resources ready');
+        // Detect CSS @page rules
+        const pageSizeInfo = await detectCssPageSize(page);
+        console.log('  CSS @page detected:', pageSizeInfo);
+        // Detect explicit page-break classes or CSS
+        const hasPageBreaks = await page.evaluate(() => {
+            const hasPageClass = !!document.querySelector('.page, .slide');
+            const styles = Array.from(document.querySelectorAll('style')).map(s => s.textContent).join('\n');
+            const hasPageBreakCSS = /page-break|break-after|break-before|@page/.test(styles);
+            return hasPageClass || hasPageBreakCSS;
         });
+        // Auto-detect mode when requested
+        let mode = modeArg;
+        if (modeArg === 'auto') {
+            const contentHeight = await page.evaluate(() => document.documentElement.scrollHeight);
+            const viewportHeight = await page.evaluate(() => window.innerHeight);
+            mode = (hasPageBreaks || contentHeight > viewportHeight * 2) ? 'multi' : 'single';
+            console.log('  Auto-detected mode:', mode);
         }
+        // If pageSizeInfo indicates CSS @page then prefer CSS sizes for multi-page
+        const preferCSSPageSize = pageSizeInfo.hasPageRule;
+        // Determine aspect ratio
+        let aspectRatio = aspectRatioArg;
+        if (aspectRatioArg === 'auto') {
+            // try to detect a data-aspect-ratio attribute or meta tag
+            const detected = await page.evaluate(() => {
+                const meta = document.querySelector('meta[name="aspect-ratio"]') || document.querySelector('meta[name="orientation"]');
+                if (meta && meta.content) return meta.content.trim();
+                if (document.documentElement.dataset && document.documentElement.dataset.aspectRatio) return document.documentElement.dataset.aspectRatio;
+                return null;
+            });
+            aspectRatio = detected || (hasPageBreaks ? '9:16' : '9:16');
+            console.log('  Auto-detected aspectRatio:', aspectRatio);
         }
+        const configs = DEFAULT_CONFIGS;
+        const config = configs[aspectRatio] || configs['9:16'];
+        const pdfPath = htmlFile.replace('.html', '.pdf');
+        if (mode === 'single') {
+            console.log('Generating single-page PDF...');
+            let singleConfig = config.single;
+            if (aspectRatioArg === 'auto') {
+                const dims = await page.evaluate(() => ({ width: document.documentElement.scrollWidth, height: document.documentElement.scrollHeight }));
+                console.log('  Detected content size:', dims);
+                singleConfig = { width: dims.width + 'px', height: dims.height + 'px' };
+            }
+            await page.pdf({ path: pdfPath, ...singleConfig, printBackground: true, preferCSSPageSize: false, scale: 1, pageRanges: '1' });
+            console.log('\u2713 Single-page PDF created');
         } else {
+            console.log('Generating multi-page PDF...');
+            const multiOptions = Object.assign({}, config.multi, { path: pdfPath, printBackground: true, preferCSSPageSize });
+            await page.pdf(multiOptions);
+            console.log('\u2713 Multi-page PDF created');
         }
         const stats = fs.statSync(pdfPath);
+        console.log('  File size KB:', (stats.size / 1024).toFixed(2));
+        await browser.close();
+        process.exit(0);
+    } catch (err) {
+        console.error('Error generating PDF:', err && err.message ? err.message : err);
+        if (err && err.stack) console.error(err.stack);
+        try { if (browser) await browser.close(); } catch (e) {}
         process.exit(1);
     }
 })();

requirements.txt CHANGED Viewed

@@ -1,3 +1,4 @@
 fastapi==0.115.0
 uvicorn==0.32.0
-python-multipart==0.0.12

 fastapi==0.115.0
 uvicorn==0.32.0
+python-multipart==0.0.12
+aiohttp==3.9.4