# File Upload Security ## Rule Validate content, not just extension. Store outside webroot. Generate new filenames. Set size limits. **Source:** [OWASP File Upload Cheat Sheet](https://cheatsheetseries.owasp.org/cheatsheets/File_Upload_Cheat_Sheet.html) ## Attack Vectors | Attack | Description | |--------|-------------| | Web shell | Upload .php/.jsp that executes commands | | XSS via SVG | SVG with embedded JavaScript | | XXE via Office | DOCX/XLSX contain XML | | Path traversal | Filename like `../../../etc/cron.d/shell` | | DoS | Upload huge files, exhaust disk | | Malware hosting | Use your server to distribute malware | ## Correct Pattern ```python import os import uuid import magic # python-magic for content detection from pathlib import Path UPLOAD_DIR = Path("/var/uploads") # Outside webroot! MAX_FILE_SIZE = 10 * 1024 * 1024 # 10 MB ALLOWED_TYPES = { "image/jpeg": ".jpg", "image/png": ".png", "image/gif": ".gif", "application/pdf": ".pdf", } def save_upload(file_storage) -> str: """Safely handle file upload.""" # Check size first (before reading into memory) file_storage.seek(0, 2) # Seek to end size = file_storage.tell() file_storage.seek(0) # Reset if size > MAX_FILE_SIZE: raise ValueError("File too large") # Read content for validation content = file_storage.read() file_storage.seek(0) # Detect MIME type from content, not extension detected_type = magic.from_buffer(content, mime=True) if detected_type not in ALLOWED_TYPES: raise ValueError(f"File type not allowed: {detected_type}") # Generate safe filename (never use user input) extension = ALLOWED_TYPES[detected_type] safe_filename = f"{uuid.uuid4()}{extension}" # Store outside webroot dest_path = UPLOAD_DIR / safe_filename # Ensure we're still in upload dir (paranoid check) if not dest_path.resolve().is_relative_to(UPLOAD_DIR.resolve()): raise ValueError("Invalid path") with open(dest_path, "wb") as f: f.write(content) return safe_filename def serve_upload(filename: str): """Serve uploaded file safely.""" # Validate filename format if not filename or ".." in filename or "/" in filename: raise ValueError("Invalid filename") path = UPLOAD_DIR / filename # Verify path is within upload dir if not path.resolve().is_relative_to(UPLOAD_DIR.resolve()): raise ValueError("Invalid path") if not path.exists(): raise FileNotFoundError() # Serve with safe content-type return send_file( path, mimetype="application/octet-stream", # Force download as_attachment=True, download_name=filename ) ``` ## Incorrect Pattern ```python import os # Wrong: using user-provided filename def bad_upload(file): filename = file.filename # User controlled! file.save(f"/uploads/{filename}") # Attack: filename = "../../../var/www/shell.php" # Wrong: checking only extension def bad_validate(filename): return filename.endswith((".jpg", ".png")) # Attack: shell.php.jpg with PHP content # Wrong: storing in webroot def bad_upload_2(file): file.save(f"/var/www/html/uploads/{file.filename}") # Attacker can access directly, execute scripts # Wrong: trusting Content-Type header def bad_validate_2(file): return file.content_type.startswith("image/") # Header is attacker-controlled! # Wrong: no size limit def bad_upload_3(file): file.save(f"/uploads/{uuid.uuid4()}") # DoS: upload 100GB file ``` ## Image-Specific Validation ```python from PIL import Image import io MAX_IMAGE_PIXELS = 4096 * 4096 # Prevent decompression bomb def validate_image(content: bytes) -> bool: """Validate image content.""" try: Image.MAX_IMAGE_PIXELS = MAX_IMAGE_PIXELS img = Image.open(io.BytesIO(content)) # Actually load the image (validates structure) img.verify() # Reopen for further checks (verify() invalidates) img = Image.open(io.BytesIO(content)) # Check format if img.format not in ("JPEG", "PNG", "GIF"): return False # Strip EXIF (can contain sensitive data, XSS in some viewers) # PIL's save() with specific format strips most metadata return True except Exception: return False def strip_image_metadata(content: bytes) -> bytes: """Remove EXIF and other metadata.""" img = Image.open(io.BytesIO(content)) # Create new image without metadata output = io.BytesIO() img.save(output, format=img.format) return output.getvalue() ``` ## Antivirus Scanning ```python import clamd # ClamAV client def scan_for_malware(filepath: str) -> bool: """Scan file with ClamAV.""" try: cd = clamd.ClamdUnixSocket() result = cd.scan(filepath) if result is None: return True # Clean # result = {filepath: ('FOUND', 'Malware.Name')} status, name = result.get(filepath, (None, None)) if status == "FOUND": log.warning("Malware detected", filepath=filepath, malware=name) os.remove(filepath) return False return True except Exception as e: log.error("Antivirus scan failed", error=str(e)) return False # Fail closed ``` ## Edge Cases - Double extensions: `file.php.jpg` may execute as PHP on misconfigured servers - Null byte: `file.php%00.jpg` truncates to `file.php` in some languages - Case sensitivity: `.PhP` may execute on Windows - SVG can contain JavaScript — treat as dangerous - ZIP files need recursive scanning for zip bombs - Office files (DOCX) are ZIPs containing XML — check for XXE - GIF89a header with PHP code can execute on some servers