# File Upload Security

## Rule

Validate content, not just extension. Store outside webroot. Generate new filenames. Set size limits.

**Source:** [OWASP File Upload Cheat Sheet](https://cheatsheetseries.owasp.org/cheatsheets/File_Upload_Cheat_Sheet.html)

## Attack Vectors

| Attack | Description |
|--------|-------------|
| Web shell | Upload .php/.jsp that executes commands |
| XSS via SVG | SVG with embedded JavaScript |
| XXE via Office | DOCX/XLSX contain XML |
| Path traversal | Filename like `../../../etc/cron.d/shell` |
| DoS | Upload huge files, exhaust disk |
| Malware hosting | Use your server to distribute malware |

## Correct Pattern

```python
import os
import uuid
import magic  # python-magic for content detection
from pathlib import Path

UPLOAD_DIR = Path("/var/uploads")  # Outside webroot!
MAX_FILE_SIZE = 10 * 1024 * 1024  # 10 MB
ALLOWED_TYPES = {
    "image/jpeg": ".jpg",
    "image/png": ".png",
    "image/gif": ".gif",
    "application/pdf": ".pdf",
}

def save_upload(file_storage) -> str:
    """Safely handle file upload."""
    # Check size first (before reading into memory)
    file_storage.seek(0, 2)  # Seek to end
    size = file_storage.tell()
    file_storage.seek(0)  # Reset
    
    if size > MAX_FILE_SIZE:
        raise ValueError("File too large")
    
    # Read content for validation
    content = file_storage.read()
    file_storage.seek(0)
    
    # Detect MIME type from content, not extension
    detected_type = magic.from_buffer(content, mime=True)
    
    if detected_type not in ALLOWED_TYPES:
        raise ValueError(f"File type not allowed: {detected_type}")
    
    # Generate safe filename (never use user input)
    extension = ALLOWED_TYPES[detected_type]
    safe_filename = f"{uuid.uuid4()}{extension}"
    
    # Store outside webroot
    dest_path = UPLOAD_DIR / safe_filename
    
    # Ensure we're still in upload dir (paranoid check)
    if not dest_path.resolve().is_relative_to(UPLOAD_DIR.resolve()):
        raise ValueError("Invalid path")
    
    with open(dest_path, "wb") as f:
        f.write(content)
    
    return safe_filename

def serve_upload(filename: str):
    """Serve uploaded file safely."""
    # Validate filename format
    if not filename or ".." in filename or "/" in filename:
        raise ValueError("Invalid filename")
    
    path = UPLOAD_DIR / filename
    
    # Verify path is within upload dir
    if not path.resolve().is_relative_to(UPLOAD_DIR.resolve()):
        raise ValueError("Invalid path")
    
    if not path.exists():
        raise FileNotFoundError()
    
    # Serve with safe content-type
    return send_file(
        path,
        mimetype="application/octet-stream",  # Force download
        as_attachment=True,
        download_name=filename
    )
```

## Incorrect Pattern

```python
import os

# Wrong: using user-provided filename
def bad_upload(file):
    filename = file.filename  # User controlled!
    file.save(f"/uploads/{filename}")
    # Attack: filename = "../../../var/www/shell.php"

# Wrong: checking only extension
def bad_validate(filename):
    return filename.endswith((".jpg", ".png"))
    # Attack: shell.php.jpg with PHP content

# Wrong: storing in webroot
def bad_upload_2(file):
    file.save(f"/var/www/html/uploads/{file.filename}")
    # Attacker can access directly, execute scripts

# Wrong: trusting Content-Type header
def bad_validate_2(file):
    return file.content_type.startswith("image/")
    # Header is attacker-controlled!

# Wrong: no size limit
def bad_upload_3(file):
    file.save(f"/uploads/{uuid.uuid4()}")
    # DoS: upload 100GB file
```

## Image-Specific Validation

```python
from PIL import Image
import io

MAX_IMAGE_PIXELS = 4096 * 4096  # Prevent decompression bomb

def validate_image(content: bytes) -> bool:
    """Validate image content."""
    try:
        Image.MAX_IMAGE_PIXELS = MAX_IMAGE_PIXELS
        img = Image.open(io.BytesIO(content))
        
        # Actually load the image (validates structure)
        img.verify()
        
        # Reopen for further checks (verify() invalidates)
        img = Image.open(io.BytesIO(content))
        
        # Check format
        if img.format not in ("JPEG", "PNG", "GIF"):
            return False
        
        # Strip EXIF (can contain sensitive data, XSS in some viewers)
        # PIL's save() with specific format strips most metadata
        
        return True
    except Exception:
        return False

def strip_image_metadata(content: bytes) -> bytes:
    """Remove EXIF and other metadata."""
    img = Image.open(io.BytesIO(content))
    
    # Create new image without metadata
    output = io.BytesIO()
    img.save(output, format=img.format)
    return output.getvalue()
```

## Antivirus Scanning

```python
import clamd  # ClamAV client

def scan_for_malware(filepath: str) -> bool:
    """Scan file with ClamAV."""
    try:
        cd = clamd.ClamdUnixSocket()
        result = cd.scan(filepath)
        
        if result is None:
            return True  # Clean
        
        # result = {filepath: ('FOUND', 'Malware.Name')}
        status, name = result.get(filepath, (None, None))
        if status == "FOUND":
            log.warning("Malware detected", filepath=filepath, malware=name)
            os.remove(filepath)
            return False
        
        return True
    except Exception as e:
        log.error("Antivirus scan failed", error=str(e))
        return False  # Fail closed
```

## Edge Cases

- Double extensions: `file.php.jpg` may execute as PHP on misconfigured servers
- Null byte: `file.php%00.jpg` truncates to `file.php` in some languages
- Case sensitivity: `.PhP` may execute on Windows
- SVG can contain JavaScript — treat as dangerous
- ZIP files need recursive scanning for zip bombs
- Office files (DOCX) are ZIPs containing XML — check for XXE
- GIF89a header with PHP code can execute on some servers