diff --git a/.github/workflows/docker-publish.yml b/.github/workflows/docker-publish.yml new file mode 100644 index 0000000..66227e6 --- /dev/null +++ b/.github/workflows/docker-publish.yml @@ -0,0 +1,92 @@ +name: Build and Publish Docker Images + +on: + push: + branches: + - main + - develop + tags: + - 'v*' + pull_request: + branches: + - main + workflow_dispatch: + +env: + REGISTRY: docker.io + IMAGE_NAME: beshultd/deltaglider + +jobs: + build-and-push: + runs-on: ubuntu-latest + permissions: + contents: read + packages: write + + steps: + - name: Checkout repository + uses: actions/checkout@v4 + with: + fetch-depth: 0 # Full history for proper git describe + + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + + - name: Log in to Docker Hub + if: github.event_name != 'pull_request' + uses: docker/login-action@v3 + with: + username: ${{ secrets.DOCKERHUB_USERNAME }} + password: ${{ secrets.DOCKERHUB_TOKEN }} + + - name: Extract version from git + id: version + run: | + # Get version from git tags + VERSION=$(git describe --tags --always --abbrev=0 2>/dev/null || echo "dev") + # Remove 'v' prefix if present + VERSION=${VERSION#v} + echo "version=${VERSION}" >> $GITHUB_OUTPUT + echo "Version: ${VERSION}" + + - name: Extract metadata (tags, labels) for Docker + id: meta + uses: docker/metadata-action@v5 + with: + images: ${{ env.IMAGE_NAME }} + tags: | + # For main branch: tag as 'latest' + type=raw,value=latest,enable=${{ github.ref == 'refs/heads/main' }} + # For develop branch: tag as 'develop' + type=raw,value=develop,enable=${{ github.ref == 'refs/heads/develop' }} + # For version tags: use semver patterns + type=semver,pattern={{version}} + type=semver,pattern={{major}}.{{minor}} + type=semver,pattern={{major}} + # For PRs: tag as pr- + type=ref,event=pr + # Always include git sha for traceability + type=sha,prefix={{branch}}- + + - name: Build and push Docker image + uses: docker/build-push-action@v5 + with: + context: . + platforms: linux/amd64,linux/arm64 + push: ${{ github.event_name != 'pull_request' }} + tags: ${{ steps.meta.outputs.tags }} + labels: ${{ steps.meta.outputs.labels }} + build-args: | + VERSION=${{ steps.version.outputs.version }} + cache-from: type=gha + cache-to: type=gha,mode=max + + - name: Docker Hub Description + if: github.event_name != 'pull_request' && github.ref == 'refs/heads/main' + uses: peter-evans/dockerhub-description@v4 + with: + username: ${{ secrets.DOCKERHUB_USERNAME }} + password: ${{ secrets.DOCKERHUB_TOKEN }} + repository: ${{ env.IMAGE_NAME }} + short-description: "Store 4TB in 5GB: S3-compatible storage with 99.9% compression" + readme-filepath: ./README.md diff --git a/Dockerfile b/Dockerfile index b1eaedf..d58be97 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,6 +1,7 @@ # Multi-stage build for deltaglider ARG PYTHON_VERSION=3.12-slim ARG UV_VERSION=0.5.13 +ARG VERSION=6.0.2 # Builder stage - install UV and dependencies FROM ghcr.io/astral-sh/uv:$UV_VERSION AS uv @@ -16,16 +17,15 @@ WORKDIR /build COPY pyproject.toml ./ COPY README.md ./ -# Install dependencies with UV caching -RUN --mount=type=cache,target=/root/.cache/uv \ - uv pip install --compile-bytecode . - -# Copy source code +# Copy source code - needed for setuptools-scm to write version file COPY src ./src -# Install the package (force reinstall to ensure it's properly installed) +# Install dependencies and package with UV caching +# Set SETUPTOOLS_SCM_PRETEND_VERSION to avoid needing .git directory +ARG VERSION +ENV SETUPTOOLS_SCM_PRETEND_VERSION_FOR_DELTAGLIDER=${VERSION} RUN --mount=type=cache,target=/root/.cache/uv \ - uv pip install --compile-bytecode --no-deps --force-reinstall . + uv pip install --compile-bytecode . # Runtime stage - minimal image FROM python:${PYTHON_VERSION} @@ -90,9 +90,10 @@ ENV DG_CACHE_MEMORY_SIZE_MB=100 # ENV AWS_DEFAULT_REGION=us-east-1 # Labels +ARG VERSION LABEL org.opencontainers.image.title="DeltaGlider" \ org.opencontainers.image.description="Delta-aware S3 file storage wrapper with encryption" \ - org.opencontainers.image.version="5.0.3" \ + org.opencontainers.image.version="${VERSION}" \ org.opencontainers.image.authors="Beshu Limited" \ org.opencontainers.image.source="https://github.com/beshu-tech/deltaglider" diff --git a/docs/DOCKER.md b/docs/DOCKER.md new file mode 100644 index 0000000..cce1847 --- /dev/null +++ b/docs/DOCKER.md @@ -0,0 +1,364 @@ +# Docker Support for DeltaGlider + +This document describes how to build, run, and publish Docker images for DeltaGlider. + +## Quick Start + +### Pull and run the latest image + +```bash +docker pull beshultd/deltaglider:latest +docker run --rm beshultd/deltaglider:latest --help +``` + +### Run with AWS credentials + +```bash +docker run --rm \ + -e AWS_ACCESS_KEY_ID=your_key \ + -e AWS_SECRET_ACCESS_KEY=your_secret \ + -e AWS_DEFAULT_REGION=us-east-1 \ + beshultd/deltaglider:latest ls s3://your-bucket/ +``` + +### Run with MinIO (local S3 alternative) + +```bash +# Start MinIO +docker run -d \ + -p 9000:9000 -p 9001:9001 \ + -e MINIO_ROOT_USER=minioadmin \ + -e MINIO_ROOT_PASSWORD=minioadmin \ + --name minio \ + minio/minio server /data --console-address ":9001" + +# Use DeltaGlider with MinIO +docker run --rm \ + -e AWS_ENDPOINT_URL=http://host.docker.internal:9000 \ + -e AWS_ACCESS_KEY_ID=minioadmin \ + -e AWS_SECRET_ACCESS_KEY=minioadmin \ + -e AWS_DEFAULT_REGION=us-east-1 \ + beshultd/deltaglider:latest ls +``` + +## Building Locally + +### Build with current git version + +```bash +VERSION=$(git describe --tags --always --abbrev=0 | sed 's/^v//') +docker build --build-arg VERSION=${VERSION} -t beshultd/deltaglider:${VERSION} . +``` + +### Build with custom version + +```bash +docker build --build-arg VERSION=6.0.2 -t beshultd/deltaglider:6.0.2 . +``` + +### Multi-platform build + +```bash +# Create a buildx builder (one-time setup) +docker buildx create --name deltaglider-builder --use + +# Build for multiple platforms +docker buildx build \ + --platform linux/amd64,linux/arm64 \ + --build-arg VERSION=6.0.2 \ + -t beshultd/deltaglider:6.0.2 \ + --push \ + . +``` + +## Testing the Image + +### Basic functionality test + +```bash +# Check version +docker run --rm beshultd/deltaglider:test --version + +# Check help +docker run --rm beshultd/deltaglider:test --help + +# List available commands +docker run --rm beshultd/deltaglider:test +``` + +### Integration test with MinIO + +```bash +# 1. Start MinIO +docker run -d \ + -p 9000:9000 -p 9001:9001 \ + -e MINIO_ROOT_USER=minioadmin \ + -e MINIO_ROOT_PASSWORD=minioadmin \ + --name minio \ + minio/minio server /data --console-address ":9001" + +# 2. Create a test file +echo "Hello DeltaGlider" > test.txt + +# 3. Upload to S3/MinIO +docker run --rm \ + -v $(pwd):/data \ + -w /data \ + -e AWS_ENDPOINT_URL=http://host.docker.internal:9000 \ + -e AWS_ACCESS_KEY_ID=minioadmin \ + -e AWS_SECRET_ACCESS_KEY=minioadmin \ + -e AWS_DEFAULT_REGION=us-east-1 \ + beshultd/deltaglider:test cp test.txt s3://test-bucket/ + +# 4. List bucket contents +docker run --rm \ + -e AWS_ENDPOINT_URL=http://host.docker.internal:9000 \ + -e AWS_ACCESS_KEY_ID=minioadmin \ + -e AWS_SECRET_ACCESS_KEY=minioadmin \ + -e AWS_DEFAULT_REGION=us-east-1 \ + beshultd/deltaglider:test ls s3://test-bucket/ + +# 5. Get statistics +docker run --rm \ + -e AWS_ENDPOINT_URL=http://host.docker.internal:9000 \ + -e AWS_ACCESS_KEY_ID=minioadmin \ + -e AWS_SECRET_ACCESS_KEY=minioadmin \ + -e AWS_DEFAULT_REGION=us-east-1 \ + beshultd/deltaglider:test stats test-bucket + +# 6. Cleanup +docker stop minio && docker rm minio +rm test.txt +``` + +## Publishing to Docker Hub + +### Manual Publishing + +```bash +# 1. Log in to Docker Hub +docker login + +# 2. Build the image +VERSION=$(git describe --tags --always --abbrev=0 | sed 's/^v//') +docker build --build-arg VERSION=${VERSION} \ + -t beshultd/deltaglider:${VERSION} \ + -t beshultd/deltaglider:latest \ + . + +# 3. Push to Docker Hub +docker push beshultd/deltaglider:${VERSION} +docker push beshultd/deltaglider:latest +``` + +### Multi-platform Publishing + +```bash +# Create builder (one-time setup) +docker buildx create --name deltaglider-builder --use + +# Build and push for multiple platforms +VERSION=$(git describe --tags --always --abbrev=0 | sed 's/^v//') +docker buildx build \ + --platform linux/amd64,linux/arm64 \ + --build-arg VERSION=${VERSION} \ + -t beshultd/deltaglider:${VERSION} \ + -t beshultd/deltaglider:latest \ + --push \ + . +``` + +## GitHub Actions Automation + +The repository includes a GitHub Action workflow (`.github/workflows/docker-publish.yml`) that automatically builds and publishes Docker images. + +### Automatic Publishing Triggers + +- **On main branch push**: Tags as `latest` +- **On develop branch push**: Tags as `develop` +- **On version tag push** (e.g., `v6.0.2`): Tags with semver patterns: + - `6.0.2` (full version) + - `6.0` (major.minor) + - `6` (major) +- **On pull request**: Builds but doesn't push (testing only) + +### Required GitHub Secrets + +Set these secrets in your GitHub repository settings (`Settings > Secrets and variables > Actions`): + +1. **DOCKERHUB_USERNAME**: Your Docker Hub username (e.g., `beshultd`) +2. **DOCKERHUB_TOKEN**: Docker Hub access token (create at https://hub.docker.com/settings/security) + +### Manual Workflow Trigger + +You can manually trigger the Docker build workflow from the GitHub Actions tab: + +1. Go to **Actions** tab +2. Select **Build and Publish Docker Images** +3. Click **Run workflow** +4. Select branch and click **Run workflow** + +## Docker Image Details + +### Image Layers + +The Dockerfile uses a multi-stage build: + +1. **Builder stage**: Installs UV and Python dependencies +2. **Runtime stage**: Minimal Python 3.12-slim with only runtime dependencies + +### Image Features + +- **Size**: ~150MB (compressed) +- **Platforms**: linux/amd64, linux/arm64 +- **User**: Runs as non-root user `deltaglider` (UID 1000) +- **Base**: Python 3.12-slim (Debian) +- **Dependencies**: + - Python 3.12 + - xdelta3 (binary diff tool) + - All Python dependencies from `pyproject.toml` + +### Environment Variables + +The image supports the following environment variables: + +```bash +# Logging +DG_LOG_LEVEL=INFO # DEBUG, INFO, WARNING, ERROR + +# Performance & Compression +DG_MAX_RATIO=0.5 # Max delta/file ratio (0.0-1.0) + +# Cache Configuration +DG_CACHE_BACKEND=filesystem # filesystem or memory +DG_CACHE_MEMORY_SIZE_MB=100 # Memory cache size +DG_CACHE_ENCRYPTION_KEY= # Optional encryption key + +# AWS Configuration +AWS_ENDPOINT_URL= # S3 endpoint (for MinIO/LocalStack) +AWS_ACCESS_KEY_ID= # AWS access key +AWS_SECRET_ACCESS_KEY= # AWS secret key +AWS_DEFAULT_REGION=us-east-1 # AWS region +``` + +### Health Check + +The image includes a health check that runs every 30 seconds: + +```bash +docker inspect --format='{{.State.Health.Status}}' +``` + +## Troubleshooting + +### Build Issues + +#### "setuptools-scm was unable to detect version" + +**Cause**: Git metadata not available during build. + +**Solution**: Always use the `VERSION` build arg: + +```bash +docker build --build-arg VERSION=6.0.2 -t beshultd/deltaglider:6.0.2 . +``` + +#### Cache issues + +**Cause**: Docker build cache causing stale builds. + +**Solution**: Use `--no-cache` flag: + +```bash +docker build --no-cache --build-arg VERSION=6.0.2 -t beshultd/deltaglider:6.0.2 . +``` + +### Runtime Issues + +#### "unauthorized: access token has insufficient scopes" + +**Cause**: Not logged in to Docker Hub or invalid credentials. + +**Solution**: + +```bash +docker login +# Enter your Docker Hub credentials +``` + +#### "Cannot connect to MinIO/LocalStack" + +**Cause**: Using `localhost` instead of `host.docker.internal` from inside container. + +**Solution**: Use `host.docker.internal` for Mac/Windows or `172.17.0.1` for Linux: + +```bash +# Mac/Windows +-e AWS_ENDPOINT_URL=http://host.docker.internal:9000 + +# Linux +-e AWS_ENDPOINT_URL=http://172.17.0.1:9000 +``` + +## Docker Compose + +For local development with MinIO: + +```yaml +version: '3.8' + +services: + minio: + image: minio/minio:latest + ports: + - "9000:9000" + - "9001:9001" + environment: + MINIO_ROOT_USER: minioadmin + MINIO_ROOT_PASSWORD: minioadmin + command: server /data --console-address ":9001" + healthcheck: + test: ["CMD", "curl", "-f", "http://localhost:9000/minio/health/live"] + interval: 10s + timeout: 5s + retries: 5 + + deltaglider: + image: beshultd/deltaglider:latest + environment: + AWS_ENDPOINT_URL: http://minio:9000 + AWS_ACCESS_KEY_ID: minioadmin + AWS_SECRET_ACCESS_KEY: minioadmin + AWS_DEFAULT_REGION: us-east-1 + DG_LOG_LEVEL: DEBUG + depends_on: + - minio + volumes: + - ./data:/data + working_dir: /data + command: ["--help"] +``` + +Run with: + +```bash +docker-compose up -d +docker-compose run --rm deltaglider ls +``` + +## Best Practices + +1. **Always specify version**: Use `--build-arg VERSION=x.y.z` when building +2. **Use multi-stage builds**: Keeps final image small +3. **Tag with semantic versions**: Follow semver (major.minor.patch) +4. **Test before pushing**: Run integration tests locally +5. **Use secrets**: Never hardcode credentials in images +6. **Multi-platform builds**: Support both amd64 and arm64 +7. **Update README**: Keep Docker Hub description in sync with README.md + +## Additional Resources + +- [Docker Hub Repository](https://hub.docker.com/r/beshultd/deltaglider) +- [GitHub Repository](https://github.com/beshu-tech/deltaglider) +- [MinIO Documentation](https://min.io/docs/minio/container/index.html) +- [Docker Buildx Documentation](https://docs.docker.com/buildx/working-with-buildx/) diff --git a/docs/PAGINATION_BUG_FIX.md b/docs/PAGINATION_BUG_FIX.md deleted file mode 100644 index d706ee5..0000000 --- a/docs/PAGINATION_BUG_FIX.md +++ /dev/null @@ -1,258 +0,0 @@ -# Pagination Bug Fix - Critical Issue Resolution - -## Summary - -**Date**: 2025-10-14 -**Severity**: Critical (infinite loop causing operations to never complete) -**Status**: Fixed - -Fixed a critical pagination bug that caused S3 LIST operations to loop infinitely, returning the same objects repeatedly instead of advancing through the bucket. - -## The Bug - -### Symptoms -- LIST operations would take minutes or never complete -- Pagination logs showed linear growth: page 10 = 9,000 objects, page 20 = 19,000 objects, etc. -- Buckets with ~hundreds of objects showed 169,000+ objects after 170+ pages -- System meters showed continuous 3MB/s download during listing -- Operation would eventually hit max_iterations limit (10,000 pages) and return partial results - -### Root Cause - -The code was using **StartAfter** with **NextContinuationToken**, which is incorrect according to AWS S3 API: - -**Incorrect behavior (before fix)**: -```python -# In list_objects_page() call -response = storage.list_objects( - bucket=bucket, - start_after=page.next_continuation_token, # ❌ WRONG! -) - -# In storage_s3.py -if start_after: - params["StartAfter"] = start_after # ❌ Expects object key, not token! -``` - -**Problem**: -- `NextContinuationToken` is an opaque token from S3's `list_objects_v2` response -- `StartAfter` expects an **actual object key** (string), not a continuation token -- When boto3 receives an invalid StartAfter value (a token instead of a key), it ignores it and restarts from the beginning -- This caused pagination to restart on every page, returning the same objects repeatedly - -### Why It Happened - -The S3 LIST pagination API has two different mechanisms: - -1. **StartAfter** (S3 v1 style): Resume listing after a specific object key - - Used for the **first page** when you want to start from a specific key - - Example: `StartAfter="my-object-123.txt"` - -2. **ContinuationToken** (S3 v2 style): Resume from an opaque token - - Used for **subsequent pages** in paginated results - - Example: `ContinuationToken="1vD6KR5W...encrypted_token..."` - - This is what `NextContinuationToken` from the response should be used with - -Our code mixed these two mechanisms, using StartAfter for pagination when it should use ContinuationToken. - -## The Fix - -### Changed Files - -1. **src/deltaglider/adapters/storage_s3.py** - - Added `continuation_token` parameter to `list_objects()` - - Changed boto3 call to use `ContinuationToken` instead of `StartAfter` for pagination - - Kept `StartAfter` support for initial page positioning - -2. **src/deltaglider/core/object_listing.py** - - Added `continuation_token` parameter to `list_objects_page()` - - Changed `list_all_objects()` to use `continuation_token` variable instead of `start_after` - - Updated pagination loop to pass continuation tokens correctly - - Added debug logging showing continuation token in use - -### Code Changes - -**storage_s3.py - Before**: -```python -def list_objects( - self, - bucket: str, - prefix: str = "", - delimiter: str = "", - max_keys: int = 1000, - start_after: str | None = None, -) -> dict[str, Any]: - params: dict[str, Any] = {"Bucket": bucket, "MaxKeys": max_keys} - - if start_after: - params["StartAfter"] = start_after # ❌ Used for pagination - - response = self.client.list_objects_v2(**params) -``` - -**storage_s3.py - After**: -```python -def list_objects( - self, - bucket: str, - prefix: str = "", - delimiter: str = "", - max_keys: int = 1000, - start_after: str | None = None, - continuation_token: str | None = None, # ✅ NEW -) -> dict[str, Any]: - params: dict[str, Any] = {"Bucket": bucket, "MaxKeys": max_keys} - - # ✅ Use ContinuationToken for pagination, StartAfter only for first page - if continuation_token: - params["ContinuationToken"] = continuation_token - elif start_after: - params["StartAfter"] = start_after - - response = self.client.list_objects_v2(**params) -``` - -**object_listing.py - Before**: -```python -def list_all_objects(...) -> ObjectListing: - aggregated = ObjectListing() - start_after: str | None = None # ❌ Wrong variable name - - while True: - page = list_objects_page( - storage, - bucket=bucket, - start_after=start_after, # ❌ Passing token as start_after - ) - - aggregated.objects.extend(page.objects) - - if not page.is_truncated: - break - - start_after = page.next_continuation_token # ❌ Token → start_after -``` - -**object_listing.py - After**: -```python -def list_all_objects(...) -> ObjectListing: - aggregated = ObjectListing() - continuation_token: str | None = None # ✅ Correct variable - - while True: - page = list_objects_page( - storage, - bucket=bucket, - continuation_token=continuation_token, # ✅ Token → token - ) - - aggregated.objects.extend(page.objects) - - if not page.is_truncated: - break - - continuation_token = page.next_continuation_token # ✅ Token → token -``` - -## Testing - -### Unit Tests -Created comprehensive unit tests in `tests/unit/test_object_listing.py`: - -1. **test_list_objects_page_passes_continuation_token**: Verifies token is passed correctly -2. **test_list_all_objects_uses_continuation_token_for_pagination**: Verifies 3-page pagination works -3. **test_list_all_objects_prevents_infinite_loop**: Verifies max_iterations protection - -### Manual Verification -Created verification script that checks for: -- `continuation_token` parameter in both files -- `ContinuationToken` usage in boto3 call -- Token priority logic (`if continuation_token:` before `elif start_after:`) -- Correct variable names throughout pagination loop - -All checks passed ✅ - -## Expected Behavior After Fix - -### Before (Broken) -``` -[21:26:16.663] LIST pagination: page 1, 0 objects so far -[21:26:18.884] LIST pagination: page 10, 9000 objects so far -[21:26:20.930] LIST pagination: page 20, 19000 objects so far -[21:26:52.290] LIST pagination: page 170, 169000 objects so far -... continues indefinitely ... -``` - -### After (Fixed) -``` -[21:26:16.663] LIST pagination: page 1, 0 objects so far -[21:26:17.012] LIST pagination: page 2, 1000 objects so far, token=AbCd1234EfGh5678... -[21:26:17.089] LIST complete: 2 pages, 1234 objects total in 0.43s -``` - -## Performance Impact - -For a bucket with ~1,000 objects: - -**Before**: -- 170+ pages × ~200ms per page = 34+ seconds -- Would eventually timeout or hit max_iterations - -**After**: -- 2 pages × ~200ms per page = <1 second -- ~34x improvement for this case -- Actual speedup scales with bucket size (more objects = bigger speedup) - -For a bucket with 200,000 objects (typical production case): -- **Before**: Would never complete (would hit 10,000 page limit) -- **After**: ~200 pages × ~200ms = ~40 seconds (200x fewer pages!) - -## AWS S3 Pagination Documentation Reference - -From AWS S3 API documentation: - -> **ContinuationToken** (string) - Indicates that the list is being continued on this bucket with a token. ContinuationToken is obfuscated and is not a real key. -> -> **StartAfter** (string) - Starts after this specified key. StartAfter can be any key in the bucket. -> -> **NextContinuationToken** (string) - NextContinuationToken is sent when isTruncated is true, which means there are more keys in the bucket that can be listed. The next list requests to Amazon S3 can be continued with this NextContinuationToken. - -Source: [AWS S3 ListObjectsV2 API Documentation](https://docs.aws.amazon.com/AmazonS3/latest/API/API_ListObjectsV2.html) - -## Related Issues - -This bug also affected: -- `get_bucket_stats()` - Would take 20+ minutes due to infinite pagination -- Any operation using `list_all_objects()` - sync, ls, etc. - -All these operations are now fixed by this pagination fix. - -## Prevention - -To prevent similar issues in the future: - -1. ✅ **Unit tests added**: Verify pagination token handling -2. ✅ **Debug logging added**: Shows continuation token in use -3. ✅ **Type checking**: mypy catches parameter mismatches -4. ✅ **Max iterations limit**: Prevents truly infinite loops (fails safely) -5. ✅ **Documentation**: This document explains the fix - -## Verification Checklist - -- [x] Code changes implemented -- [x] Unit tests added -- [x] Type checking passes (mypy) -- [x] Linting passes (ruff) -- [x] Manual verification script passes -- [x] Documentation created -- [x] Performance characteristics documented -- [x] AWS API documentation referenced - -## Author Notes - -This was a classic case of mixing two similar but different API mechanisms. The bug was subtle because: -1. boto3 didn't throw an error - it silently ignored the invalid StartAfter value -2. The pagination appeared to work (returned objects), just the wrong objects -3. The linear growth pattern (9K, 19K, 29K) made it look like a counting bug, not a pagination bug - -The fix is simple but critical: use the right parameter (`ContinuationToken`) with the right value (`NextContinuationToken`).