mirror of
https://github.com/beshu-tech/deltaglider.git
synced 2026-04-30 12:14:32 +02:00
Compare commits
29 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
a98fc7c178 | ||
|
|
82e00623de | ||
|
|
e8c76f1dc7 | ||
|
|
c492a5087b | ||
|
|
85af5a95c8 | ||
|
|
60b70309fa | ||
|
|
b0699f952a | ||
|
|
9bfe121f44 | ||
|
|
6cab3de9a0 | ||
|
|
482f45fc02 | ||
|
|
6b3245266e | ||
|
|
20053acb5f | ||
|
|
87f425734f | ||
|
|
012662c377 | ||
|
|
284f030fae | ||
|
|
7a4d30a007 | ||
|
|
0d46283ff0 | ||
|
|
805e2967bc | ||
|
|
2ef1741d51 | ||
|
|
2c1d756e7b | ||
|
|
c6cee7ae26 | ||
|
|
cee9a9fd2d | ||
|
|
0507e6ebcd | ||
|
|
fa9c4fa42d | ||
|
|
934d83975c | ||
|
|
c32d5265d9 | ||
|
|
1cf7e3ad21 | ||
|
|
9b36087438 | ||
|
|
60877966f2 |
2
.github/workflows/ci.yml
vendored
2
.github/workflows/ci.yml
vendored
@@ -98,7 +98,7 @@ jobs:
|
||||
runs-on: ubuntu-latest
|
||||
services:
|
||||
localstack:
|
||||
image: localstack/localstack:latest
|
||||
image: localstack/localstack:4.4
|
||||
ports:
|
||||
- 4566:4566
|
||||
env:
|
||||
|
||||
92
.github/workflows/docker-publish.yml
vendored
Normal file
92
.github/workflows/docker-publish.yml
vendored
Normal file
@@ -0,0 +1,92 @@
|
||||
name: Build and Publish Docker Images
|
||||
|
||||
on:
|
||||
push:
|
||||
branches:
|
||||
- main
|
||||
- develop
|
||||
tags:
|
||||
- 'v*'
|
||||
pull_request:
|
||||
branches:
|
||||
- main
|
||||
workflow_dispatch:
|
||||
|
||||
env:
|
||||
REGISTRY: docker.io
|
||||
IMAGE_NAME: beshultd/deltaglider
|
||||
|
||||
jobs:
|
||||
build-and-push:
|
||||
runs-on: ubuntu-latest
|
||||
permissions:
|
||||
contents: read
|
||||
packages: write
|
||||
|
||||
steps:
|
||||
- name: Checkout repository
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
fetch-depth: 0 # Full history for proper git describe
|
||||
|
||||
- name: Set up Docker Buildx
|
||||
uses: docker/setup-buildx-action@v3
|
||||
|
||||
- name: Log in to Docker Hub
|
||||
if: github.event_name != 'pull_request'
|
||||
uses: docker/login-action@v3
|
||||
with:
|
||||
username: ${{ secrets.DOCKERHUB_USERNAME }}
|
||||
password: ${{ secrets.DOCKERHUB_TOKEN }}
|
||||
|
||||
- name: Extract version from git
|
||||
id: version
|
||||
run: |
|
||||
# Get version from git tags
|
||||
VERSION=$(git describe --tags --always --abbrev=0 2>/dev/null || echo "dev")
|
||||
# Remove 'v' prefix if present
|
||||
VERSION=${VERSION#v}
|
||||
echo "version=${VERSION}" >> $GITHUB_OUTPUT
|
||||
echo "Version: ${VERSION}"
|
||||
|
||||
- name: Extract metadata (tags, labels) for Docker
|
||||
id: meta
|
||||
uses: docker/metadata-action@v5
|
||||
with:
|
||||
images: ${{ env.IMAGE_NAME }}
|
||||
tags: |
|
||||
# For main branch: tag as 'latest'
|
||||
type=raw,value=latest,enable=${{ github.ref == 'refs/heads/main' }}
|
||||
# For develop branch: tag as 'develop'
|
||||
type=raw,value=develop,enable=${{ github.ref == 'refs/heads/develop' }}
|
||||
# For version tags: use semver patterns
|
||||
type=semver,pattern={{version}}
|
||||
type=semver,pattern={{major}}.{{minor}}
|
||||
type=semver,pattern={{major}}
|
||||
# For PRs: tag as pr-<number>
|
||||
type=ref,event=pr
|
||||
# Include git sha for traceability (only on branch pushes, not tags)
|
||||
type=sha,prefix={{branch}}-,enable=${{ startsWith(github.ref, 'refs/heads/') }}
|
||||
|
||||
- name: Build and push Docker image
|
||||
uses: docker/build-push-action@v5
|
||||
with:
|
||||
context: .
|
||||
platforms: linux/amd64,linux/arm64
|
||||
push: ${{ github.event_name != 'pull_request' }}
|
||||
tags: ${{ steps.meta.outputs.tags }}
|
||||
labels: ${{ steps.meta.outputs.labels }}
|
||||
build-args: |
|
||||
VERSION=${{ steps.version.outputs.version }}
|
||||
cache-from: type=gha
|
||||
cache-to: type=gha,mode=max
|
||||
|
||||
- name: Docker Hub Description
|
||||
if: github.event_name != 'pull_request' && github.ref == 'refs/heads/main'
|
||||
uses: peter-evans/dockerhub-description@v4
|
||||
with:
|
||||
username: ${{ secrets.DOCKERHUB_USERNAME }}
|
||||
password: ${{ secrets.DOCKERHUB_TOKEN }}
|
||||
repository: ${{ env.IMAGE_NAME }}
|
||||
short-description: "Store 4TB in 5GB: S3-compatible storage with 99.9% compression"
|
||||
readme-filepath: ./README.md
|
||||
2
.github/workflows/release-manual.yml
vendored
2
.github/workflows/release-manual.yml
vendored
@@ -146,7 +146,7 @@ jobs:
|
||||
runs-on: ubuntu-latest
|
||||
services:
|
||||
localstack:
|
||||
image: localstack/localstack:latest
|
||||
image: localstack/localstack:4.4
|
||||
ports:
|
||||
- 4566:4566
|
||||
env:
|
||||
|
||||
2
.github/workflows/release.yml
vendored
2
.github/workflows/release.yml
vendored
@@ -150,7 +150,7 @@ jobs:
|
||||
runs-on: ubuntu-latest
|
||||
services:
|
||||
localstack:
|
||||
image: localstack/localstack:latest
|
||||
image: localstack/localstack:4.4
|
||||
ports:
|
||||
- 4566:4566
|
||||
env:
|
||||
|
||||
@@ -2,7 +2,7 @@
|
||||
|
||||
DeltaGlider implements a **subset** of boto3's S3 client API, focusing on the most commonly used operations. This is **not** a 100% drop-in replacement, but covers the core functionality needed for most use cases.
|
||||
|
||||
## ✅ Implemented Methods (21 core methods)
|
||||
## ✅ Implemented Methods (23 core methods)
|
||||
|
||||
### Object Operations
|
||||
- ✅ `put_object()` - Upload objects (with automatic delta compression)
|
||||
@@ -17,6 +17,8 @@ DeltaGlider implements a **subset** of boto3's S3 client API, focusing on the mo
|
||||
- ✅ `create_bucket()` - Create buckets
|
||||
- ✅ `delete_bucket()` - Delete empty buckets
|
||||
- ✅ `list_buckets()` - List all buckets
|
||||
- ✅ `put_bucket_acl()` - Set bucket ACL (passthrough to S3)
|
||||
- ✅ `get_bucket_acl()` - Get bucket ACL (passthrough to S3)
|
||||
|
||||
### Presigned URLs
|
||||
- ✅ `generate_presigned_url()` - Generate presigned URLs
|
||||
@@ -46,8 +48,6 @@ DeltaGlider implements a **subset** of boto3's S3 client API, focusing on the mo
|
||||
- ❌ `list_parts()`
|
||||
|
||||
### Access Control (ACL)
|
||||
- ❌ `get_bucket_acl()`
|
||||
- ❌ `put_bucket_acl()`
|
||||
- ❌ `get_object_acl()`
|
||||
- ❌ `put_object_acl()`
|
||||
- ❌ `get_public_access_block()`
|
||||
@@ -135,9 +135,9 @@ DeltaGlider implements a **subset** of boto3's S3 client API, focusing on the mo
|
||||
|
||||
## Coverage Analysis
|
||||
|
||||
**Implemented:** ~21 methods
|
||||
**Implemented:** ~23 methods
|
||||
**Total boto3 S3 methods:** ~100+ methods
|
||||
**Coverage:** ~20%
|
||||
**Coverage:** ~23%
|
||||
|
||||
## What's Covered
|
||||
|
||||
|
||||
52
CHANGELOG.md
52
CHANGELOG.md
@@ -5,7 +5,51 @@ All notable changes to this project will be documented in this file.
|
||||
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
|
||||
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
||||
|
||||
## [Unreleased]
|
||||
## [6.1.1] - 2026-03-23
|
||||
|
||||
### Fixed
|
||||
- **S3-Compatible Endpoint Support**: Disabled boto3 automatic request checksums (CRC32/CRC64) that were added in boto3 1.36+. S3-compatible stores like Hetzner Object Storage reject these headers with `BadRequest`, breaking direct (non-delta) file uploads. Sets `request_checksum_calculation="when_required"` to restore compatibility while still working with AWS S3.
|
||||
- **CI: LocalStack pinned to 4.4** — `localstack/localstack:latest` now requires a paid license; pinned to last free version across all workflows and docker-compose files.
|
||||
|
||||
### Changed
|
||||
- **Dependency Pinning**: All runtime dependencies now use major-version upper bounds (`boto3>=1.35.0,<2.0.0`, etc.) to prevent surprise breaking changes in Docker builds.
|
||||
|
||||
### Added
|
||||
- **S3 Compatibility Tests**: New `test_s3_compat.py` unit tests verifying the boto3 client disables automatic checksums and `put_object` doesn't pass checksum kwargs — regression protection for non-AWS S3 endpoints.
|
||||
- **Dependency Management Guide**: Added quarterly dependency refresh checklist and known compatibility constraints to CLAUDE.md.
|
||||
|
||||
## [6.1.0] - 2025-02-07
|
||||
|
||||
### Added
|
||||
- **Bucket ACL Management**: New `put_bucket_acl()` and `get_bucket_acl()` methods
|
||||
- boto3-compatible passthrough to native S3 ACL operations
|
||||
- Supports canned ACLs (`private`, `public-read`, `public-read-write`, `authenticated-read`)
|
||||
- Supports grant-based ACLs (`GrantRead`, `GrantWrite`, `GrantFullControl`, etc.)
|
||||
- Supports full `AccessControlPolicy` dict for fine-grained control
|
||||
- SDK method count increased from 21 to 23
|
||||
- **New CLI Commands**: `deltaglider put-bucket-acl` and `deltaglider get-bucket-acl`
|
||||
- Mirrors `aws s3api put-bucket-acl` / `get-bucket-acl` syntax
|
||||
- Accepts bucket name or `s3://bucket` URL format
|
||||
- JSON output for `get-bucket-acl` (compatible with AWS CLI)
|
||||
- Supports `--endpoint-url`, `--region`, `--profile` flags
|
||||
- **Docker Publishing**: Added GitHub Actions workflow for multi-arch Docker image builds (amd64/arm64)
|
||||
|
||||
### Changed
|
||||
- **Refactor**: Extracted `DeltaGliderConfig` dataclass for centralized configuration management
|
||||
- **Refactor**: Introduced typed `DeleteResult` and `RecursiveDeleteResult` dataclasses replacing raw dicts
|
||||
- **Refactor**: Centralized S3 metadata key aliases into `core/models.py` constants
|
||||
- **Refactor**: Extracted helper methods in `DeltaService` for improved readability
|
||||
|
||||
### Fixed
|
||||
- Removed unused imports flagged by ruff in test files
|
||||
|
||||
### Documentation
|
||||
- Updated BOTO3_COMPATIBILITY.md (coverage 20% → 23%)
|
||||
- Updated AWS S3 CLI compatibility docs with ACL command examples
|
||||
- Refreshed README with dark mode logo and streamlined content
|
||||
- Cleaned up SDK documentation and examples
|
||||
|
||||
## [6.0.0] - 2025-10-17
|
||||
|
||||
### Added
|
||||
- **EC2 Region Detection & Cost Optimization**
|
||||
@@ -34,6 +78,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
|
||||
- **DeltaService API Enhancement**: Added `override_name` parameter to `put()` method
|
||||
- Allows specifying destination filename independently of source filesystem path
|
||||
- Enables proper S3-to-S3 transfers without filesystem renaming tricks
|
||||
- **Rehydration & Purge**: Automatic rehydration of delta-compressed files for presigned URL access
|
||||
- New `deltaglider purge` CLI command to clean expired temporary files
|
||||
- **Metadata Namespace**: Centralized `dg-` prefixed metadata keys for all DeltaGlider metadata
|
||||
- **S3-Based Stats Caching**: Bucket statistics cached in S3 with automatic invalidation
|
||||
|
||||
### Fixed
|
||||
- **Critical**: S3-to-S3 migration now preserves original filenames
|
||||
@@ -240,6 +288,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
|
||||
- Delta compression for versioned artifacts
|
||||
- 99%+ compression for similar files
|
||||
|
||||
[6.1.0]: https://github.com/beshu-tech/deltaglider/compare/v6.0.2...v6.1.0
|
||||
[6.0.0]: https://github.com/beshu-tech/deltaglider/compare/v5.1.1...v6.0.0
|
||||
[5.1.0]: https://github.com/beshu-tech/deltaglider/compare/v5.0.3...v5.1.0
|
||||
[5.0.3]: https://github.com/beshu-tech/deltaglider/compare/v5.0.1...v5.0.3
|
||||
[5.0.1]: https://github.com/beshu-tech/deltaglider/compare/v5.0.0...v5.0.1
|
||||
|
||||
36
CLAUDE.md
36
CLAUDE.md
@@ -79,12 +79,14 @@ deltaglider stats test-bucket # Get bucket statistics
|
||||
|
||||
### Available CLI Commands
|
||||
```bash
|
||||
cp # Copy files to/from S3 (AWS S3 compatible)
|
||||
ls # List S3 buckets or objects (AWS S3 compatible)
|
||||
rm # Remove S3 objects (AWS S3 compatible)
|
||||
sync # Synchronize directories with S3 (AWS S3 compatible)
|
||||
stats # Get bucket statistics and compression metrics
|
||||
verify # Verify integrity of delta file
|
||||
cp # Copy files to/from S3 (AWS S3 compatible)
|
||||
ls # List S3 buckets or objects (AWS S3 compatible)
|
||||
rm # Remove S3 objects (AWS S3 compatible)
|
||||
sync # Synchronize directories with S3 (AWS S3 compatible)
|
||||
stats # Get bucket statistics and compression metrics
|
||||
verify # Verify integrity of delta file
|
||||
put-bucket-acl # Set bucket ACL (s3api compatible passthrough)
|
||||
get-bucket-acl # Get bucket ACL (s3api compatible passthrough)
|
||||
```
|
||||
|
||||
## Architecture
|
||||
@@ -254,4 +256,24 @@ Core delta logic is in `src/deltaglider/core/service.py`:
|
||||
- **Auto-Cleanup**: Corrupted or tampered cache files automatically deleted on decryption failures
|
||||
- **Persistent Keys**: Set `DG_CACHE_ENCRYPTION_KEY` only for cross-process cache sharing (use secrets management)
|
||||
- **Content-Addressed Storage**: SHA256-based filenames prevent collision attacks
|
||||
- **Zero-Trust Cache**: All cache operations include cryptographic validation
|
||||
- **Zero-Trust Cache**: All cache operations include cryptographic validation
|
||||
|
||||
## Dependency Management
|
||||
|
||||
### Pinning Strategy
|
||||
Runtime dependencies in `pyproject.toml` use **compatible range pins** (`>=x.y.z,<NEXT_MAJOR`). This prevents surprise breaking changes from major versions while allowing patch/minor updates.
|
||||
|
||||
**Critical dependency: `boto3`** — This is the most breakage-prone dependency. AWS periodically changes default behaviors in minor releases (e.g., boto3 1.36+ added automatic request checksums that break S3-compatible stores like Hetzner Object Storage). The S3 adapter (`adapters/storage_s3.py`) explicitly sets `request_checksum_calculation="when_required"` to maintain compatibility with non-AWS S3 endpoints.
|
||||
|
||||
### Quarterly Dependency Refresh (do every ~3 months)
|
||||
1. **Check for updates**: `uv pip compile pyproject.toml --upgrade --dry-run`
|
||||
2. **Update in a branch**: bump version floors in `pyproject.toml` to current stable releases
|
||||
3. **Run full test suite**: `uv run pytest` (unit + integration)
|
||||
4. **Test against S3-compatible stores**: test a small file upload against Hetzner (or whichever non-AWS endpoint is in use) — boto3 updates are the most likely to break this
|
||||
5. **Rebuild Docker image** and test the same upload from the container
|
||||
6. **Check changelogs** for boto3, cryptography, and click for any deprecation notices or behavior changes
|
||||
|
||||
### Known Compatibility Constraints
|
||||
- **boto3**: Must use `request_checksum_calculation="when_required"` for Hetzner/MinIO compatibility. If upgrading past a new major behavior change, test direct uploads (non-delta path) of small files to non-AWS endpoints.
|
||||
- **cryptography**: Fernet API has been stable, but major versions may drop old OpenSSL support. Verify cache encryption still works after upgrades.
|
||||
- **click**: CLI argument parsing. Major versions may change decorator behavior. Run integration tests (`test_aws_cli_commands_v2.py`) after upgrades.
|
||||
17
Dockerfile
17
Dockerfile
@@ -1,6 +1,7 @@
|
||||
# Multi-stage build for deltaglider
|
||||
ARG PYTHON_VERSION=3.12-slim
|
||||
ARG UV_VERSION=0.5.13
|
||||
ARG VERSION=6.0.2
|
||||
|
||||
# Builder stage - install UV and dependencies
|
||||
FROM ghcr.io/astral-sh/uv:$UV_VERSION AS uv
|
||||
@@ -16,16 +17,15 @@ WORKDIR /build
|
||||
COPY pyproject.toml ./
|
||||
COPY README.md ./
|
||||
|
||||
# Install dependencies with UV caching
|
||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
uv pip install --compile-bytecode .
|
||||
|
||||
# Copy source code
|
||||
# Copy source code - needed for setuptools-scm to write version file
|
||||
COPY src ./src
|
||||
|
||||
# Install the package (force reinstall to ensure it's properly installed)
|
||||
# Install dependencies and package with UV caching
|
||||
# Set SETUPTOOLS_SCM_PRETEND_VERSION to avoid needing .git directory
|
||||
ARG VERSION
|
||||
ENV SETUPTOOLS_SCM_PRETEND_VERSION_FOR_DELTAGLIDER=${VERSION}
|
||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
uv pip install --compile-bytecode --no-deps --force-reinstall .
|
||||
uv pip install --compile-bytecode .
|
||||
|
||||
# Runtime stage - minimal image
|
||||
FROM python:${PYTHON_VERSION}
|
||||
@@ -90,9 +90,10 @@ ENV DG_CACHE_MEMORY_SIZE_MB=100
|
||||
# ENV AWS_DEFAULT_REGION=us-east-1
|
||||
|
||||
# Labels
|
||||
ARG VERSION
|
||||
LABEL org.opencontainers.image.title="DeltaGlider" \
|
||||
org.opencontainers.image.description="Delta-aware S3 file storage wrapper with encryption" \
|
||||
org.opencontainers.image.version="5.0.3" \
|
||||
org.opencontainers.image.version="${VERSION}" \
|
||||
org.opencontainers.image.authors="Beshu Limited" \
|
||||
org.opencontainers.image.source="https://github.com/beshu-tech/deltaglider"
|
||||
|
||||
|
||||
74
README.md
74
README.md
@@ -6,14 +6,13 @@
|
||||
[](https://www.python.org/downloads/)
|
||||
[](https://github.com/jmacd/xdelta)
|
||||
|
||||
<div align="center">
|
||||
<img src="https://github.com/beshu-tech/deltaglider/raw/main/docs/deltaglider.png" alt="DeltaGlider Logo" width="500"/>
|
||||
</div>
|
||||
|
||||
**Store 4TB of similar files in 5GB. No, that's not a typo.**
|
||||
|
||||
DeltaGlider is a drop-in S3 replacement that may achieve 99.9% size reduction for versioned compressed artifacts, backups, and release archives through intelligent binary delta compression (via xdelta3).
|
||||
|
||||
> 🌟 Star if you like this! Or Leave a message in [Issues](https://github.com/beshu-tech/deltaglider/issues) - we are listening!
|
||||
|
||||
## The Problem We Solved
|
||||
|
||||
You're storing hundreds of versions of your software releases. Each 100MB build differs by <1% from the previous version. You're paying to store 100GB of what's essentially 100MB of unique data.
|
||||
@@ -26,12 +25,20 @@ From our [ReadOnlyREST case study](docs/case-study-readonlyrest.md):
|
||||
- **Before**: 201,840 files, 3.96TB storage, $1,120/year
|
||||
- **After**: Same files, 4.9GB storage, $1.32/year
|
||||
- **Compression**: 99.9% (not a typo)
|
||||
- **Integration time**: 5 minutes
|
||||
- **Integration time**: 5 minutes
|
||||
- **Data migration** `deltaglider migrate s3://origin-bucket s3://dest-bucket`
|
||||
|
||||
Deltaglider is great for compressed archives of similar content. Like multiple releases of the same software, DB backups, etc.
|
||||
We don't expect significant benefit for multimedia content like videos, but we never tried.
|
||||
|
||||
## Quick Start
|
||||
|
||||
The quickest way to start is using the GUI
|
||||
* https://github.com/sscarduzio/dg_commander/
|
||||
Deltaglider comes as SDK, CLI, but we also have a GUI:
|
||||
* https://github.com/beshu-tech/deltaglider_commander/
|
||||
|
||||
<div align="center">
|
||||
<img src="https://github.com/beshu-tech/deltaglider/raw/main/docs/deltaglider.png" alt="DeltaGlider Logo"/>
|
||||
</div>
|
||||
|
||||
### CLI Installation
|
||||
|
||||
@@ -136,11 +143,11 @@ Traditional S3:
|
||||
|
||||
With DeltaGlider:
|
||||
v1.0.0.zip (100MB) → S3: 100MB reference + 0KB delta
|
||||
v1.0.1.zip (100MB) → S3: 98KB delta (100.1MB total)
|
||||
v1.0.2.zip (100MB) → S3: 97KB delta (100.3MB total)
|
||||
v1.0.1.zip (100MB) → S3: 98KB delta (from 100.1MB total)
|
||||
v1.0.2.zip (100MB) → S3: 97KB delta (from 100.3MB total)
|
||||
```
|
||||
|
||||
DeltaGlider stores the first file as a reference and subsequent similar files as tiny deltas (differences). When you download, it reconstructs the original file perfectly using the reference + delta.
|
||||
DeltaGlider stores the first file in a directory (deltaspace) as a reference and subsequent similar files as tiny deltas (differences). When you download, it reconstructs the original file perfectly using the reference + delta.
|
||||
|
||||
### Intelligent File Type Detection
|
||||
|
||||
@@ -160,7 +167,7 @@ DeltaGlider automatically detects file types and applies the optimal strategy:
|
||||
- **AWS CLI Replacement**: Same commands as `aws s3` with automatic compression
|
||||
- **boto3-Compatible SDK**: Works with existing boto3 code with minimal changes
|
||||
- **Zero Configuration**: No databases, no manifest files, no complex setup
|
||||
- **Data Integrity**: SHA256 verification on every operation
|
||||
- **Data Integrity**: original file's SHA256 checksum saved within S3 metadata, verification on every reconstruction
|
||||
- **S3 Compatible**: Works with AWS S3, MinIO, Cloudflare R2, and any S3-compatible storage
|
||||
|
||||
## CLI Reference
|
||||
@@ -203,6 +210,12 @@ deltaglider stats my-bucket --refresh # Force cache refresh
|
||||
deltaglider stats my-bucket --no-cache # Skip caching entirely
|
||||
deltaglider stats my-bucket --json # JSON output for automation
|
||||
|
||||
# Integrity verification & maintenance
|
||||
deltaglider verify s3://releases/file.zip # Validate stored SHA256
|
||||
deltaglider purge my-bucket # Clean expired .deltaglider/tmp files
|
||||
deltaglider purge my-bucket --dry-run # Preview purge results
|
||||
deltaglider purge my-bucket --json # Machine-readable purge stats
|
||||
|
||||
# Migrate existing S3 buckets to DeltaGlider compression
|
||||
deltaglider migrate s3://old-bucket/ s3://new-bucket/ # Interactive migration
|
||||
deltaglider migrate s3://old-bucket/ s3://new-bucket/ --yes # Skip confirmation
|
||||
@@ -483,18 +496,18 @@ This is why DeltaGlider achieves 99%+ compression on versioned archives - xdelta
|
||||
|
||||
### System Architecture
|
||||
|
||||
DeltaGlider uses a clean hexagonal architecture:
|
||||
DeltaGlider intelligently stores files within **DeltaSpaces** - S3 prefixes where related files share a common reference file for delta compression:
|
||||
|
||||
```
|
||||
┌─────────────┐ ┌──────────────┐ ┌─────────────┐
|
||||
│ Your App │────▶│ DeltaGlider │────▶│ S3/MinIO │
|
||||
│ (CLI/SDK) │ │ Core │ │ Storage │
|
||||
└─────────────┘ └──────────────┘ └─────────────┘
|
||||
│
|
||||
┌──────▼───────┐
|
||||
│ Local Cache │
|
||||
│ (References) │
|
||||
└──────────────┘
|
||||
┌─────────────┐ ┌──────────────┐ ┌─────────────────┐
|
||||
│ Your App │────▶│ DeltaGlider │────▶│ DeltaSpace │
|
||||
│ (CLI/SDK) │ │ Core │ │ (S3 prefix) │
|
||||
└─────────────┘ └──────────────┘ ├─────────────────┤
|
||||
│ │ reference.bin │
|
||||
┌──────▼───────┐ │ file1.delta │
|
||||
│ Local Cache │ │ file2.delta │
|
||||
│ (References) │ │ file3.delta │
|
||||
└──────────────┘ └─────────────────┘
|
||||
```
|
||||
|
||||
**Key Components:**
|
||||
@@ -503,6 +516,9 @@ DeltaGlider uses a clean hexagonal architecture:
|
||||
- **Integrity verification**: SHA256 on every operation
|
||||
- **Local caching**: Fast repeated operations
|
||||
- **Zero dependencies**: No database, no manifest files
|
||||
- **Modular storage**: The storage layer is pluggable - you could easily replace S3 with a filesystem driver (using extended attributes for metadata) or any other backend
|
||||
|
||||
The codebase follows a ports-and-adapters pattern where core business logic is decoupled from infrastructure, with storage operations abstracted through well-defined interfaces in the `ports/` directory and concrete implementations in `adapters/`.
|
||||
|
||||
### When to Use DeltaGlider
|
||||
|
||||
@@ -647,14 +663,8 @@ MIT - Use it freely in your projects.
|
||||
|
||||
## Success Stories
|
||||
|
||||
> "We reduced our artifact storage from 4TB to 5GB. This isn't hyperbole—it's math."
|
||||
> — [ReadOnlyREST Case Study](docs/case-study-readonlyrest.md)
|
||||
|
||||
> "Our CI/CD pipeline now uploads 100x faster. Deploys that took minutes now take seconds."
|
||||
> — Platform Engineer at [redacted]
|
||||
|
||||
> "We were about to buy expensive deduplication storage. DeltaGlider saved us $50K/year."
|
||||
> — CTO at [stealth startup]
|
||||
> "We reduced our artifact storage from 4TB to 5GB. CI is also much faster, due to smaller uploads."
|
||||
> — [ReadonlyREST Case Study](docs/case-study-readonlyrest.md)
|
||||
|
||||
---
|
||||
|
||||
@@ -666,4 +676,10 @@ deltaglider analyze s3://your-bucket/
|
||||
# Output: "Potential savings: 95.2% (4.8TB → 237GB)"
|
||||
```
|
||||
|
||||
Built with ❤️ by engineers who were tired of paying to store the same bytes over and over.
|
||||
## Who built this?
|
||||
|
||||
Built with ❤️ by [ReadonlyREST](https://readonlyrest.com) engineers who were tired of paying to store the same bytes over and over.
|
||||
|
||||
We also built [Anaphora](https://anaphora.it) for aggregated reports and alerting
|
||||
|
||||
And [Deltaglider Commander](https://github.com/beshu-tech/deltaglider_commander)
|
||||
|
||||
@@ -2,7 +2,7 @@ version: '3.8'
|
||||
|
||||
services:
|
||||
localstack:
|
||||
image: localstack/localstack:latest
|
||||
image: localstack/localstack:4.4
|
||||
ports:
|
||||
- "4566:4566"
|
||||
environment:
|
||||
|
||||
@@ -22,7 +22,7 @@ services:
|
||||
retries: 5
|
||||
|
||||
localstack:
|
||||
image: localstack/localstack:latest
|
||||
image: localstack/localstack:4.4
|
||||
container_name: deltaglider-localstack
|
||||
ports:
|
||||
- "4566:4566"
|
||||
|
||||
@@ -1,28 +1,18 @@
|
||||
# boto3 Compatibility Vision
|
||||
|
||||
## Current State (v4.2.3)
|
||||
DeltaGlider is a drop-in replacement for boto3's S3 client. This document spells out what “drop-in”
|
||||
means in practice so new projects can adopt the SDK with confidence.
|
||||
|
||||
DeltaGlider currently uses custom dataclasses for responses:
|
||||
## Current State (v5.x and newer)
|
||||
|
||||
```python
|
||||
from deltaglider import create_client, ListObjectsResponse, ObjectInfo
|
||||
|
||||
client = create_client()
|
||||
response: ListObjectsResponse = client.list_objects(Bucket='my-bucket')
|
||||
|
||||
for obj in response.contents: # Custom field name
|
||||
print(f"{obj.key}: {obj.size}") # Custom ObjectInfo dataclass
|
||||
```
|
||||
|
||||
**Problems:**
|
||||
- ❌ Not a true drop-in replacement for boto3
|
||||
- ❌ Users need to learn DeltaGlider-specific types
|
||||
- ❌ Can't use with tools expecting boto3 responses
|
||||
- ❌ Different API surface (`.contents` vs `['Contents']`)
|
||||
|
||||
## Target State (v5.0.0)
|
||||
|
||||
DeltaGlider should return native boto3-compatible dicts with TypedDict type hints:
|
||||
- `DeltaGliderClient` methods such as `list_objects`, `put_object`, `get_object`, `delete_object`,
|
||||
`delete_objects`, `head_object`, etc. return **boto3-compatible dicts**.
|
||||
- TypedDict aliases in `deltaglider.types` (e.g. `ListObjectsV2Response`, `PutObjectResponse`) give
|
||||
IDE/type-checking support without importing boto3.
|
||||
- DeltaGlider-specific metadata lives inside standard boto3 fields (typically `Metadata`), so tools
|
||||
that ignore those keys see the exact same structures as they would from boto3.
|
||||
- Tests and documentation exercise and describe the boto3-style responses (`response['Contents']`
|
||||
instead of `response.contents`).
|
||||
|
||||
```python
|
||||
from deltaglider import create_client, ListObjectsV2Response
|
||||
@@ -30,239 +20,35 @@ from deltaglider import create_client, ListObjectsV2Response
|
||||
client = create_client()
|
||||
response: ListObjectsV2Response = client.list_objects(Bucket='my-bucket')
|
||||
|
||||
for obj in response['Contents']: # boto3-compatible!
|
||||
print(f"{obj['Key']}: {obj['Size']}") # Works exactly like boto3
|
||||
```
|
||||
|
||||
**Benefits:**
|
||||
- ✅ **True drop-in replacement** - swap `boto3.client('s3')` with `create_client()`
|
||||
- ✅ **No learning curve** - if you know boto3, you know DeltaGlider
|
||||
- ✅ **Tool compatibility** - works with any library expecting boto3 types
|
||||
- ✅ **Type safety** - TypedDict provides IDE autocomplete without boto3 import
|
||||
- ✅ **Zero runtime overhead** - TypedDict compiles to plain dict
|
||||
|
||||
## Implementation Plan
|
||||
|
||||
### Phase 1: Type Definitions ✅ (DONE)
|
||||
|
||||
Created `deltaglider/types.py` with comprehensive TypedDict definitions:
|
||||
|
||||
```python
|
||||
from typing import TypedDict, NotRequired
|
||||
from datetime import datetime
|
||||
|
||||
class S3Object(TypedDict):
|
||||
Key: str
|
||||
Size: int
|
||||
LastModified: datetime
|
||||
ETag: NotRequired[str]
|
||||
StorageClass: NotRequired[str]
|
||||
|
||||
class ListObjectsV2Response(TypedDict):
|
||||
Contents: list[S3Object]
|
||||
CommonPrefixes: NotRequired[list[dict[str, str]]]
|
||||
IsTruncated: NotRequired[bool]
|
||||
NextContinuationToken: NotRequired[str]
|
||||
```
|
||||
|
||||
**Key insight:** TypedDict provides type safety at development time but compiles to plain `dict` at runtime!
|
||||
|
||||
### Phase 2: Refactor Client Methods (TODO)
|
||||
|
||||
Update all client methods to return boto3-compatible dicts:
|
||||
|
||||
#### `list_objects()`
|
||||
|
||||
**Before:**
|
||||
```python
|
||||
def list_objects(...) -> ListObjectsResponse: # Custom dataclass
|
||||
return ListObjectsResponse(
|
||||
name=bucket,
|
||||
contents=[ObjectInfo(...), ...] # Custom dataclass
|
||||
)
|
||||
```
|
||||
|
||||
**After:**
|
||||
```python
|
||||
def list_objects(...) -> ListObjectsV2Response: # TypedDict
|
||||
return {
|
||||
'Contents': [
|
||||
{
|
||||
'Key': 'file.zip', # .delta suffix already stripped
|
||||
'Size': 1024,
|
||||
'LastModified': datetime(...),
|
||||
'ETag': '"abc123"',
|
||||
}
|
||||
],
|
||||
'CommonPrefixes': [{'Prefix': 'dir/'}],
|
||||
'IsTruncated': False,
|
||||
}
|
||||
```
|
||||
|
||||
**Key changes:**
|
||||
1. Return plain dict instead of custom dataclass
|
||||
2. Use boto3 field names: `Contents` not `contents`, `Key` not `key`
|
||||
3. Strip `.delta` suffix transparently (already done)
|
||||
4. Hide `reference.bin` files (already done)
|
||||
|
||||
#### `put_object()`
|
||||
|
||||
**Before:**
|
||||
```python
|
||||
def put_object(...) -> dict[str, Any]:
|
||||
return {
|
||||
"ETag": etag,
|
||||
"VersionId": None,
|
||||
"DeltaGliderInfo": {...} # Custom field
|
||||
}
|
||||
```
|
||||
|
||||
**After:**
|
||||
```python
|
||||
def put_object(...) -> PutObjectResponse: # TypedDict
|
||||
return {
|
||||
'ETag': etag,
|
||||
'ResponseMetadata': {'HTTPStatusCode': 200},
|
||||
# DeltaGlider metadata goes in Metadata field
|
||||
'Metadata': {
|
||||
'deltaglider-is-delta': 'true',
|
||||
'deltaglider-compression-ratio': '0.99'
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
#### `get_object()`
|
||||
|
||||
**Before:**
|
||||
```python
|
||||
def get_object(...) -> dict[str, Any]:
|
||||
return {
|
||||
"Body": data,
|
||||
"ContentLength": len(data),
|
||||
"DeltaGliderInfo": {...} # Custom field
|
||||
}
|
||||
```
|
||||
|
||||
**After:**
|
||||
```python
|
||||
def get_object(...) -> GetObjectResponse: # TypedDict
|
||||
return {
|
||||
'Body': data, # bytes, not StreamingBody (simpler!)
|
||||
'ContentLength': len(data),
|
||||
'LastModified': datetime(...),
|
||||
'ETag': '"abc123"',
|
||||
'Metadata': { # DeltaGlider metadata here
|
||||
'deltaglider-is-delta': 'true'
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
#### `delete_object()`, `delete_objects()`, `head_object()`, etc.
|
||||
|
||||
All follow the same pattern: return boto3-compatible dicts with TypedDict hints.
|
||||
|
||||
### Phase 3: Backward Compatibility (TODO)
|
||||
|
||||
Keep old dataclasses for 1-2 versions with deprecation warnings:
|
||||
|
||||
```python
|
||||
class ListObjectsResponse:
|
||||
"""DEPRECATED: Use dict responses with ListObjectsV2Response type hint.
|
||||
|
||||
This will be removed in v6.0.0. Update your code:
|
||||
|
||||
Before:
|
||||
response.contents[0].key
|
||||
|
||||
After:
|
||||
response['Contents'][0]['Key']
|
||||
"""
|
||||
def __init__(self, data: dict):
|
||||
warnings.warn(
|
||||
"ListObjectsResponse dataclass is deprecated. "
|
||||
"Use dict responses with ListObjectsV2Response type hint.",
|
||||
DeprecationWarning,
|
||||
stacklevel=2
|
||||
)
|
||||
self._data = data
|
||||
|
||||
@property
|
||||
def contents(self):
|
||||
return [ObjectInfo(obj) for obj in self._data.get('Contents', [])]
|
||||
```
|
||||
|
||||
### Phase 4: Update Documentation (TODO)
|
||||
|
||||
1. Update all examples to use dict responses
|
||||
2. Add migration guide from v4.x to v5.0
|
||||
3. Update BOTO3_COMPATIBILITY.md
|
||||
4. Add "Drop-in Replacement" marketing language
|
||||
|
||||
### Phase 5: Update Tests (TODO)
|
||||
|
||||
Convert all tests from:
|
||||
```python
|
||||
assert response.contents[0].key == "file.zip"
|
||||
```
|
||||
|
||||
To:
|
||||
```python
|
||||
assert response['Contents'][0]['Key'] == "file.zip"
|
||||
```
|
||||
|
||||
## Migration Guide (for users)
|
||||
|
||||
### v4.x → v5.0
|
||||
|
||||
**Old code (v4.x):**
|
||||
```python
|
||||
from deltaglider import create_client
|
||||
|
||||
client = create_client()
|
||||
response = client.list_objects(Bucket='my-bucket')
|
||||
|
||||
for obj in response.contents: # Dataclass attribute
|
||||
print(f"{obj.key}: {obj.size}") # Dataclass attributes
|
||||
```
|
||||
|
||||
**New code (v5.0):**
|
||||
```python
|
||||
from deltaglider import create_client, ListObjectsV2Response
|
||||
|
||||
client = create_client()
|
||||
response: ListObjectsV2Response = client.list_objects(Bucket='my-bucket')
|
||||
|
||||
for obj in response['Contents']: # Dict key (boto3-compatible)
|
||||
print(f"{obj['Key']}: {obj['Size']}") # Dict keys (boto3-compatible)
|
||||
```
|
||||
|
||||
**Or even simpler - no type hint needed:**
|
||||
```python
|
||||
client = create_client()
|
||||
response = client.list_objects(Bucket='my-bucket')
|
||||
|
||||
for obj in response['Contents']:
|
||||
print(f"{obj['Key']}: {obj['Size']}")
|
||||
print(f"{obj['Key']}: {obj['Size']} bytes")
|
||||
```
|
||||
|
||||
## Key Design Points
|
||||
|
||||
- **TypedDict everywhere** – `put_object`, `get_object`, `list_objects`, `delete_object`, etc.
|
||||
return the same shapes boto3 does. Use the provided aliases (`ListObjectsV2Response`,
|
||||
`PutObjectResponse`, …) for IDE/completion help.
|
||||
- **Metadata namespace** – DeltaGlider-specific flags such as `deltaglider-is-delta` live under the
|
||||
regular `Metadata` key so every response remains valid boto3 output.
|
||||
- **No shims required** – responses are plain dicts. If you already know boto3, you already know how
|
||||
to consume DeltaGlider outputs.
|
||||
|
||||
## Benefits Summary
|
||||
|
||||
### For Users
|
||||
- **Zero learning curve** - if you know boto3, you're done
|
||||
- **Drop-in replacement** - literally change one line (client creation)
|
||||
- **Type safety** - TypedDict provides autocomplete without boto3 dependency
|
||||
- **Tool compatibility** - works with all boto3-compatible libraries
|
||||
- **Zero learning curve** – identical data structures to boto3.
|
||||
- **Tooling compatibility** – works with any boto3-aware tool or library.
|
||||
- **Type safety** – TypedDicts provide IDE autocomplete even without boto3 installed.
|
||||
|
||||
### For DeltaGlider
|
||||
- **Simpler codebase** - no custom dataclasses to maintain
|
||||
- **Better marketing** - true "drop-in replacement" claim
|
||||
- **Easier testing** - test against boto3 behavior directly
|
||||
- **Future-proof** - if boto3 adds fields, users can access them immediately
|
||||
- **Cleaner internals** – no custom dataclasses to maintain.
|
||||
- **Simpler docs/tests** – examples mirror boto3 verbatim.
|
||||
- **Marketing accuracy** – "drop-in replacement" is now literal.
|
||||
|
||||
## Technical Details
|
||||
|
||||
### How TypedDict Works
|
||||
|
||||
### TypedDict refresher
|
||||
```python
|
||||
from typing import TypedDict
|
||||
|
||||
@@ -270,47 +56,29 @@ class MyResponse(TypedDict):
|
||||
Key: str
|
||||
Size: int
|
||||
|
||||
# At runtime, this is just a dict!
|
||||
response: MyResponse = {'Key': 'file.zip', 'Size': 1024}
|
||||
print(type(response)) # <class 'dict'>
|
||||
|
||||
# But mypy and IDEs understand the structure
|
||||
response['Key'] # ✅ Autocomplete works!
|
||||
response['Nonexistent'] # ❌ Mypy error: Key 'Nonexistent' not found
|
||||
resp: MyResponse = {'Key': 'file.zip', 'Size': 1024}
|
||||
print(type(resp)) # <class 'dict'>
|
||||
```
|
||||
At runtime the structure is still a plain `dict`, but static type-checkers understand the shape.
|
||||
|
||||
### DeltaGlider-Specific Metadata
|
||||
|
||||
Store in standard boto3 `Metadata` field:
|
||||
### DeltaGlider Metadata
|
||||
|
||||
Delta-specific fields live inside the standard `Metadata` map. Example list_objects entry:
|
||||
```python
|
||||
{
|
||||
'Key': 'file.zip',
|
||||
'Size': 1024,
|
||||
'Metadata': {
|
||||
# DeltaGlider-specific fields (prefixed for safety)
|
||||
'deltaglider-is-delta': 'true',
|
||||
'deltaglider-compression-ratio': '0.99',
|
||||
'deltaglider-original-size': '100000',
|
||||
'deltaglider-reference-key': 'releases/v1.0.0/reference.bin',
|
||||
'deltaglider-original-size': '50000000',
|
||||
}
|
||||
}
|
||||
```
|
||||
These keys are namespaced (`deltaglider-...`) so they are safe to ignore if not needed.
|
||||
|
||||
This is:
|
||||
- ✅ boto3-compatible (Metadata is a standard field)
|
||||
- ✅ Namespaced (deltaglider- prefix prevents conflicts)
|
||||
- ✅ Optional (tools can ignore it)
|
||||
- ✅ Type-safe (Metadata: NotRequired[dict[str, str]])
|
||||
## Status Snapshot
|
||||
|
||||
## Status
|
||||
|
||||
- ✅ **Phase 1:** TypedDict definitions created
|
||||
- ✅ **Phase 2:** `list_objects()` refactored to return boto3-compatible dict
|
||||
- ⏳ **Phase 3:** Refactor remaining methods (`put_object`, `get_object`, etc.) (TODO)
|
||||
- ⏳ **Phase 4:** Backward compatibility with deprecation warnings (TODO)
|
||||
- ⏳ **Phase 5:** Documentation updates (TODO)
|
||||
- ⏳ **Phase 6:** Full test coverage updates (PARTIAL - list_objects tests done)
|
||||
|
||||
**Current:** v4.2.3+ (Phase 2 complete - `list_objects()` boto3-compatible)
|
||||
**Target:** v5.0.0 release (all phases complete)
|
||||
- ✅ TypedDict builders are used everywhere (`build_list_objects_response`, etc.).
|
||||
- ✅ Tests assert boto3-style dict access (`response['Contents']`).
|
||||
- ✅ Documentation (README, SDK docs, examples) shows the boto3 syntax.
|
||||
|
||||
364
docs/DOCKER.md
Normal file
364
docs/DOCKER.md
Normal file
@@ -0,0 +1,364 @@
|
||||
# Docker Support for DeltaGlider
|
||||
|
||||
This document describes how to build, run, and publish Docker images for DeltaGlider.
|
||||
|
||||
## Quick Start
|
||||
|
||||
### Pull and run the latest image
|
||||
|
||||
```bash
|
||||
docker pull beshultd/deltaglider:latest
|
||||
docker run --rm beshultd/deltaglider:latest --help
|
||||
```
|
||||
|
||||
### Run with AWS credentials
|
||||
|
||||
```bash
|
||||
docker run --rm \
|
||||
-e AWS_ACCESS_KEY_ID=your_key \
|
||||
-e AWS_SECRET_ACCESS_KEY=your_secret \
|
||||
-e AWS_DEFAULT_REGION=us-east-1 \
|
||||
beshultd/deltaglider:latest ls s3://your-bucket/
|
||||
```
|
||||
|
||||
### Run with MinIO (local S3 alternative)
|
||||
|
||||
```bash
|
||||
# Start MinIO
|
||||
docker run -d \
|
||||
-p 9000:9000 -p 9001:9001 \
|
||||
-e MINIO_ROOT_USER=minioadmin \
|
||||
-e MINIO_ROOT_PASSWORD=minioadmin \
|
||||
--name minio \
|
||||
minio/minio server /data --console-address ":9001"
|
||||
|
||||
# Use DeltaGlider with MinIO
|
||||
docker run --rm \
|
||||
-e AWS_ENDPOINT_URL=http://host.docker.internal:9000 \
|
||||
-e AWS_ACCESS_KEY_ID=minioadmin \
|
||||
-e AWS_SECRET_ACCESS_KEY=minioadmin \
|
||||
-e AWS_DEFAULT_REGION=us-east-1 \
|
||||
beshultd/deltaglider:latest ls
|
||||
```
|
||||
|
||||
## Building Locally
|
||||
|
||||
### Build with current git version
|
||||
|
||||
```bash
|
||||
VERSION=$(git describe --tags --always --abbrev=0 | sed 's/^v//')
|
||||
docker build --build-arg VERSION=${VERSION} -t beshultd/deltaglider:${VERSION} .
|
||||
```
|
||||
|
||||
### Build with custom version
|
||||
|
||||
```bash
|
||||
docker build --build-arg VERSION=6.0.2 -t beshultd/deltaglider:6.0.2 .
|
||||
```
|
||||
|
||||
### Multi-platform build
|
||||
|
||||
```bash
|
||||
# Create a buildx builder (one-time setup)
|
||||
docker buildx create --name deltaglider-builder --use
|
||||
|
||||
# Build for multiple platforms
|
||||
docker buildx build \
|
||||
--platform linux/amd64,linux/arm64 \
|
||||
--build-arg VERSION=6.0.2 \
|
||||
-t beshultd/deltaglider:6.0.2 \
|
||||
--push \
|
||||
.
|
||||
```
|
||||
|
||||
## Testing the Image
|
||||
|
||||
### Basic functionality test
|
||||
|
||||
```bash
|
||||
# Check version
|
||||
docker run --rm beshultd/deltaglider:test --version
|
||||
|
||||
# Check help
|
||||
docker run --rm beshultd/deltaglider:test --help
|
||||
|
||||
# List available commands
|
||||
docker run --rm beshultd/deltaglider:test
|
||||
```
|
||||
|
||||
### Integration test with MinIO
|
||||
|
||||
```bash
|
||||
# 1. Start MinIO
|
||||
docker run -d \
|
||||
-p 9000:9000 -p 9001:9001 \
|
||||
-e MINIO_ROOT_USER=minioadmin \
|
||||
-e MINIO_ROOT_PASSWORD=minioadmin \
|
||||
--name minio \
|
||||
minio/minio server /data --console-address ":9001"
|
||||
|
||||
# 2. Create a test file
|
||||
echo "Hello DeltaGlider" > test.txt
|
||||
|
||||
# 3. Upload to S3/MinIO
|
||||
docker run --rm \
|
||||
-v $(pwd):/data \
|
||||
-w /data \
|
||||
-e AWS_ENDPOINT_URL=http://host.docker.internal:9000 \
|
||||
-e AWS_ACCESS_KEY_ID=minioadmin \
|
||||
-e AWS_SECRET_ACCESS_KEY=minioadmin \
|
||||
-e AWS_DEFAULT_REGION=us-east-1 \
|
||||
beshultd/deltaglider:test cp test.txt s3://test-bucket/
|
||||
|
||||
# 4. List bucket contents
|
||||
docker run --rm \
|
||||
-e AWS_ENDPOINT_URL=http://host.docker.internal:9000 \
|
||||
-e AWS_ACCESS_KEY_ID=minioadmin \
|
||||
-e AWS_SECRET_ACCESS_KEY=minioadmin \
|
||||
-e AWS_DEFAULT_REGION=us-east-1 \
|
||||
beshultd/deltaglider:test ls s3://test-bucket/
|
||||
|
||||
# 5. Get statistics
|
||||
docker run --rm \
|
||||
-e AWS_ENDPOINT_URL=http://host.docker.internal:9000 \
|
||||
-e AWS_ACCESS_KEY_ID=minioadmin \
|
||||
-e AWS_SECRET_ACCESS_KEY=minioadmin \
|
||||
-e AWS_DEFAULT_REGION=us-east-1 \
|
||||
beshultd/deltaglider:test stats test-bucket
|
||||
|
||||
# 6. Cleanup
|
||||
docker stop minio && docker rm minio
|
||||
rm test.txt
|
||||
```
|
||||
|
||||
## Publishing to Docker Hub
|
||||
|
||||
### Manual Publishing
|
||||
|
||||
```bash
|
||||
# 1. Log in to Docker Hub
|
||||
docker login
|
||||
|
||||
# 2. Build the image
|
||||
VERSION=$(git describe --tags --always --abbrev=0 | sed 's/^v//')
|
||||
docker build --build-arg VERSION=${VERSION} \
|
||||
-t beshultd/deltaglider:${VERSION} \
|
||||
-t beshultd/deltaglider:latest \
|
||||
.
|
||||
|
||||
# 3. Push to Docker Hub
|
||||
docker push beshultd/deltaglider:${VERSION}
|
||||
docker push beshultd/deltaglider:latest
|
||||
```
|
||||
|
||||
### Multi-platform Publishing
|
||||
|
||||
```bash
|
||||
# Create builder (one-time setup)
|
||||
docker buildx create --name deltaglider-builder --use
|
||||
|
||||
# Build and push for multiple platforms
|
||||
VERSION=$(git describe --tags --always --abbrev=0 | sed 's/^v//')
|
||||
docker buildx build \
|
||||
--platform linux/amd64,linux/arm64 \
|
||||
--build-arg VERSION=${VERSION} \
|
||||
-t beshultd/deltaglider:${VERSION} \
|
||||
-t beshultd/deltaglider:latest \
|
||||
--push \
|
||||
.
|
||||
```
|
||||
|
||||
## GitHub Actions Automation
|
||||
|
||||
The repository includes a GitHub Action workflow (`.github/workflows/docker-publish.yml`) that automatically builds and publishes Docker images.
|
||||
|
||||
### Automatic Publishing Triggers
|
||||
|
||||
- **On main branch push**: Tags as `latest`
|
||||
- **On develop branch push**: Tags as `develop`
|
||||
- **On version tag push** (e.g., `v6.0.2`): Tags with semver patterns:
|
||||
- `6.0.2` (full version)
|
||||
- `6.0` (major.minor)
|
||||
- `6` (major)
|
||||
- **On pull request**: Builds but doesn't push (testing only)
|
||||
|
||||
### Required GitHub Secrets
|
||||
|
||||
Set these secrets in your GitHub repository settings (`Settings > Secrets and variables > Actions`):
|
||||
|
||||
1. **DOCKERHUB_USERNAME**: Your Docker Hub username (e.g., `beshultd`)
|
||||
2. **DOCKERHUB_TOKEN**: Docker Hub access token (create at https://hub.docker.com/settings/security)
|
||||
|
||||
### Manual Workflow Trigger
|
||||
|
||||
You can manually trigger the Docker build workflow from the GitHub Actions tab:
|
||||
|
||||
1. Go to **Actions** tab
|
||||
2. Select **Build and Publish Docker Images**
|
||||
3. Click **Run workflow**
|
||||
4. Select branch and click **Run workflow**
|
||||
|
||||
## Docker Image Details
|
||||
|
||||
### Image Layers
|
||||
|
||||
The Dockerfile uses a multi-stage build:
|
||||
|
||||
1. **Builder stage**: Installs UV and Python dependencies
|
||||
2. **Runtime stage**: Minimal Python 3.12-slim with only runtime dependencies
|
||||
|
||||
### Image Features
|
||||
|
||||
- **Size**: ~150MB (compressed)
|
||||
- **Platforms**: linux/amd64, linux/arm64
|
||||
- **User**: Runs as non-root user `deltaglider` (UID 1000)
|
||||
- **Base**: Python 3.12-slim (Debian)
|
||||
- **Dependencies**:
|
||||
- Python 3.12
|
||||
- xdelta3 (binary diff tool)
|
||||
- All Python dependencies from `pyproject.toml`
|
||||
|
||||
### Environment Variables
|
||||
|
||||
The image supports the following environment variables:
|
||||
|
||||
```bash
|
||||
# Logging
|
||||
DG_LOG_LEVEL=INFO # DEBUG, INFO, WARNING, ERROR
|
||||
|
||||
# Performance & Compression
|
||||
DG_MAX_RATIO=0.5 # Max delta/file ratio (0.0-1.0)
|
||||
|
||||
# Cache Configuration
|
||||
DG_CACHE_BACKEND=filesystem # filesystem or memory
|
||||
DG_CACHE_MEMORY_SIZE_MB=100 # Memory cache size
|
||||
DG_CACHE_ENCRYPTION_KEY= # Optional encryption key
|
||||
|
||||
# AWS Configuration
|
||||
AWS_ENDPOINT_URL= # S3 endpoint (for MinIO/LocalStack)
|
||||
AWS_ACCESS_KEY_ID= # AWS access key
|
||||
AWS_SECRET_ACCESS_KEY= # AWS secret key
|
||||
AWS_DEFAULT_REGION=us-east-1 # AWS region
|
||||
```
|
||||
|
||||
### Health Check
|
||||
|
||||
The image includes a health check that runs every 30 seconds:
|
||||
|
||||
```bash
|
||||
docker inspect --format='{{.State.Health.Status}}' <container-id>
|
||||
```
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
### Build Issues
|
||||
|
||||
#### "setuptools-scm was unable to detect version"
|
||||
|
||||
**Cause**: Git metadata not available during build.
|
||||
|
||||
**Solution**: Always use the `VERSION` build arg:
|
||||
|
||||
```bash
|
||||
docker build --build-arg VERSION=6.0.2 -t beshultd/deltaglider:6.0.2 .
|
||||
```
|
||||
|
||||
#### Cache issues
|
||||
|
||||
**Cause**: Docker build cache causing stale builds.
|
||||
|
||||
**Solution**: Use `--no-cache` flag:
|
||||
|
||||
```bash
|
||||
docker build --no-cache --build-arg VERSION=6.0.2 -t beshultd/deltaglider:6.0.2 .
|
||||
```
|
||||
|
||||
### Runtime Issues
|
||||
|
||||
#### "unauthorized: access token has insufficient scopes"
|
||||
|
||||
**Cause**: Not logged in to Docker Hub or invalid credentials.
|
||||
|
||||
**Solution**:
|
||||
|
||||
```bash
|
||||
docker login
|
||||
# Enter your Docker Hub credentials
|
||||
```
|
||||
|
||||
#### "Cannot connect to MinIO/LocalStack"
|
||||
|
||||
**Cause**: Using `localhost` instead of `host.docker.internal` from inside container.
|
||||
|
||||
**Solution**: Use `host.docker.internal` for Mac/Windows or `172.17.0.1` for Linux:
|
||||
|
||||
```bash
|
||||
# Mac/Windows
|
||||
-e AWS_ENDPOINT_URL=http://host.docker.internal:9000
|
||||
|
||||
# Linux
|
||||
-e AWS_ENDPOINT_URL=http://172.17.0.1:9000
|
||||
```
|
||||
|
||||
## Docker Compose
|
||||
|
||||
For local development with MinIO:
|
||||
|
||||
```yaml
|
||||
version: '3.8'
|
||||
|
||||
services:
|
||||
minio:
|
||||
image: minio/minio:latest
|
||||
ports:
|
||||
- "9000:9000"
|
||||
- "9001:9001"
|
||||
environment:
|
||||
MINIO_ROOT_USER: minioadmin
|
||||
MINIO_ROOT_PASSWORD: minioadmin
|
||||
command: server /data --console-address ":9001"
|
||||
healthcheck:
|
||||
test: ["CMD", "curl", "-f", "http://localhost:9000/minio/health/live"]
|
||||
interval: 10s
|
||||
timeout: 5s
|
||||
retries: 5
|
||||
|
||||
deltaglider:
|
||||
image: beshultd/deltaglider:latest
|
||||
environment:
|
||||
AWS_ENDPOINT_URL: http://minio:9000
|
||||
AWS_ACCESS_KEY_ID: minioadmin
|
||||
AWS_SECRET_ACCESS_KEY: minioadmin
|
||||
AWS_DEFAULT_REGION: us-east-1
|
||||
DG_LOG_LEVEL: DEBUG
|
||||
depends_on:
|
||||
- minio
|
||||
volumes:
|
||||
- ./data:/data
|
||||
working_dir: /data
|
||||
command: ["--help"]
|
||||
```
|
||||
|
||||
Run with:
|
||||
|
||||
```bash
|
||||
docker-compose up -d
|
||||
docker-compose run --rm deltaglider ls
|
||||
```
|
||||
|
||||
## Best Practices
|
||||
|
||||
1. **Always specify version**: Use `--build-arg VERSION=x.y.z` when building
|
||||
2. **Use multi-stage builds**: Keeps final image small
|
||||
3. **Tag with semantic versions**: Follow semver (major.minor.patch)
|
||||
4. **Test before pushing**: Run integration tests locally
|
||||
5. **Use secrets**: Never hardcode credentials in images
|
||||
6. **Multi-platform builds**: Support both amd64 and arm64
|
||||
7. **Update README**: Keep Docker Hub description in sync with README.md
|
||||
|
||||
## Additional Resources
|
||||
|
||||
- [Docker Hub Repository](https://hub.docker.com/r/beshultd/deltaglider)
|
||||
- [GitHub Repository](https://github.com/beshu-tech/deltaglider)
|
||||
- [MinIO Documentation](https://min.io/docs/minio/container/index.html)
|
||||
- [Docker Buildx Documentation](https://docs.docker.com/buildx/working-with-buildx/)
|
||||
@@ -1,237 +0,0 @@
|
||||
# Metadata Issue Diagnosis and Resolution
|
||||
|
||||
## Issue Summary
|
||||
|
||||
**Date**: 2025-10-14
|
||||
**Severity**: Medium (affects stats accuracy, not functionality)
|
||||
**Status**: Diagnosed, enhanced logging added
|
||||
|
||||
## The Problem
|
||||
|
||||
When running `deltaglider stats`, you saw warnings like:
|
||||
|
||||
```
|
||||
Delta build/1.66.1/universal/readonlyrest_kbn_universal-1.66.1_es9.1.3.zip.delta:
|
||||
no original_size metadata (original_size=342104, size=342104).
|
||||
Using compressed size as fallback. This may undercount space savings.
|
||||
```
|
||||
|
||||
This indicates that delta files are missing the `file_size` metadata key, which causes stats to undercount compression savings.
|
||||
|
||||
## Root Cause
|
||||
|
||||
The delta files in your bucket **do not have S3 object metadata** attached to them. Specifically, they're missing the `file_size` key that DeltaGlider uses to calculate the original file size before compression.
|
||||
|
||||
### Why Metadata is Missing
|
||||
|
||||
Possible causes (in order of likelihood):
|
||||
|
||||
1. **Uploaded with older DeltaGlider version**: Files uploaded before `file_size` metadata was added
|
||||
2. **Direct S3 upload**: Files copied directly via AWS CLI, s3cmd, or other tools (bypassing DeltaGlider)
|
||||
3. **Upload failure**: Metadata write failed during upload but file upload succeeded
|
||||
4. **S3 storage issue**: Metadata was lost due to S3 provider issue (rare)
|
||||
|
||||
### What DeltaGlider Expects
|
||||
|
||||
When DeltaGlider uploads a delta file, it stores these metadata keys:
|
||||
|
||||
```python
|
||||
{
|
||||
"tool": "deltaglider/5.x.x",
|
||||
"original_name": "file.zip",
|
||||
"file_sha256": "abc123...",
|
||||
"file_size": "1048576", # ← MISSING in your files
|
||||
"created_at": "2025-01-01T00:00:00Z",
|
||||
"ref_key": "prefix/reference.bin",
|
||||
"ref_sha256": "def456...",
|
||||
"delta_size": "524288",
|
||||
"delta_cmd": "xdelta3 -e -9 -s reference.bin file.zip file.zip.delta"
|
||||
}
|
||||
```
|
||||
|
||||
Without `file_size`, DeltaGlider can't calculate the space savings accurately.
|
||||
|
||||
## Impact
|
||||
|
||||
### What Works
|
||||
- ✅ File upload/download - completely unaffected
|
||||
- ✅ Delta compression - works normally
|
||||
- ✅ Verification - integrity checks work fine
|
||||
- ✅ All other operations - sync, ls, cp, etc.
|
||||
|
||||
### What's Affected
|
||||
- ❌ **Stats accuracy**: Compression metrics are undercounted
|
||||
- Files without metadata: counted as if they saved 0 bytes
|
||||
- Actual compression ratio: underestimated
|
||||
- Space saved: underestimated
|
||||
|
||||
### Example Impact
|
||||
|
||||
If you have 100 delta files:
|
||||
- 90 files with metadata: accurate stats
|
||||
- 10 files without metadata: counted at compressed size (no savings shown)
|
||||
- **Result**: Stats show ~90% of actual compression savings
|
||||
|
||||
## The Fix (Already Applied)
|
||||
|
||||
### Enhanced Logging
|
||||
|
||||
We've improved the logging in `src/deltaglider/client_operations/stats.py` to help diagnose the issue:
|
||||
|
||||
**1. During metadata fetch (lines 317-333)**:
|
||||
```python
|
||||
if "file_size" in metadata:
|
||||
original_size = int(metadata["file_size"])
|
||||
logger.debug(f"Delta {key}: using original_size={original_size} from metadata")
|
||||
else:
|
||||
logger.warning(
|
||||
f"Delta {key}: metadata missing 'file_size' key. "
|
||||
f"Available keys: {list(metadata.keys())}. "
|
||||
f"Using compressed size={size} as fallback"
|
||||
)
|
||||
```
|
||||
|
||||
This will show you exactly which metadata keys ARE present on the object.
|
||||
|
||||
**2. During stats calculation (lines 395-405)**:
|
||||
```python
|
||||
logger.warning(
|
||||
f"Delta {obj.key}: no original_size metadata "
|
||||
f"(original_size={obj.original_size}, size={obj.size}). "
|
||||
f"Using compressed size as fallback. "
|
||||
f"This may undercount space savings."
|
||||
)
|
||||
```
|
||||
|
||||
This shows both values so you can see if they're equal (metadata missing) or different (metadata present).
|
||||
|
||||
### CLI Help Improvement
|
||||
|
||||
We've also improved the `stats` command help (line 750):
|
||||
```python
|
||||
@cli.command(short_help="Get bucket statistics and compression metrics")
|
||||
```
|
||||
|
||||
And enhanced the option descriptions to be more informative.
|
||||
|
||||
## Verification
|
||||
|
||||
To check which files are missing metadata, you can use the diagnostic script:
|
||||
|
||||
```bash
|
||||
# Create and run the metadata checker
|
||||
python scripts/check_metadata.py <your-bucket-name>
|
||||
```
|
||||
|
||||
This will show:
|
||||
- Total delta files
|
||||
- Files with complete metadata
|
||||
- Files missing metadata
|
||||
- Specific missing fields for each file
|
||||
|
||||
## Resolution Options
|
||||
|
||||
### Option 1: Re-upload Files (Recommended)
|
||||
|
||||
Re-uploading files will attach proper metadata:
|
||||
|
||||
```bash
|
||||
# Re-upload a single file
|
||||
deltaglider cp local-file.zip s3://bucket/path/file.zip
|
||||
|
||||
# Re-upload a directory
|
||||
deltaglider sync local-dir/ s3://bucket/path/
|
||||
```
|
||||
|
||||
**Pros**:
|
||||
- Accurate stats for all files
|
||||
- Proper metadata for future operations
|
||||
- One-time fix
|
||||
|
||||
**Cons**:
|
||||
- Takes time to re-upload
|
||||
- Uses bandwidth
|
||||
|
||||
### Option 2: Accept Inaccurate Stats
|
||||
|
||||
Keep files as-is and accept that stats are undercounted:
|
||||
|
||||
**Pros**:
|
||||
- No work required
|
||||
- Files still work perfectly for download/verification
|
||||
|
||||
**Cons**:
|
||||
- Stats show less compression than actually achieved
|
||||
- Missing metadata for future features
|
||||
|
||||
### Option 3: Metadata Repair Tool (Future)
|
||||
|
||||
We could create a tool that:
|
||||
1. Downloads each delta file
|
||||
2. Reconstructs it to get original size
|
||||
3. Updates metadata in-place
|
||||
|
||||
**Status**: Not implemented yet, but feasible if needed.
|
||||
|
||||
## Prevention
|
||||
|
||||
For future uploads, DeltaGlider **will always** attach complete metadata (assuming current version is used).
|
||||
|
||||
The code in `src/deltaglider/core/service.py` (lines 445-467) ensures metadata is set:
|
||||
|
||||
```python
|
||||
delta_meta = DeltaMeta(
|
||||
tool=self.tool_version,
|
||||
original_name=original_name,
|
||||
file_sha256=file_sha256,
|
||||
file_size=file_size, # ← Always set
|
||||
created_at=self.clock.now(),
|
||||
ref_key=ref_key,
|
||||
ref_sha256=ref_sha256,
|
||||
delta_size=delta_size,
|
||||
delta_cmd=f"xdelta3 -e -9 -s reference.bin {original_name} {original_name}.delta",
|
||||
)
|
||||
|
||||
self.storage.put(
|
||||
full_delta_key,
|
||||
delta_path,
|
||||
delta_meta.to_dict(), # ← Includes file_size
|
||||
)
|
||||
```
|
||||
|
||||
## Testing
|
||||
|
||||
After reinstalling from source, run stats with enhanced logging:
|
||||
|
||||
```bash
|
||||
# Install from source
|
||||
pip install -e .
|
||||
|
||||
# Run stats with INFO logging to see detailed messages
|
||||
DG_LOG_LEVEL=INFO deltaglider stats mybucket --detailed
|
||||
|
||||
# Look for warnings like:
|
||||
# "Delta X: metadata missing 'file_size' key. Available keys: [...]"
|
||||
```
|
||||
|
||||
The warning will now show which metadata keys ARE present, helping you understand if:
|
||||
- Metadata is completely empty: `Available keys: []`
|
||||
- Metadata exists but incomplete: `Available keys: ['tool', 'ref_key', ...]`
|
||||
|
||||
## Summary
|
||||
|
||||
| Aspect | Status |
|
||||
|--------|--------|
|
||||
| File operations | ✅ Unaffected |
|
||||
| Stats accuracy | ⚠️ Undercounted for files missing metadata |
|
||||
| Logging | ✅ Enhanced to show missing keys |
|
||||
| Future uploads | ✅ Will have complete metadata |
|
||||
| Resolution | 📋 Re-upload or accept inaccuracy |
|
||||
|
||||
## Related Files
|
||||
|
||||
- `src/deltaglider/client_operations/stats.py` - Enhanced logging
|
||||
- `src/deltaglider/core/service.py` - Metadata creation
|
||||
- `src/deltaglider/core/models.py` - DeltaMeta definition
|
||||
- `scripts/check_metadata.py` - Diagnostic tool (NEW)
|
||||
- `docs/PAGINATION_BUG_FIX.md` - Related performance fix
|
||||
@@ -1,258 +0,0 @@
|
||||
# Pagination Bug Fix - Critical Issue Resolution
|
||||
|
||||
## Summary
|
||||
|
||||
**Date**: 2025-10-14
|
||||
**Severity**: Critical (infinite loop causing operations to never complete)
|
||||
**Status**: Fixed
|
||||
|
||||
Fixed a critical pagination bug that caused S3 LIST operations to loop infinitely, returning the same objects repeatedly instead of advancing through the bucket.
|
||||
|
||||
## The Bug
|
||||
|
||||
### Symptoms
|
||||
- LIST operations would take minutes or never complete
|
||||
- Pagination logs showed linear growth: page 10 = 9,000 objects, page 20 = 19,000 objects, etc.
|
||||
- Buckets with ~hundreds of objects showed 169,000+ objects after 170+ pages
|
||||
- System meters showed continuous 3MB/s download during listing
|
||||
- Operation would eventually hit max_iterations limit (10,000 pages) and return partial results
|
||||
|
||||
### Root Cause
|
||||
|
||||
The code was using **StartAfter** with **NextContinuationToken**, which is incorrect according to AWS S3 API:
|
||||
|
||||
**Incorrect behavior (before fix)**:
|
||||
```python
|
||||
# In list_objects_page() call
|
||||
response = storage.list_objects(
|
||||
bucket=bucket,
|
||||
start_after=page.next_continuation_token, # ❌ WRONG!
|
||||
)
|
||||
|
||||
# In storage_s3.py
|
||||
if start_after:
|
||||
params["StartAfter"] = start_after # ❌ Expects object key, not token!
|
||||
```
|
||||
|
||||
**Problem**:
|
||||
- `NextContinuationToken` is an opaque token from S3's `list_objects_v2` response
|
||||
- `StartAfter` expects an **actual object key** (string), not a continuation token
|
||||
- When boto3 receives an invalid StartAfter value (a token instead of a key), it ignores it and restarts from the beginning
|
||||
- This caused pagination to restart on every page, returning the same objects repeatedly
|
||||
|
||||
### Why It Happened
|
||||
|
||||
The S3 LIST pagination API has two different mechanisms:
|
||||
|
||||
1. **StartAfter** (S3 v1 style): Resume listing after a specific object key
|
||||
- Used for the **first page** when you want to start from a specific key
|
||||
- Example: `StartAfter="my-object-123.txt"`
|
||||
|
||||
2. **ContinuationToken** (S3 v2 style): Resume from an opaque token
|
||||
- Used for **subsequent pages** in paginated results
|
||||
- Example: `ContinuationToken="1vD6KR5W...encrypted_token..."`
|
||||
- This is what `NextContinuationToken` from the response should be used with
|
||||
|
||||
Our code mixed these two mechanisms, using StartAfter for pagination when it should use ContinuationToken.
|
||||
|
||||
## The Fix
|
||||
|
||||
### Changed Files
|
||||
|
||||
1. **src/deltaglider/adapters/storage_s3.py**
|
||||
- Added `continuation_token` parameter to `list_objects()`
|
||||
- Changed boto3 call to use `ContinuationToken` instead of `StartAfter` for pagination
|
||||
- Kept `StartAfter` support for initial page positioning
|
||||
|
||||
2. **src/deltaglider/core/object_listing.py**
|
||||
- Added `continuation_token` parameter to `list_objects_page()`
|
||||
- Changed `list_all_objects()` to use `continuation_token` variable instead of `start_after`
|
||||
- Updated pagination loop to pass continuation tokens correctly
|
||||
- Added debug logging showing continuation token in use
|
||||
|
||||
### Code Changes
|
||||
|
||||
**storage_s3.py - Before**:
|
||||
```python
|
||||
def list_objects(
|
||||
self,
|
||||
bucket: str,
|
||||
prefix: str = "",
|
||||
delimiter: str = "",
|
||||
max_keys: int = 1000,
|
||||
start_after: str | None = None,
|
||||
) -> dict[str, Any]:
|
||||
params: dict[str, Any] = {"Bucket": bucket, "MaxKeys": max_keys}
|
||||
|
||||
if start_after:
|
||||
params["StartAfter"] = start_after # ❌ Used for pagination
|
||||
|
||||
response = self.client.list_objects_v2(**params)
|
||||
```
|
||||
|
||||
**storage_s3.py - After**:
|
||||
```python
|
||||
def list_objects(
|
||||
self,
|
||||
bucket: str,
|
||||
prefix: str = "",
|
||||
delimiter: str = "",
|
||||
max_keys: int = 1000,
|
||||
start_after: str | None = None,
|
||||
continuation_token: str | None = None, # ✅ NEW
|
||||
) -> dict[str, Any]:
|
||||
params: dict[str, Any] = {"Bucket": bucket, "MaxKeys": max_keys}
|
||||
|
||||
# ✅ Use ContinuationToken for pagination, StartAfter only for first page
|
||||
if continuation_token:
|
||||
params["ContinuationToken"] = continuation_token
|
||||
elif start_after:
|
||||
params["StartAfter"] = start_after
|
||||
|
||||
response = self.client.list_objects_v2(**params)
|
||||
```
|
||||
|
||||
**object_listing.py - Before**:
|
||||
```python
|
||||
def list_all_objects(...) -> ObjectListing:
|
||||
aggregated = ObjectListing()
|
||||
start_after: str | None = None # ❌ Wrong variable name
|
||||
|
||||
while True:
|
||||
page = list_objects_page(
|
||||
storage,
|
||||
bucket=bucket,
|
||||
start_after=start_after, # ❌ Passing token as start_after
|
||||
)
|
||||
|
||||
aggregated.objects.extend(page.objects)
|
||||
|
||||
if not page.is_truncated:
|
||||
break
|
||||
|
||||
start_after = page.next_continuation_token # ❌ Token → start_after
|
||||
```
|
||||
|
||||
**object_listing.py - After**:
|
||||
```python
|
||||
def list_all_objects(...) -> ObjectListing:
|
||||
aggregated = ObjectListing()
|
||||
continuation_token: str | None = None # ✅ Correct variable
|
||||
|
||||
while True:
|
||||
page = list_objects_page(
|
||||
storage,
|
||||
bucket=bucket,
|
||||
continuation_token=continuation_token, # ✅ Token → token
|
||||
)
|
||||
|
||||
aggregated.objects.extend(page.objects)
|
||||
|
||||
if not page.is_truncated:
|
||||
break
|
||||
|
||||
continuation_token = page.next_continuation_token # ✅ Token → token
|
||||
```
|
||||
|
||||
## Testing
|
||||
|
||||
### Unit Tests
|
||||
Created comprehensive unit tests in `tests/unit/test_object_listing.py`:
|
||||
|
||||
1. **test_list_objects_page_passes_continuation_token**: Verifies token is passed correctly
|
||||
2. **test_list_all_objects_uses_continuation_token_for_pagination**: Verifies 3-page pagination works
|
||||
3. **test_list_all_objects_prevents_infinite_loop**: Verifies max_iterations protection
|
||||
|
||||
### Manual Verification
|
||||
Created verification script that checks for:
|
||||
- `continuation_token` parameter in both files
|
||||
- `ContinuationToken` usage in boto3 call
|
||||
- Token priority logic (`if continuation_token:` before `elif start_after:`)
|
||||
- Correct variable names throughout pagination loop
|
||||
|
||||
All checks passed ✅
|
||||
|
||||
## Expected Behavior After Fix
|
||||
|
||||
### Before (Broken)
|
||||
```
|
||||
[21:26:16.663] LIST pagination: page 1, 0 objects so far
|
||||
[21:26:18.884] LIST pagination: page 10, 9000 objects so far
|
||||
[21:26:20.930] LIST pagination: page 20, 19000 objects so far
|
||||
[21:26:52.290] LIST pagination: page 170, 169000 objects so far
|
||||
... continues indefinitely ...
|
||||
```
|
||||
|
||||
### After (Fixed)
|
||||
```
|
||||
[21:26:16.663] LIST pagination: page 1, 0 objects so far
|
||||
[21:26:17.012] LIST pagination: page 2, 1000 objects so far, token=AbCd1234EfGh5678...
|
||||
[21:26:17.089] LIST complete: 2 pages, 1234 objects total in 0.43s
|
||||
```
|
||||
|
||||
## Performance Impact
|
||||
|
||||
For a bucket with ~1,000 objects:
|
||||
|
||||
**Before**:
|
||||
- 170+ pages × ~200ms per page = 34+ seconds
|
||||
- Would eventually timeout or hit max_iterations
|
||||
|
||||
**After**:
|
||||
- 2 pages × ~200ms per page = <1 second
|
||||
- ~34x improvement for this case
|
||||
- Actual speedup scales with bucket size (more objects = bigger speedup)
|
||||
|
||||
For a bucket with 200,000 objects (typical production case):
|
||||
- **Before**: Would never complete (would hit 10,000 page limit)
|
||||
- **After**: ~200 pages × ~200ms = ~40 seconds (200x fewer pages!)
|
||||
|
||||
## AWS S3 Pagination Documentation Reference
|
||||
|
||||
From AWS S3 API documentation:
|
||||
|
||||
> **ContinuationToken** (string) - Indicates that the list is being continued on this bucket with a token. ContinuationToken is obfuscated and is not a real key.
|
||||
>
|
||||
> **StartAfter** (string) - Starts after this specified key. StartAfter can be any key in the bucket.
|
||||
>
|
||||
> **NextContinuationToken** (string) - NextContinuationToken is sent when isTruncated is true, which means there are more keys in the bucket that can be listed. The next list requests to Amazon S3 can be continued with this NextContinuationToken.
|
||||
|
||||
Source: [AWS S3 ListObjectsV2 API Documentation](https://docs.aws.amazon.com/AmazonS3/latest/API/API_ListObjectsV2.html)
|
||||
|
||||
## Related Issues
|
||||
|
||||
This bug also affected:
|
||||
- `get_bucket_stats()` - Would take 20+ minutes due to infinite pagination
|
||||
- Any operation using `list_all_objects()` - sync, ls, etc.
|
||||
|
||||
All these operations are now fixed by this pagination fix.
|
||||
|
||||
## Prevention
|
||||
|
||||
To prevent similar issues in the future:
|
||||
|
||||
1. ✅ **Unit tests added**: Verify pagination token handling
|
||||
2. ✅ **Debug logging added**: Shows continuation token in use
|
||||
3. ✅ **Type checking**: mypy catches parameter mismatches
|
||||
4. ✅ **Max iterations limit**: Prevents truly infinite loops (fails safely)
|
||||
5. ✅ **Documentation**: This document explains the fix
|
||||
|
||||
## Verification Checklist
|
||||
|
||||
- [x] Code changes implemented
|
||||
- [x] Unit tests added
|
||||
- [x] Type checking passes (mypy)
|
||||
- [x] Linting passes (ruff)
|
||||
- [x] Manual verification script passes
|
||||
- [x] Documentation created
|
||||
- [x] Performance characteristics documented
|
||||
- [x] AWS API documentation referenced
|
||||
|
||||
## Author Notes
|
||||
|
||||
This was a classic case of mixing two similar but different API mechanisms. The bug was subtle because:
|
||||
1. boto3 didn't throw an error - it silently ignored the invalid StartAfter value
|
||||
2. The pagination appeared to work (returned objects), just the wrong objects
|
||||
3. The linear growth pattern (9K, 19K, 29K) made it look like a counting bug, not a pagination bug
|
||||
|
||||
The fix is simple but critical: use the right parameter (`ContinuationToken`) with the right value (`NextContinuationToken`).
|
||||
@@ -12,6 +12,8 @@ DeltaGlider provides AWS S3 CLI compatible commands with automatic delta compres
|
||||
- `deltaglider migrate <source> <destination>` - Migrate S3 buckets with compression and EC2 cost warnings
|
||||
- `deltaglider stats <bucket>` - Get bucket statistics and compression metrics
|
||||
- `deltaglider verify <s3_url>` - Verify file integrity
|
||||
- `deltaglider put-bucket-acl <bucket>` - Set bucket ACL (s3api compatible)
|
||||
- `deltaglider get-bucket-acl <bucket>` - Get bucket ACL (s3api compatible)
|
||||
|
||||
### Current Usage Examples
|
||||
```bash
|
||||
@@ -23,6 +25,14 @@ deltaglider cp s3://bucket/path/to/file.zip .
|
||||
|
||||
# Verify integrity
|
||||
deltaglider verify s3://bucket/path/to/file.zip.delta
|
||||
|
||||
# Set bucket ACL
|
||||
deltaglider put-bucket-acl my-bucket --acl public-read
|
||||
deltaglider put-bucket-acl my-bucket --acl private
|
||||
deltaglider put-bucket-acl my-bucket --grant-read id=12345
|
||||
|
||||
# Get bucket ACL
|
||||
deltaglider get-bucket-acl my-bucket
|
||||
```
|
||||
|
||||
## Target State: AWS S3 CLI Compatibility
|
||||
|
||||
@@ -1,347 +1,76 @@
|
||||
# Case Study: How ReadOnlyREST Reduced Storage Costs by 99.9% with DeltaGlider
|
||||
## How ReadonlyREST Cut 4TB of S3 Storage Down to 5GB (and Saved 99.9%)
|
||||
|
||||
## Executive Summary
|
||||
### TL;DR
|
||||
|
||||
**The Challenge**: ReadOnlyREST, a security plugin for Elasticsearch, was facing exponential storage costs managing 145 release versions across multiple product lines, consuming nearly 4TB of S3 storage.
|
||||
We were paying to store 4TB of mostly identical plugin builds.
|
||||
DeltaGlider deduplicated everything down to 4.9GB — 99.9% smaller, $1.1k/year cheaper, and no workflow changes.
|
||||
|
||||
**The Solution**: DeltaGlider, an intelligent delta compression system that reduced storage from 4,060GB to just 4.9GB.
|
||||
#### The Problem
|
||||
|
||||
**The Impact**:
|
||||
- 💰 **$1,119 annual savings** on storage costs
|
||||
- 📉 **99.9% reduction** in storage usage
|
||||
- ⚡ **Zero changes** to existing workflows
|
||||
- ✅ **Full data integrity** maintained
|
||||
ReadonlyREST supports ~150 Elasticsearch/Kibana versions × multiple product lines × all our own releases.
|
||||
After years of publishing builds, our S3 archive hit `4TB` (201,840 files, $93/month).
|
||||
Glacier helped, but restoring files took 48 hours — useless for CI/CD.
|
||||
|
||||
---
|
||||
Every plugin ZIP was ~82MB, but `99.7% identical` to the next one. We were paying to store duplicates.
|
||||
|
||||
## The Storage Crisis
|
||||
#### The Fix: DeltaGlider
|
||||
|
||||
### The Numbers That Kept Us Up at Night
|
||||
DeltaGlider stores binary deltas between similar files instead of full copies.
|
||||
|
||||
ReadOnlyREST maintains a comprehensive release archive:
|
||||
- **145 version folders** (v1.50.0 through v1.66.1)
|
||||
- **201,840 total files** to manage
|
||||
- **3.96 TB** of S3 storage consumed
|
||||
- **$1,120/year** in storage costs alone
|
||||
|
||||
Each version folder contained:
|
||||
- 513 plugin ZIP files (one for each Elasticsearch version)
|
||||
- 879 checksum files (SHA1 and SHA512)
|
||||
- 3 product lines (Enterprise, Pro, Free)
|
||||
|
||||
### The Hidden Problem
|
||||
|
||||
What made this particularly painful wasn't just the size—it was the **redundancy**. Each 82.5MB plugin ZIP was 99.7% identical to others in the same version, differing only in minor Elasticsearch compatibility adjustments. We were essentially storing the same data hundreds of times.
|
||||
|
||||
> "We were paying to store 4TB of data that was fundamentally just variations of the same ~250MB of unique content. It felt like photocopying War and Peace 500 times because each copy had a different page number."
|
||||
>
|
||||
> — *DevOps Lead*
|
||||
|
||||
---
|
||||
|
||||
## Enter DeltaGlider
|
||||
|
||||
### The Lightbulb Moment
|
||||
|
||||
The breakthrough came when we realized we didn't need to store complete files—just the *differences* between them. DeltaGlider applies this principle automatically:
|
||||
|
||||
1. **First file becomes the reference** (stored in full)
|
||||
2. **Similar files store only deltas** (typically 0.3% of original size)
|
||||
3. **Different files uploaded directly** (no delta overhead)
|
||||
|
||||
### Implementation: Surprisingly Simple
|
||||
|
||||
```bash
|
||||
# Before DeltaGlider (standard S3 upload)
|
||||
aws s3 cp readonlyrest-1.66.1_es8.0.0.zip s3://releases/
|
||||
# Size on S3: 82.5MB
|
||||
|
||||
# With DeltaGlider
|
||||
deltaglider cp readonlyrest-1.66.1_es8.0.0.zip s3://releases/
|
||||
# Size on S3: 65KB (99.92% smaller!)
|
||||
# Before
|
||||
```
|
||||
aws s3 cp readonlyrest-1.66.1_es8.0.0.zip s3://releases/ # 82MB
|
||||
```
|
||||
|
||||
The beauty? **Zero changes to our build pipeline**. DeltaGlider works as a drop-in replacement for S3 uploads.
|
||||
|
||||
---
|
||||
|
||||
## The Results: Beyond Our Expectations
|
||||
|
||||
### Storage Transformation
|
||||
|
||||
# After
|
||||
```
|
||||
BEFORE DELTAGLIDER AFTER DELTAGLIDER
|
||||
━━━━━━━━━━━━━━━━━ ━━━━━━━━━━━━━━━━
|
||||
4,060 GB (3.96 TB) → 4.9 GB
|
||||
$93.38/month → $0.11/month
|
||||
201,840 files → 201,840 files (same!)
|
||||
deltaglider cp readonlyrest-1.66.1_es8.0.0.zip s3://releases/ # 65KB
|
||||
```
|
||||
|
||||
### Real Performance Metrics
|
||||
Drop-in replacement for `aws s3 cp`. No pipeline changes.
|
||||
Data integrity checked with SHA256, stored as metadata in S3.
|
||||
|
||||
From our actual production deployment:
|
||||
|
||||
| Metric | Value | Impact |
|
||||
|--------|-------|--------|
|
||||
| **Compression Ratio** | 99.9% | Near-perfect deduplication |
|
||||
| **Delta Size** | ~65KB per 82.5MB file | 1/1,269th of original |
|
||||
| **Upload Speed** | 3-4 files/second | Faster than raw S3 uploads |
|
||||
| **Download Speed** | Transparent reconstruction | No user impact |
|
||||
| **Storage Savings** | 4,055 GB | Enough for 850,000 more files |
|
||||
### The Result
|
||||
|
||||
### Version-to-Version Comparison
|
||||
| Metric | Before | After | Δ |
|
||||
|-------------- |----------|----------|--------------|
|
||||
| Storage | 4.06TB | 4.9GB | -99.9% |
|
||||
| Cost | $93/mo | $0.11/mo | -$1,119/yr |
|
||||
| Files | 201,840 | 201,840 | identical |
|
||||
| Upload speed | 1x | 3–4x | faster |
|
||||
|
||||
Testing between similar versions showed incredible efficiency:
|
||||
Each “different” ZIP? Just a 65KB delta.
|
||||
Reconstruction time: <100ms.
|
||||
Zero user impact.
|
||||
|
||||
|
||||
## Under the Hood
|
||||
|
||||
Uses xdelta3 diffs.
|
||||
• Keeps one reference per group
|
||||
• Stores deltas for near-identical files
|
||||
• Skips small or text-based ones (.sha, .json, etc.)
|
||||
|
||||
It’s smart enough to decide what’s worth diffing automatically.
|
||||
|
||||
|
||||
## Payoff
|
||||
• 4TB → 5GB overnight
|
||||
• Uploads 1,200× faster
|
||||
• CI bandwidth cut 99%
|
||||
• 100% checksum verified integrity
|
||||
• Zero vendor lock-in (open source)
|
||||
|
||||
## Takeaways
|
||||
|
||||
If You Ship Versioned Artifacts
|
||||
|
||||
This will probably save you four figures and hours of upload time per year.
|
||||
|
||||
```
|
||||
readonlyrest-1.66.1_es7.17.0.zip (82.5MB) → reference.bin (82.5MB)
|
||||
readonlyrest-1.66.1_es7.17.1.zip (82.5MB) → 64KB delta (0.08% size)
|
||||
readonlyrest-1.66.1_es7.17.2.zip (82.5MB) → 65KB delta (0.08% size)
|
||||
...
|
||||
readonlyrest-1.66.1_es8.15.0.zip (82.5MB) → 71KB delta (0.09% size)
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Technical Deep Dive
|
||||
|
||||
### How DeltaGlider Achieves 99.9% Compression
|
||||
|
||||
DeltaGlider uses binary diff algorithms (xdelta3) to identify and store only the bytes that change between files:
|
||||
|
||||
```python
|
||||
# Simplified concept
|
||||
reference = "readonlyrest-1.66.1_es7.17.0.zip" # 82.5MB
|
||||
new_file = "readonlyrest-1.66.1_es7.17.1.zip" # 82.5MB
|
||||
|
||||
delta = binary_diff(reference, new_file) # 65KB
|
||||
# Delta contains only:
|
||||
# - Elasticsearch version string changes
|
||||
# - Compatibility metadata updates
|
||||
# - Build timestamp differences
|
||||
```
|
||||
|
||||
### Intelligent File Type Detection
|
||||
|
||||
Not every file benefits from delta compression. DeltaGlider automatically:
|
||||
|
||||
- **Applies delta compression to**: `.zip`, `.tar`, `.gz`, `.dmg`, `.jar`, `.war`
|
||||
- **Uploads directly**: `.txt`, `.sha1`, `.sha512`, `.json`, `.md`
|
||||
|
||||
This intelligence meant our 127,455 checksum files were uploaded directly, avoiding unnecessary processing overhead.
|
||||
|
||||
### Architecture That Scales
|
||||
|
||||
```
|
||||
┌─────────────┐ ┌──────────────┐ ┌─────────────┐
|
||||
│ Client │────▶│ DeltaGlider │────▶│ S3/MinIO │
|
||||
│ (CI/CD) │ │ │ │ │
|
||||
└─────────────┘ └──────────────┘ └─────────────┘
|
||||
│
|
||||
┌──────▼───────┐
|
||||
│ Local Cache │
|
||||
│ (References) │
|
||||
└──────────────┘
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Business Impact
|
||||
|
||||
### Immediate ROI
|
||||
|
||||
- **Day 1**: 99.9% storage reduction
|
||||
- **Month 1**: $93 saved
|
||||
- **Year 1**: $1,119 saved
|
||||
- **5 Years**: $5,595 saved (not counting growth)
|
||||
|
||||
### Hidden Benefits We Didn't Expect
|
||||
|
||||
1. **Faster Deployments**: Uploading 65KB deltas is 1,200x faster than 82.5MB files
|
||||
2. **Reduced Bandwidth**: CI/CD pipeline bandwidth usage dropped 99%
|
||||
3. **Improved Reliability**: Fewer timeout errors on large file uploads
|
||||
4. **Better Compliance**: Automatic SHA256 integrity verification on every operation
|
||||
|
||||
### Environmental Impact
|
||||
|
||||
> "Reducing storage by 4TB means fewer drives spinning in data centers. It's a small contribution to our sustainability goals, but every bit counts."
|
||||
>
|
||||
> — *CTO*
|
||||
|
||||
---
|
||||
|
||||
## Implementation Journey
|
||||
|
||||
### Week 1: Proof of Concept
|
||||
- Tested with 10 files
|
||||
- Achieved 99.6% compression
|
||||
- Decision to proceed
|
||||
|
||||
### Week 2: Production Rollout
|
||||
- Uploaded all 201,840 files
|
||||
- Zero errors or failures
|
||||
- Immediate cost reduction
|
||||
|
||||
### Week 3: Integration
|
||||
```bash
|
||||
# Simple integration into our CI/CD
|
||||
- aws s3 cp $FILE s3://releases/
|
||||
+ deltaglider cp $FILE s3://releases/
|
||||
```
|
||||
|
||||
### Week 4: Full Migration
|
||||
- All build pipelines updated
|
||||
- Developer documentation completed
|
||||
- Monitoring dashboards configured
|
||||
|
||||
---
|
||||
|
||||
## Lessons Learned
|
||||
|
||||
### What Worked Well
|
||||
|
||||
1. **Drop-in replacement**: No architectural changes needed
|
||||
2. **Automatic intelligence**: File type detection "just worked"
|
||||
3. **Preservation of structure**: Directory hierarchy maintained perfectly
|
||||
|
||||
### Challenges Overcome
|
||||
|
||||
1. **Initial skepticism**: "99.9% compression sounds too good to be true"
|
||||
- *Solution*: Live demonstration with real data
|
||||
|
||||
2. **Download concerns**: "Will it be slow to reconstruct files?"
|
||||
- *Solution*: Benchmarking showed <100ms reconstruction time
|
||||
|
||||
3. **Reliability questions**: "What if the reference file is corrupted?"
|
||||
- *Solution*: SHA256 verification on every operation
|
||||
|
||||
---
|
||||
|
||||
## For Decision Makers
|
||||
|
||||
### Why This Matters
|
||||
|
||||
Storage costs scale linearly with data growth. Without DeltaGlider:
|
||||
- Next 145 versions: Additional $1,120/year
|
||||
- 5-year projection: $11,200 in storage alone
|
||||
- Opportunity cost: Resources that could fund innovation
|
||||
|
||||
### Risk Assessment
|
||||
|
||||
| Risk | Mitigation | Status |
|
||||
|------|------------|--------|
|
||||
| Vendor lock-in | Open-source, standards-based | ✅ Mitigated |
|
||||
| Data corruption | SHA256 verification built-in | ✅ Mitigated |
|
||||
| Performance impact | Faster than original | ✅ No risk |
|
||||
| Complexity | Drop-in replacement | ✅ No risk |
|
||||
|
||||
### Strategic Advantages
|
||||
|
||||
1. **Cost Predictability**: Storage costs become negligible
|
||||
2. **Scalability**: Can handle 100x more versions in same space
|
||||
3. **Competitive Edge**: More resources for product development
|
||||
4. **Green IT**: Reduced carbon footprint from storage
|
||||
|
||||
---
|
||||
|
||||
## For Engineers
|
||||
|
||||
### Getting Started
|
||||
|
||||
```bash
|
||||
# Install DeltaGlider
|
||||
pip install deltaglider
|
||||
|
||||
# Upload a file (automatic compression)
|
||||
deltaglider cp my-release-v1.0.0.zip s3://releases/
|
||||
|
||||
# Download (automatic reconstruction)
|
||||
deltaglider cp s3://releases/my-release-v1.0.0.zip .
|
||||
|
||||
# It's that simple.
|
||||
deltaglider cp my-release.zip s3://releases/
|
||||
```
|
||||
|
||||
### Performance Characteristics
|
||||
|
||||
```python
|
||||
# Compression ratios by similarity
|
||||
identical_files: 99.9% # Same file, different name
|
||||
minor_changes: 99.7% # Version bumps, timestamps
|
||||
moderate_changes: 95.0% # Feature additions
|
||||
major_changes: 70.0% # Significant refactoring
|
||||
completely_different: 0% # No compression (uploaded as-is)
|
||||
```
|
||||
|
||||
### Integration Examples
|
||||
|
||||
**GitHub Actions**:
|
||||
```yaml
|
||||
- name: Upload Release
|
||||
run: deltaglider cp dist/*.zip s3://releases/${{ github.ref_name }}/
|
||||
```
|
||||
|
||||
**Jenkins Pipeline**:
|
||||
```groovy
|
||||
sh "deltaglider cp ${WORKSPACE}/target/*.jar s3://artifacts/"
|
||||
```
|
||||
|
||||
**Python Script**:
|
||||
```python
|
||||
from deltaglider import DeltaService
|
||||
service = DeltaService(bucket="releases")
|
||||
service.put("my-app-v2.0.0.zip", "v2.0.0/")
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## The Bottom Line
|
||||
|
||||
DeltaGlider transformed our storage crisis into a solved problem:
|
||||
|
||||
- ✅ **4TB → 5GB** storage reduction
|
||||
- ✅ **$1,119/year** saved
|
||||
- ✅ **Zero** workflow disruption
|
||||
- ✅ **100%** data integrity maintained
|
||||
|
||||
For ReadOnlyREST, DeltaGlider wasn't just a cost-saving tool—it was a glimpse into the future of intelligent storage. When 99.9% of your data is redundant, why pay to store it 500 times?
|
||||
|
||||
---
|
||||
|
||||
## Next Steps
|
||||
|
||||
### For Your Organization
|
||||
|
||||
1. **Identify similar use cases**: Version releases, backups, build artifacts
|
||||
2. **Run the calculator**: `[Your files] × [Versions] × [Similarity] = Savings`
|
||||
3. **Start small**: Test with one project's releases
|
||||
4. **Scale confidently**: Deploy across all similar data
|
||||
|
||||
### Get Started Today
|
||||
|
||||
```bash
|
||||
# See your potential savings
|
||||
git clone https://github.com/beshu-tech/deltaglider
|
||||
cd deltaglider
|
||||
python calculate_savings.py --path /your/releases
|
||||
|
||||
# Try it yourself
|
||||
docker run -p 9000:9000 minio/minio # Local S3
|
||||
pip install deltaglider
|
||||
deltaglider cp your-file.zip s3://test/
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## About ReadOnlyREST
|
||||
|
||||
ReadOnlyREST is the enterprise security plugin for Elasticsearch and OpenSearch, protecting clusters in production since 2015. Learn more at [readonlyrest.com](https://readonlyrest.com)
|
||||
|
||||
## About DeltaGlider
|
||||
|
||||
DeltaGlider is an open-source delta compression system for S3-compatible storage, turning redundant data into remarkable savings. Built with modern Python, containerized for portability, and designed for scale.
|
||||
|
||||
---
|
||||
|
||||
*"In a world where storage is cheap but not free, and data grows exponentially but changes incrementally, DeltaGlider represents a fundamental shift in how we think about storing versioned artifacts."*
|
||||
|
||||
**— ReadOnlyREST Engineering Team**
|
||||
That’s it.
|
||||
Binary file not shown.
|
Before Width: | Height: | Size: 267 KiB After Width: | Height: | Size: 4.0 MiB |
@@ -1,6 +1,6 @@
|
||||
# DeltaGlider Python SDK Documentation
|
||||
|
||||
The DeltaGlider Python SDK provides a **boto3-compatible API for core S3 operations** (~20% of methods covering 80% of use cases), while achieving 99%+ compression for versioned artifacts through intelligent binary delta compression.
|
||||
The DeltaGlider Python SDK provides a **boto3-compatible API for core S3 operations** (~20% of methods covering 80% of use cases), while achieving 99%+ compression for very similar versioned artifacts through intelligent binary delta compression.
|
||||
|
||||
## 🎯 Key Highlights
|
||||
|
||||
@@ -206,10 +206,17 @@ from deltaglider import create_client
|
||||
client = create_client(
|
||||
endpoint_url="http://minio.internal:9000", # Custom S3 endpoint
|
||||
log_level="DEBUG", # Detailed logging
|
||||
cache_dir="/var/cache/deltaglider", # Custom cache location
|
||||
aws_access_key_id="minio",
|
||||
aws_secret_access_key="minio",
|
||||
region_name="eu-west-1",
|
||||
max_ratio=0.3, # Stricter delta acceptance
|
||||
)
|
||||
```
|
||||
|
||||
> ℹ️ The SDK now manages an encrypted, process-isolated cache automatically in `/tmp/deltaglider-*`.
|
||||
> Tune cache behavior via environment variables such as `DG_CACHE_BACKEND`,
|
||||
> `DG_CACHE_MEMORY_SIZE_MB`, and `DG_CACHE_ENCRYPTION_KEY` instead of passing a `cache_dir` argument.
|
||||
|
||||
## Real-World Example
|
||||
|
||||
```python
|
||||
@@ -299,4 +306,4 @@ url = client.generate_presigned_url(
|
||||
|
||||
## License
|
||||
|
||||
MIT License - See [LICENSE](https://github.com/beshu-tech/deltaglider/blob/main/LICENSE) for details.
|
||||
MIT License - See [LICENSE](https://github.com/beshu-tech/deltaglider/blob/main/LICENSE) for details.
|
||||
|
||||
182
docs/sdk/api.md
182
docs/sdk/api.md
@@ -156,29 +156,34 @@ for obj in response['Contents']:
|
||||
|
||||
#### `get_bucket_stats`
|
||||
|
||||
Get statistics for a bucket with optional detailed compression metrics. Results are cached per client session for performance.
|
||||
Get statistics for a bucket with optional detailed compression metrics. Results are cached inside the bucket for performance.
|
||||
|
||||
```python
|
||||
def get_bucket_stats(
|
||||
self,
|
||||
bucket: str,
|
||||
detailed_stats: bool = False
|
||||
mode: Literal["quick", "sampled", "detailed"] = "quick",
|
||||
use_cache: bool = True,
|
||||
refresh_cache: bool = False,
|
||||
) -> BucketStats
|
||||
```
|
||||
|
||||
##### Parameters
|
||||
|
||||
- **bucket** (`str`): S3 bucket name.
|
||||
- **detailed_stats** (`bool`): If True, fetch accurate compression ratios for delta files. Default: False.
|
||||
- With `detailed_stats=False`: ~50ms for any bucket size (LIST calls only)
|
||||
- With `detailed_stats=True`: ~2-3s per 1000 objects (adds HEAD calls for delta files)
|
||||
- **mode** (`Literal[...]`): Accuracy/cost trade-off:
|
||||
- `"quick"` (default): LIST-only scan; compression ratios for deltas are estimated.
|
||||
- `"sampled"`: HEAD one delta per deltaspace and reuse the ratio.
|
||||
- `"detailed"`: HEAD every delta object; slowest but exact.
|
||||
- **use_cache** (`bool`): If True, read/write `.deltaglider/stats_{mode}.json` in the bucket for reuse.
|
||||
- **refresh_cache** (`bool`): Force recomputation even if a cache file is valid.
|
||||
|
||||
##### Caching Behavior
|
||||
|
||||
- **Session-scoped cache**: Results cached within client instance lifetime
|
||||
- **Automatic invalidation**: Cache cleared on bucket mutations (put, delete, bucket operations)
|
||||
- **Intelligent reuse**: Detailed stats can serve quick stat requests
|
||||
- **Manual cache control**: Use `clear_cache()` to invalidate all cached stats
|
||||
- Stats are cached per mode directly inside the bucket at `.deltaglider/stats_{mode}.json`.
|
||||
- Every call validates cache freshness via a quick LIST (object count + compressed size).
|
||||
- `refresh_cache=True` skips cache validation and recomputes immediately.
|
||||
- `use_cache=False` bypasses both reading and writing cache artifacts.
|
||||
|
||||
##### Returns
|
||||
|
||||
@@ -195,24 +200,20 @@ def get_bucket_stats(
|
||||
##### Examples
|
||||
|
||||
```python
|
||||
# Quick stats for dashboard display (cached after first call)
|
||||
# Quick stats (fast LIST-only)
|
||||
stats = client.get_bucket_stats('releases')
|
||||
print(f"Objects: {stats.object_count}, Size: {stats.total_size}")
|
||||
|
||||
# Second call hits cache (instant response)
|
||||
stats = client.get_bucket_stats('releases')
|
||||
print(f"Space saved: {stats.space_saved} bytes")
|
||||
# Sampled/detailed modes for analytics
|
||||
sampled = client.get_bucket_stats('releases', mode='sampled')
|
||||
detailed = client.get_bucket_stats('releases', mode='detailed')
|
||||
print(f"Compression ratio: {detailed.average_compression_ratio:.1%}")
|
||||
|
||||
# Detailed stats for analytics (slower but accurate, also cached)
|
||||
stats = client.get_bucket_stats('releases', detailed_stats=True)
|
||||
print(f"Compression ratio: {stats.average_compression_ratio:.1%}")
|
||||
# Force refresh if an external tool modified the bucket
|
||||
fresh = client.get_bucket_stats('releases', mode='quick', refresh_cache=True)
|
||||
|
||||
# Quick call after detailed call reuses detailed cache (more accurate)
|
||||
quick_stats = client.get_bucket_stats('releases') # Uses detailed cache
|
||||
|
||||
# Clear cache to force refresh
|
||||
client.clear_cache()
|
||||
stats = client.get_bucket_stats('releases') # Fresh computation
|
||||
# Skip cache entirely when running ad-hoc diagnostics
|
||||
uncached = client.get_bucket_stats('releases', use_cache=False)
|
||||
```
|
||||
|
||||
#### `put_object`
|
||||
@@ -334,7 +335,7 @@ client.delete_bucket(Bucket='old-releases')
|
||||
|
||||
#### `list_buckets`
|
||||
|
||||
List all S3 buckets (boto3-compatible). Includes cached statistics when available.
|
||||
List all S3 buckets (boto3-compatible).
|
||||
|
||||
```python
|
||||
def list_buckets(
|
||||
@@ -345,51 +346,18 @@ def list_buckets(
|
||||
|
||||
##### Returns
|
||||
|
||||
Dict with list of buckets and owner information (identical to boto3). Each bucket may include optional `DeltaGliderStats` metadata if statistics have been previously cached.
|
||||
|
||||
##### Response Structure
|
||||
|
||||
```python
|
||||
{
|
||||
'Buckets': [
|
||||
{
|
||||
'Name': 'bucket-name',
|
||||
'CreationDate': datetime(2025, 1, 1),
|
||||
'DeltaGliderStats': { # Optional, only if cached
|
||||
'Cached': True,
|
||||
'Detailed': bool, # Whether detailed stats were fetched
|
||||
'ObjectCount': int,
|
||||
'TotalSize': int,
|
||||
'CompressedSize': int,
|
||||
'SpaceSaved': int,
|
||||
'AverageCompressionRatio': float,
|
||||
'DeltaObjects': int,
|
||||
'DirectObjects': int
|
||||
}
|
||||
}
|
||||
],
|
||||
'Owner': {...}
|
||||
}
|
||||
```
|
||||
Dict with the same structure boto3 returns (`Buckets`, `Owner`, `ResponseMetadata`). DeltaGlider does not inject additional metadata; use `get_bucket_stats()` for compression data.
|
||||
|
||||
##### Examples
|
||||
|
||||
```python
|
||||
# List all buckets
|
||||
response = client.list_buckets()
|
||||
for bucket in response['Buckets']:
|
||||
print(f"{bucket['Name']} - Created: {bucket['CreationDate']}")
|
||||
|
||||
# Check if stats are cached
|
||||
if 'DeltaGliderStats' in bucket:
|
||||
stats = bucket['DeltaGliderStats']
|
||||
print(f" Cached stats: {stats['ObjectCount']} objects, "
|
||||
f"{stats['AverageCompressionRatio']:.1%} compression")
|
||||
|
||||
# Fetch stats first, then list buckets to see cached data
|
||||
client.get_bucket_stats('my-bucket', detailed_stats=True)
|
||||
response = client.list_buckets()
|
||||
# Now 'my-bucket' will include DeltaGliderStats in response
|
||||
# Combine with get_bucket_stats for deeper insights
|
||||
stats = client.get_bucket_stats('releases', mode='detailed')
|
||||
print(f"releases -> {stats.object_count} objects, {stats.space_saved/(1024**3):.2f} GB saved")
|
||||
```
|
||||
|
||||
### Simple API Methods
|
||||
@@ -528,13 +496,9 @@ else:
|
||||
|
||||
### Cache Management Methods
|
||||
|
||||
DeltaGlider maintains two types of caches for performance optimization:
|
||||
1. **Reference cache**: Binary reference files used for delta reconstruction
|
||||
2. **Statistics cache**: Bucket statistics (session-scoped)
|
||||
|
||||
#### `clear_cache`
|
||||
|
||||
Clear all cached data including reference files and bucket statistics.
|
||||
Clear all locally cached reference files.
|
||||
|
||||
```python
|
||||
def clear_cache(self) -> None
|
||||
@@ -542,23 +506,20 @@ def clear_cache(self) -> None
|
||||
|
||||
##### Description
|
||||
|
||||
Removes all cached reference files from the local filesystem and invalidates all bucket statistics. Useful for:
|
||||
- Forcing fresh statistics computation
|
||||
Removes all cached reference files from the local filesystem. Useful for:
|
||||
- Freeing disk space in long-running applications
|
||||
- Ensuring latest data after external bucket modifications
|
||||
- Ensuring the next upload/download fetches fresh references from S3
|
||||
- Resetting cache after configuration or credential changes
|
||||
- Testing and development workflows
|
||||
|
||||
##### Cache Types Cleared
|
||||
##### Cache Scope
|
||||
|
||||
1. **Reference Cache**: Binary reference files stored in `/tmp/deltaglider-*/`
|
||||
- Encrypted at rest with ephemeral keys
|
||||
- Content-addressed storage (SHA256-based filenames)
|
||||
- Automatically cleaned up on process exit
|
||||
|
||||
2. **Statistics Cache**: Bucket statistics cached per client session
|
||||
- Metadata about compression ratios and object counts
|
||||
- Session-scoped (not persisted to disk)
|
||||
- Automatically invalidated on bucket mutations
|
||||
- **Reference Cache**: Binary reference files stored in `/tmp/deltaglider-*/`
|
||||
- Encrypted at rest with ephemeral keys
|
||||
- Content-addressed storage (SHA256-based filenames)
|
||||
- Automatically cleaned up on process exit
|
||||
- **Statistics Cache**: Stored inside the bucket as `.deltaglider/stats_{mode}.json`.
|
||||
- `clear_cache()` does *not* remove these S3 objects; use `refresh_cache=True` or delete the objects manually if needed.
|
||||
|
||||
##### Examples
|
||||
|
||||
@@ -574,71 +535,14 @@ for i in range(1000):
|
||||
if i % 100 == 0:
|
||||
client.clear_cache()
|
||||
|
||||
# Force fresh statistics after external changes
|
||||
stats_before = client.get_bucket_stats('releases') # Cached
|
||||
# ... external tool modifies bucket ...
|
||||
client.clear_cache()
|
||||
stats_after = client.get_bucket_stats('releases') # Fresh data
|
||||
# Force fresh statistics after external changes (skip cache instead of clearing)
|
||||
stats_before = client.get_bucket_stats('releases')
|
||||
stats_after = client.get_bucket_stats('releases', refresh_cache=True)
|
||||
|
||||
# Development workflow
|
||||
client.clear_cache() # Start with clean state
|
||||
```
|
||||
|
||||
#### `evict_cache`
|
||||
|
||||
Remove a specific cached reference file from the local cache.
|
||||
|
||||
```python
|
||||
def evict_cache(self, s3_url: str) -> None
|
||||
```
|
||||
|
||||
##### Parameters
|
||||
|
||||
- **s3_url** (`str`): S3 URL of the reference file to evict (e.g., `s3://bucket/prefix/reference.bin`)
|
||||
|
||||
##### Description
|
||||
|
||||
Removes a specific reference file from the cache without affecting other cached files or statistics. Useful for:
|
||||
- Selective cache invalidation when specific references are updated
|
||||
- Memory management in applications with many delta spaces
|
||||
- Testing specific delta compression scenarios
|
||||
|
||||
##### Examples
|
||||
|
||||
```python
|
||||
# Evict specific reference after update
|
||||
client.upload("new-reference.zip", "s3://releases/v2.0.0/")
|
||||
client.evict_cache("s3://releases/v2.0.0/reference.bin")
|
||||
|
||||
# Next upload will fetch fresh reference
|
||||
client.upload("similar-file.zip", "s3://releases/v2.0.0/")
|
||||
|
||||
# Selective eviction for specific delta spaces
|
||||
delta_spaces = ["v1.0.0", "v1.1.0", "v1.2.0"]
|
||||
for space in delta_spaces:
|
||||
client.evict_cache(f"s3://releases/{space}/reference.bin")
|
||||
```
|
||||
|
||||
##### See Also
|
||||
|
||||
- [docs/CACHE_MANAGEMENT.md](../../CACHE_MANAGEMENT.md): Complete cache management guide
|
||||
- `clear_cache()`: Clear all caches
|
||||
|
||||
#### `lifecycle_policy`
|
||||
|
||||
Set lifecycle policy for S3 prefix (placeholder for future implementation).
|
||||
|
||||
```python
|
||||
def lifecycle_policy(
|
||||
self,
|
||||
s3_prefix: str,
|
||||
days_before_archive: int = 30,
|
||||
days_before_delete: int = 90
|
||||
) -> None
|
||||
```
|
||||
|
||||
**Note**: This method is a placeholder for future S3 lifecycle policy management.
|
||||
|
||||
## UploadSummary
|
||||
|
||||
Data class containing upload operation results.
|
||||
@@ -995,4 +899,4 @@ client = create_client(log_level="DEBUG")
|
||||
|
||||
- **GitHub Issues**: [github.com/beshu-tech/deltaglider/issues](https://github.com/beshu-tech/deltaglider/issues)
|
||||
- **Documentation**: [github.com/beshu-tech/deltaglider](https://github.com/beshu-tech/deltaglider)
|
||||
- **PyPI Package**: [pypi.org/project/deltaglider](https://pypi.org/project/deltaglider)
|
||||
- **PyPI Package**: [pypi.org/project/deltaglider](https://pypi.org/project/deltaglider)
|
||||
|
||||
@@ -25,6 +25,7 @@ DeltaGlider's smart `list_objects` method eliminates the N+1 query problem by in
|
||||
|
||||
```python
|
||||
from deltaglider import create_client
|
||||
from deltaglider.client_models import BucketStats
|
||||
import time
|
||||
|
||||
client = create_client()
|
||||
@@ -41,19 +42,19 @@ def fast_bucket_listing(bucket: str):
|
||||
|
||||
# Process objects for display
|
||||
items = []
|
||||
for obj in response.contents:
|
||||
for obj in response['Contents']:
|
||||
metadata = obj.get("Metadata", {})
|
||||
items.append({
|
||||
"key": obj.key,
|
||||
"size": obj.size,
|
||||
"last_modified": obj.last_modified,
|
||||
"is_delta": obj.is_delta, # Determined from filename
|
||||
# No compression_ratio - would require HEAD request
|
||||
"key": obj["Key"],
|
||||
"size": obj["Size"],
|
||||
"last_modified": obj["LastModified"],
|
||||
"is_delta": metadata.get("deltaglider-is-delta") == "true",
|
||||
})
|
||||
|
||||
elapsed = time.time() - start
|
||||
print(f"Listed {len(items)} objects in {elapsed*1000:.0f}ms")
|
||||
|
||||
return items, response.next_continuation_token
|
||||
return items, response.get("NextContinuationToken")
|
||||
|
||||
# Example: List first page
|
||||
items, next_token = fast_bucket_listing('releases')
|
||||
@@ -75,12 +76,12 @@ def paginated_listing(bucket: str, page_size: int = 50):
|
||||
FetchMetadata=False # Keep it fast
|
||||
)
|
||||
|
||||
all_objects.extend(response.contents)
|
||||
all_objects.extend(response["Contents"])
|
||||
|
||||
if not response.is_truncated:
|
||||
if not response.get("IsTruncated"):
|
||||
break
|
||||
|
||||
continuation_token = response.next_continuation_token
|
||||
continuation_token = response.get("NextContinuationToken")
|
||||
print(f"Fetched {len(all_objects)} objects so far...")
|
||||
|
||||
return all_objects
|
||||
@@ -96,8 +97,8 @@ print(f"Total objects: {len(all_objects)}")
|
||||
def dashboard_with_stats(bucket: str):
|
||||
"""Dashboard view with optional detailed stats."""
|
||||
|
||||
# Quick overview (fast - no metadata)
|
||||
stats = client.get_bucket_stats(bucket, detailed_stats=False)
|
||||
# Quick overview (fast LIST-only)
|
||||
stats = client.get_bucket_stats(bucket)
|
||||
|
||||
print(f"Quick Stats for {bucket}:")
|
||||
print(f" Total Objects: {stats.object_count}")
|
||||
@@ -108,7 +109,7 @@ def dashboard_with_stats(bucket: str):
|
||||
|
||||
# Detailed compression analysis (slower - fetches metadata for deltas only)
|
||||
if stats.delta_objects > 0:
|
||||
detailed_stats = client.get_bucket_stats(bucket, detailed_stats=True)
|
||||
detailed_stats = client.get_bucket_stats(bucket, mode='detailed')
|
||||
print(f"\nDetailed Compression Stats:")
|
||||
print(f" Average Compression: {detailed_stats.average_compression_ratio:.1%}")
|
||||
print(f" Space Saved: {detailed_stats.space_saved / (1024**3):.2f} GB")
|
||||
@@ -131,11 +132,25 @@ def compression_analysis(bucket: str, prefix: str = ""):
|
||||
)
|
||||
|
||||
# Analyze compression effectiveness
|
||||
delta_files = [obj for obj in response.contents if obj.is_delta]
|
||||
delta_files: list[dict[str, float | int | str]] = []
|
||||
for obj in response["Contents"]:
|
||||
metadata = obj.get("Metadata", {})
|
||||
if metadata.get("deltaglider-is-delta") != "true":
|
||||
continue
|
||||
original_size = int(metadata.get("deltaglider-original-size", obj["Size"]))
|
||||
compression_ratio = float(metadata.get("deltaglider-compression-ratio", 0.0))
|
||||
delta_files.append(
|
||||
{
|
||||
"key": obj["Key"],
|
||||
"original": original_size,
|
||||
"compressed": obj["Size"],
|
||||
"ratio": compression_ratio,
|
||||
}
|
||||
)
|
||||
|
||||
if delta_files:
|
||||
total_original = sum(obj.original_size for obj in delta_files)
|
||||
total_compressed = sum(obj.compressed_size for obj in delta_files)
|
||||
total_original = sum(obj["original"] for obj in delta_files)
|
||||
total_compressed = sum(obj["compressed"] for obj in delta_files)
|
||||
avg_ratio = (total_original - total_compressed) / total_original
|
||||
|
||||
print(f"Compression Analysis for {prefix or 'all files'}:")
|
||||
@@ -145,11 +160,11 @@ def compression_analysis(bucket: str, prefix: str = ""):
|
||||
print(f" Average Compression: {avg_ratio:.1%}")
|
||||
|
||||
# Find best and worst compression
|
||||
best = max(delta_files, key=lambda x: x.compression_ratio or 0)
|
||||
worst = min(delta_files, key=lambda x: x.compression_ratio or 1)
|
||||
best = max(delta_files, key=lambda x: x["ratio"])
|
||||
worst = min(delta_files, key=lambda x: x["ratio"])
|
||||
|
||||
print(f" Best Compression: {best.key} ({best.compression_ratio:.1%})")
|
||||
print(f" Worst Compression: {worst.key} ({worst.compression_ratio:.1%})")
|
||||
print(f" Best Compression: {best['key']} ({best['ratio']:.1%})")
|
||||
print(f" Worst Compression: {worst['key']} ({worst['ratio']:.1%})")
|
||||
|
||||
# Example: Analyze v2.0 releases
|
||||
compression_analysis('releases', 'v2.0/')
|
||||
@@ -180,7 +195,11 @@ def performance_comparison(bucket: str):
|
||||
)
|
||||
time_detailed = (time.time() - start) * 1000
|
||||
|
||||
delta_count = sum(1 for obj in response_fast.contents if obj.is_delta)
|
||||
delta_count = sum(
|
||||
1
|
||||
for obj in response_fast["Contents"]
|
||||
if obj.get("Metadata", {}).get("deltaglider-is-delta") == "true"
|
||||
)
|
||||
|
||||
print(f"Performance Comparison for {bucket}:")
|
||||
print(f" Fast Listing: {time_fast:.0f}ms (1 API call)")
|
||||
@@ -203,7 +222,7 @@ performance_comparison('releases')
|
||||
|
||||
## Bucket Statistics and Monitoring
|
||||
|
||||
DeltaGlider provides powerful bucket statistics with session-level caching for performance.
|
||||
DeltaGlider provides powerful bucket statistics with S3-backed caching for performance.
|
||||
|
||||
### Quick Dashboard Stats (Cached)
|
||||
|
||||
@@ -244,7 +263,7 @@ def detailed_compression_report(bucket: str):
|
||||
"""Generate detailed compression report with accurate ratios."""
|
||||
|
||||
# Detailed stats fetch metadata for delta files (slower, accurate)
|
||||
stats = client.get_bucket_stats(bucket, detailed_stats=True)
|
||||
stats = client.get_bucket_stats(bucket, mode='detailed')
|
||||
|
||||
efficiency = (stats.space_saved / stats.total_size * 100) if stats.total_size > 0 else 0
|
||||
|
||||
@@ -281,15 +300,18 @@ detailed_compression_report('releases')
|
||||
|
||||
```python
|
||||
def list_buckets_with_stats():
|
||||
"""List all buckets and show cached statistics if available."""
|
||||
"""List buckets and augment with cached stats fetched on demand."""
|
||||
|
||||
# Pre-fetch stats for important buckets
|
||||
important_buckets = ['releases', 'backups']
|
||||
for bucket_name in important_buckets:
|
||||
client.get_bucket_stats(bucket_name, detailed_stats=True)
|
||||
|
||||
# List all buckets (includes cached stats automatically)
|
||||
response = client.list_buckets()
|
||||
stats_cache: dict[str, BucketStats | None] = {}
|
||||
|
||||
def ensure_stats(bucket_name: str) -> BucketStats | None:
|
||||
if bucket_name not in stats_cache:
|
||||
try:
|
||||
stats_cache[bucket_name] = client.get_bucket_stats(bucket_name)
|
||||
except Exception:
|
||||
stats_cache[bucket_name] = None
|
||||
return stats_cache[bucket_name]
|
||||
|
||||
print("All Buckets:")
|
||||
print(f"{'Name':<30} {'Objects':<10} {'Compression':<15} {'Cached'}")
|
||||
@@ -297,13 +319,12 @@ def list_buckets_with_stats():
|
||||
|
||||
for bucket in response['Buckets']:
|
||||
name = bucket['Name']
|
||||
stats = ensure_stats(name)
|
||||
|
||||
# Check if stats are cached
|
||||
if 'DeltaGliderStats' in bucket:
|
||||
stats = bucket['DeltaGliderStats']
|
||||
obj_count = f"{stats['ObjectCount']:,}"
|
||||
compression = f"{stats['AverageCompressionRatio']:.1%}"
|
||||
cached = "✓ (detailed)" if stats['Detailed'] else "✓ (quick)"
|
||||
if stats:
|
||||
obj_count = f"{stats.object_count:,}"
|
||||
compression = f"{stats.average_compression_ratio:.1%}"
|
||||
cached = "✓ (S3 cache)"
|
||||
else:
|
||||
obj_count = "N/A"
|
||||
compression = "N/A"
|
||||
@@ -357,7 +378,7 @@ except KeyboardInterrupt:
|
||||
|
||||
## Session-Level Cache Management
|
||||
|
||||
DeltaGlider maintains session-level caches for optimal performance in long-running applications.
|
||||
DeltaGlider maintains an encrypted reference cache for optimal performance in long-running applications.
|
||||
|
||||
### Long-Running Application Pattern
|
||||
|
||||
@@ -410,11 +431,8 @@ def handle_external_bucket_changes(bucket: str):
|
||||
print("External backup tool running...")
|
||||
run_external_backup_tool(bucket) # Your external tool
|
||||
|
||||
# Clear cache to get fresh data
|
||||
client.clear_cache()
|
||||
|
||||
# Get updated stats
|
||||
stats_after = client.get_bucket_stats(bucket)
|
||||
# Force a recompute of the cached stats
|
||||
stats_after = client.get_bucket_stats(bucket, refresh_cache=True)
|
||||
print(f"After: {stats_after.object_count} objects")
|
||||
print(f"Added: {stats_after.object_count - stats_before.object_count} objects")
|
||||
|
||||
@@ -422,35 +440,6 @@ def handle_external_bucket_changes(bucket: str):
|
||||
handle_external_bucket_changes('backups')
|
||||
```
|
||||
|
||||
### Selective Cache Eviction
|
||||
|
||||
```python
|
||||
def selective_cache_management():
|
||||
"""Manage cache for specific delta spaces."""
|
||||
|
||||
client = create_client()
|
||||
|
||||
# Upload to multiple delta spaces
|
||||
versions = ['v1.0.0', 'v1.1.0', 'v1.2.0']
|
||||
|
||||
for version in versions:
|
||||
client.upload(f"app-{version}.zip", f"s3://releases/{version}/")
|
||||
|
||||
# Update reference for specific version
|
||||
print("Updating v1.1.0 reference...")
|
||||
client.upload("new-reference.zip", "s3://releases/v1.1.0/")
|
||||
|
||||
# Evict only v1.1.0 cache (others remain cached)
|
||||
client.evict_cache("s3://releases/v1.1.0/reference.bin")
|
||||
|
||||
# Next upload to v1.1.0 fetches fresh reference
|
||||
# v1.0.0 and v1.2.0 still use cached references
|
||||
client.upload("similar-file.zip", "s3://releases/v1.1.0/")
|
||||
|
||||
# Example: Selective eviction
|
||||
selective_cache_management()
|
||||
```
|
||||
|
||||
### Testing with Clean Cache
|
||||
|
||||
```python
|
||||
@@ -491,19 +480,18 @@ def measure_cache_performance(bucket: str):
|
||||
client = create_client()
|
||||
|
||||
# Test 1: Cold cache
|
||||
client.clear_cache()
|
||||
start = time.time()
|
||||
stats1 = client.get_bucket_stats(bucket, detailed_stats=True)
|
||||
stats1 = client.get_bucket_stats(bucket, mode='detailed', refresh_cache=True)
|
||||
cold_time = (time.time() - start) * 1000
|
||||
|
||||
# Test 2: Warm cache
|
||||
start = time.time()
|
||||
stats2 = client.get_bucket_stats(bucket, detailed_stats=True)
|
||||
stats2 = client.get_bucket_stats(bucket, mode='detailed')
|
||||
warm_time = (time.time() - start) * 1000
|
||||
|
||||
# Test 3: Quick stats from detailed cache
|
||||
start = time.time()
|
||||
stats3 = client.get_bucket_stats(bucket, detailed_stats=False)
|
||||
stats3 = client.get_bucket_stats(bucket, mode='quick')
|
||||
reuse_time = (time.time() - start) * 1000
|
||||
|
||||
print(f"Cache Performance for {bucket}:")
|
||||
@@ -1707,4 +1695,4 @@ files_to_upload = [
|
||||
results = uploader.upload_batch(files_to_upload)
|
||||
```
|
||||
|
||||
These examples demonstrate real-world usage patterns for DeltaGlider across various domains. Each example includes error handling, monitoring, and best practices for production deployments.
|
||||
These examples demonstrate real-world usage patterns for DeltaGlider across various domains. Each example includes error handling, monitoring, and best practices for production deployments.
|
||||
|
||||
@@ -49,11 +49,11 @@ classifiers = [
|
||||
]
|
||||
|
||||
dependencies = [
|
||||
"boto3>=1.35.0",
|
||||
"click>=8.1.0",
|
||||
"cryptography>=42.0.0",
|
||||
"python-dateutil>=2.9.0",
|
||||
"requests>=2.32.0",
|
||||
"boto3>=1.35.0,<2.0.0",
|
||||
"click>=8.1.0,<9.0.0",
|
||||
"cryptography>=42.0.0,<45.0.0",
|
||||
"python-dateutil>=2.9.0,<3.0.0",
|
||||
"requests>=2.32.0,<3.0.0",
|
||||
]
|
||||
|
||||
[project.urls]
|
||||
|
||||
@@ -7,6 +7,7 @@ from pathlib import Path
|
||||
from typing import TYPE_CHECKING, Any, BinaryIO, Optional
|
||||
|
||||
import boto3
|
||||
from botocore.config import Config
|
||||
from botocore.exceptions import ClientError
|
||||
|
||||
from ..ports.storage import ObjectHead, PutResult, StoragePort
|
||||
@@ -42,6 +43,13 @@ class S3StorageAdapter(StoragePort):
|
||||
client_params: dict[str, Any] = {
|
||||
"service_name": "s3",
|
||||
"endpoint_url": endpoint_url or os.environ.get("AWS_ENDPOINT_URL"),
|
||||
# Disable automatic request checksums (CRC32/CRC64) added in
|
||||
# boto3 1.36+. S3-compatible stores like Hetzner Object Storage
|
||||
# reject the checksum headers with BadRequest.
|
||||
"config": Config(
|
||||
request_checksum_calculation="when_required",
|
||||
response_checksum_validation="when_required",
|
||||
),
|
||||
}
|
||||
|
||||
# Merge in any additional boto3 kwargs (credentials, region, etc.)
|
||||
@@ -225,47 +233,94 @@ class S3StorageAdapter(StoragePort):
|
||||
f"AWS S3 limit (2KB). Some metadata may be lost!"
|
||||
)
|
||||
|
||||
try:
|
||||
response = self.client.put_object(
|
||||
Bucket=bucket,
|
||||
Key=object_key,
|
||||
Body=body_data,
|
||||
ContentType=content_type,
|
||||
Metadata=clean_metadata,
|
||||
)
|
||||
import time
|
||||
|
||||
# VERIFICATION: Check if metadata was actually stored (especially for delta files)
|
||||
if object_key.endswith(".delta") and clean_metadata:
|
||||
try:
|
||||
# Verify metadata was stored by doing a HEAD immediately
|
||||
verify_response = self.client.head_object(Bucket=bucket, Key=object_key)
|
||||
stored_metadata = verify_response.get("Metadata", {})
|
||||
max_retries = 3
|
||||
last_error: ClientError | None = None
|
||||
|
||||
if not stored_metadata:
|
||||
logger.error(
|
||||
f"PUT {object_key}: CRITICAL - Metadata was sent but NOT STORED! "
|
||||
f"Sent {len(clean_metadata)} keys, received 0 keys back."
|
||||
)
|
||||
elif len(stored_metadata) < len(clean_metadata):
|
||||
missing_keys = set(clean_metadata.keys()) - set(stored_metadata.keys())
|
||||
logger.warning(
|
||||
f"PUT {object_key}: Metadata partially stored. "
|
||||
f"Sent {len(clean_metadata)} keys, stored {len(stored_metadata)} keys. "
|
||||
f"Missing keys: {missing_keys}"
|
||||
)
|
||||
elif logger.isEnabledFor(logging.DEBUG):
|
||||
logger.debug(
|
||||
f"PUT {object_key}: Metadata verified - all {len(clean_metadata)} keys stored"
|
||||
)
|
||||
except Exception as e:
|
||||
logger.warning(f"PUT {object_key}: Could not verify metadata: {e}")
|
||||
for attempt in range(max_retries):
|
||||
try:
|
||||
response = self.client.put_object(
|
||||
Bucket=bucket,
|
||||
Key=object_key,
|
||||
Body=body_data,
|
||||
ContentType=content_type,
|
||||
Metadata=clean_metadata,
|
||||
)
|
||||
|
||||
return PutResult(
|
||||
etag=response["ETag"].strip('"'),
|
||||
version_id=response.get("VersionId"),
|
||||
)
|
||||
except ClientError as e:
|
||||
raise RuntimeError(f"Failed to put object: {e}") from e
|
||||
# VERIFICATION: Check if metadata was actually stored (especially for delta files)
|
||||
if object_key.endswith(".delta") and clean_metadata:
|
||||
try:
|
||||
# Verify metadata was stored by doing a HEAD immediately
|
||||
verify_response = self.client.head_object(Bucket=bucket, Key=object_key)
|
||||
stored_metadata = verify_response.get("Metadata", {})
|
||||
|
||||
if not stored_metadata:
|
||||
logger.error(
|
||||
f"PUT {object_key}: CRITICAL - Metadata was sent but NOT STORED! "
|
||||
f"Sent {len(clean_metadata)} keys, received 0 keys back."
|
||||
)
|
||||
elif len(stored_metadata) < len(clean_metadata):
|
||||
missing_keys = set(clean_metadata.keys()) - set(stored_metadata.keys())
|
||||
logger.warning(
|
||||
f"PUT {object_key}: Metadata partially stored. "
|
||||
f"Sent {len(clean_metadata)} keys, stored {len(stored_metadata)} keys. "
|
||||
f"Missing keys: {missing_keys}"
|
||||
)
|
||||
elif logger.isEnabledFor(logging.DEBUG):
|
||||
logger.debug(
|
||||
f"PUT {object_key}: Metadata verified - "
|
||||
f"all {len(clean_metadata)} keys stored"
|
||||
)
|
||||
except Exception as e:
|
||||
logger.warning(f"PUT {object_key}: Could not verify metadata: {e}")
|
||||
|
||||
return PutResult(
|
||||
etag=response["ETag"].strip('"'),
|
||||
version_id=response.get("VersionId"),
|
||||
)
|
||||
except ClientError as e:
|
||||
last_error = e
|
||||
if attempt < max_retries - 1:
|
||||
delay = 2**attempt # 1s, 2s
|
||||
# Log full error details
|
||||
error_response = e.response if hasattr(e, "response") else {}
|
||||
http_headers = error_response.get("ResponseMetadata", {}).get("HTTPHeaders", {})
|
||||
logger.warning(
|
||||
f"PUT {object_key}: Attempt {attempt + 1}/{max_retries} failed: {e}. "
|
||||
f"Retrying in {delay}s... "
|
||||
f"Details: bucket={bucket}, key={object_key}, "
|
||||
f"body_size={len(body_data)}, content_type={content_type}, "
|
||||
f"metadata_keys={list(clean_metadata.keys())}, "
|
||||
f"endpoint={self.client.meta.endpoint_url}, "
|
||||
f"http_status={error_response.get('ResponseMetadata', {}).get('HTTPStatusCode')}, "
|
||||
f"error_code={error_response.get('Error', {}).get('Code')}, "
|
||||
f"error_message={error_response.get('Error', {}).get('Message')}, "
|
||||
f"request_id={error_response.get('ResponseMetadata', {}).get('RequestId')}, "
|
||||
f"http_headers={dict(http_headers)}"
|
||||
)
|
||||
# Enable botocore wire-level logging for the retry
|
||||
logging.getLogger("botocore").setLevel(logging.DEBUG)
|
||||
time.sleep(delay)
|
||||
else:
|
||||
# Final attempt failed — log everything
|
||||
error_response = e.response if hasattr(e, "response") else {}
|
||||
http_headers = error_response.get("ResponseMetadata", {}).get("HTTPHeaders", {})
|
||||
logger.error(
|
||||
f"PUT {object_key}: All {max_retries} attempts failed. "
|
||||
f"Last error: {e}. "
|
||||
f"Details: bucket={bucket}, key={object_key}, "
|
||||
f"body_size={len(body_data)}, content_type={content_type}, "
|
||||
f"metadata={clean_metadata}, "
|
||||
f"endpoint={self.client.meta.endpoint_url}, "
|
||||
f"http_status={error_response.get('ResponseMetadata', {}).get('HTTPStatusCode')}, "
|
||||
f"error_code={error_response.get('Error', {}).get('Code')}, "
|
||||
f"error_message={error_response.get('Error', {}).get('Message')}, "
|
||||
f"request_id={error_response.get('ResponseMetadata', {}).get('RequestId')}, "
|
||||
f"http_headers={dict(http_headers)}"
|
||||
)
|
||||
|
||||
raise RuntimeError(f"Failed to put object: {last_error}") from last_error
|
||||
|
||||
def delete(self, key: str) -> None:
|
||||
"""Delete object."""
|
||||
|
||||
@@ -6,6 +6,7 @@ import os
|
||||
import shutil
|
||||
import sys
|
||||
import tempfile
|
||||
from datetime import UTC
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
@@ -21,6 +22,7 @@ from ...adapters import (
|
||||
XdeltaAdapter,
|
||||
)
|
||||
from ...core import DeltaService, ObjectKey
|
||||
from ...core.config import DeltaGliderConfig
|
||||
from ...ports import MetricsPort
|
||||
from ...ports.cache import CachePort
|
||||
from .aws_compat import (
|
||||
@@ -40,11 +42,25 @@ def create_service(
|
||||
endpoint_url: str | None = None,
|
||||
region: str | None = None,
|
||||
profile: str | None = None,
|
||||
*,
|
||||
config: DeltaGliderConfig | None = None,
|
||||
) -> DeltaService:
|
||||
"""Create service with wired adapters."""
|
||||
# Get config from environment
|
||||
max_ratio = float(os.environ.get("DG_MAX_RATIO", "0.5"))
|
||||
metrics_type = os.environ.get("DG_METRICS", "logging") # Options: noop, logging, cloudwatch
|
||||
"""Create service with wired adapters.
|
||||
|
||||
Args:
|
||||
log_level: Logging level (overridden by config.log_level if config provided).
|
||||
endpoint_url: S3 endpoint URL (overridden by config if provided).
|
||||
region: AWS region (overridden by config if provided).
|
||||
profile: AWS profile (overridden by config if provided).
|
||||
config: Optional pre-built config. If None, built from env vars + explicit params.
|
||||
"""
|
||||
if config is None:
|
||||
config = DeltaGliderConfig.from_env(
|
||||
log_level=log_level,
|
||||
endpoint_url=endpoint_url,
|
||||
region=region,
|
||||
profile=profile,
|
||||
)
|
||||
|
||||
# SECURITY: Always use ephemeral process-isolated cache
|
||||
cache_dir = Path(tempfile.mkdtemp(prefix="deltaglider-", dir="/tmp"))
|
||||
@@ -52,62 +68,61 @@ def create_service(
|
||||
atexit.register(lambda: shutil.rmtree(cache_dir, ignore_errors=True))
|
||||
|
||||
# Set AWS environment variables if provided (for compatibility with other AWS tools)
|
||||
if endpoint_url:
|
||||
os.environ["AWS_ENDPOINT_URL"] = endpoint_url
|
||||
if region:
|
||||
os.environ["AWS_DEFAULT_REGION"] = region
|
||||
if profile:
|
||||
os.environ["AWS_PROFILE"] = profile
|
||||
if config.endpoint_url:
|
||||
os.environ["AWS_ENDPOINT_URL"] = config.endpoint_url
|
||||
if config.region:
|
||||
os.environ["AWS_DEFAULT_REGION"] = config.region
|
||||
if config.profile:
|
||||
os.environ["AWS_PROFILE"] = config.profile
|
||||
|
||||
# Build boto3_kwargs for explicit parameter passing (preferred over env vars)
|
||||
boto3_kwargs: dict[str, Any] = {}
|
||||
if region:
|
||||
boto3_kwargs["region_name"] = region
|
||||
if config.region:
|
||||
boto3_kwargs["region_name"] = config.region
|
||||
|
||||
# Create adapters
|
||||
hasher = Sha256Adapter()
|
||||
storage = S3StorageAdapter(endpoint_url=endpoint_url, boto3_kwargs=boto3_kwargs)
|
||||
storage = S3StorageAdapter(endpoint_url=config.endpoint_url, boto3_kwargs=boto3_kwargs)
|
||||
diff = XdeltaAdapter()
|
||||
|
||||
# SECURITY: Configurable cache with encryption and backend selection
|
||||
from deltaglider.adapters import ContentAddressedCache, EncryptedCache, MemoryCache
|
||||
|
||||
# Select backend: memory or filesystem
|
||||
cache_backend = os.environ.get("DG_CACHE_BACKEND", "filesystem") # Options: filesystem, memory
|
||||
base_cache: CachePort
|
||||
if cache_backend == "memory":
|
||||
max_size_mb = int(os.environ.get("DG_CACHE_MEMORY_SIZE_MB", "100"))
|
||||
base_cache = MemoryCache(hasher, max_size_mb=max_size_mb, temp_dir=cache_dir)
|
||||
if config.cache_backend == "memory":
|
||||
base_cache = MemoryCache(
|
||||
hasher, max_size_mb=config.cache_memory_size_mb, temp_dir=cache_dir
|
||||
)
|
||||
else:
|
||||
# Filesystem-backed with Content-Addressed Storage
|
||||
base_cache = ContentAddressedCache(cache_dir, hasher)
|
||||
|
||||
# Always apply encryption with ephemeral keys (security hardening)
|
||||
# Encryption key is optional via DG_CACHE_ENCRYPTION_KEY (ephemeral if not set)
|
||||
cache: CachePort = EncryptedCache.from_env(base_cache)
|
||||
|
||||
clock = UtcClockAdapter()
|
||||
logger = StdLoggerAdapter(level=log_level)
|
||||
logger = StdLoggerAdapter(level=config.log_level)
|
||||
|
||||
# Create metrics adapter based on configuration
|
||||
metrics: MetricsPort
|
||||
if metrics_type == "cloudwatch":
|
||||
# Import here to avoid dependency if not used
|
||||
if config.metrics_type == "cloudwatch":
|
||||
from ...adapters.metrics_cloudwatch import CloudWatchMetricsAdapter
|
||||
|
||||
metrics = CloudWatchMetricsAdapter(
|
||||
namespace=os.environ.get("DG_METRICS_NAMESPACE", "DeltaGlider"),
|
||||
region=region,
|
||||
endpoint_url=endpoint_url if endpoint_url and "localhost" in endpoint_url else None,
|
||||
namespace=config.metrics_namespace,
|
||||
region=config.region,
|
||||
endpoint_url=(
|
||||
config.endpoint_url
|
||||
if config.endpoint_url and "localhost" in config.endpoint_url
|
||||
else None
|
||||
),
|
||||
)
|
||||
elif metrics_type == "logging":
|
||||
elif config.metrics_type == "logging":
|
||||
from ...adapters.metrics_cloudwatch import LoggingMetricsAdapter
|
||||
|
||||
metrics = LoggingMetricsAdapter(log_level=log_level)
|
||||
metrics = LoggingMetricsAdapter(log_level=config.log_level)
|
||||
else:
|
||||
metrics = NoopMetricsAdapter()
|
||||
|
||||
# Create service
|
||||
return DeltaService(
|
||||
storage=storage,
|
||||
diff=diff,
|
||||
@@ -116,7 +131,7 @@ def create_service(
|
||||
clock=clock,
|
||||
logger=logger,
|
||||
metrics=metrics,
|
||||
max_ratio=max_ratio,
|
||||
max_ratio=config.max_ratio,
|
||||
)
|
||||
|
||||
|
||||
@@ -140,8 +155,11 @@ def _version_callback(ctx: click.Context, param: click.Parameter, value: bool) -
|
||||
@click.pass_context
|
||||
def cli(ctx: click.Context, debug: bool) -> None:
|
||||
"""DeltaGlider - Delta-aware S3 file storage wrapper."""
|
||||
import logging
|
||||
|
||||
log_level = "DEBUG" if debug else os.environ.get("DG_LOG_LEVEL", "INFO")
|
||||
ctx.obj = create_service(log_level)
|
||||
logging.getLogger("deltaglider").info("deltaglider %s", __version__)
|
||||
|
||||
|
||||
@cli.command()
|
||||
@@ -488,24 +506,24 @@ def rm(
|
||||
|
||||
# Report the results
|
||||
if not quiet:
|
||||
if result["deleted_count"] == 0:
|
||||
if result.deleted_count == 0:
|
||||
click.echo(f"delete: No objects found with prefix: s3://{bucket}/{prefix}")
|
||||
else:
|
||||
click.echo(f"Deleted {result['deleted_count']} object(s)")
|
||||
click.echo(f"Deleted {result.deleted_count} object(s)")
|
||||
|
||||
# Show warnings if any references were kept
|
||||
for warning in result.get("warnings", []):
|
||||
for warning in result.warnings:
|
||||
if "Kept reference" in warning:
|
||||
click.echo(
|
||||
f"Keeping reference file (still in use): s3://{bucket}/{warning.split()[2]}"
|
||||
)
|
||||
|
||||
# Report any errors
|
||||
if result["failed_count"] > 0:
|
||||
for error in result.get("errors", []):
|
||||
if result.failed_count > 0:
|
||||
for error in result.errors:
|
||||
click.echo(f"Error: {error}", err=True)
|
||||
|
||||
if result["failed_count"] > 0:
|
||||
if result.failed_count > 0:
|
||||
sys.exit(1)
|
||||
|
||||
except Exception as e:
|
||||
@@ -890,6 +908,301 @@ def stats(
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
@cli.command()
|
||||
@click.argument("bucket")
|
||||
@click.option("--dry-run", is_flag=True, help="Show what would be deleted without deleting")
|
||||
@click.option("--json", "output_json", is_flag=True, help="Output in JSON format")
|
||||
@click.option("--endpoint-url", help="Override S3 endpoint URL")
|
||||
@click.option("--region", help="AWS region")
|
||||
@click.option("--profile", help="AWS profile to use")
|
||||
@click.pass_obj
|
||||
def purge(
|
||||
service: DeltaService,
|
||||
bucket: str,
|
||||
dry_run: bool,
|
||||
output_json: bool,
|
||||
endpoint_url: str | None,
|
||||
region: str | None,
|
||||
profile: str | None,
|
||||
) -> None:
|
||||
"""Purge expired temporary files from .deltaglider/tmp/.
|
||||
|
||||
This command scans the .deltaglider/tmp/ prefix in the specified bucket
|
||||
and deletes any files whose dg-expires-at metadata indicates they have expired.
|
||||
|
||||
These temporary files are created by the rehydration process when deltaglider-compressed
|
||||
files need to be made available for direct download (e.g., via presigned URLs).
|
||||
|
||||
BUCKET can be specified as:
|
||||
- s3://bucket-name/
|
||||
- s3://bucket-name
|
||||
- bucket-name
|
||||
|
||||
Examples:
|
||||
deltaglider purge mybucket # Purge expired files
|
||||
deltaglider purge mybucket --dry-run # Preview what would be deleted
|
||||
deltaglider purge mybucket --json # JSON output for automation
|
||||
deltaglider purge s3://mybucket/ # Also accepts s3:// URLs
|
||||
"""
|
||||
# Recreate service with AWS parameters if provided
|
||||
if endpoint_url or region or profile:
|
||||
service = create_service(
|
||||
log_level=os.environ.get("DG_LOG_LEVEL", "INFO"),
|
||||
endpoint_url=endpoint_url,
|
||||
region=region,
|
||||
profile=profile,
|
||||
)
|
||||
|
||||
try:
|
||||
# Parse bucket from S3 URL if needed
|
||||
if is_s3_path(bucket):
|
||||
bucket, _prefix = parse_s3_url(bucket)
|
||||
|
||||
if not bucket:
|
||||
click.echo("Error: Invalid bucket name", err=True)
|
||||
sys.exit(1)
|
||||
|
||||
# Perform the purge (or dry run simulation)
|
||||
if dry_run:
|
||||
# For dry run, we need to simulate what would be deleted
|
||||
prefix = ".deltaglider/tmp/"
|
||||
expired_files = []
|
||||
total_size = 0
|
||||
|
||||
# List all objects in temp directory
|
||||
from datetime import datetime
|
||||
|
||||
import boto3
|
||||
|
||||
s3_client = boto3.client(
|
||||
"s3",
|
||||
endpoint_url=endpoint_url or os.environ.get("AWS_ENDPOINT_URL"),
|
||||
region_name=region,
|
||||
)
|
||||
|
||||
paginator = s3_client.get_paginator("list_objects_v2")
|
||||
page_iterator = paginator.paginate(Bucket=bucket, Prefix=prefix)
|
||||
|
||||
for page in page_iterator:
|
||||
for obj in page.get("Contents", []):
|
||||
# Get object metadata
|
||||
head_response = s3_client.head_object(Bucket=bucket, Key=obj["Key"])
|
||||
metadata = head_response.get("Metadata", {})
|
||||
|
||||
expires_at_str = metadata.get("dg-expires-at")
|
||||
if expires_at_str:
|
||||
try:
|
||||
expires_at = datetime.fromisoformat(
|
||||
expires_at_str.replace("Z", "+00:00")
|
||||
)
|
||||
if expires_at.tzinfo is None:
|
||||
expires_at = expires_at.replace(tzinfo=UTC)
|
||||
|
||||
if datetime.now(UTC) >= expires_at:
|
||||
expired_files.append(
|
||||
{
|
||||
"key": obj["Key"],
|
||||
"size": obj["Size"],
|
||||
"expires_at": expires_at_str,
|
||||
}
|
||||
)
|
||||
total_size += obj["Size"]
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
if output_json:
|
||||
output = {
|
||||
"bucket": bucket,
|
||||
"prefix": prefix,
|
||||
"dry_run": True,
|
||||
"would_delete_count": len(expired_files),
|
||||
"total_size_to_free": total_size,
|
||||
"expired_files": expired_files[:10], # Show first 10
|
||||
}
|
||||
click.echo(json.dumps(output, indent=2))
|
||||
else:
|
||||
click.echo(f"Dry run: Would delete {len(expired_files)} expired file(s)")
|
||||
click.echo(f"Total space to free: {total_size:,} bytes")
|
||||
if expired_files:
|
||||
click.echo("\nFiles that would be deleted (first 10):")
|
||||
for file_info in expired_files[:10]:
|
||||
click.echo(f" {file_info['key']} (expires: {file_info['expires_at']})")
|
||||
if len(expired_files) > 10:
|
||||
click.echo(f" ... and {len(expired_files) - 10} more")
|
||||
else:
|
||||
# Perform actual purge using the service method
|
||||
result = service.purge_temp_files(bucket)
|
||||
|
||||
if output_json:
|
||||
# JSON output
|
||||
click.echo(json.dumps(result, indent=2))
|
||||
else:
|
||||
# Human-readable output
|
||||
click.echo(f"Purge Statistics for bucket: {bucket}")
|
||||
click.echo(f"{'=' * 60}")
|
||||
click.echo(f"Expired files found: {result['expired_count']}")
|
||||
click.echo(f"Files deleted: {result['deleted_count']}")
|
||||
click.echo(f"Errors: {result['error_count']}")
|
||||
click.echo(f"Space freed: {result['total_size_freed']:,} bytes")
|
||||
click.echo(f"Duration: {result['duration_seconds']:.2f} seconds")
|
||||
|
||||
if result["errors"]:
|
||||
click.echo("\nErrors encountered:")
|
||||
for error in result["errors"][:5]:
|
||||
click.echo(f" - {error}")
|
||||
if len(result["errors"]) > 5:
|
||||
click.echo(f" ... and {len(result['errors']) - 5} more errors")
|
||||
|
||||
except Exception as e:
|
||||
click.echo(f"Error: {e}", err=True)
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
@cli.command("put-bucket-acl")
|
||||
@click.argument("bucket")
|
||||
@click.option(
|
||||
"--acl",
|
||||
type=click.Choice(["private", "public-read", "public-read-write", "authenticated-read"]),
|
||||
help="Canned ACL to apply",
|
||||
)
|
||||
@click.option("--grant-full-control", help="Grants full control (e.g., id=account-id)")
|
||||
@click.option("--grant-read", help="Allows grantee to list objects (e.g., id=account-id)")
|
||||
@click.option("--grant-read-acp", help="Allows grantee to read the bucket ACL")
|
||||
@click.option("--grant-write", help="Allows grantee to create objects in the bucket")
|
||||
@click.option("--grant-write-acp", help="Allows grantee to write the ACL for the bucket")
|
||||
@click.option("--access-control-policy", help="Full ACL policy as JSON string")
|
||||
@click.option("--endpoint-url", help="Override S3 endpoint URL")
|
||||
@click.option("--region", help="AWS region")
|
||||
@click.option("--profile", help="AWS profile to use")
|
||||
@click.pass_obj
|
||||
def put_bucket_acl(
|
||||
service: DeltaService,
|
||||
bucket: str,
|
||||
acl: str | None,
|
||||
grant_full_control: str | None,
|
||||
grant_read: str | None,
|
||||
grant_read_acp: str | None,
|
||||
grant_write: str | None,
|
||||
grant_write_acp: str | None,
|
||||
access_control_policy: str | None,
|
||||
endpoint_url: str | None,
|
||||
region: str | None,
|
||||
profile: str | None,
|
||||
) -> None:
|
||||
"""Set the access control list (ACL) for an S3 bucket.
|
||||
|
||||
BUCKET can be specified as:
|
||||
- s3://bucket-name
|
||||
- bucket-name
|
||||
|
||||
Examples:
|
||||
deltaglider put-bucket-acl my-bucket --acl private
|
||||
deltaglider put-bucket-acl my-bucket --acl public-read
|
||||
deltaglider put-bucket-acl my-bucket --grant-read id=12345
|
||||
"""
|
||||
from ...client import DeltaGliderClient
|
||||
|
||||
# Recreate service with AWS parameters if provided
|
||||
if endpoint_url or region or profile:
|
||||
service = create_service(
|
||||
log_level=os.environ.get("DG_LOG_LEVEL", "INFO"),
|
||||
endpoint_url=endpoint_url,
|
||||
region=region,
|
||||
profile=profile,
|
||||
)
|
||||
|
||||
try:
|
||||
# Parse bucket from S3 URL if needed
|
||||
if is_s3_path(bucket):
|
||||
bucket, _prefix = parse_s3_url(bucket)
|
||||
|
||||
if not bucket:
|
||||
click.echo("Error: Invalid bucket name", err=True)
|
||||
sys.exit(1)
|
||||
|
||||
client = DeltaGliderClient(service=service)
|
||||
|
||||
kwargs: dict[str, Any] = {}
|
||||
if acl is not None:
|
||||
kwargs["ACL"] = acl
|
||||
if grant_full_control is not None:
|
||||
kwargs["GrantFullControl"] = grant_full_control
|
||||
if grant_read is not None:
|
||||
kwargs["GrantRead"] = grant_read
|
||||
if grant_read_acp is not None:
|
||||
kwargs["GrantReadACP"] = grant_read_acp
|
||||
if grant_write is not None:
|
||||
kwargs["GrantWrite"] = grant_write
|
||||
if grant_write_acp is not None:
|
||||
kwargs["GrantWriteACP"] = grant_write_acp
|
||||
if access_control_policy is not None:
|
||||
kwargs["AccessControlPolicy"] = json.loads(access_control_policy)
|
||||
|
||||
client.put_bucket_acl(Bucket=bucket, **kwargs)
|
||||
click.echo(f"ACL updated for bucket: {bucket}")
|
||||
|
||||
except json.JSONDecodeError as e:
|
||||
click.echo(f"Error: Invalid JSON for --access-control-policy: {e}", err=True)
|
||||
sys.exit(1)
|
||||
except Exception as e:
|
||||
click.echo(f"Error: {e}", err=True)
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
@cli.command("get-bucket-acl")
|
||||
@click.argument("bucket")
|
||||
@click.option("--endpoint-url", help="Override S3 endpoint URL")
|
||||
@click.option("--region", help="AWS region")
|
||||
@click.option("--profile", help="AWS profile to use")
|
||||
@click.pass_obj
|
||||
def get_bucket_acl(
|
||||
service: DeltaService,
|
||||
bucket: str,
|
||||
endpoint_url: str | None,
|
||||
region: str | None,
|
||||
profile: str | None,
|
||||
) -> None:
|
||||
"""Get the access control list (ACL) for an S3 bucket.
|
||||
|
||||
BUCKET can be specified as:
|
||||
- s3://bucket-name
|
||||
- bucket-name
|
||||
|
||||
Examples:
|
||||
deltaglider get-bucket-acl my-bucket
|
||||
deltaglider get-bucket-acl s3://my-bucket
|
||||
"""
|
||||
from ...client import DeltaGliderClient
|
||||
|
||||
# Recreate service with AWS parameters if provided
|
||||
if endpoint_url or region or profile:
|
||||
service = create_service(
|
||||
log_level=os.environ.get("DG_LOG_LEVEL", "INFO"),
|
||||
endpoint_url=endpoint_url,
|
||||
region=region,
|
||||
profile=profile,
|
||||
)
|
||||
|
||||
try:
|
||||
# Parse bucket from S3 URL if needed
|
||||
if is_s3_path(bucket):
|
||||
bucket, _prefix = parse_s3_url(bucket)
|
||||
|
||||
if not bucket:
|
||||
click.echo("Error: Invalid bucket name", err=True)
|
||||
sys.exit(1)
|
||||
|
||||
client = DeltaGliderClient(service=service)
|
||||
response = client.get_bucket_acl(Bucket=bucket)
|
||||
|
||||
# Output as JSON like aws s3api get-bucket-acl
|
||||
click.echo(json.dumps(response, indent=2, default=str))
|
||||
|
||||
except Exception as e:
|
||||
click.echo(f"Error: {e}", err=True)
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
def main() -> None:
|
||||
"""Main entry point."""
|
||||
cli()
|
||||
|
||||
@@ -28,9 +28,11 @@ from .client_operations import (
|
||||
find_similar_files as _find_similar_files,
|
||||
generate_presigned_post as _generate_presigned_post,
|
||||
generate_presigned_url as _generate_presigned_url,
|
||||
get_bucket_acl as _get_bucket_acl,
|
||||
get_bucket_stats as _get_bucket_stats,
|
||||
get_object_info as _get_object_info,
|
||||
list_buckets as _list_buckets,
|
||||
put_bucket_acl as _put_bucket_acl,
|
||||
upload_batch as _upload_batch,
|
||||
upload_chunked as _upload_chunked,
|
||||
)
|
||||
@@ -40,6 +42,7 @@ from .client_operations.stats import StatsMode
|
||||
|
||||
from .core import DeltaService, DeltaSpace, ObjectKey
|
||||
from .core.errors import NotFoundError
|
||||
from .core.models import DeleteResult
|
||||
from .core.object_listing import ObjectListing, list_objects_page
|
||||
from .core.s3_uri import parse_s3_url
|
||||
from .response_builders import (
|
||||
@@ -67,7 +70,6 @@ class DeltaGliderClient:
|
||||
"""Initialize client with service."""
|
||||
self.service = service
|
||||
self.endpoint_url = endpoint_url
|
||||
self._multipart_uploads: dict[str, Any] = {} # Track multipart uploads
|
||||
# Session-scoped bucket statistics cache (cleared with the client lifecycle)
|
||||
self._bucket_stats_cache: dict[str, dict[str, BucketStats]] = {}
|
||||
|
||||
@@ -464,19 +466,17 @@ class DeltaGliderClient:
|
||||
|
||||
# Build DeltaGlider-specific info
|
||||
deltaglider_info: dict[str, Any] = {
|
||||
"Type": delete_result.get("type"),
|
||||
"Deleted": delete_result.get("deleted", False),
|
||||
"Type": delete_result.type,
|
||||
"Deleted": delete_result.deleted,
|
||||
}
|
||||
|
||||
# Add warnings if any
|
||||
warnings = delete_result.get("warnings")
|
||||
if warnings:
|
||||
deltaglider_info["Warnings"] = warnings
|
||||
if delete_result.warnings:
|
||||
deltaglider_info["Warnings"] = delete_result.warnings
|
||||
|
||||
# Add dependent delta count for references
|
||||
dependent_deltas = delete_result.get("dependent_deltas")
|
||||
if dependent_deltas:
|
||||
deltaglider_info["DependentDeltas"] = dependent_deltas
|
||||
if delete_result.dependent_deltas:
|
||||
deltaglider_info["DependentDeltas"] = delete_result.dependent_deltas
|
||||
|
||||
# Return as dict[str, Any] for public API (TypedDict is a dict at runtime!)
|
||||
response = cast(
|
||||
@@ -518,21 +518,21 @@ class DeltaGliderClient:
|
||||
deleted_item = {"Key": key}
|
||||
if actual_key != key:
|
||||
deleted_item["StoredKey"] = actual_key
|
||||
if delete_result.get("type"):
|
||||
deleted_item["Type"] = delete_result["type"]
|
||||
if delete_result.get("warnings"):
|
||||
deleted_item["Warnings"] = delete_result["warnings"]
|
||||
if delete_result.type:
|
||||
deleted_item["Type"] = delete_result.type
|
||||
if delete_result.warnings:
|
||||
deleted_item["Warnings"] = delete_result.warnings
|
||||
|
||||
deleted.append(deleted_item)
|
||||
|
||||
# Track delta-specific info
|
||||
if delete_result.get("type") in ["delta", "reference"]:
|
||||
if delete_result.type in ("delta", "reference"):
|
||||
delta_info.append(
|
||||
{
|
||||
"Key": key,
|
||||
"StoredKey": actual_key,
|
||||
"Type": delete_result["type"],
|
||||
"DependentDeltas": delete_result.get("dependent_deltas", 0),
|
||||
"Type": delete_result.type,
|
||||
"DependentDeltas": delete_result.dependent_deltas,
|
||||
}
|
||||
)
|
||||
|
||||
@@ -604,22 +604,22 @@ class DeltaGliderClient:
|
||||
continue
|
||||
|
||||
try:
|
||||
actual_key, delete_result = delete_with_delta_suffix(
|
||||
actual_key, single_del = delete_with_delta_suffix(
|
||||
self.service, Bucket, candidate
|
||||
)
|
||||
if delete_result.get("deleted"):
|
||||
if single_del.deleted:
|
||||
single_results.append(
|
||||
{
|
||||
"requested_key": candidate,
|
||||
"actual_key": actual_key,
|
||||
"result": delete_result,
|
||||
"result": single_del,
|
||||
}
|
||||
)
|
||||
except Exception as e:
|
||||
single_errors.append(f"Failed to delete {candidate}: {e}")
|
||||
|
||||
# Use core service's delta-aware recursive delete for remaining objects
|
||||
delete_result = self.service.delete_recursive(Bucket, Prefix)
|
||||
recursive_result = self.service.delete_recursive(Bucket, Prefix)
|
||||
|
||||
# Aggregate results
|
||||
single_deleted_count = len(single_results)
|
||||
@@ -628,37 +628,32 @@ class DeltaGliderClient:
|
||||
single_warnings: list[str] = []
|
||||
|
||||
for item in single_results:
|
||||
result = item["result"]
|
||||
dr: DeleteResult = item["result"]
|
||||
requested_key = item["requested_key"]
|
||||
actual_key = item["actual_key"]
|
||||
result_type = result.get("type", "other")
|
||||
if result_type not in single_counts:
|
||||
result_type = "other"
|
||||
result_type = dr.type if dr.type in single_counts else "other"
|
||||
single_counts[result_type] += 1
|
||||
detail = {
|
||||
detail: dict[str, Any] = {
|
||||
"Key": requested_key,
|
||||
"Type": result.get("type"),
|
||||
"DependentDeltas": result.get("dependent_deltas", 0),
|
||||
"Warnings": result.get("warnings", []),
|
||||
"Type": dr.type,
|
||||
"DependentDeltas": dr.dependent_deltas,
|
||||
"Warnings": dr.warnings,
|
||||
}
|
||||
if actual_key != requested_key:
|
||||
detail["StoredKey"] = actual_key
|
||||
single_details.append(detail)
|
||||
warnings = result.get("warnings")
|
||||
if warnings:
|
||||
single_warnings.extend(warnings)
|
||||
if dr.warnings:
|
||||
single_warnings.extend(dr.warnings)
|
||||
|
||||
deleted_count = cast(int, delete_result.get("deleted_count", 0)) + single_deleted_count
|
||||
failed_count = cast(int, delete_result.get("failed_count", 0)) + len(single_errors)
|
||||
deleted_count = recursive_result.deleted_count + single_deleted_count
|
||||
failed_count = recursive_result.failed_count + len(single_errors)
|
||||
|
||||
deltas_deleted = cast(int, delete_result.get("deltas_deleted", 0)) + single_counts["delta"]
|
||||
references_deleted = (
|
||||
cast(int, delete_result.get("references_deleted", 0)) + single_counts["reference"]
|
||||
)
|
||||
direct_deleted = cast(int, delete_result.get("direct_deleted", 0)) + single_counts["direct"]
|
||||
other_deleted = cast(int, delete_result.get("other_deleted", 0)) + single_counts["other"]
|
||||
deltas_deleted = recursive_result.deltas_deleted + single_counts["delta"]
|
||||
references_deleted = recursive_result.references_deleted + single_counts["reference"]
|
||||
direct_deleted = recursive_result.direct_deleted + single_counts["direct"]
|
||||
other_deleted = recursive_result.other_deleted + single_counts["other"]
|
||||
|
||||
response = {
|
||||
response: dict[str, Any] = {
|
||||
"ResponseMetadata": {
|
||||
"HTTPStatusCode": 200,
|
||||
},
|
||||
@@ -672,13 +667,11 @@ class DeltaGliderClient:
|
||||
},
|
||||
}
|
||||
|
||||
errors = delete_result.get("errors")
|
||||
if errors:
|
||||
response["Errors"] = cast(list[str], errors)
|
||||
if recursive_result.errors:
|
||||
response["Errors"] = recursive_result.errors
|
||||
|
||||
warnings = delete_result.get("warnings")
|
||||
if warnings:
|
||||
response["Warnings"] = cast(list[str], warnings)
|
||||
if recursive_result.warnings:
|
||||
response["Warnings"] = recursive_result.warnings
|
||||
|
||||
if single_errors:
|
||||
errors_list = cast(list[str], response.setdefault("Errors", []))
|
||||
@@ -1129,6 +1122,63 @@ class DeltaGliderClient:
|
||||
"""
|
||||
return _list_buckets(self, **kwargs)
|
||||
|
||||
def put_bucket_acl(
|
||||
self,
|
||||
Bucket: str,
|
||||
ACL: str | None = None,
|
||||
AccessControlPolicy: dict[str, Any] | None = None,
|
||||
GrantFullControl: str | None = None,
|
||||
GrantRead: str | None = None,
|
||||
GrantReadACP: str | None = None,
|
||||
GrantWrite: str | None = None,
|
||||
GrantWriteACP: str | None = None,
|
||||
**kwargs: Any,
|
||||
) -> dict[str, Any]:
|
||||
"""Set the ACL for an S3 bucket (boto3-compatible passthrough).
|
||||
|
||||
Args:
|
||||
Bucket: Bucket name
|
||||
ACL: Canned ACL (private, public-read, public-read-write, authenticated-read)
|
||||
AccessControlPolicy: Full ACL policy dict
|
||||
GrantFullControl: Grants full control to the grantee
|
||||
GrantRead: Allows grantee to list objects in the bucket
|
||||
GrantReadACP: Allows grantee to read the bucket ACL
|
||||
GrantWrite: Allows grantee to create objects in the bucket
|
||||
GrantWriteACP: Allows grantee to write the ACL for the bucket
|
||||
**kwargs: Additional S3 parameters (for compatibility)
|
||||
|
||||
Returns:
|
||||
Response dict with status
|
||||
"""
|
||||
return _put_bucket_acl(
|
||||
self,
|
||||
Bucket,
|
||||
ACL=ACL,
|
||||
AccessControlPolicy=AccessControlPolicy,
|
||||
GrantFullControl=GrantFullControl,
|
||||
GrantRead=GrantRead,
|
||||
GrantReadACP=GrantReadACP,
|
||||
GrantWrite=GrantWrite,
|
||||
GrantWriteACP=GrantWriteACP,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
def get_bucket_acl(
|
||||
self,
|
||||
Bucket: str,
|
||||
**kwargs: Any,
|
||||
) -> dict[str, Any]:
|
||||
"""Get the ACL for an S3 bucket (boto3-compatible passthrough).
|
||||
|
||||
Args:
|
||||
Bucket: Bucket name
|
||||
**kwargs: Additional S3 parameters (for compatibility)
|
||||
|
||||
Returns:
|
||||
Response dict with Owner and Grants
|
||||
"""
|
||||
return _get_bucket_acl(self, Bucket, **kwargs)
|
||||
|
||||
def _parse_tagging(self, tagging: str) -> dict[str, str]:
|
||||
"""Parse URL-encoded tagging string to dict."""
|
||||
tags = {}
|
||||
@@ -1220,6 +1270,116 @@ class DeltaGliderClient:
|
||||
self._invalidate_bucket_stats_cache()
|
||||
self.service.cache.clear()
|
||||
|
||||
def rehydrate_for_download(self, Bucket: str, Key: str, ExpiresIn: int = 3600) -> str | None:
|
||||
"""Rehydrate a deltaglider-compressed file for direct download.
|
||||
|
||||
If the file is deltaglider-compressed, this will:
|
||||
1. Download and decompress the file
|
||||
2. Re-upload to .deltaglider/tmp/ with expiration metadata
|
||||
3. Return the new temporary file key
|
||||
|
||||
If the file is not deltaglider-compressed, returns None.
|
||||
|
||||
Args:
|
||||
Bucket: S3 bucket name
|
||||
Key: Object key
|
||||
ExpiresIn: How long the temporary file should exist (seconds)
|
||||
|
||||
Returns:
|
||||
New key for temporary file, or None if not deltaglider-compressed
|
||||
|
||||
Example:
|
||||
>>> client = create_client()
|
||||
>>> temp_key = client.rehydrate_for_download(
|
||||
... Bucket='my-bucket',
|
||||
... Key='large-file.zip.delta',
|
||||
... ExpiresIn=3600 # 1 hour
|
||||
... )
|
||||
>>> if temp_key:
|
||||
... # Generate presigned URL for the temporary file
|
||||
... url = client.generate_presigned_url(
|
||||
... 'get_object',
|
||||
... Params={'Bucket': 'my-bucket', 'Key': temp_key},
|
||||
... ExpiresIn=3600
|
||||
... )
|
||||
"""
|
||||
return self.service.rehydrate_for_download(Bucket, Key, ExpiresIn)
|
||||
|
||||
def generate_presigned_url_with_rehydration(
|
||||
self,
|
||||
Bucket: str,
|
||||
Key: str,
|
||||
ExpiresIn: int = 3600,
|
||||
) -> str:
|
||||
"""Generate a presigned URL with automatic rehydration for deltaglider files.
|
||||
|
||||
This method handles both regular and deltaglider-compressed files:
|
||||
- For regular files: Returns a standard presigned URL
|
||||
- For deltaglider files: Rehydrates to temporary location and returns presigned URL
|
||||
|
||||
Args:
|
||||
Bucket: S3 bucket name
|
||||
Key: Object key
|
||||
ExpiresIn: URL expiration time in seconds
|
||||
|
||||
Returns:
|
||||
Presigned URL for direct download
|
||||
|
||||
Example:
|
||||
>>> client = create_client()
|
||||
>>> # Works for both regular and deltaglider files
|
||||
>>> url = client.generate_presigned_url_with_rehydration(
|
||||
... Bucket='my-bucket',
|
||||
... Key='any-file.zip', # or 'any-file.zip.delta'
|
||||
... ExpiresIn=3600
|
||||
... )
|
||||
>>> print(f"Download URL: {url}")
|
||||
"""
|
||||
# Try to rehydrate if it's a deltaglider file
|
||||
temp_key = self.rehydrate_for_download(Bucket, Key, ExpiresIn)
|
||||
|
||||
# Use the temporary key if rehydration occurred, otherwise use original
|
||||
download_key = temp_key if temp_key else Key
|
||||
|
||||
# Extract the original filename for Content-Disposition header
|
||||
original_filename = Key.removesuffix(".delta") if Key.endswith(".delta") else Key
|
||||
if "/" in original_filename:
|
||||
original_filename = original_filename.split("/")[-1]
|
||||
|
||||
# Generate presigned URL with Content-Disposition to force correct filename
|
||||
params = {"Bucket": Bucket, "Key": download_key}
|
||||
if temp_key:
|
||||
# For rehydrated files, set Content-Disposition to use original filename
|
||||
params["ResponseContentDisposition"] = f'attachment; filename="{original_filename}"'
|
||||
|
||||
return self.generate_presigned_url("get_object", Params=params, ExpiresIn=ExpiresIn)
|
||||
|
||||
def purge_temp_files(self, Bucket: str) -> dict[str, Any]:
|
||||
"""Purge expired temporary files from .deltaglider/tmp/.
|
||||
|
||||
Scans the .deltaglider/tmp/ prefix and deletes any files
|
||||
whose dg-expires-at metadata indicates they have expired.
|
||||
|
||||
Args:
|
||||
Bucket: S3 bucket to purge temp files from
|
||||
|
||||
Returns:
|
||||
dict with purge statistics including:
|
||||
- deleted_count: Number of files deleted
|
||||
- expired_count: Number of expired files found
|
||||
- error_count: Number of errors encountered
|
||||
- total_size_freed: Total bytes freed
|
||||
- duration_seconds: Operation duration
|
||||
- errors: List of error messages
|
||||
|
||||
Example:
|
||||
>>> client = create_client()
|
||||
>>> result = client.purge_temp_files(Bucket='my-bucket')
|
||||
>>> print(f"Deleted {result['deleted_count']} expired files")
|
||||
>>> print(f"Freed {result['total_size_freed']} bytes")
|
||||
"""
|
||||
return self.service.purge_temp_files(Bucket)
|
||||
|
||||
|
||||
def create_client(
|
||||
endpoint_url: str | None = None,
|
||||
|
||||
@@ -2,11 +2,12 @@
|
||||
|
||||
from .core import DeltaService, ObjectKey
|
||||
from .core.errors import NotFoundError
|
||||
from .core.models import DeleteResult
|
||||
|
||||
|
||||
def delete_with_delta_suffix(
|
||||
service: DeltaService, bucket: str, key: str
|
||||
) -> tuple[str, dict[str, object]]:
|
||||
) -> tuple[str, DeleteResult]:
|
||||
"""Delete an object, retrying with '.delta' suffix when needed.
|
||||
|
||||
Args:
|
||||
@@ -15,7 +16,7 @@ def delete_with_delta_suffix(
|
||||
key: Requested key (without forcing .delta suffix).
|
||||
|
||||
Returns:
|
||||
Tuple containing the actual key deleted in storage and the delete result dict.
|
||||
Tuple containing the actual key deleted in storage and the DeleteResult.
|
||||
|
||||
Raises:
|
||||
NotFoundError: Propagated when both the direct and '.delta' keys are missing.
|
||||
|
||||
@@ -97,3 +97,4 @@ class BucketStats:
|
||||
average_compression_ratio: float
|
||||
delta_objects: int
|
||||
direct_objects: int
|
||||
object_limit_reached: bool = False
|
||||
|
||||
@@ -8,7 +8,7 @@ This package contains modular operation implementations:
|
||||
"""
|
||||
|
||||
from .batch import download_batch, upload_batch, upload_chunked
|
||||
from .bucket import create_bucket, delete_bucket, list_buckets
|
||||
from .bucket import create_bucket, delete_bucket, get_bucket_acl, list_buckets, put_bucket_acl
|
||||
from .presigned import generate_presigned_post, generate_presigned_url
|
||||
from .stats import (
|
||||
estimate_compression,
|
||||
@@ -21,7 +21,9 @@ __all__ = [
|
||||
# Bucket operations
|
||||
"create_bucket",
|
||||
"delete_bucket",
|
||||
"get_bucket_acl",
|
||||
"list_buckets",
|
||||
"put_bucket_acl",
|
||||
# Presigned operations
|
||||
"generate_presigned_url",
|
||||
"generate_presigned_post",
|
||||
|
||||
@@ -4,6 +4,8 @@ This module contains boto3-compatible bucket operations:
|
||||
- create_bucket
|
||||
- delete_bucket
|
||||
- list_buckets
|
||||
- put_bucket_acl
|
||||
- get_bucket_acl
|
||||
"""
|
||||
|
||||
from typing import Any
|
||||
@@ -173,3 +175,101 @@ def list_buckets(
|
||||
raise RuntimeError(f"Failed to list buckets: {e}") from e
|
||||
else:
|
||||
raise NotImplementedError("Storage adapter does not support bucket listing")
|
||||
|
||||
|
||||
def put_bucket_acl(
|
||||
client: Any, # DeltaGliderClient (avoiding circular import)
|
||||
Bucket: str,
|
||||
ACL: str | None = None,
|
||||
AccessControlPolicy: dict[str, Any] | None = None,
|
||||
GrantFullControl: str | None = None,
|
||||
GrantRead: str | None = None,
|
||||
GrantReadACP: str | None = None,
|
||||
GrantWrite: str | None = None,
|
||||
GrantWriteACP: str | None = None,
|
||||
**kwargs: Any,
|
||||
) -> dict[str, Any]:
|
||||
"""Set the ACL for an S3 bucket (boto3-compatible passthrough).
|
||||
|
||||
Args:
|
||||
client: DeltaGliderClient instance
|
||||
Bucket: Bucket name
|
||||
ACL: Canned ACL (private, public-read, public-read-write, authenticated-read)
|
||||
AccessControlPolicy: Full ACL policy dict
|
||||
GrantFullControl: Grants full control to the grantee
|
||||
GrantRead: Allows grantee to list objects in the bucket
|
||||
GrantReadACP: Allows grantee to read the bucket ACL
|
||||
GrantWrite: Allows grantee to create objects in the bucket
|
||||
GrantWriteACP: Allows grantee to write the ACL for the bucket
|
||||
**kwargs: Additional S3 parameters (for compatibility)
|
||||
|
||||
Returns:
|
||||
Response dict with status
|
||||
|
||||
Example:
|
||||
>>> client = create_client()
|
||||
>>> client.put_bucket_acl(Bucket='my-bucket', ACL='public-read')
|
||||
"""
|
||||
storage_adapter = client.service.storage
|
||||
|
||||
if hasattr(storage_adapter, "client"):
|
||||
try:
|
||||
params: dict[str, Any] = {"Bucket": Bucket}
|
||||
if ACL is not None:
|
||||
params["ACL"] = ACL
|
||||
if AccessControlPolicy is not None:
|
||||
params["AccessControlPolicy"] = AccessControlPolicy
|
||||
if GrantFullControl is not None:
|
||||
params["GrantFullControl"] = GrantFullControl
|
||||
if GrantRead is not None:
|
||||
params["GrantRead"] = GrantRead
|
||||
if GrantReadACP is not None:
|
||||
params["GrantReadACP"] = GrantReadACP
|
||||
if GrantWrite is not None:
|
||||
params["GrantWrite"] = GrantWrite
|
||||
if GrantWriteACP is not None:
|
||||
params["GrantWriteACP"] = GrantWriteACP
|
||||
|
||||
storage_adapter.client.put_bucket_acl(**params)
|
||||
return {
|
||||
"ResponseMetadata": {
|
||||
"HTTPStatusCode": 200,
|
||||
},
|
||||
}
|
||||
except Exception as e:
|
||||
raise RuntimeError(f"Failed to set bucket ACL: {e}") from e
|
||||
else:
|
||||
raise NotImplementedError("Storage adapter does not support bucket ACL operations")
|
||||
|
||||
|
||||
def get_bucket_acl(
|
||||
client: Any, # DeltaGliderClient (avoiding circular import)
|
||||
Bucket: str,
|
||||
**kwargs: Any,
|
||||
) -> dict[str, Any]:
|
||||
"""Get the ACL for an S3 bucket (boto3-compatible passthrough).
|
||||
|
||||
Args:
|
||||
client: DeltaGliderClient instance
|
||||
Bucket: Bucket name
|
||||
**kwargs: Additional S3 parameters (for compatibility)
|
||||
|
||||
Returns:
|
||||
Response dict with Owner and Grants
|
||||
|
||||
Example:
|
||||
>>> client = create_client()
|
||||
>>> response = client.get_bucket_acl(Bucket='my-bucket')
|
||||
>>> print(response['Owner'])
|
||||
>>> print(response['Grants'])
|
||||
"""
|
||||
storage_adapter = client.service.storage
|
||||
|
||||
if hasattr(storage_adapter, "client"):
|
||||
try:
|
||||
response: dict[str, Any] = storage_adapter.client.get_bucket_acl(Bucket=Bucket)
|
||||
return response
|
||||
except Exception as e:
|
||||
raise RuntimeError(f"Failed to get bucket ACL: {e}") from e
|
||||
else:
|
||||
raise NotImplementedError("Storage adapter does not support bucket ACL operations")
|
||||
|
||||
@@ -26,11 +26,24 @@ StatsMode = Literal["quick", "sampled", "detailed"]
|
||||
CACHE_VERSION = "1.0"
|
||||
CACHE_PREFIX = ".deltaglider"
|
||||
|
||||
# Listing limits (prevent runaway scans on gigantic buckets)
|
||||
QUICK_LIST_LIMIT = 60_000
|
||||
SAMPLED_LIST_LIMIT = 30_000
|
||||
|
||||
# ============================================================================
|
||||
# Internal Helper Functions
|
||||
# ============================================================================
|
||||
|
||||
|
||||
def _first_metadata_value(metadata: dict[str, Any], *keys: str) -> str | None:
|
||||
"""Return the first non-empty metadata value matching the provided keys."""
|
||||
for key in keys:
|
||||
value = metadata.get(key)
|
||||
if value not in (None, ""):
|
||||
return value
|
||||
return None
|
||||
|
||||
|
||||
def _fetch_delta_metadata(
|
||||
client: Any,
|
||||
bucket: str,
|
||||
@@ -316,22 +329,25 @@ def _build_object_info_list(
|
||||
compression_ratio = 0.0
|
||||
|
||||
try:
|
||||
if "dg-file-size" in metadata:
|
||||
original_size = int(metadata["dg-file-size"])
|
||||
logger.debug(
|
||||
f"Delta {key}: using original_size={original_size} from metadata['dg-file-size']"
|
||||
)
|
||||
original_size_raw = _first_metadata_value(
|
||||
metadata,
|
||||
"dg-file-size",
|
||||
"dg_file_size",
|
||||
"file_size",
|
||||
"file-size",
|
||||
"deltaglider-original-size",
|
||||
)
|
||||
if original_size_raw is not None:
|
||||
original_size = int(original_size_raw)
|
||||
logger.debug(f"Delta {key}: using original_size={original_size} from metadata")
|
||||
else:
|
||||
logger.warning(
|
||||
f"Delta {key}: metadata missing 'dg-file-size' key. "
|
||||
f"Available keys: {list(metadata.keys())}. "
|
||||
f"Using None as original_size (unknown)"
|
||||
f"Delta {key}: metadata missing file size. Available keys: {list(metadata.keys())}. Using None as original_size (unknown)"
|
||||
)
|
||||
original_size = None
|
||||
except (ValueError, TypeError) as e:
|
||||
logger.warning(
|
||||
f"Delta {key}: failed to parse dg-file-size from metadata: {e}. "
|
||||
f"Using None as original_size (unknown)"
|
||||
f"Delta {key}: failed to parse file size from metadata: {e}. Using None as original_size (unknown)"
|
||||
)
|
||||
original_size = None
|
||||
|
||||
@@ -346,7 +362,13 @@ def _build_object_info_list(
|
||||
compressed_size=size,
|
||||
is_delta=is_delta,
|
||||
compression_ratio=compression_ratio,
|
||||
reference_key=metadata.get("ref_key") if metadata else None,
|
||||
reference_key=_first_metadata_value(
|
||||
metadata,
|
||||
"dg-ref-key",
|
||||
"dg_ref_key",
|
||||
"ref_key",
|
||||
"ref-key",
|
||||
),
|
||||
)
|
||||
)
|
||||
|
||||
@@ -434,8 +456,13 @@ def _calculate_bucket_statistics(
|
||||
space_saved = 0
|
||||
avg_ratio = 0.0
|
||||
else:
|
||||
space_saved = total_original_size - total_compressed_size
|
||||
raw_space_saved = total_original_size - total_compressed_size
|
||||
space_saved = raw_space_saved if raw_space_saved > 0 else 0
|
||||
avg_ratio = (space_saved / total_original_size) if total_original_size > 0 else 0.0
|
||||
if avg_ratio < 0:
|
||||
avg_ratio = 0.0
|
||||
elif avg_ratio > 1:
|
||||
avg_ratio = 1.0
|
||||
|
||||
# Warn if quick mode with delta files (stats will be incomplete)
|
||||
if mode == "quick" and delta_count > 0 and total_original_size == 0:
|
||||
@@ -612,17 +639,24 @@ def get_bucket_stats(
|
||||
f"[{datetime.now(UTC).strftime('%H:%M:%S.%f')[:-3]}] Phase 1: Starting LIST operation for bucket '{bucket}'"
|
||||
)
|
||||
|
||||
list_cap = QUICK_LIST_LIMIT if mode == "quick" else SAMPLED_LIST_LIMIT
|
||||
listing = list_all_objects(
|
||||
client.service.storage,
|
||||
bucket=bucket,
|
||||
max_keys=1000,
|
||||
logger=client.service.logger,
|
||||
max_objects=list_cap,
|
||||
)
|
||||
raw_objects = listing.objects
|
||||
|
||||
# Calculate validation metrics from LIST
|
||||
current_object_count = len(raw_objects)
|
||||
current_compressed_size = sum(obj["size"] for obj in raw_objects)
|
||||
limit_reached = listing.limit_reached or listing.is_truncated
|
||||
if limit_reached:
|
||||
client.service.logger.info(
|
||||
f"[{datetime.now(UTC).strftime('%H:%M:%S.%f')[:-3]}] Phase 1: Listing capped at {list_cap} objects (bucket likely larger)."
|
||||
)
|
||||
|
||||
phase1_duration = time.time() - phase1_start
|
||||
client.service.logger.info(
|
||||
@@ -790,6 +824,7 @@ def get_bucket_stats(
|
||||
f"[{datetime.now(UTC).strftime('%H:%M:%S.%f')[:-3]}] COMPLETE: Total time {total_duration:.2f}s for bucket '{bucket}' (mode={mode})"
|
||||
)
|
||||
|
||||
stats.object_limit_reached = limit_reached
|
||||
return stats
|
||||
|
||||
except Exception as e:
|
||||
@@ -807,6 +842,7 @@ def get_bucket_stats(
|
||||
average_compression_ratio=0.0,
|
||||
delta_objects=0,
|
||||
direct_objects=0,
|
||||
object_limit_reached=False,
|
||||
)
|
||||
|
||||
|
||||
|
||||
@@ -16,10 +16,12 @@ from .errors import (
|
||||
StorageIOError,
|
||||
)
|
||||
from .models import (
|
||||
DeleteResult,
|
||||
DeltaMeta,
|
||||
DeltaSpace,
|
||||
ObjectKey,
|
||||
PutSummary,
|
||||
RecursiveDeleteResult,
|
||||
ReferenceMeta,
|
||||
Sha256,
|
||||
VerifyResult,
|
||||
@@ -36,8 +38,10 @@ __all__ = [
|
||||
"DiffDecodeError",
|
||||
"StorageIOError",
|
||||
"PolicyViolationWarning",
|
||||
"DeleteResult",
|
||||
"DeltaSpace",
|
||||
"ObjectKey",
|
||||
"RecursiveDeleteResult",
|
||||
"Sha256",
|
||||
"DeltaMeta",
|
||||
"ReferenceMeta",
|
||||
|
||||
53
src/deltaglider/core/config.py
Normal file
53
src/deltaglider/core/config.py
Normal file
@@ -0,0 +1,53 @@
|
||||
"""Centralized configuration for DeltaGlider."""
|
||||
|
||||
import os
|
||||
from dataclasses import dataclass, field
|
||||
|
||||
|
||||
@dataclass(slots=True)
|
||||
class DeltaGliderConfig:
|
||||
"""All DeltaGlider configuration in one place.
|
||||
|
||||
Environment variables (all optional):
|
||||
DG_MAX_RATIO: Max delta/file ratio before falling back to direct storage.
|
||||
Range 0.0-1.0, default 0.5.
|
||||
DG_LOG_LEVEL: Logging level. Default "INFO".
|
||||
DG_CACHE_BACKEND: "filesystem" (default) or "memory".
|
||||
DG_CACHE_MEMORY_SIZE_MB: Memory cache size in MB. Default 100.
|
||||
DG_METRICS: Metrics backend: "noop", "logging" (default), "cloudwatch".
|
||||
DG_METRICS_NAMESPACE: CloudWatch namespace. Default "DeltaGlider".
|
||||
"""
|
||||
|
||||
max_ratio: float = 0.5
|
||||
log_level: str = "INFO"
|
||||
cache_backend: str = "filesystem"
|
||||
cache_memory_size_mb: int = 100
|
||||
metrics_type: str = "logging"
|
||||
metrics_namespace: str = "DeltaGlider"
|
||||
|
||||
# Connection params (typically passed by CLI, not env vars)
|
||||
endpoint_url: str | None = field(default=None, repr=False)
|
||||
region: str | None = None
|
||||
profile: str | None = None
|
||||
|
||||
@classmethod
|
||||
def from_env(
|
||||
cls,
|
||||
*,
|
||||
log_level: str = "INFO",
|
||||
endpoint_url: str | None = None,
|
||||
region: str | None = None,
|
||||
profile: str | None = None,
|
||||
) -> "DeltaGliderConfig":
|
||||
"""Build config from environment variables + explicit overrides."""
|
||||
return cls(
|
||||
max_ratio=float(os.environ.get("DG_MAX_RATIO", "0.5")),
|
||||
log_level=os.environ.get("DG_LOG_LEVEL", log_level),
|
||||
cache_backend=os.environ.get("DG_CACHE_BACKEND", "filesystem"),
|
||||
cache_memory_size_mb=int(os.environ.get("DG_CACHE_MEMORY_SIZE_MB", "100")),
|
||||
metrics_type=os.environ.get("DG_METRICS", "logging"),
|
||||
metrics_namespace=os.environ.get("DG_METRICS_NAMESPACE", "DeltaGlider"),
|
||||
endpoint_url=endpoint_url,
|
||||
region=region,
|
||||
profile=profile,
|
||||
)
|
||||
@@ -1,12 +1,80 @@
|
||||
"""Core domain models."""
|
||||
|
||||
from dataclasses import dataclass
|
||||
import logging
|
||||
from dataclasses import dataclass, field
|
||||
from datetime import datetime
|
||||
|
||||
# Metadata key prefix for DeltaGlider
|
||||
# AWS S3 automatically adds 'x-amz-meta-' prefix, so our keys become 'x-amz-meta-dg-*'
|
||||
METADATA_PREFIX = "dg-"
|
||||
|
||||
# Canonical metadata key aliases.
|
||||
# Each field maps to all known key formats (current prefixed, legacy underscore, legacy bare,
|
||||
# legacy hyphenated). Order matters: first match wins during lookup.
|
||||
# Both DeltaMeta.from_dict() and service-layer _meta_value() MUST use these to stay in sync.
|
||||
METADATA_KEY_ALIASES: dict[str, tuple[str, ...]] = {
|
||||
"tool": (f"{METADATA_PREFIX}tool", "dg_tool", "tool"),
|
||||
"original_name": (
|
||||
f"{METADATA_PREFIX}original-name",
|
||||
"dg_original_name",
|
||||
"original_name",
|
||||
"original-name",
|
||||
),
|
||||
"file_sha256": (
|
||||
f"{METADATA_PREFIX}file-sha256",
|
||||
"dg_file_sha256",
|
||||
"file_sha256",
|
||||
"file-sha256",
|
||||
),
|
||||
"file_size": (
|
||||
f"{METADATA_PREFIX}file-size",
|
||||
"dg_file_size",
|
||||
"file_size",
|
||||
"file-size",
|
||||
),
|
||||
"created_at": (
|
||||
f"{METADATA_PREFIX}created-at",
|
||||
"dg_created_at",
|
||||
"created_at",
|
||||
"created-at",
|
||||
),
|
||||
"ref_key": (f"{METADATA_PREFIX}ref-key", "dg_ref_key", "ref_key", "ref-key"),
|
||||
"ref_sha256": (
|
||||
f"{METADATA_PREFIX}ref-sha256",
|
||||
"dg_ref_sha256",
|
||||
"ref_sha256",
|
||||
"ref-sha256",
|
||||
),
|
||||
"delta_size": (
|
||||
f"{METADATA_PREFIX}delta-size",
|
||||
"dg_delta_size",
|
||||
"delta_size",
|
||||
"delta-size",
|
||||
),
|
||||
"delta_cmd": (
|
||||
f"{METADATA_PREFIX}delta-cmd",
|
||||
"dg_delta_cmd",
|
||||
"delta_cmd",
|
||||
"delta-cmd",
|
||||
),
|
||||
"note": (f"{METADATA_PREFIX}note", "dg_note", "note"),
|
||||
}
|
||||
|
||||
|
||||
def resolve_metadata(metadata: dict[str, str], field: str) -> str | None:
|
||||
"""Look up a metadata field using all known key aliases.
|
||||
|
||||
Returns the first non-empty match, or None if not found.
|
||||
"""
|
||||
for key in METADATA_KEY_ALIASES[field]:
|
||||
value = metadata.get(key)
|
||||
if value not in (None, ""):
|
||||
return value
|
||||
return None
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class DeltaSpace:
|
||||
@@ -27,6 +95,11 @@ class ObjectKey:
|
||||
bucket: str
|
||||
key: str
|
||||
|
||||
@property
|
||||
def full_key(self) -> str:
|
||||
"""Full S3 path: bucket/key."""
|
||||
return f"{self.bucket}/{self.key}"
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class Sha256:
|
||||
@@ -96,17 +169,60 @@ class DeltaMeta:
|
||||
@classmethod
|
||||
def from_dict(cls, data: dict[str, str]) -> "DeltaMeta":
|
||||
"""Create from S3 metadata dict with DeltaGlider namespace prefix."""
|
||||
|
||||
def _require(field: str) -> str:
|
||||
value = resolve_metadata(data, field)
|
||||
if value is None:
|
||||
raise KeyError(METADATA_KEY_ALIASES[field][0])
|
||||
return value
|
||||
|
||||
tool = _require("tool")
|
||||
original_name = _require("original_name")
|
||||
file_sha = _require("file_sha256")
|
||||
file_size_raw = _require("file_size")
|
||||
created_at_raw = _require("created_at")
|
||||
ref_key = _require("ref_key")
|
||||
ref_sha = _require("ref_sha256")
|
||||
delta_size_raw = _require("delta_size")
|
||||
delta_cmd_value = resolve_metadata(data, "delta_cmd") or ""
|
||||
note_value = resolve_metadata(data, "note") or ""
|
||||
|
||||
try:
|
||||
file_size = int(file_size_raw)
|
||||
except (TypeError, ValueError):
|
||||
raise ValueError(f"Invalid file size metadata: {file_size_raw}") from None
|
||||
|
||||
try:
|
||||
delta_size = int(delta_size_raw)
|
||||
except (TypeError, ValueError):
|
||||
raise ValueError(f"Invalid delta size metadata: {delta_size_raw}") from None
|
||||
|
||||
created_at_text = created_at_raw.rstrip("Z")
|
||||
try:
|
||||
created_at = datetime.fromisoformat(created_at_text)
|
||||
except ValueError as exc:
|
||||
raise ValueError(f"Invalid created_at metadata: {created_at_raw}") from exc
|
||||
|
||||
if not delta_cmd_value:
|
||||
object_name = original_name or "<unknown>"
|
||||
logger.warning(
|
||||
"Delta metadata missing %s for %s; using empty command",
|
||||
f"{METADATA_PREFIX}delta-cmd",
|
||||
object_name,
|
||||
)
|
||||
delta_cmd_value = ""
|
||||
|
||||
return cls(
|
||||
tool=data[f"{METADATA_PREFIX}tool"],
|
||||
original_name=data[f"{METADATA_PREFIX}original-name"],
|
||||
file_sha256=data[f"{METADATA_PREFIX}file-sha256"],
|
||||
file_size=int(data[f"{METADATA_PREFIX}file-size"]),
|
||||
created_at=datetime.fromisoformat(data[f"{METADATA_PREFIX}created-at"].rstrip("Z")),
|
||||
ref_key=data[f"{METADATA_PREFIX}ref-key"],
|
||||
ref_sha256=data[f"{METADATA_PREFIX}ref-sha256"],
|
||||
delta_size=int(data[f"{METADATA_PREFIX}delta-size"]),
|
||||
delta_cmd=data[f"{METADATA_PREFIX}delta-cmd"],
|
||||
note=data.get(f"{METADATA_PREFIX}note"),
|
||||
tool=tool,
|
||||
original_name=original_name,
|
||||
file_sha256=file_sha,
|
||||
file_size=file_size,
|
||||
created_at=created_at,
|
||||
ref_key=ref_key,
|
||||
ref_sha256=ref_sha,
|
||||
delta_size=delta_size,
|
||||
delta_cmd=delta_cmd_value,
|
||||
note=note_value or None,
|
||||
)
|
||||
|
||||
|
||||
@@ -135,3 +251,33 @@ class VerifyResult:
|
||||
expected_sha256: str
|
||||
actual_sha256: str
|
||||
message: str
|
||||
|
||||
|
||||
@dataclass
|
||||
class DeleteResult:
|
||||
"""Result of a single delete operation."""
|
||||
|
||||
key: str
|
||||
bucket: str
|
||||
deleted: bool = False
|
||||
type: str = "unknown"
|
||||
warnings: list[str] = field(default_factory=list)
|
||||
original_name: str | None = None
|
||||
dependent_deltas: int = 0
|
||||
cleaned_reference: str | None = None
|
||||
|
||||
|
||||
@dataclass
|
||||
class RecursiveDeleteResult:
|
||||
"""Result of a recursive delete operation."""
|
||||
|
||||
bucket: str
|
||||
prefix: str
|
||||
deleted_count: int = 0
|
||||
failed_count: int = 0
|
||||
deltas_deleted: int = 0
|
||||
references_deleted: int = 0
|
||||
direct_deleted: int = 0
|
||||
other_deleted: int = 0
|
||||
errors: list[str] = field(default_factory=list)
|
||||
warnings: list[str] = field(default_factory=list)
|
||||
|
||||
@@ -18,6 +18,7 @@ class ObjectListing:
|
||||
key_count: int = 0
|
||||
is_truncated: bool = False
|
||||
next_continuation_token: str | None = None
|
||||
limit_reached: bool = False
|
||||
|
||||
|
||||
def list_objects_page(
|
||||
@@ -61,6 +62,7 @@ def list_all_objects(
|
||||
max_keys: int = 1000,
|
||||
logger: Any | None = None,
|
||||
max_iterations: int = 10_000,
|
||||
max_objects: int | None = None,
|
||||
) -> ObjectListing:
|
||||
"""Fetch all objects under the given bucket/prefix with pagination safety."""
|
||||
import time
|
||||
@@ -70,6 +72,7 @@ def list_all_objects(
|
||||
continuation_token: str | None = None
|
||||
iteration_count = 0
|
||||
list_start_time = time.time()
|
||||
limit_reached = False
|
||||
|
||||
while True:
|
||||
iteration_count += 1
|
||||
@@ -130,6 +133,18 @@ def list_all_objects(
|
||||
aggregated.common_prefixes.extend(page.common_prefixes)
|
||||
aggregated.key_count += page.key_count
|
||||
|
||||
if max_objects is not None and len(aggregated.objects) >= max_objects:
|
||||
if logger:
|
||||
logger.info(
|
||||
f"[{datetime.now(UTC).strftime('%H:%M:%S.%f')[:-3]}] LIST capped at {max_objects} objects."
|
||||
)
|
||||
aggregated.objects = aggregated.objects[:max_objects]
|
||||
aggregated.key_count = len(aggregated.objects)
|
||||
aggregated.is_truncated = True
|
||||
aggregated.next_continuation_token = page.next_continuation_token
|
||||
limit_reached = True
|
||||
break
|
||||
|
||||
if not page.is_truncated:
|
||||
aggregated.is_truncated = False
|
||||
aggregated.next_continuation_token = None
|
||||
@@ -161,6 +176,7 @@ def list_all_objects(
|
||||
unique_prefixes.append(prefix)
|
||||
aggregated.common_prefixes = unique_prefixes
|
||||
aggregated.key_count = len(aggregated.objects)
|
||||
aggregated.limit_reached = limit_reached
|
||||
return aggregated
|
||||
|
||||
|
||||
|
||||
@@ -2,6 +2,7 @@
|
||||
|
||||
import tempfile
|
||||
import warnings
|
||||
from datetime import UTC, timedelta
|
||||
from pathlib import Path
|
||||
from typing import Any, BinaryIO
|
||||
|
||||
@@ -29,12 +30,15 @@ from .errors import (
|
||||
PolicyViolationWarning,
|
||||
)
|
||||
from .models import (
|
||||
DeleteResult,
|
||||
DeltaMeta,
|
||||
DeltaSpace,
|
||||
ObjectKey,
|
||||
PutSummary,
|
||||
RecursiveDeleteResult,
|
||||
ReferenceMeta,
|
||||
VerifyResult,
|
||||
resolve_metadata,
|
||||
)
|
||||
|
||||
|
||||
@@ -169,7 +173,7 @@ class DeltaService:
|
||||
self.logger.info("Starting get operation", key=object_key.key)
|
||||
|
||||
# Get object metadata
|
||||
obj_head = self.storage.head(f"{object_key.bucket}/{object_key.key}")
|
||||
obj_head = self.storage.head(object_key.full_key)
|
||||
if obj_head is None:
|
||||
raise NotFoundError(f"Object not found: {object_key.key}")
|
||||
|
||||
@@ -199,11 +203,13 @@ class DeltaService:
|
||||
# Direct download without delta processing
|
||||
self._get_direct(object_key, obj_head, out)
|
||||
duration = (self.clock.now() - start_time).total_seconds()
|
||||
file_size_meta = resolve_metadata(obj_head.metadata, "file_size")
|
||||
file_size_value = int(file_size_meta) if file_size_meta else obj_head.size
|
||||
self.logger.log_operation(
|
||||
op="get",
|
||||
key=object_key.key,
|
||||
deltaspace=f"{object_key.bucket}",
|
||||
sizes={"file": int(obj_head.metadata.get("file_size", 0))},
|
||||
sizes={"file": file_size_value},
|
||||
durations={"total": duration},
|
||||
cache_hit=False,
|
||||
)
|
||||
@@ -241,7 +247,7 @@ class DeltaService:
|
||||
|
||||
# Download delta
|
||||
with open(delta_path, "wb") as f:
|
||||
delta_stream = self.storage.get(f"{object_key.bucket}/{object_key.key}")
|
||||
delta_stream = self.storage.get(object_key.full_key)
|
||||
for chunk in iter(lambda: delta_stream.read(8192), b""):
|
||||
f.write(chunk)
|
||||
|
||||
@@ -341,10 +347,13 @@ class DeltaService:
|
||||
|
||||
# Re-check for race condition
|
||||
ref_head = self.storage.head(full_ref_key)
|
||||
if ref_head and ref_head.metadata.get("dg-file-sha256") != file_sha256:
|
||||
existing_sha = None
|
||||
if ref_head:
|
||||
existing_sha = resolve_metadata(ref_head.metadata, "file_sha256")
|
||||
if ref_head and existing_sha and existing_sha != file_sha256:
|
||||
self.logger.warning("Reference creation race detected, using existing")
|
||||
# Proceed with existing reference
|
||||
ref_sha256 = ref_head.metadata["dg-file-sha256"]
|
||||
ref_sha256 = existing_sha
|
||||
else:
|
||||
ref_sha256 = file_sha256
|
||||
|
||||
@@ -407,7 +416,9 @@ class DeltaService:
|
||||
) -> PutSummary:
|
||||
"""Create delta file."""
|
||||
ref_key = delta_space.reference_key()
|
||||
ref_sha256 = ref_head.metadata["dg-file-sha256"]
|
||||
ref_sha256 = resolve_metadata(ref_head.metadata, "file_sha256")
|
||||
if not ref_sha256:
|
||||
raise ValueError("Reference metadata missing file SHA256")
|
||||
|
||||
# Ensure reference is cached
|
||||
cache_hit = self.cache.has_ref(delta_space.bucket, delta_space.prefix, ref_sha256)
|
||||
@@ -527,7 +538,7 @@ class DeltaService:
|
||||
) -> None:
|
||||
"""Download file directly from S3 without delta processing."""
|
||||
# Download the file directly
|
||||
file_stream = self.storage.get(f"{object_key.bucket}/{object_key.key}")
|
||||
file_stream = self.storage.get(object_key.full_key)
|
||||
|
||||
if isinstance(out, Path):
|
||||
# Write to file path
|
||||
@@ -540,7 +551,7 @@ class DeltaService:
|
||||
out.write(chunk)
|
||||
|
||||
# Verify integrity if SHA256 is present
|
||||
expected_sha = obj_head.metadata.get("file_sha256")
|
||||
expected_sha = resolve_metadata(obj_head.metadata, "file_sha256")
|
||||
if expected_sha:
|
||||
if isinstance(out, Path):
|
||||
actual_sha = self.hasher.sha256(out)
|
||||
@@ -561,7 +572,7 @@ class DeltaService:
|
||||
self.logger.info(
|
||||
"Direct download complete",
|
||||
key=object_key.key,
|
||||
size=obj_head.metadata.get("file_size"),
|
||||
size=resolve_metadata(obj_head.metadata, "file_size"),
|
||||
)
|
||||
|
||||
def _upload_direct(
|
||||
@@ -609,128 +620,37 @@ class DeltaService:
|
||||
file_sha256=file_sha256,
|
||||
)
|
||||
|
||||
def delete(self, object_key: ObjectKey) -> dict[str, Any]:
|
||||
def delete(self, object_key: ObjectKey) -> DeleteResult:
|
||||
"""Delete an object (delta-aware).
|
||||
|
||||
For delta files, just deletes the delta.
|
||||
For reference files, checks if any deltas depend on it first.
|
||||
For direct uploads, simply deletes the file.
|
||||
|
||||
Returns:
|
||||
dict with deletion details including type and any warnings
|
||||
"""
|
||||
start_time = self.clock.now()
|
||||
full_key = f"{object_key.bucket}/{object_key.key}"
|
||||
full_key = object_key.full_key
|
||||
|
||||
self.logger.info("Starting delete operation", key=object_key.key)
|
||||
|
||||
# Check if object exists
|
||||
obj_head = self.storage.head(full_key)
|
||||
if obj_head is None:
|
||||
raise NotFoundError(f"Object not found: {object_key.key}")
|
||||
|
||||
# Determine object type
|
||||
is_reference = object_key.key.endswith("/reference.bin")
|
||||
is_delta = object_key.key.endswith(".delta")
|
||||
is_direct = obj_head.metadata.get("compression") == "none"
|
||||
result = DeleteResult(key=object_key.key, bucket=object_key.bucket)
|
||||
|
||||
result: dict[str, Any] = {
|
||||
"key": object_key.key,
|
||||
"bucket": object_key.bucket,
|
||||
"deleted": False,
|
||||
"type": "unknown",
|
||||
"warnings": [],
|
||||
}
|
||||
|
||||
if is_reference:
|
||||
# Check if any deltas depend on this reference
|
||||
prefix = object_key.key.rsplit("/", 1)[0] if "/" in object_key.key else ""
|
||||
dependent_deltas = []
|
||||
|
||||
for obj in self.storage.list(f"{object_key.bucket}/{prefix}"):
|
||||
if obj.key.endswith(".delta") and obj.key != object_key.key:
|
||||
# Check if this delta references our reference
|
||||
delta_head = self.storage.head(f"{object_key.bucket}/{obj.key}")
|
||||
if delta_head and delta_head.metadata.get("ref_key") == object_key.key:
|
||||
dependent_deltas.append(obj.key)
|
||||
|
||||
if dependent_deltas:
|
||||
warnings_list = result["warnings"]
|
||||
assert isinstance(warnings_list, list)
|
||||
warnings_list.append(
|
||||
f"Reference has {len(dependent_deltas)} dependent delta(s). "
|
||||
"Deleting this will make those deltas unrecoverable."
|
||||
)
|
||||
self.logger.warning(
|
||||
"Reference has dependent deltas",
|
||||
ref_key=object_key.key,
|
||||
delta_count=len(dependent_deltas),
|
||||
deltas=dependent_deltas[:5], # Log first 5
|
||||
)
|
||||
|
||||
# Delete the reference
|
||||
if object_key.key.endswith("/reference.bin"):
|
||||
self._delete_reference(object_key, full_key, result)
|
||||
elif object_key.key.endswith(".delta"):
|
||||
self._delete_delta(object_key, full_key, obj_head, result)
|
||||
elif obj_head.metadata.get("compression") == "none":
|
||||
self.storage.delete(full_key)
|
||||
result["deleted"] = True
|
||||
result["type"] = "reference"
|
||||
result["dependent_deltas"] = len(dependent_deltas)
|
||||
|
||||
# Clear from cache if present
|
||||
if "/" in object_key.key:
|
||||
deltaspace_prefix = object_key.key.rsplit("/", 1)[0]
|
||||
try:
|
||||
self.cache.evict(object_key.bucket, deltaspace_prefix)
|
||||
except Exception as e:
|
||||
self.logger.debug(f"Could not clear cache for {object_key.key}: {e}")
|
||||
|
||||
elif is_delta:
|
||||
# Delete the delta file
|
||||
self.storage.delete(full_key)
|
||||
result["deleted"] = True
|
||||
result["type"] = "delta"
|
||||
result["original_name"] = obj_head.metadata.get("original_name", "unknown")
|
||||
|
||||
# Check if this was the last delta in the DeltaSpace - if so, clean up reference.bin
|
||||
if "/" in object_key.key:
|
||||
deltaspace_prefix = "/".join(object_key.key.split("/")[:-1])
|
||||
ref_key = f"{deltaspace_prefix}/reference.bin"
|
||||
|
||||
# Check if any other delta files exist in this DeltaSpace
|
||||
remaining_deltas = []
|
||||
for obj in self.storage.list(f"{object_key.bucket}/{deltaspace_prefix}"):
|
||||
if obj.key.endswith(".delta") and obj.key != object_key.key:
|
||||
remaining_deltas.append(obj.key)
|
||||
|
||||
if not remaining_deltas:
|
||||
# No more deltas - clean up the orphaned reference.bin
|
||||
ref_full_key = f"{object_key.bucket}/{ref_key}"
|
||||
ref_head = self.storage.head(ref_full_key)
|
||||
if ref_head:
|
||||
self.storage.delete(ref_full_key)
|
||||
self.logger.info(
|
||||
"Cleaned up orphaned reference.bin",
|
||||
ref_key=ref_key,
|
||||
reason="no remaining deltas",
|
||||
)
|
||||
result["cleaned_reference"] = ref_key
|
||||
|
||||
# Clear from cache
|
||||
try:
|
||||
self.cache.evict(object_key.bucket, deltaspace_prefix)
|
||||
except Exception as e:
|
||||
self.logger.debug(f"Could not clear cache for {deltaspace_prefix}: {e}")
|
||||
|
||||
elif is_direct:
|
||||
# Simply delete the direct upload
|
||||
self.storage.delete(full_key)
|
||||
result["deleted"] = True
|
||||
result["type"] = "direct"
|
||||
result["original_name"] = obj_head.metadata.get("original_name", object_key.key)
|
||||
|
||||
result.deleted = True
|
||||
result.type = "direct"
|
||||
result.original_name = obj_head.metadata.get("original_name", object_key.key)
|
||||
else:
|
||||
# Unknown file type, delete anyway
|
||||
self.storage.delete(full_key)
|
||||
result["deleted"] = True
|
||||
result["type"] = "unknown"
|
||||
result.deleted = True
|
||||
result.type = "unknown"
|
||||
|
||||
duration = (self.clock.now() - start_time).total_seconds()
|
||||
self.logger.log_operation(
|
||||
@@ -742,169 +662,139 @@ class DeltaService:
|
||||
cache_hit=False,
|
||||
)
|
||||
self.metrics.timing("deltaglider.delete.duration", duration)
|
||||
self.metrics.increment(f"deltaglider.delete.{result['type']}")
|
||||
self.metrics.increment(f"deltaglider.delete.{result.type}")
|
||||
|
||||
return result
|
||||
|
||||
def delete_recursive(self, bucket: str, prefix: str) -> dict[str, Any]:
|
||||
def _delete_reference(self, object_key: ObjectKey, full_key: str, result: DeleteResult) -> None:
|
||||
"""Handle deletion of a reference.bin file."""
|
||||
prefix = object_key.key.rsplit("/", 1)[0] if "/" in object_key.key else ""
|
||||
dependent_deltas = []
|
||||
|
||||
for obj in self.storage.list(f"{object_key.bucket}/{prefix}"):
|
||||
if obj.key.endswith(".delta") and obj.key != object_key.key:
|
||||
delta_head = self.storage.head(f"{object_key.bucket}/{obj.key}")
|
||||
if delta_head and delta_head.metadata.get("ref_key") == object_key.key:
|
||||
dependent_deltas.append(obj.key)
|
||||
|
||||
if dependent_deltas:
|
||||
result.warnings.append(
|
||||
f"Reference has {len(dependent_deltas)} dependent delta(s). "
|
||||
"Deleting this will make those deltas unrecoverable."
|
||||
)
|
||||
self.logger.warning(
|
||||
"Reference has dependent deltas",
|
||||
ref_key=object_key.key,
|
||||
delta_count=len(dependent_deltas),
|
||||
deltas=dependent_deltas[:5],
|
||||
)
|
||||
|
||||
self.storage.delete(full_key)
|
||||
result.deleted = True
|
||||
result.type = "reference"
|
||||
result.dependent_deltas = len(dependent_deltas)
|
||||
|
||||
if "/" in object_key.key:
|
||||
deltaspace_prefix = object_key.key.rsplit("/", 1)[0]
|
||||
try:
|
||||
self.cache.evict(object_key.bucket, deltaspace_prefix)
|
||||
except Exception as e:
|
||||
self.logger.debug(f"Could not clear cache for {object_key.key}: {e}")
|
||||
|
||||
def _delete_delta(
|
||||
self,
|
||||
object_key: ObjectKey,
|
||||
full_key: str,
|
||||
obj_head: ObjectHead,
|
||||
result: DeleteResult,
|
||||
) -> None:
|
||||
"""Handle deletion of a delta file, cleaning up orphaned references."""
|
||||
self.storage.delete(full_key)
|
||||
result.deleted = True
|
||||
result.type = "delta"
|
||||
result.original_name = obj_head.metadata.get("original_name", "unknown")
|
||||
|
||||
if "/" not in object_key.key:
|
||||
return
|
||||
|
||||
deltaspace_prefix = "/".join(object_key.key.split("/")[:-1])
|
||||
ref_key = f"{deltaspace_prefix}/reference.bin"
|
||||
|
||||
remaining_deltas = [
|
||||
obj.key
|
||||
for obj in self.storage.list(f"{object_key.bucket}/{deltaspace_prefix}")
|
||||
if obj.key.endswith(".delta") and obj.key != object_key.key
|
||||
]
|
||||
|
||||
if not remaining_deltas:
|
||||
ref_full_key = f"{object_key.bucket}/{ref_key}"
|
||||
ref_head = self.storage.head(ref_full_key)
|
||||
if ref_head:
|
||||
self.storage.delete(ref_full_key)
|
||||
self.logger.info(
|
||||
"Cleaned up orphaned reference.bin",
|
||||
ref_key=ref_key,
|
||||
reason="no remaining deltas",
|
||||
)
|
||||
result.cleaned_reference = ref_key
|
||||
|
||||
try:
|
||||
self.cache.evict(object_key.bucket, deltaspace_prefix)
|
||||
except Exception as e:
|
||||
self.logger.debug(f"Could not clear cache for {deltaspace_prefix}: {e}")
|
||||
|
||||
def delete_recursive(self, bucket: str, prefix: str) -> RecursiveDeleteResult:
|
||||
"""Recursively delete all objects under a prefix (delta-aware).
|
||||
|
||||
Handles delta relationships intelligently:
|
||||
- Deletes deltas before references
|
||||
- Warns about orphaned deltas
|
||||
- Handles direct uploads
|
||||
|
||||
Args:
|
||||
bucket: S3 bucket name
|
||||
prefix: Prefix to delete recursively
|
||||
|
||||
Returns:
|
||||
dict with deletion statistics and any warnings
|
||||
"""
|
||||
start_time = self.clock.now()
|
||||
self.logger.info("Starting recursive delete", bucket=bucket, prefix=prefix)
|
||||
|
||||
# Ensure prefix ends with / for proper directory deletion
|
||||
if prefix and not prefix.endswith("/"):
|
||||
prefix = f"{prefix}/"
|
||||
|
||||
# Collect all objects under prefix
|
||||
objects_to_delete = []
|
||||
references = []
|
||||
deltas = []
|
||||
direct_uploads = []
|
||||
affected_deltaspaces = set()
|
||||
# Phase 1: classify objects by type
|
||||
references, deltas, direct_uploads, other_objects, affected_deltaspaces = (
|
||||
self._classify_objects_for_deletion(bucket, prefix)
|
||||
)
|
||||
|
||||
for obj in self.storage.list(f"{bucket}/{prefix}" if prefix else bucket):
|
||||
if not obj.key.startswith(prefix) and prefix:
|
||||
continue
|
||||
|
||||
if obj.key.endswith("/reference.bin"):
|
||||
references.append(obj.key)
|
||||
elif obj.key.endswith(".delta"):
|
||||
deltas.append(obj.key)
|
||||
# Track which deltaspaces are affected by this deletion
|
||||
if "/" in obj.key:
|
||||
deltaspace_prefix = "/".join(obj.key.split("/")[:-1])
|
||||
affected_deltaspaces.add(deltaspace_prefix)
|
||||
else:
|
||||
# Check if it's a direct upload
|
||||
obj_head = self.storage.head(f"{bucket}/{obj.key}")
|
||||
if obj_head and obj_head.metadata.get("compression") == "none":
|
||||
direct_uploads.append(obj.key)
|
||||
else:
|
||||
objects_to_delete.append(obj.key)
|
||||
|
||||
# Also check for references in parent directories that might be affected
|
||||
# by the deletion of delta files in affected deltaspaces
|
||||
for deltaspace_prefix in affected_deltaspaces:
|
||||
ref_key = f"{deltaspace_prefix}/reference.bin"
|
||||
# Also check for references in parent deltaspaces affected by delta deletion
|
||||
for ds_prefix in affected_deltaspaces:
|
||||
ref_key = f"{ds_prefix}/reference.bin"
|
||||
if ref_key not in references:
|
||||
# Check if this reference exists
|
||||
ref_head = self.storage.head(f"{bucket}/{ref_key}")
|
||||
if ref_head:
|
||||
references.append(ref_key)
|
||||
|
||||
result: dict[str, Any] = {
|
||||
"bucket": bucket,
|
||||
"prefix": prefix,
|
||||
"deleted_count": 0,
|
||||
"failed_count": 0,
|
||||
"deltas_deleted": len(deltas),
|
||||
"references_deleted": len(references),
|
||||
"direct_deleted": len(direct_uploads),
|
||||
"other_deleted": len(objects_to_delete),
|
||||
"errors": [],
|
||||
"warnings": [],
|
||||
}
|
||||
result = RecursiveDeleteResult(
|
||||
bucket=bucket,
|
||||
prefix=prefix,
|
||||
deltas_deleted=len(deltas),
|
||||
references_deleted=len(references),
|
||||
direct_deleted=len(direct_uploads),
|
||||
other_deleted=len(other_objects),
|
||||
)
|
||||
|
||||
# Delete in order: other files -> direct uploads -> deltas -> references (with checks)
|
||||
# This ensures we don't delete references that deltas depend on prematurely
|
||||
regular_files = objects_to_delete + direct_uploads + deltas
|
||||
|
||||
# Delete regular files first
|
||||
for key in regular_files:
|
||||
# Phase 2: delete non-reference files first (dependency order)
|
||||
for key in other_objects + direct_uploads + deltas:
|
||||
try:
|
||||
self.storage.delete(f"{bucket}/{key}")
|
||||
deleted_count = result["deleted_count"]
|
||||
assert isinstance(deleted_count, int)
|
||||
result["deleted_count"] = deleted_count + 1
|
||||
result.deleted_count += 1
|
||||
self.logger.debug(f"Deleted {key}")
|
||||
except Exception as e:
|
||||
failed_count = result["failed_count"]
|
||||
assert isinstance(failed_count, int)
|
||||
result["failed_count"] = failed_count + 1
|
||||
errors_list = result["errors"]
|
||||
assert isinstance(errors_list, list)
|
||||
errors_list.append(f"Failed to delete {key}: {str(e)}")
|
||||
result.failed_count += 1
|
||||
result.errors.append(f"Failed to delete {key}: {str(e)}")
|
||||
self.logger.error(f"Failed to delete {key}: {e}")
|
||||
|
||||
# Handle references intelligently - only delete if no files outside deletion scope depend on them
|
||||
references_kept = 0
|
||||
for ref_key in references:
|
||||
try:
|
||||
# Extract deltaspace prefix from reference.bin path
|
||||
if ref_key.endswith("/reference.bin"):
|
||||
deltaspace_prefix = ref_key[:-14] # Remove "/reference.bin"
|
||||
else:
|
||||
deltaspace_prefix = ""
|
||||
# Phase 3: delete references only if safe
|
||||
references_kept = self._delete_references_if_safe(bucket, prefix, references, result)
|
||||
result.references_deleted -= references_kept
|
||||
|
||||
# Check if there are any remaining files in this deltaspace
|
||||
# (outside of the deletion prefix)
|
||||
deltaspace_list_prefix = (
|
||||
f"{bucket}/{deltaspace_prefix}" if deltaspace_prefix else bucket
|
||||
)
|
||||
remaining_objects = list(self.storage.list(deltaspace_list_prefix))
|
||||
|
||||
# Filter out objects that are being deleted (within our deletion scope)
|
||||
# and the reference.bin file itself
|
||||
deletion_prefix_full = f"{bucket}/{prefix}" if prefix else bucket
|
||||
has_remaining_files = False
|
||||
|
||||
for remaining_obj in remaining_objects:
|
||||
obj_full_path = f"{bucket}/{remaining_obj.key}"
|
||||
# Skip if this object is within our deletion scope
|
||||
if prefix and obj_full_path.startswith(deletion_prefix_full):
|
||||
continue
|
||||
# Skip if this is the reference.bin file itself
|
||||
if remaining_obj.key == ref_key:
|
||||
continue
|
||||
# If we find any other file, the reference is still needed
|
||||
has_remaining_files = True
|
||||
break
|
||||
|
||||
if not has_remaining_files:
|
||||
# Safe to delete this reference.bin
|
||||
self.storage.delete(f"{bucket}/{ref_key}")
|
||||
deleted_count = result["deleted_count"]
|
||||
assert isinstance(deleted_count, int)
|
||||
result["deleted_count"] = deleted_count + 1
|
||||
self.logger.debug(f"Deleted reference {ref_key}")
|
||||
else:
|
||||
# Keep the reference as it's still needed
|
||||
references_kept += 1
|
||||
warnings_list = result["warnings"]
|
||||
assert isinstance(warnings_list, list)
|
||||
warnings_list.append(f"Kept reference {ref_key} (still in use)")
|
||||
self.logger.info(
|
||||
f"Kept reference {ref_key} - still in use outside deletion scope"
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
failed_count = result["failed_count"]
|
||||
assert isinstance(failed_count, int)
|
||||
result["failed_count"] = failed_count + 1
|
||||
errors_list = result["errors"]
|
||||
assert isinstance(errors_list, list)
|
||||
errors_list.append(f"Failed to delete reference {ref_key}: {str(e)}")
|
||||
self.logger.error(f"Failed to delete reference {ref_key}: {e}")
|
||||
|
||||
# Update reference deletion count
|
||||
references_deleted = result["references_deleted"]
|
||||
assert isinstance(references_deleted, int)
|
||||
result["references_deleted"] = references_deleted - references_kept
|
||||
|
||||
# Clear any cached references for this prefix
|
||||
# Clear cached references
|
||||
if references:
|
||||
try:
|
||||
self.cache.evict(bucket, prefix.rstrip("/") if prefix else "")
|
||||
@@ -916,11 +806,291 @@ class DeltaService:
|
||||
"Recursive delete complete",
|
||||
bucket=bucket,
|
||||
prefix=prefix,
|
||||
deleted=result["deleted_count"],
|
||||
failed=result["failed_count"],
|
||||
deleted=result.deleted_count,
|
||||
failed=result.failed_count,
|
||||
duration=duration,
|
||||
)
|
||||
self.metrics.timing("deltaglider.delete_recursive.duration", duration)
|
||||
self.metrics.increment("deltaglider.delete_recursive.completed")
|
||||
|
||||
return result
|
||||
|
||||
def _classify_objects_for_deletion(
|
||||
self, bucket: str, prefix: str
|
||||
) -> tuple[list[str], list[str], list[str], list[str], set[str]]:
|
||||
"""Classify objects under a prefix into references, deltas, direct uploads, and other.
|
||||
|
||||
Returns:
|
||||
(references, deltas, direct_uploads, other_objects, affected_deltaspaces)
|
||||
"""
|
||||
references: list[str] = []
|
||||
deltas: list[str] = []
|
||||
direct_uploads: list[str] = []
|
||||
other_objects: list[str] = []
|
||||
affected_deltaspaces: set[str] = set()
|
||||
|
||||
for obj in self.storage.list(f"{bucket}/{prefix}" if prefix else bucket):
|
||||
if prefix and not obj.key.startswith(prefix):
|
||||
continue
|
||||
|
||||
if obj.key.endswith("/reference.bin"):
|
||||
references.append(obj.key)
|
||||
elif obj.key.endswith(".delta"):
|
||||
deltas.append(obj.key)
|
||||
if "/" in obj.key:
|
||||
affected_deltaspaces.add("/".join(obj.key.split("/")[:-1]))
|
||||
else:
|
||||
obj_head = self.storage.head(f"{bucket}/{obj.key}")
|
||||
if obj_head and obj_head.metadata.get("compression") == "none":
|
||||
direct_uploads.append(obj.key)
|
||||
else:
|
||||
other_objects.append(obj.key)
|
||||
|
||||
return references, deltas, direct_uploads, other_objects, affected_deltaspaces
|
||||
|
||||
def _delete_references_if_safe(
|
||||
self,
|
||||
bucket: str,
|
||||
prefix: str,
|
||||
references: list[str],
|
||||
result: RecursiveDeleteResult,
|
||||
) -> int:
|
||||
"""Delete references only if no files outside the deletion scope depend on them.
|
||||
|
||||
Returns the number of references kept (not deleted).
|
||||
"""
|
||||
references_kept = 0
|
||||
deletion_prefix_full = f"{bucket}/{prefix}" if prefix else bucket
|
||||
|
||||
for ref_key in references:
|
||||
try:
|
||||
if ref_key.endswith("/reference.bin"):
|
||||
deltaspace_prefix = ref_key[:-14] # Remove "/reference.bin"
|
||||
else:
|
||||
deltaspace_prefix = ""
|
||||
|
||||
ds_list_prefix = f"{bucket}/{deltaspace_prefix}" if deltaspace_prefix else bucket
|
||||
has_remaining_files = any(
|
||||
not (prefix and f"{bucket}/{obj.key}".startswith(deletion_prefix_full))
|
||||
and obj.key != ref_key
|
||||
for obj in self.storage.list(ds_list_prefix)
|
||||
)
|
||||
|
||||
if not has_remaining_files:
|
||||
self.storage.delete(f"{bucket}/{ref_key}")
|
||||
result.deleted_count += 1
|
||||
self.logger.debug(f"Deleted reference {ref_key}")
|
||||
else:
|
||||
references_kept += 1
|
||||
result.warnings.append(f"Kept reference {ref_key} (still in use)")
|
||||
self.logger.info(
|
||||
f"Kept reference {ref_key} - still in use outside deletion scope"
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
result.failed_count += 1
|
||||
result.errors.append(f"Failed to delete reference {ref_key}: {str(e)}")
|
||||
self.logger.error(f"Failed to delete reference {ref_key}: {e}")
|
||||
|
||||
return references_kept
|
||||
|
||||
def rehydrate_for_download(
|
||||
self,
|
||||
bucket: str,
|
||||
key: str,
|
||||
expires_in_seconds: int = 3600,
|
||||
) -> str | None:
|
||||
"""Rehydrate a deltaglider-compressed file for direct download.
|
||||
|
||||
If the file is deltaglider-compressed, this will:
|
||||
1. Download and decompress the file
|
||||
2. Re-upload to .deltaglider/tmp/ with expiration metadata
|
||||
3. Return the new temporary file key
|
||||
|
||||
If the file is not deltaglider-compressed, returns None.
|
||||
|
||||
Args:
|
||||
bucket: S3 bucket name
|
||||
key: Object key
|
||||
expires_in_seconds: How long the temporary file should exist
|
||||
|
||||
Returns:
|
||||
New key for temporary file, or None if not deltaglider-compressed
|
||||
"""
|
||||
start_time = self.clock.now()
|
||||
|
||||
# Check if object exists and is deltaglider-compressed
|
||||
obj_head = self.storage.head(f"{bucket}/{key}")
|
||||
|
||||
# If not found directly, try with .delta extension
|
||||
if obj_head is None and not key.endswith(".delta"):
|
||||
obj_head = self.storage.head(f"{bucket}/{key}.delta")
|
||||
if obj_head is not None:
|
||||
# Found the delta version, update the key
|
||||
key = f"{key}.delta"
|
||||
|
||||
if obj_head is None:
|
||||
raise NotFoundError(f"Object not found: {key}")
|
||||
|
||||
# Check if this is a deltaglider file
|
||||
is_delta = key.endswith(".delta")
|
||||
has_dg_metadata = "dg-file-sha256" in obj_head.metadata
|
||||
|
||||
if not is_delta and not has_dg_metadata:
|
||||
# Not a deltaglider file, return None
|
||||
self.logger.debug(f"File {key} is not deltaglider-compressed")
|
||||
return None
|
||||
|
||||
# Generate temporary file path
|
||||
import uuid
|
||||
|
||||
# Use the original filename without .delta extension for the temp file
|
||||
original_name = key.removesuffix(".delta") if key.endswith(".delta") else key
|
||||
temp_filename = f"{uuid.uuid4().hex}_{Path(original_name).name}"
|
||||
temp_key = f".deltaglider/tmp/{temp_filename}"
|
||||
|
||||
# Download and decompress the file
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
tmp_path = Path(tmpdir)
|
||||
decompressed_path = tmp_path / "decompressed"
|
||||
|
||||
# Use the existing get method to decompress
|
||||
object_key = ObjectKey(bucket=bucket, key=key)
|
||||
self.get(object_key, decompressed_path)
|
||||
|
||||
# Calculate expiration time
|
||||
expires_at = self.clock.now() + timedelta(seconds=expires_in_seconds)
|
||||
|
||||
# Create metadata for temporary file
|
||||
metadata = {
|
||||
"dg-expires-at": expires_at.isoformat(),
|
||||
"dg-original-key": key,
|
||||
"dg-original-filename": Path(original_name).name,
|
||||
"dg-rehydrated": "true",
|
||||
"dg-created-at": self.clock.now().isoformat(),
|
||||
}
|
||||
|
||||
# Upload the decompressed file
|
||||
self.logger.info(
|
||||
"Uploading rehydrated file",
|
||||
original_key=key,
|
||||
temp_key=temp_key,
|
||||
expires_at=expires_at.isoformat(),
|
||||
)
|
||||
|
||||
self.storage.put(
|
||||
f"{bucket}/{temp_key}",
|
||||
decompressed_path,
|
||||
metadata,
|
||||
)
|
||||
|
||||
duration = (self.clock.now() - start_time).total_seconds()
|
||||
self.logger.info(
|
||||
"Rehydration complete",
|
||||
original_key=key,
|
||||
temp_key=temp_key,
|
||||
duration=duration,
|
||||
)
|
||||
self.metrics.timing("deltaglider.rehydrate.duration", duration)
|
||||
self.metrics.increment("deltaglider.rehydrate.completed")
|
||||
|
||||
return temp_key
|
||||
|
||||
def purge_temp_files(self, bucket: str) -> dict[str, Any]:
|
||||
"""Purge expired temporary files from .deltaglider/tmp/.
|
||||
|
||||
Scans the .deltaglider/tmp/ prefix and deletes any files
|
||||
whose dg-expires-at metadata indicates they have expired.
|
||||
|
||||
Args:
|
||||
bucket: S3 bucket to purge temp files from
|
||||
|
||||
Returns:
|
||||
dict with purge statistics
|
||||
"""
|
||||
start_time = self.clock.now()
|
||||
prefix = ".deltaglider/tmp/"
|
||||
|
||||
self.logger.info("Starting temp file purge", bucket=bucket, prefix=prefix)
|
||||
|
||||
deleted_count = 0
|
||||
expired_count = 0
|
||||
error_count = 0
|
||||
total_size_freed = 0
|
||||
errors = []
|
||||
|
||||
# List all objects in temp directory
|
||||
for obj in self.storage.list(f"{bucket}/{prefix}"):
|
||||
if not obj.key.startswith(prefix):
|
||||
continue
|
||||
|
||||
try:
|
||||
# Get object metadata
|
||||
obj_head = self.storage.head(f"{bucket}/{obj.key}")
|
||||
if obj_head is None:
|
||||
continue
|
||||
|
||||
# Check expiration
|
||||
expires_at_str = obj_head.metadata.get("dg-expires-at")
|
||||
if not expires_at_str:
|
||||
# No expiration metadata, skip
|
||||
self.logger.debug(f"No expiration metadata for {obj.key}")
|
||||
continue
|
||||
|
||||
# Parse expiration time
|
||||
from datetime import datetime
|
||||
|
||||
try:
|
||||
expires_at = datetime.fromisoformat(expires_at_str.replace("Z", "+00:00"))
|
||||
if expires_at.tzinfo is None:
|
||||
expires_at = expires_at.replace(tzinfo=UTC)
|
||||
except ValueError:
|
||||
self.logger.warning(
|
||||
f"Invalid expiration format for {obj.key}: {expires_at_str}"
|
||||
)
|
||||
continue
|
||||
|
||||
# Check if expired
|
||||
if self.clock.now() >= expires_at:
|
||||
expired_count += 1
|
||||
# Delete the file
|
||||
self.storage.delete(f"{bucket}/{obj.key}")
|
||||
deleted_count += 1
|
||||
total_size_freed += obj.size
|
||||
self.logger.debug(
|
||||
f"Deleted expired temp file {obj.key}",
|
||||
expired_at=expires_at_str,
|
||||
size=obj.size,
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
error_count += 1
|
||||
errors.append(f"Error processing {obj.key}: {str(e)}")
|
||||
self.logger.error(f"Failed to process temp file {obj.key}: {e}")
|
||||
|
||||
duration = (self.clock.now() - start_time).total_seconds()
|
||||
|
||||
result = {
|
||||
"bucket": bucket,
|
||||
"prefix": prefix,
|
||||
"deleted_count": deleted_count,
|
||||
"expired_count": expired_count,
|
||||
"error_count": error_count,
|
||||
"total_size_freed": total_size_freed,
|
||||
"duration_seconds": duration,
|
||||
"errors": errors,
|
||||
}
|
||||
|
||||
self.logger.info(
|
||||
"Temp file purge complete",
|
||||
bucket=bucket,
|
||||
deleted=deleted_count,
|
||||
size_freed=total_size_freed,
|
||||
duration=duration,
|
||||
)
|
||||
|
||||
self.metrics.timing("deltaglider.purge.duration", duration)
|
||||
self.metrics.gauge("deltaglider.purge.deleted_count", deleted_count)
|
||||
self.metrics.gauge("deltaglider.purge.size_freed", total_size_freed)
|
||||
|
||||
return result
|
||||
|
||||
@@ -308,6 +308,148 @@ class TestBucketManagement:
|
||||
with pytest.raises(NotImplementedError):
|
||||
client.list_buckets()
|
||||
|
||||
def test_put_bucket_acl_with_canned_acl(self):
|
||||
"""Test setting a canned ACL on a bucket."""
|
||||
service = create_service()
|
||||
mock_storage = Mock()
|
||||
service.storage = mock_storage
|
||||
|
||||
mock_boto3_client = Mock()
|
||||
mock_boto3_client.put_bucket_acl.return_value = None
|
||||
mock_storage.client = mock_boto3_client
|
||||
|
||||
client = DeltaGliderClient(service)
|
||||
response = client.put_bucket_acl(Bucket="test-bucket", ACL="public-read")
|
||||
|
||||
assert response["ResponseMetadata"]["HTTPStatusCode"] == 200
|
||||
mock_boto3_client.put_bucket_acl.assert_called_once_with(
|
||||
Bucket="test-bucket", ACL="public-read"
|
||||
)
|
||||
|
||||
def test_put_bucket_acl_with_grants(self):
|
||||
"""Test setting ACL with grant parameters."""
|
||||
service = create_service()
|
||||
mock_storage = Mock()
|
||||
service.storage = mock_storage
|
||||
|
||||
mock_boto3_client = Mock()
|
||||
mock_boto3_client.put_bucket_acl.return_value = None
|
||||
mock_storage.client = mock_boto3_client
|
||||
|
||||
client = DeltaGliderClient(service)
|
||||
response = client.put_bucket_acl(
|
||||
Bucket="test-bucket",
|
||||
GrantRead="id=12345",
|
||||
GrantWrite="id=67890",
|
||||
)
|
||||
|
||||
assert response["ResponseMetadata"]["HTTPStatusCode"] == 200
|
||||
mock_boto3_client.put_bucket_acl.assert_called_once_with(
|
||||
Bucket="test-bucket", GrantRead="id=12345", GrantWrite="id=67890"
|
||||
)
|
||||
|
||||
def test_put_bucket_acl_with_access_control_policy(self):
|
||||
"""Test setting ACL with a full AccessControlPolicy dict."""
|
||||
service = create_service()
|
||||
mock_storage = Mock()
|
||||
service.storage = mock_storage
|
||||
|
||||
mock_boto3_client = Mock()
|
||||
mock_boto3_client.put_bucket_acl.return_value = None
|
||||
mock_storage.client = mock_boto3_client
|
||||
|
||||
policy = {
|
||||
"Grants": [
|
||||
{
|
||||
"Grantee": {"Type": "CanonicalUser", "ID": "abc123"},
|
||||
"Permission": "FULL_CONTROL",
|
||||
}
|
||||
],
|
||||
"Owner": {"ID": "abc123"},
|
||||
}
|
||||
|
||||
client = DeltaGliderClient(service)
|
||||
response = client.put_bucket_acl(Bucket="test-bucket", AccessControlPolicy=policy)
|
||||
|
||||
assert response["ResponseMetadata"]["HTTPStatusCode"] == 200
|
||||
mock_boto3_client.put_bucket_acl.assert_called_once_with(
|
||||
Bucket="test-bucket", AccessControlPolicy=policy
|
||||
)
|
||||
|
||||
def test_put_bucket_acl_failure(self):
|
||||
"""Test that put_bucket_acl raises RuntimeError on boto3 failure."""
|
||||
service = create_service()
|
||||
mock_storage = Mock()
|
||||
service.storage = mock_storage
|
||||
|
||||
mock_boto3_client = Mock()
|
||||
mock_boto3_client.put_bucket_acl.side_effect = Exception("AccessDenied")
|
||||
mock_storage.client = mock_boto3_client
|
||||
|
||||
client = DeltaGliderClient(service)
|
||||
|
||||
with pytest.raises(RuntimeError, match="Failed to set bucket ACL"):
|
||||
client.put_bucket_acl(Bucket="test-bucket", ACL="public-read")
|
||||
|
||||
def test_put_bucket_acl_no_boto3_client(self):
|
||||
"""Test that put_bucket_acl raises NotImplementedError without boto3 client."""
|
||||
service = create_service()
|
||||
mock_storage = Mock()
|
||||
service.storage = mock_storage
|
||||
delattr(mock_storage, "client")
|
||||
|
||||
client = DeltaGliderClient(service)
|
||||
|
||||
with pytest.raises(NotImplementedError):
|
||||
client.put_bucket_acl(Bucket="test-bucket", ACL="private")
|
||||
|
||||
def test_get_bucket_acl_success(self):
|
||||
"""Test getting bucket ACL successfully."""
|
||||
service = create_service()
|
||||
mock_storage = Mock()
|
||||
service.storage = mock_storage
|
||||
|
||||
acl_response = {
|
||||
"Owner": {"DisplayName": "test-user", "ID": "abc123"},
|
||||
"Grants": [
|
||||
{
|
||||
"Grantee": {
|
||||
"Type": "CanonicalUser",
|
||||
"DisplayName": "test-user",
|
||||
"ID": "abc123",
|
||||
},
|
||||
"Permission": "FULL_CONTROL",
|
||||
}
|
||||
],
|
||||
}
|
||||
|
||||
mock_boto3_client = Mock()
|
||||
mock_boto3_client.get_bucket_acl.return_value = acl_response
|
||||
mock_storage.client = mock_boto3_client
|
||||
|
||||
client = DeltaGliderClient(service)
|
||||
response = client.get_bucket_acl(Bucket="test-bucket")
|
||||
|
||||
assert response["Owner"]["DisplayName"] == "test-user"
|
||||
assert len(response["Grants"]) == 1
|
||||
assert response["Grants"][0]["Permission"] == "FULL_CONTROL"
|
||||
mock_boto3_client.get_bucket_acl.assert_called_once_with(Bucket="test-bucket")
|
||||
|
||||
def test_get_bucket_acl_failure(self):
|
||||
"""Test that get_bucket_acl raises RuntimeError on boto3 failure."""
|
||||
service = create_service()
|
||||
mock_storage = Mock()
|
||||
service.storage = mock_storage
|
||||
|
||||
mock_boto3_client = Mock()
|
||||
mock_boto3_client.get_bucket_acl.side_effect = Exception("NoSuchBucket")
|
||||
mock_storage.client = mock_boto3_client
|
||||
|
||||
client = DeltaGliderClient(service)
|
||||
|
||||
with pytest.raises(RuntimeError, match="Failed to get bucket ACL"):
|
||||
client.get_bucket_acl(Bucket="nonexistent-bucket")
|
||||
|
||||
def test_complete_bucket_lifecycle(self):
|
||||
"""Test complete bucket lifecycle: create, use, delete."""
|
||||
service = create_service()
|
||||
|
||||
@@ -6,6 +6,7 @@ from unittest.mock import Mock, patch
|
||||
import pytest
|
||||
|
||||
from deltaglider import create_client
|
||||
from deltaglider.core.models import DeleteResult, RecursiveDeleteResult
|
||||
|
||||
|
||||
class MockStorage:
|
||||
@@ -177,14 +178,16 @@ class TestDeleteObjectsRecursiveStatisticsAggregation:
|
||||
def test_aggregates_deleted_count_from_service_and_single_deletes(self, client):
|
||||
"""Test that deleted counts are aggregated correctly."""
|
||||
# Setup: Mock service.delete_recursive to return specific counts
|
||||
mock_result = {
|
||||
"deleted_count": 5,
|
||||
"failed_count": 0,
|
||||
"deltas_deleted": 2,
|
||||
"references_deleted": 1,
|
||||
"direct_deleted": 2,
|
||||
"other_deleted": 0,
|
||||
}
|
||||
mock_result = RecursiveDeleteResult(
|
||||
bucket="test-bucket",
|
||||
prefix="test/",
|
||||
deleted_count=5,
|
||||
failed_count=0,
|
||||
deltas_deleted=2,
|
||||
references_deleted=1,
|
||||
direct_deleted=2,
|
||||
other_deleted=0,
|
||||
)
|
||||
client.service.delete_recursive = Mock(return_value=mock_result)
|
||||
|
||||
# Execute
|
||||
@@ -204,14 +207,16 @@ class TestDeleteObjectsRecursiveStatisticsAggregation:
|
||||
client.service.storage.objects["test-bucket/file.txt"] = {"size": 100}
|
||||
|
||||
# Mock service.delete_recursive to return additional counts
|
||||
mock_result = {
|
||||
"deleted_count": 3,
|
||||
"failed_count": 0,
|
||||
"deltas_deleted": 1,
|
||||
"references_deleted": 0,
|
||||
"direct_deleted": 2,
|
||||
"other_deleted": 0,
|
||||
}
|
||||
mock_result = RecursiveDeleteResult(
|
||||
bucket="test-bucket",
|
||||
prefix="file.txt",
|
||||
deleted_count=3,
|
||||
failed_count=0,
|
||||
deltas_deleted=1,
|
||||
references_deleted=0,
|
||||
direct_deleted=2,
|
||||
other_deleted=0,
|
||||
)
|
||||
client.service.delete_recursive = Mock(return_value=mock_result)
|
||||
|
||||
# Execute
|
||||
@@ -245,15 +250,17 @@ class TestDeleteObjectsRecursiveErrorHandling:
|
||||
def test_service_errors_propagated_in_response(self, client):
|
||||
"""Test that errors from service.delete_recursive are propagated."""
|
||||
# Mock service to return errors
|
||||
mock_result = {
|
||||
"deleted_count": 2,
|
||||
"failed_count": 1,
|
||||
"deltas_deleted": 2,
|
||||
"references_deleted": 0,
|
||||
"direct_deleted": 0,
|
||||
"other_deleted": 0,
|
||||
"errors": ["Error deleting object1", "Error deleting object2"],
|
||||
}
|
||||
mock_result = RecursiveDeleteResult(
|
||||
bucket="test-bucket",
|
||||
prefix="test/",
|
||||
deleted_count=2,
|
||||
failed_count=1,
|
||||
deltas_deleted=2,
|
||||
references_deleted=0,
|
||||
direct_deleted=0,
|
||||
other_deleted=0,
|
||||
errors=["Error deleting object1", "Error deleting object2"],
|
||||
)
|
||||
client.service.delete_recursive = Mock(return_value=mock_result)
|
||||
|
||||
# Execute
|
||||
@@ -271,15 +278,17 @@ class TestDeleteObjectsRecursiveErrorHandling:
|
||||
client.service.storage.objects["test-bucket/file.txt"] = {"size": 100}
|
||||
|
||||
# Mock service to also return errors
|
||||
mock_result = {
|
||||
"deleted_count": 1,
|
||||
"failed_count": 1,
|
||||
"deltas_deleted": 0,
|
||||
"references_deleted": 0,
|
||||
"direct_deleted": 0,
|
||||
"other_deleted": 0,
|
||||
"errors": ["Service delete error"],
|
||||
}
|
||||
mock_result = RecursiveDeleteResult(
|
||||
bucket="test-bucket",
|
||||
prefix="file.txt",
|
||||
deleted_count=1,
|
||||
failed_count=1,
|
||||
deltas_deleted=0,
|
||||
references_deleted=0,
|
||||
direct_deleted=0,
|
||||
other_deleted=0,
|
||||
errors=["Service delete error"],
|
||||
)
|
||||
client.service.delete_recursive = Mock(return_value=mock_result)
|
||||
|
||||
# Mock delete_with_delta_suffix to raise exception
|
||||
@@ -302,15 +311,17 @@ class TestDeleteObjectsRecursiveWarningsHandling:
|
||||
def test_service_warnings_propagated_in_response(self, client):
|
||||
"""Test that warnings from service.delete_recursive are propagated."""
|
||||
# Mock service to return warnings
|
||||
mock_result = {
|
||||
"deleted_count": 3,
|
||||
"failed_count": 0,
|
||||
"deltas_deleted": 2,
|
||||
"references_deleted": 1,
|
||||
"direct_deleted": 0,
|
||||
"other_deleted": 0,
|
||||
"warnings": ["Reference deleted, 2 dependent deltas invalidated"],
|
||||
}
|
||||
mock_result = RecursiveDeleteResult(
|
||||
bucket="test-bucket",
|
||||
prefix="test/",
|
||||
deleted_count=3,
|
||||
failed_count=0,
|
||||
deltas_deleted=2,
|
||||
references_deleted=1,
|
||||
direct_deleted=0,
|
||||
other_deleted=0,
|
||||
warnings=["Reference deleted, 2 dependent deltas invalidated"],
|
||||
)
|
||||
client.service.delete_recursive = Mock(return_value=mock_result)
|
||||
|
||||
# Execute
|
||||
@@ -326,25 +337,29 @@ class TestDeleteObjectsRecursiveWarningsHandling:
|
||||
client.service.storage.objects["test-bucket/ref.bin"] = {"size": 100}
|
||||
|
||||
# Mock service
|
||||
mock_result = {
|
||||
"deleted_count": 0,
|
||||
"failed_count": 0,
|
||||
"deltas_deleted": 0,
|
||||
"references_deleted": 0,
|
||||
"direct_deleted": 0,
|
||||
"other_deleted": 0,
|
||||
}
|
||||
mock_result = RecursiveDeleteResult(
|
||||
bucket="test-bucket",
|
||||
prefix="ref.bin",
|
||||
deleted_count=0,
|
||||
failed_count=0,
|
||||
deltas_deleted=0,
|
||||
references_deleted=0,
|
||||
direct_deleted=0,
|
||||
other_deleted=0,
|
||||
)
|
||||
client.service.delete_recursive = Mock(return_value=mock_result)
|
||||
|
||||
# Mock delete_with_delta_suffix to return warnings
|
||||
with patch("deltaglider.client.delete_with_delta_suffix") as mock_delete:
|
||||
mock_delete.return_value = (
|
||||
"ref.bin",
|
||||
{
|
||||
"deleted": True,
|
||||
"type": "reference",
|
||||
"warnings": ["Warning from single delete"],
|
||||
},
|
||||
DeleteResult(
|
||||
key="ref.bin",
|
||||
bucket="test-bucket",
|
||||
deleted=True,
|
||||
type="reference",
|
||||
warnings=["Warning from single delete"],
|
||||
),
|
||||
)
|
||||
|
||||
# Execute
|
||||
@@ -364,26 +379,29 @@ class TestDeleteObjectsRecursiveSingleDeleteDetails:
|
||||
client.service.storage.objects["test-bucket/file.txt"] = {"size": 100}
|
||||
|
||||
# Mock service
|
||||
mock_result = {
|
||||
"deleted_count": 0,
|
||||
"failed_count": 0,
|
||||
"deltas_deleted": 0,
|
||||
"references_deleted": 0,
|
||||
"direct_deleted": 0,
|
||||
"other_deleted": 0,
|
||||
}
|
||||
mock_result = RecursiveDeleteResult(
|
||||
bucket="test-bucket",
|
||||
prefix="file.txt",
|
||||
deleted_count=0,
|
||||
failed_count=0,
|
||||
deltas_deleted=0,
|
||||
references_deleted=0,
|
||||
direct_deleted=0,
|
||||
other_deleted=0,
|
||||
)
|
||||
client.service.delete_recursive = Mock(return_value=mock_result)
|
||||
|
||||
# Mock delete_with_delta_suffix
|
||||
with patch("deltaglider.client.delete_with_delta_suffix") as mock_delete:
|
||||
mock_delete.return_value = (
|
||||
"file.txt",
|
||||
{
|
||||
"deleted": True,
|
||||
"type": "direct",
|
||||
"dependent_deltas": 0,
|
||||
"warnings": [],
|
||||
},
|
||||
DeleteResult(
|
||||
key="file.txt",
|
||||
bucket="test-bucket",
|
||||
deleted=True,
|
||||
type="direct",
|
||||
dependent_deltas=0,
|
||||
),
|
||||
)
|
||||
|
||||
# Execute
|
||||
@@ -412,25 +430,28 @@ class TestDeleteObjectsRecursiveSingleDeleteDetails:
|
||||
actual_key = "file.zip.delta" if key == "file.zip" else key
|
||||
return (
|
||||
actual_key,
|
||||
{
|
||||
"deleted": True,
|
||||
"type": "delta",
|
||||
"dependent_deltas": 0,
|
||||
"warnings": [],
|
||||
},
|
||||
DeleteResult(
|
||||
key=actual_key,
|
||||
bucket=bucket,
|
||||
deleted=True,
|
||||
type="delta",
|
||||
dependent_deltas=0,
|
||||
),
|
||||
)
|
||||
|
||||
client_delete_helpers.delete_with_delta_suffix = mock_delete
|
||||
|
||||
# Mock service
|
||||
mock_result = {
|
||||
"deleted_count": 0,
|
||||
"failed_count": 0,
|
||||
"deltas_deleted": 0,
|
||||
"references_deleted": 0,
|
||||
"direct_deleted": 0,
|
||||
"other_deleted": 0,
|
||||
}
|
||||
mock_result = RecursiveDeleteResult(
|
||||
bucket="test-bucket",
|
||||
prefix="file.zip",
|
||||
deleted_count=0,
|
||||
failed_count=0,
|
||||
deltas_deleted=0,
|
||||
references_deleted=0,
|
||||
direct_deleted=0,
|
||||
other_deleted=0,
|
||||
)
|
||||
client.service.delete_recursive = Mock(return_value=mock_result)
|
||||
|
||||
try:
|
||||
@@ -479,26 +500,29 @@ class TestDeleteObjectsRecursiveEdgeCases:
|
||||
client.service.storage.objects["test-bucket/file.txt"] = {"size": 100}
|
||||
|
||||
# Mock service
|
||||
mock_result = {
|
||||
"deleted_count": 0,
|
||||
"failed_count": 0,
|
||||
"deltas_deleted": 0,
|
||||
"references_deleted": 0,
|
||||
"direct_deleted": 0,
|
||||
"other_deleted": 0,
|
||||
}
|
||||
mock_result = RecursiveDeleteResult(
|
||||
bucket="test-bucket",
|
||||
prefix="file.txt",
|
||||
deleted_count=0,
|
||||
failed_count=0,
|
||||
deltas_deleted=0,
|
||||
references_deleted=0,
|
||||
direct_deleted=0,
|
||||
other_deleted=0,
|
||||
)
|
||||
client.service.delete_recursive = Mock(return_value=mock_result)
|
||||
|
||||
# Mock delete_with_delta_suffix to return unknown type
|
||||
with patch("deltaglider.client.delete_with_delta_suffix") as mock_delete:
|
||||
mock_delete.return_value = (
|
||||
"file.txt",
|
||||
{
|
||||
"deleted": True,
|
||||
"type": "unknown_type", # Not in single_counts keys
|
||||
"dependent_deltas": 0,
|
||||
"warnings": [],
|
||||
},
|
||||
DeleteResult(
|
||||
key="file.txt",
|
||||
bucket="test-bucket",
|
||||
deleted=True,
|
||||
type="unknown_type", # Not in single_counts keys
|
||||
dependent_deltas=0,
|
||||
),
|
||||
)
|
||||
|
||||
# Execute
|
||||
|
||||
@@ -243,12 +243,12 @@ class TestSingleDeleteCleanup:
|
||||
result = service.delete(ObjectKey(bucket="test-bucket", key="releases/app.zip.delta"))
|
||||
|
||||
# Verify delta was deleted
|
||||
assert result["deleted"] is True
|
||||
assert result["type"] == "delta"
|
||||
assert result.deleted is True
|
||||
assert result.type == "delta"
|
||||
|
||||
# Verify reference.bin cleanup was triggered
|
||||
assert "cleaned_reference" in result
|
||||
assert result["cleaned_reference"] == "releases/reference.bin"
|
||||
assert result.cleaned_reference is not None
|
||||
assert result.cleaned_reference == "releases/reference.bin"
|
||||
|
||||
# Verify both files were deleted
|
||||
assert mock_storage.delete.call_count == 2
|
||||
@@ -295,11 +295,11 @@ class TestSingleDeleteCleanup:
|
||||
result = service.delete(ObjectKey(bucket="test-bucket", key="releases/app-v1.zip.delta"))
|
||||
|
||||
# Verify delta was deleted
|
||||
assert result["deleted"] is True
|
||||
assert result["type"] == "delta"
|
||||
assert result.deleted is True
|
||||
assert result.type == "delta"
|
||||
|
||||
# Verify reference.bin was NOT cleaned up
|
||||
assert "cleaned_reference" not in result
|
||||
assert result.cleaned_reference is None
|
||||
|
||||
# Verify only the delta was deleted, not reference.bin
|
||||
assert mock_storage.delete.call_count == 1
|
||||
@@ -342,11 +342,11 @@ class TestSingleDeleteCleanup:
|
||||
result = service.delete(ObjectKey(bucket="test-bucket", key="releases/app.zip.delta"))
|
||||
|
||||
# Verify delta was deleted
|
||||
assert result["deleted"] is True
|
||||
assert result["type"] == "delta"
|
||||
assert result.deleted is True
|
||||
assert result.type == "delta"
|
||||
|
||||
# Verify no reference cleanup (since it didn't exist)
|
||||
assert "cleaned_reference" not in result
|
||||
assert result.cleaned_reference is None
|
||||
|
||||
# Only delta should be deleted
|
||||
assert mock_storage.delete.call_count == 1
|
||||
@@ -395,7 +395,7 @@ class TestSingleDeleteCleanup:
|
||||
result = service.delete(ObjectKey(bucket="test-bucket", key="releases/1.0/app.zip.delta"))
|
||||
|
||||
# Should clean up only 1.0/reference.bin
|
||||
assert result["cleaned_reference"] == "releases/1.0/reference.bin"
|
||||
assert result.cleaned_reference == "releases/1.0/reference.bin"
|
||||
|
||||
# Verify correct files deleted
|
||||
delete_calls = [call[0][0] for call in mock_storage.delete.call_args_list]
|
||||
@@ -436,9 +436,9 @@ class TestRecursiveDeleteCleanup:
|
||||
result = service.delete_recursive("test-bucket", "data/")
|
||||
|
||||
# Should delete both delta and reference
|
||||
assert result["deleted_count"] == 2
|
||||
assert result["deltas_deleted"] == 1
|
||||
assert result["references_deleted"] == 1
|
||||
assert result.deleted_count == 2
|
||||
assert result.deltas_deleted == 1
|
||||
assert result.references_deleted == 1
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
@@ -5,6 +5,7 @@ from unittest.mock import Mock, patch
|
||||
import pytest
|
||||
|
||||
from deltaglider.app.cli.main import create_service
|
||||
from deltaglider.core.models import RecursiveDeleteResult
|
||||
from deltaglider.ports.storage import ObjectHead
|
||||
|
||||
|
||||
@@ -28,10 +29,10 @@ class TestRecursiveDeleteReferenceCleanup:
|
||||
|
||||
result = service.delete_recursive("test-bucket", "nonexistent/")
|
||||
|
||||
assert result["deleted_count"] == 0
|
||||
assert result["failed_count"] == 0
|
||||
assert isinstance(result["errors"], list)
|
||||
assert isinstance(result["warnings"], list)
|
||||
assert result.deleted_count == 0
|
||||
assert result.failed_count == 0
|
||||
assert isinstance(result.errors, list)
|
||||
assert isinstance(result.warnings, list)
|
||||
|
||||
def test_delete_recursive_returns_structured_result(self):
|
||||
"""Test that delete_recursive returns a properly structured result."""
|
||||
@@ -57,26 +58,22 @@ class TestRecursiveDeleteReferenceCleanup:
|
||||
|
||||
result = service.delete_recursive("test-bucket", "test/")
|
||||
|
||||
# Verify structure
|
||||
required_keys = [
|
||||
"bucket",
|
||||
"prefix",
|
||||
"deleted_count",
|
||||
"failed_count",
|
||||
"deltas_deleted",
|
||||
"references_deleted",
|
||||
"direct_deleted",
|
||||
"other_deleted",
|
||||
"errors",
|
||||
"warnings",
|
||||
]
|
||||
for key in required_keys:
|
||||
assert key in result, f"Missing key: {key}"
|
||||
# Verify structure - result is a RecursiveDeleteResult dataclass
|
||||
assert hasattr(result, "bucket")
|
||||
assert hasattr(result, "prefix")
|
||||
assert hasattr(result, "deleted_count")
|
||||
assert hasattr(result, "failed_count")
|
||||
assert hasattr(result, "deltas_deleted")
|
||||
assert hasattr(result, "references_deleted")
|
||||
assert hasattr(result, "direct_deleted")
|
||||
assert hasattr(result, "other_deleted")
|
||||
assert hasattr(result, "errors")
|
||||
assert hasattr(result, "warnings")
|
||||
|
||||
assert isinstance(result["deleted_count"], int)
|
||||
assert isinstance(result["failed_count"], int)
|
||||
assert isinstance(result["errors"], list)
|
||||
assert isinstance(result["warnings"], list)
|
||||
assert isinstance(result.deleted_count, int)
|
||||
assert isinstance(result.failed_count, int)
|
||||
assert isinstance(result.errors, list)
|
||||
assert isinstance(result.warnings, list)
|
||||
|
||||
def test_delete_recursive_categorizes_objects_correctly(self):
|
||||
"""Test that delete_recursive correctly categorizes different object types."""
|
||||
@@ -117,12 +114,12 @@ class TestRecursiveDeleteReferenceCleanup:
|
||||
result = service.delete_recursive("test-bucket", "test/")
|
||||
|
||||
# Should categorize correctly - the exact categorization depends on implementation
|
||||
assert result["deltas_deleted"] == 1 # app.zip.delta
|
||||
assert result["references_deleted"] == 1 # reference.bin
|
||||
assert result.deltas_deleted == 1 # app.zip.delta
|
||||
assert result.references_deleted == 1 # reference.bin
|
||||
# Direct and other files may be categorized differently based on metadata detection
|
||||
assert result["direct_deleted"] + result["other_deleted"] == 2 # readme.txt + config.json
|
||||
assert result["deleted_count"] == 4 # total
|
||||
assert result["failed_count"] == 0
|
||||
assert result.direct_deleted + result.other_deleted == 2 # readme.txt + config.json
|
||||
assert result.deleted_count == 4 # total
|
||||
assert result.failed_count == 0
|
||||
|
||||
def test_delete_recursive_handles_storage_errors_gracefully(self):
|
||||
"""Test that delete_recursive handles individual storage errors gracefully."""
|
||||
@@ -151,10 +148,10 @@ class TestRecursiveDeleteReferenceCleanup:
|
||||
result = service.delete_recursive("test-bucket", "test/")
|
||||
|
||||
# Should handle partial failure
|
||||
assert result["deleted_count"] == 1 # good.zip.delta succeeded
|
||||
assert result["failed_count"] == 1 # bad.zip.delta failed
|
||||
assert len(result["errors"]) == 1
|
||||
assert "bad" in result["errors"][0]
|
||||
assert result.deleted_count == 1 # good.zip.delta succeeded
|
||||
assert result.failed_count == 1 # bad.zip.delta failed
|
||||
assert len(result.errors) == 1
|
||||
assert "bad" in result.errors[0]
|
||||
|
||||
def test_affected_deltaspaces_discovery(self):
|
||||
"""Test that the system discovers affected deltaspaces when deleting deltas."""
|
||||
@@ -206,8 +203,8 @@ class TestRecursiveDeleteReferenceCleanup:
|
||||
result = service.delete_recursive("test-bucket", "project/team-a/v1/")
|
||||
|
||||
# Should have discovered and evaluated the parent reference
|
||||
assert result["deleted_count"] >= 1 # At least the delta file
|
||||
assert result["failed_count"] == 0
|
||||
assert result.deleted_count >= 1 # At least the delta file
|
||||
assert result.failed_count == 0
|
||||
|
||||
def test_cli_uses_core_service_method(self):
|
||||
"""Test that CLI rm -r command uses the core service delete_recursive method."""
|
||||
@@ -222,14 +219,12 @@ class TestRecursiveDeleteReferenceCleanup:
|
||||
mock_create_service.return_value = mock_service
|
||||
|
||||
# Mock successful deletion
|
||||
mock_service.delete_recursive.return_value = {
|
||||
"bucket": "test-bucket",
|
||||
"prefix": "test/",
|
||||
"deleted_count": 2,
|
||||
"failed_count": 0,
|
||||
"warnings": [],
|
||||
"errors": [],
|
||||
}
|
||||
mock_service.delete_recursive.return_value = RecursiveDeleteResult(
|
||||
bucket="test-bucket",
|
||||
prefix="test/",
|
||||
deleted_count=2,
|
||||
failed_count=0,
|
||||
)
|
||||
|
||||
result = runner.invoke(cli, ["rm", "-r", "s3://test-bucket/test/"])
|
||||
|
||||
@@ -294,8 +289,8 @@ class TestRecursiveDeleteReferenceCleanup:
|
||||
|
||||
result = service.delete(ObjectKey(bucket="test-bucket", key="test/file.zip.delta"))
|
||||
|
||||
assert result["deleted"]
|
||||
assert result["type"] == "delta"
|
||||
assert result.deleted
|
||||
assert result.type == "delta"
|
||||
|
||||
def test_reference_cleanup_intelligence_basic(self):
|
||||
"""Basic test to verify reference cleanup intelligence is working."""
|
||||
@@ -328,10 +323,10 @@ class TestRecursiveDeleteReferenceCleanup:
|
||||
result = service.delete_recursive("test-bucket", "simple/")
|
||||
|
||||
# Should delete both delta and reference since there are no other dependencies
|
||||
assert result["deleted_count"] == 2
|
||||
assert result["deltas_deleted"] == 1
|
||||
assert result["references_deleted"] == 1
|
||||
assert result["failed_count"] == 0
|
||||
assert result.deleted_count == 2
|
||||
assert result.deltas_deleted == 1
|
||||
assert result.references_deleted == 1
|
||||
assert result.failed_count == 0
|
||||
|
||||
def test_comprehensive_result_validation(self):
|
||||
"""Test that all result fields are properly populated."""
|
||||
@@ -366,31 +361,31 @@ class TestRecursiveDeleteReferenceCleanup:
|
||||
result = service.delete_recursive("test-bucket", "mixed/")
|
||||
|
||||
# Validate all expected fields are present and have correct types
|
||||
assert isinstance(result["bucket"], str)
|
||||
assert isinstance(result["prefix"], str)
|
||||
assert isinstance(result["deleted_count"], int)
|
||||
assert isinstance(result["failed_count"], int)
|
||||
assert isinstance(result["deltas_deleted"], int)
|
||||
assert isinstance(result["references_deleted"], int)
|
||||
assert isinstance(result["direct_deleted"], int)
|
||||
assert isinstance(result["other_deleted"], int)
|
||||
assert isinstance(result["errors"], list)
|
||||
assert isinstance(result["warnings"], list)
|
||||
assert isinstance(result.bucket, str)
|
||||
assert isinstance(result.prefix, str)
|
||||
assert isinstance(result.deleted_count, int)
|
||||
assert isinstance(result.failed_count, int)
|
||||
assert isinstance(result.deltas_deleted, int)
|
||||
assert isinstance(result.references_deleted, int)
|
||||
assert isinstance(result.direct_deleted, int)
|
||||
assert isinstance(result.other_deleted, int)
|
||||
assert isinstance(result.errors, list)
|
||||
assert isinstance(result.warnings, list)
|
||||
|
||||
# Validate counts add up
|
||||
total_by_type = (
|
||||
result["deltas_deleted"]
|
||||
+ result["references_deleted"]
|
||||
+ result["direct_deleted"]
|
||||
+ result["other_deleted"]
|
||||
result.deltas_deleted
|
||||
+ result.references_deleted
|
||||
+ result.direct_deleted
|
||||
+ result.other_deleted
|
||||
)
|
||||
assert result["deleted_count"] == total_by_type
|
||||
assert result.deleted_count == total_by_type
|
||||
|
||||
# Validate specific counts for this scenario
|
||||
assert result["deltas_deleted"] == 1
|
||||
assert result["references_deleted"] == 1
|
||||
assert result.deltas_deleted == 1
|
||||
assert result.references_deleted == 1
|
||||
# Direct and other files may be categorized differently
|
||||
assert result["direct_deleted"] + result["other_deleted"] == 2
|
||||
assert result.direct_deleted + result.other_deleted == 2
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
70
tests/unit/test_s3_compat.py
Normal file
70
tests/unit/test_s3_compat.py
Normal file
@@ -0,0 +1,70 @@
|
||||
"""Tests for S3-compatible storage compatibility.
|
||||
|
||||
Ensures the S3 adapter works with non-AWS S3 endpoints (Hetzner, MinIO, etc.)
|
||||
that don't support newer AWS-specific features like automatic request checksums.
|
||||
"""
|
||||
|
||||
from unittest.mock import MagicMock, patch
|
||||
|
||||
from deltaglider.adapters.storage_s3 import S3StorageAdapter
|
||||
|
||||
|
||||
class TestS3CompatibleEndpoints:
|
||||
"""Verify S3 adapter configuration for non-AWS endpoint compatibility."""
|
||||
|
||||
def test_client_disables_automatic_checksums(self):
|
||||
"""boto3 1.36+ sends CRC32/CRC64 checksums by default.
|
||||
|
||||
S3-compatible stores (Hetzner, MinIO) reject these with BadRequest.
|
||||
The adapter must set request_checksum_calculation='when_required'.
|
||||
"""
|
||||
with patch("deltaglider.adapters.storage_s3.boto3.client") as mock_client:
|
||||
S3StorageAdapter(endpoint_url="https://example.com")
|
||||
|
||||
mock_client.assert_called_once()
|
||||
call_kwargs = mock_client.call_args
|
||||
config = call_kwargs.kwargs.get("config") or call_kwargs[1].get("config")
|
||||
|
||||
assert config is not None, "boto3 client must be created with a Config object"
|
||||
assert config.request_checksum_calculation == "when_required"
|
||||
assert config.response_checksum_validation == "when_required"
|
||||
|
||||
def test_put_object_no_checksum_kwargs(self, temp_dir):
|
||||
"""put_object must not pass ChecksumAlgorithm or similar kwargs."""
|
||||
mock_client = MagicMock()
|
||||
mock_client.put_object.return_value = {"ETag": '"abc123"'}
|
||||
|
||||
adapter = S3StorageAdapter(client=mock_client)
|
||||
|
||||
test_file = temp_dir / "test.sha1"
|
||||
test_file.write_text("abc123")
|
||||
|
||||
adapter.put(
|
||||
"my-bucket/test/test.sha1",
|
||||
test_file,
|
||||
{"compression": "none", "tool": "deltaglider"},
|
||||
)
|
||||
|
||||
mock_client.put_object.assert_called_once()
|
||||
call_kwargs = mock_client.put_object.call_args.kwargs
|
||||
|
||||
checksum_keys = {
|
||||
"ChecksumAlgorithm",
|
||||
"ChecksumCRC32",
|
||||
"ChecksumCRC32C",
|
||||
"ChecksumCRC64NVME",
|
||||
"ChecksumSHA1",
|
||||
"ChecksumSHA256",
|
||||
"ContentMD5",
|
||||
}
|
||||
passed_checksum_keys = checksum_keys & set(call_kwargs.keys())
|
||||
assert not passed_checksum_keys, (
|
||||
f"put_object must not pass checksum kwargs for S3-compatible "
|
||||
f"endpoint support, but found: {passed_checksum_keys}"
|
||||
)
|
||||
|
||||
def test_preconfigured_client_is_used_as_is(self):
|
||||
"""When a pre-configured client is passed, it should be used directly."""
|
||||
mock_client = MagicMock()
|
||||
adapter = S3StorageAdapter(client=mock_client)
|
||||
assert adapter.client is mock_client
|
||||
Reference in New Issue
Block a user