mirror of
https://github.com/beshu-tech/deltaglider.git
synced 2026-04-30 12:14:32 +02:00
Compare commits
88 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
a98fc7c178 | ||
|
|
82e00623de | ||
|
|
e8c76f1dc7 | ||
|
|
c492a5087b | ||
|
|
85af5a95c8 | ||
|
|
60b70309fa | ||
|
|
b0699f952a | ||
|
|
9bfe121f44 | ||
|
|
6cab3de9a0 | ||
|
|
482f45fc02 | ||
|
|
6b3245266e | ||
|
|
20053acb5f | ||
|
|
87f425734f | ||
|
|
012662c377 | ||
|
|
284f030fae | ||
|
|
7a4d30a007 | ||
|
|
0d46283ff0 | ||
|
|
805e2967bc | ||
|
|
2ef1741d51 | ||
|
|
2c1d756e7b | ||
|
|
c6cee7ae26 | ||
|
|
cee9a9fd2d | ||
|
|
0507e6ebcd | ||
|
|
fa9c4fa42d | ||
|
|
934d83975c | ||
|
|
c32d5265d9 | ||
|
|
1cf7e3ad21 | ||
|
|
9b36087438 | ||
|
|
60877966f2 | ||
|
|
fbd44ea3c3 | ||
|
|
3f689fc601 | ||
|
|
3753212f96 | ||
|
|
db7d14f8a8 | ||
|
|
e1259b7ea8 | ||
|
|
ff05e77c24 | ||
|
|
c3d385bf18 | ||
|
|
aea5cb5d9a | ||
|
|
b2ca59490b | ||
|
|
4f56c4b600 | ||
|
|
14c6af0f35 | ||
|
|
67792b2031 | ||
|
|
a9a1396e6e | ||
|
|
52eb5bba21 | ||
|
|
f75db142e8 | ||
|
|
35d34d4862 | ||
|
|
9230cbd762 | ||
|
|
2eba6e8d38 | ||
|
|
656726b57b | ||
|
|
85dd315424 | ||
|
|
dbd2632cae | ||
|
|
3d04a407c0 | ||
|
|
47f022fffe | ||
|
|
7a2ed16ee7 | ||
|
|
5e333254ba | ||
|
|
04cc984d4a | ||
|
|
ac7d4e067f | ||
|
|
e8fb926fd6 | ||
|
|
626e28eaf6 | ||
|
|
90a342dc33 | ||
|
|
f9f2b036e3 | ||
|
|
778d7f0148 | ||
|
|
37ea2f138c | ||
|
|
5e3b76791e | ||
|
|
fb2877bfd3 | ||
|
|
88fd1f51cd | ||
|
|
0857e02edd | ||
|
|
689cf00d02 | ||
|
|
743d52e783 | ||
|
|
8bc0a0eaf3 | ||
|
|
4cf25e4681 | ||
|
|
69ed9056d2 | ||
|
|
38134f28f5 | ||
|
|
fa1f8b85a9 | ||
|
|
a06cc2939c | ||
|
|
5b8477ed61 | ||
|
|
e706ddebdd | ||
|
|
50db9bbb27 | ||
|
|
c25568e315 | ||
|
|
ca1186a3f6 | ||
|
|
4217535e8c | ||
|
|
0064d7e74b | ||
|
|
9c1659a1f1 | ||
|
|
34c871b0d7 | ||
|
|
db0662c175 | ||
|
|
2efa760785 | ||
|
|
74207f4ee4 | ||
|
|
4668b10c3f | ||
|
|
8cea5a3527 |
2
.github/workflows/ci.yml
vendored
2
.github/workflows/ci.yml
vendored
@@ -98,7 +98,7 @@ jobs:
|
||||
runs-on: ubuntu-latest
|
||||
services:
|
||||
localstack:
|
||||
image: localstack/localstack:latest
|
||||
image: localstack/localstack:4.4
|
||||
ports:
|
||||
- 4566:4566
|
||||
env:
|
||||
|
||||
92
.github/workflows/docker-publish.yml
vendored
Normal file
92
.github/workflows/docker-publish.yml
vendored
Normal file
@@ -0,0 +1,92 @@
|
||||
name: Build and Publish Docker Images
|
||||
|
||||
on:
|
||||
push:
|
||||
branches:
|
||||
- main
|
||||
- develop
|
||||
tags:
|
||||
- 'v*'
|
||||
pull_request:
|
||||
branches:
|
||||
- main
|
||||
workflow_dispatch:
|
||||
|
||||
env:
|
||||
REGISTRY: docker.io
|
||||
IMAGE_NAME: beshultd/deltaglider
|
||||
|
||||
jobs:
|
||||
build-and-push:
|
||||
runs-on: ubuntu-latest
|
||||
permissions:
|
||||
contents: read
|
||||
packages: write
|
||||
|
||||
steps:
|
||||
- name: Checkout repository
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
fetch-depth: 0 # Full history for proper git describe
|
||||
|
||||
- name: Set up Docker Buildx
|
||||
uses: docker/setup-buildx-action@v3
|
||||
|
||||
- name: Log in to Docker Hub
|
||||
if: github.event_name != 'pull_request'
|
||||
uses: docker/login-action@v3
|
||||
with:
|
||||
username: ${{ secrets.DOCKERHUB_USERNAME }}
|
||||
password: ${{ secrets.DOCKERHUB_TOKEN }}
|
||||
|
||||
- name: Extract version from git
|
||||
id: version
|
||||
run: |
|
||||
# Get version from git tags
|
||||
VERSION=$(git describe --tags --always --abbrev=0 2>/dev/null || echo "dev")
|
||||
# Remove 'v' prefix if present
|
||||
VERSION=${VERSION#v}
|
||||
echo "version=${VERSION}" >> $GITHUB_OUTPUT
|
||||
echo "Version: ${VERSION}"
|
||||
|
||||
- name: Extract metadata (tags, labels) for Docker
|
||||
id: meta
|
||||
uses: docker/metadata-action@v5
|
||||
with:
|
||||
images: ${{ env.IMAGE_NAME }}
|
||||
tags: |
|
||||
# For main branch: tag as 'latest'
|
||||
type=raw,value=latest,enable=${{ github.ref == 'refs/heads/main' }}
|
||||
# For develop branch: tag as 'develop'
|
||||
type=raw,value=develop,enable=${{ github.ref == 'refs/heads/develop' }}
|
||||
# For version tags: use semver patterns
|
||||
type=semver,pattern={{version}}
|
||||
type=semver,pattern={{major}}.{{minor}}
|
||||
type=semver,pattern={{major}}
|
||||
# For PRs: tag as pr-<number>
|
||||
type=ref,event=pr
|
||||
# Include git sha for traceability (only on branch pushes, not tags)
|
||||
type=sha,prefix={{branch}}-,enable=${{ startsWith(github.ref, 'refs/heads/') }}
|
||||
|
||||
- name: Build and push Docker image
|
||||
uses: docker/build-push-action@v5
|
||||
with:
|
||||
context: .
|
||||
platforms: linux/amd64,linux/arm64
|
||||
push: ${{ github.event_name != 'pull_request' }}
|
||||
tags: ${{ steps.meta.outputs.tags }}
|
||||
labels: ${{ steps.meta.outputs.labels }}
|
||||
build-args: |
|
||||
VERSION=${{ steps.version.outputs.version }}
|
||||
cache-from: type=gha
|
||||
cache-to: type=gha,mode=max
|
||||
|
||||
- name: Docker Hub Description
|
||||
if: github.event_name != 'pull_request' && github.ref == 'refs/heads/main'
|
||||
uses: peter-evans/dockerhub-description@v4
|
||||
with:
|
||||
username: ${{ secrets.DOCKERHUB_USERNAME }}
|
||||
password: ${{ secrets.DOCKERHUB_TOKEN }}
|
||||
repository: ${{ env.IMAGE_NAME }}
|
||||
short-description: "Store 4TB in 5GB: S3-compatible storage with 99.9% compression"
|
||||
readme-filepath: ./README.md
|
||||
3
.github/workflows/release-manual.yml
vendored
3
.github/workflows/release-manual.yml
vendored
@@ -146,7 +146,7 @@ jobs:
|
||||
runs-on: ubuntu-latest
|
||||
services:
|
||||
localstack:
|
||||
image: localstack/localstack:latest
|
||||
image: localstack/localstack:4.4
|
||||
ports:
|
||||
- 4566:4566
|
||||
env:
|
||||
@@ -231,6 +231,7 @@ jobs:
|
||||
|
||||
- name: Create GitHub Release
|
||||
uses: softprops/action-gh-release@v1
|
||||
continue-on-error: true # Don't fail if GitHub release creation fails
|
||||
with:
|
||||
tag_name: ${{ needs.validate.outputs.tag_name }}
|
||||
name: Release v${{ github.event.inputs.version }}
|
||||
|
||||
3
.github/workflows/release.yml
vendored
3
.github/workflows/release.yml
vendored
@@ -150,7 +150,7 @@ jobs:
|
||||
runs-on: ubuntu-latest
|
||||
services:
|
||||
localstack:
|
||||
image: localstack/localstack:latest
|
||||
image: localstack/localstack:4.4
|
||||
ports:
|
||||
- 4566:4566
|
||||
env:
|
||||
@@ -235,6 +235,7 @@ jobs:
|
||||
|
||||
- name: Create GitHub Release
|
||||
uses: softprops/action-gh-release@v1
|
||||
continue-on-error: true # Don't fail if GitHub release creation fails
|
||||
with:
|
||||
tag_name: ${{ needs.validate-and-tag.outputs.tag_name }}
|
||||
name: Release v${{ github.event.inputs.version }}
|
||||
|
||||
@@ -2,7 +2,7 @@
|
||||
|
||||
DeltaGlider implements a **subset** of boto3's S3 client API, focusing on the most commonly used operations. This is **not** a 100% drop-in replacement, but covers the core functionality needed for most use cases.
|
||||
|
||||
## ✅ Implemented Methods (21 core methods)
|
||||
## ✅ Implemented Methods (23 core methods)
|
||||
|
||||
### Object Operations
|
||||
- ✅ `put_object()` - Upload objects (with automatic delta compression)
|
||||
@@ -17,6 +17,8 @@ DeltaGlider implements a **subset** of boto3's S3 client API, focusing on the mo
|
||||
- ✅ `create_bucket()` - Create buckets
|
||||
- ✅ `delete_bucket()` - Delete empty buckets
|
||||
- ✅ `list_buckets()` - List all buckets
|
||||
- ✅ `put_bucket_acl()` - Set bucket ACL (passthrough to S3)
|
||||
- ✅ `get_bucket_acl()` - Get bucket ACL (passthrough to S3)
|
||||
|
||||
### Presigned URLs
|
||||
- ✅ `generate_presigned_url()` - Generate presigned URLs
|
||||
@@ -46,8 +48,6 @@ DeltaGlider implements a **subset** of boto3's S3 client API, focusing on the mo
|
||||
- ❌ `list_parts()`
|
||||
|
||||
### Access Control (ACL)
|
||||
- ❌ `get_bucket_acl()`
|
||||
- ❌ `put_bucket_acl()`
|
||||
- ❌ `get_object_acl()`
|
||||
- ❌ `put_object_acl()`
|
||||
- ❌ `get_public_access_block()`
|
||||
@@ -135,9 +135,9 @@ DeltaGlider implements a **subset** of boto3's S3 client API, focusing on the mo
|
||||
|
||||
## Coverage Analysis
|
||||
|
||||
**Implemented:** ~21 methods
|
||||
**Implemented:** ~23 methods
|
||||
**Total boto3 S3 methods:** ~100+ methods
|
||||
**Coverage:** ~20%
|
||||
**Coverage:** ~23%
|
||||
|
||||
## What's Covered
|
||||
|
||||
|
||||
303
CHANGELOG.md
Normal file
303
CHANGELOG.md
Normal file
@@ -0,0 +1,303 @@
|
||||
# Changelog
|
||||
|
||||
All notable changes to this project will be documented in this file.
|
||||
|
||||
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
|
||||
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
||||
|
||||
## [6.1.1] - 2026-03-23
|
||||
|
||||
### Fixed
|
||||
- **S3-Compatible Endpoint Support**: Disabled boto3 automatic request checksums (CRC32/CRC64) that were added in boto3 1.36+. S3-compatible stores like Hetzner Object Storage reject these headers with `BadRequest`, breaking direct (non-delta) file uploads. Sets `request_checksum_calculation="when_required"` to restore compatibility while still working with AWS S3.
|
||||
- **CI: LocalStack pinned to 4.4** — `localstack/localstack:latest` now requires a paid license; pinned to last free version across all workflows and docker-compose files.
|
||||
|
||||
### Changed
|
||||
- **Dependency Pinning**: All runtime dependencies now use major-version upper bounds (`boto3>=1.35.0,<2.0.0`, etc.) to prevent surprise breaking changes in Docker builds.
|
||||
|
||||
### Added
|
||||
- **S3 Compatibility Tests**: New `test_s3_compat.py` unit tests verifying the boto3 client disables automatic checksums and `put_object` doesn't pass checksum kwargs — regression protection for non-AWS S3 endpoints.
|
||||
- **Dependency Management Guide**: Added quarterly dependency refresh checklist and known compatibility constraints to CLAUDE.md.
|
||||
|
||||
## [6.1.0] - 2025-02-07
|
||||
|
||||
### Added
|
||||
- **Bucket ACL Management**: New `put_bucket_acl()` and `get_bucket_acl()` methods
|
||||
- boto3-compatible passthrough to native S3 ACL operations
|
||||
- Supports canned ACLs (`private`, `public-read`, `public-read-write`, `authenticated-read`)
|
||||
- Supports grant-based ACLs (`GrantRead`, `GrantWrite`, `GrantFullControl`, etc.)
|
||||
- Supports full `AccessControlPolicy` dict for fine-grained control
|
||||
- SDK method count increased from 21 to 23
|
||||
- **New CLI Commands**: `deltaglider put-bucket-acl` and `deltaglider get-bucket-acl`
|
||||
- Mirrors `aws s3api put-bucket-acl` / `get-bucket-acl` syntax
|
||||
- Accepts bucket name or `s3://bucket` URL format
|
||||
- JSON output for `get-bucket-acl` (compatible with AWS CLI)
|
||||
- Supports `--endpoint-url`, `--region`, `--profile` flags
|
||||
- **Docker Publishing**: Added GitHub Actions workflow for multi-arch Docker image builds (amd64/arm64)
|
||||
|
||||
### Changed
|
||||
- **Refactor**: Extracted `DeltaGliderConfig` dataclass for centralized configuration management
|
||||
- **Refactor**: Introduced typed `DeleteResult` and `RecursiveDeleteResult` dataclasses replacing raw dicts
|
||||
- **Refactor**: Centralized S3 metadata key aliases into `core/models.py` constants
|
||||
- **Refactor**: Extracted helper methods in `DeltaService` for improved readability
|
||||
|
||||
### Fixed
|
||||
- Removed unused imports flagged by ruff in test files
|
||||
|
||||
### Documentation
|
||||
- Updated BOTO3_COMPATIBILITY.md (coverage 20% → 23%)
|
||||
- Updated AWS S3 CLI compatibility docs with ACL command examples
|
||||
- Refreshed README with dark mode logo and streamlined content
|
||||
- Cleaned up SDK documentation and examples
|
||||
|
||||
## [6.0.0] - 2025-10-17
|
||||
|
||||
### Added
|
||||
- **EC2 Region Detection & Cost Optimization**
|
||||
- Automatic detection of EC2 instance region using IMDSv2
|
||||
- Warns when EC2 region ≠ S3 client region (potential cross-region charges)
|
||||
- Different warnings for auto-detected vs. explicit `--region` flag mismatches
|
||||
- Green checkmark when regions are aligned (optimal configuration)
|
||||
- Can be disabled with `DG_DISABLE_EC2_DETECTION=true` environment variable
|
||||
- Helps users optimize for cost and performance before migration starts
|
||||
- **New CLI Command**: `deltaglider migrate` for S3-to-S3 bucket migration with compression
|
||||
- Supports resume capability (skips already migrated files)
|
||||
- Real-time progress tracking with file count and statistics
|
||||
- Interactive confirmation prompt (use `--yes` to skip)
|
||||
- Prefix preservation by default (use `--no-preserve-prefix` to disable)
|
||||
- Dry run mode with `--dry-run` flag
|
||||
- Include/exclude pattern filtering
|
||||
- Shows compression statistics after migration
|
||||
- **EC2-aware region logging**: Detects EC2 instance and warns about cross-region charges
|
||||
- **FIXED**: Now correctly preserves original filenames during migration
|
||||
- **S3-to-S3 Recursive Copy**: `deltaglider cp -r s3://source/ s3://dest/` now supported
|
||||
- Automatically uses migration functionality with prefix preservation
|
||||
- Applies delta compression during transfer
|
||||
- Preserves original filenames correctly
|
||||
- **Version Command**: Added `--version` flag to show deltaglider version
|
||||
- Usage: `deltaglider --version`
|
||||
- **DeltaService API Enhancement**: Added `override_name` parameter to `put()` method
|
||||
- Allows specifying destination filename independently of source filesystem path
|
||||
- Enables proper S3-to-S3 transfers without filesystem renaming tricks
|
||||
- **Rehydration & Purge**: Automatic rehydration of delta-compressed files for presigned URL access
|
||||
- New `deltaglider purge` CLI command to clean expired temporary files
|
||||
- **Metadata Namespace**: Centralized `dg-` prefixed metadata keys for all DeltaGlider metadata
|
||||
- **S3-Based Stats Caching**: Bucket statistics cached in S3 with automatic invalidation
|
||||
|
||||
### Fixed
|
||||
- **Critical**: S3-to-S3 migration now preserves original filenames
|
||||
- Previously created files with temp names like `tmp1b9cpdsn.zip`
|
||||
- Now correctly uses original filenames from source S3 keys
|
||||
- Fixed by adding `override_name` parameter to `DeltaService.put()`
|
||||
- **CLI Region Support**: `--region` flag now properly passes region to boto3 client
|
||||
- Previously only set environment variable, relied on boto3 auto-detection
|
||||
- Now explicitly passes `region_name` to `boto3.client()` via `boto3_kwargs`
|
||||
- Ensures consistent behavior with `DeltaGliderClient` SDK
|
||||
|
||||
### Changed
|
||||
- Recursive S3-to-S3 copy operations now preserve source prefix structure by default
|
||||
- Migration operations show formatted output with source and destination paths
|
||||
|
||||
### Documentation
|
||||
- Added comprehensive migration guide in README.md
|
||||
- Updated CLI reference with migrate command examples
|
||||
- Added prefix preservation behavior documentation
|
||||
|
||||
## [5.1.1] - 2025-01-10
|
||||
|
||||
### Fixed
|
||||
- **Stats Command**: Fixed incorrect compression ratio calculations
|
||||
- Now correctly counts ALL files including reference.bin in compressed size
|
||||
- Fixed handling of orphaned reference.bin files (reference files with no delta files)
|
||||
- Added prominent warnings for orphaned reference files with cleanup commands
|
||||
- Fixed stats for buckets with no compression (now shows 0% instead of negative)
|
||||
- SHA1 checksum files are now properly included in calculations
|
||||
|
||||
### Improved
|
||||
- **Stats Performance**: Optimized metadata fetching with parallel requests
|
||||
- 5-10x faster for buckets with many delta files
|
||||
- Uses ThreadPoolExecutor for concurrent HEAD requests
|
||||
- Single-pass calculation algorithm for better efficiency
|
||||
|
||||
## [5.1.0] - 2025-10-10
|
||||
|
||||
### Added
|
||||
- **New CLI Command**: `deltaglider stats <bucket>` for bucket statistics and compression metrics
|
||||
- Supports `--detailed` flag for comprehensive analysis
|
||||
- Supports `--json` flag for machine-readable output
|
||||
- Accepts multiple formats: `s3://bucket/`, `s3://bucket`, `bucket`
|
||||
- **Session-Level Statistics Caching**: Bucket stats now cached per client instance
|
||||
- Automatic cache invalidation on mutations (put, delete, bucket operations)
|
||||
- Intelligent cache reuse (detailed stats serve quick stat requests)
|
||||
- Enhanced `list_buckets()` includes cached stats when available
|
||||
- **Programmatic Cache Management**: Added cache management APIs for long-running applications
|
||||
- `clear_cache()`: Clear all cached references
|
||||
- `evict_cache()`: Remove specific cached reference
|
||||
- Session-scoped cache lifecycle management
|
||||
|
||||
### Changed
|
||||
- Bucket statistics are now cached within client session for performance
|
||||
- `list_buckets()` response includes `DeltaGliderStats` metadata when cached
|
||||
|
||||
### Documentation
|
||||
- Added comprehensive DG_MAX_RATIO tuning guide in docs/
|
||||
- Updated CLI command reference in CLAUDE.md and README.md
|
||||
- Added detailed cache management documentation
|
||||
|
||||
## [5.0.3] - 2025-10-10
|
||||
|
||||
### Security
|
||||
- **BREAKING**: Removed all legacy shared cache code for security
|
||||
- **BREAKING**: Encryption is now ALWAYS ON (cannot be disabled)
|
||||
- Ephemeral process-isolated cache is now the ONLY mode (no opt-out)
|
||||
- **Content-Addressed Storage (CAS)**: Implemented SHA256-based cache storage
|
||||
- Zero collision risk (SHA256 namespace guarantees uniqueness)
|
||||
- Automatic deduplication (same content = same filename)
|
||||
- Tampering protection (changing content changes SHA, breaks lookup)
|
||||
- Two-level directory structure for filesystem optimization
|
||||
- **Encrypted Cache**: All cache data encrypted at rest using Fernet (AES-128-CBC + HMAC)
|
||||
- Ephemeral encryption keys per process (forward secrecy)
|
||||
- Optional persistent keys via `DG_CACHE_ENCRYPTION_KEY` for shared filesystems
|
||||
- Automatic cleanup of corrupted cache files on decryption failures
|
||||
- Fixed TOCTOU vulnerabilities with atomic SHA validation at use-time
|
||||
- Added `get_validated_ref()` method to prevent cache poisoning
|
||||
- Eliminated multi-user data exposure through mandatory cache isolation
|
||||
|
||||
### Removed
|
||||
- **BREAKING**: Removed `DG_UNSAFE_SHARED_CACHE` environment variable
|
||||
- **BREAKING**: Removed `DG_CACHE_DIR` environment variable
|
||||
- **BREAKING**: Removed `DG_CACHE_ENCRYPTION` environment variable (encryption always on)
|
||||
- **BREAKING**: Removed `cache_dir` parameter from `create_client()`
|
||||
|
||||
### Changed
|
||||
- Cache is now auto-created in `/tmp/deltaglider-*` and cleaned on exit
|
||||
- All cache operations use file locking (Unix) and SHA validation
|
||||
- Added `CacheMissError` and `CacheCorruptionError` exceptions
|
||||
|
||||
### Added
|
||||
- New `ContentAddressedCache` adapter in `adapters/cache_cas.py`
|
||||
- New `EncryptedCache` wrapper in `adapters/cache_encrypted.py`
|
||||
- New `MemoryCache` adapter in `adapters/cache_memory.py` with LRU eviction
|
||||
- Self-describing cache structure with SHA256-based filenames
|
||||
- Configurable cache backends via `DG_CACHE_BACKEND` (filesystem or memory)
|
||||
- Memory cache size limit via `DG_CACHE_MEMORY_SIZE_MB` (default: 100MB)
|
||||
|
||||
### Internal
|
||||
- Updated all tests to use Content-Addressed Storage and encryption
|
||||
- All 119 tests passing with zero errors (99 original + 20 new cache tests)
|
||||
- Type checking: 0 errors (mypy)
|
||||
- Linting: All checks passed (ruff)
|
||||
- Completed Phase 1, 2, and 7 of SECURITY_FIX_ROADMAP.md
|
||||
- Added comprehensive test suites for encryption (13 tests) and memory cache (10 tests)
|
||||
|
||||
## [5.0.1] - 2025-01-10
|
||||
|
||||
### Changed
|
||||
- **Code Organization**: Refactored client.py from 1560 to 1154 lines (26% reduction)
|
||||
- Extracted client operations into modular `client_operations/` package:
|
||||
- `bucket.py` - S3 bucket management operations
|
||||
- `presigned.py` - Presigned URL generation
|
||||
- `batch.py` - Batch upload/download operations
|
||||
- `stats.py` - Analytics and statistics operations
|
||||
- Improved code maintainability with logical separation of concerns
|
||||
- Better developer experience with cleaner module structure
|
||||
|
||||
### Internal
|
||||
- Full type safety maintained with mypy (0 errors)
|
||||
- All 99 tests passing
|
||||
- Code quality checks passing (ruff)
|
||||
- No breaking changes - all public APIs remain unchanged
|
||||
|
||||
## [5.0.0] - 2025-01-10
|
||||
|
||||
### Added
|
||||
- boto3-compatible TypedDict types for S3 responses (no boto3 import needed)
|
||||
- Complete boto3 compatibility vision document
|
||||
- Type-safe response builders using TypedDict patterns
|
||||
|
||||
### Changed
|
||||
- **BREAKING**: `list_objects()` now returns boto3-compatible dict instead of custom dataclass
|
||||
- Use `response['Contents']` instead of `response.contents`
|
||||
- Use `response.get('IsTruncated')` instead of `response.is_truncated`
|
||||
- Use `response.get('NextContinuationToken')` instead of `response.next_continuation_token`
|
||||
- DeltaGlider metadata now in `Metadata` field of each object
|
||||
- Internal response building now uses TypedDict for compile-time type safety
|
||||
- All S3 responses are dicts at runtime (TypedDict is a dict!)
|
||||
|
||||
### Fixed
|
||||
- Updated all documentation examples to use dict-based responses
|
||||
- Fixed pagination examples in README and API docs
|
||||
- Corrected SDK documentation with accurate method signatures
|
||||
|
||||
## [4.2.4] - 2025-01-10
|
||||
|
||||
### Fixed
|
||||
- Show only filename in `ls` output instead of full path for cleaner display
|
||||
- Correct `ls` command path handling and prefix display logic
|
||||
|
||||
## [4.2.3] - 2025-01-07
|
||||
|
||||
### Added
|
||||
- Comprehensive test coverage for `delete_objects_recursive()` method with 19 thorough tests
|
||||
- Tests cover delta suffix handling, error/warning aggregation, statistics tracking, and edge cases
|
||||
- Better code organization with separate `client_models.py` and `client_delete_helpers.py` modules
|
||||
|
||||
### Fixed
|
||||
- Fixed all mypy type errors using proper `cast()` for type safety
|
||||
- Improved type hints for dictionary operations in client code
|
||||
|
||||
### Changed
|
||||
- Refactored client code into logical modules for better maintainability
|
||||
- Enhanced code quality with comprehensive linting and type checking
|
||||
- All 99 integration/unit tests passing with zero type errors
|
||||
|
||||
### Internal
|
||||
- Better separation of concerns in client module
|
||||
- Improved developer experience with clearer code structure
|
||||
|
||||
## [4.2.2] - 2024-10-06
|
||||
|
||||
### Fixed
|
||||
- Add .delta suffix fallback for `delete_object()` method
|
||||
- Handle regular S3 objects without DeltaGlider metadata
|
||||
- Update mypy type ignore comment for compatibility
|
||||
|
||||
## [4.2.1] - 2024-10-06
|
||||
|
||||
### Fixed
|
||||
- Make GitHub release creation non-blocking in workflows
|
||||
|
||||
## [4.2.0] - 2024-10-03
|
||||
|
||||
### Added
|
||||
- AWS credential parameters to `create_client()` function
|
||||
- Support for custom endpoint URLs
|
||||
- Enhanced boto3 compatibility
|
||||
|
||||
## [4.1.0] - 2024-09-29
|
||||
|
||||
### Added
|
||||
- boto3-compatible client API
|
||||
- Bucket management methods
|
||||
- Comprehensive SDK documentation
|
||||
|
||||
## [4.0.0] - 2024-09-21
|
||||
|
||||
### Added
|
||||
- Initial public release
|
||||
- CLI with AWS S3 compatibility
|
||||
- Delta compression for versioned artifacts
|
||||
- 99%+ compression for similar files
|
||||
|
||||
[6.1.0]: https://github.com/beshu-tech/deltaglider/compare/v6.0.2...v6.1.0
|
||||
[6.0.0]: https://github.com/beshu-tech/deltaglider/compare/v5.1.1...v6.0.0
|
||||
[5.1.0]: https://github.com/beshu-tech/deltaglider/compare/v5.0.3...v5.1.0
|
||||
[5.0.3]: https://github.com/beshu-tech/deltaglider/compare/v5.0.1...v5.0.3
|
||||
[5.0.1]: https://github.com/beshu-tech/deltaglider/compare/v5.0.0...v5.0.1
|
||||
[5.0.0]: https://github.com/beshu-tech/deltaglider/compare/v4.2.4...v5.0.0
|
||||
[4.2.4]: https://github.com/beshu-tech/deltaglider/compare/v4.2.3...v4.2.4
|
||||
[4.2.3]: https://github.com/beshu-tech/deltaglider/compare/v4.2.2...v4.2.3
|
||||
[4.2.2]: https://github.com/beshu-tech/deltaglider/compare/v4.2.1...v4.2.2
|
||||
[4.2.1]: https://github.com/beshu-tech/deltaglider/compare/v4.2.0...v4.2.1
|
||||
[4.2.0]: https://github.com/beshu-tech/deltaglider/compare/v4.1.0...v4.2.0
|
||||
[4.1.0]: https://github.com/beshu-tech/deltaglider/compare/v4.0.0...v4.1.0
|
||||
[4.0.0]: https://github.com/beshu-tech/deltaglider/releases/tag/v4.0.0
|
||||
88
CLAUDE.md
88
CLAUDE.md
@@ -74,6 +74,19 @@ export AWS_SECRET_ACCESS_KEY=minioadmin
|
||||
|
||||
# Now you can use deltaglider commands
|
||||
deltaglider cp test.zip s3://test-bucket/
|
||||
deltaglider stats test-bucket # Get bucket statistics
|
||||
```
|
||||
|
||||
### Available CLI Commands
|
||||
```bash
|
||||
cp # Copy files to/from S3 (AWS S3 compatible)
|
||||
ls # List S3 buckets or objects (AWS S3 compatible)
|
||||
rm # Remove S3 objects (AWS S3 compatible)
|
||||
sync # Synchronize directories with S3 (AWS S3 compatible)
|
||||
stats # Get bucket statistics and compression metrics
|
||||
verify # Verify integrity of delta file
|
||||
put-bucket-acl # Set bucket ACL (s3api compatible passthrough)
|
||||
get-bucket-acl # Get bucket ACL (s3api compatible passthrough)
|
||||
```
|
||||
|
||||
## Architecture
|
||||
@@ -97,13 +110,15 @@ src/deltaglider/
|
||||
│ ├── logger.py # LoggerPort protocol for logging
|
||||
│ └── metrics.py # MetricsPort protocol for observability
|
||||
├── adapters/ # Concrete implementations
|
||||
│ ├── storage_s3.py # S3StorageAdapter using boto3
|
||||
│ ├── diff_xdelta.py # XdeltaAdapter using xdelta3 binary
|
||||
│ ├── hash_sha256.py # Sha256Adapter for checksums
|
||||
│ ├── cache_fs.py # FsCacheAdapter for file system cache
|
||||
│ ├── clock_utc.py # UtcClockAdapter for UTC timestamps
|
||||
│ ├── logger_std.py # StdLoggerAdapter for console output
|
||||
│ └── metrics_noop.py # NoopMetricsAdapter (placeholder)
|
||||
│ ├── storage_s3.py # S3StorageAdapter using boto3
|
||||
│ ├── diff_xdelta.py # XdeltaAdapter using xdelta3 binary
|
||||
│ ├── hash_sha256.py # Sha256Adapter for checksums
|
||||
│ ├── cache_cas.py # ContentAddressedCache (SHA256-based storage)
|
||||
│ ├── cache_encrypted.py # EncryptedCache (Fernet encryption wrapper)
|
||||
│ ├── cache_memory.py # MemoryCache (LRU in-memory cache)
|
||||
│ ├── clock_utc.py # UtcClockAdapter for UTC timestamps
|
||||
│ ├── logger_std.py # StdLoggerAdapter for console output
|
||||
│ └── metrics_noop.py # NoopMetricsAdapter (placeholder)
|
||||
└── app/
|
||||
└── cli/ # Click-based CLI application
|
||||
├── main.py # Main CLI entry point with AWS S3 commands
|
||||
@@ -140,7 +155,13 @@ src/deltaglider/
|
||||
2. **Reference Management** (`core/service.py`):
|
||||
- Reference stored at `{deltaspace.prefix}/reference.bin`
|
||||
- SHA256 verification on every read/write
|
||||
- Local cache in `/tmp/.deltaglider/reference_cache` for performance
|
||||
- **Content-Addressed Storage (CAS)** cache in `/tmp/deltaglider-*` (ephemeral)
|
||||
- Cache uses SHA256 as filename with two-level directory structure (ab/cd/abcdef...)
|
||||
- Automatic deduplication: same content = same SHA = same cache file
|
||||
- Zero collision risk: SHA256 namespace guarantees uniqueness
|
||||
- **Encryption**: Optional Fernet (AES-128-CBC + HMAC) encryption at rest (enabled by default)
|
||||
- Ephemeral encryption keys per process for forward secrecy
|
||||
- **Cache Backends**: Configurable filesystem or in-memory cache with LRU eviction
|
||||
|
||||
3. **Sync Algorithm** (`app/cli/sync.py`):
|
||||
- Compares local vs S3 using size and modification time
|
||||
@@ -181,13 +202,26 @@ Core delta logic is in `src/deltaglider/core/service.py`:
|
||||
## Environment Variables
|
||||
|
||||
- `DG_LOG_LEVEL`: Logging level (default: "INFO")
|
||||
- `DG_CACHE_DIR`: Local reference cache directory (default: "/tmp/.deltaglider/reference_cache")
|
||||
- `DG_MAX_RATIO`: Maximum acceptable delta/file ratio (default: "0.5")
|
||||
- `DG_MAX_RATIO`: Maximum acceptable delta/file ratio (default: "0.5", range: "0.0-1.0")
|
||||
- **See [docs/DG_MAX_RATIO.md](docs/DG_MAX_RATIO.md) for complete tuning guide**
|
||||
- Controls when to use delta vs. direct storage
|
||||
- Lower (0.2-0.3) = conservative, only high-quality compression
|
||||
- Higher (0.6-0.7) = permissive, accept modest savings
|
||||
- `DG_CACHE_BACKEND`: Cache backend type - "filesystem" (default) or "memory"
|
||||
- `DG_CACHE_MEMORY_SIZE_MB`: Memory cache size limit in MB (default: "100")
|
||||
- `DG_CACHE_ENCRYPTION_KEY`: Optional base64-encoded Fernet key for persistent encryption (ephemeral by default)
|
||||
- `AWS_ENDPOINT_URL`: Override S3 endpoint for MinIO/LocalStack
|
||||
- `AWS_ACCESS_KEY_ID`: AWS credentials
|
||||
- `AWS_SECRET_ACCESS_KEY`: AWS credentials
|
||||
- `AWS_DEFAULT_REGION`: AWS region
|
||||
|
||||
**Security Notes**:
|
||||
- **Encryption Always On**: Cache data is ALWAYS encrypted (cannot be disabled)
|
||||
- **Ephemeral Keys**: Encryption keys auto-generated per process for maximum security
|
||||
- **Auto-Cleanup**: Corrupted cache files automatically deleted on decryption failures
|
||||
- **Process Isolation**: Each process gets isolated cache in `/tmp/deltaglider-*`, cleaned up on exit
|
||||
- **Persistent Keys**: Set `DG_CACHE_ENCRYPTION_KEY` only if you need cross-process cache sharing (e.g., shared filesystems)
|
||||
|
||||
## Important Implementation Details
|
||||
|
||||
1. **xdelta3 Binary Dependency**: The system requires xdelta3 binary installed on the system. The `XdeltaAdapter` uses subprocess to call it.
|
||||
@@ -202,7 +236,11 @@ Core delta logic is in `src/deltaglider/core/service.py`:
|
||||
|
||||
## Performance Considerations
|
||||
|
||||
- Local reference caching dramatically improves performance for repeated operations
|
||||
- **Content-Addressed Storage**: SHA256-based deduplication eliminates redundant storage
|
||||
- **Cache Backends**:
|
||||
- Filesystem cache (default): persistent across processes, good for shared workflows
|
||||
- Memory cache: faster, zero I/O, perfect for ephemeral CI/CD pipelines
|
||||
- **Encryption Overhead**: ~10-15% performance impact, provides security at rest
|
||||
- Delta compression is CPU-intensive; consider parallelization for bulk uploads
|
||||
- The default max_ratio of 0.5 prevents storing inefficient deltas
|
||||
- For files <1MB, delta overhead may exceed benefits
|
||||
@@ -212,4 +250,30 @@ Core delta logic is in `src/deltaglider/core/service.py`:
|
||||
- Never store AWS credentials in code
|
||||
- Use IAM roles when possible
|
||||
- All S3 operations respect bucket policies and encryption settings
|
||||
- SHA256 checksums prevent tampering and corruption
|
||||
- SHA256 checksums prevent tampering and corruption
|
||||
- **Encryption Always On**: Cache data is ALWAYS encrypted using Fernet (AES-128-CBC + HMAC) - cannot be disabled
|
||||
- **Ephemeral Keys**: Encryption keys auto-generated per process for forward secrecy and process isolation
|
||||
- **Auto-Cleanup**: Corrupted or tampered cache files automatically deleted on decryption failures
|
||||
- **Persistent Keys**: Set `DG_CACHE_ENCRYPTION_KEY` only for cross-process cache sharing (use secrets management)
|
||||
- **Content-Addressed Storage**: SHA256-based filenames prevent collision attacks
|
||||
- **Zero-Trust Cache**: All cache operations include cryptographic validation
|
||||
|
||||
## Dependency Management
|
||||
|
||||
### Pinning Strategy
|
||||
Runtime dependencies in `pyproject.toml` use **compatible range pins** (`>=x.y.z,<NEXT_MAJOR`). This prevents surprise breaking changes from major versions while allowing patch/minor updates.
|
||||
|
||||
**Critical dependency: `boto3`** — This is the most breakage-prone dependency. AWS periodically changes default behaviors in minor releases (e.g., boto3 1.36+ added automatic request checksums that break S3-compatible stores like Hetzner Object Storage). The S3 adapter (`adapters/storage_s3.py`) explicitly sets `request_checksum_calculation="when_required"` to maintain compatibility with non-AWS S3 endpoints.
|
||||
|
||||
### Quarterly Dependency Refresh (do every ~3 months)
|
||||
1. **Check for updates**: `uv pip compile pyproject.toml --upgrade --dry-run`
|
||||
2. **Update in a branch**: bump version floors in `pyproject.toml` to current stable releases
|
||||
3. **Run full test suite**: `uv run pytest` (unit + integration)
|
||||
4. **Test against S3-compatible stores**: test a small file upload against Hetzner (or whichever non-AWS endpoint is in use) — boto3 updates are the most likely to break this
|
||||
5. **Rebuild Docker image** and test the same upload from the container
|
||||
6. **Check changelogs** for boto3, cryptography, and click for any deprecation notices or behavior changes
|
||||
|
||||
### Known Compatibility Constraints
|
||||
- **boto3**: Must use `request_checksum_calculation="when_required"` for Hetzner/MinIO compatibility. If upgrading past a new major behavior change, test direct uploads (non-delta path) of small files to non-AWS endpoints.
|
||||
- **cryptography**: Fernet API has been stable, but major versions may drop old OpenSSL support. Verify cache encryption still works after upgrades.
|
||||
- **click**: CLI argument parsing. Major versions may change decorator behavior. Run integration tests (`test_aws_cli_commands_v2.py`) after upgrades.
|
||||
53
Dockerfile
53
Dockerfile
@@ -1,6 +1,7 @@
|
||||
# Multi-stage build for deltaglider
|
||||
ARG PYTHON_VERSION=3.12-slim
|
||||
ARG UV_VERSION=0.5.13
|
||||
ARG VERSION=6.0.2
|
||||
|
||||
# Builder stage - install UV and dependencies
|
||||
FROM ghcr.io/astral-sh/uv:$UV_VERSION AS uv
|
||||
@@ -16,21 +17,29 @@ WORKDIR /build
|
||||
COPY pyproject.toml ./
|
||||
COPY README.md ./
|
||||
|
||||
# Install dependencies with UV caching
|
||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
uv pip install --compile-bytecode .
|
||||
|
||||
# Copy source code
|
||||
# Copy source code - needed for setuptools-scm to write version file
|
||||
COPY src ./src
|
||||
|
||||
# Install the package (force reinstall to ensure it's properly installed)
|
||||
# Install dependencies and package with UV caching
|
||||
# Set SETUPTOOLS_SCM_PRETEND_VERSION to avoid needing .git directory
|
||||
ARG VERSION
|
||||
ENV SETUPTOOLS_SCM_PRETEND_VERSION_FOR_DELTAGLIDER=${VERSION}
|
||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
uv pip install --compile-bytecode --no-deps --force-reinstall .
|
||||
uv pip install --compile-bytecode .
|
||||
|
||||
# Runtime stage - minimal image
|
||||
FROM python:${PYTHON_VERSION}
|
||||
|
||||
# Install xdelta3
|
||||
# Skip man pages and docs to speed up builds
|
||||
RUN mkdir -p /etc/dpkg/dpkg.cfg.d && \
|
||||
echo 'path-exclude /usr/share/doc/*' > /etc/dpkg/dpkg.cfg.d/01_nodoc && \
|
||||
echo 'path-exclude /usr/share/man/*' >> /etc/dpkg/dpkg.cfg.d/01_nodoc && \
|
||||
echo 'path-exclude /usr/share/groff/*' >> /etc/dpkg/dpkg.cfg.d/01_nodoc && \
|
||||
echo 'path-exclude /usr/share/info/*' >> /etc/dpkg/dpkg.cfg.d/01_nodoc && \
|
||||
echo 'path-exclude /usr/share/lintian/*' >> /etc/dpkg/dpkg.cfg.d/01_nodoc && \
|
||||
echo 'path-exclude /usr/share/linda/*' >> /etc/dpkg/dpkg.cfg.d/01_nodoc
|
||||
|
||||
# Install xdelta3 (now much faster without man pages)
|
||||
RUN apt-get update && \
|
||||
apt-get install -y --no-install-recommends xdelta3 && \
|
||||
apt-get clean && \
|
||||
@@ -57,10 +66,34 @@ USER deltaglider
|
||||
HEALTHCHECK --interval=30s --timeout=3s --start-period=5s --retries=3 \
|
||||
CMD deltaglider --help || exit 1
|
||||
|
||||
# Environment variables (all optional, can be overridden at runtime)
|
||||
# Logging
|
||||
ENV DG_LOG_LEVEL=INFO
|
||||
|
||||
# Performance & Compression
|
||||
# DG_MAX_RATIO: Maximum delta/file ratio (0.0-1.0)
|
||||
# Default 0.5 means: only use delta if delta_size ≤ 50% of original_size
|
||||
# Lower (0.2-0.3) = more conservative, only high-quality compression
|
||||
# Higher (0.6-0.7) = more permissive, accept modest savings
|
||||
# See docs/DG_MAX_RATIO.md for complete tuning guide
|
||||
ENV DG_MAX_RATIO=0.5
|
||||
|
||||
# Cache Configuration
|
||||
ENV DG_CACHE_BACKEND=filesystem
|
||||
ENV DG_CACHE_MEMORY_SIZE_MB=100
|
||||
# ENV DG_CACHE_ENCRYPTION_KEY=<base64-key> # Optional: Set for cross-process cache sharing
|
||||
|
||||
# AWS Configuration (override at runtime)
|
||||
# ENV AWS_ENDPOINT_URL=https://s3.amazonaws.com
|
||||
# ENV AWS_ACCESS_KEY_ID=<your-key>
|
||||
# ENV AWS_SECRET_ACCESS_KEY=<your-secret>
|
||||
# ENV AWS_DEFAULT_REGION=us-east-1
|
||||
|
||||
# Labels
|
||||
ARG VERSION
|
||||
LABEL org.opencontainers.image.title="DeltaGlider" \
|
||||
org.opencontainers.image.description="Delta-aware S3 file storage wrapper" \
|
||||
org.opencontainers.image.version="0.1.0" \
|
||||
org.opencontainers.image.description="Delta-aware S3 file storage wrapper with encryption" \
|
||||
org.opencontainers.image.version="${VERSION}" \
|
||||
org.opencontainers.image.authors="Beshu Limited" \
|
||||
org.opencontainers.image.source="https://github.com/beshu-tech/deltaglider"
|
||||
|
||||
|
||||
556
README.md
556
README.md
@@ -6,17 +6,16 @@
|
||||
[](https://www.python.org/downloads/)
|
||||
[](https://github.com/jmacd/xdelta)
|
||||
|
||||
<div align="center">
|
||||
<img src="https://github.com/beshu-tech/deltaglider/raw/main/docs/deltaglider.png" alt="DeltaGlider Logo" width="500"/>
|
||||
</div>
|
||||
|
||||
**Store 4TB of similar files in 5GB. No, that's not a typo.**
|
||||
|
||||
DeltaGlider is a drop-in S3 replacement that achieves 99.9% compression for versioned artifacts, backups, and release archives through intelligent binary delta compression.
|
||||
DeltaGlider is a drop-in S3 replacement that may achieve 99.9% size reduction for versioned compressed artifacts, backups, and release archives through intelligent binary delta compression (via xdelta3).
|
||||
|
||||
> 🌟 Star if you like this! Or Leave a message in [Issues](https://github.com/beshu-tech/deltaglider/issues) - we are listening!
|
||||
|
||||
## The Problem We Solved
|
||||
|
||||
You're storing hundreds of versions of your releases. Each 100MB build differs by <1% from the previous version. You're paying to store 100GB of what's essentially 100MB of unique data.
|
||||
You're storing hundreds of versions of your software releases. Each 100MB build differs by <1% from the previous version. You're paying to store 100GB of what's essentially 100MB of unique data.
|
||||
|
||||
Sound familiar?
|
||||
|
||||
@@ -26,25 +25,22 @@ From our [ReadOnlyREST case study](docs/case-study-readonlyrest.md):
|
||||
- **Before**: 201,840 files, 3.96TB storage, $1,120/year
|
||||
- **After**: Same files, 4.9GB storage, $1.32/year
|
||||
- **Compression**: 99.9% (not a typo)
|
||||
- **Integration time**: 5 minutes
|
||||
- **Integration time**: 5 minutes
|
||||
- **Data migration** `deltaglider migrate s3://origin-bucket s3://dest-bucket`
|
||||
|
||||
## How It Works
|
||||
|
||||
```
|
||||
Traditional S3:
|
||||
v1.0.0.zip (100MB) → S3: 100MB
|
||||
v1.0.1.zip (100MB) → S3: 100MB (200MB total)
|
||||
v1.0.2.zip (100MB) → S3: 100MB (300MB total)
|
||||
|
||||
With DeltaGlider:
|
||||
v1.0.0.zip (100MB) → S3: 100MB reference + 0KB delta
|
||||
v1.0.1.zip (100MB) → S3: 98KB delta (100.1MB total)
|
||||
v1.0.2.zip (100MB) → S3: 97KB delta (100.3MB total)
|
||||
```
|
||||
Deltaglider is great for compressed archives of similar content. Like multiple releases of the same software, DB backups, etc.
|
||||
We don't expect significant benefit for multimedia content like videos, but we never tried.
|
||||
|
||||
## Quick Start
|
||||
|
||||
### Installation
|
||||
Deltaglider comes as SDK, CLI, but we also have a GUI:
|
||||
* https://github.com/beshu-tech/deltaglider_commander/
|
||||
|
||||
<div align="center">
|
||||
<img src="https://github.com/beshu-tech/deltaglider/raw/main/docs/deltaglider.png" alt="DeltaGlider Logo"/>
|
||||
</div>
|
||||
|
||||
### CLI Installation
|
||||
|
||||
```bash
|
||||
# Via pip (Python 3.11+)
|
||||
@@ -57,9 +53,126 @@ uv pip install deltaglider
|
||||
docker run -v ~/.aws:/root/.aws deltaglider/deltaglider --help
|
||||
```
|
||||
|
||||
### AWS S3 Compatible Commands
|
||||
### Docker Usage
|
||||
|
||||
DeltaGlider is a **drop-in replacement** for AWS S3 CLI with automatic delta compression:
|
||||
DeltaGlider provides a secure, production-ready Docker image with encryption always enabled:
|
||||
|
||||
```bash
|
||||
# Basic usage with AWS credentials from environment
|
||||
docker run -e AWS_ACCESS_KEY_ID -e AWS_SECRET_ACCESS_KEY \
|
||||
deltaglider/deltaglider ls s3://my-bucket/
|
||||
|
||||
# Mount AWS credentials
|
||||
docker run -v ~/.aws:/root/.aws:ro \
|
||||
deltaglider/deltaglider cp file.zip s3://releases/
|
||||
|
||||
# Use memory cache for ephemeral CI/CD pipelines (faster)
|
||||
docker run -e DG_CACHE_BACKEND=memory \
|
||||
-e AWS_ACCESS_KEY_ID -e AWS_SECRET_ACCESS_KEY \
|
||||
deltaglider/deltaglider sync ./dist/ s3://releases/v1.0.0/
|
||||
|
||||
# Configure memory cache size (default: 100MB)
|
||||
docker run -e DG_CACHE_BACKEND=memory \
|
||||
-e DG_CACHE_MEMORY_SIZE_MB=500 \
|
||||
-e AWS_ACCESS_KEY_ID -e AWS_SECRET_ACCESS_KEY \
|
||||
deltaglider/deltaglider cp large-file.zip s3://releases/
|
||||
|
||||
# Use MinIO or custom S3 endpoint
|
||||
docker run -e AWS_ENDPOINT_URL=http://minio:9000 \
|
||||
-e AWS_ACCESS_KEY_ID=minioadmin \
|
||||
-e AWS_SECRET_ACCESS_KEY=minioadmin \
|
||||
deltaglider/deltaglider ls s3://test-bucket/
|
||||
|
||||
# Persistent encryption key for cross-container cache sharing
|
||||
# (Only needed if sharing cache across containers via volume mount)
|
||||
docker run -v /shared-cache:/tmp/.deltaglider \
|
||||
-e DG_CACHE_ENCRYPTION_KEY=$(openssl rand -base64 32) \
|
||||
deltaglider/deltaglider cp file.zip s3://releases/
|
||||
```
|
||||
|
||||
**Environment Variables**:
|
||||
- `DG_LOG_LEVEL`: Logging level (default: `INFO`, options: `DEBUG`, `INFO`, `WARNING`, `ERROR`)
|
||||
- `DG_MAX_RATIO`: Maximum delta/file ratio (default: `0.5`, range: `0.0-1.0`) - [📖 Complete Guide](docs/DG_MAX_RATIO.md)
|
||||
- `DG_CACHE_BACKEND`: Cache backend (default: `filesystem`, options: `filesystem`, `memory`)
|
||||
- `DG_CACHE_MEMORY_SIZE_MB`: Memory cache size in MB (default: `100`)
|
||||
- `DG_CACHE_ENCRYPTION_KEY`: Optional base64-encoded encryption key for cross-process cache sharing
|
||||
- `DG_DISABLE_EC2_DETECTION`: Disable EC2 instance detection (default: `false`, set to `true` to disable)
|
||||
- `AWS_ENDPOINT_URL`: S3 endpoint URL (default: AWS S3)
|
||||
- `AWS_ACCESS_KEY_ID`: AWS access key
|
||||
- `AWS_SECRET_ACCESS_KEY`: AWS secret key
|
||||
- `AWS_DEFAULT_REGION`: AWS region (default: `us-east-1`)
|
||||
|
||||
> **💡 Tip**: `DG_MAX_RATIO` is a powerful tuning parameter. See the [DG_MAX_RATIO guide](docs/DG_MAX_RATIO.md) to learn how to optimize compression for your use case.
|
||||
|
||||
**Security Notes**:
|
||||
- Encryption is **always enabled** (cannot be disabled)
|
||||
- Each container gets ephemeral encryption keys for maximum security
|
||||
- Corrupted cache files are automatically deleted
|
||||
- Use `DG_CACHE_ENCRYPTION_KEY` only for persistent cache sharing (store securely)
|
||||
|
||||
### Basic Usage
|
||||
|
||||
```bash
|
||||
# Upload a file (automatic delta compression)
|
||||
deltaglider cp my-app-v1.0.0.zip s3://releases/
|
||||
|
||||
# Download a file (automatic delta reconstruction)
|
||||
deltaglider cp s3://releases/my-app-v1.0.0.zip ./downloaded.zip
|
||||
|
||||
# List objects
|
||||
deltaglider ls s3://releases/
|
||||
|
||||
# Sync directories
|
||||
deltaglider sync ./dist/ s3://releases/v1.0.0/
|
||||
|
||||
# Migrate existing S3 bucket to DeltaGlider-compressed storage
|
||||
deltaglider migrate s3://old-bucket/ s3://new-bucket/
|
||||
```
|
||||
|
||||
**That's it!** DeltaGlider automatically detects similar files and applies 99%+ compression. For more commands and options, see [CLI Reference](#cli-reference).
|
||||
|
||||
## Core Concepts
|
||||
|
||||
### How It Works
|
||||
|
||||
```
|
||||
Traditional S3:
|
||||
v1.0.0.zip (100MB) → S3: 100MB
|
||||
v1.0.1.zip (100MB) → S3: 100MB (200MB total)
|
||||
v1.0.2.zip (100MB) → S3: 100MB (300MB total)
|
||||
|
||||
With DeltaGlider:
|
||||
v1.0.0.zip (100MB) → S3: 100MB reference + 0KB delta
|
||||
v1.0.1.zip (100MB) → S3: 98KB delta (from 100.1MB total)
|
||||
v1.0.2.zip (100MB) → S3: 97KB delta (from 100.3MB total)
|
||||
```
|
||||
|
||||
DeltaGlider stores the first file in a directory (deltaspace) as a reference and subsequent similar files as tiny deltas (differences). When you download, it reconstructs the original file perfectly using the reference + delta.
|
||||
|
||||
### Intelligent File Type Detection
|
||||
|
||||
DeltaGlider automatically detects file types and applies the optimal strategy:
|
||||
|
||||
| File Type | Strategy | Typical Compression | Why It Works |
|
||||
|-----------|----------|---------------------|--------------|
|
||||
| `.zip`, `.tar`, `.gz` | Binary delta | 99%+ for similar versions | Archive structure remains consistent between versions |
|
||||
| `.dmg`, `.deb`, `.rpm` | Binary delta | 95%+ for similar versions | Package formats with predictable structure |
|
||||
| `.jar`, `.war`, `.ear` | Binary delta | 90%+ for similar builds | Java archives with mostly unchanged classes |
|
||||
| `.exe`, `.dll`, `.so` | Direct upload | 0% (no delta benefit) | Compiled code changes unpredictably |
|
||||
| `.txt`, `.json`, `.xml` | Direct upload | 0% (use gzip instead) | Text files benefit more from standard compression |
|
||||
| `.sha1`, `.sha512`, `.md5` | Direct upload | 0% (already minimal) | Hash files are unique by design |
|
||||
|
||||
### Key Features
|
||||
|
||||
- **AWS CLI Replacement**: Same commands as `aws s3` with automatic compression
|
||||
- **boto3-Compatible SDK**: Works with existing boto3 code with minimal changes
|
||||
- **Zero Configuration**: No databases, no manifest files, no complex setup
|
||||
- **Data Integrity**: original file's SHA256 checksum saved within S3 metadata, verification on every reconstruction
|
||||
- **S3 Compatible**: Works with AWS S3, MinIO, Cloudflare R2, and any S3-compatible storage
|
||||
|
||||
## CLI Reference
|
||||
|
||||
### All Commands
|
||||
|
||||
```bash
|
||||
# Copy files to/from S3 (automatic delta compression for archives)
|
||||
@@ -87,88 +200,61 @@ deltaglider sync s3://releases/ ./local-backup/ # Sync from S3
|
||||
deltaglider sync --delete ./src/ s3://backup/ # Mirror exactly
|
||||
deltaglider sync --exclude "*.log" ./src/ s3://backup/ # Exclude patterns
|
||||
|
||||
# Get bucket statistics with intelligent S3-based caching
|
||||
deltaglider stats my-bucket # Quick stats (~100ms with cache)
|
||||
deltaglider stats s3://my-bucket # Also accepts s3:// format
|
||||
deltaglider stats s3://my-bucket/ # With or without trailing slash
|
||||
deltaglider stats my-bucket --sampled # Balanced (one sample per deltaspace)
|
||||
deltaglider stats my-bucket --detailed # Most accurate (slower, all metadata)
|
||||
deltaglider stats my-bucket --refresh # Force cache refresh
|
||||
deltaglider stats my-bucket --no-cache # Skip caching entirely
|
||||
deltaglider stats my-bucket --json # JSON output for automation
|
||||
|
||||
# Integrity verification & maintenance
|
||||
deltaglider verify s3://releases/file.zip # Validate stored SHA256
|
||||
deltaglider purge my-bucket # Clean expired .deltaglider/tmp files
|
||||
deltaglider purge my-bucket --dry-run # Preview purge results
|
||||
deltaglider purge my-bucket --json # Machine-readable purge stats
|
||||
|
||||
# Migrate existing S3 buckets to DeltaGlider compression
|
||||
deltaglider migrate s3://old-bucket/ s3://new-bucket/ # Interactive migration
|
||||
deltaglider migrate s3://old-bucket/ s3://new-bucket/ --yes # Skip confirmation
|
||||
deltaglider migrate --dry-run s3://old-bucket/ s3://new/ # Preview migration
|
||||
deltaglider migrate s3://bucket/v1/ s3://bucket/v2/ # Migrate prefixes
|
||||
|
||||
# Works with MinIO, R2, and S3-compatible storage
|
||||
deltaglider cp file.zip s3://bucket/ --endpoint-url http://localhost:9000
|
||||
```
|
||||
|
||||
## Why xdelta3 Excels at Archive Compression
|
||||
|
||||
Traditional diff algorithms (like `diff` or `git diff`) work line-by-line on text files. Binary diff tools like `bsdiff` or `courgette` are optimized for executables. But **xdelta3** is uniquely suited for compressed archives because:
|
||||
|
||||
1. **Block-level matching**: xdelta3 uses a rolling hash algorithm to find matching byte sequences at any offset, not just line boundaries. This is crucial for archives where small file changes can shift all subsequent byte positions.
|
||||
|
||||
2. **Large window support**: xdelta3 can use reference windows up to 2GB, allowing it to find matches even when content has moved significantly within the archive. Other delta algorithms typically use much smaller windows (64KB-1MB).
|
||||
|
||||
3. **Compression-aware**: When you update one file in a ZIP/TAR archive, the archive format itself remains largely identical - same compression dictionary, same structure. xdelta3 preserves these similarities while other algorithms might miss them.
|
||||
|
||||
4. **Format agnostic**: Unlike specialized tools (e.g., `courgette` for Chrome updates), xdelta3 works on raw bytes without understanding the file format, making it perfect for any archive type.
|
||||
|
||||
### Real-World Example
|
||||
When you rebuild a JAR file with one class changed:
|
||||
- **Text diff**: 100% different (it's binary data!)
|
||||
- **bsdiff**: ~30-40% of original size (optimized for executables, not archives)
|
||||
- **xdelta3**: ~0.1-1% of original size (finds the unchanged parts regardless of position)
|
||||
|
||||
This is why DeltaGlider achieves 99%+ compression on versioned archives - xdelta3 can identify that 99% of the archive structure and content remains identical between versions.
|
||||
|
||||
## Intelligent File Type Detection
|
||||
|
||||
DeltaGlider automatically detects file types and applies the optimal strategy:
|
||||
|
||||
| File Type | Strategy | Typical Compression | Why It Works |
|
||||
|-----------|----------|-------------------|--------------|
|
||||
| `.zip`, `.tar`, `.gz` | Binary delta | 99%+ for similar versions | Archive structure remains consistent between versions |
|
||||
| `.dmg`, `.deb`, `.rpm` | Binary delta | 95%+ for similar versions | Package formats with predictable structure |
|
||||
| `.jar`, `.war`, `.ear` | Binary delta | 90%+ for similar builds | Java archives with mostly unchanged classes |
|
||||
| `.exe`, `.dll`, `.so` | Direct upload | 0% (no delta benefit) | Compiled code changes unpredictably |
|
||||
| `.txt`, `.json`, `.xml` | Direct upload | 0% (use gzip instead) | Text files benefit more from standard compression |
|
||||
| `.sha1`, `.sha512`, `.md5` | Direct upload | 0% (already minimal) | Hash files are unique by design |
|
||||
|
||||
## Performance Benchmarks
|
||||
|
||||
Testing with real software releases:
|
||||
|
||||
```python
|
||||
# 513 Elasticsearch plugin releases (82.5MB each)
|
||||
Original size: 42.3 GB
|
||||
DeltaGlider size: 115 MB
|
||||
Compression: 99.7%
|
||||
Upload speed: 3-4 files/second
|
||||
Download speed: <100ms reconstruction
|
||||
```
|
||||
|
||||
## Integration Examples
|
||||
|
||||
### Drop-in AWS CLI Replacement
|
||||
### Command Flags
|
||||
|
||||
```bash
|
||||
# Before (aws-cli)
|
||||
aws s3 cp release-v2.0.0.zip s3://releases/
|
||||
aws s3 cp --recursive ./build/ s3://releases/v2.0.0/
|
||||
aws s3 ls s3://releases/
|
||||
aws s3 rm s3://releases/old-version.zip
|
||||
# All standard AWS flags work
|
||||
deltaglider cp file.zip s3://bucket/ \
|
||||
--endpoint-url http://localhost:9000 \
|
||||
--profile production \
|
||||
--region us-west-2
|
||||
|
||||
# After (deltaglider) - Same commands, 99% less storage!
|
||||
deltaglider cp release-v2.0.0.zip s3://releases/
|
||||
deltaglider cp -r ./build/ s3://releases/v2.0.0/
|
||||
deltaglider ls s3://releases/
|
||||
deltaglider rm s3://releases/old-version.zip
|
||||
# DeltaGlider-specific flags
|
||||
deltaglider cp file.zip s3://bucket/ \
|
||||
--no-delta # Disable compression for specific files
|
||||
--max-ratio 0.8 # Only use delta if compression > 20%
|
||||
```
|
||||
|
||||
### CI/CD Pipeline (GitHub Actions)
|
||||
### CI/CD Integration
|
||||
|
||||
#### GitHub Actions
|
||||
|
||||
```yaml
|
||||
- name: Upload Release with 99% compression
|
||||
run: |
|
||||
pip install deltaglider
|
||||
# Use AWS S3 compatible syntax
|
||||
deltaglider cp dist/*.zip s3://releases/${{ github.ref_name }}/
|
||||
|
||||
# Or use recursive for entire directories
|
||||
# Or recursive for entire directories
|
||||
deltaglider cp -r dist/ s3://releases/${{ github.ref_name }}/
|
||||
```
|
||||
|
||||
### Backup Script
|
||||
#### Daily Backup Script
|
||||
|
||||
```bash
|
||||
#!/bin/bash
|
||||
@@ -177,18 +263,15 @@ tar -czf backup-$(date +%Y%m%d).tar.gz /data
|
||||
deltaglider cp backup-*.tar.gz s3://backups/
|
||||
# Only changes are stored, not full backup
|
||||
|
||||
# List backups with human-readable sizes
|
||||
deltaglider ls -h s3://backups/
|
||||
|
||||
# Clean up old backups
|
||||
deltaglider rm -r s3://backups/2023/
|
||||
```
|
||||
|
||||
### Python SDK - boto3-Compatible API
|
||||
## Python SDK
|
||||
|
||||
**[📚 Full SDK Documentation](docs/sdk/README.md)** | **[API Reference](docs/sdk/api.md)** | **[Examples](docs/sdk/examples.md)** | **[boto3 Compatibility Guide](BOTO3_COMPATIBILITY.md)**
|
||||
|
||||
#### Quick Start - boto3 Compatible API (Recommended)
|
||||
### boto3-Compatible API (Recommended)
|
||||
|
||||
DeltaGlider provides a **boto3-compatible API** for core S3 operations (21 methods covering 80% of use cases):
|
||||
|
||||
@@ -211,35 +294,30 @@ response = client.get_object(Bucket='releases', Key='v2.0.0/my-app.zip')
|
||||
with open('downloaded.zip', 'wb') as f:
|
||||
f.write(response['Body'].read())
|
||||
|
||||
# Smart list_objects with optimized performance (NEW!)
|
||||
# Fast listing (default) - no metadata fetching, ~50ms for 1000 objects
|
||||
# Smart list_objects with optimized performance
|
||||
response = client.list_objects(Bucket='releases', Prefix='v2.0.0/')
|
||||
for obj in response['Contents']:
|
||||
print(f"{obj['Key']}: {obj['Size']} bytes")
|
||||
|
||||
# Paginated listing for large buckets
|
||||
response = client.list_objects(Bucket='releases', MaxKeys=100)
|
||||
while response.is_truncated:
|
||||
while response.get('IsTruncated'):
|
||||
for obj in response['Contents']:
|
||||
print(obj['Key'])
|
||||
response = client.list_objects(
|
||||
Bucket='releases',
|
||||
MaxKeys=100,
|
||||
ContinuationToken=response.next_continuation_token
|
||||
ContinuationToken=response.get('NextContinuationToken')
|
||||
)
|
||||
|
||||
# Get bucket statistics with smart defaults
|
||||
stats = client.get_bucket_stats('releases') # Quick stats (50ms)
|
||||
stats = client.get_bucket_stats('releases', detailed_stats=True) # With compression metrics
|
||||
|
||||
# Delete and inspect objects
|
||||
client.delete_object(Bucket='releases', Key='old-version.zip')
|
||||
client.head_object(Bucket='releases', Key='v2.0.0/my-app.zip')
|
||||
|
||||
# Bucket management - no boto3 needed!
|
||||
client.create_bucket(Bucket='my-new-bucket')
|
||||
client.list_buckets()
|
||||
client.delete_bucket(Bucket='my-new-bucket')
|
||||
```
|
||||
|
||||
#### Bucket Management (NEW!)
|
||||
### Bucket Management
|
||||
|
||||
**No boto3 required!** DeltaGlider now provides complete bucket management:
|
||||
**No boto3 required!** DeltaGlider provides complete bucket management:
|
||||
|
||||
```python
|
||||
from deltaglider import create_client
|
||||
@@ -264,15 +342,9 @@ for bucket in response['Buckets']:
|
||||
client.delete_bucket(Bucket='my-old-bucket')
|
||||
```
|
||||
|
||||
**Benefits:**
|
||||
- ✅ No need to import boto3 separately for bucket operations
|
||||
- ✅ Consistent API with DeltaGlider object operations
|
||||
- ✅ Works with AWS S3, MinIO, and S3-compatible storage
|
||||
- ✅ Idempotent operations (safe to retry)
|
||||
|
||||
See [examples/bucket_management.py](examples/bucket_management.py) for complete example.
|
||||
|
||||
#### Simple API (Alternative)
|
||||
### Simple API (Alternative)
|
||||
|
||||
For simpler use cases, DeltaGlider also provides a streamlined API:
|
||||
|
||||
@@ -290,15 +362,16 @@ print(f"Saved {summary.savings_percent:.0f}% storage space")
|
||||
client.download("s3://releases/v2.0.0/my-app-v2.0.0.zip", "local-app.zip")
|
||||
```
|
||||
|
||||
#### Real-World Example: Software Release Storage with boto3 API
|
||||
### Real-World Examples
|
||||
|
||||
#### Software Release Storage
|
||||
|
||||
```python
|
||||
from deltaglider import create_client
|
||||
|
||||
# Works exactly like boto3, but with 99% compression!
|
||||
client = create_client()
|
||||
|
||||
# Upload multiple versions using boto3-compatible API
|
||||
# Upload multiple versions
|
||||
versions = ["v1.0.0", "v1.0.1", "v1.0.2", "v1.1.0"]
|
||||
for version in versions:
|
||||
with open(f"dist/my-app-{version}.zip", 'rb') as f:
|
||||
@@ -323,27 +396,19 @@ for version in versions:
|
||||
# v1.0.1: Stored as 0.2MB delta (saved 99.8%)
|
||||
# v1.0.2: Stored as 0.3MB delta (saved 99.7%)
|
||||
# v1.1.0: Stored as 5.2MB delta (saved 94.8%)
|
||||
|
||||
# Download using standard boto3 API
|
||||
response = client.get_object(Bucket='releases', Key='v1.1.0/my-app-v1.1.0.zip')
|
||||
with open('my-app-latest.zip', 'wb') as f:
|
||||
f.write(response['Body'].read())
|
||||
```
|
||||
|
||||
#### Advanced Example: Automated Backup with boto3 API
|
||||
#### Automated Database Backup
|
||||
|
||||
```python
|
||||
from datetime import datetime
|
||||
from deltaglider import create_client
|
||||
|
||||
# Works with any S3-compatible storage
|
||||
client = create_client(endpoint_url="http://minio.internal:9000")
|
||||
|
||||
def backup_database():
|
||||
"""Daily database backup with automatic deduplication using boto3 API."""
|
||||
"""Daily database backup with automatic deduplication."""
|
||||
date = datetime.now().strftime("%Y%m%d")
|
||||
|
||||
# Create database dump
|
||||
dump_file = f"backup-{date}.sql.gz"
|
||||
|
||||
# Upload using boto3-compatible API
|
||||
@@ -356,76 +421,93 @@ def backup_database():
|
||||
Metadata={'date': date, 'source': 'production'}
|
||||
)
|
||||
|
||||
# Check compression effectiveness (DeltaGlider extension)
|
||||
# Check compression effectiveness
|
||||
if 'DeltaGliderInfo' in response:
|
||||
info = response['DeltaGliderInfo']
|
||||
if info['DeltaRatio'] > 0.1: # If delta is >10% of original
|
||||
if info['DeltaRatio'] > 0.1:
|
||||
print(f"Warning: Low compression ({info['SavingsPercent']:.0f}%), "
|
||||
"database might have significant changes")
|
||||
print(f"Backup stored: {info['StoredSizeMB']:.1f}MB "
|
||||
f"(compressed from {info['OriginalSizeMB']:.1f}MB)")
|
||||
|
||||
# List recent backups using boto3 API
|
||||
response = client.list_objects(
|
||||
Bucket='backups',
|
||||
Prefix='postgres/',
|
||||
MaxKeys=30
|
||||
)
|
||||
|
||||
# Clean up old backups
|
||||
for obj in response.get('Contents', []):
|
||||
# Parse date from key
|
||||
obj_date = obj['Key'].split('/')[1]
|
||||
if days_old(obj_date) > 30:
|
||||
client.delete_object(Bucket='backups', Key=obj['Key'])
|
||||
|
||||
# Run backup
|
||||
backup_database()
|
||||
```
|
||||
|
||||
For more examples and detailed API documentation, see the [SDK Documentation](docs/sdk/README.md).
|
||||
|
||||
## Migration from AWS CLI
|
||||
## Performance & Benchmarks
|
||||
|
||||
Migrating from `aws s3` to `deltaglider` is as simple as changing the command name:
|
||||
### Real-World Results
|
||||
|
||||
| AWS CLI | DeltaGlider | Compression Benefit |
|
||||
|---------|------------|-------------------|
|
||||
| `aws s3 cp file.zip s3://bucket/` | `deltaglider cp file.zip s3://bucket/` | ✅ 99% for similar files |
|
||||
| `aws s3 cp -r dir/ s3://bucket/` | `deltaglider cp -r dir/ s3://bucket/` | ✅ 99% for archives |
|
||||
| `aws s3 ls s3://bucket/` | `deltaglider ls s3://bucket/` | - |
|
||||
| `aws s3 rm s3://bucket/file` | `deltaglider rm s3://bucket/file` | - |
|
||||
| `aws s3 sync dir/ s3://bucket/` | `deltaglider sync dir/ s3://bucket/` | ✅ 99% incremental |
|
||||
|
||||
### Compatibility Flags
|
||||
|
||||
```bash
|
||||
# All standard AWS flags work
|
||||
deltaglider cp file.zip s3://bucket/ \
|
||||
--endpoint-url http://localhost:9000 \
|
||||
--profile production \
|
||||
--region us-west-2
|
||||
|
||||
# DeltaGlider-specific flags
|
||||
deltaglider cp file.zip s3://bucket/ \
|
||||
--no-delta # Disable compression for specific files
|
||||
--max-ratio 0.8 # Only use delta if compression > 20%
|
||||
```
|
||||
|
||||
## Architecture
|
||||
|
||||
DeltaGlider uses a clean hexagonal architecture:
|
||||
Testing with 513 Elasticsearch plugin releases (82.5MB each):
|
||||
|
||||
```
|
||||
┌─────────────┐ ┌──────────────┐ ┌─────────────┐
|
||||
│ Your App │────▶│ DeltaGlider │────▶│ S3/MinIO │
|
||||
│ (CLI/SDK) │ │ Core │ │ Storage │
|
||||
└─────────────┘ └──────────────┘ └─────────────┘
|
||||
│
|
||||
┌──────▼───────┐
|
||||
│ Local Cache │
|
||||
│ (References) │
|
||||
└──────────────┘
|
||||
Original size: 42.3 GB
|
||||
DeltaGlider size: 115 MB
|
||||
Compression: 99.7%
|
||||
Upload speed: 3-4 files/second
|
||||
Download speed: <100ms reconstruction
|
||||
```
|
||||
|
||||
### The Math
|
||||
|
||||
For `N` versions of a `S` MB file with `D%` difference between versions:
|
||||
|
||||
**Traditional S3**: `N × S` MB
|
||||
**DeltaGlider**: `S + (N-1) × S × D%` MB
|
||||
|
||||
Example: 100 versions of 100MB files with 1% difference:
|
||||
- **Traditional**: 10,000 MB
|
||||
- **DeltaGlider**: 199 MB
|
||||
- **Savings**: 98%
|
||||
|
||||
### Comparison
|
||||
|
||||
| Solution | Compression | Speed | Integration | Cost |
|
||||
|----------|------------|-------|-------------|------|
|
||||
| **DeltaGlider** | 99%+ | Fast | Drop-in | Open source |
|
||||
| S3 Versioning | 0% | Native | Built-in | $$ per version |
|
||||
| Deduplication | 30-50% | Slow | Complex | Enterprise $$$ |
|
||||
| Git LFS | Good | Slow | Git-only | $ per GB |
|
||||
| Restic/Borg | 80-90% | Medium | Backup-only | Open source |
|
||||
|
||||
## Architecture & Technical Deep Dive
|
||||
|
||||
### Why xdelta3 Excels at Archive Compression
|
||||
|
||||
Traditional diff algorithms (like `diff` or `git diff`) work line-by-line on text files. Binary diff tools like `bsdiff` or `courgette` are optimized for executables. But **xdelta3** is uniquely suited for compressed archives because:
|
||||
|
||||
1. **Block-level matching**: xdelta3 uses a rolling hash algorithm to find matching byte sequences at any offset, not just line boundaries. This is crucial for archives where small file changes can shift all subsequent byte positions.
|
||||
|
||||
2. **Large window support**: xdelta3 can use reference windows up to 2GB, allowing it to find matches even when content has moved significantly within the archive. Other delta algorithms typically use much smaller windows (64KB-1MB).
|
||||
|
||||
3. **Compression-aware**: When you update one file in a ZIP/TAR archive, the archive format itself remains largely identical - same compression dictionary, same structure. xdelta3 preserves these similarities while other algorithms might miss them.
|
||||
|
||||
4. **Format agnostic**: Unlike specialized tools (e.g., `courgette` for Chrome updates), xdelta3 works on raw bytes without understanding the file format, making it perfect for any archive type.
|
||||
|
||||
#### Real-World Example
|
||||
|
||||
When you rebuild a JAR file with one class changed:
|
||||
- **Text diff**: 100% different (it's binary data!)
|
||||
- **bsdiff**: ~30-40% of original size (optimized for executables, not archives)
|
||||
- **xdelta3**: ~0.1-1% of original size (finds the unchanged parts regardless of position)
|
||||
|
||||
This is why DeltaGlider achieves 99%+ compression on versioned archives - xdelta3 can identify that 99% of the archive structure and content remains identical between versions.
|
||||
|
||||
### System Architecture
|
||||
|
||||
DeltaGlider intelligently stores files within **DeltaSpaces** - S3 prefixes where related files share a common reference file for delta compression:
|
||||
|
||||
```
|
||||
┌─────────────┐ ┌──────────────┐ ┌─────────────────┐
|
||||
│ Your App │────▶│ DeltaGlider │────▶│ DeltaSpace │
|
||||
│ (CLI/SDK) │ │ Core │ │ (S3 prefix) │
|
||||
└─────────────┘ └──────────────┘ ├─────────────────┤
|
||||
│ │ reference.bin │
|
||||
┌──────▼───────┐ │ file1.delta │
|
||||
│ Local Cache │ │ file2.delta │
|
||||
│ (References) │ │ file3.delta │
|
||||
└──────────────┘ └─────────────────┘
|
||||
```
|
||||
|
||||
**Key Components:**
|
||||
@@ -434,8 +516,11 @@ DeltaGlider uses a clean hexagonal architecture:
|
||||
- **Integrity verification**: SHA256 on every operation
|
||||
- **Local caching**: Fast repeated operations
|
||||
- **Zero dependencies**: No database, no manifest files
|
||||
- **Modular storage**: The storage layer is pluggable - you could easily replace S3 with a filesystem driver (using extended attributes for metadata) or any other backend
|
||||
|
||||
## When to Use DeltaGlider
|
||||
The codebase follows a ports-and-adapters pattern where core business logic is decoupled from infrastructure, with storage operations abstracted through well-defined interfaces in the `ports/` directory and concrete implementations in `adapters/`.
|
||||
|
||||
### When to Use DeltaGlider
|
||||
|
||||
✅ **Perfect for:**
|
||||
- Software releases and versioned artifacts
|
||||
@@ -446,29 +531,80 @@ DeltaGlider uses a clean hexagonal architecture:
|
||||
- Any versioned binary data
|
||||
|
||||
❌ **Not ideal for:**
|
||||
- Already compressed unique files
|
||||
- Streaming media files
|
||||
- Already compressed **unique** files
|
||||
- Streaming or multimedia files
|
||||
- Frequently changing unstructured data
|
||||
- Files smaller than 1MB
|
||||
|
||||
## Comparison
|
||||
## Migration from AWS CLI
|
||||
|
||||
| Solution | Compression | Speed | Integration | Cost |
|
||||
|----------|------------|-------|-------------|------|
|
||||
| **DeltaGlider** | 99%+ | Fast | Drop-in | Open source |
|
||||
| S3 Versioning | 0% | Native | Built-in | $$ per version |
|
||||
| Deduplication | 30-50% | Slow | Complex | Enterprise $$$ |
|
||||
| Git LFS | Good | Slow | Git-only | $ per GB |
|
||||
| Restic/Borg | 80-90% | Medium | Backup-only | Open source |
|
||||
Migrating from `aws s3` to `deltaglider` is as simple as changing the command name:
|
||||
|
||||
| AWS CLI | DeltaGlider | Compression Benefit |
|
||||
|---------|------------|---------------------|
|
||||
| `aws s3 cp file.zip s3://bucket/` | `deltaglider cp file.zip s3://bucket/` | ✅ 99% for similar files |
|
||||
| `aws s3 cp -r dir/ s3://bucket/` | `deltaglider cp -r dir/ s3://bucket/` | ✅ 99% for archives |
|
||||
| `aws s3 ls s3://bucket/` | `deltaglider ls s3://bucket/` | - |
|
||||
| `aws s3 rm s3://bucket/file` | `deltaglider rm s3://bucket/file` | - |
|
||||
| `aws s3 sync dir/ s3://bucket/` | `deltaglider sync dir/ s3://bucket/` | ✅ 99% incremental |
|
||||
|
||||
### Migrating Existing S3 Buckets
|
||||
|
||||
DeltaGlider provides a dedicated `migrate` command to compress your existing S3 data:
|
||||
|
||||
```bash
|
||||
# Migrate an entire bucket
|
||||
deltaglider migrate s3://old-bucket/ s3://compressed-bucket/
|
||||
|
||||
# Migrate a prefix (preserves prefix structure by default)
|
||||
deltaglider migrate s3://bucket/releases/ s3://bucket/archive/
|
||||
# Result: s3://bucket/archive/releases/ contains the files
|
||||
|
||||
# Migrate without preserving source prefix
|
||||
deltaglider migrate --no-preserve-prefix s3://bucket/v1/ s3://bucket/archive/
|
||||
# Result: Files go directly into s3://bucket/archive/
|
||||
|
||||
# Preview migration (dry run)
|
||||
deltaglider migrate --dry-run s3://old/ s3://new/
|
||||
|
||||
# Skip confirmation prompt
|
||||
deltaglider migrate --yes s3://old/ s3://new/
|
||||
|
||||
# Exclude certain file patterns
|
||||
deltaglider migrate --exclude "*.log" s3://old/ s3://new/
|
||||
```
|
||||
|
||||
**Key Features:**
|
||||
- **Resume Support**: Migration automatically skips files that already exist in the destination
|
||||
- **Progress Tracking**: Shows real-time migration progress and statistics
|
||||
- **Safety First**: Interactive confirmation shows file count before starting
|
||||
- **EC2 Cost Optimization**: Automatically detects EC2 instance region and warns about cross-region charges
|
||||
- ✅ Green checkmark when regions align (no extra charges)
|
||||
- ℹ️ INFO when auto-detected mismatch (suggests optimal region)
|
||||
- ⚠️ WARNING when user explicitly set wrong `--region` (expect data transfer costs)
|
||||
- Disable with `DG_DISABLE_EC2_DETECTION=true` if needed
|
||||
- **AWS Region Transparency**: Displays the actual AWS region being used
|
||||
- **Prefix Preservation**: By default, source prefix is preserved in destination (use `--no-preserve-prefix` to disable)
|
||||
- **S3-to-S3 Transfer**: Both regular S3 and DeltaGlider buckets supported
|
||||
|
||||
**Prefix Preservation Examples:**
|
||||
- `s3://src/data/` → `s3://dest/` creates `s3://dest/data/`
|
||||
- `s3://src/a/b/c/` → `s3://dest/x/` creates `s3://dest/x/c/`
|
||||
- Use `--no-preserve-prefix` to place files directly in destination without the source prefix
|
||||
|
||||
The migration preserves all file names and structure while applying DeltaGlider's compression transparently.
|
||||
|
||||
## Production Ready
|
||||
|
||||
- ✅ **Battle tested**: 200K+ files in production
|
||||
- ✅ **Data integrity**: SHA256 verification on every operation
|
||||
- ✅ **Cost optimization**: Automatic EC2 region detection warns about cross-region charges - [📖 EC2 Detection Guide](docs/EC2_REGION_DETECTION.md)
|
||||
- ✅ **S3 compatible**: Works with AWS, MinIO, Cloudflare R2, etc.
|
||||
- ✅ **Atomic operations**: No partial states
|
||||
- ✅ **Concurrent safe**: Multiple clients supported
|
||||
- ✅ **Well tested**: 95%+ code coverage
|
||||
- ✅ **Thoroughly tested**: 99 integration/unit tests, comprehensive test coverage
|
||||
- ✅ **Type safe**: Full mypy type checking, zero type errors
|
||||
- ✅ **Code quality**: Automated linting with ruff, clean codebase
|
||||
|
||||
## Development
|
||||
|
||||
@@ -480,9 +616,13 @@ cd deltaglider
|
||||
# Install with dev dependencies
|
||||
uv pip install -e ".[dev]"
|
||||
|
||||
# Run tests
|
||||
# Run tests (99 integration/unit tests)
|
||||
uv run pytest
|
||||
|
||||
# Run quality checks
|
||||
uv run ruff check src/ # Linting
|
||||
uv run mypy src/ # Type checking
|
||||
|
||||
# Run with local MinIO
|
||||
docker-compose up -d
|
||||
export AWS_ENDPOINT_URL=http://localhost:9000
|
||||
@@ -506,18 +646,6 @@ A: Zero. Files without similarity are uploaded directly.
|
||||
**Q: Is this compatible with S3 encryption?**
|
||||
A: Yes, DeltaGlider respects all S3 settings including SSE, KMS, and bucket policies.
|
||||
|
||||
## The Math
|
||||
|
||||
For `N` versions of a `S` MB file with `D%` difference between versions:
|
||||
|
||||
**Traditional S3**: `N × S` MB
|
||||
**DeltaGlider**: `S + (N-1) × S × D%` MB
|
||||
|
||||
Example: 100 versions of 100MB files with 1% difference:
|
||||
- **Traditional**: 10,000 MB
|
||||
- **DeltaGlider**: 199 MB
|
||||
- **Savings**: 98%
|
||||
|
||||
## Contributing
|
||||
|
||||
We welcome contributions! See [CONTRIBUTING.md](CONTRIBUTING.md) for guidelines.
|
||||
@@ -535,14 +663,8 @@ MIT - Use it freely in your projects.
|
||||
|
||||
## Success Stories
|
||||
|
||||
> "We reduced our artifact storage from 4TB to 5GB. This isn't hyperbole—it's math."
|
||||
> — [ReadOnlyREST Case Study](docs/case-study-readonlyrest.md)
|
||||
|
||||
> "Our CI/CD pipeline now uploads 100x faster. Deploys that took minutes now take seconds."
|
||||
> — Platform Engineer at [redacted]
|
||||
|
||||
> "We were about to buy expensive deduplication storage. DeltaGlider saved us $50K/year."
|
||||
> — CTO at [stealth startup]
|
||||
> "We reduced our artifact storage from 4TB to 5GB. CI is also much faster, due to smaller uploads."
|
||||
> — [ReadonlyREST Case Study](docs/case-study-readonlyrest.md)
|
||||
|
||||
---
|
||||
|
||||
@@ -554,4 +676,10 @@ deltaglider analyze s3://your-bucket/
|
||||
# Output: "Potential savings: 95.2% (4.8TB → 237GB)"
|
||||
```
|
||||
|
||||
Built with ❤️ by engineers who were tired of paying to store the same bytes over and over.
|
||||
## Who built this?
|
||||
|
||||
Built with ❤️ by [ReadonlyREST](https://readonlyrest.com) engineers who were tired of paying to store the same bytes over and over.
|
||||
|
||||
We also built [Anaphora](https://anaphora.it) for aggregated reports and alerting
|
||||
|
||||
And [Deltaglider Commander](https://github.com/beshu-tech/deltaglider_commander)
|
||||
|
||||
630
SECURITY_FIX_ROADMAP.md
Normal file
630
SECURITY_FIX_ROADMAP.md
Normal file
@@ -0,0 +1,630 @@
|
||||
# 🛡️ DeltaGlider Security Fix Roadmap
|
||||
|
||||
## Executive Summary
|
||||
Critical security vulnerabilities have been identified in DeltaGlider's cache system that enable multi-user attacks, data exposure, and cache poisoning. This document provides a **chronological, actionable roadmap** to eliminate these threats through bold architectural changes.
|
||||
|
||||
**Key Innovation**: Instead of patching individual issues, we propose a **"Zero-Trust Cache Architecture"** that eliminates entire classes of vulnerabilities.
|
||||
|
||||
---
|
||||
|
||||
## 🚀 The Bold Solution: Ephemeral Signed Cache
|
||||
|
||||
### Core Concept
|
||||
Replace filesystem cache with **ephemeral, cryptographically-signed, user-isolated cache** that eliminates:
|
||||
- TOCTOU vulnerabilities (no shared filesystem)
|
||||
- Multi-user interference (process isolation)
|
||||
- Cache poisoning (cryptographic signatures)
|
||||
- Information disclosure (encrypted metadata)
|
||||
- Cross-endpoint collision (content-addressed storage)
|
||||
|
||||
**Note**: DeltaGlider is designed as a standalone CLI/SDK application. All solutions maintain this architecture without requiring external services.
|
||||
|
||||
---
|
||||
|
||||
## 📋 Implementation Roadmap
|
||||
|
||||
### **DAY 1-2: Emergency Hotfix** (v5.0.3) ✅ COMPLETED
|
||||
*Stop the bleeding - minimal changes for immediate deployment*
|
||||
|
||||
#### 1. **Ephemeral Process-Isolated Cache** (2 hours) ✅ COMPLETED
|
||||
```python
|
||||
# src/deltaglider/app/cli/main.py
|
||||
import tempfile
|
||||
import atexit
|
||||
|
||||
# SECURITY: Always use ephemeral process-isolated cache
|
||||
cache_dir = Path(tempfile.mkdtemp(prefix="deltaglider-", dir="/tmp"))
|
||||
atexit.register(lambda: shutil.rmtree(cache_dir, ignore_errors=True))
|
||||
```
|
||||
|
||||
**Impact**: Each process gets isolated cache, auto-cleaned on exit. Eliminates multi-user attacks.
|
||||
**Implementation**: All legacy shared cache code removed. Ephemeral cache is now the ONLY mode.
|
||||
|
||||
#### 2. **Add SHA Validation at Use-Time** (2 hours) ✅ COMPLETED
|
||||
```python
|
||||
# src/deltaglider/ports/cache.py
|
||||
class CachePort(Protocol):
|
||||
def get_validated_ref(self, bucket: str, prefix: str, expected_sha: str) -> Path:
|
||||
"""Get reference with atomic SHA validation - MUST use this for all operations."""
|
||||
...
|
||||
|
||||
# src/deltaglider/adapters/cache_fs.py
|
||||
def get_validated_ref(self, bucket: str, prefix: str, expected_sha: str) -> Path:
|
||||
path = self.ref_path(bucket, prefix)
|
||||
if not path.exists():
|
||||
raise CacheMissError(f"Cache miss for {bucket}/{prefix}")
|
||||
|
||||
# Lock file for atomic read (Unix only)
|
||||
with open(path, 'rb') as f:
|
||||
if sys.platform != "win32":
|
||||
fcntl.flock(f.fileno(), fcntl.LOCK_SH)
|
||||
content = f.read()
|
||||
actual_sha = hashlib.sha256(content).hexdigest()
|
||||
|
||||
if actual_sha != expected_sha:
|
||||
path.unlink() # Remove corrupted cache
|
||||
raise CacheCorruptionError(f"SHA mismatch: cache corrupted")
|
||||
|
||||
return path
|
||||
```
|
||||
|
||||
#### 3. **Update All Usage Points** (1 hour) ✅ COMPLETED
|
||||
```python
|
||||
# src/deltaglider/core/service.py
|
||||
# Replaced ALL instances in two locations:
|
||||
# - Line 234 (get method for decoding)
|
||||
# - Line 415 (_create_delta method for encoding)
|
||||
|
||||
ref_path = self.cache.get_validated_ref(
|
||||
delta_space.bucket,
|
||||
delta_space.prefix,
|
||||
ref_sha256 # Pass expected SHA
|
||||
)
|
||||
```
|
||||
|
||||
**Test & Deploy**: ✅ All 99 tests passing + ready for release
|
||||
|
||||
---
|
||||
|
||||
### **DAY 3-5: Quick Wins** (v5.0.3) ✅ COMPLETED
|
||||
*Low-risk improvements with high security impact*
|
||||
|
||||
#### 4. **Implement Content-Addressed Storage** (4 hours) ✅ COMPLETED
|
||||
```python
|
||||
# src/deltaglider/adapters/cache_cas.py
|
||||
class ContentAddressedCache(CachePort):
|
||||
"""Cache using SHA as filename - eliminates collisions"""
|
||||
|
||||
def ref_path(self, bucket: str, prefix: str, sha256: str) -> Path:
|
||||
# Use SHA as filename - guaranteed unique
|
||||
return self.base_dir / sha256[:2] / sha256[2:4] / sha256
|
||||
|
||||
def write_ref(self, bucket: str, prefix: str, src: Path, sha256: str) -> Path:
|
||||
path = self.ref_path(bucket, prefix, sha256)
|
||||
|
||||
# If file with this SHA exists, we're done (deduplication!)
|
||||
if path.exists():
|
||||
return path
|
||||
|
||||
# Atomic write
|
||||
path.parent.mkdir(parents=True, mode=0o700, exist_ok=True)
|
||||
tmp = path.with_suffix('.tmp')
|
||||
shutil.copy2(src, tmp)
|
||||
os.chmod(tmp, 0o600)
|
||||
|
||||
# Verify content before committing
|
||||
actual_sha = self.hasher.sha256(tmp)
|
||||
if actual_sha != sha256:
|
||||
tmp.unlink()
|
||||
raise ValueError("File corruption during cache write")
|
||||
|
||||
os.replace(tmp, path) # Atomic
|
||||
return path
|
||||
```
|
||||
|
||||
**Benefits**: ✅ ACHIEVED
|
||||
- Same file cached once regardless of bucket/prefix (automatic deduplication)
|
||||
- No collision possible (SHA256 uniqueness guarantees)
|
||||
- Natural cache validation (filename IS the checksum)
|
||||
- Two-level directory structure (ab/cd/abcdef...) for filesystem optimization
|
||||
|
||||
**Implementation**: Complete in `src/deltaglider/adapters/cache_cas.py` with:
|
||||
- `_cas_path()` method for SHA256-based path computation
|
||||
- `get_validated_ref()` with atomic validation and locking
|
||||
- `write_ref()` with atomic temp-file + rename pattern
|
||||
- Ephemeral deltaspace-to-SHA mapping for compatibility
|
||||
|
||||
#### 5. **Add Secure Directory Creation** (2 hours)
|
||||
```python
|
||||
# src/deltaglider/utils/secure_fs.py
|
||||
import os
|
||||
import stat
|
||||
|
||||
def secure_makedirs(path: Path, mode: int = 0o700) -> None:
|
||||
"""Create directory with secure permissions atomically."""
|
||||
try:
|
||||
path.mkdir(parents=True, mode=mode, exist_ok=False)
|
||||
except FileExistsError:
|
||||
# Verify it's ours and has correct permissions
|
||||
st = path.stat()
|
||||
if st.st_uid != os.getuid():
|
||||
raise SecurityError(f"Directory {path} owned by different user")
|
||||
if stat.S_IMODE(st.st_mode) != mode:
|
||||
os.chmod(path, mode) # Fix permissions
|
||||
```
|
||||
|
||||
#### 6. **Unify Cache Configuration** (1 hour)
|
||||
```python
|
||||
# src/deltaglider/config.py
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
def get_cache_dir() -> Path:
|
||||
"""Single source of truth for cache directory."""
|
||||
if os.environ.get("DG_NO_CACHE") == "true":
|
||||
return None # Feature flag to disable cache
|
||||
|
||||
if os.environ.get("DG_EPHEMERAL_CACHE") == "true":
|
||||
return Path(tempfile.mkdtemp(prefix="dg-cache-"))
|
||||
|
||||
# User-specific cache by default
|
||||
cache_base = os.environ.get("DG_CACHE_DIR",
|
||||
os.path.expanduser("~/.cache/deltaglider"))
|
||||
return Path(cache_base) / "v2" # Version cache format
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### **DAY 6-10: Architecture Redesign** (v5.0.3) ✅ COMPLETED
|
||||
*The bold solution that eliminates entire vulnerability classes*
|
||||
|
||||
#### 7. **Implement Memory Cache with Encryption** (8 hours) ✅ COMPLETED
|
||||
```python
|
||||
# src/deltaglider/adapters/cache_memory.py
|
||||
class MemoryCache(CachePort):
|
||||
"""In-memory cache with LRU eviction and configurable size limits."""
|
||||
|
||||
def __init__(self, hasher: HashPort, max_size_mb: int = 100, temp_dir: Path | None = None):
|
||||
self.hasher = hasher
|
||||
self.max_size_bytes = max_size_mb * 1024 * 1024
|
||||
self._current_size = 0
|
||||
self._cache: dict[tuple[str, str], tuple[bytes, str]] = {} # (bucket, prefix) -> (content, SHA)
|
||||
self._access_order: list[tuple[str, str]] = [] # LRU tracking
|
||||
|
||||
def write_ref(self, bucket: str, prefix: str, src: Path) -> Path:
|
||||
"""Write reference to in-memory cache with LRU eviction."""
|
||||
# Read content and compute SHA
|
||||
content = src.read_bytes()
|
||||
sha256 = self.hasher.sha256_bytes(content)
|
||||
|
||||
# Check if file fits in cache
|
||||
needed_bytes = len(content)
|
||||
if needed_bytes > self.max_size_bytes:
|
||||
raise CacheCorruptionError(f"File too large for cache: {needed_bytes} > {self.max_size_bytes}")
|
||||
|
||||
# Evict LRU if needed
|
||||
self._evict_lru(needed_bytes)
|
||||
|
||||
# Store in memory
|
||||
key = (bucket, prefix)
|
||||
self._cache[key] = (content, sha256)
|
||||
self._current_size += needed_bytes
|
||||
self._access_order.append(key)
|
||||
|
||||
return src # Return original path for compatibility
|
||||
|
||||
def get_validated_ref(self, bucket: str, prefix: str, expected_sha: str) -> Path:
|
||||
"""Get cached reference with validation."""
|
||||
key = (bucket, prefix)
|
||||
if key not in self._cache:
|
||||
raise CacheMissError(f"Cache miss for {bucket}/{prefix}")
|
||||
|
||||
content, stored_sha = self._cache[key]
|
||||
|
||||
# Validate SHA matches
|
||||
if stored_sha != expected_sha:
|
||||
raise CacheCorruptionError(f"SHA mismatch for {bucket}/{prefix}")
|
||||
|
||||
# Update LRU order
|
||||
self._access_order.remove(key)
|
||||
self._access_order.append(key)
|
||||
|
||||
# Write to temp file for compatibility
|
||||
temp_path = self.temp_dir / f"{expected_sha}.bin"
|
||||
temp_path.write_bytes(content)
|
||||
return temp_path
|
||||
```
|
||||
|
||||
# src/deltaglider/adapters/cache_encrypted.py
|
||||
class EncryptedCache(CachePort):
|
||||
"""Encrypted cache wrapper using Fernet symmetric encryption."""
|
||||
|
||||
def __init__(self, backend: CachePort, encryption_key: bytes | None = None):
|
||||
self.backend = backend
|
||||
|
||||
# Key management: ephemeral (default) or provided
|
||||
if encryption_key is None:
|
||||
self._key = Fernet.generate_key() # Ephemeral per process
|
||||
self._ephemeral = True
|
||||
else:
|
||||
self._key = encryption_key
|
||||
self._ephemeral = False
|
||||
|
||||
self._cipher = Fernet(self._key)
|
||||
# Track plaintext SHA since encrypted content has different SHA
|
||||
self._plaintext_sha_map: dict[tuple[str, str], str] = {}
|
||||
|
||||
def write_ref(self, bucket: str, prefix: str, src: Path) -> Path:
|
||||
"""Encrypt and cache reference file."""
|
||||
# Read plaintext and compute SHA
|
||||
plaintext_data = src.read_bytes()
|
||||
plaintext_sha = hashlib.sha256(plaintext_data).hexdigest()
|
||||
|
||||
# Encrypt data
|
||||
encrypted_data = self._cipher.encrypt(plaintext_data)
|
||||
|
||||
# Write encrypted data to temp file
|
||||
temp_encrypted = src.with_suffix(".encrypted.tmp")
|
||||
temp_encrypted.write_bytes(encrypted_data)
|
||||
|
||||
try:
|
||||
# Store encrypted file via backend
|
||||
result_path = self.backend.write_ref(bucket, prefix, temp_encrypted)
|
||||
|
||||
# Store plaintext SHA mapping
|
||||
key = (bucket, prefix)
|
||||
self._plaintext_sha_map[key] = plaintext_sha
|
||||
|
||||
return result_path
|
||||
finally:
|
||||
temp_encrypted.unlink(missing_ok=True)
|
||||
|
||||
def get_validated_ref(self, bucket: str, prefix: str, expected_sha: str) -> Path:
|
||||
"""Get cached reference with decryption and validation."""
|
||||
# Verify we have the plaintext SHA mapped
|
||||
key = (bucket, prefix)
|
||||
if key not in self._plaintext_sha_map:
|
||||
raise CacheMissError(f"Cache miss for {bucket}/{prefix}")
|
||||
|
||||
if self._plaintext_sha_map[key] != expected_sha:
|
||||
raise CacheCorruptionError(f"SHA mismatch for {bucket}/{prefix}")
|
||||
|
||||
# Get encrypted file from backend
|
||||
encrypted_path = self.backend.ref_path(bucket, prefix)
|
||||
if not encrypted_path.exists():
|
||||
raise CacheMissError(f"Encrypted cache file not found")
|
||||
|
||||
# Decrypt content
|
||||
encrypted_data = encrypted_path.read_bytes()
|
||||
try:
|
||||
decrypted_data = self._cipher.decrypt(encrypted_data)
|
||||
except Exception as e:
|
||||
raise CacheCorruptionError(f"Decryption failed: {e}") from e
|
||||
|
||||
# Validate plaintext SHA
|
||||
actual_sha = hashlib.sha256(decrypted_data).hexdigest()
|
||||
if actual_sha != expected_sha:
|
||||
raise CacheCorruptionError(f"Decrypted content SHA mismatch")
|
||||
|
||||
# Write decrypted content to temp file
|
||||
decrypted_path = encrypted_path.with_suffix(".decrypted")
|
||||
decrypted_path.write_bytes(decrypted_data)
|
||||
return decrypted_path
|
||||
```
|
||||
|
||||
**Implementation**: ✅ COMPLETED
|
||||
- **MemoryCache**: In-memory cache with LRU eviction, configurable size limits, zero filesystem I/O
|
||||
- **EncryptedCache**: Fernet (AES-128-CBC + HMAC) encryption wrapper, ephemeral keys by default
|
||||
- **Configuration**: `DG_CACHE_BACKEND` (filesystem/memory), `DG_CACHE_ENCRYPTION` (true/false)
|
||||
- **Environment Variables**: `DG_CACHE_MEMORY_SIZE_MB`, `DG_CACHE_ENCRYPTION_KEY`
|
||||
|
||||
**Benefits**: ✅ ACHIEVED
|
||||
- No filesystem access for memory cache = no permission issues
|
||||
- Encrypted at rest = secure cache storage
|
||||
- Per-process ephemeral keys = forward secrecy and process isolation
|
||||
- LRU eviction = prevents memory exhaustion
|
||||
- Zero TOCTOU window = memory operations are atomic
|
||||
- Configurable backends = flexibility for different use cases
|
||||
|
||||
#### 8. **Implement Signed Cache Entries** (6 hours)
|
||||
```python
|
||||
# src/deltaglider/adapters/cache_signed.py
|
||||
import hmac
|
||||
import json
|
||||
from datetime import datetime, timedelta
|
||||
|
||||
class SignedCache(CachePort):
|
||||
"""Cache with cryptographic signatures and expiry."""
|
||||
|
||||
def __init__(self, base_dir: Path, secret_key: bytes = None):
|
||||
self.base_dir = base_dir
|
||||
# Per-session key if not provided
|
||||
self.secret = secret_key or os.urandom(32)
|
||||
|
||||
def _sign_metadata(self, metadata: dict) -> str:
|
||||
"""Create HMAC signature for metadata."""
|
||||
json_meta = json.dumps(metadata, sort_keys=True)
|
||||
signature = hmac.new(
|
||||
self.secret,
|
||||
json_meta.encode(),
|
||||
hashlib.sha256
|
||||
).hexdigest()
|
||||
return signature
|
||||
|
||||
def write_ref(self, bucket: str, prefix: str, src: Path, sha256: str) -> Path:
|
||||
# Create signed metadata
|
||||
metadata = {
|
||||
"sha256": sha256,
|
||||
"bucket": bucket,
|
||||
"prefix": prefix,
|
||||
"timestamp": datetime.utcnow().isoformat(),
|
||||
"expires": (datetime.utcnow() + timedelta(hours=24)).isoformat(),
|
||||
"pid": os.getpid(),
|
||||
"uid": os.getuid(),
|
||||
}
|
||||
signature = self._sign_metadata(metadata)
|
||||
|
||||
# Store data + metadata
|
||||
cache_dir = self.base_dir / signature[:8] # Use signature prefix as namespace
|
||||
cache_dir.mkdir(parents=True, mode=0o700, exist_ok=True)
|
||||
|
||||
data_path = cache_dir / f"{sha256}.bin"
|
||||
meta_path = cache_dir / f"{sha256}.meta"
|
||||
|
||||
# Atomic writes
|
||||
shutil.copy2(src, data_path)
|
||||
os.chmod(data_path, 0o600)
|
||||
|
||||
with open(meta_path, 'w') as f:
|
||||
json.dump({"metadata": metadata, "signature": signature}, f)
|
||||
os.chmod(meta_path, 0o600)
|
||||
|
||||
return data_path
|
||||
|
||||
def get_validated_ref(self, bucket: str, prefix: str, sha256: str) -> Path:
|
||||
# Find and validate signed entry
|
||||
pattern = self.base_dir / "*" / f"{sha256}.meta"
|
||||
matches = list(Path(self.base_dir).glob(f"*/{sha256}.meta"))
|
||||
|
||||
for meta_path in matches:
|
||||
with open(meta_path) as f:
|
||||
entry = json.load(f)
|
||||
|
||||
# Verify signature
|
||||
expected_sig = self._sign_metadata(entry["metadata"])
|
||||
if not hmac.compare_digest(entry["signature"], expected_sig):
|
||||
meta_path.unlink() # Remove tampered entry
|
||||
continue
|
||||
|
||||
# Check expiry
|
||||
expires = datetime.fromisoformat(entry["metadata"]["expires"])
|
||||
if datetime.utcnow() > expires:
|
||||
meta_path.unlink()
|
||||
continue
|
||||
|
||||
# Validate data integrity
|
||||
data_path = meta_path.with_suffix('.bin')
|
||||
actual_sha = self.hasher.sha256(data_path)
|
||||
if actual_sha != sha256:
|
||||
data_path.unlink()
|
||||
meta_path.unlink()
|
||||
continue
|
||||
|
||||
return data_path
|
||||
|
||||
raise CacheMissError(f"No valid cache entry for {sha256}")
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### **DAY 11-15: Advanced Security** (v6.0.0)
|
||||
*Next-generation features for standalone security*
|
||||
|
||||
#### 9. **Add Integrity Monitoring** (4 hours)
|
||||
```python
|
||||
# src/deltaglider/security/monitor.py
|
||||
import inotify
|
||||
import logging
|
||||
|
||||
class CacheIntegrityMonitor:
|
||||
"""Detect and alert on cache tampering attempts."""
|
||||
|
||||
def __init__(self, cache_dir: Path):
|
||||
self.cache_dir = cache_dir
|
||||
self.notifier = inotify.INotify()
|
||||
self.watch_desc = self.notifier.add_watch(
|
||||
str(cache_dir),
|
||||
inotify.IN_MODIFY | inotify.IN_DELETE | inotify.IN_ATTRIB
|
||||
)
|
||||
self.logger = logging.getLogger("security")
|
||||
|
||||
async def monitor(self):
|
||||
"""Monitor for unauthorized cache modifications."""
|
||||
async for event in self.notifier:
|
||||
if event.mask & inotify.IN_MODIFY:
|
||||
# File modified - verify it was by our process
|
||||
if not self._is_our_modification(event):
|
||||
self.logger.critical(
|
||||
f"SECURITY: Unauthorized cache modification detected: {event.path}"
|
||||
)
|
||||
# Immediately invalidate affected cache
|
||||
Path(event.path).unlink(missing_ok=True)
|
||||
|
||||
elif event.mask & inotify.IN_ATTRIB:
|
||||
# Permission change - always suspicious
|
||||
self.logger.warning(
|
||||
f"SECURITY: Cache permission change: {event.path}"
|
||||
)
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### **DAY 16-20: Testing & Rollout** (v6.0.0 release)
|
||||
|
||||
#### 10. **Security Test Suite** (8 hours)
|
||||
```python
|
||||
# tests/security/test_cache_attacks.py
|
||||
import pytest
|
||||
import os
|
||||
import threading
|
||||
import time
|
||||
|
||||
class TestCacheSecurity:
|
||||
"""Test all known attack vectors."""
|
||||
|
||||
def test_toctou_attack_prevented(self, cache):
|
||||
"""Verify TOCTOU window is eliminated."""
|
||||
sha = "abc123"
|
||||
cache.write_ref("bucket", "prefix", test_file, sha)
|
||||
|
||||
# Attacker thread tries to replace file during read
|
||||
def attacker():
|
||||
time.sleep(0.0001) # Try to hit the TOCTOU window
|
||||
cache_path = cache.ref_path("bucket", "prefix", sha)
|
||||
cache_path.write_bytes(b"malicious")
|
||||
|
||||
thread = threading.Thread(target=attacker)
|
||||
thread.start()
|
||||
|
||||
# Should detect tampering
|
||||
with pytest.raises(CacheCorruptionError):
|
||||
cache.get_validated_ref("bucket", "prefix", sha)
|
||||
|
||||
def test_multi_user_isolation(self, cache):
|
||||
"""Verify users can't access each other's cache."""
|
||||
# Create cache as user A
|
||||
cache_a = SignedCache(Path("/tmp/cache"), secret=b"key_a")
|
||||
cache_a.write_ref("bucket", "prefix", test_file, "sha_a")
|
||||
|
||||
# Try to read as user B with different key
|
||||
cache_b = SignedCache(Path("/tmp/cache"), secret=b"key_b")
|
||||
|
||||
with pytest.raises(CacheMissError):
|
||||
cache_b.get_validated_ref("bucket", "prefix", "sha_a")
|
||||
|
||||
def test_cache_poisoning_prevented(self, cache):
|
||||
"""Verify corrupted cache is detected."""
|
||||
sha = "abc123"
|
||||
cache.write_ref("bucket", "prefix", test_file, sha)
|
||||
|
||||
# Corrupt the cache file
|
||||
cache_path = cache.ref_path("bucket", "prefix", sha)
|
||||
with open(cache_path, 'ab') as f:
|
||||
f.write(b"corrupted")
|
||||
|
||||
# Should detect corruption
|
||||
with pytest.raises(CacheCorruptionError):
|
||||
cache.get_validated_ref("bucket", "prefix", sha)
|
||||
```
|
||||
|
||||
#### 11. **Migration Guide** (4 hours)
|
||||
```python
|
||||
# src/deltaglider/migration/v5_to_v6.py
|
||||
def migrate_cache():
|
||||
"""Migrate from v5 shared cache to v6 secure cache."""
|
||||
old_cache = Path("/tmp/.deltaglider/cache")
|
||||
|
||||
if old_cache.exists():
|
||||
print("WARNING: Old insecure cache detected at", old_cache)
|
||||
print("This cache had security vulnerabilities and will not be migrated.")
|
||||
|
||||
response = input("Delete old cache? [y/N]: ")
|
||||
if response.lower() == 'y':
|
||||
shutil.rmtree(old_cache)
|
||||
print("Old cache deleted. New secure cache will be created on demand.")
|
||||
else:
|
||||
print("Old cache retained at", old_cache)
|
||||
print("Set DG_CACHE_DIR to use a different location.")
|
||||
```
|
||||
|
||||
#### 12. **Performance Benchmarks** (4 hours)
|
||||
```python
|
||||
# benchmarks/cache_performance.py
|
||||
def benchmark_cache_implementations():
|
||||
"""Compare performance of cache implementations."""
|
||||
|
||||
implementations = [
|
||||
("Filesystem (v5)", FsCacheAdapter),
|
||||
("Content-Addressed", ContentAddressedCache),
|
||||
("Memory", MemoryCache),
|
||||
("Signed", SignedCache),
|
||||
]
|
||||
|
||||
for name, cache_class in implementations:
|
||||
cache = cache_class(test_dir)
|
||||
|
||||
# Measure write performance
|
||||
start = time.perf_counter()
|
||||
for i in range(1000):
|
||||
cache.write_ref("bucket", f"prefix{i}", test_file, f"sha{i}")
|
||||
write_time = time.perf_counter() - start
|
||||
|
||||
# Measure read performance
|
||||
start = time.perf_counter()
|
||||
for i in range(1000):
|
||||
cache.get_validated_ref("bucket", f"prefix{i}", f"sha{i}")
|
||||
read_time = time.perf_counter() - start
|
||||
|
||||
print(f"{name}: Write={write_time:.3f}s Read={read_time:.3f}s")
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 📊 Decision Matrix
|
||||
|
||||
| Solution | Security | Performance | Complexity | Breaking Change |
|
||||
|----------|----------|-------------|------------|-----------------|
|
||||
| Hotfix (Day 1-2) | ⭐⭐⭐ | ⭐⭐ | ⭐ | No |
|
||||
| Content-Addressed | ⭐⭐⭐⭐ | ⭐⭐⭐⭐ | ⭐⭐ | No |
|
||||
| Memory Cache | ⭐⭐⭐⭐⭐ | ⭐⭐⭐⭐⭐ | ⭐⭐⭐ | No |
|
||||
| Signed Cache | ⭐⭐⭐⭐⭐ | ⭐⭐⭐ | ⭐⭐⭐ | No |
|
||||
|
||||
---
|
||||
|
||||
## 🎯 Recommended Approach
|
||||
|
||||
### For Immediate Production (Next 48 hours)
|
||||
Deploy **Hotfix v5.0.3** with ephemeral cache + SHA validation
|
||||
|
||||
### For Next Release (1 week)
|
||||
Implement **Content-Addressed Storage** (v5.1.0) - best balance of security and simplicity
|
||||
|
||||
### For Enterprise (1 month)
|
||||
Deploy **Signed Cache** (v6.0.0) for maximum security with built-in TTL and integrity
|
||||
|
||||
---
|
||||
|
||||
## 🚦 Success Metrics
|
||||
|
||||
After implementation, verify:
|
||||
|
||||
1. **Security Tests Pass**: All attack vectors prevented
|
||||
2. **Performance Maintained**: <10% degradation vs v5
|
||||
3. **Zero CVEs**: No security vulnerabilities in cache
|
||||
4. **User Isolation**: Multi-user systems work safely
|
||||
5. **Backward Compatible**: Existing workflows unaffected
|
||||
|
||||
---
|
||||
|
||||
## 📞 Support
|
||||
|
||||
For questions or security concerns:
|
||||
- Security Team: security@deltaglider.io
|
||||
- Lead Developer: @architect
|
||||
- Immediate Issues: Create SECURITY labeled issue
|
||||
|
||||
---
|
||||
|
||||
## ⚠️ Disclosure Timeline
|
||||
|
||||
- **Day 0**: Vulnerabilities discovered
|
||||
- **Day 1**: Hotfix released (v5.0.3)
|
||||
- **Day 7**: Improved version released (v5.1.0)
|
||||
- **Day 30**: Full disclosure published
|
||||
- **Day 45**: v6.0.0 with complete redesign
|
||||
|
||||
---
|
||||
|
||||
*Document Version: 1.0*
|
||||
*Classification: SENSITIVE - INTERNAL USE ONLY*
|
||||
*Last Updated: 2024-10-09*
|
||||
@@ -2,7 +2,7 @@ version: '3.8'
|
||||
|
||||
services:
|
||||
localstack:
|
||||
image: localstack/localstack:latest
|
||||
image: localstack/localstack:4.4
|
||||
ports:
|
||||
- "4566:4566"
|
||||
environment:
|
||||
|
||||
@@ -22,7 +22,7 @@ services:
|
||||
retries: 5
|
||||
|
||||
localstack:
|
||||
image: localstack/localstack:latest
|
||||
image: localstack/localstack:4.4
|
||||
container_name: deltaglider-localstack
|
||||
ports:
|
||||
- "4566:4566"
|
||||
|
||||
84
docs/BOTO3_COMPATIBILITY_VISION.md
Normal file
84
docs/BOTO3_COMPATIBILITY_VISION.md
Normal file
@@ -0,0 +1,84 @@
|
||||
# boto3 Compatibility Vision
|
||||
|
||||
DeltaGlider is a drop-in replacement for boto3's S3 client. This document spells out what “drop-in”
|
||||
means in practice so new projects can adopt the SDK with confidence.
|
||||
|
||||
## Current State (v5.x and newer)
|
||||
|
||||
- `DeltaGliderClient` methods such as `list_objects`, `put_object`, `get_object`, `delete_object`,
|
||||
`delete_objects`, `head_object`, etc. return **boto3-compatible dicts**.
|
||||
- TypedDict aliases in `deltaglider.types` (e.g. `ListObjectsV2Response`, `PutObjectResponse`) give
|
||||
IDE/type-checking support without importing boto3.
|
||||
- DeltaGlider-specific metadata lives inside standard boto3 fields (typically `Metadata`), so tools
|
||||
that ignore those keys see the exact same structures as they would from boto3.
|
||||
- Tests and documentation exercise and describe the boto3-style responses (`response['Contents']`
|
||||
instead of `response.contents`).
|
||||
|
||||
```python
|
||||
from deltaglider import create_client, ListObjectsV2Response
|
||||
|
||||
client = create_client()
|
||||
response: ListObjectsV2Response = client.list_objects(Bucket='my-bucket')
|
||||
|
||||
for obj in response['Contents']:
|
||||
print(f"{obj['Key']}: {obj['Size']} bytes")
|
||||
```
|
||||
|
||||
## Key Design Points
|
||||
|
||||
- **TypedDict everywhere** – `put_object`, `get_object`, `list_objects`, `delete_object`, etc.
|
||||
return the same shapes boto3 does. Use the provided aliases (`ListObjectsV2Response`,
|
||||
`PutObjectResponse`, …) for IDE/completion help.
|
||||
- **Metadata namespace** – DeltaGlider-specific flags such as `deltaglider-is-delta` live under the
|
||||
regular `Metadata` key so every response remains valid boto3 output.
|
||||
- **No shims required** – responses are plain dicts. If you already know boto3, you already know how
|
||||
to consume DeltaGlider outputs.
|
||||
|
||||
## Benefits Summary
|
||||
|
||||
### For Users
|
||||
- **Zero learning curve** – identical data structures to boto3.
|
||||
- **Tooling compatibility** – works with any boto3-aware tool or library.
|
||||
- **Type safety** – TypedDicts provide IDE autocomplete even without boto3 installed.
|
||||
|
||||
### For DeltaGlider
|
||||
- **Cleaner internals** – no custom dataclasses to maintain.
|
||||
- **Simpler docs/tests** – examples mirror boto3 verbatim.
|
||||
- **Marketing accuracy** – "drop-in replacement" is now literal.
|
||||
|
||||
## Technical Details
|
||||
|
||||
### TypedDict refresher
|
||||
```python
|
||||
from typing import TypedDict
|
||||
|
||||
class MyResponse(TypedDict):
|
||||
Key: str
|
||||
Size: int
|
||||
|
||||
resp: MyResponse = {'Key': 'file.zip', 'Size': 1024}
|
||||
print(type(resp)) # <class 'dict'>
|
||||
```
|
||||
At runtime the structure is still a plain `dict`, but static type-checkers understand the shape.
|
||||
|
||||
### DeltaGlider Metadata
|
||||
|
||||
Delta-specific fields live inside the standard `Metadata` map. Example list_objects entry:
|
||||
```python
|
||||
{
|
||||
'Key': 'file.zip',
|
||||
'Size': 1024,
|
||||
'Metadata': {
|
||||
'deltaglider-is-delta': 'true',
|
||||
'deltaglider-compression-ratio': '0.99',
|
||||
'deltaglider-original-size': '50000000',
|
||||
}
|
||||
}
|
||||
```
|
||||
These keys are namespaced (`deltaglider-...`) so they are safe to ignore if not needed.
|
||||
|
||||
## Status Snapshot
|
||||
|
||||
- ✅ TypedDict builders are used everywhere (`build_list_objects_response`, etc.).
|
||||
- ✅ Tests assert boto3-style dict access (`response['Contents']`).
|
||||
- ✅ Documentation (README, SDK docs, examples) shows the boto3 syntax.
|
||||
684
docs/CACHE_MANAGEMENT.md
Normal file
684
docs/CACHE_MANAGEMENT.md
Normal file
@@ -0,0 +1,684 @@
|
||||
# Cache Management for Long-Running Applications
|
||||
|
||||
## Overview
|
||||
|
||||
DeltaGlider uses caching to store reference files locally for efficient delta compression. However, unlike the CLI which automatically cleans up cache on exit, **programmatic SDK usage requires manual cache management** for long-running applications.
|
||||
|
||||
This guide explains how to manage cache in production applications, including:
|
||||
- When and how to clear cache
|
||||
- Encryption key management strategies
|
||||
- Memory vs. filesystem cache trade-offs
|
||||
- Best practices for different application types
|
||||
|
||||
## The Problem
|
||||
|
||||
**CLI (Automatic Cleanup)**:
|
||||
```bash
|
||||
# Cache created in /tmp/deltaglider-xyz123/
|
||||
deltaglider cp file.zip s3://bucket/
|
||||
|
||||
# Process exits → Cache automatically deleted via atexit handler
|
||||
# ✅ No manual cleanup needed
|
||||
```
|
||||
|
||||
**SDK (Manual Cleanup Required)**:
|
||||
```python
|
||||
from deltaglider import create_client
|
||||
|
||||
client = create_client()
|
||||
|
||||
# Long-running application (runs for hours/days)
|
||||
while True:
|
||||
client.put_object(Bucket='releases', Key='file.zip', Body=data)
|
||||
time.sleep(600) # Upload every 10 minutes
|
||||
|
||||
# ❌ Process never exits → Cache never cleaned
|
||||
# ❌ Cache grows indefinitely
|
||||
# ❌ Memory/disk exhaustion after days/weeks
|
||||
```
|
||||
|
||||
## Solution: Manual Cache Management
|
||||
|
||||
### Basic Cache Clearing
|
||||
|
||||
```python
|
||||
from deltaglider import create_client
|
||||
|
||||
client = create_client()
|
||||
|
||||
# Do some uploads
|
||||
client.put_object(Bucket='releases', Key='file1.zip', Body=data1)
|
||||
client.put_object(Bucket='releases', Key='file2.zip', Body=data2)
|
||||
|
||||
# Clear cache to free resources
|
||||
client.clear_cache()
|
||||
|
||||
# ✅ All cached references removed
|
||||
# ✅ Memory/disk freed
|
||||
# ✅ Next upload will fetch fresh reference from S3
|
||||
```
|
||||
|
||||
### When to Clear Cache
|
||||
|
||||
| Scenario | Frequency | Reason |
|
||||
|----------|-----------|--------|
|
||||
| **Long-running services** | Every 1-4 hours | Prevent memory/disk growth |
|
||||
| **After config changes** | Immediately | Old cache is invalid |
|
||||
| **High memory pressure** | As needed | Free resources |
|
||||
| **Test cleanup** | After each test | Ensure clean state |
|
||||
| **Scheduled jobs** | After job completes | Clean up before next run |
|
||||
| **Key rotation** | After rotation | Old encrypted cache unusable |
|
||||
|
||||
## Cache Strategies by Application Type
|
||||
|
||||
### 1. Long-Running Background Service
|
||||
|
||||
**Scenario**: Continuous upload service running 24/7
|
||||
|
||||
```python
|
||||
from deltaglider import create_client
|
||||
import schedule
|
||||
import time
|
||||
import logging
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
client = create_client()
|
||||
|
||||
def upload_task():
|
||||
"""Upload latest build."""
|
||||
try:
|
||||
with open('latest-build.zip', 'rb') as f:
|
||||
response = client.put_object(
|
||||
Bucket='releases',
|
||||
Key=f'builds/{datetime.now().isoformat()}.zip',
|
||||
Body=f
|
||||
)
|
||||
logger.info(f"Uploaded: {response['ETag']}")
|
||||
except Exception as e:
|
||||
logger.error(f"Upload failed: {e}")
|
||||
|
||||
def cleanup_task():
|
||||
"""Clear cache to prevent growth."""
|
||||
client.clear_cache()
|
||||
logger.info("Cache cleared - freed resources")
|
||||
|
||||
# Upload every 10 minutes
|
||||
schedule.every(10).minutes.do(upload_task)
|
||||
|
||||
# Clear cache every 2 hours (balance performance vs. memory)
|
||||
schedule.every(2).hours.do(cleanup_task)
|
||||
|
||||
# Run indefinitely
|
||||
while True:
|
||||
schedule.run_pending()
|
||||
time.sleep(60)
|
||||
```
|
||||
|
||||
**Cache Clearing Frequency Guidelines**:
|
||||
- Every 1 hour: High upload frequency (>100/day), memory-constrained
|
||||
- Every 2-4 hours: Moderate upload frequency (10-100/day), normal memory
|
||||
- Every 6-12 hours: Low upload frequency (<10/day), abundant memory
|
||||
|
||||
### 2. Periodic Batch Job
|
||||
|
||||
**Scenario**: Daily backup script
|
||||
|
||||
```python
|
||||
from deltaglider import create_client
|
||||
import glob
|
||||
from pathlib import Path
|
||||
|
||||
def daily_backup():
|
||||
"""Upload daily backups and clean up."""
|
||||
client = create_client()
|
||||
|
||||
try:
|
||||
# Upload all backup files
|
||||
for backup in glob.glob('/backups/*.zip'):
|
||||
with open(backup, 'rb') as f:
|
||||
client.put_object(
|
||||
Bucket='backups',
|
||||
Key=f'daily/{Path(backup).name}',
|
||||
Body=f
|
||||
)
|
||||
print(f"Backed up: {backup}")
|
||||
|
||||
finally:
|
||||
# ALWAYS clear cache at end of job
|
||||
client.clear_cache()
|
||||
print("Cache cleared - job complete")
|
||||
|
||||
# Run daily via cron/systemd timer
|
||||
if __name__ == '__main__':
|
||||
daily_backup()
|
||||
```
|
||||
|
||||
**Best Practice**: Always clear cache in `finally` block to ensure cleanup even if job fails.
|
||||
|
||||
### 3. Web Application / API Server
|
||||
|
||||
**Scenario**: Flask/FastAPI app with upload endpoints
|
||||
|
||||
```python
|
||||
from fastapi import FastAPI, UploadFile, BackgroundTasks
|
||||
from deltaglider import create_client
|
||||
import asyncio
|
||||
|
||||
app = FastAPI()
|
||||
|
||||
# Create client once at startup
|
||||
client = create_client()
|
||||
|
||||
async def periodic_cache_cleanup():
|
||||
"""Background task to clear cache periodically."""
|
||||
while True:
|
||||
await asyncio.sleep(3600) # Every hour
|
||||
client.clear_cache()
|
||||
print("Cache cleared in background")
|
||||
|
||||
@app.on_event("startup")
|
||||
async def startup_event():
|
||||
"""Start background cache cleanup task."""
|
||||
asyncio.create_task(periodic_cache_cleanup())
|
||||
|
||||
@app.post("/upload")
|
||||
async def upload_file(file: UploadFile):
|
||||
"""Upload endpoint with automatic cache management."""
|
||||
content = await file.read()
|
||||
|
||||
response = client.put_object(
|
||||
Bucket='uploads',
|
||||
Key=f'files/{file.filename}',
|
||||
Body=content
|
||||
)
|
||||
|
||||
return {"message": "Uploaded", "etag": response['ETag']}
|
||||
|
||||
@app.post("/admin/clear-cache")
|
||||
async def admin_clear_cache():
|
||||
"""Manual cache clear endpoint for admin."""
|
||||
client.clear_cache()
|
||||
return {"message": "Cache cleared"}
|
||||
```
|
||||
|
||||
**Best Practice**: Run periodic cache cleanup in background task, provide manual clear endpoint for emergencies.
|
||||
|
||||
### 4. Testing / CI/CD
|
||||
|
||||
**Scenario**: Test suite using DeltaGlider
|
||||
|
||||
```python
|
||||
import pytest
|
||||
from deltaglider import create_client
|
||||
|
||||
@pytest.fixture
|
||||
def deltaglider_client():
|
||||
"""Provide clean client for each test."""
|
||||
client = create_client()
|
||||
yield client
|
||||
# ALWAYS clear cache after test
|
||||
client.clear_cache()
|
||||
|
||||
def test_upload(deltaglider_client):
|
||||
"""Test upload with automatic cleanup."""
|
||||
response = deltaglider_client.put_object(
|
||||
Bucket='test-bucket',
|
||||
Key='test-file.zip',
|
||||
Body=b'test data'
|
||||
)
|
||||
assert response['ETag'] is not None
|
||||
# Cache automatically cleared by fixture
|
||||
|
||||
def test_download(deltaglider_client):
|
||||
"""Test download with clean cache."""
|
||||
# Cache is clean from previous test
|
||||
deltaglider_client.put_object(Bucket='test', Key='file.zip', Body=b'data')
|
||||
response = deltaglider_client.get_object(Bucket='test', Key='file.zip')
|
||||
assert response['Body'].read() == b'data'
|
||||
# Cache automatically cleared by fixture
|
||||
```
|
||||
|
||||
**Best Practice**: Use pytest fixtures to ensure cache is cleared after each test.
|
||||
|
||||
### 5. AWS Lambda / Serverless
|
||||
|
||||
**Scenario**: Lambda function with warm container reuse
|
||||
|
||||
```python
|
||||
import os
|
||||
from deltaglider import create_client
|
||||
|
||||
# Initialize client outside handler (reused across invocations)
|
||||
client = create_client()
|
||||
|
||||
# Track invocation count for cache clearing
|
||||
invocation_count = 0
|
||||
|
||||
def lambda_handler(event, context):
|
||||
"""Lambda handler with periodic cache clearing."""
|
||||
global invocation_count
|
||||
invocation_count += 1
|
||||
|
||||
try:
|
||||
# Upload file
|
||||
response = client.put_object(
|
||||
Bucket='lambda-uploads',
|
||||
Key=event['filename'],
|
||||
Body=event['data']
|
||||
)
|
||||
|
||||
# Clear cache every 50 invocations (warm container optimization)
|
||||
if invocation_count % 50 == 0:
|
||||
client.clear_cache()
|
||||
print(f"Cache cleared after {invocation_count} invocations")
|
||||
|
||||
return {'statusCode': 200, 'etag': response['ETag']}
|
||||
|
||||
except Exception as e:
|
||||
# Clear cache on error to prevent poisoned state
|
||||
client.clear_cache()
|
||||
return {'statusCode': 500, 'error': str(e)}
|
||||
```
|
||||
|
||||
**Best Practice**: Clear cache periodically (every N invocations) and on errors. Lambda warm containers can reuse cache across invocations for performance.
|
||||
|
||||
## Encryption Key Management
|
||||
|
||||
DeltaGlider always encrypts cache data. Understanding key management is critical for programmatic usage.
|
||||
|
||||
### Ephemeral Keys (Default - Recommended)
|
||||
|
||||
**How It Works**:
|
||||
- New encryption key generated per client instance
|
||||
- Cache encrypted with instance-specific key
|
||||
- Key lost when client is garbage collected
|
||||
- **Maximum security** - keys never persist
|
||||
|
||||
**When to Use**:
|
||||
- Single-process applications
|
||||
- Short-lived scripts
|
||||
- CI/CD pipelines
|
||||
- Testing
|
||||
- Maximum security requirements
|
||||
|
||||
**Example**:
|
||||
```python
|
||||
from deltaglider import create_client
|
||||
|
||||
# Create client (generates ephemeral key automatically)
|
||||
client = create_client()
|
||||
|
||||
# Upload file (encrypted with ephemeral key)
|
||||
client.put_object(Bucket='bucket', Key='file.zip', Body=data)
|
||||
|
||||
# Clear cache
|
||||
client.clear_cache()
|
||||
|
||||
# ✅ Encrypted cache cleared
|
||||
# ✅ Key was never persisted
|
||||
# ✅ Perfect forward secrecy
|
||||
```
|
||||
|
||||
**Characteristics**:
|
||||
- ✅ Maximum security (keys never leave process)
|
||||
- ✅ Perfect forward secrecy
|
||||
- ✅ No key management overhead
|
||||
- ❌ Cache not shareable between processes
|
||||
- ❌ Cache not reusable after client recreation
|
||||
|
||||
### Persistent Keys (Advanced - Shared Cache)
|
||||
|
||||
**How It Works**:
|
||||
- Use same encryption key across multiple processes/clients
|
||||
- Key stored in environment variable or secrets manager
|
||||
- All processes can read each other's encrypted cache
|
||||
- **Trade-off**: Convenience vs. security
|
||||
|
||||
**When to Use**:
|
||||
- Multi-process applications (workers, replicas)
|
||||
- Shared cache across containers
|
||||
- Cache persistence across application restarts
|
||||
- Horizontal scaling scenarios
|
||||
|
||||
**Example - Environment Variable**:
|
||||
```python
|
||||
import os
|
||||
from cryptography.fernet import Fernet
|
||||
import base64
|
||||
|
||||
# Generate persistent key (do this ONCE, securely)
|
||||
key = Fernet.generate_key()
|
||||
key_b64 = base64.b64encode(key).decode('utf-8')
|
||||
print(f"DG_CACHE_ENCRYPTION_KEY={key_b64}") # Store in secrets manager!
|
||||
|
||||
# Set in environment (or use secrets manager)
|
||||
os.environ['DG_CACHE_ENCRYPTION_KEY'] = key_b64
|
||||
|
||||
# All client instances use same key
|
||||
client1 = create_client()
|
||||
client2 = create_client()
|
||||
|
||||
# Client1 writes to cache
|
||||
client1.put_object(Bucket='bucket', Key='file.zip', Body=data)
|
||||
|
||||
# Client2 can read same cached data (same key!)
|
||||
client2.get_object(Bucket='bucket', Key='file.zip')
|
||||
|
||||
# ✅ Cache shared between processes
|
||||
# ⚠️ Key must be securely managed
|
||||
```
|
||||
|
||||
**Example - AWS Secrets Manager**:
|
||||
```python
|
||||
import boto3
|
||||
import json
|
||||
from deltaglider import create_client
|
||||
|
||||
def get_encryption_key_from_secrets_manager():
|
||||
"""Retrieve encryption key from AWS Secrets Manager."""
|
||||
secrets = boto3.client('secretsmanager', region_name='us-west-2')
|
||||
response = secrets.get_secret_value(SecretId='deltaglider/cache-encryption-key')
|
||||
secret = json.loads(response['SecretString'])
|
||||
return secret['encryption_key']
|
||||
|
||||
# Retrieve key securely
|
||||
os.environ['DG_CACHE_ENCRYPTION_KEY'] = get_encryption_key_from_secrets_manager()
|
||||
|
||||
# Create client with persistent key
|
||||
client = create_client()
|
||||
```
|
||||
|
||||
**Security Best Practices for Persistent Keys**:
|
||||
1. **Generate once**: Never regenerate unless rotating
|
||||
2. **Store securely**: AWS Secrets Manager, HashiCorp Vault, etc.
|
||||
3. **Rotate regularly**: Follow your key rotation policy
|
||||
4. **Audit access**: Log who accesses encryption keys
|
||||
5. **Principle of least privilege**: Only processes that need shared cache get the key
|
||||
|
||||
### Key Rotation
|
||||
|
||||
**When to Rotate**:
|
||||
- Regular schedule (every 90 days recommended)
|
||||
- After security incident
|
||||
- When team members leave
|
||||
- After key exposure
|
||||
|
||||
**How to Rotate**:
|
||||
```python
|
||||
import os
|
||||
from deltaglider import create_client
|
||||
|
||||
# OLD KEY (existing cache encrypted with this)
|
||||
old_key = os.environ.get('DG_CACHE_ENCRYPTION_KEY')
|
||||
|
||||
# Generate NEW KEY
|
||||
from cryptography.fernet import Fernet
|
||||
new_key = Fernet.generate_key()
|
||||
new_key_b64 = base64.b64encode(new_key).decode('utf-8')
|
||||
|
||||
# Steps for rotation:
|
||||
# 1. Clear old cache (encrypted with old key)
|
||||
client_old = create_client() # Uses old key from env
|
||||
client_old.clear_cache()
|
||||
|
||||
# 2. Update environment with new key
|
||||
os.environ['DG_CACHE_ENCRYPTION_KEY'] = new_key_b64
|
||||
|
||||
# 3. Create new client with new key
|
||||
client_new = create_client() # Uses new key
|
||||
|
||||
# 4. Continue operations
|
||||
client_new.put_object(Bucket='bucket', Key='file.zip', Body=data)
|
||||
|
||||
# ✅ Cache now encrypted with new key
|
||||
# ✅ Old encrypted cache cleared
|
||||
```
|
||||
|
||||
## Memory vs. Filesystem Cache
|
||||
|
||||
### Filesystem Cache (Default)
|
||||
|
||||
**Characteristics**:
|
||||
- Stored in `/tmp/deltaglider-*/`
|
||||
- Persistent across client recreations (within same process)
|
||||
- Can be shared between processes (with persistent encryption key)
|
||||
- Slower than memory cache (disk I/O)
|
||||
|
||||
**Configuration**:
|
||||
```python
|
||||
import os
|
||||
|
||||
# Explicitly set filesystem cache (this is the default)
|
||||
os.environ['DG_CACHE_BACKEND'] = 'filesystem'
|
||||
|
||||
from deltaglider import create_client
|
||||
client = create_client()
|
||||
```
|
||||
|
||||
**When to Use**:
|
||||
- Default choice for most applications
|
||||
- When cache should persist across client recreations
|
||||
- Multi-process applications (with persistent key)
|
||||
- Memory-constrained environments
|
||||
|
||||
**Cache Clearing**:
|
||||
```python
|
||||
client.clear_cache()
|
||||
# Removes all files from /tmp/deltaglider-*/
|
||||
# Frees disk space
|
||||
```
|
||||
|
||||
### Memory Cache
|
||||
|
||||
**Characteristics**:
|
||||
- Stored in process memory (RAM)
|
||||
- Fast access (no disk I/O)
|
||||
- Automatically freed when process exits
|
||||
- LRU eviction prevents unlimited growth
|
||||
- Not shared between processes
|
||||
|
||||
**Configuration**:
|
||||
```python
|
||||
import os
|
||||
|
||||
# Enable memory cache
|
||||
os.environ['DG_CACHE_BACKEND'] = 'memory'
|
||||
os.environ['DG_CACHE_MEMORY_SIZE_MB'] = '200' # Default: 100MB
|
||||
|
||||
from deltaglider import create_client
|
||||
client = create_client()
|
||||
```
|
||||
|
||||
**When to Use**:
|
||||
- High-performance requirements
|
||||
- Ephemeral environments (containers, Lambda)
|
||||
- Short-lived applications
|
||||
- CI/CD pipelines
|
||||
- When disk I/O is bottleneck
|
||||
|
||||
**Cache Clearing**:
|
||||
```python
|
||||
client.clear_cache()
|
||||
# Frees memory immediately
|
||||
# No disk I/O
|
||||
```
|
||||
|
||||
**LRU Eviction**:
|
||||
Memory cache automatically evicts least recently used entries when size limit is reached. No manual intervention needed.
|
||||
|
||||
## Best Practices Summary
|
||||
|
||||
### ✅ DO
|
||||
|
||||
1. **Clear cache periodically** in long-running applications
|
||||
2. **Clear cache in `finally` blocks** for batch jobs
|
||||
3. **Use fixtures for tests** to ensure clean state
|
||||
4. **Monitor cache size** in production
|
||||
5. **Use ephemeral keys** when possible (maximum security)
|
||||
6. **Store persistent keys securely** (Secrets Manager, Vault)
|
||||
7. **Rotate encryption keys** regularly
|
||||
8. **Use memory cache** for ephemeral environments
|
||||
9. **Clear cache after config changes**
|
||||
10. **Document cache strategy** for your application
|
||||
|
||||
### ❌ DON'T
|
||||
|
||||
1. **Never let cache grow unbounded** in long-running apps
|
||||
2. **Don't share ephemeral encrypted cache** between processes
|
||||
3. **Don't store persistent keys in code** or version control
|
||||
4. **Don't forget to clear cache in tests**
|
||||
5. **Don't assume cache is automatically cleaned** in SDK usage
|
||||
6. **Don't use persistent keys** unless you need cross-process sharing
|
||||
7. **Don't skip key rotation**
|
||||
8. **Don't ignore memory limits** for memory cache
|
||||
|
||||
## Monitoring Cache Health
|
||||
|
||||
### Cache Size Tracking
|
||||
|
||||
```python
|
||||
import os
|
||||
from pathlib import Path
|
||||
from deltaglider import create_client
|
||||
|
||||
def get_cache_size_mb(cache_dir: Path) -> float:
|
||||
"""Calculate total cache size in MB."""
|
||||
total_bytes = sum(f.stat().st_size for f in cache_dir.rglob('*') if f.is_file())
|
||||
return total_bytes / (1024 * 1024)
|
||||
|
||||
# Get cache directory (ephemeral, changes per process)
|
||||
cache_dir = Path(tempfile.gettempdir())
|
||||
deltaglider_caches = list(cache_dir.glob('deltaglider-*'))
|
||||
|
||||
if deltaglider_caches:
|
||||
cache_path = deltaglider_caches[0]
|
||||
cache_size_mb = get_cache_size_mb(cache_path)
|
||||
print(f"Cache size: {cache_size_mb:.2f} MB")
|
||||
|
||||
# Clear if over threshold
|
||||
if cache_size_mb > 500:
|
||||
client = create_client()
|
||||
client.clear_cache()
|
||||
print("Cache cleared - exceeded 500MB")
|
||||
```
|
||||
|
||||
### Memory Cache Monitoring
|
||||
|
||||
```python
|
||||
import os
|
||||
os.environ['DG_CACHE_BACKEND'] = 'memory'
|
||||
|
||||
from deltaglider import create_client
|
||||
|
||||
client = create_client()
|
||||
|
||||
# Memory cache auto-evicts at configured limit
|
||||
# Monitor via application memory usage tools (not direct API)
|
||||
|
||||
# Example with memory_profiler
|
||||
from memory_profiler import profile
|
||||
|
||||
@profile
|
||||
def upload_many_files():
|
||||
for i in range(1000):
|
||||
client.put_object(Bucket='test', Key=f'file{i}.zip', Body=b'data' * 1000)
|
||||
# Check memory profile to see cache memory usage
|
||||
|
||||
upload_many_files()
|
||||
```
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
### Problem: Cache Growing Too Large
|
||||
|
||||
**Symptoms**:
|
||||
- Disk space running out (`/tmp` filling up)
|
||||
- High memory usage
|
||||
|
||||
**Solution**:
|
||||
```python
|
||||
# Implement automatic cache clearing
|
||||
import psutil
|
||||
|
||||
client = create_client()
|
||||
|
||||
def smart_cache_clear():
|
||||
"""Clear cache if memory/disk pressure detected."""
|
||||
# Check disk space
|
||||
disk = psutil.disk_usage('/tmp')
|
||||
if disk.percent > 80:
|
||||
client.clear_cache()
|
||||
print("Cache cleared - disk pressure")
|
||||
return
|
||||
|
||||
# Check memory usage
|
||||
memory = psutil.virtual_memory()
|
||||
if memory.percent > 80:
|
||||
client.clear_cache()
|
||||
print("Cache cleared - memory pressure")
|
||||
|
||||
# Call periodically
|
||||
schedule.every(15).minutes.do(smart_cache_clear)
|
||||
```
|
||||
|
||||
### Problem: Decryption Failures After Key Rotation
|
||||
|
||||
**Symptoms**:
|
||||
- `CacheCorruptionError: Decryption failed`
|
||||
- After rotating encryption keys
|
||||
|
||||
**Solution**:
|
||||
```python
|
||||
# Clear cache before using new key
|
||||
old_client = create_client() # Old key
|
||||
old_client.clear_cache()
|
||||
|
||||
# Update encryption key
|
||||
os.environ['DG_CACHE_ENCRYPTION_KEY'] = new_key
|
||||
|
||||
# Create new client
|
||||
new_client = create_client() # New key
|
||||
```
|
||||
|
||||
### Problem: Tests Failing Due to Cached Data
|
||||
|
||||
**Symptoms**:
|
||||
- Tests pass in isolation, fail when run together
|
||||
- Unexpected data in downloads
|
||||
|
||||
**Solution**:
|
||||
```python
|
||||
# Always clear cache in test teardown
|
||||
@pytest.fixture(autouse=True)
|
||||
def clear_cache_after_test():
|
||||
"""Automatically clear cache after every test."""
|
||||
yield
|
||||
# Teardown
|
||||
client = create_client()
|
||||
client.clear_cache()
|
||||
```
|
||||
|
||||
## Related Documentation
|
||||
|
||||
- [docs/sdk/getting-started.md](sdk/getting-started.md) - SDK configuration
|
||||
- [README.md](../README.md) - Docker and environment variables
|
||||
- [CLAUDE.md](../CLAUDE.md) - Development guide
|
||||
|
||||
## Quick Reference
|
||||
|
||||
```python
|
||||
from deltaglider import create_client
|
||||
|
||||
# Create client
|
||||
client = create_client()
|
||||
|
||||
# Clear all cache
|
||||
client.clear_cache()
|
||||
|
||||
# Use memory cache
|
||||
os.environ['DG_CACHE_BACKEND'] = 'memory'
|
||||
|
||||
# Use persistent encryption key
|
||||
os.environ['DG_CACHE_ENCRYPTION_KEY'] = 'base64-encoded-key'
|
||||
```
|
||||
526
docs/DG_MAX_RATIO.md
Normal file
526
docs/DG_MAX_RATIO.md
Normal file
@@ -0,0 +1,526 @@
|
||||
# DG_MAX_RATIO - Delta Compression Efficiency Guard
|
||||
|
||||
## Overview
|
||||
|
||||
`DG_MAX_RATIO` is a **safety threshold** that prevents inefficient delta compression. It controls the maximum acceptable ratio of `delta_size / original_file_size`.
|
||||
|
||||
**Default**: `0.5` (50%)
|
||||
**Range**: `0.0` to `1.0`
|
||||
**Environment Variable**: `DG_MAX_RATIO`
|
||||
|
||||
## The Problem It Solves
|
||||
|
||||
When files are **too different**, xdelta3 can create a delta file that's **almost as large as the original file** (or even larger!). This defeats the purpose of compression and wastes:
|
||||
- ❌ Storage space (storing a large delta instead of the original)
|
||||
- ❌ CPU time (creating and applying the delta)
|
||||
- ❌ Network bandwidth (downloading delta + reference instead of just the file)
|
||||
|
||||
## How It Works
|
||||
|
||||
```
|
||||
1. Upload file → Create delta using xdelta3
|
||||
2. Calculate ratio = delta_size / original_file_size
|
||||
3. If ratio > DG_MAX_RATIO:
|
||||
❌ Discard delta, store original file directly
|
||||
Else:
|
||||
✅ Keep delta, save storage space
|
||||
```
|
||||
|
||||
### Example Flow
|
||||
|
||||
```
|
||||
Original file: 100MB
|
||||
Reference file: 100MB (uploaded previously)
|
||||
|
||||
xdelta3 creates delta: 60MB
|
||||
Ratio = 60MB / 100MB = 0.6 (60%)
|
||||
|
||||
With DG_MAX_RATIO=0.5:
|
||||
❌ Delta rejected (60% > 50%)
|
||||
Action: Store 100MB original file
|
||||
Reason: Delta not efficient enough
|
||||
|
||||
With DG_MAX_RATIO=0.7:
|
||||
✅ Delta accepted (60% ≤ 70%)
|
||||
Action: Store 60MB delta
|
||||
Savings: 40MB (40%)
|
||||
```
|
||||
|
||||
## Default Value: 0.5 (50%)
|
||||
|
||||
**Meaning**: Only use delta compression if the delta is **≤50% of the original file size**
|
||||
|
||||
This default provides:
|
||||
- ✅ Minimum 50% storage savings when delta compression is used
|
||||
- ✅ Prevents wasting CPU on inefficient compression
|
||||
- ✅ Works well for typical versioned releases (minor updates between versions)
|
||||
- ✅ Balanced approach without manual tuning
|
||||
|
||||
## When to Adjust
|
||||
|
||||
### 🔽 Lower Value (More Conservative)
|
||||
|
||||
**Set `DG_MAX_RATIO=0.2-0.3` when:**
|
||||
- Files change significantly between versions (major updates, refactors)
|
||||
- Storage cost is **very high** (premium S3 tiers, small quotas)
|
||||
- You want to avoid **any** inefficient compression
|
||||
- You need guaranteed high-quality compression (≥70% savings)
|
||||
|
||||
**Example**:
|
||||
```bash
|
||||
export DG_MAX_RATIO=0.3
|
||||
|
||||
# Only accept deltas that are ≤30% of original size
|
||||
# More files stored directly, but guaranteed ≥70% savings when using deltas
|
||||
```
|
||||
|
||||
**Trade-off**:
|
||||
- ✅ Higher quality compression when deltas are used
|
||||
- ❌ Fewer files will use delta compression
|
||||
- ❌ More direct uploads (higher storage for dissimilar files)
|
||||
|
||||
### 🔼 Higher Value (More Permissive)
|
||||
|
||||
**Set `DG_MAX_RATIO=0.6-0.8` when:**
|
||||
- Files are very similar (minor patches, nightly builds, incremental changes)
|
||||
- Storage cost is **cheap** (large S3 buckets, unlimited quotas)
|
||||
- CPU time is **expensive** (you want to save re-uploading even with modest compression)
|
||||
- You want to maximize delta compression usage
|
||||
|
||||
**Example**:
|
||||
```bash
|
||||
export DG_MAX_RATIO=0.7
|
||||
|
||||
# Accept deltas up to 70% of original size
|
||||
# More files use delta compression, even with modest 30% savings
|
||||
```
|
||||
|
||||
**Trade-off**:
|
||||
- ✅ More files use delta compression
|
||||
- ✅ Save bandwidth even with modest compression
|
||||
- ❌ Some deltas may only save 20-30% space
|
||||
- ❌ More CPU time spent on marginal compressions
|
||||
|
||||
## Real-World Scenarios
|
||||
|
||||
### Scenario 1: Nightly Builds (Minimal Changes) ⭐ IDEAL
|
||||
|
||||
```
|
||||
my-app-v1.0.0.zip → 100MB (reference)
|
||||
my-app-v1.0.1.zip → 100MB (0.1% code change)
|
||||
|
||||
Delta: 200KB (0.2% of original)
|
||||
Ratio: 0.002
|
||||
|
||||
With ANY DG_MAX_RATIO: ✅ Use delta (99.8% savings!)
|
||||
Result: Store 200KB instead of 100MB
|
||||
```
|
||||
|
||||
**This is what DeltaGlider excels at!**
|
||||
|
||||
### Scenario 2: Major Version (Significant Changes)
|
||||
|
||||
```
|
||||
my-app-v1.0.0.zip → 100MB (reference)
|
||||
my-app-v2.0.0.zip → 100MB (complete rewrite, 85% different)
|
||||
|
||||
Delta: 85MB (85% of original)
|
||||
Ratio: 0.85
|
||||
|
||||
With DG_MAX_RATIO=0.5: ❌ Store original (85% > 50%)
|
||||
→ Stores 100MB directly
|
||||
→ No compression benefit, but no CPU waste
|
||||
|
||||
With DG_MAX_RATIO=0.9: ✅ Use delta (85% ≤ 90%)
|
||||
→ Stores 85MB delta
|
||||
→ Only 15% savings, questionable benefit
|
||||
```
|
||||
|
||||
**Recommendation**: For major versions, default `0.5` correctly rejects inefficient compression.
|
||||
|
||||
### Scenario 3: Different File Format (Same Content)
|
||||
|
||||
```
|
||||
my-app-v1.0.0.zip → 100MB (ZIP archive)
|
||||
my-app-v1.0.0.tar → 100MB (TAR archive, same content)
|
||||
|
||||
Delta: 70MB (completely different format structure)
|
||||
Ratio: 0.70
|
||||
|
||||
With DG_MAX_RATIO=0.5: ❌ Store original (70% > 50%)
|
||||
→ Stores 100MB directly
|
||||
→ Correct decision: formats too different
|
||||
|
||||
With DG_MAX_RATIO=0.8: ✅ Use delta (70% ≤ 80%)
|
||||
→ Stores 70MB delta
|
||||
→ 30% savings, but CPU-intensive
|
||||
```
|
||||
|
||||
**Recommendation**: Use consistent file formats for better compression. Default `0.5` correctly rejects cross-format compression.
|
||||
|
||||
### Scenario 4: Incremental Updates (Sweet Spot) ⭐
|
||||
|
||||
```
|
||||
my-app-v1.0.0.zip → 100MB (reference)
|
||||
my-app-v1.0.1.zip → 100MB (minor bugfix, 5% code change)
|
||||
|
||||
Delta: 5MB (5% of original)
|
||||
Ratio: 0.05
|
||||
|
||||
With ANY DG_MAX_RATIO: ✅ Use delta (95% savings!)
|
||||
Result: Store 5MB instead of 100MB
|
||||
```
|
||||
|
||||
**This is the target use case for delta compression!**
|
||||
|
||||
## How to Choose the Right Value
|
||||
|
||||
### Decision Tree
|
||||
|
||||
```
|
||||
Do your files have minimal changes between versions? (< 5% different)
|
||||
├─ YES → Use default 0.5 ✅
|
||||
│ Delta compression will work perfectly
|
||||
│
|
||||
└─ NO → Are your files significantly different? (> 50% different)
|
||||
├─ YES → Lower to 0.2-0.3 🔽
|
||||
│ Avoid wasting time on inefficient compression
|
||||
│
|
||||
└─ NO → Are they moderately different? (20-50% different)
|
||||
├─ Storage is expensive → Lower to 0.3 🔽
|
||||
│ Only high-quality compression
|
||||
│
|
||||
└─ Storage is cheap → Raise to 0.6-0.7 🔼
|
||||
Accept modest savings
|
||||
```
|
||||
|
||||
### Quick Reference Table
|
||||
|
||||
| File Similarity | Recommended DG_MAX_RATIO | Expected Behavior |
|
||||
|----------------|--------------------------|-------------------|
|
||||
| Nearly identical (< 5% change) | **0.5 (default)** | 🟢 95%+ savings |
|
||||
| Minor updates (5-20% change) | **0.5 (default)** | 🟢 80-95% savings |
|
||||
| Moderate changes (20-50% change) | **0.4-0.6** | 🟡 50-80% savings |
|
||||
| Major changes (50-80% change) | **0.3 or lower** | 🔴 Store directly |
|
||||
| Complete rewrites (> 80% change) | **0.3 or lower** | 🔴 Store directly |
|
||||
|
||||
### Use Cases by Industry
|
||||
|
||||
**Software Releases (SaaS)**:
|
||||
```bash
|
||||
export DG_MAX_RATIO=0.5 # Default
|
||||
# Nightly builds with minor changes compress perfectly
|
||||
```
|
||||
|
||||
**Mobile App Builds**:
|
||||
```bash
|
||||
export DG_MAX_RATIO=0.4 # Slightly conservative
|
||||
# iOS/Android builds can vary, want quality compression only
|
||||
```
|
||||
|
||||
**Database Backups**:
|
||||
```bash
|
||||
export DG_MAX_RATIO=0.7 # Permissive
|
||||
# Daily backups are very similar, accept modest savings
|
||||
```
|
||||
|
||||
**Document Archives**:
|
||||
```bash
|
||||
export DG_MAX_RATIO=0.6 # Moderate
|
||||
# Documents change incrementally, accept good savings
|
||||
```
|
||||
|
||||
**Video/Media Archives**:
|
||||
```bash
|
||||
export DG_MAX_RATIO=0.2 # Very conservative
|
||||
# Media files are unique, only compress if very similar
|
||||
```
|
||||
|
||||
## Configuration Examples
|
||||
|
||||
### Docker
|
||||
|
||||
**Conservative (Premium Storage)**:
|
||||
```bash
|
||||
docker run -e DG_MAX_RATIO=0.3 \
|
||||
-e AWS_ACCESS_KEY_ID -e AWS_SECRET_ACCESS_KEY \
|
||||
deltaglider/deltaglider cp file.zip s3://releases/
|
||||
```
|
||||
|
||||
**Default (Balanced)**:
|
||||
```bash
|
||||
docker run -e DG_MAX_RATIO=0.5 \
|
||||
-e AWS_ACCESS_KEY_ID -e AWS_SECRET_ACCESS_KEY \
|
||||
deltaglider/deltaglider cp file.zip s3://releases/
|
||||
```
|
||||
|
||||
**Permissive (Cheap Storage)**:
|
||||
```bash
|
||||
docker run -e DG_MAX_RATIO=0.7 \
|
||||
-e AWS_ACCESS_KEY_ID -e AWS_SECRET_ACCESS_KEY \
|
||||
deltaglider/deltaglider cp file.zip s3://releases/
|
||||
```
|
||||
|
||||
### Python SDK
|
||||
|
||||
```python
|
||||
import os
|
||||
|
||||
# Conservative (only high-quality compression)
|
||||
os.environ['DG_MAX_RATIO'] = '0.3'
|
||||
|
||||
# Default (balanced)
|
||||
os.environ['DG_MAX_RATIO'] = '0.5'
|
||||
|
||||
# Permissive (accept modest savings)
|
||||
os.environ['DG_MAX_RATIO'] = '0.7'
|
||||
|
||||
from deltaglider import create_client
|
||||
client = create_client()
|
||||
|
||||
summary = client.upload("file.zip", "s3://bucket/")
|
||||
print(f"Delta ratio: {summary.delta_ratio:.2f}")
|
||||
print(f"Used delta: {summary.is_delta}")
|
||||
```
|
||||
|
||||
### CLI
|
||||
|
||||
```bash
|
||||
# Conservative
|
||||
export DG_MAX_RATIO=0.3
|
||||
deltaglider cp my-app-v2.0.0.zip s3://releases/
|
||||
|
||||
# Default
|
||||
export DG_MAX_RATIO=0.5
|
||||
deltaglider cp my-app-v2.0.0.zip s3://releases/
|
||||
|
||||
# Permissive
|
||||
export DG_MAX_RATIO=0.7
|
||||
deltaglider cp my-app-v2.0.0.zip s3://releases/
|
||||
```
|
||||
|
||||
### Override Per-Upload (CLI)
|
||||
|
||||
```bash
|
||||
# Use custom ratio for specific file
|
||||
deltaglider cp large-file.zip s3://releases/ --max-ratio 0.3
|
||||
```
|
||||
|
||||
## Monitoring and Tuning
|
||||
|
||||
### Check Delta Ratios After Upload
|
||||
|
||||
```python
|
||||
from deltaglider import create_client
|
||||
|
||||
client = create_client()
|
||||
summary = client.upload("file.zip", "s3://bucket/")
|
||||
|
||||
print(f"Delta ratio: {summary.delta_ratio:.2f}")
|
||||
print(f"Savings: {summary.savings_percent:.0f}%")
|
||||
print(f"Used delta: {summary.is_delta}")
|
||||
|
||||
if summary.is_delta:
|
||||
print(f"✅ Used delta compression (ratio {summary.delta_ratio:.2f})")
|
||||
if summary.delta_ratio > 0.4:
|
||||
print(f"⚠️ Consider lowering DG_MAX_RATIO for better quality")
|
||||
else:
|
||||
print(f"❌ Stored directly (delta would have been ratio ~{summary.delta_ratio:.2f})")
|
||||
if summary.delta_ratio < 0.6:
|
||||
print(f"💡 Consider raising DG_MAX_RATIO to enable compression")
|
||||
```
|
||||
|
||||
### Batch Analysis
|
||||
|
||||
```python
|
||||
from deltaglider import create_client
|
||||
from pathlib import Path
|
||||
|
||||
client = create_client()
|
||||
ratios = []
|
||||
|
||||
for file in Path("releases").glob("*.zip"):
|
||||
summary = client.upload(str(file), f"s3://bucket/{file.name}")
|
||||
if summary.is_delta:
|
||||
ratios.append(summary.delta_ratio)
|
||||
|
||||
if ratios:
|
||||
avg_ratio = sum(ratios) / len(ratios)
|
||||
max_ratio = max(ratios)
|
||||
|
||||
print(f"Average delta ratio: {avg_ratio:.2f}")
|
||||
print(f"Maximum delta ratio: {max_ratio:.2f}")
|
||||
print(f"Files compressed: {len(ratios)}")
|
||||
|
||||
if avg_ratio < 0.2:
|
||||
print("💡 Consider raising DG_MAX_RATIO - you're getting excellent compression")
|
||||
elif max_ratio > 0.6:
|
||||
print("⚠️ Consider lowering DG_MAX_RATIO - some deltas are inefficient")
|
||||
```
|
||||
|
||||
### Optimization Strategy
|
||||
|
||||
1. **Start with default (0.5)**
|
||||
2. **Monitor delta ratios** for 1 week of uploads
|
||||
3. **Analyze results**:
|
||||
- If most ratios < 0.2: Consider raising to 0.6-0.7
|
||||
- If many ratios > 0.4: Consider lowering to 0.3-0.4
|
||||
- If ratios vary widely: Keep default 0.5
|
||||
4. **Adjust and re-test** for 1 week
|
||||
5. **Repeat until optimal** for your use case
|
||||
|
||||
## Advanced Usage
|
||||
|
||||
### Dynamic Ratio Based on File Type
|
||||
|
||||
```python
|
||||
import os
|
||||
from pathlib import Path
|
||||
from deltaglider import create_client
|
||||
|
||||
def get_optimal_ratio(file_path: str) -> float:
|
||||
"""Determine optimal ratio based on file type."""
|
||||
suffix = Path(file_path).suffix.lower()
|
||||
|
||||
# Very compressible (source code archives)
|
||||
if suffix in ['.zip', '.tar', '.gz']:
|
||||
return 0.6
|
||||
|
||||
# Moderately compressible (compiled binaries)
|
||||
elif suffix in ['.jar', '.war', '.deb', '.rpm']:
|
||||
return 0.5
|
||||
|
||||
# Rarely compressible (media, already compressed)
|
||||
elif suffix in ['.mp4', '.jpg', '.png']:
|
||||
return 0.2
|
||||
|
||||
# Default
|
||||
return 0.5
|
||||
|
||||
file = "my-app.zip"
|
||||
os.environ['DG_MAX_RATIO'] = str(get_optimal_ratio(file))
|
||||
|
||||
client = create_client()
|
||||
summary = client.upload(file, "s3://bucket/")
|
||||
```
|
||||
|
||||
### A/B Testing Different Ratios
|
||||
|
||||
```python
|
||||
from deltaglider import create_client
|
||||
import os
|
||||
|
||||
def test_ratios(file_path: str, ratios: list[float]):
|
||||
"""Test different ratios and report results."""
|
||||
results = []
|
||||
|
||||
for ratio in ratios:
|
||||
os.environ['DG_MAX_RATIO'] = str(ratio)
|
||||
client = create_client()
|
||||
|
||||
# Simulate upload (don't actually upload)
|
||||
summary = client.estimate_compression(file_path, "s3://bucket/test/")
|
||||
|
||||
results.append({
|
||||
'ratio_threshold': ratio,
|
||||
'would_use_delta': summary.delta_ratio <= ratio,
|
||||
'delta_ratio': summary.delta_ratio,
|
||||
'savings': summary.savings_percent if summary.delta_ratio <= ratio else 0
|
||||
})
|
||||
|
||||
return results
|
||||
|
||||
# Test different ratios
|
||||
file = "my-app-v2.0.0.zip"
|
||||
test_results = test_ratios(file, [0.3, 0.5, 0.7])
|
||||
|
||||
for result in test_results:
|
||||
print(f"Ratio {result['ratio_threshold']}: "
|
||||
f"Delta={result['would_use_delta']}, "
|
||||
f"Savings={result['savings']:.0f}%")
|
||||
```
|
||||
|
||||
## FAQ
|
||||
|
||||
### Q: What happens if I set DG_MAX_RATIO=1.0?
|
||||
|
||||
**A**: Delta compression will **always** be used, even if the delta is larger than the original file! This is generally a bad idea and defeats the purpose of the threshold.
|
||||
|
||||
**Example**:
|
||||
```bash
|
||||
export DG_MAX_RATIO=1.0
|
||||
|
||||
# File: 100MB, Delta: 120MB
|
||||
# Ratio: 1.2
|
||||
# With DG_MAX_RATIO=1.0: ✅ Use delta (1.2 > 1.0 but we accept anything ≤1.0)
|
||||
# Wait, that's wrong! The delta is LARGER than the original!
|
||||
|
||||
# NEVER set DG_MAX_RATIO to 1.0 or higher
|
||||
```
|
||||
|
||||
### Q: What happens if I set DG_MAX_RATIO=0.0?
|
||||
|
||||
**A**: Delta compression will **never** be used. All files will be stored directly. This is equivalent to disabling DeltaGlider's compression entirely.
|
||||
|
||||
### Q: Can I disable the ratio check?
|
||||
|
||||
**A**: No, and you shouldn't want to. The ratio check is a critical safety feature that prevents wasting storage and CPU on inefficient compression.
|
||||
|
||||
### Q: Does DG_MAX_RATIO affect downloading?
|
||||
|
||||
**A**: No, `DG_MAX_RATIO` only affects **uploads**. During download, DeltaGlider automatically detects whether a file is stored as a delta or directly and handles reconstruction accordingly.
|
||||
|
||||
### Q: Can I set different ratios for different buckets?
|
||||
|
||||
**A**: Not directly via environment variables, but you can change `DG_MAX_RATIO` before each upload in your code:
|
||||
|
||||
```python
|
||||
import os
|
||||
from deltaglider import create_client
|
||||
|
||||
# High-quality compression for production releases
|
||||
os.environ['DG_MAX_RATIO'] = '0.3'
|
||||
client = create_client()
|
||||
client.upload("prod-release.zip", "s3://production/")
|
||||
|
||||
# Permissive compression for dev builds
|
||||
os.environ['DG_MAX_RATIO'] = '0.7'
|
||||
client = create_client()
|
||||
client.upload("dev-build.zip", "s3://development/")
|
||||
```
|
||||
|
||||
### Q: How do I know if my DG_MAX_RATIO is set correctly?
|
||||
|
||||
**A**: Monitor your upload summaries. If most deltas have ratios close to your threshold (e.g., 0.45-0.50 with default 0.5), you might want to lower it. If most deltas have very low ratios (e.g., < 0.2), you could raise it.
|
||||
|
||||
**Ideal scenario**: Most successful delta compressions have ratios < 0.3, and inefficient deltas (> 0.5) are correctly rejected.
|
||||
|
||||
## Summary
|
||||
|
||||
**`DG_MAX_RATIO` prevents wasting time and storage on inefficient delta compression.**
|
||||
|
||||
### Quick Takeaways
|
||||
|
||||
✅ **Default 0.5 works for 90% of use cases**
|
||||
✅ **Lower values (0.2-0.3) for dissimilar files or expensive storage**
|
||||
✅ **Higher values (0.6-0.7) for very similar files or cheap storage**
|
||||
✅ **Monitor delta ratios to tune for your use case**
|
||||
✅ **Never set to 1.0 or higher (defeats the purpose)**
|
||||
✅ **Never set to 0.0 (disables delta compression entirely)**
|
||||
|
||||
### Golden Rule
|
||||
|
||||
**If you're not sure, keep the default `0.5`.**
|
||||
|
||||
It's a sensible balance that:
|
||||
- Prevents inefficient compression (no deltas > 50% of original size)
|
||||
- Allows excellent savings on similar files (most deltas are < 20%)
|
||||
- Works well for typical versioned releases
|
||||
- Requires no manual tuning for most use cases
|
||||
|
||||
---
|
||||
|
||||
**Related Documentation**:
|
||||
- [CLAUDE.md](../CLAUDE.md) - Environment variables reference
|
||||
- [README.md](../README.md) - Docker usage and configuration
|
||||
- [docs/sdk/getting-started.md](sdk/getting-started.md) - SDK configuration guide
|
||||
364
docs/DOCKER.md
Normal file
364
docs/DOCKER.md
Normal file
@@ -0,0 +1,364 @@
|
||||
# Docker Support for DeltaGlider
|
||||
|
||||
This document describes how to build, run, and publish Docker images for DeltaGlider.
|
||||
|
||||
## Quick Start
|
||||
|
||||
### Pull and run the latest image
|
||||
|
||||
```bash
|
||||
docker pull beshultd/deltaglider:latest
|
||||
docker run --rm beshultd/deltaglider:latest --help
|
||||
```
|
||||
|
||||
### Run with AWS credentials
|
||||
|
||||
```bash
|
||||
docker run --rm \
|
||||
-e AWS_ACCESS_KEY_ID=your_key \
|
||||
-e AWS_SECRET_ACCESS_KEY=your_secret \
|
||||
-e AWS_DEFAULT_REGION=us-east-1 \
|
||||
beshultd/deltaglider:latest ls s3://your-bucket/
|
||||
```
|
||||
|
||||
### Run with MinIO (local S3 alternative)
|
||||
|
||||
```bash
|
||||
# Start MinIO
|
||||
docker run -d \
|
||||
-p 9000:9000 -p 9001:9001 \
|
||||
-e MINIO_ROOT_USER=minioadmin \
|
||||
-e MINIO_ROOT_PASSWORD=minioadmin \
|
||||
--name minio \
|
||||
minio/minio server /data --console-address ":9001"
|
||||
|
||||
# Use DeltaGlider with MinIO
|
||||
docker run --rm \
|
||||
-e AWS_ENDPOINT_URL=http://host.docker.internal:9000 \
|
||||
-e AWS_ACCESS_KEY_ID=minioadmin \
|
||||
-e AWS_SECRET_ACCESS_KEY=minioadmin \
|
||||
-e AWS_DEFAULT_REGION=us-east-1 \
|
||||
beshultd/deltaglider:latest ls
|
||||
```
|
||||
|
||||
## Building Locally
|
||||
|
||||
### Build with current git version
|
||||
|
||||
```bash
|
||||
VERSION=$(git describe --tags --always --abbrev=0 | sed 's/^v//')
|
||||
docker build --build-arg VERSION=${VERSION} -t beshultd/deltaglider:${VERSION} .
|
||||
```
|
||||
|
||||
### Build with custom version
|
||||
|
||||
```bash
|
||||
docker build --build-arg VERSION=6.0.2 -t beshultd/deltaglider:6.0.2 .
|
||||
```
|
||||
|
||||
### Multi-platform build
|
||||
|
||||
```bash
|
||||
# Create a buildx builder (one-time setup)
|
||||
docker buildx create --name deltaglider-builder --use
|
||||
|
||||
# Build for multiple platforms
|
||||
docker buildx build \
|
||||
--platform linux/amd64,linux/arm64 \
|
||||
--build-arg VERSION=6.0.2 \
|
||||
-t beshultd/deltaglider:6.0.2 \
|
||||
--push \
|
||||
.
|
||||
```
|
||||
|
||||
## Testing the Image
|
||||
|
||||
### Basic functionality test
|
||||
|
||||
```bash
|
||||
# Check version
|
||||
docker run --rm beshultd/deltaglider:test --version
|
||||
|
||||
# Check help
|
||||
docker run --rm beshultd/deltaglider:test --help
|
||||
|
||||
# List available commands
|
||||
docker run --rm beshultd/deltaglider:test
|
||||
```
|
||||
|
||||
### Integration test with MinIO
|
||||
|
||||
```bash
|
||||
# 1. Start MinIO
|
||||
docker run -d \
|
||||
-p 9000:9000 -p 9001:9001 \
|
||||
-e MINIO_ROOT_USER=minioadmin \
|
||||
-e MINIO_ROOT_PASSWORD=minioadmin \
|
||||
--name minio \
|
||||
minio/minio server /data --console-address ":9001"
|
||||
|
||||
# 2. Create a test file
|
||||
echo "Hello DeltaGlider" > test.txt
|
||||
|
||||
# 3. Upload to S3/MinIO
|
||||
docker run --rm \
|
||||
-v $(pwd):/data \
|
||||
-w /data \
|
||||
-e AWS_ENDPOINT_URL=http://host.docker.internal:9000 \
|
||||
-e AWS_ACCESS_KEY_ID=minioadmin \
|
||||
-e AWS_SECRET_ACCESS_KEY=minioadmin \
|
||||
-e AWS_DEFAULT_REGION=us-east-1 \
|
||||
beshultd/deltaglider:test cp test.txt s3://test-bucket/
|
||||
|
||||
# 4. List bucket contents
|
||||
docker run --rm \
|
||||
-e AWS_ENDPOINT_URL=http://host.docker.internal:9000 \
|
||||
-e AWS_ACCESS_KEY_ID=minioadmin \
|
||||
-e AWS_SECRET_ACCESS_KEY=minioadmin \
|
||||
-e AWS_DEFAULT_REGION=us-east-1 \
|
||||
beshultd/deltaglider:test ls s3://test-bucket/
|
||||
|
||||
# 5. Get statistics
|
||||
docker run --rm \
|
||||
-e AWS_ENDPOINT_URL=http://host.docker.internal:9000 \
|
||||
-e AWS_ACCESS_KEY_ID=minioadmin \
|
||||
-e AWS_SECRET_ACCESS_KEY=minioadmin \
|
||||
-e AWS_DEFAULT_REGION=us-east-1 \
|
||||
beshultd/deltaglider:test stats test-bucket
|
||||
|
||||
# 6. Cleanup
|
||||
docker stop minio && docker rm minio
|
||||
rm test.txt
|
||||
```
|
||||
|
||||
## Publishing to Docker Hub
|
||||
|
||||
### Manual Publishing
|
||||
|
||||
```bash
|
||||
# 1. Log in to Docker Hub
|
||||
docker login
|
||||
|
||||
# 2. Build the image
|
||||
VERSION=$(git describe --tags --always --abbrev=0 | sed 's/^v//')
|
||||
docker build --build-arg VERSION=${VERSION} \
|
||||
-t beshultd/deltaglider:${VERSION} \
|
||||
-t beshultd/deltaglider:latest \
|
||||
.
|
||||
|
||||
# 3. Push to Docker Hub
|
||||
docker push beshultd/deltaglider:${VERSION}
|
||||
docker push beshultd/deltaglider:latest
|
||||
```
|
||||
|
||||
### Multi-platform Publishing
|
||||
|
||||
```bash
|
||||
# Create builder (one-time setup)
|
||||
docker buildx create --name deltaglider-builder --use
|
||||
|
||||
# Build and push for multiple platforms
|
||||
VERSION=$(git describe --tags --always --abbrev=0 | sed 's/^v//')
|
||||
docker buildx build \
|
||||
--platform linux/amd64,linux/arm64 \
|
||||
--build-arg VERSION=${VERSION} \
|
||||
-t beshultd/deltaglider:${VERSION} \
|
||||
-t beshultd/deltaglider:latest \
|
||||
--push \
|
||||
.
|
||||
```
|
||||
|
||||
## GitHub Actions Automation
|
||||
|
||||
The repository includes a GitHub Action workflow (`.github/workflows/docker-publish.yml`) that automatically builds and publishes Docker images.
|
||||
|
||||
### Automatic Publishing Triggers
|
||||
|
||||
- **On main branch push**: Tags as `latest`
|
||||
- **On develop branch push**: Tags as `develop`
|
||||
- **On version tag push** (e.g., `v6.0.2`): Tags with semver patterns:
|
||||
- `6.0.2` (full version)
|
||||
- `6.0` (major.minor)
|
||||
- `6` (major)
|
||||
- **On pull request**: Builds but doesn't push (testing only)
|
||||
|
||||
### Required GitHub Secrets
|
||||
|
||||
Set these secrets in your GitHub repository settings (`Settings > Secrets and variables > Actions`):
|
||||
|
||||
1. **DOCKERHUB_USERNAME**: Your Docker Hub username (e.g., `beshultd`)
|
||||
2. **DOCKERHUB_TOKEN**: Docker Hub access token (create at https://hub.docker.com/settings/security)
|
||||
|
||||
### Manual Workflow Trigger
|
||||
|
||||
You can manually trigger the Docker build workflow from the GitHub Actions tab:
|
||||
|
||||
1. Go to **Actions** tab
|
||||
2. Select **Build and Publish Docker Images**
|
||||
3. Click **Run workflow**
|
||||
4. Select branch and click **Run workflow**
|
||||
|
||||
## Docker Image Details
|
||||
|
||||
### Image Layers
|
||||
|
||||
The Dockerfile uses a multi-stage build:
|
||||
|
||||
1. **Builder stage**: Installs UV and Python dependencies
|
||||
2. **Runtime stage**: Minimal Python 3.12-slim with only runtime dependencies
|
||||
|
||||
### Image Features
|
||||
|
||||
- **Size**: ~150MB (compressed)
|
||||
- **Platforms**: linux/amd64, linux/arm64
|
||||
- **User**: Runs as non-root user `deltaglider` (UID 1000)
|
||||
- **Base**: Python 3.12-slim (Debian)
|
||||
- **Dependencies**:
|
||||
- Python 3.12
|
||||
- xdelta3 (binary diff tool)
|
||||
- All Python dependencies from `pyproject.toml`
|
||||
|
||||
### Environment Variables
|
||||
|
||||
The image supports the following environment variables:
|
||||
|
||||
```bash
|
||||
# Logging
|
||||
DG_LOG_LEVEL=INFO # DEBUG, INFO, WARNING, ERROR
|
||||
|
||||
# Performance & Compression
|
||||
DG_MAX_RATIO=0.5 # Max delta/file ratio (0.0-1.0)
|
||||
|
||||
# Cache Configuration
|
||||
DG_CACHE_BACKEND=filesystem # filesystem or memory
|
||||
DG_CACHE_MEMORY_SIZE_MB=100 # Memory cache size
|
||||
DG_CACHE_ENCRYPTION_KEY= # Optional encryption key
|
||||
|
||||
# AWS Configuration
|
||||
AWS_ENDPOINT_URL= # S3 endpoint (for MinIO/LocalStack)
|
||||
AWS_ACCESS_KEY_ID= # AWS access key
|
||||
AWS_SECRET_ACCESS_KEY= # AWS secret key
|
||||
AWS_DEFAULT_REGION=us-east-1 # AWS region
|
||||
```
|
||||
|
||||
### Health Check
|
||||
|
||||
The image includes a health check that runs every 30 seconds:
|
||||
|
||||
```bash
|
||||
docker inspect --format='{{.State.Health.Status}}' <container-id>
|
||||
```
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
### Build Issues
|
||||
|
||||
#### "setuptools-scm was unable to detect version"
|
||||
|
||||
**Cause**: Git metadata not available during build.
|
||||
|
||||
**Solution**: Always use the `VERSION` build arg:
|
||||
|
||||
```bash
|
||||
docker build --build-arg VERSION=6.0.2 -t beshultd/deltaglider:6.0.2 .
|
||||
```
|
||||
|
||||
#### Cache issues
|
||||
|
||||
**Cause**: Docker build cache causing stale builds.
|
||||
|
||||
**Solution**: Use `--no-cache` flag:
|
||||
|
||||
```bash
|
||||
docker build --no-cache --build-arg VERSION=6.0.2 -t beshultd/deltaglider:6.0.2 .
|
||||
```
|
||||
|
||||
### Runtime Issues
|
||||
|
||||
#### "unauthorized: access token has insufficient scopes"
|
||||
|
||||
**Cause**: Not logged in to Docker Hub or invalid credentials.
|
||||
|
||||
**Solution**:
|
||||
|
||||
```bash
|
||||
docker login
|
||||
# Enter your Docker Hub credentials
|
||||
```
|
||||
|
||||
#### "Cannot connect to MinIO/LocalStack"
|
||||
|
||||
**Cause**: Using `localhost` instead of `host.docker.internal` from inside container.
|
||||
|
||||
**Solution**: Use `host.docker.internal` for Mac/Windows or `172.17.0.1` for Linux:
|
||||
|
||||
```bash
|
||||
# Mac/Windows
|
||||
-e AWS_ENDPOINT_URL=http://host.docker.internal:9000
|
||||
|
||||
# Linux
|
||||
-e AWS_ENDPOINT_URL=http://172.17.0.1:9000
|
||||
```
|
||||
|
||||
## Docker Compose
|
||||
|
||||
For local development with MinIO:
|
||||
|
||||
```yaml
|
||||
version: '3.8'
|
||||
|
||||
services:
|
||||
minio:
|
||||
image: minio/minio:latest
|
||||
ports:
|
||||
- "9000:9000"
|
||||
- "9001:9001"
|
||||
environment:
|
||||
MINIO_ROOT_USER: minioadmin
|
||||
MINIO_ROOT_PASSWORD: minioadmin
|
||||
command: server /data --console-address ":9001"
|
||||
healthcheck:
|
||||
test: ["CMD", "curl", "-f", "http://localhost:9000/minio/health/live"]
|
||||
interval: 10s
|
||||
timeout: 5s
|
||||
retries: 5
|
||||
|
||||
deltaglider:
|
||||
image: beshultd/deltaglider:latest
|
||||
environment:
|
||||
AWS_ENDPOINT_URL: http://minio:9000
|
||||
AWS_ACCESS_KEY_ID: minioadmin
|
||||
AWS_SECRET_ACCESS_KEY: minioadmin
|
||||
AWS_DEFAULT_REGION: us-east-1
|
||||
DG_LOG_LEVEL: DEBUG
|
||||
depends_on:
|
||||
- minio
|
||||
volumes:
|
||||
- ./data:/data
|
||||
working_dir: /data
|
||||
command: ["--help"]
|
||||
```
|
||||
|
||||
Run with:
|
||||
|
||||
```bash
|
||||
docker-compose up -d
|
||||
docker-compose run --rm deltaglider ls
|
||||
```
|
||||
|
||||
## Best Practices
|
||||
|
||||
1. **Always specify version**: Use `--build-arg VERSION=x.y.z` when building
|
||||
2. **Use multi-stage builds**: Keeps final image small
|
||||
3. **Tag with semantic versions**: Follow semver (major.minor.patch)
|
||||
4. **Test before pushing**: Run integration tests locally
|
||||
5. **Use secrets**: Never hardcode credentials in images
|
||||
6. **Multi-platform builds**: Support both amd64 and arm64
|
||||
7. **Update README**: Keep Docker Hub description in sync with README.md
|
||||
|
||||
## Additional Resources
|
||||
|
||||
- [Docker Hub Repository](https://hub.docker.com/r/beshultd/deltaglider)
|
||||
- [GitHub Repository](https://github.com/beshu-tech/deltaglider)
|
||||
- [MinIO Documentation](https://min.io/docs/minio/container/index.html)
|
||||
- [Docker Buildx Documentation](https://docs.docker.com/buildx/working-with-buildx/)
|
||||
242
docs/EC2_REGION_DETECTION.md
Normal file
242
docs/EC2_REGION_DETECTION.md
Normal file
@@ -0,0 +1,242 @@
|
||||
# EC2 Region Detection & Cost Optimization
|
||||
|
||||
DeltaGlider automatically detects when you're running on an EC2 instance and warns you about potential cross-region data transfer charges.
|
||||
|
||||
## Overview
|
||||
|
||||
When running `deltaglider migrate` on an EC2 instance, DeltaGlider:
|
||||
|
||||
1. **Detects EC2 Environment**: Uses IMDSv2 (Instance Metadata Service v2) to determine if running on EC2
|
||||
2. **Retrieves Instance Region**: Gets the actual AWS region where your EC2 instance is running
|
||||
3. **Compares Regions**: Checks if your EC2 region matches the S3 client region
|
||||
4. **Warns About Costs**: Displays clear warnings when regions don't match
|
||||
|
||||
## Why This Matters
|
||||
|
||||
**AWS Cross-Region Data Transfer Costs**:
|
||||
- **Same region**: No additional charges for data transfer
|
||||
- **Cross-region**: $0.02 per GB transferred (can add up quickly for large migrations)
|
||||
- **NAT Gateway**: Additional charges if going through NAT
|
||||
|
||||
**Example Cost Impact**:
|
||||
- Migrating 1TB from `us-east-1` EC2 → `us-west-2` S3 = ~$20 in data transfer charges
|
||||
- Same migration within same region = $0 in data transfer charges
|
||||
|
||||
## Output Examples
|
||||
|
||||
### Scenario 1: Regions Aligned (Optimal) ✅
|
||||
|
||||
```bash
|
||||
$ deltaglider migrate s3://old-bucket/ s3://new-bucket/
|
||||
EC2 Instance: us-east-1a
|
||||
S3 Client Region: us-east-1
|
||||
✓ Regions aligned - no cross-region charges
|
||||
Migrating from s3://old-bucket/
|
||||
to s3://new-bucket/
|
||||
...
|
||||
```
|
||||
|
||||
**Result**: No warnings, optimal configuration, no extra charges.
|
||||
|
||||
---
|
||||
|
||||
### Scenario 2: Auto-Detected Mismatch (INFO) ℹ️
|
||||
|
||||
```bash
|
||||
$ deltaglider migrate s3://old-bucket/ s3://new-bucket/
|
||||
EC2 Instance: us-west-2a
|
||||
S3 Client Region: us-east-1
|
||||
|
||||
ℹ️ INFO: EC2 region (us-west-2) differs from configured S3 region (us-east-1)
|
||||
Consider using --region us-west-2 to avoid cross-region charges.
|
||||
|
||||
Migrating from s3://old-bucket/
|
||||
to s3://new-bucket/
|
||||
...
|
||||
```
|
||||
|
||||
**Result**: Informational warning, suggests optimal region. User didn't explicitly set wrong region, so it's likely from their AWS config.
|
||||
|
||||
---
|
||||
|
||||
### Scenario 3: Explicit Region Override Mismatch (WARNING) ⚠️
|
||||
|
||||
```bash
|
||||
$ deltaglider migrate --region us-east-1 s3://old-bucket/ s3://new-bucket/
|
||||
EC2 Instance: us-west-2a
|
||||
S3 Client Region: us-east-1
|
||||
|
||||
⚠️ WARNING: EC2 region=us-west-2 != S3 client region=us-east-1
|
||||
Expect cross-region/NAT data charges. Align regions (set client region=us-west-2)
|
||||
before proceeding. Or drop --region for automatic region resolution.
|
||||
|
||||
Migrating from s3://old-bucket/
|
||||
to s3://new-bucket/
|
||||
...
|
||||
```
|
||||
|
||||
**Result**: Strong warning because user explicitly set the wrong region with `--region` flag. They might not realize the cost implications.
|
||||
|
||||
---
|
||||
|
||||
### Scenario 4: Not on EC2
|
||||
|
||||
```bash
|
||||
$ deltaglider migrate s3://old-bucket/ s3://new-bucket/
|
||||
S3 Client Region: us-east-1
|
||||
Migrating from s3://old-bucket/
|
||||
to s3://new-bucket/
|
||||
...
|
||||
```
|
||||
|
||||
**Result**: Simple region display, no EC2 warnings (not applicable).
|
||||
|
||||
## Configuration
|
||||
|
||||
### Disable EC2 Detection
|
||||
|
||||
If you want to disable EC2 detection (e.g., for testing or if it causes issues):
|
||||
|
||||
```bash
|
||||
export DG_DISABLE_EC2_DETECTION=true
|
||||
deltaglider migrate s3://old/ s3://new/
|
||||
```
|
||||
|
||||
Or in your script:
|
||||
|
||||
```python
|
||||
import os
|
||||
os.environ["DG_DISABLE_EC2_DETECTION"] = "true"
|
||||
```
|
||||
|
||||
### How It Works
|
||||
|
||||
DeltaGlider uses **IMDSv2** (Instance Metadata Service v2) for security:
|
||||
|
||||
1. **Token Request** (PUT with TTL):
|
||||
```
|
||||
PUT http://169.254.169.254/latest/api/token
|
||||
X-aws-ec2-metadata-token-ttl-seconds: 21600
|
||||
```
|
||||
|
||||
2. **Metadata Request** (GET with token):
|
||||
```
|
||||
GET http://169.254.169.254/latest/meta-data/placement/region
|
||||
X-aws-ec2-metadata-token: <token>
|
||||
```
|
||||
|
||||
3. **Fast Timeout**: 1 second timeout for non-EC2 environments (no delay if not on EC2)
|
||||
|
||||
### Security Notes
|
||||
|
||||
- **IMDSv2 Only**: DeltaGlider uses the more secure IMDSv2, not the legacy IMDSv1
|
||||
- **No Credentials**: Only reads metadata, never accesses credentials
|
||||
- **Graceful Fallback**: Silently skips detection if IMDS unavailable
|
||||
- **No Network Impact**: Uses local-only IP (169.254.169.254), never leaves the instance
|
||||
|
||||
## Best Practices
|
||||
|
||||
### For Cost Optimization
|
||||
|
||||
1. **Same Region**: Always try to keep EC2 instance and S3 bucket in the same region
|
||||
2. **Check First**: Run with `--dry-run` to verify the setup before actual migration
|
||||
3. **Use Auto-Detection**: Don't specify `--region` unless you have a specific reason
|
||||
4. **Monitor Costs**: Use AWS Cost Explorer to track cross-region data transfer
|
||||
|
||||
### For Terraform/IaC
|
||||
|
||||
```hcl
|
||||
# Good: EC2 and S3 in same region
|
||||
resource "aws_instance" "app" {
|
||||
region = "us-west-2"
|
||||
}
|
||||
|
||||
resource "aws_s3_bucket" "data" {
|
||||
region = "us-west-2" # Same region
|
||||
}
|
||||
```
|
||||
|
||||
### For Multi-Region Setups
|
||||
|
||||
If you MUST do cross-region transfers:
|
||||
|
||||
1. **Use VPC Endpoints**: Reduce NAT Gateway costs
|
||||
2. **Schedule Off-Peak**: AWS charges less during off-peak hours in some regions
|
||||
3. **Consider S3 Transfer Acceleration**: May be cheaper for very large transfers
|
||||
4. **Batch Operations**: Minimize number of API calls
|
||||
|
||||
## Technical Details
|
||||
|
||||
### EC2MetadataAdapter
|
||||
|
||||
Location: `src/deltaglider/adapters/ec2_metadata.py`
|
||||
|
||||
Key methods:
|
||||
- `is_running_on_ec2()`: Detects EC2 environment
|
||||
- `get_region()`: Returns AWS region code (e.g., "us-east-1")
|
||||
- `get_availability_zone()`: Returns AZ (e.g., "us-east-1a")
|
||||
|
||||
### Region Logging
|
||||
|
||||
Location: `src/deltaglider/app/cli/aws_compat.py`
|
||||
|
||||
Function: `log_aws_region(service, region_override=False)`
|
||||
|
||||
Logic:
|
||||
- If not EC2: Show S3 region only
|
||||
- If EC2 + regions match: Green checkmark ✅
|
||||
- If EC2 + auto-detected mismatch: Blue INFO ℹ️
|
||||
- If EC2 + `--region` mismatch: Yellow WARNING ⚠️
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
### "Cannot connect to IMDS"
|
||||
|
||||
**Cause**: Network policy blocks access to 169.254.169.254
|
||||
|
||||
**Solution**:
|
||||
```bash
|
||||
# Test IMDS connectivity
|
||||
TOKEN=$(curl -X PUT "http://169.254.169.254/latest/api/token" \
|
||||
-H "X-aws-ec2-metadata-token-ttl-seconds: 21600")
|
||||
curl -H "X-aws-ec2-metadata-token: $TOKEN" \
|
||||
http://169.254.169.254/latest/meta-data/placement/region
|
||||
|
||||
# If it fails, disable detection
|
||||
export DG_DISABLE_EC2_DETECTION=true
|
||||
```
|
||||
|
||||
### "Wrong region detected"
|
||||
|
||||
**Cause**: Cached metadata or race condition
|
||||
|
||||
**Solution**: DeltaGlider caches metadata for performance. Restart the process to refresh.
|
||||
|
||||
### "Warning appears but I want cross-region"
|
||||
|
||||
**Cause**: You intentionally need cross-region transfer
|
||||
|
||||
**Solution**: This is just a warning, not an error. The migration will proceed. The warning helps you confirm you understand the cost implications.
|
||||
|
||||
## FAQ
|
||||
|
||||
**Q: Does this slow down my migrations?**
|
||||
A: No. EC2 detection happens once before migration starts (< 100ms). It doesn't affect migration performance.
|
||||
|
||||
**Q: What if I'm not on EC2 but the detection is slow?**
|
||||
A: The timeout is 1 second. If IMDS is unreachable, it fails fast. Disable with `DG_DISABLE_EC2_DETECTION=true`.
|
||||
|
||||
**Q: Does this work on Fargate/ECS/Lambda?**
|
||||
A: Yes! All AWS compute services support IMDSv2. The detection works the same way.
|
||||
|
||||
**Q: Can I use this with LocalStack/MinIO?**
|
||||
A: Yes. When using `--endpoint-url`, DeltaGlider skips EC2 detection (not applicable for non-AWS S3).
|
||||
|
||||
**Q: Will this detect VPC endpoints?**
|
||||
A: No. VPC endpoints don't change the "region" from an EC2 perspective. The warning still applies if regions don't match.
|
||||
|
||||
## Related Documentation
|
||||
|
||||
- [AWS Data Transfer Pricing](https://aws.amazon.com/ec2/pricing/on-demand/#Data_Transfer)
|
||||
- [AWS IMDSv2 Documentation](https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/configuring-instance-metadata-service.html)
|
||||
- [S3 Transfer Costs](https://aws.amazon.com/s3/pricing/)
|
||||
342
docs/STATS_CACHING.md
Normal file
342
docs/STATS_CACHING.md
Normal file
@@ -0,0 +1,342 @@
|
||||
# Bucket Statistics Caching
|
||||
|
||||
**TL;DR**: Bucket stats are now cached in S3 with automatic validation. What took 20 minutes now takes ~100ms when the bucket hasn't changed.
|
||||
|
||||
## Overview
|
||||
|
||||
DeltaGlider's `get_bucket_stats()` operation now includes intelligent S3-based caching that dramatically improves performance for read-heavy workloads while maintaining accuracy through automatic validation.
|
||||
|
||||
## The Problem
|
||||
|
||||
Computing bucket statistics requires:
|
||||
1. **LIST operation**: Get all objects (~50-100ms per 1000 objects)
|
||||
2. **HEAD operations**: Fetch metadata for delta files (expensive!)
|
||||
- For a bucket with 10,000 delta files: 10,000 HEAD calls
|
||||
- Even with 10 parallel workers: ~1,000 sequential batches
|
||||
- At ~100ms per batch: **100+ seconds minimum**
|
||||
- With network issues or throttling: **20+ minutes** 😱
|
||||
|
||||
This made monitoring dashboards and repeated stats checks impractical.
|
||||
|
||||
## The Solution
|
||||
|
||||
### S3-Based Cache with Automatic Validation
|
||||
|
||||
Statistics are cached in S3 at `.deltaglider/stats_{mode}.json` (one per mode). On every call:
|
||||
|
||||
1. **Quick LIST operation** (~50-100ms) - always performed for validation
|
||||
2. **Compare** current object_count + compressed_size with cache
|
||||
3. **If unchanged** → Return cached stats instantly ✅ (**~100ms total**)
|
||||
4. **If changed** → Recompute and update cache automatically
|
||||
|
||||
### Three Stats Modes
|
||||
|
||||
```bash
|
||||
# Quick mode (default): Fast listing-only, approximate compression metrics
|
||||
deltaglider stats my-bucket
|
||||
|
||||
# Sampled mode: One HEAD per deltaspace, balanced accuracy/speed
|
||||
deltaglider stats my-bucket --sampled
|
||||
|
||||
# Detailed mode: All HEAD calls, most accurate (slowest)
|
||||
deltaglider stats my-bucket --detailed
|
||||
```
|
||||
|
||||
Each mode has its own independent cache file.
|
||||
|
||||
## Performance
|
||||
|
||||
| Scenario | Before | After | Speedup |
|
||||
|----------|--------|-------|---------|
|
||||
| **First run** (cold cache) | 20 min | 20 min | 1x (must compute) |
|
||||
| **Bucket unchanged** (warm cache) | 20 min | **100ms** | **200x** ✨ |
|
||||
| **Bucket changed** (stale cache) | 20 min | 20 min | 1x (auto-recompute) |
|
||||
| **Dashboard monitoring** | 20 min/check | **100ms/check** | **200x** ✨ |
|
||||
|
||||
## CLI Usage
|
||||
|
||||
### Basic Usage
|
||||
|
||||
```bash
|
||||
# Use cache (default behavior)
|
||||
deltaglider stats my-bucket
|
||||
|
||||
# Force recomputation even if cache valid
|
||||
deltaglider stats my-bucket --refresh
|
||||
|
||||
# Skip cache entirely (both read and write)
|
||||
deltaglider stats my-bucket --no-cache
|
||||
|
||||
# Different modes with caching
|
||||
deltaglider stats my-bucket --sampled
|
||||
deltaglider stats my-bucket --detailed
|
||||
```
|
||||
|
||||
### Cache Control Flags
|
||||
|
||||
| Flag | Description | Use Case |
|
||||
|------|-------------|----------|
|
||||
| *(none)* | Use cache if valid | **Default** - Fast monitoring |
|
||||
| `--refresh` | Force recomputation | Updated data needed now |
|
||||
| `--no-cache` | Skip caching entirely | Testing, one-off analysis |
|
||||
| `--sampled` | Balanced mode | Good accuracy, faster than detailed |
|
||||
| `--detailed` | Most accurate mode | Analytics, reports |
|
||||
|
||||
## Python SDK Usage
|
||||
|
||||
```python
|
||||
from deltaglider import create_client
|
||||
|
||||
client = create_client()
|
||||
|
||||
# Use cache (fast, ~100ms with cache hit)
|
||||
stats = client.get_bucket_stats('releases')
|
||||
|
||||
# Force refresh (slow, recomputes everything)
|
||||
stats = client.get_bucket_stats('releases', refresh_cache=True)
|
||||
|
||||
# Skip cache entirely
|
||||
stats = client.get_bucket_stats('releases', use_cache=False)
|
||||
|
||||
# Different modes with caching
|
||||
stats = client.get_bucket_stats('releases', mode='quick') # Fast
|
||||
stats = client.get_bucket_stats('releases', mode='sampled') # Balanced
|
||||
stats = client.get_bucket_stats('releases', mode='detailed') # Accurate
|
||||
```
|
||||
|
||||
## Cache Structure
|
||||
|
||||
Cache files are stored at `.deltaglider/stats_{mode}.json` in your bucket:
|
||||
|
||||
```json
|
||||
{
|
||||
"version": "1.0",
|
||||
"mode": "quick",
|
||||
"computed_at": "2025-10-14T10:30:00Z",
|
||||
"validation": {
|
||||
"object_count": 1523,
|
||||
"compressed_size": 1234567890
|
||||
},
|
||||
"stats": {
|
||||
"bucket": "releases",
|
||||
"object_count": 1523,
|
||||
"total_size": 50000000000,
|
||||
"compressed_size": 1234567890,
|
||||
"space_saved": 48765432110,
|
||||
"average_compression_ratio": 0.9753,
|
||||
"delta_objects": 1500,
|
||||
"direct_objects": 23
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
## How Validation Works
|
||||
|
||||
**Smart Staleness Detection**:
|
||||
1. Always perform quick LIST operation (required anyway, ~50-100ms)
|
||||
2. Calculate current `object_count` and `compressed_size` from LIST
|
||||
3. Compare with cached values
|
||||
4. If **both match** → Cache valid, return instantly
|
||||
5. If **either differs** → Bucket changed, recompute automatically
|
||||
|
||||
This catches:
|
||||
- ✅ Objects added (count increases)
|
||||
- ✅ Objects removed (count decreases)
|
||||
- ✅ Objects replaced (size changes)
|
||||
- ✅ Content modified (size changes)
|
||||
|
||||
**Edge Case**: If only metadata changes (tags, headers) but not content/count/size, cache remains valid. This is acceptable since metadata changes are rare and don't affect core statistics.
|
||||
|
||||
## Use Cases
|
||||
|
||||
### ✅ Perfect For
|
||||
|
||||
1. **Monitoring Dashboards**
|
||||
- Check stats every minute
|
||||
- Bucket rarely changes
|
||||
- **20 min → 100ms per check** ✨
|
||||
|
||||
2. **CI/CD Status Checks**
|
||||
- Verify upload success
|
||||
- Check compression effectiveness
|
||||
- Near-instant feedback
|
||||
|
||||
3. **Repeated Analysis**
|
||||
- Multiple stats queries during investigation
|
||||
- Cache persists across sessions
|
||||
- Huge time savings
|
||||
|
||||
### ⚠️ Less Beneficial For
|
||||
|
||||
1. **Write-Heavy Buckets**
|
||||
- Bucket changes on every check
|
||||
- Cache always stale
|
||||
- **No benefit, but no harm either** (graceful degradation)
|
||||
|
||||
2. **One-Off Queries**
|
||||
- Single stats check
|
||||
- Cache doesn't help (cold cache)
|
||||
- Still works normally
|
||||
|
||||
## Cache Management
|
||||
|
||||
### Automatic Management
|
||||
|
||||
- **Creation**: Automatic on first `get_bucket_stats()` call
|
||||
- **Validation**: Automatic on every call (always current)
|
||||
- **Updates**: Automatic when bucket changes
|
||||
- **Cleanup**: Not needed (cache files are tiny ~1-10KB)
|
||||
|
||||
### Manual Management
|
||||
|
||||
```bash
|
||||
# View cache files
|
||||
deltaglider ls s3://my-bucket/.deltaglider/
|
||||
|
||||
# Delete cache manually (will be recreated automatically)
|
||||
deltaglider rm s3://my-bucket/.deltaglider/stats_quick.json
|
||||
deltaglider rm s3://my-bucket/.deltaglider/stats_sampled.json
|
||||
deltaglider rm s3://my-bucket/.deltaglider/stats_detailed.json
|
||||
|
||||
# Or delete entire .deltaglider prefix
|
||||
deltaglider rm -r s3://my-bucket/.deltaglider/
|
||||
```
|
||||
|
||||
## Technical Details
|
||||
|
||||
### Cache Files
|
||||
|
||||
- **Location**: `.deltaglider/` prefix in each bucket
|
||||
- **Naming**: `stats_{mode}.json` (quick, sampled, detailed)
|
||||
- **Size**: ~1-10KB per file
|
||||
- **Format**: JSON with version, mode, validation data, and stats
|
||||
|
||||
### Validation Logic
|
||||
|
||||
```python
|
||||
def is_cache_valid(cached, current):
|
||||
"""Cache is valid if object count and size unchanged."""
|
||||
return (
|
||||
cached['object_count'] == current['object_count'] and
|
||||
cached['compressed_size'] == current['compressed_size']
|
||||
)
|
||||
```
|
||||
|
||||
### Error Handling
|
||||
|
||||
Cache operations are **non-fatal**:
|
||||
- ✅ Cache read fails → Compute normally, log warning
|
||||
- ✅ Cache write fails → Return computed stats, log warning
|
||||
- ✅ Corrupted cache → Ignore, recompute, overwrite
|
||||
- ✅ Version mismatch → Ignore, recompute with new version
|
||||
- ✅ Permission denied → Log warning, continue without caching
|
||||
|
||||
**The stats operation never fails due to cache issues.**
|
||||
|
||||
## Future Enhancements
|
||||
|
||||
Potential improvements for the future:
|
||||
|
||||
1. **TTL-Based Expiration**: Auto-refresh after N hours even if unchanged
|
||||
2. **Cache Cleanup Command**: `deltaglider cache clear` for manual invalidation
|
||||
3. **Cache Statistics**: Show hit/miss rates, staleness info
|
||||
4. **Async Cache Updates**: Background refresh for very large buckets
|
||||
5. **Cross-Bucket Cache**: Share reference data across related buckets
|
||||
|
||||
## Comparison with Old Implementation
|
||||
|
||||
| Aspect | Old (In-Memory) | New (S3-Based) |
|
||||
|--------|----------------|----------------|
|
||||
| **Storage** | Process memory | S3 bucket |
|
||||
| **Persistence** | Lost on restart | Survives restarts |
|
||||
| **Sharing** | Per-process | Shared across all clients |
|
||||
| **Validation** | None | Automatic on every call |
|
||||
| **Staleness** | Always fresh | Automatically detected |
|
||||
| **Use Case** | Single session | Monitoring, dashboards |
|
||||
|
||||
## Examples
|
||||
|
||||
### Example 1: Monitoring Dashboard
|
||||
|
||||
```python
|
||||
from deltaglider import create_client
|
||||
import time
|
||||
|
||||
client = create_client()
|
||||
|
||||
while True:
|
||||
# Fast stats check (~100ms with cache)
|
||||
stats = client.get_bucket_stats('releases')
|
||||
print(f"Objects: {stats.object_count}, "
|
||||
f"Compression: {stats.average_compression_ratio:.1%}")
|
||||
|
||||
time.sleep(60) # Check every minute
|
||||
|
||||
# First run: 20 min (computes and caches)
|
||||
# All subsequent runs: ~100ms (cache hit)
|
||||
```
|
||||
|
||||
### Example 2: CI/CD Pipeline
|
||||
|
||||
```python
|
||||
from deltaglider import create_client
|
||||
|
||||
client = create_client()
|
||||
|
||||
# Upload new release
|
||||
client.upload("v2.0.0.zip", "s3://releases/v2.0.0/")
|
||||
|
||||
# Quick verification (fast with cache)
|
||||
stats = client.get_bucket_stats('releases')
|
||||
if stats.average_compression_ratio < 0.90:
|
||||
print("Warning: Lower than expected compression")
|
||||
```
|
||||
|
||||
### Example 3: Force Fresh Stats
|
||||
|
||||
```python
|
||||
from deltaglider import create_client
|
||||
|
||||
client = create_client()
|
||||
|
||||
# Force recomputation for accurate report
|
||||
stats = client.get_bucket_stats(
|
||||
'releases',
|
||||
mode='detailed',
|
||||
refresh_cache=True
|
||||
)
|
||||
|
||||
print(f"Accurate compression report:")
|
||||
print(f" Original: {stats.total_size / 1e9:.1f} GB")
|
||||
print(f" Stored: {stats.compressed_size / 1e9:.1f} GB")
|
||||
print(f" Saved: {stats.space_saved / 1e9:.1f} GB ({stats.average_compression_ratio:.1%})")
|
||||
```
|
||||
|
||||
## FAQ
|
||||
|
||||
**Q: Does caching affect accuracy?**
|
||||
A: No! Cache is automatically validated on every call. If the bucket changed, stats are recomputed automatically.
|
||||
|
||||
**Q: What if I need fresh stats immediately?**
|
||||
A: Use `--refresh` flag (CLI) or `refresh_cache=True` (SDK) to force recomputation.
|
||||
|
||||
**Q: Can I disable caching?**
|
||||
A: Yes, use `--no-cache` flag (CLI) or `use_cache=False` (SDK).
|
||||
|
||||
**Q: How much space do cache files use?**
|
||||
A: ~1-10KB per mode, negligible for any bucket.
|
||||
|
||||
**Q: What happens if cache write fails?**
|
||||
A: The operation continues normally - computed stats are returned and a warning is logged. Caching is optional and non-fatal.
|
||||
|
||||
**Q: Do I need to clean up cache files?**
|
||||
A: No, they're tiny and automatically managed. But you can delete `.deltaglider/` prefix if desired.
|
||||
|
||||
**Q: Does cache work across different modes?**
|
||||
A: Each mode (quick, sampled, detailed) has its own independent cache file.
|
||||
|
||||
---
|
||||
|
||||
**Implementation**: See [PR #XX] for complete implementation details and test coverage.
|
||||
|
||||
**Related**: [SDK Documentation](sdk/README.md) | [CLI Reference](../README.md#cli-reference) | [Architecture](sdk/architecture.md)
|
||||
@@ -9,7 +9,11 @@ DeltaGlider provides AWS S3 CLI compatible commands with automatic delta compres
|
||||
- `deltaglider ls [s3_url]` - List buckets and objects
|
||||
- `deltaglider rm <s3_url>` - Remove objects
|
||||
- `deltaglider sync <source> <destination>` - Synchronize directories
|
||||
- `deltaglider migrate <source> <destination>` - Migrate S3 buckets with compression and EC2 cost warnings
|
||||
- `deltaglider stats <bucket>` - Get bucket statistics and compression metrics
|
||||
- `deltaglider verify <s3_url>` - Verify file integrity
|
||||
- `deltaglider put-bucket-acl <bucket>` - Set bucket ACL (s3api compatible)
|
||||
- `deltaglider get-bucket-acl <bucket>` - Get bucket ACL (s3api compatible)
|
||||
|
||||
### Current Usage Examples
|
||||
```bash
|
||||
@@ -21,6 +25,14 @@ deltaglider cp s3://bucket/path/to/file.zip .
|
||||
|
||||
# Verify integrity
|
||||
deltaglider verify s3://bucket/path/to/file.zip.delta
|
||||
|
||||
# Set bucket ACL
|
||||
deltaglider put-bucket-acl my-bucket --acl public-read
|
||||
deltaglider put-bucket-acl my-bucket --acl private
|
||||
deltaglider put-bucket-acl my-bucket --grant-read id=12345
|
||||
|
||||
# Get bucket ACL
|
||||
deltaglider get-bucket-acl my-bucket
|
||||
```
|
||||
|
||||
## Target State: AWS S3 CLI Compatibility
|
||||
|
||||
@@ -1,347 +1,76 @@
|
||||
# Case Study: How ReadOnlyREST Reduced Storage Costs by 99.9% with DeltaGlider
|
||||
## How ReadonlyREST Cut 4TB of S3 Storage Down to 5GB (and Saved 99.9%)
|
||||
|
||||
## Executive Summary
|
||||
### TL;DR
|
||||
|
||||
**The Challenge**: ReadOnlyREST, a security plugin for Elasticsearch, was facing exponential storage costs managing 145 release versions across multiple product lines, consuming nearly 4TB of S3 storage.
|
||||
We were paying to store 4TB of mostly identical plugin builds.
|
||||
DeltaGlider deduplicated everything down to 4.9GB — 99.9% smaller, $1.1k/year cheaper, and no workflow changes.
|
||||
|
||||
**The Solution**: DeltaGlider, an intelligent delta compression system that reduced storage from 4,060GB to just 4.9GB.
|
||||
#### The Problem
|
||||
|
||||
**The Impact**:
|
||||
- 💰 **$1,119 annual savings** on storage costs
|
||||
- 📉 **99.9% reduction** in storage usage
|
||||
- ⚡ **Zero changes** to existing workflows
|
||||
- ✅ **Full data integrity** maintained
|
||||
ReadonlyREST supports ~150 Elasticsearch/Kibana versions × multiple product lines × all our own releases.
|
||||
After years of publishing builds, our S3 archive hit `4TB` (201,840 files, $93/month).
|
||||
Glacier helped, but restoring files took 48 hours — useless for CI/CD.
|
||||
|
||||
---
|
||||
Every plugin ZIP was ~82MB, but `99.7% identical` to the next one. We were paying to store duplicates.
|
||||
|
||||
## The Storage Crisis
|
||||
#### The Fix: DeltaGlider
|
||||
|
||||
### The Numbers That Kept Us Up at Night
|
||||
DeltaGlider stores binary deltas between similar files instead of full copies.
|
||||
|
||||
ReadOnlyREST maintains a comprehensive release archive:
|
||||
- **145 version folders** (v1.50.0 through v1.66.1)
|
||||
- **201,840 total files** to manage
|
||||
- **3.96 TB** of S3 storage consumed
|
||||
- **$1,120/year** in storage costs alone
|
||||
|
||||
Each version folder contained:
|
||||
- 513 plugin ZIP files (one for each Elasticsearch version)
|
||||
- 879 checksum files (SHA1 and SHA512)
|
||||
- 3 product lines (Enterprise, Pro, Free)
|
||||
|
||||
### The Hidden Problem
|
||||
|
||||
What made this particularly painful wasn't just the size—it was the **redundancy**. Each 82.5MB plugin ZIP was 99.7% identical to others in the same version, differing only in minor Elasticsearch compatibility adjustments. We were essentially storing the same data hundreds of times.
|
||||
|
||||
> "We were paying to store 4TB of data that was fundamentally just variations of the same ~250MB of unique content. It felt like photocopying War and Peace 500 times because each copy had a different page number."
|
||||
>
|
||||
> — *DevOps Lead*
|
||||
|
||||
---
|
||||
|
||||
## Enter DeltaGlider
|
||||
|
||||
### The Lightbulb Moment
|
||||
|
||||
The breakthrough came when we realized we didn't need to store complete files—just the *differences* between them. DeltaGlider applies this principle automatically:
|
||||
|
||||
1. **First file becomes the reference** (stored in full)
|
||||
2. **Similar files store only deltas** (typically 0.3% of original size)
|
||||
3. **Different files uploaded directly** (no delta overhead)
|
||||
|
||||
### Implementation: Surprisingly Simple
|
||||
|
||||
```bash
|
||||
# Before DeltaGlider (standard S3 upload)
|
||||
aws s3 cp readonlyrest-1.66.1_es8.0.0.zip s3://releases/
|
||||
# Size on S3: 82.5MB
|
||||
|
||||
# With DeltaGlider
|
||||
deltaglider cp readonlyrest-1.66.1_es8.0.0.zip s3://releases/
|
||||
# Size on S3: 65KB (99.92% smaller!)
|
||||
# Before
|
||||
```
|
||||
aws s3 cp readonlyrest-1.66.1_es8.0.0.zip s3://releases/ # 82MB
|
||||
```
|
||||
|
||||
The beauty? **Zero changes to our build pipeline**. DeltaGlider works as a drop-in replacement for S3 uploads.
|
||||
|
||||
---
|
||||
|
||||
## The Results: Beyond Our Expectations
|
||||
|
||||
### Storage Transformation
|
||||
|
||||
# After
|
||||
```
|
||||
BEFORE DELTAGLIDER AFTER DELTAGLIDER
|
||||
━━━━━━━━━━━━━━━━━ ━━━━━━━━━━━━━━━━
|
||||
4,060 GB (3.96 TB) → 4.9 GB
|
||||
$93.38/month → $0.11/month
|
||||
201,840 files → 201,840 files (same!)
|
||||
deltaglider cp readonlyrest-1.66.1_es8.0.0.zip s3://releases/ # 65KB
|
||||
```
|
||||
|
||||
### Real Performance Metrics
|
||||
Drop-in replacement for `aws s3 cp`. No pipeline changes.
|
||||
Data integrity checked with SHA256, stored as metadata in S3.
|
||||
|
||||
From our actual production deployment:
|
||||
|
||||
| Metric | Value | Impact |
|
||||
|--------|-------|--------|
|
||||
| **Compression Ratio** | 99.9% | Near-perfect deduplication |
|
||||
| **Delta Size** | ~65KB per 82.5MB file | 1/1,269th of original |
|
||||
| **Upload Speed** | 3-4 files/second | Faster than raw S3 uploads |
|
||||
| **Download Speed** | Transparent reconstruction | No user impact |
|
||||
| **Storage Savings** | 4,055 GB | Enough for 850,000 more files |
|
||||
### The Result
|
||||
|
||||
### Version-to-Version Comparison
|
||||
| Metric | Before | After | Δ |
|
||||
|-------------- |----------|----------|--------------|
|
||||
| Storage | 4.06TB | 4.9GB | -99.9% |
|
||||
| Cost | $93/mo | $0.11/mo | -$1,119/yr |
|
||||
| Files | 201,840 | 201,840 | identical |
|
||||
| Upload speed | 1x | 3–4x | faster |
|
||||
|
||||
Testing between similar versions showed incredible efficiency:
|
||||
Each “different” ZIP? Just a 65KB delta.
|
||||
Reconstruction time: <100ms.
|
||||
Zero user impact.
|
||||
|
||||
|
||||
## Under the Hood
|
||||
|
||||
Uses xdelta3 diffs.
|
||||
• Keeps one reference per group
|
||||
• Stores deltas for near-identical files
|
||||
• Skips small or text-based ones (.sha, .json, etc.)
|
||||
|
||||
It’s smart enough to decide what’s worth diffing automatically.
|
||||
|
||||
|
||||
## Payoff
|
||||
• 4TB → 5GB overnight
|
||||
• Uploads 1,200× faster
|
||||
• CI bandwidth cut 99%
|
||||
• 100% checksum verified integrity
|
||||
• Zero vendor lock-in (open source)
|
||||
|
||||
## Takeaways
|
||||
|
||||
If You Ship Versioned Artifacts
|
||||
|
||||
This will probably save you four figures and hours of upload time per year.
|
||||
|
||||
```
|
||||
readonlyrest-1.66.1_es7.17.0.zip (82.5MB) → reference.bin (82.5MB)
|
||||
readonlyrest-1.66.1_es7.17.1.zip (82.5MB) → 64KB delta (0.08% size)
|
||||
readonlyrest-1.66.1_es7.17.2.zip (82.5MB) → 65KB delta (0.08% size)
|
||||
...
|
||||
readonlyrest-1.66.1_es8.15.0.zip (82.5MB) → 71KB delta (0.09% size)
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Technical Deep Dive
|
||||
|
||||
### How DeltaGlider Achieves 99.9% Compression
|
||||
|
||||
DeltaGlider uses binary diff algorithms (xdelta3) to identify and store only the bytes that change between files:
|
||||
|
||||
```python
|
||||
# Simplified concept
|
||||
reference = "readonlyrest-1.66.1_es7.17.0.zip" # 82.5MB
|
||||
new_file = "readonlyrest-1.66.1_es7.17.1.zip" # 82.5MB
|
||||
|
||||
delta = binary_diff(reference, new_file) # 65KB
|
||||
# Delta contains only:
|
||||
# - Elasticsearch version string changes
|
||||
# - Compatibility metadata updates
|
||||
# - Build timestamp differences
|
||||
```
|
||||
|
||||
### Intelligent File Type Detection
|
||||
|
||||
Not every file benefits from delta compression. DeltaGlider automatically:
|
||||
|
||||
- **Applies delta compression to**: `.zip`, `.tar`, `.gz`, `.dmg`, `.jar`, `.war`
|
||||
- **Uploads directly**: `.txt`, `.sha1`, `.sha512`, `.json`, `.md`
|
||||
|
||||
This intelligence meant our 127,455 checksum files were uploaded directly, avoiding unnecessary processing overhead.
|
||||
|
||||
### Architecture That Scales
|
||||
|
||||
```
|
||||
┌─────────────┐ ┌──────────────┐ ┌─────────────┐
|
||||
│ Client │────▶│ DeltaGlider │────▶│ S3/MinIO │
|
||||
│ (CI/CD) │ │ │ │ │
|
||||
└─────────────┘ └──────────────┘ └─────────────┘
|
||||
│
|
||||
┌──────▼───────┐
|
||||
│ Local Cache │
|
||||
│ (References) │
|
||||
└──────────────┘
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Business Impact
|
||||
|
||||
### Immediate ROI
|
||||
|
||||
- **Day 1**: 99.9% storage reduction
|
||||
- **Month 1**: $93 saved
|
||||
- **Year 1**: $1,119 saved
|
||||
- **5 Years**: $5,595 saved (not counting growth)
|
||||
|
||||
### Hidden Benefits We Didn't Expect
|
||||
|
||||
1. **Faster Deployments**: Uploading 65KB deltas is 1,200x faster than 82.5MB files
|
||||
2. **Reduced Bandwidth**: CI/CD pipeline bandwidth usage dropped 99%
|
||||
3. **Improved Reliability**: Fewer timeout errors on large file uploads
|
||||
4. **Better Compliance**: Automatic SHA256 integrity verification on every operation
|
||||
|
||||
### Environmental Impact
|
||||
|
||||
> "Reducing storage by 4TB means fewer drives spinning in data centers. It's a small contribution to our sustainability goals, but every bit counts."
|
||||
>
|
||||
> — *CTO*
|
||||
|
||||
---
|
||||
|
||||
## Implementation Journey
|
||||
|
||||
### Week 1: Proof of Concept
|
||||
- Tested with 10 files
|
||||
- Achieved 99.6% compression
|
||||
- Decision to proceed
|
||||
|
||||
### Week 2: Production Rollout
|
||||
- Uploaded all 201,840 files
|
||||
- Zero errors or failures
|
||||
- Immediate cost reduction
|
||||
|
||||
### Week 3: Integration
|
||||
```bash
|
||||
# Simple integration into our CI/CD
|
||||
- aws s3 cp $FILE s3://releases/
|
||||
+ deltaglider cp $FILE s3://releases/
|
||||
```
|
||||
|
||||
### Week 4: Full Migration
|
||||
- All build pipelines updated
|
||||
- Developer documentation completed
|
||||
- Monitoring dashboards configured
|
||||
|
||||
---
|
||||
|
||||
## Lessons Learned
|
||||
|
||||
### What Worked Well
|
||||
|
||||
1. **Drop-in replacement**: No architectural changes needed
|
||||
2. **Automatic intelligence**: File type detection "just worked"
|
||||
3. **Preservation of structure**: Directory hierarchy maintained perfectly
|
||||
|
||||
### Challenges Overcome
|
||||
|
||||
1. **Initial skepticism**: "99.9% compression sounds too good to be true"
|
||||
- *Solution*: Live demonstration with real data
|
||||
|
||||
2. **Download concerns**: "Will it be slow to reconstruct files?"
|
||||
- *Solution*: Benchmarking showed <100ms reconstruction time
|
||||
|
||||
3. **Reliability questions**: "What if the reference file is corrupted?"
|
||||
- *Solution*: SHA256 verification on every operation
|
||||
|
||||
---
|
||||
|
||||
## For Decision Makers
|
||||
|
||||
### Why This Matters
|
||||
|
||||
Storage costs scale linearly with data growth. Without DeltaGlider:
|
||||
- Next 145 versions: Additional $1,120/year
|
||||
- 5-year projection: $11,200 in storage alone
|
||||
- Opportunity cost: Resources that could fund innovation
|
||||
|
||||
### Risk Assessment
|
||||
|
||||
| Risk | Mitigation | Status |
|
||||
|------|------------|--------|
|
||||
| Vendor lock-in | Open-source, standards-based | ✅ Mitigated |
|
||||
| Data corruption | SHA256 verification built-in | ✅ Mitigated |
|
||||
| Performance impact | Faster than original | ✅ No risk |
|
||||
| Complexity | Drop-in replacement | ✅ No risk |
|
||||
|
||||
### Strategic Advantages
|
||||
|
||||
1. **Cost Predictability**: Storage costs become negligible
|
||||
2. **Scalability**: Can handle 100x more versions in same space
|
||||
3. **Competitive Edge**: More resources for product development
|
||||
4. **Green IT**: Reduced carbon footprint from storage
|
||||
|
||||
---
|
||||
|
||||
## For Engineers
|
||||
|
||||
### Getting Started
|
||||
|
||||
```bash
|
||||
# Install DeltaGlider
|
||||
pip install deltaglider
|
||||
|
||||
# Upload a file (automatic compression)
|
||||
deltaglider cp my-release-v1.0.0.zip s3://releases/
|
||||
|
||||
# Download (automatic reconstruction)
|
||||
deltaglider cp s3://releases/my-release-v1.0.0.zip .
|
||||
|
||||
# It's that simple.
|
||||
deltaglider cp my-release.zip s3://releases/
|
||||
```
|
||||
|
||||
### Performance Characteristics
|
||||
|
||||
```python
|
||||
# Compression ratios by similarity
|
||||
identical_files: 99.9% # Same file, different name
|
||||
minor_changes: 99.7% # Version bumps, timestamps
|
||||
moderate_changes: 95.0% # Feature additions
|
||||
major_changes: 70.0% # Significant refactoring
|
||||
completely_different: 0% # No compression (uploaded as-is)
|
||||
```
|
||||
|
||||
### Integration Examples
|
||||
|
||||
**GitHub Actions**:
|
||||
```yaml
|
||||
- name: Upload Release
|
||||
run: deltaglider cp dist/*.zip s3://releases/${{ github.ref_name }}/
|
||||
```
|
||||
|
||||
**Jenkins Pipeline**:
|
||||
```groovy
|
||||
sh "deltaglider cp ${WORKSPACE}/target/*.jar s3://artifacts/"
|
||||
```
|
||||
|
||||
**Python Script**:
|
||||
```python
|
||||
from deltaglider import DeltaService
|
||||
service = DeltaService(bucket="releases")
|
||||
service.put("my-app-v2.0.0.zip", "v2.0.0/")
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## The Bottom Line
|
||||
|
||||
DeltaGlider transformed our storage crisis into a solved problem:
|
||||
|
||||
- ✅ **4TB → 5GB** storage reduction
|
||||
- ✅ **$1,119/year** saved
|
||||
- ✅ **Zero** workflow disruption
|
||||
- ✅ **100%** data integrity maintained
|
||||
|
||||
For ReadOnlyREST, DeltaGlider wasn't just a cost-saving tool—it was a glimpse into the future of intelligent storage. When 99.9% of your data is redundant, why pay to store it 500 times?
|
||||
|
||||
---
|
||||
|
||||
## Next Steps
|
||||
|
||||
### For Your Organization
|
||||
|
||||
1. **Identify similar use cases**: Version releases, backups, build artifacts
|
||||
2. **Run the calculator**: `[Your files] × [Versions] × [Similarity] = Savings`
|
||||
3. **Start small**: Test with one project's releases
|
||||
4. **Scale confidently**: Deploy across all similar data
|
||||
|
||||
### Get Started Today
|
||||
|
||||
```bash
|
||||
# See your potential savings
|
||||
git clone https://github.com/beshu-tech/deltaglider
|
||||
cd deltaglider
|
||||
python calculate_savings.py --path /your/releases
|
||||
|
||||
# Try it yourself
|
||||
docker run -p 9000:9000 minio/minio # Local S3
|
||||
pip install deltaglider
|
||||
deltaglider cp your-file.zip s3://test/
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## About ReadOnlyREST
|
||||
|
||||
ReadOnlyREST is the enterprise security plugin for Elasticsearch and OpenSearch, protecting clusters in production since 2015. Learn more at [readonlyrest.com](https://readonlyrest.com)
|
||||
|
||||
## About DeltaGlider
|
||||
|
||||
DeltaGlider is an open-source delta compression system for S3-compatible storage, turning redundant data into remarkable savings. Built with modern Python, containerized for portability, and designed for scale.
|
||||
|
||||
---
|
||||
|
||||
*"In a world where storage is cheap but not free, and data grows exponentially but changes incrementally, DeltaGlider represents a fundamental shift in how we think about storing versioned artifacts."*
|
||||
|
||||
**— ReadOnlyREST Engineering Team**
|
||||
That’s it.
|
||||
Binary file not shown.
|
Before Width: | Height: | Size: 267 KiB After Width: | Height: | Size: 4.0 MiB |
@@ -1,6 +1,6 @@
|
||||
# DeltaGlider Python SDK Documentation
|
||||
|
||||
The DeltaGlider Python SDK provides a **boto3-compatible API for core S3 operations** (~20% of methods covering 80% of use cases), while achieving 99%+ compression for versioned artifacts through intelligent binary delta compression.
|
||||
The DeltaGlider Python SDK provides a **boto3-compatible API for core S3 operations** (~20% of methods covering 80% of use cases), while achieving 99%+ compression for very similar versioned artifacts through intelligent binary delta compression.
|
||||
|
||||
## 🎯 Key Highlights
|
||||
|
||||
@@ -38,17 +38,29 @@ response = client.get_object(Bucket='releases', Key='v1.0.0/app.zip')
|
||||
# Optimized list_objects with smart performance defaults (NEW!)
|
||||
# Fast by default - no unnecessary metadata fetching
|
||||
response = client.list_objects(Bucket='releases', Prefix='v1.0.0/')
|
||||
for obj in response['Contents']:
|
||||
print(f"{obj['Key']}: {obj['Size']} bytes")
|
||||
|
||||
# Pagination for large buckets
|
||||
response = client.list_objects(Bucket='releases', MaxKeys=100,
|
||||
ContinuationToken=response.next_continuation_token)
|
||||
response = client.list_objects(Bucket='releases', MaxKeys=100)
|
||||
while response.get('IsTruncated'):
|
||||
# Process current page
|
||||
for obj in response['Contents']:
|
||||
print(obj['Key'])
|
||||
# Get next page
|
||||
response = client.list_objects(
|
||||
Bucket='releases',
|
||||
MaxKeys=100,
|
||||
ContinuationToken=response.get('NextContinuationToken')
|
||||
)
|
||||
|
||||
# Get detailed compression stats only when needed
|
||||
response = client.list_objects(Bucket='releases', FetchMetadata=True) # Slower but detailed
|
||||
|
||||
# Quick bucket statistics
|
||||
stats = client.get_bucket_stats('releases') # Fast overview
|
||||
stats = client.get_bucket_stats('releases', detailed_stats=True) # With compression metrics
|
||||
# Bucket statistics with intelligent S3-based caching (NEW!)
|
||||
stats = client.get_bucket_stats('releases') # Fast (~100ms with cache)
|
||||
stats = client.get_bucket_stats('releases', mode='detailed') # Accurate compression metrics
|
||||
stats = client.get_bucket_stats('releases', refresh_cache=True) # Force fresh computation
|
||||
|
||||
client.delete_object(Bucket='releases', Key='old-version.zip')
|
||||
```
|
||||
@@ -101,6 +113,8 @@ client.put_object(Bucket='mybucket', Key='myfile.zip', Body=data)
|
||||
- **Data Integrity**: SHA256 verification on every operation
|
||||
- **Transparent**: Works with existing tools and workflows
|
||||
- **Production Ready**: Battle-tested with 200K+ files
|
||||
- **Thoroughly Tested**: 99 integration/unit tests with comprehensive coverage
|
||||
- **Type Safe**: Full mypy type checking, zero type errors
|
||||
|
||||
## When to Use DeltaGlider
|
||||
|
||||
@@ -192,10 +206,17 @@ from deltaglider import create_client
|
||||
client = create_client(
|
||||
endpoint_url="http://minio.internal:9000", # Custom S3 endpoint
|
||||
log_level="DEBUG", # Detailed logging
|
||||
cache_dir="/var/cache/deltaglider", # Custom cache location
|
||||
aws_access_key_id="minio",
|
||||
aws_secret_access_key="minio",
|
||||
region_name="eu-west-1",
|
||||
max_ratio=0.3, # Stricter delta acceptance
|
||||
)
|
||||
```
|
||||
|
||||
> ℹ️ The SDK now manages an encrypted, process-isolated cache automatically in `/tmp/deltaglider-*`.
|
||||
> Tune cache behavior via environment variables such as `DG_CACHE_BACKEND`,
|
||||
> `DG_CACHE_MEMORY_SIZE_MB`, and `DG_CACHE_ENCRYPTION_KEY` instead of passing a `cache_dir` argument.
|
||||
|
||||
## Real-World Example
|
||||
|
||||
```python
|
||||
@@ -285,4 +306,4 @@ url = client.generate_presigned_url(
|
||||
|
||||
## License
|
||||
|
||||
MIT License - See [LICENSE](https://github.com/beshu-tech/deltaglider/blob/main/LICENSE) for details.
|
||||
MIT License - See [LICENSE](https://github.com/beshu-tech/deltaglider/blob/main/LICENSE) for details.
|
||||
|
||||
151
docs/sdk/api.md
151
docs/sdk/api.md
@@ -21,7 +21,6 @@ Factory function to create a configured DeltaGlider client with sensible default
|
||||
def create_client(
|
||||
endpoint_url: Optional[str] = None,
|
||||
log_level: str = "INFO",
|
||||
cache_dir: str = "/tmp/.deltaglider/cache",
|
||||
**kwargs
|
||||
) -> DeltaGliderClient
|
||||
```
|
||||
@@ -30,11 +29,12 @@ def create_client(
|
||||
|
||||
- **endpoint_url** (`Optional[str]`): S3 endpoint URL for MinIO, R2, or other S3-compatible storage. If None, uses AWS S3.
|
||||
- **log_level** (`str`): Logging verbosity level. Options: "DEBUG", "INFO", "WARNING", "ERROR". Default: "INFO".
|
||||
- **cache_dir** (`str`): Directory for local reference cache. Default: "/tmp/.deltaglider/cache".
|
||||
- **kwargs**: Additional arguments passed to `DeltaService`:
|
||||
- **tool_version** (`str`): Version string for metadata. Default: "deltaglider/0.1.0"
|
||||
- **max_ratio** (`float`): Maximum acceptable delta/file ratio. Default: 0.5
|
||||
|
||||
**Security Note**: DeltaGlider automatically uses ephemeral, process-isolated cache (`/tmp/deltaglider-*`) that is cleaned up on exit. No configuration needed.
|
||||
|
||||
#### Returns
|
||||
|
||||
`DeltaGliderClient`: Configured client instance ready for use.
|
||||
@@ -48,11 +48,8 @@ client = create_client()
|
||||
# Custom endpoint for MinIO
|
||||
client = create_client(endpoint_url="http://localhost:9000")
|
||||
|
||||
# Debug mode with custom cache
|
||||
client = create_client(
|
||||
log_level="DEBUG",
|
||||
cache_dir="/var/cache/deltaglider"
|
||||
)
|
||||
# Debug mode
|
||||
client = create_client(log_level="DEBUG")
|
||||
|
||||
# Custom delta ratio threshold
|
||||
client = create_client(max_ratio=0.3) # Only use delta if <30% of original
|
||||
@@ -94,7 +91,7 @@ def list_objects(
|
||||
StartAfter: Optional[str] = None,
|
||||
FetchMetadata: bool = False,
|
||||
**kwargs
|
||||
) -> ListObjectsResponse
|
||||
) -> dict[str, Any]
|
||||
```
|
||||
|
||||
##### Parameters
|
||||
@@ -117,19 +114,32 @@ The method intelligently optimizes performance by:
|
||||
2. Only fetching metadata for delta files when explicitly requested
|
||||
3. Supporting efficient pagination for large buckets
|
||||
|
||||
##### Returns
|
||||
|
||||
boto3-compatible dict with:
|
||||
- **Contents** (`list[dict]`): List of S3Object dicts with Key, Size, LastModified, Metadata
|
||||
- **CommonPrefixes** (`list[dict]`): Optional list of common prefixes (folders)
|
||||
- **IsTruncated** (`bool`): Whether more results are available
|
||||
- **NextContinuationToken** (`str`): Token for next page
|
||||
- **KeyCount** (`int`): Number of keys returned
|
||||
|
||||
##### Examples
|
||||
|
||||
```python
|
||||
# Fast listing for UI display (no metadata fetching)
|
||||
response = client.list_objects(Bucket='releases')
|
||||
for obj in response['Contents']:
|
||||
print(f"{obj['Key']}: {obj['Size']} bytes")
|
||||
|
||||
# Paginated listing for large buckets
|
||||
response = client.list_objects(Bucket='releases', MaxKeys=100)
|
||||
while response.is_truncated:
|
||||
while response.get('IsTruncated'):
|
||||
for obj in response['Contents']:
|
||||
print(obj['Key'])
|
||||
response = client.list_objects(
|
||||
Bucket='releases',
|
||||
MaxKeys=100,
|
||||
ContinuationToken=response.next_continuation_token
|
||||
ContinuationToken=response.get('NextContinuationToken')
|
||||
)
|
||||
|
||||
# Get detailed compression stats (slower, only for analytics)
|
||||
@@ -137,37 +147,73 @@ response = client.list_objects(
|
||||
Bucket='releases',
|
||||
FetchMetadata=True # Only fetches for delta files
|
||||
)
|
||||
for obj in response['Contents']:
|
||||
metadata = obj.get('Metadata', {})
|
||||
if metadata.get('deltaglider-is-delta') == 'true':
|
||||
compression = metadata.get('deltaglider-compression-ratio', 'unknown')
|
||||
print(f"{obj['Key']}: {compression} compression")
|
||||
```
|
||||
|
||||
#### `get_bucket_stats`
|
||||
|
||||
Get statistics for a bucket with optional detailed compression metrics.
|
||||
Get statistics for a bucket with optional detailed compression metrics. Results are cached inside the bucket for performance.
|
||||
|
||||
```python
|
||||
def get_bucket_stats(
|
||||
self,
|
||||
bucket: str,
|
||||
detailed_stats: bool = False
|
||||
mode: Literal["quick", "sampled", "detailed"] = "quick",
|
||||
use_cache: bool = True,
|
||||
refresh_cache: bool = False,
|
||||
) -> BucketStats
|
||||
```
|
||||
|
||||
##### Parameters
|
||||
|
||||
- **bucket** (`str`): S3 bucket name.
|
||||
- **detailed_stats** (`bool`): If True, fetch accurate compression ratios for delta files. Default: False.
|
||||
- With `detailed_stats=False`: ~50ms for any bucket size (LIST calls only)
|
||||
- With `detailed_stats=True`: ~2-3s per 1000 objects (adds HEAD calls for delta files)
|
||||
- **mode** (`Literal[...]`): Accuracy/cost trade-off:
|
||||
- `"quick"` (default): LIST-only scan; compression ratios for deltas are estimated.
|
||||
- `"sampled"`: HEAD one delta per deltaspace and reuse the ratio.
|
||||
- `"detailed"`: HEAD every delta object; slowest but exact.
|
||||
- **use_cache** (`bool`): If True, read/write `.deltaglider/stats_{mode}.json` in the bucket for reuse.
|
||||
- **refresh_cache** (`bool`): Force recomputation even if a cache file is valid.
|
||||
|
||||
##### Caching Behavior
|
||||
|
||||
- Stats are cached per mode directly inside the bucket at `.deltaglider/stats_{mode}.json`.
|
||||
- Every call validates cache freshness via a quick LIST (object count + compressed size).
|
||||
- `refresh_cache=True` skips cache validation and recomputes immediately.
|
||||
- `use_cache=False` bypasses both reading and writing cache artifacts.
|
||||
|
||||
##### Returns
|
||||
|
||||
`BucketStats`: Dataclass containing:
|
||||
- **bucket** (`str`): Bucket name
|
||||
- **object_count** (`int`): Total number of objects
|
||||
- **total_size** (`int`): Original size in bytes (before compression)
|
||||
- **compressed_size** (`int`): Actual stored size in bytes
|
||||
- **space_saved** (`int`): Bytes saved through compression
|
||||
- **average_compression_ratio** (`float`): Average compression ratio (0.0-1.0)
|
||||
- **delta_objects** (`int`): Number of delta-compressed objects
|
||||
- **direct_objects** (`int`): Number of directly stored objects
|
||||
|
||||
##### Examples
|
||||
|
||||
```python
|
||||
# Quick stats for dashboard display
|
||||
# Quick stats (fast LIST-only)
|
||||
stats = client.get_bucket_stats('releases')
|
||||
print(f"Objects: {stats.object_count}, Size: {stats.total_size}")
|
||||
|
||||
# Detailed stats for analytics (slower but accurate)
|
||||
stats = client.get_bucket_stats('releases', detailed_stats=True)
|
||||
print(f"Compression ratio: {stats.average_compression_ratio:.1%}")
|
||||
# Sampled/detailed modes for analytics
|
||||
sampled = client.get_bucket_stats('releases', mode='sampled')
|
||||
detailed = client.get_bucket_stats('releases', mode='detailed')
|
||||
print(f"Compression ratio: {detailed.average_compression_ratio:.1%}")
|
||||
|
||||
# Force refresh if an external tool modified the bucket
|
||||
fresh = client.get_bucket_stats('releases', mode='quick', refresh_cache=True)
|
||||
|
||||
# Skip cache entirely when running ad-hoc diagnostics
|
||||
uncached = client.get_bucket_stats('releases', use_cache=False)
|
||||
```
|
||||
|
||||
#### `put_object`
|
||||
@@ -300,15 +346,18 @@ def list_buckets(
|
||||
|
||||
##### Returns
|
||||
|
||||
Dict with list of buckets and owner information (identical to boto3).
|
||||
Dict with the same structure boto3 returns (`Buckets`, `Owner`, `ResponseMetadata`). DeltaGlider does not inject additional metadata; use `get_bucket_stats()` for compression data.
|
||||
|
||||
##### Examples
|
||||
|
||||
```python
|
||||
# List all buckets
|
||||
response = client.list_buckets()
|
||||
for bucket in response['Buckets']:
|
||||
print(f"{bucket['Name']} - Created: {bucket['CreationDate']}")
|
||||
|
||||
# Combine with get_bucket_stats for deeper insights
|
||||
stats = client.get_bucket_stats('releases', mode='detailed')
|
||||
print(f"releases -> {stats.object_count} objects, {stats.space_saved/(1024**3):.2f} GB saved")
|
||||
```
|
||||
|
||||
### Simple API Methods
|
||||
@@ -445,20 +494,54 @@ else:
|
||||
# Re-upload or investigate
|
||||
```
|
||||
|
||||
#### `lifecycle_policy`
|
||||
### Cache Management Methods
|
||||
|
||||
Set lifecycle policy for S3 prefix (placeholder for future implementation).
|
||||
#### `clear_cache`
|
||||
|
||||
Clear all locally cached reference files.
|
||||
|
||||
```python
|
||||
def lifecycle_policy(
|
||||
self,
|
||||
s3_prefix: str,
|
||||
days_before_archive: int = 30,
|
||||
days_before_delete: int = 90
|
||||
) -> None
|
||||
def clear_cache(self) -> None
|
||||
```
|
||||
|
||||
**Note**: This method is a placeholder for future S3 lifecycle policy management.
|
||||
##### Description
|
||||
|
||||
Removes all cached reference files from the local filesystem. Useful for:
|
||||
- Freeing disk space in long-running applications
|
||||
- Ensuring the next upload/download fetches fresh references from S3
|
||||
- Resetting cache after configuration or credential changes
|
||||
- Testing and development workflows
|
||||
|
||||
##### Cache Scope
|
||||
|
||||
- **Reference Cache**: Binary reference files stored in `/tmp/deltaglider-*/`
|
||||
- Encrypted at rest with ephemeral keys
|
||||
- Content-addressed storage (SHA256-based filenames)
|
||||
- Automatically cleaned up on process exit
|
||||
- **Statistics Cache**: Stored inside the bucket as `.deltaglider/stats_{mode}.json`.
|
||||
- `clear_cache()` does *not* remove these S3 objects; use `refresh_cache=True` or delete the objects manually if needed.
|
||||
|
||||
##### Examples
|
||||
|
||||
```python
|
||||
# Long-running application
|
||||
client = create_client()
|
||||
|
||||
# Work with files
|
||||
for i in range(1000):
|
||||
client.upload(f"file_{i}.zip", "s3://bucket/")
|
||||
|
||||
# Periodic cache cleanup to prevent disk buildup
|
||||
if i % 100 == 0:
|
||||
client.clear_cache()
|
||||
|
||||
# Force fresh statistics after external changes (skip cache instead of clearing)
|
||||
stats_before = client.get_bucket_stats('releases')
|
||||
stats_after = client.get_bucket_stats('releases', refresh_cache=True)
|
||||
|
||||
# Development workflow
|
||||
client.clear_cache() # Start with clean state
|
||||
```
|
||||
|
||||
## UploadSummary
|
||||
|
||||
@@ -708,9 +791,10 @@ DeltaGlider respects these environment variables:
|
||||
### DeltaGlider Configuration
|
||||
|
||||
- **DG_LOG_LEVEL**: Logging level (DEBUG, INFO, WARNING, ERROR)
|
||||
- **DG_CACHE_DIR**: Local cache directory
|
||||
- **DG_MAX_RATIO**: Default maximum delta ratio
|
||||
|
||||
**Note**: Cache is automatically managed (ephemeral, process-isolated) and requires no configuration.
|
||||
|
||||
### Example
|
||||
|
||||
```bash
|
||||
@@ -721,10 +805,9 @@ export AWS_SECRET_ACCESS_KEY=minioadmin
|
||||
|
||||
# Configure DeltaGlider
|
||||
export DG_LOG_LEVEL=DEBUG
|
||||
export DG_CACHE_DIR=/var/cache/deltaglider
|
||||
export DG_MAX_RATIO=0.3
|
||||
|
||||
# Now use normally
|
||||
# Now use normally (cache managed automatically)
|
||||
python my_script.py
|
||||
```
|
||||
|
||||
@@ -816,4 +899,4 @@ client = create_client(log_level="DEBUG")
|
||||
|
||||
- **GitHub Issues**: [github.com/beshu-tech/deltaglider/issues](https://github.com/beshu-tech/deltaglider/issues)
|
||||
- **Documentation**: [github.com/beshu-tech/deltaglider](https://github.com/beshu-tech/deltaglider)
|
||||
- **PyPI Package**: [pypi.org/project/deltaglider](https://pypi.org/project/deltaglider)
|
||||
- **PyPI Package**: [pypi.org/project/deltaglider](https://pypi.org/project/deltaglider)
|
||||
|
||||
@@ -5,15 +5,17 @@ Real-world examples and patterns for using DeltaGlider in production application
|
||||
## Table of Contents
|
||||
|
||||
1. [Performance-Optimized Bucket Listing](#performance-optimized-bucket-listing)
|
||||
2. [Bucket Management](#bucket-management)
|
||||
3. [Software Release Management](#software-release-management)
|
||||
4. [Database Backup System](#database-backup-system)
|
||||
5. [CI/CD Pipeline Integration](#cicd-pipeline-integration)
|
||||
6. [Container Registry Storage](#container-registry-storage)
|
||||
7. [Machine Learning Model Versioning](#machine-learning-model-versioning)
|
||||
8. [Game Asset Distribution](#game-asset-distribution)
|
||||
9. [Log Archive Management](#log-archive-management)
|
||||
10. [Multi-Region Replication](#multi-region-replication)
|
||||
2. [Bucket Statistics and Monitoring](#bucket-statistics-and-monitoring)
|
||||
3. [Session-Level Cache Management](#session-level-cache-management)
|
||||
4. [Bucket Management](#bucket-management)
|
||||
5. [Software Release Management](#software-release-management)
|
||||
6. [Database Backup System](#database-backup-system)
|
||||
7. [CI/CD Pipeline Integration](#cicd-pipeline-integration)
|
||||
8. [Container Registry Storage](#container-registry-storage)
|
||||
9. [Machine Learning Model Versioning](#machine-learning-model-versioning)
|
||||
10. [Game Asset Distribution](#game-asset-distribution)
|
||||
11. [Log Archive Management](#log-archive-management)
|
||||
12. [Multi-Region Replication](#multi-region-replication)
|
||||
|
||||
## Performance-Optimized Bucket Listing
|
||||
|
||||
@@ -23,6 +25,7 @@ DeltaGlider's smart `list_objects` method eliminates the N+1 query problem by in
|
||||
|
||||
```python
|
||||
from deltaglider import create_client
|
||||
from deltaglider.client_models import BucketStats
|
||||
import time
|
||||
|
||||
client = create_client()
|
||||
@@ -39,19 +42,19 @@ def fast_bucket_listing(bucket: str):
|
||||
|
||||
# Process objects for display
|
||||
items = []
|
||||
for obj in response.contents:
|
||||
for obj in response['Contents']:
|
||||
metadata = obj.get("Metadata", {})
|
||||
items.append({
|
||||
"key": obj.key,
|
||||
"size": obj.size,
|
||||
"last_modified": obj.last_modified,
|
||||
"is_delta": obj.is_delta, # Determined from filename
|
||||
# No compression_ratio - would require HEAD request
|
||||
"key": obj["Key"],
|
||||
"size": obj["Size"],
|
||||
"last_modified": obj["LastModified"],
|
||||
"is_delta": metadata.get("deltaglider-is-delta") == "true",
|
||||
})
|
||||
|
||||
elapsed = time.time() - start
|
||||
print(f"Listed {len(items)} objects in {elapsed*1000:.0f}ms")
|
||||
|
||||
return items, response.next_continuation_token
|
||||
return items, response.get("NextContinuationToken")
|
||||
|
||||
# Example: List first page
|
||||
items, next_token = fast_bucket_listing('releases')
|
||||
@@ -73,12 +76,12 @@ def paginated_listing(bucket: str, page_size: int = 50):
|
||||
FetchMetadata=False # Keep it fast
|
||||
)
|
||||
|
||||
all_objects.extend(response.contents)
|
||||
all_objects.extend(response["Contents"])
|
||||
|
||||
if not response.is_truncated:
|
||||
if not response.get("IsTruncated"):
|
||||
break
|
||||
|
||||
continuation_token = response.next_continuation_token
|
||||
continuation_token = response.get("NextContinuationToken")
|
||||
print(f"Fetched {len(all_objects)} objects so far...")
|
||||
|
||||
return all_objects
|
||||
@@ -94,8 +97,8 @@ print(f"Total objects: {len(all_objects)}")
|
||||
def dashboard_with_stats(bucket: str):
|
||||
"""Dashboard view with optional detailed stats."""
|
||||
|
||||
# Quick overview (fast - no metadata)
|
||||
stats = client.get_bucket_stats(bucket, detailed_stats=False)
|
||||
# Quick overview (fast LIST-only)
|
||||
stats = client.get_bucket_stats(bucket)
|
||||
|
||||
print(f"Quick Stats for {bucket}:")
|
||||
print(f" Total Objects: {stats.object_count}")
|
||||
@@ -106,7 +109,7 @@ def dashboard_with_stats(bucket: str):
|
||||
|
||||
# Detailed compression analysis (slower - fetches metadata for deltas only)
|
||||
if stats.delta_objects > 0:
|
||||
detailed_stats = client.get_bucket_stats(bucket, detailed_stats=True)
|
||||
detailed_stats = client.get_bucket_stats(bucket, mode='detailed')
|
||||
print(f"\nDetailed Compression Stats:")
|
||||
print(f" Average Compression: {detailed_stats.average_compression_ratio:.1%}")
|
||||
print(f" Space Saved: {detailed_stats.space_saved / (1024**3):.2f} GB")
|
||||
@@ -129,11 +132,25 @@ def compression_analysis(bucket: str, prefix: str = ""):
|
||||
)
|
||||
|
||||
# Analyze compression effectiveness
|
||||
delta_files = [obj for obj in response.contents if obj.is_delta]
|
||||
delta_files: list[dict[str, float | int | str]] = []
|
||||
for obj in response["Contents"]:
|
||||
metadata = obj.get("Metadata", {})
|
||||
if metadata.get("deltaglider-is-delta") != "true":
|
||||
continue
|
||||
original_size = int(metadata.get("deltaglider-original-size", obj["Size"]))
|
||||
compression_ratio = float(metadata.get("deltaglider-compression-ratio", 0.0))
|
||||
delta_files.append(
|
||||
{
|
||||
"key": obj["Key"],
|
||||
"original": original_size,
|
||||
"compressed": obj["Size"],
|
||||
"ratio": compression_ratio,
|
||||
}
|
||||
)
|
||||
|
||||
if delta_files:
|
||||
total_original = sum(obj.original_size for obj in delta_files)
|
||||
total_compressed = sum(obj.compressed_size for obj in delta_files)
|
||||
total_original = sum(obj["original"] for obj in delta_files)
|
||||
total_compressed = sum(obj["compressed"] for obj in delta_files)
|
||||
avg_ratio = (total_original - total_compressed) / total_original
|
||||
|
||||
print(f"Compression Analysis for {prefix or 'all files'}:")
|
||||
@@ -143,11 +160,11 @@ def compression_analysis(bucket: str, prefix: str = ""):
|
||||
print(f" Average Compression: {avg_ratio:.1%}")
|
||||
|
||||
# Find best and worst compression
|
||||
best = max(delta_files, key=lambda x: x.compression_ratio or 0)
|
||||
worst = min(delta_files, key=lambda x: x.compression_ratio or 1)
|
||||
best = max(delta_files, key=lambda x: x["ratio"])
|
||||
worst = min(delta_files, key=lambda x: x["ratio"])
|
||||
|
||||
print(f" Best Compression: {best.key} ({best.compression_ratio:.1%})")
|
||||
print(f" Worst Compression: {worst.key} ({worst.compression_ratio:.1%})")
|
||||
print(f" Best Compression: {best['key']} ({best['ratio']:.1%})")
|
||||
print(f" Worst Compression: {worst['key']} ({worst['ratio']:.1%})")
|
||||
|
||||
# Example: Analyze v2.0 releases
|
||||
compression_analysis('releases', 'v2.0/')
|
||||
@@ -178,7 +195,11 @@ def performance_comparison(bucket: str):
|
||||
)
|
||||
time_detailed = (time.time() - start) * 1000
|
||||
|
||||
delta_count = sum(1 for obj in response_fast.contents if obj.is_delta)
|
||||
delta_count = sum(
|
||||
1
|
||||
for obj in response_fast["Contents"]
|
||||
if obj.get("Metadata", {}).get("deltaglider-is-delta") == "true"
|
||||
)
|
||||
|
||||
print(f"Performance Comparison for {bucket}:")
|
||||
print(f" Fast Listing: {time_fast:.0f}ms (1 API call)")
|
||||
@@ -199,6 +220,291 @@ performance_comparison('releases')
|
||||
|
||||
2. **Never Fetch for Non-Deltas**: The SDK automatically skips metadata fetching for non-delta files even when `FetchMetadata=True`.
|
||||
|
||||
## Bucket Statistics and Monitoring
|
||||
|
||||
DeltaGlider provides powerful bucket statistics with S3-backed caching for performance.
|
||||
|
||||
### Quick Dashboard Stats (Cached)
|
||||
|
||||
```python
|
||||
from deltaglider import create_client
|
||||
|
||||
client = create_client()
|
||||
|
||||
def show_bucket_dashboard(bucket: str):
|
||||
"""Display real-time bucket statistics with caching."""
|
||||
|
||||
# First call: computes stats (~50ms)
|
||||
stats = client.get_bucket_stats(bucket)
|
||||
|
||||
# Second call: instant (cached)
|
||||
stats = client.get_bucket_stats(bucket)
|
||||
|
||||
print(f"Dashboard for {stats.bucket}")
|
||||
print(f"=" * 60)
|
||||
print(f"Total Objects: {stats.object_count:,}")
|
||||
print(f" Delta Objects: {stats.delta_objects:,}")
|
||||
print(f" Direct Objects: {stats.direct_objects:,}")
|
||||
print()
|
||||
print(f"Original Size: {stats.total_size / (1024**3):.2f} GB")
|
||||
print(f"Stored Size: {stats.compressed_size / (1024**3):.2f} GB")
|
||||
print(f"Space Saved: {stats.space_saved / (1024**3):.2f} GB")
|
||||
print(f"Compression Ratio: {stats.average_compression_ratio:.1%}")
|
||||
|
||||
# Example: Show stats for multiple buckets (each cached separately)
|
||||
for bucket_name in ['releases', 'backups', 'archives']:
|
||||
show_bucket_dashboard(bucket_name)
|
||||
```
|
||||
|
||||
### Detailed Compression Analysis
|
||||
|
||||
```python
|
||||
def detailed_compression_report(bucket: str):
|
||||
"""Generate detailed compression report with accurate ratios."""
|
||||
|
||||
# Detailed stats fetch metadata for delta files (slower, accurate)
|
||||
stats = client.get_bucket_stats(bucket, mode='detailed')
|
||||
|
||||
efficiency = (stats.space_saved / stats.total_size * 100) if stats.total_size > 0 else 0
|
||||
|
||||
print(f"Detailed Compression Report: {stats.bucket}")
|
||||
print(f"=" * 60)
|
||||
print(f"Object Distribution:")
|
||||
print(f" Total: {stats.object_count:,}")
|
||||
print(f" Delta-Compressed: {stats.delta_objects:,} ({stats.delta_objects/stats.object_count*100:.1f}%)")
|
||||
print(f" Direct Storage: {stats.direct_objects:,} ({stats.direct_objects/stats.object_count*100:.1f}%)")
|
||||
print()
|
||||
print(f"Storage Efficiency:")
|
||||
print(f" Original Data: {stats.total_size / (1024**3):.2f} GB")
|
||||
print(f" Actual Storage: {stats.compressed_size / (1024**3):.2f} GB")
|
||||
print(f" Space Saved: {stats.space_saved / (1024**3):.2f} GB")
|
||||
print(f" Efficiency: {efficiency:.1f}%")
|
||||
print(f" Avg Compression: {stats.average_compression_ratio:.2%}")
|
||||
|
||||
# Calculate estimated monthly costs (example: $0.023/GB S3 Standard)
|
||||
cost_without = stats.total_size / (1024**3) * 0.023
|
||||
cost_with = stats.compressed_size / (1024**3) * 0.023
|
||||
monthly_savings = cost_without - cost_with
|
||||
|
||||
print()
|
||||
print(f"Estimated Monthly S3 Costs ($0.023/GB):")
|
||||
print(f" Without DeltaGlider: ${cost_without:.2f}")
|
||||
print(f" With DeltaGlider: ${cost_with:.2f}")
|
||||
print(f" Monthly Savings: ${monthly_savings:.2f}")
|
||||
|
||||
# Example: Detailed report
|
||||
detailed_compression_report('releases')
|
||||
```
|
||||
|
||||
### List Buckets with Cached Stats
|
||||
|
||||
```python
|
||||
def list_buckets_with_stats():
|
||||
"""List buckets and augment with cached stats fetched on demand."""
|
||||
|
||||
response = client.list_buckets()
|
||||
stats_cache: dict[str, BucketStats | None] = {}
|
||||
|
||||
def ensure_stats(bucket_name: str) -> BucketStats | None:
|
||||
if bucket_name not in stats_cache:
|
||||
try:
|
||||
stats_cache[bucket_name] = client.get_bucket_stats(bucket_name)
|
||||
except Exception:
|
||||
stats_cache[bucket_name] = None
|
||||
return stats_cache[bucket_name]
|
||||
|
||||
print("All Buckets:")
|
||||
print(f"{'Name':<30} {'Objects':<10} {'Compression':<15} {'Cached'}")
|
||||
print("=" * 70)
|
||||
|
||||
for bucket in response['Buckets']:
|
||||
name = bucket['Name']
|
||||
stats = ensure_stats(name)
|
||||
|
||||
if stats:
|
||||
obj_count = f"{stats.object_count:,}"
|
||||
compression = f"{stats.average_compression_ratio:.1%}"
|
||||
cached = "✓ (S3 cache)"
|
||||
else:
|
||||
obj_count = "N/A"
|
||||
compression = "N/A"
|
||||
cached = "✗"
|
||||
|
||||
print(f"{name:<30} {obj_count:<10} {compression:<15} {cached}")
|
||||
|
||||
# Example: List with stats
|
||||
list_buckets_with_stats()
|
||||
```
|
||||
|
||||
### Monitoring Dashboard (Real-Time)
|
||||
|
||||
```python
|
||||
import time
|
||||
|
||||
def monitoring_dashboard(buckets: list[str], refresh_seconds: int = 60):
|
||||
"""Real-time monitoring dashboard with periodic refresh."""
|
||||
|
||||
while True:
|
||||
print("\033[2J\033[H") # Clear screen
|
||||
print(f"DeltaGlider Monitoring Dashboard - {time.strftime('%Y-%m-%d %H:%M:%S')}")
|
||||
print("=" * 80)
|
||||
|
||||
for bucket_name in buckets:
|
||||
# Get cached stats (instant) or compute fresh
|
||||
stats = client.get_bucket_stats(bucket_name)
|
||||
|
||||
print(f"\n{bucket_name}:")
|
||||
print(f" Objects: {stats.object_count:,} | "
|
||||
f"Delta: {stats.delta_objects:,} | "
|
||||
f"Direct: {stats.direct_objects:,}")
|
||||
print(f" Size: {stats.compressed_size/(1024**3):.2f} GB | "
|
||||
f"Saved: {stats.space_saved/(1024**3):.2f} GB | "
|
||||
f"Compression: {stats.average_compression_ratio:.1%}")
|
||||
|
||||
print(f"\n{'=' * 80}")
|
||||
print(f"Refreshing in {refresh_seconds} seconds... (Ctrl+C to exit)")
|
||||
|
||||
time.sleep(refresh_seconds)
|
||||
|
||||
# Clear cache for fresh data on next iteration
|
||||
client.clear_cache()
|
||||
|
||||
# Example: Monitor key buckets
|
||||
try:
|
||||
monitoring_dashboard(['releases', 'backups', 'archives'], refresh_seconds=30)
|
||||
except KeyboardInterrupt:
|
||||
print("\nMonitoring stopped.")
|
||||
```
|
||||
|
||||
## Session-Level Cache Management
|
||||
|
||||
DeltaGlider maintains an encrypted reference cache for optimal performance in long-running applications.
|
||||
|
||||
### Long-Running Application Pattern
|
||||
|
||||
```python
|
||||
from deltaglider import create_client
|
||||
import time
|
||||
|
||||
def long_running_upload_service():
|
||||
"""Upload service with periodic cache cleanup."""
|
||||
|
||||
client = create_client()
|
||||
processed_count = 0
|
||||
|
||||
while True:
|
||||
# Simulate file processing
|
||||
files_to_upload = get_pending_files() # Your file queue
|
||||
|
||||
for file_path in files_to_upload:
|
||||
try:
|
||||
summary = client.upload(file_path, "s3://releases/")
|
||||
processed_count += 1
|
||||
|
||||
print(f"Uploaded {file_path}: {summary.savings_percent:.0f}% saved")
|
||||
|
||||
# Periodic cache cleanup (every 100 files)
|
||||
if processed_count % 100 == 0:
|
||||
client.clear_cache()
|
||||
print(f"Cache cleared after {processed_count} files")
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error uploading {file_path}: {e}")
|
||||
|
||||
time.sleep(60) # Check for new files every minute
|
||||
|
||||
# Example: Run upload service
|
||||
# long_running_upload_service()
|
||||
```
|
||||
|
||||
### Cache Invalidation After External Changes
|
||||
|
||||
```python
|
||||
def handle_external_bucket_changes(bucket: str):
|
||||
"""Refresh statistics after external tools modify bucket."""
|
||||
|
||||
# Get initial stats (cached)
|
||||
stats_before = client.get_bucket_stats(bucket)
|
||||
print(f"Before: {stats_before.object_count} objects")
|
||||
|
||||
# External process modifies bucket
|
||||
print("External backup tool running...")
|
||||
run_external_backup_tool(bucket) # Your external tool
|
||||
|
||||
# Force a recompute of the cached stats
|
||||
stats_after = client.get_bucket_stats(bucket, refresh_cache=True)
|
||||
print(f"After: {stats_after.object_count} objects")
|
||||
print(f"Added: {stats_after.object_count - stats_before.object_count} objects")
|
||||
|
||||
# Example usage
|
||||
handle_external_bucket_changes('backups')
|
||||
```
|
||||
|
||||
### Testing with Clean Cache
|
||||
|
||||
```python
|
||||
import pytest
|
||||
from deltaglider import create_client
|
||||
|
||||
def test_upload_workflow():
|
||||
"""Test with clean cache state."""
|
||||
|
||||
client = create_client()
|
||||
client.clear_cache() # Start with clean state
|
||||
|
||||
# Test first upload (no reference exists)
|
||||
summary1 = client.upload("file1.zip", "s3://test-bucket/prefix/")
|
||||
assert not summary1.is_delta # First file is reference
|
||||
|
||||
# Test subsequent upload (uses cached reference)
|
||||
summary2 = client.upload("file2.zip", "s3://test-bucket/prefix/")
|
||||
assert summary2.is_delta # Should use delta
|
||||
|
||||
# Clear and test again
|
||||
client.clear_cache()
|
||||
summary3 = client.upload("file3.zip", "s3://test-bucket/prefix/")
|
||||
assert summary3.is_delta # Still delta (reference in S3)
|
||||
|
||||
# Run test
|
||||
# test_upload_workflow()
|
||||
```
|
||||
|
||||
### Cache Performance Monitoring
|
||||
|
||||
```python
|
||||
import time
|
||||
|
||||
def measure_cache_performance(bucket: str):
|
||||
"""Measure performance impact of caching."""
|
||||
|
||||
client = create_client()
|
||||
|
||||
# Test 1: Cold cache
|
||||
start = time.time()
|
||||
stats1 = client.get_bucket_stats(bucket, mode='detailed', refresh_cache=True)
|
||||
cold_time = (time.time() - start) * 1000
|
||||
|
||||
# Test 2: Warm cache
|
||||
start = time.time()
|
||||
stats2 = client.get_bucket_stats(bucket, mode='detailed')
|
||||
warm_time = (time.time() - start) * 1000
|
||||
|
||||
# Test 3: Quick stats from detailed cache
|
||||
start = time.time()
|
||||
stats3 = client.get_bucket_stats(bucket, mode='quick')
|
||||
reuse_time = (time.time() - start) * 1000
|
||||
|
||||
print(f"Cache Performance for {bucket}:")
|
||||
print(f" Cold Cache (detailed): {cold_time:.0f}ms")
|
||||
print(f" Warm Cache (detailed): {warm_time:.0f}ms")
|
||||
print(f" Cache Reuse (quick): {reuse_time:.0f}ms")
|
||||
print(f" Speedup (detailed): {cold_time/warm_time:.1f}x")
|
||||
print(f" Speedup (reuse): {cold_time/reuse_time:.1f}x")
|
||||
|
||||
# Example: Measure cache performance
|
||||
measure_cache_performance('releases')
|
||||
```
|
||||
|
||||
3. **Use Pagination**: For large buckets, use `MaxKeys` and `ContinuationToken` to paginate results.
|
||||
|
||||
4. **Cache Results**: If you need metadata frequently, consider caching the results to avoid repeated HEAD requests.
|
||||
@@ -1389,4 +1695,4 @@ files_to_upload = [
|
||||
results = uploader.upload_batch(files_to_upload)
|
||||
```
|
||||
|
||||
These examples demonstrate real-world usage patterns for DeltaGlider across various domains. Each example includes error handling, monitoring, and best practices for production deployments.
|
||||
These examples demonstrate real-world usage patterns for DeltaGlider across various domains. Each example includes error handling, monitoring, and best practices for production deployments.
|
||||
|
||||
@@ -69,6 +69,42 @@ Or via environment variable:
|
||||
export AWS_ENDPOINT_URL=http://minio.local:9000
|
||||
```
|
||||
|
||||
### DeltaGlider Configuration
|
||||
|
||||
DeltaGlider supports the following environment variables:
|
||||
|
||||
**Logging & Performance**:
|
||||
- `DG_LOG_LEVEL`: Logging level (default: `INFO`, options: `DEBUG`, `INFO`, `WARNING`, `ERROR`)
|
||||
- `DG_MAX_RATIO`: Maximum delta/file ratio (default: `0.5`, range: `0.0-1.0`)
|
||||
- **See [DG_MAX_RATIO.md](../DG_MAX_RATIO.md) for complete tuning guide**
|
||||
- Controls when to use delta compression vs. direct storage
|
||||
- Lower (0.2-0.3) = conservative, only high-quality compression
|
||||
- Higher (0.6-0.7) = permissive, accept modest savings
|
||||
|
||||
**Cache Configuration**:
|
||||
- `DG_CACHE_BACKEND`: Cache backend type (default: `filesystem`, options: `filesystem`, `memory`)
|
||||
- `DG_CACHE_MEMORY_SIZE_MB`: Memory cache size in MB (default: `100`)
|
||||
- `DG_CACHE_ENCRYPTION_KEY`: Optional base64-encoded Fernet key for persistent encryption
|
||||
|
||||
**Security**:
|
||||
- Encryption is **always enabled** (cannot be disabled)
|
||||
- Ephemeral encryption keys per process (forward secrecy)
|
||||
- Corrupted cache files automatically deleted
|
||||
- Set `DG_CACHE_ENCRYPTION_KEY` only for cross-process cache sharing
|
||||
|
||||
**Example**:
|
||||
```bash
|
||||
# Use memory cache for faster performance in CI/CD
|
||||
export DG_CACHE_BACKEND=memory
|
||||
export DG_CACHE_MEMORY_SIZE_MB=500
|
||||
|
||||
# Enable debug logging
|
||||
export DG_LOG_LEVEL=DEBUG
|
||||
|
||||
# Adjust delta compression threshold
|
||||
export DG_MAX_RATIO=0.3 # More aggressive compression
|
||||
```
|
||||
|
||||
## Your First Upload
|
||||
|
||||
### Basic Example
|
||||
|
||||
64
examples/boto3_compatible_types.py
Normal file
64
examples/boto3_compatible_types.py
Normal file
@@ -0,0 +1,64 @@
|
||||
"""Example: Using boto3-compatible responses without importing boto3.
|
||||
|
||||
This demonstrates how DeltaGlider provides full type safety and boto3 compatibility
|
||||
without requiring boto3 imports in user code.
|
||||
|
||||
As of v5.0.0, DeltaGlider returns plain dicts (not custom dataclasses) that are
|
||||
100% compatible with boto3 S3 responses. You get IDE autocomplete through TypedDict
|
||||
type hints without any runtime overhead.
|
||||
"""
|
||||
|
||||
from deltaglider import ListObjectsV2Response, S3Object, create_client
|
||||
|
||||
# Create client (no boto3 import needed!)
|
||||
client = create_client()
|
||||
|
||||
# Type hints work perfectly without boto3
|
||||
def process_files(bucket: str, prefix: str) -> None:
|
||||
"""Process files in S3 with full type safety."""
|
||||
# Return type is fully typed - IDE autocomplete works!
|
||||
response: ListObjectsV2Response = client.list_objects(
|
||||
Bucket=bucket, Prefix=prefix, Delimiter="/"
|
||||
)
|
||||
|
||||
# Response is a plain dict - 100% boto3-compatible
|
||||
# TypedDict provides autocomplete and type checking
|
||||
for obj in response["Contents"]:
|
||||
# obj is typed as S3Object - all fields have autocomplete!
|
||||
key: str = obj["Key"] # ✅ IDE knows this is str
|
||||
size: int = obj["Size"] # ✅ IDE knows this is int
|
||||
print(f"{key}: {size} bytes")
|
||||
|
||||
# DeltaGlider metadata is in the standard Metadata field
|
||||
metadata = obj.get("Metadata", {})
|
||||
if metadata.get("deltaglider-is-delta") == "true":
|
||||
compression = metadata.get("deltaglider-compression-ratio", "unknown")
|
||||
print(f" └─ Delta file (compression: {compression})")
|
||||
|
||||
# Optional fields work too
|
||||
for prefix_dict in response.get("CommonPrefixes", []):
|
||||
print(f"Directory: {prefix_dict['Prefix']}")
|
||||
|
||||
# Pagination info
|
||||
if response.get("IsTruncated"):
|
||||
next_token = response.get("NextContinuationToken")
|
||||
print(f"More results available, token: {next_token}")
|
||||
|
||||
|
||||
# This is 100% compatible with boto3 code!
|
||||
def works_with_boto3_or_deltaglider(s3_client) -> None:
|
||||
"""This function works with EITHER boto3 or DeltaGlider client."""
|
||||
# Because the response structure is identical!
|
||||
response = s3_client.list_objects(Bucket="my-bucket")
|
||||
|
||||
for obj in response["Contents"]:
|
||||
print(obj["Key"])
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
# Example usage
|
||||
print("✅ Full type safety without boto3 imports!")
|
||||
print("✅ 100% compatible with boto3")
|
||||
print("✅ Drop-in replacement")
|
||||
print("✅ Plain dict responses (not custom dataclasses)")
|
||||
print("✅ DeltaGlider metadata in standard Metadata field")
|
||||
101
examples/credentials_example.py
Normal file
101
examples/credentials_example.py
Normal file
@@ -0,0 +1,101 @@
|
||||
"""Example: Using explicit AWS credentials with DeltaGlider.
|
||||
|
||||
This example demonstrates how to pass AWS credentials directly to
|
||||
DeltaGlider's create_client() function, which is useful when:
|
||||
|
||||
1. You need to use different credentials than your environment default
|
||||
2. You're working with temporary credentials (session tokens)
|
||||
3. You want to avoid relying on environment variables
|
||||
4. You're implementing multi-tenant systems with different AWS accounts
|
||||
"""
|
||||
|
||||
from deltaglider import create_client
|
||||
|
||||
|
||||
def example_basic_credentials():
|
||||
"""Use basic AWS credentials (access key + secret key)."""
|
||||
client = create_client(
|
||||
aws_access_key_id="AKIAIOSFODNN7EXAMPLE",
|
||||
aws_secret_access_key="wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY",
|
||||
region_name="us-west-2",
|
||||
)
|
||||
|
||||
# Now use the client normally
|
||||
# client.put_object(Bucket="my-bucket", Key="file.zip", Body=b"data")
|
||||
print("✓ Created client with explicit credentials")
|
||||
|
||||
|
||||
def example_temporary_credentials():
|
||||
"""Use temporary AWS credentials (with session token)."""
|
||||
client = create_client(
|
||||
aws_access_key_id="ASIAIOSFODNN7EXAMPLE",
|
||||
aws_secret_access_key="wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY",
|
||||
aws_session_token="FwoGZXIvYXdzEBEaDH...", # From STS
|
||||
region_name="us-east-1",
|
||||
)
|
||||
|
||||
print("✓ Created client with temporary credentials")
|
||||
|
||||
|
||||
def example_environment_credentials():
|
||||
"""Use default credential chain (environment variables, IAM role, etc.)."""
|
||||
# When credentials are omitted, DeltaGlider uses boto3's default credential chain:
|
||||
# 1. Environment variables (AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY)
|
||||
# 2. AWS credentials file (~/.aws/credentials)
|
||||
# 3. IAM role (for EC2 instances)
|
||||
client = create_client()
|
||||
|
||||
print("✓ Created client with default credential chain")
|
||||
|
||||
|
||||
def example_minio_credentials():
|
||||
"""Use credentials for MinIO or other S3-compatible services."""
|
||||
client = create_client(
|
||||
endpoint_url="http://localhost:9000",
|
||||
aws_access_key_id="minioadmin",
|
||||
aws_secret_access_key="minioadmin",
|
||||
)
|
||||
|
||||
print("✓ Created client for MinIO with custom credentials")
|
||||
|
||||
|
||||
def example_multi_tenant():
|
||||
"""Example: Different credentials for different tenants."""
|
||||
|
||||
# Tenant A uses one AWS account
|
||||
tenant_a_client = create_client(
|
||||
aws_access_key_id="TENANT_A_KEY",
|
||||
aws_secret_access_key="TENANT_A_SECRET",
|
||||
region_name="us-west-2",
|
||||
)
|
||||
|
||||
# Tenant B uses a different AWS account
|
||||
tenant_b_client = create_client(
|
||||
aws_access_key_id="TENANT_B_KEY",
|
||||
aws_secret_access_key="TENANT_B_SECRET",
|
||||
region_name="eu-west-1",
|
||||
)
|
||||
|
||||
print("✓ Created separate clients for multi-tenant scenario")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
print("DeltaGlider Credentials Examples\n" + "=" * 40)
|
||||
|
||||
print("\n1. Basic credentials:")
|
||||
example_basic_credentials()
|
||||
|
||||
print("\n2. Temporary credentials:")
|
||||
example_temporary_credentials()
|
||||
|
||||
print("\n3. Environment credentials:")
|
||||
example_environment_credentials()
|
||||
|
||||
print("\n4. MinIO credentials:")
|
||||
example_minio_credentials()
|
||||
|
||||
print("\n5. Multi-tenant scenario:")
|
||||
example_multi_tenant()
|
||||
|
||||
print("\n" + "=" * 40)
|
||||
print("All examples completed successfully!")
|
||||
@@ -49,9 +49,11 @@ classifiers = [
|
||||
]
|
||||
|
||||
dependencies = [
|
||||
"boto3>=1.35.0",
|
||||
"click>=8.1.0",
|
||||
"python-dateutil>=2.9.0",
|
||||
"boto3>=1.35.0,<2.0.0",
|
||||
"click>=8.1.0,<9.0.0",
|
||||
"cryptography>=42.0.0,<45.0.0",
|
||||
"python-dateutil>=2.9.0,<3.0.0",
|
||||
"requests>=2.32.0,<3.0.0",
|
||||
]
|
||||
|
||||
[project.urls]
|
||||
@@ -108,6 +110,7 @@ dev-dependencies = [
|
||||
"mypy>=1.13.0",
|
||||
"boto3-stubs[s3]>=1.35.0",
|
||||
"types-python-dateutil>=2.9.0",
|
||||
"types-requests>=2.32.0",
|
||||
"setuptools-scm>=8.0.0",
|
||||
]
|
||||
|
||||
|
||||
101
scripts/check_metadata.py
Normal file
101
scripts/check_metadata.py
Normal file
@@ -0,0 +1,101 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Check which delta files are missing metadata."""
|
||||
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
# Add src to path for imports
|
||||
sys.path.insert(0, str(Path(__file__).parent.parent / "src"))
|
||||
|
||||
from deltaglider import create_client
|
||||
|
||||
|
||||
def check_bucket_metadata(bucket: str) -> None:
|
||||
"""Check all delta files in a bucket for missing metadata.
|
||||
|
||||
Args:
|
||||
bucket: S3 bucket name
|
||||
"""
|
||||
client = create_client()
|
||||
|
||||
print(f"Checking delta files in bucket: {bucket}\n")
|
||||
print("=" * 80)
|
||||
|
||||
# List all objects
|
||||
response = client.service.storage.list_objects(bucket=bucket, max_keys=10000)
|
||||
|
||||
missing_metadata = []
|
||||
has_metadata = []
|
||||
total_delta_files = 0
|
||||
|
||||
for obj in response["objects"]:
|
||||
key = obj["key"]
|
||||
|
||||
# Only check .delta files
|
||||
if not key.endswith(".delta"):
|
||||
continue
|
||||
|
||||
total_delta_files += 1
|
||||
|
||||
# Get metadata
|
||||
obj_head = client.service.storage.head(f"{bucket}/{key}")
|
||||
|
||||
if not obj_head:
|
||||
print(f"❌ {key}: Object not found")
|
||||
continue
|
||||
|
||||
metadata = obj_head.metadata
|
||||
|
||||
# Check for required metadata fields
|
||||
required_fields = ["file_size", "file_sha256", "ref_key", "ref_sha256", "delta_size"]
|
||||
missing_fields = [f for f in required_fields if f not in metadata]
|
||||
|
||||
if missing_fields:
|
||||
missing_metadata.append({
|
||||
"key": key,
|
||||
"missing_fields": missing_fields,
|
||||
"has_metadata": bool(metadata),
|
||||
"available_keys": list(metadata.keys()) if metadata else [],
|
||||
})
|
||||
status = "⚠️ MISSING"
|
||||
detail = f"missing: {', '.join(missing_fields)}"
|
||||
else:
|
||||
has_metadata.append(key)
|
||||
status = "✅ OK"
|
||||
detail = f"file_size={metadata.get('file_size')}"
|
||||
|
||||
print(f"{status} {key}")
|
||||
print(f" {detail}")
|
||||
if metadata:
|
||||
print(f" Available keys: {', '.join(metadata.keys())}")
|
||||
print()
|
||||
|
||||
# Summary
|
||||
print("=" * 80)
|
||||
print(f"\nSummary:")
|
||||
print(f" Total delta files: {total_delta_files}")
|
||||
print(f" With complete metadata: {len(has_metadata)} ({len(has_metadata)/total_delta_files*100:.1f}%)")
|
||||
print(f" Missing metadata: {len(missing_metadata)} ({len(missing_metadata)/total_delta_files*100:.1f}%)")
|
||||
|
||||
if missing_metadata:
|
||||
print(f"\n❌ Files with missing metadata:")
|
||||
for item in missing_metadata:
|
||||
print(f" - {item['key']}")
|
||||
print(f" Missing: {', '.join(item['missing_fields'])}")
|
||||
if item['available_keys']:
|
||||
print(f" Has: {', '.join(item['available_keys'])}")
|
||||
|
||||
print(f"\n💡 Recommendation:")
|
||||
print(f" These files should be re-uploaded to get proper metadata and accurate stats.")
|
||||
print(f" You can re-upload with: deltaglider cp <local-file> s3://{bucket}/<path>")
|
||||
else:
|
||||
print(f"\n✅ All delta files have complete metadata!")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
if len(sys.argv) != 2:
|
||||
print("Usage: python check_metadata.py <bucket-name>")
|
||||
sys.exit(1)
|
||||
|
||||
bucket_name = sys.argv[1]
|
||||
check_bucket_metadata(bucket_name)
|
||||
@@ -7,23 +7,36 @@ except ImportError:
|
||||
__version__ = "0.0.0+unknown"
|
||||
|
||||
# Import client API
|
||||
from .client import (
|
||||
from .client import DeltaGliderClient, create_client
|
||||
from .client_models import (
|
||||
BucketStats,
|
||||
CompressionEstimate,
|
||||
DeltaGliderClient,
|
||||
ListObjectsResponse,
|
||||
ObjectInfo,
|
||||
UploadSummary,
|
||||
create_client,
|
||||
)
|
||||
from .core import DeltaService, DeltaSpace, ObjectKey
|
||||
|
||||
# Import boto3-compatible type aliases (no boto3 import required!)
|
||||
from .types import (
|
||||
CopyObjectResponse,
|
||||
CreateBucketResponse,
|
||||
DeleteObjectResponse,
|
||||
DeleteObjectsResponse,
|
||||
GetObjectResponse,
|
||||
HeadObjectResponse,
|
||||
ListBucketsResponse,
|
||||
ListObjectsV2Response,
|
||||
PutObjectResponse,
|
||||
S3Object,
|
||||
)
|
||||
|
||||
__all__ = [
|
||||
"__version__",
|
||||
# Client
|
||||
"DeltaGliderClient",
|
||||
"create_client",
|
||||
# Data classes
|
||||
# Data classes (legacy - will be deprecated in favor of TypedDict)
|
||||
"UploadSummary",
|
||||
"CompressionEstimate",
|
||||
"ObjectInfo",
|
||||
@@ -33,4 +46,15 @@ __all__ = [
|
||||
"DeltaService",
|
||||
"DeltaSpace",
|
||||
"ObjectKey",
|
||||
# boto3-compatible types (no boto3 import needed!)
|
||||
"ListObjectsV2Response",
|
||||
"PutObjectResponse",
|
||||
"GetObjectResponse",
|
||||
"DeleteObjectResponse",
|
||||
"DeleteObjectsResponse",
|
||||
"HeadObjectResponse",
|
||||
"ListBucketsResponse",
|
||||
"CreateBucketResponse",
|
||||
"CopyObjectResponse",
|
||||
"S3Object",
|
||||
]
|
||||
|
||||
@@ -1,19 +1,27 @@
|
||||
"""Adapters for DeltaGlider."""
|
||||
|
||||
from .cache_cas import ContentAddressedCache
|
||||
from .cache_encrypted import EncryptedCache
|
||||
from .cache_fs import FsCacheAdapter
|
||||
from .cache_memory import MemoryCache
|
||||
from .clock_utc import UtcClockAdapter
|
||||
from .diff_xdelta import XdeltaAdapter
|
||||
from .ec2_metadata import EC2MetadataAdapter
|
||||
from .hash_sha import Sha256Adapter
|
||||
from .logger_std import StdLoggerAdapter
|
||||
from .metrics_noop import NoopMetricsAdapter
|
||||
from .storage_s3 import S3StorageAdapter
|
||||
|
||||
__all__ = [
|
||||
"S3StorageAdapter",
|
||||
"XdeltaAdapter",
|
||||
"Sha256Adapter",
|
||||
"ContentAddressedCache",
|
||||
"EC2MetadataAdapter",
|
||||
"EncryptedCache",
|
||||
"FsCacheAdapter",
|
||||
"UtcClockAdapter",
|
||||
"StdLoggerAdapter",
|
||||
"MemoryCache",
|
||||
"NoopMetricsAdapter",
|
||||
"S3StorageAdapter",
|
||||
"Sha256Adapter",
|
||||
"StdLoggerAdapter",
|
||||
"UtcClockAdapter",
|
||||
"XdeltaAdapter",
|
||||
]
|
||||
|
||||
270
src/deltaglider/adapters/cache_cas.py
Normal file
270
src/deltaglider/adapters/cache_cas.py
Normal file
@@ -0,0 +1,270 @@
|
||||
"""Content-Addressed Storage (CAS) cache adapter.
|
||||
|
||||
This adapter stores cached references using their SHA256 hash as the filename,
|
||||
eliminating collision risks and enabling automatic deduplication.
|
||||
"""
|
||||
|
||||
import hashlib
|
||||
import shutil
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
# Unix-only imports for file locking
|
||||
if sys.platform != "win32":
|
||||
import fcntl
|
||||
|
||||
from ..core.errors import CacheCorruptionError, CacheMissError
|
||||
from ..ports.cache import CachePort
|
||||
from ..ports.hash import HashPort
|
||||
|
||||
|
||||
class ContentAddressedCache(CachePort):
|
||||
"""Content-addressed storage cache using SHA256 as filename.
|
||||
|
||||
Key Features:
|
||||
- Zero collision risk (SHA256 namespace is the filename)
|
||||
- Automatic deduplication (same content = same filename)
|
||||
- No metadata tracking needed (self-describing)
|
||||
- Secure by design (tampering changes SHA, breaks lookup)
|
||||
|
||||
Storage Layout:
|
||||
- base_dir/
|
||||
- ab/
|
||||
- cd/
|
||||
- abcdef123456... (full SHA256 as filename)
|
||||
|
||||
The two-level directory structure (first 2 chars, next 2 chars) prevents
|
||||
filesystem performance degradation from too many files in one directory.
|
||||
"""
|
||||
|
||||
def __init__(self, base_dir: Path, hasher: HashPort):
|
||||
"""Initialize content-addressed cache.
|
||||
|
||||
Args:
|
||||
base_dir: Root directory for cache storage
|
||||
hasher: Hash adapter for SHA256 computation
|
||||
"""
|
||||
self.base_dir = base_dir
|
||||
self.hasher = hasher
|
||||
# Mapping of (bucket, prefix) -> sha256 for compatibility
|
||||
# This is ephemeral and only used within a single process
|
||||
self._deltaspace_to_sha: dict[tuple[str, str], str] = {}
|
||||
|
||||
def _cas_path(self, sha256: str) -> Path:
|
||||
"""Get content-addressed path from SHA256 hash.
|
||||
|
||||
Uses two-level directory structure for filesystem optimization:
|
||||
- First 2 hex chars as L1 directory (256 buckets)
|
||||
- Next 2 hex chars as L2 directory (256 buckets per L1)
|
||||
- Full SHA as filename
|
||||
|
||||
Example: abcdef1234... -> ab/cd/abcdef1234...
|
||||
|
||||
Args:
|
||||
sha256: Full SHA256 hash (64 hex chars)
|
||||
|
||||
Returns:
|
||||
Path to file in content-addressed storage
|
||||
"""
|
||||
if len(sha256) < 4:
|
||||
raise ValueError(f"Invalid SHA256: {sha256}")
|
||||
|
||||
# Two-level directory structure
|
||||
l1_dir = sha256[:2] # First 2 chars
|
||||
l2_dir = sha256[2:4] # Next 2 chars
|
||||
|
||||
return self.base_dir / l1_dir / l2_dir / sha256
|
||||
|
||||
def ref_path(self, bucket: str, prefix: str) -> Path:
|
||||
"""Get path where reference should be cached.
|
||||
|
||||
For CAS, we need the SHA to compute the path. This method looks up
|
||||
the SHA from the ephemeral mapping. If not found, it returns a
|
||||
placeholder path (backward compatibility with has_ref checks).
|
||||
|
||||
Args:
|
||||
bucket: S3 bucket name
|
||||
prefix: Deltaspace prefix
|
||||
|
||||
Returns:
|
||||
Path to cached reference (may not exist)
|
||||
"""
|
||||
key = (bucket, prefix)
|
||||
|
||||
# If we have the SHA mapping, use CAS path
|
||||
if key in self._deltaspace_to_sha:
|
||||
sha = self._deltaspace_to_sha[key]
|
||||
return self._cas_path(sha)
|
||||
|
||||
# Fallback: return a non-existent placeholder
|
||||
# This enables has_ref to return False for unmapped deltaspaces
|
||||
return self.base_dir / "_unmapped" / bucket / prefix / "reference.bin"
|
||||
|
||||
def has_ref(self, bucket: str, prefix: str, sha: str) -> bool:
|
||||
"""Check if reference exists with given SHA.
|
||||
|
||||
In CAS, existence check is simple: if file exists at SHA path,
|
||||
it MUST have that SHA (content-addressed guarantee).
|
||||
|
||||
Args:
|
||||
bucket: S3 bucket name
|
||||
prefix: Deltaspace prefix
|
||||
sha: Expected SHA256 hash
|
||||
|
||||
Returns:
|
||||
True if reference exists with this SHA
|
||||
"""
|
||||
path = self._cas_path(sha)
|
||||
return path.exists()
|
||||
|
||||
def get_validated_ref(self, bucket: str, prefix: str, expected_sha: str) -> Path:
|
||||
"""Get cached reference with atomic SHA validation.
|
||||
|
||||
In CAS, the SHA IS the filename, so if the file exists, it's already
|
||||
validated by definition. We still perform an integrity check to detect
|
||||
filesystem corruption.
|
||||
|
||||
Args:
|
||||
bucket: S3 bucket name
|
||||
prefix: Deltaspace prefix
|
||||
expected_sha: Expected SHA256 hash
|
||||
|
||||
Returns:
|
||||
Path to validated cached file
|
||||
|
||||
Raises:
|
||||
CacheMissError: File not found in cache
|
||||
CacheCorruptionError: SHA mismatch (filesystem corruption)
|
||||
"""
|
||||
path = self._cas_path(expected_sha)
|
||||
|
||||
if not path.exists():
|
||||
raise CacheMissError(f"Cache miss for SHA {expected_sha[:8]}...")
|
||||
|
||||
# Lock file and validate content atomically
|
||||
try:
|
||||
with open(path, "rb") as f:
|
||||
# Acquire shared lock (Unix only)
|
||||
if sys.platform != "win32":
|
||||
fcntl.flock(f.fileno(), fcntl.LOCK_SH)
|
||||
|
||||
# Read and hash content
|
||||
content = f.read()
|
||||
actual_sha = hashlib.sha256(content).hexdigest()
|
||||
|
||||
# Release lock automatically when exiting context
|
||||
|
||||
# Validate SHA (should never fail in CAS unless filesystem corruption)
|
||||
if actual_sha != expected_sha:
|
||||
# Filesystem corruption detected
|
||||
try:
|
||||
path.unlink()
|
||||
except OSError:
|
||||
pass # Best effort cleanup
|
||||
|
||||
raise CacheCorruptionError(
|
||||
f"Filesystem corruption detected: file {path.name} has wrong content. "
|
||||
f"Expected SHA {expected_sha}, got {actual_sha}"
|
||||
)
|
||||
|
||||
# Update mapping for ref_path compatibility
|
||||
self._deltaspace_to_sha[(bucket, prefix)] = expected_sha
|
||||
|
||||
return path
|
||||
|
||||
except OSError as e:
|
||||
raise CacheMissError(f"Cache read error for SHA {expected_sha[:8]}...: {e}") from e
|
||||
|
||||
def write_ref(self, bucket: str, prefix: str, src: Path) -> Path:
|
||||
"""Cache reference file using content-addressed storage.
|
||||
|
||||
The file is stored at a path determined by its SHA256 hash.
|
||||
If a file with the same content already exists, it's reused
|
||||
(automatic deduplication).
|
||||
|
||||
Args:
|
||||
bucket: S3 bucket name
|
||||
prefix: Deltaspace prefix
|
||||
src: Source file to cache
|
||||
|
||||
Returns:
|
||||
Path to cached file (content-addressed)
|
||||
"""
|
||||
# Compute SHA of source file
|
||||
sha = self.hasher.sha256(src)
|
||||
path = self._cas_path(sha)
|
||||
|
||||
# If file already exists, we're done (deduplication)
|
||||
if path.exists():
|
||||
# Update mapping
|
||||
self._deltaspace_to_sha[(bucket, prefix)] = sha
|
||||
return path
|
||||
|
||||
# Create directory structure with secure permissions
|
||||
path.parent.mkdir(parents=True, mode=0o700, exist_ok=True)
|
||||
|
||||
# Atomic write using temp file + rename
|
||||
temp_path = path.parent / f".tmp.{sha}"
|
||||
try:
|
||||
shutil.copy2(src, temp_path)
|
||||
# Atomic rename (POSIX guarantee)
|
||||
temp_path.rename(path)
|
||||
except Exception:
|
||||
# Cleanup on failure
|
||||
if temp_path.exists():
|
||||
temp_path.unlink()
|
||||
raise
|
||||
|
||||
# Update mapping
|
||||
self._deltaspace_to_sha[(bucket, prefix)] = sha
|
||||
|
||||
return path
|
||||
|
||||
def evict(self, bucket: str, prefix: str) -> None:
|
||||
"""Remove cached reference for given deltaspace.
|
||||
|
||||
In CAS, eviction is more complex because:
|
||||
1. Multiple deltaspaces may reference the same SHA (deduplication)
|
||||
2. We can't delete the file unless we know no other deltaspace uses it
|
||||
|
||||
For safety, we only remove the mapping, not the actual file.
|
||||
Orphaned files will be cleaned up by cache expiry (future feature).
|
||||
|
||||
Args:
|
||||
bucket: S3 bucket name
|
||||
prefix: Deltaspace prefix
|
||||
"""
|
||||
key = (bucket, prefix)
|
||||
|
||||
# Remove mapping (safe operation)
|
||||
if key in self._deltaspace_to_sha:
|
||||
del self._deltaspace_to_sha[key]
|
||||
|
||||
# NOTE: We don't delete the actual CAS file because:
|
||||
# - Other deltaspaces may reference the same SHA
|
||||
|
||||
def clear(self) -> None:
|
||||
"""Clear all cached references.
|
||||
|
||||
Removes all cached files and mappings. This is a destructive operation
|
||||
that forcibly removes the entire cache directory.
|
||||
|
||||
Use cases:
|
||||
- Long-running applications that need to free disk space
|
||||
- Manual cache invalidation
|
||||
- Test cleanup
|
||||
- Ensuring fresh data fetch after configuration changes
|
||||
"""
|
||||
import shutil
|
||||
|
||||
# Clear in-memory mapping
|
||||
self._deltaspace_to_sha.clear()
|
||||
|
||||
# Remove all cache files (destructive!)
|
||||
if self.base_dir.exists():
|
||||
shutil.rmtree(self.base_dir, ignore_errors=True)
|
||||
|
||||
# Recreate base directory with secure permissions
|
||||
self.base_dir.mkdir(parents=True, mode=0o700, exist_ok=True)
|
||||
# - The ephemeral cache will be cleaned on process exit anyway
|
||||
# - For persistent cache (future), we'd need reference counting
|
||||
305
src/deltaglider/adapters/cache_encrypted.py
Normal file
305
src/deltaglider/adapters/cache_encrypted.py
Normal file
@@ -0,0 +1,305 @@
|
||||
"""Encrypted cache wrapper using Fernet symmetric encryption.
|
||||
|
||||
This adapter wraps any CachePort implementation and adds transparent encryption/decryption.
|
||||
It uses Fernet (symmetric encryption based on AES-128-CBC with HMAC authentication).
|
||||
"""
|
||||
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
from cryptography.fernet import Fernet
|
||||
|
||||
from ..core.errors import CacheCorruptionError, CacheMissError
|
||||
from ..ports.cache import CachePort
|
||||
|
||||
|
||||
class EncryptedCache(CachePort):
|
||||
"""Encrypted cache wrapper using Fernet symmetric encryption.
|
||||
|
||||
Wraps any CachePort implementation and transparently encrypts data at rest.
|
||||
Uses Fernet which provides:
|
||||
- AES-128-CBC encryption
|
||||
- HMAC authentication (prevents tampering)
|
||||
- Automatic key rotation support
|
||||
- Safe for ephemeral process-isolated caches
|
||||
|
||||
Key Management:
|
||||
- Ephemeral key generated per process (default, most secure)
|
||||
- Or use DG_CACHE_ENCRYPTION_KEY env var (base64-encoded Fernet key)
|
||||
- For production: use secrets management system (AWS KMS, HashiCorp Vault, etc.)
|
||||
|
||||
Security Properties:
|
||||
- Confidentiality: Data encrypted at rest
|
||||
- Integrity: HMAC prevents tampering
|
||||
- Authenticity: Only valid keys can decrypt
|
||||
- Forward Secrecy: Ephemeral keys destroyed on process exit
|
||||
"""
|
||||
|
||||
def __init__(self, backend: CachePort, encryption_key: bytes | None = None):
|
||||
"""Initialize encrypted cache wrapper.
|
||||
|
||||
Args:
|
||||
backend: Underlying cache implementation (CAS, filesystem, memory, etc.)
|
||||
encryption_key: Optional Fernet key (32 bytes base64-encoded).
|
||||
If None, generates ephemeral key for this process.
|
||||
"""
|
||||
self.backend = backend
|
||||
|
||||
# Key management: ephemeral (default) or provided
|
||||
if encryption_key is None:
|
||||
# Generate ephemeral key for this process (most secure)
|
||||
self._key = Fernet.generate_key()
|
||||
self._ephemeral = True
|
||||
else:
|
||||
# Use provided key (for persistent cache scenarios)
|
||||
self._key = encryption_key
|
||||
self._ephemeral = False
|
||||
|
||||
self._cipher = Fernet(self._key)
|
||||
|
||||
# Mapping: (bucket, prefix) -> plaintext_sha256
|
||||
# Needed because backend uses SHA for storage, but encrypted content has different SHA
|
||||
self._plaintext_sha_map: dict[tuple[str, str], str] = {}
|
||||
|
||||
@classmethod
|
||||
def from_env(cls, backend: CachePort) -> "EncryptedCache":
|
||||
"""Create encrypted cache with key from environment.
|
||||
|
||||
Looks for DG_CACHE_ENCRYPTION_KEY environment variable.
|
||||
If not found, generates ephemeral key.
|
||||
|
||||
Args:
|
||||
backend: Underlying cache implementation
|
||||
|
||||
Returns:
|
||||
EncryptedCache instance
|
||||
"""
|
||||
key_str = os.environ.get("DG_CACHE_ENCRYPTION_KEY")
|
||||
if key_str:
|
||||
# Decode base64-encoded key
|
||||
encryption_key = key_str.encode("utf-8")
|
||||
else:
|
||||
# Use ephemeral key
|
||||
encryption_key = None
|
||||
|
||||
return cls(backend, encryption_key)
|
||||
|
||||
def ref_path(self, bucket: str, prefix: str) -> Path:
|
||||
"""Get path where reference should be cached.
|
||||
|
||||
Delegates to backend. Path structure determined by backend
|
||||
(e.g., CAS uses SHA256-based paths).
|
||||
|
||||
Args:
|
||||
bucket: S3 bucket name
|
||||
prefix: Deltaspace prefix
|
||||
|
||||
Returns:
|
||||
Path from backend
|
||||
"""
|
||||
return self.backend.ref_path(bucket, prefix)
|
||||
|
||||
def has_ref(self, bucket: str, prefix: str, sha: str) -> bool:
|
||||
"""Check if reference exists with given SHA.
|
||||
|
||||
Note: SHA is of the *unencrypted* content. The backend may store
|
||||
encrypted data, but we verify against original content hash.
|
||||
|
||||
Args:
|
||||
bucket: S3 bucket name
|
||||
prefix: Deltaspace prefix
|
||||
sha: SHA256 of unencrypted content
|
||||
|
||||
Returns:
|
||||
True if encrypted reference exists with this SHA
|
||||
"""
|
||||
# Delegate to backend
|
||||
# Backend may use SHA for content-addressed storage of encrypted data
|
||||
return self.backend.has_ref(bucket, prefix, sha)
|
||||
|
||||
def get_validated_ref(self, bucket: str, prefix: str, expected_sha: str) -> Path:
|
||||
"""Get cached reference with decryption and validation.
|
||||
|
||||
Retrieves encrypted data from backend, decrypts it, validates SHA,
|
||||
and returns path to decrypted temporary file.
|
||||
|
||||
Args:
|
||||
bucket: S3 bucket name
|
||||
prefix: Deltaspace prefix
|
||||
expected_sha: Expected SHA256 of *decrypted* content
|
||||
|
||||
Returns:
|
||||
Path to decrypted validated file (temporary)
|
||||
|
||||
Raises:
|
||||
CacheMissError: File not in cache
|
||||
CacheCorruptionError: Decryption failed or SHA mismatch
|
||||
"""
|
||||
# Check if we have this plaintext SHA mapped
|
||||
key = (bucket, prefix)
|
||||
if key not in self._plaintext_sha_map:
|
||||
raise CacheMissError(f"Cache miss for {bucket}/{prefix}")
|
||||
|
||||
# Verify the requested SHA matches our mapping
|
||||
if self._plaintext_sha_map[key] != expected_sha:
|
||||
raise CacheCorruptionError(
|
||||
f"SHA mismatch for {bucket}/{prefix}: "
|
||||
f"expected {expected_sha}, have {self._plaintext_sha_map[key]}"
|
||||
)
|
||||
|
||||
# Get encrypted file from backend using ref_path (not validated, we validate plaintext)
|
||||
encrypted_path = self.backend.ref_path(bucket, prefix)
|
||||
if not encrypted_path.exists():
|
||||
raise CacheMissError(f"Encrypted cache file not found for {bucket}/{prefix}")
|
||||
|
||||
# Read encrypted content
|
||||
try:
|
||||
with open(encrypted_path, "rb") as f:
|
||||
encrypted_data = f.read()
|
||||
except OSError as e:
|
||||
raise CacheMissError(f"Cannot read encrypted cache: {e}") from e
|
||||
|
||||
# Decrypt
|
||||
try:
|
||||
decrypted_data = self._cipher.decrypt(encrypted_data)
|
||||
except Exception as e:
|
||||
# Fernet raises InvalidToken for tampering/wrong key
|
||||
# SECURITY: Auto-delete corrupted cache files
|
||||
try:
|
||||
encrypted_path.unlink(missing_ok=True)
|
||||
# Clean up mapping
|
||||
if key in self._plaintext_sha_map:
|
||||
del self._plaintext_sha_map[key]
|
||||
except Exception:
|
||||
pass # Best effort cleanup
|
||||
raise CacheCorruptionError(
|
||||
f"Decryption failed for {bucket}/{prefix}: {e}. "
|
||||
f"Corrupted cache deleted automatically."
|
||||
) from e
|
||||
|
||||
# Validate SHA of decrypted content
|
||||
import hashlib
|
||||
|
||||
actual_sha = hashlib.sha256(decrypted_data).hexdigest()
|
||||
if actual_sha != expected_sha:
|
||||
# SECURITY: Auto-delete corrupted cache files
|
||||
try:
|
||||
encrypted_path.unlink(missing_ok=True)
|
||||
# Clean up mapping
|
||||
if key in self._plaintext_sha_map:
|
||||
del self._plaintext_sha_map[key]
|
||||
except Exception:
|
||||
pass # Best effort cleanup
|
||||
raise CacheCorruptionError(
|
||||
f"Decrypted content SHA mismatch for {bucket}/{prefix}: "
|
||||
f"expected {expected_sha}, got {actual_sha}. "
|
||||
f"Corrupted cache deleted automatically."
|
||||
)
|
||||
|
||||
# Write decrypted content to temporary file
|
||||
# Use same path as encrypted file but with .decrypted suffix
|
||||
decrypted_path = encrypted_path.with_suffix(".decrypted")
|
||||
try:
|
||||
with open(decrypted_path, "wb") as f:
|
||||
f.write(decrypted_data)
|
||||
except OSError as e:
|
||||
raise CacheCorruptionError(f"Cannot write decrypted cache: {e}") from e
|
||||
|
||||
return decrypted_path
|
||||
|
||||
def write_ref(self, bucket: str, prefix: str, src: Path) -> Path:
|
||||
"""Encrypt and cache reference file.
|
||||
|
||||
Reads source file, encrypts it, and stores encrypted version via backend.
|
||||
|
||||
Args:
|
||||
bucket: S3 bucket name
|
||||
prefix: Deltaspace prefix
|
||||
src: Source file to encrypt and cache
|
||||
|
||||
Returns:
|
||||
Path to encrypted cached file (from backend)
|
||||
"""
|
||||
# Read source file
|
||||
try:
|
||||
with open(src, "rb") as f:
|
||||
plaintext_data = f.read()
|
||||
except OSError as e:
|
||||
raise CacheCorruptionError(f"Cannot read source file {src}: {e}") from e
|
||||
|
||||
# Compute plaintext SHA for mapping
|
||||
import hashlib
|
||||
|
||||
plaintext_sha = hashlib.sha256(plaintext_data).hexdigest()
|
||||
|
||||
# Encrypt
|
||||
encrypted_data = self._cipher.encrypt(plaintext_data)
|
||||
|
||||
# Write encrypted data to temporary file
|
||||
temp_encrypted = src.with_suffix(".encrypted.tmp")
|
||||
try:
|
||||
with open(temp_encrypted, "wb") as f:
|
||||
f.write(encrypted_data)
|
||||
|
||||
# Store encrypted file via backend
|
||||
result_path = self.backend.write_ref(bucket, prefix, temp_encrypted)
|
||||
|
||||
# Store mapping of plaintext SHA
|
||||
key = (bucket, prefix)
|
||||
self._plaintext_sha_map[key] = plaintext_sha
|
||||
|
||||
return result_path
|
||||
|
||||
finally:
|
||||
# Cleanup temporary file
|
||||
if temp_encrypted.exists():
|
||||
temp_encrypted.unlink()
|
||||
|
||||
def evict(self, bucket: str, prefix: str) -> None:
|
||||
"""Remove cached reference (encrypted version).
|
||||
|
||||
Delegates to backend. Also cleans up any .decrypted temporary files and mappings.
|
||||
|
||||
Args:
|
||||
bucket: S3 bucket name
|
||||
prefix: Deltaspace prefix
|
||||
"""
|
||||
# Remove from plaintext SHA mapping
|
||||
key = (bucket, prefix)
|
||||
if key in self._plaintext_sha_map:
|
||||
del self._plaintext_sha_map[key]
|
||||
|
||||
# Get path to potentially clean up .decrypted files
|
||||
try:
|
||||
path = self.backend.ref_path(bucket, prefix)
|
||||
decrypted_path = path.with_suffix(".decrypted")
|
||||
if decrypted_path.exists():
|
||||
decrypted_path.unlink()
|
||||
except Exception:
|
||||
# Best effort cleanup
|
||||
pass
|
||||
|
||||
# Evict from backend
|
||||
self.backend.evict(bucket, prefix)
|
||||
|
||||
def clear(self) -> None:
|
||||
"""Clear all cached references and encryption mappings.
|
||||
|
||||
Removes all cached data and clears encryption key mappings.
|
||||
This is the proper way to forcibly clean up cache in long-running
|
||||
applications.
|
||||
|
||||
Use cases:
|
||||
- Long-running applications needing to free resources
|
||||
- Manual cache invalidation after key rotation
|
||||
- Test cleanup
|
||||
- Memory pressure situations
|
||||
|
||||
Note: After clearing, the cache will use a fresh encryption key
|
||||
(ephemeral mode) or the same persistent key (if DG_CACHE_ENCRYPTION_KEY set).
|
||||
"""
|
||||
# Clear encryption mapping
|
||||
self._plaintext_sha_map.clear()
|
||||
|
||||
# Delegate to backend to clear actual files/memory
|
||||
self.backend.clear()
|
||||
@@ -1,8 +1,15 @@
|
||||
"""Filesystem cache adapter."""
|
||||
|
||||
import hashlib
|
||||
import shutil
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
# Unix-only imports for file locking
|
||||
if sys.platform != "win32":
|
||||
import fcntl
|
||||
|
||||
from ..core.errors import CacheCorruptionError, CacheMissError
|
||||
from ..ports.cache import CachePort
|
||||
from ..ports.hash import HashPort
|
||||
|
||||
@@ -29,6 +36,60 @@ class FsCacheAdapter(CachePort):
|
||||
actual_sha = self.hasher.sha256(path)
|
||||
return actual_sha == sha
|
||||
|
||||
def get_validated_ref(self, bucket: str, prefix: str, expected_sha: str) -> Path:
|
||||
"""Get cached reference with atomic SHA validation.
|
||||
|
||||
This method prevents TOCTOU attacks by validating the SHA at use-time,
|
||||
not just at check-time.
|
||||
|
||||
Args:
|
||||
bucket: S3 bucket name
|
||||
prefix: Prefix/deltaspace within bucket
|
||||
expected_sha: Expected SHA256 hash
|
||||
|
||||
Returns:
|
||||
Path to validated cached file
|
||||
|
||||
Raises:
|
||||
CacheMissError: File not found in cache
|
||||
CacheCorruptionError: SHA mismatch detected
|
||||
"""
|
||||
path = self.ref_path(bucket, prefix)
|
||||
|
||||
if not path.exists():
|
||||
raise CacheMissError(f"Cache miss for {bucket}/{prefix}")
|
||||
|
||||
# Lock file and validate content atomically
|
||||
try:
|
||||
with open(path, "rb") as f:
|
||||
# Acquire shared lock (Unix only)
|
||||
if sys.platform != "win32":
|
||||
fcntl.flock(f.fileno(), fcntl.LOCK_SH)
|
||||
|
||||
# Read and hash content
|
||||
content = f.read()
|
||||
actual_sha = hashlib.sha256(content).hexdigest()
|
||||
|
||||
# Release lock automatically when exiting context
|
||||
|
||||
# Validate SHA
|
||||
if actual_sha != expected_sha:
|
||||
# File corrupted or tampered - remove it
|
||||
try:
|
||||
path.unlink()
|
||||
except OSError:
|
||||
pass # Best effort cleanup
|
||||
|
||||
raise CacheCorruptionError(
|
||||
f"Cache corruption detected for {bucket}/{prefix}: "
|
||||
f"expected {expected_sha}, got {actual_sha}"
|
||||
)
|
||||
|
||||
return path
|
||||
|
||||
except OSError as e:
|
||||
raise CacheMissError(f"Cache read error for {bucket}/{prefix}: {e}") from e
|
||||
|
||||
def write_ref(self, bucket: str, prefix: str, src: Path) -> Path:
|
||||
"""Cache reference file."""
|
||||
path = self.ref_path(bucket, prefix)
|
||||
|
||||
279
src/deltaglider/adapters/cache_memory.py
Normal file
279
src/deltaglider/adapters/cache_memory.py
Normal file
@@ -0,0 +1,279 @@
|
||||
"""In-memory cache implementation with optional size limits.
|
||||
|
||||
This adapter stores cached references entirely in memory, avoiding filesystem I/O.
|
||||
Useful for:
|
||||
- High-performance scenarios where memory is abundant
|
||||
- Containerized environments with limited filesystem access
|
||||
- Testing and development
|
||||
"""
|
||||
|
||||
import hashlib
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
# Unix-only imports for compatibility
|
||||
if sys.platform != "win32":
|
||||
import fcntl # noqa: F401
|
||||
|
||||
from ..core.errors import CacheCorruptionError, CacheMissError
|
||||
from ..ports.cache import CachePort
|
||||
from ..ports.hash import HashPort
|
||||
|
||||
|
||||
class MemoryCache(CachePort):
|
||||
"""In-memory cache implementation with LRU eviction.
|
||||
|
||||
Stores cached references in memory as bytes. Useful for high-performance
|
||||
scenarios or when filesystem access is limited.
|
||||
|
||||
Features:
|
||||
- Zero filesystem I/O (everything in RAM)
|
||||
- Optional size limits with LRU eviction
|
||||
- Thread-safe operations
|
||||
- Temporary file creation for compatibility with file-based APIs
|
||||
|
||||
Limitations:
|
||||
- Data lost on process exit (ephemeral only)
|
||||
- Memory usage proportional to cache size
|
||||
- Not suitable for very large reference files
|
||||
|
||||
Storage Layout:
|
||||
- Key: (bucket, prefix) tuple
|
||||
- Value: (content_bytes, sha256) tuple
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
hasher: HashPort,
|
||||
max_size_mb: int = 100,
|
||||
temp_dir: Path | None = None,
|
||||
):
|
||||
"""Initialize in-memory cache.
|
||||
|
||||
Args:
|
||||
hasher: Hash adapter for SHA256 computation
|
||||
max_size_mb: Maximum cache size in megabytes (default 100MB)
|
||||
temp_dir: Directory for temporary files (default: system temp)
|
||||
"""
|
||||
self.hasher = hasher
|
||||
self.max_size_bytes = max_size_mb * 1024 * 1024
|
||||
|
||||
# Storage: (bucket, prefix) -> (content_bytes, sha256)
|
||||
self._cache: dict[tuple[str, str], tuple[bytes, str]] = {}
|
||||
|
||||
# Size tracking
|
||||
self._current_size = 0
|
||||
|
||||
# Access order for LRU eviction: (bucket, prefix) list
|
||||
self._access_order: list[tuple[str, str]] = []
|
||||
|
||||
# Temp directory for file-based API compatibility
|
||||
if temp_dir is None:
|
||||
import tempfile
|
||||
|
||||
self.temp_dir = Path(tempfile.gettempdir()) / "deltaglider-mem-cache"
|
||||
else:
|
||||
self.temp_dir = temp_dir
|
||||
|
||||
self.temp_dir.mkdir(parents=True, exist_ok=True, mode=0o700)
|
||||
|
||||
def _update_access(self, key: tuple[str, str]) -> None:
|
||||
"""Update LRU access order.
|
||||
|
||||
Args:
|
||||
key: Cache key (bucket, prefix)
|
||||
"""
|
||||
# Remove old position if exists
|
||||
if key in self._access_order:
|
||||
self._access_order.remove(key)
|
||||
|
||||
# Add to end (most recently used)
|
||||
self._access_order.append(key)
|
||||
|
||||
def _evict_lru(self, needed_bytes: int) -> None:
|
||||
"""Evict least recently used entries to free space.
|
||||
|
||||
Args:
|
||||
needed_bytes: Bytes needed for new entry
|
||||
"""
|
||||
while self._current_size + needed_bytes > self.max_size_bytes and self._access_order:
|
||||
# Evict least recently used
|
||||
lru_key = self._access_order[0]
|
||||
bucket, prefix = lru_key
|
||||
|
||||
# Remove from cache
|
||||
if lru_key in self._cache:
|
||||
content, _ = self._cache[lru_key]
|
||||
self._current_size -= len(content)
|
||||
del self._cache[lru_key]
|
||||
|
||||
# Remove from access order
|
||||
self._access_order.remove(lru_key)
|
||||
|
||||
def ref_path(self, bucket: str, prefix: str) -> Path:
|
||||
"""Get placeholder path for in-memory reference.
|
||||
|
||||
Returns a virtual path that doesn't actually exist on filesystem.
|
||||
Used for API compatibility.
|
||||
|
||||
Args:
|
||||
bucket: S3 bucket name
|
||||
prefix: Deltaspace prefix
|
||||
|
||||
Returns:
|
||||
Virtual path (may not exist on filesystem)
|
||||
"""
|
||||
# Return virtual path for compatibility
|
||||
# Actual data is in memory, but we need Path for API
|
||||
safe_bucket = bucket.replace("/", "_")
|
||||
safe_prefix = prefix.replace("/", "_")
|
||||
return self.temp_dir / safe_bucket / safe_prefix / "reference.bin"
|
||||
|
||||
def has_ref(self, bucket: str, prefix: str, sha: str) -> bool:
|
||||
"""Check if reference exists in memory with given SHA.
|
||||
|
||||
Args:
|
||||
bucket: S3 bucket name
|
||||
prefix: Deltaspace prefix
|
||||
sha: Expected SHA256 hash
|
||||
|
||||
Returns:
|
||||
True if reference exists with this SHA
|
||||
"""
|
||||
key = (bucket, prefix)
|
||||
if key not in self._cache:
|
||||
return False
|
||||
|
||||
_, cached_sha = self._cache[key]
|
||||
return cached_sha == sha
|
||||
|
||||
def get_validated_ref(self, bucket: str, prefix: str, expected_sha: str) -> Path:
|
||||
"""Get cached reference from memory with validation.
|
||||
|
||||
Retrieves content from memory, validates SHA, and writes to
|
||||
temporary file for compatibility with file-based APIs.
|
||||
|
||||
Args:
|
||||
bucket: S3 bucket name
|
||||
prefix: Deltaspace prefix
|
||||
expected_sha: Expected SHA256 hash
|
||||
|
||||
Returns:
|
||||
Path to temporary file containing content
|
||||
|
||||
Raises:
|
||||
CacheMissError: Content not in cache
|
||||
CacheCorruptionError: SHA mismatch
|
||||
"""
|
||||
key = (bucket, prefix)
|
||||
|
||||
# Check if in cache
|
||||
if key not in self._cache:
|
||||
raise CacheMissError(f"Cache miss for {bucket}/{prefix}")
|
||||
|
||||
# Get content and validate
|
||||
content, cached_sha = self._cache[key]
|
||||
|
||||
# Update LRU
|
||||
self._update_access(key)
|
||||
|
||||
# Validate SHA
|
||||
if cached_sha != expected_sha:
|
||||
# SHA mismatch - possible corruption
|
||||
raise CacheCorruptionError(
|
||||
f"Memory cache SHA mismatch for {bucket}/{prefix}: "
|
||||
f"expected {expected_sha}, got {cached_sha}"
|
||||
)
|
||||
|
||||
# Write to temporary file for API compatibility
|
||||
temp_path = self.ref_path(bucket, prefix)
|
||||
temp_path.parent.mkdir(parents=True, exist_ok=True, mode=0o700)
|
||||
|
||||
try:
|
||||
with open(temp_path, "wb") as f:
|
||||
f.write(content)
|
||||
except OSError as e:
|
||||
raise CacheMissError(f"Cannot write temp file: {e}") from e
|
||||
|
||||
return temp_path
|
||||
|
||||
def write_ref(self, bucket: str, prefix: str, src: Path) -> Path:
|
||||
"""Store reference file in memory.
|
||||
|
||||
Reads file content and stores in memory with SHA hash.
|
||||
|
||||
Args:
|
||||
bucket: S3 bucket name
|
||||
prefix: Deltaspace prefix
|
||||
src: Source file to cache
|
||||
|
||||
Returns:
|
||||
Virtual path (content is in memory)
|
||||
"""
|
||||
# Read source file
|
||||
try:
|
||||
with open(src, "rb") as f:
|
||||
content = f.read()
|
||||
except OSError as e:
|
||||
raise CacheCorruptionError(f"Cannot read source file {src}: {e}") from e
|
||||
|
||||
# Compute SHA
|
||||
sha = hashlib.sha256(content).hexdigest()
|
||||
|
||||
# Check if we need to evict
|
||||
content_size = len(content)
|
||||
if content_size > self.max_size_bytes:
|
||||
raise CacheCorruptionError(
|
||||
f"File too large for memory cache: {content_size} bytes "
|
||||
f"(limit: {self.max_size_bytes} bytes)"
|
||||
)
|
||||
|
||||
# Evict LRU entries if needed
|
||||
self._evict_lru(content_size)
|
||||
|
||||
# Store in memory
|
||||
key = (bucket, prefix)
|
||||
self._cache[key] = (content, sha)
|
||||
self._current_size += content_size
|
||||
|
||||
# Update LRU
|
||||
self._update_access(key)
|
||||
|
||||
# Return virtual path
|
||||
return self.ref_path(bucket, prefix)
|
||||
|
||||
def evict(self, bucket: str, prefix: str) -> None:
|
||||
"""Remove cached reference from memory.
|
||||
|
||||
Args:
|
||||
bucket: S3 bucket name
|
||||
prefix: Deltaspace prefix
|
||||
"""
|
||||
key = (bucket, prefix)
|
||||
|
||||
# Remove from cache
|
||||
if key in self._cache:
|
||||
content, _ = self._cache[key]
|
||||
self._current_size -= len(content)
|
||||
del self._cache[key]
|
||||
|
||||
# Remove from LRU tracking
|
||||
if key in self._access_order:
|
||||
self._access_order.remove(key)
|
||||
|
||||
# Clean up temp file if exists
|
||||
temp_path = self.ref_path(bucket, prefix)
|
||||
if temp_path.exists():
|
||||
try:
|
||||
temp_path.unlink()
|
||||
except OSError:
|
||||
pass # Best effort
|
||||
|
||||
def clear(self) -> None:
|
||||
"""Clear all cached content from memory.
|
||||
|
||||
Useful for testing and cleanup.
|
||||
"""
|
||||
self._cache.clear()
|
||||
self._access_order.clear()
|
||||
self._current_size = 0
|
||||
126
src/deltaglider/adapters/ec2_metadata.py
Normal file
126
src/deltaglider/adapters/ec2_metadata.py
Normal file
@@ -0,0 +1,126 @@
|
||||
"""EC2 Instance Metadata Service (IMDS) adapter.
|
||||
|
||||
Provides access to EC2 instance metadata using IMDSv2 with token-based authentication.
|
||||
Falls back gracefully when not running on EC2.
|
||||
"""
|
||||
|
||||
import os
|
||||
|
||||
import requests
|
||||
|
||||
|
||||
class EC2MetadataAdapter:
|
||||
"""Adapter for EC2 Instance Metadata Service (IMDSv2)."""
|
||||
|
||||
IMDS_BASE_URL = "http://169.254.169.254/latest"
|
||||
TOKEN_URL = f"{IMDS_BASE_URL}/api/token"
|
||||
TOKEN_TTL_SECONDS = 21600 # 6 hours
|
||||
TOKEN_HEADER = "X-aws-ec2-metadata-token"
|
||||
TIMEOUT_SECONDS = 1 # Fast timeout for non-EC2 environments
|
||||
|
||||
def __init__(self) -> None:
|
||||
"""Initialize EC2 metadata adapter."""
|
||||
self._token: str | None = None
|
||||
self._is_ec2: bool | None = None
|
||||
self._region: str | None = None
|
||||
|
||||
def is_running_on_ec2(self) -> bool:
|
||||
"""Check if running on an EC2 instance.
|
||||
|
||||
Returns:
|
||||
True if running on EC2, False otherwise
|
||||
|
||||
Note:
|
||||
Result is cached after first check for performance.
|
||||
"""
|
||||
if self._is_ec2 is not None:
|
||||
return self._is_ec2
|
||||
|
||||
# Skip check if explicitly disabled
|
||||
if os.environ.get("DG_DISABLE_EC2_DETECTION", "").lower() in ("true", "1", "yes"):
|
||||
self._is_ec2 = False
|
||||
return False
|
||||
|
||||
try:
|
||||
# Try to get IMDSv2 token
|
||||
self._token = self._get_token()
|
||||
self._is_ec2 = self._token is not None
|
||||
except Exception:
|
||||
self._is_ec2 = False
|
||||
|
||||
return self._is_ec2
|
||||
|
||||
def get_region(self) -> str | None:
|
||||
"""Get the EC2 instance's AWS region.
|
||||
|
||||
Returns:
|
||||
AWS region code (e.g., "us-east-1") or None if not on EC2
|
||||
|
||||
Note:
|
||||
Result is cached after first successful fetch.
|
||||
"""
|
||||
if not self.is_running_on_ec2():
|
||||
return None
|
||||
|
||||
if self._region is not None:
|
||||
return self._region
|
||||
|
||||
try:
|
||||
if self._token:
|
||||
response = requests.get(
|
||||
f"{self.IMDS_BASE_URL}/meta-data/placement/region",
|
||||
headers={self.TOKEN_HEADER: self._token},
|
||||
timeout=self.TIMEOUT_SECONDS,
|
||||
)
|
||||
if response.status_code == 200:
|
||||
self._region = response.text.strip()
|
||||
return self._region
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
return None
|
||||
|
||||
def get_availability_zone(self) -> str | None:
|
||||
"""Get the EC2 instance's availability zone.
|
||||
|
||||
Returns:
|
||||
Availability zone (e.g., "us-east-1a") or None if not on EC2
|
||||
"""
|
||||
if not self.is_running_on_ec2():
|
||||
return None
|
||||
|
||||
try:
|
||||
if self._token:
|
||||
response = requests.get(
|
||||
f"{self.IMDS_BASE_URL}/meta-data/placement/availability-zone",
|
||||
headers={self.TOKEN_HEADER: self._token},
|
||||
timeout=self.TIMEOUT_SECONDS,
|
||||
)
|
||||
if response.status_code == 200:
|
||||
return str(response.text.strip())
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
return None
|
||||
|
||||
def _get_token(self) -> str | None:
|
||||
"""Get IMDSv2 token for authenticated metadata requests.
|
||||
|
||||
Returns:
|
||||
IMDSv2 token or None if unable to retrieve
|
||||
|
||||
Note:
|
||||
Uses IMDSv2 for security. IMDSv1 is not supported.
|
||||
"""
|
||||
try:
|
||||
response = requests.put(
|
||||
self.TOKEN_URL,
|
||||
headers={"X-aws-ec2-metadata-token-ttl-seconds": str(self.TOKEN_TTL_SECONDS)},
|
||||
timeout=self.TIMEOUT_SECONDS,
|
||||
)
|
||||
if response.status_code == 200:
|
||||
return response.text.strip()
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
return None
|
||||
@@ -1,18 +1,22 @@
|
||||
"""S3 storage adapter."""
|
||||
|
||||
import logging
|
||||
import os
|
||||
from collections.abc import Iterator
|
||||
from pathlib import Path
|
||||
from typing import TYPE_CHECKING, Any, BinaryIO, Optional
|
||||
|
||||
import boto3
|
||||
from botocore.config import Config
|
||||
from botocore.exceptions import ClientError
|
||||
|
||||
from ..ports.storage import ObjectHead, PutResult, StoragePort
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from mypy_boto3_s3.client import S3Client
|
||||
|
||||
from ..ports.storage import ObjectHead, PutResult, StoragePort
|
||||
|
||||
|
||||
class S3StorageAdapter(StoragePort):
|
||||
"""S3 implementation of StoragePort."""
|
||||
@@ -21,13 +25,38 @@ class S3StorageAdapter(StoragePort):
|
||||
self,
|
||||
client: Optional["S3Client"] = None,
|
||||
endpoint_url: str | None = None,
|
||||
boto3_kwargs: dict[str, Any] | None = None,
|
||||
):
|
||||
"""Initialize with S3 client."""
|
||||
"""Initialize with S3 client.
|
||||
|
||||
Args:
|
||||
client: Pre-configured S3 client (if None, one will be created)
|
||||
endpoint_url: S3 endpoint URL override (for MinIO, LocalStack, etc.)
|
||||
boto3_kwargs: Additional kwargs to pass to boto3.client() including:
|
||||
- aws_access_key_id: AWS access key
|
||||
- aws_secret_access_key: AWS secret key
|
||||
- aws_session_token: AWS session token (for temporary credentials)
|
||||
- region_name: AWS region name
|
||||
"""
|
||||
if client is None:
|
||||
self.client = boto3.client(
|
||||
"s3",
|
||||
endpoint_url=endpoint_url or os.environ.get("AWS_ENDPOINT_URL"),
|
||||
)
|
||||
# Build boto3 client parameters
|
||||
client_params: dict[str, Any] = {
|
||||
"service_name": "s3",
|
||||
"endpoint_url": endpoint_url or os.environ.get("AWS_ENDPOINT_URL"),
|
||||
# Disable automatic request checksums (CRC32/CRC64) added in
|
||||
# boto3 1.36+. S3-compatible stores like Hetzner Object Storage
|
||||
# reject the checksum headers with BadRequest.
|
||||
"config": Config(
|
||||
request_checksum_calculation="when_required",
|
||||
response_checksum_validation="when_required",
|
||||
),
|
||||
}
|
||||
|
||||
# Merge in any additional boto3 kwargs (credentials, region, etc.)
|
||||
if boto3_kwargs:
|
||||
client_params.update(boto3_kwargs)
|
||||
|
||||
self.client = boto3.client(**client_params)
|
||||
else:
|
||||
self.client = client
|
||||
|
||||
@@ -37,12 +66,21 @@ class S3StorageAdapter(StoragePort):
|
||||
|
||||
try:
|
||||
response = self.client.head_object(Bucket=bucket, Key=object_key)
|
||||
extracted_metadata = self._extract_metadata(response.get("Metadata", {}))
|
||||
|
||||
# Debug: Log metadata received (to verify it's stored correctly)
|
||||
if logger.isEnabledFor(logging.DEBUG):
|
||||
logger.debug(
|
||||
f"HEAD {object_key}: Received metadata with {len(extracted_metadata)} keys: "
|
||||
f"{list(extracted_metadata.keys())}"
|
||||
)
|
||||
|
||||
return ObjectHead(
|
||||
key=object_key,
|
||||
size=response["ContentLength"],
|
||||
etag=response["ETag"].strip('"'),
|
||||
last_modified=response["LastModified"],
|
||||
metadata=self._extract_metadata(response.get("Metadata", {})),
|
||||
metadata=extracted_metadata,
|
||||
)
|
||||
except ClientError as e:
|
||||
if e.response["Error"]["Code"] == "404":
|
||||
@@ -79,6 +117,7 @@ class S3StorageAdapter(StoragePort):
|
||||
delimiter: str = "",
|
||||
max_keys: int = 1000,
|
||||
start_after: str | None = None,
|
||||
continuation_token: str | None = None,
|
||||
) -> dict[str, Any]:
|
||||
"""List objects with S3-compatible response.
|
||||
|
||||
@@ -87,7 +126,8 @@ class S3StorageAdapter(StoragePort):
|
||||
prefix: Filter results to keys beginning with prefix
|
||||
delimiter: Delimiter for grouping keys (e.g., '/' for folders)
|
||||
max_keys: Maximum number of keys to return
|
||||
start_after: Start listing after this key
|
||||
start_after: Start listing after this key (for first page only)
|
||||
continuation_token: Token from previous response for pagination
|
||||
|
||||
Returns:
|
||||
Dict with objects, common_prefixes, and pagination info
|
||||
@@ -101,7 +141,11 @@ class S3StorageAdapter(StoragePort):
|
||||
params["Prefix"] = prefix
|
||||
if delimiter:
|
||||
params["Delimiter"] = delimiter
|
||||
if start_after:
|
||||
|
||||
# Use ContinuationToken for pagination if available, otherwise StartAfter
|
||||
if continuation_token:
|
||||
params["ContinuationToken"] = continuation_token
|
||||
elif start_after:
|
||||
params["StartAfter"] = start_after
|
||||
|
||||
try:
|
||||
@@ -145,7 +189,7 @@ class S3StorageAdapter(StoragePort):
|
||||
|
||||
try:
|
||||
response = self.client.get_object(Bucket=bucket, Key=object_key)
|
||||
return response["Body"] # type: ignore[return-value]
|
||||
return response["Body"] # type: ignore[no-any-return]
|
||||
except ClientError as e:
|
||||
if e.response["Error"]["Code"] == "NoSuchKey":
|
||||
raise FileNotFoundError(f"Object not found: {key}") from e
|
||||
@@ -173,20 +217,110 @@ class S3StorageAdapter(StoragePort):
|
||||
# AWS requires lowercase metadata keys
|
||||
clean_metadata = {k.lower(): v for k, v in metadata.items()}
|
||||
|
||||
try:
|
||||
response = self.client.put_object(
|
||||
Bucket=bucket,
|
||||
Key=object_key,
|
||||
Body=body_data,
|
||||
ContentType=content_type,
|
||||
Metadata=clean_metadata,
|
||||
# Calculate total metadata size (AWS has 2KB limit)
|
||||
total_metadata_size = sum(len(k) + len(v) for k, v in clean_metadata.items())
|
||||
|
||||
if logger.isEnabledFor(logging.DEBUG):
|
||||
logger.debug(
|
||||
f"PUT {object_key}: Sending metadata with {len(clean_metadata)} keys "
|
||||
f"({total_metadata_size} bytes): {list(clean_metadata.keys())}"
|
||||
)
|
||||
return PutResult(
|
||||
etag=response["ETag"].strip('"'),
|
||||
version_id=response.get("VersionId"),
|
||||
|
||||
# Warn if approaching AWS metadata size limit (2KB per key, 2KB total for user metadata)
|
||||
if total_metadata_size > 1800: # Warn at 1.8KB
|
||||
logger.warning(
|
||||
f"PUT {object_key}: Metadata size ({total_metadata_size} bytes) approaching "
|
||||
f"AWS S3 limit (2KB). Some metadata may be lost!"
|
||||
)
|
||||
except ClientError as e:
|
||||
raise RuntimeError(f"Failed to put object: {e}") from e
|
||||
|
||||
import time
|
||||
|
||||
max_retries = 3
|
||||
last_error: ClientError | None = None
|
||||
|
||||
for attempt in range(max_retries):
|
||||
try:
|
||||
response = self.client.put_object(
|
||||
Bucket=bucket,
|
||||
Key=object_key,
|
||||
Body=body_data,
|
||||
ContentType=content_type,
|
||||
Metadata=clean_metadata,
|
||||
)
|
||||
|
||||
# VERIFICATION: Check if metadata was actually stored (especially for delta files)
|
||||
if object_key.endswith(".delta") and clean_metadata:
|
||||
try:
|
||||
# Verify metadata was stored by doing a HEAD immediately
|
||||
verify_response = self.client.head_object(Bucket=bucket, Key=object_key)
|
||||
stored_metadata = verify_response.get("Metadata", {})
|
||||
|
||||
if not stored_metadata:
|
||||
logger.error(
|
||||
f"PUT {object_key}: CRITICAL - Metadata was sent but NOT STORED! "
|
||||
f"Sent {len(clean_metadata)} keys, received 0 keys back."
|
||||
)
|
||||
elif len(stored_metadata) < len(clean_metadata):
|
||||
missing_keys = set(clean_metadata.keys()) - set(stored_metadata.keys())
|
||||
logger.warning(
|
||||
f"PUT {object_key}: Metadata partially stored. "
|
||||
f"Sent {len(clean_metadata)} keys, stored {len(stored_metadata)} keys. "
|
||||
f"Missing keys: {missing_keys}"
|
||||
)
|
||||
elif logger.isEnabledFor(logging.DEBUG):
|
||||
logger.debug(
|
||||
f"PUT {object_key}: Metadata verified - "
|
||||
f"all {len(clean_metadata)} keys stored"
|
||||
)
|
||||
except Exception as e:
|
||||
logger.warning(f"PUT {object_key}: Could not verify metadata: {e}")
|
||||
|
||||
return PutResult(
|
||||
etag=response["ETag"].strip('"'),
|
||||
version_id=response.get("VersionId"),
|
||||
)
|
||||
except ClientError as e:
|
||||
last_error = e
|
||||
if attempt < max_retries - 1:
|
||||
delay = 2**attempt # 1s, 2s
|
||||
# Log full error details
|
||||
error_response = e.response if hasattr(e, "response") else {}
|
||||
http_headers = error_response.get("ResponseMetadata", {}).get("HTTPHeaders", {})
|
||||
logger.warning(
|
||||
f"PUT {object_key}: Attempt {attempt + 1}/{max_retries} failed: {e}. "
|
||||
f"Retrying in {delay}s... "
|
||||
f"Details: bucket={bucket}, key={object_key}, "
|
||||
f"body_size={len(body_data)}, content_type={content_type}, "
|
||||
f"metadata_keys={list(clean_metadata.keys())}, "
|
||||
f"endpoint={self.client.meta.endpoint_url}, "
|
||||
f"http_status={error_response.get('ResponseMetadata', {}).get('HTTPStatusCode')}, "
|
||||
f"error_code={error_response.get('Error', {}).get('Code')}, "
|
||||
f"error_message={error_response.get('Error', {}).get('Message')}, "
|
||||
f"request_id={error_response.get('ResponseMetadata', {}).get('RequestId')}, "
|
||||
f"http_headers={dict(http_headers)}"
|
||||
)
|
||||
# Enable botocore wire-level logging for the retry
|
||||
logging.getLogger("botocore").setLevel(logging.DEBUG)
|
||||
time.sleep(delay)
|
||||
else:
|
||||
# Final attempt failed — log everything
|
||||
error_response = e.response if hasattr(e, "response") else {}
|
||||
http_headers = error_response.get("ResponseMetadata", {}).get("HTTPHeaders", {})
|
||||
logger.error(
|
||||
f"PUT {object_key}: All {max_retries} attempts failed. "
|
||||
f"Last error: {e}. "
|
||||
f"Details: bucket={bucket}, key={object_key}, "
|
||||
f"body_size={len(body_data)}, content_type={content_type}, "
|
||||
f"metadata={clean_metadata}, "
|
||||
f"endpoint={self.client.meta.endpoint_url}, "
|
||||
f"http_status={error_response.get('ResponseMetadata', {}).get('HTTPStatusCode')}, "
|
||||
f"error_code={error_response.get('Error', {}).get('Code')}, "
|
||||
f"error_message={error_response.get('Error', {}).get('Message')}, "
|
||||
f"request_id={error_response.get('ResponseMetadata', {}).get('RequestId')}, "
|
||||
f"http_headers={dict(http_headers)}"
|
||||
)
|
||||
|
||||
raise RuntimeError(f"Failed to put object: {last_error}") from last_error
|
||||
|
||||
def delete(self, key: str) -> None:
|
||||
"""Delete object."""
|
||||
|
||||
@@ -1,28 +1,120 @@
|
||||
"""AWS S3 CLI compatible commands."""
|
||||
|
||||
import shutil
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
import click
|
||||
|
||||
from ...core import DeltaService, DeltaSpace, ObjectKey
|
||||
from ...core import (
|
||||
DeltaService,
|
||||
DeltaSpace,
|
||||
ObjectKey,
|
||||
build_s3_url,
|
||||
is_s3_url,
|
||||
)
|
||||
from ...core import parse_s3_url as core_parse_s3_url
|
||||
from .sync import fetch_s3_object_heads
|
||||
|
||||
__all__ = [
|
||||
"is_s3_path",
|
||||
"parse_s3_url",
|
||||
"determine_operation",
|
||||
"upload_file",
|
||||
"download_file",
|
||||
"copy_s3_to_s3",
|
||||
"migrate_s3_to_s3",
|
||||
"handle_recursive",
|
||||
"log_aws_region",
|
||||
]
|
||||
|
||||
|
||||
def log_aws_region(service: DeltaService, region_override: bool = False) -> None:
|
||||
"""Log the AWS region being used and warn about cross-region charges.
|
||||
|
||||
This function:
|
||||
1. Detects if running on EC2
|
||||
2. Compares EC2 region with S3 client region
|
||||
3. Warns about potential cross-region data transfer charges
|
||||
4. Helps users optimize for cost and performance
|
||||
|
||||
Args:
|
||||
service: DeltaService instance with storage adapter
|
||||
region_override: True if user explicitly specified --region flag
|
||||
"""
|
||||
try:
|
||||
from ...adapters.ec2_metadata import EC2MetadataAdapter
|
||||
from ...adapters.storage_s3 import S3StorageAdapter
|
||||
|
||||
if not isinstance(service.storage, S3StorageAdapter):
|
||||
return # Not using S3 storage, skip
|
||||
|
||||
# Get S3 client region
|
||||
s3_region = service.storage.client.meta.region_name
|
||||
if not s3_region:
|
||||
s3_region = "us-east-1" # boto3 default
|
||||
|
||||
# Check if running on EC2
|
||||
ec2_metadata = EC2MetadataAdapter()
|
||||
if ec2_metadata.is_running_on_ec2():
|
||||
ec2_region = ec2_metadata.get_region()
|
||||
ec2_az = ec2_metadata.get_availability_zone()
|
||||
|
||||
# Log EC2 context
|
||||
click.echo(f"EC2 Instance: {ec2_az or ec2_region or 'unknown'}")
|
||||
click.echo(f"S3 Client Region: {s3_region}")
|
||||
|
||||
# Check for region mismatch
|
||||
if ec2_region and ec2_region != s3_region:
|
||||
if region_override:
|
||||
# User explicitly set --region, warn about costs
|
||||
click.echo("")
|
||||
click.secho(
|
||||
f"⚠️ WARNING: EC2 region={ec2_region} != S3 client region={s3_region}",
|
||||
fg="yellow",
|
||||
bold=True,
|
||||
)
|
||||
click.secho(
|
||||
f" Expect cross-region/NAT data charges. Align regions (set client region={ec2_region})",
|
||||
fg="yellow",
|
||||
)
|
||||
click.secho(
|
||||
" before proceeding. Or drop --region for automatic region resolution.",
|
||||
fg="yellow",
|
||||
)
|
||||
click.echo("")
|
||||
else:
|
||||
# Auto-detected mismatch, but user can still cancel
|
||||
click.echo("")
|
||||
click.secho(
|
||||
f"ℹ️ INFO: EC2 region ({ec2_region}) differs from configured S3 region ({s3_region})",
|
||||
fg="cyan",
|
||||
)
|
||||
click.secho(
|
||||
f" Consider using --region {ec2_region} to avoid cross-region charges.",
|
||||
fg="cyan",
|
||||
)
|
||||
click.echo("")
|
||||
elif ec2_region and ec2_region == s3_region:
|
||||
# Regions match - optimal configuration
|
||||
click.secho("✓ Regions aligned - no cross-region charges", fg="green")
|
||||
else:
|
||||
# Not on EC2, just show S3 region
|
||||
click.echo(f"S3 Client Region: {s3_region}")
|
||||
|
||||
except Exception:
|
||||
pass # Silently ignore errors getting region info
|
||||
|
||||
|
||||
def is_s3_path(path: str) -> bool:
|
||||
"""Check if path is an S3 URL."""
|
||||
return path.startswith("s3://")
|
||||
return is_s3_url(path)
|
||||
|
||||
|
||||
def parse_s3_url(url: str) -> tuple[str, str]:
|
||||
"""Parse S3 URL into bucket and key."""
|
||||
if not url.startswith("s3://"):
|
||||
raise ValueError(f"Invalid S3 URL: {url}")
|
||||
|
||||
s3_path = url[5:].rstrip("/")
|
||||
parts = s3_path.split("/", 1)
|
||||
bucket = parts[0]
|
||||
key = parts[1] if len(parts) > 1 else ""
|
||||
return bucket, key
|
||||
parsed = core_parse_s3_url(url, strip_trailing_slash=True)
|
||||
return parsed.bucket, parsed.key
|
||||
|
||||
|
||||
def determine_operation(source: str, dest: str) -> str:
|
||||
@@ -57,6 +149,8 @@ def upload_file(
|
||||
|
||||
delta_space = DeltaSpace(bucket=bucket, prefix="/".join(key.split("/")[:-1]))
|
||||
|
||||
dest_url = build_s3_url(bucket, key)
|
||||
|
||||
try:
|
||||
# Check if delta should be disabled
|
||||
if no_delta:
|
||||
@@ -66,7 +160,7 @@ def upload_file(
|
||||
|
||||
if not quiet:
|
||||
file_size = local_path.stat().st_size
|
||||
click.echo(f"upload: '{local_path}' to 's3://{bucket}/{key}' ({file_size} bytes)")
|
||||
click.echo(f"upload: '{local_path}' to '{dest_url}' ({file_size} bytes)")
|
||||
else:
|
||||
# Use delta compression
|
||||
summary = service.put(local_path, delta_space, max_ratio)
|
||||
@@ -75,12 +169,12 @@ def upload_file(
|
||||
if summary.delta_size:
|
||||
ratio = round((summary.delta_size / summary.file_size) * 100, 1)
|
||||
click.echo(
|
||||
f"upload: '{local_path}' to 's3://{bucket}/{summary.key}' "
|
||||
f"upload: '{local_path}' to '{build_s3_url(bucket, summary.key)}' "
|
||||
f"(delta: {ratio}% of original)"
|
||||
)
|
||||
else:
|
||||
click.echo(
|
||||
f"upload: '{local_path}' to 's3://{bucket}/{summary.key}' "
|
||||
f"upload: '{local_path}' to '{build_s3_url(bucket, summary.key)}' "
|
||||
f"(reference: {summary.file_size} bytes)"
|
||||
)
|
||||
|
||||
@@ -112,7 +206,7 @@ def download_file(
|
||||
actual_key = delta_key
|
||||
obj_key = ObjectKey(bucket=bucket, key=delta_key)
|
||||
if not quiet:
|
||||
click.echo(f"Auto-detected delta: s3://{bucket}/{delta_key}")
|
||||
click.echo(f"Auto-detected delta: {build_s3_url(bucket, delta_key)}")
|
||||
|
||||
# Determine output path
|
||||
if local_path is None:
|
||||
@@ -136,7 +230,7 @@ def download_file(
|
||||
if not quiet:
|
||||
file_size = local_path.stat().st_size
|
||||
click.echo(
|
||||
f"download: 's3://{bucket}/{actual_key}' to '{local_path}' ({file_size} bytes)"
|
||||
f"download: '{build_s3_url(bucket, actual_key)}' to '{local_path}' ({file_size} bytes)"
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
@@ -149,31 +243,310 @@ def copy_s3_to_s3(
|
||||
source_url: str,
|
||||
dest_url: str,
|
||||
quiet: bool = False,
|
||||
max_ratio: float | None = None,
|
||||
no_delta: bool = False,
|
||||
) -> None:
|
||||
"""Copy object between S3 locations."""
|
||||
# For now, implement as download + upload
|
||||
# TODO: Optimize with server-side copy when possible
|
||||
"""Copy object between S3 locations with optional delta compression.
|
||||
|
||||
This performs a direct S3-to-S3 transfer using streaming to preserve
|
||||
the original file content and apply delta compression at the destination.
|
||||
"""
|
||||
source_bucket, source_key = parse_s3_url(source_url)
|
||||
dest_bucket, dest_key = parse_s3_url(dest_url)
|
||||
|
||||
if not quiet:
|
||||
click.echo(f"copy: 's3://{source_bucket}/{source_key}' to 's3://{dest_bucket}/{dest_key}'")
|
||||
click.echo(
|
||||
f"copy: '{build_s3_url(source_bucket, source_key)}' "
|
||||
f"to '{build_s3_url(dest_bucket, dest_key)}'"
|
||||
)
|
||||
|
||||
# Use temporary file
|
||||
import tempfile
|
||||
try:
|
||||
# Get the source object as a stream
|
||||
source_stream = service.storage.get(f"{source_bucket}/{source_key}")
|
||||
|
||||
with tempfile.NamedTemporaryFile(suffix=Path(source_key).suffix) as tmp:
|
||||
tmp_path = Path(tmp.name)
|
||||
# Determine the destination deltaspace
|
||||
dest_key_parts = dest_key.split("/")
|
||||
if len(dest_key_parts) > 1:
|
||||
dest_prefix = "/".join(dest_key_parts[:-1])
|
||||
else:
|
||||
dest_prefix = ""
|
||||
|
||||
# Download from source
|
||||
download_file(service, source_url, tmp_path, quiet=True)
|
||||
dest_deltaspace = DeltaSpace(bucket=dest_bucket, prefix=dest_prefix)
|
||||
|
||||
# Upload to destination
|
||||
upload_file(service, tmp_path, dest_url, quiet=True)
|
||||
# If delta is disabled or max_ratio specified, use direct put
|
||||
if no_delta:
|
||||
# Direct storage put without delta compression
|
||||
service.storage.put(f"{dest_bucket}/{dest_key}", source_stream, {})
|
||||
if not quiet:
|
||||
click.echo("Copy completed (no delta compression)")
|
||||
else:
|
||||
# Write to a temporary file and use override_name to preserve original filename
|
||||
import tempfile
|
||||
|
||||
# Extract original filename from source
|
||||
original_filename = Path(source_key).name
|
||||
|
||||
with tempfile.NamedTemporaryFile(delete=False, suffix=Path(source_key).suffix) as tmp:
|
||||
tmp_path = Path(tmp.name)
|
||||
|
||||
# Write stream to temp file
|
||||
with open(tmp_path, "wb") as f:
|
||||
shutil.copyfileobj(source_stream, f)
|
||||
|
||||
try:
|
||||
# Use DeltaService.put() with override_name to preserve original filename
|
||||
summary = service.put(
|
||||
tmp_path, dest_deltaspace, max_ratio, override_name=original_filename
|
||||
)
|
||||
|
||||
if not quiet:
|
||||
if summary.delta_size:
|
||||
ratio = round((summary.delta_size / summary.file_size) * 100, 1)
|
||||
click.echo(f"Copy completed with delta compression ({ratio}% of original)")
|
||||
else:
|
||||
click.echo("Copy completed (stored as reference)")
|
||||
finally:
|
||||
# Clean up temp file
|
||||
tmp_path.unlink(missing_ok=True)
|
||||
|
||||
except Exception as e:
|
||||
click.echo(f"S3-to-S3 copy failed: {e}", err=True)
|
||||
raise
|
||||
|
||||
|
||||
def migrate_s3_to_s3(
|
||||
service: DeltaService,
|
||||
source_url: str,
|
||||
dest_url: str,
|
||||
exclude: str | None = None,
|
||||
include: str | None = None,
|
||||
quiet: bool = False,
|
||||
no_delta: bool = False,
|
||||
max_ratio: float | None = None,
|
||||
dry_run: bool = False,
|
||||
skip_confirm: bool = False,
|
||||
preserve_prefix: bool = False,
|
||||
region_override: bool = False,
|
||||
) -> None:
|
||||
"""Migrate objects from one S3 location to another with delta compression.
|
||||
|
||||
Features:
|
||||
- Resume support: Only copies files that don't exist in destination
|
||||
- Progress tracking: Shows migration progress
|
||||
- Confirmation prompt: Shows file count before starting
|
||||
- Prefix preservation: Optionally preserves source prefix structure in destination
|
||||
- EC2 region detection: Warns about cross-region data transfer charges
|
||||
|
||||
Args:
|
||||
service: DeltaService instance
|
||||
source_url: Source S3 URL
|
||||
dest_url: Destination S3 URL
|
||||
exclude: Pattern to exclude files
|
||||
include: Pattern to include files
|
||||
quiet: Suppress output
|
||||
no_delta: Disable delta compression
|
||||
max_ratio: Maximum delta/file ratio
|
||||
dry_run: Show what would be migrated without migrating
|
||||
skip_confirm: Skip confirmation prompt
|
||||
preserve_prefix: Preserve source prefix in destination
|
||||
region_override: True if user explicitly specified --region flag
|
||||
"""
|
||||
import fnmatch
|
||||
|
||||
source_bucket, source_prefix = parse_s3_url(source_url)
|
||||
dest_bucket, dest_prefix = parse_s3_url(dest_url)
|
||||
|
||||
# Ensure prefixes end with / if they exist
|
||||
if source_prefix and not source_prefix.endswith("/"):
|
||||
source_prefix += "/"
|
||||
if dest_prefix and not dest_prefix.endswith("/"):
|
||||
dest_prefix += "/"
|
||||
|
||||
# Determine the effective destination prefix based on preserve_prefix setting
|
||||
effective_dest_prefix = dest_prefix
|
||||
if preserve_prefix and source_prefix:
|
||||
# Extract the last component of the source prefix (e.g., "prefix1/" from "path/to/prefix1/")
|
||||
source_prefix_name = source_prefix.rstrip("/").split("/")[-1]
|
||||
if source_prefix_name:
|
||||
# Append source prefix name to destination
|
||||
effective_dest_prefix = (dest_prefix or "") + source_prefix_name + "/"
|
||||
|
||||
if not quiet:
|
||||
# Log AWS region being used (helps users verify their configuration)
|
||||
# Pass region_override to warn about cross-region charges if user explicitly set --region
|
||||
log_aws_region(service, region_override=region_override)
|
||||
|
||||
source_display = build_s3_url(source_bucket, source_prefix)
|
||||
dest_display = build_s3_url(dest_bucket, dest_prefix)
|
||||
effective_dest_display = build_s3_url(dest_bucket, effective_dest_prefix)
|
||||
|
||||
if preserve_prefix and source_prefix:
|
||||
click.echo(f"Migrating from {source_display}")
|
||||
click.echo(f" to {effective_dest_display}")
|
||||
else:
|
||||
click.echo(f"Migrating from {source_display} to {dest_display}")
|
||||
click.echo("Scanning source and destination buckets...")
|
||||
|
||||
# List source objects
|
||||
source_list_prefix = f"{source_bucket}/{source_prefix}" if source_prefix else source_bucket
|
||||
source_objects = []
|
||||
|
||||
for obj in service.storage.list(source_list_prefix):
|
||||
# Skip reference.bin files (internal delta reference)
|
||||
if obj.key.endswith("/reference.bin"):
|
||||
continue
|
||||
# Skip .delta files in source (we'll handle the original files)
|
||||
if obj.key.endswith(".delta"):
|
||||
continue
|
||||
|
||||
# Apply include/exclude filters
|
||||
rel_key = obj.key.removeprefix(source_prefix) if source_prefix else obj.key
|
||||
if exclude and fnmatch.fnmatch(rel_key, exclude):
|
||||
continue
|
||||
if include and not fnmatch.fnmatch(rel_key, include):
|
||||
continue
|
||||
|
||||
source_objects.append(obj)
|
||||
|
||||
# List destination objects to detect what needs copying
|
||||
dest_list_prefix = (
|
||||
f"{dest_bucket}/{effective_dest_prefix}" if effective_dest_prefix else dest_bucket
|
||||
)
|
||||
dest_keys = set()
|
||||
|
||||
for obj in service.storage.list(dest_list_prefix):
|
||||
# Get the relative key in destination
|
||||
rel_key = obj.key.removeprefix(effective_dest_prefix) if effective_dest_prefix else obj.key
|
||||
# Remove .delta suffix for comparison
|
||||
if rel_key.endswith(".delta"):
|
||||
rel_key = rel_key[:-6]
|
||||
# Skip reference.bin
|
||||
if not rel_key.endswith("/reference.bin"):
|
||||
dest_keys.add(rel_key)
|
||||
|
||||
# Determine files to migrate (not in destination)
|
||||
files_to_migrate = []
|
||||
total_size = 0
|
||||
|
||||
for source_obj in source_objects:
|
||||
# Get relative path from source prefix
|
||||
rel_key = source_obj.key.removeprefix(source_prefix) if source_prefix else source_obj.key
|
||||
|
||||
# Check if already exists in destination
|
||||
if rel_key not in dest_keys:
|
||||
files_to_migrate.append((source_obj, rel_key))
|
||||
total_size += source_obj.size
|
||||
|
||||
# Show summary and ask for confirmation
|
||||
if not files_to_migrate:
|
||||
if not quiet:
|
||||
click.echo("Copy completed")
|
||||
click.echo("All files are already migrated. Nothing to do.")
|
||||
return
|
||||
|
||||
if not quiet:
|
||||
|
||||
def format_bytes(size: int) -> str:
|
||||
size_float = float(size)
|
||||
for unit in ["B", "KB", "MB", "GB", "TB"]:
|
||||
if size_float < 1024.0:
|
||||
return f"{size_float:.2f} {unit}"
|
||||
size_float /= 1024.0
|
||||
return f"{size_float:.2f} PB"
|
||||
|
||||
click.echo("")
|
||||
click.echo(f"Files to migrate: {len(files_to_migrate)}")
|
||||
click.echo(f"Total size: {format_bytes(total_size)}")
|
||||
if len(dest_keys) > 0:
|
||||
click.echo(f"Already migrated: {len(dest_keys)} files (will be skipped)")
|
||||
|
||||
# Handle dry run mode early (before confirmation prompt)
|
||||
if dry_run:
|
||||
if not quiet:
|
||||
click.echo("\n--- DRY RUN MODE ---")
|
||||
for _obj, rel_key in files_to_migrate[:10]: # Show first 10 files
|
||||
click.echo(f" Would migrate: {rel_key}")
|
||||
if len(files_to_migrate) > 10:
|
||||
click.echo(f" ... and {len(files_to_migrate) - 10} more files")
|
||||
return
|
||||
|
||||
# Ask for confirmation before proceeding with actual migration
|
||||
if not quiet and not skip_confirm:
|
||||
click.echo("")
|
||||
if not click.confirm("Do you want to proceed with the migration?"):
|
||||
click.echo("Migration cancelled.")
|
||||
return
|
||||
|
||||
# Perform migration
|
||||
if not quiet:
|
||||
click.echo(f"\nStarting migration of {len(files_to_migrate)} files...")
|
||||
|
||||
successful = 0
|
||||
failed = 0
|
||||
failed_files = []
|
||||
|
||||
for i, (source_obj, rel_key) in enumerate(files_to_migrate, 1):
|
||||
source_s3_url = build_s3_url(source_bucket, source_obj.key)
|
||||
|
||||
# Construct destination URL using effective prefix
|
||||
if effective_dest_prefix:
|
||||
dest_key = effective_dest_prefix + rel_key
|
||||
else:
|
||||
dest_key = rel_key
|
||||
dest_s3_url = build_s3_url(dest_bucket, dest_key)
|
||||
|
||||
try:
|
||||
if not quiet:
|
||||
progress = f"[{i}/{len(files_to_migrate)}]"
|
||||
click.echo(f"{progress} Migrating {rel_key}...", nl=False)
|
||||
|
||||
# Copy with delta compression
|
||||
copy_s3_to_s3(
|
||||
service,
|
||||
source_s3_url,
|
||||
dest_s3_url,
|
||||
quiet=True,
|
||||
max_ratio=max_ratio,
|
||||
no_delta=no_delta,
|
||||
)
|
||||
|
||||
successful += 1
|
||||
if not quiet:
|
||||
click.echo(" ✓")
|
||||
|
||||
except Exception as e:
|
||||
failed += 1
|
||||
failed_files.append((rel_key, str(e)))
|
||||
if not quiet:
|
||||
click.echo(f" ✗ ({e})")
|
||||
|
||||
# Show final summary
|
||||
if not quiet:
|
||||
click.echo("")
|
||||
click.echo("Migration Summary:")
|
||||
click.echo(f" Successfully migrated: {successful} files")
|
||||
if failed > 0:
|
||||
click.echo(f" Failed: {failed} files")
|
||||
click.echo("\nFailed files:")
|
||||
for file, error in failed_files[:10]: # Show first 10 failures
|
||||
click.echo(f" {file}: {error}")
|
||||
if len(failed_files) > 10:
|
||||
click.echo(f" ... and {len(failed_files) - 10} more failures")
|
||||
|
||||
# Show compression statistics from cache if available (no bucket scan)
|
||||
if successful > 0 and not no_delta:
|
||||
try:
|
||||
from ...client import DeltaGliderClient
|
||||
|
||||
client = DeltaGliderClient(service)
|
||||
# Use cached stats only - don't scan bucket (prevents blocking)
|
||||
cached_stats = client._get_cached_bucket_stats(dest_bucket, "quick")
|
||||
if cached_stats and cached_stats.delta_objects > 0:
|
||||
click.echo(
|
||||
f"\nCompression achieved: {cached_stats.average_compression_ratio:.1%}"
|
||||
)
|
||||
click.echo(f"Space saved: {format_bytes(cached_stats.space_saved)}")
|
||||
except Exception:
|
||||
pass # Ignore stats errors
|
||||
|
||||
|
||||
def handle_recursive(
|
||||
@@ -228,10 +601,7 @@ def handle_recursive(
|
||||
dest_path = Path(dest)
|
||||
dest_path.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# List all objects with prefix
|
||||
# Note: S3StorageAdapter.list() expects "bucket/prefix" format
|
||||
list_prefix = f"{bucket}/{prefix}" if prefix else bucket
|
||||
objects = list(service.storage.list(list_prefix))
|
||||
objects = fetch_s3_object_heads(service, bucket, prefix)
|
||||
|
||||
if not quiet:
|
||||
click.echo(f"Downloading {len(objects)} files...")
|
||||
@@ -261,9 +631,22 @@ def handle_recursive(
|
||||
local_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Download file
|
||||
s3_url = f"s3://{bucket}/{obj.key}"
|
||||
s3_url = build_s3_url(bucket, obj.key)
|
||||
download_file(service, s3_url, local_path, quiet)
|
||||
|
||||
else:
|
||||
click.echo("S3-to-S3 recursive copy not yet implemented", err=True)
|
||||
sys.exit(1)
|
||||
elif operation == "copy":
|
||||
# S3-to-S3 recursive copy with migration support
|
||||
migrate_s3_to_s3(
|
||||
service,
|
||||
source,
|
||||
dest,
|
||||
exclude=exclude,
|
||||
include=include,
|
||||
quiet=quiet,
|
||||
no_delta=no_delta,
|
||||
max_ratio=max_ratio,
|
||||
dry_run=False,
|
||||
skip_confirm=True, # Don't prompt for cp command
|
||||
preserve_prefix=True, # Always preserve prefix for cp -r
|
||||
region_override=False, # cp command doesn't track region override explicitly
|
||||
)
|
||||
|
||||
@@ -1,14 +1,19 @@
|
||||
"""CLI main entry point."""
|
||||
|
||||
import atexit
|
||||
import json
|
||||
import os
|
||||
import shutil
|
||||
import sys
|
||||
import tempfile
|
||||
from datetime import UTC
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
import click
|
||||
|
||||
from ... import __version__
|
||||
from ...adapters import (
|
||||
FsCacheAdapter,
|
||||
NoopMetricsAdapter,
|
||||
S3StorageAdapter,
|
||||
Sha256Adapter,
|
||||
@@ -17,7 +22,9 @@ from ...adapters import (
|
||||
XdeltaAdapter,
|
||||
)
|
||||
from ...core import DeltaService, ObjectKey
|
||||
from ...core.config import DeltaGliderConfig
|
||||
from ...ports import MetricsPort
|
||||
from ...ports.cache import CachePort
|
||||
from .aws_compat import (
|
||||
copy_s3_to_s3,
|
||||
determine_operation,
|
||||
@@ -35,48 +42,87 @@ def create_service(
|
||||
endpoint_url: str | None = None,
|
||||
region: str | None = None,
|
||||
profile: str | None = None,
|
||||
*,
|
||||
config: DeltaGliderConfig | None = None,
|
||||
) -> DeltaService:
|
||||
"""Create service with wired adapters."""
|
||||
# Get config from environment
|
||||
cache_dir = Path(os.environ.get("DG_CACHE_DIR", "/tmp/.deltaglider/reference_cache"))
|
||||
max_ratio = float(os.environ.get("DG_MAX_RATIO", "0.5"))
|
||||
metrics_type = os.environ.get("DG_METRICS", "logging") # Options: noop, logging, cloudwatch
|
||||
"""Create service with wired adapters.
|
||||
|
||||
# Set AWS environment variables if provided
|
||||
if endpoint_url:
|
||||
os.environ["AWS_ENDPOINT_URL"] = endpoint_url
|
||||
if region:
|
||||
os.environ["AWS_DEFAULT_REGION"] = region
|
||||
if profile:
|
||||
os.environ["AWS_PROFILE"] = profile
|
||||
Args:
|
||||
log_level: Logging level (overridden by config.log_level if config provided).
|
||||
endpoint_url: S3 endpoint URL (overridden by config if provided).
|
||||
region: AWS region (overridden by config if provided).
|
||||
profile: AWS profile (overridden by config if provided).
|
||||
config: Optional pre-built config. If None, built from env vars + explicit params.
|
||||
"""
|
||||
if config is None:
|
||||
config = DeltaGliderConfig.from_env(
|
||||
log_level=log_level,
|
||||
endpoint_url=endpoint_url,
|
||||
region=region,
|
||||
profile=profile,
|
||||
)
|
||||
|
||||
# SECURITY: Always use ephemeral process-isolated cache
|
||||
cache_dir = Path(tempfile.mkdtemp(prefix="deltaglider-", dir="/tmp"))
|
||||
# Register cleanup handler to remove cache on exit
|
||||
atexit.register(lambda: shutil.rmtree(cache_dir, ignore_errors=True))
|
||||
|
||||
# Set AWS environment variables if provided (for compatibility with other AWS tools)
|
||||
if config.endpoint_url:
|
||||
os.environ["AWS_ENDPOINT_URL"] = config.endpoint_url
|
||||
if config.region:
|
||||
os.environ["AWS_DEFAULT_REGION"] = config.region
|
||||
if config.profile:
|
||||
os.environ["AWS_PROFILE"] = config.profile
|
||||
|
||||
# Build boto3_kwargs for explicit parameter passing (preferred over env vars)
|
||||
boto3_kwargs: dict[str, Any] = {}
|
||||
if config.region:
|
||||
boto3_kwargs["region_name"] = config.region
|
||||
|
||||
# Create adapters
|
||||
hasher = Sha256Adapter()
|
||||
storage = S3StorageAdapter(endpoint_url=endpoint_url)
|
||||
storage = S3StorageAdapter(endpoint_url=config.endpoint_url, boto3_kwargs=boto3_kwargs)
|
||||
diff = XdeltaAdapter()
|
||||
cache = FsCacheAdapter(cache_dir, hasher)
|
||||
|
||||
# SECURITY: Configurable cache with encryption and backend selection
|
||||
from deltaglider.adapters import ContentAddressedCache, EncryptedCache, MemoryCache
|
||||
|
||||
base_cache: CachePort
|
||||
if config.cache_backend == "memory":
|
||||
base_cache = MemoryCache(
|
||||
hasher, max_size_mb=config.cache_memory_size_mb, temp_dir=cache_dir
|
||||
)
|
||||
else:
|
||||
base_cache = ContentAddressedCache(cache_dir, hasher)
|
||||
|
||||
# Always apply encryption with ephemeral keys (security hardening)
|
||||
cache: CachePort = EncryptedCache.from_env(base_cache)
|
||||
|
||||
clock = UtcClockAdapter()
|
||||
logger = StdLoggerAdapter(level=log_level)
|
||||
logger = StdLoggerAdapter(level=config.log_level)
|
||||
|
||||
# Create metrics adapter based on configuration
|
||||
metrics: MetricsPort
|
||||
if metrics_type == "cloudwatch":
|
||||
# Import here to avoid dependency if not used
|
||||
if config.metrics_type == "cloudwatch":
|
||||
from ...adapters.metrics_cloudwatch import CloudWatchMetricsAdapter
|
||||
|
||||
metrics = CloudWatchMetricsAdapter(
|
||||
namespace=os.environ.get("DG_METRICS_NAMESPACE", "DeltaGlider"),
|
||||
region=region,
|
||||
endpoint_url=endpoint_url if endpoint_url and "localhost" in endpoint_url else None,
|
||||
namespace=config.metrics_namespace,
|
||||
region=config.region,
|
||||
endpoint_url=(
|
||||
config.endpoint_url
|
||||
if config.endpoint_url and "localhost" in config.endpoint_url
|
||||
else None
|
||||
),
|
||||
)
|
||||
elif metrics_type == "logging":
|
||||
elif config.metrics_type == "logging":
|
||||
from ...adapters.metrics_cloudwatch import LoggingMetricsAdapter
|
||||
|
||||
metrics = LoggingMetricsAdapter(log_level=log_level)
|
||||
metrics = LoggingMetricsAdapter(log_level=config.log_level)
|
||||
else:
|
||||
metrics = NoopMetricsAdapter()
|
||||
|
||||
# Create service
|
||||
return DeltaService(
|
||||
storage=storage,
|
||||
diff=diff,
|
||||
@@ -85,17 +131,35 @@ def create_service(
|
||||
clock=clock,
|
||||
logger=logger,
|
||||
metrics=metrics,
|
||||
max_ratio=max_ratio,
|
||||
max_ratio=config.max_ratio,
|
||||
)
|
||||
|
||||
|
||||
def _version_callback(ctx: click.Context, param: click.Parameter, value: bool) -> None:
|
||||
"""Callback for --version option."""
|
||||
if value:
|
||||
click.echo(f"deltaglider {__version__}")
|
||||
ctx.exit(0)
|
||||
|
||||
|
||||
@click.group()
|
||||
@click.option("--debug", is_flag=True, help="Enable debug logging")
|
||||
@click.option(
|
||||
"--version",
|
||||
is_flag=True,
|
||||
is_eager=True,
|
||||
expose_value=False,
|
||||
callback=_version_callback,
|
||||
help="Show version and exit",
|
||||
)
|
||||
@click.pass_context
|
||||
def cli(ctx: click.Context, debug: bool) -> None:
|
||||
"""DeltaGlider - Delta-aware S3 file storage wrapper."""
|
||||
import logging
|
||||
|
||||
log_level = "DEBUG" if debug else os.environ.get("DG_LOG_LEVEL", "INFO")
|
||||
ctx.obj = create_service(log_level)
|
||||
logging.getLogger("deltaglider").info("deltaglider %s", __version__)
|
||||
|
||||
|
||||
@cli.command()
|
||||
@@ -148,9 +212,6 @@ def cp(
|
||||
|
||||
# Handle recursive operations for directories
|
||||
if recursive:
|
||||
if operation == "copy":
|
||||
click.echo("S3-to-S3 recursive copy not yet implemented", err=True)
|
||||
sys.exit(1)
|
||||
handle_recursive(
|
||||
service, source, dest, recursive, exclude, include, quiet, no_delta, max_ratio
|
||||
)
|
||||
@@ -172,7 +233,7 @@ def cp(
|
||||
download_file(service, source, local_path, quiet)
|
||||
|
||||
elif operation == "copy":
|
||||
copy_s3_to_s3(service, source, dest, quiet)
|
||||
copy_s3_to_s3(service, source, dest, quiet, max_ratio, no_delta)
|
||||
|
||||
except ValueError as e:
|
||||
click.echo(f"Error: {e}", err=True)
|
||||
@@ -240,6 +301,13 @@ def ls(
|
||||
prefix_str: str
|
||||
bucket_name, prefix_str = parse_s3_url(s3_url)
|
||||
|
||||
# Ensure prefix ends with / if it's meant to be a directory
|
||||
# This helps with proper path handling
|
||||
if prefix_str and not prefix_str.endswith("/"):
|
||||
# Check if this is a file or directory by listing
|
||||
# For now, assume it's a directory prefix
|
||||
prefix_str = prefix_str + "/"
|
||||
|
||||
# Format bytes to human readable
|
||||
def format_bytes(size: int) -> str:
|
||||
if not human_readable:
|
||||
@@ -252,33 +320,38 @@ def ls(
|
||||
return f"{size_float:.1f}P"
|
||||
|
||||
# List objects using SDK (automatically filters .delta and reference.bin)
|
||||
from deltaglider.client import DeltaGliderClient, ListObjectsResponse
|
||||
from deltaglider.client import DeltaGliderClient
|
||||
|
||||
client = DeltaGliderClient(service)
|
||||
dg_response: ListObjectsResponse = client.list_objects(
|
||||
Bucket=bucket_name, Prefix=prefix_str, MaxKeys=10000
|
||||
dg_response = client.list_objects(
|
||||
Bucket=bucket_name,
|
||||
Prefix=prefix_str,
|
||||
MaxKeys=10000,
|
||||
Delimiter="/" if not recursive else "",
|
||||
)
|
||||
objects = dg_response.contents
|
||||
objects = dg_response["Contents"]
|
||||
|
||||
# Filter by recursive flag
|
||||
if not recursive:
|
||||
# Only show direct children
|
||||
seen_prefixes = set()
|
||||
# Show common prefixes (subdirectories) from S3 response
|
||||
for common_prefix in dg_response.get("CommonPrefixes", []):
|
||||
prefix_path = common_prefix.get("Prefix", "")
|
||||
# Show only the directory name, not the full path
|
||||
if prefix_str:
|
||||
# Strip the current prefix to show only the subdirectory
|
||||
display_name = prefix_path[len(prefix_str) :]
|
||||
else:
|
||||
display_name = prefix_path
|
||||
click.echo(f" PRE {display_name}")
|
||||
|
||||
# Only show files at current level (not in subdirectories)
|
||||
filtered_objects = []
|
||||
for obj in objects:
|
||||
rel_path = obj.key[len(prefix_str) :] if prefix_str else obj.key
|
||||
if "/" in rel_path:
|
||||
# It's in a subdirectory
|
||||
subdir = rel_path.split("/")[0] + "/"
|
||||
if subdir not in seen_prefixes:
|
||||
seen_prefixes.add(subdir)
|
||||
# Show as directory
|
||||
full_prefix = f"{prefix_str}{subdir}" if prefix_str else subdir
|
||||
click.echo(f" PRE {full_prefix}")
|
||||
else:
|
||||
# Direct file
|
||||
if rel_path: # Only add if there's actually a file at this level
|
||||
filtered_objects.append(obj)
|
||||
obj_key = obj["Key"]
|
||||
rel_path = obj_key[len(prefix_str) :] if prefix_str else obj_key
|
||||
# Only include if it's a direct child (no / in relative path)
|
||||
if "/" not in rel_path and rel_path:
|
||||
filtered_objects.append(obj)
|
||||
objects = filtered_objects
|
||||
|
||||
# Display objects (SDK already filters reference.bin and strips .delta)
|
||||
@@ -286,19 +359,26 @@ def ls(
|
||||
total_count = 0
|
||||
|
||||
for obj in objects:
|
||||
total_size += obj.size
|
||||
total_size += obj["Size"]
|
||||
total_count += 1
|
||||
|
||||
# Format the display
|
||||
size_str = format_bytes(obj.size)
|
||||
size_str = format_bytes(obj["Size"])
|
||||
# last_modified is a string from SDK, parse it if needed
|
||||
if isinstance(obj.last_modified, str):
|
||||
last_modified = obj.get("LastModified", "")
|
||||
if isinstance(last_modified, str):
|
||||
# Already a string, extract date portion
|
||||
date_str = obj.last_modified[:19].replace("T", " ")
|
||||
date_str = last_modified[:19].replace("T", " ")
|
||||
else:
|
||||
date_str = obj.last_modified.strftime("%Y-%m-%d %H:%M:%S")
|
||||
date_str = last_modified.strftime("%Y-%m-%d %H:%M:%S")
|
||||
|
||||
click.echo(f"{date_str} {size_str:>10} s3://{bucket_name}/{obj.key}")
|
||||
# Show only the filename relative to current prefix (like AWS CLI)
|
||||
if prefix_str:
|
||||
display_key = obj["Key"][len(prefix_str) :]
|
||||
else:
|
||||
display_key = obj["Key"]
|
||||
|
||||
click.echo(f"{date_str} {size_str:>10} {display_key}")
|
||||
|
||||
# Show summary if requested
|
||||
if summarize:
|
||||
@@ -426,24 +506,24 @@ def rm(
|
||||
|
||||
# Report the results
|
||||
if not quiet:
|
||||
if result["deleted_count"] == 0:
|
||||
if result.deleted_count == 0:
|
||||
click.echo(f"delete: No objects found with prefix: s3://{bucket}/{prefix}")
|
||||
else:
|
||||
click.echo(f"Deleted {result['deleted_count']} object(s)")
|
||||
click.echo(f"Deleted {result.deleted_count} object(s)")
|
||||
|
||||
# Show warnings if any references were kept
|
||||
for warning in result.get("warnings", []):
|
||||
for warning in result.warnings:
|
||||
if "Kept reference" in warning:
|
||||
click.echo(
|
||||
f"Keeping reference file (still in use): s3://{bucket}/{warning.split()[2]}"
|
||||
)
|
||||
|
||||
# Report any errors
|
||||
if result["failed_count"] > 0:
|
||||
for error in result.get("errors", []):
|
||||
if result.failed_count > 0:
|
||||
for error in result.errors:
|
||||
click.echo(f"Error: {error}", err=True)
|
||||
|
||||
if result["failed_count"] > 0:
|
||||
if result.failed_count > 0:
|
||||
sys.exit(1)
|
||||
|
||||
except Exception as e:
|
||||
@@ -561,20 +641,14 @@ def sync(
|
||||
@click.pass_obj
|
||||
def verify(service: DeltaService, s3_url: str) -> None:
|
||||
"""Verify integrity of delta file."""
|
||||
# Parse S3 URL
|
||||
if not s3_url.startswith("s3://"):
|
||||
try:
|
||||
bucket, key = parse_s3_url(s3_url)
|
||||
if not key:
|
||||
raise ValueError("Missing key")
|
||||
except ValueError:
|
||||
click.echo(f"Error: Invalid S3 URL: {s3_url}", err=True)
|
||||
sys.exit(1)
|
||||
|
||||
s3_path = s3_url[5:]
|
||||
parts = s3_path.split("/", 1)
|
||||
if len(parts) != 2:
|
||||
click.echo(f"Error: Invalid S3 URL: {s3_url}", err=True)
|
||||
sys.exit(1)
|
||||
|
||||
bucket = parts[0]
|
||||
key = parts[1]
|
||||
|
||||
obj_key = ObjectKey(bucket=bucket, key=key)
|
||||
|
||||
try:
|
||||
@@ -597,6 +671,538 @@ def verify(service: DeltaService, s3_url: str) -> None:
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
@cli.command()
|
||||
@click.argument("source")
|
||||
@click.argument("dest")
|
||||
@click.option("--exclude", help="Exclude files matching pattern")
|
||||
@click.option("--include", help="Include only files matching pattern")
|
||||
@click.option("--quiet", "-q", is_flag=True, help="Suppress output")
|
||||
@click.option("--no-delta", is_flag=True, help="Disable delta compression")
|
||||
@click.option("--max-ratio", type=float, help="Max delta/file ratio (default: 0.5)")
|
||||
@click.option("--dry-run", is_flag=True, help="Show what would be migrated without migrating")
|
||||
@click.option("--yes", "-y", is_flag=True, help="Skip confirmation prompt")
|
||||
@click.option(
|
||||
"--no-preserve-prefix", is_flag=True, help="Don't preserve source prefix in destination"
|
||||
)
|
||||
@click.option("--endpoint-url", help="Override S3 endpoint URL")
|
||||
@click.option("--region", help="AWS region")
|
||||
@click.option("--profile", help="AWS profile to use")
|
||||
@click.pass_obj
|
||||
def migrate(
|
||||
service: DeltaService,
|
||||
source: str,
|
||||
dest: str,
|
||||
exclude: str | None,
|
||||
include: str | None,
|
||||
quiet: bool,
|
||||
no_delta: bool,
|
||||
max_ratio: float | None,
|
||||
dry_run: bool,
|
||||
yes: bool,
|
||||
no_preserve_prefix: bool,
|
||||
endpoint_url: str | None,
|
||||
region: str | None,
|
||||
profile: str | None,
|
||||
) -> None:
|
||||
"""Migrate S3 bucket/prefix to DeltaGlider-compressed storage.
|
||||
|
||||
This command facilitates the migration of existing S3 objects to another bucket
|
||||
with DeltaGlider compression. It supports:
|
||||
- Resume capability: Only copies files that don't exist in destination
|
||||
- Progress tracking: Shows migration progress
|
||||
- Confirmation prompt: Shows file count before starting (use --yes to skip)
|
||||
- Prefix preservation: By default, source prefix is preserved in destination
|
||||
|
||||
When migrating a prefix, the source prefix name is preserved by default:
|
||||
s3://src/prefix1/ → s3://dest/ creates s3://dest/prefix1/
|
||||
s3://src/a/b/c/ → s3://dest/x/ creates s3://dest/x/c/
|
||||
|
||||
Use --no-preserve-prefix to disable this behavior:
|
||||
s3://src/prefix1/ → s3://dest/ creates s3://dest/ (files at root)
|
||||
|
||||
Examples:
|
||||
deltaglider migrate s3://old-bucket/ s3://new-bucket/
|
||||
deltaglider migrate s3://old-bucket/data/ s3://new-bucket/
|
||||
deltaglider migrate --no-preserve-prefix s3://src/v1/ s3://dest/
|
||||
deltaglider migrate --dry-run s3://old-bucket/ s3://new-bucket/
|
||||
deltaglider migrate --yes --quiet s3://old-bucket/ s3://new-bucket/
|
||||
"""
|
||||
from .aws_compat import is_s3_path, migrate_s3_to_s3
|
||||
|
||||
# Recreate service with AWS parameters if provided
|
||||
if endpoint_url or region or profile:
|
||||
service = create_service(
|
||||
log_level=os.environ.get("DG_LOG_LEVEL", "INFO"),
|
||||
endpoint_url=endpoint_url,
|
||||
region=region,
|
||||
profile=profile,
|
||||
)
|
||||
|
||||
try:
|
||||
# Validate both paths are S3
|
||||
if not is_s3_path(source) or not is_s3_path(dest):
|
||||
click.echo("Error: Both source and destination must be S3 paths", err=True)
|
||||
sys.exit(1)
|
||||
|
||||
# Perform migration
|
||||
migrate_s3_to_s3(
|
||||
service,
|
||||
source,
|
||||
dest,
|
||||
exclude=exclude,
|
||||
include=include,
|
||||
quiet=quiet,
|
||||
no_delta=no_delta,
|
||||
max_ratio=max_ratio,
|
||||
dry_run=dry_run,
|
||||
skip_confirm=yes,
|
||||
preserve_prefix=not no_preserve_prefix,
|
||||
region_override=region is not None, # True if user explicitly specified --region
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
click.echo(f"Migration failed: {e}", err=True)
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
@cli.command(short_help="Get bucket statistics and compression metrics")
|
||||
@click.argument("bucket")
|
||||
@click.option("--sampled", is_flag=True, help="Balanced mode: one sample per deltaspace (~5-15s)")
|
||||
@click.option(
|
||||
"--detailed", is_flag=True, help="Most accurate: HEAD for all deltas (slowest, ~1min+)"
|
||||
)
|
||||
@click.option("--refresh", is_flag=True, help="Force cache refresh even if valid")
|
||||
@click.option("--no-cache", is_flag=True, help="Skip caching entirely (both read and write)")
|
||||
@click.option("--json", "output_json", is_flag=True, help="Output in JSON format")
|
||||
@click.pass_obj
|
||||
def stats(
|
||||
service: DeltaService,
|
||||
bucket: str,
|
||||
sampled: bool,
|
||||
detailed: bool,
|
||||
refresh: bool,
|
||||
no_cache: bool,
|
||||
output_json: bool,
|
||||
) -> None:
|
||||
"""Get bucket statistics and compression metrics with intelligent S3-based caching.
|
||||
|
||||
BUCKET can be specified as:
|
||||
- s3://bucket-name/
|
||||
- s3://bucket-name
|
||||
- bucket-name
|
||||
|
||||
Modes (mutually exclusive):
|
||||
- quick (default): Fast listing-only stats (~0.5s), approximate compression metrics
|
||||
- --sampled: Balanced mode - one HEAD per deltaspace (~5-15s for typical buckets)
|
||||
- --detailed: Most accurate - HEAD for every delta file (slowest, ~1min+ for large buckets)
|
||||
|
||||
Caching (NEW - massive performance improvement!):
|
||||
Stats are cached in S3 at .deltaglider/stats_{mode}.json (one per mode).
|
||||
Cache is automatically validated on every call using object count + size.
|
||||
If bucket changed, stats are recomputed automatically.
|
||||
|
||||
Performance with cache:
|
||||
- Cache hit: ~0.1s (200x faster than recomputation!)
|
||||
- Cache miss: Full computation time (creates cache for next time)
|
||||
- Cache invalid: Auto-recomputes when bucket changes
|
||||
|
||||
Options:
|
||||
--refresh: Force cache refresh even if valid (use when you need fresh data now)
|
||||
--no-cache: Skip caching entirely - always recompute (useful for testing/debugging)
|
||||
--json: Output in JSON format for automation/scripting
|
||||
|
||||
Examples:
|
||||
deltaglider stats mybucket # Fast (~0.1s with cache, ~0.5s without)
|
||||
deltaglider stats mybucket --sampled # Balanced accuracy/speed (~5-15s first run)
|
||||
deltaglider stats mybucket --detailed # Most accurate (~1-10min first run, ~0.1s cached)
|
||||
deltaglider stats mybucket --refresh # Force recomputation even if cached
|
||||
deltaglider stats mybucket --no-cache # Always compute fresh (skip cache)
|
||||
deltaglider stats mybucket --json # JSON output for scripts
|
||||
deltaglider stats s3://mybucket/ # Also accepts s3:// URLs
|
||||
|
||||
Timing Logs:
|
||||
Set DG_LOG_LEVEL=INFO to see detailed phase timing with timestamps:
|
||||
[HH:MM:SS.mmm] Phase 1: LIST completed in 0.52s - Found 1523 objects
|
||||
[HH:MM:SS.mmm] Phase 2: Cache HIT in 0.06s - Using cached stats
|
||||
[HH:MM:SS.mmm] COMPLETE: Total time 0.58s
|
||||
|
||||
See docs/STATS_CACHING.md for complete documentation.
|
||||
"""
|
||||
from ...client import DeltaGliderClient
|
||||
from ...client_operations.stats import StatsMode
|
||||
|
||||
try:
|
||||
# Parse bucket from S3 URL if needed
|
||||
if is_s3_path(bucket):
|
||||
bucket, _prefix = parse_s3_url(bucket)
|
||||
|
||||
if not bucket:
|
||||
click.echo("Error: Invalid bucket name", err=True)
|
||||
sys.exit(1)
|
||||
|
||||
if sampled and detailed:
|
||||
click.echo("Error: --sampled and --detailed cannot be used together", err=True)
|
||||
sys.exit(1)
|
||||
|
||||
if refresh and no_cache:
|
||||
click.echo("Error: --refresh and --no-cache cannot be used together", err=True)
|
||||
sys.exit(1)
|
||||
|
||||
mode: StatsMode = "quick"
|
||||
if sampled:
|
||||
mode = "sampled"
|
||||
if detailed:
|
||||
mode = "detailed"
|
||||
|
||||
# Create client from service
|
||||
client = DeltaGliderClient(service=service)
|
||||
|
||||
# Get bucket stats with caching control
|
||||
use_cache = not no_cache
|
||||
bucket_stats = client.get_bucket_stats(
|
||||
bucket, mode=mode, use_cache=use_cache, refresh_cache=refresh
|
||||
)
|
||||
|
||||
if output_json:
|
||||
# JSON output
|
||||
output = {
|
||||
"bucket": bucket_stats.bucket,
|
||||
"object_count": bucket_stats.object_count,
|
||||
"total_size": bucket_stats.total_size,
|
||||
"compressed_size": bucket_stats.compressed_size,
|
||||
"space_saved": bucket_stats.space_saved,
|
||||
"average_compression_ratio": bucket_stats.average_compression_ratio,
|
||||
"delta_objects": bucket_stats.delta_objects,
|
||||
"direct_objects": bucket_stats.direct_objects,
|
||||
}
|
||||
click.echo(json.dumps(output, indent=2))
|
||||
else:
|
||||
# Human-readable output
|
||||
def format_bytes(size: float) -> str:
|
||||
"""Format bytes to human-readable size."""
|
||||
for unit in ["B", "KB", "MB", "GB", "TB"]:
|
||||
if size < 1024.0:
|
||||
return f"{size:.2f} {unit}"
|
||||
size /= 1024.0
|
||||
return f"{size:.2f} PB"
|
||||
|
||||
click.echo(f"Bucket Statistics: {bucket_stats.bucket}")
|
||||
click.echo(f"{'=' * 60}")
|
||||
click.echo(f"Total Objects: {bucket_stats.object_count:,}")
|
||||
click.echo(f" Delta Objects: {bucket_stats.delta_objects:,}")
|
||||
click.echo(f" Direct Objects: {bucket_stats.direct_objects:,}")
|
||||
click.echo("")
|
||||
click.echo(
|
||||
f"Original Size: {format_bytes(bucket_stats.total_size)} ({bucket_stats.total_size:,} bytes)"
|
||||
)
|
||||
click.echo(
|
||||
f"Compressed Size: {format_bytes(bucket_stats.compressed_size)} ({bucket_stats.compressed_size:,} bytes)"
|
||||
)
|
||||
click.echo(
|
||||
f"Space Saved: {format_bytes(bucket_stats.space_saved)} ({bucket_stats.space_saved:,} bytes)"
|
||||
)
|
||||
click.echo(f"Compression Ratio: {bucket_stats.average_compression_ratio:.1%}")
|
||||
|
||||
except Exception as e:
|
||||
click.echo(f"Error: {e}", err=True)
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
@cli.command()
|
||||
@click.argument("bucket")
|
||||
@click.option("--dry-run", is_flag=True, help="Show what would be deleted without deleting")
|
||||
@click.option("--json", "output_json", is_flag=True, help="Output in JSON format")
|
||||
@click.option("--endpoint-url", help="Override S3 endpoint URL")
|
||||
@click.option("--region", help="AWS region")
|
||||
@click.option("--profile", help="AWS profile to use")
|
||||
@click.pass_obj
|
||||
def purge(
|
||||
service: DeltaService,
|
||||
bucket: str,
|
||||
dry_run: bool,
|
||||
output_json: bool,
|
||||
endpoint_url: str | None,
|
||||
region: str | None,
|
||||
profile: str | None,
|
||||
) -> None:
|
||||
"""Purge expired temporary files from .deltaglider/tmp/.
|
||||
|
||||
This command scans the .deltaglider/tmp/ prefix in the specified bucket
|
||||
and deletes any files whose dg-expires-at metadata indicates they have expired.
|
||||
|
||||
These temporary files are created by the rehydration process when deltaglider-compressed
|
||||
files need to be made available for direct download (e.g., via presigned URLs).
|
||||
|
||||
BUCKET can be specified as:
|
||||
- s3://bucket-name/
|
||||
- s3://bucket-name
|
||||
- bucket-name
|
||||
|
||||
Examples:
|
||||
deltaglider purge mybucket # Purge expired files
|
||||
deltaglider purge mybucket --dry-run # Preview what would be deleted
|
||||
deltaglider purge mybucket --json # JSON output for automation
|
||||
deltaglider purge s3://mybucket/ # Also accepts s3:// URLs
|
||||
"""
|
||||
# Recreate service with AWS parameters if provided
|
||||
if endpoint_url or region or profile:
|
||||
service = create_service(
|
||||
log_level=os.environ.get("DG_LOG_LEVEL", "INFO"),
|
||||
endpoint_url=endpoint_url,
|
||||
region=region,
|
||||
profile=profile,
|
||||
)
|
||||
|
||||
try:
|
||||
# Parse bucket from S3 URL if needed
|
||||
if is_s3_path(bucket):
|
||||
bucket, _prefix = parse_s3_url(bucket)
|
||||
|
||||
if not bucket:
|
||||
click.echo("Error: Invalid bucket name", err=True)
|
||||
sys.exit(1)
|
||||
|
||||
# Perform the purge (or dry run simulation)
|
||||
if dry_run:
|
||||
# For dry run, we need to simulate what would be deleted
|
||||
prefix = ".deltaglider/tmp/"
|
||||
expired_files = []
|
||||
total_size = 0
|
||||
|
||||
# List all objects in temp directory
|
||||
from datetime import datetime
|
||||
|
||||
import boto3
|
||||
|
||||
s3_client = boto3.client(
|
||||
"s3",
|
||||
endpoint_url=endpoint_url or os.environ.get("AWS_ENDPOINT_URL"),
|
||||
region_name=region,
|
||||
)
|
||||
|
||||
paginator = s3_client.get_paginator("list_objects_v2")
|
||||
page_iterator = paginator.paginate(Bucket=bucket, Prefix=prefix)
|
||||
|
||||
for page in page_iterator:
|
||||
for obj in page.get("Contents", []):
|
||||
# Get object metadata
|
||||
head_response = s3_client.head_object(Bucket=bucket, Key=obj["Key"])
|
||||
metadata = head_response.get("Metadata", {})
|
||||
|
||||
expires_at_str = metadata.get("dg-expires-at")
|
||||
if expires_at_str:
|
||||
try:
|
||||
expires_at = datetime.fromisoformat(
|
||||
expires_at_str.replace("Z", "+00:00")
|
||||
)
|
||||
if expires_at.tzinfo is None:
|
||||
expires_at = expires_at.replace(tzinfo=UTC)
|
||||
|
||||
if datetime.now(UTC) >= expires_at:
|
||||
expired_files.append(
|
||||
{
|
||||
"key": obj["Key"],
|
||||
"size": obj["Size"],
|
||||
"expires_at": expires_at_str,
|
||||
}
|
||||
)
|
||||
total_size += obj["Size"]
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
if output_json:
|
||||
output = {
|
||||
"bucket": bucket,
|
||||
"prefix": prefix,
|
||||
"dry_run": True,
|
||||
"would_delete_count": len(expired_files),
|
||||
"total_size_to_free": total_size,
|
||||
"expired_files": expired_files[:10], # Show first 10
|
||||
}
|
||||
click.echo(json.dumps(output, indent=2))
|
||||
else:
|
||||
click.echo(f"Dry run: Would delete {len(expired_files)} expired file(s)")
|
||||
click.echo(f"Total space to free: {total_size:,} bytes")
|
||||
if expired_files:
|
||||
click.echo("\nFiles that would be deleted (first 10):")
|
||||
for file_info in expired_files[:10]:
|
||||
click.echo(f" {file_info['key']} (expires: {file_info['expires_at']})")
|
||||
if len(expired_files) > 10:
|
||||
click.echo(f" ... and {len(expired_files) - 10} more")
|
||||
else:
|
||||
# Perform actual purge using the service method
|
||||
result = service.purge_temp_files(bucket)
|
||||
|
||||
if output_json:
|
||||
# JSON output
|
||||
click.echo(json.dumps(result, indent=2))
|
||||
else:
|
||||
# Human-readable output
|
||||
click.echo(f"Purge Statistics for bucket: {bucket}")
|
||||
click.echo(f"{'=' * 60}")
|
||||
click.echo(f"Expired files found: {result['expired_count']}")
|
||||
click.echo(f"Files deleted: {result['deleted_count']}")
|
||||
click.echo(f"Errors: {result['error_count']}")
|
||||
click.echo(f"Space freed: {result['total_size_freed']:,} bytes")
|
||||
click.echo(f"Duration: {result['duration_seconds']:.2f} seconds")
|
||||
|
||||
if result["errors"]:
|
||||
click.echo("\nErrors encountered:")
|
||||
for error in result["errors"][:5]:
|
||||
click.echo(f" - {error}")
|
||||
if len(result["errors"]) > 5:
|
||||
click.echo(f" ... and {len(result['errors']) - 5} more errors")
|
||||
|
||||
except Exception as e:
|
||||
click.echo(f"Error: {e}", err=True)
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
@cli.command("put-bucket-acl")
|
||||
@click.argument("bucket")
|
||||
@click.option(
|
||||
"--acl",
|
||||
type=click.Choice(["private", "public-read", "public-read-write", "authenticated-read"]),
|
||||
help="Canned ACL to apply",
|
||||
)
|
||||
@click.option("--grant-full-control", help="Grants full control (e.g., id=account-id)")
|
||||
@click.option("--grant-read", help="Allows grantee to list objects (e.g., id=account-id)")
|
||||
@click.option("--grant-read-acp", help="Allows grantee to read the bucket ACL")
|
||||
@click.option("--grant-write", help="Allows grantee to create objects in the bucket")
|
||||
@click.option("--grant-write-acp", help="Allows grantee to write the ACL for the bucket")
|
||||
@click.option("--access-control-policy", help="Full ACL policy as JSON string")
|
||||
@click.option("--endpoint-url", help="Override S3 endpoint URL")
|
||||
@click.option("--region", help="AWS region")
|
||||
@click.option("--profile", help="AWS profile to use")
|
||||
@click.pass_obj
|
||||
def put_bucket_acl(
|
||||
service: DeltaService,
|
||||
bucket: str,
|
||||
acl: str | None,
|
||||
grant_full_control: str | None,
|
||||
grant_read: str | None,
|
||||
grant_read_acp: str | None,
|
||||
grant_write: str | None,
|
||||
grant_write_acp: str | None,
|
||||
access_control_policy: str | None,
|
||||
endpoint_url: str | None,
|
||||
region: str | None,
|
||||
profile: str | None,
|
||||
) -> None:
|
||||
"""Set the access control list (ACL) for an S3 bucket.
|
||||
|
||||
BUCKET can be specified as:
|
||||
- s3://bucket-name
|
||||
- bucket-name
|
||||
|
||||
Examples:
|
||||
deltaglider put-bucket-acl my-bucket --acl private
|
||||
deltaglider put-bucket-acl my-bucket --acl public-read
|
||||
deltaglider put-bucket-acl my-bucket --grant-read id=12345
|
||||
"""
|
||||
from ...client import DeltaGliderClient
|
||||
|
||||
# Recreate service with AWS parameters if provided
|
||||
if endpoint_url or region or profile:
|
||||
service = create_service(
|
||||
log_level=os.environ.get("DG_LOG_LEVEL", "INFO"),
|
||||
endpoint_url=endpoint_url,
|
||||
region=region,
|
||||
profile=profile,
|
||||
)
|
||||
|
||||
try:
|
||||
# Parse bucket from S3 URL if needed
|
||||
if is_s3_path(bucket):
|
||||
bucket, _prefix = parse_s3_url(bucket)
|
||||
|
||||
if not bucket:
|
||||
click.echo("Error: Invalid bucket name", err=True)
|
||||
sys.exit(1)
|
||||
|
||||
client = DeltaGliderClient(service=service)
|
||||
|
||||
kwargs: dict[str, Any] = {}
|
||||
if acl is not None:
|
||||
kwargs["ACL"] = acl
|
||||
if grant_full_control is not None:
|
||||
kwargs["GrantFullControl"] = grant_full_control
|
||||
if grant_read is not None:
|
||||
kwargs["GrantRead"] = grant_read
|
||||
if grant_read_acp is not None:
|
||||
kwargs["GrantReadACP"] = grant_read_acp
|
||||
if grant_write is not None:
|
||||
kwargs["GrantWrite"] = grant_write
|
||||
if grant_write_acp is not None:
|
||||
kwargs["GrantWriteACP"] = grant_write_acp
|
||||
if access_control_policy is not None:
|
||||
kwargs["AccessControlPolicy"] = json.loads(access_control_policy)
|
||||
|
||||
client.put_bucket_acl(Bucket=bucket, **kwargs)
|
||||
click.echo(f"ACL updated for bucket: {bucket}")
|
||||
|
||||
except json.JSONDecodeError as e:
|
||||
click.echo(f"Error: Invalid JSON for --access-control-policy: {e}", err=True)
|
||||
sys.exit(1)
|
||||
except Exception as e:
|
||||
click.echo(f"Error: {e}", err=True)
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
@cli.command("get-bucket-acl")
|
||||
@click.argument("bucket")
|
||||
@click.option("--endpoint-url", help="Override S3 endpoint URL")
|
||||
@click.option("--region", help="AWS region")
|
||||
@click.option("--profile", help="AWS profile to use")
|
||||
@click.pass_obj
|
||||
def get_bucket_acl(
|
||||
service: DeltaService,
|
||||
bucket: str,
|
||||
endpoint_url: str | None,
|
||||
region: str | None,
|
||||
profile: str | None,
|
||||
) -> None:
|
||||
"""Get the access control list (ACL) for an S3 bucket.
|
||||
|
||||
BUCKET can be specified as:
|
||||
- s3://bucket-name
|
||||
- bucket-name
|
||||
|
||||
Examples:
|
||||
deltaglider get-bucket-acl my-bucket
|
||||
deltaglider get-bucket-acl s3://my-bucket
|
||||
"""
|
||||
from ...client import DeltaGliderClient
|
||||
|
||||
# Recreate service with AWS parameters if provided
|
||||
if endpoint_url or region or profile:
|
||||
service = create_service(
|
||||
log_level=os.environ.get("DG_LOG_LEVEL", "INFO"),
|
||||
endpoint_url=endpoint_url,
|
||||
region=region,
|
||||
profile=profile,
|
||||
)
|
||||
|
||||
try:
|
||||
# Parse bucket from S3 URL if needed
|
||||
if is_s3_path(bucket):
|
||||
bucket, _prefix = parse_s3_url(bucket)
|
||||
|
||||
if not bucket:
|
||||
click.echo("Error: Invalid bucket name", err=True)
|
||||
sys.exit(1)
|
||||
|
||||
client = DeltaGliderClient(service=service)
|
||||
response = client.get_bucket_acl(Bucket=bucket)
|
||||
|
||||
# Output as JSON like aws s3api get-bucket-acl
|
||||
click.echo(json.dumps(response, indent=2, default=str))
|
||||
|
||||
except Exception as e:
|
||||
click.echo(f"Error: {e}", err=True)
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
def main() -> None:
|
||||
"""Main entry point."""
|
||||
cli()
|
||||
|
||||
@@ -5,9 +5,27 @@ from pathlib import Path
|
||||
import click
|
||||
|
||||
from ...core import DeltaService
|
||||
from ...core.object_listing import list_all_objects, object_dict_to_head
|
||||
from ...ports import ObjectHead
|
||||
|
||||
|
||||
def fetch_s3_object_heads(service: DeltaService, bucket: str, prefix: str) -> list[ObjectHead]:
|
||||
"""Retrieve all objects for a prefix, falling back to iterator when needed."""
|
||||
try:
|
||||
listing = list_all_objects(
|
||||
service.storage,
|
||||
bucket=bucket,
|
||||
prefix=prefix,
|
||||
max_keys=1000,
|
||||
logger=getattr(service, "logger", None),
|
||||
)
|
||||
except (RuntimeError, NotImplementedError):
|
||||
list_prefix = f"{bucket}/{prefix}" if prefix else bucket
|
||||
return list(service.storage.list(list_prefix))
|
||||
|
||||
return [object_dict_to_head(obj) for obj in listing.objects]
|
||||
|
||||
|
||||
def get_local_files(
|
||||
local_dir: Path, exclude: str | None = None, include: str | None = None
|
||||
) -> dict[str, tuple[Path, int]]:
|
||||
@@ -42,8 +60,7 @@ def get_s3_files(
|
||||
import fnmatch
|
||||
|
||||
files = {}
|
||||
list_prefix = f"{bucket}/{prefix}" if prefix else bucket
|
||||
objects = service.storage.list(list_prefix)
|
||||
objects = fetch_s3_object_heads(service, bucket, prefix)
|
||||
|
||||
for obj in objects:
|
||||
# Skip reference.bin files (internal)
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
36
src/deltaglider/client_delete_helpers.py
Normal file
36
src/deltaglider/client_delete_helpers.py
Normal file
@@ -0,0 +1,36 @@
|
||||
"""Helper utilities for client delete operations."""
|
||||
|
||||
from .core import DeltaService, ObjectKey
|
||||
from .core.errors import NotFoundError
|
||||
from .core.models import DeleteResult
|
||||
|
||||
|
||||
def delete_with_delta_suffix(
|
||||
service: DeltaService, bucket: str, key: str
|
||||
) -> tuple[str, DeleteResult]:
|
||||
"""Delete an object, retrying with '.delta' suffix when needed.
|
||||
|
||||
Args:
|
||||
service: DeltaService-like instance exposing ``delete(ObjectKey)``.
|
||||
bucket: Target bucket.
|
||||
key: Requested key (without forcing .delta suffix).
|
||||
|
||||
Returns:
|
||||
Tuple containing the actual key deleted in storage and the DeleteResult.
|
||||
|
||||
Raises:
|
||||
NotFoundError: Propagated when both the direct and '.delta' keys are missing.
|
||||
"""
|
||||
actual_key = key
|
||||
object_key = ObjectKey(bucket=bucket, key=actual_key)
|
||||
|
||||
try:
|
||||
delete_result = service.delete(object_key)
|
||||
except NotFoundError:
|
||||
if key.endswith(".delta"):
|
||||
raise
|
||||
actual_key = f"{key}.delta"
|
||||
object_key = ObjectKey(bucket=bucket, key=actual_key)
|
||||
delete_result = service.delete(object_key)
|
||||
|
||||
return actual_key, delete_result
|
||||
100
src/deltaglider/client_models.py
Normal file
100
src/deltaglider/client_models.py
Normal file
@@ -0,0 +1,100 @@
|
||||
"""Shared data models for the DeltaGlider client."""
|
||||
|
||||
from dataclasses import dataclass, field
|
||||
|
||||
|
||||
@dataclass
|
||||
class UploadSummary:
|
||||
"""User-friendly upload summary."""
|
||||
|
||||
operation: str
|
||||
bucket: str
|
||||
key: str
|
||||
original_size: int
|
||||
stored_size: int
|
||||
is_delta: bool
|
||||
delta_ratio: float = 0.0
|
||||
|
||||
@property
|
||||
def original_size_mb(self) -> float:
|
||||
"""Original size in MB."""
|
||||
return self.original_size / (1024 * 1024)
|
||||
|
||||
@property
|
||||
def stored_size_mb(self) -> float:
|
||||
"""Stored size in MB."""
|
||||
return self.stored_size / (1024 * 1024)
|
||||
|
||||
@property
|
||||
def savings_percent(self) -> float:
|
||||
"""Percentage saved through compression."""
|
||||
if self.original_size == 0:
|
||||
return 0.0
|
||||
return ((self.original_size - self.stored_size) / self.original_size) * 100
|
||||
|
||||
|
||||
@dataclass
|
||||
class CompressionEstimate:
|
||||
"""Compression estimate for a file."""
|
||||
|
||||
original_size: int
|
||||
estimated_compressed_size: int
|
||||
estimated_ratio: float
|
||||
confidence: float
|
||||
recommended_reference: str | None = None
|
||||
should_use_delta: bool = True
|
||||
|
||||
|
||||
@dataclass
|
||||
class ObjectInfo:
|
||||
"""Detailed object information with compression stats."""
|
||||
|
||||
key: str
|
||||
size: int
|
||||
last_modified: str
|
||||
etag: str | None = None
|
||||
storage_class: str = "STANDARD"
|
||||
|
||||
# DeltaGlider-specific fields
|
||||
original_size: int | None = None
|
||||
compressed_size: int | None = None
|
||||
compression_ratio: float | None = None
|
||||
is_delta: bool = False
|
||||
reference_key: str | None = None
|
||||
delta_chain_length: int = 0
|
||||
|
||||
|
||||
@dataclass
|
||||
class ListObjectsResponse:
|
||||
"""Response from list_objects, compatible with boto3."""
|
||||
|
||||
name: str # Bucket name
|
||||
prefix: str = ""
|
||||
delimiter: str = ""
|
||||
max_keys: int = 1000
|
||||
common_prefixes: list[dict[str, str]] = field(default_factory=list)
|
||||
contents: list[ObjectInfo] = field(default_factory=list)
|
||||
is_truncated: bool = False
|
||||
next_continuation_token: str | None = None
|
||||
continuation_token: str | None = None
|
||||
key_count: int = 0
|
||||
|
||||
@property
|
||||
def objects(self) -> list[ObjectInfo]:
|
||||
"""Alias for contents, for convenience."""
|
||||
return self.contents
|
||||
|
||||
|
||||
@dataclass
|
||||
class BucketStats:
|
||||
"""Statistics for a bucket."""
|
||||
|
||||
bucket: str
|
||||
object_count: int
|
||||
total_size: int
|
||||
compressed_size: int
|
||||
space_saved: int
|
||||
average_compression_ratio: float
|
||||
delta_objects: int
|
||||
direct_objects: int
|
||||
object_limit_reached: bool = False
|
||||
39
src/deltaglider/client_operations/__init__.py
Normal file
39
src/deltaglider/client_operations/__init__.py
Normal file
@@ -0,0 +1,39 @@
|
||||
"""Client operation modules for DeltaGliderClient.
|
||||
|
||||
This package contains modular operation implementations:
|
||||
- bucket: S3 bucket management (create, delete, list)
|
||||
- presigned: Presigned URL generation for temporary access
|
||||
- batch: Batch upload/download operations
|
||||
- stats: Statistics and analytics operations
|
||||
"""
|
||||
|
||||
from .batch import download_batch, upload_batch, upload_chunked
|
||||
from .bucket import create_bucket, delete_bucket, get_bucket_acl, list_buckets, put_bucket_acl
|
||||
from .presigned import generate_presigned_post, generate_presigned_url
|
||||
from .stats import (
|
||||
estimate_compression,
|
||||
find_similar_files,
|
||||
get_bucket_stats,
|
||||
get_object_info,
|
||||
)
|
||||
|
||||
__all__ = [
|
||||
# Bucket operations
|
||||
"create_bucket",
|
||||
"delete_bucket",
|
||||
"get_bucket_acl",
|
||||
"list_buckets",
|
||||
"put_bucket_acl",
|
||||
# Presigned operations
|
||||
"generate_presigned_url",
|
||||
"generate_presigned_post",
|
||||
# Batch operations
|
||||
"upload_chunked",
|
||||
"upload_batch",
|
||||
"download_batch",
|
||||
# Stats operations
|
||||
"get_bucket_stats",
|
||||
"get_object_info",
|
||||
"estimate_compression",
|
||||
"find_similar_files",
|
||||
]
|
||||
159
src/deltaglider/client_operations/batch.py
Normal file
159
src/deltaglider/client_operations/batch.py
Normal file
@@ -0,0 +1,159 @@
|
||||
"""Batch upload/download operations for DeltaGlider client.
|
||||
|
||||
This module contains DeltaGlider-specific batch operations:
|
||||
- upload_batch
|
||||
- download_batch
|
||||
- upload_chunked
|
||||
"""
|
||||
|
||||
from collections.abc import Callable
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
from ..client_models import UploadSummary
|
||||
|
||||
|
||||
def upload_chunked(
|
||||
client: Any, # DeltaGliderClient
|
||||
file_path: str | Path,
|
||||
s3_url: str,
|
||||
chunk_size: int = 5 * 1024 * 1024,
|
||||
progress_callback: Callable[[int, int, int, int], None] | None = None,
|
||||
max_ratio: float = 0.5,
|
||||
) -> UploadSummary:
|
||||
"""Upload a file in chunks with progress callback.
|
||||
|
||||
This method reads the file in chunks to avoid loading large files entirely into memory,
|
||||
making it suitable for uploading very large files. Progress is reported after each chunk.
|
||||
|
||||
Args:
|
||||
client: DeltaGliderClient instance
|
||||
file_path: Local file to upload
|
||||
s3_url: S3 destination URL (s3://bucket/path/filename)
|
||||
chunk_size: Size of each chunk in bytes (default 5MB)
|
||||
progress_callback: Callback(chunk_number, total_chunks, bytes_sent, total_bytes)
|
||||
max_ratio: Maximum acceptable delta/file ratio for compression
|
||||
|
||||
Returns:
|
||||
UploadSummary with compression statistics
|
||||
|
||||
Example:
|
||||
def on_progress(chunk_num, total_chunks, bytes_sent, total_bytes):
|
||||
percent = (bytes_sent / total_bytes) * 100
|
||||
print(f"Upload progress: {percent:.1f}%")
|
||||
|
||||
client.upload_chunked(
|
||||
"large_file.zip",
|
||||
"s3://bucket/releases/large_file.zip",
|
||||
chunk_size=10 * 1024 * 1024, # 10MB chunks
|
||||
progress_callback=on_progress
|
||||
)
|
||||
"""
|
||||
file_path = Path(file_path)
|
||||
file_size = file_path.stat().st_size
|
||||
|
||||
# For small files, just use regular upload
|
||||
if file_size <= chunk_size:
|
||||
if progress_callback:
|
||||
progress_callback(1, 1, file_size, file_size)
|
||||
result: UploadSummary = client.upload(file_path, s3_url, max_ratio=max_ratio)
|
||||
return result
|
||||
|
||||
# Calculate chunks
|
||||
total_chunks = (file_size + chunk_size - 1) // chunk_size
|
||||
|
||||
# Create a temporary file for chunked processing
|
||||
# For now, we read the entire file but report progress in chunks
|
||||
# Future enhancement: implement true streaming upload in storage adapter
|
||||
bytes_read = 0
|
||||
|
||||
with open(file_path, "rb") as f:
|
||||
for chunk_num in range(1, total_chunks + 1):
|
||||
# Read chunk (simulated for progress reporting)
|
||||
chunk_data = f.read(chunk_size)
|
||||
bytes_read += len(chunk_data)
|
||||
|
||||
if progress_callback:
|
||||
progress_callback(chunk_num, total_chunks, bytes_read, file_size)
|
||||
|
||||
# Perform the actual upload
|
||||
# TODO: When storage adapter supports streaming, pass chunks directly
|
||||
upload_result: UploadSummary = client.upload(file_path, s3_url, max_ratio=max_ratio)
|
||||
|
||||
# Final progress callback
|
||||
if progress_callback:
|
||||
progress_callback(total_chunks, total_chunks, file_size, file_size)
|
||||
|
||||
return upload_result
|
||||
|
||||
|
||||
def upload_batch(
|
||||
client: Any, # DeltaGliderClient
|
||||
files: list[str | Path],
|
||||
s3_prefix: str,
|
||||
max_ratio: float = 0.5,
|
||||
progress_callback: Callable[[str, int, int], None] | None = None,
|
||||
) -> list[UploadSummary]:
|
||||
"""Upload multiple files in batch.
|
||||
|
||||
Args:
|
||||
client: DeltaGliderClient instance
|
||||
files: List of local file paths
|
||||
s3_prefix: S3 destination prefix (s3://bucket/prefix/)
|
||||
max_ratio: Maximum acceptable delta/file ratio
|
||||
progress_callback: Callback(filename, current_file_index, total_files)
|
||||
|
||||
Returns:
|
||||
List of UploadSummary objects
|
||||
"""
|
||||
results = []
|
||||
|
||||
for i, file_path in enumerate(files):
|
||||
file_path = Path(file_path)
|
||||
|
||||
if progress_callback:
|
||||
progress_callback(file_path.name, i + 1, len(files))
|
||||
|
||||
# Upload each file
|
||||
s3_url = f"{s3_prefix.rstrip('/')}/{file_path.name}"
|
||||
summary = client.upload(file_path, s3_url, max_ratio=max_ratio)
|
||||
results.append(summary)
|
||||
|
||||
return results
|
||||
|
||||
|
||||
def download_batch(
|
||||
client: Any, # DeltaGliderClient
|
||||
s3_urls: list[str],
|
||||
output_dir: str | Path,
|
||||
progress_callback: Callable[[str, int, int], None] | None = None,
|
||||
) -> list[Path]:
|
||||
"""Download multiple files in batch.
|
||||
|
||||
Args:
|
||||
client: DeltaGliderClient instance
|
||||
s3_urls: List of S3 URLs to download
|
||||
output_dir: Local directory to save files
|
||||
progress_callback: Callback(filename, current_file_index, total_files)
|
||||
|
||||
Returns:
|
||||
List of downloaded file paths
|
||||
"""
|
||||
output_dir = Path(output_dir)
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
results = []
|
||||
|
||||
for i, s3_url in enumerate(s3_urls):
|
||||
# Extract filename from URL
|
||||
filename = s3_url.split("/")[-1]
|
||||
if filename.endswith(".delta"):
|
||||
filename = filename[:-6] # Remove .delta suffix
|
||||
|
||||
if progress_callback:
|
||||
progress_callback(filename, i + 1, len(s3_urls))
|
||||
|
||||
output_path = output_dir / filename
|
||||
client.download(s3_url, output_path)
|
||||
results.append(output_path)
|
||||
|
||||
return results
|
||||
275
src/deltaglider/client_operations/bucket.py
Normal file
275
src/deltaglider/client_operations/bucket.py
Normal file
@@ -0,0 +1,275 @@
|
||||
"""Bucket management operations for DeltaGlider client.
|
||||
|
||||
This module contains boto3-compatible bucket operations:
|
||||
- create_bucket
|
||||
- delete_bucket
|
||||
- list_buckets
|
||||
- put_bucket_acl
|
||||
- get_bucket_acl
|
||||
"""
|
||||
|
||||
from typing import Any
|
||||
|
||||
|
||||
def create_bucket(
|
||||
client: Any, # DeltaGliderClient (avoiding circular import)
|
||||
Bucket: str,
|
||||
CreateBucketConfiguration: dict[str, str] | None = None,
|
||||
**kwargs: Any,
|
||||
) -> dict[str, Any]:
|
||||
"""Create an S3 bucket (boto3-compatible).
|
||||
|
||||
Args:
|
||||
client: DeltaGliderClient instance
|
||||
Bucket: Bucket name to create
|
||||
CreateBucketConfiguration: Optional bucket configuration (e.g., LocationConstraint)
|
||||
**kwargs: Additional S3 parameters (for compatibility)
|
||||
|
||||
Returns:
|
||||
Response dict with bucket location
|
||||
|
||||
Example:
|
||||
>>> client = create_client()
|
||||
>>> client.create_bucket(Bucket='my-bucket')
|
||||
>>> # With region
|
||||
>>> client.create_bucket(
|
||||
... Bucket='my-bucket',
|
||||
... CreateBucketConfiguration={'LocationConstraint': 'us-west-2'}
|
||||
... )
|
||||
"""
|
||||
storage_adapter = client.service.storage
|
||||
|
||||
# Check if storage adapter has boto3 client
|
||||
if hasattr(storage_adapter, "client"):
|
||||
try:
|
||||
params: dict[str, Any] = {"Bucket": Bucket}
|
||||
if CreateBucketConfiguration:
|
||||
params["CreateBucketConfiguration"] = CreateBucketConfiguration
|
||||
|
||||
response = storage_adapter.client.create_bucket(**params)
|
||||
return {
|
||||
"Location": response.get("Location", f"/{Bucket}"),
|
||||
"ResponseMetadata": {
|
||||
"HTTPStatusCode": 200,
|
||||
},
|
||||
}
|
||||
except Exception as e:
|
||||
error_msg = str(e)
|
||||
if "BucketAlreadyExists" in error_msg or "BucketAlreadyOwnedByYou" in error_msg:
|
||||
# Bucket already exists - return success
|
||||
client.service.logger.debug(f"Bucket {Bucket} already exists")
|
||||
return {
|
||||
"Location": f"/{Bucket}",
|
||||
"ResponseMetadata": {
|
||||
"HTTPStatusCode": 200,
|
||||
},
|
||||
}
|
||||
raise RuntimeError(f"Failed to create bucket: {e}") from e
|
||||
else:
|
||||
raise NotImplementedError("Storage adapter does not support bucket creation")
|
||||
|
||||
|
||||
def delete_bucket(
|
||||
client: Any, # DeltaGliderClient
|
||||
Bucket: str,
|
||||
**kwargs: Any,
|
||||
) -> dict[str, Any]:
|
||||
"""Delete an S3 bucket (boto3-compatible).
|
||||
|
||||
Note: Bucket must be empty before deletion.
|
||||
|
||||
Args:
|
||||
client: DeltaGliderClient instance
|
||||
Bucket: Bucket name to delete
|
||||
**kwargs: Additional S3 parameters (for compatibility)
|
||||
|
||||
Returns:
|
||||
Response dict with deletion status
|
||||
|
||||
Example:
|
||||
>>> client = create_client()
|
||||
>>> client.delete_bucket(Bucket='my-bucket')
|
||||
"""
|
||||
storage_adapter = client.service.storage
|
||||
|
||||
# Check if storage adapter has boto3 client
|
||||
if hasattr(storage_adapter, "client"):
|
||||
try:
|
||||
storage_adapter.client.delete_bucket(Bucket=Bucket)
|
||||
return {
|
||||
"ResponseMetadata": {
|
||||
"HTTPStatusCode": 204,
|
||||
},
|
||||
}
|
||||
except Exception as e:
|
||||
error_msg = str(e)
|
||||
if "NoSuchBucket" in error_msg:
|
||||
# Bucket doesn't exist - return success
|
||||
client.service.logger.debug(f"Bucket {Bucket} does not exist")
|
||||
return {
|
||||
"ResponseMetadata": {
|
||||
"HTTPStatusCode": 204,
|
||||
},
|
||||
}
|
||||
raise RuntimeError(f"Failed to delete bucket: {e}") from e
|
||||
else:
|
||||
raise NotImplementedError("Storage adapter does not support bucket deletion")
|
||||
|
||||
|
||||
def list_buckets(
|
||||
client: Any, # DeltaGliderClient
|
||||
**kwargs: Any,
|
||||
) -> dict[str, Any]:
|
||||
"""List all S3 buckets (boto3-compatible).
|
||||
|
||||
Args:
|
||||
client: DeltaGliderClient instance
|
||||
**kwargs: Additional S3 parameters (for compatibility)
|
||||
|
||||
Returns:
|
||||
Response dict with bucket list
|
||||
|
||||
Example:
|
||||
>>> client = create_client()
|
||||
>>> response = client.list_buckets()
|
||||
>>> for bucket in response['Buckets']:
|
||||
... print(bucket['Name'])
|
||||
"""
|
||||
storage_adapter = client.service.storage
|
||||
|
||||
# Check if storage adapter has boto3 client
|
||||
if hasattr(storage_adapter, "client"):
|
||||
try:
|
||||
raw_response = storage_adapter.client.list_buckets()
|
||||
|
||||
buckets: list[dict[str, Any]] = []
|
||||
for bucket_entry in raw_response.get("Buckets", []):
|
||||
bucket_data = dict(bucket_entry)
|
||||
name = bucket_data.get("Name")
|
||||
if isinstance(name, str) and name:
|
||||
cached_stats, cached_mode = client._get_cached_bucket_stats_for_listing(name)
|
||||
if cached_stats is not None and cached_mode is not None:
|
||||
bucket_data["DeltaGliderStats"] = {
|
||||
"Cached": True,
|
||||
"Mode": cached_mode,
|
||||
"Detailed": cached_mode == "detailed",
|
||||
"ObjectCount": cached_stats.object_count,
|
||||
"TotalSize": cached_stats.total_size,
|
||||
"CompressedSize": cached_stats.compressed_size,
|
||||
"SpaceSaved": cached_stats.space_saved,
|
||||
"AverageCompressionRatio": cached_stats.average_compression_ratio,
|
||||
"DeltaObjects": cached_stats.delta_objects,
|
||||
"DirectObjects": cached_stats.direct_objects,
|
||||
}
|
||||
|
||||
buckets.append(bucket_data)
|
||||
|
||||
return {
|
||||
"Buckets": buckets,
|
||||
"Owner": raw_response.get("Owner", {}),
|
||||
"ResponseMetadata": {
|
||||
"HTTPStatusCode": 200,
|
||||
},
|
||||
}
|
||||
except Exception as e:
|
||||
raise RuntimeError(f"Failed to list buckets: {e}") from e
|
||||
else:
|
||||
raise NotImplementedError("Storage adapter does not support bucket listing")
|
||||
|
||||
|
||||
def put_bucket_acl(
|
||||
client: Any, # DeltaGliderClient (avoiding circular import)
|
||||
Bucket: str,
|
||||
ACL: str | None = None,
|
||||
AccessControlPolicy: dict[str, Any] | None = None,
|
||||
GrantFullControl: str | None = None,
|
||||
GrantRead: str | None = None,
|
||||
GrantReadACP: str | None = None,
|
||||
GrantWrite: str | None = None,
|
||||
GrantWriteACP: str | None = None,
|
||||
**kwargs: Any,
|
||||
) -> dict[str, Any]:
|
||||
"""Set the ACL for an S3 bucket (boto3-compatible passthrough).
|
||||
|
||||
Args:
|
||||
client: DeltaGliderClient instance
|
||||
Bucket: Bucket name
|
||||
ACL: Canned ACL (private, public-read, public-read-write, authenticated-read)
|
||||
AccessControlPolicy: Full ACL policy dict
|
||||
GrantFullControl: Grants full control to the grantee
|
||||
GrantRead: Allows grantee to list objects in the bucket
|
||||
GrantReadACP: Allows grantee to read the bucket ACL
|
||||
GrantWrite: Allows grantee to create objects in the bucket
|
||||
GrantWriteACP: Allows grantee to write the ACL for the bucket
|
||||
**kwargs: Additional S3 parameters (for compatibility)
|
||||
|
||||
Returns:
|
||||
Response dict with status
|
||||
|
||||
Example:
|
||||
>>> client = create_client()
|
||||
>>> client.put_bucket_acl(Bucket='my-bucket', ACL='public-read')
|
||||
"""
|
||||
storage_adapter = client.service.storage
|
||||
|
||||
if hasattr(storage_adapter, "client"):
|
||||
try:
|
||||
params: dict[str, Any] = {"Bucket": Bucket}
|
||||
if ACL is not None:
|
||||
params["ACL"] = ACL
|
||||
if AccessControlPolicy is not None:
|
||||
params["AccessControlPolicy"] = AccessControlPolicy
|
||||
if GrantFullControl is not None:
|
||||
params["GrantFullControl"] = GrantFullControl
|
||||
if GrantRead is not None:
|
||||
params["GrantRead"] = GrantRead
|
||||
if GrantReadACP is not None:
|
||||
params["GrantReadACP"] = GrantReadACP
|
||||
if GrantWrite is not None:
|
||||
params["GrantWrite"] = GrantWrite
|
||||
if GrantWriteACP is not None:
|
||||
params["GrantWriteACP"] = GrantWriteACP
|
||||
|
||||
storage_adapter.client.put_bucket_acl(**params)
|
||||
return {
|
||||
"ResponseMetadata": {
|
||||
"HTTPStatusCode": 200,
|
||||
},
|
||||
}
|
||||
except Exception as e:
|
||||
raise RuntimeError(f"Failed to set bucket ACL: {e}") from e
|
||||
else:
|
||||
raise NotImplementedError("Storage adapter does not support bucket ACL operations")
|
||||
|
||||
|
||||
def get_bucket_acl(
|
||||
client: Any, # DeltaGliderClient (avoiding circular import)
|
||||
Bucket: str,
|
||||
**kwargs: Any,
|
||||
) -> dict[str, Any]:
|
||||
"""Get the ACL for an S3 bucket (boto3-compatible passthrough).
|
||||
|
||||
Args:
|
||||
client: DeltaGliderClient instance
|
||||
Bucket: Bucket name
|
||||
**kwargs: Additional S3 parameters (for compatibility)
|
||||
|
||||
Returns:
|
||||
Response dict with Owner and Grants
|
||||
|
||||
Example:
|
||||
>>> client = create_client()
|
||||
>>> response = client.get_bucket_acl(Bucket='my-bucket')
|
||||
>>> print(response['Owner'])
|
||||
>>> print(response['Grants'])
|
||||
"""
|
||||
storage_adapter = client.service.storage
|
||||
|
||||
if hasattr(storage_adapter, "client"):
|
||||
try:
|
||||
response: dict[str, Any] = storage_adapter.client.get_bucket_acl(Bucket=Bucket)
|
||||
return response
|
||||
except Exception as e:
|
||||
raise RuntimeError(f"Failed to get bucket ACL: {e}") from e
|
||||
else:
|
||||
raise NotImplementedError("Storage adapter does not support bucket ACL operations")
|
||||
124
src/deltaglider/client_operations/presigned.py
Normal file
124
src/deltaglider/client_operations/presigned.py
Normal file
@@ -0,0 +1,124 @@
|
||||
"""Presigned URL operations for DeltaGlider client.
|
||||
|
||||
This module contains boto3-compatible presigned URL operations:
|
||||
- generate_presigned_url
|
||||
- generate_presigned_post
|
||||
"""
|
||||
|
||||
from typing import Any
|
||||
|
||||
|
||||
def try_boto3_presigned_operation(
|
||||
client: Any, # DeltaGliderClient
|
||||
operation: str,
|
||||
**kwargs: Any,
|
||||
) -> Any | None:
|
||||
"""Try to generate presigned operation using boto3 client, return None if not available."""
|
||||
storage_adapter = client.service.storage
|
||||
|
||||
# Check if storage adapter has boto3 client
|
||||
if hasattr(storage_adapter, "client"):
|
||||
try:
|
||||
if operation == "url":
|
||||
return str(storage_adapter.client.generate_presigned_url(**kwargs))
|
||||
elif operation == "post":
|
||||
return dict(storage_adapter.client.generate_presigned_post(**kwargs))
|
||||
except AttributeError:
|
||||
# storage_adapter does not have a 'client' attribute
|
||||
pass
|
||||
except Exception as e:
|
||||
# Fall back to manual construction if needed
|
||||
client.service.logger.warning(f"Failed to generate presigned {operation}: {e}")
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def generate_presigned_url(
|
||||
client: Any, # DeltaGliderClient
|
||||
ClientMethod: str,
|
||||
Params: dict[str, Any],
|
||||
ExpiresIn: int = 3600,
|
||||
) -> str:
|
||||
"""Generate presigned URL (boto3-compatible).
|
||||
|
||||
Args:
|
||||
client: DeltaGliderClient instance
|
||||
ClientMethod: Method name ('get_object' or 'put_object')
|
||||
Params: Parameters dict with Bucket and Key
|
||||
ExpiresIn: URL expiration in seconds
|
||||
|
||||
Returns:
|
||||
Presigned URL string
|
||||
"""
|
||||
# Try boto3 first, fallback to manual construction
|
||||
url = try_boto3_presigned_operation(
|
||||
client,
|
||||
"url",
|
||||
ClientMethod=ClientMethod,
|
||||
Params=Params,
|
||||
ExpiresIn=ExpiresIn,
|
||||
)
|
||||
if url is not None:
|
||||
return str(url)
|
||||
|
||||
# Fallback: construct URL manually (less secure, for dev/testing only)
|
||||
bucket = Params.get("Bucket", "")
|
||||
key = Params.get("Key", "")
|
||||
|
||||
if client.endpoint_url:
|
||||
base_url = client.endpoint_url
|
||||
else:
|
||||
base_url = f"https://{bucket}.s3.amazonaws.com"
|
||||
|
||||
# Warning: This is not a real presigned URL, just a placeholder
|
||||
client.service.logger.warning("Using placeholder presigned URL - not suitable for production")
|
||||
return f"{base_url}/{key}?expires={ExpiresIn}"
|
||||
|
||||
|
||||
def generate_presigned_post(
|
||||
client: Any, # DeltaGliderClient
|
||||
Bucket: str,
|
||||
Key: str,
|
||||
Fields: dict[str, str] | None = None,
|
||||
Conditions: list[Any] | None = None,
|
||||
ExpiresIn: int = 3600,
|
||||
) -> dict[str, Any]:
|
||||
"""Generate presigned POST data for HTML forms (boto3-compatible).
|
||||
|
||||
Args:
|
||||
client: DeltaGliderClient instance
|
||||
Bucket: S3 bucket name
|
||||
Key: Object key
|
||||
Fields: Additional fields to include
|
||||
Conditions: Upload conditions
|
||||
ExpiresIn: URL expiration in seconds
|
||||
|
||||
Returns:
|
||||
Dict with 'url' and 'fields' for form submission
|
||||
"""
|
||||
# Try boto3 first, fallback to manual construction
|
||||
response = try_boto3_presigned_operation(
|
||||
client,
|
||||
"post",
|
||||
Bucket=Bucket,
|
||||
Key=Key,
|
||||
Fields=Fields,
|
||||
Conditions=Conditions,
|
||||
ExpiresIn=ExpiresIn,
|
||||
)
|
||||
if response is not None:
|
||||
return dict(response)
|
||||
|
||||
# Fallback: return minimal structure for compatibility
|
||||
if client.endpoint_url:
|
||||
url = f"{client.endpoint_url}/{Bucket}"
|
||||
else:
|
||||
url = f"https://{Bucket}.s3.amazonaws.com"
|
||||
|
||||
return {
|
||||
"url": url,
|
||||
"fields": {
|
||||
"key": Key,
|
||||
**(Fields or {}),
|
||||
},
|
||||
}
|
||||
994
src/deltaglider/client_operations/stats.py
Normal file
994
src/deltaglider/client_operations/stats.py
Normal file
@@ -0,0 +1,994 @@
|
||||
"""Statistics and analysis operations for DeltaGlider client.
|
||||
|
||||
This module contains DeltaGlider-specific statistics operations:
|
||||
- get_bucket_stats
|
||||
- get_object_info
|
||||
- estimate_compression
|
||||
- find_similar_files
|
||||
"""
|
||||
|
||||
import concurrent.futures
|
||||
import json
|
||||
import re
|
||||
from dataclasses import asdict
|
||||
from datetime import UTC, datetime
|
||||
from pathlib import Path
|
||||
from typing import Any, Literal
|
||||
|
||||
from ..client_models import BucketStats, CompressionEstimate, ObjectInfo
|
||||
from ..core.delta_extensions import is_delta_candidate
|
||||
from ..core.object_listing import list_all_objects
|
||||
from ..core.s3_uri import parse_s3_url
|
||||
|
||||
StatsMode = Literal["quick", "sampled", "detailed"]
|
||||
|
||||
# Cache configuration
|
||||
CACHE_VERSION = "1.0"
|
||||
CACHE_PREFIX = ".deltaglider"
|
||||
|
||||
# Listing limits (prevent runaway scans on gigantic buckets)
|
||||
QUICK_LIST_LIMIT = 60_000
|
||||
SAMPLED_LIST_LIMIT = 30_000
|
||||
|
||||
# ============================================================================
|
||||
# Internal Helper Functions
|
||||
# ============================================================================
|
||||
|
||||
|
||||
def _first_metadata_value(metadata: dict[str, Any], *keys: str) -> str | None:
|
||||
"""Return the first non-empty metadata value matching the provided keys."""
|
||||
for key in keys:
|
||||
value = metadata.get(key)
|
||||
if value not in (None, ""):
|
||||
return value
|
||||
return None
|
||||
|
||||
|
||||
def _fetch_delta_metadata(
|
||||
client: Any,
|
||||
bucket: str,
|
||||
delta_keys: list[str],
|
||||
max_timeout: int = 600,
|
||||
) -> dict[str, dict[str, Any]]:
|
||||
"""Fetch metadata for delta files in parallel with timeout.
|
||||
|
||||
Args:
|
||||
client: DeltaGliderClient instance
|
||||
bucket: S3 bucket name
|
||||
delta_keys: List of delta file keys
|
||||
max_timeout: Maximum total timeout in seconds (default: 600 = 10 min)
|
||||
|
||||
Returns:
|
||||
Dict mapping delta key -> metadata dict
|
||||
"""
|
||||
metadata_map: dict[str, dict[str, Any]] = {}
|
||||
|
||||
if not delta_keys:
|
||||
return metadata_map
|
||||
|
||||
client.service.logger.info(
|
||||
f"Fetching metadata for {len(delta_keys)} delta files in parallel..."
|
||||
)
|
||||
|
||||
def fetch_single_metadata(key: str) -> tuple[str, dict[str, Any] | None]:
|
||||
try:
|
||||
obj_head = client.service.storage.head(f"{bucket}/{key}")
|
||||
if obj_head and obj_head.metadata:
|
||||
return key, obj_head.metadata
|
||||
except Exception as e:
|
||||
client.service.logger.debug(f"Failed to fetch metadata for {key}: {e}")
|
||||
return key, None
|
||||
|
||||
with concurrent.futures.ThreadPoolExecutor(max_workers=min(10, len(delta_keys))) as executor:
|
||||
futures = [executor.submit(fetch_single_metadata, key) for key in delta_keys]
|
||||
|
||||
# Calculate timeout: 60s per file, capped at max_timeout
|
||||
timeout_per_file = 60
|
||||
total_timeout = min(len(delta_keys) * timeout_per_file, max_timeout)
|
||||
|
||||
try:
|
||||
for future in concurrent.futures.as_completed(futures, timeout=total_timeout):
|
||||
try:
|
||||
key, metadata = future.result(timeout=5) # 5s per result
|
||||
if metadata:
|
||||
metadata_map[key] = metadata
|
||||
except concurrent.futures.TimeoutError:
|
||||
client.service.logger.warning("Timeout fetching metadata for a delta file")
|
||||
continue
|
||||
except concurrent.futures.TimeoutError:
|
||||
client.service.logger.warning(
|
||||
f"_fetch_delta_metadata: Timeout after {total_timeout}s. "
|
||||
f"Fetched {len(metadata_map)}/{len(delta_keys)} metadata entries. "
|
||||
f"Continuing with partial metadata..."
|
||||
)
|
||||
# Cancel remaining futures
|
||||
for future in futures:
|
||||
future.cancel()
|
||||
|
||||
return metadata_map
|
||||
|
||||
|
||||
def _extract_deltaspace(key: str) -> str:
|
||||
"""Return the delta space (prefix) for a given object key."""
|
||||
if "/" in key:
|
||||
return key.rsplit("/", 1)[0]
|
||||
return ""
|
||||
|
||||
|
||||
def _get_cache_key(mode: StatsMode) -> str:
|
||||
"""Get the S3 key for a cache file based on mode.
|
||||
|
||||
Args:
|
||||
mode: Stats mode (quick, sampled, or detailed)
|
||||
|
||||
Returns:
|
||||
S3 key like ".deltaglider/stats_quick.json"
|
||||
"""
|
||||
return f"{CACHE_PREFIX}/stats_{mode}.json"
|
||||
|
||||
|
||||
def _read_stats_cache(
|
||||
client: Any,
|
||||
bucket: str,
|
||||
mode: StatsMode,
|
||||
) -> tuple[BucketStats | None, dict[str, Any] | None]:
|
||||
"""Read cached stats from S3 if available.
|
||||
|
||||
Args:
|
||||
client: DeltaGliderClient instance
|
||||
bucket: S3 bucket name
|
||||
mode: Stats mode to read cache for
|
||||
|
||||
Returns:
|
||||
Tuple of (BucketStats | None, validation_data | None)
|
||||
Returns (None, None) if cache doesn't exist or is invalid
|
||||
"""
|
||||
cache_key = _get_cache_key(mode)
|
||||
|
||||
try:
|
||||
# Try to read cache file from S3
|
||||
obj = client.service.storage.get(f"{bucket}/{cache_key}")
|
||||
if not obj or not obj.data:
|
||||
return None, None
|
||||
|
||||
# Parse JSON
|
||||
cache_data = json.loads(obj.data.decode("utf-8"))
|
||||
|
||||
# Validate version
|
||||
if cache_data.get("version") != CACHE_VERSION:
|
||||
client.service.logger.warning(
|
||||
f"Cache version mismatch: expected {CACHE_VERSION}, got {cache_data.get('version')}"
|
||||
)
|
||||
return None, None
|
||||
|
||||
# Validate mode
|
||||
if cache_data.get("mode") != mode:
|
||||
client.service.logger.warning(
|
||||
f"Cache mode mismatch: expected {mode}, got {cache_data.get('mode')}"
|
||||
)
|
||||
return None, None
|
||||
|
||||
# Extract stats and validation data
|
||||
stats_dict = cache_data.get("stats")
|
||||
validation_data = cache_data.get("validation")
|
||||
|
||||
if not stats_dict or not validation_data:
|
||||
client.service.logger.warning("Cache missing stats or validation data")
|
||||
return None, None
|
||||
|
||||
# Reconstruct BucketStats from dict
|
||||
stats = BucketStats(**stats_dict)
|
||||
|
||||
client.service.logger.debug(
|
||||
f"Successfully read cache for {bucket} (mode={mode}, "
|
||||
f"computed_at={cache_data.get('computed_at')})"
|
||||
)
|
||||
|
||||
return stats, validation_data
|
||||
|
||||
except FileNotFoundError:
|
||||
# Cache doesn't exist yet - this is normal
|
||||
client.service.logger.debug(f"No cache found for {bucket} (mode={mode})")
|
||||
return None, None
|
||||
except json.JSONDecodeError as e:
|
||||
client.service.logger.warning(f"Invalid JSON in cache file: {e}")
|
||||
return None, None
|
||||
except Exception as e:
|
||||
client.service.logger.warning(f"Error reading cache: {e}")
|
||||
return None, None
|
||||
|
||||
|
||||
def _write_stats_cache(
|
||||
client: Any,
|
||||
bucket: str,
|
||||
mode: StatsMode,
|
||||
stats: BucketStats,
|
||||
object_count: int,
|
||||
compressed_size: int,
|
||||
) -> None:
|
||||
"""Write computed stats to S3 cache.
|
||||
|
||||
Args:
|
||||
client: DeltaGliderClient instance
|
||||
bucket: S3 bucket name
|
||||
mode: Stats mode being cached
|
||||
stats: Computed BucketStats to cache
|
||||
object_count: Current object count (for validation)
|
||||
compressed_size: Current compressed size (for validation)
|
||||
"""
|
||||
cache_key = _get_cache_key(mode)
|
||||
|
||||
try:
|
||||
# Build cache structure
|
||||
cache_data = {
|
||||
"version": CACHE_VERSION,
|
||||
"mode": mode,
|
||||
"computed_at": datetime.now(UTC).isoformat(),
|
||||
"validation": {
|
||||
"object_count": object_count,
|
||||
"compressed_size": compressed_size,
|
||||
},
|
||||
"stats": asdict(stats),
|
||||
}
|
||||
|
||||
# Serialize to JSON
|
||||
cache_json = json.dumps(cache_data, indent=2)
|
||||
|
||||
# Write to S3
|
||||
client.service.storage.put(
|
||||
address=f"{bucket}/{cache_key}",
|
||||
data=cache_json.encode("utf-8"),
|
||||
metadata={
|
||||
"content-type": "application/json",
|
||||
"x-deltaglider-cache": "true",
|
||||
},
|
||||
)
|
||||
|
||||
client.service.logger.info(
|
||||
f"Wrote cache for {bucket} (mode={mode}, {len(cache_json)} bytes)"
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
# Log warning but don't fail - caching is optional
|
||||
client.service.logger.warning(f"Failed to write cache (non-fatal): {e}")
|
||||
|
||||
|
||||
def _is_cache_valid(
|
||||
cached_validation: dict[str, Any],
|
||||
current_object_count: int,
|
||||
current_compressed_size: int,
|
||||
) -> bool:
|
||||
"""Check if cached stats are still valid based on bucket state.
|
||||
|
||||
Validation strategy: Compare object count and total compressed size.
|
||||
If either changed, the cache is stale.
|
||||
|
||||
Args:
|
||||
cached_validation: Validation data from cache
|
||||
current_object_count: Current object count from LIST
|
||||
current_compressed_size: Current compressed size from LIST
|
||||
|
||||
Returns:
|
||||
True if cache is still valid, False if stale
|
||||
"""
|
||||
cached_count = cached_validation.get("object_count")
|
||||
cached_size = cached_validation.get("compressed_size")
|
||||
|
||||
if cached_count != current_object_count:
|
||||
return False
|
||||
|
||||
if cached_size != current_compressed_size:
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
|
||||
def _build_object_info_list(
|
||||
raw_objects: list[dict[str, Any]],
|
||||
metadata_map: dict[str, dict[str, Any]],
|
||||
logger: Any,
|
||||
sampled_space_metadata: dict[str, dict[str, Any]] | None = None,
|
||||
) -> list[ObjectInfo]:
|
||||
"""Build ObjectInfo list from raw objects and metadata.
|
||||
|
||||
Args:
|
||||
raw_objects: List of raw object dicts from S3 LIST
|
||||
metadata_map: Dict of key -> metadata for delta files
|
||||
logger: Logger instance
|
||||
|
||||
Returns:
|
||||
List of ObjectInfo objects
|
||||
"""
|
||||
all_objects = []
|
||||
|
||||
for obj_dict in raw_objects:
|
||||
key = obj_dict["key"]
|
||||
size = obj_dict["size"]
|
||||
is_delta = key.endswith(".delta")
|
||||
|
||||
deltaspace = _extract_deltaspace(key)
|
||||
|
||||
# Get metadata from map (empty dict if not present)
|
||||
metadata = metadata_map.get(key)
|
||||
if metadata is None and sampled_space_metadata and deltaspace in sampled_space_metadata:
|
||||
metadata = sampled_space_metadata[deltaspace]
|
||||
if metadata is None:
|
||||
metadata = {}
|
||||
|
||||
# Parse compression ratio and original size
|
||||
compression_ratio = 0.0
|
||||
# For delta files without metadata, set original_size to None to indicate unknown
|
||||
# This prevents nonsensical stats like "693 bytes compressed to 82MB"
|
||||
original_size = None if is_delta else size
|
||||
|
||||
if is_delta and metadata:
|
||||
try:
|
||||
ratio_str = metadata.get("compression_ratio", "0.0")
|
||||
compression_ratio = float(ratio_str) if ratio_str != "unknown" else 0.0
|
||||
except (ValueError, TypeError):
|
||||
compression_ratio = 0.0
|
||||
|
||||
try:
|
||||
original_size_raw = _first_metadata_value(
|
||||
metadata,
|
||||
"dg-file-size",
|
||||
"dg_file_size",
|
||||
"file_size",
|
||||
"file-size",
|
||||
"deltaglider-original-size",
|
||||
)
|
||||
if original_size_raw is not None:
|
||||
original_size = int(original_size_raw)
|
||||
logger.debug(f"Delta {key}: using original_size={original_size} from metadata")
|
||||
else:
|
||||
logger.warning(
|
||||
f"Delta {key}: metadata missing file size. Available keys: {list(metadata.keys())}. Using None as original_size (unknown)"
|
||||
)
|
||||
original_size = None
|
||||
except (ValueError, TypeError) as e:
|
||||
logger.warning(
|
||||
f"Delta {key}: failed to parse file size from metadata: {e}. Using None as original_size (unknown)"
|
||||
)
|
||||
original_size = None
|
||||
|
||||
all_objects.append(
|
||||
ObjectInfo(
|
||||
key=key,
|
||||
size=size,
|
||||
last_modified=obj_dict.get("last_modified", ""),
|
||||
etag=obj_dict.get("etag"),
|
||||
storage_class=obj_dict.get("storage_class", "STANDARD"),
|
||||
original_size=original_size,
|
||||
compressed_size=size,
|
||||
is_delta=is_delta,
|
||||
compression_ratio=compression_ratio,
|
||||
reference_key=_first_metadata_value(
|
||||
metadata,
|
||||
"dg-ref-key",
|
||||
"dg_ref_key",
|
||||
"ref_key",
|
||||
"ref-key",
|
||||
),
|
||||
)
|
||||
)
|
||||
|
||||
return all_objects
|
||||
|
||||
|
||||
def _calculate_bucket_statistics(
|
||||
all_objects: list[ObjectInfo],
|
||||
bucket: str,
|
||||
logger: Any,
|
||||
mode: StatsMode = "quick",
|
||||
) -> BucketStats:
|
||||
"""Calculate statistics from ObjectInfo list.
|
||||
|
||||
Args:
|
||||
all_objects: List of ObjectInfo objects
|
||||
bucket: Bucket name for stats
|
||||
logger: Logger instance
|
||||
mode: Stats mode (quick, sampled, or detailed) - controls warning behavior
|
||||
|
||||
Returns:
|
||||
BucketStats object
|
||||
"""
|
||||
total_original_size = 0
|
||||
total_compressed_size = 0
|
||||
delta_count = 0
|
||||
direct_count = 0
|
||||
reference_files = {} # deltaspace -> size
|
||||
|
||||
# First pass: identify object types and reference files
|
||||
for obj in all_objects:
|
||||
if obj.key.endswith("/reference.bin") or obj.key == "reference.bin":
|
||||
deltaspace = obj.key.rsplit("/reference.bin", 1)[0] if "/" in obj.key else ""
|
||||
reference_files[deltaspace] = obj.size
|
||||
elif obj.is_delta:
|
||||
delta_count += 1
|
||||
else:
|
||||
direct_count += 1
|
||||
|
||||
# Second pass: calculate sizes
|
||||
for obj in all_objects:
|
||||
# Skip reference.bin (handled separately)
|
||||
if obj.key.endswith("/reference.bin") or obj.key == "reference.bin":
|
||||
continue
|
||||
|
||||
if obj.is_delta:
|
||||
# Delta: use original_size if available
|
||||
if obj.original_size is not None:
|
||||
logger.debug(f"Delta {obj.key}: using original_size={obj.original_size}")
|
||||
total_original_size += obj.original_size
|
||||
else:
|
||||
# original_size is None - metadata not available
|
||||
# In quick mode, this is expected (no HEAD requests)
|
||||
# In sampled/detailed mode, this means metadata is genuinely missing
|
||||
if mode != "quick":
|
||||
logger.warning(
|
||||
f"Delta {obj.key}: no original_size metadata available. "
|
||||
f"Cannot calculate original size without metadata. "
|
||||
f"Use --detailed mode for accurate stats."
|
||||
)
|
||||
# Don't add anything to total_original_size for deltas without metadata
|
||||
# This prevents nonsensical stats
|
||||
total_compressed_size += obj.size
|
||||
else:
|
||||
# Direct files: original = compressed
|
||||
total_original_size += obj.size
|
||||
total_compressed_size += obj.size
|
||||
|
||||
# Handle reference.bin files
|
||||
total_reference_size = sum(reference_files.values())
|
||||
|
||||
if delta_count > 0 and total_reference_size > 0:
|
||||
total_compressed_size += total_reference_size
|
||||
logger.info(
|
||||
f"Including {len(reference_files)} reference.bin file(s) "
|
||||
f"({total_reference_size:,} bytes) in compressed size"
|
||||
)
|
||||
elif delta_count == 0 and total_reference_size > 0:
|
||||
_log_orphaned_references(bucket, reference_files, total_reference_size, logger)
|
||||
|
||||
# Calculate final metrics
|
||||
# If we couldn't calculate original size (quick mode with deltas), set space_saved to 0
|
||||
# to avoid nonsensical negative numbers
|
||||
if total_original_size == 0 and total_compressed_size > 0:
|
||||
space_saved = 0
|
||||
avg_ratio = 0.0
|
||||
else:
|
||||
raw_space_saved = total_original_size - total_compressed_size
|
||||
space_saved = raw_space_saved if raw_space_saved > 0 else 0
|
||||
avg_ratio = (space_saved / total_original_size) if total_original_size > 0 else 0.0
|
||||
if avg_ratio < 0:
|
||||
avg_ratio = 0.0
|
||||
elif avg_ratio > 1:
|
||||
avg_ratio = 1.0
|
||||
|
||||
# Warn if quick mode with delta files (stats will be incomplete)
|
||||
if mode == "quick" and delta_count > 0 and total_original_size == 0:
|
||||
logger.warning(
|
||||
f"Quick mode cannot calculate original size for delta files (no metadata fetched). "
|
||||
f"Stats show {delta_count} delta file(s) with unknown original size. "
|
||||
f"Use --detailed for accurate compression metrics."
|
||||
)
|
||||
|
||||
return BucketStats(
|
||||
bucket=bucket,
|
||||
object_count=delta_count + direct_count,
|
||||
total_size=total_original_size,
|
||||
compressed_size=total_compressed_size,
|
||||
space_saved=space_saved,
|
||||
average_compression_ratio=avg_ratio,
|
||||
delta_objects=delta_count,
|
||||
direct_objects=direct_count,
|
||||
)
|
||||
|
||||
|
||||
def _log_orphaned_references(
|
||||
bucket: str,
|
||||
reference_files: dict[str, int],
|
||||
total_reference_size: int,
|
||||
logger: Any,
|
||||
) -> None:
|
||||
"""Log warning about orphaned reference.bin files.
|
||||
|
||||
Args:
|
||||
bucket: Bucket name
|
||||
reference_files: Dict of deltaspace -> size
|
||||
total_reference_size: Total size of all reference files
|
||||
logger: Logger instance
|
||||
"""
|
||||
waste_mb = total_reference_size / 1024 / 1024
|
||||
logger.warning(
|
||||
f"\n{'=' * 60}\n"
|
||||
f"WARNING: ORPHANED REFERENCE FILE(S) DETECTED!\n"
|
||||
f"{'=' * 60}\n"
|
||||
f"Found {len(reference_files)} reference.bin file(s) totaling "
|
||||
f"{total_reference_size:,} bytes ({waste_mb:.2f} MB)\n"
|
||||
f"but NO delta files are using them.\n"
|
||||
f"\n"
|
||||
f"This wastes {waste_mb:.2f} MB of storage!\n"
|
||||
f"\n"
|
||||
f"Orphaned reference files:\n"
|
||||
)
|
||||
|
||||
for deltaspace, size in reference_files.items():
|
||||
path = f"{deltaspace}/reference.bin" if deltaspace else "reference.bin"
|
||||
logger.warning(f" - s3://{bucket}/{path} ({size:,} bytes)")
|
||||
|
||||
logger.warning("\nConsider removing these orphaned files:\n")
|
||||
for deltaspace in reference_files:
|
||||
path = f"{deltaspace}/reference.bin" if deltaspace else "reference.bin"
|
||||
logger.warning(f" aws s3 rm s3://{bucket}/{path}")
|
||||
|
||||
logger.warning(f"{'=' * 60}")
|
||||
|
||||
|
||||
def get_object_info(
|
||||
client: Any, # DeltaGliderClient
|
||||
s3_url: str,
|
||||
) -> ObjectInfo:
|
||||
"""Get detailed object information including compression stats.
|
||||
|
||||
Args:
|
||||
client: DeltaGliderClient instance
|
||||
s3_url: S3 URL of the object
|
||||
|
||||
Returns:
|
||||
ObjectInfo with detailed metadata
|
||||
"""
|
||||
address = parse_s3_url(s3_url, allow_empty_key=False)
|
||||
bucket = address.bucket
|
||||
key = address.key
|
||||
|
||||
# Get object metadata
|
||||
obj_head = client.service.storage.head(f"{bucket}/{key}")
|
||||
if not obj_head:
|
||||
raise FileNotFoundError(f"Object not found: {s3_url}")
|
||||
|
||||
metadata = obj_head.metadata
|
||||
is_delta = key.endswith(".delta")
|
||||
|
||||
return ObjectInfo(
|
||||
key=key,
|
||||
size=obj_head.size,
|
||||
last_modified=metadata.get("last_modified", ""),
|
||||
etag=metadata.get("etag"),
|
||||
original_size=int(metadata.get("file_size", obj_head.size)),
|
||||
compressed_size=obj_head.size,
|
||||
compression_ratio=float(metadata.get("compression_ratio", 0.0)),
|
||||
is_delta=is_delta,
|
||||
reference_key=metadata.get("ref_key"),
|
||||
)
|
||||
|
||||
|
||||
def get_bucket_stats(
|
||||
client: Any, # DeltaGliderClient
|
||||
bucket: str,
|
||||
mode: StatsMode = "quick",
|
||||
use_cache: bool = True,
|
||||
refresh_cache: bool = False,
|
||||
) -> BucketStats:
|
||||
"""Get statistics for a bucket with configurable metadata strategies and caching.
|
||||
|
||||
Modes:
|
||||
- ``quick`` (default): Stream LIST results only. Compression metrics for delta files are
|
||||
approximate (falls back to delta size when metadata is unavailable).
|
||||
- ``sampled``: Fetch HEAD metadata for a single delta per delta-space and reuse the ratios for
|
||||
other deltas in the same space. Balances accuracy and speed.
|
||||
- ``detailed``: Fetch HEAD metadata for every delta object for the most accurate statistics.
|
||||
|
||||
Caching:
|
||||
- Stats are cached per mode in ``.deltaglider/stats_{mode}.json``
|
||||
- Cache is validated using object count and compressed size from LIST
|
||||
- If bucket changed, cache is recomputed automatically
|
||||
- Use ``refresh_cache=True`` to force recomputation
|
||||
- Use ``use_cache=False`` to skip caching entirely
|
||||
|
||||
**Robustness**: This function is designed to always return valid stats:
|
||||
- Returns partial stats if timeouts or pagination issues occur
|
||||
- Returns empty stats (zeros) if bucket listing completely fails
|
||||
- Never hangs indefinitely (max 10 min timeout, 10M object limit)
|
||||
|
||||
Args:
|
||||
client: DeltaGliderClient instance
|
||||
bucket: S3 bucket name
|
||||
mode: Stats mode ("quick", "sampled", or "detailed")
|
||||
use_cache: If True, use cached stats when available (default: True)
|
||||
refresh_cache: If True, force cache recomputation even if valid (default: False)
|
||||
|
||||
Returns:
|
||||
BucketStats with compression and space savings info. Always returns a valid BucketStats
|
||||
object, even if errors occur (will return empty/partial stats with warnings logged).
|
||||
|
||||
Raises:
|
||||
RuntimeError: Only if bucket listing fails immediately with no objects collected.
|
||||
All other errors result in partial/empty stats being returned.
|
||||
|
||||
Performance:
|
||||
- With cache hit: ~50-100ms (LIST + cache read + validation)
|
||||
- quick (no cache): ~50ms for any bucket size (LIST calls only)
|
||||
- sampled (no cache): LIST + one HEAD per delta-space
|
||||
- detailed (no cache): LIST + HEAD for every delta (slowest but accurate)
|
||||
- Max timeout: 10 minutes (prevents indefinite hangs)
|
||||
- Max objects: 10M (prevents infinite loops)
|
||||
|
||||
Example:
|
||||
# Use cached stats (fast, ~100ms)
|
||||
stats = client.get_bucket_stats('releases')
|
||||
|
||||
# Force refresh (slow, recomputes everything)
|
||||
stats = client.get_bucket_stats('releases', refresh_cache=True)
|
||||
|
||||
# Skip cache entirely
|
||||
stats = client.get_bucket_stats('releases', use_cache=False)
|
||||
|
||||
# Different modes with caching
|
||||
stats_sampled = client.get_bucket_stats('releases', mode='sampled')
|
||||
stats_detailed = client.get_bucket_stats('releases', mode='detailed')
|
||||
"""
|
||||
try:
|
||||
if mode not in {"quick", "sampled", "detailed"}:
|
||||
raise ValueError(f"Unknown stats mode: {mode}")
|
||||
|
||||
# Phase 1: Always do a quick LIST to get current state (needed for validation)
|
||||
import time
|
||||
|
||||
phase1_start = time.time()
|
||||
client.service.logger.info(
|
||||
f"[{datetime.now(UTC).strftime('%H:%M:%S.%f')[:-3]}] Phase 1: Starting LIST operation for bucket '{bucket}'"
|
||||
)
|
||||
|
||||
list_cap = QUICK_LIST_LIMIT if mode == "quick" else SAMPLED_LIST_LIMIT
|
||||
listing = list_all_objects(
|
||||
client.service.storage,
|
||||
bucket=bucket,
|
||||
max_keys=1000,
|
||||
logger=client.service.logger,
|
||||
max_objects=list_cap,
|
||||
)
|
||||
raw_objects = listing.objects
|
||||
|
||||
# Calculate validation metrics from LIST
|
||||
current_object_count = len(raw_objects)
|
||||
current_compressed_size = sum(obj["size"] for obj in raw_objects)
|
||||
limit_reached = listing.limit_reached or listing.is_truncated
|
||||
if limit_reached:
|
||||
client.service.logger.info(
|
||||
f"[{datetime.now(UTC).strftime('%H:%M:%S.%f')[:-3]}] Phase 1: Listing capped at {list_cap} objects (bucket likely larger)."
|
||||
)
|
||||
|
||||
phase1_duration = time.time() - phase1_start
|
||||
client.service.logger.info(
|
||||
f"[{datetime.now(UTC).strftime('%H:%M:%S.%f')[:-3]}] Phase 1: LIST completed in {phase1_duration:.2f}s - "
|
||||
f"Found {current_object_count} objects, {current_compressed_size:,} bytes total"
|
||||
)
|
||||
|
||||
# Phase 2: Try to use cache if enabled and not forcing refresh
|
||||
phase2_start = time.time()
|
||||
if use_cache and not refresh_cache:
|
||||
client.service.logger.info(
|
||||
f"[{datetime.now(UTC).strftime('%H:%M:%S.%f')[:-3]}] Phase 2: Checking cache for mode '{mode}'"
|
||||
)
|
||||
cached_stats, cached_validation = _read_stats_cache(client, bucket, mode)
|
||||
|
||||
if cached_stats and cached_validation:
|
||||
# Validate cache against current bucket state
|
||||
if _is_cache_valid(
|
||||
cached_validation, current_object_count, current_compressed_size
|
||||
):
|
||||
phase2_duration = time.time() - phase2_start
|
||||
client.service.logger.info(
|
||||
f"[{datetime.now(UTC).strftime('%H:%M:%S.%f')[:-3]}] Phase 2: Cache HIT in {phase2_duration:.2f}s - "
|
||||
f"Using cached stats for {bucket} (mode={mode}, bucket unchanged)"
|
||||
)
|
||||
return cached_stats
|
||||
else:
|
||||
phase2_duration = time.time() - phase2_start
|
||||
client.service.logger.info(
|
||||
f"[{datetime.now(UTC).strftime('%H:%M:%S.%f')[:-3]}] Phase 2: Cache INVALID in {phase2_duration:.2f}s - "
|
||||
f"Bucket changed: count {cached_validation.get('object_count')} → {current_object_count}, "
|
||||
f"size {cached_validation.get('compressed_size')} → {current_compressed_size}"
|
||||
)
|
||||
else:
|
||||
phase2_duration = time.time() - phase2_start
|
||||
client.service.logger.info(
|
||||
f"[{datetime.now(UTC).strftime('%H:%M:%S.%f')[:-3]}] Phase 2: Cache MISS in {phase2_duration:.2f}s - "
|
||||
f"No valid cache found"
|
||||
)
|
||||
else:
|
||||
if refresh_cache:
|
||||
client.service.logger.info(
|
||||
f"[{datetime.now(UTC).strftime('%H:%M:%S.%f')[:-3]}] Phase 2: Cache SKIPPED (refresh requested)"
|
||||
)
|
||||
elif not use_cache:
|
||||
client.service.logger.info(
|
||||
f"[{datetime.now(UTC).strftime('%H:%M:%S.%f')[:-3]}] Phase 2: Cache DISABLED"
|
||||
)
|
||||
|
||||
# Phase 3: Cache miss or invalid - compute stats from scratch
|
||||
client.service.logger.info(
|
||||
f"[{datetime.now(UTC).strftime('%H:%M:%S.%f')[:-3]}] Phase 3: Computing stats (mode={mode})"
|
||||
)
|
||||
|
||||
# Phase 4: Extract delta keys for metadata fetching
|
||||
phase4_start = time.time()
|
||||
delta_keys = [obj["key"] for obj in raw_objects if obj["key"].endswith(".delta")]
|
||||
phase4_duration = time.time() - phase4_start
|
||||
|
||||
client.service.logger.info(
|
||||
f"[{datetime.now(UTC).strftime('%H:%M:%S.%f')[:-3]}] Phase 4: Delta extraction completed in {phase4_duration:.3f}s - "
|
||||
f"Found {len(delta_keys)} delta files"
|
||||
)
|
||||
|
||||
# Phase 5: Fetch metadata for delta files based on mode
|
||||
phase5_start = time.time()
|
||||
metadata_map: dict[str, dict[str, Any]] = {}
|
||||
sampled_space_metadata: dict[str, dict[str, Any]] | None = None
|
||||
|
||||
if delta_keys:
|
||||
if mode == "detailed":
|
||||
client.service.logger.info(
|
||||
f"[{datetime.now(UTC).strftime('%H:%M:%S.%f')[:-3]}] Phase 5: Fetching metadata for ALL {len(delta_keys)} delta files"
|
||||
)
|
||||
metadata_map = _fetch_delta_metadata(client, bucket, delta_keys)
|
||||
|
||||
elif mode == "sampled":
|
||||
# Sample one delta per deltaspace
|
||||
seen_spaces: set[str] = set()
|
||||
sampled_keys: list[str] = []
|
||||
for key in delta_keys:
|
||||
space = _extract_deltaspace(key)
|
||||
if space not in seen_spaces:
|
||||
seen_spaces.add(space)
|
||||
sampled_keys.append(key)
|
||||
|
||||
client.service.logger.info(
|
||||
f"[{datetime.now(UTC).strftime('%H:%M:%S.%f')[:-3]}] Phase 5: Sampling {len(sampled_keys)} delta files "
|
||||
f"(one per deltaspace) out of {len(delta_keys)} total delta files"
|
||||
)
|
||||
|
||||
# Log which files are being sampled
|
||||
if sampled_keys:
|
||||
for idx, key in enumerate(sampled_keys[:10], 1): # Show first 10
|
||||
space = _extract_deltaspace(key)
|
||||
client.service.logger.info(
|
||||
f" [{idx}] Sampling: {key} (deltaspace: '{space or '(root)'}')"
|
||||
)
|
||||
if len(sampled_keys) > 10:
|
||||
client.service.logger.info(f" ... and {len(sampled_keys) - 10} more")
|
||||
|
||||
if sampled_keys:
|
||||
metadata_map = _fetch_delta_metadata(client, bucket, sampled_keys)
|
||||
sampled_space_metadata = {
|
||||
_extract_deltaspace(k): metadata for k, metadata in metadata_map.items()
|
||||
}
|
||||
|
||||
phase5_duration = time.time() - phase5_start
|
||||
if mode == "quick":
|
||||
client.service.logger.info(
|
||||
f"[{datetime.now(UTC).strftime('%H:%M:%S.%f')[:-3]}] Phase 5: Skipped metadata fetching (quick mode) in {phase5_duration:.3f}s"
|
||||
)
|
||||
else:
|
||||
client.service.logger.info(
|
||||
f"[{datetime.now(UTC).strftime('%H:%M:%S.%f')[:-3]}] Phase 5: Metadata fetching completed in {phase5_duration:.2f}s - "
|
||||
f"Fetched {len(metadata_map)} metadata records"
|
||||
)
|
||||
|
||||
# Phase 6: Build ObjectInfo list
|
||||
phase6_start = time.time()
|
||||
all_objects = _build_object_info_list(
|
||||
raw_objects,
|
||||
metadata_map,
|
||||
client.service.logger,
|
||||
sampled_space_metadata,
|
||||
)
|
||||
phase6_duration = time.time() - phase6_start
|
||||
client.service.logger.info(
|
||||
f"[{datetime.now(UTC).strftime('%H:%M:%S.%f')[:-3]}] Phase 6: ObjectInfo list built in {phase6_duration:.3f}s - "
|
||||
f"{len(all_objects)} objects processed"
|
||||
)
|
||||
|
||||
# Phase 7: Calculate final statistics
|
||||
phase7_start = time.time()
|
||||
stats = _calculate_bucket_statistics(all_objects, bucket, client.service.logger, mode)
|
||||
phase7_duration = time.time() - phase7_start
|
||||
client.service.logger.info(
|
||||
f"[{datetime.now(UTC).strftime('%H:%M:%S.%f')[:-3]}] Phase 7: Statistics calculated in {phase7_duration:.3f}s - "
|
||||
f"{stats.delta_objects} delta, {stats.direct_objects} direct objects"
|
||||
)
|
||||
|
||||
# Phase 8: Write cache if enabled
|
||||
phase8_start = time.time()
|
||||
if use_cache:
|
||||
_write_stats_cache(
|
||||
client=client,
|
||||
bucket=bucket,
|
||||
mode=mode,
|
||||
stats=stats,
|
||||
object_count=current_object_count,
|
||||
compressed_size=current_compressed_size,
|
||||
)
|
||||
phase8_duration = time.time() - phase8_start
|
||||
client.service.logger.info(
|
||||
f"[{datetime.now(UTC).strftime('%H:%M:%S.%f')[:-3]}] Phase 8: Cache written in {phase8_duration:.3f}s"
|
||||
)
|
||||
else:
|
||||
client.service.logger.info(
|
||||
f"[{datetime.now(UTC).strftime('%H:%M:%S.%f')[:-3]}] Phase 8: Cache write skipped (caching disabled)"
|
||||
)
|
||||
|
||||
# Summary
|
||||
total_duration = time.time() - phase1_start
|
||||
client.service.logger.info(
|
||||
f"[{datetime.now(UTC).strftime('%H:%M:%S.%f')[:-3]}] COMPLETE: Total time {total_duration:.2f}s for bucket '{bucket}' (mode={mode})"
|
||||
)
|
||||
|
||||
stats.object_limit_reached = limit_reached
|
||||
return stats
|
||||
|
||||
except Exception as e:
|
||||
# Last resort: return empty stats with error indication
|
||||
client.service.logger.error(
|
||||
f"get_bucket_stats: Failed to build statistics for '{bucket}': {e}. "
|
||||
f"Returning empty stats."
|
||||
)
|
||||
return BucketStats(
|
||||
bucket=bucket,
|
||||
object_count=0,
|
||||
total_size=0,
|
||||
compressed_size=0,
|
||||
space_saved=0,
|
||||
average_compression_ratio=0.0,
|
||||
delta_objects=0,
|
||||
direct_objects=0,
|
||||
object_limit_reached=False,
|
||||
)
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# Public API Functions
|
||||
# ============================================================================
|
||||
|
||||
|
||||
def estimate_compression(
|
||||
client: Any, # DeltaGliderClient
|
||||
file_path: str | Path,
|
||||
bucket: str,
|
||||
prefix: str = "",
|
||||
sample_size: int = 1024 * 1024,
|
||||
) -> CompressionEstimate:
|
||||
"""Estimate compression ratio before upload.
|
||||
|
||||
Args:
|
||||
client: DeltaGliderClient instance
|
||||
file_path: Local file to estimate
|
||||
bucket: Target bucket
|
||||
prefix: Target prefix (for finding similar files)
|
||||
sample_size: Bytes to sample for estimation (default 1MB)
|
||||
|
||||
Returns:
|
||||
CompressionEstimate with predicted compression
|
||||
"""
|
||||
file_path = Path(file_path)
|
||||
file_size = file_path.stat().st_size
|
||||
|
||||
filename = file_path.name
|
||||
ext = file_path.suffix.lower()
|
||||
|
||||
# Already compressed formats that won't benefit from delta
|
||||
incompressible = {".jpg", ".jpeg", ".png", ".mp4", ".mp3", ".avi", ".mov"}
|
||||
|
||||
if ext in incompressible:
|
||||
return CompressionEstimate(
|
||||
original_size=file_size,
|
||||
estimated_compressed_size=file_size,
|
||||
estimated_ratio=0.0,
|
||||
confidence=0.95,
|
||||
should_use_delta=False,
|
||||
)
|
||||
|
||||
if not is_delta_candidate(filename):
|
||||
# Unknown type, conservative estimate
|
||||
return CompressionEstimate(
|
||||
original_size=file_size,
|
||||
estimated_compressed_size=file_size,
|
||||
estimated_ratio=0.0,
|
||||
confidence=0.5,
|
||||
should_use_delta=file_size > 1024 * 1024, # Only for files > 1MB
|
||||
)
|
||||
|
||||
# Look for similar files in the target location
|
||||
similar_files = find_similar_files(client, bucket, prefix, file_path.name)
|
||||
|
||||
if similar_files:
|
||||
# If we have similar files, estimate high compression
|
||||
estimated_ratio = 0.99 # 99% compression typical for similar versions
|
||||
confidence = 0.9
|
||||
recommended_ref = similar_files[0]["Key"] if similar_files else None
|
||||
else:
|
||||
# First file of its type
|
||||
estimated_ratio = 0.0
|
||||
confidence = 0.7
|
||||
recommended_ref = None
|
||||
|
||||
estimated_size = int(file_size * (1 - estimated_ratio))
|
||||
|
||||
return CompressionEstimate(
|
||||
original_size=file_size,
|
||||
estimated_compressed_size=estimated_size,
|
||||
estimated_ratio=estimated_ratio,
|
||||
confidence=confidence,
|
||||
recommended_reference=recommended_ref,
|
||||
should_use_delta=True,
|
||||
)
|
||||
|
||||
|
||||
def find_similar_files(
|
||||
client: Any, # DeltaGliderClient
|
||||
bucket: str,
|
||||
prefix: str,
|
||||
filename: str,
|
||||
limit: int = 5,
|
||||
) -> list[dict[str, Any]]:
|
||||
"""Find similar files that could serve as references.
|
||||
|
||||
Args:
|
||||
client: DeltaGliderClient instance
|
||||
bucket: S3 bucket
|
||||
prefix: Prefix to search in
|
||||
filename: Filename to match against
|
||||
limit: Maximum number of results
|
||||
|
||||
Returns:
|
||||
List of similar files with scores
|
||||
"""
|
||||
# List objects in the prefix (no metadata needed for similarity check)
|
||||
response = client.list_objects(
|
||||
Bucket=bucket,
|
||||
Prefix=prefix,
|
||||
MaxKeys=1000,
|
||||
FetchMetadata=False, # Don't need metadata for similarity
|
||||
)
|
||||
|
||||
similar: list[dict[str, Any]] = []
|
||||
base_name = Path(filename).stem
|
||||
ext = Path(filename).suffix
|
||||
|
||||
for obj in response["Contents"]:
|
||||
obj_key = obj["Key"]
|
||||
obj_base = Path(obj_key).stem
|
||||
obj_ext = Path(obj_key).suffix
|
||||
|
||||
# Skip delta files and references
|
||||
if obj_key.endswith(".delta") or obj_key.endswith("reference.bin"):
|
||||
continue
|
||||
|
||||
score = 0.0
|
||||
|
||||
# Extension match
|
||||
if ext == obj_ext:
|
||||
score += 0.5
|
||||
|
||||
# Base name similarity
|
||||
if base_name in obj_base or obj_base in base_name:
|
||||
score += 0.3
|
||||
|
||||
# Version pattern match
|
||||
if re.search(r"v?\d+[\.\d]*", base_name) and re.search(r"v?\d+[\.\d]*", obj_base):
|
||||
score += 0.2
|
||||
|
||||
if score > 0.5:
|
||||
similar.append(
|
||||
{
|
||||
"Key": obj_key,
|
||||
"Size": obj["Size"],
|
||||
"Similarity": score,
|
||||
"LastModified": obj["LastModified"],
|
||||
}
|
||||
)
|
||||
|
||||
# Sort by similarity
|
||||
similar.sort(key=lambda x: x["Similarity"], reverse=True) # type: ignore
|
||||
|
||||
return similar[:limit]
|
||||
@@ -1,5 +1,10 @@
|
||||
"""Core domain for DeltaGlider."""
|
||||
|
||||
from .delta_extensions import (
|
||||
DEFAULT_COMPOUND_DELTA_EXTENSIONS,
|
||||
DEFAULT_DELTA_EXTENSIONS,
|
||||
is_delta_candidate,
|
||||
)
|
||||
from .errors import (
|
||||
DeltaGliderError,
|
||||
DiffDecodeError,
|
||||
@@ -11,14 +16,17 @@ from .errors import (
|
||||
StorageIOError,
|
||||
)
|
||||
from .models import (
|
||||
DeleteResult,
|
||||
DeltaMeta,
|
||||
DeltaSpace,
|
||||
ObjectKey,
|
||||
PutSummary,
|
||||
RecursiveDeleteResult,
|
||||
ReferenceMeta,
|
||||
Sha256,
|
||||
VerifyResult,
|
||||
)
|
||||
from .s3_uri import S3Url, build_s3_url, is_s3_url, parse_s3_url
|
||||
from .service import DeltaService
|
||||
|
||||
__all__ = [
|
||||
@@ -30,12 +38,21 @@ __all__ = [
|
||||
"DiffDecodeError",
|
||||
"StorageIOError",
|
||||
"PolicyViolationWarning",
|
||||
"DeleteResult",
|
||||
"DeltaSpace",
|
||||
"ObjectKey",
|
||||
"RecursiveDeleteResult",
|
||||
"Sha256",
|
||||
"DeltaMeta",
|
||||
"ReferenceMeta",
|
||||
"PutSummary",
|
||||
"VerifyResult",
|
||||
"DeltaService",
|
||||
"DEFAULT_DELTA_EXTENSIONS",
|
||||
"DEFAULT_COMPOUND_DELTA_EXTENSIONS",
|
||||
"is_delta_candidate",
|
||||
"S3Url",
|
||||
"build_s3_url",
|
||||
"is_s3_url",
|
||||
"parse_s3_url",
|
||||
]
|
||||
|
||||
53
src/deltaglider/core/config.py
Normal file
53
src/deltaglider/core/config.py
Normal file
@@ -0,0 +1,53 @@
|
||||
"""Centralized configuration for DeltaGlider."""
|
||||
|
||||
import os
|
||||
from dataclasses import dataclass, field
|
||||
|
||||
|
||||
@dataclass(slots=True)
|
||||
class DeltaGliderConfig:
|
||||
"""All DeltaGlider configuration in one place.
|
||||
|
||||
Environment variables (all optional):
|
||||
DG_MAX_RATIO: Max delta/file ratio before falling back to direct storage.
|
||||
Range 0.0-1.0, default 0.5.
|
||||
DG_LOG_LEVEL: Logging level. Default "INFO".
|
||||
DG_CACHE_BACKEND: "filesystem" (default) or "memory".
|
||||
DG_CACHE_MEMORY_SIZE_MB: Memory cache size in MB. Default 100.
|
||||
DG_METRICS: Metrics backend: "noop", "logging" (default), "cloudwatch".
|
||||
DG_METRICS_NAMESPACE: CloudWatch namespace. Default "DeltaGlider".
|
||||
"""
|
||||
|
||||
max_ratio: float = 0.5
|
||||
log_level: str = "INFO"
|
||||
cache_backend: str = "filesystem"
|
||||
cache_memory_size_mb: int = 100
|
||||
metrics_type: str = "logging"
|
||||
metrics_namespace: str = "DeltaGlider"
|
||||
|
||||
# Connection params (typically passed by CLI, not env vars)
|
||||
endpoint_url: str | None = field(default=None, repr=False)
|
||||
region: str | None = None
|
||||
profile: str | None = None
|
||||
|
||||
@classmethod
|
||||
def from_env(
|
||||
cls,
|
||||
*,
|
||||
log_level: str = "INFO",
|
||||
endpoint_url: str | None = None,
|
||||
region: str | None = None,
|
||||
profile: str | None = None,
|
||||
) -> "DeltaGliderConfig":
|
||||
"""Build config from environment variables + explicit overrides."""
|
||||
return cls(
|
||||
max_ratio=float(os.environ.get("DG_MAX_RATIO", "0.5")),
|
||||
log_level=os.environ.get("DG_LOG_LEVEL", log_level),
|
||||
cache_backend=os.environ.get("DG_CACHE_BACKEND", "filesystem"),
|
||||
cache_memory_size_mb=int(os.environ.get("DG_CACHE_MEMORY_SIZE_MB", "100")),
|
||||
metrics_type=os.environ.get("DG_METRICS", "logging"),
|
||||
metrics_namespace=os.environ.get("DG_METRICS_NAMESPACE", "DeltaGlider"),
|
||||
endpoint_url=endpoint_url,
|
||||
region=region,
|
||||
profile=profile,
|
||||
)
|
||||
56
src/deltaglider/core/delta_extensions.py
Normal file
56
src/deltaglider/core/delta_extensions.py
Normal file
@@ -0,0 +1,56 @@
|
||||
"""Shared delta compression extension policy."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from collections.abc import Collection, Iterable
|
||||
|
||||
# Compound extensions must be checked before simple suffix matching so that
|
||||
# multi-part archives like ".tar.gz" are handled correctly.
|
||||
DEFAULT_COMPOUND_DELTA_EXTENSIONS: tuple[str, ...] = (".tar.gz", ".tar.bz2", ".tar.xz")
|
||||
|
||||
# Simple extensions that benefit from delta compression. Keep this structure
|
||||
# immutable so it can be safely reused across modules.
|
||||
DEFAULT_DELTA_EXTENSIONS: frozenset[str] = frozenset(
|
||||
{
|
||||
".zip",
|
||||
".tar",
|
||||
".gz",
|
||||
".tgz",
|
||||
".bz2",
|
||||
".xz",
|
||||
".7z",
|
||||
".rar",
|
||||
".dmg",
|
||||
".iso",
|
||||
".pkg",
|
||||
".deb",
|
||||
".rpm",
|
||||
".apk",
|
||||
".jar",
|
||||
".war",
|
||||
".ear",
|
||||
}
|
||||
)
|
||||
|
||||
|
||||
def is_delta_candidate(
|
||||
filename: str,
|
||||
*,
|
||||
simple_extensions: Collection[str] = DEFAULT_DELTA_EXTENSIONS,
|
||||
compound_extensions: Iterable[str] = DEFAULT_COMPOUND_DELTA_EXTENSIONS,
|
||||
) -> bool:
|
||||
"""Check if a filename should use delta compression based on extension."""
|
||||
name_lower = filename.lower()
|
||||
|
||||
for ext in compound_extensions:
|
||||
if name_lower.endswith(ext):
|
||||
return True
|
||||
|
||||
return any(name_lower.endswith(ext) for ext in simple_extensions)
|
||||
|
||||
|
||||
__all__ = [
|
||||
"DEFAULT_COMPOUND_DELTA_EXTENSIONS",
|
||||
"DEFAULT_DELTA_EXTENSIONS",
|
||||
"is_delta_candidate",
|
||||
]
|
||||
@@ -47,3 +47,15 @@ class PolicyViolationWarning(Warning):
|
||||
"""Policy violation warning."""
|
||||
|
||||
pass
|
||||
|
||||
|
||||
class CacheMissError(DeltaGliderError):
|
||||
"""Cache miss - file not found in cache."""
|
||||
|
||||
pass
|
||||
|
||||
|
||||
class CacheCorruptionError(DeltaGliderError):
|
||||
"""Cache corruption - SHA mismatch or tampering detected."""
|
||||
|
||||
pass
|
||||
|
||||
@@ -1,8 +1,80 @@
|
||||
"""Core domain models."""
|
||||
|
||||
from dataclasses import dataclass
|
||||
import logging
|
||||
from dataclasses import dataclass, field
|
||||
from datetime import datetime
|
||||
|
||||
# Metadata key prefix for DeltaGlider
|
||||
# AWS S3 automatically adds 'x-amz-meta-' prefix, so our keys become 'x-amz-meta-dg-*'
|
||||
METADATA_PREFIX = "dg-"
|
||||
|
||||
# Canonical metadata key aliases.
|
||||
# Each field maps to all known key formats (current prefixed, legacy underscore, legacy bare,
|
||||
# legacy hyphenated). Order matters: first match wins during lookup.
|
||||
# Both DeltaMeta.from_dict() and service-layer _meta_value() MUST use these to stay in sync.
|
||||
METADATA_KEY_ALIASES: dict[str, tuple[str, ...]] = {
|
||||
"tool": (f"{METADATA_PREFIX}tool", "dg_tool", "tool"),
|
||||
"original_name": (
|
||||
f"{METADATA_PREFIX}original-name",
|
||||
"dg_original_name",
|
||||
"original_name",
|
||||
"original-name",
|
||||
),
|
||||
"file_sha256": (
|
||||
f"{METADATA_PREFIX}file-sha256",
|
||||
"dg_file_sha256",
|
||||
"file_sha256",
|
||||
"file-sha256",
|
||||
),
|
||||
"file_size": (
|
||||
f"{METADATA_PREFIX}file-size",
|
||||
"dg_file_size",
|
||||
"file_size",
|
||||
"file-size",
|
||||
),
|
||||
"created_at": (
|
||||
f"{METADATA_PREFIX}created-at",
|
||||
"dg_created_at",
|
||||
"created_at",
|
||||
"created-at",
|
||||
),
|
||||
"ref_key": (f"{METADATA_PREFIX}ref-key", "dg_ref_key", "ref_key", "ref-key"),
|
||||
"ref_sha256": (
|
||||
f"{METADATA_PREFIX}ref-sha256",
|
||||
"dg_ref_sha256",
|
||||
"ref_sha256",
|
||||
"ref-sha256",
|
||||
),
|
||||
"delta_size": (
|
||||
f"{METADATA_PREFIX}delta-size",
|
||||
"dg_delta_size",
|
||||
"delta_size",
|
||||
"delta-size",
|
||||
),
|
||||
"delta_cmd": (
|
||||
f"{METADATA_PREFIX}delta-cmd",
|
||||
"dg_delta_cmd",
|
||||
"delta_cmd",
|
||||
"delta-cmd",
|
||||
),
|
||||
"note": (f"{METADATA_PREFIX}note", "dg_note", "note"),
|
||||
}
|
||||
|
||||
|
||||
def resolve_metadata(metadata: dict[str, str], field: str) -> str | None:
|
||||
"""Look up a metadata field using all known key aliases.
|
||||
|
||||
Returns the first non-empty match, or None if not found.
|
||||
"""
|
||||
for key in METADATA_KEY_ALIASES[field]:
|
||||
value = metadata.get(key)
|
||||
if value not in (None, ""):
|
||||
return value
|
||||
return None
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class DeltaSpace:
|
||||
@@ -23,6 +95,11 @@ class ObjectKey:
|
||||
bucket: str
|
||||
key: str
|
||||
|
||||
@property
|
||||
def full_key(self) -> str:
|
||||
"""Full S3 path: bucket/key."""
|
||||
return f"{self.bucket}/{self.key}"
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class Sha256:
|
||||
@@ -47,13 +124,13 @@ class ReferenceMeta:
|
||||
note: str = "reference"
|
||||
|
||||
def to_dict(self) -> dict[str, str]:
|
||||
"""Convert to S3 metadata dict."""
|
||||
"""Convert to S3 metadata dict with DeltaGlider namespace prefix."""
|
||||
return {
|
||||
"tool": self.tool,
|
||||
"source_name": self.source_name,
|
||||
"file_sha256": self.file_sha256,
|
||||
"created_at": self.created_at.isoformat() + "Z",
|
||||
"note": self.note,
|
||||
f"{METADATA_PREFIX}tool": self.tool,
|
||||
f"{METADATA_PREFIX}source-name": self.source_name,
|
||||
f"{METADATA_PREFIX}file-sha256": self.file_sha256,
|
||||
f"{METADATA_PREFIX}created-at": self.created_at.isoformat() + "Z",
|
||||
f"{METADATA_PREFIX}note": self.note,
|
||||
}
|
||||
|
||||
|
||||
@@ -73,36 +150,79 @@ class DeltaMeta:
|
||||
note: str | None = None
|
||||
|
||||
def to_dict(self) -> dict[str, str]:
|
||||
"""Convert to S3 metadata dict."""
|
||||
"""Convert to S3 metadata dict with DeltaGlider namespace prefix."""
|
||||
meta = {
|
||||
"tool": self.tool,
|
||||
"original_name": self.original_name,
|
||||
"file_sha256": self.file_sha256,
|
||||
"file_size": str(self.file_size),
|
||||
"created_at": self.created_at.isoformat() + "Z",
|
||||
"ref_key": self.ref_key,
|
||||
"ref_sha256": self.ref_sha256,
|
||||
"delta_size": str(self.delta_size),
|
||||
"delta_cmd": self.delta_cmd,
|
||||
f"{METADATA_PREFIX}tool": self.tool,
|
||||
f"{METADATA_PREFIX}original-name": self.original_name,
|
||||
f"{METADATA_PREFIX}file-sha256": self.file_sha256,
|
||||
f"{METADATA_PREFIX}file-size": str(self.file_size),
|
||||
f"{METADATA_PREFIX}created-at": self.created_at.isoformat() + "Z",
|
||||
f"{METADATA_PREFIX}ref-key": self.ref_key,
|
||||
f"{METADATA_PREFIX}ref-sha256": self.ref_sha256,
|
||||
f"{METADATA_PREFIX}delta-size": str(self.delta_size),
|
||||
f"{METADATA_PREFIX}delta-cmd": self.delta_cmd,
|
||||
}
|
||||
if self.note:
|
||||
meta["note"] = self.note
|
||||
meta[f"{METADATA_PREFIX}note"] = self.note
|
||||
return meta
|
||||
|
||||
@classmethod
|
||||
def from_dict(cls, data: dict[str, str]) -> "DeltaMeta":
|
||||
"""Create from S3 metadata dict."""
|
||||
"""Create from S3 metadata dict with DeltaGlider namespace prefix."""
|
||||
|
||||
def _require(field: str) -> str:
|
||||
value = resolve_metadata(data, field)
|
||||
if value is None:
|
||||
raise KeyError(METADATA_KEY_ALIASES[field][0])
|
||||
return value
|
||||
|
||||
tool = _require("tool")
|
||||
original_name = _require("original_name")
|
||||
file_sha = _require("file_sha256")
|
||||
file_size_raw = _require("file_size")
|
||||
created_at_raw = _require("created_at")
|
||||
ref_key = _require("ref_key")
|
||||
ref_sha = _require("ref_sha256")
|
||||
delta_size_raw = _require("delta_size")
|
||||
delta_cmd_value = resolve_metadata(data, "delta_cmd") or ""
|
||||
note_value = resolve_metadata(data, "note") or ""
|
||||
|
||||
try:
|
||||
file_size = int(file_size_raw)
|
||||
except (TypeError, ValueError):
|
||||
raise ValueError(f"Invalid file size metadata: {file_size_raw}") from None
|
||||
|
||||
try:
|
||||
delta_size = int(delta_size_raw)
|
||||
except (TypeError, ValueError):
|
||||
raise ValueError(f"Invalid delta size metadata: {delta_size_raw}") from None
|
||||
|
||||
created_at_text = created_at_raw.rstrip("Z")
|
||||
try:
|
||||
created_at = datetime.fromisoformat(created_at_text)
|
||||
except ValueError as exc:
|
||||
raise ValueError(f"Invalid created_at metadata: {created_at_raw}") from exc
|
||||
|
||||
if not delta_cmd_value:
|
||||
object_name = original_name or "<unknown>"
|
||||
logger.warning(
|
||||
"Delta metadata missing %s for %s; using empty command",
|
||||
f"{METADATA_PREFIX}delta-cmd",
|
||||
object_name,
|
||||
)
|
||||
delta_cmd_value = ""
|
||||
|
||||
return cls(
|
||||
tool=data["tool"],
|
||||
original_name=data["original_name"],
|
||||
file_sha256=data["file_sha256"],
|
||||
file_size=int(data["file_size"]),
|
||||
created_at=datetime.fromisoformat(data["created_at"].rstrip("Z")),
|
||||
ref_key=data["ref_key"],
|
||||
ref_sha256=data["ref_sha256"],
|
||||
delta_size=int(data["delta_size"]),
|
||||
delta_cmd=data["delta_cmd"],
|
||||
note=data.get("note"),
|
||||
tool=tool,
|
||||
original_name=original_name,
|
||||
file_sha256=file_sha,
|
||||
file_size=file_size,
|
||||
created_at=created_at,
|
||||
ref_key=ref_key,
|
||||
ref_sha256=ref_sha,
|
||||
delta_size=delta_size,
|
||||
delta_cmd=delta_cmd_value,
|
||||
note=note_value or None,
|
||||
)
|
||||
|
||||
|
||||
@@ -131,3 +251,33 @@ class VerifyResult:
|
||||
expected_sha256: str
|
||||
actual_sha256: str
|
||||
message: str
|
||||
|
||||
|
||||
@dataclass
|
||||
class DeleteResult:
|
||||
"""Result of a single delete operation."""
|
||||
|
||||
key: str
|
||||
bucket: str
|
||||
deleted: bool = False
|
||||
type: str = "unknown"
|
||||
warnings: list[str] = field(default_factory=list)
|
||||
original_name: str | None = None
|
||||
dependent_deltas: int = 0
|
||||
cleaned_reference: str | None = None
|
||||
|
||||
|
||||
@dataclass
|
||||
class RecursiveDeleteResult:
|
||||
"""Result of a recursive delete operation."""
|
||||
|
||||
bucket: str
|
||||
prefix: str
|
||||
deleted_count: int = 0
|
||||
failed_count: int = 0
|
||||
deltas_deleted: int = 0
|
||||
references_deleted: int = 0
|
||||
direct_deleted: int = 0
|
||||
other_deleted: int = 0
|
||||
errors: list[str] = field(default_factory=list)
|
||||
warnings: list[str] = field(default_factory=list)
|
||||
|
||||
222
src/deltaglider/core/object_listing.py
Normal file
222
src/deltaglider/core/object_listing.py
Normal file
@@ -0,0 +1,222 @@
|
||||
"""Shared helpers for listing bucket objects with pagination support."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from dataclasses import dataclass, field
|
||||
from datetime import datetime, timezone
|
||||
from typing import Any
|
||||
|
||||
from ..ports.storage import ObjectHead
|
||||
|
||||
|
||||
@dataclass(slots=True)
|
||||
class ObjectListing:
|
||||
"""All objects and prefixes returned from a bucket listing."""
|
||||
|
||||
objects: list[dict[str, Any]] = field(default_factory=list)
|
||||
common_prefixes: list[str] = field(default_factory=list)
|
||||
key_count: int = 0
|
||||
is_truncated: bool = False
|
||||
next_continuation_token: str | None = None
|
||||
limit_reached: bool = False
|
||||
|
||||
|
||||
def list_objects_page(
|
||||
storage: Any,
|
||||
*,
|
||||
bucket: str,
|
||||
prefix: str = "",
|
||||
delimiter: str = "",
|
||||
max_keys: int = 1000,
|
||||
start_after: str | None = None,
|
||||
continuation_token: str | None = None,
|
||||
) -> ObjectListing:
|
||||
"""Perform a single list_objects call using the storage adapter."""
|
||||
if not hasattr(storage, "list_objects"):
|
||||
raise NotImplementedError("Storage adapter does not support list_objects")
|
||||
|
||||
response = storage.list_objects(
|
||||
bucket=bucket,
|
||||
prefix=prefix,
|
||||
delimiter=delimiter,
|
||||
max_keys=max_keys,
|
||||
start_after=start_after,
|
||||
continuation_token=continuation_token,
|
||||
)
|
||||
|
||||
return ObjectListing(
|
||||
objects=list(response.get("objects", [])),
|
||||
common_prefixes=list(response.get("common_prefixes", [])),
|
||||
key_count=response.get("key_count", len(response.get("objects", []))),
|
||||
is_truncated=bool(response.get("is_truncated", False)),
|
||||
next_continuation_token=response.get("next_continuation_token"),
|
||||
)
|
||||
|
||||
|
||||
def list_all_objects(
|
||||
storage: Any,
|
||||
*,
|
||||
bucket: str,
|
||||
prefix: str = "",
|
||||
delimiter: str = "",
|
||||
max_keys: int = 1000,
|
||||
logger: Any | None = None,
|
||||
max_iterations: int = 10_000,
|
||||
max_objects: int | None = None,
|
||||
) -> ObjectListing:
|
||||
"""Fetch all objects under the given bucket/prefix with pagination safety."""
|
||||
import time
|
||||
from datetime import UTC, datetime
|
||||
|
||||
aggregated = ObjectListing()
|
||||
continuation_token: str | None = None
|
||||
iteration_count = 0
|
||||
list_start_time = time.time()
|
||||
limit_reached = False
|
||||
|
||||
while True:
|
||||
iteration_count += 1
|
||||
if iteration_count > max_iterations:
|
||||
if logger:
|
||||
logger.warning(
|
||||
"list_all_objects: reached max iterations (%s). Returning partial results.",
|
||||
max_iterations,
|
||||
)
|
||||
aggregated.is_truncated = True
|
||||
aggregated.next_continuation_token = continuation_token
|
||||
break
|
||||
|
||||
# Log progress every 10 pages or on first page
|
||||
if logger and (iteration_count == 1 or iteration_count % 10 == 0):
|
||||
elapsed = time.time() - list_start_time
|
||||
objects_per_sec = len(aggregated.objects) / elapsed if elapsed > 0 else 0
|
||||
token_info = f", token={continuation_token[:20]}..." if continuation_token else ""
|
||||
logger.info(
|
||||
f"[{datetime.now(UTC).strftime('%H:%M:%S.%f')[:-3]}] LIST pagination: "
|
||||
f"page {iteration_count}, {len(aggregated.objects)} objects so far "
|
||||
f"({objects_per_sec:.0f} obj/s, {elapsed:.1f}s elapsed{token_info})"
|
||||
)
|
||||
|
||||
# Warn if taking very long (>60s)
|
||||
if elapsed > 60 and iteration_count % 50 == 0:
|
||||
estimated_total = (len(aggregated.objects) / iteration_count) * max_iterations
|
||||
logger.warning(
|
||||
f"LIST operation is slow ({elapsed:.0f}s elapsed). "
|
||||
f"This bucket has MANY objects ({len(aggregated.objects)} so far). "
|
||||
f"Consider using a smaller prefix or enabling caching. "
|
||||
f"Estimated remaining: {estimated_total - len(aggregated.objects):.0f} objects"
|
||||
)
|
||||
|
||||
try:
|
||||
page = list_objects_page(
|
||||
storage,
|
||||
bucket=bucket,
|
||||
prefix=prefix,
|
||||
delimiter=delimiter,
|
||||
max_keys=max_keys,
|
||||
continuation_token=continuation_token,
|
||||
)
|
||||
except Exception as exc:
|
||||
if not aggregated.objects:
|
||||
raise RuntimeError(f"Failed to list objects for bucket '{bucket}': {exc}") from exc
|
||||
if logger:
|
||||
logger.warning(
|
||||
"list_all_objects: pagination error after %s objects: %s. Returning partial results.",
|
||||
len(aggregated.objects),
|
||||
exc,
|
||||
)
|
||||
aggregated.is_truncated = True
|
||||
aggregated.next_continuation_token = continuation_token
|
||||
break
|
||||
|
||||
aggregated.objects.extend(page.objects)
|
||||
aggregated.common_prefixes.extend(page.common_prefixes)
|
||||
aggregated.key_count += page.key_count
|
||||
|
||||
if max_objects is not None and len(aggregated.objects) >= max_objects:
|
||||
if logger:
|
||||
logger.info(
|
||||
f"[{datetime.now(UTC).strftime('%H:%M:%S.%f')[:-3]}] LIST capped at {max_objects} objects."
|
||||
)
|
||||
aggregated.objects = aggregated.objects[:max_objects]
|
||||
aggregated.key_count = len(aggregated.objects)
|
||||
aggregated.is_truncated = True
|
||||
aggregated.next_continuation_token = page.next_continuation_token
|
||||
limit_reached = True
|
||||
break
|
||||
|
||||
if not page.is_truncated:
|
||||
aggregated.is_truncated = False
|
||||
aggregated.next_continuation_token = None
|
||||
if logger:
|
||||
elapsed = time.time() - list_start_time
|
||||
logger.info(
|
||||
f"[{datetime.now(UTC).strftime('%H:%M:%S.%f')[:-3]}] LIST complete: "
|
||||
f"{iteration_count} pages, {len(aggregated.objects)} objects total in {elapsed:.2f}s"
|
||||
)
|
||||
break
|
||||
|
||||
continuation_token = page.next_continuation_token
|
||||
if not continuation_token:
|
||||
if logger:
|
||||
logger.warning(
|
||||
"list_all_objects: truncated response without continuation token after %s objects.",
|
||||
len(aggregated.objects),
|
||||
)
|
||||
aggregated.is_truncated = True
|
||||
aggregated.next_continuation_token = None
|
||||
break
|
||||
|
||||
if aggregated.common_prefixes:
|
||||
seen: set[str] = set()
|
||||
unique_prefixes: list[str] = []
|
||||
for prefix in aggregated.common_prefixes:
|
||||
if prefix not in seen:
|
||||
seen.add(prefix)
|
||||
unique_prefixes.append(prefix)
|
||||
aggregated.common_prefixes = unique_prefixes
|
||||
aggregated.key_count = len(aggregated.objects)
|
||||
aggregated.limit_reached = limit_reached
|
||||
return aggregated
|
||||
|
||||
|
||||
def _parse_last_modified(value: Any) -> datetime:
|
||||
if isinstance(value, datetime):
|
||||
dt = value
|
||||
elif value:
|
||||
text = str(value)
|
||||
if text.endswith("Z"):
|
||||
text = text[:-1] + "+00:00"
|
||||
try:
|
||||
dt = datetime.fromisoformat(text)
|
||||
except ValueError:
|
||||
dt = datetime.fromtimestamp(0, tz=timezone.utc) # noqa: UP017
|
||||
else:
|
||||
dt = datetime.fromtimestamp(0, tz=timezone.utc) # noqa: UP017
|
||||
|
||||
if dt.tzinfo is None:
|
||||
dt = dt.replace(tzinfo=timezone.utc) # noqa: UP017
|
||||
return dt
|
||||
|
||||
|
||||
def object_dict_to_head(obj: dict[str, Any]) -> ObjectHead:
|
||||
"""Convert a list_objects entry into ObjectHead for compatibility uses."""
|
||||
metadata = obj.get("metadata")
|
||||
if metadata is None or not isinstance(metadata, dict):
|
||||
metadata = {}
|
||||
|
||||
return ObjectHead(
|
||||
key=obj["key"],
|
||||
size=int(obj.get("size", 0)),
|
||||
etag=str(obj.get("etag", "")),
|
||||
last_modified=_parse_last_modified(obj.get("last_modified")),
|
||||
metadata=metadata,
|
||||
)
|
||||
|
||||
|
||||
__all__ = [
|
||||
"ObjectListing",
|
||||
"list_objects_page",
|
||||
"list_all_objects",
|
||||
"object_dict_to_head",
|
||||
]
|
||||
85
src/deltaglider/core/s3_uri.py
Normal file
85
src/deltaglider/core/s3_uri.py
Normal file
@@ -0,0 +1,85 @@
|
||||
"""Utilities for working with S3-style URLs and keys."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import NamedTuple
|
||||
|
||||
S3_SCHEME = "s3://"
|
||||
|
||||
|
||||
class S3Url(NamedTuple):
|
||||
"""Normalized representation of an S3 URL."""
|
||||
|
||||
bucket: str
|
||||
key: str = ""
|
||||
|
||||
def to_url(self) -> str:
|
||||
"""Return the canonical string form."""
|
||||
if self.key:
|
||||
return f"{S3_SCHEME}{self.bucket}/{self.key}"
|
||||
return f"{S3_SCHEME}{self.bucket}"
|
||||
|
||||
def with_key(self, key: str) -> S3Url:
|
||||
"""Return a new S3Url with a different key."""
|
||||
return S3Url(self.bucket, key.lstrip("/"))
|
||||
|
||||
def join_key(self, suffix: str) -> S3Url:
|
||||
"""Append a suffix to the key using '/' semantics."""
|
||||
suffix = suffix.lstrip("/")
|
||||
if not self.key:
|
||||
return self.with_key(suffix)
|
||||
if not suffix:
|
||||
return self
|
||||
return self.with_key(f"{self.key.rstrip('/')}/{suffix}")
|
||||
|
||||
|
||||
def is_s3_url(value: str) -> bool:
|
||||
"""Check if a string is an S3 URL."""
|
||||
return value.startswith(S3_SCHEME)
|
||||
|
||||
|
||||
def parse_s3_url(
|
||||
url: str,
|
||||
*,
|
||||
allow_empty_key: bool = True,
|
||||
strip_trailing_slash: bool = False,
|
||||
) -> S3Url:
|
||||
"""Parse an S3 URL into bucket and key components."""
|
||||
if not is_s3_url(url):
|
||||
raise ValueError(f"Invalid S3 URL: {url}")
|
||||
|
||||
path = url[len(S3_SCHEME) :]
|
||||
if strip_trailing_slash:
|
||||
path = path.rstrip("/")
|
||||
|
||||
bucket, sep, key = path.partition("/")
|
||||
if not bucket:
|
||||
raise ValueError(f"S3 URL missing bucket: {url}")
|
||||
|
||||
if not sep:
|
||||
key = ""
|
||||
|
||||
key = key.lstrip("/")
|
||||
if not key and not allow_empty_key:
|
||||
raise ValueError(f"S3 URL must include a key: {url}")
|
||||
|
||||
return S3Url(bucket=bucket, key=key)
|
||||
|
||||
|
||||
def build_s3_url(bucket: str, key: str | None = None) -> str:
|
||||
"""Build an S3 URL from components."""
|
||||
if not bucket:
|
||||
raise ValueError("Bucket name cannot be empty")
|
||||
|
||||
if key:
|
||||
key = key.lstrip("/")
|
||||
return f"{S3_SCHEME}{bucket}/{key}"
|
||||
return f"{S3_SCHEME}{bucket}"
|
||||
|
||||
|
||||
__all__ = [
|
||||
"S3Url",
|
||||
"build_s3_url",
|
||||
"is_s3_url",
|
||||
"parse_s3_url",
|
||||
]
|
||||
@@ -2,9 +2,11 @@
|
||||
|
||||
import tempfile
|
||||
import warnings
|
||||
from datetime import UTC, timedelta
|
||||
from pathlib import Path
|
||||
from typing import Any, BinaryIO
|
||||
|
||||
from .. import __version__
|
||||
from ..ports import (
|
||||
CachePort,
|
||||
ClockPort,
|
||||
@@ -15,21 +17,28 @@ from ..ports import (
|
||||
StoragePort,
|
||||
)
|
||||
from ..ports.storage import ObjectHead
|
||||
from .delta_extensions import (
|
||||
DEFAULT_COMPOUND_DELTA_EXTENSIONS,
|
||||
DEFAULT_DELTA_EXTENSIONS,
|
||||
is_delta_candidate,
|
||||
)
|
||||
from .errors import (
|
||||
DiffDecodeError,
|
||||
DiffEncodeError,
|
||||
IntegrityMismatchError,
|
||||
NotFoundError,
|
||||
PolicyViolationWarning,
|
||||
StorageIOError,
|
||||
)
|
||||
from .models import (
|
||||
DeleteResult,
|
||||
DeltaMeta,
|
||||
DeltaSpace,
|
||||
ObjectKey,
|
||||
PutSummary,
|
||||
RecursiveDeleteResult,
|
||||
ReferenceMeta,
|
||||
VerifyResult,
|
||||
resolve_metadata,
|
||||
)
|
||||
|
||||
|
||||
@@ -45,10 +54,17 @@ class DeltaService:
|
||||
clock: ClockPort,
|
||||
logger: LoggerPort,
|
||||
metrics: MetricsPort,
|
||||
tool_version: str = "deltaglider/0.1.0",
|
||||
tool_version: str | None = None,
|
||||
max_ratio: float = 0.5,
|
||||
):
|
||||
"""Initialize service with ports."""
|
||||
"""Initialize service with ports.
|
||||
|
||||
Args:
|
||||
tool_version: Version string for metadata. If None, uses package __version__.
|
||||
"""
|
||||
# Use real package version if not explicitly provided
|
||||
if tool_version is None:
|
||||
tool_version = f"deltaglider/{__version__}"
|
||||
self.storage = storage
|
||||
self.diff = diff
|
||||
self.hasher = hasher
|
||||
@@ -59,51 +75,41 @@ class DeltaService:
|
||||
self.tool_version = tool_version
|
||||
self.max_ratio = max_ratio
|
||||
|
||||
# File extensions that should use delta compression
|
||||
self.delta_extensions = {
|
||||
".zip",
|
||||
".tar",
|
||||
".gz",
|
||||
".tar.gz",
|
||||
".tgz",
|
||||
".bz2",
|
||||
".tar.bz2",
|
||||
".xz",
|
||||
".tar.xz",
|
||||
".7z",
|
||||
".rar",
|
||||
".dmg",
|
||||
".iso",
|
||||
".pkg",
|
||||
".deb",
|
||||
".rpm",
|
||||
".apk",
|
||||
".jar",
|
||||
".war",
|
||||
".ear",
|
||||
}
|
||||
# File extensions that should use delta compression. Keep mutable copies
|
||||
# so advanced callers can customize the policy if needed.
|
||||
self.delta_extensions = set(DEFAULT_DELTA_EXTENSIONS)
|
||||
self.compound_delta_extensions = DEFAULT_COMPOUND_DELTA_EXTENSIONS
|
||||
|
||||
def should_use_delta(self, filename: str) -> bool:
|
||||
"""Check if file should use delta compression based on extension."""
|
||||
name_lower = filename.lower()
|
||||
# Check compound extensions first
|
||||
for ext in [".tar.gz", ".tar.bz2", ".tar.xz"]:
|
||||
if name_lower.endswith(ext):
|
||||
return True
|
||||
# Check simple extensions
|
||||
return any(name_lower.endswith(ext) for ext in self.delta_extensions)
|
||||
return is_delta_candidate(
|
||||
filename,
|
||||
simple_extensions=self.delta_extensions,
|
||||
compound_extensions=self.compound_delta_extensions,
|
||||
)
|
||||
|
||||
def put(
|
||||
self, local_file: Path, delta_space: DeltaSpace, max_ratio: float | None = None
|
||||
self,
|
||||
local_file: Path,
|
||||
delta_space: DeltaSpace,
|
||||
max_ratio: float | None = None,
|
||||
override_name: str | None = None,
|
||||
) -> PutSummary:
|
||||
"""Upload file as reference or delta (for archive files) or directly (for other files)."""
|
||||
"""Upload file as reference or delta (for archive files) or directly (for other files).
|
||||
|
||||
Args:
|
||||
local_file: Path to the local file to upload
|
||||
delta_space: DeltaSpace (bucket + prefix) for the upload
|
||||
max_ratio: Maximum acceptable delta/file ratio (default: service max_ratio)
|
||||
override_name: Optional name to use instead of local_file.name (useful for S3-to-S3 copies)
|
||||
"""
|
||||
if max_ratio is None:
|
||||
max_ratio = self.max_ratio
|
||||
|
||||
start_time = self.clock.now()
|
||||
file_size = local_file.stat().st_size
|
||||
file_sha256 = self.hasher.sha256(local_file)
|
||||
original_name = local_file.name
|
||||
original_name = override_name if override_name else local_file.name
|
||||
|
||||
self.logger.info(
|
||||
"Starting put operation",
|
||||
@@ -167,23 +173,43 @@ class DeltaService:
|
||||
self.logger.info("Starting get operation", key=object_key.key)
|
||||
|
||||
# Get object metadata
|
||||
obj_head = self.storage.head(f"{object_key.bucket}/{object_key.key}")
|
||||
obj_head = self.storage.head(object_key.full_key)
|
||||
if obj_head is None:
|
||||
raise NotFoundError(f"Object not found: {object_key.key}")
|
||||
|
||||
if "file_sha256" not in obj_head.metadata:
|
||||
raise StorageIOError(f"Missing metadata on {object_key.key}")
|
||||
|
||||
# Check if this is a direct upload (non-delta)
|
||||
if obj_head.metadata.get("compression") == "none":
|
||||
# Direct download without delta processing
|
||||
# Check if this is a regular S3 object (not uploaded via DeltaGlider)
|
||||
# Regular S3 objects won't have DeltaGlider metadata (dg-file-sha256 key)
|
||||
if "dg-file-sha256" not in obj_head.metadata:
|
||||
# This is a regular S3 object, download it directly
|
||||
self.logger.info(
|
||||
"Downloading regular S3 object (no DeltaGlider metadata)",
|
||||
key=object_key.key,
|
||||
)
|
||||
self._get_direct(object_key, obj_head, out)
|
||||
duration = (self.clock.now() - start_time).total_seconds()
|
||||
self.logger.log_operation(
|
||||
op="get",
|
||||
key=object_key.key,
|
||||
deltaspace=f"{object_key.bucket}",
|
||||
sizes={"file": int(obj_head.metadata.get("file_size", 0))},
|
||||
sizes={"file": obj_head.size},
|
||||
durations={"total": duration},
|
||||
cache_hit=False,
|
||||
)
|
||||
self.metrics.timing("deltaglider.get.duration", duration)
|
||||
return
|
||||
|
||||
# Check if this is a direct upload (non-delta) uploaded via DeltaGlider
|
||||
if obj_head.metadata.get("compression") == "none":
|
||||
# Direct download without delta processing
|
||||
self._get_direct(object_key, obj_head, out)
|
||||
duration = (self.clock.now() - start_time).total_seconds()
|
||||
file_size_meta = resolve_metadata(obj_head.metadata, "file_size")
|
||||
file_size_value = int(file_size_meta) if file_size_meta else obj_head.size
|
||||
self.logger.log_operation(
|
||||
op="get",
|
||||
key=object_key.key,
|
||||
deltaspace=f"{object_key.bucket}",
|
||||
sizes={"file": file_size_value},
|
||||
durations={"total": duration},
|
||||
cache_hit=False,
|
||||
)
|
||||
@@ -213,12 +239,15 @@ class DeltaService:
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
tmp_path = Path(tmpdir)
|
||||
delta_path = tmp_path / "delta"
|
||||
ref_path = self.cache.ref_path(delta_space.bucket, delta_space.prefix)
|
||||
# SECURITY: Use validated ref to prevent TOCTOU attacks
|
||||
ref_path = self.cache.get_validated_ref(
|
||||
delta_space.bucket, delta_space.prefix, delta_meta.ref_sha256
|
||||
)
|
||||
out_path = tmp_path / "output"
|
||||
|
||||
# Download delta
|
||||
with open(delta_path, "wb") as f:
|
||||
delta_stream = self.storage.get(f"{object_key.bucket}/{object_key.key}")
|
||||
delta_stream = self.storage.get(object_key.full_key)
|
||||
for chunk in iter(lambda: delta_stream.read(8192), b""):
|
||||
f.write(chunk)
|
||||
|
||||
@@ -318,10 +347,13 @@ class DeltaService:
|
||||
|
||||
# Re-check for race condition
|
||||
ref_head = self.storage.head(full_ref_key)
|
||||
if ref_head and ref_head.metadata.get("file_sha256") != file_sha256:
|
||||
existing_sha = None
|
||||
if ref_head:
|
||||
existing_sha = resolve_metadata(ref_head.metadata, "file_sha256")
|
||||
if ref_head and existing_sha and existing_sha != file_sha256:
|
||||
self.logger.warning("Reference creation race detected, using existing")
|
||||
# Proceed with existing reference
|
||||
ref_sha256 = ref_head.metadata["file_sha256"]
|
||||
ref_sha256 = existing_sha
|
||||
else:
|
||||
ref_sha256 = file_sha256
|
||||
|
||||
@@ -384,14 +416,17 @@ class DeltaService:
|
||||
) -> PutSummary:
|
||||
"""Create delta file."""
|
||||
ref_key = delta_space.reference_key()
|
||||
ref_sha256 = ref_head.metadata["file_sha256"]
|
||||
ref_sha256 = resolve_metadata(ref_head.metadata, "file_sha256")
|
||||
if not ref_sha256:
|
||||
raise ValueError("Reference metadata missing file SHA256")
|
||||
|
||||
# Ensure reference is cached
|
||||
cache_hit = self.cache.has_ref(delta_space.bucket, delta_space.prefix, ref_sha256)
|
||||
if not cache_hit:
|
||||
self._cache_reference(delta_space, ref_sha256)
|
||||
|
||||
ref_path = self.cache.ref_path(delta_space.bucket, delta_space.prefix)
|
||||
# SECURITY: Use validated ref to prevent TOCTOU attacks
|
||||
ref_path = self.cache.get_validated_ref(delta_space.bucket, delta_space.prefix, ref_sha256)
|
||||
|
||||
# Create delta
|
||||
with tempfile.NamedTemporaryFile(suffix=".delta") as delta_file:
|
||||
@@ -503,7 +538,7 @@ class DeltaService:
|
||||
) -> None:
|
||||
"""Download file directly from S3 without delta processing."""
|
||||
# Download the file directly
|
||||
file_stream = self.storage.get(f"{object_key.bucket}/{object_key.key}")
|
||||
file_stream = self.storage.get(object_key.full_key)
|
||||
|
||||
if isinstance(out, Path):
|
||||
# Write to file path
|
||||
@@ -516,7 +551,7 @@ class DeltaService:
|
||||
out.write(chunk)
|
||||
|
||||
# Verify integrity if SHA256 is present
|
||||
expected_sha = obj_head.metadata.get("file_sha256")
|
||||
expected_sha = resolve_metadata(obj_head.metadata, "file_sha256")
|
||||
if expected_sha:
|
||||
if isinstance(out, Path):
|
||||
actual_sha = self.hasher.sha256(out)
|
||||
@@ -537,7 +572,7 @@ class DeltaService:
|
||||
self.logger.info(
|
||||
"Direct download complete",
|
||||
key=object_key.key,
|
||||
size=obj_head.metadata.get("file_size"),
|
||||
size=resolve_metadata(obj_head.metadata, "file_size"),
|
||||
)
|
||||
|
||||
def _upload_direct(
|
||||
@@ -585,128 +620,37 @@ class DeltaService:
|
||||
file_sha256=file_sha256,
|
||||
)
|
||||
|
||||
def delete(self, object_key: ObjectKey) -> dict[str, Any]:
|
||||
def delete(self, object_key: ObjectKey) -> DeleteResult:
|
||||
"""Delete an object (delta-aware).
|
||||
|
||||
For delta files, just deletes the delta.
|
||||
For reference files, checks if any deltas depend on it first.
|
||||
For direct uploads, simply deletes the file.
|
||||
|
||||
Returns:
|
||||
dict with deletion details including type and any warnings
|
||||
"""
|
||||
start_time = self.clock.now()
|
||||
full_key = f"{object_key.bucket}/{object_key.key}"
|
||||
full_key = object_key.full_key
|
||||
|
||||
self.logger.info("Starting delete operation", key=object_key.key)
|
||||
|
||||
# Check if object exists
|
||||
obj_head = self.storage.head(full_key)
|
||||
if obj_head is None:
|
||||
raise NotFoundError(f"Object not found: {object_key.key}")
|
||||
|
||||
# Determine object type
|
||||
is_reference = object_key.key.endswith("/reference.bin")
|
||||
is_delta = object_key.key.endswith(".delta")
|
||||
is_direct = obj_head.metadata.get("compression") == "none"
|
||||
result = DeleteResult(key=object_key.key, bucket=object_key.bucket)
|
||||
|
||||
result: dict[str, Any] = {
|
||||
"key": object_key.key,
|
||||
"bucket": object_key.bucket,
|
||||
"deleted": False,
|
||||
"type": "unknown",
|
||||
"warnings": [],
|
||||
}
|
||||
|
||||
if is_reference:
|
||||
# Check if any deltas depend on this reference
|
||||
prefix = object_key.key.rsplit("/", 1)[0] if "/" in object_key.key else ""
|
||||
dependent_deltas = []
|
||||
|
||||
for obj in self.storage.list(f"{object_key.bucket}/{prefix}"):
|
||||
if obj.key.endswith(".delta") and obj.key != object_key.key:
|
||||
# Check if this delta references our reference
|
||||
delta_head = self.storage.head(f"{object_key.bucket}/{obj.key}")
|
||||
if delta_head and delta_head.metadata.get("ref_key") == object_key.key:
|
||||
dependent_deltas.append(obj.key)
|
||||
|
||||
if dependent_deltas:
|
||||
warnings_list = result["warnings"]
|
||||
assert isinstance(warnings_list, list)
|
||||
warnings_list.append(
|
||||
f"Reference has {len(dependent_deltas)} dependent delta(s). "
|
||||
"Deleting this will make those deltas unrecoverable."
|
||||
)
|
||||
self.logger.warning(
|
||||
"Reference has dependent deltas",
|
||||
ref_key=object_key.key,
|
||||
delta_count=len(dependent_deltas),
|
||||
deltas=dependent_deltas[:5], # Log first 5
|
||||
)
|
||||
|
||||
# Delete the reference
|
||||
if object_key.key.endswith("/reference.bin"):
|
||||
self._delete_reference(object_key, full_key, result)
|
||||
elif object_key.key.endswith(".delta"):
|
||||
self._delete_delta(object_key, full_key, obj_head, result)
|
||||
elif obj_head.metadata.get("compression") == "none":
|
||||
self.storage.delete(full_key)
|
||||
result["deleted"] = True
|
||||
result["type"] = "reference"
|
||||
result["dependent_deltas"] = len(dependent_deltas)
|
||||
|
||||
# Clear from cache if present
|
||||
if "/" in object_key.key:
|
||||
deltaspace_prefix = object_key.key.rsplit("/", 1)[0]
|
||||
try:
|
||||
self.cache.evict(object_key.bucket, deltaspace_prefix)
|
||||
except Exception as e:
|
||||
self.logger.debug(f"Could not clear cache for {object_key.key}: {e}")
|
||||
|
||||
elif is_delta:
|
||||
# Delete the delta file
|
||||
self.storage.delete(full_key)
|
||||
result["deleted"] = True
|
||||
result["type"] = "delta"
|
||||
result["original_name"] = obj_head.metadata.get("original_name", "unknown")
|
||||
|
||||
# Check if this was the last delta in the DeltaSpace - if so, clean up reference.bin
|
||||
if "/" in object_key.key:
|
||||
deltaspace_prefix = "/".join(object_key.key.split("/")[:-1])
|
||||
ref_key = f"{deltaspace_prefix}/reference.bin"
|
||||
|
||||
# Check if any other delta files exist in this DeltaSpace
|
||||
remaining_deltas = []
|
||||
for obj in self.storage.list(f"{object_key.bucket}/{deltaspace_prefix}"):
|
||||
if obj.key.endswith(".delta") and obj.key != object_key.key:
|
||||
remaining_deltas.append(obj.key)
|
||||
|
||||
if not remaining_deltas:
|
||||
# No more deltas - clean up the orphaned reference.bin
|
||||
ref_full_key = f"{object_key.bucket}/{ref_key}"
|
||||
ref_head = self.storage.head(ref_full_key)
|
||||
if ref_head:
|
||||
self.storage.delete(ref_full_key)
|
||||
self.logger.info(
|
||||
"Cleaned up orphaned reference.bin",
|
||||
ref_key=ref_key,
|
||||
reason="no remaining deltas",
|
||||
)
|
||||
result["cleaned_reference"] = ref_key
|
||||
|
||||
# Clear from cache
|
||||
try:
|
||||
self.cache.evict(object_key.bucket, deltaspace_prefix)
|
||||
except Exception as e:
|
||||
self.logger.debug(f"Could not clear cache for {deltaspace_prefix}: {e}")
|
||||
|
||||
elif is_direct:
|
||||
# Simply delete the direct upload
|
||||
self.storage.delete(full_key)
|
||||
result["deleted"] = True
|
||||
result["type"] = "direct"
|
||||
result["original_name"] = obj_head.metadata.get("original_name", object_key.key)
|
||||
|
||||
result.deleted = True
|
||||
result.type = "direct"
|
||||
result.original_name = obj_head.metadata.get("original_name", object_key.key)
|
||||
else:
|
||||
# Unknown file type, delete anyway
|
||||
self.storage.delete(full_key)
|
||||
result["deleted"] = True
|
||||
result["type"] = "unknown"
|
||||
result.deleted = True
|
||||
result.type = "unknown"
|
||||
|
||||
duration = (self.clock.now() - start_time).total_seconds()
|
||||
self.logger.log_operation(
|
||||
@@ -718,169 +662,139 @@ class DeltaService:
|
||||
cache_hit=False,
|
||||
)
|
||||
self.metrics.timing("deltaglider.delete.duration", duration)
|
||||
self.metrics.increment(f"deltaglider.delete.{result['type']}")
|
||||
self.metrics.increment(f"deltaglider.delete.{result.type}")
|
||||
|
||||
return result
|
||||
|
||||
def delete_recursive(self, bucket: str, prefix: str) -> dict[str, Any]:
|
||||
def _delete_reference(self, object_key: ObjectKey, full_key: str, result: DeleteResult) -> None:
|
||||
"""Handle deletion of a reference.bin file."""
|
||||
prefix = object_key.key.rsplit("/", 1)[0] if "/" in object_key.key else ""
|
||||
dependent_deltas = []
|
||||
|
||||
for obj in self.storage.list(f"{object_key.bucket}/{prefix}"):
|
||||
if obj.key.endswith(".delta") and obj.key != object_key.key:
|
||||
delta_head = self.storage.head(f"{object_key.bucket}/{obj.key}")
|
||||
if delta_head and delta_head.metadata.get("ref_key") == object_key.key:
|
||||
dependent_deltas.append(obj.key)
|
||||
|
||||
if dependent_deltas:
|
||||
result.warnings.append(
|
||||
f"Reference has {len(dependent_deltas)} dependent delta(s). "
|
||||
"Deleting this will make those deltas unrecoverable."
|
||||
)
|
||||
self.logger.warning(
|
||||
"Reference has dependent deltas",
|
||||
ref_key=object_key.key,
|
||||
delta_count=len(dependent_deltas),
|
||||
deltas=dependent_deltas[:5],
|
||||
)
|
||||
|
||||
self.storage.delete(full_key)
|
||||
result.deleted = True
|
||||
result.type = "reference"
|
||||
result.dependent_deltas = len(dependent_deltas)
|
||||
|
||||
if "/" in object_key.key:
|
||||
deltaspace_prefix = object_key.key.rsplit("/", 1)[0]
|
||||
try:
|
||||
self.cache.evict(object_key.bucket, deltaspace_prefix)
|
||||
except Exception as e:
|
||||
self.logger.debug(f"Could not clear cache for {object_key.key}: {e}")
|
||||
|
||||
def _delete_delta(
|
||||
self,
|
||||
object_key: ObjectKey,
|
||||
full_key: str,
|
||||
obj_head: ObjectHead,
|
||||
result: DeleteResult,
|
||||
) -> None:
|
||||
"""Handle deletion of a delta file, cleaning up orphaned references."""
|
||||
self.storage.delete(full_key)
|
||||
result.deleted = True
|
||||
result.type = "delta"
|
||||
result.original_name = obj_head.metadata.get("original_name", "unknown")
|
||||
|
||||
if "/" not in object_key.key:
|
||||
return
|
||||
|
||||
deltaspace_prefix = "/".join(object_key.key.split("/")[:-1])
|
||||
ref_key = f"{deltaspace_prefix}/reference.bin"
|
||||
|
||||
remaining_deltas = [
|
||||
obj.key
|
||||
for obj in self.storage.list(f"{object_key.bucket}/{deltaspace_prefix}")
|
||||
if obj.key.endswith(".delta") and obj.key != object_key.key
|
||||
]
|
||||
|
||||
if not remaining_deltas:
|
||||
ref_full_key = f"{object_key.bucket}/{ref_key}"
|
||||
ref_head = self.storage.head(ref_full_key)
|
||||
if ref_head:
|
||||
self.storage.delete(ref_full_key)
|
||||
self.logger.info(
|
||||
"Cleaned up orphaned reference.bin",
|
||||
ref_key=ref_key,
|
||||
reason="no remaining deltas",
|
||||
)
|
||||
result.cleaned_reference = ref_key
|
||||
|
||||
try:
|
||||
self.cache.evict(object_key.bucket, deltaspace_prefix)
|
||||
except Exception as e:
|
||||
self.logger.debug(f"Could not clear cache for {deltaspace_prefix}: {e}")
|
||||
|
||||
def delete_recursive(self, bucket: str, prefix: str) -> RecursiveDeleteResult:
|
||||
"""Recursively delete all objects under a prefix (delta-aware).
|
||||
|
||||
Handles delta relationships intelligently:
|
||||
- Deletes deltas before references
|
||||
- Warns about orphaned deltas
|
||||
- Handles direct uploads
|
||||
|
||||
Args:
|
||||
bucket: S3 bucket name
|
||||
prefix: Prefix to delete recursively
|
||||
|
||||
Returns:
|
||||
dict with deletion statistics and any warnings
|
||||
"""
|
||||
start_time = self.clock.now()
|
||||
self.logger.info("Starting recursive delete", bucket=bucket, prefix=prefix)
|
||||
|
||||
# Ensure prefix ends with / for proper directory deletion
|
||||
if prefix and not prefix.endswith("/"):
|
||||
prefix = f"{prefix}/"
|
||||
|
||||
# Collect all objects under prefix
|
||||
objects_to_delete = []
|
||||
references = []
|
||||
deltas = []
|
||||
direct_uploads = []
|
||||
affected_deltaspaces = set()
|
||||
# Phase 1: classify objects by type
|
||||
references, deltas, direct_uploads, other_objects, affected_deltaspaces = (
|
||||
self._classify_objects_for_deletion(bucket, prefix)
|
||||
)
|
||||
|
||||
for obj in self.storage.list(f"{bucket}/{prefix}" if prefix else bucket):
|
||||
if not obj.key.startswith(prefix) and prefix:
|
||||
continue
|
||||
|
||||
if obj.key.endswith("/reference.bin"):
|
||||
references.append(obj.key)
|
||||
elif obj.key.endswith(".delta"):
|
||||
deltas.append(obj.key)
|
||||
# Track which deltaspaces are affected by this deletion
|
||||
if "/" in obj.key:
|
||||
deltaspace_prefix = "/".join(obj.key.split("/")[:-1])
|
||||
affected_deltaspaces.add(deltaspace_prefix)
|
||||
else:
|
||||
# Check if it's a direct upload
|
||||
obj_head = self.storage.head(f"{bucket}/{obj.key}")
|
||||
if obj_head and obj_head.metadata.get("compression") == "none":
|
||||
direct_uploads.append(obj.key)
|
||||
else:
|
||||
objects_to_delete.append(obj.key)
|
||||
|
||||
# Also check for references in parent directories that might be affected
|
||||
# by the deletion of delta files in affected deltaspaces
|
||||
for deltaspace_prefix in affected_deltaspaces:
|
||||
ref_key = f"{deltaspace_prefix}/reference.bin"
|
||||
# Also check for references in parent deltaspaces affected by delta deletion
|
||||
for ds_prefix in affected_deltaspaces:
|
||||
ref_key = f"{ds_prefix}/reference.bin"
|
||||
if ref_key not in references:
|
||||
# Check if this reference exists
|
||||
ref_head = self.storage.head(f"{bucket}/{ref_key}")
|
||||
if ref_head:
|
||||
references.append(ref_key)
|
||||
|
||||
result: dict[str, Any] = {
|
||||
"bucket": bucket,
|
||||
"prefix": prefix,
|
||||
"deleted_count": 0,
|
||||
"failed_count": 0,
|
||||
"deltas_deleted": len(deltas),
|
||||
"references_deleted": len(references),
|
||||
"direct_deleted": len(direct_uploads),
|
||||
"other_deleted": len(objects_to_delete),
|
||||
"errors": [],
|
||||
"warnings": [],
|
||||
}
|
||||
result = RecursiveDeleteResult(
|
||||
bucket=bucket,
|
||||
prefix=prefix,
|
||||
deltas_deleted=len(deltas),
|
||||
references_deleted=len(references),
|
||||
direct_deleted=len(direct_uploads),
|
||||
other_deleted=len(other_objects),
|
||||
)
|
||||
|
||||
# Delete in order: other files -> direct uploads -> deltas -> references (with checks)
|
||||
# This ensures we don't delete references that deltas depend on prematurely
|
||||
regular_files = objects_to_delete + direct_uploads + deltas
|
||||
|
||||
# Delete regular files first
|
||||
for key in regular_files:
|
||||
# Phase 2: delete non-reference files first (dependency order)
|
||||
for key in other_objects + direct_uploads + deltas:
|
||||
try:
|
||||
self.storage.delete(f"{bucket}/{key}")
|
||||
deleted_count = result["deleted_count"]
|
||||
assert isinstance(deleted_count, int)
|
||||
result["deleted_count"] = deleted_count + 1
|
||||
result.deleted_count += 1
|
||||
self.logger.debug(f"Deleted {key}")
|
||||
except Exception as e:
|
||||
failed_count = result["failed_count"]
|
||||
assert isinstance(failed_count, int)
|
||||
result["failed_count"] = failed_count + 1
|
||||
errors_list = result["errors"]
|
||||
assert isinstance(errors_list, list)
|
||||
errors_list.append(f"Failed to delete {key}: {str(e)}")
|
||||
result.failed_count += 1
|
||||
result.errors.append(f"Failed to delete {key}: {str(e)}")
|
||||
self.logger.error(f"Failed to delete {key}: {e}")
|
||||
|
||||
# Handle references intelligently - only delete if no files outside deletion scope depend on them
|
||||
references_kept = 0
|
||||
for ref_key in references:
|
||||
try:
|
||||
# Extract deltaspace prefix from reference.bin path
|
||||
if ref_key.endswith("/reference.bin"):
|
||||
deltaspace_prefix = ref_key[:-14] # Remove "/reference.bin"
|
||||
else:
|
||||
deltaspace_prefix = ""
|
||||
# Phase 3: delete references only if safe
|
||||
references_kept = self._delete_references_if_safe(bucket, prefix, references, result)
|
||||
result.references_deleted -= references_kept
|
||||
|
||||
# Check if there are any remaining files in this deltaspace
|
||||
# (outside of the deletion prefix)
|
||||
deltaspace_list_prefix = (
|
||||
f"{bucket}/{deltaspace_prefix}" if deltaspace_prefix else bucket
|
||||
)
|
||||
remaining_objects = list(self.storage.list(deltaspace_list_prefix))
|
||||
|
||||
# Filter out objects that are being deleted (within our deletion scope)
|
||||
# and the reference.bin file itself
|
||||
deletion_prefix_full = f"{bucket}/{prefix}" if prefix else bucket
|
||||
has_remaining_files = False
|
||||
|
||||
for remaining_obj in remaining_objects:
|
||||
obj_full_path = f"{bucket}/{remaining_obj.key}"
|
||||
# Skip if this object is within our deletion scope
|
||||
if prefix and obj_full_path.startswith(deletion_prefix_full):
|
||||
continue
|
||||
# Skip if this is the reference.bin file itself
|
||||
if remaining_obj.key == ref_key:
|
||||
continue
|
||||
# If we find any other file, the reference is still needed
|
||||
has_remaining_files = True
|
||||
break
|
||||
|
||||
if not has_remaining_files:
|
||||
# Safe to delete this reference.bin
|
||||
self.storage.delete(f"{bucket}/{ref_key}")
|
||||
deleted_count = result["deleted_count"]
|
||||
assert isinstance(deleted_count, int)
|
||||
result["deleted_count"] = deleted_count + 1
|
||||
self.logger.debug(f"Deleted reference {ref_key}")
|
||||
else:
|
||||
# Keep the reference as it's still needed
|
||||
references_kept += 1
|
||||
warnings_list = result["warnings"]
|
||||
assert isinstance(warnings_list, list)
|
||||
warnings_list.append(f"Kept reference {ref_key} (still in use)")
|
||||
self.logger.info(
|
||||
f"Kept reference {ref_key} - still in use outside deletion scope"
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
failed_count = result["failed_count"]
|
||||
assert isinstance(failed_count, int)
|
||||
result["failed_count"] = failed_count + 1
|
||||
errors_list = result["errors"]
|
||||
assert isinstance(errors_list, list)
|
||||
errors_list.append(f"Failed to delete reference {ref_key}: {str(e)}")
|
||||
self.logger.error(f"Failed to delete reference {ref_key}: {e}")
|
||||
|
||||
# Update reference deletion count
|
||||
references_deleted = result["references_deleted"]
|
||||
assert isinstance(references_deleted, int)
|
||||
result["references_deleted"] = references_deleted - references_kept
|
||||
|
||||
# Clear any cached references for this prefix
|
||||
# Clear cached references
|
||||
if references:
|
||||
try:
|
||||
self.cache.evict(bucket, prefix.rstrip("/") if prefix else "")
|
||||
@@ -892,11 +806,291 @@ class DeltaService:
|
||||
"Recursive delete complete",
|
||||
bucket=bucket,
|
||||
prefix=prefix,
|
||||
deleted=result["deleted_count"],
|
||||
failed=result["failed_count"],
|
||||
deleted=result.deleted_count,
|
||||
failed=result.failed_count,
|
||||
duration=duration,
|
||||
)
|
||||
self.metrics.timing("deltaglider.delete_recursive.duration", duration)
|
||||
self.metrics.increment("deltaglider.delete_recursive.completed")
|
||||
|
||||
return result
|
||||
|
||||
def _classify_objects_for_deletion(
|
||||
self, bucket: str, prefix: str
|
||||
) -> tuple[list[str], list[str], list[str], list[str], set[str]]:
|
||||
"""Classify objects under a prefix into references, deltas, direct uploads, and other.
|
||||
|
||||
Returns:
|
||||
(references, deltas, direct_uploads, other_objects, affected_deltaspaces)
|
||||
"""
|
||||
references: list[str] = []
|
||||
deltas: list[str] = []
|
||||
direct_uploads: list[str] = []
|
||||
other_objects: list[str] = []
|
||||
affected_deltaspaces: set[str] = set()
|
||||
|
||||
for obj in self.storage.list(f"{bucket}/{prefix}" if prefix else bucket):
|
||||
if prefix and not obj.key.startswith(prefix):
|
||||
continue
|
||||
|
||||
if obj.key.endswith("/reference.bin"):
|
||||
references.append(obj.key)
|
||||
elif obj.key.endswith(".delta"):
|
||||
deltas.append(obj.key)
|
||||
if "/" in obj.key:
|
||||
affected_deltaspaces.add("/".join(obj.key.split("/")[:-1]))
|
||||
else:
|
||||
obj_head = self.storage.head(f"{bucket}/{obj.key}")
|
||||
if obj_head and obj_head.metadata.get("compression") == "none":
|
||||
direct_uploads.append(obj.key)
|
||||
else:
|
||||
other_objects.append(obj.key)
|
||||
|
||||
return references, deltas, direct_uploads, other_objects, affected_deltaspaces
|
||||
|
||||
def _delete_references_if_safe(
|
||||
self,
|
||||
bucket: str,
|
||||
prefix: str,
|
||||
references: list[str],
|
||||
result: RecursiveDeleteResult,
|
||||
) -> int:
|
||||
"""Delete references only if no files outside the deletion scope depend on them.
|
||||
|
||||
Returns the number of references kept (not deleted).
|
||||
"""
|
||||
references_kept = 0
|
||||
deletion_prefix_full = f"{bucket}/{prefix}" if prefix else bucket
|
||||
|
||||
for ref_key in references:
|
||||
try:
|
||||
if ref_key.endswith("/reference.bin"):
|
||||
deltaspace_prefix = ref_key[:-14] # Remove "/reference.bin"
|
||||
else:
|
||||
deltaspace_prefix = ""
|
||||
|
||||
ds_list_prefix = f"{bucket}/{deltaspace_prefix}" if deltaspace_prefix else bucket
|
||||
has_remaining_files = any(
|
||||
not (prefix and f"{bucket}/{obj.key}".startswith(deletion_prefix_full))
|
||||
and obj.key != ref_key
|
||||
for obj in self.storage.list(ds_list_prefix)
|
||||
)
|
||||
|
||||
if not has_remaining_files:
|
||||
self.storage.delete(f"{bucket}/{ref_key}")
|
||||
result.deleted_count += 1
|
||||
self.logger.debug(f"Deleted reference {ref_key}")
|
||||
else:
|
||||
references_kept += 1
|
||||
result.warnings.append(f"Kept reference {ref_key} (still in use)")
|
||||
self.logger.info(
|
||||
f"Kept reference {ref_key} - still in use outside deletion scope"
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
result.failed_count += 1
|
||||
result.errors.append(f"Failed to delete reference {ref_key}: {str(e)}")
|
||||
self.logger.error(f"Failed to delete reference {ref_key}: {e}")
|
||||
|
||||
return references_kept
|
||||
|
||||
def rehydrate_for_download(
|
||||
self,
|
||||
bucket: str,
|
||||
key: str,
|
||||
expires_in_seconds: int = 3600,
|
||||
) -> str | None:
|
||||
"""Rehydrate a deltaglider-compressed file for direct download.
|
||||
|
||||
If the file is deltaglider-compressed, this will:
|
||||
1. Download and decompress the file
|
||||
2. Re-upload to .deltaglider/tmp/ with expiration metadata
|
||||
3. Return the new temporary file key
|
||||
|
||||
If the file is not deltaglider-compressed, returns None.
|
||||
|
||||
Args:
|
||||
bucket: S3 bucket name
|
||||
key: Object key
|
||||
expires_in_seconds: How long the temporary file should exist
|
||||
|
||||
Returns:
|
||||
New key for temporary file, or None if not deltaglider-compressed
|
||||
"""
|
||||
start_time = self.clock.now()
|
||||
|
||||
# Check if object exists and is deltaglider-compressed
|
||||
obj_head = self.storage.head(f"{bucket}/{key}")
|
||||
|
||||
# If not found directly, try with .delta extension
|
||||
if obj_head is None and not key.endswith(".delta"):
|
||||
obj_head = self.storage.head(f"{bucket}/{key}.delta")
|
||||
if obj_head is not None:
|
||||
# Found the delta version, update the key
|
||||
key = f"{key}.delta"
|
||||
|
||||
if obj_head is None:
|
||||
raise NotFoundError(f"Object not found: {key}")
|
||||
|
||||
# Check if this is a deltaglider file
|
||||
is_delta = key.endswith(".delta")
|
||||
has_dg_metadata = "dg-file-sha256" in obj_head.metadata
|
||||
|
||||
if not is_delta and not has_dg_metadata:
|
||||
# Not a deltaglider file, return None
|
||||
self.logger.debug(f"File {key} is not deltaglider-compressed")
|
||||
return None
|
||||
|
||||
# Generate temporary file path
|
||||
import uuid
|
||||
|
||||
# Use the original filename without .delta extension for the temp file
|
||||
original_name = key.removesuffix(".delta") if key.endswith(".delta") else key
|
||||
temp_filename = f"{uuid.uuid4().hex}_{Path(original_name).name}"
|
||||
temp_key = f".deltaglider/tmp/{temp_filename}"
|
||||
|
||||
# Download and decompress the file
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
tmp_path = Path(tmpdir)
|
||||
decompressed_path = tmp_path / "decompressed"
|
||||
|
||||
# Use the existing get method to decompress
|
||||
object_key = ObjectKey(bucket=bucket, key=key)
|
||||
self.get(object_key, decompressed_path)
|
||||
|
||||
# Calculate expiration time
|
||||
expires_at = self.clock.now() + timedelta(seconds=expires_in_seconds)
|
||||
|
||||
# Create metadata for temporary file
|
||||
metadata = {
|
||||
"dg-expires-at": expires_at.isoformat(),
|
||||
"dg-original-key": key,
|
||||
"dg-original-filename": Path(original_name).name,
|
||||
"dg-rehydrated": "true",
|
||||
"dg-created-at": self.clock.now().isoformat(),
|
||||
}
|
||||
|
||||
# Upload the decompressed file
|
||||
self.logger.info(
|
||||
"Uploading rehydrated file",
|
||||
original_key=key,
|
||||
temp_key=temp_key,
|
||||
expires_at=expires_at.isoformat(),
|
||||
)
|
||||
|
||||
self.storage.put(
|
||||
f"{bucket}/{temp_key}",
|
||||
decompressed_path,
|
||||
metadata,
|
||||
)
|
||||
|
||||
duration = (self.clock.now() - start_time).total_seconds()
|
||||
self.logger.info(
|
||||
"Rehydration complete",
|
||||
original_key=key,
|
||||
temp_key=temp_key,
|
||||
duration=duration,
|
||||
)
|
||||
self.metrics.timing("deltaglider.rehydrate.duration", duration)
|
||||
self.metrics.increment("deltaglider.rehydrate.completed")
|
||||
|
||||
return temp_key
|
||||
|
||||
def purge_temp_files(self, bucket: str) -> dict[str, Any]:
|
||||
"""Purge expired temporary files from .deltaglider/tmp/.
|
||||
|
||||
Scans the .deltaglider/tmp/ prefix and deletes any files
|
||||
whose dg-expires-at metadata indicates they have expired.
|
||||
|
||||
Args:
|
||||
bucket: S3 bucket to purge temp files from
|
||||
|
||||
Returns:
|
||||
dict with purge statistics
|
||||
"""
|
||||
start_time = self.clock.now()
|
||||
prefix = ".deltaglider/tmp/"
|
||||
|
||||
self.logger.info("Starting temp file purge", bucket=bucket, prefix=prefix)
|
||||
|
||||
deleted_count = 0
|
||||
expired_count = 0
|
||||
error_count = 0
|
||||
total_size_freed = 0
|
||||
errors = []
|
||||
|
||||
# List all objects in temp directory
|
||||
for obj in self.storage.list(f"{bucket}/{prefix}"):
|
||||
if not obj.key.startswith(prefix):
|
||||
continue
|
||||
|
||||
try:
|
||||
# Get object metadata
|
||||
obj_head = self.storage.head(f"{bucket}/{obj.key}")
|
||||
if obj_head is None:
|
||||
continue
|
||||
|
||||
# Check expiration
|
||||
expires_at_str = obj_head.metadata.get("dg-expires-at")
|
||||
if not expires_at_str:
|
||||
# No expiration metadata, skip
|
||||
self.logger.debug(f"No expiration metadata for {obj.key}")
|
||||
continue
|
||||
|
||||
# Parse expiration time
|
||||
from datetime import datetime
|
||||
|
||||
try:
|
||||
expires_at = datetime.fromisoformat(expires_at_str.replace("Z", "+00:00"))
|
||||
if expires_at.tzinfo is None:
|
||||
expires_at = expires_at.replace(tzinfo=UTC)
|
||||
except ValueError:
|
||||
self.logger.warning(
|
||||
f"Invalid expiration format for {obj.key}: {expires_at_str}"
|
||||
)
|
||||
continue
|
||||
|
||||
# Check if expired
|
||||
if self.clock.now() >= expires_at:
|
||||
expired_count += 1
|
||||
# Delete the file
|
||||
self.storage.delete(f"{bucket}/{obj.key}")
|
||||
deleted_count += 1
|
||||
total_size_freed += obj.size
|
||||
self.logger.debug(
|
||||
f"Deleted expired temp file {obj.key}",
|
||||
expired_at=expires_at_str,
|
||||
size=obj.size,
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
error_count += 1
|
||||
errors.append(f"Error processing {obj.key}: {str(e)}")
|
||||
self.logger.error(f"Failed to process temp file {obj.key}: {e}")
|
||||
|
||||
duration = (self.clock.now() - start_time).total_seconds()
|
||||
|
||||
result = {
|
||||
"bucket": bucket,
|
||||
"prefix": prefix,
|
||||
"deleted_count": deleted_count,
|
||||
"expired_count": expired_count,
|
||||
"error_count": error_count,
|
||||
"total_size_freed": total_size_freed,
|
||||
"duration_seconds": duration,
|
||||
"errors": errors,
|
||||
}
|
||||
|
||||
self.logger.info(
|
||||
"Temp file purge complete",
|
||||
bucket=bucket,
|
||||
deleted=deleted_count,
|
||||
size_freed=total_size_freed,
|
||||
duration=duration,
|
||||
)
|
||||
|
||||
self.metrics.timing("deltaglider.purge.duration", duration)
|
||||
self.metrics.gauge("deltaglider.purge.deleted_count", deleted_count)
|
||||
self.metrics.gauge("deltaglider.purge.size_freed", total_size_freed)
|
||||
|
||||
return result
|
||||
|
||||
@@ -15,6 +15,26 @@ class CachePort(Protocol):
|
||||
"""Check if reference exists and matches SHA."""
|
||||
...
|
||||
|
||||
def get_validated_ref(self, bucket: str, prefix: str, expected_sha: str) -> Path:
|
||||
"""Get cached reference with atomic SHA validation.
|
||||
|
||||
This method MUST be used instead of ref_path() to prevent TOCTOU attacks.
|
||||
It validates the SHA256 hash at the time of use, not just at cache check time.
|
||||
|
||||
Args:
|
||||
bucket: S3 bucket name
|
||||
prefix: Prefix/deltaspace within bucket
|
||||
expected_sha: Expected SHA256 hash of the file
|
||||
|
||||
Returns:
|
||||
Path to the validated cached file
|
||||
|
||||
Raises:
|
||||
CacheMissError: If cached file doesn't exist
|
||||
CacheCorruptionError: If SHA doesn't match (file corrupted or tampered)
|
||||
"""
|
||||
...
|
||||
|
||||
def write_ref(self, bucket: str, prefix: str, src: Path) -> Path:
|
||||
"""Cache reference file."""
|
||||
...
|
||||
@@ -22,3 +42,18 @@ class CachePort(Protocol):
|
||||
def evict(self, bucket: str, prefix: str) -> None:
|
||||
"""Remove cached reference."""
|
||||
...
|
||||
|
||||
def clear(self) -> None:
|
||||
"""Clear all cached references.
|
||||
|
||||
This method forcibly removes all cached data, useful for:
|
||||
- Long-running applications that need to free memory
|
||||
- Test cleanup
|
||||
- Manual cache invalidation
|
||||
- Ensuring fresh data fetch
|
||||
|
||||
Note: For filesystem caches, this removes all files in the cache directory.
|
||||
For memory caches, this clears all in-memory data.
|
||||
For encrypted caches, this also clears encryption key mappings.
|
||||
"""
|
||||
...
|
||||
|
||||
152
src/deltaglider/response_builders.py
Normal file
152
src/deltaglider/response_builders.py
Normal file
@@ -0,0 +1,152 @@
|
||||
"""Type-safe response builders using TypedDicts for internal type safety.
|
||||
|
||||
This module provides builder functions that construct boto3-compatible responses
|
||||
with full compile-time type validation using TypedDicts. At runtime, TypedDicts
|
||||
are plain dicts, so there's no conversion overhead.
|
||||
|
||||
Benefits:
|
||||
- Field name typos caught by mypy (e.g., "HTTPStatusCode" → "HttpStatusCode")
|
||||
- Wrong types caught by mypy (e.g., string instead of int)
|
||||
- Missing required fields caught by mypy
|
||||
- Extra unknown fields caught by mypy
|
||||
"""
|
||||
|
||||
from typing import Any
|
||||
|
||||
from .types import (
|
||||
CommonPrefix,
|
||||
DeleteObjectResponse,
|
||||
GetObjectResponse,
|
||||
ListObjectsV2Response,
|
||||
PutObjectResponse,
|
||||
ResponseMetadata,
|
||||
S3Object,
|
||||
)
|
||||
|
||||
|
||||
def build_response_metadata(status_code: int = 200) -> ResponseMetadata:
|
||||
"""Build ResponseMetadata with full type safety via TypedDict.
|
||||
|
||||
TypedDict is a dict at runtime - no conversion needed!
|
||||
mypy validates all fields match ResponseMetadata TypedDict.
|
||||
Uses our types.py TypedDict which has proper NotRequired fields.
|
||||
"""
|
||||
# Build as TypedDict - mypy validates field names and types!
|
||||
metadata: ResponseMetadata = {
|
||||
"HTTPStatusCode": status_code,
|
||||
# All other fields are NotRequired - can be omitted!
|
||||
}
|
||||
return metadata # Returns dict at runtime, ResponseMetadata type at compile-time
|
||||
|
||||
|
||||
def build_put_response(
|
||||
etag: str,
|
||||
*,
|
||||
version_id: str | None = None,
|
||||
deltaglider_info: dict[str, Any] | None = None,
|
||||
) -> PutObjectResponse:
|
||||
"""Build PutObjectResponse with full type safety via TypedDict.
|
||||
|
||||
Uses our types.py TypedDict which has proper NotRequired fields.
|
||||
mypy validates all field names, types, and structure.
|
||||
"""
|
||||
# Build as TypedDict - mypy catches typos and type errors!
|
||||
response: PutObjectResponse = {
|
||||
"ETag": etag,
|
||||
"ResponseMetadata": build_response_metadata(),
|
||||
}
|
||||
|
||||
if version_id:
|
||||
response["VersionId"] = version_id
|
||||
|
||||
# DeltaGlider extension - add as Any field
|
||||
if deltaglider_info:
|
||||
response["DeltaGliderInfo"] = deltaglider_info # type: ignore[typeddict-item]
|
||||
|
||||
return response # Returns dict at runtime, PutObjectResponse type at compile-time
|
||||
|
||||
|
||||
def build_get_response(
|
||||
body: Any,
|
||||
content_length: int,
|
||||
etag: str,
|
||||
metadata: dict[str, Any],
|
||||
) -> GetObjectResponse:
|
||||
"""Build GetObjectResponse with full type safety via TypedDict.
|
||||
|
||||
Uses our types.py TypedDict which has proper NotRequired fields.
|
||||
mypy validates all field names, types, and structure.
|
||||
"""
|
||||
# Build as TypedDict - mypy catches typos and type errors!
|
||||
response: GetObjectResponse = {
|
||||
"Body": body,
|
||||
"ContentLength": content_length,
|
||||
"ETag": etag,
|
||||
"Metadata": metadata,
|
||||
"ResponseMetadata": build_response_metadata(),
|
||||
}
|
||||
return response # Returns dict at runtime, GetObjectResponse type at compile-time
|
||||
|
||||
|
||||
def build_list_objects_response(
|
||||
bucket: str,
|
||||
prefix: str,
|
||||
delimiter: str,
|
||||
max_keys: int,
|
||||
contents: list[S3Object],
|
||||
common_prefixes: list[CommonPrefix] | None,
|
||||
is_truncated: bool,
|
||||
next_continuation_token: str | None,
|
||||
continuation_token: str | None,
|
||||
) -> ListObjectsV2Response:
|
||||
"""Build ListObjectsV2Response with full type safety via TypedDict.
|
||||
|
||||
Uses our types.py TypedDict which has proper NotRequired fields.
|
||||
mypy validates all field names, types, and structure.
|
||||
"""
|
||||
# Build as TypedDict - mypy catches typos and type errors!
|
||||
response: ListObjectsV2Response = {
|
||||
"IsTruncated": is_truncated,
|
||||
"Contents": contents,
|
||||
"Name": bucket,
|
||||
"Prefix": prefix,
|
||||
"Delimiter": delimiter,
|
||||
"MaxKeys": max_keys,
|
||||
"KeyCount": len(contents),
|
||||
"ResponseMetadata": build_response_metadata(),
|
||||
}
|
||||
|
||||
# Add optional fields
|
||||
if common_prefixes:
|
||||
response["CommonPrefixes"] = common_prefixes
|
||||
|
||||
if next_continuation_token:
|
||||
response["NextContinuationToken"] = next_continuation_token
|
||||
|
||||
if continuation_token:
|
||||
response["ContinuationToken"] = continuation_token
|
||||
|
||||
return response # Returns dict at runtime, ListObjectsV2Response type at compile-time
|
||||
|
||||
|
||||
def build_delete_response(
|
||||
delete_marker: bool = False,
|
||||
status_code: int = 204,
|
||||
deltaglider_info: dict[str, Any] | None = None,
|
||||
) -> DeleteObjectResponse:
|
||||
"""Build DeleteObjectResponse with full type safety via TypedDict.
|
||||
|
||||
Uses our types.py TypedDict which has proper NotRequired fields.
|
||||
mypy validates all field names, types, and structure.
|
||||
"""
|
||||
# Build as TypedDict - mypy catches typos and type errors!
|
||||
response: DeleteObjectResponse = {
|
||||
"DeleteMarker": delete_marker,
|
||||
"ResponseMetadata": build_response_metadata(status_code),
|
||||
}
|
||||
|
||||
# DeltaGlider extension
|
||||
if deltaglider_info:
|
||||
response["DeltaGliderInfo"] = deltaglider_info # type: ignore[typeddict-item]
|
||||
|
||||
return response # Returns dict at runtime, DeleteObjectResponse type at compile-time
|
||||
355
src/deltaglider/types.py
Normal file
355
src/deltaglider/types.py
Normal file
@@ -0,0 +1,355 @@
|
||||
"""Type definitions for boto3-compatible responses.
|
||||
|
||||
These TypedDict definitions provide type hints for DeltaGlider's boto3-compatible
|
||||
responses. All methods return plain `dict[str, Any]` at runtime for maximum
|
||||
flexibility and boto3 compatibility.
|
||||
|
||||
## Basic Usage (Recommended)
|
||||
|
||||
Use DeltaGlider with simple dict access - no type imports needed:
|
||||
|
||||
```python
|
||||
from deltaglider import create_client
|
||||
|
||||
client = create_client()
|
||||
|
||||
# Returns plain dict - 100% boto3 compatible
|
||||
response = client.put_object(Bucket='my-bucket', Key='file.zip', Body=data)
|
||||
print(response['ETag'])
|
||||
|
||||
# List objects with dict access
|
||||
listing = client.list_objects(Bucket='my-bucket')
|
||||
for obj in listing['Contents']:
|
||||
print(f"{obj['Key']}: {obj['Size']} bytes")
|
||||
```
|
||||
|
||||
## Optional Type Hints
|
||||
|
||||
For IDE autocomplete and type checking, you can use our convenience TypedDicts:
|
||||
|
||||
```python
|
||||
from deltaglider import create_client
|
||||
from deltaglider.types import PutObjectResponse, ListObjectsV2Response
|
||||
|
||||
client = create_client()
|
||||
response: PutObjectResponse = client.put_object(...) # IDE autocomplete
|
||||
listing: ListObjectsV2Response = client.list_objects(...)
|
||||
```
|
||||
|
||||
## Advanced: boto3-stubs Integration
|
||||
|
||||
For strictest type checking (requires boto3-stubs installation):
|
||||
|
||||
```bash
|
||||
pip install boto3-stubs[s3]
|
||||
```
|
||||
|
||||
```python
|
||||
from mypy_boto3_s3.type_defs import PutObjectOutputTypeDef
|
||||
response: PutObjectOutputTypeDef = client.put_object(...)
|
||||
```
|
||||
|
||||
**Note**: boto3-stubs TypedDefs are very strict and require ALL optional fields.
|
||||
DeltaGlider returns partial dicts for better boto3 compatibility, so boto3-stubs
|
||||
types may show false positive errors. Use `dict[str, Any]` or our TypedDicts instead.
|
||||
|
||||
## Design Philosophy
|
||||
|
||||
DeltaGlider returns `dict[str, Any]` from all boto3-compatible methods because:
|
||||
1. **Flexibility**: boto3 responses vary by service and operation
|
||||
2. **Compatibility**: Exact match with boto3 runtime behavior
|
||||
3. **Simplicity**: No complex type dependencies for users
|
||||
4. **Optional Typing**: Users choose their preferred level of type safety
|
||||
"""
|
||||
|
||||
from datetime import datetime
|
||||
from typing import Any, Literal, NotRequired, TypedDict
|
||||
|
||||
# ============================================================================
|
||||
# S3 Object Types
|
||||
# ============================================================================
|
||||
|
||||
|
||||
class S3Object(TypedDict):
|
||||
"""An S3 object returned in list operations.
|
||||
|
||||
Compatible with boto3's S3.Client.list_objects_v2() response Contents.
|
||||
"""
|
||||
|
||||
Key: str
|
||||
Size: int
|
||||
LastModified: datetime
|
||||
ETag: NotRequired[str]
|
||||
StorageClass: NotRequired[str]
|
||||
Owner: NotRequired[dict[str, str]]
|
||||
Metadata: NotRequired[dict[str, str]]
|
||||
|
||||
|
||||
class CommonPrefix(TypedDict):
|
||||
"""A common prefix (directory) in S3 listing.
|
||||
|
||||
Compatible with boto3's S3.Client.list_objects_v2() response CommonPrefixes.
|
||||
"""
|
||||
|
||||
Prefix: str
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# Response Metadata (used in all responses)
|
||||
# ============================================================================
|
||||
|
||||
|
||||
class ResponseMetadata(TypedDict):
|
||||
"""Metadata about the API response.
|
||||
|
||||
Compatible with all boto3 responses.
|
||||
"""
|
||||
|
||||
RequestId: NotRequired[str]
|
||||
HostId: NotRequired[str]
|
||||
HTTPStatusCode: int
|
||||
HTTPHeaders: NotRequired[dict[str, str]]
|
||||
RetryAttempts: NotRequired[int]
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# List Operations Response Types
|
||||
# ============================================================================
|
||||
|
||||
|
||||
class ListObjectsV2Response(TypedDict):
|
||||
"""Response from list_objects_v2 operation.
|
||||
|
||||
100% compatible with boto3's S3.Client.list_objects_v2() response.
|
||||
|
||||
Example:
|
||||
```python
|
||||
client = create_client()
|
||||
response: ListObjectsV2Response = client.list_objects(
|
||||
Bucket='my-bucket',
|
||||
Prefix='path/',
|
||||
Delimiter='/'
|
||||
)
|
||||
|
||||
for obj in response['Contents']:
|
||||
print(f"{obj['Key']}: {obj['Size']} bytes")
|
||||
|
||||
for prefix in response.get('CommonPrefixes', []):
|
||||
print(f"Directory: {prefix['Prefix']}")
|
||||
```
|
||||
"""
|
||||
|
||||
Contents: list[S3Object]
|
||||
Name: NotRequired[str] # Bucket name
|
||||
Prefix: NotRequired[str]
|
||||
Delimiter: NotRequired[str]
|
||||
MaxKeys: NotRequired[int]
|
||||
CommonPrefixes: NotRequired[list[CommonPrefix]]
|
||||
EncodingType: NotRequired[str]
|
||||
KeyCount: NotRequired[int]
|
||||
ContinuationToken: NotRequired[str]
|
||||
NextContinuationToken: NotRequired[str]
|
||||
StartAfter: NotRequired[str]
|
||||
IsTruncated: NotRequired[bool]
|
||||
ResponseMetadata: NotRequired[ResponseMetadata]
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# Put/Get/Delete Response Types
|
||||
# ============================================================================
|
||||
|
||||
|
||||
class PutObjectResponse(TypedDict):
|
||||
"""Response from put_object operation.
|
||||
|
||||
Compatible with boto3's S3.Client.put_object() response.
|
||||
"""
|
||||
|
||||
ETag: str
|
||||
VersionId: NotRequired[str]
|
||||
ServerSideEncryption: NotRequired[str]
|
||||
ResponseMetadata: NotRequired[ResponseMetadata]
|
||||
|
||||
|
||||
class GetObjectResponse(TypedDict):
|
||||
"""Response from get_object operation.
|
||||
|
||||
Compatible with boto3's S3.Client.get_object() response.
|
||||
"""
|
||||
|
||||
Body: Any # StreamingBody in boto3, bytes in DeltaGlider
|
||||
ContentLength: int
|
||||
ContentType: NotRequired[str]
|
||||
ETag: NotRequired[str]
|
||||
LastModified: NotRequired[datetime]
|
||||
Metadata: NotRequired[dict[str, str]]
|
||||
VersionId: NotRequired[str]
|
||||
StorageClass: NotRequired[str]
|
||||
ResponseMetadata: NotRequired[ResponseMetadata]
|
||||
|
||||
|
||||
class DeleteObjectResponse(TypedDict):
|
||||
"""Response from delete_object operation.
|
||||
|
||||
Compatible with boto3's S3.Client.delete_object() response.
|
||||
"""
|
||||
|
||||
DeleteMarker: NotRequired[bool]
|
||||
VersionId: NotRequired[str]
|
||||
ResponseMetadata: NotRequired[ResponseMetadata]
|
||||
|
||||
|
||||
class DeletedObject(TypedDict):
|
||||
"""A successfully deleted object.
|
||||
|
||||
Compatible with boto3's S3.Client.delete_objects() response Deleted.
|
||||
"""
|
||||
|
||||
Key: str
|
||||
VersionId: NotRequired[str]
|
||||
DeleteMarker: NotRequired[bool]
|
||||
DeleteMarkerVersionId: NotRequired[str]
|
||||
|
||||
|
||||
class DeleteError(TypedDict):
|
||||
"""An error that occurred during deletion.
|
||||
|
||||
Compatible with boto3's S3.Client.delete_objects() response Errors.
|
||||
"""
|
||||
|
||||
Key: str
|
||||
Code: str
|
||||
Message: str
|
||||
VersionId: NotRequired[str]
|
||||
|
||||
|
||||
class DeleteObjectsResponse(TypedDict):
|
||||
"""Response from delete_objects operation.
|
||||
|
||||
Compatible with boto3's S3.Client.delete_objects() response.
|
||||
"""
|
||||
|
||||
Deleted: NotRequired[list[DeletedObject]]
|
||||
Errors: NotRequired[list[DeleteError]]
|
||||
ResponseMetadata: NotRequired[ResponseMetadata]
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# Head Object Response
|
||||
# ============================================================================
|
||||
|
||||
|
||||
class HeadObjectResponse(TypedDict):
|
||||
"""Response from head_object operation.
|
||||
|
||||
Compatible with boto3's S3.Client.head_object() response.
|
||||
"""
|
||||
|
||||
ContentLength: int
|
||||
ContentType: NotRequired[str]
|
||||
ETag: NotRequired[str]
|
||||
LastModified: NotRequired[datetime]
|
||||
Metadata: NotRequired[dict[str, str]]
|
||||
VersionId: NotRequired[str]
|
||||
StorageClass: NotRequired[str]
|
||||
ResponseMetadata: NotRequired[ResponseMetadata]
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# Bucket Operations
|
||||
# ============================================================================
|
||||
|
||||
|
||||
class Bucket(TypedDict):
|
||||
"""An S3 bucket.
|
||||
|
||||
Compatible with boto3's S3.Client.list_buckets() response Buckets.
|
||||
"""
|
||||
|
||||
Name: str
|
||||
CreationDate: datetime
|
||||
|
||||
|
||||
class ListBucketsResponse(TypedDict):
|
||||
"""Response from list_buckets operation.
|
||||
|
||||
Compatible with boto3's S3.Client.list_buckets() response.
|
||||
"""
|
||||
|
||||
Buckets: list[Bucket]
|
||||
Owner: NotRequired[dict[str, str]]
|
||||
ResponseMetadata: NotRequired[ResponseMetadata]
|
||||
|
||||
|
||||
class CreateBucketResponse(TypedDict):
|
||||
"""Response from create_bucket operation.
|
||||
|
||||
Compatible with boto3's S3.Client.create_bucket() response.
|
||||
"""
|
||||
|
||||
Location: NotRequired[str]
|
||||
ResponseMetadata: NotRequired[ResponseMetadata]
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# Multipart Upload Types
|
||||
# ============================================================================
|
||||
|
||||
|
||||
class CompletedPart(TypedDict):
|
||||
"""A completed part in a multipart upload."""
|
||||
|
||||
PartNumber: int
|
||||
ETag: str
|
||||
|
||||
|
||||
class CompleteMultipartUploadResponse(TypedDict):
|
||||
"""Response from complete_multipart_upload operation."""
|
||||
|
||||
Location: NotRequired[str]
|
||||
Bucket: NotRequired[str]
|
||||
Key: NotRequired[str]
|
||||
ETag: NotRequired[str]
|
||||
VersionId: NotRequired[str]
|
||||
ResponseMetadata: NotRequired[ResponseMetadata]
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# Copy Operations
|
||||
# ============================================================================
|
||||
|
||||
|
||||
class CopyObjectResponse(TypedDict):
|
||||
"""Response from copy_object operation.
|
||||
|
||||
Compatible with boto3's S3.Client.copy_object() response.
|
||||
"""
|
||||
|
||||
CopyObjectResult: NotRequired[dict[str, Any]]
|
||||
ETag: NotRequired[str]
|
||||
LastModified: NotRequired[datetime]
|
||||
VersionId: NotRequired[str]
|
||||
ResponseMetadata: NotRequired[ResponseMetadata]
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# Type Aliases for Convenience
|
||||
# ============================================================================
|
||||
|
||||
# Common parameter types
|
||||
BucketName = str
|
||||
ObjectKey = str
|
||||
Prefix = str
|
||||
Delimiter = str
|
||||
|
||||
# Storage class options
|
||||
StorageClass = Literal[
|
||||
"STANDARD",
|
||||
"REDUCED_REDUNDANCY",
|
||||
"STANDARD_IA",
|
||||
"ONEZONE_IA",
|
||||
"INTELLIGENT_TIERING",
|
||||
"GLACIER",
|
||||
"DEEP_ARCHIVE",
|
||||
"GLACIER_IR",
|
||||
]
|
||||
@@ -8,7 +8,7 @@ from unittest.mock import Mock
|
||||
import pytest
|
||||
|
||||
from deltaglider.adapters import (
|
||||
FsCacheAdapter,
|
||||
ContentAddressedCache,
|
||||
NoopMetricsAdapter,
|
||||
Sha256Adapter,
|
||||
StdLoggerAdapter,
|
||||
@@ -59,9 +59,9 @@ def real_hasher():
|
||||
|
||||
@pytest.fixture
|
||||
def cache_adapter(temp_dir, real_hasher):
|
||||
"""Create filesystem cache adapter."""
|
||||
"""Create content-addressed storage cache adapter."""
|
||||
cache_dir = temp_dir / "cache"
|
||||
return FsCacheAdapter(cache_dir, real_hasher)
|
||||
return ContentAddressedCache(cache_dir, real_hasher)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
|
||||
@@ -15,10 +15,19 @@ from deltaglider.app.cli.main import cli
|
||||
def extract_json_from_cli_output(output: str) -> dict:
|
||||
"""Extract JSON from CLI output that may contain log messages."""
|
||||
lines = output.split("\n")
|
||||
json_start = next(i for i, line in enumerate(lines) if line.strip().startswith("{"))
|
||||
json_end = next(i for i in range(json_start, len(lines)) if lines[i].strip() == "}") + 1
|
||||
json_text = "\n".join(lines[json_start:json_end])
|
||||
return json.loads(json_text)
|
||||
for i, line in enumerate(lines):
|
||||
if line.strip().startswith("{"):
|
||||
json_start = i
|
||||
json_end = (
|
||||
next(
|
||||
(j for j in range(json_start, len(lines)) if lines[j].strip() == "}"),
|
||||
len(lines) - 1,
|
||||
)
|
||||
+ 1
|
||||
)
|
||||
json_text = "\n".join(lines[json_start:json_end])
|
||||
return json.loads(json_text)
|
||||
raise ValueError("No JSON found in CLI output")
|
||||
|
||||
|
||||
@pytest.mark.e2e
|
||||
@@ -74,23 +83,25 @@ class TestLocalStackE2E:
|
||||
# Upload first file (becomes reference)
|
||||
result = runner.invoke(cli, ["cp", str(file1), f"s3://{test_bucket}/plugins/"])
|
||||
assert result.exit_code == 0
|
||||
output1 = extract_json_from_cli_output(result.output)
|
||||
assert output1["operation"] == "create_reference"
|
||||
assert output1["key"] == "plugins/reference.bin"
|
||||
assert "reference" in result.output.lower() or "upload:" in result.output
|
||||
|
||||
# Verify reference was created
|
||||
objects = s3_client.list_objects_v2(Bucket=test_bucket, Prefix="plugins/")
|
||||
# Verify reference was created (deltaspace is root, files are at root level)
|
||||
objects = s3_client.list_objects_v2(Bucket=test_bucket)
|
||||
assert "Contents" in objects
|
||||
keys = [obj["Key"] for obj in objects["Contents"]]
|
||||
assert "plugins/reference.bin" in keys
|
||||
assert "plugins/plugin-v1.0.0.zip.delta" in keys
|
||||
# Files are stored at root level: reference.bin and plugin-v1.0.0.zip.delta
|
||||
assert "reference.bin" in keys
|
||||
assert "plugin-v1.0.0.zip.delta" in keys
|
||||
|
||||
# Upload second file (creates delta)
|
||||
result = runner.invoke(cli, ["cp", str(file2), f"s3://{test_bucket}/plugins/"])
|
||||
assert result.exit_code == 0
|
||||
output2 = extract_json_from_cli_output(result.output)
|
||||
assert output2["operation"] == "create_delta"
|
||||
assert output2["key"] == "plugins/plugin-v1.0.1.zip.delta"
|
||||
assert "delta_ratio" in output2
|
||||
assert "upload:" in result.output
|
||||
|
||||
# Verify delta was created
|
||||
objects = s3_client.list_objects_v2(Bucket=test_bucket)
|
||||
keys = [obj["Key"] for obj in objects["Contents"]]
|
||||
assert "plugin-v1.0.1.zip.delta" in keys
|
||||
|
||||
# Download and verify second file
|
||||
output_file = tmpdir / "downloaded.zip"
|
||||
@@ -98,7 +109,7 @@ class TestLocalStackE2E:
|
||||
cli,
|
||||
[
|
||||
"cp",
|
||||
f"s3://{test_bucket}/plugins/plugin-v1.0.1.zip.delta",
|
||||
f"s3://{test_bucket}/plugin-v1.0.1.zip.delta",
|
||||
str(output_file),
|
||||
],
|
||||
)
|
||||
@@ -108,41 +119,42 @@ class TestLocalStackE2E:
|
||||
# Verify integrity
|
||||
result = runner.invoke(
|
||||
cli,
|
||||
["verify", f"s3://{test_bucket}/plugins/plugin-v1.0.1.zip.delta"],
|
||||
["verify", f"s3://{test_bucket}/plugin-v1.0.1.zip.delta"],
|
||||
)
|
||||
assert result.exit_code == 0
|
||||
verify_output = extract_json_from_cli_output(result.output)
|
||||
assert verify_output["valid"] is True
|
||||
|
||||
def test_multiple_deltaspaces(self, test_bucket, s3_client):
|
||||
"""Test multiple deltaspace directories with separate references."""
|
||||
"""Test shared deltaspace with multiple files."""
|
||||
runner = CliRunner()
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
tmpdir = Path(tmpdir)
|
||||
|
||||
# Create test files for different deltaspaces
|
||||
# Create test files for the same deltaspace
|
||||
file_a1 = tmpdir / "app-a-v1.zip"
|
||||
file_a1.write_text("Application A version 1")
|
||||
|
||||
file_b1 = tmpdir / "app-b-v1.zip"
|
||||
file_b1.write_text("Application B version 1")
|
||||
|
||||
# Upload to different deltaspaces
|
||||
# Upload to same deltaspace (apps/) with different target paths
|
||||
result = runner.invoke(cli, ["cp", str(file_a1), f"s3://{test_bucket}/apps/app-a/"])
|
||||
assert result.exit_code == 0
|
||||
|
||||
result = runner.invoke(cli, ["cp", str(file_b1), f"s3://{test_bucket}/apps/app-b/"])
|
||||
assert result.exit_code == 0
|
||||
|
||||
# Verify each deltaspace has its own reference
|
||||
objects_a = s3_client.list_objects_v2(Bucket=test_bucket, Prefix="apps/app-a/")
|
||||
keys_a = [obj["Key"] for obj in objects_a["Contents"]]
|
||||
assert "apps/app-a/reference.bin" in keys_a
|
||||
|
||||
objects_b = s3_client.list_objects_v2(Bucket=test_bucket, Prefix="apps/app-b/")
|
||||
keys_b = [obj["Key"] for obj in objects_b["Contents"]]
|
||||
assert "apps/app-b/reference.bin" in keys_b
|
||||
# Verify deltaspace has reference (both files share apps/ deltaspace)
|
||||
objects = s3_client.list_objects_v2(Bucket=test_bucket, Prefix="apps/")
|
||||
assert "Contents" in objects
|
||||
keys = [obj["Key"] for obj in objects["Contents"]]
|
||||
# Should have: apps/reference.bin, apps/app-a-v1.zip.delta, apps/app-b-v1.zip.delta
|
||||
# Both files share the same deltaspace (apps/) so only one reference
|
||||
assert "apps/reference.bin" in keys
|
||||
assert "apps/app-a-v1.zip.delta" in keys
|
||||
assert "apps/app-b-v1.zip.delta" in keys
|
||||
|
||||
def test_large_delta_warning(self, test_bucket, s3_client):
|
||||
"""Test delta compression with different content."""
|
||||
@@ -174,9 +186,11 @@ class TestLocalStackE2E:
|
||||
], # Very low threshold
|
||||
)
|
||||
assert result.exit_code == 0
|
||||
# Even with completely different content, xdelta3 is efficient
|
||||
output = extract_json_from_cli_output(result.output)
|
||||
assert output["operation"] == "create_delta"
|
||||
# Delta ratio should be small even for different files (xdelta3 is very efficient)
|
||||
assert "delta_ratio" in output
|
||||
assert output["delta_ratio"] > 0.01 # Should exceed the very low threshold we set
|
||||
# Should still upload successfully even though delta exceeds threshold
|
||||
assert "upload:" in result.output
|
||||
|
||||
# Verify delta was created
|
||||
objects = s3_client.list_objects_v2(Bucket=test_bucket)
|
||||
assert "Contents" in objects
|
||||
keys = [obj["Key"] for obj in objects["Contents"]]
|
||||
assert "file2.zip.delta" in keys
|
||||
|
||||
@@ -130,17 +130,26 @@ class TestSyncCommand:
|
||||
|
||||
# Mock service methods
|
||||
mock_service.storage.list.return_value = [] # No existing files
|
||||
mock_service.put.return_value = PutSummary(
|
||||
operation="create_reference",
|
||||
bucket="test-bucket",
|
||||
key="backup/file.zip.delta",
|
||||
original_name="file.zip",
|
||||
file_size=8,
|
||||
file_sha256="ghi789",
|
||||
delta_size=None,
|
||||
delta_ratio=None,
|
||||
ref_key=None,
|
||||
)
|
||||
# Mock list_objects to raise NotImplementedError so it falls back to list()
|
||||
mock_service.storage.list_objects.side_effect = NotImplementedError()
|
||||
|
||||
# Mock service.put to avoid actual execution
|
||||
def mock_put(local_path, delta_space, max_ratio=None):
|
||||
return PutSummary(
|
||||
operation="create_reference",
|
||||
bucket="test-bucket",
|
||||
key=f"{delta_space.prefix}/{local_path.name}.delta"
|
||||
if delta_space.prefix
|
||||
else f"{local_path.name}.delta",
|
||||
original_name=local_path.name,
|
||||
file_size=local_path.stat().st_size,
|
||||
file_sha256="ghi789",
|
||||
delta_size=None,
|
||||
delta_ratio=None,
|
||||
ref_key=None,
|
||||
)
|
||||
|
||||
mock_service.put.side_effect = mock_put
|
||||
|
||||
with patch("deltaglider.app.cli.main.create_service", return_value=mock_service):
|
||||
result = runner.invoke(cli, ["sync", str(test_dir), "s3://test-bucket/backup/"])
|
||||
@@ -175,6 +184,8 @@ class TestSyncCommand:
|
||||
metadata={},
|
||||
),
|
||||
]
|
||||
# Mock list_objects to raise NotImplementedError so it falls back to list()
|
||||
mock_service.storage.list_objects.side_effect = NotImplementedError()
|
||||
mock_service.storage.head.side_effect = [
|
||||
None, # file1.zip doesn't exist
|
||||
Mock(), # file1.zip.delta exists
|
||||
|
||||
@@ -1,11 +1,13 @@
|
||||
"""Tests for bucket management APIs."""
|
||||
|
||||
from typing import Any
|
||||
from unittest.mock import Mock
|
||||
|
||||
import pytest
|
||||
|
||||
from deltaglider.app.cli.main import create_service
|
||||
from deltaglider.client import DeltaGliderClient
|
||||
from deltaglider.client_models import BucketStats
|
||||
|
||||
|
||||
class TestBucketManagement:
|
||||
@@ -123,6 +125,48 @@ class TestBucketManagement:
|
||||
assert response["Buckets"] == []
|
||||
assert response["ResponseMetadata"]["HTTPStatusCode"] == 200
|
||||
|
||||
def test_list_buckets_includes_cached_stats(self):
|
||||
"""Bucket list should merge cached stats when available."""
|
||||
service = create_service()
|
||||
mock_storage = Mock()
|
||||
service.storage = mock_storage
|
||||
|
||||
mock_boto3_client = Mock()
|
||||
mock_boto3_client.list_buckets.return_value = {
|
||||
"Buckets": [
|
||||
{"Name": "bucket1", "CreationDate": "2025-01-01T00:00:00Z"},
|
||||
{"Name": "bucket2", "CreationDate": "2025-01-02T00:00:00Z"},
|
||||
],
|
||||
"Owner": {"DisplayName": "test-user", "ID": "12345"},
|
||||
}
|
||||
mock_storage.client = mock_boto3_client
|
||||
|
||||
client = DeltaGliderClient(service)
|
||||
|
||||
cached_stats = BucketStats(
|
||||
bucket="bucket1",
|
||||
object_count=10,
|
||||
total_size=1000,
|
||||
compressed_size=600,
|
||||
space_saved=400,
|
||||
average_compression_ratio=0.4,
|
||||
delta_objects=6,
|
||||
direct_objects=4,
|
||||
)
|
||||
client._store_bucket_stats_cache("bucket1", mode="detailed", stats=cached_stats)
|
||||
|
||||
response = client.list_buckets()
|
||||
|
||||
bucket1 = next(bucket for bucket in response["Buckets"] if bucket["Name"] == "bucket1")
|
||||
assert bucket1["DeltaGliderStats"]["Cached"] is True
|
||||
assert bucket1["DeltaGliderStats"]["Detailed"] is True
|
||||
assert bucket1["DeltaGliderStats"]["Mode"] == "detailed"
|
||||
assert bucket1["DeltaGliderStats"]["ObjectCount"] == cached_stats.object_count
|
||||
assert bucket1["DeltaGliderStats"]["TotalSize"] == cached_stats.total_size
|
||||
|
||||
bucket2 = next(bucket for bucket in response["Buckets"] if bucket["Name"] == "bucket2")
|
||||
assert "DeltaGliderStats" not in bucket2
|
||||
|
||||
def test_delete_bucket_success(self):
|
||||
"""Test deleting a bucket successfully."""
|
||||
service = create_service()
|
||||
@@ -178,6 +222,71 @@ class TestBucketManagement:
|
||||
with pytest.raises(RuntimeError, match="Failed to delete bucket"):
|
||||
client.delete_bucket(Bucket="full-bucket")
|
||||
|
||||
def test_get_bucket_stats_caches_per_session(self, monkeypatch):
|
||||
"""Verify bucket stats are cached within the client session."""
|
||||
service = create_service()
|
||||
mock_storage = Mock()
|
||||
service.storage = mock_storage
|
||||
|
||||
mock_storage.client = Mock()
|
||||
|
||||
client = DeltaGliderClient(service)
|
||||
|
||||
quick_stats = BucketStats(
|
||||
bucket="bucket1",
|
||||
object_count=5,
|
||||
total_size=500,
|
||||
compressed_size=300,
|
||||
space_saved=200,
|
||||
average_compression_ratio=0.4,
|
||||
delta_objects=3,
|
||||
direct_objects=2,
|
||||
)
|
||||
detailed_stats = BucketStats(
|
||||
bucket="bucket1",
|
||||
object_count=5,
|
||||
total_size=520,
|
||||
compressed_size=300,
|
||||
space_saved=220,
|
||||
average_compression_ratio=0.423,
|
||||
delta_objects=3,
|
||||
direct_objects=2,
|
||||
)
|
||||
|
||||
call_count = {"value": 0}
|
||||
|
||||
def fake_get_bucket_stats(
|
||||
_: Any, bucket: str, mode: str, use_cache: bool = True, refresh_cache: bool = False
|
||||
) -> BucketStats:
|
||||
call_count["value"] += 1
|
||||
assert bucket == "bucket1"
|
||||
if mode == "detailed":
|
||||
return detailed_stats
|
||||
if mode == "sampled":
|
||||
return detailed_stats # sampled treated as detailed for cache propagation
|
||||
return quick_stats
|
||||
|
||||
monkeypatch.setattr("deltaglider.client._get_bucket_stats", fake_get_bucket_stats)
|
||||
|
||||
# First call should invoke underlying function
|
||||
result_quick = client.get_bucket_stats("bucket1")
|
||||
assert result_quick is quick_stats
|
||||
assert call_count["value"] == 1
|
||||
|
||||
# Second quick call - caching is now done in _get_bucket_stats (S3-based)
|
||||
# So each call goes through _get_bucket_stats (which handles caching internally)
|
||||
assert client.get_bucket_stats("bucket1") is quick_stats
|
||||
assert call_count["value"] == 2
|
||||
|
||||
# Detailed call triggers new computation
|
||||
result_detailed = client.get_bucket_stats("bucket1", mode="detailed")
|
||||
assert result_detailed is detailed_stats
|
||||
assert call_count["value"] == 3
|
||||
|
||||
# Quick call - each mode has its own cache in _get_bucket_stats
|
||||
assert client.get_bucket_stats("bucket1") is quick_stats
|
||||
assert call_count["value"] == 4
|
||||
|
||||
def test_bucket_methods_without_boto3_client(self):
|
||||
"""Test that bucket methods raise NotImplementedError when storage doesn't support it."""
|
||||
service = create_service()
|
||||
@@ -199,6 +308,148 @@ class TestBucketManagement:
|
||||
with pytest.raises(NotImplementedError):
|
||||
client.list_buckets()
|
||||
|
||||
def test_put_bucket_acl_with_canned_acl(self):
|
||||
"""Test setting a canned ACL on a bucket."""
|
||||
service = create_service()
|
||||
mock_storage = Mock()
|
||||
service.storage = mock_storage
|
||||
|
||||
mock_boto3_client = Mock()
|
||||
mock_boto3_client.put_bucket_acl.return_value = None
|
||||
mock_storage.client = mock_boto3_client
|
||||
|
||||
client = DeltaGliderClient(service)
|
||||
response = client.put_bucket_acl(Bucket="test-bucket", ACL="public-read")
|
||||
|
||||
assert response["ResponseMetadata"]["HTTPStatusCode"] == 200
|
||||
mock_boto3_client.put_bucket_acl.assert_called_once_with(
|
||||
Bucket="test-bucket", ACL="public-read"
|
||||
)
|
||||
|
||||
def test_put_bucket_acl_with_grants(self):
|
||||
"""Test setting ACL with grant parameters."""
|
||||
service = create_service()
|
||||
mock_storage = Mock()
|
||||
service.storage = mock_storage
|
||||
|
||||
mock_boto3_client = Mock()
|
||||
mock_boto3_client.put_bucket_acl.return_value = None
|
||||
mock_storage.client = mock_boto3_client
|
||||
|
||||
client = DeltaGliderClient(service)
|
||||
response = client.put_bucket_acl(
|
||||
Bucket="test-bucket",
|
||||
GrantRead="id=12345",
|
||||
GrantWrite="id=67890",
|
||||
)
|
||||
|
||||
assert response["ResponseMetadata"]["HTTPStatusCode"] == 200
|
||||
mock_boto3_client.put_bucket_acl.assert_called_once_with(
|
||||
Bucket="test-bucket", GrantRead="id=12345", GrantWrite="id=67890"
|
||||
)
|
||||
|
||||
def test_put_bucket_acl_with_access_control_policy(self):
|
||||
"""Test setting ACL with a full AccessControlPolicy dict."""
|
||||
service = create_service()
|
||||
mock_storage = Mock()
|
||||
service.storage = mock_storage
|
||||
|
||||
mock_boto3_client = Mock()
|
||||
mock_boto3_client.put_bucket_acl.return_value = None
|
||||
mock_storage.client = mock_boto3_client
|
||||
|
||||
policy = {
|
||||
"Grants": [
|
||||
{
|
||||
"Grantee": {"Type": "CanonicalUser", "ID": "abc123"},
|
||||
"Permission": "FULL_CONTROL",
|
||||
}
|
||||
],
|
||||
"Owner": {"ID": "abc123"},
|
||||
}
|
||||
|
||||
client = DeltaGliderClient(service)
|
||||
response = client.put_bucket_acl(Bucket="test-bucket", AccessControlPolicy=policy)
|
||||
|
||||
assert response["ResponseMetadata"]["HTTPStatusCode"] == 200
|
||||
mock_boto3_client.put_bucket_acl.assert_called_once_with(
|
||||
Bucket="test-bucket", AccessControlPolicy=policy
|
||||
)
|
||||
|
||||
def test_put_bucket_acl_failure(self):
|
||||
"""Test that put_bucket_acl raises RuntimeError on boto3 failure."""
|
||||
service = create_service()
|
||||
mock_storage = Mock()
|
||||
service.storage = mock_storage
|
||||
|
||||
mock_boto3_client = Mock()
|
||||
mock_boto3_client.put_bucket_acl.side_effect = Exception("AccessDenied")
|
||||
mock_storage.client = mock_boto3_client
|
||||
|
||||
client = DeltaGliderClient(service)
|
||||
|
||||
with pytest.raises(RuntimeError, match="Failed to set bucket ACL"):
|
||||
client.put_bucket_acl(Bucket="test-bucket", ACL="public-read")
|
||||
|
||||
def test_put_bucket_acl_no_boto3_client(self):
|
||||
"""Test that put_bucket_acl raises NotImplementedError without boto3 client."""
|
||||
service = create_service()
|
||||
mock_storage = Mock()
|
||||
service.storage = mock_storage
|
||||
delattr(mock_storage, "client")
|
||||
|
||||
client = DeltaGliderClient(service)
|
||||
|
||||
with pytest.raises(NotImplementedError):
|
||||
client.put_bucket_acl(Bucket="test-bucket", ACL="private")
|
||||
|
||||
def test_get_bucket_acl_success(self):
|
||||
"""Test getting bucket ACL successfully."""
|
||||
service = create_service()
|
||||
mock_storage = Mock()
|
||||
service.storage = mock_storage
|
||||
|
||||
acl_response = {
|
||||
"Owner": {"DisplayName": "test-user", "ID": "abc123"},
|
||||
"Grants": [
|
||||
{
|
||||
"Grantee": {
|
||||
"Type": "CanonicalUser",
|
||||
"DisplayName": "test-user",
|
||||
"ID": "abc123",
|
||||
},
|
||||
"Permission": "FULL_CONTROL",
|
||||
}
|
||||
],
|
||||
}
|
||||
|
||||
mock_boto3_client = Mock()
|
||||
mock_boto3_client.get_bucket_acl.return_value = acl_response
|
||||
mock_storage.client = mock_boto3_client
|
||||
|
||||
client = DeltaGliderClient(service)
|
||||
response = client.get_bucket_acl(Bucket="test-bucket")
|
||||
|
||||
assert response["Owner"]["DisplayName"] == "test-user"
|
||||
assert len(response["Grants"]) == 1
|
||||
assert response["Grants"][0]["Permission"] == "FULL_CONTROL"
|
||||
mock_boto3_client.get_bucket_acl.assert_called_once_with(Bucket="test-bucket")
|
||||
|
||||
def test_get_bucket_acl_failure(self):
|
||||
"""Test that get_bucket_acl raises RuntimeError on boto3 failure."""
|
||||
service = create_service()
|
||||
mock_storage = Mock()
|
||||
service.storage = mock_storage
|
||||
|
||||
mock_boto3_client = Mock()
|
||||
mock_boto3_client.get_bucket_acl.side_effect = Exception("NoSuchBucket")
|
||||
mock_storage.client = mock_boto3_client
|
||||
|
||||
client = DeltaGliderClient(service)
|
||||
|
||||
with pytest.raises(RuntimeError, match="Failed to get bucket ACL"):
|
||||
client.get_bucket_acl(Bucket="nonexistent-bucket")
|
||||
|
||||
def test_complete_bucket_lifecycle(self):
|
||||
"""Test complete bucket lifecycle: create, use, delete."""
|
||||
service = create_service()
|
||||
|
||||
@@ -10,7 +10,6 @@ from deltaglider import create_client
|
||||
from deltaglider.client import (
|
||||
BucketStats,
|
||||
CompressionEstimate,
|
||||
ListObjectsResponse,
|
||||
ObjectInfo,
|
||||
)
|
||||
|
||||
@@ -44,7 +43,15 @@ class MockStorage:
|
||||
if obj_head is not None:
|
||||
yield obj_head
|
||||
|
||||
def list_objects(self, bucket, prefix="", delimiter="", max_keys=1000, start_after=None):
|
||||
def list_objects(
|
||||
self,
|
||||
bucket,
|
||||
prefix="",
|
||||
delimiter="",
|
||||
max_keys=1000,
|
||||
start_after=None,
|
||||
continuation_token=None,
|
||||
):
|
||||
"""Mock list_objects operation for S3 features."""
|
||||
objects = []
|
||||
common_prefixes = set()
|
||||
@@ -125,7 +132,7 @@ class MockStorage:
|
||||
@pytest.fixture
|
||||
def client(tmp_path):
|
||||
"""Create a client with mocked storage."""
|
||||
client = create_client(cache_dir=str(tmp_path / "cache"))
|
||||
client = create_client()
|
||||
|
||||
# Replace storage with mock
|
||||
mock_storage = MockStorage()
|
||||
@@ -146,6 +153,65 @@ def client(tmp_path):
|
||||
return client
|
||||
|
||||
|
||||
class TestCredentialHandling:
|
||||
"""Test AWS credential passing."""
|
||||
|
||||
def test_create_client_with_explicit_credentials(self, tmp_path):
|
||||
"""Test that credentials can be passed directly to create_client."""
|
||||
# This test verifies the API accepts credentials, not that they work
|
||||
# (we'd need a real S3 or LocalStack for that)
|
||||
client = create_client(
|
||||
aws_access_key_id="AKIAIOSFODNN7EXAMPLE",
|
||||
aws_secret_access_key="wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY",
|
||||
region_name="us-west-2",
|
||||
)
|
||||
|
||||
# Verify the client was created
|
||||
assert client is not None
|
||||
assert client.service is not None
|
||||
|
||||
# Verify credentials were passed to the storage adapter's boto3 client
|
||||
# The storage adapter should have a client with these credentials
|
||||
storage = client.service.storage
|
||||
assert hasattr(storage, "client")
|
||||
|
||||
# Check that the boto3 client was configured with our credentials
|
||||
# Note: boto3 doesn't expose credentials directly, but we can verify
|
||||
# the client was created (if credentials were invalid, this would fail)
|
||||
assert storage.client is not None
|
||||
|
||||
def test_create_client_with_session_token(self, tmp_path):
|
||||
"""Test passing temporary credentials with session token."""
|
||||
client = create_client(
|
||||
aws_access_key_id="ASIAIOSFODNN7EXAMPLE",
|
||||
aws_secret_access_key="wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY",
|
||||
aws_session_token="FwoGZXIvYXdzEBEaDH...",
|
||||
)
|
||||
|
||||
assert client is not None
|
||||
assert client.service.storage.client is not None
|
||||
|
||||
def test_create_client_without_credentials_uses_environment(self, tmp_path):
|
||||
"""Test that omitting credentials falls back to environment/IAM."""
|
||||
# This should use boto3's default credential chain
|
||||
client = create_client()
|
||||
|
||||
assert client is not None
|
||||
assert client.service.storage.client is not None
|
||||
|
||||
def test_create_client_with_endpoint_and_credentials(self, tmp_path):
|
||||
"""Test passing both endpoint URL and credentials."""
|
||||
client = create_client(
|
||||
endpoint_url="http://localhost:9000",
|
||||
aws_access_key_id="minioadmin",
|
||||
aws_secret_access_key="minioadmin",
|
||||
)
|
||||
|
||||
assert client is not None
|
||||
# Endpoint should be available
|
||||
assert client.endpoint_url == "http://localhost:9000"
|
||||
|
||||
|
||||
class TestBoto3Compatibility:
|
||||
"""Test boto3-compatible methods."""
|
||||
|
||||
@@ -196,28 +262,56 @@ class TestBoto3Compatibility:
|
||||
content = response["Body"].read()
|
||||
assert content == b"Test Content"
|
||||
|
||||
def test_get_object_regular_s3_file(self, client):
|
||||
"""Test get_object with regular S3 files (not uploaded via DeltaGlider)."""
|
||||
|
||||
content = b"Regular S3 File Content"
|
||||
|
||||
# Add as a regular S3 object WITHOUT DeltaGlider metadata
|
||||
client.service.storage.objects["test-bucket/regular-file.pdf"] = {
|
||||
"data": content,
|
||||
"size": len(content),
|
||||
"metadata": {}, # No DeltaGlider metadata
|
||||
}
|
||||
|
||||
# Should successfully download the regular S3 object
|
||||
response = client.get_object(Bucket="test-bucket", Key="regular-file.pdf")
|
||||
|
||||
assert "Body" in response
|
||||
downloaded_content = response["Body"].read()
|
||||
assert downloaded_content == content
|
||||
assert response["ContentLength"] == len(content)
|
||||
|
||||
def test_list_objects(self, client):
|
||||
"""Test list_objects with various options."""
|
||||
"""Test list_objects with various options (boto3-compatible dict response)."""
|
||||
# List all objects (default: FetchMetadata=False)
|
||||
response = client.list_objects(Bucket="test-bucket")
|
||||
|
||||
assert isinstance(response, ListObjectsResponse)
|
||||
assert response.key_count > 0
|
||||
assert len(response.contents) > 0
|
||||
# Response is now a boto3-compatible dict (not ListObjectsResponse)
|
||||
assert isinstance(response, dict)
|
||||
assert response["KeyCount"] > 0
|
||||
assert len(response["Contents"]) > 0
|
||||
|
||||
# Verify S3Object structure
|
||||
for obj in response["Contents"]:
|
||||
assert "Key" in obj
|
||||
assert "Size" in obj
|
||||
assert "LastModified" in obj
|
||||
assert "Metadata" in obj # DeltaGlider metadata
|
||||
|
||||
# Test with FetchMetadata=True (should only affect delta files)
|
||||
response_with_metadata = client.list_objects(Bucket="test-bucket", FetchMetadata=True)
|
||||
assert isinstance(response_with_metadata, ListObjectsResponse)
|
||||
assert response_with_metadata.key_count > 0
|
||||
assert isinstance(response_with_metadata, dict)
|
||||
assert response_with_metadata["KeyCount"] > 0
|
||||
|
||||
def test_list_objects_with_delimiter(self, client):
|
||||
"""Test list_objects with delimiter for folder simulation."""
|
||||
"""Test list_objects with delimiter for folder simulation (boto3-compatible dict response)."""
|
||||
response = client.list_objects(Bucket="test-bucket", Prefix="", Delimiter="/")
|
||||
|
||||
# Should have common prefixes for folders
|
||||
assert len(response.common_prefixes) > 0
|
||||
assert {"Prefix": "folder1/"} in response.common_prefixes
|
||||
assert {"Prefix": "folder2/"} in response.common_prefixes
|
||||
assert len(response.get("CommonPrefixes", [])) > 0
|
||||
assert {"Prefix": "folder1/"} in response["CommonPrefixes"]
|
||||
assert {"Prefix": "folder2/"} in response["CommonPrefixes"]
|
||||
|
||||
def test_delete_object(self, client):
|
||||
"""Test delete_object."""
|
||||
@@ -229,6 +323,24 @@ class TestBoto3Compatibility:
|
||||
assert response["ResponseMetadata"]["HTTPStatusCode"] == 204
|
||||
assert "test-bucket/to-delete.txt" not in client.service.storage.objects
|
||||
|
||||
def test_delete_object_with_delta_suffix_fallback(self, client):
|
||||
"""Test delete_object with automatic .delta suffix fallback."""
|
||||
# Add object with .delta suffix (as DeltaGlider stores it)
|
||||
client.service.storage.objects["test-bucket/file.zip.delta"] = {
|
||||
"size": 100,
|
||||
"metadata": {
|
||||
"original_name": "file.zip",
|
||||
"compression": "delta",
|
||||
},
|
||||
}
|
||||
|
||||
# Delete using original name (without .delta)
|
||||
response = client.delete_object(Bucket="test-bucket", Key="file.zip")
|
||||
|
||||
assert response["ResponseMetadata"]["HTTPStatusCode"] == 204
|
||||
assert response["DeltaGliderInfo"]["Deleted"] is True
|
||||
assert "test-bucket/file.zip.delta" not in client.service.storage.objects
|
||||
|
||||
def test_delete_objects(self, client):
|
||||
"""Test batch delete."""
|
||||
# Add objects
|
||||
@@ -330,7 +442,7 @@ class TestDeltaGliderFeatures:
|
||||
|
||||
def test_get_bucket_stats(self, client):
|
||||
"""Test getting bucket statistics."""
|
||||
# Test quick stats (default: detailed_stats=False)
|
||||
# Test quick stats (LIST only)
|
||||
stats = client.get_bucket_stats("test-bucket")
|
||||
|
||||
assert isinstance(stats, BucketStats)
|
||||
@@ -338,8 +450,8 @@ class TestDeltaGliderFeatures:
|
||||
assert stats.total_size > 0
|
||||
assert stats.delta_objects >= 1 # We have archive.zip.delta
|
||||
|
||||
# Test with detailed_stats=True
|
||||
detailed_stats = client.get_bucket_stats("test-bucket", detailed_stats=True)
|
||||
# Test with detailed mode
|
||||
detailed_stats = client.get_bucket_stats("test-bucket", mode="detailed")
|
||||
assert isinstance(detailed_stats, BucketStats)
|
||||
assert detailed_stats.object_count == stats.object_count
|
||||
|
||||
|
||||
548
tests/integration/test_delete_objects_recursive.py
Normal file
548
tests/integration/test_delete_objects_recursive.py
Normal file
@@ -0,0 +1,548 @@
|
||||
"""Comprehensive tests for DeltaGliderClient.delete_objects_recursive() method."""
|
||||
|
||||
from datetime import UTC, datetime
|
||||
from unittest.mock import Mock, patch
|
||||
|
||||
import pytest
|
||||
|
||||
from deltaglider import create_client
|
||||
from deltaglider.core.models import DeleteResult, RecursiveDeleteResult
|
||||
|
||||
|
||||
class MockStorage:
|
||||
"""Mock storage for testing."""
|
||||
|
||||
def __init__(self):
|
||||
self.objects = {}
|
||||
self.delete_calls = []
|
||||
|
||||
def head(self, key):
|
||||
"""Mock head operation."""
|
||||
from deltaglider.ports.storage import ObjectHead
|
||||
|
||||
if key in self.objects:
|
||||
obj = self.objects[key]
|
||||
return ObjectHead(
|
||||
key=key,
|
||||
size=obj["size"],
|
||||
etag=obj.get("etag", "mock-etag"),
|
||||
last_modified=obj.get("last_modified", datetime.now(UTC)),
|
||||
metadata=obj.get("metadata", {}),
|
||||
)
|
||||
return None
|
||||
|
||||
def list(self, prefix):
|
||||
"""Mock list operation for StoragePort interface."""
|
||||
for key, _obj in self.objects.items():
|
||||
if key.startswith(prefix):
|
||||
obj_head = self.head(key)
|
||||
if obj_head is not None:
|
||||
yield obj_head
|
||||
|
||||
def delete(self, key):
|
||||
"""Mock delete operation."""
|
||||
self.delete_calls.append(key)
|
||||
if key in self.objects:
|
||||
del self.objects[key]
|
||||
return True
|
||||
return False
|
||||
|
||||
def get(self, key):
|
||||
"""Mock get operation."""
|
||||
if key in self.objects:
|
||||
return self.objects[key].get("content", b"mock-content")
|
||||
return None
|
||||
|
||||
def put(self, key, data, metadata=None):
|
||||
"""Mock put operation."""
|
||||
self.objects[key] = {
|
||||
"size": len(data),
|
||||
"content": data,
|
||||
"metadata": metadata or {},
|
||||
}
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def mock_storage():
|
||||
"""Create mock storage."""
|
||||
return MockStorage()
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def client(tmp_path):
|
||||
"""Create DeltaGliderClient with mock storage."""
|
||||
# Use create_client to get a properly configured client
|
||||
client = create_client()
|
||||
|
||||
# Replace storage with mock
|
||||
mock_storage = MockStorage()
|
||||
client.service.storage = mock_storage
|
||||
|
||||
return client
|
||||
|
||||
|
||||
class TestDeleteObjectsRecursiveBasicFunctionality:
|
||||
"""Test basic functionality of delete_objects_recursive."""
|
||||
|
||||
def test_delete_single_object_with_file_prefix(self, client):
|
||||
"""Test deleting a single object when prefix is a file (no trailing slash)."""
|
||||
# Setup: Add a regular file
|
||||
client.service.storage.objects["test-bucket/file.txt"] = {"size": 100}
|
||||
|
||||
# Execute
|
||||
response = client.delete_objects_recursive(Bucket="test-bucket", Prefix="file.txt")
|
||||
|
||||
# Verify response structure
|
||||
assert response["ResponseMetadata"]["HTTPStatusCode"] == 200
|
||||
assert "DeletedCount" in response
|
||||
assert "FailedCount" in response
|
||||
assert "DeltaGliderInfo" in response
|
||||
|
||||
# Verify DeltaGliderInfo structure
|
||||
info = response["DeltaGliderInfo"]
|
||||
assert "DeltasDeleted" in info
|
||||
assert "ReferencesDeleted" in info
|
||||
assert "DirectDeleted" in info
|
||||
assert "OtherDeleted" in info
|
||||
|
||||
def test_delete_directory_with_trailing_slash(self, client):
|
||||
"""Test deleting all objects under a prefix with trailing slash."""
|
||||
# Setup: Add multiple files under a prefix
|
||||
client.service.storage.objects["test-bucket/dir/file1.txt"] = {"size": 100}
|
||||
client.service.storage.objects["test-bucket/dir/file2.txt"] = {"size": 200}
|
||||
client.service.storage.objects["test-bucket/dir/sub/file3.txt"] = {"size": 300}
|
||||
|
||||
# Execute
|
||||
response = client.delete_objects_recursive(Bucket="test-bucket", Prefix="dir/")
|
||||
|
||||
# Verify
|
||||
assert response["ResponseMetadata"]["HTTPStatusCode"] == 200
|
||||
assert response["DeletedCount"] >= 0
|
||||
assert response["FailedCount"] == 0
|
||||
|
||||
def test_delete_empty_prefix_returns_zero_counts(self, client):
|
||||
"""Test deleting with empty prefix returns zero counts."""
|
||||
# Execute
|
||||
response = client.delete_objects_recursive(Bucket="test-bucket", Prefix="")
|
||||
|
||||
# Verify
|
||||
assert response["ResponseMetadata"]["HTTPStatusCode"] == 200
|
||||
assert response["DeletedCount"] >= 0
|
||||
assert response["FailedCount"] == 0
|
||||
|
||||
|
||||
class TestDeleteObjectsRecursiveDeltaSuffixHandling:
|
||||
"""Test delta suffix fallback logic."""
|
||||
|
||||
def test_delete_file_with_delta_suffix_fallback(self, client):
|
||||
"""Test that delete falls back to .delta suffix if original not found."""
|
||||
# Setup: Add file with .delta suffix
|
||||
client.service.storage.objects["test-bucket/archive.zip.delta"] = {
|
||||
"size": 500,
|
||||
"metadata": {"original_name": "archive.zip"},
|
||||
}
|
||||
|
||||
# Execute: Delete using original name (without .delta)
|
||||
response = client.delete_objects_recursive(Bucket="test-bucket", Prefix="archive.zip")
|
||||
|
||||
# Verify
|
||||
assert response["ResponseMetadata"]["HTTPStatusCode"] == 200
|
||||
assert "test-bucket/archive.zip.delta" not in client.service.storage.objects
|
||||
|
||||
def test_delete_file_already_with_delta_suffix(self, client):
|
||||
"""Test deleting a file that already has .delta suffix."""
|
||||
# Setup
|
||||
client.service.storage.objects["test-bucket/file.zip.delta"] = {"size": 300}
|
||||
|
||||
# Execute: Delete using .delta suffix directly
|
||||
response = client.delete_objects_recursive(Bucket="test-bucket", Prefix="file.zip.delta")
|
||||
|
||||
# Verify
|
||||
assert response["ResponseMetadata"]["HTTPStatusCode"] == 200
|
||||
|
||||
def test_delta_suffix_not_added_for_directory_prefix(self, client):
|
||||
"""Test that .delta suffix is not added when prefix ends with /."""
|
||||
# Setup
|
||||
client.service.storage.objects["test-bucket/dir/file.txt"] = {"size": 100}
|
||||
|
||||
# Execute
|
||||
response = client.delete_objects_recursive(Bucket="test-bucket", Prefix="dir/")
|
||||
|
||||
# Verify - should not attempt to delete "dir/.delta"
|
||||
assert response["ResponseMetadata"]["HTTPStatusCode"] == 200
|
||||
|
||||
|
||||
class TestDeleteObjectsRecursiveStatisticsAggregation:
|
||||
"""Test statistics aggregation from core service."""
|
||||
|
||||
def test_aggregates_deleted_count_from_service_and_single_deletes(self, client):
|
||||
"""Test that deleted counts are aggregated correctly."""
|
||||
# Setup: Mock service.delete_recursive to return specific counts
|
||||
mock_result = RecursiveDeleteResult(
|
||||
bucket="test-bucket",
|
||||
prefix="test/",
|
||||
deleted_count=5,
|
||||
failed_count=0,
|
||||
deltas_deleted=2,
|
||||
references_deleted=1,
|
||||
direct_deleted=2,
|
||||
other_deleted=0,
|
||||
)
|
||||
client.service.delete_recursive = Mock(return_value=mock_result)
|
||||
|
||||
# Execute
|
||||
response = client.delete_objects_recursive(Bucket="test-bucket", Prefix="test/")
|
||||
|
||||
# Verify aggregation
|
||||
assert response["DeletedCount"] == 5
|
||||
assert response["FailedCount"] == 0
|
||||
assert response["DeltaGliderInfo"]["DeltasDeleted"] == 2
|
||||
assert response["DeltaGliderInfo"]["ReferencesDeleted"] == 1
|
||||
assert response["DeltaGliderInfo"]["DirectDeleted"] == 2
|
||||
assert response["DeltaGliderInfo"]["OtherDeleted"] == 0
|
||||
|
||||
def test_aggregates_single_delete_counts_with_service_counts(self, client):
|
||||
"""Test that single file deletes are aggregated with service counts."""
|
||||
# Setup: Add file to trigger single delete path
|
||||
client.service.storage.objects["test-bucket/file.txt"] = {"size": 100}
|
||||
|
||||
# Mock service.delete_recursive to return additional counts
|
||||
mock_result = RecursiveDeleteResult(
|
||||
bucket="test-bucket",
|
||||
prefix="file.txt",
|
||||
deleted_count=3,
|
||||
failed_count=0,
|
||||
deltas_deleted=1,
|
||||
references_deleted=0,
|
||||
direct_deleted=2,
|
||||
other_deleted=0,
|
||||
)
|
||||
client.service.delete_recursive = Mock(return_value=mock_result)
|
||||
|
||||
# Execute
|
||||
response = client.delete_objects_recursive(Bucket="test-bucket", Prefix="file.txt")
|
||||
|
||||
# Verify that counts include both single delete and service delete
|
||||
assert response["DeletedCount"] >= 3 # At least service count
|
||||
assert response["DeltaGliderInfo"]["DeltasDeleted"] >= 1
|
||||
|
||||
|
||||
class TestDeleteObjectsRecursiveErrorHandling:
|
||||
"""Test error handling and error aggregation."""
|
||||
|
||||
def test_single_delete_error_captured_in_errors_list(self, client):
|
||||
"""Test that errors from single deletes are captured."""
|
||||
# Setup: Add file
|
||||
client.service.storage.objects["test-bucket/file.txt"] = {"size": 100}
|
||||
|
||||
# Mock delete_with_delta_suffix to raise exception
|
||||
with patch("deltaglider.client.delete_with_delta_suffix") as mock_delete:
|
||||
mock_delete.side_effect = RuntimeError("Simulated delete error")
|
||||
|
||||
# Execute
|
||||
response = client.delete_objects_recursive(Bucket="test-bucket", Prefix="file.txt")
|
||||
|
||||
# Verify error captured
|
||||
assert response["FailedCount"] > 0
|
||||
assert "Errors" in response
|
||||
assert any("Simulated delete error" in err for err in response["Errors"])
|
||||
|
||||
def test_service_errors_propagated_in_response(self, client):
|
||||
"""Test that errors from service.delete_recursive are propagated."""
|
||||
# Mock service to return errors
|
||||
mock_result = RecursiveDeleteResult(
|
||||
bucket="test-bucket",
|
||||
prefix="test/",
|
||||
deleted_count=2,
|
||||
failed_count=1,
|
||||
deltas_deleted=2,
|
||||
references_deleted=0,
|
||||
direct_deleted=0,
|
||||
other_deleted=0,
|
||||
errors=["Error deleting object1", "Error deleting object2"],
|
||||
)
|
||||
client.service.delete_recursive = Mock(return_value=mock_result)
|
||||
|
||||
# Execute
|
||||
response = client.delete_objects_recursive(Bucket="test-bucket", Prefix="test/")
|
||||
|
||||
# Verify
|
||||
assert response["FailedCount"] == 1
|
||||
assert "Errors" in response
|
||||
assert "Error deleting object1" in response["Errors"]
|
||||
assert "Error deleting object2" in response["Errors"]
|
||||
|
||||
def test_combines_single_and_service_errors(self, client):
|
||||
"""Test that errors from both single deletes and service are combined."""
|
||||
# Setup
|
||||
client.service.storage.objects["test-bucket/file.txt"] = {"size": 100}
|
||||
|
||||
# Mock service to also return errors
|
||||
mock_result = RecursiveDeleteResult(
|
||||
bucket="test-bucket",
|
||||
prefix="file.txt",
|
||||
deleted_count=1,
|
||||
failed_count=1,
|
||||
deltas_deleted=0,
|
||||
references_deleted=0,
|
||||
direct_deleted=0,
|
||||
other_deleted=0,
|
||||
errors=["Service delete error"],
|
||||
)
|
||||
client.service.delete_recursive = Mock(return_value=mock_result)
|
||||
|
||||
# Mock delete_with_delta_suffix to raise exception
|
||||
with patch("deltaglider.client.delete_with_delta_suffix") as mock_delete:
|
||||
mock_delete.side_effect = RuntimeError("Single delete error")
|
||||
|
||||
# Execute
|
||||
response = client.delete_objects_recursive(Bucket="test-bucket", Prefix="file.txt")
|
||||
|
||||
# Verify both errors present
|
||||
assert "Errors" in response
|
||||
errors_str = " ".join(response["Errors"])
|
||||
assert "Single delete error" in errors_str
|
||||
assert "Service delete error" in errors_str
|
||||
|
||||
|
||||
class TestDeleteObjectsRecursiveWarningsHandling:
|
||||
"""Test warning aggregation."""
|
||||
|
||||
def test_service_warnings_propagated_in_response(self, client):
|
||||
"""Test that warnings from service.delete_recursive are propagated."""
|
||||
# Mock service to return warnings
|
||||
mock_result = RecursiveDeleteResult(
|
||||
bucket="test-bucket",
|
||||
prefix="test/",
|
||||
deleted_count=3,
|
||||
failed_count=0,
|
||||
deltas_deleted=2,
|
||||
references_deleted=1,
|
||||
direct_deleted=0,
|
||||
other_deleted=0,
|
||||
warnings=["Reference deleted, 2 dependent deltas invalidated"],
|
||||
)
|
||||
client.service.delete_recursive = Mock(return_value=mock_result)
|
||||
|
||||
# Execute
|
||||
response = client.delete_objects_recursive(Bucket="test-bucket", Prefix="test/")
|
||||
|
||||
# Verify
|
||||
assert "Warnings" in response
|
||||
assert "Reference deleted, 2 dependent deltas invalidated" in response["Warnings"]
|
||||
|
||||
def test_single_delete_warnings_propagated(self, client):
|
||||
"""Test that warnings from single deletes are captured."""
|
||||
# Setup
|
||||
client.service.storage.objects["test-bucket/ref.bin"] = {"size": 100}
|
||||
|
||||
# Mock service
|
||||
mock_result = RecursiveDeleteResult(
|
||||
bucket="test-bucket",
|
||||
prefix="ref.bin",
|
||||
deleted_count=0,
|
||||
failed_count=0,
|
||||
deltas_deleted=0,
|
||||
references_deleted=0,
|
||||
direct_deleted=0,
|
||||
other_deleted=0,
|
||||
)
|
||||
client.service.delete_recursive = Mock(return_value=mock_result)
|
||||
|
||||
# Mock delete_with_delta_suffix to return warnings
|
||||
with patch("deltaglider.client.delete_with_delta_suffix") as mock_delete:
|
||||
mock_delete.return_value = (
|
||||
"ref.bin",
|
||||
DeleteResult(
|
||||
key="ref.bin",
|
||||
bucket="test-bucket",
|
||||
deleted=True,
|
||||
type="reference",
|
||||
warnings=["Warning from single delete"],
|
||||
),
|
||||
)
|
||||
|
||||
# Execute
|
||||
response = client.delete_objects_recursive(Bucket="test-bucket", Prefix="ref.bin")
|
||||
|
||||
# Verify
|
||||
assert "Warnings" in response
|
||||
assert "Warning from single delete" in response["Warnings"]
|
||||
|
||||
|
||||
class TestDeleteObjectsRecursiveSingleDeleteDetails:
|
||||
"""Test SingleDeletes detail tracking."""
|
||||
|
||||
def test_single_delete_details_included_for_file_prefix(self, client):
|
||||
"""Test that SingleDeletes details are included when deleting file prefix."""
|
||||
# Setup
|
||||
client.service.storage.objects["test-bucket/file.txt"] = {"size": 100}
|
||||
|
||||
# Mock service
|
||||
mock_result = RecursiveDeleteResult(
|
||||
bucket="test-bucket",
|
||||
prefix="file.txt",
|
||||
deleted_count=0,
|
||||
failed_count=0,
|
||||
deltas_deleted=0,
|
||||
references_deleted=0,
|
||||
direct_deleted=0,
|
||||
other_deleted=0,
|
||||
)
|
||||
client.service.delete_recursive = Mock(return_value=mock_result)
|
||||
|
||||
# Mock delete_with_delta_suffix
|
||||
with patch("deltaglider.client.delete_with_delta_suffix") as mock_delete:
|
||||
mock_delete.return_value = (
|
||||
"file.txt",
|
||||
DeleteResult(
|
||||
key="file.txt",
|
||||
bucket="test-bucket",
|
||||
deleted=True,
|
||||
type="direct",
|
||||
dependent_deltas=0,
|
||||
),
|
||||
)
|
||||
|
||||
# Execute
|
||||
response = client.delete_objects_recursive(Bucket="test-bucket", Prefix="file.txt")
|
||||
|
||||
# Verify
|
||||
assert "SingleDeletes" in response["DeltaGliderInfo"]
|
||||
single_deletes = response["DeltaGliderInfo"]["SingleDeletes"]
|
||||
assert len(single_deletes) > 0
|
||||
assert single_deletes[0]["Key"] == "file.txt"
|
||||
assert single_deletes[0]["Type"] == "direct"
|
||||
assert "DependentDeltas" in single_deletes[0]
|
||||
assert "Warnings" in single_deletes[0]
|
||||
|
||||
def test_single_delete_includes_stored_key_when_different(self, client):
|
||||
"""Test that StoredKey is included when actual key differs from requested."""
|
||||
# Setup
|
||||
client.service.storage.objects["test-bucket/file.zip.delta"] = {"size": 200}
|
||||
|
||||
# Mock delete_with_delta_suffix to return different key
|
||||
from deltaglider import client_delete_helpers
|
||||
|
||||
original_delete = client_delete_helpers.delete_with_delta_suffix
|
||||
|
||||
def mock_delete(service, bucket, key):
|
||||
actual_key = "file.zip.delta" if key == "file.zip" else key
|
||||
return (
|
||||
actual_key,
|
||||
DeleteResult(
|
||||
key=actual_key,
|
||||
bucket=bucket,
|
||||
deleted=True,
|
||||
type="delta",
|
||||
dependent_deltas=0,
|
||||
),
|
||||
)
|
||||
|
||||
client_delete_helpers.delete_with_delta_suffix = mock_delete
|
||||
|
||||
# Mock service
|
||||
mock_result = RecursiveDeleteResult(
|
||||
bucket="test-bucket",
|
||||
prefix="file.zip",
|
||||
deleted_count=0,
|
||||
failed_count=0,
|
||||
deltas_deleted=0,
|
||||
references_deleted=0,
|
||||
direct_deleted=0,
|
||||
other_deleted=0,
|
||||
)
|
||||
client.service.delete_recursive = Mock(return_value=mock_result)
|
||||
|
||||
try:
|
||||
# Execute
|
||||
response = client.delete_objects_recursive(Bucket="test-bucket", Prefix="file.zip")
|
||||
|
||||
# Verify
|
||||
assert "SingleDeletes" in response["DeltaGliderInfo"]
|
||||
single_deletes = response["DeltaGliderInfo"]["SingleDeletes"]
|
||||
if len(single_deletes) > 0:
|
||||
# If actual key differs, StoredKey should be present
|
||||
detail = single_deletes[0]
|
||||
if detail["Key"] != "file.zip.delta":
|
||||
assert "StoredKey" in detail
|
||||
finally:
|
||||
client_delete_helpers.delete_with_delta_suffix = original_delete
|
||||
|
||||
|
||||
class TestDeleteObjectsRecursiveEdgeCases:
|
||||
"""Test edge cases and boundary conditions."""
|
||||
|
||||
def test_nonexistent_prefix_returns_zero_counts(self, client):
|
||||
"""Test deleting nonexistent prefix returns zero counts."""
|
||||
# Execute
|
||||
response = client.delete_objects_recursive(Bucket="test-bucket", Prefix="nonexistent/path/")
|
||||
|
||||
# Verify
|
||||
assert response["ResponseMetadata"]["HTTPStatusCode"] == 200
|
||||
assert response["DeletedCount"] >= 0
|
||||
assert response["FailedCount"] == 0
|
||||
|
||||
def test_duplicate_candidates_handled_correctly(self, client):
|
||||
"""Test that duplicate delete candidates are handled correctly."""
|
||||
# Setup: This tests the seen_candidates logic
|
||||
client.service.storage.objects["test-bucket/file.delta"] = {"size": 100}
|
||||
|
||||
# Execute: Should not attempt to delete "file.delta" twice
|
||||
response = client.delete_objects_recursive(Bucket="test-bucket", Prefix="file.delta")
|
||||
|
||||
# Verify no errors
|
||||
assert response["ResponseMetadata"]["HTTPStatusCode"] == 200
|
||||
|
||||
def test_unknown_result_type_categorized_as_other(self, client):
|
||||
"""Test that unknown result types are categorized as 'other'."""
|
||||
# Setup
|
||||
client.service.storage.objects["test-bucket/file.txt"] = {"size": 100}
|
||||
|
||||
# Mock service
|
||||
mock_result = RecursiveDeleteResult(
|
||||
bucket="test-bucket",
|
||||
prefix="file.txt",
|
||||
deleted_count=0,
|
||||
failed_count=0,
|
||||
deltas_deleted=0,
|
||||
references_deleted=0,
|
||||
direct_deleted=0,
|
||||
other_deleted=0,
|
||||
)
|
||||
client.service.delete_recursive = Mock(return_value=mock_result)
|
||||
|
||||
# Mock delete_with_delta_suffix to return unknown type
|
||||
with patch("deltaglider.client.delete_with_delta_suffix") as mock_delete:
|
||||
mock_delete.return_value = (
|
||||
"file.txt",
|
||||
DeleteResult(
|
||||
key="file.txt",
|
||||
bucket="test-bucket",
|
||||
deleted=True,
|
||||
type="unknown_type", # Not in single_counts keys
|
||||
dependent_deltas=0,
|
||||
),
|
||||
)
|
||||
|
||||
# Execute
|
||||
response = client.delete_objects_recursive(Bucket="test-bucket", Prefix="file.txt")
|
||||
|
||||
# Verify it's categorized as "other"
|
||||
assert response["DeltaGliderInfo"]["OtherDeleted"] >= 1
|
||||
# Also verify the detail shows the unknown type
|
||||
if "SingleDeletes" in response["DeltaGliderInfo"]:
|
||||
assert response["DeltaGliderInfo"]["SingleDeletes"][0]["Type"] == "unknown_type"
|
||||
|
||||
def test_kwargs_parameter_accepted(self, client):
|
||||
"""Test that additional kwargs are accepted without error."""
|
||||
# Execute with extra parameters
|
||||
response = client.delete_objects_recursive(
|
||||
Bucket="test-bucket",
|
||||
Prefix="test/",
|
||||
ExtraParam="value", # Should be ignored
|
||||
AnotherParam=123,
|
||||
)
|
||||
|
||||
# Verify no errors
|
||||
assert response["ResponseMetadata"]["HTTPStatusCode"] == 200
|
||||
@@ -53,8 +53,11 @@ class TestSDKFiltering:
|
||||
client = DeltaGliderClient(service)
|
||||
response = client.list_objects(Bucket="test-bucket", Prefix="releases/")
|
||||
|
||||
# Response is now a boto3-compatible dict
|
||||
contents = response["Contents"]
|
||||
|
||||
# Verify .delta suffix is stripped
|
||||
keys = [obj.key for obj in response.contents]
|
||||
keys = [obj["Key"] for obj in contents]
|
||||
assert "releases/app-v1.zip" in keys
|
||||
assert "releases/app-v2.zip" in keys
|
||||
assert "releases/README.md" in keys
|
||||
@@ -63,8 +66,10 @@ class TestSDKFiltering:
|
||||
for key in keys:
|
||||
assert not key.endswith(".delta"), f"Found .delta suffix in: {key}"
|
||||
|
||||
# Verify is_delta flag is set correctly
|
||||
delta_objects = [obj for obj in response.contents if obj.is_delta]
|
||||
# Verify is_delta flag is set correctly in Metadata
|
||||
delta_objects = [
|
||||
obj for obj in contents if obj.get("Metadata", {}).get("deltaglider-is-delta") == "true"
|
||||
]
|
||||
assert len(delta_objects) == 2
|
||||
|
||||
def test_list_objects_filters_reference_bin(self):
|
||||
@@ -106,15 +111,18 @@ class TestSDKFiltering:
|
||||
client = DeltaGliderClient(service)
|
||||
response = client.list_objects(Bucket="test-bucket", Prefix="releases/")
|
||||
|
||||
# Response is now a boto3-compatible dict
|
||||
contents = response["Contents"]
|
||||
|
||||
# Verify NO reference.bin files in output
|
||||
keys = [obj.key for obj in response.contents]
|
||||
keys = [obj["Key"] for obj in contents]
|
||||
for key in keys:
|
||||
assert not key.endswith("reference.bin"), f"Found reference.bin in: {key}"
|
||||
|
||||
# Should only have the app.zip (with .delta stripped)
|
||||
assert len(response.contents) == 1
|
||||
assert response.contents[0].key == "releases/app.zip"
|
||||
assert response.contents[0].is_delta is True
|
||||
assert len(contents) == 1
|
||||
assert contents[0]["Key"] == "releases/app.zip"
|
||||
assert contents[0].get("Metadata", {}).get("deltaglider-is-delta") == "true"
|
||||
|
||||
def test_list_objects_combined_filtering(self):
|
||||
"""Test filtering of both .delta and reference.bin together."""
|
||||
@@ -170,12 +178,15 @@ class TestSDKFiltering:
|
||||
client = DeltaGliderClient(service)
|
||||
response = client.list_objects(Bucket="test-bucket", Prefix="data/")
|
||||
|
||||
# Response is now a boto3-compatible dict
|
||||
contents = response["Contents"]
|
||||
|
||||
# Should filter out 2 reference.bin files
|
||||
# Should strip .delta from 3 files
|
||||
# Should keep 1 regular file as-is
|
||||
assert len(response.contents) == 4 # 3 deltas + 1 regular file
|
||||
assert len(contents) == 4 # 3 deltas + 1 regular file
|
||||
|
||||
keys = [obj.key for obj in response.contents]
|
||||
keys = [obj["Key"] for obj in contents]
|
||||
expected_keys = ["data/file1.zip", "data/file2.zip", "data/file3.txt", "data/sub/app.jar"]
|
||||
assert sorted(keys) == sorted(expected_keys)
|
||||
|
||||
@@ -232,12 +243,12 @@ class TestSingleDeleteCleanup:
|
||||
result = service.delete(ObjectKey(bucket="test-bucket", key="releases/app.zip.delta"))
|
||||
|
||||
# Verify delta was deleted
|
||||
assert result["deleted"] is True
|
||||
assert result["type"] == "delta"
|
||||
assert result.deleted is True
|
||||
assert result.type == "delta"
|
||||
|
||||
# Verify reference.bin cleanup was triggered
|
||||
assert "cleaned_reference" in result
|
||||
assert result["cleaned_reference"] == "releases/reference.bin"
|
||||
assert result.cleaned_reference is not None
|
||||
assert result.cleaned_reference == "releases/reference.bin"
|
||||
|
||||
# Verify both files were deleted
|
||||
assert mock_storage.delete.call_count == 2
|
||||
@@ -284,11 +295,11 @@ class TestSingleDeleteCleanup:
|
||||
result = service.delete(ObjectKey(bucket="test-bucket", key="releases/app-v1.zip.delta"))
|
||||
|
||||
# Verify delta was deleted
|
||||
assert result["deleted"] is True
|
||||
assert result["type"] == "delta"
|
||||
assert result.deleted is True
|
||||
assert result.type == "delta"
|
||||
|
||||
# Verify reference.bin was NOT cleaned up
|
||||
assert "cleaned_reference" not in result
|
||||
assert result.cleaned_reference is None
|
||||
|
||||
# Verify only the delta was deleted, not reference.bin
|
||||
assert mock_storage.delete.call_count == 1
|
||||
@@ -331,11 +342,11 @@ class TestSingleDeleteCleanup:
|
||||
result = service.delete(ObjectKey(bucket="test-bucket", key="releases/app.zip.delta"))
|
||||
|
||||
# Verify delta was deleted
|
||||
assert result["deleted"] is True
|
||||
assert result["type"] == "delta"
|
||||
assert result.deleted is True
|
||||
assert result.type == "delta"
|
||||
|
||||
# Verify no reference cleanup (since it didn't exist)
|
||||
assert "cleaned_reference" not in result
|
||||
assert result.cleaned_reference is None
|
||||
|
||||
# Only delta should be deleted
|
||||
assert mock_storage.delete.call_count == 1
|
||||
@@ -384,7 +395,7 @@ class TestSingleDeleteCleanup:
|
||||
result = service.delete(ObjectKey(bucket="test-bucket", key="releases/1.0/app.zip.delta"))
|
||||
|
||||
# Should clean up only 1.0/reference.bin
|
||||
assert result["cleaned_reference"] == "releases/1.0/reference.bin"
|
||||
assert result.cleaned_reference == "releases/1.0/reference.bin"
|
||||
|
||||
# Verify correct files deleted
|
||||
delete_calls = [call[0][0] for call in mock_storage.delete.call_args_list]
|
||||
@@ -425,9 +436,9 @@ class TestRecursiveDeleteCleanup:
|
||||
result = service.delete_recursive("test-bucket", "data/")
|
||||
|
||||
# Should delete both delta and reference
|
||||
assert result["deleted_count"] == 2
|
||||
assert result["deltas_deleted"] == 1
|
||||
assert result["references_deleted"] == 1
|
||||
assert result.deleted_count == 2
|
||||
assert result.deltas_deleted == 1
|
||||
assert result.references_deleted == 1
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
@@ -5,6 +5,7 @@ from unittest.mock import Mock, patch
|
||||
import pytest
|
||||
|
||||
from deltaglider.app.cli.main import create_service
|
||||
from deltaglider.core.models import RecursiveDeleteResult
|
||||
from deltaglider.ports.storage import ObjectHead
|
||||
|
||||
|
||||
@@ -28,10 +29,10 @@ class TestRecursiveDeleteReferenceCleanup:
|
||||
|
||||
result = service.delete_recursive("test-bucket", "nonexistent/")
|
||||
|
||||
assert result["deleted_count"] == 0
|
||||
assert result["failed_count"] == 0
|
||||
assert isinstance(result["errors"], list)
|
||||
assert isinstance(result["warnings"], list)
|
||||
assert result.deleted_count == 0
|
||||
assert result.failed_count == 0
|
||||
assert isinstance(result.errors, list)
|
||||
assert isinstance(result.warnings, list)
|
||||
|
||||
def test_delete_recursive_returns_structured_result(self):
|
||||
"""Test that delete_recursive returns a properly structured result."""
|
||||
@@ -57,26 +58,22 @@ class TestRecursiveDeleteReferenceCleanup:
|
||||
|
||||
result = service.delete_recursive("test-bucket", "test/")
|
||||
|
||||
# Verify structure
|
||||
required_keys = [
|
||||
"bucket",
|
||||
"prefix",
|
||||
"deleted_count",
|
||||
"failed_count",
|
||||
"deltas_deleted",
|
||||
"references_deleted",
|
||||
"direct_deleted",
|
||||
"other_deleted",
|
||||
"errors",
|
||||
"warnings",
|
||||
]
|
||||
for key in required_keys:
|
||||
assert key in result, f"Missing key: {key}"
|
||||
# Verify structure - result is a RecursiveDeleteResult dataclass
|
||||
assert hasattr(result, "bucket")
|
||||
assert hasattr(result, "prefix")
|
||||
assert hasattr(result, "deleted_count")
|
||||
assert hasattr(result, "failed_count")
|
||||
assert hasattr(result, "deltas_deleted")
|
||||
assert hasattr(result, "references_deleted")
|
||||
assert hasattr(result, "direct_deleted")
|
||||
assert hasattr(result, "other_deleted")
|
||||
assert hasattr(result, "errors")
|
||||
assert hasattr(result, "warnings")
|
||||
|
||||
assert isinstance(result["deleted_count"], int)
|
||||
assert isinstance(result["failed_count"], int)
|
||||
assert isinstance(result["errors"], list)
|
||||
assert isinstance(result["warnings"], list)
|
||||
assert isinstance(result.deleted_count, int)
|
||||
assert isinstance(result.failed_count, int)
|
||||
assert isinstance(result.errors, list)
|
||||
assert isinstance(result.warnings, list)
|
||||
|
||||
def test_delete_recursive_categorizes_objects_correctly(self):
|
||||
"""Test that delete_recursive correctly categorizes different object types."""
|
||||
@@ -117,12 +114,12 @@ class TestRecursiveDeleteReferenceCleanup:
|
||||
result = service.delete_recursive("test-bucket", "test/")
|
||||
|
||||
# Should categorize correctly - the exact categorization depends on implementation
|
||||
assert result["deltas_deleted"] == 1 # app.zip.delta
|
||||
assert result["references_deleted"] == 1 # reference.bin
|
||||
assert result.deltas_deleted == 1 # app.zip.delta
|
||||
assert result.references_deleted == 1 # reference.bin
|
||||
# Direct and other files may be categorized differently based on metadata detection
|
||||
assert result["direct_deleted"] + result["other_deleted"] == 2 # readme.txt + config.json
|
||||
assert result["deleted_count"] == 4 # total
|
||||
assert result["failed_count"] == 0
|
||||
assert result.direct_deleted + result.other_deleted == 2 # readme.txt + config.json
|
||||
assert result.deleted_count == 4 # total
|
||||
assert result.failed_count == 0
|
||||
|
||||
def test_delete_recursive_handles_storage_errors_gracefully(self):
|
||||
"""Test that delete_recursive handles individual storage errors gracefully."""
|
||||
@@ -151,10 +148,10 @@ class TestRecursiveDeleteReferenceCleanup:
|
||||
result = service.delete_recursive("test-bucket", "test/")
|
||||
|
||||
# Should handle partial failure
|
||||
assert result["deleted_count"] == 1 # good.zip.delta succeeded
|
||||
assert result["failed_count"] == 1 # bad.zip.delta failed
|
||||
assert len(result["errors"]) == 1
|
||||
assert "bad" in result["errors"][0]
|
||||
assert result.deleted_count == 1 # good.zip.delta succeeded
|
||||
assert result.failed_count == 1 # bad.zip.delta failed
|
||||
assert len(result.errors) == 1
|
||||
assert "bad" in result.errors[0]
|
||||
|
||||
def test_affected_deltaspaces_discovery(self):
|
||||
"""Test that the system discovers affected deltaspaces when deleting deltas."""
|
||||
@@ -206,8 +203,8 @@ class TestRecursiveDeleteReferenceCleanup:
|
||||
result = service.delete_recursive("test-bucket", "project/team-a/v1/")
|
||||
|
||||
# Should have discovered and evaluated the parent reference
|
||||
assert result["deleted_count"] >= 1 # At least the delta file
|
||||
assert result["failed_count"] == 0
|
||||
assert result.deleted_count >= 1 # At least the delta file
|
||||
assert result.failed_count == 0
|
||||
|
||||
def test_cli_uses_core_service_method(self):
|
||||
"""Test that CLI rm -r command uses the core service delete_recursive method."""
|
||||
@@ -222,14 +219,12 @@ class TestRecursiveDeleteReferenceCleanup:
|
||||
mock_create_service.return_value = mock_service
|
||||
|
||||
# Mock successful deletion
|
||||
mock_service.delete_recursive.return_value = {
|
||||
"bucket": "test-bucket",
|
||||
"prefix": "test/",
|
||||
"deleted_count": 2,
|
||||
"failed_count": 0,
|
||||
"warnings": [],
|
||||
"errors": [],
|
||||
}
|
||||
mock_service.delete_recursive.return_value = RecursiveDeleteResult(
|
||||
bucket="test-bucket",
|
||||
prefix="test/",
|
||||
deleted_count=2,
|
||||
failed_count=0,
|
||||
)
|
||||
|
||||
result = runner.invoke(cli, ["rm", "-r", "s3://test-bucket/test/"])
|
||||
|
||||
@@ -294,8 +289,8 @@ class TestRecursiveDeleteReferenceCleanup:
|
||||
|
||||
result = service.delete(ObjectKey(bucket="test-bucket", key="test/file.zip.delta"))
|
||||
|
||||
assert result["deleted"]
|
||||
assert result["type"] == "delta"
|
||||
assert result.deleted
|
||||
assert result.type == "delta"
|
||||
|
||||
def test_reference_cleanup_intelligence_basic(self):
|
||||
"""Basic test to verify reference cleanup intelligence is working."""
|
||||
@@ -328,10 +323,10 @@ class TestRecursiveDeleteReferenceCleanup:
|
||||
result = service.delete_recursive("test-bucket", "simple/")
|
||||
|
||||
# Should delete both delta and reference since there are no other dependencies
|
||||
assert result["deleted_count"] == 2
|
||||
assert result["deltas_deleted"] == 1
|
||||
assert result["references_deleted"] == 1
|
||||
assert result["failed_count"] == 0
|
||||
assert result.deleted_count == 2
|
||||
assert result.deltas_deleted == 1
|
||||
assert result.references_deleted == 1
|
||||
assert result.failed_count == 0
|
||||
|
||||
def test_comprehensive_result_validation(self):
|
||||
"""Test that all result fields are properly populated."""
|
||||
@@ -366,31 +361,31 @@ class TestRecursiveDeleteReferenceCleanup:
|
||||
result = service.delete_recursive("test-bucket", "mixed/")
|
||||
|
||||
# Validate all expected fields are present and have correct types
|
||||
assert isinstance(result["bucket"], str)
|
||||
assert isinstance(result["prefix"], str)
|
||||
assert isinstance(result["deleted_count"], int)
|
||||
assert isinstance(result["failed_count"], int)
|
||||
assert isinstance(result["deltas_deleted"], int)
|
||||
assert isinstance(result["references_deleted"], int)
|
||||
assert isinstance(result["direct_deleted"], int)
|
||||
assert isinstance(result["other_deleted"], int)
|
||||
assert isinstance(result["errors"], list)
|
||||
assert isinstance(result["warnings"], list)
|
||||
assert isinstance(result.bucket, str)
|
||||
assert isinstance(result.prefix, str)
|
||||
assert isinstance(result.deleted_count, int)
|
||||
assert isinstance(result.failed_count, int)
|
||||
assert isinstance(result.deltas_deleted, int)
|
||||
assert isinstance(result.references_deleted, int)
|
||||
assert isinstance(result.direct_deleted, int)
|
||||
assert isinstance(result.other_deleted, int)
|
||||
assert isinstance(result.errors, list)
|
||||
assert isinstance(result.warnings, list)
|
||||
|
||||
# Validate counts add up
|
||||
total_by_type = (
|
||||
result["deltas_deleted"]
|
||||
+ result["references_deleted"]
|
||||
+ result["direct_deleted"]
|
||||
+ result["other_deleted"]
|
||||
result.deltas_deleted
|
||||
+ result.references_deleted
|
||||
+ result.direct_deleted
|
||||
+ result.other_deleted
|
||||
)
|
||||
assert result["deleted_count"] == total_by_type
|
||||
assert result.deleted_count == total_by_type
|
||||
|
||||
# Validate specific counts for this scenario
|
||||
assert result["deltas_deleted"] == 1
|
||||
assert result["references_deleted"] == 1
|
||||
assert result.deltas_deleted == 1
|
||||
assert result.references_deleted == 1
|
||||
# Direct and other files may be categorized differently
|
||||
assert result["direct_deleted"] + result["other_deleted"] == 2
|
||||
assert result.direct_deleted + result.other_deleted == 2
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
271
tests/integration/test_s3_migration.py
Normal file
271
tests/integration/test_s3_migration.py
Normal file
@@ -0,0 +1,271 @@
|
||||
"""Test S3-to-S3 migration functionality."""
|
||||
|
||||
from unittest.mock import MagicMock, patch
|
||||
|
||||
import pytest
|
||||
|
||||
from deltaglider.app.cli.aws_compat import migrate_s3_to_s3
|
||||
from deltaglider.core import DeltaService
|
||||
from deltaglider.ports import ObjectHead
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def mock_service():
|
||||
"""Create a mock DeltaService."""
|
||||
service = MagicMock(spec=DeltaService)
|
||||
service.storage = MagicMock()
|
||||
return service
|
||||
|
||||
|
||||
def test_migrate_s3_to_s3_with_resume(mock_service):
|
||||
"""Test migration with resume support (skips existing files)."""
|
||||
# Setup mock storage with source files
|
||||
source_objects = [
|
||||
ObjectHead(
|
||||
key="file1.zip",
|
||||
size=1024,
|
||||
etag="abc123",
|
||||
last_modified="2024-01-01T00:00:00Z",
|
||||
metadata={},
|
||||
),
|
||||
ObjectHead(
|
||||
key="file2.zip",
|
||||
size=2048,
|
||||
etag="def456",
|
||||
last_modified="2024-01-01T00:00:00Z",
|
||||
metadata={},
|
||||
),
|
||||
ObjectHead(
|
||||
key="subdir/file3.zip",
|
||||
size=512,
|
||||
etag="ghi789",
|
||||
last_modified="2024-01-01T00:00:00Z",
|
||||
metadata={},
|
||||
),
|
||||
]
|
||||
|
||||
# Destination already has file1.zip (as .delta)
|
||||
dest_objects = [
|
||||
ObjectHead(
|
||||
key="file1.zip.delta",
|
||||
size=100,
|
||||
last_modified="2024-01-02T00:00:00Z",
|
||||
etag="delta123",
|
||||
metadata={},
|
||||
),
|
||||
]
|
||||
|
||||
# Configure mock to return appropriate objects
|
||||
def list_side_effect(prefix):
|
||||
if "source-bucket" in prefix:
|
||||
return iter(source_objects)
|
||||
elif "dest-bucket" in prefix:
|
||||
return iter(dest_objects)
|
||||
return iter([])
|
||||
|
||||
mock_service.storage.list.side_effect = list_side_effect
|
||||
|
||||
# Mock the copy operation and click functions
|
||||
# Use quiet=True to skip EC2 detection logging
|
||||
with patch("deltaglider.app.cli.aws_compat.copy_s3_to_s3") as mock_copy:
|
||||
with patch("deltaglider.app.cli.aws_compat.click.confirm", return_value=True):
|
||||
migrate_s3_to_s3(
|
||||
mock_service,
|
||||
"s3://source-bucket/",
|
||||
"s3://dest-bucket/",
|
||||
exclude=None,
|
||||
include=None,
|
||||
quiet=True, # Skip EC2 detection and logging
|
||||
no_delta=False,
|
||||
max_ratio=None,
|
||||
dry_run=False,
|
||||
skip_confirm=False,
|
||||
)
|
||||
|
||||
# Should copy only file2.zip and subdir/file3.zip (file1 already exists)
|
||||
assert mock_copy.call_count == 2
|
||||
|
||||
# Verify the files being migrated
|
||||
call_args = [call[0] for call in mock_copy.call_args_list]
|
||||
migrated_files = [(args[1], args[2]) for args in call_args]
|
||||
|
||||
assert ("s3://source-bucket/file2.zip", "s3://dest-bucket/file2.zip") in migrated_files
|
||||
assert (
|
||||
"s3://source-bucket/subdir/file3.zip",
|
||||
"s3://dest-bucket/subdir/file3.zip",
|
||||
) in migrated_files
|
||||
|
||||
|
||||
def test_migrate_s3_to_s3_dry_run(mock_service):
|
||||
"""Test dry run mode shows what would be migrated without actually migrating."""
|
||||
source_objects = [
|
||||
ObjectHead(
|
||||
key="file1.zip",
|
||||
size=1024,
|
||||
last_modified="2024-01-01T00:00:00Z",
|
||||
etag="abc123",
|
||||
metadata={},
|
||||
),
|
||||
]
|
||||
|
||||
mock_service.storage.list.return_value = iter(source_objects)
|
||||
|
||||
# Mock the copy operation and EC2 detection
|
||||
with patch("deltaglider.app.cli.aws_compat.copy_s3_to_s3") as mock_copy:
|
||||
with patch("deltaglider.app.cli.aws_compat.click.echo") as mock_echo:
|
||||
with patch("deltaglider.app.cli.aws_compat.log_aws_region"):
|
||||
migrate_s3_to_s3(
|
||||
mock_service,
|
||||
"s3://source-bucket/",
|
||||
"s3://dest-bucket/",
|
||||
exclude=None,
|
||||
include=None,
|
||||
quiet=False, # Allow output to test dry run messages
|
||||
no_delta=False,
|
||||
max_ratio=None,
|
||||
dry_run=True,
|
||||
skip_confirm=False,
|
||||
)
|
||||
|
||||
# Should not actually copy anything in dry run mode
|
||||
mock_copy.assert_not_called()
|
||||
|
||||
# Should show dry run message
|
||||
echo_calls = [str(call[0][0]) for call in mock_echo.call_args_list if call[0]]
|
||||
assert any("DRY RUN MODE" in msg for msg in echo_calls)
|
||||
|
||||
|
||||
def test_migrate_s3_to_s3_with_filters(mock_service):
|
||||
"""Test migration with include/exclude filters."""
|
||||
source_objects = [
|
||||
ObjectHead(
|
||||
key="file1.zip",
|
||||
size=1024,
|
||||
last_modified="2024-01-01T00:00:00Z",
|
||||
etag="abc123",
|
||||
metadata={},
|
||||
),
|
||||
ObjectHead(
|
||||
key="file2.log",
|
||||
size=256,
|
||||
last_modified="2024-01-01T00:00:00Z",
|
||||
etag="def456",
|
||||
metadata={},
|
||||
),
|
||||
ObjectHead(
|
||||
key="file3.tar",
|
||||
size=512,
|
||||
last_modified="2024-01-01T00:00:00Z",
|
||||
etag="ghi789",
|
||||
metadata={},
|
||||
),
|
||||
]
|
||||
|
||||
mock_service.storage.list.return_value = iter(source_objects)
|
||||
|
||||
# Mock the copy operation
|
||||
with patch("deltaglider.app.cli.aws_compat.copy_s3_to_s3") as mock_copy:
|
||||
with patch("click.echo"):
|
||||
with patch("deltaglider.app.cli.aws_compat.click.confirm", return_value=True):
|
||||
# Exclude .log files
|
||||
migrate_s3_to_s3(
|
||||
mock_service,
|
||||
"s3://source-bucket/",
|
||||
"s3://dest-bucket/",
|
||||
exclude="*.log",
|
||||
include=None,
|
||||
quiet=True, # Skip EC2 detection
|
||||
no_delta=False,
|
||||
max_ratio=None,
|
||||
dry_run=False,
|
||||
skip_confirm=False,
|
||||
)
|
||||
|
||||
# Should copy file1.zip and file3.tar, but not file2.log
|
||||
assert mock_copy.call_count == 2
|
||||
|
||||
call_args = [call[0] for call in mock_copy.call_args_list]
|
||||
migrated_sources = [args[1] for args in call_args]
|
||||
|
||||
assert "s3://source-bucket/file1.zip" in migrated_sources
|
||||
assert "s3://source-bucket/file3.tar" in migrated_sources
|
||||
assert "s3://source-bucket/file2.log" not in migrated_sources
|
||||
|
||||
|
||||
def test_migrate_s3_to_s3_skip_confirm(mock_service):
|
||||
"""Test skipping confirmation prompt with skip_confirm=True."""
|
||||
source_objects = [
|
||||
ObjectHead(
|
||||
key="file1.zip",
|
||||
size=1024,
|
||||
last_modified="2024-01-01T00:00:00Z",
|
||||
etag="abc123",
|
||||
metadata={},
|
||||
),
|
||||
]
|
||||
|
||||
mock_service.storage.list.return_value = iter(source_objects)
|
||||
|
||||
with patch("deltaglider.app.cli.aws_compat.copy_s3_to_s3") as mock_copy:
|
||||
with patch("click.echo"):
|
||||
with patch("deltaglider.app.cli.aws_compat.click.confirm") as mock_confirm:
|
||||
migrate_s3_to_s3(
|
||||
mock_service,
|
||||
"s3://source-bucket/",
|
||||
"s3://dest-bucket/",
|
||||
exclude=None,
|
||||
include=None,
|
||||
quiet=True, # Skip EC2 detection
|
||||
no_delta=False,
|
||||
max_ratio=None,
|
||||
dry_run=False,
|
||||
skip_confirm=True, # Skip confirmation
|
||||
)
|
||||
|
||||
# Should not ask for confirmation
|
||||
mock_confirm.assert_not_called()
|
||||
|
||||
# Should still perform the copy
|
||||
mock_copy.assert_called_once()
|
||||
|
||||
|
||||
def test_migrate_s3_to_s3_with_prefix(mock_service):
|
||||
"""Test migration with source and destination prefixes."""
|
||||
source_objects = [
|
||||
ObjectHead(
|
||||
key="data/file1.zip",
|
||||
size=1024,
|
||||
last_modified="2024-01-01T00:00:00Z",
|
||||
etag="abc123",
|
||||
metadata={},
|
||||
),
|
||||
]
|
||||
|
||||
def list_side_effect(prefix):
|
||||
if "source-bucket/data" in prefix:
|
||||
return iter(source_objects)
|
||||
return iter([])
|
||||
|
||||
mock_service.storage.list.side_effect = list_side_effect
|
||||
|
||||
with patch("deltaglider.app.cli.aws_compat.copy_s3_to_s3") as mock_copy:
|
||||
with patch("click.echo"):
|
||||
with patch("deltaglider.app.cli.aws_compat.click.confirm", return_value=True):
|
||||
migrate_s3_to_s3(
|
||||
mock_service,
|
||||
"s3://source-bucket/data/",
|
||||
"s3://dest-bucket/archive/",
|
||||
exclude=None,
|
||||
include=None,
|
||||
quiet=True, # Skip EC2 detection
|
||||
no_delta=False,
|
||||
max_ratio=None,
|
||||
dry_run=False,
|
||||
skip_confirm=False,
|
||||
)
|
||||
|
||||
# Verify the correct destination path is used
|
||||
mock_copy.assert_called_once()
|
||||
call_args = mock_copy.call_args[0]
|
||||
assert call_args[1] == "s3://source-bucket/data/file1.zip"
|
||||
assert call_args[2] == "s3://dest-bucket/archive/file1.zip"
|
||||
255
tests/integration/test_stats_command.py
Normal file
255
tests/integration/test_stats_command.py
Normal file
@@ -0,0 +1,255 @@
|
||||
"""Integration tests for stats CLI command."""
|
||||
|
||||
import json
|
||||
from unittest.mock import Mock, patch
|
||||
|
||||
from click.testing import CliRunner
|
||||
|
||||
from deltaglider.app.cli.main import cli
|
||||
from deltaglider.client_models import BucketStats
|
||||
|
||||
|
||||
class TestStatsCommand:
|
||||
"""Test stats CLI command."""
|
||||
|
||||
def test_stats_json_output(self):
|
||||
"""Test stats command with JSON output."""
|
||||
# Create mock bucket stats
|
||||
mock_stats = BucketStats(
|
||||
bucket="test-bucket",
|
||||
object_count=10,
|
||||
total_size=1000000,
|
||||
compressed_size=500000,
|
||||
space_saved=500000,
|
||||
average_compression_ratio=0.5,
|
||||
delta_objects=7,
|
||||
direct_objects=3,
|
||||
)
|
||||
|
||||
with patch("deltaglider.client.DeltaGliderClient") as mock_client_class:
|
||||
# Setup mock client
|
||||
mock_client = Mock()
|
||||
mock_client.get_bucket_stats.return_value = mock_stats
|
||||
mock_client_class.return_value = mock_client
|
||||
|
||||
# Run command
|
||||
runner = CliRunner()
|
||||
result = runner.invoke(cli, ["stats", "test-bucket", "--json"])
|
||||
|
||||
# Verify
|
||||
assert result.exit_code == 0
|
||||
output = json.loads(result.output)
|
||||
assert output["bucket"] == "test-bucket"
|
||||
assert output["object_count"] == 10
|
||||
assert output["total_size"] == 1000000
|
||||
assert output["compressed_size"] == 500000
|
||||
assert output["space_saved"] == 500000
|
||||
assert output["average_compression_ratio"] == 0.5
|
||||
assert output["delta_objects"] == 7
|
||||
assert output["direct_objects"] == 3
|
||||
|
||||
# Verify client was called correctly
|
||||
mock_client.get_bucket_stats.assert_called_once_with(
|
||||
"test-bucket", mode="quick", use_cache=True, refresh_cache=False
|
||||
)
|
||||
|
||||
def test_stats_json_output_detailed(self):
|
||||
"""Test stats command with detailed JSON output."""
|
||||
mock_stats = BucketStats(
|
||||
bucket="test-bucket",
|
||||
object_count=5,
|
||||
total_size=2000000,
|
||||
compressed_size=100000,
|
||||
space_saved=1900000,
|
||||
average_compression_ratio=0.95,
|
||||
delta_objects=5,
|
||||
direct_objects=0,
|
||||
)
|
||||
|
||||
with patch("deltaglider.client.DeltaGliderClient") as mock_client_class:
|
||||
mock_client = Mock()
|
||||
mock_client.get_bucket_stats.return_value = mock_stats
|
||||
mock_client_class.return_value = mock_client
|
||||
|
||||
runner = CliRunner()
|
||||
result = runner.invoke(cli, ["stats", "test-bucket", "--detailed", "--json"])
|
||||
|
||||
assert result.exit_code == 0
|
||||
output = json.loads(result.output)
|
||||
assert output["average_compression_ratio"] == 0.95
|
||||
|
||||
# Verify detailed flag was passed
|
||||
mock_client.get_bucket_stats.assert_called_once_with(
|
||||
"test-bucket", mode="detailed", use_cache=True, refresh_cache=False
|
||||
)
|
||||
|
||||
def test_stats_json_output_sampled(self):
|
||||
"""Test stats command with sampled JSON output."""
|
||||
mock_stats = BucketStats(
|
||||
bucket="test-bucket",
|
||||
object_count=5,
|
||||
total_size=2000000,
|
||||
compressed_size=100000,
|
||||
space_saved=1900000,
|
||||
average_compression_ratio=0.95,
|
||||
delta_objects=5,
|
||||
direct_objects=0,
|
||||
)
|
||||
|
||||
with patch("deltaglider.client.DeltaGliderClient") as mock_client_class:
|
||||
mock_client = Mock()
|
||||
mock_client.get_bucket_stats.return_value = mock_stats
|
||||
mock_client_class.return_value = mock_client
|
||||
|
||||
runner = CliRunner()
|
||||
result = runner.invoke(cli, ["stats", "test-bucket", "--sampled", "--json"])
|
||||
|
||||
assert result.exit_code == 0
|
||||
mock_client.get_bucket_stats.assert_called_once_with(
|
||||
"test-bucket", mode="sampled", use_cache=True, refresh_cache=False
|
||||
)
|
||||
|
||||
def test_stats_sampled_and_detailed_conflict(self):
|
||||
"""--sampled and --detailed flags must be mutually exclusive."""
|
||||
|
||||
with patch("deltaglider.client.DeltaGliderClient") as mock_client_class:
|
||||
mock_client = Mock()
|
||||
mock_client_class.return_value = mock_client
|
||||
|
||||
runner = CliRunner()
|
||||
result = runner.invoke(cli, ["stats", "test-bucket", "--sampled", "--detailed"])
|
||||
|
||||
assert result.exit_code == 1
|
||||
assert "cannot be used together" in result.output
|
||||
|
||||
def test_stats_human_readable_output(self):
|
||||
"""Test stats command with human-readable output."""
|
||||
mock_stats = BucketStats(
|
||||
bucket="test-bucket",
|
||||
object_count=10,
|
||||
total_size=1500000, # ~1.43 MB
|
||||
compressed_size=300000, # ~293 KB
|
||||
space_saved=1200000, # ~1.14 MB
|
||||
average_compression_ratio=0.8,
|
||||
delta_objects=7,
|
||||
direct_objects=3,
|
||||
)
|
||||
|
||||
with patch("deltaglider.client.DeltaGliderClient") as mock_client_class:
|
||||
mock_client = Mock()
|
||||
mock_client.get_bucket_stats.return_value = mock_stats
|
||||
mock_client_class.return_value = mock_client
|
||||
|
||||
runner = CliRunner()
|
||||
result = runner.invoke(cli, ["stats", "test-bucket"])
|
||||
|
||||
assert result.exit_code == 0
|
||||
output = result.output
|
||||
|
||||
# Verify human-readable format
|
||||
assert "Bucket Statistics: test-bucket" in output
|
||||
assert "Total Objects:" in output
|
||||
assert "10" in output
|
||||
assert "Delta Objects:" in output
|
||||
assert "7" in output
|
||||
assert "Direct Objects:" in output
|
||||
assert "3" in output
|
||||
assert "Original Size:" in output
|
||||
assert "Compressed Size:" in output
|
||||
assert "Space Saved:" in output
|
||||
assert "Compression Ratio:" in output
|
||||
assert "80.0%" in output # 0.8 = 80%
|
||||
|
||||
def test_stats_error_handling(self):
|
||||
"""Test stats command error handling."""
|
||||
with patch("deltaglider.client.DeltaGliderClient") as mock_client_class:
|
||||
mock_client = Mock()
|
||||
mock_client.get_bucket_stats.side_effect = Exception("Bucket not found")
|
||||
mock_client_class.return_value = mock_client
|
||||
|
||||
runner = CliRunner()
|
||||
result = runner.invoke(cli, ["stats", "nonexistent-bucket"])
|
||||
|
||||
assert result.exit_code == 1
|
||||
assert "Error: Bucket not found" in result.output
|
||||
|
||||
def test_stats_with_s3_url(self):
|
||||
"""Test stats command with s3:// URL format."""
|
||||
mock_stats = BucketStats(
|
||||
bucket="test-bucket",
|
||||
object_count=5,
|
||||
total_size=1000000,
|
||||
compressed_size=500000,
|
||||
space_saved=500000,
|
||||
average_compression_ratio=0.5,
|
||||
delta_objects=3,
|
||||
direct_objects=2,
|
||||
)
|
||||
|
||||
with patch("deltaglider.client.DeltaGliderClient") as mock_client_class:
|
||||
mock_client = Mock()
|
||||
mock_client.get_bucket_stats.return_value = mock_stats
|
||||
mock_client_class.return_value = mock_client
|
||||
|
||||
runner = CliRunner()
|
||||
result = runner.invoke(cli, ["stats", "s3://test-bucket", "--json"])
|
||||
|
||||
assert result.exit_code == 0
|
||||
# Verify bucket name was parsed correctly from S3 URL
|
||||
mock_client.get_bucket_stats.assert_called_once_with(
|
||||
"test-bucket", mode="quick", use_cache=True, refresh_cache=False
|
||||
)
|
||||
|
||||
def test_stats_with_s3_url_trailing_slash(self):
|
||||
"""Test stats command with s3:// URL format with trailing slash."""
|
||||
mock_stats = BucketStats(
|
||||
bucket="test-bucket",
|
||||
object_count=5,
|
||||
total_size=1000000,
|
||||
compressed_size=500000,
|
||||
space_saved=500000,
|
||||
average_compression_ratio=0.5,
|
||||
delta_objects=3,
|
||||
direct_objects=2,
|
||||
)
|
||||
|
||||
with patch("deltaglider.client.DeltaGliderClient") as mock_client_class:
|
||||
mock_client = Mock()
|
||||
mock_client.get_bucket_stats.return_value = mock_stats
|
||||
mock_client_class.return_value = mock_client
|
||||
|
||||
runner = CliRunner()
|
||||
result = runner.invoke(cli, ["stats", "s3://test-bucket/", "--json"])
|
||||
|
||||
assert result.exit_code == 0
|
||||
# Verify bucket name was parsed correctly from S3 URL with trailing slash
|
||||
mock_client.get_bucket_stats.assert_called_once_with(
|
||||
"test-bucket", mode="quick", use_cache=True, refresh_cache=False
|
||||
)
|
||||
|
||||
def test_stats_with_s3_url_with_prefix(self):
|
||||
"""Test stats command with s3:// URL format with prefix (should ignore prefix)."""
|
||||
mock_stats = BucketStats(
|
||||
bucket="test-bucket",
|
||||
object_count=5,
|
||||
total_size=1000000,
|
||||
compressed_size=500000,
|
||||
space_saved=500000,
|
||||
average_compression_ratio=0.5,
|
||||
delta_objects=3,
|
||||
direct_objects=2,
|
||||
)
|
||||
|
||||
with patch("deltaglider.client.DeltaGliderClient") as mock_client_class:
|
||||
mock_client = Mock()
|
||||
mock_client.get_bucket_stats.return_value = mock_stats
|
||||
mock_client_class.return_value = mock_client
|
||||
|
||||
runner = CliRunner()
|
||||
result = runner.invoke(cli, ["stats", "s3://test-bucket/some/prefix/", "--json"])
|
||||
|
||||
assert result.exit_code == 0
|
||||
# Verify only bucket name was extracted, prefix ignored
|
||||
mock_client.get_bucket_stats.assert_called_once_with(
|
||||
"test-bucket", mode="quick", use_cache=True, refresh_cache=False
|
||||
)
|
||||
189
tests/unit/test_cache_encrypted.py
Normal file
189
tests/unit/test_cache_encrypted.py
Normal file
@@ -0,0 +1,189 @@
|
||||
"""Tests for encrypted cache adapter."""
|
||||
|
||||
import tempfile
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
from cryptography.fernet import Fernet
|
||||
|
||||
from deltaglider.adapters import ContentAddressedCache, EncryptedCache, Sha256Adapter
|
||||
from deltaglider.core.errors import CacheCorruptionError, CacheMissError
|
||||
|
||||
|
||||
class TestEncryptedCache:
|
||||
"""Test encrypted cache wrapper functionality."""
|
||||
|
||||
@pytest.fixture
|
||||
def temp_dir(self):
|
||||
"""Create temporary directory for tests."""
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
yield Path(tmpdir)
|
||||
|
||||
@pytest.fixture
|
||||
def hasher(self):
|
||||
"""Create SHA256 hasher."""
|
||||
return Sha256Adapter()
|
||||
|
||||
@pytest.fixture
|
||||
def backend(self, temp_dir, hasher):
|
||||
"""Create CAS backend."""
|
||||
return ContentAddressedCache(temp_dir, hasher)
|
||||
|
||||
@pytest.fixture
|
||||
def encrypted_cache(self, backend):
|
||||
"""Create encrypted cache with ephemeral key."""
|
||||
return EncryptedCache(backend)
|
||||
|
||||
def test_ephemeral_key_generation(self, backend):
|
||||
"""Test that ephemeral key is generated automatically."""
|
||||
cache = EncryptedCache(backend)
|
||||
|
||||
assert cache._ephemeral is True
|
||||
assert cache._key is not None
|
||||
assert len(cache._key) == 44 # Base64-encoded 32-byte key
|
||||
|
||||
def test_provided_key_usage(self, backend):
|
||||
"""Test using provided encryption key."""
|
||||
key = Fernet.generate_key()
|
||||
cache = EncryptedCache(backend, encryption_key=key)
|
||||
|
||||
assert cache._ephemeral is False
|
||||
assert cache._key == key
|
||||
|
||||
def test_write_and_read_encrypted(self, encrypted_cache, temp_dir):
|
||||
"""Test writing and reading encrypted content."""
|
||||
# Create test file
|
||||
test_file = temp_dir / "test.txt"
|
||||
test_content = b"Secret data that should be encrypted"
|
||||
test_file.write_bytes(test_content)
|
||||
|
||||
# Compute expected SHA
|
||||
import hashlib
|
||||
|
||||
expected_sha = hashlib.sha256(test_content).hexdigest()
|
||||
|
||||
# Write to encrypted cache
|
||||
encrypted_cache.write_ref("test-bucket", "test-prefix", test_file)
|
||||
|
||||
# Read back and validate
|
||||
decrypted_path = encrypted_cache.get_validated_ref(
|
||||
"test-bucket", "test-prefix", expected_sha
|
||||
)
|
||||
|
||||
# Verify decrypted content matches original
|
||||
decrypted_content = decrypted_path.read_bytes()
|
||||
assert decrypted_content == test_content
|
||||
|
||||
def test_encrypted_storage_not_readable(self, encrypted_cache, backend, temp_dir):
|
||||
"""Test that stored data is actually encrypted."""
|
||||
# Create test file
|
||||
test_file = temp_dir / "test.txt"
|
||||
test_content = b"Plaintext secret"
|
||||
test_file.write_bytes(test_content)
|
||||
|
||||
# Write to encrypted cache
|
||||
encrypted_cache.write_ref("test-bucket", "test-prefix", test_file)
|
||||
|
||||
# Get the encrypted file path from backend
|
||||
backend_path = backend.ref_path("test-bucket", "test-prefix")
|
||||
|
||||
# Read encrypted content directly
|
||||
encrypted_content = backend_path.read_bytes()
|
||||
|
||||
# Verify content is NOT the same as plaintext
|
||||
assert encrypted_content != test_content
|
||||
# Verify content doesn't contain plaintext substring
|
||||
assert b"secret" not in encrypted_content.lower()
|
||||
|
||||
def test_cache_miss(self, encrypted_cache):
|
||||
"""Test cache miss error."""
|
||||
with pytest.raises(CacheMissError):
|
||||
encrypted_cache.get_validated_ref("no-bucket", "no-prefix", "fakehash")
|
||||
|
||||
def test_decryption_with_wrong_sha(self, encrypted_cache, temp_dir):
|
||||
"""Test that wrong SHA is detected after decryption."""
|
||||
# Create test file
|
||||
test_file = temp_dir / "test.txt"
|
||||
test_content = b"Test content"
|
||||
test_file.write_bytes(test_content)
|
||||
|
||||
# Write to cache
|
||||
encrypted_cache.write_ref("test-bucket", "test-prefix", test_file)
|
||||
|
||||
# Try to read with wrong SHA
|
||||
with pytest.raises(CacheCorruptionError, match="SHA mismatch"):
|
||||
encrypted_cache.get_validated_ref("test-bucket", "test-prefix", "wrong_sha_hash_here")
|
||||
|
||||
def test_decryption_with_wrong_key(self, temp_dir):
|
||||
"""Test that decryption fails with wrong key."""
|
||||
# Create shared backend
|
||||
from deltaglider.adapters import ContentAddressedCache, Sha256Adapter
|
||||
|
||||
hasher = Sha256Adapter()
|
||||
backend = ContentAddressedCache(temp_dir / "shared", hasher)
|
||||
|
||||
# Create two caches with different keys sharing same backend
|
||||
cache1 = EncryptedCache(backend)
|
||||
|
||||
# Write with cache1
|
||||
test_file = temp_dir / "test.txt"
|
||||
test_content = b"Encrypted data"
|
||||
test_file.write_bytes(test_content)
|
||||
|
||||
import hashlib
|
||||
|
||||
expected_sha = hashlib.sha256(test_content).hexdigest()
|
||||
|
||||
cache1.write_ref("test-bucket", "test-prefix", test_file)
|
||||
|
||||
# Create cache2 with different key (fresh instance, different ephemeral key)
|
||||
# and manually add to its mapping (simulating persistent storage scenario)
|
||||
cache2 = EncryptedCache(backend)
|
||||
cache2._plaintext_sha_map[("test-bucket", "test-prefix")] = expected_sha
|
||||
|
||||
# Try to read with cache2 (different key) - should fail decryption
|
||||
with pytest.raises(CacheCorruptionError, match="Decryption failed"):
|
||||
cache2.get_validated_ref("test-bucket", "test-prefix", expected_sha)
|
||||
|
||||
def test_evict_cleans_decrypted_files(self, encrypted_cache, temp_dir):
|
||||
"""Test that evict cleans up .decrypted temporary files."""
|
||||
# Create and store file
|
||||
test_file = temp_dir / "test.txt"
|
||||
test_content = b"Test"
|
||||
test_file.write_bytes(test_content)
|
||||
|
||||
import hashlib
|
||||
|
||||
expected_sha = hashlib.sha256(test_content).hexdigest()
|
||||
|
||||
encrypted_cache.write_ref("test-bucket", "test-prefix", test_file)
|
||||
|
||||
# Read to create .decrypted file
|
||||
decrypted_path = encrypted_cache.get_validated_ref(
|
||||
"test-bucket", "test-prefix", expected_sha
|
||||
)
|
||||
assert decrypted_path.exists()
|
||||
|
||||
# Evict
|
||||
encrypted_cache.evict("test-bucket", "test-prefix")
|
||||
|
||||
# Verify .decrypted file is removed
|
||||
assert not decrypted_path.exists()
|
||||
|
||||
def test_from_env_with_no_key(self, backend, monkeypatch):
|
||||
"""Test from_env creates ephemeral key when env var not set."""
|
||||
monkeypatch.delenv("DG_CACHE_ENCRYPTION_KEY", raising=False)
|
||||
|
||||
cache = EncryptedCache.from_env(backend)
|
||||
|
||||
assert cache._ephemeral is True
|
||||
|
||||
def test_from_env_with_key(self, backend, monkeypatch):
|
||||
"""Test from_env uses key from environment."""
|
||||
key = Fernet.generate_key()
|
||||
monkeypatch.setenv("DG_CACHE_ENCRYPTION_KEY", key.decode("utf-8"))
|
||||
|
||||
cache = EncryptedCache.from_env(backend)
|
||||
|
||||
assert cache._ephemeral is False
|
||||
assert cache._key == key
|
||||
200
tests/unit/test_cache_memory.py
Normal file
200
tests/unit/test_cache_memory.py
Normal file
@@ -0,0 +1,200 @@
|
||||
"""Tests for in-memory cache adapter."""
|
||||
|
||||
import tempfile
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
|
||||
from deltaglider.adapters import MemoryCache, Sha256Adapter
|
||||
from deltaglider.core.errors import CacheCorruptionError, CacheMissError
|
||||
|
||||
|
||||
class TestMemoryCache:
|
||||
"""Test in-memory cache functionality."""
|
||||
|
||||
@pytest.fixture
|
||||
def temp_dir(self):
|
||||
"""Create temporary directory for tests."""
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
yield Path(tmpdir)
|
||||
|
||||
@pytest.fixture
|
||||
def hasher(self):
|
||||
"""Create SHA256 hasher."""
|
||||
return Sha256Adapter()
|
||||
|
||||
@pytest.fixture
|
||||
def memory_cache(self, hasher, temp_dir):
|
||||
"""Create memory cache with 1MB limit."""
|
||||
return MemoryCache(hasher, max_size_mb=1, temp_dir=temp_dir)
|
||||
|
||||
def test_write_and_read(self, memory_cache, temp_dir):
|
||||
"""Test basic write and read functionality."""
|
||||
# Create test file
|
||||
test_file = temp_dir / "test.txt"
|
||||
test_content = b"Hello, memory cache!"
|
||||
test_file.write_bytes(test_content)
|
||||
|
||||
# Compute expected SHA
|
||||
import hashlib
|
||||
|
||||
expected_sha = hashlib.sha256(test_content).hexdigest()
|
||||
|
||||
# Write to memory cache
|
||||
memory_cache.write_ref("test-bucket", "test-prefix", test_file)
|
||||
|
||||
# Read back
|
||||
retrieved_path = memory_cache.get_validated_ref("test-bucket", "test-prefix", expected_sha)
|
||||
|
||||
# Verify content
|
||||
assert retrieved_path.read_bytes() == test_content
|
||||
|
||||
def test_has_ref_true(self, memory_cache, temp_dir):
|
||||
"""Test has_ref returns True for existing content."""
|
||||
test_file = temp_dir / "test.txt"
|
||||
test_content = b"Test"
|
||||
test_file.write_bytes(test_content)
|
||||
|
||||
import hashlib
|
||||
|
||||
sha = hashlib.sha256(test_content).hexdigest()
|
||||
|
||||
memory_cache.write_ref("test-bucket", "test-prefix", test_file)
|
||||
|
||||
assert memory_cache.has_ref("test-bucket", "test-prefix", sha) is True
|
||||
|
||||
def test_has_ref_false(self, memory_cache):
|
||||
"""Test has_ref returns False for non-existent content."""
|
||||
assert memory_cache.has_ref("no-bucket", "no-prefix", "fakehash") is False
|
||||
|
||||
def test_cache_miss(self, memory_cache):
|
||||
"""Test cache miss error."""
|
||||
with pytest.raises(CacheMissError):
|
||||
memory_cache.get_validated_ref("no-bucket", "no-prefix", "fakehash")
|
||||
|
||||
def test_sha_mismatch_detection(self, memory_cache, temp_dir):
|
||||
"""Test that SHA mismatch is detected."""
|
||||
test_file = temp_dir / "test.txt"
|
||||
test_file.write_bytes(b"Content")
|
||||
|
||||
memory_cache.write_ref("test-bucket", "test-prefix", test_file)
|
||||
|
||||
# Try to read with wrong SHA
|
||||
with pytest.raises(CacheCorruptionError, match="SHA mismatch"):
|
||||
memory_cache.get_validated_ref("test-bucket", "test-prefix", "wrong_sha")
|
||||
|
||||
def test_lru_eviction(self, hasher, temp_dir):
|
||||
"""Test LRU eviction when cache is full."""
|
||||
# Create small cache (only 10KB)
|
||||
small_cache = MemoryCache(hasher, max_size_mb=0.01, temp_dir=temp_dir)
|
||||
|
||||
# Create files that will exceed cache limit
|
||||
file1 = temp_dir / "file1.txt"
|
||||
file2 = temp_dir / "file2.txt"
|
||||
file3 = temp_dir / "file3.txt"
|
||||
|
||||
# Each file is 5KB
|
||||
file1.write_bytes(b"A" * 5000)
|
||||
file2.write_bytes(b"B" * 5000)
|
||||
file3.write_bytes(b"C" * 5000)
|
||||
|
||||
# Write file1 and file2 (total 10KB, at limit)
|
||||
small_cache.write_ref("bucket", "prefix1", file1)
|
||||
small_cache.write_ref("bucket", "prefix2", file2)
|
||||
|
||||
# Verify both are in cache
|
||||
import hashlib
|
||||
|
||||
sha1 = hashlib.sha256(b"A" * 5000).hexdigest()
|
||||
sha2 = hashlib.sha256(b"B" * 5000).hexdigest()
|
||||
|
||||
assert small_cache.has_ref("bucket", "prefix1", sha1) is True
|
||||
assert small_cache.has_ref("bucket", "prefix2", sha2) is True
|
||||
|
||||
# Write file3 (5KB) - should evict file1 (LRU)
|
||||
small_cache.write_ref("bucket", "prefix3", file3)
|
||||
|
||||
# file1 should be evicted
|
||||
assert small_cache.has_ref("bucket", "prefix1", sha1) is False
|
||||
|
||||
# file2 and file3 should still be in cache
|
||||
sha3 = hashlib.sha256(b"C" * 5000).hexdigest()
|
||||
assert small_cache.has_ref("bucket", "prefix2", sha2) is True
|
||||
assert small_cache.has_ref("bucket", "prefix3", sha3) is True
|
||||
|
||||
def test_file_too_large_for_cache(self, hasher, temp_dir):
|
||||
"""Test error when file exceeds cache size limit."""
|
||||
small_cache = MemoryCache(hasher, max_size_mb=0.001, temp_dir=temp_dir) # 1KB limit
|
||||
|
||||
large_file = temp_dir / "large.txt"
|
||||
large_file.write_bytes(b"X" * 2000) # 2KB file
|
||||
|
||||
with pytest.raises(CacheCorruptionError, match="too large"):
|
||||
small_cache.write_ref("bucket", "prefix", large_file)
|
||||
|
||||
def test_evict_removes_from_memory(self, memory_cache, temp_dir):
|
||||
"""Test that evict removes content from memory."""
|
||||
test_file = temp_dir / "test.txt"
|
||||
test_content = b"Test"
|
||||
test_file.write_bytes(test_content)
|
||||
|
||||
import hashlib
|
||||
|
||||
sha = hashlib.sha256(test_content).hexdigest()
|
||||
|
||||
memory_cache.write_ref("test-bucket", "test-prefix", test_file)
|
||||
|
||||
# Verify it's in cache
|
||||
assert memory_cache.has_ref("test-bucket", "test-prefix", sha) is True
|
||||
|
||||
# Evict
|
||||
memory_cache.evict("test-bucket", "test-prefix")
|
||||
|
||||
# Verify it's gone
|
||||
assert memory_cache.has_ref("test-bucket", "test-prefix", sha) is False
|
||||
|
||||
def test_clear_removes_all(self, memory_cache, temp_dir):
|
||||
"""Test that clear removes all cached content."""
|
||||
# Add multiple files
|
||||
for i in range(3):
|
||||
test_file = temp_dir / f"test{i}.txt"
|
||||
test_file.write_bytes(f"Content {i}".encode())
|
||||
memory_cache.write_ref("bucket", f"prefix{i}", test_file)
|
||||
|
||||
# Verify cache is not empty
|
||||
assert memory_cache._current_size > 0
|
||||
assert len(memory_cache._cache) == 3
|
||||
|
||||
# Clear
|
||||
memory_cache.clear()
|
||||
|
||||
# Verify cache is empty
|
||||
assert memory_cache._current_size == 0
|
||||
assert len(memory_cache._cache) == 0
|
||||
assert len(memory_cache._access_order) == 0
|
||||
|
||||
def test_access_order_updated_on_read(self, memory_cache, temp_dir):
|
||||
"""Test that LRU access order is updated on reads."""
|
||||
# Create two files
|
||||
file1 = temp_dir / "file1.txt"
|
||||
file2 = temp_dir / "file2.txt"
|
||||
file1.write_bytes(b"File 1")
|
||||
file2.write_bytes(b"File 2")
|
||||
|
||||
# Write both
|
||||
memory_cache.write_ref("bucket", "prefix1", file1)
|
||||
memory_cache.write_ref("bucket", "prefix2", file2)
|
||||
|
||||
# Access order should be: [prefix1, prefix2]
|
||||
assert memory_cache._access_order[0] == ("bucket", "prefix1")
|
||||
assert memory_cache._access_order[1] == ("bucket", "prefix2")
|
||||
|
||||
# Read prefix1 again
|
||||
import hashlib
|
||||
|
||||
sha1 = hashlib.sha256(b"File 1").hexdigest()
|
||||
memory_cache.get_validated_ref("bucket", "prefix1", sha1)
|
||||
|
||||
# Access order should now be: [prefix2, prefix1]
|
||||
assert memory_cache._access_order[0] == ("bucket", "prefix2")
|
||||
assert memory_cache._access_order[1] == ("bucket", "prefix1")
|
||||
@@ -50,10 +50,10 @@ class TestDeltaServicePut:
|
||||
ref_sha = service.hasher.sha256(io.BytesIO(ref_content))
|
||||
|
||||
ref_metadata = {
|
||||
"tool": "deltaglider/0.1.0",
|
||||
"source_name": "original.zip",
|
||||
"file_sha256": ref_sha,
|
||||
"created_at": "2025-01-01T00:00:00Z",
|
||||
"dg-tool": "deltaglider/0.1.0",
|
||||
"dg-source-name": "original.zip",
|
||||
"dg-file-sha256": ref_sha,
|
||||
"dg-created-at": "2025-01-01T00:00:00Z",
|
||||
}
|
||||
mock_storage.head.return_value = ObjectHead(
|
||||
key="test/prefix/reference.bin",
|
||||
@@ -98,7 +98,7 @@ class TestDeltaServicePut:
|
||||
ref_sha = service.hasher.sha256(io.BytesIO(ref_content))
|
||||
|
||||
ref_metadata = {
|
||||
"file_sha256": ref_sha,
|
||||
"dg-file-sha256": ref_sha,
|
||||
}
|
||||
mock_storage.head.return_value = ObjectHead(
|
||||
key="test/prefix/reference.bin",
|
||||
@@ -147,22 +147,36 @@ class TestDeltaServiceGet:
|
||||
service.get(delta_key, temp_dir / "output.zip")
|
||||
|
||||
def test_get_missing_metadata(self, service, mock_storage, temp_dir):
|
||||
"""Test get with missing metadata."""
|
||||
"""Test get with missing metadata (regular S3 object)."""
|
||||
# Setup
|
||||
delta_key = ObjectKey(bucket="test-bucket", key="test/file.zip.delta")
|
||||
|
||||
# Create test content
|
||||
test_content = b"regular S3 file content"
|
||||
|
||||
# Mock a regular S3 object without DeltaGlider metadata
|
||||
mock_storage.head.return_value = ObjectHead(
|
||||
key="test/file.zip.delta",
|
||||
size=100,
|
||||
size=len(test_content),
|
||||
etag="abc",
|
||||
last_modified=None,
|
||||
metadata={}, # Missing required metadata
|
||||
metadata={}, # Missing DeltaGlider metadata - this is a regular S3 object
|
||||
)
|
||||
|
||||
# Execute and verify
|
||||
from deltaglider.core.errors import StorageIOError
|
||||
# Mock the storage.get to return the content
|
||||
from unittest.mock import MagicMock
|
||||
|
||||
with pytest.raises(StorageIOError):
|
||||
service.get(delta_key, temp_dir / "output.zip")
|
||||
mock_stream = MagicMock()
|
||||
mock_stream.read.side_effect = [test_content, b""] # Return content then EOF
|
||||
mock_storage.get.return_value = mock_stream
|
||||
|
||||
# Execute - should successfully download regular S3 object
|
||||
output_path = temp_dir / "output.zip"
|
||||
service.get(delta_key, output_path)
|
||||
|
||||
# Verify - file should be downloaded
|
||||
assert output_path.exists()
|
||||
assert output_path.read_bytes() == test_content
|
||||
|
||||
|
||||
class TestDeltaServiceVerify:
|
||||
@@ -186,15 +200,15 @@ class TestDeltaServiceVerify:
|
||||
ref_sha = service.hasher.sha256(io.BytesIO(ref_content))
|
||||
|
||||
delta_metadata = {
|
||||
"tool": "deltaglider/0.1.0",
|
||||
"original_name": "file.zip",
|
||||
"file_sha256": test_sha,
|
||||
"file_size": str(len(test_content)),
|
||||
"created_at": "2025-01-01T00:00:00Z",
|
||||
"ref_key": "test/reference.bin",
|
||||
"ref_sha256": ref_sha,
|
||||
"delta_size": "100",
|
||||
"delta_cmd": "xdelta3 -e -9 -s reference.bin file.zip file.zip.delta",
|
||||
"dg-tool": "deltaglider/0.1.0",
|
||||
"dg-original-name": "file.zip",
|
||||
"dg-file-sha256": test_sha,
|
||||
"dg-file-size": str(len(test_content)),
|
||||
"dg-created-at": "2025-01-01T00:00:00Z",
|
||||
"dg-ref-key": "test/reference.bin",
|
||||
"dg-ref-sha256": ref_sha,
|
||||
"dg-delta-size": "100",
|
||||
"dg-delta-cmd": "xdelta3 -e -9 -s reference.bin file.zip file.zip.delta",
|
||||
}
|
||||
mock_storage.head.return_value = ObjectHead(
|
||||
key="test/file.zip.delta",
|
||||
|
||||
25
tests/unit/test_delta_extensions.py
Normal file
25
tests/unit/test_delta_extensions.py
Normal file
@@ -0,0 +1,25 @@
|
||||
"""Tests for shared delta extension policy."""
|
||||
|
||||
from deltaglider.core.delta_extensions import (
|
||||
DEFAULT_COMPOUND_DELTA_EXTENSIONS,
|
||||
DEFAULT_DELTA_EXTENSIONS,
|
||||
is_delta_candidate,
|
||||
)
|
||||
|
||||
|
||||
def test_is_delta_candidate_matches_default_extensions():
|
||||
"""All default extensions should be detected as delta candidates."""
|
||||
for ext in DEFAULT_DELTA_EXTENSIONS:
|
||||
assert is_delta_candidate(f"file{ext}")
|
||||
|
||||
|
||||
def test_is_delta_candidate_matches_compound_extensions():
|
||||
"""Compound extensions should be handled even with multiple suffixes."""
|
||||
for ext in DEFAULT_COMPOUND_DELTA_EXTENSIONS:
|
||||
assert is_delta_candidate(f"file{ext}")
|
||||
|
||||
|
||||
def test_is_delta_candidate_rejects_other_extensions():
|
||||
"""Non delta-friendly extensions should return False."""
|
||||
assert not is_delta_candidate("document.txt")
|
||||
assert not is_delta_candidate("image.jpeg")
|
||||
112
tests/unit/test_object_listing.py
Normal file
112
tests/unit/test_object_listing.py
Normal file
@@ -0,0 +1,112 @@
|
||||
"""Unit tests for object_listing pagination."""
|
||||
|
||||
from unittest.mock import Mock
|
||||
|
||||
from deltaglider.core.object_listing import list_all_objects, list_objects_page
|
||||
|
||||
|
||||
def test_list_objects_page_passes_continuation_token():
|
||||
"""Test that list_objects_page passes continuation_token to storage."""
|
||||
storage = Mock()
|
||||
storage.list_objects.return_value = {
|
||||
"objects": [],
|
||||
"common_prefixes": [],
|
||||
"is_truncated": False,
|
||||
"next_continuation_token": None,
|
||||
"key_count": 0,
|
||||
}
|
||||
|
||||
list_objects_page(
|
||||
storage,
|
||||
bucket="test-bucket",
|
||||
continuation_token="test-token",
|
||||
)
|
||||
|
||||
# Verify continuation_token was passed
|
||||
storage.list_objects.assert_called_once()
|
||||
call_kwargs = storage.list_objects.call_args.kwargs
|
||||
assert call_kwargs["continuation_token"] == "test-token"
|
||||
|
||||
|
||||
def test_list_all_objects_uses_continuation_token_for_pagination():
|
||||
"""Test that list_all_objects uses continuation_token (not start_after) for pagination."""
|
||||
storage = Mock()
|
||||
|
||||
# Mock 3 pages of results
|
||||
responses = [
|
||||
{
|
||||
"objects": [{"key": f"obj{i}"} for i in range(1000)],
|
||||
"common_prefixes": [],
|
||||
"is_truncated": True,
|
||||
"next_continuation_token": "token1",
|
||||
"key_count": 1000,
|
||||
},
|
||||
{
|
||||
"objects": [{"key": f"obj{i}"} for i in range(1000, 2000)],
|
||||
"common_prefixes": [],
|
||||
"is_truncated": True,
|
||||
"next_continuation_token": "token2",
|
||||
"key_count": 1000,
|
||||
},
|
||||
{
|
||||
"objects": [{"key": f"obj{i}"} for i in range(2000, 2500)],
|
||||
"common_prefixes": [],
|
||||
"is_truncated": False,
|
||||
"next_continuation_token": None,
|
||||
"key_count": 500,
|
||||
},
|
||||
]
|
||||
|
||||
storage.list_objects.side_effect = responses
|
||||
|
||||
result = list_all_objects(
|
||||
storage,
|
||||
bucket="test-bucket",
|
||||
prefix="",
|
||||
)
|
||||
|
||||
# Should have made 3 calls
|
||||
assert storage.list_objects.call_count == 3
|
||||
|
||||
# Should have collected all objects
|
||||
assert len(result.objects) == 2500
|
||||
|
||||
# Should not be truncated
|
||||
assert not result.is_truncated
|
||||
|
||||
# Verify the calls used continuation_token correctly
|
||||
calls = storage.list_objects.call_args_list
|
||||
assert len(calls) == 3
|
||||
|
||||
# First call should have no continuation_token
|
||||
assert calls[0].kwargs.get("continuation_token") is None
|
||||
|
||||
# Second call should use token1
|
||||
assert calls[1].kwargs.get("continuation_token") == "token1"
|
||||
|
||||
# Third call should use token2
|
||||
assert calls[2].kwargs.get("continuation_token") == "token2"
|
||||
|
||||
|
||||
def test_list_all_objects_prevents_infinite_loop():
|
||||
"""Test that list_all_objects has max_iterations protection."""
|
||||
storage = Mock()
|
||||
|
||||
# Mock infinite pagination (always returns more)
|
||||
storage.list_objects.return_value = {
|
||||
"objects": [{"key": "obj"}],
|
||||
"common_prefixes": [],
|
||||
"is_truncated": True,
|
||||
"next_continuation_token": "token",
|
||||
"key_count": 1,
|
||||
}
|
||||
|
||||
result = list_all_objects(
|
||||
storage,
|
||||
bucket="test-bucket",
|
||||
max_iterations=10, # Low limit for testing
|
||||
)
|
||||
|
||||
# Should stop at max_iterations
|
||||
assert storage.list_objects.call_count == 10
|
||||
assert result.is_truncated
|
||||
70
tests/unit/test_s3_compat.py
Normal file
70
tests/unit/test_s3_compat.py
Normal file
@@ -0,0 +1,70 @@
|
||||
"""Tests for S3-compatible storage compatibility.
|
||||
|
||||
Ensures the S3 adapter works with non-AWS S3 endpoints (Hetzner, MinIO, etc.)
|
||||
that don't support newer AWS-specific features like automatic request checksums.
|
||||
"""
|
||||
|
||||
from unittest.mock import MagicMock, patch
|
||||
|
||||
from deltaglider.adapters.storage_s3 import S3StorageAdapter
|
||||
|
||||
|
||||
class TestS3CompatibleEndpoints:
|
||||
"""Verify S3 adapter configuration for non-AWS endpoint compatibility."""
|
||||
|
||||
def test_client_disables_automatic_checksums(self):
|
||||
"""boto3 1.36+ sends CRC32/CRC64 checksums by default.
|
||||
|
||||
S3-compatible stores (Hetzner, MinIO) reject these with BadRequest.
|
||||
The adapter must set request_checksum_calculation='when_required'.
|
||||
"""
|
||||
with patch("deltaglider.adapters.storage_s3.boto3.client") as mock_client:
|
||||
S3StorageAdapter(endpoint_url="https://example.com")
|
||||
|
||||
mock_client.assert_called_once()
|
||||
call_kwargs = mock_client.call_args
|
||||
config = call_kwargs.kwargs.get("config") or call_kwargs[1].get("config")
|
||||
|
||||
assert config is not None, "boto3 client must be created with a Config object"
|
||||
assert config.request_checksum_calculation == "when_required"
|
||||
assert config.response_checksum_validation == "when_required"
|
||||
|
||||
def test_put_object_no_checksum_kwargs(self, temp_dir):
|
||||
"""put_object must not pass ChecksumAlgorithm or similar kwargs."""
|
||||
mock_client = MagicMock()
|
||||
mock_client.put_object.return_value = {"ETag": '"abc123"'}
|
||||
|
||||
adapter = S3StorageAdapter(client=mock_client)
|
||||
|
||||
test_file = temp_dir / "test.sha1"
|
||||
test_file.write_text("abc123")
|
||||
|
||||
adapter.put(
|
||||
"my-bucket/test/test.sha1",
|
||||
test_file,
|
||||
{"compression": "none", "tool": "deltaglider"},
|
||||
)
|
||||
|
||||
mock_client.put_object.assert_called_once()
|
||||
call_kwargs = mock_client.put_object.call_args.kwargs
|
||||
|
||||
checksum_keys = {
|
||||
"ChecksumAlgorithm",
|
||||
"ChecksumCRC32",
|
||||
"ChecksumCRC32C",
|
||||
"ChecksumCRC64NVME",
|
||||
"ChecksumSHA1",
|
||||
"ChecksumSHA256",
|
||||
"ContentMD5",
|
||||
}
|
||||
passed_checksum_keys = checksum_keys & set(call_kwargs.keys())
|
||||
assert not passed_checksum_keys, (
|
||||
f"put_object must not pass checksum kwargs for S3-compatible "
|
||||
f"endpoint support, but found: {passed_checksum_keys}"
|
||||
)
|
||||
|
||||
def test_preconfigured_client_is_used_as_is(self):
|
||||
"""When a pre-configured client is passed, it should be used directly."""
|
||||
mock_client = MagicMock()
|
||||
adapter = S3StorageAdapter(client=mock_client)
|
||||
assert adapter.client is mock_client
|
||||
44
tests/unit/test_s3_uri.py
Normal file
44
tests/unit/test_s3_uri.py
Normal file
@@ -0,0 +1,44 @@
|
||||
"""Tests for S3 URI helpers."""
|
||||
|
||||
import pytest
|
||||
|
||||
from deltaglider.core.s3_uri import build_s3_url, is_s3_url, parse_s3_url
|
||||
|
||||
|
||||
def test_is_s3_url_detects_scheme() -> None:
|
||||
"""is_s3_url should only match the S3 scheme."""
|
||||
assert is_s3_url("s3://bucket/path")
|
||||
assert not is_s3_url("https://example.com/object")
|
||||
|
||||
|
||||
def test_parse_s3_url_returns_bucket_and_key() -> None:
|
||||
"""Parsing should split bucket and key correctly."""
|
||||
parsed = parse_s3_url("s3://my-bucket/path/to/object.txt")
|
||||
assert parsed.bucket == "my-bucket"
|
||||
assert parsed.key == "path/to/object.txt"
|
||||
|
||||
|
||||
def test_parse_strips_trailing_slash_when_requested() -> None:
|
||||
"""strip_trailing_slash should normalise directory-style URLs."""
|
||||
parsed = parse_s3_url("s3://my-bucket/path/to/", strip_trailing_slash=True)
|
||||
assert parsed.bucket == "my-bucket"
|
||||
assert parsed.key == "path/to"
|
||||
|
||||
|
||||
def test_parse_requires_key_when_configured() -> None:
|
||||
"""allow_empty_key=False should reject bucket-only URLs."""
|
||||
with pytest.raises(ValueError):
|
||||
parse_s3_url("s3://bucket-only", allow_empty_key=False)
|
||||
|
||||
|
||||
def test_build_s3_url_round_trip() -> None:
|
||||
"""build_s3_url should round-trip with parse_s3_url."""
|
||||
url = build_s3_url("bucket", "dir/file.tar")
|
||||
parsed = parse_s3_url(url)
|
||||
assert parsed.bucket == "bucket"
|
||||
assert parsed.key == "dir/file.tar"
|
||||
|
||||
|
||||
def test_build_s3_url_for_bucket_root() -> None:
|
||||
"""When key is missing, build_s3_url should omit the trailing slash."""
|
||||
assert build_s3_url("root-bucket") == "s3://root-bucket"
|
||||
479
tests/unit/test_stats_algorithm.py
Normal file
479
tests/unit/test_stats_algorithm.py
Normal file
@@ -0,0 +1,479 @@
|
||||
"""Exhaustive tests for the bucket statistics algorithm."""
|
||||
|
||||
from unittest.mock import MagicMock, Mock, patch
|
||||
|
||||
import pytest
|
||||
|
||||
from deltaglider.client_operations.stats import get_bucket_stats
|
||||
|
||||
|
||||
class TestBucketStatsAlgorithm:
|
||||
"""Test suite for get_bucket_stats algorithm."""
|
||||
|
||||
@pytest.fixture
|
||||
def mock_client(self):
|
||||
"""Create a mock DeltaGliderClient."""
|
||||
client = Mock()
|
||||
client.service = Mock()
|
||||
client.service.storage = Mock()
|
||||
client.service.logger = Mock()
|
||||
return client
|
||||
|
||||
def test_empty_bucket(self, mock_client):
|
||||
"""Test statistics for an empty bucket."""
|
||||
# Setup: Empty bucket
|
||||
mock_client.service.storage.list_objects.return_value = {
|
||||
"objects": [],
|
||||
"is_truncated": False,
|
||||
}
|
||||
|
||||
# Execute
|
||||
stats = get_bucket_stats(mock_client, "empty-bucket")
|
||||
|
||||
# Verify
|
||||
assert stats.bucket == "empty-bucket"
|
||||
assert stats.object_count == 0
|
||||
assert stats.total_size == 0
|
||||
assert stats.compressed_size == 0
|
||||
assert stats.space_saved == 0
|
||||
assert stats.average_compression_ratio == 0.0
|
||||
assert stats.delta_objects == 0
|
||||
assert stats.direct_objects == 0
|
||||
|
||||
def test_bucket_with_only_direct_files(self, mock_client):
|
||||
"""Test bucket with only direct files (no compression)."""
|
||||
# Setup: Bucket with 3 direct files
|
||||
mock_client.service.storage.list_objects.return_value = {
|
||||
"objects": [
|
||||
{"key": "file1.pdf", "size": 1000000, "last_modified": "2024-01-01"},
|
||||
{"key": "file2.html", "size": 500000, "last_modified": "2024-01-02"},
|
||||
{"key": "file3.txt", "size": 250000, "last_modified": "2024-01-03"},
|
||||
],
|
||||
"is_truncated": False,
|
||||
}
|
||||
mock_client.service.storage.head.return_value = None
|
||||
|
||||
# Execute
|
||||
stats = get_bucket_stats(mock_client, "direct-only-bucket")
|
||||
|
||||
# Verify
|
||||
assert stats.object_count == 3
|
||||
assert stats.total_size == 1750000 # Sum of all files
|
||||
assert stats.compressed_size == 1750000 # Same as total (no compression)
|
||||
assert stats.space_saved == 0
|
||||
assert stats.average_compression_ratio == 0.0
|
||||
assert stats.delta_objects == 0
|
||||
assert stats.direct_objects == 3
|
||||
|
||||
def test_bucket_with_delta_compression(self, mock_client):
|
||||
"""Test bucket with delta-compressed files."""
|
||||
# Setup: Bucket with reference.bin and 2 delta files
|
||||
mock_client.service.storage.list_objects.return_value = {
|
||||
"objects": [
|
||||
{"key": "reference.bin", "size": 20000000, "last_modified": "2024-01-01"},
|
||||
{"key": "file1.zip.delta", "size": 50000, "last_modified": "2024-01-02"},
|
||||
{"key": "file2.zip.delta", "size": 60000, "last_modified": "2024-01-03"},
|
||||
],
|
||||
"is_truncated": False,
|
||||
}
|
||||
|
||||
# Mock metadata for delta files
|
||||
def mock_head(path):
|
||||
if "file1.zip.delta" in path:
|
||||
head = Mock()
|
||||
head.metadata = {"dg-file-size": "19500000", "compression_ratio": "0.997"}
|
||||
return head
|
||||
elif "file2.zip.delta" in path:
|
||||
head = Mock()
|
||||
head.metadata = {"dg-file-size": "19600000", "compression_ratio": "0.997"}
|
||||
return head
|
||||
return None
|
||||
|
||||
mock_client.service.storage.head.side_effect = mock_head
|
||||
|
||||
# Execute
|
||||
stats = get_bucket_stats(mock_client, "compressed-bucket", mode="detailed")
|
||||
|
||||
# Verify
|
||||
assert stats.object_count == 2 # Only delta files counted (not reference.bin)
|
||||
assert stats.total_size == 39100000 # 19.5M + 19.6M
|
||||
assert stats.compressed_size == 20110000 # reference (20M) + deltas (50K + 60K)
|
||||
assert stats.space_saved == 18990000 # ~19MB saved
|
||||
assert stats.average_compression_ratio > 0.48 # ~48.6% compression
|
||||
assert stats.delta_objects == 2
|
||||
assert stats.direct_objects == 0
|
||||
|
||||
def test_orphaned_reference_bin_detection(self, mock_client):
|
||||
"""Test detection of orphaned reference.bin files."""
|
||||
# Setup: Bucket with reference.bin but no delta files
|
||||
mock_client.service.storage.list_objects.return_value = {
|
||||
"objects": [
|
||||
{"key": "reference.bin", "size": 20000000, "last_modified": "2024-01-01"},
|
||||
{"key": "regular.pdf", "size": 1000000, "last_modified": "2024-01-02"},
|
||||
],
|
||||
"is_truncated": False,
|
||||
}
|
||||
mock_client.service.storage.head.return_value = None
|
||||
|
||||
# Execute
|
||||
stats = get_bucket_stats(mock_client, "orphaned-ref-bucket")
|
||||
|
||||
# Verify stats
|
||||
assert stats.object_count == 1 # Only regular.pdf
|
||||
assert stats.total_size == 1000000 # Only regular.pdf size
|
||||
assert stats.compressed_size == 1000000 # reference.bin NOT included
|
||||
assert stats.space_saved == 0
|
||||
assert stats.delta_objects == 0
|
||||
assert stats.direct_objects == 1
|
||||
|
||||
# Verify warning was logged
|
||||
warning_calls = mock_client.service.logger.warning.call_args_list
|
||||
assert any("ORPHANED REFERENCE FILE" in str(call) for call in warning_calls)
|
||||
assert any("20,000,000 bytes" in str(call) for call in warning_calls)
|
||||
assert any(
|
||||
"aws s3 rm s3://orphaned-ref-bucket/reference.bin" in str(call)
|
||||
for call in warning_calls
|
||||
)
|
||||
|
||||
def test_mixed_bucket(self, mock_client):
|
||||
"""Test bucket with both delta and direct files."""
|
||||
# Setup: Mixed bucket
|
||||
mock_client.service.storage.list_objects.return_value = {
|
||||
"objects": [
|
||||
{"key": "pro/reference.bin", "size": 20000000, "last_modified": "2024-01-01"},
|
||||
{"key": "pro/v1.zip.delta", "size": 50000, "last_modified": "2024-01-02"},
|
||||
{"key": "pro/v2.zip.delta", "size": 60000, "last_modified": "2024-01-03"},
|
||||
{"key": "docs/readme.pdf", "size": 500000, "last_modified": "2024-01-04"},
|
||||
{"key": "docs/manual.html", "size": 300000, "last_modified": "2024-01-05"},
|
||||
],
|
||||
"is_truncated": False,
|
||||
}
|
||||
|
||||
# Mock metadata for delta files
|
||||
def mock_head(path):
|
||||
if "v1.zip.delta" in path:
|
||||
head = Mock()
|
||||
head.metadata = {"dg-file-size": "19500000"}
|
||||
return head
|
||||
elif "v2.zip.delta" in path:
|
||||
head = Mock()
|
||||
head.metadata = {"dg-file-size": "19600000"}
|
||||
return head
|
||||
return None
|
||||
|
||||
mock_client.service.storage.head.side_effect = mock_head
|
||||
|
||||
# Execute
|
||||
stats = get_bucket_stats(mock_client, "mixed-bucket", mode="detailed")
|
||||
|
||||
# Verify
|
||||
assert stats.object_count == 4 # 2 delta + 2 direct files
|
||||
assert stats.total_size == 39900000 # 19.5M + 19.6M + 0.5M + 0.3M
|
||||
assert stats.compressed_size == 20910000 # ref (20M) + deltas (110K) + direct (800K)
|
||||
assert stats.space_saved == 18990000
|
||||
assert stats.delta_objects == 2
|
||||
assert stats.direct_objects == 2
|
||||
|
||||
def test_sha1_files_included(self, mock_client):
|
||||
"""Test that .sha1 checksum files are counted properly."""
|
||||
# Setup: Bucket with .sha1 files
|
||||
mock_client.service.storage.list_objects.return_value = {
|
||||
"objects": [
|
||||
{"key": "file1.zip", "size": 1000000, "last_modified": "2024-01-01"},
|
||||
{"key": "file1.zip.sha1", "size": 41, "last_modified": "2024-01-01"},
|
||||
{"key": "file2.tar", "size": 2000000, "last_modified": "2024-01-02"},
|
||||
{"key": "file2.tar.sha1", "size": 41, "last_modified": "2024-01-02"},
|
||||
],
|
||||
"is_truncated": False,
|
||||
}
|
||||
mock_client.service.storage.head.return_value = None
|
||||
|
||||
# Execute
|
||||
stats = get_bucket_stats(mock_client, "sha1-bucket")
|
||||
|
||||
# Verify - .sha1 files ARE counted
|
||||
assert stats.object_count == 4
|
||||
assert stats.total_size == 3000082 # All files including .sha1
|
||||
assert stats.compressed_size == 3000082
|
||||
assert stats.direct_objects == 4
|
||||
|
||||
def test_multiple_deltaspaces(self, mock_client):
|
||||
"""Test bucket with multiple deltaspaces (different prefixes)."""
|
||||
# Setup: Multiple deltaspaces
|
||||
mock_client.service.storage.list_objects.return_value = {
|
||||
"objects": [
|
||||
{"key": "pro/reference.bin", "size": 20000000, "last_modified": "2024-01-01"},
|
||||
{"key": "pro/v1.zip.delta", "size": 50000, "last_modified": "2024-01-02"},
|
||||
{
|
||||
"key": "enterprise/reference.bin",
|
||||
"size": 25000000,
|
||||
"last_modified": "2024-01-03",
|
||||
},
|
||||
{"key": "enterprise/v1.zip.delta", "size": 70000, "last_modified": "2024-01-04"},
|
||||
],
|
||||
"is_truncated": False,
|
||||
}
|
||||
|
||||
# Mock metadata
|
||||
def mock_head(path):
|
||||
if "pro/v1.zip.delta" in path:
|
||||
head = Mock()
|
||||
head.metadata = {"dg-file-size": "19500000"}
|
||||
return head
|
||||
elif "enterprise/v1.zip.delta" in path:
|
||||
head = Mock()
|
||||
head.metadata = {"dg-file-size": "24500000"}
|
||||
return head
|
||||
return None
|
||||
|
||||
mock_client.service.storage.head.side_effect = mock_head
|
||||
|
||||
# Execute
|
||||
stats = get_bucket_stats(mock_client, "multi-deltaspace-bucket", mode="detailed")
|
||||
|
||||
# Verify
|
||||
assert stats.object_count == 2 # Only delta files
|
||||
assert stats.total_size == 44000000 # 19.5M + 24.5M
|
||||
assert stats.compressed_size == 45120000 # Both references + both deltas
|
||||
assert stats.delta_objects == 2
|
||||
assert stats.direct_objects == 0
|
||||
|
||||
def test_pagination_handling(self, mock_client):
|
||||
"""Test handling of paginated results."""
|
||||
# Setup: Paginated responses
|
||||
mock_client.service.storage.list_objects.side_effect = [
|
||||
{
|
||||
"objects": [
|
||||
{"key": f"file{i}.txt", "size": 1000, "last_modified": "2024-01-01"}
|
||||
for i in range(1000)
|
||||
],
|
||||
"is_truncated": True,
|
||||
"next_continuation_token": "token1",
|
||||
},
|
||||
{
|
||||
"objects": [
|
||||
{"key": f"file{i}.txt", "size": 1000, "last_modified": "2024-01-01"}
|
||||
for i in range(1000, 1500)
|
||||
],
|
||||
"is_truncated": False,
|
||||
},
|
||||
]
|
||||
mock_client.service.storage.head.return_value = None
|
||||
|
||||
# Execute
|
||||
stats = get_bucket_stats(mock_client, "paginated-bucket")
|
||||
|
||||
# Verify
|
||||
assert stats.object_count == 1500
|
||||
assert stats.total_size == 1500000
|
||||
assert stats.compressed_size == 1500000
|
||||
assert stats.direct_objects == 1500
|
||||
|
||||
# Verify pagination was handled
|
||||
assert mock_client.service.storage.list_objects.call_count == 2
|
||||
|
||||
def test_delta_file_without_metadata(self, mock_client):
|
||||
"""Test handling of delta files with missing metadata in quick mode."""
|
||||
# Setup: Delta file without metadata
|
||||
mock_client.service.storage.list_objects.return_value = {
|
||||
"objects": [
|
||||
{"key": "reference.bin", "size": 20000000, "last_modified": "2024-01-01"},
|
||||
{"key": "file.zip.delta", "size": 50000, "last_modified": "2024-01-02"},
|
||||
],
|
||||
"is_truncated": False,
|
||||
}
|
||||
|
||||
# No metadata available (quick mode doesn't fetch metadata)
|
||||
mock_client.service.storage.head.return_value = None
|
||||
|
||||
# Execute in quick mode (default)
|
||||
stats = get_bucket_stats(mock_client, "no-metadata-bucket", mode="quick")
|
||||
|
||||
# Verify - without metadata, original size cannot be calculated
|
||||
assert stats.object_count == 1
|
||||
assert stats.total_size == 0 # Cannot calculate without metadata
|
||||
assert stats.compressed_size == 20050000 # reference + delta
|
||||
assert stats.space_saved == 0 # Cannot calculate without metadata
|
||||
assert stats.delta_objects == 1
|
||||
|
||||
# Verify warning was logged about incomplete stats in quick mode
|
||||
warning_calls = mock_client.service.logger.warning.call_args_list
|
||||
assert any("Quick mode cannot calculate" in str(call) for call in warning_calls)
|
||||
|
||||
def test_parallel_metadata_fetching(self, mock_client):
|
||||
"""Test that metadata is fetched in parallel for performance."""
|
||||
# Setup: Many delta files
|
||||
num_deltas = 50
|
||||
objects = [{"key": "reference.bin", "size": 20000000, "last_modified": "2024-01-01"}]
|
||||
objects.extend(
|
||||
[
|
||||
{
|
||||
"key": f"file{i}.zip.delta",
|
||||
"size": 50000 + i,
|
||||
"last_modified": f"2024-01-{i + 2:02d}",
|
||||
}
|
||||
for i in range(num_deltas)
|
||||
]
|
||||
)
|
||||
|
||||
mock_client.service.storage.list_objects.return_value = {
|
||||
"objects": objects,
|
||||
"is_truncated": False,
|
||||
}
|
||||
|
||||
# Mock metadata
|
||||
def mock_head(path):
|
||||
head = Mock()
|
||||
head.metadata = {"dg-file-size": "19500000"}
|
||||
return head
|
||||
|
||||
mock_client.service.storage.head.side_effect = mock_head
|
||||
|
||||
# Execute with mocked ThreadPoolExecutor
|
||||
with patch("concurrent.futures.ThreadPoolExecutor") as mock_executor:
|
||||
mock_pool = MagicMock()
|
||||
mock_executor.return_value.__enter__.return_value = mock_pool
|
||||
|
||||
# Simulate parallel execution
|
||||
futures = []
|
||||
for i in range(num_deltas):
|
||||
future = Mock()
|
||||
future.result.return_value = (f"file{i}.zip.delta", {"dg-file-size": "19500000"})
|
||||
futures.append(future)
|
||||
|
||||
mock_pool.submit.side_effect = futures
|
||||
patch_as_completed = patch(
|
||||
"concurrent.futures.as_completed",
|
||||
return_value=futures,
|
||||
)
|
||||
|
||||
with patch_as_completed:
|
||||
_ = get_bucket_stats(mock_client, "parallel-bucket", mode="detailed")
|
||||
|
||||
# Verify ThreadPoolExecutor was used with correct max_workers
|
||||
mock_executor.assert_called_once_with(max_workers=10) # min(10, 50) = 10
|
||||
|
||||
def test_stats_modes_control_metadata_fetch(self, mock_client):
|
||||
"""Metadata fetching should depend on the selected stats mode."""
|
||||
mock_client.service.storage.list_objects.return_value = {
|
||||
"objects": [
|
||||
{"key": "alpha/reference.bin", "size": 100, "last_modified": "2024-01-01"},
|
||||
{"key": "alpha/file1.zip.delta", "size": 10, "last_modified": "2024-01-02"},
|
||||
{"key": "alpha/file2.zip.delta", "size": 12, "last_modified": "2024-01-03"},
|
||||
{"key": "beta/reference.bin", "size": 200, "last_modified": "2024-01-04"},
|
||||
{"key": "beta/file1.zip.delta", "size": 20, "last_modified": "2024-01-05"},
|
||||
],
|
||||
"is_truncated": False,
|
||||
}
|
||||
|
||||
metadata_by_key = {
|
||||
"alpha/file1.zip.delta": {"dg-file-size": "100", "compression_ratio": "0.9"},
|
||||
"alpha/file2.zip.delta": {"dg-file-size": "120", "compression_ratio": "0.88"},
|
||||
"beta/file1.zip.delta": {"dg-file-size": "210", "compression_ratio": "0.9"},
|
||||
}
|
||||
|
||||
def mock_head(path: str):
|
||||
for key, metadata in metadata_by_key.items():
|
||||
if key in path:
|
||||
head = Mock()
|
||||
head.metadata = metadata
|
||||
return head
|
||||
return None
|
||||
|
||||
mock_client.service.storage.head.side_effect = mock_head
|
||||
|
||||
# Quick mode: no metadata fetch
|
||||
_ = get_bucket_stats(mock_client, "mode-test", mode="quick")
|
||||
assert mock_client.service.storage.head.call_count == 0
|
||||
|
||||
# Sampled mode: one HEAD per delta-space (alpha, beta)
|
||||
mock_client.service.storage.head.reset_mock()
|
||||
stats_sampled = get_bucket_stats(mock_client, "mode-test", mode="sampled")
|
||||
assert mock_client.service.storage.head.call_count == 2
|
||||
|
||||
# Detailed mode: HEAD for every delta (3 total)
|
||||
mock_client.service.storage.head.reset_mock()
|
||||
stats_detailed = get_bucket_stats(mock_client, "mode-test", mode="detailed")
|
||||
assert mock_client.service.storage.head.call_count == 3
|
||||
|
||||
# Sampled totals should be close to detailed but not identical
|
||||
assert stats_detailed.total_size == 100 + 120 + 210
|
||||
assert stats_sampled.total_size == 100 + 100 + 210
|
||||
|
||||
def test_error_handling_in_metadata_fetch(self, mock_client):
|
||||
"""Test graceful handling of errors during metadata fetch."""
|
||||
# Setup
|
||||
mock_client.service.storage.list_objects.return_value = {
|
||||
"objects": [
|
||||
{"key": "reference.bin", "size": 20000000, "last_modified": "2024-01-01"},
|
||||
{"key": "file1.zip.delta", "size": 50000, "last_modified": "2024-01-02"},
|
||||
{"key": "file2.zip.delta", "size": 60000, "last_modified": "2024-01-03"},
|
||||
],
|
||||
"is_truncated": False,
|
||||
}
|
||||
|
||||
# Mock metadata fetch to fail for one file
|
||||
def mock_head(path):
|
||||
if "file1.zip.delta" in path:
|
||||
raise Exception("S3 error")
|
||||
elif "file2.zip.delta" in path:
|
||||
head = Mock()
|
||||
head.metadata = {"dg-file-size": "19600000"}
|
||||
return head
|
||||
return None
|
||||
|
||||
mock_client.service.storage.head.side_effect = mock_head
|
||||
|
||||
# Execute - should handle error gracefully
|
||||
stats = get_bucket_stats(mock_client, "error-bucket", mode="detailed")
|
||||
|
||||
# Verify - file1 has no metadata (error), file2 uses metadata
|
||||
assert stats.object_count == 2
|
||||
assert stats.delta_objects == 2
|
||||
# file1 has no metadata so not counted in original size, file2 uses metadata (19600000)
|
||||
assert stats.total_size == 19600000
|
||||
|
||||
# Verify warning was logged for file1
|
||||
warning_calls = mock_client.service.logger.warning.call_args_list
|
||||
assert any(
|
||||
"file1.zip.delta" in str(call) and "no original_size metadata" in str(call)
|
||||
for call in warning_calls
|
||||
)
|
||||
|
||||
def test_multiple_orphaned_references(self, mock_client):
|
||||
"""Test detection of multiple orphaned reference.bin files."""
|
||||
# Setup: Multiple orphaned references
|
||||
mock_client.service.storage.list_objects.return_value = {
|
||||
"objects": [
|
||||
{"key": "pro/reference.bin", "size": 20000000, "last_modified": "2024-01-01"},
|
||||
{
|
||||
"key": "enterprise/reference.bin",
|
||||
"size": 25000000,
|
||||
"last_modified": "2024-01-02",
|
||||
},
|
||||
{"key": "community/reference.bin", "size": 15000000, "last_modified": "2024-01-03"},
|
||||
{"key": "regular.pdf", "size": 1000000, "last_modified": "2024-01-04"},
|
||||
],
|
||||
"is_truncated": False,
|
||||
}
|
||||
mock_client.service.storage.head.return_value = None
|
||||
|
||||
# Execute
|
||||
stats = get_bucket_stats(mock_client, "multi-orphaned-bucket")
|
||||
|
||||
# Verify stats
|
||||
assert stats.object_count == 1 # Only regular.pdf
|
||||
assert stats.total_size == 1000000
|
||||
assert stats.compressed_size == 1000000 # No references included
|
||||
assert stats.space_saved == 0
|
||||
|
||||
# Verify warnings for all orphaned references
|
||||
warning_calls = [str(call) for call in mock_client.service.logger.warning.call_args_list]
|
||||
warning_text = " ".join(warning_calls)
|
||||
|
||||
assert "ORPHANED REFERENCE FILE" in warning_text
|
||||
assert "3 reference.bin file(s)" in warning_text
|
||||
assert "60,000,000 bytes" in warning_text # Total of all references
|
||||
assert "s3://multi-orphaned-bucket/pro/reference.bin" in warning_text
|
||||
assert "s3://multi-orphaned-bucket/enterprise/reference.bin" in warning_text
|
||||
assert "s3://multi-orphaned-bucket/community/reference.bin" in warning_text
|
||||
284
tests/unit/test_stats_caching.py
Normal file
284
tests/unit/test_stats_caching.py
Normal file
@@ -0,0 +1,284 @@
|
||||
"""Unit tests for bucket stats caching functionality."""
|
||||
|
||||
import json
|
||||
from unittest.mock import MagicMock
|
||||
|
||||
from deltaglider.client_models import BucketStats
|
||||
from deltaglider.client_operations.stats import (
|
||||
_get_cache_key,
|
||||
_is_cache_valid,
|
||||
_read_stats_cache,
|
||||
_write_stats_cache,
|
||||
)
|
||||
|
||||
|
||||
def test_get_cache_key():
|
||||
"""Test cache key generation for different modes."""
|
||||
assert _get_cache_key("quick") == ".deltaglider/stats_quick.json"
|
||||
assert _get_cache_key("sampled") == ".deltaglider/stats_sampled.json"
|
||||
assert _get_cache_key("detailed") == ".deltaglider/stats_detailed.json"
|
||||
|
||||
|
||||
def test_is_cache_valid_when_unchanged():
|
||||
"""Test cache validation when bucket hasn't changed."""
|
||||
cached_validation = {
|
||||
"object_count": 100,
|
||||
"compressed_size": 50000,
|
||||
}
|
||||
|
||||
assert _is_cache_valid(cached_validation, 100, 50000) is True
|
||||
|
||||
|
||||
def test_is_cache_valid_when_count_changed():
|
||||
"""Test cache validation when object count changed."""
|
||||
cached_validation = {
|
||||
"object_count": 100,
|
||||
"compressed_size": 50000,
|
||||
}
|
||||
|
||||
# Object count changed
|
||||
assert _is_cache_valid(cached_validation, 101, 50000) is False
|
||||
|
||||
|
||||
def test_is_cache_valid_when_size_changed():
|
||||
"""Test cache validation when compressed size changed."""
|
||||
cached_validation = {
|
||||
"object_count": 100,
|
||||
"compressed_size": 50000,
|
||||
}
|
||||
|
||||
# Compressed size changed
|
||||
assert _is_cache_valid(cached_validation, 100, 60000) is False
|
||||
|
||||
|
||||
def test_write_and_read_cache_roundtrip():
|
||||
"""Test writing and reading cache with valid data."""
|
||||
# Create mock client and storage
|
||||
mock_storage = MagicMock()
|
||||
mock_logger = MagicMock()
|
||||
mock_service = MagicMock()
|
||||
mock_service.storage = mock_storage
|
||||
mock_service.logger = mock_logger
|
||||
mock_client = MagicMock()
|
||||
mock_client.service = mock_service
|
||||
|
||||
# Create test stats
|
||||
test_stats = BucketStats(
|
||||
bucket="test-bucket",
|
||||
object_count=150,
|
||||
total_size=1000000,
|
||||
compressed_size=50000,
|
||||
space_saved=950000,
|
||||
average_compression_ratio=0.95,
|
||||
delta_objects=140,
|
||||
direct_objects=10,
|
||||
)
|
||||
|
||||
# Capture what was written to storage
|
||||
written_data = None
|
||||
|
||||
def capture_put(address, data, metadata):
|
||||
nonlocal written_data
|
||||
written_data = data
|
||||
|
||||
mock_storage.put = capture_put
|
||||
|
||||
# Write cache
|
||||
_write_stats_cache(
|
||||
client=mock_client,
|
||||
bucket="test-bucket",
|
||||
mode="quick",
|
||||
stats=test_stats,
|
||||
object_count=150,
|
||||
compressed_size=50000,
|
||||
)
|
||||
|
||||
# Verify something was written
|
||||
assert written_data is not None
|
||||
|
||||
# Parse written data
|
||||
cache_data = json.loads(written_data.decode("utf-8"))
|
||||
|
||||
# Verify structure
|
||||
assert cache_data["version"] == "1.0"
|
||||
assert cache_data["mode"] == "quick"
|
||||
assert "computed_at" in cache_data
|
||||
assert cache_data["validation"]["object_count"] == 150
|
||||
assert cache_data["validation"]["compressed_size"] == 50000
|
||||
assert cache_data["stats"]["bucket"] == "test-bucket"
|
||||
assert cache_data["stats"]["object_count"] == 150
|
||||
assert cache_data["stats"]["delta_objects"] == 140
|
||||
|
||||
# Now test reading it back
|
||||
mock_obj = MagicMock()
|
||||
mock_obj.data = written_data
|
||||
mock_storage.get = MagicMock(return_value=mock_obj)
|
||||
|
||||
stats, validation = _read_stats_cache(mock_client, "test-bucket", "quick")
|
||||
|
||||
# Verify read stats match original
|
||||
assert stats is not None
|
||||
assert validation is not None
|
||||
assert stats.bucket == "test-bucket"
|
||||
assert stats.object_count == 150
|
||||
assert stats.delta_objects == 140
|
||||
assert stats.average_compression_ratio == 0.95
|
||||
assert validation["object_count"] == 150
|
||||
assert validation["compressed_size"] == 50000
|
||||
|
||||
|
||||
def test_read_cache_missing_file():
|
||||
"""Test reading cache when file doesn't exist."""
|
||||
mock_storage = MagicMock()
|
||||
mock_logger = MagicMock()
|
||||
mock_service = MagicMock()
|
||||
mock_service.storage = mock_storage
|
||||
mock_service.logger = mock_logger
|
||||
mock_client = MagicMock()
|
||||
mock_client.service = mock_service
|
||||
|
||||
# Simulate FileNotFoundError
|
||||
mock_storage.get.side_effect = FileNotFoundError("No such key")
|
||||
|
||||
stats, validation = _read_stats_cache(mock_client, "test-bucket", "quick")
|
||||
|
||||
assert stats is None
|
||||
assert validation is None
|
||||
|
||||
|
||||
def test_read_cache_invalid_json():
|
||||
"""Test reading cache with corrupted JSON."""
|
||||
mock_storage = MagicMock()
|
||||
mock_logger = MagicMock()
|
||||
mock_service = MagicMock()
|
||||
mock_service.storage = mock_storage
|
||||
mock_service.logger = mock_logger
|
||||
mock_client = MagicMock()
|
||||
mock_client.service = mock_service
|
||||
|
||||
# Return invalid JSON
|
||||
mock_obj = MagicMock()
|
||||
mock_obj.data = b"not valid json {]["
|
||||
mock_storage.get = MagicMock(return_value=mock_obj)
|
||||
|
||||
stats, validation = _read_stats_cache(mock_client, "test-bucket", "quick")
|
||||
|
||||
assert stats is None
|
||||
assert validation is None
|
||||
mock_logger.warning.assert_called_once()
|
||||
|
||||
|
||||
def test_read_cache_version_mismatch():
|
||||
"""Test reading cache with wrong version."""
|
||||
mock_storage = MagicMock()
|
||||
mock_logger = MagicMock()
|
||||
mock_service = MagicMock()
|
||||
mock_service.storage = mock_storage
|
||||
mock_service.logger = mock_logger
|
||||
mock_client = MagicMock()
|
||||
mock_client.service = mock_service
|
||||
|
||||
# Cache with wrong version
|
||||
cache_data = {
|
||||
"version": "2.0", # Wrong version
|
||||
"mode": "quick",
|
||||
"validation": {"object_count": 100, "compressed_size": 50000},
|
||||
"stats": {
|
||||
"bucket": "test",
|
||||
"object_count": 100,
|
||||
"total_size": 1000,
|
||||
"compressed_size": 500,
|
||||
"space_saved": 500,
|
||||
"average_compression_ratio": 0.5,
|
||||
"delta_objects": 90,
|
||||
"direct_objects": 10,
|
||||
},
|
||||
}
|
||||
|
||||
mock_obj = MagicMock()
|
||||
mock_obj.data = json.dumps(cache_data).encode("utf-8")
|
||||
mock_storage.get = MagicMock(return_value=mock_obj)
|
||||
|
||||
stats, validation = _read_stats_cache(mock_client, "test-bucket", "quick")
|
||||
|
||||
assert stats is None
|
||||
assert validation is None
|
||||
mock_logger.warning.assert_called_once()
|
||||
|
||||
|
||||
def test_read_cache_mode_mismatch():
|
||||
"""Test reading cache with wrong mode."""
|
||||
mock_storage = MagicMock()
|
||||
mock_logger = MagicMock()
|
||||
mock_service = MagicMock()
|
||||
mock_service.storage = mock_storage
|
||||
mock_service.logger = mock_logger
|
||||
mock_client = MagicMock()
|
||||
mock_client.service = mock_service
|
||||
|
||||
# Cache with mismatched mode
|
||||
cache_data = {
|
||||
"version": "1.0",
|
||||
"mode": "detailed", # Wrong mode
|
||||
"validation": {"object_count": 100, "compressed_size": 50000},
|
||||
"stats": {
|
||||
"bucket": "test",
|
||||
"object_count": 100,
|
||||
"total_size": 1000,
|
||||
"compressed_size": 500,
|
||||
"space_saved": 500,
|
||||
"average_compression_ratio": 0.5,
|
||||
"delta_objects": 90,
|
||||
"direct_objects": 10,
|
||||
},
|
||||
}
|
||||
|
||||
mock_obj = MagicMock()
|
||||
mock_obj.data = json.dumps(cache_data).encode("utf-8")
|
||||
mock_storage.get = MagicMock(return_value=mock_obj)
|
||||
|
||||
# Request "quick" mode but cache has "detailed"
|
||||
stats, validation = _read_stats_cache(mock_client, "test-bucket", "quick")
|
||||
|
||||
assert stats is None
|
||||
assert validation is None
|
||||
mock_logger.warning.assert_called_once()
|
||||
|
||||
|
||||
def test_write_cache_handles_errors_gracefully():
|
||||
"""Test that cache write failures don't crash the program."""
|
||||
mock_storage = MagicMock()
|
||||
mock_logger = MagicMock()
|
||||
mock_service = MagicMock()
|
||||
mock_service.storage = mock_storage
|
||||
mock_service.logger = mock_logger
|
||||
mock_client = MagicMock()
|
||||
mock_client.service = mock_service
|
||||
|
||||
# Simulate S3 permission error
|
||||
mock_storage.put.side_effect = PermissionError("Access denied")
|
||||
|
||||
test_stats = BucketStats(
|
||||
bucket="test-bucket",
|
||||
object_count=150,
|
||||
total_size=1000000,
|
||||
compressed_size=50000,
|
||||
space_saved=950000,
|
||||
average_compression_ratio=0.95,
|
||||
delta_objects=140,
|
||||
direct_objects=10,
|
||||
)
|
||||
|
||||
# Should not raise exception
|
||||
_write_stats_cache(
|
||||
client=mock_client,
|
||||
bucket="test-bucket",
|
||||
mode="quick",
|
||||
stats=test_stats,
|
||||
object_count=150,
|
||||
compressed_size=50000,
|
||||
)
|
||||
|
||||
# Should log warning
|
||||
mock_logger.warning.assert_called_once()
|
||||
assert "Failed to write cache" in str(mock_logger.warning.call_args)
|
||||
Reference in New Issue
Block a user